Merge tag 'for-f2fs-4.12' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 8 May 2017 19:24:17 +0000 (12:24 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 8 May 2017 19:24:17 +0000 (12:24 -0700)
Pull f2fs updates from Jaegeuk Kim:
 "In this round, we've focused on enhancing performance with regards to
  block allocation, GC, and discard/in-place-update IO controls. There
  are a bunch of clean-ups as well as minor bug fixes.

  Enhancements:
   - disable heap-based allocation by default
   - issue small-sized discard commands by default
   - change the policy of data hotness for logging
   - distinguish IOs in terms of size and wbc type
   - start SSR earlier to avoid foreground GC
   - enhance data structures managing discard commands
   - enhance in-place update flow
   - add some more fault injection routines
   - secure one more xattr entry

  Bug fixes:
   - calculate victim cost for GC correctly
   - remain correct victim segment number for GC
   - race condition in nid allocator and initializer
   - stale pointer produced by atomic_writes
   - fix missing REQ_SYNC for flush commands
   - handle missing errors in more corner cases"

* tag 'for-f2fs-4.12' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs: (111 commits)
  f2fs: fix a mount fail for wrong next_scan_nid
  f2fs: enhance scalability of trace macro
  f2fs: relocate inode_{,un}lock in F2FS_IOC_SETFLAGS
  f2fs: Make flush bios explicitely sync
  f2fs: show available_nids in f2fs/status
  f2fs: flush dirty nats periodically
  f2fs: introduce CP_TRIMMED_FLAG to avoid unneeded discard
  f2fs: allow cpc->reason to indicate more than one reason
  f2fs: release cp and dnode lock before IPU
  f2fs: shrink size of struct discard_cmd
  f2fs: don't hold cmd_lock during waiting discard command
  f2fs: nullify fio->encrypted_page for each writes
  f2fs: sanity check segment count
  f2fs: introduce valid_ipu_blkaddr to clean up
  f2fs: lookup extent cache first under IPU scenario
  f2fs: reconstruct code to write a data page
  f2fs: introduce __wait_discard_cmd
  f2fs: introduce __issue_discard_cmd
  f2fs: enable small discard by default
  f2fs: delay awaking discard thread
  ...

22 files changed:
fs/f2fs/checkpoint.c
fs/f2fs/data.c
fs/f2fs/debug.c
fs/f2fs/dir.c
fs/f2fs/extent_cache.c
fs/f2fs/f2fs.h
fs/f2fs/file.c
fs/f2fs/gc.c
fs/f2fs/inline.c
fs/f2fs/inode.c
fs/f2fs/namei.c
fs/f2fs/node.c
fs/f2fs/node.h
fs/f2fs/recovery.c
fs/f2fs/segment.c
fs/f2fs/segment.h
fs/f2fs/super.c
fs/f2fs/trace.c
fs/f2fs/xattr.c
fs/f2fs/xattr.h
include/linux/f2fs_fs.h
include/trace/events/f2fs.h

index 0339daf4ca02fac4090fc59783bbfa745bccc9eb..ea9c317b5916ee51610069c2c58cdb504f95969d 100644 (file)
@@ -275,10 +275,11 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
                get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
                goto skip_write;
 
-       trace_f2fs_writepages(mapping->host, wbc, META);
+       /* if locked failed, cp will flush dirty pages instead */
+       if (!mutex_trylock(&sbi->cp_mutex))
+               goto skip_write;
 
-       /* if mounting is failed, skip writing node pages */
-       mutex_lock(&sbi->cp_mutex);
+       trace_f2fs_writepages(mapping->host, wbc, META);
        diff = nr_pages_to_write(sbi, META, wbc);
        written = sync_meta_pages(sbi, META, wbc->nr_to_write);
        mutex_unlock(&sbi->cp_mutex);
@@ -567,7 +568,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
        if (ni.blk_addr != NULL_ADDR) {
                set_sbi_flag(sbi, SBI_NEED_FSCK);
                f2fs_msg(sbi->sb, KERN_WARNING,
-                       "%s: orphan failed (ino=%x), run fsck to fix.",
+                       "%s: orphan failed (ino=%x) by kernel, retry mount.",
                                __func__, ino);
                return -EIO;
        }
@@ -677,7 +678,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
        *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
 
        crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
-       if (crc_offset >= blk_size) {
+       if (crc_offset > (blk_size - sizeof(__le32))) {
                f2fs_msg(sbi->sb, KERN_WARNING,
                        "invalid crc_offset: %zu", crc_offset);
                return -EINVAL;
@@ -816,7 +817,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type)
                return;
 
        set_inode_flag(inode, flag);
-       list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
+       if (!f2fs_is_volatile_file(inode))
+               list_add_tail(&F2FS_I(inode)->dirty_list,
+                                               &sbi->inode_list[type]);
        stat_inc_dirty_inode(sbi, type);
 }
 
@@ -941,6 +944,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
        return 0;
 }
 
+static void __prepare_cp_block(struct f2fs_sb_info *sbi)
+{
+       struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+       struct f2fs_nm_info *nm_i = NM_I(sbi);
+       nid_t last_nid = nm_i->next_scan_nid;
+
+       next_free_nid(sbi, &last_nid);
+       ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
+       ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
+       ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
+       ckpt->next_free_nid = cpu_to_le32(last_nid);
+}
+
 /*
  * Freeze all the FS-operations for checkpoint.
  */
@@ -964,21 +980,26 @@ retry_flush_dents:
                err = sync_dirty_inodes(sbi, DIR_INODE);
                if (err)
                        goto out;
+               cond_resched();
                goto retry_flush_dents;
        }
 
+       /*
+        * POR: we should ensure that there are no dirty node pages
+        * until finishing nat/sit flush. inode->i_blocks can be updated.
+        */
+       down_write(&sbi->node_change);
+
        if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
+               up_write(&sbi->node_change);
                f2fs_unlock_all(sbi);
                err = f2fs_sync_inode_meta(sbi);
                if (err)
                        goto out;
+               cond_resched();
                goto retry_flush_dents;
        }
 
-       /*
-        * POR: we should ensure that there are no dirty node pages
-        * until finishing nat/sit flush.
-        */
 retry_flush_nodes:
        down_write(&sbi->node_write);
 
@@ -986,11 +1007,20 @@ retry_flush_nodes:
                up_write(&sbi->node_write);
                err = sync_node_pages(sbi, &wbc);
                if (err) {
+                       up_write(&sbi->node_change);
                        f2fs_unlock_all(sbi);
                        goto out;
                }
+               cond_resched();
                goto retry_flush_nodes;
        }
+
+       /*
+        * sbi->node_change is used only for AIO write_begin path which produces
+        * dirty node blocks and some checkpoint values by block allocation.
+        */
+       __prepare_cp_block(sbi);
+       up_write(&sbi->node_change);
 out:
        blk_finish_plug(&plug);
        return err;
@@ -1024,16 +1054,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
        spin_lock(&sbi->cp_lock);
 
-       if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count >
+       if ((cpc->reason & CP_UMOUNT) &&
+                       le32_to_cpu(ckpt->cp_pack_total_block_count) >
                        sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
                disable_nat_bits(sbi, false);
 
-       if (cpc->reason == CP_UMOUNT)
+       if (cpc->reason & CP_TRIMMED)
+               __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
+
+       if (cpc->reason & CP_UMOUNT)
                __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
        else
                __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
 
-       if (cpc->reason == CP_FASTBOOT)
+       if (cpc->reason & CP_FASTBOOT)
                __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
        else
                __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
@@ -1057,7 +1091,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
-       nid_t last_nid = nm_i->next_scan_nid;
        block_t start_blk;
        unsigned int data_sum_blocks, orphan_blocks;
        __u32 crc32 = 0;
@@ -1074,14 +1107,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                        return -EIO;
        }
 
-       next_free_nid(sbi, &last_nid);
-
        /*
         * modify checkpoint
         * version number is already updated
         */
        ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
-       ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
        ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
        for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
                ckpt->cur_node_segno[i] =
@@ -1100,10 +1130,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                                curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
        }
 
-       ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
-       ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
-       ckpt->next_free_nid = cpu_to_le32(last_nid);
-
        /* 2 cp  + n data seg summary + orphan inode blocks */
        data_sum_blocks = npages_for_summary_flush(sbi, false);
        spin_lock(&sbi->cp_lock);
@@ -1143,7 +1169,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        /* write nat bits */
        if (enabled_nat_bits(sbi, cpc)) {
                __u64 cp_ver = cur_cp_version(ckpt);
-               unsigned int i;
                block_t blk;
 
                cp_ver |= ((__u64)crc32 << 32);
@@ -1250,8 +1275,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        mutex_lock(&sbi->cp_mutex);
 
        if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
-               (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
-               (cpc->reason == CP_DISCARD && !sbi->discard_blks)))
+               ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
+               ((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))
                goto out;
        if (unlikely(f2fs_cp_error(sbi))) {
                err = -EIO;
@@ -1273,7 +1298,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        f2fs_flush_merged_bios(sbi);
 
        /* this is the case of multiple fstrims without any changes */
-       if (cpc->reason == CP_DISCARD) {
+       if (cpc->reason & CP_DISCARD) {
                if (!exist_trim_candidates(sbi, cpc)) {
                        unblock_operations(sbi);
                        goto out;
@@ -1311,7 +1336,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        unblock_operations(sbi);
        stat_inc_cp_count(sbi->stat_info);
 
-       if (cpc->reason == CP_RECOVERY)
+       if (cpc->reason & CP_RECOVERY)
                f2fs_msg(sbi->sb, KERN_NOTICE,
                        "checkpoint: version = %llx", ckpt_ver);
 
index 1602b4bccae61e8ac9fcff3d4810b953294773e2..7c0f6bdf817d4370b74b36de9096fad73c75fbd4 100644 (file)
@@ -309,7 +309,7 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
        if (type >= META_FLUSH) {
                io->fio.type = META_FLUSH;
                io->fio.op = REQ_OP_WRITE;
-               io->fio.op_flags = REQ_META | REQ_PRIO;
+               io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC;
                if (!test_opt(sbi, NOBARRIER))
                        io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
        }
@@ -341,7 +341,7 @@ void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
 
 /*
  * Fill the locked page with data located in the block address.
- * Return unlocked page.
+ * A caller needs to unlock the page on failure.
  */
 int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 {
@@ -362,6 +362,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
        bio_set_op_attrs(bio, fio->op, fio->op_flags);
 
        __submit_bio(fio->sbi, bio, fio->type);
+
+       if (!is_read_io(fio->op))
+               inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page));
        return 0;
 }
 
@@ -787,6 +790,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
        return err;
 }
 
+static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
+{
+       if (flag == F2FS_GET_BLOCK_PRE_AIO) {
+               if (lock)
+                       down_read(&sbi->node_change);
+               else
+                       up_read(&sbi->node_change);
+       } else {
+               if (lock)
+                       f2fs_lock_op(sbi);
+               else
+                       f2fs_unlock_op(sbi);
+       }
+}
+
 /*
  * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
  * f2fs_map_blocks structure.
@@ -829,7 +847,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 
 next_dnode:
        if (create)
-               f2fs_lock_op(sbi);
+               __do_map_lock(sbi, flag, true);
 
        /* When reading holes, we need its node page */
        set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -939,7 +957,7 @@ skip:
        f2fs_put_dnode(&dn);
 
        if (create) {
-               f2fs_unlock_op(sbi);
+               __do_map_lock(sbi, flag, false);
                f2fs_balance_fs(sbi, dn.node_changed);
        }
        goto next_dnode;
@@ -948,7 +966,7 @@ sync_out:
        f2fs_put_dnode(&dn);
 unlock_out:
        if (create) {
-               f2fs_unlock_op(sbi);
+               __do_map_lock(sbi, flag, false);
                f2fs_balance_fs(sbi, dn.node_changed);
        }
 out:
@@ -1151,9 +1169,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 
        for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
 
-               prefetchw(&page->flags);
                if (pages) {
                        page = list_last_entry(pages, struct page, lru);
+
+                       prefetchw(&page->flags);
                        list_del(&page->lru);
                        if (add_to_page_cache_lru(page, mapping,
                                                  page->index,
@@ -1283,17 +1302,83 @@ static int f2fs_read_data_pages(struct file *file,
        return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages);
 }
 
+static int encrypt_one_page(struct f2fs_io_info *fio)
+{
+       struct inode *inode = fio->page->mapping->host;
+       gfp_t gfp_flags = GFP_NOFS;
+
+       if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
+               return 0;
+
+       /* wait for GCed encrypted page writeback */
+       f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr);
+
+retry_encrypt:
+       fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
+                       PAGE_SIZE, 0, fio->page->index, gfp_flags);
+       if (!IS_ERR(fio->encrypted_page))
+               return 0;
+
+       /* flush pending IOs and wait for a while in the ENOMEM case */
+       if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
+               f2fs_flush_merged_bios(fio->sbi);
+               congestion_wait(BLK_RW_ASYNC, HZ/50);
+               gfp_flags |= __GFP_NOFAIL;
+               goto retry_encrypt;
+       }
+       return PTR_ERR(fio->encrypted_page);
+}
+
+static inline bool need_inplace_update(struct f2fs_io_info *fio)
+{
+       struct inode *inode = fio->page->mapping->host;
+
+       if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
+               return false;
+       if (is_cold_data(fio->page))
+               return false;
+       if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
+               return false;
+
+       return need_inplace_update_policy(inode, fio);
+}
+
+static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio)
+{
+       if (fio->old_blkaddr == NEW_ADDR)
+               return false;
+       if (fio->old_blkaddr == NULL_ADDR)
+               return false;
+       return true;
+}
+
 int do_write_data_page(struct f2fs_io_info *fio)
 {
        struct page *page = fio->page;
        struct inode *inode = page->mapping->host;
        struct dnode_of_data dn;
+       struct extent_info ei = {0,0,0};
+       bool ipu_force = false;
        int err = 0;
 
        set_new_dnode(&dn, inode, NULL, NULL, 0);
+       if (need_inplace_update(fio) &&
+                       f2fs_lookup_extent_cache(inode, page->index, &ei)) {
+               fio->old_blkaddr = ei.blk + page->index - ei.fofs;
+
+               if (valid_ipu_blkaddr(fio)) {
+                       ipu_force = true;
+                       fio->need_lock = false;
+                       goto got_it;
+               }
+       }
+
+       if (fio->need_lock)
+               f2fs_lock_op(fio->sbi);
+
        err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
        if (err)
-               return err;
+               goto out;
 
        fio->old_blkaddr = dn.data_blkaddr;
 
@@ -1302,31 +1387,10 @@ int do_write_data_page(struct f2fs_io_info *fio)
                ClearPageUptodate(page);
                goto out_writepage;
        }
-
-       if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
-               gfp_t gfp_flags = GFP_NOFS;
-
-               /* wait for GCed encrypted page writeback */
-               f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
-                                                       fio->old_blkaddr);
-retry_encrypt:
-               fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
-                                                       PAGE_SIZE, 0,
-                                                       fio->page->index,
-                                                       gfp_flags);
-               if (IS_ERR(fio->encrypted_page)) {
-                       err = PTR_ERR(fio->encrypted_page);
-                       if (err == -ENOMEM) {
-                               /* flush pending ios and wait for a while */
-                               f2fs_flush_merged_bios(F2FS_I_SB(inode));
-                               congestion_wait(BLK_RW_ASYNC, HZ/50);
-                               gfp_flags |= __GFP_NOFAIL;
-                               err = 0;
-                               goto retry_encrypt;
-                       }
-                       goto out_writepage;
-               }
-       }
+got_it:
+       err = encrypt_one_page(fio);
+       if (err)
+               goto out_writepage;
 
        set_page_writeback(page);
 
@@ -1334,22 +1398,27 @@ retry_encrypt:
         * If current allocation needs SSR,
         * it had better in-place writes for updated data.
         */
-       if (unlikely(fio->old_blkaddr != NEW_ADDR &&
-                       !is_cold_data(page) &&
-                       !IS_ATOMIC_WRITTEN_PAGE(page) &&
-                       need_inplace_update(inode))) {
-               rewrite_data_page(fio);
+       if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) {
+               f2fs_put_dnode(&dn);
+               if (fio->need_lock)
+                       f2fs_unlock_op(fio->sbi);
+               err = rewrite_data_page(fio);
+               trace_f2fs_do_write_data_page(fio->page, IPU);
                set_inode_flag(inode, FI_UPDATE_WRITE);
-               trace_f2fs_do_write_data_page(page, IPU);
-       } else {
-               write_data_page(&dn, fio);
-               trace_f2fs_do_write_data_page(page, OPU);
-               set_inode_flag(inode, FI_APPEND_WRITE);
-               if (page->index == 0)
-                       set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+               return err;
        }
+
+       /* LFS mode write path */
+       write_data_page(&dn, fio);
+       trace_f2fs_do_write_data_page(page, OPU);
+       set_inode_flag(inode, FI_APPEND_WRITE);
+       if (page->index == 0)
+               set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
 out_writepage:
        f2fs_put_dnode(&dn);
+out:
+       if (fio->need_lock)
+               f2fs_unlock_op(fio->sbi);
        return err;
 }
 
@@ -1370,9 +1439,11 @@ static int __write_data_page(struct page *page, bool *submitted,
                .type = DATA,
                .op = REQ_OP_WRITE,
                .op_flags = wbc_to_write_flags(wbc),
+               .old_blkaddr = NULL_ADDR,
                .page = page,
                .encrypted_page = NULL,
                .submitted = false,
+               .need_lock = true,
        };
 
        trace_f2fs_writepage(page, DATA);
@@ -1408,6 +1479,7 @@ write:
 
        /* Dentry blocks are controlled by checkpoint */
        if (S_ISDIR(inode->i_mode)) {
+               fio.need_lock = false;
                err = do_write_data_page(&fio);
                goto done;
        }
@@ -1416,6 +1488,8 @@ write:
                need_balance_fs = true;
        else if (has_not_enough_free_secs(sbi, 0, 0))
                goto redirty_out;
+       else
+               set_inode_flag(inode, FI_HOT_DATA);
 
        err = -EAGAIN;
        if (f2fs_has_inline_data(inode)) {
@@ -1423,12 +1497,12 @@ write:
                if (!err)
                        goto out;
        }
-       f2fs_lock_op(sbi);
+
        if (err == -EAGAIN)
                err = do_write_data_page(&fio);
        if (F2FS_I(inode)->last_disk_size < psize)
                F2FS_I(inode)->last_disk_size = psize;
-       f2fs_unlock_op(sbi);
+
 done:
        if (err && err != -ENOENT)
                goto redirty_out;
@@ -1441,12 +1515,14 @@ out:
        if (wbc->for_reclaim) {
                f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index,
                                                DATA, WRITE);
+               clear_inode_flag(inode, FI_HOT_DATA);
                remove_dirty_inode(inode);
                submitted = NULL;
        }
 
        unlock_page(page);
-       f2fs_balance_fs(sbi, need_balance_fs);
+       if (!S_ISDIR(inode->i_mode))
+               f2fs_balance_fs(sbi, need_balance_fs);
 
        if (unlikely(f2fs_cp_error(sbi))) {
                f2fs_submit_merged_bio(sbi, DATA, WRITE);
@@ -1495,6 +1571,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 
        pagevec_init(&pvec, 0);
 
+       if (get_dirty_pages(mapping->host) <=
+                               SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
+               set_inode_flag(mapping->host, FI_HOT_DATA);
+       else
+               clear_inode_flag(mapping->host, FI_HOT_DATA);
+
        if (wbc->range_cyclic) {
                writeback_index = mapping->writeback_index; /* prev offset */
                index = writeback_index;
@@ -1580,8 +1662,10 @@ continue_unlock:
                                last_idx = page->index;
                        }
 
-                       if (--wbc->nr_to_write <= 0 &&
-                           wbc->sync_mode == WB_SYNC_NONE) {
+                       /* give a priority to WB_SYNC threads */
+                       if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) ||
+                                       --wbc->nr_to_write <= 0) &&
+                                       wbc->sync_mode == WB_SYNC_NONE) {
                                done = 1;
                                break;
                        }
@@ -1637,9 +1721,18 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 
        trace_f2fs_writepages(mapping->host, wbc, DATA);
 
+       /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               atomic_inc(&sbi->wb_sync_req);
+       else if (atomic_read(&sbi->wb_sync_req))
+               goto skip_write;
+
        blk_start_plug(&plug);
        ret = f2fs_write_cache_pages(mapping, wbc);
        blk_finish_plug(&plug);
+
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               atomic_dec(&sbi->wb_sync_req);
        /*
         * if some pages were truncated, we cannot guarantee its mapping->host
         * to detect pending bios.
@@ -1687,7 +1780,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
 
        if (f2fs_has_inline_data(inode) ||
                        (pos & PAGE_MASK) >= i_size_read(inode)) {
-               f2fs_lock_op(sbi);
+               __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
                locked = true;
        }
 restart:
@@ -1723,7 +1816,8 @@ restart:
                        err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
                        if (err || dn.data_blkaddr == NULL_ADDR) {
                                f2fs_put_dnode(&dn);
-                               f2fs_lock_op(sbi);
+                               __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
+                                                               true);
                                locked = true;
                                goto restart;
                        }
@@ -1737,7 +1831,7 @@ out:
        f2fs_put_dnode(&dn);
 unlock_out:
        if (locked)
-               f2fs_unlock_op(sbi);
+               __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
        return err;
 }
 
@@ -1951,7 +2045,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 
        /* This is atomic written page, keep Private */
        if (IS_ATOMIC_WRITTEN_PAGE(page))
-               return;
+               return drop_inmem_page(inode, page);
 
        set_page_private(page, 0);
        ClearPagePrivate(page);
index ee2d0a485fc3478fc5f93b5b85c6dad0431e8ea0..87f449845f5f9a8010ee13c40548fb6620802872 100644 (file)
@@ -51,15 +51,26 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
        si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
        si->aw_cnt = atomic_read(&sbi->aw_cnt);
+       si->vw_cnt = atomic_read(&sbi->vw_cnt);
        si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
+       si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
        si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
        si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
-       if (SM_I(sbi) && SM_I(sbi)->fcc_info)
-               si->nr_flush =
-                       atomic_read(&SM_I(sbi)->fcc_info->submit_flush);
-       if (SM_I(sbi) && SM_I(sbi)->dcc_info)
-               si->nr_discard =
-                       atomic_read(&SM_I(sbi)->dcc_info->submit_discard);
+       if (SM_I(sbi) && SM_I(sbi)->fcc_info) {
+               si->nr_flushed =
+                       atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
+               si->nr_flushing =
+                       atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+       }
+       if (SM_I(sbi) && SM_I(sbi)->dcc_info) {
+               si->nr_discarded =
+                       atomic_read(&SM_I(sbi)->dcc_info->issued_discard);
+               si->nr_discarding =
+                       atomic_read(&SM_I(sbi)->dcc_info->issing_discard);
+               si->nr_discard_cmd =
+                       atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+               si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
+       }
        si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
        si->rsvd_segs = reserved_segments(sbi);
        si->overp_segs = overprovision_segments(sbi);
@@ -86,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        si->sits = MAIN_SEGS(sbi);
        si->dirty_sits = SIT_I(sbi)->dirty_sentries;
        si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+       si->avail_nids = NM_I(sbi)->available_nids;
        si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
        si->bg_gc = sbi->bg_gc;
        si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
@@ -99,8 +111,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
                struct curseg_info *curseg = CURSEG_I(sbi, i);
                si->curseg[i] = curseg->segno;
-               si->cursec[i] = curseg->segno / sbi->segs_per_sec;
-               si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
+               si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
+               si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
        }
 
        for (i = 0; i < 2; i++) {
@@ -124,10 +136,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
 
        bimodal = 0;
        total_vblocks = 0;
-       blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
+       blks_per_sec = BLKS_PER_SEC(sbi);
        hblks_per_sec = blks_per_sec / 2;
        for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
-               vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+               vblocks = get_valid_blocks(sbi, segno, true);
                dist = abs(vblocks - hblks_per_sec);
                bimodal += dist * dist;
 
@@ -156,7 +168,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
        if (si->base_mem)
                goto get_cache;
 
-       si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
+       /* build stat */
+       si->base_mem = sizeof(struct f2fs_stat_info);
+
+       /* build superblock */
+       si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
        si->base_mem += 2 * sizeof(struct f2fs_inode_info);
        si->base_mem += sizeof(*sbi->ckpt);
        si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE;
@@ -208,8 +224,11 @@ get_cache:
        /* build merge flush thread */
        if (SM_I(sbi)->fcc_info)
                si->cache_mem += sizeof(struct flush_cmd_control);
-       if (SM_I(sbi)->dcc_info)
+       if (SM_I(sbi)->dcc_info) {
                si->cache_mem += sizeof(struct discard_cmd_control);
+               si->cache_mem += sizeof(struct discard_cmd) *
+                       atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+       }
 
        /* free nids */
        si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
@@ -330,11 +349,16 @@ static int stat_show(struct seq_file *s, void *v)
                seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
                                si->ext_tree, si->zombie_tree, si->ext_node);
                seq_puts(s, "\nBalancing F2FS Async:\n");
-               seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n",
+               seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), "
+                       "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
                           si->nr_wb_cp_data, si->nr_wb_data,
-                          si->nr_flush, si->nr_discard);
-               seq_printf(s, "  - inmem: %4d, atomic IO: %4d (Max. %4d)\n",
-                          si->inmem_pages, si->aw_cnt, si->max_aw_cnt);
+                          si->nr_flushing, si->nr_flushed,
+                          si->nr_discarding, si->nr_discarded,
+                          si->nr_discard_cmd, si->undiscard_blks);
+               seq_printf(s, "  - inmem: %4d, atomic IO: %4d (Max. %4d), "
+                       "volatile IO: %4d (Max. %4d)\n",
+                          si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
+                          si->vw_cnt, si->max_vw_cnt);
                seq_printf(s, "  - nodes: %4d in %4d\n",
                           si->ndirty_node, si->node_pages);
                seq_printf(s, "  - dents: %4d in dirs:%4d (%4d)\n",
@@ -347,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v)
                           si->ndirty_imeta);
                seq_printf(s, "  - NATs: %9d/%9d\n  - SITs: %9d/%9d\n",
                           si->dirty_nats, si->nats, si->dirty_sits, si->sits);
-               seq_printf(s, "  - free_nids: %9d, alloc_nids: %9d\n",
-                          si->free_nids, si->alloc_nids);
+               seq_printf(s, "  - free_nids: %9d/%9d\n  - alloc_nids: %9d\n",
+                          si->free_nids, si->avail_nids, si->alloc_nids);
                seq_puts(s, "\nDistribution of User Blocks:");
                seq_puts(s, " [ valid | invalid | free ]\n");
                seq_puts(s, "  [");
@@ -434,7 +458,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
        atomic_set(&sbi->inplace_count, 0);
 
        atomic_set(&sbi->aw_cnt, 0);
+       atomic_set(&sbi->vw_cnt, 0);
        atomic_set(&sbi->max_aw_cnt, 0);
+       atomic_set(&sbi->max_vw_cnt, 0);
 
        mutex_lock(&f2fs_stat_mutex);
        list_add_tail(&si->stat_list, &f2fs_stat_list);
index e640870528349ecd67dae9bd4688bedc4fa84333..94756f55a97e7d86052cdb49ce79fcc131738cfd 100644 (file)
@@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
 
        dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
 
-       make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+       make_dentry_ptr_block(NULL, &d, dentry_blk);
        de = find_target_dentry(fname, namehash, max_slots, &d);
        if (de)
                *res_page = dentry_page;
@@ -192,13 +192,9 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
                f2fs_put_page(dentry_page, 0);
        }
 
-       /* This is to increase the speed of f2fs_create */
-       if (!de && room) {
-               F2FS_I(dir)->task = current;
-               if (F2FS_I(dir)->chash != namehash) {
-                       F2FS_I(dir)->chash = namehash;
-                       F2FS_I(dir)->clevel = level;
-               }
+       if (!de && room && F2FS_I(dir)->chash != namehash) {
+               F2FS_I(dir)->chash = namehash;
+               F2FS_I(dir)->clevel = level;
        }
 
        return de;
@@ -239,6 +235,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
                        break;
        }
 out:
+       /* This is to increase the speed of f2fs_create */
+       if (!de)
+               F2FS_I(dir)->task = current;
        return de;
 }
 
@@ -322,24 +321,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
        set_page_dirty(ipage);
 }
 
-int update_dent_inode(struct inode *inode, struct inode *to,
-                                       const struct qstr *name)
-{
-       struct page *page;
-
-       if (file_enc_name(to))
-               return 0;
-
-       page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
-       if (IS_ERR(page))
-               return PTR_ERR(page);
-
-       init_dent_inode(name, page);
-       f2fs_put_page(page, 1);
-
-       return 0;
-}
-
 void do_make_empty_dir(struct inode *inode, struct inode *parent,
                                        struct f2fs_dentry_ptr *d)
 {
@@ -369,7 +350,7 @@ static int make_empty_dir(struct inode *inode,
 
        dentry_blk = kmap_atomic(dentry_page);
 
-       make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+       make_dentry_ptr_block(NULL, &d, dentry_blk);
        do_make_empty_dir(inode, parent, &d);
 
        kunmap_atomic(dentry_blk);
@@ -423,8 +404,11 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
                set_cold_node(inode, page);
        }
 
-       if (new_name)
+       if (new_name) {
                init_dent_inode(new_name, page);
+               if (f2fs_encrypted_inode(dir))
+                       file_set_enc_name(inode);
+       }
 
        /*
         * This file should be checkpointed during fsync.
@@ -584,11 +568,9 @@ add_dentry:
                        err = PTR_ERR(page);
                        goto fail;
                }
-               if (f2fs_encrypted_inode(dir))
-                       file_set_enc_name(inode);
        }
 
-       make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+       make_dentry_ptr_block(NULL, &d, dentry_blk);
        f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
 
        set_page_dirty(dentry_page);
@@ -896,7 +878,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 
                dentry_blk = kmap(dentry_page);
 
-               make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
+               make_dentry_ptr_block(inode, &d, dentry_blk);
 
                err = f2fs_fill_dentries(ctx, &d,
                                n * NR_DENTRY_IN_BLOCK, &fstr);
index c6934f014e0f1d9a458a6489422612a6484fe198..2f98d70397013317c393257c3a957f4fa12e1c00 100644 (file)
 #include "node.h"
 #include <trace/events/f2fs.h>
 
+static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
+                                                       unsigned int ofs)
+{
+       if (cached_re) {
+               if (cached_re->ofs <= ofs &&
+                               cached_re->ofs + cached_re->len > ofs) {
+                       return cached_re;
+               }
+       }
+       return NULL;
+}
+
+static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root,
+                                                       unsigned int ofs)
+{
+       struct rb_node *node = root->rb_node;
+       struct rb_entry *re;
+
+       while (node) {
+               re = rb_entry(node, struct rb_entry, rb_node);
+
+               if (ofs < re->ofs)
+                       node = node->rb_left;
+               else if (ofs >= re->ofs + re->len)
+                       node = node->rb_right;
+               else
+                       return re;
+       }
+       return NULL;
+}
+
+struct rb_entry *__lookup_rb_tree(struct rb_root *root,
+                               struct rb_entry *cached_re, unsigned int ofs)
+{
+       struct rb_entry *re;
+
+       re = __lookup_rb_tree_fast(cached_re, ofs);
+       if (!re)
+               return __lookup_rb_tree_slow(root, ofs);
+
+       return re;
+}
+
+struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+                               struct rb_root *root, struct rb_node **parent,
+                               unsigned int ofs)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_entry *re;
+
+       while (*p) {
+               *parent = *p;
+               re = rb_entry(*parent, struct rb_entry, rb_node);
+
+               if (ofs < re->ofs)
+                       p = &(*p)->rb_left;
+               else if (ofs >= re->ofs + re->len)
+                       p = &(*p)->rb_right;
+               else
+                       f2fs_bug_on(sbi, 1);
+       }
+
+       return p;
+}
+
+/*
+ * lookup rb entry in position of @ofs in rb-tree,
+ * if hit, return the entry, otherwise, return NULL
+ * @prev_ex: extent before ofs
+ * @next_ex: extent after ofs
+ * @insert_p: insert point for new extent at ofs
+ * in order to simpfy the insertion after.
+ * tree must stay unchanged between lookup and insertion.
+ */
+struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root,
+                               struct rb_entry *cached_re,
+                               unsigned int ofs,
+                               struct rb_entry **prev_entry,
+                               struct rb_entry **next_entry,
+                               struct rb_node ***insert_p,
+                               struct rb_node **insert_parent,
+                               bool force)
+{
+       struct rb_node **pnode = &root->rb_node;
+       struct rb_node *parent = NULL, *tmp_node;
+       struct rb_entry *re = cached_re;
+
+       *insert_p = NULL;
+       *insert_parent = NULL;
+       *prev_entry = NULL;
+       *next_entry = NULL;
+
+       if (RB_EMPTY_ROOT(root))
+               return NULL;
+
+       if (re) {
+               if (re->ofs <= ofs && re->ofs + re->len > ofs)
+                       goto lookup_neighbors;
+       }
+
+       while (*pnode) {
+               parent = *pnode;
+               re = rb_entry(*pnode, struct rb_entry, rb_node);
+
+               if (ofs < re->ofs)
+                       pnode = &(*pnode)->rb_left;
+               else if (ofs >= re->ofs + re->len)
+                       pnode = &(*pnode)->rb_right;
+               else
+                       goto lookup_neighbors;
+       }
+
+       *insert_p = pnode;
+       *insert_parent = parent;
+
+       re = rb_entry(parent, struct rb_entry, rb_node);
+       tmp_node = parent;
+       if (parent && ofs > re->ofs)
+               tmp_node = rb_next(parent);
+       *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+
+       tmp_node = parent;
+       if (parent && ofs < re->ofs)
+               tmp_node = rb_prev(parent);
+       *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+       return NULL;
+
+lookup_neighbors:
+       if (ofs == re->ofs || force) {
+               /* lookup prev node for merging backward later */
+               tmp_node = rb_prev(&re->rb_node);
+               *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+       }
+       if (ofs == re->ofs + re->len - 1 || force) {
+               /* lookup next node for merging frontward later */
+               tmp_node = rb_next(&re->rb_node);
+               *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+       }
+       return re;
+}
+
+bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+                                               struct rb_root *root)
+{
+#ifdef CONFIG_F2FS_CHECK_FS
+       struct rb_node *cur = rb_first(root), *next;
+       struct rb_entry *cur_re, *next_re;
+
+       if (!cur)
+               return true;
+
+       while (cur) {
+               next = rb_next(cur);
+               if (!next)
+                       return true;
+
+               cur_re = rb_entry(cur, struct rb_entry, rb_node);
+               next_re = rb_entry(next, struct rb_entry, rb_node);
+
+               if (cur_re->ofs + cur_re->len > next_re->ofs) {
+                       f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, "
+                               "cur(%u, %u) next(%u, %u)",
+                               cur_re->ofs, cur_re->len,
+                               next_re->ofs, next_re->len);
+                       return false;
+               }
+
+               cur = next;
+       }
+#endif
+       return true;
+}
+
 static struct kmem_cache *extent_tree_slab;
 static struct kmem_cache *extent_node_slab;
 
@@ -102,36 +275,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
        return et;
 }
 
-static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi,
-                               struct extent_tree *et, unsigned int fofs)
-{
-       struct rb_node *node = et->root.rb_node;
-       struct extent_node *en = et->cached_en;
-
-       if (en) {
-               struct extent_info *cei = &en->ei;
-
-               if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) {
-                       stat_inc_cached_node_hit(sbi);
-                       return en;
-               }
-       }
-
-       while (node) {
-               en = rb_entry(node, struct extent_node, rb_node);
-
-               if (fofs < en->ei.fofs) {
-                       node = node->rb_left;
-               } else if (fofs >= en->ei.fofs + en->ei.len) {
-                       node = node->rb_right;
-               } else {
-                       stat_inc_rbtree_node_hit(sbi);
-                       return en;
-               }
-       }
-       return NULL;
-}
-
 static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
                                struct extent_tree *et, struct extent_info *ei)
 {
@@ -237,17 +380,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
                goto out;
        }
 
-       en = __lookup_extent_tree(sbi, et, pgofs);
-       if (en) {
-               *ei = en->ei;
-               spin_lock(&sbi->extent_lock);
-               if (!list_empty(&en->list)) {
-                       list_move_tail(&en->list, &sbi->extent_list);
-                       et->cached_en = en;
-               }
-               spin_unlock(&sbi->extent_lock);
-               ret = true;
+       en = (struct extent_node *)__lookup_rb_tree(&et->root,
+                               (struct rb_entry *)et->cached_en, pgofs);
+       if (!en)
+               goto out;
+
+       if (en == et->cached_en)
+               stat_inc_cached_node_hit(sbi);
+       else
+               stat_inc_rbtree_node_hit(sbi);
+
+       *ei = en->ei;
+       spin_lock(&sbi->extent_lock);
+       if (!list_empty(&en->list)) {
+               list_move_tail(&en->list, &sbi->extent_list);
+               et->cached_en = en;
        }
+       spin_unlock(&sbi->extent_lock);
+       ret = true;
 out:
        stat_inc_total_hit(sbi);
        read_unlock(&et->lock);
@@ -256,83 +406,6 @@ out:
        return ret;
 }
 
-
-/*
- * lookup extent at @fofs, if hit, return the extent
- * if not, return NULL and
- * @prev_ex: extent before fofs
- * @next_ex: extent after fofs
- * @insert_p: insert point for new extent at fofs
- * in order to simpfy the insertion after.
- * tree must stay unchanged between lookup and insertion.
- */
-static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
-                               unsigned int fofs,
-                               struct extent_node **prev_ex,
-                               struct extent_node **next_ex,
-                               struct rb_node ***insert_p,
-                               struct rb_node **insert_parent)
-{
-       struct rb_node **pnode = &et->root.rb_node;
-       struct rb_node *parent = NULL, *tmp_node;
-       struct extent_node *en = et->cached_en;
-
-       *insert_p = NULL;
-       *insert_parent = NULL;
-       *prev_ex = NULL;
-       *next_ex = NULL;
-
-       if (RB_EMPTY_ROOT(&et->root))
-               return NULL;
-
-       if (en) {
-               struct extent_info *cei = &en->ei;
-
-               if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
-                       goto lookup_neighbors;
-       }
-
-       while (*pnode) {
-               parent = *pnode;
-               en = rb_entry(*pnode, struct extent_node, rb_node);
-
-               if (fofs < en->ei.fofs)
-                       pnode = &(*pnode)->rb_left;
-               else if (fofs >= en->ei.fofs + en->ei.len)
-                       pnode = &(*pnode)->rb_right;
-               else
-                       goto lookup_neighbors;
-       }
-
-       *insert_p = pnode;
-       *insert_parent = parent;
-
-       en = rb_entry(parent, struct extent_node, rb_node);
-       tmp_node = parent;
-       if (parent && fofs > en->ei.fofs)
-               tmp_node = rb_next(parent);
-       *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
-
-       tmp_node = parent;
-       if (parent && fofs < en->ei.fofs)
-               tmp_node = rb_prev(parent);
-       *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
-       return NULL;
-
-lookup_neighbors:
-       if (fofs == en->ei.fofs) {
-               /* lookup prev node for merging backward later */
-               tmp_node = rb_prev(&en->rb_node);
-               *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
-       }
-       if (fofs == en->ei.fofs + en->ei.len - 1) {
-               /* lookup next node for merging frontward later */
-               tmp_node = rb_next(&en->rb_node);
-               *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
-       }
-       return en;
-}
-
 static struct extent_node *__try_merge_extent_node(struct inode *inode,
                                struct extent_tree *et, struct extent_info *ei,
                                struct extent_node *prev_ex,
@@ -387,17 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode,
                goto do_insert;
        }
 
-       while (*p) {
-               parent = *p;
-               en = rb_entry(parent, struct extent_node, rb_node);
-
-               if (ei->fofs < en->ei.fofs)
-                       p = &(*p)->rb_left;
-               else if (ei->fofs >= en->ei.fofs + en->ei.len)
-                       p = &(*p)->rb_right;
-               else
-                       f2fs_bug_on(sbi, 1);
-       }
+       p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs);
 do_insert:
        en = __attach_extent_node(sbi, et, ei, parent, p);
        if (!en)
@@ -447,8 +510,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
        __drop_largest_extent(inode, fofs, len);
 
        /* 1. lookup first extent node in range [fofs, fofs + len - 1] */
-       en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
-                                       &insert_p, &insert_parent);
+       en = (struct extent_node *)__lookup_rb_tree_ret(&et->root,
+                                       (struct rb_entry *)et->cached_en, fofs,
+                                       (struct rb_entry **)&prev_en,
+                                       (struct rb_entry **)&next_en,
+                                       &insert_p, &insert_parent, false);
        if (!en)
                en = next_en;
 
index 05d7e2cefc566d73115de05a108c21dcc2df0756..e26999a745220a9691614c21df5a11ee6880a021 100644 (file)
@@ -50,6 +50,7 @@ enum {
        FAULT_BLOCK,
        FAULT_DIR_DEPTH,
        FAULT_EVICT_INODE,
+       FAULT_TRUNCATE,
        FAULT_IO,
        FAULT_CHECKPOINT,
        FAULT_MAX,
@@ -62,7 +63,7 @@ struct f2fs_fault_info {
 };
 
 extern char *fault_name[FAULT_MAX];
-#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type)))
+#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
 #endif
 
 /*
@@ -88,9 +89,9 @@ extern char *fault_name[FAULT_MAX];
 #define F2FS_MOUNT_ADAPTIVE            0x00020000
 #define F2FS_MOUNT_LFS                 0x00040000
 
-#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
-#define set_opt(sbi, option)   (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
-#define test_opt(sbi, option)  (sbi->mount_opt.opt & F2FS_MOUNT_##option)
+#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option)   ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option)  ((sbi)->mount_opt.opt & F2FS_MOUNT_##option)
 
 #define ver_after(a, b)        (typecheck(unsigned long long, a) &&            \
                typecheck(unsigned long long, b) &&                     \
@@ -124,22 +125,20 @@ enum {
        SIT_BITMAP
 };
 
-enum {
-       CP_UMOUNT,
-       CP_FASTBOOT,
-       CP_SYNC,
-       CP_RECOVERY,
-       CP_DISCARD,
-};
+#define        CP_UMOUNT       0x00000001
+#define        CP_FASTBOOT     0x00000002
+#define        CP_SYNC         0x00000004
+#define        CP_RECOVERY     0x00000008
+#define        CP_DISCARD      0x00000010
+#define CP_TRIMMED     0x00000020
 
 #define DEF_BATCHED_TRIM_SECTIONS      2048
 #define BATCHED_TRIM_SEGMENTS(sbi)     \
-               (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
+               (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections))
 #define BATCHED_TRIM_BLOCKS(sbi)       \
                (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
-#define MAX_DISCARD_BLOCKS(sbi)                                                \
-               ((1 << (sbi)->log_blocks_per_seg) * (sbi)->segs_per_sec)
-#define DISCARD_ISSUE_RATE     8
+#define MAX_DISCARD_BLOCKS(sbi)                BLKS_PER_SEC(sbi)
+#define DISCARD_ISSUE_RATE             8
 #define DEF_CP_INTERVAL                        60      /* 60 secs */
 #define DEF_IDLE_INTERVAL              5       /* 5 secs */
 
@@ -181,37 +180,63 @@ struct inode_entry {
        struct inode *inode;    /* vfs inode pointer */
 };
 
-/* for the list of blockaddresses to be discarded */
+/* for the bitmap indicate blocks to be discarded */
 struct discard_entry {
        struct list_head list;  /* list head */
-       block_t blkaddr;        /* block address to be discarded */
-       int len;                /* # of consecutive blocks of the discard */
+       block_t start_blkaddr;  /* start blockaddr of current segment */
+       unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */
 };
 
+/* max discard pend list number */
+#define MAX_PLIST_NUM          512
+#define plist_idx(blk_num)     ((blk_num) >= MAX_PLIST_NUM ?           \
+                                       (MAX_PLIST_NUM - 1) : (blk_num - 1))
+
 enum {
        D_PREP,
        D_SUBMIT,
        D_DONE,
 };
 
+struct discard_info {
+       block_t lstart;                 /* logical start address */
+       block_t len;                    /* length */
+       block_t start;                  /* actual start address in dev */
+};
+
 struct discard_cmd {
+       struct rb_node rb_node;         /* rb node located in rb-tree */
+       union {
+               struct {
+                       block_t lstart; /* logical start address */
+                       block_t len;    /* length */
+                       block_t start;  /* actual start address in dev */
+               };
+               struct discard_info di; /* discard info */
+
+       };
        struct list_head list;          /* command list */
        struct completion wait;         /* compleation */
-       block_t lstart;                 /* logical start address */
-       block_t len;                    /* length */
-       struct bio *bio;                /* bio */
-       int state;                      /* state */
+       struct block_device *bdev;      /* bdev */
+       unsigned short ref;             /* reference count */
+       unsigned char state;            /* state */
+       int error;                      /* bio error */
 };
 
 struct discard_cmd_control {
        struct task_struct *f2fs_issue_discard; /* discard thread */
-       struct list_head discard_entry_list;    /* 4KB discard entry list */
-       int nr_discards;                        /* # of discards in the list */
-       struct list_head discard_cmd_list;      /* discard cmd list */
+       struct list_head entry_list;            /* 4KB discard entry list */
+       struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
+       struct list_head wait_list;             /* store on-flushing entries */
        wait_queue_head_t discard_wait_queue;   /* waiting queue for wake-up */
        struct mutex cmd_lock;
-       int max_discards;                       /* max. discards to be issued */
-       atomic_t submit_discard;                /* # of issued discard */
+       unsigned int nr_discards;               /* # of discards in the list */
+       unsigned int max_discards;              /* max. discards to be issued */
+       unsigned int undiscard_blks;            /* # of undiscard blocks */
+       atomic_t issued_discard;                /* # of issued discard */
+       atomic_t issing_discard;                /* # of issing discard */
+       atomic_t discard_cmd_cnt;               /* # of cached cmd count */
+       struct rb_root root;                    /* root of discard rb-tree */
 };
 
 /* for the list of fsync inodes, used only during recovery */
@@ -222,13 +247,13 @@ struct fsync_inode_entry {
        block_t last_dentry;    /* block address locating the last dentry */
 };
 
-#define nats_in_cursum(jnl)            (le16_to_cpu(jnl->n_nats))
-#define sits_in_cursum(jnl)            (le16_to_cpu(jnl->n_sits))
+#define nats_in_cursum(jnl)            (le16_to_cpu((jnl)->n_nats))
+#define sits_in_cursum(jnl)            (le16_to_cpu((jnl)->n_sits))
 
-#define nat_in_journal(jnl, i)         (jnl->nat_j.entries[i].ne)
-#define nid_in_journal(jnl, i)         (jnl->nat_j.entries[i].nid)
-#define sit_in_journal(jnl, i)         (jnl->sit_j.entries[i].se)
-#define segno_in_journal(jnl, i)       (jnl->sit_j.entries[i].segno)
+#define nat_in_journal(jnl, i)         ((jnl)->nat_j.entries[i].ne)
+#define nid_in_journal(jnl, i)         ((jnl)->nat_j.entries[i].nid)
+#define sit_in_journal(jnl, i)         ((jnl)->sit_j.entries[i].se)
+#define segno_in_journal(jnl, i)       ((jnl)->sit_j.entries[i].segno)
 
 #define MAX_NAT_JENTRIES(jnl)  (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
 #define MAX_SIT_JENTRIES(jnl)  (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
@@ -270,11 +295,14 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
 #define F2FS_IOC_START_VOLATILE_WRITE  _IO(F2FS_IOCTL_MAGIC, 3)
 #define F2FS_IOC_RELEASE_VOLATILE_WRITE        _IO(F2FS_IOCTL_MAGIC, 4)
 #define F2FS_IOC_ABORT_VOLATILE_WRITE  _IO(F2FS_IOCTL_MAGIC, 5)
-#define F2FS_IOC_GARBAGE_COLLECT       _IO(F2FS_IOCTL_MAGIC, 6)
+#define F2FS_IOC_GARBAGE_COLLECT       _IOW(F2FS_IOCTL_MAGIC, 6, __u32)
 #define F2FS_IOC_WRITE_CHECKPOINT      _IO(F2FS_IOCTL_MAGIC, 7)
-#define F2FS_IOC_DEFRAGMENT            _IO(F2FS_IOCTL_MAGIC, 8)
+#define F2FS_IOC_DEFRAGMENT            _IOWR(F2FS_IOCTL_MAGIC, 8,      \
+                                               struct f2fs_defragment)
 #define F2FS_IOC_MOVE_RANGE            _IOWR(F2FS_IOCTL_MAGIC, 9,      \
                                                struct f2fs_move_range)
+#define F2FS_IOC_FLUSH_DEVICE          _IOW(F2FS_IOCTL_MAGIC, 10,      \
+                                               struct f2fs_flush_device)
 
 #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
 #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
@@ -311,6 +339,11 @@ struct f2fs_move_range {
        u64 len;                /* size to move */
 };
 
+struct f2fs_flush_device {
+       u32 dev_num;            /* device number to flush */
+       u32 segments;           /* # of segments to flush */
+};
+
 /*
  * For INODE and NODE manager
  */
@@ -323,26 +356,24 @@ struct f2fs_dentry_ptr {
        int max;
 };
 
-static inline void make_dentry_ptr(struct inode *inode,
-               struct f2fs_dentry_ptr *d, void *src, int type)
+static inline void make_dentry_ptr_block(struct inode *inode,
+               struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t)
 {
        d->inode = inode;
+       d->max = NR_DENTRY_IN_BLOCK;
+       d->bitmap = &t->dentry_bitmap;
+       d->dentry = t->dentry;
+       d->filename = t->filename;
+}
 
-       if (type == 1) {
-               struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src;
-
-               d->max = NR_DENTRY_IN_BLOCK;
-               d->bitmap = &t->dentry_bitmap;
-               d->dentry = t->dentry;
-               d->filename = t->filename;
-       } else {
-               struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src;
-
-               d->max = NR_INLINE_DENTRY;
-               d->bitmap = &t->dentry_bitmap;
-               d->dentry = t->dentry;
-               d->filename = t->filename;
-       }
+static inline void make_dentry_ptr_inline(struct inode *inode,
+               struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t)
+{
+       d->inode = inode;
+       d->max = NR_INLINE_DENTRY;
+       d->bitmap = &t->dentry_bitmap;
+       d->dentry = t->dentry;
+       d->filename = t->filename;
 }
 
 /*
@@ -374,16 +405,30 @@ enum {
 /* number of extent info in extent cache we try to shrink */
 #define EXTENT_CACHE_SHRINK_NUMBER     128
 
+struct rb_entry {
+       struct rb_node rb_node;         /* rb node located in rb-tree */
+       unsigned int ofs;               /* start offset of the entry */
+       unsigned int len;               /* length of the entry */
+};
+
 struct extent_info {
        unsigned int fofs;              /* start offset in a file */
-       u32 blk;                        /* start block address of the extent */
        unsigned int len;               /* length of the extent */
+       u32 blk;                        /* start block address of the extent */
 };
 
 struct extent_node {
-       struct rb_node rb_node;         /* rb node located in rb-tree */
+       struct rb_node rb_node;
+       union {
+               struct {
+                       unsigned int fofs;
+                       unsigned int len;
+                       u32 blk;
+               };
+               struct extent_info ei;  /* extent info */
+
+       };
        struct list_head list;          /* node in global extent list of sbi */
-       struct extent_info ei;          /* extent info */
        struct extent_tree *et;         /* extent tree pointer */
 };
 
@@ -500,6 +545,24 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
        ei->len = len;
 }
 
+static inline bool __is_discard_mergeable(struct discard_info *back,
+                                               struct discard_info *front)
+{
+       return back->lstart + back->len == front->lstart;
+}
+
+static inline bool __is_discard_back_mergeable(struct discard_info *cur,
+                                               struct discard_info *back)
+{
+       return __is_discard_mergeable(back, cur);
+}
+
+static inline bool __is_discard_front_mergeable(struct discard_info *cur,
+                                               struct discard_info *front)
+{
+       return __is_discard_mergeable(cur, front);
+}
+
 static inline bool __is_extent_mergeable(struct extent_info *back,
                                                struct extent_info *front)
 {
@@ -562,7 +625,6 @@ struct f2fs_nm_info {
        unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
        unsigned char *nat_block_bitmap;
        unsigned short *free_nid_count; /* free nid count of NAT block */
-       spinlock_t free_nid_lock;       /* protect updating of nid count */
 
        /* for checkpoint */
        char *nat_bitmap;               /* NAT bitmap pointer */
@@ -641,7 +703,8 @@ struct flush_cmd {
 struct flush_cmd_control {
        struct task_struct *f2fs_issue_flush;   /* flush thread */
        wait_queue_head_t flush_wait_queue;     /* waiting queue for wake-up */
-       atomic_t submit_flush;                  /* # of issued flushes */
+       atomic_t issued_flush;                  /* # of issued flushes */
+       atomic_t issing_flush;                  /* # of issing flushes */
        struct llist_head issue_list;           /* list for command issue */
        struct llist_node *dispatch_list;       /* list for command dispatch */
 };
@@ -672,6 +735,7 @@ struct f2fs_sm_info {
        unsigned int ipu_policy;        /* in-place-update policy */
        unsigned int min_ipu_util;      /* in-place-update threshold */
        unsigned int min_fsync_blocks;  /* threshold for fsync */
+       unsigned int min_hot_blocks;    /* threshold for hot block allocation */
 
        /* for flush command control */
        struct flush_cmd_control *fcc_info;
@@ -722,6 +786,7 @@ enum page_type {
        META_FLUSH,
        INMEM,          /* the below types are used by tracepoints only. */
        INMEM_DROP,
+       INMEM_INVALIDATE,
        INMEM_REVOKE,
        IPU,
        OPU,
@@ -737,9 +802,10 @@ struct f2fs_io_info {
        struct page *page;      /* page to be written */
        struct page *encrypted_page;    /* encrypted page */
        bool submitted;         /* indicate IO submission */
+       bool need_lock;         /* indicate we need to lock cp_rwsem */
 };
 
-#define is_read_io(rw) (rw == READ)
+#define is_read_io(rw) ((rw) == READ)
 struct f2fs_bio_info {
        struct f2fs_sb_info *sbi;       /* f2fs superblock */
        struct bio *bio;                /* bios to merge */
@@ -827,6 +893,7 @@ struct f2fs_sb_info {
        struct mutex cp_mutex;                  /* checkpoint procedure lock */
        struct rw_semaphore cp_rwsem;           /* blocking FS operations */
        struct rw_semaphore node_write;         /* locking node writes */
+       struct rw_semaphore node_change;        /* locking node change */
        wait_queue_head_t cp_wait;
        unsigned long last_time[MAX_TIME];      /* to store time in jiffies */
        long interval_time[MAX_TIME];           /* to store thresholds */
@@ -879,6 +946,9 @@ struct f2fs_sb_info {
        /* # of allocated blocks */
        struct percpu_counter alloc_valid_block_count;
 
+       /* writeback control */
+       atomic_t wb_sync_req;                   /* count # of WB_SYNC threads */
+
        /* valid inode count */
        struct percpu_counter total_valid_inode_count;
 
@@ -912,11 +982,12 @@ struct f2fs_sb_info {
        atomic_t inline_inode;                  /* # of inline_data inodes */
        atomic_t inline_dir;                    /* # of inline_dentry inodes */
        atomic_t aw_cnt;                        /* # of atomic writes */
+       atomic_t vw_cnt;                        /* # of volatile writes */
        atomic_t max_aw_cnt;                    /* max # of atomic writes */
+       atomic_t max_vw_cnt;                    /* max # of volatile writes */
        int bg_gc;                              /* background gc calls */
        unsigned int ndirty_inode[NR_INODE_TYPE];       /* # of dirty inodes */
 #endif
-       unsigned int last_victim[2];            /* last victim segment # */
        spinlock_t stat_lock;                   /* lock for stat operations */
 
        /* For sysfs suppport */
@@ -971,8 +1042,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
  * and the return value is in kbytes. s is of struct f2fs_sb_info.
  */
 #define BD_PART_WRITTEN(s)                                              \
-(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) -             \
-               s->sectors_written_start) >> 1)
+(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) -           \
+               (s)->sectors_written_start) >> 1)
 
 static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
 {
@@ -1193,7 +1264,7 @@ static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
 {
        bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
 
-       return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set;
+       return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set;
 }
 
 static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
@@ -1229,7 +1300,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
 
 static inline bool __remain_node_summaries(int reason)
 {
-       return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
+       return (reason & (CP_UMOUNT | CP_FASTBOOT));
 }
 
 static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
@@ -1707,6 +1778,7 @@ enum {
        FI_DO_DEFRAG,           /* indicate defragment is running */
        FI_DIRTY_FILE,          /* indicate regular/symlink has dirty pages */
        FI_NO_PREALLOC,         /* indicate skipped preallocated blocks */
+       FI_HOT_DATA,            /* indicate file is hot */
 };
 
 static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -1869,12 +1941,6 @@ static inline int f2fs_has_inline_data(struct inode *inode)
        return is_inode_flag_set(inode, FI_INLINE_DATA);
 }
 
-static inline void f2fs_clear_inline_inode(struct inode *inode)
-{
-       clear_inode_flag(inode, FI_INLINE_DATA);
-       clear_inode_flag(inode, FI_DATA_EXIST);
-}
-
 static inline int f2fs_exist_data(struct inode *inode)
 {
        return is_inode_flag_set(inode, FI_DATA_EXIST);
@@ -2029,12 +2095,6 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
        ((is_inode_flag_set(i, FI_ACL_MODE)) ? \
         (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
 
-/* get offset of first page in next direct node */
-#define PGOFS_OF_NEXT_DNODE(pgofs, inode)                              \
-       ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) :    \
-       (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) /    \
-       ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode))
-
 /*
  * file.c
  */
@@ -2096,8 +2156,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
                        struct page **page);
 void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
                        struct page *page, struct inode *inode);
-int update_dent_inode(struct inode *inode, struct inode *to,
-                       const struct qstr *name);
 void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
                        const struct qstr *name, f2fs_hash_t name_hash,
                        unsigned int bit_pos);
@@ -2185,6 +2243,7 @@ void destroy_node_manager_caches(void);
  */
 void register_inmem_page(struct inode *inode, struct page *page);
 void drop_inmem_pages(struct inode *inode);
+void drop_inmem_page(struct inode *inode, struct page *page);
 int commit_inmem_pages(struct inode *inode);
 void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
@@ -2194,7 +2253,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
 void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
 bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
 void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new);
-void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr);
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
 void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
 void release_discard_addrs(struct f2fs_sb_info *sbi);
 int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
@@ -2206,7 +2265,7 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr);
 void write_meta_page(struct f2fs_sb_info *sbi, struct page *page);
 void write_node_page(unsigned int nid, struct f2fs_io_info *fio);
 void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio);
-void rewrite_data_page(struct f2fs_io_info *fio);
+int rewrite_data_page(struct f2fs_io_info *fio);
 void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
                        block_t old_blkaddr, block_t new_blkaddr,
                        bool recover_curseg, bool recover_newaddr);
@@ -2311,7 +2370,8 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
 int start_gc_thread(struct f2fs_sb_info *sbi);
 void stop_gc_thread(struct f2fs_sb_info *sbi);
 block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background);
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
+                       unsigned int segno);
 void build_gc_manager(struct f2fs_sb_info *sbi);
 
 /*
@@ -2335,11 +2395,15 @@ struct f2fs_stat_info {
        int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
        int inmem_pages;
        unsigned int ndirty_dirs, ndirty_files, ndirty_all;
-       int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids;
+       int nats, dirty_nats, sits, dirty_sits;
+       int free_nids, avail_nids, alloc_nids;
        int total_count, utilization;
-       int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard;
+       int bg_gc, nr_wb_cp_data, nr_wb_data;
+       int nr_flushing, nr_flushed, nr_discarding, nr_discarded;
+       int nr_discard_cmd;
+       unsigned int undiscard_blks;
        int inline_xattr, inline_inode, inline_dir, append, update, orphans;
-       int aw_cnt, max_aw_cnt;
+       int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt;
        unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
        unsigned int bimodal, avg_vblocks;
        int util_free, util_valid, util_invalid;
@@ -2422,11 +2486,22 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
                if (cur > max)                                          \
                        atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \
        } while (0)
+#define stat_inc_volatile_write(inode)                                 \
+               (atomic_inc(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_dec_volatile_write(inode)                                 \
+               (atomic_dec(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_update_max_volatile_write(inode)                          \
+       do {                                                            \
+               int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt);       \
+               int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt);   \
+               if (cur > max)                                          \
+                       atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \
+       } while (0)
 #define stat_inc_seg_count(sbi, type, gc_type)                         \
        do {                                                            \
                struct f2fs_stat_info *si = F2FS_STAT(sbi);             \
-               (si)->tot_segs++;                                       \
-               if (type == SUM_TYPE_DATA) {                            \
+               si->tot_segs++;                                         \
+               if ((type) == SUM_TYPE_DATA) {                          \
                        si->data_segs++;                                \
                        si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \
                } else {                                                \
@@ -2436,14 +2511,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
        } while (0)
 
 #define stat_inc_tot_blk_count(si, blks)                               \
-       (si->tot_blks += (blks))
+       ((si)->tot_blks += (blks))
 
 #define stat_inc_data_blk_count(sbi, blks, gc_type)                    \
        do {                                                            \
                struct f2fs_stat_info *si = F2FS_STAT(sbi);             \
                stat_inc_tot_blk_count(si, blks);                       \
                si->data_blks += (blks);                                \
-               si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0;    \
+               si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0;  \
        } while (0)
 
 #define stat_inc_node_blk_count(sbi, blks, gc_type)                    \
@@ -2451,7 +2526,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
                struct f2fs_stat_info *si = F2FS_STAT(sbi);             \
                stat_inc_tot_blk_count(si, blks);                       \
                si->node_blks += (blks);                                \
-               si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0;    \
+               si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0;  \
        } while (0)
 
 int f2fs_build_stats(struct f2fs_sb_info *sbi);
@@ -2459,32 +2534,35 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
 int __init f2fs_create_root_stats(void);
 void f2fs_destroy_root_stats(void);
 #else
-#define stat_inc_cp_count(si)
-#define stat_inc_bg_cp_count(si)
-#define stat_inc_call_count(si)
-#define stat_inc_bggc_count(si)
-#define stat_inc_dirty_inode(sbi, type)
-#define stat_dec_dirty_inode(sbi, type)
-#define stat_inc_total_hit(sb)
-#define stat_inc_rbtree_node_hit(sb)
-#define stat_inc_largest_node_hit(sbi)
-#define stat_inc_cached_node_hit(sbi)
-#define stat_inc_inline_xattr(inode)
-#define stat_dec_inline_xattr(inode)
-#define stat_inc_inline_inode(inode)
-#define stat_dec_inline_inode(inode)
-#define stat_inc_inline_dir(inode)
-#define stat_dec_inline_dir(inode)
-#define stat_inc_atomic_write(inode)
-#define stat_dec_atomic_write(inode)
-#define stat_update_max_atomic_write(inode)
-#define stat_inc_seg_type(sbi, curseg)
-#define stat_inc_block_count(sbi, curseg)
-#define stat_inc_inplace_blocks(sbi)
-#define stat_inc_seg_count(sbi, type, gc_type)
-#define stat_inc_tot_blk_count(si, blks)
-#define stat_inc_data_blk_count(sbi, blks, gc_type)
-#define stat_inc_node_blk_count(sbi, blks, gc_type)
+#define stat_inc_cp_count(si)                          do { } while (0)
+#define stat_inc_bg_cp_count(si)                       do { } while (0)
+#define stat_inc_call_count(si)                                do { } while (0)
+#define stat_inc_bggc_count(si)                                do { } while (0)
+#define stat_inc_dirty_inode(sbi, type)                        do { } while (0)
+#define stat_dec_dirty_inode(sbi, type)                        do { } while (0)
+#define stat_inc_total_hit(sb)                         do { } while (0)
+#define stat_inc_rbtree_node_hit(sb)                   do { } while (0)
+#define stat_inc_largest_node_hit(sbi)                 do { } while (0)
+#define stat_inc_cached_node_hit(sbi)                  do { } while (0)
+#define stat_inc_inline_xattr(inode)                   do { } while (0)
+#define stat_dec_inline_xattr(inode)                   do { } while (0)
+#define stat_inc_inline_inode(inode)                   do { } while (0)
+#define stat_dec_inline_inode(inode)                   do { } while (0)
+#define stat_inc_inline_dir(inode)                     do { } while (0)
+#define stat_dec_inline_dir(inode)                     do { } while (0)
+#define stat_inc_atomic_write(inode)                   do { } while (0)
+#define stat_dec_atomic_write(inode)                   do { } while (0)
+#define stat_update_max_atomic_write(inode)            do { } while (0)
+#define stat_inc_volatile_write(inode)                 do { } while (0)
+#define stat_dec_volatile_write(inode)                 do { } while (0)
+#define stat_update_max_volatile_write(inode)          do { } while (0)
+#define stat_inc_seg_type(sbi, curseg)                 do { } while (0)
+#define stat_inc_block_count(sbi, curseg)              do { } while (0)
+#define stat_inc_inplace_blocks(sbi)                   do { } while (0)
+#define stat_inc_seg_count(sbi, type, gc_type)         do { } while (0)
+#define stat_inc_tot_blk_count(si, blks)               do { } while (0)
+#define stat_inc_data_blk_count(sbi, blks, gc_type)    do { } while (0)
+#define stat_inc_node_blk_count(sbi, blks, gc_type)    do { } while (0)
 
 static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
@@ -2510,7 +2588,7 @@ extern struct kmem_cache *inode_entry_slab;
 bool f2fs_may_inline_data(struct inode *inode);
 bool f2fs_may_inline_dentry(struct inode *inode);
 void read_inline_data(struct page *page, struct page *ipage);
-bool truncate_inline_inode(struct page *ipage, u64 from);
+void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from);
 int f2fs_read_inline_data(struct inode *inode, struct page *page);
 int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
 int f2fs_convert_inline_inode(struct inode *inode);
@@ -2545,6 +2623,18 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
 /*
  * extent_cache.c
  */
+struct rb_entry *__lookup_rb_tree(struct rb_root *root,
+                               struct rb_entry *cached_re, unsigned int ofs);
+struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+                               struct rb_root *root, struct rb_node **parent,
+                               unsigned int ofs);
+struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root,
+               struct rb_entry *cached_re, unsigned int ofs,
+               struct rb_entry **prev_entry, struct rb_entry **next_entry,
+               struct rb_node ***insert_p, struct rb_node **insert_parent,
+               bool force);
+bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+                                               struct rb_root *root);
 unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
 bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext);
 void f2fs_drop_extent_tree(struct inode *inode);
index 5f7317875a6726b9ea968f81cbf401b2c8966f8f..abb0403d341484fa96e8cfc4634d1cf504b653c2 100644 (file)
@@ -116,11 +116,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
        if (!dentry)
                return 0;
 
-       if (update_dent_inode(inode, inode, &dentry->d_name)) {
-               dput(dentry);
-               return 0;
-       }
-
        *pino = parent_ino(dentry);
        dput(dentry);
        return 1;
@@ -528,7 +523,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 
        page = get_lock_data_page(inode, index, true);
        if (IS_ERR(page))
-               return 0;
+               return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
 truncate_out:
        f2fs_wait_on_page_writeback(page, DATA, true);
        zero_user(page, offset, PAGE_SIZE - offset);
@@ -566,9 +561,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
        }
 
        if (f2fs_has_inline_data(inode)) {
-               truncate_inline_inode(ipage, from);
-               if (from == 0)
-                       clear_inode_flag(inode, FI_DATA_EXIST);
+               truncate_inline_inode(inode, ipage, from);
                f2fs_put_page(ipage, 1);
                truncate_page = true;
                goto out;
@@ -617,6 +610,12 @@ int f2fs_truncate(struct inode *inode)
 
        trace_f2fs_truncate(inode);
 
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) {
+               f2fs_show_injection_info(FAULT_TRUNCATE);
+               return -EIO;
+       }
+#endif
        /* we should check inline_data size */
        if (!f2fs_may_inline_data(inode)) {
                err = f2fs_convert_inline_inode(inode);
@@ -1188,8 +1187,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
                if (ret)
                        return ret;
 
-               if (offset + len > new_size)
-                       new_size = offset + len;
                new_size = max_t(loff_t, new_size, offset + len);
        } else {
                if (off_start) {
@@ -1257,8 +1254,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
        int ret = 0;
 
        new_size = i_size_read(inode) + len;
-       if (new_size > inode->i_sb->s_maxbytes)
-               return -EFBIG;
+       ret = inode_newsize_ok(inode, new_size);
+       if (ret)
+               return ret;
 
        if (offset >= i_size_read(inode))
                return -EINVAL;
@@ -1428,6 +1426,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
                drop_inmem_pages(inode);
        if (f2fs_is_volatile_file(inode)) {
                clear_inode_flag(inode, FI_VOLATILE_FILE);
+               stat_dec_volatile_write(inode);
                set_inode_flag(inode, FI_DROP_CACHE);
                filemap_fdatawrite(inode->i_mapping);
                clear_inode_flag(inode, FI_DROP_CACHE);
@@ -1474,10 +1473,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
        if (ret)
                return ret;
 
-       flags = f2fs_mask_flags(inode->i_mode, flags);
-
        inode_lock(inode);
 
+       flags = f2fs_mask_flags(inode->i_mode, flags);
+
        oldflags = fi->i_flags;
 
        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -1491,10 +1490,11 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
        flags = flags & FS_FL_USER_MODIFIABLE;
        flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
        fi->i_flags = flags;
-       inode_unlock(inode);
 
        inode->i_ctime = current_time(inode);
        f2fs_set_inode_flags(inode);
+
+       inode_unlock(inode);
 out:
        mnt_drop_write_file(filp);
        return ret;
@@ -1515,6 +1515,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
        if (!inode_owner_or_capable(inode))
                return -EACCES;
 
+       if (!S_ISREG(inode->i_mode))
+               return -EINVAL;
+
        ret = mnt_want_write_file(filp);
        if (ret)
                return ret;
@@ -1529,20 +1532,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
                goto out;
 
        set_inode_flag(inode, FI_ATOMIC_FILE);
+       set_inode_flag(inode, FI_HOT_DATA);
        f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 
        if (!get_dirty_pages(inode))
-               goto out;
+               goto inc_stat;
 
        f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
                "Unexpected flush for atomic writes: ino=%lu, npages=%u",
                                        inode->i_ino, get_dirty_pages(inode));
        ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
-       if (ret)
+       if (ret) {
                clear_inode_flag(inode, FI_ATOMIC_FILE);
-out:
+               goto out;
+       }
+
+inc_stat:
        stat_inc_atomic_write(inode);
        stat_update_max_atomic_write(inode);
+out:
        inode_unlock(inode);
        mnt_drop_write_file(filp);
        return ret;
@@ -1592,6 +1600,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
        if (!inode_owner_or_capable(inode))
                return -EACCES;
 
+       if (!S_ISREG(inode->i_mode))
+               return -EINVAL;
+
        ret = mnt_want_write_file(filp);
        if (ret)
                return ret;
@@ -1605,6 +1616,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
        if (ret)
                goto out;
 
+       stat_inc_volatile_write(inode);
+       stat_update_max_volatile_write(inode);
+
        set_inode_flag(inode, FI_VOLATILE_FILE);
        f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 out:
@@ -1660,6 +1674,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
                drop_inmem_pages(inode);
        if (f2fs_is_volatile_file(inode)) {
                clear_inode_flag(inode, FI_VOLATILE_FILE);
+               stat_dec_volatile_write(inode);
                ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
        }
 
@@ -1841,7 +1856,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
                mutex_lock(&sbi->gc_mutex);
        }
 
-       ret = f2fs_gc(sbi, sync, true);
+       ret = f2fs_gc(sbi, sync, true, NULL_SEGNO);
 out:
        mnt_drop_write_file(filp);
        return ret;
@@ -1879,13 +1894,12 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
        pgoff_t pg_start, pg_end;
        unsigned int blk_per_seg = sbi->blocks_per_seg;
        unsigned int total = 0, sec_num;
-       unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg;
        block_t blk_end = 0;
        bool fragmented = false;
        int err;
 
        /* if in-place-update policy is enabled, don't waste time here */
-       if (need_inplace_update(inode))
+       if (need_inplace_update_policy(inode, NULL))
                return -EINVAL;
 
        pg_start = range->start >> PAGE_SHIFT;
@@ -1943,7 +1957,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
        map.m_lblk = pg_start;
        map.m_len = pg_end - pg_start;
 
-       sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
+       sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi);
 
        /*
         * make sure there are enough free section for LFS allocation, this can
@@ -2020,42 +2034,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       if (!S_ISREG(inode->i_mode))
+       if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode))
                return -EINVAL;
 
-       err = mnt_want_write_file(filp);
-       if (err)
-               return err;
-
-       if (f2fs_readonly(sbi->sb)) {
-               err = -EROFS;
-               goto out;
-       }
+       if (f2fs_readonly(sbi->sb))
+               return -EROFS;
 
        if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
-                                                       sizeof(range))) {
-               err = -EFAULT;
-               goto out;
-       }
+                                                       sizeof(range)))
+               return -EFAULT;
 
        /* verify alignment of offset & size */
-       if (range.start & (F2FS_BLKSIZE - 1) ||
-               range.len & (F2FS_BLKSIZE - 1)) {
-               err = -EINVAL;
-               goto out;
-       }
+       if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1))
+               return -EINVAL;
+
+       if (unlikely((range.start + range.len) >> PAGE_SHIFT >
+                                       sbi->max_file_blocks))
+               return -EINVAL;
+
+       err = mnt_want_write_file(filp);
+       if (err)
+               return err;
 
        err = f2fs_defragment_range(sbi, filp, &range);
+       mnt_drop_write_file(filp);
+
        f2fs_update_time(sbi, REQ_TIME);
        if (err < 0)
-               goto out;
+               return err;
 
        if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
                                                        sizeof(range)))
-               err = -EFAULT;
-out:
-       mnt_drop_write_file(filp);
-       return err;
+               return -EFAULT;
+
+       return 0;
 }
 
 static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
@@ -2189,6 +2201,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
                                        range.pos_out, range.len);
 
        mnt_drop_write_file(filp);
+       if (err)
+               goto err_out;
 
        if (copy_to_user((struct f2fs_move_range __user *)arg,
                                                &range, sizeof(range)))
@@ -2198,6 +2212,69 @@ err_out:
        return err;
 }
 
+static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
+{
+       struct inode *inode = file_inode(filp);
+       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+       struct sit_info *sm = SIT_I(sbi);
+       unsigned int start_segno = 0, end_segno = 0;
+       unsigned int dev_start_segno = 0, dev_end_segno = 0;
+       struct f2fs_flush_device range;
+       int ret;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (f2fs_readonly(sbi->sb))
+               return -EROFS;
+
+       if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg,
+                                                       sizeof(range)))
+               return -EFAULT;
+
+       if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num ||
+                       sbi->segs_per_sec != 1) {
+               f2fs_msg(sbi->sb, KERN_WARNING,
+                       "Can't flush %u in %d for segs_per_sec %u != 1\n",
+                               range.dev_num, sbi->s_ndevs,
+                               sbi->segs_per_sec);
+               return -EINVAL;
+       }
+
+       ret = mnt_want_write_file(filp);
+       if (ret)
+               return ret;
+
+       if (range.dev_num != 0)
+               dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk);
+       dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk);
+
+       start_segno = sm->last_victim[FLUSH_DEVICE];
+       if (start_segno < dev_start_segno || start_segno >= dev_end_segno)
+               start_segno = dev_start_segno;
+       end_segno = min(start_segno + range.segments, dev_end_segno);
+
+       while (start_segno < end_segno) {
+               if (!mutex_trylock(&sbi->gc_mutex)) {
+                       ret = -EBUSY;
+                       goto out;
+               }
+               sm->last_victim[GC_CB] = end_segno + 1;
+               sm->last_victim[GC_GREEDY] = end_segno + 1;
+               sm->last_victim[ALLOC_NEXT] = end_segno + 1;
+               ret = f2fs_gc(sbi, true, true, start_segno);
+               if (ret == -EAGAIN)
+                       ret = 0;
+               else if (ret < 0)
+                       break;
+               start_segno++;
+       }
+out:
+       mnt_drop_write_file(filp);
+       return ret;
+}
+
+
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        switch (cmd) {
@@ -2235,6 +2312,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return f2fs_ioc_defragment(filp, arg);
        case F2FS_IOC_MOVE_RANGE:
                return f2fs_ioc_move_range(filp, arg);
+       case F2FS_IOC_FLUSH_DEVICE:
+               return f2fs_ioc_flush_device(filp, arg);
        default:
                return -ENOTTY;
        }
@@ -2302,8 +2381,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case F2FS_IOC_GARBAGE_COLLECT:
        case F2FS_IOC_WRITE_CHECKPOINT:
        case F2FS_IOC_DEFRAGMENT:
-               break;
        case F2FS_IOC_MOVE_RANGE:
+       case F2FS_IOC_FLUSH_DEVICE:
                break;
        default:
                return -ENOIOCTLCMD;
index 418fd988164677623cf0ad305d34f7855fb608dd..026522107ca3f843dee496b0f1ab5359c10eeb6a 100644 (file)
@@ -84,7 +84,7 @@ static int gc_thread_func(void *data)
                stat_inc_bggc_count(sbi);
 
                /* if return value is not zero, no victim was selected */
-               if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true))
+               if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
                        wait_ms = gc_th->no_gc_sleep_time;
 
                trace_f2fs_background_gc(sbi->sb, wait_ms,
@@ -172,7 +172,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
        if (gc_type != FG_GC && p->max_search > sbi->max_victim_search)
                p->max_search = sbi->max_victim_search;
 
-       p->offset = sbi->last_victim[p->gc_mode];
+       /* let's select beginning hot/small space first */
+       if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+               p->offset = 0;
+       else
+               p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
 }
 
 static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
@@ -182,7 +186,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
        if (p->alloc_mode == SSR)
                return sbi->blocks_per_seg;
        if (p->gc_mode == GC_GREEDY)
-               return sbi->blocks_per_seg * p->ofs_unit;
+               return 2 * sbi->blocks_per_seg * p->ofs_unit;
        else if (p->gc_mode == GC_CB)
                return UINT_MAX;
        else /* No other gc_mode */
@@ -207,7 +211,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
                        continue;
 
                clear_bit(secno, dirty_i->victim_secmap);
-               return secno * sbi->segs_per_sec;
+               return GET_SEG_FROM_SEC(sbi, secno);
        }
        return NULL_SEGNO;
 }
@@ -215,8 +219,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
 {
        struct sit_info *sit_i = SIT_I(sbi);
-       unsigned int secno = GET_SECNO(sbi, segno);
-       unsigned int start = secno * sbi->segs_per_sec;
+       unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+       unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
        unsigned long long mtime = 0;
        unsigned int vblocks;
        unsigned char age = 0;
@@ -225,7 +229,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
 
        for (i = 0; i < sbi->segs_per_sec; i++)
                mtime += get_seg_entry(sbi, start + i)->mtime;
-       vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+       vblocks = get_valid_blocks(sbi, segno, true);
 
        mtime = div_u64(mtime, sbi->segs_per_sec);
        vblocks = div_u64(vblocks, sbi->segs_per_sec);
@@ -248,7 +252,7 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi,
                                                unsigned int segno)
 {
        unsigned int valid_blocks =
-                       get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+                       get_valid_blocks(sbi, segno, true);
 
        return IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
                                valid_blocks * 2 : valid_blocks;
@@ -291,6 +295,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
                unsigned int *result, int gc_type, int type, char alloc_mode)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+       struct sit_info *sm = SIT_I(sbi);
        struct victim_sel_policy p;
        unsigned int secno, last_victim;
        unsigned int last_segment = MAIN_SEGS(sbi);
@@ -304,10 +309,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
        p.min_segno = NULL_SEGNO;
        p.min_cost = get_max_cost(sbi, &p);
 
+       if (*result != NULL_SEGNO) {
+               if (IS_DATASEG(get_seg_entry(sbi, *result)->type) &&
+                       get_valid_blocks(sbi, *result, false) &&
+                       !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
+                       p.min_segno = *result;
+               goto out;
+       }
+
        if (p.max_search == 0)
                goto out;
 
-       last_victim = sbi->last_victim[p.gc_mode];
+       last_victim = sm->last_victim[p.gc_mode];
        if (p.alloc_mode == LFS && gc_type == FG_GC) {
                p.min_segno = check_bg_victims(sbi);
                if (p.min_segno != NULL_SEGNO)
@@ -320,9 +333,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 
                segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
                if (segno >= last_segment) {
-                       if (sbi->last_victim[p.gc_mode]) {
-                               last_segment = sbi->last_victim[p.gc_mode];
-                               sbi->last_victim[p.gc_mode] = 0;
+                       if (sm->last_victim[p.gc_mode]) {
+                               last_segment =
+                                       sm->last_victim[p.gc_mode];
+                               sm->last_victim[p.gc_mode] = 0;
                                p.offset = 0;
                                continue;
                        }
@@ -339,7 +353,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
                        nsearched++;
                }
 
-               secno = GET_SECNO(sbi, segno);
+               secno = GET_SEC_FROM_SEG(sbi, segno);
 
                if (sec_usage_check(sbi, secno))
                        goto next;
@@ -357,17 +371,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
                }
 next:
                if (nsearched >= p.max_search) {
-                       if (!sbi->last_victim[p.gc_mode] && segno <= last_victim)
-                               sbi->last_victim[p.gc_mode] = last_victim + 1;
+                       if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
+                               sm->last_victim[p.gc_mode] = last_victim + 1;
                        else
-                               sbi->last_victim[p.gc_mode] = segno + 1;
+                               sm->last_victim[p.gc_mode] = segno + 1;
+                       sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi);
                        break;
                }
        }
        if (p.min_segno != NULL_SEGNO) {
 got_it:
                if (p.alloc_mode == LFS) {
-                       secno = GET_SECNO(sbi, p.min_segno);
+                       secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
                        if (gc_type == FG_GC)
                                sbi->cur_victim_sec = secno;
                        else
@@ -550,8 +565,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
        get_node_info(sbi, nid, dni);
 
        if (sum->version != dni->version) {
-               f2fs_put_page(node_page, 1);
-               return false;
+               f2fs_msg(sbi->sb, KERN_WARNING,
+                               "%s: valid data with mismatched node version.",
+                               __func__);
+               set_sbi_flag(sbi, SBI_NEED_FSCK);
        }
 
        *nofs = ofs_of_node(node_page);
@@ -697,8 +714,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
                        .type = DATA,
                        .op = REQ_OP_WRITE,
                        .op_flags = REQ_SYNC,
+                       .old_blkaddr = NULL_ADDR,
                        .page = page,
                        .encrypted_page = NULL,
+                       .need_lock = true,
                };
                bool is_dirty = PageDirty(page);
                int err;
@@ -890,7 +909,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
                                        GET_SUM_BLOCK(sbi, segno));
                f2fs_put_page(sum_page, 0);
 
-               if (get_valid_blocks(sbi, segno, 1) == 0 ||
+               if (get_valid_blocks(sbi, segno, false) == 0 ||
                                !PageUptodate(sum_page) ||
                                unlikely(f2fs_cp_error(sbi)))
                        goto next;
@@ -905,7 +924,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
                 *   - mutex_lock(sentry_lock)     - change_curseg()
                 *                                  - lock_page(sum_page)
                 */
-
                if (type == SUM_TYPE_NODE)
                        gc_node_segment(sbi, sum->entries, segno, gc_type);
                else
@@ -924,7 +942,7 @@ next:
        blk_finish_plug(&plug);
 
        if (gc_type == FG_GC &&
-               get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0)
+               get_valid_blocks(sbi, start_segno, true) == 0)
                sec_freed = 1;
 
        stat_inc_call_count(sbi->stat_info);
@@ -932,13 +950,14 @@ next:
        return sec_freed;
 }
 
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
+                       bool background, unsigned int segno)
 {
-       unsigned int segno;
        int gc_type = sync ? FG_GC : BG_GC;
        int sec_freed = 0;
        int ret = -EINVAL;
        struct cp_control cpc;
+       unsigned int init_segno = segno;
        struct gc_inode_list gc_list = {
                .ilist = LIST_HEAD_INIT(gc_list.ilist),
                .iroot = RADIX_TREE_INIT(GFP_NOFS),
@@ -959,9 +978,11 @@ gc_more:
                 * threshold, we can make them free by checkpoint. Then, we
                 * secure free segments which doesn't need fggc any more.
                 */
-               ret = write_checkpoint(sbi, &cpc);
-               if (ret)
-                       goto stop;
+               if (prefree_segments(sbi)) {
+                       ret = write_checkpoint(sbi, &cpc);
+                       if (ret)
+                               goto stop;
+               }
                if (has_not_enough_free_secs(sbi, 0, 0))
                        gc_type = FG_GC;
        }
@@ -981,13 +1002,17 @@ gc_more:
                sbi->cur_victim_sec = NULL_SEGNO;
 
        if (!sync) {
-               if (has_not_enough_free_secs(sbi, sec_freed, 0))
+               if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
+                       segno = NULL_SEGNO;
                        goto gc_more;
+               }
 
                if (gc_type == FG_GC)
                        ret = write_checkpoint(sbi, &cpc);
        }
 stop:
+       SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
+       SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
        mutex_unlock(&sbi->gc_mutex);
 
        put_gc_inode(&gc_list);
@@ -999,7 +1024,7 @@ stop:
 
 void build_gc_manager(struct f2fs_sb_info *sbi)
 {
-       u64 main_count, resv_count, ovp_count, blocks_per_sec;
+       u64 main_count, resv_count, ovp_count;
 
        DIRTY_I(sbi)->v_ops = &default_v_ops;
 
@@ -1007,8 +1032,12 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
        main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg;
        resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg;
        ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
-       blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec;
 
-       sbi->fggc_threshold = div64_u64((main_count - ovp_count) * blocks_per_sec,
-                                       (main_count - resv_count));
+       sbi->fggc_threshold = div64_u64((main_count - ovp_count) *
+                               BLKS_PER_SEC(sbi), (main_count - resv_count));
+
+       /* give warm/cold data area from slower device */
+       if (sbi->s_ndevs && sbi->segs_per_sec == 1)
+               SIT_I(sbi)->last_victim[ALLOC_NEXT] =
+                               GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
 }
index fa729ff6b2f924dd34630fd40a4c7618220bba63..e4c527c4e7d0b382907b0c8fbf95d0f15f4bbc09 100644 (file)
@@ -63,19 +63,21 @@ void read_inline_data(struct page *page, struct page *ipage)
                SetPageUptodate(page);
 }
 
-bool truncate_inline_inode(struct page *ipage, u64 from)
+void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from)
 {
        void *addr;
 
        if (from >= MAX_INLINE_DATA)
-               return false;
+               return;
 
        addr = inline_data_addr(ipage);
 
        f2fs_wait_on_page_writeback(ipage, NODE, true);
        memset(addr + from, 0, MAX_INLINE_DATA - from);
        set_page_dirty(ipage);
-       return true;
+
+       if (from == 0)
+               clear_inode_flag(inode, FI_DATA_EXIST);
 }
 
 int f2fs_read_inline_data(struct inode *inode, struct page *page)
@@ -135,6 +137,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
        /* write data page to try to make data consistent */
        set_page_writeback(page);
        fio.old_blkaddr = dn->data_blkaddr;
+       set_inode_flag(dn->inode, FI_HOT_DATA);
        write_data_page(dn, &fio);
        f2fs_wait_on_page_writeback(page, DATA, true);
        if (dirty) {
@@ -146,11 +149,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
        set_inode_flag(dn->inode, FI_APPEND_WRITE);
 
        /* clear inline data and flag after data writeback */
-       truncate_inline_inode(dn->inode_page, 0);
+       truncate_inline_inode(dn->inode, dn->inode_page, 0);
        clear_inline_node(dn->inode_page);
 clear_out:
        stat_dec_inline_inode(dn->inode);
-       f2fs_clear_inline_inode(dn->inode);
+       clear_inode_flag(dn->inode, FI_INLINE_DATA);
        f2fs_put_dnode(dn);
        return 0;
 }
@@ -267,9 +270,8 @@ process_inline:
        if (f2fs_has_inline_data(inode)) {
                ipage = get_node_page(sbi, inode->i_ino);
                f2fs_bug_on(sbi, IS_ERR(ipage));
-               if (!truncate_inline_inode(ipage, 0))
-                       return false;
-               f2fs_clear_inline_inode(inode);
+               truncate_inline_inode(inode, ipage, 0);
+               clear_inode_flag(inode, FI_INLINE_DATA);
                f2fs_put_page(ipage, 1);
        } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
                if (truncate_blocks(inode, 0, false))
@@ -300,7 +302,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
 
        inline_dentry = inline_data_addr(ipage);
 
-       make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+       make_dentry_ptr_inline(NULL, &d, inline_dentry);
        de = find_target_dentry(fname, namehash, NULL, &d);
        unlock_page(ipage);
        if (de)
@@ -319,7 +321,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
 
        dentry_blk = inline_data_addr(ipage);
 
-       make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
+       make_dentry_ptr_inline(NULL, &d, dentry_blk);
        do_make_empty_dir(inode, parent, &d);
 
        set_page_dirty(ipage);
@@ -380,7 +382,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
        set_page_dirty(page);
 
        /* clear inline dir and flag after data writeback */
-       truncate_inline_inode(ipage, 0);
+       truncate_inline_inode(dir, ipage, 0);
 
        stat_dec_inline_dir(dir);
        clear_inode_flag(dir, FI_INLINE_DENTRY);
@@ -400,7 +402,7 @@ static int f2fs_add_inline_entries(struct inode *dir,
        unsigned long bit_pos = 0;
        int err = 0;
 
-       make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+       make_dentry_ptr_inline(NULL, &d, inline_dentry);
 
        while (bit_pos < d.max) {
                struct f2fs_dir_entry *de;
@@ -455,7 +457,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
        }
 
        memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
-       truncate_inline_inode(ipage, 0);
+       truncate_inline_inode(dir, ipage, 0);
 
        unlock_page(ipage);
 
@@ -527,14 +529,12 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
                        err = PTR_ERR(page);
                        goto fail;
                }
-               if (f2fs_encrypted_inode(dir))
-                       file_set_enc_name(inode);
        }
 
        f2fs_wait_on_page_writeback(ipage, NODE, true);
 
        name_hash = f2fs_dentry_hash(new_name, NULL);
-       make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
+       make_dentry_ptr_inline(NULL, &d, dentry_blk);
        f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
 
        set_page_dirty(ipage);
@@ -623,7 +623,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 
        inline_dentry = inline_data_addr(ipage);
 
-       make_dentry_ptr(inode, &d, (void *)inline_dentry, 2);
+       make_dentry_ptr_inline(inode, &d, inline_dentry);
 
        err = f2fs_fill_dentries(ctx, &d, 0, fstr);
        if (!err)
index 24bb8213d974b710b43f286b1089ccff7946a50e..518f49643092582ae256c425616e942ed2392c40 100644 (file)
@@ -316,7 +316,6 @@ retry:
                } else if (err != -ENOENT) {
                        f2fs_stop_checkpoint(sbi, false);
                }
-               f2fs_inode_synced(inode);
                return 0;
        }
        ret = update_inode(inode, node_page);
@@ -339,7 +338,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
         * We need to balance fs here to prevent from producing dirty node pages
         * during the urgent cleaning time when runing out of free sections.
         */
-       if (update_inode_page(inode) && wbc && wbc->nr_to_write)
+       update_inode_page(inode);
+       if (wbc && wbc->nr_to_write)
                f2fs_balance_fs(sbi, true);
        return 0;
 }
@@ -372,13 +372,6 @@ void f2fs_evict_inode(struct inode *inode)
        if (inode->i_nlink || is_bad_inode(inode))
                goto no_delete;
 
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-       if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
-               f2fs_show_injection_info(FAULT_EVICT_INODE);
-               goto no_delete;
-       }
-#endif
-
        remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
        remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
 
@@ -389,6 +382,12 @@ retry:
        if (F2FS_HAS_BLOCKS(inode))
                err = f2fs_truncate(inode);
 
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
+               f2fs_show_injection_info(FAULT_EVICT_INODE);
+               err = -EIO;
+       }
+#endif
        if (!err) {
                f2fs_lock_op(sbi);
                err = remove_inode_page(inode);
@@ -411,7 +410,10 @@ no_delete:
        stat_dec_inline_dir(inode);
        stat_dec_inline_inode(inode);
 
-       invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
+       /* ino == 0, if f2fs_new_inode() was failed t*/
+       if (inode->i_ino)
+               invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
+                                                       inode->i_ino);
        if (xnid)
                invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
        if (inode->i_nlink) {
@@ -448,6 +450,7 @@ void handle_failed_inode(struct inode *inode)
         * in a panic when flushing dirty inodes in gdirty_list.
         */
        update_inode_page(inode);
+       f2fs_inode_synced(inode);
 
        /* don't make bad inode, since it becomes a regular file. */
        unlock_new_inode(inode);
index 9a5b9fa553181959174dc18f99efc764b558ade4..c31b40e5f9cf6dce167f3c24381df1e31c63acf2 100644 (file)
@@ -148,8 +148,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        inode->i_mapping->a_ops = &f2fs_dblock_aops;
        ino = inode->i_ino;
 
-       f2fs_balance_fs(sbi, true);
-
        f2fs_lock_op(sbi);
        err = f2fs_add_link(dentry, inode);
        if (err)
@@ -163,6 +161,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 
        if (IS_DIRSYNC(dir))
                f2fs_sync_fs(sbi->sb, 1);
+
+       f2fs_balance_fs(sbi, true);
        return 0;
 out:
        handle_failed_inode(inode);
@@ -424,8 +424,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
        inode_nohighmem(inode);
        inode->i_mapping->a_ops = &f2fs_dblock_aops;
 
-       f2fs_balance_fs(sbi, true);
-
        f2fs_lock_op(sbi);
        err = f2fs_add_link(dentry, inode);
        if (err)
@@ -488,6 +486,8 @@ err_out:
        }
 
        kfree(sd);
+
+       f2fs_balance_fs(sbi, true);
        return err;
 out:
        handle_failed_inode(inode);
@@ -509,8 +509,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_mapping->a_ops = &f2fs_dblock_aops;
        mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
 
-       f2fs_balance_fs(sbi, true);
-
        set_inode_flag(inode, FI_INC_LINK);
        f2fs_lock_op(sbi);
        err = f2fs_add_link(dentry, inode);
@@ -525,6 +523,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
        if (IS_DIRSYNC(dir))
                f2fs_sync_fs(sbi->sb, 1);
+
+       f2fs_balance_fs(sbi, true);
        return 0;
 
 out_fail:
@@ -555,8 +555,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
        init_special_inode(inode, inode->i_mode, rdev);
        inode->i_op = &f2fs_special_inode_operations;
 
-       f2fs_balance_fs(sbi, true);
-
        f2fs_lock_op(sbi);
        err = f2fs_add_link(dentry, inode);
        if (err)
@@ -570,6 +568,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 
        if (IS_DIRSYNC(dir))
                f2fs_sync_fs(sbi->sb, 1);
+
+       f2fs_balance_fs(sbi, true);
        return 0;
 out:
        handle_failed_inode(inode);
@@ -596,8 +596,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
                inode->i_mapping->a_ops = &f2fs_dblock_aops;
        }
 
-       f2fs_balance_fs(sbi, true);
-
        f2fs_lock_op(sbi);
        err = acquire_orphan_inode(sbi);
        if (err)
@@ -623,6 +621,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
        /* link_count was changed by d_tmpfile as well. */
        f2fs_unlock_op(sbi);
        unlock_new_inode(inode);
+
+       f2fs_balance_fs(sbi, true);
        return 0;
 
 release_out:
@@ -721,13 +721,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (err)
                        goto put_out_dir;
 
-               err = update_dent_inode(old_inode, new_inode,
-                                               &new_dentry->d_name);
-               if (err) {
-                       release_orphan_inode(sbi);
-                       goto put_out_dir;
-               }
-
                f2fs_set_link(new_dir, new_entry, new_page, old_inode);
 
                new_inode->i_ctime = current_time(new_inode);
@@ -780,8 +773,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
        down_write(&F2FS_I(old_inode)->i_sem);
        file_lost_pino(old_inode);
-       if (new_inode && file_enc_name(new_inode))
-               file_set_enc_name(old_inode);
        up_write(&F2FS_I(old_inode)->i_sem);
 
        old_inode->i_ctime = current_time(old_inode);
@@ -909,8 +900,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
                old_nlink = old_dir_entry ? -1 : 1;
                new_nlink = -old_nlink;
                err = -EMLINK;
-               if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) ||
-                       (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX))
+               if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) ||
+                       (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX))
                        goto out_new_dir;
        }
 
@@ -918,18 +909,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 
        f2fs_lock_op(sbi);
 
-       err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
-       if (err)
-               goto out_unlock;
-       if (file_enc_name(new_inode))
-               file_set_enc_name(old_inode);
-
-       err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name);
-       if (err)
-               goto out_undo;
-       if (file_enc_name(old_inode))
-               file_set_enc_name(new_inode);
-
        /* update ".." directory entry info of old dentry */
        if (old_dir_entry)
                f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
@@ -973,14 +952,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
                f2fs_sync_fs(sbi->sb, 1);
        return 0;
-out_undo:
-       /*
-        * Still we may fail to recover name info of f2fs_inode here
-        * Drop it, once its name is set as encrypted
-        */
-       update_dent_inode(old_inode, old_inode, &old_dentry->d_name);
-out_unlock:
-       f2fs_unlock_op(sbi);
 out_new_dir:
        if (new_dir_entry) {
                f2fs_dentry_kunmap(new_inode, new_dir_page);
index 481aa8dc79f46f4c156cf67cca665e8160e36e6a..98351a4a4da3f41d00fa3e3256ce741b59e6d715 100644 (file)
@@ -22,7 +22,7 @@
 #include "trace.h"
 #include <trace/events/f2fs.h>
 
-#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
+#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
 
 static struct kmem_cache *nat_entry_slab;
 static struct kmem_cache *free_nid_slab;
@@ -63,8 +63,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
                int i;
 
                for (i = 0; i <= UPDATE_INO; i++)
-                       mem_size += (sbi->im[i].ino_num *
-                               sizeof(struct ino_entry)) >> PAGE_SHIFT;
+                       mem_size += sbi->im[i].ino_num *
+                                               sizeof(struct ino_entry);
+               mem_size >>= PAGE_SHIFT;
                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
        } else if (type == EXTENT_CACHE) {
                mem_size = (atomic_read(&sbi->total_ext_tree) *
@@ -177,18 +178,12 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
 }
 
 static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
-                                               struct nat_entry *ne)
+               struct nat_entry_set *set, struct nat_entry *ne)
 {
-       nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
-       struct nat_entry_set *head;
-
-       head = radix_tree_lookup(&nm_i->nat_set_root, set);
-       if (head) {
-               list_move_tail(&ne->list, &nm_i->nat_entries);
-               set_nat_flag(ne, IS_DIRTY, false);
-               head->entry_cnt--;
-               nm_i->dirty_nat_cnt--;
-       }
+       list_move_tail(&ne->list, &nm_i->nat_entries);
+       set_nat_flag(ne, IS_DIRTY, false);
+       set->entry_cnt--;
+       nm_i->dirty_nat_cnt--;
 }
 
 static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
@@ -381,6 +376,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
        struct page *page = NULL;
        struct f2fs_nat_entry ne;
        struct nat_entry *e;
+       pgoff_t index;
        int i;
 
        ni->nid = nid;
@@ -406,17 +402,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
                node_info_from_raw_nat(ni, &ne);
        }
        up_read(&curseg->journal_rwsem);
-       if (i >= 0)
+       if (i >= 0) {
+               up_read(&nm_i->nat_tree_lock);
                goto cache;
+       }
 
        /* Fill node_info from nat page */
-       page = get_current_nat_page(sbi, start_nid);
+       index = current_nat_addr(sbi, nid);
+       up_read(&nm_i->nat_tree_lock);
+
+       page = get_meta_page(sbi, index);
        nat_blk = (struct f2fs_nat_block *)page_address(page);
        ne = nat_blk->entries[nid - start_nid];
        node_info_from_raw_nat(ni, &ne);
        f2fs_put_page(page, 1);
 cache:
-       up_read(&nm_i->nat_tree_lock);
        /* cache nat entry */
        down_write(&nm_i->nat_tree_lock);
        cache_nat_entry(sbi, nid, &ne);
@@ -1463,6 +1463,9 @@ continue_unlock:
                        f2fs_wait_on_page_writeback(page, NODE, true);
                        BUG_ON(PageWriteback(page));
 
+                       set_fsync_mark(page, 0);
+                       set_dentry_mark(page, 0);
+
                        if (!atomic || page == last_page) {
                                set_fsync_mark(page, 1);
                                if (IS_INODE(page)) {
@@ -1766,40 +1769,67 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
 static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
-       struct free_nid *i;
+       struct free_nid *i, *e;
        struct nat_entry *ne;
-       int err;
+       int err = -EINVAL;
+       bool ret = false;
 
        /* 0 nid should not be used */
        if (unlikely(nid == 0))
                return false;
 
-       if (build) {
-               /* do not add allocated nids */
-               ne = __lookup_nat_cache(nm_i, nid);
-               if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
-                               nat_get_blkaddr(ne) != NULL_ADDR))
-                       return false;
-       }
-
        i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
        i->nid = nid;
        i->state = NID_NEW;
 
-       if (radix_tree_preload(GFP_NOFS)) {
-               kmem_cache_free(free_nid_slab, i);
-               return true;
-       }
+       if (radix_tree_preload(GFP_NOFS))
+               goto err;
 
        spin_lock(&nm_i->nid_list_lock);
+
+       if (build) {
+               /*
+                *   Thread A             Thread B
+                *  - f2fs_create
+                *   - f2fs_new_inode
+                *    - alloc_nid
+                *     - __insert_nid_to_list(ALLOC_NID_LIST)
+                *                     - f2fs_balance_fs_bg
+                *                      - build_free_nids
+                *                       - __build_free_nids
+                *                        - scan_nat_page
+                *                         - add_free_nid
+                *                          - __lookup_nat_cache
+                *  - f2fs_add_link
+                *   - init_inode_metadata
+                *    - new_inode_page
+                *     - new_node_page
+                *      - set_node_addr
+                *  - alloc_nid_done
+                *   - __remove_nid_from_list(ALLOC_NID_LIST)
+                *                         - __insert_nid_to_list(FREE_NID_LIST)
+                */
+               ne = __lookup_nat_cache(nm_i, nid);
+               if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+                               nat_get_blkaddr(ne) != NULL_ADDR))
+                       goto err_out;
+
+               e = __lookup_free_nid_list(nm_i, nid);
+               if (e) {
+                       if (e->state == NID_NEW)
+                               ret = true;
+                       goto err_out;
+               }
+       }
+       ret = true;
        err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+err_out:
        spin_unlock(&nm_i->nid_list_lock);
        radix_tree_preload_end();
-       if (err) {
+err:
+       if (err)
                kmem_cache_free(free_nid_slab, i);
-               return true;
-       }
-       return true;
+       return ret;
 }
 
 static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
@@ -1821,7 +1851,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
 }
 
 static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
-                       bool set, bool build, bool locked)
+                                                       bool set, bool build)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
@@ -1835,14 +1865,10 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
        else
                __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
 
-       if (!locked)
-               spin_lock(&nm_i->free_nid_lock);
        if (set)
                nm_i->free_nid_count[nat_ofs]++;
        else if (!build)
                nm_i->free_nid_count[nat_ofs]--;
-       if (!locked)
-               spin_unlock(&nm_i->free_nid_lock);
 }
 
 static void scan_nat_page(struct f2fs_sb_info *sbi,
@@ -1871,7 +1897,9 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
                f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
                if (blk_addr == NULL_ADDR)
                        freed = add_free_nid(sbi, start_nid, true);
-               update_free_nid_bitmap(sbi, start_nid, freed, true, false);
+               spin_lock(&NM_I(sbi)->nid_list_lock);
+               update_free_nid_bitmap(sbi, start_nid, freed, true);
+               spin_unlock(&NM_I(sbi)->nid_list_lock);
        }
 }
 
@@ -1927,6 +1955,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
        int i = 0;
        nid_t nid = nm_i->next_scan_nid;
 
+       if (unlikely(nid >= nm_i->max_nid))
+               nid = 0;
+
        /* Enough entries */
        if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
                return;
@@ -2026,7 +2057,7 @@ retry:
                __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
                nm_i->available_nids--;
 
-               update_free_nid_bitmap(sbi, *nid, false, false, false);
+               update_free_nid_bitmap(sbi, *nid, false, false);
 
                spin_unlock(&nm_i->nid_list_lock);
                return true;
@@ -2082,7 +2113,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 
        nm_i->available_nids++;
 
-       update_free_nid_bitmap(sbi, nid, true, false, false);
+       update_free_nid_bitmap(sbi, nid, true, false);
 
        spin_unlock(&nm_i->nid_list_lock);
 
@@ -2407,16 +2438,16 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
                }
                raw_nat_from_node_info(raw_ne, &ne->ni);
                nat_reset_flag(ne);
-               __clear_nat_cache_dirty(NM_I(sbi), ne);
+               __clear_nat_cache_dirty(NM_I(sbi), set, ne);
                if (nat_get_blkaddr(ne) == NULL_ADDR) {
                        add_free_nid(sbi, nid, false);
                        spin_lock(&NM_I(sbi)->nid_list_lock);
                        NM_I(sbi)->available_nids++;
-                       update_free_nid_bitmap(sbi, nid, true, false, false);
+                       update_free_nid_bitmap(sbi, nid, true, false);
                        spin_unlock(&NM_I(sbi)->nid_list_lock);
                } else {
                        spin_lock(&NM_I(sbi)->nid_list_lock);
-                       update_free_nid_bitmap(sbi, nid, false, false, false);
+                       update_free_nid_bitmap(sbi, nid, false, false);
                        spin_unlock(&NM_I(sbi)->nid_list_lock);
                }
        }
@@ -2428,10 +2459,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
                f2fs_put_page(page, 1);
        }
 
-       f2fs_bug_on(sbi, set->entry_cnt);
-
-       radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
-       kmem_cache_free(nat_entry_set_slab, set);
+       /* Allow dirty nats by node block allocation in write_begin */
+       if (!set->entry_cnt) {
+               radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+               kmem_cache_free(nat_entry_set_slab, set);
+       }
 }
 
 /*
@@ -2476,8 +2508,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                __flush_nat_entry_set(sbi, set, cpc);
 
        up_write(&nm_i->nat_tree_lock);
-
-       f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
+       /* Allow dirty nats by node block allocation in write_begin */
 }
 
 static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
@@ -2541,10 +2572,10 @@ inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
                nid = i * NAT_ENTRY_PER_BLOCK;
                last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK;
 
-               spin_lock(&nm_i->free_nid_lock);
+               spin_lock(&NM_I(sbi)->nid_list_lock);
                for (; nid < last_nid; nid++)
-                       update_free_nid_bitmap(sbi, nid, true, true, true);
-               spin_unlock(&nm_i->free_nid_lock);
+                       update_free_nid_bitmap(sbi, nid, true, true);
+               spin_unlock(&NM_I(sbi)->nid_list_lock);
        }
 
        for (i = 0; i < nm_i->nat_blocks; i++) {
@@ -2635,9 +2666,6 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi)
                                        sizeof(unsigned short), GFP_KERNEL);
        if (!nm_i->free_nid_count)
                return -ENOMEM;
-
-       spin_lock_init(&nm_i->free_nid_lock);
-
        return 0;
 }
 
index 2f9603fa85a59842281b3443e4984351b2692af8..558048e33cf9a6c1920f03673a98409c34325f4c 100644 (file)
@@ -9,10 +9,10 @@
  * published by the Free Software Foundation.
  */
 /* start node id of a node block dedicated to the given node id */
-#define        START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
+#define        START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
 
 /* node block offset on the NAT area dedicated to the given start node id */
-#define        NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
+#define        NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK)
 
 /* # of pages to perform synchronous readahead before building free nids */
 #define FREE_NID_PAGES 8
@@ -62,16 +62,16 @@ struct nat_entry {
        struct node_info ni;    /* in-memory node information */
 };
 
-#define nat_get_nid(nat)               (nat->ni.nid)
-#define nat_set_nid(nat, n)            (nat->ni.nid = n)
-#define nat_get_blkaddr(nat)           (nat->ni.blk_addr)
-#define nat_set_blkaddr(nat, b)                (nat->ni.blk_addr = b)
-#define nat_get_ino(nat)               (nat->ni.ino)
-#define nat_set_ino(nat, i)            (nat->ni.ino = i)
-#define nat_get_version(nat)           (nat->ni.version)
-#define nat_set_version(nat, v)                (nat->ni.version = v)
+#define nat_get_nid(nat)               ((nat)->ni.nid)
+#define nat_set_nid(nat, n)            ((nat)->ni.nid = (n))
+#define nat_get_blkaddr(nat)           ((nat)->ni.blk_addr)
+#define nat_set_blkaddr(nat, b)                ((nat)->ni.blk_addr = (b))
+#define nat_get_ino(nat)               ((nat)->ni.ino)
+#define nat_set_ino(nat, i)            ((nat)->ni.ino = (i))
+#define nat_get_version(nat)           ((nat)->ni.version)
+#define nat_set_version(nat, v)                ((nat)->ni.version = (v))
 
-#define inc_node_version(version)      (++version)
+#define inc_node_version(version)      (++(version))
 
 static inline void copy_node_info(struct node_info *dst,
                                                struct node_info *src)
@@ -200,13 +200,16 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        pgoff_t block_off;
        pgoff_t block_addr;
-       int seg_off;
 
+       /*
+        * block_off = segment_off * 512 + off_in_segment
+        * OLD = (segment_off * 512) * 2 + off_in_segment
+        * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment
+        */
        block_off = NAT_BLOCK_OFFSET(start);
-       seg_off = block_off >> sbi->log_blocks_per_seg;
 
        block_addr = (pgoff_t)(nm_i->nat_blkaddr +
-               (seg_off << sbi->log_blocks_per_seg << 1) +
+               (block_off << 1) -
                (block_off & (sbi->blocks_per_seg - 1)));
 
        if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
index d025aa83fb5bb344fcb5799f080d033895e584aa..907d6b7dde6a7f636ea7027d461cfd029b8b1ae5 100644 (file)
@@ -198,7 +198,8 @@ static void recover_inode(struct inode *inode, struct page *page)
                        ino_of_node(page), name);
 }
 
-static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
+static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
+                               bool check_only)
 {
        struct curseg_info *curseg;
        struct page *page = NULL;
@@ -225,7 +226,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 
                entry = get_fsync_inode(head, ino_of_node(page));
                if (!entry) {
-                       if (IS_INODE(page) && is_dent_dnode(page)) {
+                       if (!check_only &&
+                                       IS_INODE(page) && is_dent_dnode(page)) {
                                err = recover_inode_page(sbi, page);
                                if (err)
                                        break;
@@ -569,7 +571,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
        mutex_lock(&sbi->cp_mutex);
 
        /* step #1: find fsynced inode numbers */
-       err = find_fsync_dnodes(sbi, &inode_list);
+       err = find_fsync_dnodes(sbi, &inode_list, check_only);
        if (err || list_empty(&inode_list))
                goto out;
 
index 29ef7088c5582a480b6a1f7965fbbcca4f07e24e..de31030b5041c4e618508f8478c7a5660de7a1de 100644 (file)
@@ -250,6 +250,36 @@ void drop_inmem_pages(struct inode *inode)
        stat_dec_atomic_write(inode);
 }
 
+void drop_inmem_page(struct inode *inode, struct page *page)
+{
+       struct f2fs_inode_info *fi = F2FS_I(inode);
+       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+       struct list_head *head = &fi->inmem_pages;
+       struct inmem_pages *cur = NULL;
+
+       f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page));
+
+       mutex_lock(&fi->inmem_lock);
+       list_for_each_entry(cur, head, list) {
+               if (cur->page == page)
+                       break;
+       }
+
+       f2fs_bug_on(sbi, !cur || cur->page != page);
+       list_del(&cur->list);
+       mutex_unlock(&fi->inmem_lock);
+
+       dec_page_count(sbi, F2FS_INMEM_PAGES);
+       kmem_cache_free(inmem_entry_slab, cur);
+
+       ClearPageUptodate(page);
+       set_page_private(page, 0);
+       ClearPagePrivate(page);
+       f2fs_put_page(page, 0);
+
+       trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
+}
+
 static int __commit_inmem_pages(struct inode *inode,
                                        struct list_head *revoke_list)
 {
@@ -261,7 +291,6 @@ static int __commit_inmem_pages(struct inode *inode,
                .type = DATA,
                .op = REQ_OP_WRITE,
                .op_flags = REQ_SYNC | REQ_PRIO,
-               .encrypted_page = NULL,
        };
        pgoff_t last_idx = ULONG_MAX;
        int err = 0;
@@ -281,6 +310,9 @@ static int __commit_inmem_pages(struct inode *inode,
                        }
 
                        fio.page = page;
+                       fio.old_blkaddr = NULL_ADDR;
+                       fio.encrypted_page = NULL;
+                       fio.need_lock = false,
                        err = do_write_data_page(&fio);
                        if (err) {
                                unlock_page(page);
@@ -358,11 +390,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
        }
 #endif
 
-       if (!need)
-               return;
-
        /* balance_fs_bg is able to be pending */
-       if (excess_cached_nats(sbi))
+       if (need && excess_cached_nats(sbi))
                f2fs_balance_fs_bg(sbi);
 
        /*
@@ -371,7 +400,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
         */
        if (has_not_enough_free_secs(sbi, 0, 0)) {
                mutex_lock(&sbi->gc_mutex);
-               f2fs_gc(sbi, false, false);
+               f2fs_gc(sbi, false, false, NULL_SEGNO);
        }
 }
 
@@ -390,7 +419,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
        else
                build_free_nids(sbi, false, false);
 
-       if (!is_idle(sbi))
+       if (!is_idle(sbi) && !excess_dirty_nats(sbi))
                return;
 
        /* checkpoint is the only way to shrink partial cached entries */
@@ -411,32 +440,34 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
        }
 }
 
-static int __submit_flush_wait(struct block_device *bdev)
+static int __submit_flush_wait(struct f2fs_sb_info *sbi,
+                               struct block_device *bdev)
 {
        struct bio *bio = f2fs_bio_alloc(0);
        int ret;
 
-       bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+       bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
        bio->bi_bdev = bdev;
        ret = submit_bio_wait(bio);
        bio_put(bio);
+
+       trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
+                               test_opt(sbi, FLUSH_MERGE), ret);
        return ret;
 }
 
 static int submit_flush_wait(struct f2fs_sb_info *sbi)
 {
-       int ret = __submit_flush_wait(sbi->sb->s_bdev);
+       int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev);
        int i;
 
-       if (sbi->s_ndevs && !ret) {
-               for (i = 1; i < sbi->s_ndevs; i++) {
-                       trace_f2fs_issue_flush(FDEV(i).bdev,
-                                       test_opt(sbi, NOBARRIER),
-                                       test_opt(sbi, FLUSH_MERGE));
-                       ret = __submit_flush_wait(FDEV(i).bdev);
-                       if (ret)
-                               break;
-               }
+       if (!sbi->s_ndevs || ret)
+               return ret;
+
+       for (i = 1; i < sbi->s_ndevs; i++) {
+               ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+               if (ret)
+                       break;
        }
        return ret;
 }
@@ -458,6 +489,8 @@ repeat:
                fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
                ret = submit_flush_wait(sbi);
+               atomic_inc(&fcc->issued_flush);
+
                llist_for_each_entry_safe(cmd, next,
                                          fcc->dispatch_list, llnode) {
                        cmd->ret = ret;
@@ -475,25 +508,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 {
        struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
        struct flush_cmd cmd;
+       int ret;
 
        if (test_opt(sbi, NOBARRIER))
                return 0;
 
-       if (!test_opt(sbi, FLUSH_MERGE))
-               return submit_flush_wait(sbi);
-
-       if (!atomic_read(&fcc->submit_flush)) {
-               int ret;
+       if (!test_opt(sbi, FLUSH_MERGE)) {
+               ret = submit_flush_wait(sbi);
+               atomic_inc(&fcc->issued_flush);
+               return ret;
+       }
 
-               atomic_inc(&fcc->submit_flush);
+       if (!atomic_read(&fcc->issing_flush)) {
+               atomic_inc(&fcc->issing_flush);
                ret = submit_flush_wait(sbi);
-               atomic_dec(&fcc->submit_flush);
+               atomic_dec(&fcc->issing_flush);
+
+               atomic_inc(&fcc->issued_flush);
                return ret;
        }
 
        init_completion(&cmd.wait);
 
-       atomic_inc(&fcc->submit_flush);
+       atomic_inc(&fcc->issing_flush);
        llist_add(&cmd.llnode, &fcc->issue_list);
 
        if (!fcc->dispatch_list)
@@ -501,10 +538,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 
        if (fcc->f2fs_issue_flush) {
                wait_for_completion(&cmd.wait);
-               atomic_dec(&fcc->submit_flush);
+               atomic_dec(&fcc->issing_flush);
        } else {
                llist_del_all(&fcc->issue_list);
-               atomic_set(&fcc->submit_flush, 0);
+               atomic_set(&fcc->issing_flush, 0);
        }
 
        return cmd.ret;
@@ -524,7 +561,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
        fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
        if (!fcc)
                return -ENOMEM;
-       atomic_set(&fcc->submit_flush, 0);
+       atomic_set(&fcc->issued_flush, 0);
+       atomic_set(&fcc->issing_flush, 0);
        init_waitqueue_head(&fcc->flush_wait_queue);
        init_llist_head(&fcc->issue_list);
        SM_I(sbi)->fcc_info = fcc;
@@ -597,8 +635,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
                if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
                        dirty_i->nr_dirty[t]--;
 
-               if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
-                       clear_bit(GET_SECNO(sbi, segno),
+               if (get_valid_blocks(sbi, segno, true) == 0)
+                       clear_bit(GET_SEC_FROM_SEG(sbi, segno),
                                                dirty_i->victim_secmap);
        }
 }
@@ -618,7 +656,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 
        mutex_lock(&dirty_i->seglist_lock);
 
-       valid_blocks = get_valid_blocks(sbi, segno, 0);
+       valid_blocks = get_valid_blocks(sbi, segno, false);
 
        if (valid_blocks == 0) {
                __locate_dirty_segment(sbi, segno, PRE);
@@ -633,162 +671,407 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
        mutex_unlock(&dirty_i->seglist_lock);
 }
 
-static void __add_discard_cmd(struct f2fs_sb_info *sbi,
-                       struct bio *bio, block_t lstart, block_t len)
+static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
+               struct block_device *bdev, block_t lstart,
+               block_t start, block_t len)
 {
        struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-       struct list_head *cmd_list = &(dcc->discard_cmd_list);
+       struct list_head *pend_list;
        struct discard_cmd *dc;
 
+       f2fs_bug_on(sbi, !len);
+
+       pend_list = &dcc->pend_list[plist_idx(len)];
+
        dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS);
        INIT_LIST_HEAD(&dc->list);
-       dc->bio = bio;
-       bio->bi_private = dc;
+       dc->bdev = bdev;
        dc->lstart = lstart;
+       dc->start = start;
        dc->len = len;
+       dc->ref = 0;
        dc->state = D_PREP;
+       dc->error = 0;
        init_completion(&dc->wait);
+       list_add_tail(&dc->list, pend_list);
+       atomic_inc(&dcc->discard_cmd_cnt);
+       dcc->undiscard_blks += len;
 
-       mutex_lock(&dcc->cmd_lock);
-       list_add_tail(&dc->list, cmd_list);
-       mutex_unlock(&dcc->cmd_lock);
+       return dc;
 }
 
-static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc)
+static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi,
+                               struct block_device *bdev, block_t lstart,
+                               block_t start, block_t len,
+                               struct rb_node *parent, struct rb_node **p)
 {
-       int err = dc->bio->bi_error;
+       struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+       struct discard_cmd *dc;
 
-       if (dc->state == D_DONE)
-               atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard));
+       dc = __create_discard_cmd(sbi, bdev, lstart, start, len);
 
-       if (err == -EOPNOTSUPP)
-               err = 0;
+       rb_link_node(&dc->rb_node, parent, p);
+       rb_insert_color(&dc->rb_node, &dcc->root);
+
+       return dc;
+}
+
+static void __detach_discard_cmd(struct discard_cmd_control *dcc,
+                                                       struct discard_cmd *dc)
+{
+       if (dc->state == D_DONE)
+               atomic_dec(&dcc->issing_discard);
 
-       if (err)
-               f2fs_msg(sbi->sb, KERN_INFO,
-                               "Issue discard failed, ret: %d", err);
-       bio_put(dc->bio);
        list_del(&dc->list);
+       rb_erase(&dc->rb_node, &dcc->root);
+       dcc->undiscard_blks -= dc->len;
+
        kmem_cache_free(discard_cmd_slab, dc);
+
+       atomic_dec(&dcc->discard_cmd_cnt);
 }
 
-/* This should be covered by global mutex, &sit_i->sentry_lock */
-void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
+static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
+                                                       struct discard_cmd *dc)
 {
        struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-       struct list_head *wait_list = &(dcc->discard_cmd_list);
-       struct discard_cmd *dc, *tmp;
-       struct blk_plug plug;
 
-       mutex_lock(&dcc->cmd_lock);
+       if (dc->error == -EOPNOTSUPP)
+               dc->error = 0;
 
-       blk_start_plug(&plug);
+       if (dc->error)
+               f2fs_msg(sbi->sb, KERN_INFO,
+                               "Issue discard failed, ret: %d", dc->error);
+       __detach_discard_cmd(dcc, dc);
+}
 
-       list_for_each_entry_safe(dc, tmp, wait_list, list) {
+static void f2fs_submit_discard_endio(struct bio *bio)
+{
+       struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
 
-               if (blkaddr == NULL_ADDR) {
-                       if (dc->state == D_PREP) {
-                               dc->state = D_SUBMIT;
-                               submit_bio(dc->bio);
-                               atomic_inc(&dcc->submit_discard);
-                       }
-                       continue;
+       dc->error = bio->bi_error;
+       dc->state = D_DONE;
+       complete(&dc->wait);
+       bio_put(bio);
+}
+
+/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
+static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
+                               struct discard_cmd *dc)
+{
+       struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+       struct bio *bio = NULL;
+
+       if (dc->state != D_PREP)
+               return;
+
+       trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len);
+
+       dc->error = __blkdev_issue_discard(dc->bdev,
+                               SECTOR_FROM_BLOCK(dc->start),
+                               SECTOR_FROM_BLOCK(dc->len),
+                               GFP_NOFS, 0, &bio);
+       if (!dc->error) {
+               /* should keep before submission to avoid D_DONE right away */
+               dc->state = D_SUBMIT;
+               atomic_inc(&dcc->issued_discard);
+               atomic_inc(&dcc->issing_discard);
+               if (bio) {
+                       bio->bi_private = dc;
+                       bio->bi_end_io = f2fs_submit_discard_endio;
+                       bio->bi_opf |= REQ_SYNC;
+                       submit_bio(bio);
+                       list_move_tail(&dc->list, &dcc->wait_list);
                }
+       } else {
+               __remove_discard_cmd(sbi, dc);
+       }
+}
 
-               if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) {
-                       if (dc->state == D_SUBMIT)
-                               wait_for_completion_io(&dc->wait);
-                       else
-                               __remove_discard_cmd(sbi, dc);
+static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
+                               struct block_device *bdev, block_t lstart,
+                               block_t start, block_t len,
+                               struct rb_node **insert_p,
+                               struct rb_node *insert_parent)
+{
+       struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+       struct rb_node **p = &dcc->root.rb_node;
+       struct rb_node *parent = NULL;
+       struct discard_cmd *dc = NULL;
+
+       if (insert_p && insert_parent) {
+               parent = insert_parent;
+               p = insert_p;
+               goto do_insert;
+       }
+
+       p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart);
+do_insert:
+       dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p);
+       if (!dc)
+               return NULL;
+
+       return dc;
+}
+
+static void __relocate_discard_cmd(struct discard_cmd_control *dcc,
+                                               struct discard_cmd *dc)
+{
+       list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]);
+}
+
+static void __punch_discard_cmd(struct f2fs_sb_info *sbi,
+                               struct discard_cmd *dc, block_t blkaddr)
+{
+       struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+       struct discard_info di = dc->di;
+       bool modified = false;
+
+       if (dc->state == D_DONE || dc->len == 1) {
+               __remove_discard_cmd(sbi, dc);
+               return;
+       }
+
+       dcc->undiscard_blks -= di.len;
+
+       if (blkaddr > di.lstart) {
+               dc->len = blkaddr - dc->lstart;
+               dcc->undiscard_blks += dc->len;
+               __relocate_discard_cmd(dcc, dc);
+               f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+               modified = true;
+       }
+
+       if (blkaddr < di.lstart + di.len - 1) {
+               if (modified) {
+                       __insert_discard_tree(sbi, dc->bdev, blkaddr + 1,
+                                       di.start + blkaddr + 1 - di.lstart,
+                                       di.lstart + di.len - 1 - blkaddr,
+                                       NULL, NULL);
+                       f2fs_bug_on(sbi,
+                               !__check_rb_tree_consistence(sbi, &dcc->root));
+               } else {
+                       dc->lstart++;
+                       dc->len--;
+                       dc->start++;
+                       dcc->undiscard_blks += dc->len;
+                       __relocate_discard_cmd(dcc, dc);
+                       f2fs_bug_on(sbi,
+                               !__check_rb_tree_consistence(sbi, &dcc->root));
                }
        }
-       blk_finish_plug(&plug);
+}
 
-       /* this comes from f2fs_put_super */
-       if (blkaddr == NULL_ADDR) {
-               list_for_each_entry_safe(dc, tmp, wait_list, list) {
-                       wait_for_completion_io(&dc->wait);
-                       __remove_discard_cmd(sbi, dc);
+static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
+                               struct block_device *bdev, block_t lstart,
+                               block_t start, block_t len)
+{
+       struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+       struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+       struct discard_cmd *dc;
+       struct discard_info di = {0};
+       struct rb_node **insert_p = NULL, *insert_parent = NULL;
+       block_t end = lstart + len;
+
+       mutex_lock(&dcc->cmd_lock);
+
+       dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root,
+                                       NULL, lstart,
+                                       (struct rb_entry **)&prev_dc,
+                                       (struct rb_entry **)&next_dc,
+                                       &insert_p, &insert_parent, true);
+       if (dc)
+               prev_dc = dc;
+
+       if (!prev_dc) {
+               di.lstart = lstart;
+               di.len = next_dc ? next_dc->lstart - lstart : len;
+               di.len = min(di.len, len);
+               di.start = start;
+       }
+
+       while (1) {
+               struct rb_node *node;
+               bool merged = false;
+               struct discard_cmd *tdc = NULL;
+
+               if (prev_dc) {
+                       di.lstart = prev_dc->lstart + prev_dc->len;
+                       if (di.lstart < lstart)
+                               di.lstart = lstart;
+                       if (di.lstart >= end)
+                               break;
+
+                       if (!next_dc || next_dc->lstart > end)
+                               di.len = end - di.lstart;
+                       else
+                               di.len = next_dc->lstart - di.lstart;
+                       di.start = start + di.lstart - lstart;
+               }
+
+               if (!di.len)
+                       goto next;
+
+               if (prev_dc && prev_dc->state == D_PREP &&
+                       prev_dc->bdev == bdev &&
+                       __is_discard_back_mergeable(&di, &prev_dc->di)) {
+                       prev_dc->di.len += di.len;
+                       dcc->undiscard_blks += di.len;
+                       __relocate_discard_cmd(dcc, prev_dc);
+                       f2fs_bug_on(sbi,
+                               !__check_rb_tree_consistence(sbi, &dcc->root));
+                       di = prev_dc->di;
+                       tdc = prev_dc;
+                       merged = true;
+               }
+
+               if (next_dc && next_dc->state == D_PREP &&
+                       next_dc->bdev == bdev &&
+                       __is_discard_front_mergeable(&di, &next_dc->di)) {
+                       next_dc->di.lstart = di.lstart;
+                       next_dc->di.len += di.len;
+                       next_dc->di.start = di.start;
+                       dcc->undiscard_blks += di.len;
+                       __relocate_discard_cmd(dcc, next_dc);
+                       if (tdc)
+                               __remove_discard_cmd(sbi, tdc);
+                       f2fs_bug_on(sbi,
+                               !__check_rb_tree_consistence(sbi, &dcc->root));
+                       merged = true;
+               }
+
+               if (!merged) {
+                       __insert_discard_tree(sbi, bdev, di.lstart, di.start,
+                                                       di.len, NULL, NULL);
+                       f2fs_bug_on(sbi,
+                               !__check_rb_tree_consistence(sbi, &dcc->root));
                }
+ next:
+               prev_dc = next_dc;
+               if (!prev_dc)
+                       break;
+
+               node = rb_next(&prev_dc->rb_node);
+               next_dc = rb_entry_safe(node, struct discard_cmd, rb_node);
        }
+
        mutex_unlock(&dcc->cmd_lock);
 }
 
-static void f2fs_submit_discard_endio(struct bio *bio)
+static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
+               struct block_device *bdev, block_t blkstart, block_t blklen)
 {
-       struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
+       block_t lblkstart = blkstart;
 
-       complete(&dc->wait);
-       dc->state = D_DONE;
+       trace_f2fs_queue_discard(bdev, blkstart, blklen);
+
+       if (sbi->s_ndevs) {
+               int devi = f2fs_target_device_index(sbi, blkstart);
+
+               blkstart -= FDEV(devi).start_blk;
+       }
+       __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
+       return 0;
 }
 
-static int issue_discard_thread(void *data)
+static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond)
 {
-       struct f2fs_sb_info *sbi = data;
        struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-       wait_queue_head_t *q = &dcc->discard_wait_queue;
-       struct list_head *cmd_list = &dcc->discard_cmd_list;
+       struct list_head *pend_list;
        struct discard_cmd *dc, *tmp;
        struct blk_plug plug;
-       int iter = 0;
-repeat:
-       if (kthread_should_stop())
-               return 0;
+       int i, iter = 0;
 
+       mutex_lock(&dcc->cmd_lock);
        blk_start_plug(&plug);
+       for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+               pend_list = &dcc->pend_list[i];
+               list_for_each_entry_safe(dc, tmp, pend_list, list) {
+                       f2fs_bug_on(sbi, dc->state != D_PREP);
+
+                       if (!issue_cond || is_idle(sbi))
+                               __submit_discard_cmd(sbi, dc);
+                       if (issue_cond && iter++ > DISCARD_ISSUE_RATE)
+                               goto out;
+               }
+       }
+out:
+       blk_finish_plug(&plug);
+       mutex_unlock(&dcc->cmd_lock);
+}
+
+static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond)
+{
+       struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+       struct list_head *wait_list = &(dcc->wait_list);
+       struct discard_cmd *dc, *tmp;
 
        mutex_lock(&dcc->cmd_lock);
-       list_for_each_entry_safe(dc, tmp, cmd_list, list) {
-               if (dc->state == D_PREP) {
-                       dc->state = D_SUBMIT;
-                       submit_bio(dc->bio);
-                       atomic_inc(&dcc->submit_discard);
-                       if (iter++ > DISCARD_ISSUE_RATE)
-                               break;
-               } else if (dc->state == D_DONE) {
+       list_for_each_entry_safe(dc, tmp, wait_list, list) {
+               if (!wait_cond || dc->state == D_DONE) {
+                       if (dc->ref)
+                               continue;
+                       wait_for_completion_io(&dc->wait);
                        __remove_discard_cmd(sbi, dc);
                }
        }
        mutex_unlock(&dcc->cmd_lock);
+}
 
-       blk_finish_plug(&plug);
+/* This should be covered by global mutex, &sit_i->sentry_lock */
+void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+       struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+       struct discard_cmd *dc;
+       bool need_wait = false;
 
-       iter = 0;
-       congestion_wait(BLK_RW_SYNC, HZ/50);
+       mutex_lock(&dcc->cmd_lock);
+       dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr);
+       if (dc) {
+               if (dc->state == D_PREP) {
+                       __punch_discard_cmd(sbi, dc, blkaddr);
+               } else {
+                       dc->ref++;
+                       need_wait = true;
+               }
+       }
+       mutex_unlock(&dcc->cmd_lock);
 
-       wait_event_interruptible(*q,
-               kthread_should_stop() || !list_empty(&dcc->discard_cmd_list));
-       goto repeat;
+       if (need_wait) {
+               wait_for_completion_io(&dc->wait);
+               mutex_lock(&dcc->cmd_lock);
+               f2fs_bug_on(sbi, dc->state != D_DONE);
+               dc->ref--;
+               if (!dc->ref)
+                       __remove_discard_cmd(sbi, dc);
+               mutex_unlock(&dcc->cmd_lock);
+       }
 }
 
-
-/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
-static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
-               struct block_device *bdev, block_t blkstart, block_t blklen)
+/* This comes from f2fs_put_super */
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
 {
-       struct bio *bio = NULL;
-       block_t lblkstart = blkstart;
-       int err;
+       __issue_discard_cmd(sbi, false);
+       __wait_discard_cmd(sbi, false);
+}
 
-       trace_f2fs_issue_discard(bdev, blkstart, blklen);
+static int issue_discard_thread(void *data)
+{
+       struct f2fs_sb_info *sbi = data;
+       struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+       wait_queue_head_t *q = &dcc->discard_wait_queue;
+repeat:
+       if (kthread_should_stop())
+               return 0;
 
-       if (sbi->s_ndevs) {
-               int devi = f2fs_target_device_index(sbi, blkstart);
+       __issue_discard_cmd(sbi, true);
+       __wait_discard_cmd(sbi, true);
 
-               blkstart -= FDEV(devi).start_blk;
-       }
-       err = __blkdev_issue_discard(bdev,
-                               SECTOR_FROM_BLOCK(blkstart),
-                               SECTOR_FROM_BLOCK(blklen),
-                               GFP_NOFS, 0, &bio);
-       if (!err && bio) {
-               bio->bi_end_io = f2fs_submit_discard_endio;
-               bio->bi_opf |= REQ_SYNC;
+       congestion_wait(BLK_RW_SYNC, HZ/50);
 
-               __add_discard_cmd(sbi, bio, lblkstart, blklen);
-               wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue);
-       }
-       return err;
+       wait_event_interruptible(*q, kthread_should_stop() ||
+                               atomic_read(&dcc->discard_cmd_cnt));
+       goto repeat;
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
@@ -796,6 +1079,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
                struct block_device *bdev, block_t blkstart, block_t blklen)
 {
        sector_t sector, nr_sects;
+       block_t lblkstart = blkstart;
        int devi = 0;
 
        if (sbi->s_ndevs) {
@@ -813,7 +1097,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
        case BLK_ZONE_TYPE_CONVENTIONAL:
                if (!blk_queue_discard(bdev_get_queue(bdev)))
                        return 0;
-               return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+               return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
        case BLK_ZONE_TYPE_SEQWRITE_REQ:
        case BLK_ZONE_TYPE_SEQWRITE_PREF:
                sector = SECTOR_FROM_BLOCK(blkstart);
@@ -845,7 +1129,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
                                bdev_zoned_model(bdev) != BLK_ZONED_NONE)
                return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
 #endif
-       return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+       return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
 }
 
 static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
@@ -888,32 +1172,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
        return err;
 }
 
-static void __add_discard_entry(struct f2fs_sb_info *sbi,
-               struct cp_control *cpc, struct seg_entry *se,
-               unsigned int start, unsigned int end)
-{
-       struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list;
-       struct discard_entry *new, *last;
-
-       if (!list_empty(head)) {
-               last = list_last_entry(head, struct discard_entry, list);
-               if (START_BLOCK(sbi, cpc->trim_start) + start ==
-                               last->blkaddr + last->len &&
-                               last->len < MAX_DISCARD_BLOCKS(sbi)) {
-                       last->len += end - start;
-                       goto done;
-               }
-       }
-
-       new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
-       INIT_LIST_HEAD(&new->list);
-       new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
-       new->len = end - start;
-       list_add_tail(&new->list, head);
-done:
-       SM_I(sbi)->dcc_info->nr_discards += end - start;
-}
-
 static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
                                                        bool check_only)
 {
@@ -925,7 +1183,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
        unsigned long *discard_map = (unsigned long *)se->discard_map;
        unsigned long *dmap = SIT_I(sbi)->tmp_map;
        unsigned int start = 0, end = -1;
-       bool force = (cpc->reason == CP_DISCARD);
+       bool force = (cpc->reason & CP_DISCARD);
+       struct discard_entry *de = NULL;
+       struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
        int i;
 
        if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
@@ -957,14 +1217,24 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
                if (check_only)
                        return true;
 
-               __add_discard_entry(sbi, cpc, se, start, end);
+               if (!de) {
+                       de = f2fs_kmem_cache_alloc(discard_entry_slab,
+                                                               GFP_F2FS_ZERO);
+                       de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start);
+                       list_add_tail(&de->list, head);
+               }
+
+               for (i = start; i < end; i++)
+                       __set_bit_le(i, (void *)de->discard_map);
+
+               SM_I(sbi)->dcc_info->nr_discards += end - start;
        }
        return false;
 }
 
 void release_discard_addrs(struct f2fs_sb_info *sbi)
 {
-       struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list);
+       struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
        struct discard_entry *entry, *this;
 
        /* drop caches */
@@ -990,13 +1260,13 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 
 void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
-       struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list);
+       struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
        struct discard_entry *entry, *this;
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
        unsigned int start = 0, end = -1;
        unsigned int secno, start_segno;
-       bool force = (cpc->reason == CP_DISCARD);
+       bool force = (cpc->reason & CP_DISCARD);
 
        mutex_lock(&dirty_i->seglist_lock);
 
@@ -1026,10 +1296,10 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                        continue;
                }
 next:
-               secno = GET_SECNO(sbi, start);
-               start_segno = secno * sbi->segs_per_sec;
+               secno = GET_SEC_FROM_SEG(sbi, start);
+               start_segno = GET_SEG_FROM_SEC(sbi, secno);
                if (!IS_CURSEC(sbi, secno) &&
-                       !get_valid_blocks(sbi, start, sbi->segs_per_sec))
+                       !get_valid_blocks(sbi, start, true))
                        f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
                                sbi->segs_per_sec << sbi->log_blocks_per_seg);
 
@@ -1043,22 +1313,46 @@ next:
 
        /* send small discards */
        list_for_each_entry_safe(entry, this, head, list) {
-               if (force && entry->len < cpc->trim_minlen)
-                       goto skip;
-               f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
-               cpc->trimmed += entry->len;
+               unsigned int cur_pos = 0, next_pos, len, total_len = 0;
+               bool is_valid = test_bit_le(0, entry->discard_map);
+
+find_next:
+               if (is_valid) {
+                       next_pos = find_next_zero_bit_le(entry->discard_map,
+                                       sbi->blocks_per_seg, cur_pos);
+                       len = next_pos - cur_pos;
+
+                       if (force && len < cpc->trim_minlen)
+                               goto skip;
+
+                       f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
+                                                                       len);
+                       cpc->trimmed += len;
+                       total_len += len;
+               } else {
+                       next_pos = find_next_bit_le(entry->discard_map,
+                                       sbi->blocks_per_seg, cur_pos);
+               }
 skip:
+               cur_pos = next_pos;
+               is_valid = !is_valid;
+
+               if (cur_pos < sbi->blocks_per_seg)
+                       goto find_next;
+
                list_del(&entry->list);
-               SM_I(sbi)->dcc_info->nr_discards -= entry->len;
+               SM_I(sbi)->dcc_info->nr_discards -= total_len;
                kmem_cache_free(discard_entry_slab, entry);
        }
+
+       wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue);
 }
 
 static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 {
        dev_t dev = sbi->sb->s_bdev->bd_dev;
        struct discard_cmd_control *dcc;
-       int err = 0;
+       int err = 0, i;
 
        if (SM_I(sbi)->dcc_info) {
                dcc = SM_I(sbi)->dcc_info;
@@ -1069,12 +1363,18 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
        if (!dcc)
                return -ENOMEM;
 
-       INIT_LIST_HEAD(&dcc->discard_entry_list);
-       INIT_LIST_HEAD(&dcc->discard_cmd_list);
+       INIT_LIST_HEAD(&dcc->entry_list);
+       for (i = 0; i < MAX_PLIST_NUM; i++)
+               INIT_LIST_HEAD(&dcc->pend_list[i]);
+       INIT_LIST_HEAD(&dcc->wait_list);
        mutex_init(&dcc->cmd_lock);
-       atomic_set(&dcc->submit_discard, 0);
+       atomic_set(&dcc->issued_discard, 0);
+       atomic_set(&dcc->issing_discard, 0);
+       atomic_set(&dcc->discard_cmd_cnt, 0);
        dcc->nr_discards = 0;
-       dcc->max_discards = 0;
+       dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+       dcc->undiscard_blks = 0;
+       dcc->root = RB_ROOT;
 
        init_waitqueue_head(&dcc->discard_wait_queue);
        SM_I(sbi)->dcc_info = dcc;
@@ -1091,20 +1391,22 @@ init_thread:
        return err;
 }
 
-static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free)
+static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
 {
        struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 
-       if (dcc && dcc->f2fs_issue_discard) {
+       if (!dcc)
+               return;
+
+       if (dcc->f2fs_issue_discard) {
                struct task_struct *discard_thread = dcc->f2fs_issue_discard;
 
                dcc->f2fs_issue_discard = NULL;
                kthread_stop(discard_thread);
        }
-       if (free) {
-               kfree(dcc);
-               SM_I(sbi)->dcc_info = NULL;
-       }
+
+       kfree(dcc);
+       SM_I(sbi)->dcc_info = NULL;
 }
 
 static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -1345,6 +1647,17 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
        f2fs_put_page(page, 1);
 }
 
+static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
+{
+       struct curseg_info *curseg = CURSEG_I(sbi, type);
+       unsigned int segno = curseg->segno + 1;
+       struct free_segmap_info *free_i = FREE_I(sbi);
+
+       if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
+               return !test_bit(segno, free_i->free_segmap);
+       return 0;
+}
+
 /*
  * Find a new segment from the free segments bitmap to right order
  * This function should be returned with success, otherwise BUG
@@ -1355,8 +1668,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
        struct free_segmap_info *free_i = FREE_I(sbi);
        unsigned int segno, secno, zoneno;
        unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
-       unsigned int hint = *newseg / sbi->segs_per_sec;
-       unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
+       unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
+       unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
        unsigned int left_start = hint;
        bool init = true;
        int go_left = 0;
@@ -1366,8 +1679,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
 
        if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
                segno = find_next_zero_bit(free_i->free_segmap,
-                               (hint + 1) * sbi->segs_per_sec, *newseg + 1);
-               if (segno < (hint + 1) * sbi->segs_per_sec)
+                       GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1);
+               if (segno < GET_SEG_FROM_SEC(sbi, hint + 1))
                        goto got_it;
        }
 find_other_zone:
@@ -1398,8 +1711,8 @@ find_other_zone:
        secno = left_start;
 skip_left:
        hint = secno;
-       segno = secno * sbi->segs_per_sec;
-       zoneno = secno / sbi->secs_per_zone;
+       segno = GET_SEG_FROM_SEC(sbi, secno);
+       zoneno = GET_ZONE_FROM_SEC(sbi, secno);
 
        /* give up on finding another zone */
        if (!init)
@@ -1443,7 +1756,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
        struct summary_footer *sum_footer;
 
        curseg->segno = curseg->next_segno;
-       curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
+       curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
        curseg->next_blkoff = 0;
        curseg->next_segno = NULL_SEGNO;
 
@@ -1456,6 +1769,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
        __set_sit_entry_type(sbi, type, curseg->segno, modified);
 }
 
+static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
+{
+       /* if segs_per_sec is large than 1, we need to keep original policy. */
+       if (sbi->segs_per_sec != 1)
+               return CURSEG_I(sbi, type)->segno;
+
+       if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+               return 0;
+
+       if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
+               return SIT_I(sbi)->last_victim[ALLOC_NEXT];
+       return CURSEG_I(sbi, type)->segno;
+}
+
 /*
  * Allocate a current working segment.
  * This function always allocates a free segment in LFS manner.
@@ -1474,6 +1801,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
        if (test_opt(sbi, NOHEAP))
                dir = ALLOC_RIGHT;
 
+       segno = __get_next_segno(sbi, type);
        get_new_segment(sbi, &segno, new_sec, dir);
        curseg->next_segno = segno;
        reset_curseg(sbi, type, 1);
@@ -1549,12 +1877,15 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, type);
        const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
+       unsigned segno = NULL_SEGNO;
        int i, cnt;
        bool reversed = false;
 
        /* need_SSR() already forces to do this */
-       if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR))
+       if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) {
+               curseg->next_segno = segno;
                return 1;
+       }
 
        /* For node segments, let's do SSR more intensively */
        if (IS_NODESEG(type)) {
@@ -1578,9 +1909,10 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
        for (; cnt-- > 0; reversed ? i-- : i++) {
                if (i == type)
                        continue;
-               if (v_ops->get_victim(sbi, &(curseg)->next_segno,
-                                               BG_GC, i, SSR))
+               if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) {
+                       curseg->next_segno = segno;
                        return 1;
+               }
        }
        return 0;
 }
@@ -1592,17 +1924,21 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
 static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
                                                int type, bool force)
 {
+       struct curseg_info *curseg = CURSEG_I(sbi, type);
+
        if (force)
                new_curseg(sbi, type, true);
        else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
                                        type == CURSEG_WARM_NODE)
                new_curseg(sbi, type, false);
+       else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
+               new_curseg(sbi, type, false);
        else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
                change_curseg(sbi, type, true);
        else
                new_curseg(sbi, type, false);
 
-       stat_inc_seg_type(sbi, CURSEG_I(sbi, type));
+       stat_inc_seg_type(sbi, curseg);
 }
 
 void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -1734,18 +2070,16 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
        if (p_type == DATA) {
                struct inode *inode = page->mapping->host;
 
-               if (S_ISDIR(inode->i_mode))
-                       return CURSEG_HOT_DATA;
-               else if (is_cold_data(page) || file_is_cold(inode))
+               if (is_cold_data(page) || file_is_cold(inode))
                        return CURSEG_COLD_DATA;
-               else
-                       return CURSEG_WARM_DATA;
+               if (is_inode_flag_set(inode, FI_HOT_DATA))
+                       return CURSEG_HOT_DATA;
+               return CURSEG_WARM_DATA;
        } else {
                if (IS_DNODE(page))
                        return is_cold_node(page) ? CURSEG_WARM_NODE :
                                                CURSEG_HOT_NODE;
-               else
-                       return CURSEG_COLD_NODE;
+               return CURSEG_COLD_NODE;
        }
 }
 
@@ -1788,15 +2122,14 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 
        stat_inc_block_count(sbi, curseg);
 
+       if (!__has_curseg_space(sbi, type))
+               sit_i->s_ops->allocate_segment(sbi, type, false);
        /*
-        * SIT information should be updated before segment allocation,
-        * since SSR needs latest valid block information.
+        * SIT information should be updated after segment allocation,
+        * since we need to keep dirty segments precisely under SSR.
         */
        refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
 
-       if (!__has_curseg_space(sbi, type))
-               sit_i->s_ops->allocate_segment(sbi, type, false);
-
        mutex_unlock(&sit_i->sentry_lock);
 
        if (page && IS_NODESEG(type))
@@ -1868,11 +2201,11 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
        f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
 }
 
-void rewrite_data_page(struct f2fs_io_info *fio)
+int rewrite_data_page(struct f2fs_io_info *fio)
 {
        fio->new_blkaddr = fio->old_blkaddr;
        stat_inc_inplace_blocks(fio->sbi);
-       f2fs_submit_page_mbio(fio);
+       return f2fs_submit_page_bio(fio);
 }
 
 void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -2437,7 +2770,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                        se = get_seg_entry(sbi, segno);
 
                        /* add discard candidates */
-                       if (cpc->reason != CP_DISCARD) {
+                       if (!(cpc->reason & CP_DISCARD)) {
                                cpc->trim_start = segno;
                                add_discard_addrs(sbi, cpc, false);
                        }
@@ -2473,7 +2806,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        f2fs_bug_on(sbi, !list_empty(head));
        f2fs_bug_on(sbi, sit_i->dirty_sentries);
 out:
-       if (cpc->reason == CP_DISCARD) {
+       if (cpc->reason & CP_DISCARD) {
                __u64 trim_start = cpc->trim_start;
 
                for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
@@ -2672,10 +3005,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 
                        /* build discard map only one time */
                        if (f2fs_discard_en(sbi)) {
-                               memcpy(se->discard_map, se->cur_valid_map,
-                                                       SIT_VBLOCK_MAP_SIZE);
-                               sbi->discard_blks += sbi->blocks_per_seg -
-                                                       se->valid_blocks;
+                               if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+                                       memset(se->discard_map, 0xff,
+                                               SIT_VBLOCK_MAP_SIZE);
+                               } else {
+                                       memcpy(se->discard_map,
+                                               se->cur_valid_map,
+                                               SIT_VBLOCK_MAP_SIZE);
+                                       sbi->discard_blks +=
+                                               sbi->blocks_per_seg -
+                                               se->valid_blocks;
+                               }
                        }
 
                        if (sbi->segs_per_sec > 1)
@@ -2699,10 +3039,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
                seg_info_from_raw_sit(se, &sit);
 
                if (f2fs_discard_en(sbi)) {
-                       memcpy(se->discard_map, se->cur_valid_map,
-                                               SIT_VBLOCK_MAP_SIZE);
-                       sbi->discard_blks += old_valid_blocks -
-                                               se->valid_blocks;
+                       if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+                               memset(se->discard_map, 0xff,
+                                                       SIT_VBLOCK_MAP_SIZE);
+                       } else {
+                               memcpy(se->discard_map, se->cur_valid_map,
+                                                       SIT_VBLOCK_MAP_SIZE);
+                               sbi->discard_blks += old_valid_blocks -
+                                                       se->valid_blocks;
+                       }
                }
 
                if (sbi->segs_per_sec > 1)
@@ -2746,7 +3091,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
                if (segno >= MAIN_SEGS(sbi))
                        break;
                offset = segno + 1;
-               valid_blocks = get_valid_blocks(sbi, segno, 0);
+               valid_blocks = get_valid_blocks(sbi, segno, false);
                if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
                        continue;
                if (valid_blocks > sbi->blocks_per_seg) {
@@ -2852,6 +3197,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
                sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
        sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
        sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
+       sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
 
        sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
 
@@ -2988,7 +3334,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
        if (!sm_info)
                return;
        destroy_flush_cmd_control(sbi, true);
-       destroy_discard_cmd_control(sbi, true);
+       destroy_discard_cmd_control(sbi);
        destroy_dirty_segmap(sbi);
        destroy_curseg(sbi);
        destroy_free_segmap(sbi);
index 5e8ad4280a5016d293817115a31a94daaf8d2d49..10bf05d4cff49be8152a8e6aa434ec0ccb0827d4 100644 (file)
 #define F2FS_MIN_SEGMENTS      9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
 
 /* L: Logical segment # in volume, R: Relative segment # in main area */
-#define GET_L2R_SEGNO(free_i, segno)   (segno - free_i->start_segno)
-#define GET_R2L_SEGNO(free_i, segno)   (segno + free_i->start_segno)
+#define GET_L2R_SEGNO(free_i, segno)   ((segno) - (free_i)->start_segno)
+#define GET_R2L_SEGNO(free_i, segno)   ((segno) + (free_i)->start_segno)
 
-#define IS_DATASEG(t)  (t <= CURSEG_COLD_DATA)
-#define IS_NODESEG(t)  (t >= CURSEG_HOT_NODE)
+#define IS_DATASEG(t)  ((t) <= CURSEG_COLD_DATA)
+#define IS_NODESEG(t)  ((t) >= CURSEG_HOT_NODE)
 
 #define IS_CURSEG(sbi, seg)                                            \
-       ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||      \
-        (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) ||     \
-        (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) ||     \
-        (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) ||      \
-        (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) ||     \
-        (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+       (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||    \
+        ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) ||   \
+        ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) ||   \
+        ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) ||    \
+        ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) ||   \
+        ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
 
 #define IS_CURSEC(sbi, secno)                                          \
-       ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno /              \
-         sbi->segs_per_sec) || \
-        (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno /             \
-         sbi->segs_per_sec) || \
-        (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno /             \
-         sbi->segs_per_sec) || \
-        (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno /              \
-         sbi->segs_per_sec) || \
-        (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno /             \
-         sbi->segs_per_sec) || \
-        (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /             \
-         sbi->segs_per_sec))   \
+       (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno /            \
+         (sbi)->segs_per_sec) ||       \
+        ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno /           \
+         (sbi)->segs_per_sec) ||       \
+        ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno /           \
+         (sbi)->segs_per_sec) ||       \
+        ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno /            \
+         (sbi)->segs_per_sec) ||       \
+        ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno /           \
+         (sbi)->segs_per_sec) ||       \
+        ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /           \
+         (sbi)->segs_per_sec)) \
 
 #define MAIN_BLKADDR(sbi)      (SM_I(sbi)->main_blkaddr)
 #define SEG0_BLKADDR(sbi)      (SM_I(sbi)->seg0_blkaddr)
 
 #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments)
-#define MAIN_SECS(sbi) (sbi->total_sections)
+#define MAIN_SECS(sbi) ((sbi)->total_sections)
 
 #define TOTAL_SEGS(sbi)        (SM_I(sbi)->segment_count)
-#define TOTAL_BLKS(sbi)        (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
+#define TOTAL_BLKS(sbi)        (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg)
 
 #define MAX_BLKADDR(sbi)       (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
-#define SEGMENT_SIZE(sbi)      (1ULL << (sbi->log_blocksize +          \
-                                       sbi->log_blocks_per_seg))
+#define SEGMENT_SIZE(sbi)      (1ULL << ((sbi)->log_blocksize +        \
+                                       (sbi)->log_blocks_per_seg))
 
 #define START_BLOCK(sbi, segno)        (SEG0_BLKADDR(sbi) +                    \
-        (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+        (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg))
 
 #define NEXT_FREE_BLKADDR(sbi, curseg)                                 \
-       (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
+       (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff)
 
 #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr)    ((blk_addr) - SEG0_BLKADDR(sbi))
 #define GET_SEGNO_FROM_SEG0(sbi, blk_addr)                             \
-       (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+       (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg)
 #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr)                            \
-       (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
+       (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1))
 
 #define GET_SEGNO(sbi, blk_addr)                                       \
-       (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ?          \
+       ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ?      \
        NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi),                 \
                GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
-#define GET_SECNO(sbi, segno)                                  \
-       ((segno) / sbi->segs_per_sec)
-#define GET_ZONENO_FROM_SEGNO(sbi, segno)                              \
-       ((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
+#define BLKS_PER_SEC(sbi)                                      \
+       ((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
+#define GET_SEC_FROM_SEG(sbi, segno)                           \
+       ((segno) / (sbi)->segs_per_sec)
+#define GET_SEG_FROM_SEC(sbi, secno)                           \
+       ((secno) * (sbi)->segs_per_sec)
+#define GET_ZONE_FROM_SEC(sbi, secno)                          \
+       ((secno) / (sbi)->secs_per_zone)
+#define GET_ZONE_FROM_SEG(sbi, segno)                          \
+       GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
 
 #define GET_SUM_BLOCK(sbi, segno)                              \
-       ((sbi->sm_info->ssa_blkaddr) + segno)
+       ((sbi)->sm_info->ssa_blkaddr + (segno))
 
 #define GET_SUM_TYPE(footer) ((footer)->entry_type)
-#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
+#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type))
 
 #define SIT_ENTRY_OFFSET(sit_i, segno)                                 \
-       (segno % sit_i->sents_per_block)
+       ((segno) % (sit_i)->sents_per_block)
 #define SIT_BLOCK_OFFSET(segno)                                        \
-       (segno / SIT_ENTRY_PER_BLOCK)
+       ((segno) / SIT_ENTRY_PER_BLOCK)
 #define        START_SEGNO(segno)              \
        (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
 #define SIT_BLK_CNT(sbi)                       \
 #define SECTOR_FROM_BLOCK(blk_addr)                                    \
        (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
 #define SECTOR_TO_BLOCK(sectors)                                       \
-       (sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
+       ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK)
 
 /*
  * indicate a block allocation direction: RIGHT and LEFT.
@@ -132,7 +138,10 @@ enum {
  */
 enum {
        GC_CB = 0,
-       GC_GREEDY
+       GC_GREEDY,
+       ALLOC_NEXT,
+       FLUSH_DEVICE,
+       MAX_GC_POLICY,
 };
 
 /*
@@ -227,6 +236,8 @@ struct sit_info {
        unsigned long long mounted_time;        /* mount time */
        unsigned long long min_mtime;           /* min. modification time */
        unsigned long long max_mtime;           /* max. modification time */
+
+       unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */
 };
 
 struct free_segmap_info {
@@ -303,17 +314,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
                                                unsigned int segno)
 {
        struct sit_info *sit_i = SIT_I(sbi);
-       return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
+       return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)];
 }
 
 static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
-                               unsigned int segno, int section)
+                               unsigned int segno, bool use_section)
 {
        /*
         * In order to get # of valid blocks in a section instantly from many
         * segments, f2fs manages two counting structures separately.
         */
-       if (section > 1)
+       if (use_section && sbi->segs_per_sec > 1)
                return get_sec_entry(sbi, segno)->valid_blocks;
        else
                return get_seg_entry(sbi, segno)->valid_blocks;
@@ -358,8 +369,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
 static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
 {
        struct free_segmap_info *free_i = FREE_I(sbi);
-       unsigned int secno = segno / sbi->segs_per_sec;
-       unsigned int start_segno = secno * sbi->segs_per_sec;
+       unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+       unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
        unsigned int next;
 
        spin_lock(&free_i->segmap_lock);
@@ -379,7 +390,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi,
                unsigned int segno)
 {
        struct free_segmap_info *free_i = FREE_I(sbi);
-       unsigned int secno = segno / sbi->segs_per_sec;
+       unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
        set_bit(segno, free_i->free_segmap);
        free_i->free_segments--;
        if (!test_and_set_bit(secno, free_i->free_secmap))
@@ -390,8 +402,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
                unsigned int segno)
 {
        struct free_segmap_info *free_i = FREE_I(sbi);
-       unsigned int secno = segno / sbi->segs_per_sec;
-       unsigned int start_segno = secno * sbi->segs_per_sec;
+       unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+       unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
        unsigned int next;
 
        spin_lock(&free_i->segmap_lock);
@@ -412,7 +424,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
                unsigned int segno)
 {
        struct free_segmap_info *free_i = FREE_I(sbi);
-       unsigned int secno = segno / sbi->segs_per_sec;
+       unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
        spin_lock(&free_i->segmap_lock);
        if (!test_and_set_bit(segno, free_i->free_segmap)) {
                free_i->free_segments--;
@@ -477,12 +490,12 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi)
 
 static inline int overprovision_sections(struct f2fs_sb_info *sbi)
 {
-       return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
+       return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi));
 }
 
 static inline int reserved_sections(struct f2fs_sb_info *sbi)
 {
-       return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
+       return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
 }
 
 static inline bool need_SSR(struct f2fs_sb_info *sbi)
@@ -495,7 +508,7 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
                return false;
 
        return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
-                                               reserved_sections(sbi) + 1);
+                                               2 * reserved_sections(sbi));
 }
 
 static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
@@ -540,6 +553,7 @@ static inline int utilization(struct f2fs_sb_info *sbi)
  */
 #define DEF_MIN_IPU_UTIL       70
 #define DEF_MIN_FSYNC_BLOCKS   8
+#define DEF_MIN_HOT_BLOCKS     16
 
 enum {
        F2FS_IPU_FORCE,
@@ -547,17 +561,15 @@ enum {
        F2FS_IPU_UTIL,
        F2FS_IPU_SSR_UTIL,
        F2FS_IPU_FSYNC,
+       F2FS_IPU_ASYNC,
 };
 
-static inline bool need_inplace_update(struct inode *inode)
+static inline bool need_inplace_update_policy(struct inode *inode,
+                               struct f2fs_io_info *fio)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        unsigned int policy = SM_I(sbi)->ipu_policy;
 
-       /* IPU can be done only for the user data */
-       if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
-               return false;
-
        if (test_opt(sbi, LFS))
                return false;
 
@@ -572,6 +584,15 @@ static inline bool need_inplace_update(struct inode *inode)
                        utilization(sbi) > SM_I(sbi)->min_ipu_util)
                return true;
 
+       /*
+        * IPU for rewrite async pages
+        */
+       if (policy & (0x1 << F2FS_IPU_ASYNC) &&
+                       fio && fio->op == REQ_OP_WRITE &&
+                       !(fio->op_flags & REQ_SYNC) &&
+                       !f2fs_encrypted_inode(inode))
+               return true;
+
        /* this is only set during fdatasync */
        if (policy & (0x1 << F2FS_IPU_FSYNC) &&
                        is_inode_flag_set(inode, FI_NEED_IPU))
@@ -719,7 +740,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
 static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi,
                                                unsigned int secno)
 {
-       if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >=
+       if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >=
                                                sbi->fggc_threshold)
                return true;
        return false;
index 96fe8ed7310001c666c5f370ea735b5aecdc2518..83355ec4a92cdeb86d4be8d4630e01e7b0c96a28 100644 (file)
@@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = {
        [FAULT_BLOCK]           = "no more block",
        [FAULT_DIR_DEPTH]       = "too big dir depth",
        [FAULT_EVICT_INODE]     = "evict_inode fail",
+       [FAULT_TRUNCATE]        = "truncate fail",
        [FAULT_IO]              = "IO error",
        [FAULT_CHECKPOINT]      = "checkpoint error",
 };
@@ -82,6 +83,7 @@ enum {
        Opt_discard,
        Opt_nodiscard,
        Opt_noheap,
+       Opt_heap,
        Opt_user_xattr,
        Opt_nouser_xattr,
        Opt_acl,
@@ -116,6 +118,7 @@ static match_table_t f2fs_tokens = {
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
        {Opt_noheap, "no_heap"},
+       {Opt_heap, "heap"},
        {Opt_user_xattr, "user_xattr"},
        {Opt_nouser_xattr, "nouser_xattr"},
        {Opt_acl, "acl"},
@@ -293,6 +296,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
@@ -318,6 +322,7 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(ipu_policy),
        ATTR_LIST(min_ipu_util),
        ATTR_LIST(min_fsync_blocks),
+       ATTR_LIST(min_hot_blocks),
        ATTR_LIST(max_victim_search),
        ATTR_LIST(dir_level),
        ATTR_LIST(ram_thresh),
@@ -436,6 +441,9 @@ static int parse_options(struct super_block *sb, char *options)
                case Opt_noheap:
                        set_opt(sbi, NOHEAP);
                        break;
+               case Opt_heap:
+                       clear_opt(sbi, NOHEAP);
+                       break;
 #ifdef CONFIG_F2FS_FS_XATTR
                case Opt_user_xattr:
                        set_opt(sbi, XATTR_USER);
@@ -787,7 +795,14 @@ static void f2fs_put_super(struct super_block *sb)
        }
 
        /* be sure to wait for any on-going discard commands */
-       f2fs_wait_discard_bio(sbi, NULL_ADDR);
+       f2fs_wait_discard_bios(sbi);
+
+       if (!sbi->discard_blks) {
+               struct cp_control cpc = {
+                       .reason = CP_UMOUNT | CP_TRIMMED,
+               };
+               write_checkpoint(sbi, &cpc);
+       }
 
        /* write_checkpoint can update stat informaion */
        f2fs_destroy_stats(sbi);
@@ -913,7 +928,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
        if (test_opt(sbi, DISCARD))
                seq_puts(seq, ",discard");
        if (test_opt(sbi, NOHEAP))
-               seq_puts(seq, ",no_heap_alloc");
+               seq_puts(seq, ",no_heap");
+       else
+               seq_puts(seq, ",heap");
 #ifdef CONFIG_F2FS_FS_XATTR
        if (test_opt(sbi, XATTR_USER))
                seq_puts(seq, ",user_xattr");
@@ -986,7 +1003,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
                if ((i % 10) == 0)
                        seq_printf(seq, "%-10d", i);
                seq_printf(seq, "%d|%-3u", se->type,
-                                       get_valid_blocks(sbi, i, 1));
+                                       get_valid_blocks(sbi, i, false));
                if ((i % 10) == 9 || i == (total_segs - 1))
                        seq_putc(seq, '\n');
                else
@@ -1012,7 +1029,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset)
 
                seq_printf(seq, "%-10d", i);
                seq_printf(seq, "%d|%-3u|", se->type,
-                                       get_valid_blocks(sbi, i, 1));
+                                       get_valid_blocks(sbi, i, false));
                for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
                        seq_printf(seq, " %.2x", se->cur_valid_map[j]);
                seq_putc(seq, '\n');
@@ -1046,6 +1063,7 @@ static void default_options(struct f2fs_sb_info *sbi)
        set_opt(sbi, INLINE_DATA);
        set_opt(sbi, INLINE_DENTRY);
        set_opt(sbi, EXTENT_CACHE);
+       set_opt(sbi, NOHEAP);
        sbi->sb->s_flags |= MS_LAZYTIME;
        set_opt(sbi, FLUSH_MERGE);
        if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
@@ -1307,7 +1325,7 @@ static int __f2fs_commit_super(struct buffer_head *bh,
        unlock_buffer(bh);
 
        /* it's rare case, we can do fua all the time */
-       return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA);
+       return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
 }
 
 static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
@@ -1483,6 +1501,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
                return 1;
        }
 
+       if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) {
+               f2fs_msg(sb, KERN_INFO,
+                       "Invalid segment count (%u)",
+                       le32_to_cpu(raw_super->segment_count));
+               return 1;
+       }
+
        /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
        if (sanity_check_area_boundary(sbi, bh))
                return 1;
@@ -1555,6 +1580,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
        for (i = 0; i < NR_COUNT_TYPE; i++)
                atomic_set(&sbi->nr_pages[i], 0);
 
+       atomic_set(&sbi->wb_sync_req, 0);
+
        INIT_LIST_HEAD(&sbi->s_list);
        mutex_init(&sbi->umount_mutex);
        mutex_init(&sbi->wio_mutex[NODE]);
@@ -1917,6 +1944,7 @@ try_onemore:
        mutex_init(&sbi->gc_mutex);
        mutex_init(&sbi->cp_mutex);
        init_rwsem(&sbi->node_write);
+       init_rwsem(&sbi->node_change);
 
        /* disallow all the data/node/meta page writes */
        set_sbi_flag(sbi, SBI_POR_DOING);
@@ -2022,6 +2050,10 @@ try_onemore:
 
        f2fs_join_shrinker(sbi);
 
+       err = f2fs_build_stats(sbi);
+       if (err)
+               goto free_nm;
+
        /* if there are nt orphan nodes free them */
        err = recover_orphan_inodes(sbi);
        if (err)
@@ -2046,10 +2078,6 @@ try_onemore:
                goto free_root_inode;
        }
 
-       err = f2fs_build_stats(sbi);
-       if (err)
-               goto free_root_inode;
-
        if (f2fs_proc_root)
                sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
 
@@ -2143,7 +2171,6 @@ free_proc:
                remove_proc_entry("segment_bits", sbi->s_proc);
                remove_proc_entry(sb->s_id, f2fs_proc_root);
        }
-       f2fs_destroy_stats(sbi);
 free_root_inode:
        dput(sb->s_root);
        sb->s_root = NULL;
@@ -2161,6 +2188,7 @@ free_node_inode:
        truncate_inode_pages_final(META_MAPPING(sbi));
        iput(sbi->node_inode);
        mutex_unlock(&sbi->umount_mutex);
+       f2fs_destroy_stats(sbi);
 free_nm:
        destroy_node_manager(sbi);
 free_sm:
index 73b4e1d1912a7ff7370c1c6f2d649b770ed70acc..bccbbf2616d2b2cd3e5c3380b809f545dc1802a9 100644 (file)
@@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page)
        pid_t pid = task_pid_nr(current);
        void *p;
 
-       page->private = pid;
+       set_page_private(page, (unsigned long)pid);
 
        if (radix_tree_preload(GFP_NOFS))
                return;
@@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
 
        radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
                results[ret] = iter.index;
-               if (++ret == PIDVEC_SIZE)
+               if (++ret == max_items)
                        break;
        }
        return ret;
index 7298a4488f7f59cfc68587c4fbc3b1ac6293f9aa..832c5110abab51208e77751056436e7ba7e7ff16 100644 (file)
@@ -250,15 +250,13 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
        void *cur_addr, *txattr_addr, *last_addr = NULL;
        nid_t xnid = F2FS_I(inode)->i_xattr_nid;
        unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0;
-       unsigned int inline_size = 0;
+       unsigned int inline_size = inline_xattr_size(inode);
        int err = 0;
 
-       inline_size = inline_xattr_size(inode);
-
        if (!size && !inline_size)
                return -ENODATA;
 
-       txattr_addr = kzalloc(inline_size + size + sizeof(__u32),
+       txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
                                                        GFP_F2FS_ZERO);
        if (!txattr_addr)
                return -ENOMEM;
@@ -328,13 +326,14 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct f2fs_xattr_header *header;
-       size_t size = PAGE_SIZE, inline_size = 0;
+       nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+       unsigned int size = VALID_XATTR_BLOCK_SIZE;
+       unsigned int inline_size = inline_xattr_size(inode);
        void *txattr_addr;
        int err;
 
-       inline_size = inline_xattr_size(inode);
-
-       txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
+       txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
+                                                       GFP_F2FS_ZERO);
        if (!txattr_addr)
                return -ENOMEM;
 
@@ -358,19 +357,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
        }
 
        /* read from xattr node block */
-       if (F2FS_I(inode)->i_xattr_nid) {
+       if (xnid) {
                struct page *xpage;
                void *xattr_addr;
 
                /* The inode already has an extended attribute block. */
-               xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
+               xpage = get_node_page(sbi, xnid);
                if (IS_ERR(xpage)) {
                        err = PTR_ERR(xpage);
                        goto fail;
                }
 
                xattr_addr = page_address(xpage);
-               memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
+               memcpy(txattr_addr + inline_size, xattr_addr, size);
                f2fs_put_page(xpage, 1);
        }
 
@@ -392,14 +391,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
                                void *txattr_addr, struct page *ipage)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-       size_t inline_size = 0;
+       size_t inline_size = inline_xattr_size(inode);
        void *xattr_addr;
        struct page *xpage;
        nid_t new_nid = 0;
        int err;
 
-       inline_size = inline_xattr_size(inode);
-
        if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
                if (!alloc_nid(sbi, &new_nid))
                        return -ENOSPC;
@@ -454,7 +451,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
        }
 
        xattr_addr = page_address(xpage);
-       memcpy(xattr_addr, txattr_addr + inline_size, MAX_XATTR_BLOCK_SIZE);
+       memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
        set_page_dirty(xpage);
        f2fs_put_page(xpage, 1);
 
@@ -546,7 +543,9 @@ static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry,
                                        const void *value, size_t size)
 {
        void *pval = entry->e_name + entry->e_name_len;
-       return (entry->e_value_size == size) && !memcmp(pval, value, size);
+
+       return (le16_to_cpu(entry->e_value_size) == size) &&
+                                       !memcmp(pval, value, size);
 }
 
 static int __f2fs_setxattr(struct inode *inode, int index,
index d5a94928c11695cde054f6143cb604d0bf850fc0..dbcd1d16e66982e07233e66fe400e95597a6c2df 100644 (file)
@@ -58,10 +58,10 @@ struct f2fs_xattr_entry {
 #define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1))
 #define XATTR_ROUND            (3)
 
-#define XATTR_ALIGN(size)      ((size + XATTR_ROUND) & ~XATTR_ROUND)
+#define XATTR_ALIGN(size)      (((size) + XATTR_ROUND) & ~XATTR_ROUND)
 
 #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
-                       entry->e_name_len + le16_to_cpu(entry->e_value_size)))
+                       (entry)->e_name_len + le16_to_cpu((entry)->e_value_size)))
 
 #define XATTR_NEXT_ENTRY(entry)        ((struct f2fs_xattr_entry *)((char *)(entry) +\
                        ENTRY_SIZE(entry)))
@@ -72,8 +72,8 @@ struct f2fs_xattr_entry {
                for (entry = XATTR_FIRST_ENTRY(addr);\
                                !IS_XATTR_LAST_ENTRY(entry);\
                                entry = XATTR_NEXT_ENTRY(entry))
-#define MAX_XATTR_BLOCK_SIZE   (PAGE_SIZE - sizeof(struct node_footer))
-#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - sizeof(__u32))
+#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer))
+#define XATTR_PADDING_SIZE     (sizeof(__u32))
 #define MIN_OFFSET(i)          XATTR_ALIGN(inline_xattr_size(i) +      \
                                                VALID_XATTR_BLOCK_SIZE)
 
index e2d239ed4c60cb05fbf6072e1a3a3ea4e34594f3..b6feed6547ce92862acaf3153c5859d3eb132ccb 100644 (file)
@@ -32,9 +32,9 @@
 /* 0, 1(node nid), 2(meta nid) are reserved node id */
 #define F2FS_RESERVED_NODE_NUM         3
 
-#define F2FS_ROOT_INO(sbi)     (sbi->root_ino_num)
-#define F2FS_NODE_INO(sbi)     (sbi->node_ino_num)
-#define F2FS_META_INO(sbi)     (sbi->meta_ino_num)
+#define F2FS_ROOT_INO(sbi)     ((sbi)->root_ino_num)
+#define F2FS_NODE_INO(sbi)     ((sbi)->node_ino_num)
+#define F2FS_META_INO(sbi)     ((sbi)->meta_ino_num)
 
 #define F2FS_IO_SIZE(sbi)      (1 << (sbi)->write_io_size_bits) /* Blocks */
 #define F2FS_IO_SIZE_KB(sbi)   (1 << ((sbi)->write_io_size_bits + 2)) /* KB */
@@ -114,6 +114,7 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_TRIMMED_FLAG                0x00000100
 #define CP_NAT_BITS_FLAG       0x00000080
 #define CP_CRC_RECOVERY_FLAG   0x00000040
 #define CP_FASTBOOT_FLAG       0x00000020
@@ -161,7 +162,7 @@ struct f2fs_checkpoint {
  */
 #define F2FS_ORPHANS_PER_BLOCK 1020
 
-#define GET_ORPHAN_BLOCKS(n)   ((n + F2FS_ORPHANS_PER_BLOCK - 1) / \
+#define GET_ORPHAN_BLOCKS(n)   (((n) + F2FS_ORPHANS_PER_BLOCK - 1) / \
                                        F2FS_ORPHANS_PER_BLOCK)
 
 struct f2fs_orphan_block {
@@ -301,6 +302,12 @@ struct f2fs_nat_block {
 #define SIT_VBLOCK_MAP_SIZE 64
 #define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry))
 
+/*
+ * F2FS uses 4 bytes to represent block address. As a result, supported size of
+ * disk is 16 TB and it equals to 16 * 1024 * 1024 / 2 segments.
+ */
+#define F2FS_MAX_SEGMENT       ((16 * 1024 * 1024) / 2)
+
 /*
  * Note that f2fs_sit_entry->vblocks has the following bit-field information.
  * [15:10] : allocation type such as CURSEG_XXXX_TYPE
@@ -449,7 +456,7 @@ typedef __le32      f2fs_hash_t;
 #define F2FS_SLOT_LEN          8
 #define F2FS_SLOT_LEN_BITS     3
 
-#define GET_DENTRY_SLOTS(x)    ((x + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS)
+#define GET_DENTRY_SLOTS(x) (((x) + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS)
 
 /* MAX level for dir lookup */
 #define MAX_DIR_HASH_DEPTH     63
index c80fcad0a6c97b6975628fe6dcd236863f295c51..15da88c5c3a4d12ef12ff56a7e6cde8a039920b7 100644 (file)
@@ -15,6 +15,8 @@ TRACE_DEFINE_ENUM(META);
 TRACE_DEFINE_ENUM(META_FLUSH);
 TRACE_DEFINE_ENUM(INMEM);
 TRACE_DEFINE_ENUM(INMEM_DROP);
+TRACE_DEFINE_ENUM(INMEM_INVALIDATE);
+TRACE_DEFINE_ENUM(INMEM_REVOKE);
 TRACE_DEFINE_ENUM(IPU);
 TRACE_DEFINE_ENUM(OPU);
 TRACE_DEFINE_ENUM(CURSEG_HOT_DATA);
@@ -42,6 +44,7 @@ TRACE_DEFINE_ENUM(CP_FASTBOOT);
 TRACE_DEFINE_ENUM(CP_SYNC);
 TRACE_DEFINE_ENUM(CP_RECOVERY);
 TRACE_DEFINE_ENUM(CP_DISCARD);
+TRACE_DEFINE_ENUM(CP_TRIMMED);
 
 #define show_block_type(type)                                          \
        __print_symbolic(type,                                          \
@@ -51,12 +54,13 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
                { META_FLUSH,   "META_FLUSH" },                         \
                { INMEM,        "INMEM" },                              \
                { INMEM_DROP,   "INMEM_DROP" },                         \
+               { INMEM_INVALIDATE,     "INMEM_INVALIDATE" },           \
                { INMEM_REVOKE, "INMEM_REVOKE" },                       \
                { IPU,          "IN-PLACE" },                           \
                { OPU,          "OUT-OF-PLACE" })
 
-#define F2FS_OP_FLAGS (REQ_RAHEAD | REQ_SYNC | REQ_PREFLUSH | REQ_META |\
-                       REQ_PRIO)
+#define F2FS_OP_FLAGS (REQ_RAHEAD | REQ_SYNC | REQ_META | REQ_PRIO |   \
+                       REQ_PREFLUSH | REQ_FUA)
 #define F2FS_BIO_FLAG_MASK(t)  (t & F2FS_OP_FLAGS)
 
 #define show_bio_type(op,op_flags)     show_bio_op(op),                \
@@ -75,16 +79,13 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
                { REQ_OP_WRITE_ZEROES,          "WRITE_ZEROES" })
 
 #define show_bio_op_flags(flags)                                       \
-       __print_symbolic(F2FS_BIO_FLAG_MASK(flags),                     \
-               { REQ_RAHEAD,           "(RA)" },                       \
-               { REQ_SYNC,             "(S)" },                        \
-               { REQ_SYNC | REQ_PRIO,  "(SP)" },                       \
-               { REQ_META,             "(M)" },                        \
-               { REQ_META | REQ_PRIO,  "(MP)" },                       \
-               { REQ_SYNC | REQ_PREFLUSH , "(SF)" },                   \
-               { REQ_SYNC | REQ_META | REQ_PRIO, "(SMP)" },            \
-               { REQ_PREFLUSH | REQ_META | REQ_PRIO, "(FMP)" },        \
-               { 0, " \b" })
+       __print_flags(F2FS_BIO_FLAG_MASK(flags), "|",                   \
+               { REQ_RAHEAD,           "R" },                          \
+               { REQ_SYNC,             "S" },                          \
+               { REQ_META,             "M" },                          \
+               { REQ_PRIO,             "P" },                          \
+               { REQ_PREFLUSH,         "PF" },                         \
+               { REQ_FUA,              "FUA" })
 
 #define show_data_type(type)                                           \
        __print_symbolic(type,                                          \
@@ -117,12 +118,14 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
                { GC_CB,        "Cost-Benefit" })
 
 #define show_cpreason(type)                                            \
-       __print_symbolic(type,                                          \
+       __print_flags(type, "|",                                        \
                { CP_UMOUNT,    "Umount" },                             \
                { CP_FASTBOOT,  "Fastboot" },                           \
                { CP_SYNC,      "Sync" },                               \
                { CP_RECOVERY,  "Recovery" },                           \
-               { CP_DISCARD,   "Discard" })
+               { CP_DISCARD,   "Discard" },                            \
+               { CP_UMOUNT,    "Umount" },                             \
+               { CP_TRIMMED,   "Trimmed" })
 
 struct victim_sel_policy;
 struct f2fs_map_blocks;
@@ -769,7 +772,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
        ),
 
        TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
-               "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s%s, type = %s",
+               "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s",
                show_dev_ino(__entry),
                (unsigned long)__entry->index,
                (unsigned long long)__entry->old_blkaddr,
@@ -822,7 +825,7 @@ DECLARE_EVENT_CLASS(f2fs__bio,
                __entry->size           = bio->bi_iter.bi_size;
        ),
 
-       TP_printk("dev = (%d,%d)/(%d,%d), rw = %s%s, %s, sector = %lld, size = %u",
+       TP_printk("dev = (%d,%d)/(%d,%d), rw = %s(%s), %s, sector = %lld, size = %u",
                show_dev(__entry->target),
                show_dev(__entry->dev),
                show_bio_type(__entry->op, __entry->op_flags),
@@ -1126,7 +1129,7 @@ TRACE_EVENT(f2fs_write_checkpoint,
                __entry->msg)
 );
 
-TRACE_EVENT(f2fs_issue_discard,
+DECLARE_EVENT_CLASS(f2fs_discard,
 
        TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
 
@@ -1150,6 +1153,20 @@ TRACE_EVENT(f2fs_issue_discard,
                (unsigned long long)__entry->blklen)
 );
 
+DEFINE_EVENT(f2fs_discard, f2fs_queue_discard,
+
+       TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
+
+       TP_ARGS(dev, blkstart, blklen)
+);
+
+DEFINE_EVENT(f2fs_discard, f2fs_issue_discard,
+
+       TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
+
+       TP_ARGS(dev, blkstart, blklen)
+);
+
 TRACE_EVENT(f2fs_issue_reset_zone,
 
        TP_PROTO(struct block_device *dev, block_t blkstart),
@@ -1174,26 +1191,29 @@ TRACE_EVENT(f2fs_issue_reset_zone,
 TRACE_EVENT(f2fs_issue_flush,
 
        TP_PROTO(struct block_device *dev, unsigned int nobarrier,
-                                       unsigned int flush_merge),
+                               unsigned int flush_merge, int ret),
 
-       TP_ARGS(dev, nobarrier, flush_merge),
+       TP_ARGS(dev, nobarrier, flush_merge, ret),
 
        TP_STRUCT__entry(
                __field(dev_t,  dev)
                __field(unsigned int, nobarrier)
                __field(unsigned int, flush_merge)
+               __field(int,  ret)
        ),
 
        TP_fast_assign(
                __entry->dev    = dev->bd_dev;
                __entry->nobarrier = nobarrier;
                __entry->flush_merge = flush_merge;
+               __entry->ret = ret;
        ),
 
-       TP_printk("dev = (%d,%d), %s %s",
+       TP_printk("dev = (%d,%d), %s %s, ret = %d",
                show_dev(__entry->dev),
                __entry->nobarrier ? "skip (nobarrier)" : "issue",
-               __entry->flush_merge ? " with flush_merge" : "")
+               __entry->flush_merge ? " with flush_merge" : "",
+               __entry->ret)
 );
 
 TRACE_EVENT(f2fs_lookup_extent_tree_start,