Merge tag 'for-linus-20181102' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 2 Nov 2018 18:25:48 +0000 (11:25 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 2 Nov 2018 18:25:48 +0000 (11:25 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Nov 2018 18:25:48 +0000 (11:25 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Nov 2018 18:25:48 +0000 (11:25 -0700)
diff --combined Documentation/admin-guide/cgroup-v2.rst

index 8384c681a4b2e0cb88595b0e95eb6660ab5d7857,184193bcb262ac908f1f5a7a7c2c662dec0ea4b8..476722b7b6367ca38bf0e3263f3e132b515dcfd6
--- 1/Documentation/admin-guide/cgroup-v2.rst
--- 2/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@@ -966,12 -966,6 +966,12 @@@ All time durations are in microseconds
         $PERIOD duration.  "max" for $MAX indicates no limit.  If only
         one number is written, $MAX is updated.
   
+ +  cpu.pressure
+ +      A read-only nested-key file which exists on non-root cgroups.
+ +
+ +      Shows pressure stall information for CPU. See
+ +      Documentation/accounting/psi.txt for details.
+ +
   
   Memory
   ------
@@@ -1133,10 -1127,6 +1133,10 @@@ PAGE_SIZE multiple when read back
                 disk readahead.  For now OOM in memory cgroup kills
                 tasks iff shortage has happened inside page fault.
   
+ +              This event is not raised if the OOM killer is not
+ +              considered as an option, e.g. for failed high-order
+ +              allocations.
+ +
           oom_kill
                 The number of processes belonging to this cgroup
                 killed by any kind of OOM killer.
@@@ -1281,12 -1271,6 +1281,12 @@@
         higher than the limit for an extended period of time.  This
         reduces the impact on the workload and memory management.
   
+ +  memory.pressure
+ +      A read-only nested-key file which exists on non-root cgroups.
+ +
+ +      Shows pressure stall information for memory. See
+ +      Documentation/accounting/psi.txt for details.
+ +
   
   Usage Guidelines
   ~~~~~~~~~~~~~~~~
@@@ -1424,12 -1408,6 +1424,12 @@@ IO Interface File
   
           8:16 rbps=2097152 wbps=max riops=max wiops=max
   
+ +  io.pressure
+ +      A read-only nested-key file which exists on non-root cgroups.
+ +
+ +      Shows pressure stall information for IO. See
+ +      Documentation/accounting/psi.txt for details.
+ +
   
   Writeback
   ~~~~~~~~~
@@@ -1879,10 -1857,8 +1879,8 @@@ following two functions
   
     wbc_init_bio(@wbc, @bio)
         Should be called for each bio carrying writeback data and
-       associates the bio with the inode's owner cgroup and the
-       corresponding request queue.  This must be called after
-       a queue (device) has been associated with the bio and
-       before submission.
+       associates the bio with the inode's owner cgroup.  Can be
+       called anytime between bio allocation and submission.
   
     wbc_account_io(@wbc, @page, @bytes)
         Should be called for each data segment being written out.
@@@ -1901,7 -1877,7 +1899,7 @@@ the configuration, the bio may be execu
   the writeback session is holding shared resources, e.g. a journal
   entry, may lead to priority inversion.  There is no one easy solution
   for the problem.  Filesystems can try to work around specific problem
- cases by skipping wbc_init_bio() or using bio_associate_create_blkg()
+ cases by skipping wbc_init_bio() or using bio_associate_blkcg()
   directly.
   
   
diff --combined block/bio.c

index c27f77befbacc70a36e194f5f2427e20cde123af,4a5a036268fb8e2dd78be6aba55a255dffb98c52..d5368a4455613452972e0b8b06ea3e1ceacb335e
--- 1/block/bio.c
--- 2/block/bio.c
+++ b/block/bio.c
@@@ -609,9 -609,7 +609,7 @@@ void __bio_clone_fast(struct bio *bio, 
         bio->bi_iter = bio_src->bi_iter;
         bio->bi_io_vec = bio_src->bi_io_vec;
   
-       bio_clone_blkg_association(bio, bio_src);
- 
-       blkcg_bio_issue_init(bio);
+       bio_clone_blkcg_association(bio, bio_src);
   }
   EXPORT_SYMBOL(__bio_clone_fast);
   
@@@ -1256,7 -1254,7 +1254,7 @@@ struct bio *bio_copy_user_iov(struct re
         /*
          * success
          */
- -      if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) ||
+ +      if ((iov_iter_rw(iter) == WRITE && (!map_data || !map_data->null_mapped)) ||
             (map_data && map_data->from_user)) {
                 ret = bio_copy_from_iter(bio, iter);
                 if (ret)
@@@ -1956,151 -1954,69 +1954,69 @@@ EXPORT_SYMBOL(bioset_init_from_src)
   
   #ifdef CONFIG_BLK_CGROUP
   
- /**
-  * bio_associate_blkg - associate a bio with the a blkg
-  * @bio: target bio
-  * @blkg: the blkg to associate
-  *
-  * This tries to associate @bio with the specified blkg.  Association failure
-  * is handled by walking up the blkg tree.  Therefore, the blkg associated can
-  * be anything between @blkg and the root_blkg.  This situation only happens
-  * when a cgroup is dying and then the remaining bios will spill to the closest
-  * alive blkg.
-  *
-  * A reference will be taken on the @blkg and will be released when @bio is
-  * freed.
-  */
- int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
- {
-       if (unlikely(bio->bi_blkg))
-               return -EBUSY;
-       bio->bi_blkg = blkg_tryget_closest(blkg);
-       return 0;
- }
- 
- /**
-  * __bio_associate_blkg_from_css - internal blkg association function
-  *
-  * This in the core association function that all association paths rely on.
-  * A blkg reference is taken which is released upon freeing of the bio.
-  */
- static int __bio_associate_blkg_from_css(struct bio *bio,
-                                        struct cgroup_subsys_state *css)
- {
-       struct request_queue *q = bio->bi_disk->queue;
-       struct blkcg_gq *blkg;
-       int ret;
- 
-       rcu_read_lock();
- 
-       if (!css || !css->parent)
-               blkg = q->root_blkg;
-       else
-               blkg = blkg_lookup_create(css_to_blkcg(css), q);
- 
-       ret = bio_associate_blkg(bio, blkg);
- 
-       rcu_read_unlock();
-       return ret;
- }
- 
- /**
-  * bio_associate_blkg_from_css - associate a bio with a specified css
-  * @bio: target bio
-  * @css: target css
-  *
-  * Associate @bio with the blkg found by combining the css's blkg and the
-  * request_queue of the @bio.  This falls back to the queue's root_blkg if
-  * the association fails with the css.
-  */
- int bio_associate_blkg_from_css(struct bio *bio,
-                               struct cgroup_subsys_state *css)
- {
-       if (unlikely(bio->bi_blkg))
-               return -EBUSY;
-       return __bio_associate_blkg_from_css(bio, css);
- }
- EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
- 
   #ifdef CONFIG_MEMCG
   /**
-  * bio_associate_blkg_from_page - associate a bio with the page's blkg
+  * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
    * @bio: target bio
    * @page: the page to lookup the blkcg from
    *
-  * Associate @bio with the blkg from @page's owning memcg and the respective
-  * request_queue.  If cgroup_e_css returns NULL, fall back to the queue's
-  * root_blkg.
-  *
-  * Note: this must be called after bio has an associated device.
+  * Associate @bio with the blkcg from @page's owning memcg.  This works like
+  * every other associate function wrt references.
    */
- int bio_associate_blkg_from_page(struct bio *bio, struct page *page)
+ int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
   {
-       struct cgroup_subsys_state *css;
-       int ret;
+       struct cgroup_subsys_state *blkcg_css;
   
-       if (unlikely(bio->bi_blkg))
+       if (unlikely(bio->bi_css))
                 return -EBUSY;
         if (!page->mem_cgroup)
                 return 0;
- 
-       rcu_read_lock();
- 
-       css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
- 
-       ret = __bio_associate_blkg_from_css(bio, css);
- 
-       rcu_read_unlock();
-       return ret;
+       blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
+                                    &io_cgrp_subsys);
+       bio->bi_css = blkcg_css;
+       return 0;
   }
   #endif /* CONFIG_MEMCG */
   
   /**
-  * bio_associate_create_blkg - associate a bio with a blkg from q
-  * @q: request_queue where bio is going
+  * bio_associate_blkcg - associate a bio with the specified blkcg
    * @bio: target bio
+  * @blkcg_css: css of the blkcg to associate
+  *
+  * Associate @bio with the blkcg specified by @blkcg_css.  Block layer will
+  * treat @bio as if it were issued by a task which belongs to the blkcg.
    *
-  * Associate @bio with the blkg found from the bio's css and the request_queue.
-  * If one is not found, bio_lookup_blkg creates the blkg.  This falls back to
-  * the queue's root_blkg if association fails.
+  * This function takes an extra reference of @blkcg_css which will be put
+  * when @bio is released.  The caller must own @bio and is responsible for
+  * synchronizing calls to this function.
    */
- int bio_associate_create_blkg(struct request_queue *q, struct bio *bio)
+ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
   {
-       struct cgroup_subsys_state *css;
-       int ret = 0;
- 
-       /* someone has already associated this bio with a blkg */
-       if (bio->bi_blkg)
-               return ret;
- 
-       rcu_read_lock();
- 
-       css = blkcg_css();
- 
-       ret = __bio_associate_blkg_from_css(bio, css);
- 
-       rcu_read_unlock();
-       return ret;
+       if (unlikely(bio->bi_css))
+               return -EBUSY;
+       css_get(blkcg_css);
+       bio->bi_css = blkcg_css;
+       return 0;
   }
+ EXPORT_SYMBOL_GPL(bio_associate_blkcg);
   
   /**
-  * bio_reassociate_blkg - reassociate a bio with a blkg from q
-  * @q: request_queue where bio is going
+  * bio_associate_blkg - associate a bio with the specified blkg
    * @bio: target bio
+  * @blkg: the blkg to associate
    *
-  * When submitting a bio, multiple recursive calls to make_request() may occur.
-  * This causes the initial associate done in blkcg_bio_issue_check() to be
-  * incorrect and reference the prior request_queue.  This performs reassociation
-  * when this situation happens.
+  * Associate @bio with the blkg specified by @blkg.  This is the queue specific
+  * blkcg information associated with the @bio, a reference will be taken on the
+  * @blkg and will be freed when the bio is freed.
    */
- int bio_reassociate_blkg(struct request_queue *q, struct bio *bio)
+ int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
   {
-       if (bio->bi_blkg) {
-               blkg_put(bio->bi_blkg);
-               bio->bi_blkg = NULL;
-       }
- 
-       return bio_associate_create_blkg(q, bio);
+       if (unlikely(bio->bi_blkg))
+               return -EBUSY;
+       if (!blkg_try_get(blkg))
+               return -ENODEV;
+       bio->bi_blkg = blkg;
+       return 0;
   }
   
   /**
@@@ -2113,6 -2029,10 +2029,10 @@@ void bio_disassociate_task(struct bio *
                 put_io_context(bio->bi_ioc);
                 bio->bi_ioc = NULL;
         }
+       if (bio->bi_css) {
+               css_put(bio->bi_css);
+               bio->bi_css = NULL;
+       }
         if (bio->bi_blkg) {
                 blkg_put(bio->bi_blkg);
                 bio->bi_blkg = NULL;
@@@ -2120,16 -2040,16 +2040,16 @@@
   }
   
   /**
-  * bio_clone_blkg_association - clone blkg association from src to dst bio
+  * bio_clone_blkcg_association - clone blkcg association from src to dst bio
    * @dst: destination bio
    * @src: source bio
    */
- void bio_clone_blkg_association(struct bio *dst, struct bio *src)
+ void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
   {
-       if (src->bi_blkg)
-               bio_associate_blkg(dst, src->bi_blkg);
+       if (src->bi_css)
+               WARN_ON(bio_associate_blkcg(dst, src->bi_css));
   }
- EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
+ EXPORT_SYMBOL_GPL(bio_clone_blkcg_association);
   #endif /* CONFIG_BLK_CGROUP */
   
   static void __init biovec_init_slabs(void)
diff --combined block/blk-iolatency.c

index 28f80d22752858a2b1fcdfefb5f079469ec480ee,bb240a0c1309f8da1260e2d727406388e3a002d3..38c35c32aff2dcf3fc0e9ac294a649f0be4a1cb1
--- 1/block/blk-iolatency.c
--- 2/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@@ -153,7 -153,7 +153,7 @@@ struct iolatency_grp 
   #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
   /*
    * These are the constants used to fake the fixed-point moving average
- - * calculation just like load average.  The call to CALC_LOAD folds
+ + * calculation just like load average.  The call to calc_load() folds
    * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg.  The sampling
    * window size is bucketed to try to approximately calculate average
    * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
@@@ -248,7 -248,7 +248,7 @@@ static inline void iolat_update_total_l
                 return;
   
         /*
- -       * CALC_LOAD takes in a number stored in fixed point representation.
+ +       * calc_load() takes in a number stored in fixed point representation.
          * Because we are using this for IO time in ns, the values stored
          * are significantly larger than the FIXED_1 denominator (2048).
          * Therefore, rounding errors in the calculation are negligible and
@@@ -257,9 -257,7 +257,9 @@@
         exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
                         div64_u64(iolat->cur_win_nsec,
                                   BLKIOLATENCY_EXP_BUCKET_SIZE));
- -      CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean);
+ +      iolat->lat_avg = calc_load(iolat->lat_avg,
+ +                                 iolatency_exp_factors[exp_idx],
+ +                                 stat->rqs.mean);
   }
   
   static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
@@@ -482,12 -480,34 +482,34 @@@ static void blkcg_iolatency_throttle(st
                                      spinlock_t *lock)
   {
         struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
-       struct blkcg_gq *blkg = bio->bi_blkg;
+       struct blkcg *blkcg;
+       struct blkcg_gq *blkg;
+       struct request_queue *q = rqos->q;
         bool issue_as_root = bio_issue_as_root_blkg(bio);
   
         if (!blk_iolatency_enabled(blkiolat))
                 return;
   
+       rcu_read_lock();
+       blkcg = bio_blkcg(bio);
+       bio_associate_blkcg(bio, &blkcg->css);
+       blkg = blkg_lookup(blkcg, q);
+       if (unlikely(!blkg)) {
+               if (!lock)
+                       spin_lock_irq(q->queue_lock);
+               blkg = blkg_lookup_create(blkcg, q);
+               if (IS_ERR(blkg))
+                       blkg = NULL;
+               if (!lock)
+                       spin_unlock_irq(q->queue_lock);
+       }
+       if (!blkg)
+               goto out;
+ 
+       bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+       bio_associate_blkg(bio, blkg);
+ out:
+       rcu_read_unlock();
         while (blkg && blkg->parent) {
                 struct iolatency_grp *iolat = blkg_to_lat(blkg);
                 if (!iolat) {
@@@ -708,7 -728,7 +730,7 @@@ static void blkiolatency_timer_fn(struc
                  * We could be exiting, don't access the pd unless we have a
                  * ref on the blkg.
                  */
-               if (!blkg_tryget(blkg))
+               if (!blkg_try_get(blkg))
                         continue;
   
                 iolat = blkg_to_lat(blkg);
diff --combined block/bounce.c

index cf49fe02f65cd017eb2132fd3475ba29a5d3cf75,418677dcec60f58d9b8a2d53cbfd8cc611f7032c..36869afc258ccf6ea609e0e74db6cea56e6d2c34
--- 1/block/bounce.c
--- 2/block/bounce.c
+++ b/block/bounce.c
@@@ -18,7 -18,7 +18,7 @@@
   #include <linux/init.h>
   #include <linux/hash.h>
   #include <linux/highmem.h>
- -#include <linux/bootmem.h>
+ +#include <linux/memblock.h>
   #include <linux/printk.h>
   #include <asm/tlbflush.h>
   
@@@ -276,9 -276,7 +276,7 @@@ static struct bio *bounce_clone_bio(str
                 }
         }
   
-       bio_clone_blkg_association(bio, bio_src);
- 
-       blkcg_bio_issue_init(bio);
+       bio_clone_blkcg_association(bio, bio_src);
   
         return bio;
   }
diff --combined drivers/block/loop.c

index e6273ae85246029358272406bfb00cca07226455,ea9debf59b225c19d815e7ff1fd8aa950f5dcb1b..cb0cc868507620513d3de7658ed1f6999ceaa965
--- 1/drivers/block/loop.c
--- 2/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@@ -77,7 -77,6 +77,6 @@@
   #include <linux/falloc.h>
   #include <linux/uio.h>
   #include <linux/ioprio.h>
- #include <linux/blk-cgroup.h>
   
   #include "loop.h"
   
@@@ -269,7 -268,7 +268,7 @@@ static int lo_write_bvec(struct file *f
         struct iov_iter i;
         ssize_t bw;
   
- -      iov_iter_bvec(&i, ITER_BVEC | WRITE, bvec, 1, bvec->bv_len);
+ +      iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len);
   
         file_start_write(file);
         bw = vfs_iter_write(file, &i, ppos, 0);
@@@ -347,7 -346,7 +346,7 @@@ static int lo_read_simple(struct loop_d
         ssize_t len;
   
         rq_for_each_segment(bvec, rq, iter) {
- -              iov_iter_bvec(&i, ITER_BVEC, &bvec, 1, bvec.bv_len);
+ +              iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len);
                 len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
                 if (len < 0)
                         return len;
@@@ -388,7 -387,7 +387,7 @@@ static int lo_read_transfer(struct loop
                 b.bv_offset = 0;
                 b.bv_len = bvec.bv_len;
   
- -              iov_iter_bvec(&i, ITER_BVEC, &b, 1, b.bv_len);
+ +              iov_iter_bvec(&i, READ, &b, 1, b.bv_len);
                 len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
                 if (len < 0) {
                         ret = len;
@@@ -555,7 -554,8 +554,7 @@@ static int lo_rw_aio(struct loop_devic
         }
         atomic_set(&cmd->ref, 2);
   
- -      iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
- -                    segments, blk_rq_bytes(rq));
+ +      iov_iter_bvec(&iter, rw, bvec, segments, blk_rq_bytes(rq));
         iter.iov_offset = offset;
   
         cmd->iocb.ki_pos = pos;
@@@ -1760,8 -1760,8 +1759,8 @@@ static blk_status_t loop_queue_rq(struc
   
         /* always use the first bio's css */
   #ifdef CONFIG_BLK_CGROUP
-       if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) {
-               cmd->css = &bio_blkcg(rq->bio)->css;
+       if (cmd->use_aio && rq->bio && rq->bio->bi_css) {
+               cmd->css = rq->bio->bi_css;
                 css_get(cmd->css);
         } else
   #endif
diff --combined fs/buffer.c

index d60d61e8ed7de495bddd0bc799f16c2606a4c68b,6f1ae3ac97896c6fff85e947b9fe7a6553368457..1286c2b95498de47d2ba08b57a93901bdf4367bd
--- 1/fs/buffer.c
--- 2/fs/buffer.c
+++ b/fs/buffer.c
@@@ -562,7 -562,7 +562,7 @@@ void mark_buffer_dirty_inode(struct buf
   EXPORT_SYMBOL(mark_buffer_dirty_inode);
   
   /*
- - * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
+ + * Mark the page dirty, and set it dirty in the page cache, and mark the inode
    * dirty.
    *
    * If warn is true, then emit a warning if the page is not uptodate and has
@@@ -579,8 -579,8 +579,8 @@@ void __set_page_dirty(struct page *page
         if (page->mapping) {    /* Race with truncate? */
                 WARN_ON_ONCE(warn && !PageUptodate(page));
                 account_page_dirtied(page, mapping);
- -              radix_tree_tag_set(&mapping->i_pages,
- -                              page_index(page), PAGECACHE_TAG_DIRTY);
+ +              __xa_set_mark(&mapping->i_pages, page_index(page),
+ +                              PAGECACHE_TAG_DIRTY);
         }
         xa_unlock_irqrestore(&mapping->i_pages, flags);
   }
@@@ -1050,7 -1050,7 +1050,7 @@@ __getblk_slow(struct block_device *bdev
    * The relationship between dirty buffers and dirty pages:
    *
    * Whenever a page has any dirty buffers, the page's dirty bit is set, and
- - * the page is tagged dirty in its radix tree.
+ + * the page is tagged dirty in the page cache.
    *
    * At all times, the dirtiness of the buffers represents the dirtiness of
    * subsections of the page.  If the page has buffers, the page dirty bit is
@@@ -1073,9 -1073,9 +1073,9 @@@
    * mark_buffer_dirty - mark a buffer_head as needing writeout
    * @bh: the buffer_head to mark dirty
    *
- - * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
- - * backing page dirty, then tag the page as dirty in its address_space's radix
- - * tree and then attach the address_space's inode to its superblock's dirty
+ + * mark_buffer_dirty() will set the dirty bit against the buffer, then set
+ + * its backing page dirty, then tag the page as dirty in the page cache
+ + * and then attach the address_space's inode to its superblock's dirty
    * inode list.
    *
    * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
@@@ -3060,6 -3060,11 +3060,11 @@@ static int submit_bh_wbc(int op, int op
          */
         bio = bio_alloc(GFP_NOIO, 1);
   
+       if (wbc) {
+               wbc_init_bio(wbc, bio);
+               wbc_account_io(wbc, bh->b_page, bh->b_size);
+       }
+ 
         bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
         bio_set_dev(bio, bh->b_bdev);
         bio->bi_write_hint = write_hint;
@@@ -3079,11 -3084,6 +3084,6 @@@
                 op_flags |= REQ_PRIO;
         bio_set_op_attrs(bio, op, op_flags);
   
-       if (wbc) {
-               wbc_init_bio(wbc, bio);
-               wbc_account_io(wbc, bh->b_page, bh->b_size);
-       }
- 
         submit_bio(bio);
         return 0;
   }
diff --combined include/linux/cgroup.h

index 9968332cceed0e64e5fc9bdb814507b0bf67451b,32c553556bbdc1c0d6f2db6c1fb86ddee6713367..9d12757a65b01846486341c5d31d398ee51d89b4
--- 1/include/linux/cgroup.h
--- 2/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@@ -93,8 -93,6 +93,6 @@@ extern struct css_set init_css_set
   
   bool css_has_online_children(struct cgroup_subsys_state *css);
   struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
- struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
-                                        struct cgroup_subsys *ss);
   struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
                                              struct cgroup_subsys *ss);
   struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
@@@ -569,11 -567,20 +567,11 @@@ static inline bool cgroup_is_descendant
   static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
                                              int ancestor_level)
   {
- -      struct cgroup *ptr;
- -
         if (cgrp->level < ancestor_level)
                 return NULL;
- -
- -      for (ptr = cgrp;
- -           ptr && ptr->level > ancestor_level;
- -           ptr = cgroup_parent(ptr))
- -              ;
- -
- -      if (ptr && ptr->level == ancestor_level)
- -              return ptr;
- -
- -      return NULL;
+ +      while (cgrp && cgrp->level > ancestor_level)
+ +              cgrp = cgroup_parent(cgrp);
+ +      return cgrp;
   }
   
   /**
@@@ -650,11 -657,6 +648,11 @@@ static inline void pr_cont_cgroup_path(
         pr_cont_kernfs_path(cgrp->kn);
   }
   
+ +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+ +{
+ +      return &cgrp->psi;
+ +}
+ +
   static inline void cgroup_init_kthreadd(void)
   {
         /*
@@@ -708,16 -710,6 +706,16 @@@ static inline union kernfs_node_id *cgr
         return NULL;
   }
   
+ +static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
+ +{
+ +      return NULL;
+ +}
+ +
+ +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+ +{
+ +      return NULL;
+ +}
+ +
   static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                                struct cgroup *ancestor)
   {
diff --combined kernel/cgroup/cgroup.c

index 8b79318810ad5c63d9e70cd634f6d6bc928659ef,4a3dae2a8283041b46d2c9ff981a51a7d5c2563e..6aaf5dd5383bba294719772bc76b7c1664d54ea9
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -55,7 -55,6 +55,7 @@@
   #include <linux/nsproxy.h>
   #include <linux/file.h>
   #include <linux/sched/cputime.h>
+ +#include <linux/psi.h>
   #include <net/sock.h>
   
   #define CREATE_TRACE_POINTS
@@@ -493,7 -492,7 +493,7 @@@ static struct cgroup_subsys_state *cgro
   }
   
   /**
-  * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
+  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
    * @cgrp: the cgroup of interest
    * @ss: the subsystem of interest (%NULL returns @cgrp->self)
    *
@@@ -502,8 -501,8 +502,8 @@@
    * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
    * function is guaranteed to return non-NULL css.
    */
- static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
-                                                       struct cgroup_subsys *ss)
+ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+                                               struct cgroup_subsys *ss)
   {
         lockdep_assert_held(&cgroup_mutex);
   
@@@ -523,35 -522,6 +523,6 @@@
         return cgroup_css(cgrp, ss);
   }
   
- /**
-  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
-  * @cgrp: the cgroup of interest
-  * @ss: the subsystem of interest
-  *
-  * Find and get the effective css of @cgrp for @ss.  The effective css is
-  * defined as the matching css of the nearest ancestor including self which
-  * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
-  * the root css is returned, so this function always returns a valid css.
-  *
-  * The returned css is not guaranteed to be online, and therefore it is the
-  * callers responsiblity to tryget a reference for it.
-  */
- struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
-                                        struct cgroup_subsys *ss)
- {
-       struct cgroup_subsys_state *css;
- 
-       do {
-               css = cgroup_css(cgrp, ss);
- 
-               if (css)
-                       return css;
-               cgrp = cgroup_parent(cgrp);
-       } while (cgrp);
- 
-       return init_css_set.subsys[ss->id];
- }
- 
   /**
    * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
    * @cgrp: the cgroup of interest
@@@ -634,11 -604,10 +605,10 @@@ EXPORT_SYMBOL_GPL(of_css)
    *
    * Should be called under cgroup_[tree_]mutex.
    */
- #define for_each_e_css(css, ssid, cgrp)                                           \
-       for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)            \
-               if (!((css) = cgroup_e_css_by_mask(cgrp,                    \
-                                                  cgroup_subsys[(ssid)]))) \
-                       ;                                                   \
+ #define for_each_e_css(css, ssid, cgrp)                                       \
+       for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
+               if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+                       ;                                               \
                 else
   
   /**
@@@ -863,7 -832,7 +833,7 @@@ static void css_set_move_task(struct ta
                  */
                 WARN_ON_ONCE(task->flags & PF_EXITING);
   
- -              rcu_assign_pointer(task->cgroups, to_cset);
+ +              cgroup_move_task(task, to_cset);
                 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
                                                              &to_cset->tasks);
         }
@@@ -1037,7 -1006,7 +1007,7 @@@ static struct css_set *find_existing_cs
                          * @ss is in this hierarchy, so we want the
                          * effective css from @cgrp.
                          */
-                       template[i] = cgroup_e_css_by_mask(cgrp, ss);
+                       template[i] = cgroup_e_css(cgrp, ss);
                 } else {
                         /*
                          * @ss is not in this hierarchy, so we don't want
@@@ -3054,7 -3023,7 +3024,7 @@@ static int cgroup_apply_control(struct 
                 return ret;
   
         /*
-        * At this point, cgroup_e_css_by_mask() results reflect the new csses
+        * At this point, cgroup_e_css() results reflect the new csses
          * making the following cgroup_update_dfl_csses() properly update
          * css associations of all tasks in the subtree.
          */
@@@ -3447,21 -3416,6 +3417,21 @@@ static int cpu_stat_show(struct seq_fil
         return ret;
   }
   
+ +#ifdef CONFIG_PSI
+ +static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
+ +{
+ +      return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
+ +}
+ +static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
+ +{
+ +      return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
+ +}
+ +static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
+ +{
+ +      return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
+ +}
+ +#endif
+ +
   static int cgroup_file_open(struct kernfs_open_file *of)
   {
         struct cftype *cft = of->kn->priv;
@@@ -4592,23 -4546,6 +4562,23 @@@ static struct cftype cgroup_base_files[
                 .flags = CFTYPE_NOT_ON_ROOT,
                 .seq_show = cpu_stat_show,
         },
+ +#ifdef CONFIG_PSI
+ +      {
+ +              .name = "io.pressure",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cgroup_io_pressure_show,
+ +      },
+ +      {
+ +              .name = "memory.pressure",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cgroup_memory_pressure_show,
+ +      },
+ +      {
+ +              .name = "cpu.pressure",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cgroup_cpu_pressure_show,
+ +      },
+ +#endif
         { }     /* terminate */
   };
   
@@@ -4669,7 -4606,6 +4639,7 @@@ static void css_free_rwork_fn(struct wo
                          */
                         cgroup_put(cgroup_parent(cgrp));
                         kernfs_put(cgrp->kn);
+ +                      psi_cgroup_free(cgrp);
                         if (cgroup_on_dfl(cgrp))
                                 cgroup_rstat_exit(cgrp);
                         kfree(cgrp);
@@@ -4926,15 -4862,10 +4896,15 @@@ static struct cgroup *cgroup_create(str
         cgrp->self.parent = &parent->self;
         cgrp->root = root;
         cgrp->level = level;
- -      ret = cgroup_bpf_inherit(cgrp);
+ +
+ +      ret = psi_cgroup_alloc(cgrp);
         if (ret)
                 goto out_idr_free;
   
+ +      ret = cgroup_bpf_inherit(cgrp);
+ +      if (ret)
+ +              goto out_psi_free;
+ +
         for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
   
@@@ -4972,8 -4903,6 +4942,8 @@@
   
         return cgrp;
   
+ +out_psi_free:
+ +      psi_cgroup_free(cgrp);
   out_idr_free:
         cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
   out_stat_exit:
diff --combined mm/page_io.c

index 27b8357284420c21fe9016452ada1075dfdd71c5,aafd19ec1db4667b5924b147d506d7eeeef37270..d4d1c89bcdddcef43dfa04fa02926f21fba228fb
--- 1/mm/page_io.c
--- 2/mm/page_io.c
+++ b/mm/page_io.c
@@@ -283,7 -283,7 +283,7 @@@ int __swap_writepage(struct page *page
         struct swap_info_struct *sis = page_swap_info(page);
   
         VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- -      if (sis->flags & SWP_FILE) {
+ +      if (sis->flags & SWP_FS) {
                 struct kiocb kiocb;
                 struct file *swap_file = sis->swap_file;
                 struct address_space *mapping = swap_file->f_mapping;
@@@ -294,7 -294,7 +294,7 @@@
                 };
                 struct iov_iter from;
   
- -              iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
+ +              iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
                 init_sync_kiocb(&kiocb, swap_file);
                 kiocb.ki_pos = page_file_offset(page);
   
@@@ -339,7 -339,7 +339,7 @@@
                 goto out;
         }
         bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
-       bio_associate_blkg_from_page(bio, page);
+       bio_associate_blkcg_from_page(bio, page);
         count_swpout_vm_event(page);
         set_page_writeback(page);
         unlock_page(page);
@@@ -365,7 -365,7 +365,7 @@@ int swap_readpage(struct page *page, bo
                 goto out;
         }
   
- -      if (sis->flags & SWP_FILE) {
+ +      if (sis->flags & SWP_FS) {
                 struct file *swap_file = sis->swap_file;
                 struct address_space *mapping = swap_file->f_mapping;
   
@@@ -423,7 -423,7 +423,7 @@@ int swap_set_page_dirty(struct page *pa
   {
         struct swap_info_struct *sis = page_swap_info(page);
   
- -      if (sis->flags & SWP_FILE) {
+ +      if (sis->flags & SWP_FS) {
                 struct address_space *mapping = sis->swap_file->f_mapping;
   
                 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 2 Nov 2018 18:25:48 +0000 (11:25 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 2 Nov 2018 18:25:48 +0000 (11:25 -0700)
		1	2
Documentation/admin-guide/cgroup-v2.rst	patch \|	diff1 \|	diff2 \|	blob \| history
block/bio.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-iolatency.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/bounce.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/loop.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/buffer.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cgroup.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_io.c	patch \|	diff1 \|	diff2 \|	blob \| history