writeback: IO-less balance_dirty_pages()

author Wu Fengguang <fengguang.wu@intel.com>

Sat, 28 Aug 2010 00:45:12 +0000 (18:45 -0600)

committer Wu Fengguang <fengguang.wu@intel.com>

Mon, 3 Oct 2011 13:08:57 +0000 (21:08 +0800)
author Wu Fengguang <fengguang.wu@intel.com>
Sat, 28 Aug 2010 00:45:12 +0000 (18:45 -0600)
committer Wu Fengguang <fengguang.wu@intel.com>
Mon, 3 Oct 2011 13:08:57 +0000 (21:08 +0800)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h

index 5f172703eb4f2cafc74064388462ac9ec5192144..178c23508d3da7b9b586760df288a3859f357342 100644 (file)
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -104,30 +104,6 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
  DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
  DEFINE_WRITEBACK_EVENT(writeback_thread_start);
  DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
-DEFINE_WRITEBACK_EVENT(balance_dirty_start);
-DEFINE_WRITEBACK_EVENT(balance_dirty_wait);
-
-TRACE_EVENT(balance_dirty_written,
-
-       TP_PROTO(struct backing_dev_info *bdi, int written),
-
-       TP_ARGS(bdi, written),
-
-       TP_STRUCT__entry(
-               __array(char,   name, 32)
-               __field(int,    written)
-       ),
-
-       TP_fast_assign(
-               strncpy(__entry->name, dev_name(bdi->dev), 32);
-               __entry->written = written;
-       ),
-
-       TP_printk("bdi %s written %d",
-                 __entry->name,
-                 __entry->written
-       )
-);
  
  DECLARE_EVENT_CLASS(wbc_class,
         TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index daff320d263febc0112169fb5822a19868603734..f32f25092c66a977b68f57bc4546e587614d43bb 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -250,50 +250,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
                                 numerator, denominator);
  }
  
-static inline void task_dirties_fraction(struct task_struct *tsk,
-               long *numerator, long *denominator)
-{
-       prop_fraction_single(&vm_dirties, &tsk->dirties,
-                               numerator, denominator);
-}
-
-/*
- * task_dirty_limit - scale down dirty throttling threshold for one task
- *
- * task specific dirty limit:
- *
- *   dirty -= (dirty/8) * p_{t}
- *
- * To protect light/slow dirtying tasks from heavier/fast ones, we start
- * throttling individual tasks before reaching the bdi dirty limit.
- * Relatively low thresholds will be allocated to heavy dirtiers. So when
- * dirty pages grow large, heavy dirtiers will be throttled first, which will
- * effectively curb the growth of dirty pages. Light dirtiers with high enough
- * dirty threshold may never get throttled.
- */
-#define TASK_LIMIT_FRACTION 8
-static unsigned long task_dirty_limit(struct task_struct *tsk,
-                                      unsigned long bdi_dirty)
-{
-       long numerator, denominator;
-       unsigned long dirty = bdi_dirty;
-       u64 inv = dirty / TASK_LIMIT_FRACTION;
-
-       task_dirties_fraction(tsk, &numerator, &denominator);
-       inv *= numerator;
-       do_div(inv, denominator);
-
-       dirty -= inv;
-
-       return max(dirty, bdi_dirty/2);
-}
-
-/* Minimum limit for any task */
-static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
-{
-       return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
-}
-
  /*
   *
   */
@@ -986,30 +942,36 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
  /*
   * balance_dirty_pages() must be called by processes which are generating dirty
   * data.  It looks at the number of dirty pages in the machine and will force
- * the caller to perform writeback if the system is over `vm_dirty_ratio'.
+ * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
   * If we're over `background_thresh' then the writeback threads are woken to
   * perform some writeout.
   */
  static void balance_dirty_pages(struct address_space *mapping,
-                               unsigned long write_chunk)
+                               unsigned long pages_dirtied)
  {
-       unsigned long nr_reclaimable, bdi_nr_reclaimable;
+       unsigned long nr_reclaimable;   /* = file_dirty + unstable_nfs */
+       unsigned long bdi_reclaimable;
         unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
         unsigned long bdi_dirty;
         unsigned long freerun;
         unsigned long background_thresh;
         unsigned long dirty_thresh;
         unsigned long bdi_thresh;
-       unsigned long task_bdi_thresh;
-       unsigned long min_task_bdi_thresh;
-       unsigned long pages_written = 0;
-       unsigned long pause = 1;
+       long pause = 0;
         bool dirty_exceeded = false;
-       bool clear_dirty_exceeded = true;
+       unsigned long task_ratelimit;
+       unsigned long dirty_ratelimit;
+       unsigned long pos_ratio;
         struct backing_dev_info *bdi = mapping->backing_dev_info;
         unsigned long start_time = jiffies;
  
         for (;;) {
+               /*
+                * Unstable writes are a feature of certain networked
+                * filesystems (i.e. NFS) in which data may have been
+                * written to the server's write cache, but has not yet
+                * been flushed to permanent storage.
+                */
                 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                         global_page_state(NR_UNSTABLE_NFS);
                 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
@@ -1026,9 +988,23 @@ static void balance_dirty_pages(struct address_space *mapping,
                 if (nr_dirty <= freerun)
                         break;
  
+               if (unlikely(!writeback_in_progress(bdi)))
+                       bdi_start_background_writeback(bdi);
+
+               /*
+                * bdi_thresh is not treated as some limiting factor as
+                * dirty_thresh, due to reasons
+                * - in JBOD setup, bdi_thresh can fluctuate a lot
+                * - in a system with HDD and USB key, the USB key may somehow
+                *   go into state (bdi_dirty >> bdi_thresh) either because
+                *   bdi_dirty starts high, or because bdi_thresh drops low.
+                *   In this case we don't want to hard throttle the USB key
+                *   dirtiers for 100 seconds until bdi_dirty drops under
+                *   bdi_thresh. Instead the auxiliary bdi control line in
+                *   bdi_position_ratio() will let the dirtier task progress
+                *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+                */
                 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-               min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
-               task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
  
                 /*
                  * In order to avoid the stacked BDI deadlock we need
@@ -1040,57 +1016,41 @@ static void balance_dirty_pages(struct address_space *mapping,
                  * actually dirty; with m+n sitting in the percpu
                  * deltas.
                  */
-               if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
-                       bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-                       bdi_dirty = bdi_nr_reclaimable +
+               if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
+                       bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+                       bdi_dirty = bdi_reclaimable +
                                     bdi_stat_sum(bdi, BDI_WRITEBACK);
                 } else {
-                       bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-                       bdi_dirty = bdi_nr_reclaimable +
+                       bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+                       bdi_dirty = bdi_reclaimable +
                                     bdi_stat(bdi, BDI_WRITEBACK);
                 }
  
-               /*
-                * The bdi thresh is somehow "soft" limit derived from the
-                * global "hard" limit. The former helps to prevent heavy IO
-                * bdi or process from holding back light ones; The latter is
-                * the last resort safeguard.
-                */
-               dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
+               dirty_exceeded = (bdi_dirty > bdi_thresh) ||
                                   (nr_dirty > dirty_thresh);
-               clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
-                                       (nr_dirty <= dirty_thresh);
-
-               if (!dirty_exceeded)
-                       break;
-
-               if (!bdi->dirty_exceeded)
+               if (dirty_exceeded && !bdi->dirty_exceeded)
                         bdi->dirty_exceeded = 1;
  
                 bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
                                      nr_dirty, bdi_thresh, bdi_dirty,
                                      start_time);
  
-               /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
-                * Unstable writes are a feature of certain networked
-                * filesystems (i.e. NFS) in which data may have been
-                * written to the server's write cache, but has not yet
-                * been flushed to permanent storage.
-                * Only move pages to writeback if this bdi is over its
-                * threshold otherwise wait until the disk writes catch
-                * up.
-                */
-               trace_balance_dirty_start(bdi);
-               if (bdi_nr_reclaimable > task_bdi_thresh) {
-                       pages_written += writeback_inodes_wb(&bdi->wb,
-                                                            write_chunk);
-                       trace_balance_dirty_written(bdi, pages_written);
-                       if (pages_written >= write_chunk)
-                               break;          /* We've done our duty */
+               dirty_ratelimit = bdi->dirty_ratelimit;
+               pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
+                                              background_thresh, nr_dirty,
+                                              bdi_thresh, bdi_dirty);
+               if (unlikely(pos_ratio == 0)) {
+                       pause = MAX_PAUSE;
+                       goto pause;
                 }
+               task_ratelimit = (u64)dirty_ratelimit *
+                                       pos_ratio >> RATELIMIT_CALC_SHIFT;
+               pause = (HZ * pages_dirtied) / (task_ratelimit | 1);
+               pause = min_t(long, pause, MAX_PAUSE);
+
+pause:
                 __set_current_state(TASK_UNINTERRUPTIBLE);
                 io_schedule_timeout(pause);
-               trace_balance_dirty_wait(bdi);
  
                 dirty_thresh = hard_dirty_limit(dirty_thresh);
                 /*
@@ -1099,22 +1059,11 @@ static void balance_dirty_pages(struct address_space *mapping,
                  * 200ms is typically more than enough to curb heavy dirtiers;
                  * (b) the pause time limit makes the dirtiers more responsive.
                  */
-               if (nr_dirty < dirty_thresh &&
-                   bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 &&
-                   time_after(jiffies, start_time + MAX_PAUSE))
+               if (nr_dirty < dirty_thresh)
                         break;
-
-               /*
-                * Increase the delay for each loop, up to our previous
-                * default of taking a 100ms nap.
-                */
-               pause <<= 1;
-               if (pause > HZ / 10)
-                       pause = HZ / 10;
         }
  
-       /* Clear dirty_exceeded flag only when no task can exceed the limit */
-       if (clear_dirty_exceeded && bdi->dirty_exceeded)
+       if (!dirty_exceeded && bdi->dirty_exceeded)
                 bdi->dirty_exceeded = 0;
  
         current->nr_dirtied = 0;
@@ -1131,8 +1080,10 @@ static void balance_dirty_pages(struct address_space *mapping,
          * In normal mode, we start background writeout at the lower
          * background_thresh, to keep the amount of dirty memory low.
          */
-       if ((laptop_mode && pages_written) ||
-           (!laptop_mode && (nr_reclaimable > background_thresh)))
+       if (laptop_mode)
+               return;
+
+       if (nr_reclaimable > background_thresh)
                 bdi_start_background_writeback(bdi);
  }
author	Wu Fengguang <fengguang.wu@intel.com>
	Sat, 28 Aug 2010 00:45:12 +0000 (18:45 -0600)
committer	Wu Fengguang <fengguang.wu@intel.com>
	Mon, 3 Oct 2011 13:08:57 +0000 (21:08 +0800)
include/trace/events/writeback.h		patch \| blob \| history
mm/page-writeback.c		patch \| blob \| history