Merge git://git.kernel.org/pub/scm/linux/kernel/git/sfrench/cifs-2.6
[sfrench/cifs-2.6.git] / mm / page-writeback.c
index 60c7244c42e48760700840debe0b61408fcac7c9..a0f33905744978df4c961e0e29187f3cf1cef54c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * mm/page-writeback.c.
+ * mm/page-writeback.c
  *
  * Copyright (C) 2002, Linus Torvalds.
  *
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/mpage.h>
+#include <linux/rmap.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -45,7 +48,6 @@
  */
 static long ratelimit_pages = 32;
 
-static long total_pages;       /* The total number of pages in the machine. */
 static int dirty_exceeded __cacheline_aligned_in_smp;  /* Dirty mem may be over limit */
 
 /*
@@ -99,22 +101,6 @@ EXPORT_SYMBOL(laptop_mode);
 
 static void background_writeout(unsigned long _min_pages);
 
-struct writeback_state
-{
-       unsigned long nr_dirty;
-       unsigned long nr_unstable;
-       unsigned long nr_mapped;
-       unsigned long nr_writeback;
-};
-
-static void get_writeback_state(struct writeback_state *wbs)
-{
-       wbs->nr_dirty = read_page_state(nr_dirty);
-       wbs->nr_unstable = read_page_state(nr_unstable);
-       wbs->nr_mapped = global_page_state(NR_FILE_MAPPED);
-       wbs->nr_writeback = read_page_state(nr_writeback);
-}
-
 /*
  * Work out the current dirty-memory clamping and background writeout
  * thresholds.
@@ -133,19 +119,17 @@ static void get_writeback_state(struct writeback_state *wbs)
  * clamping level.
  */
 static void
-get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
-               struct address_space *mapping)
+get_dirty_limits(long *pbackground, long *pdirty,
+                                       struct address_space *mapping)
 {
        int background_ratio;           /* Percentages */
        int dirty_ratio;
        int unmapped_ratio;
        long background;
        long dirty;
-       unsigned long available_memory = total_pages;
+       unsigned long available_memory = vm_total_pages;
        struct task_struct *tsk;
 
-       get_writeback_state(wbs);
-
 #ifdef CONFIG_HIGHMEM
        /*
         * If this mapping can only allocate from low memory,
@@ -156,7 +140,9 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
 #endif
 
 
-       unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages;
+       unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
+                               global_page_state(NR_ANON_PAGES)) * 100) /
+                                       vm_total_pages;
 
        dirty_ratio = vm_dirty_ratio;
        if (dirty_ratio > unmapped_ratio / 2)
@@ -189,7 +175,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
  */
 static void balance_dirty_pages(struct address_space *mapping)
 {
-       struct writeback_state wbs;
        long nr_reclaimable;
        long background_thresh;
        long dirty_thresh;
@@ -207,11 +192,12 @@ static void balance_dirty_pages(struct address_space *mapping)
                        .range_cyclic   = 1,
                };
 
-               get_dirty_limits(&wbs, &background_thresh,
-                                       &dirty_thresh, mapping);
-               nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
-               if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
-                       break;
+               get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
+               nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+                                       global_page_state(NR_UNSTABLE_NFS);
+               if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
+                       dirty_thresh)
+                               break;
 
                if (!dirty_exceeded)
                        dirty_exceeded = 1;
@@ -224,11 +210,14 @@ static void balance_dirty_pages(struct address_space *mapping)
                 */
                if (nr_reclaimable) {
                        writeback_inodes(&wbc);
-                       get_dirty_limits(&wbs, &background_thresh,
-                                       &dirty_thresh, mapping);
-                       nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
-                       if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
-                               break;
+                       get_dirty_limits(&background_thresh,
+                                               &dirty_thresh, mapping);
+                       nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+                                       global_page_state(NR_UNSTABLE_NFS);
+                       if (nr_reclaimable +
+                               global_page_state(NR_WRITEBACK)
+                                       <= dirty_thresh)
+                                               break;
                        pages_written += write_chunk - wbc.nr_to_write;
                        if (pages_written >= write_chunk)
                                break;          /* We've done our duty */
@@ -236,8 +225,9 @@ static void balance_dirty_pages(struct address_space *mapping)
                blk_congestion_wait(WRITE, HZ/10);
        }
 
-       if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded)
-               dirty_exceeded = 0;
+       if (nr_reclaimable + global_page_state(NR_WRITEBACK)
+               <= dirty_thresh && dirty_exceeded)
+                       dirty_exceeded = 0;
 
        if (writeback_in_progress(bdi))
                return;         /* pdflush is already working this queue */
@@ -255,6 +245,16 @@ static void balance_dirty_pages(struct address_space *mapping)
                pdflush_operation(background_writeout, 0);
 }
 
+void set_page_dirty_balance(struct page *page)
+{
+       if (set_page_dirty(page)) {
+               struct address_space *mapping = page_mapping(page);
+
+               if (mapping)
+                       balance_dirty_pages_ratelimited(mapping);
+       }
+}
+
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
@@ -299,12 +299,11 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
 void throttle_vm_writeout(void)
 {
-       struct writeback_state wbs;
        long background_thresh;
        long dirty_thresh;
 
         for ( ; ; ) {
-               get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
+               get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
 
                 /*
                  * Boost the allowable dirty threshold a bit for page
@@ -312,8 +311,9 @@ void throttle_vm_writeout(void)
                  */
                 dirty_thresh += dirty_thresh / 10;      /* wheeee... */
 
-                if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh)
-                        break;
+                if (global_page_state(NR_UNSTABLE_NFS) +
+                       global_page_state(NR_WRITEBACK) <= dirty_thresh)
+                               break;
                 blk_congestion_wait(WRITE, HZ/10);
         }
 }
@@ -336,12 +336,12 @@ static void background_writeout(unsigned long _min_pages)
        };
 
        for ( ; ; ) {
-               struct writeback_state wbs;
                long background_thresh;
                long dirty_thresh;
 
-               get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
-               if (wbs.nr_dirty + wbs.nr_unstable < background_thresh
+               get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+               if (global_page_state(NR_FILE_DIRTY) +
+                       global_page_state(NR_UNSTABLE_NFS) < background_thresh
                                && min_pages <= 0)
                        break;
                wbc.encountered_congestion = 0;
@@ -365,12 +365,9 @@ static void background_writeout(unsigned long _min_pages)
  */
 int wakeup_pdflush(long nr_pages)
 {
-       if (nr_pages == 0) {
-               struct writeback_state wbs;
-
-               get_writeback_state(&wbs);
-               nr_pages = wbs.nr_dirty + wbs.nr_unstable;
-       }
+       if (nr_pages == 0)
+               nr_pages = global_page_state(NR_FILE_DIRTY) +
+                               global_page_state(NR_UNSTABLE_NFS);
        return pdflush_operation(background_writeout, nr_pages);
 }
 
@@ -401,7 +398,6 @@ static void wb_kupdate(unsigned long arg)
        unsigned long start_jif;
        unsigned long next_jif;
        long nr_to_write;
-       struct writeback_state wbs;
        struct writeback_control wbc = {
                .bdi            = NULL,
                .sync_mode      = WB_SYNC_NONE,
@@ -414,11 +410,11 @@ static void wb_kupdate(unsigned long arg)
 
        sync_supers();
 
-       get_writeback_state(&wbs);
        oldest_jif = jiffies - dirty_expire_interval;
        start_jif = jiffies;
        next_jif = start_jif + dirty_writeback_interval;
-       nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
+       nr_to_write = global_page_state(NR_FILE_DIRTY) +
+                       global_page_state(NR_UNSTABLE_NFS) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        while (nr_to_write > 0) {
                wbc.encountered_congestion = 0;
@@ -507,9 +503,9 @@ void laptop_sync_completion(void)
  * will write six megabyte chunks, max.
  */
 
-static void set_ratelimit(void)
+void writeback_set_ratelimit(void)
 {
-       ratelimit_pages = total_pages / (num_online_cpus() * 32);
+       ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
        if (ratelimit_pages < 16)
                ratelimit_pages = 16;
        if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
@@ -519,7 +515,7 @@ static void set_ratelimit(void)
 static int __cpuinit
 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 {
-       set_ratelimit();
+       writeback_set_ratelimit();
        return 0;
 }
 
@@ -538,9 +534,7 @@ void __init page_writeback_init(void)
        long buffer_pages = nr_free_buffer_pages();
        long correction;
 
-       total_pages = nr_free_pagecache_pages();
-
-       correction = (100 * 4 * buffer_pages) / total_pages;
+       correction = (100 * 4 * buffer_pages) / vm_total_pages;
 
        if (correction < 100) {
                dirty_background_ratio *= correction;
@@ -554,10 +548,143 @@ void __init page_writeback_init(void)
                        vm_dirty_ratio = 1;
        }
        mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
-       set_ratelimit();
+       writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
 }
 
+/**
+ * generic_writepages - walk the list of dirty pages of the given
+ *                      address space and writepage() all of them.
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * If a page is already under I/O, generic_writepages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ *
+ * Derived from mpage_writepages() - if you fix this you should check that
+ * also!
+ */
+int generic_writepages(struct address_space *mapping,
+                      struct writeback_control *wbc)
+{
+       struct backing_dev_info *bdi = mapping->backing_dev_info;
+       int ret = 0;
+       int done = 0;
+       int (*writepage)(struct page *page, struct writeback_control *wbc);
+       struct pagevec pvec;
+       int nr_pages;
+       pgoff_t index;
+       pgoff_t end;            /* Inclusive */
+       int scanned = 0;
+       int range_whole = 0;
+
+       if (wbc->nonblocking && bdi_write_congested(bdi)) {
+               wbc->encountered_congestion = 1;
+               return 0;
+       }
+
+       writepage = mapping->a_ops->writepage;
+
+       /* deal with chardevs and other special file */
+       if (!writepage)
+               return 0;
+
+       pagevec_init(&pvec, 0);
+       if (wbc->range_cyclic) {
+               index = mapping->writeback_index; /* Start from prev offset */
+               end = -1;
+       } else {
+               index = wbc->range_start >> PAGE_CACHE_SHIFT;
+               end = wbc->range_end >> PAGE_CACHE_SHIFT;
+               if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                       range_whole = 1;
+               scanned = 1;
+       }
+retry:
+       while (!done && (index <= end) &&
+              (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                             PAGECACHE_TAG_DIRTY,
+                                             min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+               unsigned i;
+
+               scanned = 1;
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+
+                       /*
+                        * At this point we hold neither mapping->tree_lock nor
+                        * lock on the page itself: the page may be truncated or
+                        * invalidated (changing page->mapping to NULL), or even
+                        * swizzled back from swapper_space to tmpfs file
+                        * mapping
+                        */
+                       lock_page(page);
+
+                       if (unlikely(page->mapping != mapping)) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       if (!wbc->range_cyclic && page->index > end) {
+                               done = 1;
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       if (wbc->sync_mode != WB_SYNC_NONE)
+                               wait_on_page_writeback(page);
+
+                       if (PageWriteback(page) ||
+                           !clear_page_dirty_for_io(page)) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       ret = (*writepage)(page, wbc);
+                       if (ret) {
+                               if (ret == -ENOSPC)
+                                       set_bit(AS_ENOSPC, &mapping->flags);
+                               else
+                                       set_bit(AS_EIO, &mapping->flags);
+                       }
+
+                       if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
+                               unlock_page(page);
+                       if (ret || (--(wbc->nr_to_write) <= 0))
+                               done = 1;
+                       if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                               wbc->encountered_congestion = 1;
+                               done = 1;
+                       }
+               }
+               pagevec_release(&pvec);
+               cond_resched();
+       }
+       if (!scanned && !done) {
+               /*
+                * We hit the last page and there is more work to be done: wrap
+                * back to the start of the file
+                */
+               scanned = 1;
+               index = 0;
+               goto retry;
+       }
+       if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+               mapping->writeback_index = index;
+       return ret;
+}
+
+EXPORT_SYMBOL(generic_writepages);
+
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
        int ret;
@@ -566,7 +693,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
                return 0;
        wbc->for_writepages = 1;
        if (mapping->a_ops->writepages)
-               ret =  mapping->a_ops->writepages(mapping, wbc);
+               ret = mapping->a_ops->writepages(mapping, wbc);
        else
                ret = generic_writepages(mapping, wbc);
        wbc->for_writepages = 0;
@@ -640,7 +767,8 @@ int __set_page_dirty_nobuffers(struct page *page)
                        if (mapping2) { /* Race with truncate? */
                                BUG_ON(mapping2 != mapping);
                                if (mapping_cap_account_dirty(mapping))
-                                       inc_page_state(nr_dirty);
+                                       __inc_zone_page_state(page,
+                                                               NR_FILE_DIRTY);
                                radix_tree_tag_set(&mapping->page_tree,
                                        page_index(page), PAGECACHE_TAG_DIRTY);
                        }
@@ -679,9 +807,11 @@ int fastcall set_page_dirty(struct page *page)
 
        if (likely(mapping)) {
                int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
-               if (spd)
-                       return (*spd)(page);
-               return __set_page_dirty_buffers(page);
+#ifdef CONFIG_BLOCK
+               if (!spd)
+                       spd = __set_page_dirty_buffers;
+#endif
+               return (*spd)(page);
        }
        if (!PageDirty(page)) {
                if (!TestSetPageDirty(page))
@@ -705,7 +835,7 @@ int set_page_dirty_lock(struct page *page)
 {
        int ret;
 
-       lock_page(page);
+       lock_page_nosync(page);
        ret = set_page_dirty(page);
        unlock_page(page);
        return ret;
@@ -728,8 +858,14 @@ int test_clear_page_dirty(struct page *page)
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
                        write_unlock_irqrestore(&mapping->tree_lock, flags);
-                       if (mapping_cap_account_dirty(mapping))
-                               dec_page_state(nr_dirty);
+                       /*
+                        * We can continue to use `mapping' here because the
+                        * page is locked, which pins the address_space
+                        */
+                       if (mapping_cap_account_dirty(mapping)) {
+                               page_mkclean(page);
+                               dec_zone_page_state(page, NR_FILE_DIRTY);
+                       }
                        return 1;
                }
                write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -759,8 +895,10 @@ int clear_page_dirty_for_io(struct page *page)
 
        if (mapping) {
                if (TestClearPageDirty(page)) {
-                       if (mapping_cap_account_dirty(mapping))
-                               dec_page_state(nr_dirty);
+                       if (mapping_cap_account_dirty(mapping)) {
+                               page_mkclean(page);
+                               dec_zone_page_state(page, NR_FILE_DIRTY);
+                       }
                        return 1;
                }
                return 0;
@@ -817,6 +955,15 @@ int test_set_page_writeback(struct page *page)
 }
 EXPORT_SYMBOL(test_set_page_writeback);
 
+/*
+ * Wakes up tasks that are being throttled due to writeback congestion
+ */
+void writeback_congestion_end(void)
+{
+       blk_congestion_end(WRITE);
+}
+EXPORT_SYMBOL(writeback_congestion_end);
+
 /*
  * Return true if any of the pages in the mapping are marged with the
  * passed tag.