flush icache before set_pte() on ia64: flush icache at set_pte
[sfrench/cifs-2.6.git] / mm / memory.c
index 9c6ff7fffdc8cf653d1e04e717a6b3a8bb8fe1c9..bd16dcaeefb8ce4e3b504916b81b67c536d0c4dc 100644 (file)
@@ -966,7 +966,7 @@ no_page_table:
         * has touched so far, we don't want to allocate page tables.
         */
        if (flags & FOLL_ANON) {
-               page = ZERO_PAGE(address);
+               page = ZERO_PAGE(0);
                if (flags & FOLL_GET)
                        get_page(page);
                BUG_ON(flags & FOLL_WRITE);
@@ -1047,7 +1047,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                if (pages)
                        foll_flags |= FOLL_GET;
                if (!write && !(vma->vm_flags & VM_LOCKED) &&
-                   (!vma->vm_ops || !vma->vm_ops->nopage))
+                   (!vma->vm_ops || (!vma->vm_ops->nopage &&
+                                       !vma->vm_ops->fault)))
                        foll_flags |= FOLL_ANON;
 
                do {
@@ -1067,31 +1068,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        cond_resched();
                        while (!(page = follow_page(vma, start, foll_flags))) {
                                int ret;
-                               ret = __handle_mm_fault(mm, vma, start,
+                               ret = handle_mm_fault(mm, vma, start,
                                                foll_flags & FOLL_WRITE);
+                               if (ret & VM_FAULT_ERROR) {
+                                       if (ret & VM_FAULT_OOM)
+                                               return i ? i : -ENOMEM;
+                                       else if (ret & VM_FAULT_SIGBUS)
+                                               return i ? i : -EFAULT;
+                                       BUG();
+                               }
+                               if (ret & VM_FAULT_MAJOR)
+                                       tsk->maj_flt++;
+                               else
+                                       tsk->min_flt++;
+
                                /*
-                                * The VM_FAULT_WRITE bit tells us that do_wp_page has
-                                * broken COW when necessary, even if maybe_mkwrite
-                                * decided not to set pte_write. We can thus safely do
-                                * subsequent page lookups as if they were reads.
+                                * The VM_FAULT_WRITE bit tells us that
+                                * do_wp_page has broken COW when necessary,
+                                * even if maybe_mkwrite decided not to set
+                                * pte_write. We can thus safely do subsequent
+                                * page lookups as if they were reads.
                                 */
                                if (ret & VM_FAULT_WRITE)
                                        foll_flags &= ~FOLL_WRITE;
-                               
-                               switch (ret & ~VM_FAULT_WRITE) {
-                               case VM_FAULT_MINOR:
-                                       tsk->min_flt++;
-                                       break;
-                               case VM_FAULT_MAJOR:
-                                       tsk->maj_flt++;
-                                       break;
-                               case VM_FAULT_SIGBUS:
-                                       return i ? i : -EFAULT;
-                               case VM_FAULT_OOM:
-                                       return i ? i : -ENOMEM;
-                               default:
-                                       BUG();
-                               }
+
                                cond_resched();
                        }
                        if (pages) {
@@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages);
 
-static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
-                       unsigned long addr, unsigned long end, pgprot_t prot)
-{
-       pte_t *pte;
-       spinlock_t *ptl;
-       int err = 0;
-
-       pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
-       if (!pte)
-               return -EAGAIN;
-       arch_enter_lazy_mmu_mode();
-       do {
-               struct page *page = ZERO_PAGE(addr);
-               pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
-
-               if (unlikely(!pte_none(*pte))) {
-                       err = -EEXIST;
-                       pte++;
-                       break;
-               }
-               page_cache_get(page);
-               page_add_file_rmap(page);
-               inc_mm_counter(mm, file_rss);
-               set_pte_at(mm, addr, pte, zero_pte);
-       } while (pte++, addr += PAGE_SIZE, addr != end);
-       arch_leave_lazy_mmu_mode();
-       pte_unmap_unlock(pte - 1, ptl);
-       return err;
-}
-
-static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
-                       unsigned long addr, unsigned long end, pgprot_t prot)
-{
-       pmd_t *pmd;
-       unsigned long next;
-       int err;
-
-       pmd = pmd_alloc(mm, pud, addr);
-       if (!pmd)
-               return -EAGAIN;
-       do {
-               next = pmd_addr_end(addr, end);
-               err = zeromap_pte_range(mm, pmd, addr, next, prot);
-               if (err)
-                       break;
-       } while (pmd++, addr = next, addr != end);
-       return err;
-}
-
-static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
-                       unsigned long addr, unsigned long end, pgprot_t prot)
-{
-       pud_t *pud;
-       unsigned long next;
-       int err;
-
-       pud = pud_alloc(mm, pgd, addr);
-       if (!pud)
-               return -EAGAIN;
-       do {
-               next = pud_addr_end(addr, end);
-               err = zeromap_pmd_range(mm, pud, addr, next, prot);
-               if (err)
-                       break;
-       } while (pud++, addr = next, addr != end);
-       return err;
-}
-
-int zeromap_page_range(struct vm_area_struct *vma,
-                       unsigned long addr, unsigned long size, pgprot_t prot)
-{
-       pgd_t *pgd;
-       unsigned long next;
-       unsigned long end = addr + size;
-       struct mm_struct *mm = vma->vm_mm;
-       int err;
-
-       BUG_ON(addr >= end);
-       pgd = pgd_offset(mm, addr);
-       flush_cache_range(vma, addr, end);
-       do {
-               next = pgd_addr_end(addr, end);
-               err = zeromap_pud_range(mm, pgd, addr, next, prot);
-               if (err)
-                       break;
-       } while (pgd++, addr = next, addr != end);
-       return err;
-}
-
 pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
 {
        pgd_t * pgd = pgd_offset(mm, addr);
@@ -1638,7 +1549,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *old_page, *new_page;
        pte_t entry;
-       int reuse = 0, ret = VM_FAULT_MINOR;
+       int reuse = 0, ret = 0;
+       int page_mkwrite = 0;
        struct page *dirty_page = NULL;
 
        old_page = vm_normal_page(vma, address, orig_pte);
@@ -1687,6 +1599,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        page_cache_release(old_page);
                        if (!pte_same(*page_table, orig_pte))
                                goto unlock;
+
+                       page_mkwrite = 1;
                }
                dirty_page = old_page;
                get_page(dirty_page);
@@ -1697,10 +1611,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-               if (ptep_set_access_flags(vma, address, page_table, entry,1)) {
+               if (ptep_set_access_flags(vma, address, page_table, entry,1))
                        update_mmu_cache(vma, address, entry);
-                       lazy_mmu_prot_update(entry);
-               }
                ret |= VM_FAULT_WRITE;
                goto unlock;
        }
@@ -1714,16 +1626,11 @@ gotten:
 
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
-       if (old_page == ZERO_PAGE(address)) {
-               new_page = alloc_zeroed_user_highpage_movable(vma, address);
-               if (!new_page)
-                       goto oom;
-       } else {
-               new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-               if (!new_page)
-                       goto oom;
-               cow_user_page(new_page, old_page, address, vma);
-       }
+       VM_BUG_ON(old_page == ZERO_PAGE(0));
+       new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+       if (!new_page)
+               goto oom;
+       cow_user_page(new_page, old_page, address, vma);
 
        /*
         * Re-check the pte - we dropped the lock
@@ -1741,7 +1648,6 @@ gotten:
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-               lazy_mmu_prot_update(entry);
                /*
                 * Clear the pte entry and flush it first, before updating the
                 * pte with the new entry. This will avoid a race condition
@@ -1765,7 +1671,16 @@ gotten:
 unlock:
        pte_unmap_unlock(page_table, ptl);
        if (dirty_page) {
-               set_page_dirty_balance(dirty_page);
+               /*
+                * Yes, Virginia, this is actually required to prevent a race
+                * with clear_page_dirty_for_io() from clearing the page dirty
+                * bit after it clear all dirty ptes, but before a racing
+                * do_wp_page installs a dirty pte.
+                *
+                * do_no_page is protected similarly.
+                */
+               wait_on_page_locked(dirty_page);
+               set_page_dirty_balance(dirty_page, page_mkwrite);
                put_page(dirty_page);
        }
        return ret;
@@ -1831,6 +1746,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma,
        unsigned long restart_addr;
        int need_break;
 
+       /*
+        * files that support invalidating or truncating portions of the
+        * file from under mmaped areas must have their ->fault function
+        * return a locked page (and set VM_FAULT_LOCKED in the return).
+        * This provides synchronisation against concurrent unmapping here.
+        */
+
 again:
        restart_addr = vma->vm_truncate_count;
        if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
@@ -1959,17 +1881,8 @@ void unmap_mapping_range(struct address_space *mapping,
 
        spin_lock(&mapping->i_mmap_lock);
 
-       /* serialize i_size write against truncate_count write */
-       smp_wmb();
-       /* Protect against page faults, and endless unmapping loops */
+       /* Protect against endless unmapping loops */
        mapping->truncate_count++;
-       /*
-        * For archs where spin_lock has inclusive semantics like ia64
-        * this smp_mb() will prevent to read pagetable contents
-        * before the truncate_count increment is visible to
-        * other cpus.
-        */
-       smp_mb();
        if (unlikely(is_restart_addr(mapping->truncate_count))) {
                if (mapping->truncate_count == 0)
                        reset_vma_truncate_counts(mapping);
@@ -2008,8 +1921,18 @@ int vmtruncate(struct inode * inode, loff_t offset)
        if (IS_SWAPFILE(inode))
                goto out_busy;
        i_size_write(inode, offset);
+
+       /*
+        * unmap_mapping_range is called twice, first simply for efficiency
+        * so that truncate_inode_pages does fewer single-page unmaps. However
+        * after this first call, and before truncate_inode_pages finishes,
+        * it is possible for private pages to be COWed, which remain after
+        * truncate_inode_pages finishes, hence the second unmap_mapping_range
+        * call must be made for correctness.
+        */
        unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
        truncate_inode_pages(mapping, offset);
+       unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
        goto out_truncate;
 
 do_expand:
@@ -2049,6 +1972,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
        down_write(&inode->i_alloc_sem);
        unmap_mapping_range(mapping, offset, (end - offset), 1);
        truncate_inode_pages_range(mapping, offset, end);
+       unmap_mapping_range(mapping, offset, (end - offset), 1);
        inode->i_op->truncate_range(inode, offset, end);
        up_write(&inode->i_alloc_sem);
        mutex_unlock(&inode->i_mutex);
@@ -2130,7 +2054,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page;
        swp_entry_t entry;
        pte_t pte;
-       int ret = VM_FAULT_MINOR;
+       int ret = 0;
 
        if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
                goto out;
@@ -2198,15 +2122,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unlock_page(page);
 
        if (write_access) {
+               /* XXX: We could OR the do_wp_page code with this one? */
                if (do_wp_page(mm, vma, address,
-                               page_table, pmd, ptl, pte) == VM_FAULT_OOM)
+                               page_table, pmd, ptl, pte) & VM_FAULT_OOM)
                        ret = VM_FAULT_OOM;
                goto out;
        }
 
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, address, pte);
-       lazy_mmu_prot_update(pte);
 unlock:
        pte_unmap_unlock(page_table, ptl);
 out:
@@ -2231,47 +2155,31 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spinlock_t *ptl;
        pte_t entry;
 
-       if (write_access) {
-               /* Allocate our own private page. */
-               pte_unmap(page_table);
-
-               if (unlikely(anon_vma_prepare(vma)))
-                       goto oom;
-               page = alloc_zeroed_user_highpage_movable(vma, address);
-               if (!page)
-                       goto oom;
-
-               entry = mk_pte(page, vma->vm_page_prot);
-               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+       /* Allocate our own private page. */
+       pte_unmap(page_table);
 
-               page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-               if (!pte_none(*page_table))
-                       goto release;
-               inc_mm_counter(mm, anon_rss);
-               lru_cache_add_active(page);
-               page_add_new_anon_rmap(page, vma, address);
-       } else {
-               /* Map the ZERO_PAGE - vm_page_prot is readonly */
-               page = ZERO_PAGE(address);
-               page_cache_get(page);
-               entry = mk_pte(page, vma->vm_page_prot);
+       if (unlikely(anon_vma_prepare(vma)))
+               goto oom;
+       page = alloc_zeroed_user_highpage_movable(vma, address);
+       if (!page)
+               goto oom;
 
-               ptl = pte_lockptr(mm, pmd);
-               spin_lock(ptl);
-               if (!pte_none(*page_table))
-                       goto release;
-               inc_mm_counter(mm, file_rss);
-               page_add_file_rmap(page);
-       }
+       entry = mk_pte(page, vma->vm_page_prot);
+       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
+       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_none(*page_table))
+               goto release;
+       inc_mm_counter(mm, anon_rss);
+       lru_cache_add_active(page);
+       page_add_new_anon_rmap(page, vma, address);
        set_pte_at(mm, address, page_table, entry);
 
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, address, entry);
-       lazy_mmu_prot_update(entry);
 unlock:
        pte_unmap_unlock(page_table, ptl);
-       return VM_FAULT_MINOR;
+       return 0;
 release:
        page_cache_release(page);
        goto unlock;
@@ -2280,102 +2188,114 @@ oom:
 }
 
 /*
- * do_no_page() tries to create a new page mapping. It aggressively
+ * __do_fault() tries to create a new page mapping. It aggressively
  * tries to share with existing pages, but makes a separate copy if
- * the "write_access" parameter is true in order to avoid the next
- * page fault.
+ * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
+ * the next page fault.
  *
  * As this is called only for pages that do not currently exist, we
  * do not need to flush old virtual caches or the TLB.
  *
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
+ * but allow concurrent faults), and pte neither mapped nor locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, pte_t *page_table, pmd_t *pmd,
-               int write_access)
+static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmd,
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
+       pte_t *page_table;
        spinlock_t *ptl;
-       struct page *new_page;
-       struct address_space *mapping = NULL;
+       struct page *page;
        pte_t entry;
-       unsigned int sequence = 0;
-       int ret = VM_FAULT_MINOR;
        int anon = 0;
        struct page *dirty_page = NULL;
+       struct vm_fault vmf;
+       int ret;
+       int page_mkwrite = 0;
+
+       vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+       vmf.pgoff = pgoff;
+       vmf.flags = flags;
+       vmf.page = NULL;
 
-       pte_unmap(page_table);
        BUG_ON(vma->vm_flags & VM_PFNMAP);
 
-       if (vma->vm_file) {
-               mapping = vma->vm_file->f_mapping;
-               sequence = mapping->truncate_count;
-               smp_rmb(); /* serializes i_size against truncate_count */
+       if (likely(vma->vm_ops->fault)) {
+               ret = vma->vm_ops->fault(vma, &vmf);
+               if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+                       return ret;
+       } else {
+               /* Legacy ->nopage path */
+               ret = 0;
+               vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
+               /* no page was available -- either SIGBUS or OOM */
+               if (unlikely(vmf.page == NOPAGE_SIGBUS))
+                       return VM_FAULT_SIGBUS;
+               else if (unlikely(vmf.page == NOPAGE_OOM))
+                       return VM_FAULT_OOM;
        }
-retry:
-       new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
+
        /*
-        * No smp_rmb is needed here as long as there's a full
-        * spin_lock/unlock sequence inside the ->nopage callback
-        * (for the pagecache lookup) that acts as an implicit
-        * smp_mb() and prevents the i_size read to happen
-        * after the next truncate_count read.
+        * For consistency in subsequent calls, make the faulted page always
+        * locked.
         */
-
-       /* no page was available -- either SIGBUS, OOM or REFAULT */
-       if (unlikely(new_page == NOPAGE_SIGBUS))
-               return VM_FAULT_SIGBUS;
-       else if (unlikely(new_page == NOPAGE_OOM))
-               return VM_FAULT_OOM;
-       else if (unlikely(new_page == NOPAGE_REFAULT))
-               return VM_FAULT_MINOR;
+       if (unlikely(!(ret & VM_FAULT_LOCKED)))
+               lock_page(vmf.page);
+       else
+               VM_BUG_ON(!PageLocked(vmf.page));
 
        /*
         * Should we do an early C-O-W break?
         */
-       if (write_access) {
+       page = vmf.page;
+       if (flags & FAULT_FLAG_WRITE) {
                if (!(vma->vm_flags & VM_SHARED)) {
-                       struct page *page;
-
-                       if (unlikely(anon_vma_prepare(vma)))
-                               goto oom;
+                       anon = 1;
+                       if (unlikely(anon_vma_prepare(vma))) {
+                               ret = VM_FAULT_OOM;
+                               goto out;
+                       }
                        page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
                                                vma, address);
-                       if (!page)
-                               goto oom;
-                       copy_user_highpage(page, new_page, address, vma);
-                       page_cache_release(new_page);
-                       new_page = page;
-                       anon = 1;
-
+                       if (!page) {
+                               ret = VM_FAULT_OOM;
+                               goto out;
+                       }
+                       copy_user_highpage(page, vmf.page, address, vma);
                } else {
-                       /* if the page will be shareable, see if the backing
+                       /*
+                        * If the page will be shareable, see if the backing
                         * address space wants to know that the page is about
-                        * to become writable */
-                       if (vma->vm_ops->page_mkwrite &&
-                           vma->vm_ops->page_mkwrite(vma, new_page) < 0
-                           ) {
-                               page_cache_release(new_page);
-                               return VM_FAULT_SIGBUS;
+                        * to become writable
+                        */
+                       if (vma->vm_ops->page_mkwrite) {
+                               unlock_page(page);
+                               if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
+                                       ret = VM_FAULT_SIGBUS;
+                                       anon = 1; /* no anon but release vmf.page */
+                                       goto out_unlocked;
+                               }
+                               lock_page(page);
+                               /*
+                                * XXX: this is not quite right (racy vs
+                                * invalidate) to unlock and relock the page
+                                * like this, however a better fix requires
+                                * reworking page_mkwrite locking API, which
+                                * is better done later.
+                                */
+                               if (!page->mapping) {
+                                       ret = 0;
+                                       anon = 1; /* no anon but release vmf.page */
+                                       goto out;
+                               }
+                               page_mkwrite = 1;
                        }
                }
+
        }
 
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-       /*
-        * For a file-backed vma, someone could have truncated or otherwise
-        * invalidated this page.  If unmap_mapping_range got called,
-        * retry getting the page.
-        */
-       if (mapping && unlikely(sequence != mapping->truncate_count)) {
-               pte_unmap_unlock(page_table, ptl);
-               page_cache_release(new_page);
-               cond_resched();
-               sequence = mapping->truncate_count;
-               smp_rmb();
-               goto retry;
-       }
 
        /*
         * This silly early PAGE_DIRTY setting removes a race
@@ -2388,45 +2308,62 @@ retry:
         * handle that later.
         */
        /* Only go through if we didn't race with anybody else... */
-       if (pte_none(*page_table)) {
-               flush_icache_page(vma, new_page);
-               entry = mk_pte(new_page, vma->vm_page_prot);
-               if (write_access)
+       if (likely(pte_same(*page_table, orig_pte))) {
+               flush_icache_page(vma, page);
+               entry = mk_pte(page, vma->vm_page_prot);
+               if (flags & FAULT_FLAG_WRITE)
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                set_pte_at(mm, address, page_table, entry);
                if (anon) {
-                       inc_mm_counter(mm, anon_rss);
-                       lru_cache_add_active(new_page);
-                       page_add_new_anon_rmap(new_page, vma, address);
+                        inc_mm_counter(mm, anon_rss);
+                        lru_cache_add_active(page);
+                        page_add_new_anon_rmap(page, vma, address);
                } else {
                        inc_mm_counter(mm, file_rss);
-                       page_add_file_rmap(new_page);
-                       if (write_access) {
-                               dirty_page = new_page;
+                       page_add_file_rmap(page);
+                       if (flags & FAULT_FLAG_WRITE) {
+                               dirty_page = page;
                                get_page(dirty_page);
                        }
                }
+
+               /* no need to invalidate: a not-present page won't be cached */
+               update_mmu_cache(vma, address, entry);
        } else {
-               /* One of our sibling threads was faster, back out. */
-               page_cache_release(new_page);
-               goto unlock;
+               if (anon)
+                       page_cache_release(page);
+               else
+                       anon = 1; /* no anon but release faulted_page */
        }
 
-       /* no need to invalidate: a not-present page shouldn't be cached */
-       update_mmu_cache(vma, address, entry);
-       lazy_mmu_prot_update(entry);
-unlock:
        pte_unmap_unlock(page_table, ptl);
-       if (dirty_page) {
-               set_page_dirty_balance(dirty_page);
+
+out:
+       unlock_page(vmf.page);
+out_unlocked:
+       if (anon)
+               page_cache_release(vmf.page);
+       else if (dirty_page) {
+               set_page_dirty_balance(dirty_page, page_mkwrite);
                put_page(dirty_page);
        }
+
        return ret;
-oom:
-       page_cache_release(new_page);
-       return VM_FAULT_OOM;
 }
 
+static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               int write_access, pte_t orig_pte)
+{
+       pgoff_t pgoff = (((address & PAGE_MASK)
+                       - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+       unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
+
+       pte_unmap(page_table);
+       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+}
+
+
 /*
  * do_no_pfn() tries to create a new page mapping for a page without
  * a struct_page backing it
@@ -2450,7 +2387,6 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
        spinlock_t *ptl;
        pte_t entry;
        unsigned long pfn;
-       int ret = VM_FAULT_MINOR;
 
        pte_unmap(page_table);
        BUG_ON(!(vma->vm_flags & VM_PFNMAP));
@@ -2462,7 +2398,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
        else if (unlikely(pfn == NOPFN_SIGBUS))
                return VM_FAULT_SIGBUS;
        else if (unlikely(pfn == NOPFN_REFAULT))
-               return VM_FAULT_MINOR;
+               return 0;
 
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 
@@ -2474,7 +2410,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
                set_pte_at(mm, address, page_table, entry);
        }
        pte_unmap_unlock(page_table, ptl);
-       return ret;
+       return 0;
 }
 
 /*
@@ -2486,33 +2422,28 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
                int write_access, pte_t orig_pte)
 {
+       unsigned int flags = FAULT_FLAG_NONLINEAR |
+                               (write_access ? FAULT_FLAG_WRITE : 0);
        pgoff_t pgoff;
-       int err;
 
        if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-               return VM_FAULT_MINOR;
+               return 0;
 
-       if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
+       if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
+                       !(vma->vm_flags & VM_CAN_NONLINEAR))) {
                /*
                 * Page table corrupted: show pte and kill process.
                 */
                print_bad_pte(vma, orig_pte, address);
                return VM_FAULT_OOM;
        }
-       /* We can then assume vm->vm_ops && vma->vm_ops->populate */
 
        pgoff = pte_to_pgoff(orig_pte);
-       err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
-                                       vma->vm_page_prot, pgoff, 0);
-       if (err == -ENOMEM)
-               return VM_FAULT_OOM;
-       if (err)
-               return VM_FAULT_SIGBUS;
-       return VM_FAULT_MAJOR;
+       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
 /*
@@ -2539,10 +2470,9 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
                        if (vma->vm_ops) {
-                               if (vma->vm_ops->nopage)
-                                       return do_no_page(mm, vma, address,
-                                                         pte, pmd,
-                                                         write_access);
+                               if (vma->vm_ops->fault || vma->vm_ops->nopage)
+                                       return do_linear_fault(mm, vma, address,
+                                               pte, pmd, write_access, entry);
                                if (unlikely(vma->vm_ops->nopfn))
                                        return do_no_pfn(mm, vma, address, pte,
                                                         pmd, write_access);
@@ -2551,7 +2481,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                                                 pte, pmd, write_access);
                }
                if (pte_file(entry))
-                       return do_file_page(mm, vma, address,
+                       return do_nonlinear_fault(mm, vma, address,
                                        pte, pmd, write_access, entry);
                return do_swap_page(mm, vma, address,
                                        pte, pmd, write_access, entry);
@@ -2570,7 +2500,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
                update_mmu_cache(vma, address, entry);
-               lazy_mmu_prot_update(entry);
        } else {
                /*
                 * This is needed only for protection faults but the arch code
@@ -2583,13 +2512,13 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        }
 unlock:
        pte_unmap_unlock(pte, ptl);
-       return VM_FAULT_MINOR;
+       return 0;
 }
 
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, int write_access)
 {
        pgd_t *pgd;
@@ -2618,8 +2547,6 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
 
-EXPORT_SYMBOL_GPL(__handle_mm_fault);
-
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
  * Allocate page upper directory.
@@ -2824,3 +2751,4 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 
        return buf - old_buf;
 }
+EXPORT_SYMBOL_GPL(access_process_vm);