[PATCH] paravirt: lazy mmu mode hooks.patch
[sfrench/cifs-2.6.git] / mm / mprotect.c
1 /*
2  *  mm/mprotect.c
3  *
4  *  (C) Copyright 1994 Linus Torvalds
5  *  (C) Copyright 2002 Christoph Hellwig
6  *
7  *  Address space accounting code       <alan@redhat.com>
8  *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
9  */
10
11 #include <linux/mm.h>
12 #include <linux/hugetlb.h>
13 #include <linux/slab.h>
14 #include <linux/shm.h>
15 #include <linux/mman.h>
16 #include <linux/fs.h>
17 #include <linux/highmem.h>
18 #include <linux/security.h>
19 #include <linux/mempolicy.h>
20 #include <linux/personality.h>
21 #include <linux/syscalls.h>
22 #include <linux/swap.h>
23 #include <linux/swapops.h>
24 #include <asm/uaccess.h>
25 #include <asm/pgtable.h>
26 #include <asm/cacheflush.h>
27 #include <asm/tlbflush.h>
28
29 static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
30                 unsigned long addr, unsigned long end, pgprot_t newprot,
31                 int dirty_accountable)
32 {
33         pte_t *pte, oldpte;
34         spinlock_t *ptl;
35
36         pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
37         arch_enter_lazy_mmu_mode();
38         do {
39                 oldpte = *pte;
40                 if (pte_present(oldpte)) {
41                         pte_t ptent;
42
43                         /* Avoid an SMP race with hardware updated dirty/clean
44                          * bits by wiping the pte and then setting the new pte
45                          * into place.
46                          */
47                         ptent = ptep_get_and_clear(mm, addr, pte);
48                         ptent = pte_modify(ptent, newprot);
49                         /*
50                          * Avoid taking write faults for pages we know to be
51                          * dirty.
52                          */
53                         if (dirty_accountable && pte_dirty(ptent))
54                                 ptent = pte_mkwrite(ptent);
55                         set_pte_at(mm, addr, pte, ptent);
56                         lazy_mmu_prot_update(ptent);
57 #ifdef CONFIG_MIGRATION
58                 } else if (!pte_file(oldpte)) {
59                         swp_entry_t entry = pte_to_swp_entry(oldpte);
60
61                         if (is_write_migration_entry(entry)) {
62                                 /*
63                                  * A protection check is difficult so
64                                  * just be safe and disable write
65                                  */
66                                 make_migration_entry_read(&entry);
67                                 set_pte_at(mm, addr, pte,
68                                         swp_entry_to_pte(entry));
69                         }
70 #endif
71                 }
72
73         } while (pte++, addr += PAGE_SIZE, addr != end);
74         arch_leave_lazy_mmu_mode();
75         pte_unmap_unlock(pte - 1, ptl);
76 }
77
78 static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
79                 unsigned long addr, unsigned long end, pgprot_t newprot,
80                 int dirty_accountable)
81 {
82         pmd_t *pmd;
83         unsigned long next;
84
85         pmd = pmd_offset(pud, addr);
86         do {
87                 next = pmd_addr_end(addr, end);
88                 if (pmd_none_or_clear_bad(pmd))
89                         continue;
90                 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
91         } while (pmd++, addr = next, addr != end);
92 }
93
94 static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
95                 unsigned long addr, unsigned long end, pgprot_t newprot,
96                 int dirty_accountable)
97 {
98         pud_t *pud;
99         unsigned long next;
100
101         pud = pud_offset(pgd, addr);
102         do {
103                 next = pud_addr_end(addr, end);
104                 if (pud_none_or_clear_bad(pud))
105                         continue;
106                 change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
107         } while (pud++, addr = next, addr != end);
108 }
109
110 static void change_protection(struct vm_area_struct *vma,
111                 unsigned long addr, unsigned long end, pgprot_t newprot,
112                 int dirty_accountable)
113 {
114         struct mm_struct *mm = vma->vm_mm;
115         pgd_t *pgd;
116         unsigned long next;
117         unsigned long start = addr;
118
119         BUG_ON(addr >= end);
120         pgd = pgd_offset(mm, addr);
121         flush_cache_range(vma, addr, end);
122         do {
123                 next = pgd_addr_end(addr, end);
124                 if (pgd_none_or_clear_bad(pgd))
125                         continue;
126                 change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
127         } while (pgd++, addr = next, addr != end);
128         flush_tlb_range(vma, start, end);
129 }
130
131 static int
132 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
133         unsigned long start, unsigned long end, unsigned long newflags)
134 {
135         struct mm_struct *mm = vma->vm_mm;
136         unsigned long oldflags = vma->vm_flags;
137         long nrpages = (end - start) >> PAGE_SHIFT;
138         unsigned long charged = 0;
139         pgoff_t pgoff;
140         int error;
141         int dirty_accountable = 0;
142
143         if (newflags == oldflags) {
144                 *pprev = vma;
145                 return 0;
146         }
147
148         /*
149          * If we make a private mapping writable we increase our commit;
150          * but (without finer accounting) cannot reduce our commit if we
151          * make it unwritable again.
152          *
153          * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
154          * a MAP_NORESERVE private mapping to writable will now reserve.
155          */
156         if (newflags & VM_WRITE) {
157                 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
158                         charged = nrpages;
159                         if (security_vm_enough_memory(charged))
160                                 return -ENOMEM;
161                         newflags |= VM_ACCOUNT;
162                 }
163         }
164
165         /*
166          * First try to merge with previous and/or next vma.
167          */
168         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
169         *pprev = vma_merge(mm, *pprev, start, end, newflags,
170                         vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
171         if (*pprev) {
172                 vma = *pprev;
173                 goto success;
174         }
175
176         *pprev = vma;
177
178         if (start != vma->vm_start) {
179                 error = split_vma(mm, vma, start, 1);
180                 if (error)
181                         goto fail;
182         }
183
184         if (end != vma->vm_end) {
185                 error = split_vma(mm, vma, end, 0);
186                 if (error)
187                         goto fail;
188         }
189
190 success:
191         /*
192          * vm_flags and vm_page_prot are protected by the mmap_sem
193          * held in write mode.
194          */
195         vma->vm_flags = newflags;
196         vma->vm_page_prot = protection_map[newflags &
197                 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
198         if (vma_wants_writenotify(vma)) {
199                 vma->vm_page_prot = protection_map[newflags &
200                         (VM_READ|VM_WRITE|VM_EXEC)];
201                 dirty_accountable = 1;
202         }
203
204         if (is_vm_hugetlb_page(vma))
205                 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
206         else
207                 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
208         vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
209         vm_stat_account(mm, newflags, vma->vm_file, nrpages);
210         return 0;
211
212 fail:
213         vm_unacct_memory(charged);
214         return error;
215 }
216
217 asmlinkage long
218 sys_mprotect(unsigned long start, size_t len, unsigned long prot)
219 {
220         unsigned long vm_flags, nstart, end, tmp, reqprot;
221         struct vm_area_struct *vma, *prev;
222         int error = -EINVAL;
223         const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
224         prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
225         if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
226                 return -EINVAL;
227
228         if (start & ~PAGE_MASK)
229                 return -EINVAL;
230         if (!len)
231                 return 0;
232         len = PAGE_ALIGN(len);
233         end = start + len;
234         if (end <= start)
235                 return -ENOMEM;
236         if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
237                 return -EINVAL;
238
239         reqprot = prot;
240         /*
241          * Does the application expect PROT_READ to imply PROT_EXEC:
242          */
243         if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
244                 prot |= PROT_EXEC;
245
246         vm_flags = calc_vm_prot_bits(prot);
247
248         down_write(&current->mm->mmap_sem);
249
250         vma = find_vma_prev(current->mm, start, &prev);
251         error = -ENOMEM;
252         if (!vma)
253                 goto out;
254         if (unlikely(grows & PROT_GROWSDOWN)) {
255                 if (vma->vm_start >= end)
256                         goto out;
257                 start = vma->vm_start;
258                 error = -EINVAL;
259                 if (!(vma->vm_flags & VM_GROWSDOWN))
260                         goto out;
261         }
262         else {
263                 if (vma->vm_start > start)
264                         goto out;
265                 if (unlikely(grows & PROT_GROWSUP)) {
266                         end = vma->vm_end;
267                         error = -EINVAL;
268                         if (!(vma->vm_flags & VM_GROWSUP))
269                                 goto out;
270                 }
271         }
272         if (start > vma->vm_start)
273                 prev = vma;
274
275         for (nstart = start ; ; ) {
276                 unsigned long newflags;
277
278                 /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
279
280                 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
281
282                 /* newflags >> 4 shift VM_MAY% in place of VM_% */
283                 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
284                         error = -EACCES;
285                         goto out;
286                 }
287
288                 error = security_file_mprotect(vma, reqprot, prot);
289                 if (error)
290                         goto out;
291
292                 tmp = vma->vm_end;
293                 if (tmp > end)
294                         tmp = end;
295                 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
296                 if (error)
297                         goto out;
298                 nstart = tmp;
299
300                 if (nstart < prev->vm_end)
301                         nstart = prev->vm_end;
302                 if (nstart >= end)
303                         goto out;
304
305                 vma = prev->vm_next;
306                 if (!vma || vma->vm_start != nstart) {
307                         error = -ENOMEM;
308                         goto out;
309                 }
310         }
311 out:
312         up_write(&current->mm->mmap_sem);
313         return error;
314 }