x86: use the pfn from the page when change its attributes
[sfrench/cifs-2.6.git] / arch / x86 / mm / pageattr.c
1 /*
2  * Copyright 2002 Andi Kleen, SuSE Labs.
3  * Thanks to Ben LaHaise for precious feedback.
4  */
5 #include <linux/highmem.h>
6 #include <linux/bootmem.h>
7 #include <linux/module.h>
8 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/mm.h>
11
12 #include <asm/e820.h>
13 #include <asm/processor.h>
14 #include <asm/tlbflush.h>
15 #include <asm/sections.h>
16 #include <asm/uaccess.h>
17 #include <asm/pgalloc.h>
18
19 static inline int
20 within(unsigned long addr, unsigned long start, unsigned long end)
21 {
22         return addr >= start && addr < end;
23 }
24
25 /*
26  * Flushing functions
27  */
28
29 /**
30  * clflush_cache_range - flush a cache range with clflush
31  * @addr:       virtual start address
32  * @size:       number of bytes to flush
33  *
34  * clflush is an unordered instruction which needs fencing with mfence
35  * to avoid ordering issues.
36  */
37 void clflush_cache_range(void *vaddr, unsigned int size)
38 {
39         void *vend = vaddr + size - 1;
40
41         mb();
42
43         for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
44                 clflush(vaddr);
45         /*
46          * Flush any possible final partial cacheline:
47          */
48         clflush(vend);
49
50         mb();
51 }
52
53 static void __cpa_flush_all(void *arg)
54 {
55         /*
56          * Flush all to work around Errata in early athlons regarding
57          * large page flushing.
58          */
59         __flush_tlb_all();
60
61         if (boot_cpu_data.x86_model >= 4)
62                 wbinvd();
63 }
64
65 static void cpa_flush_all(void)
66 {
67         BUG_ON(irqs_disabled());
68
69         on_each_cpu(__cpa_flush_all, NULL, 1, 1);
70 }
71
72 static void __cpa_flush_range(void *arg)
73 {
74         /*
75          * We could optimize that further and do individual per page
76          * tlb invalidates for a low number of pages. Caveat: we must
77          * flush the high aliases on 64bit as well.
78          */
79         __flush_tlb_all();
80 }
81
82 static void cpa_flush_range(unsigned long start, int numpages)
83 {
84         unsigned int i, level;
85         unsigned long addr;
86
87         BUG_ON(irqs_disabled());
88         WARN_ON(PAGE_ALIGN(start) != start);
89
90         on_each_cpu(__cpa_flush_range, NULL, 1, 1);
91
92         /*
93          * We only need to flush on one CPU,
94          * clflush is a MESI-coherent instruction that
95          * will cause all other CPUs to flush the same
96          * cachelines:
97          */
98         for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
99                 pte_t *pte = lookup_address(addr, &level);
100
101                 /*
102                  * Only flush present addresses:
103                  */
104                 if (pte && pte_present(*pte))
105                         clflush_cache_range((void *) addr, PAGE_SIZE);
106         }
107 }
108
109 #define HIGH_MAP_START  __START_KERNEL_map
110 #define HIGH_MAP_END    (__START_KERNEL_map + KERNEL_TEXT_SIZE)
111
112
113 /*
114  * Converts a virtual address to a X86-64 highmap address
115  */
116 static unsigned long virt_to_highmap(void *address)
117 {
118 #ifdef CONFIG_X86_64
119         return __pa((unsigned long)address) + HIGH_MAP_START - phys_base;
120 #else
121         return (unsigned long)address;
122 #endif
123 }
124
125 /*
126  * Certain areas of memory on x86 require very specific protection flags,
127  * for example the BIOS area or kernel text. Callers don't always get this
128  * right (again, ioremap() on BIOS memory is not uncommon) so this function
129  * checks and fixes these known static required protection bits.
130  */
131 static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
132 {
133         pgprot_t forbidden = __pgprot(0);
134
135         /*
136          * The BIOS area between 640k and 1Mb needs to be executable for
137          * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
138          */
139         if (within(__pa(address), BIOS_BEGIN, BIOS_END))
140                 pgprot_val(forbidden) |= _PAGE_NX;
141
142         /*
143          * The kernel text needs to be executable for obvious reasons
144          * Does not cover __inittext since that is gone later on
145          */
146         if (within(address, (unsigned long)_text, (unsigned long)_etext))
147                 pgprot_val(forbidden) |= _PAGE_NX;
148         /*
149          * Do the same for the x86-64 high kernel mapping
150          */
151         if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext)))
152                 pgprot_val(forbidden) |= _PAGE_NX;
153
154
155 #ifdef CONFIG_DEBUG_RODATA
156         /* The .rodata section needs to be read-only */
157         if (within(address, (unsigned long)__start_rodata,
158                                 (unsigned long)__end_rodata))
159                 pgprot_val(forbidden) |= _PAGE_RW;
160         /*
161          * Do the same for the x86-64 high kernel mapping
162          */
163         if (within(address, virt_to_highmap(__start_rodata),
164                                 virt_to_highmap(__end_rodata)))
165                 pgprot_val(forbidden) |= _PAGE_RW;
166 #endif
167
168         prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
169
170         return prot;
171 }
172
173 pte_t *lookup_address(unsigned long address, int *level)
174 {
175         pgd_t *pgd = pgd_offset_k(address);
176         pud_t *pud;
177         pmd_t *pmd;
178
179         *level = PG_LEVEL_NONE;
180
181         if (pgd_none(*pgd))
182                 return NULL;
183         pud = pud_offset(pgd, address);
184         if (pud_none(*pud))
185                 return NULL;
186         pmd = pmd_offset(pud, address);
187         if (pmd_none(*pmd))
188                 return NULL;
189
190         *level = PG_LEVEL_2M;
191         if (pmd_large(*pmd))
192                 return (pte_t *)pmd;
193
194         *level = PG_LEVEL_4K;
195         return pte_offset_kernel(pmd, address);
196 }
197
198 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
199 {
200         /* change init_mm */
201         set_pte_atomic(kpte, pte);
202 #ifdef CONFIG_X86_32
203         if (!SHARED_KERNEL_PMD) {
204                 struct page *page;
205
206                 list_for_each_entry(page, &pgd_list, lru) {
207                         pgd_t *pgd;
208                         pud_t *pud;
209                         pmd_t *pmd;
210
211                         pgd = (pgd_t *)page_address(page) + pgd_index(address);
212                         pud = pud_offset(pgd, address);
213                         pmd = pmd_offset(pud, address);
214                         set_pte_atomic((pte_t *)pmd, pte);
215                 }
216         }
217 #endif
218 }
219
220 static int split_large_page(pte_t *kpte, unsigned long address)
221 {
222         pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte));
223         gfp_t gfp_flags = GFP_KERNEL;
224         unsigned long flags;
225         unsigned long addr;
226         pte_t *pbase, *tmp;
227         struct page *base;
228         unsigned int i, level;
229
230 #ifdef CONFIG_DEBUG_PAGEALLOC
231         gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN;
232         gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
233 #endif
234         base = alloc_pages(gfp_flags, 0);
235         if (!base)
236                 return -ENOMEM;
237
238         spin_lock_irqsave(&pgd_lock, flags);
239         /*
240          * Check for races, another CPU might have split this page
241          * up for us already:
242          */
243         tmp = lookup_address(address, &level);
244         if (tmp != kpte) {
245                 WARN_ON_ONCE(1);
246                 goto out_unlock;
247         }
248
249         address = __pa(address);
250         addr = address & LARGE_PAGE_MASK;
251         pbase = (pte_t *)page_address(base);
252 #ifdef CONFIG_X86_32
253         paravirt_alloc_pt(&init_mm, page_to_pfn(base));
254 #endif
255
256         for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE)
257                 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot));
258
259         /*
260          * Install the new, split up pagetable. Important detail here:
261          *
262          * On Intel the NX bit of all levels must be cleared to make a
263          * page executable. See section 4.13.2 of Intel 64 and IA-32
264          * Architectures Software Developer's Manual).
265          */
266         ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
267         __set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
268         base = NULL;
269
270 out_unlock:
271         spin_unlock_irqrestore(&pgd_lock, flags);
272
273         if (base)
274                 __free_pages(base, 0);
275
276         return 0;
277 }
278
279 static int
280 __change_page_attr(unsigned long address, pgprot_t mask_set, pgprot_t mask_clr)
281 {
282         struct page *kpte_page;
283         int level, err = 0;
284         pte_t *kpte;
285
286 repeat:
287         kpte = lookup_address(address, &level);
288         if (!kpte)
289                 return -EINVAL;
290
291         kpte_page = virt_to_page(kpte);
292         BUG_ON(PageLRU(kpte_page));
293         BUG_ON(PageCompound(kpte_page));
294
295         if (level == PG_LEVEL_4K) {
296                 pte_t new_pte, old_pte = *kpte;
297                 pgprot_t new_prot = pte_pgprot(old_pte);
298
299                 if(!pte_val(old_pte)) {
300                         WARN_ON_ONCE(1);
301                         return -EINVAL;
302                 }
303
304                 pgprot_val(new_prot) &= ~pgprot_val(mask_clr);
305                 pgprot_val(new_prot) |= pgprot_val(mask_set);
306
307                 new_prot = static_protections(new_prot, address);
308
309                 /*
310                  * We need to keep the pfn from the existing PTE,
311                  * after all we're only going to change it's attributes
312                  * not the memory it points to
313                  */
314                 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
315                 set_pte_atomic(kpte, new_pte);
316         } else {
317                 err = split_large_page(kpte, address);
318                 if (!err)
319                         goto repeat;
320         }
321         return err;
322 }
323
324 /**
325  * change_page_attr_addr - Change page table attributes in linear mapping
326  * @address: Virtual address in linear mapping.
327  * @prot:    New page table attribute (PAGE_*)
328  *
329  * Change page attributes of a page in the direct mapping. This is a variant
330  * of change_page_attr() that also works on memory holes that do not have
331  * mem_map entry (pfn_valid() is false).
332  *
333  * See change_page_attr() documentation for more details.
334  *
335  * Modules and drivers should use the set_memory_* APIs instead.
336  */
337
338
339 static int
340 change_page_attr_addr(unsigned long address, pgprot_t mask_set,
341                       pgprot_t mask_clr)
342 {
343         int err;
344
345 #ifdef CONFIG_X86_64
346         unsigned long phys_addr = __pa(address);
347
348         /*
349          * If we are inside the high mapped kernel range, then we
350          * fixup the low mapping first. __va() returns the virtual
351          * address in the linear mapping:
352          */
353         if (within(address, HIGH_MAP_START, HIGH_MAP_END))
354                 address = (unsigned long) __va(phys_addr);
355 #endif
356
357         err = __change_page_attr(address, mask_set, mask_clr);
358         if (err)
359                 return err;
360
361 #ifdef CONFIG_X86_64
362         /*
363          * If the physical address is inside the kernel map, we need
364          * to touch the high mapped kernel as well:
365          */
366         if (within(phys_addr, 0, KERNEL_TEXT_SIZE)) {
367                 /*
368                  * Calc the high mapping address. See __phys_addr()
369                  * for the non obvious details.
370                  *
371                  * Note that NX and other required permissions are
372                  * checked in static_protections().
373                  */
374                 address = phys_addr + HIGH_MAP_START - phys_base;
375
376                 /*
377                  * Our high aliases are imprecise, because we check
378                  * everything between 0 and KERNEL_TEXT_SIZE, so do
379                  * not propagate lookup failures back to users:
380                  */
381                 __change_page_attr(address, mask_set, mask_clr);
382         }
383 #endif
384         return err;
385 }
386
387 static int __change_page_attr_set_clr(unsigned long addr, int numpages,
388                                       pgprot_t mask_set, pgprot_t mask_clr)
389 {
390         unsigned int i;
391         int ret;
392
393         for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) {
394                 ret = change_page_attr_addr(addr, mask_set, mask_clr);
395                 if (ret)
396                         return ret;
397         }
398
399         return 0;
400 }
401
402 static int change_page_attr_set_clr(unsigned long addr, int numpages,
403                                     pgprot_t mask_set, pgprot_t mask_clr)
404 {
405         int ret = __change_page_attr_set_clr(addr, numpages, mask_set,
406                                              mask_clr);
407
408         /*
409          * On success we use clflush, when the CPU supports it to
410          * avoid the wbindv. If the CPU does not support it and in the
411          * error case we fall back to cpa_flush_all (which uses
412          * wbindv):
413          */
414         if (!ret && cpu_has_clflush)
415                 cpa_flush_range(addr, numpages);
416         else
417                 cpa_flush_all();
418
419         return ret;
420 }
421
422 static inline int change_page_attr_set(unsigned long addr, int numpages,
423                                        pgprot_t mask)
424 {
425         return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
426 }
427
428 static inline int change_page_attr_clear(unsigned long addr, int numpages,
429                                          pgprot_t mask)
430 {
431         return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
432 }
433
434 int set_memory_uc(unsigned long addr, int numpages)
435 {
436         return change_page_attr_set(addr, numpages,
437                                     __pgprot(_PAGE_PCD | _PAGE_PWT));
438 }
439 EXPORT_SYMBOL(set_memory_uc);
440
441 int set_memory_wb(unsigned long addr, int numpages)
442 {
443         return change_page_attr_clear(addr, numpages,
444                                       __pgprot(_PAGE_PCD | _PAGE_PWT));
445 }
446 EXPORT_SYMBOL(set_memory_wb);
447
448 int set_memory_x(unsigned long addr, int numpages)
449 {
450         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
451 }
452 EXPORT_SYMBOL(set_memory_x);
453
454 int set_memory_nx(unsigned long addr, int numpages)
455 {
456         return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
457 }
458 EXPORT_SYMBOL(set_memory_nx);
459
460 int set_memory_ro(unsigned long addr, int numpages)
461 {
462         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
463 }
464
465 int set_memory_rw(unsigned long addr, int numpages)
466 {
467         return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
468 }
469
470 int set_memory_np(unsigned long addr, int numpages)
471 {
472         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
473 }
474
475 int set_pages_uc(struct page *page, int numpages)
476 {
477         unsigned long addr = (unsigned long)page_address(page);
478
479         return set_memory_uc(addr, numpages);
480 }
481 EXPORT_SYMBOL(set_pages_uc);
482
483 int set_pages_wb(struct page *page, int numpages)
484 {
485         unsigned long addr = (unsigned long)page_address(page);
486
487         return set_memory_wb(addr, numpages);
488 }
489 EXPORT_SYMBOL(set_pages_wb);
490
491 int set_pages_x(struct page *page, int numpages)
492 {
493         unsigned long addr = (unsigned long)page_address(page);
494
495         return set_memory_x(addr, numpages);
496 }
497 EXPORT_SYMBOL(set_pages_x);
498
499 int set_pages_nx(struct page *page, int numpages)
500 {
501         unsigned long addr = (unsigned long)page_address(page);
502
503         return set_memory_nx(addr, numpages);
504 }
505 EXPORT_SYMBOL(set_pages_nx);
506
507 int set_pages_ro(struct page *page, int numpages)
508 {
509         unsigned long addr = (unsigned long)page_address(page);
510
511         return set_memory_ro(addr, numpages);
512 }
513
514 int set_pages_rw(struct page *page, int numpages)
515 {
516         unsigned long addr = (unsigned long)page_address(page);
517
518         return set_memory_rw(addr, numpages);
519 }
520
521
522 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG)
523 static inline int __change_page_attr_set(unsigned long addr, int numpages,
524                                          pgprot_t mask)
525 {
526         return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
527 }
528
529 static inline int __change_page_attr_clear(unsigned long addr, int numpages,
530                                            pgprot_t mask)
531 {
532         return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
533 }
534 #endif
535
536 #ifdef CONFIG_DEBUG_PAGEALLOC
537
538 static int __set_pages_p(struct page *page, int numpages)
539 {
540         unsigned long addr = (unsigned long)page_address(page);
541
542         return __change_page_attr_set(addr, numpages,
543                                       __pgprot(_PAGE_PRESENT | _PAGE_RW));
544 }
545
546 static int __set_pages_np(struct page *page, int numpages)
547 {
548         unsigned long addr = (unsigned long)page_address(page);
549
550         return __change_page_attr_clear(addr, numpages,
551                                         __pgprot(_PAGE_PRESENT));
552 }
553
554 void kernel_map_pages(struct page *page, int numpages, int enable)
555 {
556         if (PageHighMem(page))
557                 return;
558         if (!enable) {
559                 debug_check_no_locks_freed(page_address(page),
560                                            numpages * PAGE_SIZE);
561         }
562
563         /*
564          * If page allocator is not up yet then do not call c_p_a():
565          */
566         if (!debug_pagealloc_enabled)
567                 return;
568
569         /*
570          * The return value is ignored - the calls cannot fail,
571          * large pages are disabled at boot time:
572          */
573         if (enable)
574                 __set_pages_p(page, numpages);
575         else
576                 __set_pages_np(page, numpages);
577
578         /*
579          * We should perform an IPI and flush all tlbs,
580          * but that can deadlock->flush only current cpu:
581          */
582         __flush_tlb_all();
583 }
584 #endif
585
586 /*
587  * The testcases use internal knowledge of the implementation that shouldn't
588  * be exposed to the rest of the kernel. Include these directly here.
589  */
590 #ifdef CONFIG_CPA_DEBUG
591 #include "pageattr-test.c"
592 #endif