Commit | Line | Data |
---|---|---|
9f4c815c IM |
1 | /* |
2 | * Copyright 2002 Andi Kleen, SuSE Labs. | |
1da177e4 | 3 | * Thanks to Ben LaHaise for precious feedback. |
9f4c815c | 4 | */ |
1da177e4 | 5 | #include <linux/highmem.h> |
8192206d | 6 | #include <linux/bootmem.h> |
1da177e4 | 7 | #include <linux/module.h> |
9f4c815c | 8 | #include <linux/sched.h> |
1da177e4 | 9 | #include <linux/slab.h> |
9f4c815c IM |
10 | #include <linux/mm.h> |
11 | ||
950f9d95 | 12 | #include <asm/e820.h> |
1da177e4 LT |
13 | #include <asm/processor.h> |
14 | #include <asm/tlbflush.h> | |
f8af095d | 15 | #include <asm/sections.h> |
9f4c815c IM |
16 | #include <asm/uaccess.h> |
17 | #include <asm/pgalloc.h> | |
1da177e4 | 18 | |
ed724be6 AV |
19 | static inline int |
20 | within(unsigned long addr, unsigned long start, unsigned long end) | |
687c4825 | 21 | { |
ed724be6 AV |
22 | return addr >= start && addr < end; |
23 | } | |
24 | ||
d7c8f21a TG |
25 | /* |
26 | * Flushing functions | |
27 | */ | |
28 | void clflush_cache_range(void *addr, int size) | |
29 | { | |
30 | int i; | |
31 | ||
32 | for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size) | |
33 | clflush(addr+i); | |
34 | } | |
35 | ||
36 | static void flush_kernel_map(void *arg) | |
37 | { | |
38 | /* | |
39 | * Flush all to work around Errata in early athlons regarding | |
40 | * large page flushing. | |
41 | */ | |
42 | __flush_tlb_all(); | |
43 | ||
44 | if (boot_cpu_data.x86_model >= 4) | |
45 | wbinvd(); | |
46 | } | |
47 | ||
48 | static void global_flush_tlb(void) | |
49 | { | |
50 | BUG_ON(irqs_disabled()); | |
51 | ||
52 | on_each_cpu(flush_kernel_map, NULL, 1, 1); | |
53 | } | |
54 | ||
ed724be6 AV |
55 | /* |
56 | * Certain areas of memory on x86 require very specific protection flags, | |
57 | * for example the BIOS area or kernel text. Callers don't always get this | |
58 | * right (again, ioremap() on BIOS memory is not uncommon) so this function | |
59 | * checks and fixes these known static required protection bits. | |
60 | */ | |
61 | static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) | |
62 | { | |
63 | pgprot_t forbidden = __pgprot(0); | |
64 | ||
687c4825 | 65 | /* |
ed724be6 AV |
66 | * The BIOS area between 640k and 1Mb needs to be executable for |
67 | * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. | |
687c4825 | 68 | */ |
ed724be6 AV |
69 | if (within(__pa(address), BIOS_BEGIN, BIOS_END)) |
70 | pgprot_val(forbidden) |= _PAGE_NX; | |
71 | ||
72 | /* | |
73 | * The kernel text needs to be executable for obvious reasons | |
74 | * Does not cover __inittext since that is gone later on | |
75 | */ | |
76 | if (within(address, (unsigned long)_text, (unsigned long)_etext)) | |
77 | pgprot_val(forbidden) |= _PAGE_NX; | |
78 | ||
79 | #ifdef CONFIG_DEBUG_RODATA | |
80 | /* The .rodata section needs to be read-only */ | |
81 | if (within(address, (unsigned long)__start_rodata, | |
82 | (unsigned long)__end_rodata)) | |
83 | pgprot_val(forbidden) |= _PAGE_RW; | |
84 | #endif | |
85 | ||
86 | prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); | |
687c4825 IM |
87 | |
88 | return prot; | |
89 | } | |
90 | ||
f0646e43 | 91 | pte_t *lookup_address(unsigned long address, int *level) |
9f4c815c | 92 | { |
1da177e4 LT |
93 | pgd_t *pgd = pgd_offset_k(address); |
94 | pud_t *pud; | |
95 | pmd_t *pmd; | |
9f4c815c | 96 | |
30551bb3 TG |
97 | *level = PG_LEVEL_NONE; |
98 | ||
1da177e4 LT |
99 | if (pgd_none(*pgd)) |
100 | return NULL; | |
101 | pud = pud_offset(pgd, address); | |
102 | if (pud_none(*pud)) | |
103 | return NULL; | |
104 | pmd = pmd_offset(pud, address); | |
105 | if (pmd_none(*pmd)) | |
106 | return NULL; | |
30551bb3 TG |
107 | |
108 | *level = PG_LEVEL_2M; | |
1da177e4 LT |
109 | if (pmd_large(*pmd)) |
110 | return (pte_t *)pmd; | |
1da177e4 | 111 | |
30551bb3 | 112 | *level = PG_LEVEL_4K; |
9f4c815c IM |
113 | return pte_offset_kernel(pmd, address); |
114 | } | |
115 | ||
9a3dc780 | 116 | static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) |
9f4c815c | 117 | { |
9f4c815c IM |
118 | /* change init_mm */ |
119 | set_pte_atomic(kpte, pte); | |
44af6c41 | 120 | #ifdef CONFIG_X86_32 |
e4b71dcf | 121 | if (!SHARED_KERNEL_PMD) { |
44af6c41 IM |
122 | struct page *page; |
123 | ||
124 | for (page = pgd_list; page; page = (struct page *)page->index) { | |
125 | pgd_t *pgd; | |
126 | pud_t *pud; | |
127 | pmd_t *pmd; | |
128 | ||
129 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | |
130 | pud = pud_offset(pgd, address); | |
131 | pmd = pmd_offset(pud, address); | |
132 | set_pte_atomic((pte_t *)pmd, pte); | |
133 | } | |
1da177e4 | 134 | } |
44af6c41 | 135 | #endif |
1da177e4 LT |
136 | } |
137 | ||
7afe15b9 | 138 | static int split_large_page(pte_t *kpte, unsigned long address) |
bb5c2dbd | 139 | { |
7afe15b9 | 140 | pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); |
12d6f21e | 141 | gfp_t gfp_flags = GFP_KERNEL; |
9a3dc780 | 142 | unsigned long flags; |
bb5c2dbd IM |
143 | unsigned long addr; |
144 | pte_t *pbase, *tmp; | |
145 | struct page *base; | |
7afe15b9 | 146 | int i, level; |
bb5c2dbd | 147 | |
12d6f21e IM |
148 | #ifdef CONFIG_DEBUG_PAGEALLOC |
149 | gfp_flags = GFP_ATOMIC; | |
150 | #endif | |
151 | base = alloc_pages(gfp_flags, 0); | |
bb5c2dbd IM |
152 | if (!base) |
153 | return -ENOMEM; | |
154 | ||
9a3dc780 | 155 | spin_lock_irqsave(&pgd_lock, flags); |
bb5c2dbd IM |
156 | /* |
157 | * Check for races, another CPU might have split this page | |
158 | * up for us already: | |
159 | */ | |
160 | tmp = lookup_address(address, &level); | |
5508a748 IM |
161 | if (tmp != kpte) { |
162 | WARN_ON_ONCE(1); | |
bb5c2dbd | 163 | goto out_unlock; |
5508a748 | 164 | } |
bb5c2dbd IM |
165 | |
166 | address = __pa(address); | |
167 | addr = address & LARGE_PAGE_MASK; | |
168 | pbase = (pte_t *)page_address(base); | |
44af6c41 | 169 | #ifdef CONFIG_X86_32 |
bb5c2dbd | 170 | paravirt_alloc_pt(&init_mm, page_to_pfn(base)); |
44af6c41 | 171 | #endif |
bb5c2dbd IM |
172 | |
173 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) | |
174 | set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot)); | |
175 | ||
176 | /* | |
4c881ca1 HY |
177 | * Install the new, split up pagetable. Important detail here: |
178 | * | |
179 | * On Intel the NX bit of all levels must be cleared to make a | |
180 | * page executable. See section 4.13.2 of Intel 64 and IA-32 | |
181 | * Architectures Software Developer's Manual). | |
bb5c2dbd | 182 | */ |
4c881ca1 | 183 | ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); |
9a3dc780 | 184 | __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); |
bb5c2dbd IM |
185 | base = NULL; |
186 | ||
187 | out_unlock: | |
9a3dc780 | 188 | spin_unlock_irqrestore(&pgd_lock, flags); |
bb5c2dbd IM |
189 | |
190 | if (base) | |
191 | __free_pages(base, 0); | |
192 | ||
193 | return 0; | |
194 | } | |
195 | ||
44af6c41 | 196 | static int |
8192206d | 197 | __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot) |
9f4c815c | 198 | { |
1da177e4 | 199 | struct page *kpte_page; |
bb5c2dbd | 200 | int level, err = 0; |
9f4c815c | 201 | pte_t *kpte; |
1da177e4 | 202 | |
8192206d IM |
203 | #ifdef CONFIG_X86_32 |
204 | BUG_ON(pfn > max_low_pfn); | |
205 | #endif | |
1da177e4 | 206 | |
97f99fed | 207 | repeat: |
f0646e43 | 208 | kpte = lookup_address(address, &level); |
1da177e4 LT |
209 | if (!kpte) |
210 | return -EINVAL; | |
9f4c815c | 211 | |
1da177e4 | 212 | kpte_page = virt_to_page(kpte); |
65d2f0bc AK |
213 | BUG_ON(PageLRU(kpte_page)); |
214 | BUG_ON(PageCompound(kpte_page)); | |
215 | ||
ed724be6 | 216 | prot = static_protections(prot, address); |
65d2f0bc | 217 | |
30551bb3 | 218 | if (level == PG_LEVEL_4K) { |
a72a08a4 | 219 | WARN_ON_ONCE(pgprot_val(prot) & _PAGE_PSE); |
8192206d | 220 | set_pte_atomic(kpte, pfn_pte(pfn, canon_pgprot(prot))); |
78c94aba | 221 | } else { |
a72a08a4 TG |
222 | /* Clear the PSE bit for the 4k level pages ! */ |
223 | pgprot_val(prot) = pgprot_val(prot) & ~_PAGE_PSE; | |
224 | ||
7afe15b9 | 225 | err = split_large_page(kpte, address); |
bb5c2dbd IM |
226 | if (!err) |
227 | goto repeat; | |
1da177e4 | 228 | } |
bb5c2dbd | 229 | return err; |
9f4c815c | 230 | } |
1da177e4 | 231 | |
44af6c41 IM |
232 | /** |
233 | * change_page_attr_addr - Change page table attributes in linear mapping | |
234 | * @address: Virtual address in linear mapping. | |
44af6c41 | 235 | * @prot: New page table attribute (PAGE_*) |
1da177e4 | 236 | * |
44af6c41 IM |
237 | * Change page attributes of a page in the direct mapping. This is a variant |
238 | * of change_page_attr() that also works on memory holes that do not have | |
239 | * mem_map entry (pfn_valid() is false). | |
9f4c815c | 240 | * |
44af6c41 | 241 | * See change_page_attr() documentation for more details. |
75cbade8 AV |
242 | * |
243 | * Modules and drivers should use the set_memory_* APIs instead. | |
1da177e4 | 244 | */ |
44af6c41 | 245 | |
488fd995 | 246 | static int change_page_attr_addr(unsigned long address, pgprot_t prot) |
1da177e4 | 247 | { |
488fd995 AV |
248 | int err = 0, kernel_map = 0; |
249 | unsigned long pfn = __pa(address) >> PAGE_SHIFT; | |
44af6c41 IM |
250 | |
251 | #ifdef CONFIG_X86_64 | |
252 | if (address >= __START_KERNEL_map && | |
253 | address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { | |
1da177e4 | 254 | |
44af6c41 IM |
255 | address = (unsigned long)__va(__pa(address)); |
256 | kernel_map = 1; | |
257 | } | |
258 | #endif | |
259 | ||
488fd995 AV |
260 | if (!kernel_map || pte_present(pfn_pte(0, prot))) { |
261 | err = __change_page_attr(address, pfn, prot); | |
262 | if (err) | |
263 | return err; | |
264 | } | |
44af6c41 | 265 | |
44af6c41 | 266 | #ifdef CONFIG_X86_64 |
488fd995 AV |
267 | /* |
268 | * Handle kernel mapping too which aliases part of | |
269 | * lowmem: | |
270 | */ | |
271 | if (__pa(address) < KERNEL_TEXT_SIZE) { | |
272 | unsigned long addr2; | |
273 | pgprot_t prot2; | |
274 | ||
275 | addr2 = __START_KERNEL_map + __pa(address); | |
276 | /* Make sure the kernel mappings stay executable */ | |
277 | prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); | |
278 | err = __change_page_attr(addr2, pfn, prot2); | |
9f4c815c | 279 | } |
488fd995 | 280 | #endif |
9f4c815c | 281 | |
1da177e4 LT |
282 | return err; |
283 | } | |
284 | ||
ff31452b TG |
285 | static int __change_page_attr_set_clr(unsigned long addr, int numpages, |
286 | pgprot_t mask_set, pgprot_t mask_clr) | |
287 | { | |
288 | pgprot_t new_prot; | |
289 | int level; | |
290 | pte_t *pte; | |
291 | int i, ret; | |
292 | ||
293 | for (i = 0; i < numpages ; i++) { | |
294 | ||
295 | pte = lookup_address(addr, &level); | |
296 | if (!pte) | |
297 | return -EINVAL; | |
298 | ||
299 | new_prot = pte_pgprot(*pte); | |
300 | ||
301 | pgprot_val(new_prot) &= ~pgprot_val(mask_clr); | |
302 | pgprot_val(new_prot) |= pgprot_val(mask_set); | |
303 | ||
304 | ret = change_page_attr_addr(addr, new_prot); | |
305 | if (ret) | |
306 | return ret; | |
307 | addr += PAGE_SIZE; | |
308 | } | |
309 | ||
310 | return 0; | |
311 | } | |
312 | ||
313 | static int change_page_attr_set_clr(unsigned long addr, int numpages, | |
314 | pgprot_t mask_set, pgprot_t mask_clr) | |
315 | { | |
316 | int ret = __change_page_attr_set_clr(addr, numpages, mask_set, | |
317 | mask_clr); | |
318 | ||
319 | global_flush_tlb(); | |
320 | ||
321 | return ret; | |
322 | } | |
323 | ||
56744546 TG |
324 | static inline int change_page_attr_set(unsigned long addr, int numpages, |
325 | pgprot_t mask) | |
75cbade8 | 326 | { |
56744546 | 327 | return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); |
75cbade8 AV |
328 | } |
329 | ||
56744546 TG |
330 | static inline int change_page_attr_clear(unsigned long addr, int numpages, |
331 | pgprot_t mask) | |
72932c7a | 332 | { |
56744546 | 333 | return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); |
72932c7a TG |
334 | |
335 | } | |
336 | ||
337 | int set_memory_uc(unsigned long addr, int numpages) | |
338 | { | |
339 | return change_page_attr_set(addr, numpages, | |
340 | __pgprot(_PAGE_PCD | _PAGE_PWT)); | |
75cbade8 AV |
341 | } |
342 | EXPORT_SYMBOL(set_memory_uc); | |
343 | ||
344 | int set_memory_wb(unsigned long addr, int numpages) | |
345 | { | |
72932c7a TG |
346 | return change_page_attr_clear(addr, numpages, |
347 | __pgprot(_PAGE_PCD | _PAGE_PWT)); | |
75cbade8 AV |
348 | } |
349 | EXPORT_SYMBOL(set_memory_wb); | |
350 | ||
351 | int set_memory_x(unsigned long addr, int numpages) | |
352 | { | |
72932c7a | 353 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); |
75cbade8 AV |
354 | } |
355 | EXPORT_SYMBOL(set_memory_x); | |
356 | ||
357 | int set_memory_nx(unsigned long addr, int numpages) | |
358 | { | |
72932c7a | 359 | return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); |
75cbade8 AV |
360 | } |
361 | EXPORT_SYMBOL(set_memory_nx); | |
362 | ||
363 | int set_memory_ro(unsigned long addr, int numpages) | |
364 | { | |
72932c7a | 365 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); |
75cbade8 | 366 | } |
75cbade8 AV |
367 | |
368 | int set_memory_rw(unsigned long addr, int numpages) | |
369 | { | |
72932c7a | 370 | return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); |
75cbade8 | 371 | } |
f62d0f00 IM |
372 | |
373 | int set_memory_np(unsigned long addr, int numpages) | |
374 | { | |
72932c7a | 375 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); |
f62d0f00 | 376 | } |
75cbade8 AV |
377 | |
378 | int set_pages_uc(struct page *page, int numpages) | |
379 | { | |
380 | unsigned long addr = (unsigned long)page_address(page); | |
75cbade8 | 381 | |
d7c8f21a | 382 | return set_memory_uc(addr, numpages); |
75cbade8 AV |
383 | } |
384 | EXPORT_SYMBOL(set_pages_uc); | |
385 | ||
386 | int set_pages_wb(struct page *page, int numpages) | |
387 | { | |
388 | unsigned long addr = (unsigned long)page_address(page); | |
75cbade8 | 389 | |
d7c8f21a | 390 | return set_memory_wb(addr, numpages); |
75cbade8 AV |
391 | } |
392 | EXPORT_SYMBOL(set_pages_wb); | |
393 | ||
394 | int set_pages_x(struct page *page, int numpages) | |
395 | { | |
396 | unsigned long addr = (unsigned long)page_address(page); | |
75cbade8 | 397 | |
d7c8f21a | 398 | return set_memory_x(addr, numpages); |
75cbade8 AV |
399 | } |
400 | EXPORT_SYMBOL(set_pages_x); | |
401 | ||
402 | int set_pages_nx(struct page *page, int numpages) | |
403 | { | |
404 | unsigned long addr = (unsigned long)page_address(page); | |
75cbade8 | 405 | |
d7c8f21a | 406 | return set_memory_nx(addr, numpages); |
75cbade8 AV |
407 | } |
408 | EXPORT_SYMBOL(set_pages_nx); | |
409 | ||
410 | int set_pages_ro(struct page *page, int numpages) | |
411 | { | |
412 | unsigned long addr = (unsigned long)page_address(page); | |
75cbade8 | 413 | |
d7c8f21a | 414 | return set_memory_ro(addr, numpages); |
75cbade8 | 415 | } |
75cbade8 AV |
416 | |
417 | int set_pages_rw(struct page *page, int numpages) | |
418 | { | |
419 | unsigned long addr = (unsigned long)page_address(page); | |
e81d5dc4 | 420 | |
d7c8f21a | 421 | return set_memory_rw(addr, numpages); |
78c94aba IM |
422 | } |
423 | ||
1da177e4 | 424 | |
56744546 TG |
425 | #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG) |
426 | static inline int __change_page_attr_set(unsigned long addr, int numpages, | |
427 | pgprot_t mask) | |
428 | { | |
429 | return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); | |
430 | } | |
431 | ||
432 | static inline int __change_page_attr_clear(unsigned long addr, int numpages, | |
433 | pgprot_t mask) | |
434 | { | |
435 | return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); | |
436 | } | |
437 | #endif | |
438 | ||
1da177e4 | 439 | #ifdef CONFIG_DEBUG_PAGEALLOC |
f62d0f00 IM |
440 | |
441 | static int __set_pages_p(struct page *page, int numpages) | |
442 | { | |
443 | unsigned long addr = (unsigned long)page_address(page); | |
72932c7a TG |
444 | |
445 | return __change_page_attr_set(addr, numpages, | |
446 | __pgprot(_PAGE_PRESENT | _PAGE_RW)); | |
f62d0f00 IM |
447 | } |
448 | ||
449 | static int __set_pages_np(struct page *page, int numpages) | |
450 | { | |
451 | unsigned long addr = (unsigned long)page_address(page); | |
72932c7a TG |
452 | |
453 | return __change_page_attr_clear(addr, numpages, | |
454 | __pgprot(_PAGE_PRESENT)); | |
f62d0f00 IM |
455 | } |
456 | ||
1da177e4 LT |
457 | void kernel_map_pages(struct page *page, int numpages, int enable) |
458 | { | |
459 | if (PageHighMem(page)) | |
460 | return; | |
9f4c815c | 461 | if (!enable) { |
f9b8404c IM |
462 | debug_check_no_locks_freed(page_address(page), |
463 | numpages * PAGE_SIZE); | |
9f4c815c | 464 | } |
de5097c2 | 465 | |
12d6f21e IM |
466 | /* |
467 | * If page allocator is not up yet then do not call c_p_a(): | |
468 | */ | |
469 | if (!debug_pagealloc_enabled) | |
470 | return; | |
471 | ||
9f4c815c | 472 | /* |
e4b71dcf IM |
473 | * The return value is ignored - the calls cannot fail, |
474 | * large pages are disabled at boot time: | |
1da177e4 | 475 | */ |
f62d0f00 IM |
476 | if (enable) |
477 | __set_pages_p(page, numpages); | |
478 | else | |
479 | __set_pages_np(page, numpages); | |
9f4c815c IM |
480 | |
481 | /* | |
e4b71dcf IM |
482 | * We should perform an IPI and flush all tlbs, |
483 | * but that can deadlock->flush only current cpu: | |
1da177e4 LT |
484 | */ |
485 | __flush_tlb_all(); | |
486 | } | |
487 | #endif | |
d1028a15 AV |
488 | |
489 | /* | |
490 | * The testcases use internal knowledge of the implementation that shouldn't | |
491 | * be exposed to the rest of the kernel. Include these directly here. | |
492 | */ | |
493 | #ifdef CONFIG_CPA_DEBUG | |
494 | #include "pageattr-test.c" | |
495 | #endif |