xen: Allow unprivileged Xen domains to create iomap pages
[deliverable/linux.git] / arch / x86 / xen / mmu.c
CommitLineData
3b827c1b
JF
1/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
f120f13e 41#include <linux/sched.h>
f4f97b3e 42#include <linux/highmem.h>
994025ca 43#include <linux/debugfs.h>
3b827c1b 44#include <linux/bug.h>
44408ad7 45#include <linux/module.h>
5a0e3ad6 46#include <linux/gfp.h>
3b827c1b
JF
47
48#include <asm/pgtable.h>
49#include <asm/tlbflush.h>
5deb30d1 50#include <asm/fixmap.h>
3b827c1b 51#include <asm/mmu_context.h>
319f3ba5 52#include <asm/setup.h>
f4f97b3e 53#include <asm/paravirt.h>
7347b408 54#include <asm/e820.h>
cbcd79c2 55#include <asm/linkage.h>
3b827c1b
JF
56
57#include <asm/xen/hypercall.h>
f4f97b3e 58#include <asm/xen/hypervisor.h>
3b827c1b 59
c0011dbf 60#include <xen/xen.h>
3b827c1b
JF
61#include <xen/page.h>
62#include <xen/interface/xen.h>
319f3ba5 63#include <xen/interface/version.h>
c0011dbf 64#include <xen/interface/memory.h>
319f3ba5 65#include <xen/hvc-console.h>
3b827c1b 66
f4f97b3e 67#include "multicalls.h"
3b827c1b 68#include "mmu.h"
994025ca
JF
69#include "debugfs.h"
70
71#define MMU_UPDATE_HISTO 30
72
73#ifdef CONFIG_XEN_DEBUG_FS
74
75static struct {
76 u32 pgd_update;
77 u32 pgd_update_pinned;
78 u32 pgd_update_batched;
79
80 u32 pud_update;
81 u32 pud_update_pinned;
82 u32 pud_update_batched;
83
84 u32 pmd_update;
85 u32 pmd_update_pinned;
86 u32 pmd_update_batched;
87
88 u32 pte_update;
89 u32 pte_update_pinned;
90 u32 pte_update_batched;
91
92 u32 mmu_update;
93 u32 mmu_update_extended;
94 u32 mmu_update_histo[MMU_UPDATE_HISTO];
95
96 u32 prot_commit;
97 u32 prot_commit_batched;
98
99 u32 set_pte_at;
100 u32 set_pte_at_batched;
101 u32 set_pte_at_pinned;
102 u32 set_pte_at_current;
103 u32 set_pte_at_kernel;
104} mmu_stats;
105
106static u8 zero_stats;
107
108static inline void check_zero(void)
109{
110 if (unlikely(zero_stats)) {
111 memset(&mmu_stats, 0, sizeof(mmu_stats));
112 zero_stats = 0;
113 }
114}
115
116#define ADD_STATS(elem, val) \
117 do { check_zero(); mmu_stats.elem += (val); } while(0)
118
119#else /* !CONFIG_XEN_DEBUG_FS */
120
121#define ADD_STATS(elem, val) do { (void)(val); } while(0)
122
123#endif /* CONFIG_XEN_DEBUG_FS */
3b827c1b 124
319f3ba5
JF
125
126/*
127 * Identity map, in addition to plain kernel map. This needs to be
128 * large enough to allocate page table pages to allocate the rest.
129 * Each page can map 2MB.
130 */
131static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
132
133#ifdef CONFIG_X86_64
134/* l3 pud for userspace vsyscall mapping */
135static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
136#endif /* CONFIG_X86_64 */
137
138/*
139 * Note about cr3 (pagetable base) values:
140 *
141 * xen_cr3 contains the current logical cr3 value; it contains the
142 * last set cr3. This may not be the current effective cr3, because
143 * its update may be being lazily deferred. However, a vcpu looking
144 * at its own cr3 can use this value knowing that it everything will
145 * be self-consistent.
146 *
147 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
148 * hypercall to set the vcpu cr3 is complete (so it may be a little
149 * out of date, but it will never be set early). If one vcpu is
150 * looking at another vcpu's cr3 value, it should use this variable.
151 */
152DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
153DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
154
155
d6182fbf
JF
156/*
157 * Just beyond the highest usermode address. STACK_TOP_MAX has a
158 * redzone above it, so round it up to a PGD boundary.
159 */
160#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
161
162
d451bb7a 163#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
cf0923ea 164#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
d451bb7a 165
cf0923ea 166/* Placeholder for holes in the address space */
cbcd79c2 167static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
cf0923ea
JF
168 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
169
170 /* Array of pointers to pages containing p2m entries */
cbcd79c2 171static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
cf0923ea 172 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
d451bb7a 173
d5edbc1f 174/* Arrays of p2m arrays expressed in mfns used for save/restore */
cbcd79c2 175static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
d5edbc1f 176
cbcd79c2
JF
177static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
178 __page_aligned_bss;
d5edbc1f 179
d451bb7a
JF
180static inline unsigned p2m_top_index(unsigned long pfn)
181{
8006ec3e 182 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
d451bb7a
JF
183 return pfn / P2M_ENTRIES_PER_PAGE;
184}
185
186static inline unsigned p2m_index(unsigned long pfn)
187{
188 return pfn % P2M_ENTRIES_PER_PAGE;
189}
190
d5edbc1f 191/* Build the parallel p2m_top_mfn structures */
fa24ba62 192void xen_build_mfn_list_list(void)
d5edbc1f
JF
193{
194 unsigned pfn, idx;
195
f63c2f24 196 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
d5edbc1f
JF
197 unsigned topidx = p2m_top_index(pfn);
198
199 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
200 }
201
f63c2f24 202 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
d5edbc1f
JF
203 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
204 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
205 }
cdaead6b 206}
d5edbc1f 207
cdaead6b
JF
208void xen_setup_mfn_list_list(void)
209{
d5edbc1f
JF
210 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
211
212 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
213 virt_to_mfn(p2m_top_mfn_list);
214 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
215}
216
217/* Set up p2m_top to point to the domain-builder provided p2m pages */
d451bb7a
JF
218void __init xen_build_dynamic_phys_to_machine(void)
219{
d451bb7a 220 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
8006ec3e 221 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
d5edbc1f 222 unsigned pfn;
d451bb7a 223
f63c2f24 224 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
d451bb7a
JF
225 unsigned topidx = p2m_top_index(pfn);
226
227 p2m_top[topidx] = &mfn_list[pfn];
228 }
cdaead6b
JF
229
230 xen_build_mfn_list_list();
d451bb7a
JF
231}
232
233unsigned long get_phys_to_machine(unsigned long pfn)
234{
235 unsigned topidx, idx;
236
8006ec3e
JF
237 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
238 return INVALID_P2M_ENTRY;
239
d451bb7a 240 topidx = p2m_top_index(pfn);
d451bb7a
JF
241 idx = p2m_index(pfn);
242 return p2m_top[topidx][idx];
243}
15ce6005 244EXPORT_SYMBOL_GPL(get_phys_to_machine);
d451bb7a 245
e791ca0f
JF
246/* install a new p2m_top page */
247bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
d451bb7a 248{
e791ca0f
JF
249 unsigned topidx = p2m_top_index(pfn);
250 unsigned long **pfnp, *mfnp;
d451bb7a
JF
251 unsigned i;
252
e791ca0f
JF
253 pfnp = &p2m_top[topidx];
254 mfnp = &p2m_top_mfn[topidx];
d451bb7a 255
f63c2f24 256 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
d451bb7a
JF
257 p[i] = INVALID_P2M_ENTRY;
258
e791ca0f 259 if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
d5edbc1f 260 *mfnp = virt_to_mfn(p);
e791ca0f
JF
261 return true;
262 }
263
264 return false;
d451bb7a
JF
265}
266
e791ca0f 267static void alloc_p2m(unsigned long pfn)
d451bb7a 268{
e791ca0f 269 unsigned long *p;
d451bb7a 270
e791ca0f
JF
271 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
272 BUG_ON(p == NULL);
273
274 if (!install_p2mtop_page(pfn, p))
275 free_page((unsigned long)p);
276}
277
278/* Try to install p2m mapping; fail if intermediate bits missing */
279bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
280{
281 unsigned topidx, idx;
8006ec3e
JF
282
283 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
284 BUG_ON(mfn != INVALID_P2M_ENTRY);
e791ca0f 285 return true;
d451bb7a
JF
286 }
287
288 topidx = p2m_top_index(pfn);
cf0923ea 289 if (p2m_top[topidx] == p2m_missing) {
d451bb7a 290 if (mfn == INVALID_P2M_ENTRY)
e791ca0f
JF
291 return true;
292 return false;
d451bb7a
JF
293 }
294
295 idx = p2m_index(pfn);
296 p2m_top[topidx][idx] = mfn;
e791ca0f
JF
297
298 return true;
299}
300
301void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
302{
303 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
304 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
305 return;
306 }
307
308 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
309 alloc_p2m(pfn);
310
311 if (!__set_phys_to_machine(pfn, mfn))
312 BUG();
313 }
d451bb7a
JF
314}
315
9976b39b
JF
316unsigned long arbitrary_virt_to_mfn(void *vaddr)
317{
318 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
319
320 return PFN_DOWN(maddr.maddr);
321}
322
ce803e70 323xmaddr_t arbitrary_virt_to_machine(void *vaddr)
3b827c1b 324{
ce803e70 325 unsigned long address = (unsigned long)vaddr;
da7bfc50 326 unsigned int level;
9f32d21c
CL
327 pte_t *pte;
328 unsigned offset;
3b827c1b 329
9f32d21c
CL
330 /*
331 * if the PFN is in the linear mapped vaddr range, we can just use
332 * the (quick) virt_to_machine() p2m lookup
333 */
334 if (virt_addr_valid(vaddr))
335 return virt_to_machine(vaddr);
336
337 /* otherwise we have to do a (slower) full page-table walk */
3b827c1b 338
9f32d21c
CL
339 pte = lookup_address(address, &level);
340 BUG_ON(pte == NULL);
341 offset = address & ~PAGE_MASK;
ebd879e3 342 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
3b827c1b
JF
343}
344
345void make_lowmem_page_readonly(void *vaddr)
346{
347 pte_t *pte, ptev;
348 unsigned long address = (unsigned long)vaddr;
da7bfc50 349 unsigned int level;
3b827c1b 350
f0646e43 351 pte = lookup_address(address, &level);
3b827c1b
JF
352 BUG_ON(pte == NULL);
353
354 ptev = pte_wrprotect(*pte);
355
356 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
357 BUG();
358}
359
360void make_lowmem_page_readwrite(void *vaddr)
361{
362 pte_t *pte, ptev;
363 unsigned long address = (unsigned long)vaddr;
da7bfc50 364 unsigned int level;
3b827c1b 365
f0646e43 366 pte = lookup_address(address, &level);
3b827c1b
JF
367 BUG_ON(pte == NULL);
368
369 ptev = pte_mkwrite(*pte);
370
371 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
372 BUG();
373}
374
375
7708ad64 376static bool xen_page_pinned(void *ptr)
e2426cf8
JF
377{
378 struct page *page = virt_to_page(ptr);
379
380 return PagePinned(page);
381}
382
c0011dbf
JF
383static bool xen_iomap_pte(pte_t pte)
384{
7347b408 385 return pte_flags(pte) & _PAGE_IOMAP;
c0011dbf
JF
386}
387
388static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
389{
390 struct multicall_space mcs;
391 struct mmu_update *u;
392
393 mcs = xen_mc_entry(sizeof(*u));
394 u = mcs.args;
395
396 /* ptep might be kmapped when using 32-bit HIGHPTE */
397 u->ptr = arbitrary_virt_to_machine(ptep).maddr;
398 u->val = pte_val_ma(pteval);
399
400 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
401
402 xen_mc_issue(PARAVIRT_LAZY_MMU);
403}
404
7708ad64 405static void xen_extend_mmu_update(const struct mmu_update *update)
3b827c1b 406{
d66bf8fc
JF
407 struct multicall_space mcs;
408 struct mmu_update *u;
3b827c1b 409
400d3494
JF
410 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
411
994025ca
JF
412 if (mcs.mc != NULL) {
413 ADD_STATS(mmu_update_extended, 1);
414 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
415
400d3494 416 mcs.mc->args[1]++;
994025ca
JF
417
418 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
419 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
420 else
421 ADD_STATS(mmu_update_histo[0], 1);
422 } else {
423 ADD_STATS(mmu_update, 1);
400d3494
JF
424 mcs = __xen_mc_entry(sizeof(*u));
425 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
994025ca 426 ADD_STATS(mmu_update_histo[1], 1);
400d3494 427 }
d66bf8fc 428
d66bf8fc 429 u = mcs.args;
400d3494
JF
430 *u = *update;
431}
432
433void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
434{
435 struct mmu_update u;
436
437 preempt_disable();
438
439 xen_mc_batch();
440
ce803e70
JF
441 /* ptr may be ioremapped for 64-bit pagetable setup */
442 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
400d3494 443 u.val = pmd_val_ma(val);
7708ad64 444 xen_extend_mmu_update(&u);
d66bf8fc 445
994025ca
JF
446 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
447
d66bf8fc
JF
448 xen_mc_issue(PARAVIRT_LAZY_MMU);
449
450 preempt_enable();
3b827c1b
JF
451}
452
e2426cf8
JF
453void xen_set_pmd(pmd_t *ptr, pmd_t val)
454{
994025ca
JF
455 ADD_STATS(pmd_update, 1);
456
e2426cf8
JF
457 /* If page is not pinned, we can just update the entry
458 directly */
7708ad64 459 if (!xen_page_pinned(ptr)) {
e2426cf8
JF
460 *ptr = val;
461 return;
462 }
463
994025ca
JF
464 ADD_STATS(pmd_update_pinned, 1);
465
e2426cf8
JF
466 xen_set_pmd_hyper(ptr, val);
467}
468
3b827c1b
JF
469/*
470 * Associate a virtual page frame with a given physical page frame
471 * and protection flags for that frame.
472 */
473void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
474{
836fe2f2 475 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
3b827c1b
JF
476}
477
478void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
479 pte_t *ptep, pte_t pteval)
480{
c0011dbf
JF
481 if (xen_iomap_pte(pteval)) {
482 xen_set_iomap_pte(ptep, pteval);
483 goto out;
484 }
485
994025ca
JF
486 ADD_STATS(set_pte_at, 1);
487// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
488 ADD_STATS(set_pte_at_current, mm == current->mm);
489 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
490
d66bf8fc 491 if (mm == current->mm || mm == &init_mm) {
8965c1c0 492 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
d66bf8fc
JF
493 struct multicall_space mcs;
494 mcs = xen_mc_entry(0);
495
496 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
994025ca 497 ADD_STATS(set_pte_at_batched, 1);
d66bf8fc 498 xen_mc_issue(PARAVIRT_LAZY_MMU);
2bd50036 499 goto out;
d66bf8fc
JF
500 } else
501 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
2bd50036 502 goto out;
d66bf8fc
JF
503 }
504 xen_set_pte(ptep, pteval);
2bd50036 505
2829b449 506out: return;
3b827c1b
JF
507}
508
f63c2f24
T
509pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
510 unsigned long addr, pte_t *ptep)
947a69c9 511{
e57778a1
JF
512 /* Just return the pte as-is. We preserve the bits on commit */
513 return *ptep;
514}
515
516void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
517 pte_t *ptep, pte_t pte)
518{
400d3494 519 struct mmu_update u;
e57778a1 520
400d3494 521 xen_mc_batch();
947a69c9 522
9f32d21c 523 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
400d3494 524 u.val = pte_val_ma(pte);
7708ad64 525 xen_extend_mmu_update(&u);
947a69c9 526
994025ca
JF
527 ADD_STATS(prot_commit, 1);
528 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
529
e57778a1 530 xen_mc_issue(PARAVIRT_LAZY_MMU);
947a69c9
JF
531}
532
ebb9cfe2
JF
533/* Assume pteval_t is equivalent to all the other *val_t types. */
534static pteval_t pte_mfn_to_pfn(pteval_t val)
947a69c9 535{
ebb9cfe2 536 if (val & _PAGE_PRESENT) {
59438c9f 537 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
77be1fab 538 pteval_t flags = val & PTE_FLAGS_MASK;
d8355aca 539 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
ebb9cfe2 540 }
947a69c9 541
ebb9cfe2 542 return val;
947a69c9
JF
543}
544
ebb9cfe2 545static pteval_t pte_pfn_to_mfn(pteval_t val)
947a69c9 546{
ebb9cfe2 547 if (val & _PAGE_PRESENT) {
59438c9f 548 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
77be1fab 549 pteval_t flags = val & PTE_FLAGS_MASK;
d8355aca 550 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
947a69c9
JF
551 }
552
ebb9cfe2 553 return val;
947a69c9
JF
554}
555
c0011dbf
JF
556static pteval_t iomap_pte(pteval_t val)
557{
558 if (val & _PAGE_PRESENT) {
559 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
560 pteval_t flags = val & PTE_FLAGS_MASK;
561
562 /* We assume the pte frame number is a MFN, so
563 just use it as-is. */
564 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
565 }
566
567 return val;
568}
569
ebb9cfe2 570pteval_t xen_pte_val(pte_t pte)
947a69c9 571{
c0011dbf
JF
572 if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
573 return pte.pte;
574
ebb9cfe2 575 return pte_mfn_to_pfn(pte.pte);
947a69c9 576}
da5de7c2 577PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
947a69c9 578
947a69c9
JF
579pgdval_t xen_pgd_val(pgd_t pgd)
580{
ebb9cfe2 581 return pte_mfn_to_pfn(pgd.pgd);
947a69c9 582}
da5de7c2 583PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
947a69c9
JF
584
585pte_t xen_make_pte(pteval_t pte)
586{
7347b408
AN
587 phys_addr_t addr = (pte & PTE_PFN_MASK);
588
589 /*
590 * Unprivileged domains are allowed to do IOMAPpings for
591 * PCI passthrough, but not map ISA space. The ISA
592 * mappings are just dummy local mappings to keep other
593 * parts of the kernel happy.
594 */
595 if (unlikely(pte & _PAGE_IOMAP) &&
596 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
c0011dbf 597 pte = iomap_pte(pte);
7347b408
AN
598 } else {
599 pte &= ~_PAGE_IOMAP;
c0011dbf 600 pte = pte_pfn_to_mfn(pte);
7347b408 601 }
c0011dbf 602
ebb9cfe2 603 return native_make_pte(pte);
947a69c9 604}
da5de7c2 605PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
947a69c9
JF
606
607pgd_t xen_make_pgd(pgdval_t pgd)
608{
ebb9cfe2
JF
609 pgd = pte_pfn_to_mfn(pgd);
610 return native_make_pgd(pgd);
947a69c9 611}
da5de7c2 612PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
947a69c9
JF
613
614pmdval_t xen_pmd_val(pmd_t pmd)
615{
ebb9cfe2 616 return pte_mfn_to_pfn(pmd.pmd);
947a69c9 617}
da5de7c2 618PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
28499143 619
e2426cf8 620void xen_set_pud_hyper(pud_t *ptr, pud_t val)
f4f97b3e 621{
400d3494 622 struct mmu_update u;
f4f97b3e 623
d66bf8fc
JF
624 preempt_disable();
625
400d3494
JF
626 xen_mc_batch();
627
ce803e70
JF
628 /* ptr may be ioremapped for 64-bit pagetable setup */
629 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
400d3494 630 u.val = pud_val_ma(val);
7708ad64 631 xen_extend_mmu_update(&u);
d66bf8fc 632
994025ca
JF
633 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
634
d66bf8fc
JF
635 xen_mc_issue(PARAVIRT_LAZY_MMU);
636
637 preempt_enable();
f4f97b3e
JF
638}
639
e2426cf8
JF
640void xen_set_pud(pud_t *ptr, pud_t val)
641{
994025ca
JF
642 ADD_STATS(pud_update, 1);
643
e2426cf8
JF
644 /* If page is not pinned, we can just update the entry
645 directly */
7708ad64 646 if (!xen_page_pinned(ptr)) {
e2426cf8
JF
647 *ptr = val;
648 return;
649 }
650
994025ca
JF
651 ADD_STATS(pud_update_pinned, 1);
652
e2426cf8
JF
653 xen_set_pud_hyper(ptr, val);
654}
655
f4f97b3e
JF
656void xen_set_pte(pte_t *ptep, pte_t pte)
657{
c0011dbf
JF
658 if (xen_iomap_pte(pte)) {
659 xen_set_iomap_pte(ptep, pte);
660 return;
661 }
662
994025ca
JF
663 ADD_STATS(pte_update, 1);
664// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
665 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
666
f6e58732 667#ifdef CONFIG_X86_PAE
f4f97b3e
JF
668 ptep->pte_high = pte.pte_high;
669 smp_wmb();
670 ptep->pte_low = pte.pte_low;
f6e58732
JF
671#else
672 *ptep = pte;
673#endif
f4f97b3e
JF
674}
675
f6e58732 676#ifdef CONFIG_X86_PAE
3b827c1b
JF
677void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
678{
c0011dbf
JF
679 if (xen_iomap_pte(pte)) {
680 xen_set_iomap_pte(ptep, pte);
681 return;
682 }
683
f6e58732 684 set_64bit((u64 *)ptep, native_pte_val(pte));
3b827c1b
JF
685}
686
687void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
688{
689 ptep->pte_low = 0;
690 smp_wmb(); /* make sure low gets written first */
691 ptep->pte_high = 0;
692}
693
694void xen_pmd_clear(pmd_t *pmdp)
695{
e2426cf8 696 set_pmd(pmdp, __pmd(0));
3b827c1b 697}
f6e58732 698#endif /* CONFIG_X86_PAE */
3b827c1b 699
abf33038 700pmd_t xen_make_pmd(pmdval_t pmd)
3b827c1b 701{
ebb9cfe2 702 pmd = pte_pfn_to_mfn(pmd);
947a69c9 703 return native_make_pmd(pmd);
3b827c1b 704}
da5de7c2 705PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
3b827c1b 706
f6e58732
JF
707#if PAGETABLE_LEVELS == 4
708pudval_t xen_pud_val(pud_t pud)
709{
710 return pte_mfn_to_pfn(pud.pud);
711}
da5de7c2 712PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
f6e58732
JF
713
714pud_t xen_make_pud(pudval_t pud)
715{
716 pud = pte_pfn_to_mfn(pud);
717
718 return native_make_pud(pud);
719}
da5de7c2 720PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
f6e58732 721
d6182fbf 722pgd_t *xen_get_user_pgd(pgd_t *pgd)
f6e58732 723{
d6182fbf
JF
724 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
725 unsigned offset = pgd - pgd_page;
726 pgd_t *user_ptr = NULL;
f6e58732 727
d6182fbf
JF
728 if (offset < pgd_index(USER_LIMIT)) {
729 struct page *page = virt_to_page(pgd_page);
730 user_ptr = (pgd_t *)page->private;
731 if (user_ptr)
732 user_ptr += offset;
733 }
f6e58732 734
d6182fbf
JF
735 return user_ptr;
736}
737
738static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
739{
740 struct mmu_update u;
f6e58732
JF
741
742 u.ptr = virt_to_machine(ptr).maddr;
743 u.val = pgd_val_ma(val);
7708ad64 744 xen_extend_mmu_update(&u);
d6182fbf
JF
745}
746
747/*
748 * Raw hypercall-based set_pgd, intended for in early boot before
749 * there's a page structure. This implies:
750 * 1. The only existing pagetable is the kernel's
751 * 2. It is always pinned
752 * 3. It has no user pagetable attached to it
753 */
754void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
755{
756 preempt_disable();
757
758 xen_mc_batch();
759
760 __xen_set_pgd_hyper(ptr, val);
f6e58732
JF
761
762 xen_mc_issue(PARAVIRT_LAZY_MMU);
763
764 preempt_enable();
765}
766
767void xen_set_pgd(pgd_t *ptr, pgd_t val)
768{
d6182fbf
JF
769 pgd_t *user_ptr = xen_get_user_pgd(ptr);
770
994025ca
JF
771 ADD_STATS(pgd_update, 1);
772
f6e58732
JF
773 /* If page is not pinned, we can just update the entry
774 directly */
7708ad64 775 if (!xen_page_pinned(ptr)) {
f6e58732 776 *ptr = val;
d6182fbf 777 if (user_ptr) {
7708ad64 778 WARN_ON(xen_page_pinned(user_ptr));
d6182fbf
JF
779 *user_ptr = val;
780 }
f6e58732
JF
781 return;
782 }
783
994025ca
JF
784 ADD_STATS(pgd_update_pinned, 1);
785 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
786
d6182fbf
JF
787 /* If it's pinned, then we can at least batch the kernel and
788 user updates together. */
789 xen_mc_batch();
790
791 __xen_set_pgd_hyper(ptr, val);
792 if (user_ptr)
793 __xen_set_pgd_hyper(user_ptr, val);
794
795 xen_mc_issue(PARAVIRT_LAZY_MMU);
f6e58732
JF
796}
797#endif /* PAGETABLE_LEVELS == 4 */
798
f4f97b3e 799/*
5deb30d1
JF
800 * (Yet another) pagetable walker. This one is intended for pinning a
801 * pagetable. This means that it walks a pagetable and calls the
802 * callback function on each page it finds making up the page table,
803 * at every level. It walks the entire pagetable, but it only bothers
804 * pinning pte pages which are below limit. In the normal case this
805 * will be STACK_TOP_MAX, but at boot we need to pin up to
806 * FIXADDR_TOP.
807 *
808 * For 32-bit the important bit is that we don't pin beyond there,
809 * because then we start getting into Xen's ptes.
810 *
811 * For 64-bit, we must skip the Xen hole in the middle of the address
812 * space, just after the big x86-64 virtual hole.
813 */
86bbc2c2
IC
814static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
815 int (*func)(struct mm_struct *mm, struct page *,
816 enum pt_level),
817 unsigned long limit)
3b827c1b 818{
f4f97b3e 819 int flush = 0;
5deb30d1
JF
820 unsigned hole_low, hole_high;
821 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
822 unsigned pgdidx, pudidx, pmdidx;
f4f97b3e 823
5deb30d1
JF
824 /* The limit is the last byte to be touched */
825 limit--;
826 BUG_ON(limit >= FIXADDR_TOP);
3b827c1b
JF
827
828 if (xen_feature(XENFEAT_auto_translated_physmap))
f4f97b3e
JF
829 return 0;
830
5deb30d1
JF
831 /*
832 * 64-bit has a great big hole in the middle of the address
833 * space, which contains the Xen mappings. On 32-bit these
834 * will end up making a zero-sized hole and so is a no-op.
835 */
d6182fbf 836 hole_low = pgd_index(USER_LIMIT);
5deb30d1
JF
837 hole_high = pgd_index(PAGE_OFFSET);
838
839 pgdidx_limit = pgd_index(limit);
840#if PTRS_PER_PUD > 1
841 pudidx_limit = pud_index(limit);
842#else
843 pudidx_limit = 0;
844#endif
845#if PTRS_PER_PMD > 1
846 pmdidx_limit = pmd_index(limit);
847#else
848 pmdidx_limit = 0;
849#endif
850
5deb30d1 851 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
f4f97b3e 852 pud_t *pud;
3b827c1b 853
5deb30d1
JF
854 if (pgdidx >= hole_low && pgdidx < hole_high)
855 continue;
f4f97b3e 856
5deb30d1 857 if (!pgd_val(pgd[pgdidx]))
3b827c1b 858 continue;
f4f97b3e 859
5deb30d1 860 pud = pud_offset(&pgd[pgdidx], 0);
3b827c1b
JF
861
862 if (PTRS_PER_PUD > 1) /* not folded */
eefb47f6 863 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
f4f97b3e 864
5deb30d1 865 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
f4f97b3e 866 pmd_t *pmd;
f4f97b3e 867
5deb30d1
JF
868 if (pgdidx == pgdidx_limit &&
869 pudidx > pudidx_limit)
870 goto out;
3b827c1b 871
5deb30d1 872 if (pud_none(pud[pudidx]))
3b827c1b 873 continue;
f4f97b3e 874
5deb30d1 875 pmd = pmd_offset(&pud[pudidx], 0);
3b827c1b
JF
876
877 if (PTRS_PER_PMD > 1) /* not folded */
eefb47f6 878 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
f4f97b3e 879
5deb30d1
JF
880 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
881 struct page *pte;
882
883 if (pgdidx == pgdidx_limit &&
884 pudidx == pudidx_limit &&
885 pmdidx > pmdidx_limit)
886 goto out;
3b827c1b 887
5deb30d1 888 if (pmd_none(pmd[pmdidx]))
3b827c1b
JF
889 continue;
890
5deb30d1 891 pte = pmd_page(pmd[pmdidx]);
eefb47f6 892 flush |= (*func)(mm, pte, PT_PTE);
3b827c1b
JF
893 }
894 }
895 }
11ad93e5 896
5deb30d1 897out:
11ad93e5
JF
898 /* Do the top level last, so that the callbacks can use it as
899 a cue to do final things like tlb flushes. */
eefb47f6 900 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
f4f97b3e
JF
901
902 return flush;
3b827c1b
JF
903}
904
86bbc2c2
IC
905static int xen_pgd_walk(struct mm_struct *mm,
906 int (*func)(struct mm_struct *mm, struct page *,
907 enum pt_level),
908 unsigned long limit)
909{
910 return __xen_pgd_walk(mm, mm->pgd, func, limit);
911}
912
7708ad64
JF
913/* If we're using split pte locks, then take the page's lock and
914 return a pointer to it. Otherwise return NULL. */
eefb47f6 915static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
74260714
JF
916{
917 spinlock_t *ptl = NULL;
918
f7d0b926 919#if USE_SPLIT_PTLOCKS
74260714 920 ptl = __pte_lockptr(page);
eefb47f6 921 spin_lock_nest_lock(ptl, &mm->page_table_lock);
74260714
JF
922#endif
923
924 return ptl;
925}
926
7708ad64 927static void xen_pte_unlock(void *v)
74260714
JF
928{
929 spinlock_t *ptl = v;
930 spin_unlock(ptl);
931}
932
933static void xen_do_pin(unsigned level, unsigned long pfn)
934{
935 struct mmuext_op *op;
936 struct multicall_space mcs;
937
938 mcs = __xen_mc_entry(sizeof(*op));
939 op = mcs.args;
940 op->cmd = level;
941 op->arg1.mfn = pfn_to_mfn(pfn);
942 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
943}
944
eefb47f6
JF
945static int xen_pin_page(struct mm_struct *mm, struct page *page,
946 enum pt_level level)
f4f97b3e 947{
d60cd46b 948 unsigned pgfl = TestSetPagePinned(page);
f4f97b3e
JF
949 int flush;
950
951 if (pgfl)
952 flush = 0; /* already pinned */
953 else if (PageHighMem(page))
954 /* kmaps need flushing if we found an unpinned
955 highpage */
956 flush = 1;
957 else {
958 void *pt = lowmem_page_address(page);
959 unsigned long pfn = page_to_pfn(page);
960 struct multicall_space mcs = __xen_mc_entry(0);
74260714 961 spinlock_t *ptl;
f4f97b3e
JF
962
963 flush = 0;
964
11ad93e5
JF
965 /*
966 * We need to hold the pagetable lock between the time
967 * we make the pagetable RO and when we actually pin
968 * it. If we don't, then other users may come in and
969 * attempt to update the pagetable by writing it,
970 * which will fail because the memory is RO but not
971 * pinned, so Xen won't do the trap'n'emulate.
972 *
973 * If we're using split pte locks, we can't hold the
974 * entire pagetable's worth of locks during the
975 * traverse, because we may wrap the preempt count (8
976 * bits). The solution is to mark RO and pin each PTE
977 * page while holding the lock. This means the number
978 * of locks we end up holding is never more than a
979 * batch size (~32 entries, at present).
980 *
981 * If we're not using split pte locks, we needn't pin
982 * the PTE pages independently, because we're
983 * protected by the overall pagetable lock.
984 */
74260714
JF
985 ptl = NULL;
986 if (level == PT_PTE)
eefb47f6 987 ptl = xen_pte_lock(page, mm);
74260714 988
f4f97b3e
JF
989 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
990 pfn_pte(pfn, PAGE_KERNEL_RO),
74260714
JF
991 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
992
11ad93e5 993 if (ptl) {
74260714
JF
994 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
995
74260714
JF
996 /* Queue a deferred unlock for when this batch
997 is completed. */
7708ad64 998 xen_mc_callback(xen_pte_unlock, ptl);
74260714 999 }
f4f97b3e
JF
1000 }
1001
1002 return flush;
1003}
3b827c1b 1004
f4f97b3e
JF
1005/* This is called just after a mm has been created, but it has not
1006 been used yet. We need to make sure that its pagetable is all
1007 read-only, and can be pinned. */
eefb47f6 1008static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
3b827c1b 1009{
d05fdf31
JF
1010 vm_unmap_aliases();
1011
f4f97b3e 1012 xen_mc_batch();
3b827c1b 1013
86bbc2c2 1014 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
d05fdf31 1015 /* re-enable interrupts for flushing */
f87e4cac 1016 xen_mc_issue(0);
d05fdf31 1017
f4f97b3e 1018 kmap_flush_unused();
d05fdf31 1019
f87e4cac
JF
1020 xen_mc_batch();
1021 }
f4f97b3e 1022
d6182fbf
JF
1023#ifdef CONFIG_X86_64
1024 {
1025 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1026
1027 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
1028
1029 if (user_pgd) {
eefb47f6 1030 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
f63c2f24
T
1031 xen_do_pin(MMUEXT_PIN_L4_TABLE,
1032 PFN_DOWN(__pa(user_pgd)));
d6182fbf
JF
1033 }
1034 }
1035#else /* CONFIG_X86_32 */
5deb30d1
JF
1036#ifdef CONFIG_X86_PAE
1037 /* Need to make sure unshared kernel PMD is pinnable */
47cb2ed9 1038 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
eefb47f6 1039 PT_PMD);
5deb30d1 1040#endif
28499143 1041 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
d6182fbf 1042#endif /* CONFIG_X86_64 */
f4f97b3e 1043 xen_mc_issue(0);
3b827c1b
JF
1044}
1045
eefb47f6
JF
1046static void xen_pgd_pin(struct mm_struct *mm)
1047{
1048 __xen_pgd_pin(mm, mm->pgd);
1049}
1050
0e91398f
JF
1051/*
1052 * On save, we need to pin all pagetables to make sure they get their
1053 * mfns turned into pfns. Search the list for any unpinned pgds and pin
1054 * them (unpinned pgds are not currently in use, probably because the
1055 * process is under construction or destruction).
eefb47f6
JF
1056 *
1057 * Expected to be called in stop_machine() ("equivalent to taking
1058 * every spinlock in the system"), so the locking doesn't really
1059 * matter all that much.
0e91398f
JF
1060 */
1061void xen_mm_pin_all(void)
1062{
1063 unsigned long flags;
1064 struct page *page;
74260714 1065
0e91398f 1066 spin_lock_irqsave(&pgd_lock, flags);
f4f97b3e 1067
0e91398f
JF
1068 list_for_each_entry(page, &pgd_list, lru) {
1069 if (!PagePinned(page)) {
eefb47f6 1070 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
0e91398f
JF
1071 SetPageSavePinned(page);
1072 }
1073 }
1074
1075 spin_unlock_irqrestore(&pgd_lock, flags);
3b827c1b
JF
1076}
1077
c1f2f09e
EH
1078/*
1079 * The init_mm pagetable is really pinned as soon as its created, but
1080 * that's before we have page structures to store the bits. So do all
1081 * the book-keeping now.
1082 */
eefb47f6
JF
1083static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1084 enum pt_level level)
3b827c1b 1085{
f4f97b3e
JF
1086 SetPagePinned(page);
1087 return 0;
1088}
3b827c1b 1089
b96229b5 1090static void __init xen_mark_init_mm_pinned(void)
f4f97b3e 1091{
eefb47f6 1092 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
f4f97b3e 1093}
3b827c1b 1094
eefb47f6
JF
1095static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1096 enum pt_level level)
f4f97b3e 1097{
d60cd46b 1098 unsigned pgfl = TestClearPagePinned(page);
3b827c1b 1099
f4f97b3e
JF
1100 if (pgfl && !PageHighMem(page)) {
1101 void *pt = lowmem_page_address(page);
1102 unsigned long pfn = page_to_pfn(page);
74260714
JF
1103 spinlock_t *ptl = NULL;
1104 struct multicall_space mcs;
1105
11ad93e5
JF
1106 /*
1107 * Do the converse to pin_page. If we're using split
1108 * pte locks, we must be holding the lock for while
1109 * the pte page is unpinned but still RO to prevent
1110 * concurrent updates from seeing it in this
1111 * partially-pinned state.
1112 */
74260714 1113 if (level == PT_PTE) {
eefb47f6 1114 ptl = xen_pte_lock(page, mm);
74260714 1115
11ad93e5
JF
1116 if (ptl)
1117 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
74260714
JF
1118 }
1119
1120 mcs = __xen_mc_entry(0);
f4f97b3e
JF
1121
1122 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1123 pfn_pte(pfn, PAGE_KERNEL),
74260714
JF
1124 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1125
1126 if (ptl) {
1127 /* unlock when batch completed */
7708ad64 1128 xen_mc_callback(xen_pte_unlock, ptl);
74260714 1129 }
f4f97b3e
JF
1130 }
1131
1132 return 0; /* never need to flush on unpin */
3b827c1b
JF
1133}
1134
f4f97b3e 1135/* Release a pagetables pages back as normal RW */
eefb47f6 1136static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
f4f97b3e 1137{
f4f97b3e
JF
1138 xen_mc_batch();
1139
74260714 1140 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
f4f97b3e 1141
d6182fbf
JF
1142#ifdef CONFIG_X86_64
1143 {
1144 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1145
1146 if (user_pgd) {
f63c2f24
T
1147 xen_do_pin(MMUEXT_UNPIN_TABLE,
1148 PFN_DOWN(__pa(user_pgd)));
eefb47f6 1149 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
d6182fbf
JF
1150 }
1151 }
1152#endif
1153
5deb30d1
JF
1154#ifdef CONFIG_X86_PAE
1155 /* Need to make sure unshared kernel PMD is unpinned */
47cb2ed9 1156 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
eefb47f6 1157 PT_PMD);
5deb30d1 1158#endif
d6182fbf 1159
86bbc2c2 1160 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
f4f97b3e
JF
1161
1162 xen_mc_issue(0);
1163}
3b827c1b 1164
eefb47f6
JF
1165static void xen_pgd_unpin(struct mm_struct *mm)
1166{
1167 __xen_pgd_unpin(mm, mm->pgd);
1168}
1169
0e91398f
JF
1170/*
1171 * On resume, undo any pinning done at save, so that the rest of the
1172 * kernel doesn't see any unexpected pinned pagetables.
1173 */
1174void xen_mm_unpin_all(void)
1175{
1176 unsigned long flags;
1177 struct page *page;
1178
1179 spin_lock_irqsave(&pgd_lock, flags);
1180
1181 list_for_each_entry(page, &pgd_list, lru) {
1182 if (PageSavePinned(page)) {
1183 BUG_ON(!PagePinned(page));
eefb47f6 1184 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
0e91398f
JF
1185 ClearPageSavePinned(page);
1186 }
1187 }
1188
1189 spin_unlock_irqrestore(&pgd_lock, flags);
1190}
1191
3b827c1b
JF
1192void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1193{
f4f97b3e 1194 spin_lock(&next->page_table_lock);
eefb47f6 1195 xen_pgd_pin(next);
f4f97b3e 1196 spin_unlock(&next->page_table_lock);
3b827c1b
JF
1197}
1198
1199void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1200{
f4f97b3e 1201 spin_lock(&mm->page_table_lock);
eefb47f6 1202 xen_pgd_pin(mm);
f4f97b3e 1203 spin_unlock(&mm->page_table_lock);
3b827c1b
JF
1204}
1205
3b827c1b 1206
f87e4cac
JF
1207#ifdef CONFIG_SMP
1208/* Another cpu may still have their %cr3 pointing at the pagetable, so
1209 we need to repoint it somewhere else before we can unpin it. */
1210static void drop_other_mm_ref(void *info)
1211{
1212 struct mm_struct *mm = info;
ce87b3d3 1213 struct mm_struct *active_mm;
3b827c1b 1214
9eb912d1 1215 active_mm = percpu_read(cpu_tlbstate.active_mm);
ce87b3d3
JF
1216
1217 if (active_mm == mm)
f87e4cac 1218 leave_mm(smp_processor_id());
9f79991d
JF
1219
1220 /* If this cpu still has a stale cr3 reference, then make sure
1221 it has been flushed. */
7fd7d83d 1222 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
9f79991d 1223 load_cr3(swapper_pg_dir);
f87e4cac 1224}
3b827c1b 1225
7708ad64 1226static void xen_drop_mm_ref(struct mm_struct *mm)
f87e4cac 1227{
e4d98207 1228 cpumask_var_t mask;
9f79991d
JF
1229 unsigned cpu;
1230
f87e4cac
JF
1231 if (current->active_mm == mm) {
1232 if (current->mm == mm)
1233 load_cr3(swapper_pg_dir);
1234 else
1235 leave_mm(smp_processor_id());
9f79991d
JF
1236 }
1237
1238 /* Get the "official" set of cpus referring to our pagetable. */
e4d98207
MT
1239 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1240 for_each_online_cpu(cpu) {
78f1c4d6 1241 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
e4d98207
MT
1242 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1243 continue;
1244 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1245 }
1246 return;
1247 }
78f1c4d6 1248 cpumask_copy(mask, mm_cpumask(mm));
9f79991d
JF
1249
1250 /* It's possible that a vcpu may have a stale reference to our
1251 cr3, because its in lazy mode, and it hasn't yet flushed
1252 its set of pending hypercalls yet. In this case, we can
1253 look at its actual current cr3 value, and force it to flush
1254 if needed. */
1255 for_each_online_cpu(cpu) {
1256 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
e4d98207 1257 cpumask_set_cpu(cpu, mask);
3b827c1b
JF
1258 }
1259
e4d98207
MT
1260 if (!cpumask_empty(mask))
1261 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1262 free_cpumask_var(mask);
f87e4cac
JF
1263}
1264#else
7708ad64 1265static void xen_drop_mm_ref(struct mm_struct *mm)
f87e4cac
JF
1266{
1267 if (current->active_mm == mm)
1268 load_cr3(swapper_pg_dir);
1269}
1270#endif
1271
1272/*
1273 * While a process runs, Xen pins its pagetables, which means that the
1274 * hypervisor forces it to be read-only, and it controls all updates
1275 * to it. This means that all pagetable updates have to go via the
1276 * hypervisor, which is moderately expensive.
1277 *
1278 * Since we're pulling the pagetable down, we switch to use init_mm,
1279 * unpin old process pagetable and mark it all read-write, which
1280 * allows further operations on it to be simple memory accesses.
1281 *
1282 * The only subtle point is that another CPU may be still using the
1283 * pagetable because of lazy tlb flushing. This means we need need to
1284 * switch all CPUs off this pagetable before we can unpin it.
1285 */
1286void xen_exit_mmap(struct mm_struct *mm)
1287{
1288 get_cpu(); /* make sure we don't move around */
7708ad64 1289 xen_drop_mm_ref(mm);
f87e4cac 1290 put_cpu();
3b827c1b 1291
f120f13e 1292 spin_lock(&mm->page_table_lock);
df912ea4
JF
1293
1294 /* pgd may not be pinned in the error exit path of execve */
7708ad64 1295 if (xen_page_pinned(mm->pgd))
eefb47f6 1296 xen_pgd_unpin(mm);
74260714 1297
f120f13e 1298 spin_unlock(&mm->page_table_lock);
3b827c1b 1299}
994025ca 1300
319f3ba5
JF
1301static __init void xen_pagetable_setup_start(pgd_t *base)
1302{
1303}
1304
f1d7062a
TG
1305static void xen_post_allocator_init(void);
1306
319f3ba5
JF
1307static __init void xen_pagetable_setup_done(pgd_t *base)
1308{
1309 xen_setup_shared_info();
f1d7062a 1310 xen_post_allocator_init();
319f3ba5
JF
1311}
1312
1313static void xen_write_cr2(unsigned long cr2)
1314{
1315 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1316}
1317
1318static unsigned long xen_read_cr2(void)
1319{
1320 return percpu_read(xen_vcpu)->arch.cr2;
1321}
1322
1323unsigned long xen_read_cr2_direct(void)
1324{
1325 return percpu_read(xen_vcpu_info.arch.cr2);
1326}
1327
1328static void xen_flush_tlb(void)
1329{
1330 struct mmuext_op *op;
1331 struct multicall_space mcs;
1332
1333 preempt_disable();
1334
1335 mcs = xen_mc_entry(sizeof(*op));
1336
1337 op = mcs.args;
1338 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1339 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1340
1341 xen_mc_issue(PARAVIRT_LAZY_MMU);
1342
1343 preempt_enable();
1344}
1345
1346static void xen_flush_tlb_single(unsigned long addr)
1347{
1348 struct mmuext_op *op;
1349 struct multicall_space mcs;
1350
1351 preempt_disable();
1352
1353 mcs = xen_mc_entry(sizeof(*op));
1354 op = mcs.args;
1355 op->cmd = MMUEXT_INVLPG_LOCAL;
1356 op->arg1.linear_addr = addr & PAGE_MASK;
1357 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1358
1359 xen_mc_issue(PARAVIRT_LAZY_MMU);
1360
1361 preempt_enable();
1362}
1363
1364static void xen_flush_tlb_others(const struct cpumask *cpus,
1365 struct mm_struct *mm, unsigned long va)
1366{
1367 struct {
1368 struct mmuext_op op;
1369 DECLARE_BITMAP(mask, NR_CPUS);
1370 } *args;
1371 struct multicall_space mcs;
1372
e3f8a74e
JF
1373 if (cpumask_empty(cpus))
1374 return; /* nothing to do */
319f3ba5
JF
1375
1376 mcs = xen_mc_entry(sizeof(*args));
1377 args = mcs.args;
1378 args->op.arg2.vcpumask = to_cpumask(args->mask);
1379
1380 /* Remove us, and any offline CPUS. */
1381 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1382 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
319f3ba5
JF
1383
1384 if (va == TLB_FLUSH_ALL) {
1385 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1386 } else {
1387 args->op.cmd = MMUEXT_INVLPG_MULTI;
1388 args->op.arg1.linear_addr = va;
1389 }
1390
1391 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1392
319f3ba5
JF
1393 xen_mc_issue(PARAVIRT_LAZY_MMU);
1394}
1395
1396static unsigned long xen_read_cr3(void)
1397{
1398 return percpu_read(xen_cr3);
1399}
1400
1401static void set_current_cr3(void *v)
1402{
1403 percpu_write(xen_current_cr3, (unsigned long)v);
1404}
1405
1406static void __xen_write_cr3(bool kernel, unsigned long cr3)
1407{
1408 struct mmuext_op *op;
1409 struct multicall_space mcs;
1410 unsigned long mfn;
1411
1412 if (cr3)
1413 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1414 else
1415 mfn = 0;
1416
1417 WARN_ON(mfn == 0 && kernel);
1418
1419 mcs = __xen_mc_entry(sizeof(*op));
1420
1421 op = mcs.args;
1422 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1423 op->arg1.mfn = mfn;
1424
1425 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1426
1427 if (kernel) {
1428 percpu_write(xen_cr3, cr3);
1429
1430 /* Update xen_current_cr3 once the batch has actually
1431 been submitted. */
1432 xen_mc_callback(set_current_cr3, (void *)cr3);
1433 }
1434}
1435
1436static void xen_write_cr3(unsigned long cr3)
1437{
1438 BUG_ON(preemptible());
1439
1440 xen_mc_batch(); /* disables interrupts */
1441
1442 /* Update while interrupts are disabled, so its atomic with
1443 respect to ipis */
1444 percpu_write(xen_cr3, cr3);
1445
1446 __xen_write_cr3(true, cr3);
1447
1448#ifdef CONFIG_X86_64
1449 {
1450 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1451 if (user_pgd)
1452 __xen_write_cr3(false, __pa(user_pgd));
1453 else
1454 __xen_write_cr3(false, 0);
1455 }
1456#endif
1457
1458 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1459}
1460
1461static int xen_pgd_alloc(struct mm_struct *mm)
1462{
1463 pgd_t *pgd = mm->pgd;
1464 int ret = 0;
1465
1466 BUG_ON(PagePinned(virt_to_page(pgd)));
1467
1468#ifdef CONFIG_X86_64
1469 {
1470 struct page *page = virt_to_page(pgd);
1471 pgd_t *user_pgd;
1472
1473 BUG_ON(page->private != 0);
1474
1475 ret = -ENOMEM;
1476
1477 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1478 page->private = (unsigned long)user_pgd;
1479
1480 if (user_pgd != NULL) {
1481 user_pgd[pgd_index(VSYSCALL_START)] =
1482 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1483 ret = 0;
1484 }
1485
1486 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1487 }
1488#endif
1489
1490 return ret;
1491}
1492
1493static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1494{
1495#ifdef CONFIG_X86_64
1496 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1497
1498 if (user_pgd)
1499 free_page((unsigned long)user_pgd);
1500#endif
1501}
1502
1f4f9315
JF
1503#ifdef CONFIG_X86_32
1504static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1505{
1506 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1507 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1508 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1509 pte_val_ma(pte));
1510
1511 return pte;
1512}
1513
1514/* Init-time set_pte while constructing initial pagetables, which
1515 doesn't allow RO pagetable pages to be remapped RW */
1516static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1517{
1518 pte = mask_rw_pte(ptep, pte);
1519
1520 xen_set_pte(ptep, pte);
1521}
1522#endif
319f3ba5 1523
b96229b5
JF
1524static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1525{
1526 struct mmuext_op op;
1527 op.cmd = cmd;
1528 op.arg1.mfn = pfn_to_mfn(pfn);
1529 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1530 BUG();
1531}
1532
319f3ba5
JF
1533/* Early in boot, while setting up the initial pagetable, assume
1534 everything is pinned. */
1535static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1536{
b96229b5
JF
1537#ifdef CONFIG_FLATMEM
1538 BUG_ON(mem_map); /* should only be used early */
1539#endif
1540 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1541 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1542}
1543
1544/* Used for pmd and pud */
1545static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1546{
319f3ba5
JF
1547#ifdef CONFIG_FLATMEM
1548 BUG_ON(mem_map); /* should only be used early */
1549#endif
1550 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1551}
1552
1553/* Early release_pte assumes that all pts are pinned, since there's
1554 only init_mm and anything attached to that is pinned. */
b96229b5 1555static __init void xen_release_pte_init(unsigned long pfn)
319f3ba5 1556{
b96229b5 1557 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
319f3ba5
JF
1558 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1559}
1560
b96229b5 1561static __init void xen_release_pmd_init(unsigned long pfn)
319f3ba5 1562{
b96229b5 1563 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
319f3ba5
JF
1564}
1565
1566/* This needs to make sure the new pte page is pinned iff its being
1567 attached to a pinned pagetable. */
1568static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1569{
1570 struct page *page = pfn_to_page(pfn);
1571
1572 if (PagePinned(virt_to_page(mm->pgd))) {
1573 SetPagePinned(page);
1574
1575 vm_unmap_aliases();
1576 if (!PageHighMem(page)) {
1577 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1578 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1579 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1580 } else {
1581 /* make sure there are no stray mappings of
1582 this page */
1583 kmap_flush_unused();
1584 }
1585 }
1586}
1587
1588static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1589{
1590 xen_alloc_ptpage(mm, pfn, PT_PTE);
1591}
1592
1593static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1594{
1595 xen_alloc_ptpage(mm, pfn, PT_PMD);
1596}
1597
1598/* This should never happen until we're OK to use struct page */
1599static void xen_release_ptpage(unsigned long pfn, unsigned level)
1600{
1601 struct page *page = pfn_to_page(pfn);
1602
1603 if (PagePinned(page)) {
1604 if (!PageHighMem(page)) {
1605 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1606 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1607 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1608 }
1609 ClearPagePinned(page);
1610 }
1611}
1612
1613static void xen_release_pte(unsigned long pfn)
1614{
1615 xen_release_ptpage(pfn, PT_PTE);
1616}
1617
1618static void xen_release_pmd(unsigned long pfn)
1619{
1620 xen_release_ptpage(pfn, PT_PMD);
1621}
1622
1623#if PAGETABLE_LEVELS == 4
1624static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1625{
1626 xen_alloc_ptpage(mm, pfn, PT_PUD);
1627}
1628
1629static void xen_release_pud(unsigned long pfn)
1630{
1631 xen_release_ptpage(pfn, PT_PUD);
1632}
1633#endif
1634
1635void __init xen_reserve_top(void)
1636{
1637#ifdef CONFIG_X86_32
1638 unsigned long top = HYPERVISOR_VIRT_START;
1639 struct xen_platform_parameters pp;
1640
1641 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1642 top = pp.virt_start;
1643
1644 reserve_top_address(-top);
1645#endif /* CONFIG_X86_32 */
1646}
1647
1648/*
1649 * Like __va(), but returns address in the kernel mapping (which is
1650 * all we have until the physical memory mapping has been set up.
1651 */
1652static void *__ka(phys_addr_t paddr)
1653{
1654#ifdef CONFIG_X86_64
1655 return (void *)(paddr + __START_KERNEL_map);
1656#else
1657 return __va(paddr);
1658#endif
1659}
1660
1661/* Convert a machine address to physical address */
1662static unsigned long m2p(phys_addr_t maddr)
1663{
1664 phys_addr_t paddr;
1665
1666 maddr &= PTE_PFN_MASK;
1667 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1668
1669 return paddr;
1670}
1671
1672/* Convert a machine address to kernel virtual */
1673static void *m2v(phys_addr_t maddr)
1674{
1675 return __ka(m2p(maddr));
1676}
1677
1678static void set_page_prot(void *addr, pgprot_t prot)
1679{
1680 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1681 pte_t pte = pfn_pte(pfn, prot);
1682
1683 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1684 BUG();
1685}
1686
1687static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1688{
1689 unsigned pmdidx, pteidx;
1690 unsigned ident_pte;
1691 unsigned long pfn;
1692
1693 ident_pte = 0;
1694 pfn = 0;
1695 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1696 pte_t *pte_page;
1697
1698 /* Reuse or allocate a page of ptes */
1699 if (pmd_present(pmd[pmdidx]))
1700 pte_page = m2v(pmd[pmdidx].pmd);
1701 else {
1702 /* Check for free pte pages */
1703 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1704 break;
1705
1706 pte_page = &level1_ident_pgt[ident_pte];
1707 ident_pte += PTRS_PER_PTE;
1708
1709 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1710 }
1711
1712 /* Install mappings */
1713 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1714 pte_t pte;
1715
1716 if (pfn > max_pfn_mapped)
1717 max_pfn_mapped = pfn;
1718
1719 if (!pte_none(pte_page[pteidx]))
1720 continue;
1721
1722 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1723 pte_page[pteidx] = pte;
1724 }
1725 }
1726
1727 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1728 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1729
1730 set_page_prot(pmd, PAGE_KERNEL_RO);
1731}
1732
1733#ifdef CONFIG_X86_64
1734static void convert_pfn_mfn(void *v)
1735{
1736 pte_t *pte = v;
1737 int i;
1738
1739 /* All levels are converted the same way, so just treat them
1740 as ptes. */
1741 for (i = 0; i < PTRS_PER_PTE; i++)
1742 pte[i] = xen_make_pte(pte[i].pte);
1743}
1744
1745/*
1746 * Set up the inital kernel pagetable.
1747 *
1748 * We can construct this by grafting the Xen provided pagetable into
1749 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1750 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1751 * means that only the kernel has a physical mapping to start with -
1752 * but that's enough to get __va working. We need to fill in the rest
1753 * of the physical mapping once some sort of allocator has been set
1754 * up.
1755 */
1756__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1757 unsigned long max_pfn)
1758{
1759 pud_t *l3;
1760 pmd_t *l2;
1761
1762 /* Zap identity mapping */
1763 init_level4_pgt[0] = __pgd(0);
1764
1765 /* Pre-constructed entries are in pfn, so convert to mfn */
1766 convert_pfn_mfn(init_level4_pgt);
1767 convert_pfn_mfn(level3_ident_pgt);
1768 convert_pfn_mfn(level3_kernel_pgt);
1769
1770 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1771 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1772
1773 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1774 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1775
1776 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1777 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1778 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1779
1780 /* Set up identity map */
1781 xen_map_identity_early(level2_ident_pgt, max_pfn);
1782
1783 /* Make pagetable pieces RO */
1784 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1785 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1786 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1787 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1788 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1789 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1790
1791 /* Pin down new L4 */
1792 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1793 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1794
1795 /* Unpin Xen-provided one */
1796 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1797
1798 /* Switch over */
1799 pgd = init_level4_pgt;
1800
1801 /*
1802 * At this stage there can be no user pgd, and no page
1803 * structure to attach it to, so make sure we just set kernel
1804 * pgd.
1805 */
1806 xen_mc_batch();
1807 __xen_write_cr3(true, __pa(pgd));
1808 xen_mc_issue(PARAVIRT_LAZY_CPU);
1809
1810 reserve_early(__pa(xen_start_info->pt_base),
1811 __pa(xen_start_info->pt_base +
1812 xen_start_info->nr_pt_frames * PAGE_SIZE),
1813 "XEN PAGETABLES");
1814
1815 return pgd;
1816}
1817#else /* !CONFIG_X86_64 */
1818static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1819
1820__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1821 unsigned long max_pfn)
1822{
1823 pmd_t *kernel_pmd;
1824
93dbda7c
JF
1825 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1826 xen_start_info->nr_pt_frames * PAGE_SIZE +
1827 512*1024);
319f3ba5
JF
1828
1829 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1830 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1831
1832 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1833
1834 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1835 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1836 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1837
1838 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1839 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1840 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1841
1842 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1843
1844 xen_write_cr3(__pa(swapper_pg_dir));
1845
1846 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1847
33df4db0
JF
1848 reserve_early(__pa(xen_start_info->pt_base),
1849 __pa(xen_start_info->pt_base +
1850 xen_start_info->nr_pt_frames * PAGE_SIZE),
1851 "XEN PAGETABLES");
1852
319f3ba5
JF
1853 return swapper_pg_dir;
1854}
1855#endif /* CONFIG_X86_64 */
1856
3b3809ac 1857static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
319f3ba5
JF
1858{
1859 pte_t pte;
1860
1861 phys >>= PAGE_SHIFT;
1862
1863 switch (idx) {
1864 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1865#ifdef CONFIG_X86_F00F_BUG
1866 case FIX_F00F_IDT:
1867#endif
1868#ifdef CONFIG_X86_32
1869 case FIX_WP_TEST:
1870 case FIX_VDSO:
1871# ifdef CONFIG_HIGHMEM
1872 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1873# endif
1874#else
1875 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1876#endif
1877#ifdef CONFIG_X86_LOCAL_APIC
1878 case FIX_APIC_BASE: /* maps dummy local APIC */
1879#endif
3ecb1b7d
JF
1880 case FIX_TEXT_POKE0:
1881 case FIX_TEXT_POKE1:
1882 /* All local page mappings */
319f3ba5
JF
1883 pte = pfn_pte(phys, prot);
1884 break;
1885
c0011dbf
JF
1886 case FIX_PARAVIRT_BOOTMAP:
1887 /* This is an MFN, but it isn't an IO mapping from the
1888 IO domain */
319f3ba5
JF
1889 pte = mfn_pte(phys, prot);
1890 break;
c0011dbf
JF
1891
1892 default:
1893 /* By default, set_fixmap is used for hardware mappings */
1894 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1895 break;
319f3ba5
JF
1896 }
1897
1898 __native_set_fixmap(idx, pte);
1899
1900#ifdef CONFIG_X86_64
1901 /* Replicate changes to map the vsyscall page into the user
1902 pagetable vsyscall mapping. */
1903 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1904 unsigned long vaddr = __fix_to_virt(idx);
1905 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1906 }
1907#endif
1908}
1909
f1d7062a 1910static __init void xen_post_allocator_init(void)
319f3ba5
JF
1911{
1912 pv_mmu_ops.set_pte = xen_set_pte;
1913 pv_mmu_ops.set_pmd = xen_set_pmd;
1914 pv_mmu_ops.set_pud = xen_set_pud;
1915#if PAGETABLE_LEVELS == 4
1916 pv_mmu_ops.set_pgd = xen_set_pgd;
1917#endif
1918
1919 /* This will work as long as patching hasn't happened yet
1920 (which it hasn't) */
1921 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1922 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1923 pv_mmu_ops.release_pte = xen_release_pte;
1924 pv_mmu_ops.release_pmd = xen_release_pmd;
1925#if PAGETABLE_LEVELS == 4
1926 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1927 pv_mmu_ops.release_pud = xen_release_pud;
1928#endif
1929
1930#ifdef CONFIG_X86_64
1931 SetPagePinned(virt_to_page(level3_user_vsyscall));
1932#endif
1933 xen_mark_init_mm_pinned();
1934}
1935
b407fc57
JF
1936static void xen_leave_lazy_mmu(void)
1937{
5caecb94 1938 preempt_disable();
b407fc57
JF
1939 xen_mc_flush();
1940 paravirt_leave_lazy_mmu();
5caecb94 1941 preempt_enable();
b407fc57 1942}
319f3ba5 1943
030cb6c0 1944static const struct pv_mmu_ops xen_mmu_ops __initdata = {
319f3ba5
JF
1945 .read_cr2 = xen_read_cr2,
1946 .write_cr2 = xen_write_cr2,
1947
1948 .read_cr3 = xen_read_cr3,
1949 .write_cr3 = xen_write_cr3,
1950
1951 .flush_tlb_user = xen_flush_tlb,
1952 .flush_tlb_kernel = xen_flush_tlb,
1953 .flush_tlb_single = xen_flush_tlb_single,
1954 .flush_tlb_others = xen_flush_tlb_others,
1955
1956 .pte_update = paravirt_nop,
1957 .pte_update_defer = paravirt_nop,
1958
1959 .pgd_alloc = xen_pgd_alloc,
1960 .pgd_free = xen_pgd_free,
1961
1962 .alloc_pte = xen_alloc_pte_init,
1963 .release_pte = xen_release_pte_init,
b96229b5 1964 .alloc_pmd = xen_alloc_pmd_init,
319f3ba5 1965 .alloc_pmd_clone = paravirt_nop,
b96229b5 1966 .release_pmd = xen_release_pmd_init,
319f3ba5 1967
319f3ba5
JF
1968#ifdef CONFIG_X86_64
1969 .set_pte = xen_set_pte,
1970#else
1971 .set_pte = xen_set_pte_init,
1972#endif
1973 .set_pte_at = xen_set_pte_at,
1974 .set_pmd = xen_set_pmd_hyper,
1975
1976 .ptep_modify_prot_start = __ptep_modify_prot_start,
1977 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1978
da5de7c2
JF
1979 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1980 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
319f3ba5 1981
da5de7c2
JF
1982 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1983 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
319f3ba5
JF
1984
1985#ifdef CONFIG_X86_PAE
1986 .set_pte_atomic = xen_set_pte_atomic,
319f3ba5
JF
1987 .pte_clear = xen_pte_clear,
1988 .pmd_clear = xen_pmd_clear,
1989#endif /* CONFIG_X86_PAE */
1990 .set_pud = xen_set_pud_hyper,
1991
da5de7c2
JF
1992 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1993 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
319f3ba5
JF
1994
1995#if PAGETABLE_LEVELS == 4
da5de7c2
JF
1996 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1997 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
319f3ba5
JF
1998 .set_pgd = xen_set_pgd_hyper,
1999
b96229b5
JF
2000 .alloc_pud = xen_alloc_pmd_init,
2001 .release_pud = xen_release_pmd_init,
319f3ba5
JF
2002#endif /* PAGETABLE_LEVELS == 4 */
2003
2004 .activate_mm = xen_activate_mm,
2005 .dup_mmap = xen_dup_mmap,
2006 .exit_mmap = xen_exit_mmap,
2007
2008 .lazy_mode = {
2009 .enter = paravirt_enter_lazy_mmu,
b407fc57 2010 .leave = xen_leave_lazy_mmu,
319f3ba5
JF
2011 },
2012
2013 .set_fixmap = xen_set_fixmap,
2014};
2015
030cb6c0
TG
2016void __init xen_init_mmu_ops(void)
2017{
2018 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2019 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2020 pv_mmu_ops = xen_mmu_ops;
2021}
319f3ba5 2022
994025ca
JF
2023#ifdef CONFIG_XEN_DEBUG_FS
2024
2025static struct dentry *d_mmu_debug;
2026
2027static int __init xen_mmu_debugfs(void)
2028{
2029 struct dentry *d_xen = xen_init_debugfs();
2030
2031 if (d_xen == NULL)
2032 return -ENOMEM;
2033
2034 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2035
2036 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2037
2038 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2039 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2040 &mmu_stats.pgd_update_pinned);
2041 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2042 &mmu_stats.pgd_update_pinned);
2043
2044 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2045 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2046 &mmu_stats.pud_update_pinned);
2047 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2048 &mmu_stats.pud_update_pinned);
2049
2050 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2051 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2052 &mmu_stats.pmd_update_pinned);
2053 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2054 &mmu_stats.pmd_update_pinned);
2055
2056 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2057// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2058// &mmu_stats.pte_update_pinned);
2059 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2060 &mmu_stats.pte_update_pinned);
2061
2062 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2063 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2064 &mmu_stats.mmu_update_extended);
2065 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2066 mmu_stats.mmu_update_histo, 20);
2067
2068 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2069 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2070 &mmu_stats.set_pte_at_batched);
2071 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2072 &mmu_stats.set_pte_at_current);
2073 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2074 &mmu_stats.set_pte_at_kernel);
2075
2076 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2077 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2078 &mmu_stats.prot_commit_batched);
2079
2080 return 0;
2081}
2082fs_initcall(xen_mmu_debugfs);
2083
2084#endif /* CONFIG_XEN_DEBUG_FS */
This page took 0.373585 seconds and 5 git commands to generate.