2 * Xen leaves the responsibility for maintaining p2m mappings to the
3 * guests themselves, but it must also access and update the p2m array
4 * during suspend/resume when all the pages are reallocated.
6 * The p2m table is logically a flat array, but we implement it as a
7 * three-level tree to allow the address space to be sparse.
13 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
15 * p2m p2m p2m p2m p2m p2m p2m ...
17 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
19 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
20 * maximum representable pseudo-physical address space is:
21 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively.
28 #include <linux/init.h>
29 #include <linux/module.h>
30 #include <linux/list.h>
31 #include <linux/hash.h>
32 #include <linux/sched.h>
34 #include <asm/cache.h>
35 #include <asm/setup.h>
37 #include <asm/xen/page.h>
38 #include <asm/xen/hypercall.h>
39 #include <asm/xen/hypervisor.h>
43 static void __init
m2p_override_init(void);
45 unsigned long xen_max_p2m_pfn __read_mostly
;
47 #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
48 #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
49 #define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
51 #define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
53 /* Placeholders for holes in the address space */
54 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing
, P2M_PER_PAGE
);
55 static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing
, P2M_MID_PER_PAGE
);
56 static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn
, P2M_MID_PER_PAGE
);
58 static RESERVE_BRK_ARRAY(unsigned long **, p2m_top
, P2M_TOP_PER_PAGE
);
59 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn
, P2M_TOP_PER_PAGE
);
60 static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p
, P2M_TOP_PER_PAGE
);
62 RESERVE_BRK(p2m_mid
, PAGE_SIZE
* (MAX_DOMAIN_PAGES
/ (P2M_PER_PAGE
* P2M_MID_PER_PAGE
)));
63 RESERVE_BRK(p2m_mid_mfn
, PAGE_SIZE
* (MAX_DOMAIN_PAGES
/ (P2M_PER_PAGE
* P2M_MID_PER_PAGE
)));
65 static inline unsigned p2m_top_index(unsigned long pfn
)
67 BUG_ON(pfn
>= MAX_P2M_PFN
);
68 return pfn
/ (P2M_MID_PER_PAGE
* P2M_PER_PAGE
);
71 static inline unsigned p2m_mid_index(unsigned long pfn
)
73 return (pfn
/ P2M_PER_PAGE
) % P2M_MID_PER_PAGE
;
76 static inline unsigned p2m_index(unsigned long pfn
)
78 return pfn
% P2M_PER_PAGE
;
81 static void p2m_top_init(unsigned long ***top
)
85 for (i
= 0; i
< P2M_TOP_PER_PAGE
; i
++)
86 top
[i
] = p2m_mid_missing
;
89 static void p2m_top_mfn_init(unsigned long *top
)
93 for (i
= 0; i
< P2M_TOP_PER_PAGE
; i
++)
94 top
[i
] = virt_to_mfn(p2m_mid_missing_mfn
);
97 static void p2m_top_mfn_p_init(unsigned long **top
)
101 for (i
= 0; i
< P2M_TOP_PER_PAGE
; i
++)
102 top
[i
] = p2m_mid_missing_mfn
;
105 static void p2m_mid_init(unsigned long **mid
)
109 for (i
= 0; i
< P2M_MID_PER_PAGE
; i
++)
110 mid
[i
] = p2m_missing
;
113 static void p2m_mid_mfn_init(unsigned long *mid
)
117 for (i
= 0; i
< P2M_MID_PER_PAGE
; i
++)
118 mid
[i
] = virt_to_mfn(p2m_missing
);
121 static void p2m_init(unsigned long *p2m
)
125 for (i
= 0; i
< P2M_MID_PER_PAGE
; i
++)
126 p2m
[i
] = INVALID_P2M_ENTRY
;
130 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
132 * This is called both at boot time, and after resuming from suspend:
133 * - At boot time we're called very early, and must use extend_brk()
134 * to allocate memory.
136 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated.
139 void xen_build_mfn_list_list(void)
143 /* Pre-initialize p2m_top_mfn to be completely missing */
144 if (p2m_top_mfn
== NULL
) {
145 p2m_mid_missing_mfn
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
146 p2m_mid_mfn_init(p2m_mid_missing_mfn
);
148 p2m_top_mfn_p
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
149 p2m_top_mfn_p_init(p2m_top_mfn_p
);
151 p2m_top_mfn
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
152 p2m_top_mfn_init(p2m_top_mfn
);
154 /* Reinitialise, mfn's all change after migration */
155 p2m_mid_mfn_init(p2m_mid_missing_mfn
);
158 for (pfn
= 0; pfn
< xen_max_p2m_pfn
; pfn
+= P2M_PER_PAGE
) {
159 unsigned topidx
= p2m_top_index(pfn
);
160 unsigned mididx
= p2m_mid_index(pfn
);
162 unsigned long *mid_mfn_p
;
164 mid
= p2m_top
[topidx
];
165 mid_mfn_p
= p2m_top_mfn_p
[topidx
];
167 /* Don't bother allocating any mfn mid levels if
168 * they're just missing, just update the stored mfn,
169 * since all could have changed over a migrate.
171 if (mid
== p2m_mid_missing
) {
173 BUG_ON(mid_mfn_p
!= p2m_mid_missing_mfn
);
174 p2m_top_mfn
[topidx
] = virt_to_mfn(p2m_mid_missing_mfn
);
175 pfn
+= (P2M_MID_PER_PAGE
- 1) * P2M_PER_PAGE
;
179 if (mid_mfn_p
== p2m_mid_missing_mfn
) {
181 * XXX boot-time only! We should never find
182 * missing parts of the mfn tree after
183 * runtime. extend_brk() will BUG if we call
186 mid_mfn_p
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
187 p2m_mid_mfn_init(mid_mfn_p
);
189 p2m_top_mfn_p
[topidx
] = mid_mfn_p
;
192 p2m_top_mfn
[topidx
] = virt_to_mfn(mid_mfn_p
);
193 mid_mfn_p
[mididx
] = virt_to_mfn(mid
[mididx
]);
197 void xen_setup_mfn_list_list(void)
199 BUG_ON(HYPERVISOR_shared_info
== &xen_dummy_shared_info
);
201 HYPERVISOR_shared_info
->arch
.pfn_to_mfn_frame_list_list
=
202 virt_to_mfn(p2m_top_mfn
);
203 HYPERVISOR_shared_info
->arch
.max_pfn
= xen_max_p2m_pfn
;
206 /* Set up p2m_top to point to the domain-builder provided p2m pages */
207 void __init
xen_build_dynamic_phys_to_machine(void)
209 unsigned long *mfn_list
= (unsigned long *)xen_start_info
->mfn_list
;
210 unsigned long max_pfn
= min(MAX_DOMAIN_PAGES
, xen_start_info
->nr_pages
);
213 xen_max_p2m_pfn
= max_pfn
;
215 p2m_missing
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
216 p2m_init(p2m_missing
);
218 p2m_mid_missing
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
219 p2m_mid_init(p2m_mid_missing
);
221 p2m_top
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
222 p2m_top_init(p2m_top
);
225 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just
227 * need to graft that into our tree structure.
229 for (pfn
= 0; pfn
< max_pfn
; pfn
+= P2M_PER_PAGE
) {
230 unsigned topidx
= p2m_top_index(pfn
);
231 unsigned mididx
= p2m_mid_index(pfn
);
233 if (p2m_top
[topidx
] == p2m_mid_missing
) {
234 unsigned long **mid
= extend_brk(PAGE_SIZE
, PAGE_SIZE
);
237 p2m_top
[topidx
] = mid
;
241 * As long as the mfn_list has enough entries to completely
242 * fill a p2m page, pointing into the array is ok. But if
243 * not the entries beyond the last pfn will be undefined.
245 if (unlikely(pfn
+ P2M_PER_PAGE
> max_pfn
)) {
246 unsigned long p2midx
;
248 p2midx
= max_pfn
% P2M_PER_PAGE
;
249 for ( ; p2midx
< P2M_PER_PAGE
; p2midx
++)
250 mfn_list
[pfn
+ p2midx
] = INVALID_P2M_ENTRY
;
252 p2m_top
[topidx
][mididx
] = &mfn_list
[pfn
];
258 unsigned long get_phys_to_machine(unsigned long pfn
)
260 unsigned topidx
, mididx
, idx
;
262 if (unlikely(pfn
>= MAX_P2M_PFN
))
263 return INVALID_P2M_ENTRY
;
265 topidx
= p2m_top_index(pfn
);
266 mididx
= p2m_mid_index(pfn
);
267 idx
= p2m_index(pfn
);
269 return p2m_top
[topidx
][mididx
][idx
];
271 EXPORT_SYMBOL_GPL(get_phys_to_machine
);
273 static void *alloc_p2m_page(void)
275 return (void *)__get_free_page(GFP_KERNEL
| __GFP_REPEAT
);
278 static void free_p2m_page(void *p
)
280 free_page((unsigned long)p
);
284 * Fully allocate the p2m structure for a given pfn. We need to check
285 * that both the top and mid levels are allocated, and make sure the
286 * parallel mfn tree is kept in sync. We may race with other cpus, so
287 * the new pages are installed with cmpxchg; if we lose the race then
288 * simply free the page we allocated and use the one that's there.
290 static bool alloc_p2m(unsigned long pfn
)
292 unsigned topidx
, mididx
;
293 unsigned long ***top_p
, **mid
;
294 unsigned long *top_mfn_p
, *mid_mfn
;
296 topidx
= p2m_top_index(pfn
);
297 mididx
= p2m_mid_index(pfn
);
299 top_p
= &p2m_top
[topidx
];
302 if (mid
== p2m_mid_missing
) {
303 /* Mid level is missing, allocate a new one */
304 mid
= alloc_p2m_page();
310 if (cmpxchg(top_p
, p2m_mid_missing
, mid
) != p2m_mid_missing
)
314 top_mfn_p
= &p2m_top_mfn
[topidx
];
315 mid_mfn
= p2m_top_mfn_p
[topidx
];
317 BUG_ON(virt_to_mfn(mid_mfn
) != *top_mfn_p
);
319 if (mid_mfn
== p2m_mid_missing_mfn
) {
320 /* Separately check the mid mfn level */
321 unsigned long missing_mfn
;
322 unsigned long mid_mfn_mfn
;
324 mid_mfn
= alloc_p2m_page();
328 p2m_mid_mfn_init(mid_mfn
);
330 missing_mfn
= virt_to_mfn(p2m_mid_missing_mfn
);
331 mid_mfn_mfn
= virt_to_mfn(mid_mfn
);
332 if (cmpxchg(top_mfn_p
, missing_mfn
, mid_mfn_mfn
) != missing_mfn
)
333 free_p2m_page(mid_mfn
);
335 p2m_top_mfn_p
[topidx
] = mid_mfn
;
338 if (p2m_top
[topidx
][mididx
] == p2m_missing
) {
339 /* p2m leaf page is missing */
342 p2m
= alloc_p2m_page();
348 if (cmpxchg(&mid
[mididx
], p2m_missing
, p2m
) != p2m_missing
)
351 mid_mfn
[mididx
] = virt_to_mfn(p2m
);
357 /* Try to install p2m mapping; fail if intermediate bits missing */
358 bool __set_phys_to_machine(unsigned long pfn
, unsigned long mfn
)
360 unsigned topidx
, mididx
, idx
;
362 if (unlikely(pfn
>= MAX_P2M_PFN
)) {
363 BUG_ON(mfn
!= INVALID_P2M_ENTRY
);
367 topidx
= p2m_top_index(pfn
);
368 mididx
= p2m_mid_index(pfn
);
369 idx
= p2m_index(pfn
);
371 if (p2m_top
[topidx
][mididx
] == p2m_missing
)
372 return mfn
== INVALID_P2M_ENTRY
;
374 p2m_top
[topidx
][mididx
][idx
] = mfn
;
379 bool set_phys_to_machine(unsigned long pfn
, unsigned long mfn
)
381 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap
))) {
382 BUG_ON(pfn
!= mfn
&& mfn
!= INVALID_P2M_ENTRY
);
386 if (unlikely(!__set_phys_to_machine(pfn
, mfn
))) {
390 if (!__set_phys_to_machine(pfn
, mfn
))
397 #define M2P_OVERRIDE_HASH_SHIFT 10
398 #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
400 static RESERVE_BRK_ARRAY(struct list_head
, m2p_overrides
, M2P_OVERRIDE_HASH
);
401 static DEFINE_SPINLOCK(m2p_override_lock
);
403 static void __init
m2p_override_init(void)
407 m2p_overrides
= extend_brk(sizeof(*m2p_overrides
) * M2P_OVERRIDE_HASH
,
408 sizeof(unsigned long));
410 for (i
= 0; i
< M2P_OVERRIDE_HASH
; i
++)
411 INIT_LIST_HEAD(&m2p_overrides
[i
]);
414 static unsigned long mfn_hash(unsigned long mfn
)
416 return hash_long(mfn
, M2P_OVERRIDE_HASH_SHIFT
);
419 /* Add an MFN override for a particular page */
420 int m2p_add_override(unsigned long mfn
, struct page
*page
)
424 unsigned long address
;
428 pfn
= page_to_pfn(page
);
429 if (!PageHighMem(page
)) {
430 address
= (unsigned long)__va(pfn
<< PAGE_SHIFT
);
431 ptep
= lookup_address(address
, &level
);
433 if (WARN(ptep
== NULL
|| level
!= PG_LEVEL_4K
,
434 "m2p_add_override: pfn %lx not mapped", pfn
))
439 page
->index
= pfn_to_mfn(pfn
);
441 __set_phys_to_machine(pfn
, FOREIGN_FRAME(mfn
));
442 if (!PageHighMem(page
))
443 /* Just zap old mapping for now */
444 pte_clear(&init_mm
, address
, ptep
);
446 spin_lock_irqsave(&m2p_override_lock
, flags
);
447 list_add(&page
->lru
, &m2p_overrides
[mfn_hash(mfn
)]);
448 spin_unlock_irqrestore(&m2p_override_lock
, flags
);
453 int m2p_remove_override(struct page
*page
)
458 unsigned long address
;
462 pfn
= page_to_pfn(page
);
463 mfn
= get_phys_to_machine(pfn
);
464 if (mfn
== INVALID_P2M_ENTRY
|| !(mfn
& FOREIGN_FRAME_BIT
))
467 if (!PageHighMem(page
)) {
468 address
= (unsigned long)__va(pfn
<< PAGE_SHIFT
);
469 ptep
= lookup_address(address
, &level
);
471 if (WARN(ptep
== NULL
|| level
!= PG_LEVEL_4K
,
472 "m2p_remove_override: pfn %lx not mapped", pfn
))
476 spin_lock_irqsave(&m2p_override_lock
, flags
);
477 list_del(&page
->lru
);
478 spin_unlock_irqrestore(&m2p_override_lock
, flags
);
479 __set_phys_to_machine(pfn
, page
->index
);
481 if (!PageHighMem(page
))
482 set_pte_at(&init_mm
, address
, ptep
,
483 pfn_pte(pfn
, PAGE_KERNEL
));
484 /* No tlb flush necessary because the caller already
485 * left the pte unmapped. */
490 struct page
*m2p_find_override(unsigned long mfn
)
493 struct list_head
*bucket
= &m2p_overrides
[mfn_hash(mfn
)];
494 struct page
*p
, *ret
;
498 spin_lock_irqsave(&m2p_override_lock
, flags
);
500 list_for_each_entry(p
, bucket
, lru
) {
501 if (p
->private == mfn
) {
507 spin_unlock_irqrestore(&m2p_override_lock
, flags
);
512 unsigned long m2p_find_override_pfn(unsigned long mfn
, unsigned long pfn
)
514 struct page
*p
= m2p_find_override(mfn
);
515 unsigned long ret
= pfn
;
518 ret
= page_to_pfn(p
);
522 EXPORT_SYMBOL_GPL(m2p_find_override_pfn
);