2 * Copyright IBM Corp. 2007, 2011
3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
11 #include <linux/swap.h>
12 #include <linux/smp.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
16 #include <linux/module.h>
17 #include <linux/quicklist.h>
18 #include <linux/rcupdate.h>
19 #include <linux/slab.h>
20 #include <linux/swapops.h>
22 #include <asm/pgtable.h>
23 #include <asm/pgalloc.h>
25 #include <asm/tlbflush.h>
26 #include <asm/mmu_context.h>
30 #define FRAG_MASK 0x0f
33 #define FRAG_MASK 0x03
37 unsigned long *crst_table_alloc(struct mm_struct
*mm
)
39 struct page
*page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
43 return (unsigned long *) page_to_phys(page
);
46 void crst_table_free(struct mm_struct
*mm
, unsigned long *table
)
48 free_pages((unsigned long) table
, ALLOC_ORDER
);
52 static void __crst_table_upgrade(void *arg
)
54 struct mm_struct
*mm
= arg
;
56 if (current
->active_mm
== mm
) {
63 int crst_table_upgrade(struct mm_struct
*mm
, unsigned long limit
)
65 unsigned long *table
, *pgd
;
69 BUG_ON(limit
> (1UL << 53));
72 table
= crst_table_alloc(mm
);
75 spin_lock_bh(&mm
->page_table_lock
);
76 if (mm
->context
.asce_limit
< limit
) {
77 pgd
= (unsigned long *) mm
->pgd
;
78 if (mm
->context
.asce_limit
<= (1UL << 31)) {
79 entry
= _REGION3_ENTRY_EMPTY
;
80 mm
->context
.asce_limit
= 1UL << 42;
81 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
85 entry
= _REGION2_ENTRY_EMPTY
;
86 mm
->context
.asce_limit
= 1UL << 53;
87 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
91 crst_table_init(table
, entry
);
92 pgd_populate(mm
, (pgd_t
*) table
, (pud_t
*) pgd
);
93 mm
->pgd
= (pgd_t
*) table
;
94 mm
->task_size
= mm
->context
.asce_limit
;
98 spin_unlock_bh(&mm
->page_table_lock
);
100 crst_table_free(mm
, table
);
101 if (mm
->context
.asce_limit
< limit
)
104 on_each_cpu(__crst_table_upgrade
, mm
, 0);
108 void crst_table_downgrade(struct mm_struct
*mm
, unsigned long limit
)
112 if (current
->active_mm
== mm
) {
116 while (mm
->context
.asce_limit
> limit
) {
118 switch (pgd_val(*pgd
) & _REGION_ENTRY_TYPE_MASK
) {
119 case _REGION_ENTRY_TYPE_R2
:
120 mm
->context
.asce_limit
= 1UL << 42;
121 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
125 case _REGION_ENTRY_TYPE_R3
:
126 mm
->context
.asce_limit
= 1UL << 31;
127 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
134 mm
->pgd
= (pgd_t
*) (pgd_val(*pgd
) & _REGION_ENTRY_ORIGIN
);
135 mm
->task_size
= mm
->context
.asce_limit
;
136 crst_table_free(mm
, (unsigned long *) pgd
);
138 if (current
->active_mm
== mm
)
146 * gmap_alloc - allocate a guest address space
147 * @mm: pointer to the parent mm_struct
148 * @limit: maximum size of the gmap address space
150 * Returns a guest address space structure.
152 struct gmap
*gmap_alloc(struct mm_struct
*mm
, unsigned long limit
)
156 unsigned long *table
;
157 unsigned long etype
, atype
;
159 if (limit
< (1UL << 31)) {
160 limit
= (1UL << 31) - 1;
161 atype
= _ASCE_TYPE_SEGMENT
;
162 etype
= _SEGMENT_ENTRY_EMPTY
;
163 } else if (limit
< (1UL << 42)) {
164 limit
= (1UL << 42) - 1;
165 atype
= _ASCE_TYPE_REGION3
;
166 etype
= _REGION3_ENTRY_EMPTY
;
167 } else if (limit
< (1UL << 53)) {
168 limit
= (1UL << 53) - 1;
169 atype
= _ASCE_TYPE_REGION2
;
170 etype
= _REGION2_ENTRY_EMPTY
;
173 atype
= _ASCE_TYPE_REGION1
;
174 etype
= _REGION1_ENTRY_EMPTY
;
176 gmap
= kzalloc(sizeof(struct gmap
), GFP_KERNEL
);
179 INIT_LIST_HEAD(&gmap
->crst_list
);
180 INIT_RADIX_TREE(&gmap
->guest_to_host
, GFP_KERNEL
);
181 INIT_RADIX_TREE(&gmap
->host_to_guest
, GFP_ATOMIC
);
182 spin_lock_init(&gmap
->guest_table_lock
);
184 page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
188 list_add(&page
->lru
, &gmap
->crst_list
);
189 table
= (unsigned long *) page_to_phys(page
);
190 crst_table_init(table
, etype
);
192 gmap
->asce
= atype
| _ASCE_TABLE_LENGTH
|
193 _ASCE_USER_BITS
| __pa(table
);
194 gmap
->asce_end
= limit
;
195 down_write(&mm
->mmap_sem
);
196 list_add(&gmap
->list
, &mm
->context
.gmap_list
);
197 up_write(&mm
->mmap_sem
);
205 EXPORT_SYMBOL_GPL(gmap_alloc
);
207 static void gmap_flush_tlb(struct gmap
*gmap
)
209 if (MACHINE_HAS_IDTE
)
210 __tlb_flush_asce(gmap
->mm
, gmap
->asce
);
212 __tlb_flush_global();
215 static void gmap_radix_tree_free(struct radix_tree_root
*root
)
217 struct radix_tree_iter iter
;
218 unsigned long indices
[16];
223 /* A radix tree is freed by deleting all of its entries */
227 radix_tree_for_each_slot(slot
, root
, &iter
, index
) {
228 indices
[nr
] = iter
.index
;
232 for (i
= 0; i
< nr
; i
++) {
234 radix_tree_delete(root
, index
);
240 * gmap_free - free a guest address space
241 * @gmap: pointer to the guest address space structure
243 void gmap_free(struct gmap
*gmap
)
245 struct page
*page
, *next
;
248 if (MACHINE_HAS_IDTE
)
249 __tlb_flush_asce(gmap
->mm
, gmap
->asce
);
251 __tlb_flush_global();
253 /* Free all segment & region tables. */
254 list_for_each_entry_safe(page
, next
, &gmap
->crst_list
, lru
)
255 __free_pages(page
, ALLOC_ORDER
);
256 gmap_radix_tree_free(&gmap
->guest_to_host
);
257 gmap_radix_tree_free(&gmap
->host_to_guest
);
258 down_write(&gmap
->mm
->mmap_sem
);
259 list_del(&gmap
->list
);
260 up_write(&gmap
->mm
->mmap_sem
);
263 EXPORT_SYMBOL_GPL(gmap_free
);
266 * gmap_enable - switch primary space to the guest address space
267 * @gmap: pointer to the guest address space structure
269 void gmap_enable(struct gmap
*gmap
)
271 S390_lowcore
.gmap
= (unsigned long) gmap
;
273 EXPORT_SYMBOL_GPL(gmap_enable
);
276 * gmap_disable - switch back to the standard primary address space
277 * @gmap: pointer to the guest address space structure
279 void gmap_disable(struct gmap
*gmap
)
281 S390_lowcore
.gmap
= 0UL;
283 EXPORT_SYMBOL_GPL(gmap_disable
);
286 * gmap_alloc_table is assumed to be called with mmap_sem held
288 static int gmap_alloc_table(struct gmap
*gmap
, unsigned long *table
,
289 unsigned long init
, unsigned long gaddr
)
294 /* since we dont free the gmap table until gmap_free we can unlock */
295 page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
298 new = (unsigned long *) page_to_phys(page
);
299 crst_table_init(new, init
);
300 spin_lock(&gmap
->mm
->page_table_lock
);
301 if (*table
& _REGION_ENTRY_INVALID
) {
302 list_add(&page
->lru
, &gmap
->crst_list
);
303 *table
= (unsigned long) new | _REGION_ENTRY_LENGTH
|
304 (*table
& _REGION_ENTRY_TYPE_MASK
);
308 spin_unlock(&gmap
->mm
->page_table_lock
);
310 __free_pages(page
, ALLOC_ORDER
);
315 * __gmap_segment_gaddr - find virtual address from segment pointer
316 * @entry: pointer to a segment table entry in the guest address space
318 * Returns the virtual address in the guest address space for the segment
320 static unsigned long __gmap_segment_gaddr(unsigned long *entry
)
323 unsigned long offset
;
325 offset
= (unsigned long) entry
/ sizeof(unsigned long);
326 offset
= (offset
& (PTRS_PER_PMD
- 1)) * PMD_SIZE
;
327 page
= pmd_to_page((pmd_t
*) entry
);
328 return page
->index
+ offset
;
332 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
333 * @gmap: pointer to the guest address space structure
334 * @vmaddr: address in the host process address space
336 * Returns 1 if a TLB flush is required
338 static int __gmap_unlink_by_vmaddr(struct gmap
*gmap
, unsigned long vmaddr
)
340 unsigned long *entry
;
343 spin_lock(&gmap
->guest_table_lock
);
344 entry
= radix_tree_delete(&gmap
->host_to_guest
, vmaddr
>> PMD_SHIFT
);
346 flush
= (*entry
!= _SEGMENT_ENTRY_INVALID
);
347 *entry
= _SEGMENT_ENTRY_INVALID
;
349 spin_unlock(&gmap
->guest_table_lock
);
354 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
355 * @gmap: pointer to the guest address space structure
356 * @gaddr: address in the guest address space
358 * Returns 1 if a TLB flush is required
360 static int __gmap_unmap_by_gaddr(struct gmap
*gmap
, unsigned long gaddr
)
362 unsigned long vmaddr
;
364 vmaddr
= (unsigned long) radix_tree_delete(&gmap
->guest_to_host
,
366 return vmaddr
? __gmap_unlink_by_vmaddr(gmap
, vmaddr
) : 0;
370 * gmap_unmap_segment - unmap segment from the guest address space
371 * @gmap: pointer to the guest address space structure
372 * @to: address in the guest address space
373 * @len: length of the memory area to unmap
375 * Returns 0 if the unmap succeeded, -EINVAL if not.
377 int gmap_unmap_segment(struct gmap
*gmap
, unsigned long to
, unsigned long len
)
382 if ((to
| len
) & (PMD_SIZE
- 1))
384 if (len
== 0 || to
+ len
< to
)
388 down_write(&gmap
->mm
->mmap_sem
);
389 for (off
= 0; off
< len
; off
+= PMD_SIZE
)
390 flush
|= __gmap_unmap_by_gaddr(gmap
, to
+ off
);
391 up_write(&gmap
->mm
->mmap_sem
);
393 gmap_flush_tlb(gmap
);
396 EXPORT_SYMBOL_GPL(gmap_unmap_segment
);
399 * gmap_mmap_segment - map a segment to the guest address space
400 * @gmap: pointer to the guest address space structure
401 * @from: source address in the parent address space
402 * @to: target address in the guest address space
403 * @len: length of the memory area to map
405 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
407 int gmap_map_segment(struct gmap
*gmap
, unsigned long from
,
408 unsigned long to
, unsigned long len
)
413 if ((from
| to
| len
) & (PMD_SIZE
- 1))
415 if (len
== 0 || from
+ len
< from
|| to
+ len
< to
||
416 from
+ len
> TASK_MAX_SIZE
|| to
+ len
> gmap
->asce_end
)
420 down_write(&gmap
->mm
->mmap_sem
);
421 for (off
= 0; off
< len
; off
+= PMD_SIZE
) {
422 /* Remove old translation */
423 flush
|= __gmap_unmap_by_gaddr(gmap
, to
+ off
);
424 /* Store new translation */
425 if (radix_tree_insert(&gmap
->guest_to_host
,
426 (to
+ off
) >> PMD_SHIFT
,
427 (void *) from
+ off
))
430 up_write(&gmap
->mm
->mmap_sem
);
432 gmap_flush_tlb(gmap
);
435 gmap_unmap_segment(gmap
, to
, len
);
438 EXPORT_SYMBOL_GPL(gmap_map_segment
);
441 * __gmap_translate - translate a guest address to a user space address
442 * @gmap: pointer to guest mapping meta data structure
443 * @gaddr: guest address
445 * Returns user space address which corresponds to the guest address or
446 * -EFAULT if no such mapping exists.
447 * This function does not establish potentially missing page table entries.
448 * The mmap_sem of the mm that belongs to the address space must be held
449 * when this function gets called.
451 unsigned long __gmap_translate(struct gmap
*gmap
, unsigned long gaddr
)
453 unsigned long vmaddr
;
455 vmaddr
= (unsigned long)
456 radix_tree_lookup(&gmap
->guest_to_host
, gaddr
>> PMD_SHIFT
);
457 return vmaddr
? (vmaddr
| (gaddr
& ~PMD_MASK
)) : -EFAULT
;
459 EXPORT_SYMBOL_GPL(__gmap_translate
);
462 * gmap_translate - translate a guest address to a user space address
463 * @gmap: pointer to guest mapping meta data structure
464 * @gaddr: guest address
466 * Returns user space address which corresponds to the guest address or
467 * -EFAULT if no such mapping exists.
468 * This function does not establish potentially missing page table entries.
470 unsigned long gmap_translate(struct gmap
*gmap
, unsigned long gaddr
)
474 down_read(&gmap
->mm
->mmap_sem
);
475 rc
= __gmap_translate(gmap
, gaddr
);
476 up_read(&gmap
->mm
->mmap_sem
);
479 EXPORT_SYMBOL_GPL(gmap_translate
);
482 * gmap_unlink - disconnect a page table from the gmap shadow tables
483 * @gmap: pointer to guest mapping meta data structure
484 * @table: pointer to the host page table
485 * @vmaddr: vm address associated with the host page table
487 static void gmap_unlink(struct mm_struct
*mm
, unsigned long *table
,
488 unsigned long vmaddr
)
493 list_for_each_entry(gmap
, &mm
->context
.gmap_list
, list
) {
494 flush
= __gmap_unlink_by_vmaddr(gmap
, vmaddr
);
496 gmap_flush_tlb(gmap
);
501 * gmap_link - set up shadow page tables to connect a host to a guest address
502 * @gmap: pointer to guest mapping meta data structure
503 * @gaddr: guest address
504 * @vmaddr: vm address
506 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
507 * if the vm address is already mapped to a different guest segment.
508 * The mmap_sem of the mm that belongs to the address space must be held
509 * when this function gets called.
511 int __gmap_link(struct gmap
*gmap
, unsigned long gaddr
, unsigned long vmaddr
)
513 struct mm_struct
*mm
;
514 unsigned long *table
;
521 /* Create higher level tables in the gmap page table */
523 if ((gmap
->asce
& _ASCE_TYPE_MASK
) >= _ASCE_TYPE_REGION1
) {
524 table
+= (gaddr
>> 53) & 0x7ff;
525 if ((*table
& _REGION_ENTRY_INVALID
) &&
526 gmap_alloc_table(gmap
, table
, _REGION2_ENTRY_EMPTY
,
527 gaddr
& 0xffe0000000000000))
529 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
531 if ((gmap
->asce
& _ASCE_TYPE_MASK
) >= _ASCE_TYPE_REGION2
) {
532 table
+= (gaddr
>> 42) & 0x7ff;
533 if ((*table
& _REGION_ENTRY_INVALID
) &&
534 gmap_alloc_table(gmap
, table
, _REGION3_ENTRY_EMPTY
,
535 gaddr
& 0xfffffc0000000000))
537 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
539 if ((gmap
->asce
& _ASCE_TYPE_MASK
) >= _ASCE_TYPE_REGION3
) {
540 table
+= (gaddr
>> 31) & 0x7ff;
541 if ((*table
& _REGION_ENTRY_INVALID
) &&
542 gmap_alloc_table(gmap
, table
, _SEGMENT_ENTRY_EMPTY
,
543 gaddr
& 0xffffffff80000000))
545 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
547 table
+= (gaddr
>> 20) & 0x7ff;
548 /* Walk the parent mm page table */
550 pgd
= pgd_offset(mm
, vmaddr
);
551 VM_BUG_ON(pgd_none(*pgd
));
552 pud
= pud_offset(pgd
, vmaddr
);
553 VM_BUG_ON(pud_none(*pud
));
554 pmd
= pmd_offset(pud
, vmaddr
);
555 VM_BUG_ON(pmd_none(*pmd
));
556 /* large pmds cannot yet be handled */
559 /* Link gmap segment table entry location to page table. */
560 rc
= radix_tree_preload(GFP_KERNEL
);
563 ptl
= pmd_lock(mm
, pmd
);
564 spin_lock(&gmap
->guest_table_lock
);
565 if (*table
== _SEGMENT_ENTRY_INVALID
) {
566 rc
= radix_tree_insert(&gmap
->host_to_guest
,
567 vmaddr
>> PMD_SHIFT
, table
);
569 *table
= pmd_val(*pmd
);
572 spin_unlock(&gmap
->guest_table_lock
);
574 radix_tree_preload_end();
579 * gmap_fault - resolve a fault on a guest address
580 * @gmap: pointer to guest mapping meta data structure
581 * @gaddr: guest address
582 * @fault_flags: flags to pass down to handle_mm_fault()
584 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
585 * if the vm address is already mapped to a different guest segment.
587 int gmap_fault(struct gmap
*gmap
, unsigned long gaddr
,
588 unsigned int fault_flags
)
590 unsigned long vmaddr
;
593 down_read(&gmap
->mm
->mmap_sem
);
594 vmaddr
= __gmap_translate(gmap
, gaddr
);
595 if (IS_ERR_VALUE(vmaddr
)) {
599 if (fixup_user_fault(current
, gmap
->mm
, vmaddr
, fault_flags
)) {
603 rc
= __gmap_link(gmap
, gaddr
, vmaddr
);
605 up_read(&gmap
->mm
->mmap_sem
);
608 EXPORT_SYMBOL_GPL(gmap_fault
);
610 static void gmap_zap_swap_entry(swp_entry_t entry
, struct mm_struct
*mm
)
612 if (!non_swap_entry(entry
))
613 dec_mm_counter(mm
, MM_SWAPENTS
);
614 else if (is_migration_entry(entry
)) {
615 struct page
*page
= migration_entry_to_page(entry
);
618 dec_mm_counter(mm
, MM_ANONPAGES
);
620 dec_mm_counter(mm
, MM_FILEPAGES
);
622 free_swap_and_cache(entry
);
626 * this function is assumed to be called with mmap_sem held
628 void __gmap_zap(struct gmap
*gmap
, unsigned long gaddr
)
630 unsigned long vmaddr
, ptev
, pgstev
;
635 /* Find the vm address for the guest address */
636 vmaddr
= (unsigned long) radix_tree_lookup(&gmap
->guest_to_host
,
640 vmaddr
|= gaddr
& ~PMD_MASK
;
641 /* Get pointer to the page table entry */
642 ptep
= get_locked_pte(gmap
->mm
, vmaddr
, &ptl
);
648 /* Zap unused and logically-zero pages */
649 pgste
= pgste_get_lock(ptep
);
650 pgstev
= pgste_val(pgste
);
652 if (((pgstev
& _PGSTE_GPS_USAGE_MASK
) == _PGSTE_GPS_USAGE_UNUSED
) ||
653 ((pgstev
& _PGSTE_GPS_ZERO
) && (ptev
& _PAGE_INVALID
))) {
654 gmap_zap_swap_entry(pte_to_swp_entry(pte
), gmap
->mm
);
655 pte_clear(gmap
->mm
, vmaddr
, ptep
);
657 pgste_set_unlock(ptep
, pgste
);
659 pte_unmap_unlock(ptep
, ptl
);
661 EXPORT_SYMBOL_GPL(__gmap_zap
);
663 void gmap_discard(struct gmap
*gmap
, unsigned long from
, unsigned long to
)
665 unsigned long gaddr
, vmaddr
, size
;
666 struct vm_area_struct
*vma
;
668 down_read(&gmap
->mm
->mmap_sem
);
669 for (gaddr
= from
; gaddr
< to
;
670 gaddr
= (gaddr
+ PMD_SIZE
) & PMD_MASK
) {
671 /* Find the vm address for the guest address */
672 vmaddr
= (unsigned long)
673 radix_tree_lookup(&gmap
->guest_to_host
,
677 vmaddr
|= gaddr
& ~PMD_MASK
;
678 /* Find vma in the parent mm */
679 vma
= find_vma(gmap
->mm
, vmaddr
);
680 size
= min(to
- gaddr
, PMD_SIZE
- (gaddr
& ~PMD_MASK
));
681 zap_page_range(vma
, vmaddr
, size
, NULL
);
683 up_read(&gmap
->mm
->mmap_sem
);
685 EXPORT_SYMBOL_GPL(gmap_discard
);
687 static LIST_HEAD(gmap_notifier_list
);
688 static DEFINE_SPINLOCK(gmap_notifier_lock
);
691 * gmap_register_ipte_notifier - register a pte invalidation callback
692 * @nb: pointer to the gmap notifier block
694 void gmap_register_ipte_notifier(struct gmap_notifier
*nb
)
696 spin_lock(&gmap_notifier_lock
);
697 list_add(&nb
->list
, &gmap_notifier_list
);
698 spin_unlock(&gmap_notifier_lock
);
700 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier
);
703 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
704 * @nb: pointer to the gmap notifier block
706 void gmap_unregister_ipte_notifier(struct gmap_notifier
*nb
)
708 spin_lock(&gmap_notifier_lock
);
709 list_del_init(&nb
->list
);
710 spin_unlock(&gmap_notifier_lock
);
712 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier
);
715 * gmap_ipte_notify - mark a range of ptes for invalidation notification
716 * @gmap: pointer to guest mapping meta data structure
717 * @gaddr: virtual address in the guest address space
720 * Returns 0 if for each page in the given range a gmap mapping exists and
721 * the invalidation notification could be set. If the gmap mapping is missing
722 * for one or more pages -EFAULT is returned. If no memory could be allocated
723 * -ENOMEM is returned. This function establishes missing page table entries.
725 int gmap_ipte_notify(struct gmap
*gmap
, unsigned long gaddr
, unsigned long len
)
733 if ((gaddr
& ~PAGE_MASK
) || (len
& ~PAGE_MASK
))
735 down_read(&gmap
->mm
->mmap_sem
);
737 /* Convert gmap address and connect the page tables */
738 addr
= __gmap_translate(gmap
, gaddr
);
739 if (IS_ERR_VALUE(addr
)) {
743 /* Get the page mapped */
744 if (fixup_user_fault(current
, gmap
->mm
, addr
, FAULT_FLAG_WRITE
)) {
748 rc
= __gmap_link(gmap
, gaddr
, addr
);
751 /* Walk the process page table, lock and get pte pointer */
752 ptep
= get_locked_pte(gmap
->mm
, addr
, &ptl
);
755 /* Set notification bit in the pgste of the pte */
757 if ((pte_val(entry
) & (_PAGE_INVALID
| _PAGE_PROTECT
)) == 0) {
758 pgste
= pgste_get_lock(ptep
);
759 pgste_val(pgste
) |= PGSTE_IN_BIT
;
760 pgste_set_unlock(ptep
, pgste
);
766 up_read(&gmap
->mm
->mmap_sem
);
769 EXPORT_SYMBOL_GPL(gmap_ipte_notify
);
772 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
773 * @mm: pointer to the process mm_struct
774 * @addr: virtual address in the process address space
775 * @pte: pointer to the page table entry
777 * This function is assumed to be called with the page table lock held
778 * for the pte to notify.
780 void gmap_do_ipte_notify(struct mm_struct
*mm
, unsigned long vmaddr
, pte_t
*pte
)
782 unsigned long offset
, gaddr
;
783 unsigned long *table
;
784 struct gmap_notifier
*nb
;
787 offset
= ((unsigned long) pte
) & (255 * sizeof(pte_t
));
788 offset
= offset
* (4096 / sizeof(pte_t
));
789 spin_lock(&gmap_notifier_lock
);
790 list_for_each_entry(gmap
, &mm
->context
.gmap_list
, list
) {
791 table
= radix_tree_lookup(&gmap
->host_to_guest
,
792 vmaddr
>> PMD_SHIFT
);
795 gaddr
= __gmap_segment_gaddr(table
) + offset
;
796 list_for_each_entry(nb
, &gmap_notifier_list
, list
)
797 nb
->notifier_call(gmap
, gaddr
);
799 spin_unlock(&gmap_notifier_lock
);
801 EXPORT_SYMBOL_GPL(gmap_do_ipte_notify
);
803 static inline int page_table_with_pgste(struct page
*page
)
805 return atomic_read(&page
->_mapcount
) == 0;
808 static inline unsigned long *page_table_alloc_pgste(struct mm_struct
*mm
)
811 unsigned long *table
;
813 page
= alloc_page(GFP_KERNEL
|__GFP_REPEAT
);
816 if (!pgtable_page_ctor(page
)) {
820 atomic_set(&page
->_mapcount
, 0);
821 table
= (unsigned long *) page_to_phys(page
);
822 clear_table(table
, _PAGE_INVALID
, PAGE_SIZE
/2);
823 clear_table(table
+ PTRS_PER_PTE
, 0, PAGE_SIZE
/2);
827 static inline void page_table_free_pgste(unsigned long *table
)
831 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
832 pgtable_page_dtor(page
);
833 atomic_set(&page
->_mapcount
, -1);
837 static inline unsigned long page_table_reset_pte(struct mm_struct
*mm
, pmd_t
*pmd
,
838 unsigned long addr
, unsigned long end
, bool init_skey
)
840 pte_t
*start_pte
, *pte
;
844 start_pte
= pte_offset_map_lock(mm
, pmd
, addr
, &ptl
);
847 pgste
= pgste_get_lock(pte
);
848 pgste_val(pgste
) &= ~_PGSTE_GPS_USAGE_MASK
;
850 unsigned long address
;
852 pgste_val(pgste
) &= ~(PGSTE_ACC_BITS
| PGSTE_FP_BIT
|
853 PGSTE_GR_BIT
| PGSTE_GC_BIT
);
855 /* skip invalid and not writable pages */
856 if (pte_val(*pte
) & _PAGE_INVALID
||
857 !(pte_val(*pte
) & _PAGE_WRITE
)) {
858 pgste_set_unlock(pte
, pgste
);
862 address
= pte_val(*pte
) & PAGE_MASK
;
863 page_set_storage_key(address
, PAGE_DEFAULT_KEY
, 1);
865 pgste_set_unlock(pte
, pgste
);
866 } while (pte
++, addr
+= PAGE_SIZE
, addr
!= end
);
867 pte_unmap_unlock(start_pte
, ptl
);
872 static inline unsigned long page_table_reset_pmd(struct mm_struct
*mm
, pud_t
*pud
,
873 unsigned long addr
, unsigned long end
, bool init_skey
)
878 pmd
= pmd_offset(pud
, addr
);
880 next
= pmd_addr_end(addr
, end
);
881 if (pmd_none_or_clear_bad(pmd
))
883 next
= page_table_reset_pte(mm
, pmd
, addr
, next
, init_skey
);
884 } while (pmd
++, addr
= next
, addr
!= end
);
889 static inline unsigned long page_table_reset_pud(struct mm_struct
*mm
, pgd_t
*pgd
,
890 unsigned long addr
, unsigned long end
, bool init_skey
)
895 pud
= pud_offset(pgd
, addr
);
897 next
= pud_addr_end(addr
, end
);
898 if (pud_none_or_clear_bad(pud
))
900 next
= page_table_reset_pmd(mm
, pud
, addr
, next
, init_skey
);
901 } while (pud
++, addr
= next
, addr
!= end
);
906 void page_table_reset_pgste(struct mm_struct
*mm
, unsigned long start
,
907 unsigned long end
, bool init_skey
)
909 unsigned long addr
, next
;
912 down_write(&mm
->mmap_sem
);
913 if (init_skey
&& mm_use_skey(mm
))
916 pgd
= pgd_offset(mm
, addr
);
918 next
= pgd_addr_end(addr
, end
);
919 if (pgd_none_or_clear_bad(pgd
))
921 next
= page_table_reset_pud(mm
, pgd
, addr
, next
, init_skey
);
922 } while (pgd
++, addr
= next
, addr
!= end
);
924 current
->mm
->context
.use_skey
= 1;
926 up_write(&mm
->mmap_sem
);
928 EXPORT_SYMBOL(page_table_reset_pgste
);
930 int set_guest_storage_key(struct mm_struct
*mm
, unsigned long addr
,
931 unsigned long key
, bool nq
)
937 down_read(&mm
->mmap_sem
);
939 ptep
= get_locked_pte(current
->mm
, addr
, &ptl
);
940 if (unlikely(!ptep
)) {
941 up_read(&mm
->mmap_sem
);
944 if (!(pte_val(*ptep
) & _PAGE_INVALID
) &&
945 (pte_val(*ptep
) & _PAGE_PROTECT
)) {
946 pte_unmap_unlock(ptep
, ptl
);
947 if (fixup_user_fault(current
, mm
, addr
, FAULT_FLAG_WRITE
)) {
948 up_read(&mm
->mmap_sem
);
954 new = old
= pgste_get_lock(ptep
);
955 pgste_val(new) &= ~(PGSTE_GR_BIT
| PGSTE_GC_BIT
|
956 PGSTE_ACC_BITS
| PGSTE_FP_BIT
);
957 pgste_val(new) |= (key
& (_PAGE_CHANGED
| _PAGE_REFERENCED
)) << 48;
958 pgste_val(new) |= (key
& (_PAGE_ACC_BITS
| _PAGE_FP_BIT
)) << 56;
959 if (!(pte_val(*ptep
) & _PAGE_INVALID
)) {
960 unsigned long address
, bits
, skey
;
962 address
= pte_val(*ptep
) & PAGE_MASK
;
963 skey
= (unsigned long) page_get_storage_key(address
);
964 bits
= skey
& (_PAGE_CHANGED
| _PAGE_REFERENCED
);
965 skey
= key
& (_PAGE_ACC_BITS
| _PAGE_FP_BIT
);
966 /* Set storage key ACC and FP */
967 page_set_storage_key(address
, skey
, !nq
);
968 /* Merge host changed & referenced into pgste */
969 pgste_val(new) |= bits
<< 52;
971 /* changing the guest storage key is considered a change of the page */
972 if ((pgste_val(new) ^ pgste_val(old
)) &
973 (PGSTE_ACC_BITS
| PGSTE_FP_BIT
| PGSTE_GR_BIT
| PGSTE_GC_BIT
))
974 pgste_val(new) |= PGSTE_UC_BIT
;
976 pgste_set_unlock(ptep
, new);
977 pte_unmap_unlock(ptep
, ptl
);
978 up_read(&mm
->mmap_sem
);
981 EXPORT_SYMBOL(set_guest_storage_key
);
983 #else /* CONFIG_PGSTE */
985 static inline int page_table_with_pgste(struct page
*page
)
990 static inline unsigned long *page_table_alloc_pgste(struct mm_struct
*mm
)
995 void page_table_reset_pgste(struct mm_struct
*mm
, unsigned long start
,
996 unsigned long end
, bool init_skey
)
1000 static inline void page_table_free_pgste(unsigned long *table
)
1004 static inline void gmap_unlink(struct mm_struct
*mm
, unsigned long *table
,
1005 unsigned long vmaddr
)
1009 #endif /* CONFIG_PGSTE */
1011 static inline unsigned int atomic_xor_bits(atomic_t
*v
, unsigned int bits
)
1013 unsigned int old
, new;
1016 old
= atomic_read(v
);
1018 } while (atomic_cmpxchg(v
, old
, new) != old
);
1023 * page table entry allocation/free routines.
1025 unsigned long *page_table_alloc(struct mm_struct
*mm
)
1027 unsigned long *uninitialized_var(table
);
1028 struct page
*uninitialized_var(page
);
1029 unsigned int mask
, bit
;
1031 if (mm_has_pgste(mm
))
1032 return page_table_alloc_pgste(mm
);
1033 /* Allocate fragments of a 4K page as 1K/2K page table */
1034 spin_lock_bh(&mm
->context
.list_lock
);
1036 if (!list_empty(&mm
->context
.pgtable_list
)) {
1037 page
= list_first_entry(&mm
->context
.pgtable_list
,
1039 table
= (unsigned long *) page_to_phys(page
);
1040 mask
= atomic_read(&page
->_mapcount
);
1041 mask
= mask
| (mask
>> 4);
1043 if ((mask
& FRAG_MASK
) == FRAG_MASK
) {
1044 spin_unlock_bh(&mm
->context
.list_lock
);
1045 page
= alloc_page(GFP_KERNEL
|__GFP_REPEAT
);
1048 if (!pgtable_page_ctor(page
)) {
1052 atomic_set(&page
->_mapcount
, 1);
1053 table
= (unsigned long *) page_to_phys(page
);
1054 clear_table(table
, _PAGE_INVALID
, PAGE_SIZE
);
1055 spin_lock_bh(&mm
->context
.list_lock
);
1056 list_add(&page
->lru
, &mm
->context
.pgtable_list
);
1058 for (bit
= 1; mask
& bit
; bit
<<= 1)
1059 table
+= PTRS_PER_PTE
;
1060 mask
= atomic_xor_bits(&page
->_mapcount
, bit
);
1061 if ((mask
& FRAG_MASK
) == FRAG_MASK
)
1062 list_del(&page
->lru
);
1064 spin_unlock_bh(&mm
->context
.list_lock
);
1068 void page_table_free(struct mm_struct
*mm
, unsigned long *table
)
1071 unsigned int bit
, mask
;
1073 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
1074 if (page_table_with_pgste(page
))
1075 return page_table_free_pgste(table
);
1076 /* Free 1K/2K page table fragment of a 4K page */
1077 bit
= 1 << ((__pa(table
) & ~PAGE_MASK
)/(PTRS_PER_PTE
*sizeof(pte_t
)));
1078 spin_lock_bh(&mm
->context
.list_lock
);
1079 if ((atomic_read(&page
->_mapcount
) & FRAG_MASK
) != FRAG_MASK
)
1080 list_del(&page
->lru
);
1081 mask
= atomic_xor_bits(&page
->_mapcount
, bit
);
1082 if (mask
& FRAG_MASK
)
1083 list_add(&page
->lru
, &mm
->context
.pgtable_list
);
1084 spin_unlock_bh(&mm
->context
.list_lock
);
1086 pgtable_page_dtor(page
);
1087 atomic_set(&page
->_mapcount
, -1);
1092 static void __page_table_free_rcu(void *table
, unsigned bit
)
1096 if (bit
== FRAG_MASK
)
1097 return page_table_free_pgste(table
);
1098 /* Free 1K/2K page table fragment of a 4K page */
1099 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
1100 if (atomic_xor_bits(&page
->_mapcount
, bit
) == 0) {
1101 pgtable_page_dtor(page
);
1102 atomic_set(&page
->_mapcount
, -1);
1107 void page_table_free_rcu(struct mmu_gather
*tlb
, unsigned long *table
,
1108 unsigned long vmaddr
)
1110 struct mm_struct
*mm
;
1112 unsigned int bit
, mask
;
1115 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
1116 if (page_table_with_pgste(page
)) {
1117 gmap_unlink(mm
, table
, vmaddr
);
1118 table
= (unsigned long *) (__pa(table
) | FRAG_MASK
);
1119 tlb_remove_table(tlb
, table
);
1122 bit
= 1 << ((__pa(table
) & ~PAGE_MASK
) / (PTRS_PER_PTE
*sizeof(pte_t
)));
1123 spin_lock_bh(&mm
->context
.list_lock
);
1124 if ((atomic_read(&page
->_mapcount
) & FRAG_MASK
) != FRAG_MASK
)
1125 list_del(&page
->lru
);
1126 mask
= atomic_xor_bits(&page
->_mapcount
, bit
| (bit
<< 4));
1127 if (mask
& FRAG_MASK
)
1128 list_add_tail(&page
->lru
, &mm
->context
.pgtable_list
);
1129 spin_unlock_bh(&mm
->context
.list_lock
);
1130 table
= (unsigned long *) (__pa(table
) | (bit
<< 4));
1131 tlb_remove_table(tlb
, table
);
1134 static void __tlb_remove_table(void *_table
)
1136 const unsigned long mask
= (FRAG_MASK
<< 4) | FRAG_MASK
;
1137 void *table
= (void *)((unsigned long) _table
& ~mask
);
1138 unsigned type
= (unsigned long) _table
& mask
;
1141 __page_table_free_rcu(table
, type
);
1143 free_pages((unsigned long) table
, ALLOC_ORDER
);
1146 static void tlb_remove_table_smp_sync(void *arg
)
1148 /* Simply deliver the interrupt */
1151 static void tlb_remove_table_one(void *table
)
1154 * This isn't an RCU grace period and hence the page-tables cannot be
1155 * assumed to be actually RCU-freed.
1157 * It is however sufficient for software page-table walkers that rely
1158 * on IRQ disabling. See the comment near struct mmu_table_batch.
1160 smp_call_function(tlb_remove_table_smp_sync
, NULL
, 1);
1161 __tlb_remove_table(table
);
1164 static void tlb_remove_table_rcu(struct rcu_head
*head
)
1166 struct mmu_table_batch
*batch
;
1169 batch
= container_of(head
, struct mmu_table_batch
, rcu
);
1171 for (i
= 0; i
< batch
->nr
; i
++)
1172 __tlb_remove_table(batch
->tables
[i
]);
1174 free_page((unsigned long)batch
);
1177 void tlb_table_flush(struct mmu_gather
*tlb
)
1179 struct mmu_table_batch
**batch
= &tlb
->batch
;
1182 call_rcu_sched(&(*batch
)->rcu
, tlb_remove_table_rcu
);
1187 void tlb_remove_table(struct mmu_gather
*tlb
, void *table
)
1189 struct mmu_table_batch
**batch
= &tlb
->batch
;
1191 tlb
->mm
->context
.flush_mm
= 1;
1192 if (*batch
== NULL
) {
1193 *batch
= (struct mmu_table_batch
*)
1194 __get_free_page(GFP_NOWAIT
| __GFP_NOWARN
);
1195 if (*batch
== NULL
) {
1196 __tlb_flush_mm_lazy(tlb
->mm
);
1197 tlb_remove_table_one(table
);
1202 (*batch
)->tables
[(*batch
)->nr
++] = table
;
1203 if ((*batch
)->nr
== MAX_TABLE_BATCH
)
1207 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1208 static inline void thp_split_vma(struct vm_area_struct
*vma
)
1212 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= PAGE_SIZE
)
1213 follow_page(vma
, addr
, FOLL_SPLIT
);
1216 static inline void thp_split_mm(struct mm_struct
*mm
)
1218 struct vm_area_struct
*vma
;
1220 for (vma
= mm
->mmap
; vma
!= NULL
; vma
= vma
->vm_next
) {
1222 vma
->vm_flags
&= ~VM_HUGEPAGE
;
1223 vma
->vm_flags
|= VM_NOHUGEPAGE
;
1225 mm
->def_flags
|= VM_NOHUGEPAGE
;
1228 static inline void thp_split_mm(struct mm_struct
*mm
)
1231 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1233 static unsigned long page_table_realloc_pmd(struct mmu_gather
*tlb
,
1234 struct mm_struct
*mm
, pud_t
*pud
,
1235 unsigned long addr
, unsigned long end
)
1237 unsigned long next
, *table
, *new;
1242 pmd
= pmd_offset(pud
, addr
);
1244 next
= pmd_addr_end(addr
, end
);
1246 if (pmd_none_or_clear_bad(pmd
))
1248 table
= (unsigned long *) pmd_deref(*pmd
);
1249 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
1250 if (page_table_with_pgste(page
))
1252 /* Allocate new page table with pgstes */
1253 new = page_table_alloc_pgste(mm
);
1257 ptl
= pmd_lock(mm
, pmd
);
1258 if (likely((unsigned long *) pmd_deref(*pmd
) == table
)) {
1259 /* Nuke pmd entry pointing to the "short" page table */
1260 pmdp_flush_lazy(mm
, addr
, pmd
);
1262 /* Copy ptes from old table to new table */
1263 memcpy(new, table
, PAGE_SIZE
/2);
1264 clear_table(table
, _PAGE_INVALID
, PAGE_SIZE
/2);
1265 /* Establish new table */
1266 pmd_populate(mm
, pmd
, (pte_t
*) new);
1267 /* Free old table with rcu, there might be a walker! */
1268 page_table_free_rcu(tlb
, table
, addr
);
1273 page_table_free_pgste(new);
1276 } while (pmd
++, addr
= next
, addr
!= end
);
1281 static unsigned long page_table_realloc_pud(struct mmu_gather
*tlb
,
1282 struct mm_struct
*mm
, pgd_t
*pgd
,
1283 unsigned long addr
, unsigned long end
)
1288 pud
= pud_offset(pgd
, addr
);
1290 next
= pud_addr_end(addr
, end
);
1291 if (pud_none_or_clear_bad(pud
))
1293 next
= page_table_realloc_pmd(tlb
, mm
, pud
, addr
, next
);
1294 if (unlikely(IS_ERR_VALUE(next
)))
1296 } while (pud
++, addr
= next
, addr
!= end
);
1301 static unsigned long page_table_realloc(struct mmu_gather
*tlb
, struct mm_struct
*mm
,
1302 unsigned long addr
, unsigned long end
)
1307 pgd
= pgd_offset(mm
, addr
);
1309 next
= pgd_addr_end(addr
, end
);
1310 if (pgd_none_or_clear_bad(pgd
))
1312 next
= page_table_realloc_pud(tlb
, mm
, pgd
, addr
, next
);
1313 if (unlikely(IS_ERR_VALUE(next
)))
1315 } while (pgd
++, addr
= next
, addr
!= end
);
1321 * switch on pgstes for its userspace process (for kvm)
1323 int s390_enable_sie(void)
1325 struct task_struct
*tsk
= current
;
1326 struct mm_struct
*mm
= tsk
->mm
;
1327 struct mmu_gather tlb
;
1329 /* Do we have pgstes? if yes, we are done */
1330 if (mm_has_pgste(tsk
->mm
))
1333 down_write(&mm
->mmap_sem
);
1334 /* split thp mappings and disable thp for future mappings */
1336 /* Reallocate the page tables with pgstes */
1337 tlb_gather_mmu(&tlb
, mm
, 0, TASK_SIZE
);
1338 if (!page_table_realloc(&tlb
, mm
, 0, TASK_SIZE
))
1339 mm
->context
.has_pgste
= 1;
1340 tlb_finish_mmu(&tlb
, 0, TASK_SIZE
);
1341 up_write(&mm
->mmap_sem
);
1342 return mm
->context
.has_pgste
? 0 : -ENOMEM
;
1344 EXPORT_SYMBOL_GPL(s390_enable_sie
);
1347 * Enable storage key handling from now on and initialize the storage
1348 * keys with the default key.
1350 void s390_enable_skey(void)
1352 page_table_reset_pgste(current
->mm
, 0, TASK_SIZE
, true);
1354 EXPORT_SYMBOL_GPL(s390_enable_skey
);
1357 * Test and reset if a guest page is dirty
1359 bool gmap_test_and_clear_dirty(unsigned long address
, struct gmap
*gmap
)
1365 pte
= get_locked_pte(gmap
->mm
, address
, &ptl
);
1369 if (ptep_test_and_clear_user_dirty(gmap
->mm
, address
, pte
))
1375 EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty
);
1377 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1378 int pmdp_clear_flush_young(struct vm_area_struct
*vma
, unsigned long address
,
1381 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1382 /* No need to flush TLB
1383 * On s390 reference bits are in storage key and never in TLB */
1384 return pmdp_test_and_clear_young(vma
, address
, pmdp
);
1387 int pmdp_set_access_flags(struct vm_area_struct
*vma
,
1388 unsigned long address
, pmd_t
*pmdp
,
1389 pmd_t entry
, int dirty
)
1391 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1393 entry
= pmd_mkyoung(entry
);
1395 entry
= pmd_mkdirty(entry
);
1396 if (pmd_same(*pmdp
, entry
))
1398 pmdp_invalidate(vma
, address
, pmdp
);
1399 set_pmd_at(vma
->vm_mm
, address
, pmdp
, entry
);
1403 static void pmdp_splitting_flush_sync(void *arg
)
1405 /* Simply deliver the interrupt */
1408 void pmdp_splitting_flush(struct vm_area_struct
*vma
, unsigned long address
,
1411 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1412 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT
,
1413 (unsigned long *) pmdp
)) {
1414 /* need to serialize against gup-fast (IRQ disabled) */
1415 smp_call_function(pmdp_splitting_flush_sync
, NULL
, 1);
1419 void pgtable_trans_huge_deposit(struct mm_struct
*mm
, pmd_t
*pmdp
,
1422 struct list_head
*lh
= (struct list_head
*) pgtable
;
1424 assert_spin_locked(pmd_lockptr(mm
, pmdp
));
1427 if (!pmd_huge_pte(mm
, pmdp
))
1430 list_add(lh
, (struct list_head
*) pmd_huge_pte(mm
, pmdp
));
1431 pmd_huge_pte(mm
, pmdp
) = pgtable
;
1434 pgtable_t
pgtable_trans_huge_withdraw(struct mm_struct
*mm
, pmd_t
*pmdp
)
1436 struct list_head
*lh
;
1440 assert_spin_locked(pmd_lockptr(mm
, pmdp
));
1443 pgtable
= pmd_huge_pte(mm
, pmdp
);
1444 lh
= (struct list_head
*) pgtable
;
1446 pmd_huge_pte(mm
, pmdp
) = NULL
;
1448 pmd_huge_pte(mm
, pmdp
) = (pgtable_t
) lh
->next
;
1451 ptep
= (pte_t
*) pgtable
;
1452 pte_val(*ptep
) = _PAGE_INVALID
;
1454 pte_val(*ptep
) = _PAGE_INVALID
;
1457 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */