Commit | Line | Data |
---|---|---|
ce0ad7f0 NP |
1 | /* |
2 | * Lockless get_user_pages_fast for powerpc | |
3 | * | |
4 | * Copyright (C) 2008 Nick Piggin | |
5 | * Copyright (C) 2008 Novell Inc. | |
6 | */ | |
7 | #undef DEBUG | |
8 | ||
9 | #include <linux/sched.h> | |
10 | #include <linux/mm.h> | |
11 | #include <linux/hugetlb.h> | |
12 | #include <linux/vmstat.h> | |
13 | #include <linux/pagemap.h> | |
14 | #include <linux/rwsem.h> | |
15 | #include <asm/pgtable.h> | |
16 | ||
9e5efaa9 BH |
17 | #ifdef __HAVE_ARCH_PTE_SPECIAL |
18 | ||
ce0ad7f0 NP |
19 | /* |
20 | * The performance critical leaf functions are made noinline otherwise gcc | |
21 | * inlines everything into a single function which results in too much | |
22 | * register pressure. | |
23 | */ | |
24 | static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | |
25 | unsigned long end, int write, struct page **pages, int *nr) | |
26 | { | |
27 | unsigned long mask, result; | |
28 | pte_t *ptep; | |
29 | ||
30 | result = _PAGE_PRESENT|_PAGE_USER; | |
31 | if (write) | |
32 | result |= _PAGE_RW; | |
33 | mask = result | _PAGE_SPECIAL; | |
34 | ||
35 | ptep = pte_offset_kernel(&pmd, addr); | |
36 | do { | |
37 | pte_t pte = *ptep; | |
38 | struct page *page; | |
39 | ||
40 | if ((pte_val(pte) & mask) != result) | |
41 | return 0; | |
42 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | |
43 | page = pte_page(pte); | |
44 | if (!page_cache_get_speculative(page)) | |
45 | return 0; | |
f5ea64dc | 46 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { |
ce0ad7f0 NP |
47 | put_page(page); |
48 | return 0; | |
49 | } | |
50 | pages[*nr] = page; | |
51 | (*nr)++; | |
52 | ||
53 | } while (ptep++, addr += PAGE_SIZE, addr != end); | |
54 | ||
55 | return 1; | |
56 | } | |
57 | ||
58 | #ifdef CONFIG_HUGETLB_PAGE | |
59 | static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate, | |
60 | unsigned long *addr, unsigned long end, | |
61 | int write, struct page **pages, int *nr) | |
62 | { | |
63 | unsigned long mask; | |
64 | unsigned long pte_end; | |
65 | struct page *head, *page; | |
66 | pte_t pte; | |
67 | int refs; | |
68 | ||
69 | pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate); | |
70 | if (pte_end < end) | |
71 | end = pte_end; | |
72 | ||
73 | pte = *ptep; | |
74 | mask = _PAGE_PRESENT|_PAGE_USER; | |
75 | if (write) | |
76 | mask |= _PAGE_RW; | |
77 | if ((pte_val(pte) & mask) != mask) | |
78 | return 0; | |
79 | /* hugepages are never "special" */ | |
80 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | |
81 | ||
82 | refs = 0; | |
83 | head = pte_page(pte); | |
84 | page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT); | |
85 | do { | |
86 | VM_BUG_ON(compound_head(page) != head); | |
87 | pages[*nr] = page; | |
88 | (*nr)++; | |
89 | page++; | |
90 | refs++; | |
91 | } while (*addr += PAGE_SIZE, *addr != end); | |
92 | ||
93 | if (!page_cache_add_speculative(head, refs)) { | |
94 | *nr -= refs; | |
95 | return 0; | |
96 | } | |
f5ea64dc | 97 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { |
ce0ad7f0 NP |
98 | /* Could be optimized better */ |
99 | while (*nr) { | |
100 | put_page(page); | |
101 | (*nr)--; | |
102 | } | |
103 | } | |
104 | ||
105 | return 1; | |
106 | } | |
107 | #endif /* CONFIG_HUGETLB_PAGE */ | |
108 | ||
109 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |
110 | int write, struct page **pages, int *nr) | |
111 | { | |
112 | unsigned long next; | |
113 | pmd_t *pmdp; | |
114 | ||
115 | pmdp = pmd_offset(&pud, addr); | |
116 | do { | |
117 | pmd_t pmd = *pmdp; | |
118 | ||
119 | next = pmd_addr_end(addr, end); | |
120 | if (pmd_none(pmd)) | |
121 | return 0; | |
122 | if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | |
123 | return 0; | |
124 | } while (pmdp++, addr = next, addr != end); | |
125 | ||
126 | return 1; | |
127 | } | |
128 | ||
129 | static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, | |
130 | int write, struct page **pages, int *nr) | |
131 | { | |
132 | unsigned long next; | |
133 | pud_t *pudp; | |
134 | ||
135 | pudp = pud_offset(&pgd, addr); | |
136 | do { | |
137 | pud_t pud = *pudp; | |
138 | ||
139 | next = pud_addr_end(addr, end); | |
140 | if (pud_none(pud)) | |
141 | return 0; | |
142 | if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | |
143 | return 0; | |
144 | } while (pudp++, addr = next, addr != end); | |
145 | ||
146 | return 1; | |
147 | } | |
148 | ||
149 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |
150 | struct page **pages) | |
151 | { | |
152 | struct mm_struct *mm = current->mm; | |
153 | unsigned long addr, len, end; | |
154 | unsigned long next; | |
155 | pgd_t *pgdp; | |
9e5efaa9 BH |
156 | int nr = 0; |
157 | #ifdef CONFIG_PPC64 | |
ce0ad7f0 | 158 | unsigned int shift; |
9e5efaa9 BH |
159 | int psize; |
160 | #endif | |
ce0ad7f0 NP |
161 | |
162 | pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); | |
163 | ||
164 | start &= PAGE_MASK; | |
165 | addr = start; | |
166 | len = (unsigned long) nr_pages << PAGE_SHIFT; | |
167 | end = start + len; | |
168 | ||
169 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | |
170 | start, len))) | |
171 | goto slow_irqon; | |
172 | ||
173 | pr_debug(" aligned: %lx .. %lx\n", start, end); | |
174 | ||
175 | #ifdef CONFIG_HUGETLB_PAGE | |
176 | /* We bail out on slice boundary crossing when hugetlb is | |
177 | * enabled in order to not have to deal with two different | |
178 | * page table formats | |
179 | */ | |
180 | if (addr < SLICE_LOW_TOP) { | |
181 | if (end > SLICE_LOW_TOP) | |
182 | goto slow_irqon; | |
183 | ||
184 | if (unlikely(GET_LOW_SLICE_INDEX(addr) != | |
185 | GET_LOW_SLICE_INDEX(end - 1))) | |
186 | goto slow_irqon; | |
187 | } else { | |
188 | if (unlikely(GET_HIGH_SLICE_INDEX(addr) != | |
189 | GET_HIGH_SLICE_INDEX(end - 1))) | |
190 | goto slow_irqon; | |
191 | } | |
192 | #endif /* CONFIG_HUGETLB_PAGE */ | |
193 | ||
194 | /* | |
195 | * XXX: batch / limit 'nr', to avoid large irq off latency | |
196 | * needs some instrumenting to determine the common sizes used by | |
197 | * important workloads (eg. DB2), and whether limiting the batch size | |
198 | * will decrease performance. | |
199 | * | |
200 | * It seems like we're in the clear for the moment. Direct-IO is | |
201 | * the main guy that batches up lots of get_user_pages, and even | |
202 | * they are limited to 64-at-a-time which is not so many. | |
203 | */ | |
204 | /* | |
205 | * This doesn't prevent pagetable teardown, but does prevent | |
206 | * the pagetables from being freed on powerpc. | |
207 | * | |
208 | * So long as we atomically load page table pointers versus teardown, | |
209 | * we can follow the address down to the the page and take a ref on it. | |
210 | */ | |
211 | local_irq_disable(); | |
212 | ||
9e5efaa9 BH |
213 | #ifdef CONFIG_PPC64 |
214 | /* Those bits are related to hugetlbfs implementation and only exist | |
215 | * on 64-bit for now | |
216 | */ | |
ce0ad7f0 NP |
217 | psize = get_slice_psize(mm, addr); |
218 | shift = mmu_psize_defs[psize].shift; | |
9e5efaa9 | 219 | #endif /* CONFIG_PPC64 */ |
ce0ad7f0 NP |
220 | |
221 | #ifdef CONFIG_HUGETLB_PAGE | |
222 | if (unlikely(mmu_huge_psizes[psize])) { | |
223 | pte_t *ptep; | |
224 | unsigned long a = addr; | |
225 | unsigned long sz = ((1UL) << shift); | |
226 | struct hstate *hstate = size_to_hstate(sz); | |
227 | ||
228 | BUG_ON(!hstate); | |
229 | /* | |
230 | * XXX: could be optimized to avoid hstate | |
231 | * lookup entirely (just use shift) | |
232 | */ | |
233 | ||
234 | do { | |
235 | VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift); | |
236 | ptep = huge_pte_offset(mm, a); | |
237 | pr_debug(" %016lx: huge ptep %p\n", a, ptep); | |
238 | if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages, | |
239 | &nr)) | |
240 | goto slow; | |
241 | } while (a != end); | |
242 | } else | |
243 | #endif /* CONFIG_HUGETLB_PAGE */ | |
244 | { | |
245 | pgdp = pgd_offset(mm, addr); | |
246 | do { | |
247 | pgd_t pgd = *pgdp; | |
248 | ||
9e5efaa9 | 249 | #ifdef CONFIG_PPC64 |
ce0ad7f0 | 250 | VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift); |
9e5efaa9 | 251 | #endif |
f5ea64dc DG |
252 | pr_debug(" %016lx: normal pgd %p\n", addr, |
253 | (void *)pgd_val(pgd)); | |
ce0ad7f0 NP |
254 | next = pgd_addr_end(addr, end); |
255 | if (pgd_none(pgd)) | |
256 | goto slow; | |
257 | if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) | |
258 | goto slow; | |
259 | } while (pgdp++, addr = next, addr != end); | |
260 | } | |
261 | local_irq_enable(); | |
262 | ||
263 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); | |
264 | return nr; | |
265 | ||
266 | { | |
267 | int ret; | |
268 | ||
269 | slow: | |
270 | local_irq_enable(); | |
271 | slow_irqon: | |
272 | pr_debug(" slow path ! nr = %d\n", nr); | |
273 | ||
274 | /* Try to get the remaining pages with get_user_pages */ | |
275 | start += nr << PAGE_SHIFT; | |
276 | pages += nr; | |
277 | ||
278 | down_read(&mm->mmap_sem); | |
279 | ret = get_user_pages(current, mm, start, | |
280 | (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); | |
281 | up_read(&mm->mmap_sem); | |
282 | ||
283 | /* Have to be a bit careful with return values */ | |
284 | if (nr > 0) { | |
285 | if (ret < 0) | |
286 | ret = nr; | |
287 | else | |
288 | ret += nr; | |
289 | } | |
290 | ||
291 | return ret; | |
292 | } | |
293 | } | |
9e5efaa9 BH |
294 | |
295 | #endif /* __HAVE_ARCH_PTE_SPECIAL */ |