Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* Copyright 2002 Andi Kleen */ |
038b0a6d | 2 | |
8d379dad | 3 | #include <linux/linkage.h> |
cbf8b5a2 | 4 | #include <asm/errno.h> |
cd4d09ec | 5 | #include <asm/cpufeatures.h> |
101068c1 | 6 | #include <asm/alternative-asm.h> |
8d379dad | 7 | |
e0bc8d17 BP |
8 | /* |
9 | * We build a jump to memcpy_orig by default which gets NOPped out on | |
10 | * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which | |
11 | * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs | |
12 | * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. | |
13 | */ | |
14 | ||
15 | .weak memcpy | |
16 | ||
1da177e4 LT |
17 | /* |
18 | * memcpy - Copy a memory block. | |
19 | * | |
f3b6eaf0 IM |
20 | * Input: |
21 | * rdi destination | |
22 | * rsi source | |
23 | * rdx count | |
24 | * | |
1da177e4 LT |
25 | * Output: |
26 | * rax original destination | |
f3b6eaf0 | 27 | */ |
e0bc8d17 BP |
28 | ENTRY(__memcpy) |
29 | ENTRY(memcpy) | |
30 | ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ | |
31 | "jmp memcpy_erms", X86_FEATURE_ERMS | |
1da177e4 | 32 | |
f3b6eaf0 | 33 | movq %rdi, %rax |
2ab56091 JB |
34 | movq %rdx, %rcx |
35 | shrq $3, %rcx | |
f3b6eaf0 | 36 | andl $7, %edx |
8d379dad | 37 | rep movsq |
f3b6eaf0 | 38 | movl %edx, %ecx |
8d379dad JB |
39 | rep movsb |
40 | ret | |
e0bc8d17 BP |
41 | ENDPROC(memcpy) |
42 | ENDPROC(__memcpy) | |
8d379dad | 43 | |
101068c1 | 44 | /* |
e0bc8d17 BP |
45 | * memcpy_erms() - enhanced fast string memcpy. This is faster and |
46 | * simpler than memcpy. Use memcpy_erms when possible. | |
101068c1 | 47 | */ |
e0bc8d17 | 48 | ENTRY(memcpy_erms) |
101068c1 | 49 | movq %rdi, %rax |
2ab56091 | 50 | movq %rdx, %rcx |
101068c1 FY |
51 | rep movsb |
52 | ret | |
e0bc8d17 | 53 | ENDPROC(memcpy_erms) |
393f203f | 54 | |
e0bc8d17 | 55 | ENTRY(memcpy_orig) |
59daa706 | 56 | movq %rdi, %rax |
7bcd3f34 | 57 | |
2ab56091 | 58 | cmpq $0x20, %rdx |
59daa706 | 59 | jb .Lhandle_tail |
7bcd3f34 | 60 | |
f3b6eaf0 | 61 | /* |
9de4966a | 62 | * We check whether memory false dependence could occur, |
59daa706 | 63 | * then jump to corresponding copy mode. |
f3b6eaf0 | 64 | */ |
59daa706 ML |
65 | cmp %dil, %sil |
66 | jl .Lcopy_backward | |
2ab56091 | 67 | subq $0x20, %rdx |
59daa706 ML |
68 | .Lcopy_forward_loop: |
69 | subq $0x20, %rdx | |
7bcd3f34 | 70 | |
f3b6eaf0 | 71 | /* |
59daa706 | 72 | * Move in blocks of 4x8 bytes: |
f3b6eaf0 | 73 | */ |
59daa706 ML |
74 | movq 0*8(%rsi), %r8 |
75 | movq 1*8(%rsi), %r9 | |
76 | movq 2*8(%rsi), %r10 | |
77 | movq 3*8(%rsi), %r11 | |
78 | leaq 4*8(%rsi), %rsi | |
79 | ||
80 | movq %r8, 0*8(%rdi) | |
81 | movq %r9, 1*8(%rdi) | |
82 | movq %r10, 2*8(%rdi) | |
83 | movq %r11, 3*8(%rdi) | |
84 | leaq 4*8(%rdi), %rdi | |
85 | jae .Lcopy_forward_loop | |
2ab56091 | 86 | addl $0x20, %edx |
59daa706 ML |
87 | jmp .Lhandle_tail |
88 | ||
89 | .Lcopy_backward: | |
90 | /* | |
91 | * Calculate copy position to tail. | |
92 | */ | |
93 | addq %rdx, %rsi | |
94 | addq %rdx, %rdi | |
95 | subq $0x20, %rdx | |
96 | /* | |
97 | * At most 3 ALU operations in one cycle, | |
d50ba368 | 98 | * so append NOPS in the same 16 bytes trunk. |
59daa706 ML |
99 | */ |
100 | .p2align 4 | |
101 | .Lcopy_backward_loop: | |
102 | subq $0x20, %rdx | |
103 | movq -1*8(%rsi), %r8 | |
104 | movq -2*8(%rsi), %r9 | |
105 | movq -3*8(%rsi), %r10 | |
106 | movq -4*8(%rsi), %r11 | |
107 | leaq -4*8(%rsi), %rsi | |
108 | movq %r8, -1*8(%rdi) | |
109 | movq %r9, -2*8(%rdi) | |
110 | movq %r10, -3*8(%rdi) | |
111 | movq %r11, -4*8(%rdi) | |
112 | leaq -4*8(%rdi), %rdi | |
113 | jae .Lcopy_backward_loop | |
7bcd3f34 | 114 | |
59daa706 ML |
115 | /* |
116 | * Calculate copy position to head. | |
117 | */ | |
2ab56091 | 118 | addl $0x20, %edx |
59daa706 ML |
119 | subq %rdx, %rsi |
120 | subq %rdx, %rdi | |
7bcd3f34 | 121 | .Lhandle_tail: |
2ab56091 | 122 | cmpl $16, %edx |
59daa706 | 123 | jb .Lless_16bytes |
f3b6eaf0 | 124 | |
59daa706 ML |
125 | /* |
126 | * Move data from 16 bytes to 31 bytes. | |
127 | */ | |
128 | movq 0*8(%rsi), %r8 | |
129 | movq 1*8(%rsi), %r9 | |
130 | movq -2*8(%rsi, %rdx), %r10 | |
131 | movq -1*8(%rsi, %rdx), %r11 | |
132 | movq %r8, 0*8(%rdi) | |
133 | movq %r9, 1*8(%rdi) | |
134 | movq %r10, -2*8(%rdi, %rdx) | |
135 | movq %r11, -1*8(%rdi, %rdx) | |
136 | retq | |
7bcd3f34 | 137 | .p2align 4 |
59daa706 | 138 | .Lless_16bytes: |
2ab56091 | 139 | cmpl $8, %edx |
59daa706 ML |
140 | jb .Lless_8bytes |
141 | /* | |
142 | * Move data from 8 bytes to 15 bytes. | |
143 | */ | |
144 | movq 0*8(%rsi), %r8 | |
145 | movq -1*8(%rsi, %rdx), %r9 | |
146 | movq %r8, 0*8(%rdi) | |
147 | movq %r9, -1*8(%rdi, %rdx) | |
148 | retq | |
149 | .p2align 4 | |
150 | .Lless_8bytes: | |
2ab56091 | 151 | cmpl $4, %edx |
59daa706 | 152 | jb .Lless_3bytes |
f3b6eaf0 | 153 | |
59daa706 ML |
154 | /* |
155 | * Move data from 4 bytes to 7 bytes. | |
156 | */ | |
157 | movl (%rsi), %ecx | |
158 | movl -4(%rsi, %rdx), %r8d | |
159 | movl %ecx, (%rdi) | |
160 | movl %r8d, -4(%rdi, %rdx) | |
161 | retq | |
7bcd3f34 | 162 | .p2align 4 |
59daa706 | 163 | .Lless_3bytes: |
9d8e2277 JB |
164 | subl $1, %edx |
165 | jb .Lend | |
59daa706 ML |
166 | /* |
167 | * Move data from 1 bytes to 3 bytes. | |
168 | */ | |
9d8e2277 JB |
169 | movzbl (%rsi), %ecx |
170 | jz .Lstore_1byte | |
171 | movzbq 1(%rsi), %r8 | |
172 | movzbq (%rsi, %rdx), %r9 | |
173 | movb %r8b, 1(%rdi) | |
174 | movb %r9b, (%rdi, %rdx) | |
175 | .Lstore_1byte: | |
176 | movb %cl, (%rdi) | |
7bcd3f34 | 177 | |
f3b6eaf0 | 178 | .Lend: |
59daa706 | 179 | retq |
e0bc8d17 | 180 | ENDPROC(memcpy_orig) |
92b0729c TL |
181 | |
182 | #ifndef CONFIG_UML | |
183 | /* | |
184 | * memcpy_mcsafe - memory copy with machine check exception handling | |
185 | * Note that we only catch machine checks when reading the source addresses. | |
186 | * Writes to target are posted and don't generate machine checks. | |
187 | */ | |
188 | ENTRY(memcpy_mcsafe) | |
189 | cmpl $8, %edx | |
190 | /* Less than 8 bytes? Go to byte copy loop */ | |
191 | jb .L_no_whole_words | |
192 | ||
193 | /* Check for bad alignment of source */ | |
194 | testl $7, %esi | |
195 | /* Already aligned */ | |
196 | jz .L_8byte_aligned | |
197 | ||
198 | /* Copy one byte at a time until source is 8-byte aligned */ | |
199 | movl %esi, %ecx | |
200 | andl $7, %ecx | |
201 | subl $8, %ecx | |
202 | negl %ecx | |
203 | subl %ecx, %edx | |
204 | .L_copy_leading_bytes: | |
205 | movb (%rsi), %al | |
206 | movb %al, (%rdi) | |
207 | incq %rsi | |
208 | incq %rdi | |
209 | decl %ecx | |
210 | jnz .L_copy_leading_bytes | |
211 | ||
212 | .L_8byte_aligned: | |
213 | /* Figure out how many whole cache lines (64-bytes) to copy */ | |
214 | movl %edx, %ecx | |
215 | andl $63, %edx | |
216 | shrl $6, %ecx | |
217 | jz .L_no_whole_cache_lines | |
218 | ||
219 | /* Loop copying whole cache lines */ | |
220 | .L_cache_w0: movq (%rsi), %r8 | |
221 | .L_cache_w1: movq 1*8(%rsi), %r9 | |
222 | .L_cache_w2: movq 2*8(%rsi), %r10 | |
223 | .L_cache_w3: movq 3*8(%rsi), %r11 | |
224 | movq %r8, (%rdi) | |
225 | movq %r9, 1*8(%rdi) | |
226 | movq %r10, 2*8(%rdi) | |
227 | movq %r11, 3*8(%rdi) | |
228 | .L_cache_w4: movq 4*8(%rsi), %r8 | |
229 | .L_cache_w5: movq 5*8(%rsi), %r9 | |
230 | .L_cache_w6: movq 6*8(%rsi), %r10 | |
231 | .L_cache_w7: movq 7*8(%rsi), %r11 | |
232 | movq %r8, 4*8(%rdi) | |
233 | movq %r9, 5*8(%rdi) | |
234 | movq %r10, 6*8(%rdi) | |
235 | movq %r11, 7*8(%rdi) | |
236 | leaq 64(%rsi), %rsi | |
237 | leaq 64(%rdi), %rdi | |
238 | decl %ecx | |
239 | jnz .L_cache_w0 | |
240 | ||
241 | /* Are there any trailing 8-byte words? */ | |
242 | .L_no_whole_cache_lines: | |
243 | movl %edx, %ecx | |
244 | andl $7, %edx | |
245 | shrl $3, %ecx | |
246 | jz .L_no_whole_words | |
247 | ||
248 | /* Copy trailing words */ | |
249 | .L_copy_trailing_words: | |
250 | movq (%rsi), %r8 | |
251 | mov %r8, (%rdi) | |
252 | leaq 8(%rsi), %rsi | |
253 | leaq 8(%rdi), %rdi | |
254 | decl %ecx | |
255 | jnz .L_copy_trailing_words | |
256 | ||
257 | /* Any trailing bytes? */ | |
258 | .L_no_whole_words: | |
259 | andl %edx, %edx | |
260 | jz .L_done_memcpy_trap | |
261 | ||
262 | /* Copy trailing bytes */ | |
263 | movl %edx, %ecx | |
264 | .L_copy_trailing_bytes: | |
265 | movb (%rsi), %al | |
266 | movb %al, (%rdi) | |
267 | incq %rsi | |
268 | incq %rdi | |
269 | decl %ecx | |
270 | jnz .L_copy_trailing_bytes | |
271 | ||
cbf8b5a2 | 272 | /* Copy successful. Return zero */ |
92b0729c TL |
273 | .L_done_memcpy_trap: |
274 | xorq %rax, %rax | |
275 | ret | |
276 | ENDPROC(memcpy_mcsafe) | |
277 | ||
278 | .section .fixup, "ax" | |
cbf8b5a2 | 279 | /* Return -EFAULT for any failure */ |
92b0729c | 280 | .L_memcpy_mcsafe_fail: |
cbf8b5a2 | 281 | mov $-EFAULT, %rax |
92b0729c TL |
282 | ret |
283 | ||
284 | .previous | |
285 | ||
286 | _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) | |
287 | _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) | |
288 | _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) | |
289 | _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) | |
290 | _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) | |
291 | _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) | |
292 | _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) | |
293 | _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail) | |
294 | _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail) | |
295 | _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail) | |
296 | _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail) | |
297 | #endif |