Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* Copyright 2002 Andi Kleen */ |
038b0a6d | 2 | |
8d379dad | 3 | #include <linux/linkage.h> |
cd4d09ec | 4 | #include <asm/cpufeatures.h> |
101068c1 | 5 | #include <asm/alternative-asm.h> |
8d379dad | 6 | |
e0bc8d17 BP |
7 | /* |
8 | * We build a jump to memcpy_orig by default which gets NOPped out on | |
9 | * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which | |
10 | * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs | |
11 | * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. | |
12 | */ | |
13 | ||
14 | .weak memcpy | |
15 | ||
1da177e4 LT |
16 | /* |
17 | * memcpy - Copy a memory block. | |
18 | * | |
f3b6eaf0 IM |
19 | * Input: |
20 | * rdi destination | |
21 | * rsi source | |
22 | * rdx count | |
23 | * | |
1da177e4 LT |
24 | * Output: |
25 | * rax original destination | |
f3b6eaf0 | 26 | */ |
e0bc8d17 BP |
27 | ENTRY(__memcpy) |
28 | ENTRY(memcpy) | |
29 | ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ | |
30 | "jmp memcpy_erms", X86_FEATURE_ERMS | |
1da177e4 | 31 | |
f3b6eaf0 | 32 | movq %rdi, %rax |
2ab56091 JB |
33 | movq %rdx, %rcx |
34 | shrq $3, %rcx | |
f3b6eaf0 | 35 | andl $7, %edx |
8d379dad | 36 | rep movsq |
f3b6eaf0 | 37 | movl %edx, %ecx |
8d379dad JB |
38 | rep movsb |
39 | ret | |
e0bc8d17 BP |
40 | ENDPROC(memcpy) |
41 | ENDPROC(__memcpy) | |
8d379dad | 42 | |
101068c1 | 43 | /* |
e0bc8d17 BP |
44 | * memcpy_erms() - enhanced fast string memcpy. This is faster and |
45 | * simpler than memcpy. Use memcpy_erms when possible. | |
101068c1 | 46 | */ |
e0bc8d17 | 47 | ENTRY(memcpy_erms) |
101068c1 | 48 | movq %rdi, %rax |
2ab56091 | 49 | movq %rdx, %rcx |
101068c1 FY |
50 | rep movsb |
51 | ret | |
e0bc8d17 | 52 | ENDPROC(memcpy_erms) |
393f203f | 53 | |
e0bc8d17 | 54 | ENTRY(memcpy_orig) |
59daa706 | 55 | movq %rdi, %rax |
7bcd3f34 | 56 | |
2ab56091 | 57 | cmpq $0x20, %rdx |
59daa706 | 58 | jb .Lhandle_tail |
7bcd3f34 | 59 | |
f3b6eaf0 | 60 | /* |
9de4966a | 61 | * We check whether memory false dependence could occur, |
59daa706 | 62 | * then jump to corresponding copy mode. |
f3b6eaf0 | 63 | */ |
59daa706 ML |
64 | cmp %dil, %sil |
65 | jl .Lcopy_backward | |
2ab56091 | 66 | subq $0x20, %rdx |
59daa706 ML |
67 | .Lcopy_forward_loop: |
68 | subq $0x20, %rdx | |
7bcd3f34 | 69 | |
f3b6eaf0 | 70 | /* |
59daa706 | 71 | * Move in blocks of 4x8 bytes: |
f3b6eaf0 | 72 | */ |
59daa706 ML |
73 | movq 0*8(%rsi), %r8 |
74 | movq 1*8(%rsi), %r9 | |
75 | movq 2*8(%rsi), %r10 | |
76 | movq 3*8(%rsi), %r11 | |
77 | leaq 4*8(%rsi), %rsi | |
78 | ||
79 | movq %r8, 0*8(%rdi) | |
80 | movq %r9, 1*8(%rdi) | |
81 | movq %r10, 2*8(%rdi) | |
82 | movq %r11, 3*8(%rdi) | |
83 | leaq 4*8(%rdi), %rdi | |
84 | jae .Lcopy_forward_loop | |
2ab56091 | 85 | addl $0x20, %edx |
59daa706 ML |
86 | jmp .Lhandle_tail |
87 | ||
88 | .Lcopy_backward: | |
89 | /* | |
90 | * Calculate copy position to tail. | |
91 | */ | |
92 | addq %rdx, %rsi | |
93 | addq %rdx, %rdi | |
94 | subq $0x20, %rdx | |
95 | /* | |
96 | * At most 3 ALU operations in one cycle, | |
d50ba368 | 97 | * so append NOPS in the same 16 bytes trunk. |
59daa706 ML |
98 | */ |
99 | .p2align 4 | |
100 | .Lcopy_backward_loop: | |
101 | subq $0x20, %rdx | |
102 | movq -1*8(%rsi), %r8 | |
103 | movq -2*8(%rsi), %r9 | |
104 | movq -3*8(%rsi), %r10 | |
105 | movq -4*8(%rsi), %r11 | |
106 | leaq -4*8(%rsi), %rsi | |
107 | movq %r8, -1*8(%rdi) | |
108 | movq %r9, -2*8(%rdi) | |
109 | movq %r10, -3*8(%rdi) | |
110 | movq %r11, -4*8(%rdi) | |
111 | leaq -4*8(%rdi), %rdi | |
112 | jae .Lcopy_backward_loop | |
7bcd3f34 | 113 | |
59daa706 ML |
114 | /* |
115 | * Calculate copy position to head. | |
116 | */ | |
2ab56091 | 117 | addl $0x20, %edx |
59daa706 ML |
118 | subq %rdx, %rsi |
119 | subq %rdx, %rdi | |
7bcd3f34 | 120 | .Lhandle_tail: |
2ab56091 | 121 | cmpl $16, %edx |
59daa706 | 122 | jb .Lless_16bytes |
f3b6eaf0 | 123 | |
59daa706 ML |
124 | /* |
125 | * Move data from 16 bytes to 31 bytes. | |
126 | */ | |
127 | movq 0*8(%rsi), %r8 | |
128 | movq 1*8(%rsi), %r9 | |
129 | movq -2*8(%rsi, %rdx), %r10 | |
130 | movq -1*8(%rsi, %rdx), %r11 | |
131 | movq %r8, 0*8(%rdi) | |
132 | movq %r9, 1*8(%rdi) | |
133 | movq %r10, -2*8(%rdi, %rdx) | |
134 | movq %r11, -1*8(%rdi, %rdx) | |
135 | retq | |
7bcd3f34 | 136 | .p2align 4 |
59daa706 | 137 | .Lless_16bytes: |
2ab56091 | 138 | cmpl $8, %edx |
59daa706 ML |
139 | jb .Lless_8bytes |
140 | /* | |
141 | * Move data from 8 bytes to 15 bytes. | |
142 | */ | |
143 | movq 0*8(%rsi), %r8 | |
144 | movq -1*8(%rsi, %rdx), %r9 | |
145 | movq %r8, 0*8(%rdi) | |
146 | movq %r9, -1*8(%rdi, %rdx) | |
147 | retq | |
148 | .p2align 4 | |
149 | .Lless_8bytes: | |
2ab56091 | 150 | cmpl $4, %edx |
59daa706 | 151 | jb .Lless_3bytes |
f3b6eaf0 | 152 | |
59daa706 ML |
153 | /* |
154 | * Move data from 4 bytes to 7 bytes. | |
155 | */ | |
156 | movl (%rsi), %ecx | |
157 | movl -4(%rsi, %rdx), %r8d | |
158 | movl %ecx, (%rdi) | |
159 | movl %r8d, -4(%rdi, %rdx) | |
160 | retq | |
7bcd3f34 | 161 | .p2align 4 |
59daa706 | 162 | .Lless_3bytes: |
9d8e2277 JB |
163 | subl $1, %edx |
164 | jb .Lend | |
59daa706 ML |
165 | /* |
166 | * Move data from 1 bytes to 3 bytes. | |
167 | */ | |
9d8e2277 JB |
168 | movzbl (%rsi), %ecx |
169 | jz .Lstore_1byte | |
170 | movzbq 1(%rsi), %r8 | |
171 | movzbq (%rsi, %rdx), %r9 | |
172 | movb %r8b, 1(%rdi) | |
173 | movb %r9b, (%rdi, %rdx) | |
174 | .Lstore_1byte: | |
175 | movb %cl, (%rdi) | |
7bcd3f34 | 176 | |
f3b6eaf0 | 177 | .Lend: |
59daa706 | 178 | retq |
e0bc8d17 | 179 | ENDPROC(memcpy_orig) |
92b0729c TL |
180 | |
181 | #ifndef CONFIG_UML | |
182 | /* | |
183 | * memcpy_mcsafe - memory copy with machine check exception handling | |
184 | * Note that we only catch machine checks when reading the source addresses. | |
185 | * Writes to target are posted and don't generate machine checks. | |
186 | */ | |
187 | ENTRY(memcpy_mcsafe) | |
188 | cmpl $8, %edx | |
189 | /* Less than 8 bytes? Go to byte copy loop */ | |
190 | jb .L_no_whole_words | |
191 | ||
192 | /* Check for bad alignment of source */ | |
193 | testl $7, %esi | |
194 | /* Already aligned */ | |
195 | jz .L_8byte_aligned | |
196 | ||
197 | /* Copy one byte at a time until source is 8-byte aligned */ | |
198 | movl %esi, %ecx | |
199 | andl $7, %ecx | |
200 | subl $8, %ecx | |
201 | negl %ecx | |
202 | subl %ecx, %edx | |
203 | .L_copy_leading_bytes: | |
204 | movb (%rsi), %al | |
205 | movb %al, (%rdi) | |
206 | incq %rsi | |
207 | incq %rdi | |
208 | decl %ecx | |
209 | jnz .L_copy_leading_bytes | |
210 | ||
211 | .L_8byte_aligned: | |
212 | /* Figure out how many whole cache lines (64-bytes) to copy */ | |
213 | movl %edx, %ecx | |
214 | andl $63, %edx | |
215 | shrl $6, %ecx | |
216 | jz .L_no_whole_cache_lines | |
217 | ||
218 | /* Loop copying whole cache lines */ | |
219 | .L_cache_w0: movq (%rsi), %r8 | |
220 | .L_cache_w1: movq 1*8(%rsi), %r9 | |
221 | .L_cache_w2: movq 2*8(%rsi), %r10 | |
222 | .L_cache_w3: movq 3*8(%rsi), %r11 | |
223 | movq %r8, (%rdi) | |
224 | movq %r9, 1*8(%rdi) | |
225 | movq %r10, 2*8(%rdi) | |
226 | movq %r11, 3*8(%rdi) | |
227 | .L_cache_w4: movq 4*8(%rsi), %r8 | |
228 | .L_cache_w5: movq 5*8(%rsi), %r9 | |
229 | .L_cache_w6: movq 6*8(%rsi), %r10 | |
230 | .L_cache_w7: movq 7*8(%rsi), %r11 | |
231 | movq %r8, 4*8(%rdi) | |
232 | movq %r9, 5*8(%rdi) | |
233 | movq %r10, 6*8(%rdi) | |
234 | movq %r11, 7*8(%rdi) | |
235 | leaq 64(%rsi), %rsi | |
236 | leaq 64(%rdi), %rdi | |
237 | decl %ecx | |
238 | jnz .L_cache_w0 | |
239 | ||
240 | /* Are there any trailing 8-byte words? */ | |
241 | .L_no_whole_cache_lines: | |
242 | movl %edx, %ecx | |
243 | andl $7, %edx | |
244 | shrl $3, %ecx | |
245 | jz .L_no_whole_words | |
246 | ||
247 | /* Copy trailing words */ | |
248 | .L_copy_trailing_words: | |
249 | movq (%rsi), %r8 | |
250 | mov %r8, (%rdi) | |
251 | leaq 8(%rsi), %rsi | |
252 | leaq 8(%rdi), %rdi | |
253 | decl %ecx | |
254 | jnz .L_copy_trailing_words | |
255 | ||
256 | /* Any trailing bytes? */ | |
257 | .L_no_whole_words: | |
258 | andl %edx, %edx | |
259 | jz .L_done_memcpy_trap | |
260 | ||
261 | /* Copy trailing bytes */ | |
262 | movl %edx, %ecx | |
263 | .L_copy_trailing_bytes: | |
264 | movb (%rsi), %al | |
265 | movb %al, (%rdi) | |
266 | incq %rsi | |
267 | incq %rdi | |
268 | decl %ecx | |
269 | jnz .L_copy_trailing_bytes | |
270 | ||
271 | /* Copy successful. Return true */ | |
272 | .L_done_memcpy_trap: | |
273 | xorq %rax, %rax | |
274 | ret | |
275 | ENDPROC(memcpy_mcsafe) | |
276 | ||
277 | .section .fixup, "ax" | |
278 | /* Return false for any failure */ | |
279 | .L_memcpy_mcsafe_fail: | |
280 | mov $1, %rax | |
281 | ret | |
282 | ||
283 | .previous | |
284 | ||
285 | _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) | |
286 | _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) | |
287 | _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) | |
288 | _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) | |
289 | _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) | |
290 | _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) | |
291 | _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) | |
292 | _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail) | |
293 | _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail) | |
294 | _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail) | |
295 | _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail) | |
296 | #endif |