Merge branch 'mailbox-for-next' of git://git.linaro.org/landing-teams/working/fujitsu...
[deliverable/linux.git] / arch / x86 / lib / memcpy_64.S
CommitLineData
1da177e4 1/* Copyright 2002 Andi Kleen */
038b0a6d 2
8d379dad 3#include <linux/linkage.h>
cd4d09ec 4#include <asm/cpufeatures.h>
101068c1 5#include <asm/alternative-asm.h>
8d379dad 6
e0bc8d17
BP
7/*
8 * We build a jump to memcpy_orig by default which gets NOPped out on
9 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
10 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
11 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
12 */
13
14.weak memcpy
15
1da177e4
LT
16/*
17 * memcpy - Copy a memory block.
18 *
f3b6eaf0
IM
19 * Input:
20 * rdi destination
21 * rsi source
22 * rdx count
23 *
1da177e4
LT
24 * Output:
25 * rax original destination
f3b6eaf0 26 */
e0bc8d17
BP
27ENTRY(__memcpy)
28ENTRY(memcpy)
29 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
30 "jmp memcpy_erms", X86_FEATURE_ERMS
1da177e4 31
f3b6eaf0 32 movq %rdi, %rax
2ab56091
JB
33 movq %rdx, %rcx
34 shrq $3, %rcx
f3b6eaf0 35 andl $7, %edx
8d379dad 36 rep movsq
f3b6eaf0 37 movl %edx, %ecx
8d379dad
JB
38 rep movsb
39 ret
e0bc8d17
BP
40ENDPROC(memcpy)
41ENDPROC(__memcpy)
8d379dad 42
101068c1 43/*
e0bc8d17
BP
44 * memcpy_erms() - enhanced fast string memcpy. This is faster and
45 * simpler than memcpy. Use memcpy_erms when possible.
101068c1 46 */
e0bc8d17 47ENTRY(memcpy_erms)
101068c1 48 movq %rdi, %rax
2ab56091 49 movq %rdx, %rcx
101068c1
FY
50 rep movsb
51 ret
e0bc8d17 52ENDPROC(memcpy_erms)
393f203f 53
e0bc8d17 54ENTRY(memcpy_orig)
59daa706 55 movq %rdi, %rax
7bcd3f34 56
2ab56091 57 cmpq $0x20, %rdx
59daa706 58 jb .Lhandle_tail
7bcd3f34 59
f3b6eaf0 60 /*
9de4966a 61 * We check whether memory false dependence could occur,
59daa706 62 * then jump to corresponding copy mode.
f3b6eaf0 63 */
59daa706
ML
64 cmp %dil, %sil
65 jl .Lcopy_backward
2ab56091 66 subq $0x20, %rdx
59daa706
ML
67.Lcopy_forward_loop:
68 subq $0x20, %rdx
7bcd3f34 69
f3b6eaf0 70 /*
59daa706 71 * Move in blocks of 4x8 bytes:
f3b6eaf0 72 */
59daa706
ML
73 movq 0*8(%rsi), %r8
74 movq 1*8(%rsi), %r9
75 movq 2*8(%rsi), %r10
76 movq 3*8(%rsi), %r11
77 leaq 4*8(%rsi), %rsi
78
79 movq %r8, 0*8(%rdi)
80 movq %r9, 1*8(%rdi)
81 movq %r10, 2*8(%rdi)
82 movq %r11, 3*8(%rdi)
83 leaq 4*8(%rdi), %rdi
84 jae .Lcopy_forward_loop
2ab56091 85 addl $0x20, %edx
59daa706
ML
86 jmp .Lhandle_tail
87
88.Lcopy_backward:
89 /*
90 * Calculate copy position to tail.
91 */
92 addq %rdx, %rsi
93 addq %rdx, %rdi
94 subq $0x20, %rdx
95 /*
96 * At most 3 ALU operations in one cycle,
d50ba368 97 * so append NOPS in the same 16 bytes trunk.
59daa706
ML
98 */
99 .p2align 4
100.Lcopy_backward_loop:
101 subq $0x20, %rdx
102 movq -1*8(%rsi), %r8
103 movq -2*8(%rsi), %r9
104 movq -3*8(%rsi), %r10
105 movq -4*8(%rsi), %r11
106 leaq -4*8(%rsi), %rsi
107 movq %r8, -1*8(%rdi)
108 movq %r9, -2*8(%rdi)
109 movq %r10, -3*8(%rdi)
110 movq %r11, -4*8(%rdi)
111 leaq -4*8(%rdi), %rdi
112 jae .Lcopy_backward_loop
7bcd3f34 113
59daa706
ML
114 /*
115 * Calculate copy position to head.
116 */
2ab56091 117 addl $0x20, %edx
59daa706
ML
118 subq %rdx, %rsi
119 subq %rdx, %rdi
7bcd3f34 120.Lhandle_tail:
2ab56091 121 cmpl $16, %edx
59daa706 122 jb .Lless_16bytes
f3b6eaf0 123
59daa706
ML
124 /*
125 * Move data from 16 bytes to 31 bytes.
126 */
127 movq 0*8(%rsi), %r8
128 movq 1*8(%rsi), %r9
129 movq -2*8(%rsi, %rdx), %r10
130 movq -1*8(%rsi, %rdx), %r11
131 movq %r8, 0*8(%rdi)
132 movq %r9, 1*8(%rdi)
133 movq %r10, -2*8(%rdi, %rdx)
134 movq %r11, -1*8(%rdi, %rdx)
135 retq
7bcd3f34 136 .p2align 4
59daa706 137.Lless_16bytes:
2ab56091 138 cmpl $8, %edx
59daa706
ML
139 jb .Lless_8bytes
140 /*
141 * Move data from 8 bytes to 15 bytes.
142 */
143 movq 0*8(%rsi), %r8
144 movq -1*8(%rsi, %rdx), %r9
145 movq %r8, 0*8(%rdi)
146 movq %r9, -1*8(%rdi, %rdx)
147 retq
148 .p2align 4
149.Lless_8bytes:
2ab56091 150 cmpl $4, %edx
59daa706 151 jb .Lless_3bytes
f3b6eaf0 152
59daa706
ML
153 /*
154 * Move data from 4 bytes to 7 bytes.
155 */
156 movl (%rsi), %ecx
157 movl -4(%rsi, %rdx), %r8d
158 movl %ecx, (%rdi)
159 movl %r8d, -4(%rdi, %rdx)
160 retq
7bcd3f34 161 .p2align 4
59daa706 162.Lless_3bytes:
9d8e2277
JB
163 subl $1, %edx
164 jb .Lend
59daa706
ML
165 /*
166 * Move data from 1 bytes to 3 bytes.
167 */
9d8e2277
JB
168 movzbl (%rsi), %ecx
169 jz .Lstore_1byte
170 movzbq 1(%rsi), %r8
171 movzbq (%rsi, %rdx), %r9
172 movb %r8b, 1(%rdi)
173 movb %r9b, (%rdi, %rdx)
174.Lstore_1byte:
175 movb %cl, (%rdi)
7bcd3f34 176
f3b6eaf0 177.Lend:
59daa706 178 retq
e0bc8d17 179ENDPROC(memcpy_orig)
92b0729c
TL
180
181#ifndef CONFIG_UML
182/*
183 * memcpy_mcsafe - memory copy with machine check exception handling
184 * Note that we only catch machine checks when reading the source addresses.
185 * Writes to target are posted and don't generate machine checks.
186 */
187ENTRY(memcpy_mcsafe)
188 cmpl $8, %edx
189 /* Less than 8 bytes? Go to byte copy loop */
190 jb .L_no_whole_words
191
192 /* Check for bad alignment of source */
193 testl $7, %esi
194 /* Already aligned */
195 jz .L_8byte_aligned
196
197 /* Copy one byte at a time until source is 8-byte aligned */
198 movl %esi, %ecx
199 andl $7, %ecx
200 subl $8, %ecx
201 negl %ecx
202 subl %ecx, %edx
203.L_copy_leading_bytes:
204 movb (%rsi), %al
205 movb %al, (%rdi)
206 incq %rsi
207 incq %rdi
208 decl %ecx
209 jnz .L_copy_leading_bytes
210
211.L_8byte_aligned:
212 /* Figure out how many whole cache lines (64-bytes) to copy */
213 movl %edx, %ecx
214 andl $63, %edx
215 shrl $6, %ecx
216 jz .L_no_whole_cache_lines
217
218 /* Loop copying whole cache lines */
219.L_cache_w0: movq (%rsi), %r8
220.L_cache_w1: movq 1*8(%rsi), %r9
221.L_cache_w2: movq 2*8(%rsi), %r10
222.L_cache_w3: movq 3*8(%rsi), %r11
223 movq %r8, (%rdi)
224 movq %r9, 1*8(%rdi)
225 movq %r10, 2*8(%rdi)
226 movq %r11, 3*8(%rdi)
227.L_cache_w4: movq 4*8(%rsi), %r8
228.L_cache_w5: movq 5*8(%rsi), %r9
229.L_cache_w6: movq 6*8(%rsi), %r10
230.L_cache_w7: movq 7*8(%rsi), %r11
231 movq %r8, 4*8(%rdi)
232 movq %r9, 5*8(%rdi)
233 movq %r10, 6*8(%rdi)
234 movq %r11, 7*8(%rdi)
235 leaq 64(%rsi), %rsi
236 leaq 64(%rdi), %rdi
237 decl %ecx
238 jnz .L_cache_w0
239
240 /* Are there any trailing 8-byte words? */
241.L_no_whole_cache_lines:
242 movl %edx, %ecx
243 andl $7, %edx
244 shrl $3, %ecx
245 jz .L_no_whole_words
246
247 /* Copy trailing words */
248.L_copy_trailing_words:
249 movq (%rsi), %r8
250 mov %r8, (%rdi)
251 leaq 8(%rsi), %rsi
252 leaq 8(%rdi), %rdi
253 decl %ecx
254 jnz .L_copy_trailing_words
255
256 /* Any trailing bytes? */
257.L_no_whole_words:
258 andl %edx, %edx
259 jz .L_done_memcpy_trap
260
261 /* Copy trailing bytes */
262 movl %edx, %ecx
263.L_copy_trailing_bytes:
264 movb (%rsi), %al
265 movb %al, (%rdi)
266 incq %rsi
267 incq %rdi
268 decl %ecx
269 jnz .L_copy_trailing_bytes
270
271 /* Copy successful. Return true */
272.L_done_memcpy_trap:
273 xorq %rax, %rax
274 ret
275ENDPROC(memcpy_mcsafe)
276
277 .section .fixup, "ax"
278 /* Return false for any failure */
279.L_memcpy_mcsafe_fail:
280 mov $1, %rax
281 ret
282
283 .previous
284
285 _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
286 _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
287 _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
288 _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
289 _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
290 _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
291 _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
292 _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
293 _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
294 _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
295 _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
296#endif
This page took 0.6869 seconds and 5 git commands to generate.