Merge branch 'mailbox-for-next' of git://git.linaro.org/landing-teams/working/fujitsu...
[deliverable/linux.git] / arch / x86 / lib / memcpy_64.S
1 /* Copyright 2002 Andi Kleen */
2
3 #include <linux/linkage.h>
4 #include <asm/cpufeatures.h>
5 #include <asm/alternative-asm.h>
6
7 /*
8 * We build a jump to memcpy_orig by default which gets NOPped out on
9 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
10 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
11 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
12 */
13
14 .weak memcpy
15
16 /*
17 * memcpy - Copy a memory block.
18 *
19 * Input:
20 * rdi destination
21 * rsi source
22 * rdx count
23 *
24 * Output:
25 * rax original destination
26 */
27 ENTRY(__memcpy)
28 ENTRY(memcpy)
29 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
30 "jmp memcpy_erms", X86_FEATURE_ERMS
31
32 movq %rdi, %rax
33 movq %rdx, %rcx
34 shrq $3, %rcx
35 andl $7, %edx
36 rep movsq
37 movl %edx, %ecx
38 rep movsb
39 ret
40 ENDPROC(memcpy)
41 ENDPROC(__memcpy)
42
43 /*
44 * memcpy_erms() - enhanced fast string memcpy. This is faster and
45 * simpler than memcpy. Use memcpy_erms when possible.
46 */
47 ENTRY(memcpy_erms)
48 movq %rdi, %rax
49 movq %rdx, %rcx
50 rep movsb
51 ret
52 ENDPROC(memcpy_erms)
53
54 ENTRY(memcpy_orig)
55 movq %rdi, %rax
56
57 cmpq $0x20, %rdx
58 jb .Lhandle_tail
59
60 /*
61 * We check whether memory false dependence could occur,
62 * then jump to corresponding copy mode.
63 */
64 cmp %dil, %sil
65 jl .Lcopy_backward
66 subq $0x20, %rdx
67 .Lcopy_forward_loop:
68 subq $0x20, %rdx
69
70 /*
71 * Move in blocks of 4x8 bytes:
72 */
73 movq 0*8(%rsi), %r8
74 movq 1*8(%rsi), %r9
75 movq 2*8(%rsi), %r10
76 movq 3*8(%rsi), %r11
77 leaq 4*8(%rsi), %rsi
78
79 movq %r8, 0*8(%rdi)
80 movq %r9, 1*8(%rdi)
81 movq %r10, 2*8(%rdi)
82 movq %r11, 3*8(%rdi)
83 leaq 4*8(%rdi), %rdi
84 jae .Lcopy_forward_loop
85 addl $0x20, %edx
86 jmp .Lhandle_tail
87
88 .Lcopy_backward:
89 /*
90 * Calculate copy position to tail.
91 */
92 addq %rdx, %rsi
93 addq %rdx, %rdi
94 subq $0x20, %rdx
95 /*
96 * At most 3 ALU operations in one cycle,
97 * so append NOPS in the same 16 bytes trunk.
98 */
99 .p2align 4
100 .Lcopy_backward_loop:
101 subq $0x20, %rdx
102 movq -1*8(%rsi), %r8
103 movq -2*8(%rsi), %r9
104 movq -3*8(%rsi), %r10
105 movq -4*8(%rsi), %r11
106 leaq -4*8(%rsi), %rsi
107 movq %r8, -1*8(%rdi)
108 movq %r9, -2*8(%rdi)
109 movq %r10, -3*8(%rdi)
110 movq %r11, -4*8(%rdi)
111 leaq -4*8(%rdi), %rdi
112 jae .Lcopy_backward_loop
113
114 /*
115 * Calculate copy position to head.
116 */
117 addl $0x20, %edx
118 subq %rdx, %rsi
119 subq %rdx, %rdi
120 .Lhandle_tail:
121 cmpl $16, %edx
122 jb .Lless_16bytes
123
124 /*
125 * Move data from 16 bytes to 31 bytes.
126 */
127 movq 0*8(%rsi), %r8
128 movq 1*8(%rsi), %r9
129 movq -2*8(%rsi, %rdx), %r10
130 movq -1*8(%rsi, %rdx), %r11
131 movq %r8, 0*8(%rdi)
132 movq %r9, 1*8(%rdi)
133 movq %r10, -2*8(%rdi, %rdx)
134 movq %r11, -1*8(%rdi, %rdx)
135 retq
136 .p2align 4
137 .Lless_16bytes:
138 cmpl $8, %edx
139 jb .Lless_8bytes
140 /*
141 * Move data from 8 bytes to 15 bytes.
142 */
143 movq 0*8(%rsi), %r8
144 movq -1*8(%rsi, %rdx), %r9
145 movq %r8, 0*8(%rdi)
146 movq %r9, -1*8(%rdi, %rdx)
147 retq
148 .p2align 4
149 .Lless_8bytes:
150 cmpl $4, %edx
151 jb .Lless_3bytes
152
153 /*
154 * Move data from 4 bytes to 7 bytes.
155 */
156 movl (%rsi), %ecx
157 movl -4(%rsi, %rdx), %r8d
158 movl %ecx, (%rdi)
159 movl %r8d, -4(%rdi, %rdx)
160 retq
161 .p2align 4
162 .Lless_3bytes:
163 subl $1, %edx
164 jb .Lend
165 /*
166 * Move data from 1 bytes to 3 bytes.
167 */
168 movzbl (%rsi), %ecx
169 jz .Lstore_1byte
170 movzbq 1(%rsi), %r8
171 movzbq (%rsi, %rdx), %r9
172 movb %r8b, 1(%rdi)
173 movb %r9b, (%rdi, %rdx)
174 .Lstore_1byte:
175 movb %cl, (%rdi)
176
177 .Lend:
178 retq
179 ENDPROC(memcpy_orig)
180
181 #ifndef CONFIG_UML
182 /*
183 * memcpy_mcsafe - memory copy with machine check exception handling
184 * Note that we only catch machine checks when reading the source addresses.
185 * Writes to target are posted and don't generate machine checks.
186 */
187 ENTRY(memcpy_mcsafe)
188 cmpl $8, %edx
189 /* Less than 8 bytes? Go to byte copy loop */
190 jb .L_no_whole_words
191
192 /* Check for bad alignment of source */
193 testl $7, %esi
194 /* Already aligned */
195 jz .L_8byte_aligned
196
197 /* Copy one byte at a time until source is 8-byte aligned */
198 movl %esi, %ecx
199 andl $7, %ecx
200 subl $8, %ecx
201 negl %ecx
202 subl %ecx, %edx
203 .L_copy_leading_bytes:
204 movb (%rsi), %al
205 movb %al, (%rdi)
206 incq %rsi
207 incq %rdi
208 decl %ecx
209 jnz .L_copy_leading_bytes
210
211 .L_8byte_aligned:
212 /* Figure out how many whole cache lines (64-bytes) to copy */
213 movl %edx, %ecx
214 andl $63, %edx
215 shrl $6, %ecx
216 jz .L_no_whole_cache_lines
217
218 /* Loop copying whole cache lines */
219 .L_cache_w0: movq (%rsi), %r8
220 .L_cache_w1: movq 1*8(%rsi), %r9
221 .L_cache_w2: movq 2*8(%rsi), %r10
222 .L_cache_w3: movq 3*8(%rsi), %r11
223 movq %r8, (%rdi)
224 movq %r9, 1*8(%rdi)
225 movq %r10, 2*8(%rdi)
226 movq %r11, 3*8(%rdi)
227 .L_cache_w4: movq 4*8(%rsi), %r8
228 .L_cache_w5: movq 5*8(%rsi), %r9
229 .L_cache_w6: movq 6*8(%rsi), %r10
230 .L_cache_w7: movq 7*8(%rsi), %r11
231 movq %r8, 4*8(%rdi)
232 movq %r9, 5*8(%rdi)
233 movq %r10, 6*8(%rdi)
234 movq %r11, 7*8(%rdi)
235 leaq 64(%rsi), %rsi
236 leaq 64(%rdi), %rdi
237 decl %ecx
238 jnz .L_cache_w0
239
240 /* Are there any trailing 8-byte words? */
241 .L_no_whole_cache_lines:
242 movl %edx, %ecx
243 andl $7, %edx
244 shrl $3, %ecx
245 jz .L_no_whole_words
246
247 /* Copy trailing words */
248 .L_copy_trailing_words:
249 movq (%rsi), %r8
250 mov %r8, (%rdi)
251 leaq 8(%rsi), %rsi
252 leaq 8(%rdi), %rdi
253 decl %ecx
254 jnz .L_copy_trailing_words
255
256 /* Any trailing bytes? */
257 .L_no_whole_words:
258 andl %edx, %edx
259 jz .L_done_memcpy_trap
260
261 /* Copy trailing bytes */
262 movl %edx, %ecx
263 .L_copy_trailing_bytes:
264 movb (%rsi), %al
265 movb %al, (%rdi)
266 incq %rsi
267 incq %rdi
268 decl %ecx
269 jnz .L_copy_trailing_bytes
270
271 /* Copy successful. Return true */
272 .L_done_memcpy_trap:
273 xorq %rax, %rax
274 ret
275 ENDPROC(memcpy_mcsafe)
276
277 .section .fixup, "ax"
278 /* Return false for any failure */
279 .L_memcpy_mcsafe_fail:
280 mov $1, %rax
281 ret
282
283 .previous
284
285 _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
286 _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
287 _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
288 _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
289 _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
290 _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
291 _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
292 _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
293 _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
294 _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
295 _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
296 #endif
This page took 0.037909 seconds and 6 git commands to generate.