[deliverable/linux.git] / arch / x86 / lib / memcpy_64.S

/* Copyright 2002 Andi Kleen */

#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>

/*
 * We build a jump to memcpy_orig by default which gets NOPped out on
 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
 */

.weak memcpy

/*
 * memcpy - Copy a memory block.
 *
 * Input:
 *  rdi destination
 *  rsi source
 *  rdx count
 *
 * Output:
 * rax original destination
 */
ENTRY(__memcpy)
ENTRY(memcpy)
	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
		      "jmp memcpy_erms", X86_FEATURE_ERMS

	movq %rdi, %rax
	movq %rdx, %rcx
	shrq $3, %rcx
	andl $7, %edx
	rep movsq
	movl %edx, %ecx
	rep movsb
	ret
ENDPROC(memcpy)
ENDPROC(__memcpy)

/*
 * memcpy_erms() - enhanced fast string memcpy. This is faster and
 * simpler than memcpy. Use memcpy_erms when possible.
 */
ENTRY(memcpy_erms)
	movq %rdi, %rax
	movq %rdx, %rcx
	rep movsb
	ret
ENDPROC(memcpy_erms)

ENTRY(memcpy_orig)
	movq %rdi, %rax

	cmpq $0x20, %rdx
	jb .Lhandle_tail

	/*
	 * We check whether memory false dependence could occur,
	 * then jump to corresponding copy mode.
	 */
	cmp  %dil, %sil
	jl .Lcopy_backward
	subq $0x20, %rdx
.Lcopy_forward_loop:
	subq $0x20,	%rdx

	/*
	 * Move in blocks of 4x8 bytes:
	 */
	movq 0*8(%rsi),	%r8
	movq 1*8(%rsi),	%r9
	movq 2*8(%rsi),	%r10
	movq 3*8(%rsi),	%r11
	leaq 4*8(%rsi),	%rsi

	movq %r8,	0*8(%rdi)
	movq %r9,	1*8(%rdi)
	movq %r10,	2*8(%rdi)
	movq %r11,	3*8(%rdi)
	leaq 4*8(%rdi),	%rdi
	jae  .Lcopy_forward_loop
	addl $0x20,	%edx
	jmp  .Lhandle_tail

.Lcopy_backward:
	/*
	 * Calculate copy position to tail.
	 */
	addq %rdx,	%rsi
	addq %rdx,	%rdi
	subq $0x20,	%rdx
	/*
	 * At most 3 ALU operations in one cycle,
	 * so append NOPS in the same 16 bytes trunk.
	 */
	.p2align 4
.Lcopy_backward_loop:
	subq $0x20,	%rdx
	movq -1*8(%rsi),	%r8
	movq -2*8(%rsi),	%r9
	movq -3*8(%rsi),	%r10
	movq -4*8(%rsi),	%r11
	leaq -4*8(%rsi),	%rsi
	movq %r8,		-1*8(%rdi)
	movq %r9,		-2*8(%rdi)
	movq %r10,		-3*8(%rdi)
	movq %r11,		-4*8(%rdi)
	leaq -4*8(%rdi),	%rdi
	jae  .Lcopy_backward_loop

	/*
	 * Calculate copy position to head.
	 */
	addl $0x20,	%edx
	subq %rdx,	%rsi
	subq %rdx,	%rdi
.Lhandle_tail:
	cmpl $16,	%edx
	jb   .Lless_16bytes

	/*
	 * Move data from 16 bytes to 31 bytes.
	 */
	movq 0*8(%rsi), %r8
	movq 1*8(%rsi),	%r9
	movq -2*8(%rsi, %rdx),	%r10
	movq -1*8(%rsi, %rdx),	%r11
	movq %r8,	0*8(%rdi)
	movq %r9,	1*8(%rdi)
	movq %r10,	-2*8(%rdi, %rdx)
	movq %r11,	-1*8(%rdi, %rdx)
	retq
	.p2align 4
.Lless_16bytes:
	cmpl $8,	%edx
	jb   .Lless_8bytes
	/*
	 * Move data from 8 bytes to 15 bytes.
	 */
	movq 0*8(%rsi),	%r8
	movq -1*8(%rsi, %rdx),	%r9
	movq %r8,	0*8(%rdi)
	movq %r9,	-1*8(%rdi, %rdx)
	retq
	.p2align 4
.Lless_8bytes:
	cmpl $4,	%edx
	jb   .Lless_3bytes

	/*
	 * Move data from 4 bytes to 7 bytes.
	 */
	movl (%rsi), %ecx
	movl -4(%rsi, %rdx), %r8d
	movl %ecx, (%rdi)
	movl %r8d, -4(%rdi, %rdx)
	retq
	.p2align 4
.Lless_3bytes:
	subl $1, %edx
	jb .Lend
	/*
	 * Move data from 1 bytes to 3 bytes.
	 */
	movzbl (%rsi), %ecx
	jz .Lstore_1byte
	movzbq 1(%rsi), %r8
	movzbq (%rsi, %rdx), %r9
	movb %r8b, 1(%rdi)
	movb %r9b, (%rdi, %rdx)
.Lstore_1byte:
	movb %cl, (%rdi)

.Lend:
	retq
ENDPROC(memcpy_orig)

#ifndef CONFIG_UML
/*
 * memcpy_mcsafe - memory copy with machine check exception handling
 * Note that we only catch machine checks when reading the source addresses.
 * Writes to target are posted and don't generate machine checks.
 */
ENTRY(memcpy_mcsafe)
	cmpl $8, %edx
	/* Less than 8 bytes? Go to byte copy loop */
	jb .L_no_whole_words

	/* Check for bad alignment of source */
	testl $7, %esi
	/* Already aligned */
	jz .L_8byte_aligned

	/* Copy one byte at a time until source is 8-byte aligned */
	movl %esi, %ecx
	andl $7, %ecx
	subl $8, %ecx
	negl %ecx
	subl %ecx, %edx
.L_copy_leading_bytes:
	movb (%rsi), %al
	movb %al, (%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz .L_copy_leading_bytes

.L_8byte_aligned:
	/* Figure out how many whole cache lines (64-bytes) to copy */
	movl %edx, %ecx
	andl $63, %edx
	shrl $6, %ecx
	jz .L_no_whole_cache_lines

	/* Loop copying whole cache lines */
.L_cache_w0: movq (%rsi), %r8
.L_cache_w1: movq 1*8(%rsi), %r9
.L_cache_w2: movq 2*8(%rsi), %r10
.L_cache_w3: movq 3*8(%rsi), %r11
	movq %r8, (%rdi)
	movq %r9, 1*8(%rdi)
	movq %r10, 2*8(%rdi)
	movq %r11, 3*8(%rdi)
.L_cache_w4: movq 4*8(%rsi), %r8
.L_cache_w5: movq 5*8(%rsi), %r9
.L_cache_w6: movq 6*8(%rsi), %r10
.L_cache_w7: movq 7*8(%rsi), %r11
	movq %r8, 4*8(%rdi)
	movq %r9, 5*8(%rdi)
	movq %r10, 6*8(%rdi)
	movq %r11, 7*8(%rdi)
	leaq 64(%rsi), %rsi
	leaq 64(%rdi), %rdi
	decl %ecx
	jnz .L_cache_w0

	/* Are there any trailing 8-byte words? */
.L_no_whole_cache_lines:
	movl %edx, %ecx
	andl $7, %edx
	shrl $3, %ecx
	jz .L_no_whole_words

	/* Copy trailing words */
.L_copy_trailing_words:
	movq (%rsi), %r8
	mov %r8, (%rdi)
	leaq 8(%rsi), %rsi
	leaq 8(%rdi), %rdi
	decl %ecx
	jnz .L_copy_trailing_words

	/* Any trailing bytes? */
.L_no_whole_words:
	andl %edx, %edx
	jz .L_done_memcpy_trap

	/* Copy trailing bytes */
	movl %edx, %ecx
.L_copy_trailing_bytes:
	movb (%rsi), %al
	movb %al, (%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz .L_copy_trailing_bytes

	/* Copy successful. Return true */
.L_done_memcpy_trap:
	xorq %rax, %rax
	ret
ENDPROC(memcpy_mcsafe)

	.section .fixup, "ax"
	/* Return false for any failure */
.L_memcpy_mcsafe_fail:
	mov	$1, %rax
	ret

	.previous

	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
#endif
Commit	Line	Data
1da177e4	1	/* Copyright 2002 Andi Kleen */
038b0a6d	2
8d379dad	3	#include <linux/linkage.h>
cd4d09ec	4	#include <asm/cpufeatures.h>
101068c1	5	#include <asm/alternative-asm.h>
8d379dad	6
e0bc8d17 BP	7	/*
	8	* We build a jump to memcpy_orig by default which gets NOPped out on
	9	* the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
	10	* have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
	11	* to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
	12	*/
	13
	14	.weak memcpy
	15
1da177e4 LT	16	/*
	17	* memcpy - Copy a memory block.
	18	*
f3b6eaf0 IM	19	* Input:
	20	* rdi destination
	21	* rsi source
	22	* rdx count
	23	*
1da177e4 LT	24	* Output:
1da177e4 LT	25	* rax original destination
f3b6eaf0	26	*/
e0bc8d17 BP	27	ENTRY(__memcpy)
	28	ENTRY(memcpy)
	29	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
	30	"jmp memcpy_erms", X86_FEATURE_ERMS
1da177e4	31
f3b6eaf0	32	movq %rdi, %rax
2ab56091 JB	33	movq %rdx, %rcx
2ab56091 JB	34	shrq $3, %rcx
f3b6eaf0	35	andl $7, %edx
8d379dad	36	rep movsq
f3b6eaf0	37	movl %edx, %ecx
8d379dad JB	38	rep movsb
8d379dad JB	39	ret
e0bc8d17 BP	40	ENDPROC(memcpy)
e0bc8d17 BP	41	ENDPROC(__memcpy)
8d379dad	42
101068c1	43	/*
e0bc8d17 BP	44	* memcpy_erms() - enhanced fast string memcpy. This is faster and
e0bc8d17 BP	45	* simpler than memcpy. Use memcpy_erms when possible.
101068c1	46	*/
e0bc8d17	47	ENTRY(memcpy_erms)
101068c1	48	movq %rdi, %rax
2ab56091	49	movq %rdx, %rcx
101068c1 FY	50	rep movsb
101068c1 FY	51	ret
e0bc8d17	52	ENDPROC(memcpy_erms)
393f203f	53
e0bc8d17	54	ENTRY(memcpy_orig)
59daa706	55	movq %rdi, %rax
7bcd3f34	56
2ab56091	57	cmpq $0x20, %rdx
59daa706	58	jb .Lhandle_tail
7bcd3f34	59
f3b6eaf0	60	/*
9de4966a	61	* We check whether memory false dependence could occur,
59daa706	62	* then jump to corresponding copy mode.
f3b6eaf0	63	*/
59daa706 ML	64	cmp %dil, %sil
59daa706 ML	65	jl .Lcopy_backward
2ab56091	66	subq $0x20, %rdx
59daa706 ML	67	.Lcopy_forward_loop:
59daa706 ML	68	subq $0x20, %rdx
7bcd3f34	69
f3b6eaf0	70	/*
59daa706	71	* Move in blocks of 4x8 bytes:
f3b6eaf0	72	*/
59daa706 ML	73	movq 0*8(%rsi), %r8
	74	movq 1*8(%rsi), %r9
	75	movq 2*8(%rsi), %r10
	76	movq 3*8(%rsi), %r11
	77	leaq 4*8(%rsi), %rsi
	78
	79	movq %r8, 0*8(%rdi)
	80	movq %r9, 1*8(%rdi)
	81	movq %r10, 2*8(%rdi)
	82	movq %r11, 3*8(%rdi)
	83	leaq 4*8(%rdi), %rdi
	84	jae .Lcopy_forward_loop
2ab56091	85	addl $0x20, %edx
59daa706 ML	86	jmp .Lhandle_tail
	87
	88	.Lcopy_backward:
	89	/*
	90	* Calculate copy position to tail.
	91	*/
	92	addq %rdx, %rsi
	93	addq %rdx, %rdi
	94	subq $0x20, %rdx
	95	/*
	96	* At most 3 ALU operations in one cycle,
d50ba368	97	* so append NOPS in the same 16 bytes trunk.
59daa706 ML	98	*/
	99	.p2align 4
	100	.Lcopy_backward_loop:
	101	subq $0x20, %rdx
	102	movq -1*8(%rsi), %r8
	103	movq -2*8(%rsi), %r9
	104	movq -3*8(%rsi), %r10
	105	movq -4*8(%rsi), %r11
	106	leaq -4*8(%rsi), %rsi
	107	movq %r8, -1*8(%rdi)
	108	movq %r9, -2*8(%rdi)
	109	movq %r10, -3*8(%rdi)
	110	movq %r11, -4*8(%rdi)
	111	leaq -4*8(%rdi), %rdi
	112	jae .Lcopy_backward_loop
7bcd3f34	113
59daa706 ML	114	/*
	115	* Calculate copy position to head.
	116	*/
2ab56091	117	addl $0x20, %edx
59daa706 ML	118	subq %rdx, %rsi
59daa706 ML	119	subq %rdx, %rdi
7bcd3f34	120	.Lhandle_tail:
2ab56091	121	cmpl $16, %edx
59daa706	122	jb .Lless_16bytes
f3b6eaf0	123
59daa706 ML	124	/*
	125	* Move data from 16 bytes to 31 bytes.
	126	*/
	127	movq 0*8(%rsi), %r8
	128	movq 1*8(%rsi), %r9
	129	movq -2*8(%rsi, %rdx), %r10
	130	movq -1*8(%rsi, %rdx), %r11
	131	movq %r8, 0*8(%rdi)
	132	movq %r9, 1*8(%rdi)
	133	movq %r10, -2*8(%rdi, %rdx)
	134	movq %r11, -1*8(%rdi, %rdx)
	135	retq
7bcd3f34	136	.p2align 4
59daa706	137	.Lless_16bytes:
2ab56091	138	cmpl $8, %edx
59daa706 ML	139	jb .Lless_8bytes
	140	/*
	141	* Move data from 8 bytes to 15 bytes.
	142	*/
	143	movq 0*8(%rsi), %r8
	144	movq -1*8(%rsi, %rdx), %r9
	145	movq %r8, 0*8(%rdi)
	146	movq %r9, -1*8(%rdi, %rdx)
	147	retq
	148	.p2align 4
	149	.Lless_8bytes:
2ab56091	150	cmpl $4, %edx
59daa706	151	jb .Lless_3bytes
f3b6eaf0	152
59daa706 ML	153	/*
	154	* Move data from 4 bytes to 7 bytes.
	155	*/
	156	movl (%rsi), %ecx
	157	movl -4(%rsi, %rdx), %r8d
	158	movl %ecx, (%rdi)
	159	movl %r8d, -4(%rdi, %rdx)
	160	retq
7bcd3f34	161	.p2align 4
59daa706	162	.Lless_3bytes:
9d8e2277 JB	163	subl $1, %edx
9d8e2277 JB	164	jb .Lend
59daa706 ML	165	/*
	166	* Move data from 1 bytes to 3 bytes.
	167	*/
9d8e2277 JB	168	movzbl (%rsi), %ecx
	169	jz .Lstore_1byte
	170	movzbq 1(%rsi), %r8
	171	movzbq (%rsi, %rdx), %r9
	172	movb %r8b, 1(%rdi)
	173	movb %r9b, (%rdi, %rdx)
	174	.Lstore_1byte:
	175	movb %cl, (%rdi)
7bcd3f34	176
f3b6eaf0	177	.Lend:
59daa706	178	retq
e0bc8d17	179	ENDPROC(memcpy_orig)
92b0729c TL	180
	181	#ifndef CONFIG_UML
	182	/*
	183	* memcpy_mcsafe - memory copy with machine check exception handling
	184	* Note that we only catch machine checks when reading the source addresses.
	185	* Writes to target are posted and don't generate machine checks.
	186	*/
	187	ENTRY(memcpy_mcsafe)
	188	cmpl $8, %edx
	189	/* Less than 8 bytes? Go to byte copy loop */
	190	jb .L_no_whole_words
	191
	192	/* Check for bad alignment of source */
	193	testl $7, %esi
	194	/* Already aligned */
	195	jz .L_8byte_aligned
	196
	197	/* Copy one byte at a time until source is 8-byte aligned */
	198	movl %esi, %ecx
	199	andl $7, %ecx
	200	subl $8, %ecx
	201	negl %ecx
	202	subl %ecx, %edx
	203	.L_copy_leading_bytes:
	204	movb (%rsi), %al
	205	movb %al, (%rdi)
	206	incq %rsi
	207	incq %rdi
	208	decl %ecx
	209	jnz .L_copy_leading_bytes
	210
	211	.L_8byte_aligned:
	212	/* Figure out how many whole cache lines (64-bytes) to copy */
	213	movl %edx, %ecx
	214	andl $63, %edx
	215	shrl $6, %ecx
	216	jz .L_no_whole_cache_lines
	217
	218	/* Loop copying whole cache lines */
	219	.L_cache_w0: movq (%rsi), %r8
	220	.L_cache_w1: movq 1*8(%rsi), %r9
	221	.L_cache_w2: movq 2*8(%rsi), %r10
	222	.L_cache_w3: movq 3*8(%rsi), %r11
	223	movq %r8, (%rdi)
	224	movq %r9, 1*8(%rdi)
	225	movq %r10, 2*8(%rdi)
	226	movq %r11, 3*8(%rdi)
	227	.L_cache_w4: movq 4*8(%rsi), %r8
	228	.L_cache_w5: movq 5*8(%rsi), %r9
	229	.L_cache_w6: movq 6*8(%rsi), %r10
	230	.L_cache_w7: movq 7*8(%rsi), %r11
	231	movq %r8, 4*8(%rdi)
	232	movq %r9, 5*8(%rdi)
	233	movq %r10, 6*8(%rdi)
	234	movq %r11, 7*8(%rdi)
	235	leaq 64(%rsi), %rsi
	236	leaq 64(%rdi), %rdi
	237	decl %ecx
	238	jnz .L_cache_w0
	239
	240	/* Are there any trailing 8-byte words? */
	241	.L_no_whole_cache_lines:
	242	movl %edx, %ecx
	243	andl $7, %edx
244	shrl $3, %ecx
245	jz .L_no_whole_words
246
247	/* Copy trailing words */
248	.L_copy_trailing_words:
249	movq (%rsi), %r8
250	mov %r8, (%rdi)
251	leaq 8(%rsi), %rsi
252	leaq 8(%rdi), %rdi
253	decl %ecx
254	jnz .L_copy_trailing_words
255
256	/* Any trailing bytes? */
257	.L_no_whole_words:
258	andl %edx, %edx
259	jz .L_done_memcpy_trap
260
261	/* Copy trailing bytes */
262	movl %edx, %ecx
263	.L_copy_trailing_bytes:
264	movb (%rsi), %al
265	movb %al, (%rdi)
266	incq %rsi
267	incq %rdi
268	decl %ecx
269	jnz .L_copy_trailing_bytes
270
271	/* Copy successful. Return true */
272	.L_done_memcpy_trap:
273	xorq %rax, %rax
274	ret
275	ENDPROC(memcpy_mcsafe)
276
277	.section .fixup, "ax"
278	/* Return false for any failure */
279	.L_memcpy_mcsafe_fail:
280	mov $1, %rax
281	ret
282
283	.previous
284
285	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
286	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
287	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
288	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
289	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
290	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
291	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
292	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
293	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
294	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
295	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
296	#endif