[deliverable/linux.git] / arch / x86 / lib / memcpy_64.S

/* Copyright 2002 Andi Kleen */

#include <linux/linkage.h>

#include <asm/cpufeature.h>
#include <asm/dwarf2.h>

/*
 * memcpy - Copy a memory block.
 *
 * Input:
 *  rdi destination
 *  rsi source
 *  rdx count
 *
 * Output:
 * rax original destination
 */

/*
 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
 *
 * This gets patched over the unrolled variant (below) via the
 * alternative instructions framework:
 */
	.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c:
	movq %rdi, %rax

	movl %edx, %ecx
	shrl $3, %ecx
	andl $7, %edx
	rep movsq
	movl %edx, %ecx
	rep movsb
	ret
.Lmemcpy_e:
	.previous

ENTRY(__memcpy)
ENTRY(memcpy)
	CFI_STARTPROC
	movq %rdi, %rax

	/*
	 * Use 32bit CMP here to avoid long NOP padding.
	 */
	cmp  $0x20, %edx
	jb .Lhandle_tail

	/*
	 * We check whether memory false dependece could occur,
	 * then jump to corresponding copy mode.
	 */
	cmp  %dil, %sil
	jl .Lcopy_backward
	subl $0x20, %edx
.Lcopy_forward_loop:
	subq $0x20,	%rdx

	/*
	 * Move in blocks of 4x8 bytes:
	 */
	movq 0*8(%rsi),	%r8
	movq 1*8(%rsi),	%r9
	movq 2*8(%rsi),	%r10
	movq 3*8(%rsi),	%r11
	leaq 4*8(%rsi),	%rsi

	movq %r8,	0*8(%rdi)
	movq %r9,	1*8(%rdi)
	movq %r10,	2*8(%rdi)
	movq %r11,	3*8(%rdi)
	leaq 4*8(%rdi),	%rdi
	jae  .Lcopy_forward_loop
	addq $0x20,	%rdx
	jmp  .Lhandle_tail

.Lcopy_backward:
	/*
	 * Calculate copy position to tail.
	 */
	addq %rdx,	%rsi
	addq %rdx,	%rdi
	subq $0x20,	%rdx
	/*
	 * At most 3 ALU operations in one cycle,
	 * so append NOPS in the same 16bytes trunk.
	 */
	.p2align 4
.Lcopy_backward_loop:
	subq $0x20,	%rdx
	movq -1*8(%rsi),	%r8
	movq -2*8(%rsi),	%r9
	movq -3*8(%rsi),	%r10
	movq -4*8(%rsi),	%r11
	leaq -4*8(%rsi),	%rsi
	movq %r8,		-1*8(%rdi)
	movq %r9,		-2*8(%rdi)
	movq %r10,		-3*8(%rdi)
	movq %r11,		-4*8(%rdi)
	leaq -4*8(%rdi),	%rdi
	jae  .Lcopy_backward_loop

	/*
	 * Calculate copy position to head.
	 */
	addq $0x20,	%rdx
	subq %rdx,	%rsi
	subq %rdx,	%rdi
.Lhandle_tail:
	cmpq $16,	%rdx
	jb   .Lless_16bytes

	/*
	 * Move data from 16 bytes to 31 bytes.
	 */
	movq 0*8(%rsi), %r8
	movq 1*8(%rsi),	%r9
	movq -2*8(%rsi, %rdx),	%r10
	movq -1*8(%rsi, %rdx),	%r11
	movq %r8,	0*8(%rdi)
	movq %r9,	1*8(%rdi)
	movq %r10,	-2*8(%rdi, %rdx)
	movq %r11,	-1*8(%rdi, %rdx)
	retq
	.p2align 4
.Lless_16bytes:
	cmpq $8,	%rdx
	jb   .Lless_8bytes
	/*
	 * Move data from 8 bytes to 15 bytes.
	 */
	movq 0*8(%rsi),	%r8
	movq -1*8(%rsi, %rdx),	%r9
	movq %r8,	0*8(%rdi)
	movq %r9,	-1*8(%rdi, %rdx)
	retq
	.p2align 4
.Lless_8bytes:
	cmpq $4,	%rdx
	jb   .Lless_3bytes

	/*
	 * Move data from 4 bytes to 7 bytes.
	 */
	movl (%rsi), %ecx
	movl -4(%rsi, %rdx), %r8d
	movl %ecx, (%rdi)
	movl %r8d, -4(%rdi, %rdx)
	retq
	.p2align 4
.Lless_3bytes:
	cmpl $0, %edx
	je .Lend
	/*
	 * Move data from 1 bytes to 3 bytes.
	 */
.Lloop_1:
	movb (%rsi), %r8b
	movb %r8b, (%rdi)
	incq %rdi
	incq %rsi
	decl %edx
	jnz .Lloop_1

.Lend:
	retq
	CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)

	/*
	 * Some CPUs run faster using the string copy instructions.
	 * It is also a lot simpler. Use this when possible:
	 */

	.section .altinstructions, "a"
	.align 8
	.quad memcpy
	.quad .Lmemcpy_c
	.word X86_FEATURE_REP_GOOD

	/*
	 * Replace only beginning, memcpy is used to apply alternatives,
	 * so it is silly to overwrite itself with nops - reboot is the
	 * only outcome...
	 */
	.byte .Lmemcpy_e - .Lmemcpy_c
	.byte .Lmemcpy_e - .Lmemcpy_c
	.previous
Commit	Line	Data
1da177e4	1	/* Copyright 2002 Andi Kleen */
038b0a6d	2
8d379dad	3	#include <linux/linkage.h>
f3b6eaf0	4
8d379dad	5	#include <asm/cpufeature.h>
f3b6eaf0	6	#include <asm/dwarf2.h>
8d379dad	7
1da177e4 LT	8	/*
	9	* memcpy - Copy a memory block.
	10	*
f3b6eaf0 IM	11	* Input:
	12	* rdi destination
	13	* rsi source
	14	* rdx count
	15	*
1da177e4 LT	16	* Output:
1da177e4 LT	17	* rax original destination
f3b6eaf0	18	*/
1da177e4	19
f3b6eaf0 IM	20	/*
	21	* memcpy_c() - fast string ops (REP MOVSQ) based variant.
	22	*
7269e881	23	* This gets patched over the unrolled variant (below) via the
f3b6eaf0 IM	24	* alternative instructions framework:
f3b6eaf0 IM	25	*/
7269e881 JB	26	.section .altinstr_replacement, "ax", @progbits
7269e881 JB	27	.Lmemcpy_c:
f3b6eaf0 IM	28	movq %rdi, %rax
	29
	30	movl %edx, %ecx
	31	shrl $3, %ecx
	32	andl $7, %edx
8d379dad	33	rep movsq
f3b6eaf0	34	movl %edx, %ecx
8d379dad JB	35	rep movsb
8d379dad JB	36	ret
7269e881 JB	37	.Lmemcpy_e:
7269e881 JB	38	.previous
8d379dad JB	39
	40	ENTRY(__memcpy)
	41	ENTRY(memcpy)
	42	CFI_STARTPROC
59daa706	43	movq %rdi, %rax
7bcd3f34	44
f3b6eaf0	45	/*
59daa706	46	* Use 32bit CMP here to avoid long NOP padding.
f3b6eaf0	47	*/
59daa706 ML	48	cmp $0x20, %edx
59daa706 ML	49	jb .Lhandle_tail
7bcd3f34	50
f3b6eaf0	51	/*
59daa706 ML	52	* We check whether memory false dependece could occur,
59daa706 ML	53	* then jump to corresponding copy mode.
f3b6eaf0	54	*/
59daa706 ML	55	cmp %dil, %sil
	56	jl .Lcopy_backward
	57	subl $0x20, %edx
	58	.Lcopy_forward_loop:
	59	subq $0x20, %rdx
7bcd3f34	60
f3b6eaf0	61	/*
59daa706	62	* Move in blocks of 4x8 bytes:
f3b6eaf0	63	*/
59daa706 ML	64	movq 0*8(%rsi), %r8
	65	movq 1*8(%rsi), %r9
	66	movq 2*8(%rsi), %r10
	67	movq 3*8(%rsi), %r11
	68	leaq 4*8(%rsi), %rsi
	69
	70	movq %r8, 0*8(%rdi)
	71	movq %r9, 1*8(%rdi)
	72	movq %r10, 2*8(%rdi)
	73	movq %r11, 3*8(%rdi)
	74	leaq 4*8(%rdi), %rdi
	75	jae .Lcopy_forward_loop
	76	addq $0x20, %rdx
	77	jmp .Lhandle_tail
	78
	79	.Lcopy_backward:
	80	/*
	81	* Calculate copy position to tail.
	82	*/
	83	addq %rdx, %rsi
	84	addq %rdx, %rdi
	85	subq $0x20, %rdx
	86	/*
	87	* At most 3 ALU operations in one cycle,
	88	* so append NOPS in the same 16bytes trunk.
	89	*/
	90	.p2align 4
	91	.Lcopy_backward_loop:
	92	subq $0x20, %rdx
	93	movq -1*8(%rsi), %r8
	94	movq -2*8(%rsi), %r9
	95	movq -3*8(%rsi), %r10
	96	movq -4*8(%rsi), %r11
	97	leaq -4*8(%rsi), %rsi
	98	movq %r8, -1*8(%rdi)
	99	movq %r9, -2*8(%rdi)
	100	movq %r10, -3*8(%rdi)
	101	movq %r11, -4*8(%rdi)
	102	leaq -4*8(%rdi), %rdi
	103	jae .Lcopy_backward_loop
7bcd3f34	104
59daa706 ML	105	/*
	106	* Calculate copy position to head.
	107	*/
	108	addq $0x20, %rdx
	109	subq %rdx, %rsi
	110	subq %rdx, %rdi
7bcd3f34	111	.Lhandle_tail:
59daa706 ML	112	cmpq $16, %rdx
59daa706 ML	113	jb .Lless_16bytes
f3b6eaf0	114
59daa706 ML	115	/*
	116	* Move data from 16 bytes to 31 bytes.
	117	*/
	118	movq 0*8(%rsi), %r8
	119	movq 1*8(%rsi), %r9
	120	movq -2*8(%rsi, %rdx), %r10
	121	movq -1*8(%rsi, %rdx), %r11
	122	movq %r8, 0*8(%rdi)
	123	movq %r9, 1*8(%rdi)
	124	movq %r10, -2*8(%rdi, %rdx)
	125	movq %r11, -1*8(%rdi, %rdx)
	126	retq
7bcd3f34	127	.p2align 4
59daa706 ML	128	.Lless_16bytes:
	129	cmpq $8, %rdx
	130	jb .Lless_8bytes
	131	/*
	132	* Move data from 8 bytes to 15 bytes.
	133	*/
	134	movq 0*8(%rsi), %r8
	135	movq -1*8(%rsi, %rdx), %r9
	136	movq %r8, 0*8(%rdi)
	137	movq %r9, -1*8(%rdi, %rdx)
	138	retq
	139	.p2align 4
	140	.Lless_8bytes:
	141	cmpq $4, %rdx
	142	jb .Lless_3bytes
f3b6eaf0	143
59daa706 ML	144	/*
	145	* Move data from 4 bytes to 7 bytes.
	146	*/
	147	movl (%rsi), %ecx
	148	movl -4(%rsi, %rdx), %r8d
	149	movl %ecx, (%rdi)
	150	movl %r8d, -4(%rdi, %rdx)
	151	retq
7bcd3f34	152	.p2align 4
59daa706 ML	153	.Lless_3bytes:
	154	cmpl $0, %edx
	155	je .Lend
	156	/*
	157	* Move data from 1 bytes to 3 bytes.
	158	*/
7bcd3f34	159	.Lloop_1:
f3b6eaf0 IM	160	movb (%rsi), %r8b
f3b6eaf0 IM	161	movb %r8b, (%rdi)
7bcd3f34 AK	162	incq %rdi
7bcd3f34 AK	163	incq %rsi
59daa706	164	decl %edx
7bcd3f34 AK	165	jnz .Lloop_1
7bcd3f34 AK	166
f3b6eaf0	167	.Lend:
59daa706	168	retq
8d379dad JB	169	CFI_ENDPROC
	170	ENDPROC(memcpy)
	171	ENDPROC(__memcpy)
7bcd3f34	172
f3b6eaf0 IM	173	/*
	174	* Some CPUs run faster using the string copy instructions.
	175	* It is also a lot simpler. Use this when possible:
	176	*/
7bcd3f34	177
f3b6eaf0	178	.section .altinstructions, "a"
7bcd3f34	179	.align 8
8d379dad	180	.quad memcpy
7269e881	181	.quad .Lmemcpy_c
83a7a2ad	182	.word X86_FEATURE_REP_GOOD
f3b6eaf0 IM	183
	184	/*
	185	* Replace only beginning, memcpy is used to apply alternatives,
	186	* so it is silly to overwrite itself with nops - reboot is the
	187	* only outcome...
	188	*/
7269e881 JB	189	.byte .Lmemcpy_e - .Lmemcpy_c
7269e881 JB	190	.byte .Lmemcpy_e - .Lmemcpy_c
7bcd3f34	191	.previous