[deliverable/linux.git] / arch / x86 / lib / memcpy_64.S

/* Copyright 2002 Andi Kleen */

#include <linux/linkage.h>

#include <asm/cpufeature.h>
#include <asm/dwarf2.h>

/*
 * memcpy - Copy a memory block.
 *
 * Input:
 *  rdi destination
 *  rsi source
 *  rdx count
 *
 * Output:
 * rax original destination
 */

/*
 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
 *
 * Calls to this get patched into the kernel image via the
 * alternative instructions framework:
 */
	ALIGN
memcpy_c:
	CFI_STARTPROC
	movq %rdi, %rax

	movl %edx, %ecx
	shrl $3, %ecx
	andl $7, %edx
	rep movsq
	movl %edx, %ecx
	rep movsb
	ret
	CFI_ENDPROC
ENDPROC(memcpy_c)

ENTRY(__memcpy)
ENTRY(memcpy)
	CFI_STARTPROC

	/*
	 * Put the number of full 64-byte blocks into %ecx.
	 * Tail portion is handled at the end:
	 */
	movq %rdi, %rax
	movl %edx, %ecx
	shrl   $6, %ecx
	jz .Lhandle_tail

	.p2align 4
.Lloop_64:
	/*
	 * We decrement the loop index here - and the zero-flag is
	 * checked at the end of the loop (instructions inbetween do
	 * not change the zero flag):
	 */
	decl %ecx

	/*
	 * Move in blocks of 4x16 bytes:
	 */
	movq 0*8(%rsi),		%r11
	movq 1*8(%rsi),		%r8
	movq %r11,		0*8(%rdi)
	movq %r8,		1*8(%rdi)

	movq 2*8(%rsi),		%r9
	movq 3*8(%rsi),		%r10
	movq %r9,		2*8(%rdi)
	movq %r10,		3*8(%rdi)

	movq 4*8(%rsi),		%r11
	movq 5*8(%rsi),		%r8
	movq %r11,		4*8(%rdi)
	movq %r8,		5*8(%rdi)

	movq 6*8(%rsi),		%r9
	movq 7*8(%rsi),		%r10
	movq %r9,		6*8(%rdi)
	movq %r10,		7*8(%rdi)

	leaq 64(%rsi), %rsi
	leaq 64(%rdi), %rdi

	jnz  .Lloop_64

.Lhandle_tail:
	movl %edx, %ecx
	andl  $63, %ecx
	shrl   $3, %ecx
	jz   .Lhandle_7

	.p2align 4
.Lloop_8:
	decl %ecx
	movq (%rsi),		%r8
	movq %r8,		(%rdi)
	leaq 8(%rdi),		%rdi
	leaq 8(%rsi),		%rsi
	jnz  .Lloop_8

.Lhandle_7:
	movl %edx, %ecx
	andl $7, %ecx
	jz .Lend

	.p2align 4
.Lloop_1:
	movb (%rsi), %r8b
	movb %r8b, (%rdi)
	incq %rdi
	incq %rsi
	decl %ecx
	jnz .Lloop_1

.Lend:
	ret
	CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)

	/*
	 * Some CPUs run faster using the string copy instructions.
	 * It is also a lot simpler. Use this when possible:
	 */

	.section .altinstr_replacement, "ax"
1:	.byte 0xeb				/* jmp <disp8> */
	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
2:
	.previous

	.section .altinstructions, "a"
	.align 8
	.quad memcpy
	.quad 1b
	.byte X86_FEATURE_REP_GOOD

	/*
	 * Replace only beginning, memcpy is used to apply alternatives,
	 * so it is silly to overwrite itself with nops - reboot is the
	 * only outcome...
	 */
	.byte 2b - 1b
	.byte 2b - 1b
	.previous
Commit	Line	Data
1da177e4	1	/* Copyright 2002 Andi Kleen */
038b0a6d	2
8d379dad	3	#include <linux/linkage.h>
f3b6eaf0	4
8d379dad	5	#include <asm/cpufeature.h>
f3b6eaf0	6	#include <asm/dwarf2.h>
8d379dad	7
1da177e4 LT	8	/*
	9	* memcpy - Copy a memory block.
	10	*
f3b6eaf0 IM	11	* Input:
	12	* rdi destination
	13	* rsi source
	14	* rdx count
	15	*
1da177e4 LT	16	* Output:
1da177e4 LT	17	* rax original destination
f3b6eaf0	18	*/
1da177e4	19
f3b6eaf0 IM	20	/*
	21	* memcpy_c() - fast string ops (REP MOVSQ) based variant.
	22	*
	23	* Calls to this get patched into the kernel image via the
	24	* alternative instructions framework:
	25	*/
8d379dad JB	26	ALIGN
	27	memcpy_c:
	28	CFI_STARTPROC
f3b6eaf0 IM	29	movq %rdi, %rax
	30
	31	movl %edx, %ecx
	32	shrl $3, %ecx
	33	andl $7, %edx
8d379dad	34	rep movsq
f3b6eaf0	35	movl %edx, %ecx
8d379dad JB	36	rep movsb
	37	ret
	38	CFI_ENDPROC
	39	ENDPROC(memcpy_c)
	40
	41	ENTRY(__memcpy)
	42	ENTRY(memcpy)
	43	CFI_STARTPROC
7bcd3f34	44
f3b6eaf0 IM	45	/*
	46	* Put the number of full 64-byte blocks into %ecx.
	47	* Tail portion is handled at the end:
	48	*/
	49	movq %rdi, %rax
	50	movl %edx, %ecx
	51	shrl $6, %ecx
7bcd3f34 AK	52	jz .Lhandle_tail
	53
	54	.p2align 4
	55	.Lloop_64:
f3b6eaf0 IM	56	/*
	57	* We decrement the loop index here - and the zero-flag is
	58	* checked at the end of the loop (instructions inbetween do
	59	* not change the zero flag):
	60	*/
7bcd3f34 AK	61	decl %ecx
7bcd3f34 AK	62
f3b6eaf0 IM	63	/*
	64	* Move in blocks of 4x16 bytes:
	65	*/
	66	movq 0*8(%rsi), %r11
	67	movq 1*8(%rsi), %r8
	68	movq %r11, 0*8(%rdi)
	69	movq %r8, 1*8(%rdi)
7bcd3f34	70
f3b6eaf0 IM	71	movq 2*8(%rsi), %r9
	72	movq 3*8(%rsi), %r10
	73	movq %r9, 2*8(%rdi)
	74	movq %r10, 3*8(%rdi)
7bcd3f34	75
f3b6eaf0 IM	76	movq 4*8(%rsi), %r11
	77	movq 5*8(%rsi), %r8
	78	movq %r11, 4*8(%rdi)
	79	movq %r8, 5*8(%rdi)
7bcd3f34	80
f3b6eaf0 IM	81	movq 6*8(%rsi), %r9
	82	movq 7*8(%rsi), %r10
	83	movq %r9, 6*8(%rdi)
	84	movq %r10, 7*8(%rdi)
7bcd3f34	85
f3b6eaf0 IM	86	leaq 64(%rsi), %rsi
f3b6eaf0 IM	87	leaq 64(%rdi), %rdi
7bcd3f34	88
7bcd3f34 AK	89	jnz .Lloop_64
	90
	91	.Lhandle_tail:
f3b6eaf0 IM	92	movl %edx, %ecx
	93	andl $63, %ecx
	94	shrl $3, %ecx
7bcd3f34	95	jz .Lhandle_7
f3b6eaf0	96
7bcd3f34 AK	97	.p2align 4
	98	.Lloop_8:
	99	decl %ecx
f3b6eaf0 IM	100	movq (%rsi), %r8
	101	movq %r8, (%rdi)
	102	leaq 8(%rdi), %rdi
	103	leaq 8(%rsi), %rsi
7bcd3f34 AK	104	jnz .Lloop_8
	105
	106	.Lhandle_7:
f3b6eaf0 IM	107	movl %edx, %ecx
	108	andl $7, %ecx
	109	jz .Lend
	110
7bcd3f34 AK	111	.p2align 4
7bcd3f34 AK	112	.Lloop_1:
f3b6eaf0 IM	113	movb (%rsi), %r8b
f3b6eaf0 IM	114	movb %r8b, (%rdi)
7bcd3f34 AK	115	incq %rdi
	116	incq %rsi
	117	decl %ecx
	118	jnz .Lloop_1
	119
f3b6eaf0	120	.Lend:
7bcd3f34	121	ret
8d379dad JB	122	CFI_ENDPROC
	123	ENDPROC(memcpy)
	124	ENDPROC(__memcpy)
7bcd3f34	125
f3b6eaf0 IM	126	/*
	127	* Some CPUs run faster using the string copy instructions.
	128	* It is also a lot simpler. Use this when possible:
	129	*/
7bcd3f34	130
f3b6eaf0	131	.section .altinstr_replacement, "ax"
8d379dad JB	132	1: .byte 0xeb /* jmp <disp8> */
	133	.byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
	134	2:
	135	.previous
f3b6eaf0 IM	136
f3b6eaf0 IM	137	.section .altinstructions, "a"
7bcd3f34	138	.align 8
8d379dad JB	139	.quad memcpy
	140	.quad 1b
	141	.byte X86_FEATURE_REP_GOOD
f3b6eaf0 IM	142
	143	/*
	144	* Replace only beginning, memcpy is used to apply alternatives,
	145	* so it is silly to overwrite itself with nops - reboot is the
	146	* only outcome...
	147	*/
b8d3f244	148	.byte 2b - 1b
8d379dad	149	.byte 2b - 1b
7bcd3f34	150	.previous