[deliverable/linux.git] / arch / x86 / lib / copy_page_64.S

/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */

#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/alternative-asm.h>

	ALIGN
copy_page_rep:
	CFI_STARTPROC
	movl	$4096/8, %ecx
	rep	movsq
	ret
	CFI_ENDPROC
ENDPROC(copy_page_rep)

/*
 *  Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
 *  Could vary the prefetch distance based on SMP/UP.
*/

ENTRY(copy_page)
	CFI_STARTPROC
	subq	$2*8,	%rsp
	CFI_ADJUST_CFA_OFFSET 2*8
	movq	%rbx,	(%rsp)
	CFI_REL_OFFSET rbx, 0
	movq	%r12,	1*8(%rsp)
	CFI_REL_OFFSET r12, 1*8

	movl	$(4096/64)-5,	%ecx
	.p2align 4
.Loop64:
	dec	%rcx
	movq	0x8*0(%rsi), %rax
	movq	0x8*1(%rsi), %rbx
	movq	0x8*2(%rsi), %rdx
	movq	0x8*3(%rsi), %r8
	movq	0x8*4(%rsi), %r9
	movq	0x8*5(%rsi), %r10
	movq	0x8*6(%rsi), %r11
	movq	0x8*7(%rsi), %r12

	prefetcht0 5*64(%rsi)

	movq	%rax, 0x8*0(%rdi)
	movq	%rbx, 0x8*1(%rdi)
	movq	%rdx, 0x8*2(%rdi)
	movq	%r8,  0x8*3(%rdi)
	movq	%r9,  0x8*4(%rdi)
	movq	%r10, 0x8*5(%rdi)
	movq	%r11, 0x8*6(%rdi)
	movq	%r12, 0x8*7(%rdi)

	leaq	64 (%rsi), %rsi
	leaq	64 (%rdi), %rdi

	jnz	.Loop64

	movl	$5, %ecx
	.p2align 4
.Loop2:
	decl	%ecx

	movq	0x8*0(%rsi), %rax
	movq	0x8*1(%rsi), %rbx
	movq	0x8*2(%rsi), %rdx
	movq	0x8*3(%rsi), %r8
	movq	0x8*4(%rsi), %r9
	movq	0x8*5(%rsi), %r10
	movq	0x8*6(%rsi), %r11
	movq	0x8*7(%rsi), %r12

	movq	%rax, 0x8*0(%rdi)
	movq	%rbx, 0x8*1(%rdi)
	movq	%rdx, 0x8*2(%rdi)
	movq	%r8,  0x8*3(%rdi)
	movq	%r9,  0x8*4(%rdi)
	movq	%r10, 0x8*5(%rdi)
	movq	%r11, 0x8*6(%rdi)
	movq	%r12, 0x8*7(%rdi)

	leaq	64(%rdi), %rdi
	leaq	64(%rsi), %rsi
	jnz	.Loop2

	movq	(%rsp), %rbx
	CFI_RESTORE rbx
	movq	1*8(%rsp), %r12
	CFI_RESTORE r12
	addq	$2*8, %rsp
	CFI_ADJUST_CFA_OFFSET -2*8
	ret
.Lcopy_page_end:
	CFI_ENDPROC
ENDPROC(copy_page)

	/* Some CPUs run faster using the string copy instructions.
	   It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

	.section .altinstr_replacement,"ax"
1:	.byte 0xeb					/* jmp <disp8> */
	.byte (copy_page_rep - copy_page) - (2f - 1b)	/* offset */
2:
	.previous
	.section .altinstructions,"a"
	altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,	\
		.Lcopy_page_end-copy_page, 2b-1b
	.previous
Commit	Line	Data
1da177e4	1	/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
038b0a6d	2
8d379dad JB	3	#include <linux/linkage.h>
8d379dad JB	4	#include <asm/dwarf2.h>
59e97e4d	5	#include <asm/alternative-asm.h>
8d379dad JB	6
8d379dad JB	7	ALIGN
269833bd	8	copy_page_rep:
8d379dad	9	CFI_STARTPROC
269833bd ML	10	movl $4096/8, %ecx
269833bd ML	11	rep movsq
8d379dad JB	12	ret
8d379dad JB	13	CFI_ENDPROC
269833bd	14	ENDPROC(copy_page_rep)
8d379dad	15
269833bd ML	16	/*
	17	* Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
	18	* Could vary the prefetch distance based on SMP/UP.
	19	*/
1da177e4	20
8d379dad JB	21	ENTRY(copy_page)
8d379dad JB	22	CFI_STARTPROC
269833bd	23	subq $2*8, %rsp
42693290	24	CFI_ADJUST_CFA_OFFSET 2*8
269833bd	25	movq %rbx, (%rsp)
8d379dad	26	CFI_REL_OFFSET rbx, 0
269833bd	27	movq %r12, 1*8(%rsp)
8d379dad	28	CFI_REL_OFFSET r12, 1*8
7bcd3f34	29
269833bd	30	movl $(4096/64)-5, %ecx
7bcd3f34 AK	31	.p2align 4
7bcd3f34 AK	32	.Loop64:
269833bd ML	33	dec %rcx
	34	movq 0x8*0(%rsi), %rax
	35	movq 0x8*1(%rsi), %rbx
	36	movq 0x8*2(%rsi), %rdx
	37	movq 0x8*3(%rsi), %r8
	38	movq 0x8*4(%rsi), %r9
	39	movq 0x8*5(%rsi), %r10
	40	movq 0x8*6(%rsi), %r11
	41	movq 0x8*7(%rsi), %r12
7bcd3f34 AK	42
	43	prefetcht0 5*64(%rsi)
	44
269833bd ML	45	movq %rax, 0x8*0(%rdi)
	46	movq %rbx, 0x8*1(%rdi)
	47	movq %rdx, 0x8*2(%rdi)
	48	movq %r8, 0x8*3(%rdi)
	49	movq %r9, 0x8*4(%rdi)
	50	movq %r10, 0x8*5(%rdi)
	51	movq %r11, 0x8*6(%rdi)
	52	movq %r12, 0x8*7(%rdi)
7bcd3f34	53
269833bd ML	54	leaq 64 (%rsi), %rsi
269833bd ML	55	leaq 64 (%rdi), %rdi
7bcd3f34	56
269833bd	57	jnz .Loop64
7bcd3f34	58
269833bd	59	movl $5, %ecx
7bcd3f34 AK	60	.p2align 4
7bcd3f34 AK	61	.Loop2:
269833bd ML	62	decl %ecx
	63
	64	movq 0x8*0(%rsi), %rax
	65	movq 0x8*1(%rsi), %rbx
	66	movq 0x8*2(%rsi), %rdx
	67	movq 0x8*3(%rsi), %r8
	68	movq 0x8*4(%rsi), %r9
	69	movq 0x8*5(%rsi), %r10
	70	movq 0x8*6(%rsi), %r11
	71	movq 0x8*7(%rsi), %r12
	72
	73	movq %rax, 0x8*0(%rdi)
	74	movq %rbx, 0x8*1(%rdi)
	75	movq %rdx, 0x8*2(%rdi)
	76	movq %r8, 0x8*3(%rdi)
	77	movq %r9, 0x8*4(%rdi)
	78	movq %r10, 0x8*5(%rdi)
	79	movq %r11, 0x8*6(%rdi)
	80	movq %r12, 0x8*7(%rdi)
	81
	82	leaq 64(%rdi), %rdi
	83	leaq 64(%rsi), %rsi
7bcd3f34 AK	84	jnz .Loop2
7bcd3f34 AK	85
269833bd	86	movq (%rsp), %rbx
8d379dad	87	CFI_RESTORE rbx
269833bd	88	movq 1*8(%rsp), %r12
8d379dad	89	CFI_RESTORE r12
269833bd	90	addq $2*8, %rsp
42693290	91	CFI_ADJUST_CFA_OFFSET -2*8
7bcd3f34	92	ret
8d379dad JB	93	.Lcopy_page_end:
	94	CFI_ENDPROC
	95	ENDPROC(copy_page)
7bcd3f34 AK	96
	97	/* Some CPUs run faster using the string copy instructions.
	98	It is also a lot simpler. Use this when possible */
	99
	100	#include <asm/cpufeature.h>
	101
8d379dad JB	102	.section .altinstr_replacement,"ax"
8d379dad JB	103	1: .byte 0xeb /* jmp <disp8> */
269833bd	104	.byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
8d379dad JB	105	2:
8d379dad JB	106	.previous
7bcd3f34	107	.section .altinstructions,"a"
59e97e4d AL	108	altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
59e97e4d AL	109	.Lcopy_page_end-copy_page, 2b-1b
7bcd3f34	110	.previous