x86: memcpy, clean up
[deliverable/linux.git] / arch / x86 / lib / memcpy_64.S
CommitLineData
1da177e4 1/* Copyright 2002 Andi Kleen */
038b0a6d 2
8d379dad 3#include <linux/linkage.h>
f3b6eaf0 4
8d379dad 5#include <asm/cpufeature.h>
f3b6eaf0 6#include <asm/dwarf2.h>
8d379dad 7
1da177e4
LT
8/*
9 * memcpy - Copy a memory block.
10 *
f3b6eaf0
IM
11 * Input:
12 * rdi destination
13 * rsi source
14 * rdx count
15 *
1da177e4
LT
16 * Output:
17 * rax original destination
f3b6eaf0 18 */
1da177e4 19
f3b6eaf0
IM
20/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 *
23 * Calls to this get patched into the kernel image via the
24 * alternative instructions framework:
25 */
8d379dad
JB
26 ALIGN
27memcpy_c:
28 CFI_STARTPROC
f3b6eaf0
IM
29 movq %rdi, %rax
30
31 movl %edx, %ecx
32 shrl $3, %ecx
33 andl $7, %edx
8d379dad 34 rep movsq
f3b6eaf0 35 movl %edx, %ecx
8d379dad
JB
36 rep movsb
37 ret
38 CFI_ENDPROC
39ENDPROC(memcpy_c)
40
41ENTRY(__memcpy)
42ENTRY(memcpy)
43 CFI_STARTPROC
7bcd3f34 44
f3b6eaf0
IM
45 /*
46 * Put the number of full 64-byte blocks into %ecx.
47 * Tail portion is handled at the end:
48 */
49 movq %rdi, %rax
50 movl %edx, %ecx
51 shrl $6, %ecx
7bcd3f34
AK
52 jz .Lhandle_tail
53
54 .p2align 4
55.Lloop_64:
f3b6eaf0
IM
56 /*
57 * We decrement the loop index here - and the zero-flag is
58 * checked at the end of the loop (instructions inbetween do
59 * not change the zero flag):
60 */
7bcd3f34
AK
61 decl %ecx
62
f3b6eaf0
IM
63 /*
64 * Move in blocks of 4x16 bytes:
65 */
66 movq 0*8(%rsi), %r11
67 movq 1*8(%rsi), %r8
68 movq %r11, 0*8(%rdi)
69 movq %r8, 1*8(%rdi)
7bcd3f34 70
f3b6eaf0
IM
71 movq 2*8(%rsi), %r9
72 movq 3*8(%rsi), %r10
73 movq %r9, 2*8(%rdi)
74 movq %r10, 3*8(%rdi)
7bcd3f34 75
f3b6eaf0
IM
76 movq 4*8(%rsi), %r11
77 movq 5*8(%rsi), %r8
78 movq %r11, 4*8(%rdi)
79 movq %r8, 5*8(%rdi)
7bcd3f34 80
f3b6eaf0
IM
81 movq 6*8(%rsi), %r9
82 movq 7*8(%rsi), %r10
83 movq %r9, 6*8(%rdi)
84 movq %r10, 7*8(%rdi)
7bcd3f34 85
f3b6eaf0
IM
86 leaq 64(%rsi), %rsi
87 leaq 64(%rdi), %rdi
7bcd3f34 88
7bcd3f34
AK
89 jnz .Lloop_64
90
91.Lhandle_tail:
f3b6eaf0
IM
92 movl %edx, %ecx
93 andl $63, %ecx
94 shrl $3, %ecx
7bcd3f34 95 jz .Lhandle_7
f3b6eaf0 96
7bcd3f34
AK
97 .p2align 4
98.Lloop_8:
99 decl %ecx
f3b6eaf0
IM
100 movq (%rsi), %r8
101 movq %r8, (%rdi)
102 leaq 8(%rdi), %rdi
103 leaq 8(%rsi), %rsi
7bcd3f34
AK
104 jnz .Lloop_8
105
106.Lhandle_7:
f3b6eaf0
IM
107 movl %edx, %ecx
108 andl $7, %ecx
109 jz .Lend
110
7bcd3f34
AK
111 .p2align 4
112.Lloop_1:
f3b6eaf0
IM
113 movb (%rsi), %r8b
114 movb %r8b, (%rdi)
7bcd3f34
AK
115 incq %rdi
116 incq %rsi
117 decl %ecx
118 jnz .Lloop_1
119
f3b6eaf0 120.Lend:
7bcd3f34 121 ret
8d379dad
JB
122 CFI_ENDPROC
123ENDPROC(memcpy)
124ENDPROC(__memcpy)
7bcd3f34 125
f3b6eaf0
IM
126 /*
127 * Some CPUs run faster using the string copy instructions.
128 * It is also a lot simpler. Use this when possible:
129 */
7bcd3f34 130
f3b6eaf0 131 .section .altinstr_replacement, "ax"
8d379dad
JB
1321: .byte 0xeb /* jmp <disp8> */
133 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
1342:
135 .previous
f3b6eaf0
IM
136
137 .section .altinstructions, "a"
7bcd3f34 138 .align 8
8d379dad
JB
139 .quad memcpy
140 .quad 1b
141 .byte X86_FEATURE_REP_GOOD
f3b6eaf0
IM
142
143 /*
144 * Replace only beginning, memcpy is used to apply alternatives,
145 * so it is silly to overwrite itself with nops - reboot is the
146 * only outcome...
147 */
b8d3f244 148 .byte 2b - 1b
8d379dad 149 .byte 2b - 1b
7bcd3f34 150 .previous
This page took 0.358047 seconds and 5 git commands to generate.