Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input
[deliverable/linux.git] / arch / x86 / lib / memcpy_64.S
1 /* Copyright 2002 Andi Kleen */
2
3 #include <linux/linkage.h>
4 #include <asm/cpufeature.h>
5 #include <asm/alternative-asm.h>
6
7 /*
8 * We build a jump to memcpy_orig by default which gets NOPped out on
9 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
10 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
11 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
12 */
13
14 .weak memcpy
15
16 /*
17 * memcpy - Copy a memory block.
18 *
19 * Input:
20 * rdi destination
21 * rsi source
22 * rdx count
23 *
24 * Output:
25 * rax original destination
26 */
27 ENTRY(__memcpy)
28 ENTRY(memcpy)
29 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
30 "jmp memcpy_erms", X86_FEATURE_ERMS
31
32 movq %rdi, %rax
33 movq %rdx, %rcx
34 shrq $3, %rcx
35 andl $7, %edx
36 rep movsq
37 movl %edx, %ecx
38 rep movsb
39 ret
40 ENDPROC(memcpy)
41 ENDPROC(__memcpy)
42
43 /*
44 * memcpy_erms() - enhanced fast string memcpy. This is faster and
45 * simpler than memcpy. Use memcpy_erms when possible.
46 */
47 ENTRY(memcpy_erms)
48 movq %rdi, %rax
49 movq %rdx, %rcx
50 rep movsb
51 ret
52 ENDPROC(memcpy_erms)
53
54 ENTRY(memcpy_orig)
55 movq %rdi, %rax
56
57 cmpq $0x20, %rdx
58 jb .Lhandle_tail
59
60 /*
61 * We check whether memory false dependence could occur,
62 * then jump to corresponding copy mode.
63 */
64 cmp %dil, %sil
65 jl .Lcopy_backward
66 subq $0x20, %rdx
67 .Lcopy_forward_loop:
68 subq $0x20, %rdx
69
70 /*
71 * Move in blocks of 4x8 bytes:
72 */
73 movq 0*8(%rsi), %r8
74 movq 1*8(%rsi), %r9
75 movq 2*8(%rsi), %r10
76 movq 3*8(%rsi), %r11
77 leaq 4*8(%rsi), %rsi
78
79 movq %r8, 0*8(%rdi)
80 movq %r9, 1*8(%rdi)
81 movq %r10, 2*8(%rdi)
82 movq %r11, 3*8(%rdi)
83 leaq 4*8(%rdi), %rdi
84 jae .Lcopy_forward_loop
85 addl $0x20, %edx
86 jmp .Lhandle_tail
87
88 .Lcopy_backward:
89 /*
90 * Calculate copy position to tail.
91 */
92 addq %rdx, %rsi
93 addq %rdx, %rdi
94 subq $0x20, %rdx
95 /*
96 * At most 3 ALU operations in one cycle,
97 * so append NOPS in the same 16 bytes trunk.
98 */
99 .p2align 4
100 .Lcopy_backward_loop:
101 subq $0x20, %rdx
102 movq -1*8(%rsi), %r8
103 movq -2*8(%rsi), %r9
104 movq -3*8(%rsi), %r10
105 movq -4*8(%rsi), %r11
106 leaq -4*8(%rsi), %rsi
107 movq %r8, -1*8(%rdi)
108 movq %r9, -2*8(%rdi)
109 movq %r10, -3*8(%rdi)
110 movq %r11, -4*8(%rdi)
111 leaq -4*8(%rdi), %rdi
112 jae .Lcopy_backward_loop
113
114 /*
115 * Calculate copy position to head.
116 */
117 addl $0x20, %edx
118 subq %rdx, %rsi
119 subq %rdx, %rdi
120 .Lhandle_tail:
121 cmpl $16, %edx
122 jb .Lless_16bytes
123
124 /*
125 * Move data from 16 bytes to 31 bytes.
126 */
127 movq 0*8(%rsi), %r8
128 movq 1*8(%rsi), %r9
129 movq -2*8(%rsi, %rdx), %r10
130 movq -1*8(%rsi, %rdx), %r11
131 movq %r8, 0*8(%rdi)
132 movq %r9, 1*8(%rdi)
133 movq %r10, -2*8(%rdi, %rdx)
134 movq %r11, -1*8(%rdi, %rdx)
135 retq
136 .p2align 4
137 .Lless_16bytes:
138 cmpl $8, %edx
139 jb .Lless_8bytes
140 /*
141 * Move data from 8 bytes to 15 bytes.
142 */
143 movq 0*8(%rsi), %r8
144 movq -1*8(%rsi, %rdx), %r9
145 movq %r8, 0*8(%rdi)
146 movq %r9, -1*8(%rdi, %rdx)
147 retq
148 .p2align 4
149 .Lless_8bytes:
150 cmpl $4, %edx
151 jb .Lless_3bytes
152
153 /*
154 * Move data from 4 bytes to 7 bytes.
155 */
156 movl (%rsi), %ecx
157 movl -4(%rsi, %rdx), %r8d
158 movl %ecx, (%rdi)
159 movl %r8d, -4(%rdi, %rdx)
160 retq
161 .p2align 4
162 .Lless_3bytes:
163 subl $1, %edx
164 jb .Lend
165 /*
166 * Move data from 1 bytes to 3 bytes.
167 */
168 movzbl (%rsi), %ecx
169 jz .Lstore_1byte
170 movzbq 1(%rsi), %r8
171 movzbq (%rsi, %rdx), %r9
172 movb %r8b, 1(%rdi)
173 movb %r9b, (%rdi, %rdx)
174 .Lstore_1byte:
175 movb %cl, (%rdi)
176
177 .Lend:
178 retq
179 ENDPROC(memcpy_orig)
This page took 0.053108 seconds and 5 git commands to generate.