x86, mem: copy_user_64.S: Support copy_to/from_user by enhanced REP MOVSB/STOSB
[deliverable/linux.git] / arch / x86 / lib / memcpy_64.S
CommitLineData
1da177e4 1/* Copyright 2002 Andi Kleen */
038b0a6d 2
8d379dad 3#include <linux/linkage.h>
f3b6eaf0 4
8d379dad 5#include <asm/cpufeature.h>
f3b6eaf0 6#include <asm/dwarf2.h>
8d379dad 7
1da177e4
LT
8/*
9 * memcpy - Copy a memory block.
10 *
f3b6eaf0
IM
11 * Input:
12 * rdi destination
13 * rsi source
14 * rdx count
15 *
1da177e4
LT
16 * Output:
17 * rax original destination
f3b6eaf0 18 */
1da177e4 19
f3b6eaf0
IM
20/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 *
7269e881 23 * This gets patched over the unrolled variant (below) via the
f3b6eaf0
IM
24 * alternative instructions framework:
25 */
7269e881
JB
26 .section .altinstr_replacement, "ax", @progbits
27.Lmemcpy_c:
f3b6eaf0
IM
28 movq %rdi, %rax
29
30 movl %edx, %ecx
31 shrl $3, %ecx
32 andl $7, %edx
8d379dad 33 rep movsq
f3b6eaf0 34 movl %edx, %ecx
8d379dad
JB
35 rep movsb
36 ret
7269e881
JB
37.Lmemcpy_e:
38 .previous
8d379dad
JB
39
40ENTRY(__memcpy)
41ENTRY(memcpy)
42 CFI_STARTPROC
59daa706 43 movq %rdi, %rax
7bcd3f34 44
f3b6eaf0 45 /*
59daa706 46 * Use 32bit CMP here to avoid long NOP padding.
f3b6eaf0 47 */
59daa706
ML
48 cmp $0x20, %edx
49 jb .Lhandle_tail
7bcd3f34 50
f3b6eaf0 51 /*
59daa706
ML
52 * We check whether memory false dependece could occur,
53 * then jump to corresponding copy mode.
f3b6eaf0 54 */
59daa706
ML
55 cmp %dil, %sil
56 jl .Lcopy_backward
57 subl $0x20, %edx
58.Lcopy_forward_loop:
59 subq $0x20, %rdx
7bcd3f34 60
f3b6eaf0 61 /*
59daa706 62 * Move in blocks of 4x8 bytes:
f3b6eaf0 63 */
59daa706
ML
64 movq 0*8(%rsi), %r8
65 movq 1*8(%rsi), %r9
66 movq 2*8(%rsi), %r10
67 movq 3*8(%rsi), %r11
68 leaq 4*8(%rsi), %rsi
69
70 movq %r8, 0*8(%rdi)
71 movq %r9, 1*8(%rdi)
72 movq %r10, 2*8(%rdi)
73 movq %r11, 3*8(%rdi)
74 leaq 4*8(%rdi), %rdi
75 jae .Lcopy_forward_loop
76 addq $0x20, %rdx
77 jmp .Lhandle_tail
78
79.Lcopy_backward:
80 /*
81 * Calculate copy position to tail.
82 */
83 addq %rdx, %rsi
84 addq %rdx, %rdi
85 subq $0x20, %rdx
86 /*
87 * At most 3 ALU operations in one cycle,
88 * so append NOPS in the same 16bytes trunk.
89 */
90 .p2align 4
91.Lcopy_backward_loop:
92 subq $0x20, %rdx
93 movq -1*8(%rsi), %r8
94 movq -2*8(%rsi), %r9
95 movq -3*8(%rsi), %r10
96 movq -4*8(%rsi), %r11
97 leaq -4*8(%rsi), %rsi
98 movq %r8, -1*8(%rdi)
99 movq %r9, -2*8(%rdi)
100 movq %r10, -3*8(%rdi)
101 movq %r11, -4*8(%rdi)
102 leaq -4*8(%rdi), %rdi
103 jae .Lcopy_backward_loop
7bcd3f34 104
59daa706
ML
105 /*
106 * Calculate copy position to head.
107 */
108 addq $0x20, %rdx
109 subq %rdx, %rsi
110 subq %rdx, %rdi
7bcd3f34 111.Lhandle_tail:
59daa706
ML
112 cmpq $16, %rdx
113 jb .Lless_16bytes
f3b6eaf0 114
59daa706
ML
115 /*
116 * Move data from 16 bytes to 31 bytes.
117 */
118 movq 0*8(%rsi), %r8
119 movq 1*8(%rsi), %r9
120 movq -2*8(%rsi, %rdx), %r10
121 movq -1*8(%rsi, %rdx), %r11
122 movq %r8, 0*8(%rdi)
123 movq %r9, 1*8(%rdi)
124 movq %r10, -2*8(%rdi, %rdx)
125 movq %r11, -1*8(%rdi, %rdx)
126 retq
7bcd3f34 127 .p2align 4
59daa706
ML
128.Lless_16bytes:
129 cmpq $8, %rdx
130 jb .Lless_8bytes
131 /*
132 * Move data from 8 bytes to 15 bytes.
133 */
134 movq 0*8(%rsi), %r8
135 movq -1*8(%rsi, %rdx), %r9
136 movq %r8, 0*8(%rdi)
137 movq %r9, -1*8(%rdi, %rdx)
138 retq
139 .p2align 4
140.Lless_8bytes:
141 cmpq $4, %rdx
142 jb .Lless_3bytes
f3b6eaf0 143
59daa706
ML
144 /*
145 * Move data from 4 bytes to 7 bytes.
146 */
147 movl (%rsi), %ecx
148 movl -4(%rsi, %rdx), %r8d
149 movl %ecx, (%rdi)
150 movl %r8d, -4(%rdi, %rdx)
151 retq
7bcd3f34 152 .p2align 4
59daa706
ML
153.Lless_3bytes:
154 cmpl $0, %edx
155 je .Lend
156 /*
157 * Move data from 1 bytes to 3 bytes.
158 */
7bcd3f34 159.Lloop_1:
f3b6eaf0
IM
160 movb (%rsi), %r8b
161 movb %r8b, (%rdi)
7bcd3f34
AK
162 incq %rdi
163 incq %rsi
59daa706 164 decl %edx
7bcd3f34
AK
165 jnz .Lloop_1
166
f3b6eaf0 167.Lend:
59daa706 168 retq
8d379dad
JB
169 CFI_ENDPROC
170ENDPROC(memcpy)
171ENDPROC(__memcpy)
7bcd3f34 172
f3b6eaf0
IM
173 /*
174 * Some CPUs run faster using the string copy instructions.
175 * It is also a lot simpler. Use this when possible:
176 */
7bcd3f34 177
f3b6eaf0 178 .section .altinstructions, "a"
7bcd3f34 179 .align 8
8d379dad 180 .quad memcpy
7269e881 181 .quad .Lmemcpy_c
83a7a2ad 182 .word X86_FEATURE_REP_GOOD
f3b6eaf0
IM
183
184 /*
185 * Replace only beginning, memcpy is used to apply alternatives,
186 * so it is silly to overwrite itself with nops - reboot is the
187 * only outcome...
188 */
7269e881
JB
189 .byte .Lmemcpy_e - .Lmemcpy_c
190 .byte .Lmemcpy_e - .Lmemcpy_c
7bcd3f34 191 .previous
This page took 0.491497 seconds and 5 git commands to generate.