Commit | Line | Data |
---|---|---|
ae2c6ca6 DM |
1 | /* NG4memcpy.S: Niagara-4 optimized memcpy. |
2 | * | |
3 | * Copyright (C) 2012 David S. Miller (davem@davemloft.net) | |
4 | */ | |
5 | ||
6 | #ifdef __KERNEL__ | |
7 | #include <asm/visasm.h> | |
8 | #include <asm/asi.h> | |
9 | #define GLOBAL_SPARE %g7 | |
10 | #else | |
11 | #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 | |
12 | #define FPRS_FEF 0x04 | |
13 | ||
14 | /* On T4 it is very expensive to access ASRs like %fprs and | |
15 | * %asi, avoiding a read or a write can save ~50 cycles. | |
16 | */ | |
17 | #define FPU_ENTER \ | |
18 | rd %fprs, %o5; \ | |
19 | andcc %o5, FPRS_FEF, %g0; \ | |
20 | be,a,pn %icc, 999f; \ | |
21 | wr %g0, FPRS_FEF, %fprs; \ | |
22 | 999: | |
23 | ||
24 | #ifdef MEMCPY_DEBUG | |
25 | #define VISEntryHalf FPU_ENTER; \ | |
26 | clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; | |
27 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
28 | #else | |
29 | #define VISEntryHalf FPU_ENTER | |
30 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
31 | #endif | |
32 | ||
33 | #define GLOBAL_SPARE %g5 | |
34 | #endif | |
35 | ||
36 | #ifndef STORE_ASI | |
37 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA | |
38 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | |
39 | #else | |
40 | #define STORE_ASI 0x80 /* ASI_P */ | |
41 | #endif | |
42 | #endif | |
43 | ||
44 | #ifndef EX_LD | |
45 | #define EX_LD(x) x | |
46 | #endif | |
47 | ||
48 | #ifndef EX_ST | |
49 | #define EX_ST(x) x | |
50 | #endif | |
51 | ||
52 | #ifndef EX_RETVAL | |
53 | #define EX_RETVAL(x) x | |
54 | #endif | |
55 | ||
56 | #ifndef LOAD | |
57 | #define LOAD(type,addr,dest) type [addr], dest | |
58 | #endif | |
59 | ||
60 | #ifndef STORE | |
61 | #ifndef MEMCPY_DEBUG | |
62 | #define STORE(type,src,addr) type src, [addr] | |
63 | #else | |
64 | #define STORE(type,src,addr) type##a src, [addr] %asi | |
65 | #endif | |
66 | #endif | |
67 | ||
68 | #ifndef STORE_INIT | |
69 | #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI | |
70 | #endif | |
71 | ||
72 | #ifndef FUNC_NAME | |
73 | #define FUNC_NAME NG4memcpy | |
74 | #endif | |
75 | #ifndef PREAMBLE | |
76 | #define PREAMBLE | |
77 | #endif | |
78 | ||
79 | #ifndef XCC | |
80 | #define XCC xcc | |
81 | #endif | |
82 | ||
83 | .register %g2,#scratch | |
84 | .register %g3,#scratch | |
85 | ||
86 | .text | |
87 | .align 64 | |
88 | ||
89 | .globl FUNC_NAME | |
90 | .type FUNC_NAME,#function | |
91 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | |
92 | #ifdef MEMCPY_DEBUG | |
93 | wr %g0, 0x80, %asi | |
94 | #endif | |
95 | srlx %o2, 31, %g2 | |
96 | cmp %g2, 0 | |
97 | tne %XCC, 5 | |
98 | PREAMBLE | |
99 | mov %o0, %o3 | |
100 | brz,pn %o2, .Lexit | |
101 | cmp %o2, 3 | |
102 | ble,pn %icc, .Ltiny | |
103 | cmp %o2, 19 | |
104 | ble,pn %icc, .Lsmall | |
105 | or %o0, %o1, %g2 | |
106 | cmp %o2, 128 | |
107 | bl,pn %icc, .Lmedium | |
108 | nop | |
109 | ||
110 | .Llarge:/* len >= 0x80 */ | |
111 | /* First get dest 8 byte aligned. */ | |
112 | sub %g0, %o0, %g1 | |
113 | and %g1, 0x7, %g1 | |
114 | brz,pt %g1, 51f | |
115 | sub %o2, %g1, %o2 | |
42a4172b | 116 | |
ae2c6ca6 DM |
117 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) |
118 | add %o1, 1, %o1 | |
119 | subcc %g1, 1, %g1 | |
120 | add %o0, 1, %o0 | |
121 | bne,pt %icc, 1b | |
122 | EX_ST(STORE(stb, %g2, %o0 - 0x01)) | |
123 | ||
124 | 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) | |
125 | LOAD(prefetch, %o1 + 0x080, #n_reads_strong) | |
126 | LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) | |
127 | LOAD(prefetch, %o1 + 0x100, #n_reads_strong) | |
128 | LOAD(prefetch, %o1 + 0x140, #n_reads_strong) | |
129 | LOAD(prefetch, %o1 + 0x180, #n_reads_strong) | |
130 | LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) | |
131 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | |
132 | ||
133 | /* Check if we can use the straight fully aligned | |
134 | * loop, or we require the alignaddr/faligndata variant. | |
135 | */ | |
136 | andcc %o1, 0x7, %o5 | |
137 | bne,pn %icc, .Llarge_src_unaligned | |
138 | sub %g0, %o0, %g1 | |
139 | ||
140 | /* Legitimize the use of initializing stores by getting dest | |
141 | * to be 64-byte aligned. | |
142 | */ | |
143 | and %g1, 0x3f, %g1 | |
144 | brz,pt %g1, .Llarge_aligned | |
145 | sub %o2, %g1, %o2 | |
42a4172b | 146 | |
ae2c6ca6 DM |
147 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2)) |
148 | add %o1, 8, %o1 | |
149 | subcc %g1, 8, %g1 | |
150 | add %o0, 8, %o0 | |
151 | bne,pt %icc, 1b | |
152 | EX_ST(STORE(stx, %g2, %o0 - 0x08)) | |
153 | ||
154 | .Llarge_aligned: | |
155 | /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ | |
156 | andn %o2, 0x3f, %o4 | |
157 | sub %o2, %o4, %o2 | |
158 | ||
159 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
160 | add %o1, 0x40, %o1 | |
161 | EX_LD(LOAD(ldx, %o1 - 0x38, %g2)) | |
162 | subcc %o4, 0x40, %o4 | |
163 | EX_LD(LOAD(ldx, %o1 - 0x30, %g3)) | |
164 | EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE)) | |
165 | EX_LD(LOAD(ldx, %o1 - 0x20, %o5)) | |
166 | EX_ST(STORE_INIT(%g1, %o0)) | |
167 | add %o0, 0x08, %o0 | |
168 | EX_ST(STORE_INIT(%g2, %o0)) | |
169 | add %o0, 0x08, %o0 | |
170 | EX_LD(LOAD(ldx, %o1 - 0x18, %g2)) | |
171 | EX_ST(STORE_INIT(%g3, %o0)) | |
172 | add %o0, 0x08, %o0 | |
173 | EX_LD(LOAD(ldx, %o1 - 0x10, %g3)) | |
174 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | |
175 | add %o0, 0x08, %o0 | |
176 | EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE)) | |
177 | EX_ST(STORE_INIT(%o5, %o0)) | |
178 | add %o0, 0x08, %o0 | |
179 | EX_ST(STORE_INIT(%g2, %o0)) | |
180 | add %o0, 0x08, %o0 | |
181 | EX_ST(STORE_INIT(%g3, %o0)) | |
182 | add %o0, 0x08, %o0 | |
183 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | |
184 | add %o0, 0x08, %o0 | |
185 | bne,pt %icc, 1b | |
186 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | |
187 | ||
188 | membar #StoreLoad | #StoreStore | |
189 | ||
190 | brz,pn %o2, .Lexit | |
191 | cmp %o2, 19 | |
192 | ble,pn %icc, .Lsmall_unaligned | |
193 | nop | |
194 | ba,a,pt %icc, .Lmedium_noprefetch | |
195 | ||
196 | .Lexit: retl | |
197 | mov EX_RETVAL(%o3), %o0 | |
198 | ||
199 | .Llarge_src_unaligned: | |
200 | andn %o2, 0x3f, %o4 | |
201 | sub %o2, %o4, %o2 | |
202 | VISEntryHalf | |
203 | alignaddr %o1, %g0, %g1 | |
204 | add %o1, %o4, %o1 | |
205 | EX_LD(LOAD(ldd, %g1 + 0x00, %f0)) | |
206 | 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2)) | |
207 | subcc %o4, 0x40, %o4 | |
208 | EX_LD(LOAD(ldd, %g1 + 0x10, %f4)) | |
209 | EX_LD(LOAD(ldd, %g1 + 0x18, %f6)) | |
210 | EX_LD(LOAD(ldd, %g1 + 0x20, %f8)) | |
211 | EX_LD(LOAD(ldd, %g1 + 0x28, %f10)) | |
212 | EX_LD(LOAD(ldd, %g1 + 0x30, %f12)) | |
213 | EX_LD(LOAD(ldd, %g1 + 0x38, %f14)) | |
214 | faligndata %f0, %f2, %f16 | |
215 | EX_LD(LOAD(ldd, %g1 + 0x40, %f0)) | |
216 | faligndata %f2, %f4, %f18 | |
217 | add %g1, 0x40, %g1 | |
218 | faligndata %f4, %f6, %f20 | |
219 | faligndata %f6, %f8, %f22 | |
220 | faligndata %f8, %f10, %f24 | |
221 | faligndata %f10, %f12, %f26 | |
222 | faligndata %f12, %f14, %f28 | |
223 | faligndata %f14, %f0, %f30 | |
224 | EX_ST(STORE(std, %f16, %o0 + 0x00)) | |
225 | EX_ST(STORE(std, %f18, %o0 + 0x08)) | |
226 | EX_ST(STORE(std, %f20, %o0 + 0x10)) | |
227 | EX_ST(STORE(std, %f22, %o0 + 0x18)) | |
228 | EX_ST(STORE(std, %f24, %o0 + 0x20)) | |
229 | EX_ST(STORE(std, %f26, %o0 + 0x28)) | |
230 | EX_ST(STORE(std, %f28, %o0 + 0x30)) | |
231 | EX_ST(STORE(std, %f30, %o0 + 0x38)) | |
232 | add %o0, 0x40, %o0 | |
233 | bne,pt %icc, 1b | |
234 | LOAD(prefetch, %g1 + 0x200, #n_reads_strong) | |
235 | VISExitHalf | |
236 | ||
237 | brz,pn %o2, .Lexit | |
238 | cmp %o2, 19 | |
239 | ble,pn %icc, .Lsmall_unaligned | |
240 | nop | |
241 | ba,a,pt %icc, .Lmedium_unaligned | |
242 | ||
243 | .Lmedium: | |
244 | LOAD(prefetch, %o1 + 0x40, #n_reads_strong) | |
245 | andcc %g2, 0x7, %g0 | |
246 | bne,pn %icc, .Lmedium_unaligned | |
247 | nop | |
248 | .Lmedium_noprefetch: | |
249 | andncc %o2, 0x20 - 1, %o5 | |
250 | be,pn %icc, 2f | |
251 | sub %o2, %o5, %o2 | |
252 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
253 | EX_LD(LOAD(ldx, %o1 + 0x08, %g2)) | |
254 | EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE)) | |
255 | EX_LD(LOAD(ldx, %o1 + 0x18, %o4)) | |
256 | add %o1, 0x20, %o1 | |
257 | subcc %o5, 0x20, %o5 | |
258 | EX_ST(STORE(stx, %g1, %o0 + 0x00)) | |
259 | EX_ST(STORE(stx, %g2, %o0 + 0x08)) | |
260 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10)) | |
261 | EX_ST(STORE(stx, %o4, %o0 + 0x18)) | |
262 | bne,pt %icc, 1b | |
263 | add %o0, 0x20, %o0 | |
264 | 2: andcc %o2, 0x18, %o5 | |
265 | be,pt %icc, 3f | |
266 | sub %o2, %o5, %o2 | |
267 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
268 | add %o1, 0x08, %o1 | |
269 | add %o0, 0x08, %o0 | |
270 | subcc %o5, 0x08, %o5 | |
271 | bne,pt %icc, 1b | |
272 | EX_ST(STORE(stx, %g1, %o0 - 0x08)) | |
273 | 3: brz,pt %o2, .Lexit | |
274 | cmp %o2, 0x04 | |
275 | bl,pn %icc, .Ltiny | |
276 | nop | |
277 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | |
278 | add %o1, 0x04, %o1 | |
279 | add %o0, 0x04, %o0 | |
280 | subcc %o2, 0x04, %o2 | |
281 | bne,pn %icc, .Ltiny | |
282 | EX_ST(STORE(stw, %g1, %o0 - 0x04)) | |
283 | ba,a,pt %icc, .Lexit | |
284 | .Lmedium_unaligned: | |
285 | /* First get dest 8 byte aligned. */ | |
286 | sub %g0, %o0, %g1 | |
287 | and %g1, 0x7, %g1 | |
288 | brz,pt %g1, 2f | |
289 | sub %o2, %g1, %o2 | |
42a4172b | 290 | |
ae2c6ca6 DM |
291 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) |
292 | add %o1, 1, %o1 | |
293 | subcc %g1, 1, %g1 | |
294 | add %o0, 1, %o0 | |
295 | bne,pt %icc, 1b | |
296 | EX_ST(STORE(stb, %g2, %o0 - 0x01)) | |
297 | 2: | |
298 | and %o1, 0x7, %g1 | |
299 | brz,pn %g1, .Lmedium_noprefetch | |
300 | sll %g1, 3, %g1 | |
301 | mov 64, %g2 | |
302 | sub %g2, %g1, %g2 | |
303 | andn %o1, 0x7, %o1 | |
304 | EX_LD(LOAD(ldx, %o1 + 0x00, %o4)) | |
305 | sllx %o4, %g1, %o4 | |
306 | andn %o2, 0x08 - 1, %o5 | |
307 | sub %o2, %o5, %o2 | |
308 | 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3)) | |
309 | add %o1, 0x08, %o1 | |
310 | subcc %o5, 0x08, %o5 | |
311 | srlx %g3, %g2, GLOBAL_SPARE | |
312 | or GLOBAL_SPARE, %o4, GLOBAL_SPARE | |
313 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00)) | |
314 | add %o0, 0x08, %o0 | |
315 | bne,pt %icc, 1b | |
316 | sllx %g3, %g1, %o4 | |
317 | srl %g1, 3, %g1 | |
318 | add %o1, %g1, %o1 | |
319 | brz,pn %o2, .Lexit | |
320 | nop | |
321 | ba,pt %icc, .Lsmall_unaligned | |
322 | ||
323 | .Ltiny: | |
324 | EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | |
325 | subcc %o2, 1, %o2 | |
326 | be,pn %icc, .Lexit | |
327 | EX_ST(STORE(stb, %g1, %o0 + 0x00)) | |
328 | EX_LD(LOAD(ldub, %o1 + 0x01, %g1)) | |
329 | subcc %o2, 1, %o2 | |
330 | be,pn %icc, .Lexit | |
331 | EX_ST(STORE(stb, %g1, %o0 + 0x01)) | |
332 | EX_LD(LOAD(ldub, %o1 + 0x02, %g1)) | |
333 | ba,pt %icc, .Lexit | |
334 | EX_ST(STORE(stb, %g1, %o0 + 0x02)) | |
335 | ||
336 | .Lsmall: | |
337 | andcc %g2, 0x3, %g0 | |
338 | bne,pn %icc, .Lsmall_unaligned | |
339 | andn %o2, 0x4 - 1, %o5 | |
340 | sub %o2, %o5, %o2 | |
341 | 1: | |
342 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | |
343 | add %o1, 0x04, %o1 | |
344 | subcc %o5, 0x04, %o5 | |
345 | add %o0, 0x04, %o0 | |
346 | bne,pt %icc, 1b | |
347 | EX_ST(STORE(stw, %g1, %o0 - 0x04)) | |
348 | brz,pt %o2, .Lexit | |
349 | nop | |
350 | ba,a,pt %icc, .Ltiny | |
351 | ||
352 | .Lsmall_unaligned: | |
353 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | |
354 | add %o1, 1, %o1 | |
355 | add %o0, 1, %o0 | |
356 | subcc %o2, 1, %o2 | |
357 | bne,pt %icc, 1b | |
358 | EX_ST(STORE(stb, %g1, %o0 - 0x01)) | |
359 | ba,a,pt %icc, .Lexit | |
360 | .size FUNC_NAME, .-FUNC_NAME |