Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy. |
2 | * | |
3 | * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com) | |
4 | * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz) | |
5 | */ | |
6 | ||
7 | #ifdef __KERNEL__ | |
8 | #include <asm/visasm.h> | |
9 | #include <asm/asi.h> | |
d3867f04 | 10 | #include <asm/export.h> |
1da177e4 LT |
11 | #define GLOBAL_SPARE g7 |
12 | #else | |
13 | #define GLOBAL_SPARE g5 | |
14 | #define ASI_BLK_P 0xf0 | |
15 | #define FPRS_FEF 0x04 | |
16 | #ifdef MEMCPY_DEBUG | |
17 | #define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ | |
18 | clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; | |
19 | #define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
20 | #else | |
21 | #define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs | |
22 | #define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
23 | #endif | |
24 | #endif | |
25 | ||
26 | #ifndef EX_LD | |
27 | #define EX_LD(x) x | |
28 | #endif | |
a7c5724b RG |
29 | #ifndef EX_LD_FP |
30 | #define EX_LD_FP(x) x | |
31 | #endif | |
1da177e4 LT |
32 | |
33 | #ifndef EX_ST | |
34 | #define EX_ST(x) x | |
35 | #endif | |
a7c5724b RG |
36 | #ifndef EX_ST_FP |
37 | #define EX_ST_FP(x) x | |
38 | #endif | |
1da177e4 LT |
39 | |
40 | #ifndef EX_RETVAL | |
41 | #define EX_RETVAL(x) x | |
42 | #endif | |
43 | ||
44 | #ifndef LOAD | |
45 | #define LOAD(type,addr,dest) type [addr], dest | |
46 | #endif | |
47 | ||
48 | #ifndef LOAD_BLK | |
49 | #define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest | |
50 | #endif | |
51 | ||
52 | #ifndef STORE | |
53 | #define STORE(type,src,addr) type src, [addr] | |
54 | #endif | |
55 | ||
56 | #ifndef STORE_BLK | |
57 | #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P | |
58 | #endif | |
59 | ||
60 | #ifndef FUNC_NAME | |
61 | #define FUNC_NAME memcpy | |
62 | #endif | |
63 | ||
64 | #ifndef PREAMBLE | |
65 | #define PREAMBLE | |
66 | #endif | |
67 | ||
68 | #ifndef XCC | |
69 | #define XCC xcc | |
70 | #endif | |
71 | ||
72 | #define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \ | |
73 | faligndata %f1, %f2, %f48; \ | |
74 | faligndata %f2, %f3, %f50; \ | |
75 | faligndata %f3, %f4, %f52; \ | |
76 | faligndata %f4, %f5, %f54; \ | |
77 | faligndata %f5, %f6, %f56; \ | |
78 | faligndata %f6, %f7, %f58; \ | |
79 | faligndata %f7, %f8, %f60; \ | |
80 | faligndata %f8, %f9, %f62; | |
81 | ||
82 | #define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \ | |
a7c5724b RG |
83 | EX_LD_FP(LOAD_BLK(%src, %fdest)); \ |
84 | EX_ST_FP(STORE_BLK(%fsrc, %dest)); \ | |
1da177e4 LT |
85 | add %src, 0x40, %src; \ |
86 | subcc %len, 0x40, %len; \ | |
87 | be,pn %xcc, jmptgt; \ | |
88 | add %dest, 0x40, %dest; \ | |
89 | ||
90 | #define LOOP_CHUNK1(src, dest, len, branch_dest) \ | |
91 | MAIN_LOOP_CHUNK(src, dest, f0, f48, len, branch_dest) | |
92 | #define LOOP_CHUNK2(src, dest, len, branch_dest) \ | |
93 | MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest) | |
94 | #define LOOP_CHUNK3(src, dest, len, branch_dest) \ | |
95 | MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest) | |
96 | ||
b445e26c | 97 | #define DO_SYNC membar #Sync; |
1da177e4 | 98 | #define STORE_SYNC(dest, fsrc) \ |
a7c5724b | 99 | EX_ST_FP(STORE_BLK(%fsrc, %dest)); \ |
b445e26c DM |
100 | add %dest, 0x40, %dest; \ |
101 | DO_SYNC | |
1da177e4 LT |
102 | |
103 | #define STORE_JUMP(dest, fsrc, target) \ | |
a7c5724b | 104 | EX_ST_FP(STORE_BLK(%fsrc, %dest)); \ |
1da177e4 | 105 | add %dest, 0x40, %dest; \ |
b445e26c DM |
106 | ba,pt %xcc, target; \ |
107 | nop; | |
1da177e4 LT |
108 | |
109 | #define FINISH_VISCHUNK(dest, f0, f1, left) \ | |
110 | subcc %left, 8, %left;\ | |
111 | bl,pn %xcc, 95f; \ | |
112 | faligndata %f0, %f1, %f48; \ | |
a7c5724b | 113 | EX_ST_FP(STORE(std, %f48, %dest)); \ |
1da177e4 LT |
114 | add %dest, 8, %dest; |
115 | ||
116 | #define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ | |
117 | subcc %left, 8, %left; \ | |
118 | bl,pn %xcc, 95f; \ | |
6f1d827f | 119 | fsrc2 %f0, %f1; |
1da177e4 LT |
120 | |
121 | #define UNEVEN_VISCHUNK(dest, f0, f1, left) \ | |
122 | UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ | |
123 | ba,a,pt %xcc, 93f; | |
124 | ||
125 | .register %g2,#scratch | |
126 | .register %g3,#scratch | |
127 | ||
128 | .text | |
129 | .align 64 | |
130 | ||
131 | .globl FUNC_NAME | |
132 | .type FUNC_NAME,#function | |
133 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | |
134 | srlx %o2, 31, %g2 | |
135 | cmp %g2, 0 | |
136 | tne %xcc, 5 | |
137 | PREAMBLE | |
138 | mov %o0, %o4 | |
139 | cmp %o2, 0 | |
140 | be,pn %XCC, 85f | |
141 | or %o0, %o1, %o3 | |
142 | cmp %o2, 16 | |
143 | blu,a,pn %XCC, 80f | |
144 | or %o3, %o2, %o3 | |
145 | ||
146 | cmp %o2, (5 * 64) | |
147 | blu,pt %XCC, 70f | |
148 | andcc %o3, 0x7, %g0 | |
149 | ||
150 | /* Clobbers o5/g1/g2/g3/g7/icc/xcc. */ | |
151 | VISEntry | |
152 | ||
153 | /* Is 'dst' already aligned on an 64-byte boundary? */ | |
154 | andcc %o0, 0x3f, %g2 | |
155 | be,pt %XCC, 2f | |
156 | ||
157 | /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number | |
158 | * of bytes to copy to make 'dst' 64-byte aligned. We pre- | |
159 | * subtract this from 'len'. | |
160 | */ | |
161 | sub %o0, %o1, %GLOBAL_SPARE | |
162 | sub %g2, 0x40, %g2 | |
163 | sub %g0, %g2, %g2 | |
164 | sub %o2, %g2, %o2 | |
165 | andcc %g2, 0x7, %g1 | |
166 | be,pt %icc, 2f | |
167 | and %g2, 0x38, %g2 | |
168 | ||
169 | 1: subcc %g1, 0x1, %g1 | |
a7c5724b RG |
170 | EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3)) |
171 | EX_ST_FP(STORE(stb, %o3, %o1 + %GLOBAL_SPARE)) | |
1da177e4 LT |
172 | bgu,pt %XCC, 1b |
173 | add %o1, 0x1, %o1 | |
174 | ||
175 | add %o1, %GLOBAL_SPARE, %o0 | |
176 | ||
177 | 2: cmp %g2, 0x0 | |
178 | and %o1, 0x7, %g1 | |
179 | be,pt %icc, 3f | |
180 | alignaddr %o1, %g0, %o1 | |
181 | ||
a7c5724b RG |
182 | EX_LD_FP(LOAD(ldd, %o1, %f4)) |
183 | 1: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6)) | |
1da177e4 LT |
184 | add %o1, 0x8, %o1 |
185 | subcc %g2, 0x8, %g2 | |
186 | faligndata %f4, %f6, %f0 | |
a7c5724b | 187 | EX_ST_FP(STORE(std, %f0, %o0)) |
1da177e4 LT |
188 | be,pn %icc, 3f |
189 | add %o0, 0x8, %o0 | |
190 | ||
a7c5724b | 191 | EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4)) |
1da177e4 LT |
192 | add %o1, 0x8, %o1 |
193 | subcc %g2, 0x8, %g2 | |
194 | faligndata %f6, %f4, %f0 | |
a7c5724b | 195 | EX_ST_FP(STORE(std, %f0, %o0)) |
1da177e4 LT |
196 | bne,pt %icc, 1b |
197 | add %o0, 0x8, %o0 | |
198 | ||
199 | /* Destination is 64-byte aligned. */ | |
200 | 3: | |
201 | membar #LoadStore | #StoreStore | #StoreLoad | |
202 | ||
203 | subcc %o2, 0x40, %GLOBAL_SPARE | |
204 | add %o1, %g1, %g1 | |
205 | andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE | |
206 | srl %g1, 3, %g2 | |
207 | sub %o2, %GLOBAL_SPARE, %g3 | |
208 | andn %o1, (0x40 - 1), %o1 | |
209 | and %g2, 7, %g2 | |
210 | andncc %g3, 0x7, %g3 | |
6f1d827f | 211 | fsrc2 %f0, %f2 |
1da177e4 LT |
212 | sub %g3, 0x8, %g3 |
213 | sub %o2, %GLOBAL_SPARE, %o2 | |
214 | ||
215 | add %g1, %GLOBAL_SPARE, %g1 | |
216 | subcc %o2, %g3, %o2 | |
217 | ||
a7c5724b | 218 | EX_LD_FP(LOAD_BLK(%o1, %f0)) |
1da177e4 LT |
219 | add %o1, 0x40, %o1 |
220 | add %g1, %g3, %g1 | |
a7c5724b | 221 | EX_LD_FP(LOAD_BLK(%o1, %f16)) |
1da177e4 LT |
222 | add %o1, 0x40, %o1 |
223 | sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE | |
a7c5724b | 224 | EX_LD_FP(LOAD_BLK(%o1, %f32)) |
1da177e4 LT |
225 | add %o1, 0x40, %o1 |
226 | ||
227 | /* There are 8 instances of the unrolled loop, | |
228 | * one for each possible alignment of the | |
229 | * source buffer. Each loop instance is 452 | |
230 | * bytes. | |
231 | */ | |
232 | sll %g2, 3, %o3 | |
233 | sub %o3, %g2, %o3 | |
234 | sllx %o3, 4, %o3 | |
235 | add %o3, %g2, %o3 | |
236 | sllx %o3, 2, %g2 | |
237 | 1: rd %pc, %o3 | |
238 | add %o3, %lo(1f - 1b), %o3 | |
239 | jmpl %o3 + %g2, %g0 | |
240 | nop | |
241 | ||
242 | .align 64 | |
243 | 1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) | |
244 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | |
245 | FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) | |
246 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | |
247 | FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) | |
248 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | |
249 | ba,pt %xcc, 1b+4 | |
250 | faligndata %f0, %f2, %f48 | |
251 | 1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) | |
b445e26c | 252 | STORE_SYNC(o0, f48) |
1da177e4 | 253 | FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) |
b445e26c | 254 | STORE_JUMP(o0, f48, 40f) |
1da177e4 | 255 | 2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) |
b445e26c | 256 | STORE_SYNC(o0, f48) |
1da177e4 | 257 | FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) |
b445e26c | 258 | STORE_JUMP(o0, f48, 48f) |
1da177e4 | 259 | 3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) |
b445e26c | 260 | STORE_SYNC(o0, f48) |
1da177e4 | 261 | FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) |
b445e26c | 262 | STORE_JUMP(o0, f48, 56f) |
1da177e4 LT |
263 | |
264 | 1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) | |
265 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | |
266 | FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) | |
267 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | |
268 | FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) | |
269 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | |
270 | ba,pt %xcc, 1b+4 | |
271 | faligndata %f2, %f4, %f48 | |
272 | 1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) | |
b445e26c | 273 | STORE_SYNC(o0, f48) |
1da177e4 | 274 | FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) |
b445e26c | 275 | STORE_JUMP(o0, f48, 41f) |
1da177e4 | 276 | 2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) |
b445e26c | 277 | STORE_SYNC(o0, f48) |
1da177e4 | 278 | FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) |
b445e26c | 279 | STORE_JUMP(o0, f48, 49f) |
1da177e4 | 280 | 3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) |
b445e26c | 281 | STORE_SYNC(o0, f48) |
1da177e4 | 282 | FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) |
b445e26c | 283 | STORE_JUMP(o0, f48, 57f) |
1da177e4 LT |
284 | |
285 | 1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) | |
286 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | |
287 | FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) | |
288 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | |
289 | FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) | |
290 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | |
291 | ba,pt %xcc, 1b+4 | |
292 | faligndata %f4, %f6, %f48 | |
293 | 1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) | |
b445e26c | 294 | STORE_SYNC(o0, f48) |
1da177e4 | 295 | FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) |
b445e26c | 296 | STORE_JUMP(o0, f48, 42f) |
1da177e4 | 297 | 2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) |
b445e26c | 298 | STORE_SYNC(o0, f48) |
1da177e4 | 299 | FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) |
b445e26c | 300 | STORE_JUMP(o0, f48, 50f) |
1da177e4 | 301 | 3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) |
b445e26c | 302 | STORE_SYNC(o0, f48) |
1da177e4 | 303 | FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) |
b445e26c | 304 | STORE_JUMP(o0, f48, 58f) |
1da177e4 LT |
305 | |
306 | 1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) | |
307 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | |
308 | FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) | |
309 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | |
310 | FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) | |
311 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | |
312 | ba,pt %xcc, 1b+4 | |
313 | faligndata %f6, %f8, %f48 | |
314 | 1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) | |
b445e26c | 315 | STORE_SYNC(o0, f48) |
1da177e4 | 316 | FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) |
b445e26c | 317 | STORE_JUMP(o0, f48, 43f) |
1da177e4 | 318 | 2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) |
b445e26c | 319 | STORE_SYNC(o0, f48) |
1da177e4 | 320 | FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) |
b445e26c | 321 | STORE_JUMP(o0, f48, 51f) |
1da177e4 | 322 | 3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) |
b445e26c | 323 | STORE_SYNC(o0, f48) |
1da177e4 | 324 | FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) |
b445e26c | 325 | STORE_JUMP(o0, f48, 59f) |
1da177e4 LT |
326 | |
327 | 1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) | |
328 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | |
329 | FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) | |
330 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | |
331 | FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) | |
332 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | |
333 | ba,pt %xcc, 1b+4 | |
334 | faligndata %f8, %f10, %f48 | |
335 | 1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) | |
b445e26c | 336 | STORE_SYNC(o0, f48) |
1da177e4 | 337 | FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) |
b445e26c | 338 | STORE_JUMP(o0, f48, 44f) |
1da177e4 | 339 | 2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) |
b445e26c | 340 | STORE_SYNC(o0, f48) |
1da177e4 | 341 | FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) |
b445e26c | 342 | STORE_JUMP(o0, f48, 52f) |
1da177e4 | 343 | 3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) |
b445e26c | 344 | STORE_SYNC(o0, f48) |
1da177e4 | 345 | FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) |
b445e26c | 346 | STORE_JUMP(o0, f48, 60f) |
1da177e4 LT |
347 | |
348 | 1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) | |
349 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | |
350 | FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) | |
351 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | |
352 | FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) | |
353 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | |
354 | ba,pt %xcc, 1b+4 | |
355 | faligndata %f10, %f12, %f48 | |
356 | 1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) | |
b445e26c | 357 | STORE_SYNC(o0, f48) |
1da177e4 | 358 | FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) |
b445e26c | 359 | STORE_JUMP(o0, f48, 45f) |
1da177e4 | 360 | 2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) |
b445e26c | 361 | STORE_SYNC(o0, f48) |
1da177e4 | 362 | FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) |
b445e26c | 363 | STORE_JUMP(o0, f48, 53f) |
1da177e4 | 364 | 3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) |
b445e26c | 365 | STORE_SYNC(o0, f48) |
1da177e4 | 366 | FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) |
b445e26c | 367 | STORE_JUMP(o0, f48, 61f) |
1da177e4 LT |
368 | |
369 | 1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) | |
370 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | |
371 | FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) | |
372 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | |
373 | FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) | |
374 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | |
375 | ba,pt %xcc, 1b+4 | |
376 | faligndata %f12, %f14, %f48 | |
377 | 1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) | |
b445e26c | 378 | STORE_SYNC(o0, f48) |
1da177e4 | 379 | FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) |
b445e26c | 380 | STORE_JUMP(o0, f48, 46f) |
1da177e4 | 381 | 2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) |
b445e26c | 382 | STORE_SYNC(o0, f48) |
1da177e4 | 383 | FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) |
b445e26c | 384 | STORE_JUMP(o0, f48, 54f) |
1da177e4 | 385 | 3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) |
b445e26c | 386 | STORE_SYNC(o0, f48) |
1da177e4 | 387 | FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) |
b445e26c | 388 | STORE_JUMP(o0, f48, 62f) |
1da177e4 LT |
389 | |
390 | 1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) | |
391 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | |
392 | FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) | |
393 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | |
394 | FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) | |
395 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | |
396 | ba,pt %xcc, 1b+4 | |
397 | faligndata %f14, %f16, %f48 | |
398 | 1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) | |
b445e26c | 399 | STORE_SYNC(o0, f48) |
1da177e4 | 400 | FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) |
b445e26c | 401 | STORE_JUMP(o0, f48, 47f) |
1da177e4 | 402 | 2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) |
b445e26c | 403 | STORE_SYNC(o0, f48) |
1da177e4 | 404 | FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) |
b445e26c | 405 | STORE_JUMP(o0, f48, 55f) |
1da177e4 | 406 | 3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) |
b445e26c | 407 | STORE_SYNC(o0, f48) |
1da177e4 | 408 | FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) |
b445e26c | 409 | STORE_JUMP(o0, f48, 63f) |
1da177e4 LT |
410 | |
411 | 40: FINISH_VISCHUNK(o0, f0, f2, g3) | |
412 | 41: FINISH_VISCHUNK(o0, f2, f4, g3) | |
413 | 42: FINISH_VISCHUNK(o0, f4, f6, g3) | |
414 | 43: FINISH_VISCHUNK(o0, f6, f8, g3) | |
415 | 44: FINISH_VISCHUNK(o0, f8, f10, g3) | |
416 | 45: FINISH_VISCHUNK(o0, f10, f12, g3) | |
417 | 46: FINISH_VISCHUNK(o0, f12, f14, g3) | |
418 | 47: UNEVEN_VISCHUNK(o0, f14, f0, g3) | |
419 | 48: FINISH_VISCHUNK(o0, f16, f18, g3) | |
420 | 49: FINISH_VISCHUNK(o0, f18, f20, g3) | |
421 | 50: FINISH_VISCHUNK(o0, f20, f22, g3) | |
422 | 51: FINISH_VISCHUNK(o0, f22, f24, g3) | |
423 | 52: FINISH_VISCHUNK(o0, f24, f26, g3) | |
424 | 53: FINISH_VISCHUNK(o0, f26, f28, g3) | |
425 | 54: FINISH_VISCHUNK(o0, f28, f30, g3) | |
426 | 55: UNEVEN_VISCHUNK(o0, f30, f0, g3) | |
427 | 56: FINISH_VISCHUNK(o0, f32, f34, g3) | |
428 | 57: FINISH_VISCHUNK(o0, f34, f36, g3) | |
429 | 58: FINISH_VISCHUNK(o0, f36, f38, g3) | |
430 | 59: FINISH_VISCHUNK(o0, f38, f40, g3) | |
431 | 60: FINISH_VISCHUNK(o0, f40, f42, g3) | |
432 | 61: FINISH_VISCHUNK(o0, f42, f44, g3) | |
433 | 62: FINISH_VISCHUNK(o0, f44, f46, g3) | |
434 | 63: UNEVEN_VISCHUNK_LAST(o0, f46, f0, g3) | |
435 | ||
a7c5724b | 436 | 93: EX_LD_FP(LOAD(ldd, %o1, %f2)) |
1da177e4 LT |
437 | add %o1, 8, %o1 |
438 | subcc %g3, 8, %g3 | |
439 | faligndata %f0, %f2, %f8 | |
a7c5724b | 440 | EX_ST_FP(STORE(std, %f8, %o0)) |
1da177e4 LT |
441 | bl,pn %xcc, 95f |
442 | add %o0, 8, %o0 | |
a7c5724b | 443 | EX_LD_FP(LOAD(ldd, %o1, %f0)) |
1da177e4 LT |
444 | add %o1, 8, %o1 |
445 | subcc %g3, 8, %g3 | |
446 | faligndata %f2, %f0, %f8 | |
a7c5724b | 447 | EX_ST_FP(STORE(std, %f8, %o0)) |
1da177e4 LT |
448 | bge,pt %xcc, 93b |
449 | add %o0, 8, %o0 | |
450 | ||
451 | 95: brz,pt %o2, 2f | |
452 | mov %g1, %o1 | |
453 | ||
a7c5724b | 454 | 1: EX_LD_FP(LOAD(ldub, %o1, %o3)) |
1da177e4 LT |
455 | add %o1, 1, %o1 |
456 | subcc %o2, 1, %o2 | |
a7c5724b | 457 | EX_ST_FP(STORE(stb, %o3, %o0)) |
1da177e4 LT |
458 | bne,pt %xcc, 1b |
459 | add %o0, 1, %o0 | |
460 | ||
461 | 2: membar #StoreLoad | #StoreStore | |
462 | VISExit | |
463 | retl | |
464 | mov EX_RETVAL(%o4), %o0 | |
465 | ||
466 | .align 64 | |
467 | 70: /* 16 < len <= (5 * 64) */ | |
468 | bne,pn %XCC, 75f | |
469 | sub %o0, %o1, %o3 | |
470 | ||
471 | 72: andn %o2, 0xf, %GLOBAL_SPARE | |
472 | and %o2, 0xf, %o2 | |
473 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) | |
474 | EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) | |
475 | subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE | |
476 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | |
477 | add %o1, 0x8, %o1 | |
478 | EX_ST(STORE(stx, %g1, %o1 + %o3)) | |
479 | bgu,pt %XCC, 1b | |
480 | add %o1, 0x8, %o1 | |
481 | 73: andcc %o2, 0x8, %g0 | |
482 | be,pt %XCC, 1f | |
483 | nop | |
484 | EX_LD(LOAD(ldx, %o1, %o5)) | |
485 | sub %o2, 0x8, %o2 | |
486 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | |
487 | add %o1, 0x8, %o1 | |
488 | 1: andcc %o2, 0x4, %g0 | |
489 | be,pt %XCC, 1f | |
490 | nop | |
491 | EX_LD(LOAD(lduw, %o1, %o5)) | |
492 | sub %o2, 0x4, %o2 | |
493 | EX_ST(STORE(stw, %o5, %o1 + %o3)) | |
494 | add %o1, 0x4, %o1 | |
495 | 1: cmp %o2, 0 | |
496 | be,pt %XCC, 85f | |
497 | nop | |
498 | ba,pt %xcc, 90f | |
499 | nop | |
500 | ||
501 | 75: andcc %o0, 0x7, %g1 | |
502 | sub %g1, 0x8, %g1 | |
503 | be,pn %icc, 2f | |
504 | sub %g0, %g1, %g1 | |
505 | sub %o2, %g1, %o2 | |
506 | ||
507 | 1: EX_LD(LOAD(ldub, %o1, %o5)) | |
508 | subcc %g1, 1, %g1 | |
509 | EX_ST(STORE(stb, %o5, %o1 + %o3)) | |
510 | bgu,pt %icc, 1b | |
511 | add %o1, 1, %o1 | |
512 | ||
513 | 2: add %o1, %o3, %o0 | |
514 | andcc %o1, 0x7, %g1 | |
515 | bne,pt %icc, 8f | |
516 | sll %g1, 3, %g1 | |
517 | ||
518 | cmp %o2, 16 | |
519 | bgeu,pt %icc, 72b | |
520 | nop | |
521 | ba,a,pt %xcc, 73b | |
522 | ||
523 | 8: mov 64, %o3 | |
524 | andn %o1, 0x7, %o1 | |
525 | EX_LD(LOAD(ldx, %o1, %g2)) | |
526 | sub %o3, %g1, %o3 | |
527 | andn %o2, 0x7, %GLOBAL_SPARE | |
528 | sllx %g2, %g1, %g2 | |
529 | 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) | |
530 | subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE | |
531 | add %o1, 0x8, %o1 | |
532 | srlx %g3, %o3, %o5 | |
533 | or %o5, %g2, %o5 | |
534 | EX_ST(STORE(stx, %o5, %o0)) | |
535 | add %o0, 0x8, %o0 | |
536 | bgu,pt %icc, 1b | |
537 | sllx %g3, %g1, %g2 | |
538 | ||
539 | srl %g1, 3, %g1 | |
540 | andcc %o2, 0x7, %o2 | |
541 | be,pn %icc, 85f | |
542 | add %o1, %g1, %o1 | |
543 | ba,pt %xcc, 90f | |
544 | sub %o0, %o1, %o3 | |
545 | ||
546 | .align 64 | |
547 | 80: /* 0 < len <= 16 */ | |
548 | andcc %o3, 0x3, %g0 | |
549 | bne,pn %XCC, 90f | |
550 | sub %o0, %o1, %o3 | |
551 | ||
552 | 1: EX_LD(LOAD(lduw, %o1, %g1)) | |
553 | subcc %o2, 4, %o2 | |
554 | EX_ST(STORE(stw, %g1, %o1 + %o3)) | |
555 | bgu,pt %XCC, 1b | |
556 | add %o1, 4, %o1 | |
557 | ||
558 | 85: retl | |
559 | mov EX_RETVAL(%o4), %o0 | |
560 | ||
561 | .align 32 | |
562 | 90: EX_LD(LOAD(ldub, %o1, %g1)) | |
563 | subcc %o2, 1, %o2 | |
564 | EX_ST(STORE(stb, %g1, %o1 + %o3)) | |
565 | bgu,pt %XCC, 90b | |
566 | add %o1, 1, %o1 | |
567 | retl | |
568 | mov EX_RETVAL(%o4), %o0 | |
569 | ||
570 | .size FUNC_NAME, .-FUNC_NAME | |
d3867f04 | 571 | EXPORT_SYMBOL(FUNC_NAME) |