Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* |
1da177e4 LT |
2 | * Optimized RAID-5 checksumming functions for MMX and SSE. |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License as published by | |
6 | * the Free Software Foundation; either version 2, or (at your option) | |
7 | * any later version. | |
8 | * | |
9 | * You should have received a copy of the GNU General Public License | |
10 | * (for example /usr/src/linux/COPYING); if not, write to the Free | |
11 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
12 | */ | |
13 | ||
14 | /* | |
15 | * High-speed RAID5 checksumming functions utilizing MMX instructions. | |
16 | * Copyright (C) 1998 Ingo Molnar. | |
17 | */ | |
18 | ||
8fdf7655 JP |
19 | #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n" |
20 | #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n" | |
21 | #define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" | |
22 | #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" | |
23 | #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" | |
24 | #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" | |
1da177e4 LT |
25 | |
26 | #include <asm/i387.h> | |
27 | ||
28 | static void | |
29 | xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
30 | { | |
31 | unsigned long lines = bytes >> 7; | |
32 | ||
33 | kernel_fpu_begin(); | |
34 | ||
8fdf7655 | 35 | asm volatile( |
1da177e4 | 36 | #undef BLOCK |
8fdf7655 JP |
37 | #define BLOCK(i) \ |
38 | LD(i, 0) \ | |
39 | LD(i + 1, 1) \ | |
40 | LD(i + 2, 2) \ | |
41 | LD(i + 3, 3) \ | |
42 | XO1(i, 0) \ | |
43 | ST(i, 0) \ | |
44 | XO1(i+1, 1) \ | |
45 | ST(i+1, 1) \ | |
46 | XO1(i + 2, 2) \ | |
47 | ST(i + 2, 2) \ | |
48 | XO1(i + 3, 3) \ | |
49 | ST(i + 3, 3) | |
1da177e4 LT |
50 | |
51 | " .align 32 ;\n" | |
8fdf7655 | 52 | " 1: ;\n" |
1da177e4 LT |
53 | |
54 | BLOCK(0) | |
55 | BLOCK(4) | |
56 | BLOCK(8) | |
57 | BLOCK(12) | |
58 | ||
59 | " addl $128, %1 ;\n" | |
60 | " addl $128, %2 ;\n" | |
61 | " decl %0 ;\n" | |
62 | " jnz 1b ;\n" | |
63 | : "+r" (lines), | |
64 | "+r" (p1), "+r" (p2) | |
65 | : | |
66 | : "memory"); | |
67 | ||
68 | kernel_fpu_end(); | |
69 | } | |
70 | ||
71 | static void | |
72 | xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
73 | unsigned long *p3) | |
74 | { | |
75 | unsigned long lines = bytes >> 7; | |
76 | ||
77 | kernel_fpu_begin(); | |
78 | ||
8fdf7655 | 79 | asm volatile( |
1da177e4 | 80 | #undef BLOCK |
8fdf7655 JP |
81 | #define BLOCK(i) \ |
82 | LD(i, 0) \ | |
83 | LD(i + 1, 1) \ | |
84 | LD(i + 2, 2) \ | |
85 | LD(i + 3, 3) \ | |
86 | XO1(i, 0) \ | |
87 | XO1(i + 1, 1) \ | |
88 | XO1(i + 2, 2) \ | |
89 | XO1(i + 3, 3) \ | |
90 | XO2(i, 0) \ | |
91 | ST(i, 0) \ | |
92 | XO2(i + 1, 1) \ | |
93 | ST(i + 1, 1) \ | |
94 | XO2(i + 2, 2) \ | |
95 | ST(i + 2, 2) \ | |
96 | XO2(i + 3, 3) \ | |
97 | ST(i + 3, 3) | |
1da177e4 LT |
98 | |
99 | " .align 32 ;\n" | |
100 | " 1: ;\n" | |
101 | ||
102 | BLOCK(0) | |
103 | BLOCK(4) | |
104 | BLOCK(8) | |
105 | BLOCK(12) | |
106 | ||
107 | " addl $128, %1 ;\n" | |
108 | " addl $128, %2 ;\n" | |
109 | " addl $128, %3 ;\n" | |
110 | " decl %0 ;\n" | |
111 | " jnz 1b ;\n" | |
112 | : "+r" (lines), | |
113 | "+r" (p1), "+r" (p2), "+r" (p3) | |
114 | : | |
115 | : "memory"); | |
116 | ||
117 | kernel_fpu_end(); | |
118 | } | |
119 | ||
120 | static void | |
121 | xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
122 | unsigned long *p3, unsigned long *p4) | |
123 | { | |
124 | unsigned long lines = bytes >> 7; | |
125 | ||
126 | kernel_fpu_begin(); | |
127 | ||
8fdf7655 | 128 | asm volatile( |
1da177e4 | 129 | #undef BLOCK |
8fdf7655 JP |
130 | #define BLOCK(i) \ |
131 | LD(i, 0) \ | |
132 | LD(i + 1, 1) \ | |
133 | LD(i + 2, 2) \ | |
134 | LD(i + 3, 3) \ | |
135 | XO1(i, 0) \ | |
136 | XO1(i + 1, 1) \ | |
137 | XO1(i + 2, 2) \ | |
138 | XO1(i + 3, 3) \ | |
139 | XO2(i, 0) \ | |
140 | XO2(i + 1, 1) \ | |
141 | XO2(i + 2, 2) \ | |
142 | XO2(i + 3, 3) \ | |
143 | XO3(i, 0) \ | |
144 | ST(i, 0) \ | |
145 | XO3(i + 1, 1) \ | |
146 | ST(i + 1, 1) \ | |
147 | XO3(i + 2, 2) \ | |
148 | ST(i + 2, 2) \ | |
149 | XO3(i + 3, 3) \ | |
150 | ST(i + 3, 3) | |
1da177e4 LT |
151 | |
152 | " .align 32 ;\n" | |
153 | " 1: ;\n" | |
154 | ||
155 | BLOCK(0) | |
156 | BLOCK(4) | |
157 | BLOCK(8) | |
158 | BLOCK(12) | |
159 | ||
160 | " addl $128, %1 ;\n" | |
161 | " addl $128, %2 ;\n" | |
162 | " addl $128, %3 ;\n" | |
163 | " addl $128, %4 ;\n" | |
164 | " decl %0 ;\n" | |
165 | " jnz 1b ;\n" | |
166 | : "+r" (lines), | |
167 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
168 | : | |
169 | : "memory"); | |
170 | ||
171 | kernel_fpu_end(); | |
172 | } | |
173 | ||
174 | ||
175 | static void | |
176 | xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
177 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
178 | { | |
179 | unsigned long lines = bytes >> 7; | |
180 | ||
181 | kernel_fpu_begin(); | |
182 | ||
183 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
184 | such that it won't pass to the asm volatile below a | |
185 | register that is shared with any other variable. That's | |
186 | because we modify p4 and p5 there, but we can't mark them | |
187 | as read/write, otherwise we'd overflow the 10-asm-operands | |
188 | limit of GCC < 3.1. */ | |
8fdf7655 | 189 | asm("" : "+r" (p4), "+r" (p5)); |
1da177e4 | 190 | |
8fdf7655 | 191 | asm volatile( |
1da177e4 | 192 | #undef BLOCK |
8fdf7655 JP |
193 | #define BLOCK(i) \ |
194 | LD(i, 0) \ | |
195 | LD(i + 1, 1) \ | |
196 | LD(i + 2, 2) \ | |
197 | LD(i + 3, 3) \ | |
198 | XO1(i, 0) \ | |
199 | XO1(i + 1, 1) \ | |
200 | XO1(i + 2, 2) \ | |
201 | XO1(i + 3, 3) \ | |
202 | XO2(i, 0) \ | |
203 | XO2(i + 1, 1) \ | |
204 | XO2(i + 2, 2) \ | |
205 | XO2(i + 3, 3) \ | |
206 | XO3(i, 0) \ | |
207 | XO3(i + 1, 1) \ | |
208 | XO3(i + 2, 2) \ | |
209 | XO3(i + 3, 3) \ | |
210 | XO4(i, 0) \ | |
211 | ST(i, 0) \ | |
212 | XO4(i + 1, 1) \ | |
213 | ST(i + 1, 1) \ | |
214 | XO4(i + 2, 2) \ | |
215 | ST(i + 2, 2) \ | |
216 | XO4(i + 3, 3) \ | |
217 | ST(i + 3, 3) | |
1da177e4 LT |
218 | |
219 | " .align 32 ;\n" | |
220 | " 1: ;\n" | |
221 | ||
222 | BLOCK(0) | |
223 | BLOCK(4) | |
224 | BLOCK(8) | |
225 | BLOCK(12) | |
226 | ||
227 | " addl $128, %1 ;\n" | |
228 | " addl $128, %2 ;\n" | |
229 | " addl $128, %3 ;\n" | |
230 | " addl $128, %4 ;\n" | |
231 | " addl $128, %5 ;\n" | |
232 | " decl %0 ;\n" | |
233 | " jnz 1b ;\n" | |
234 | : "+r" (lines), | |
235 | "+r" (p1), "+r" (p2), "+r" (p3) | |
8fdf7655 | 236 | : "r" (p4), "r" (p5) |
1da177e4 LT |
237 | : "memory"); |
238 | ||
239 | /* p4 and p5 were modified, and now the variables are dead. | |
240 | Clobber them just to be sure nobody does something stupid | |
241 | like assuming they have some legal value. */ | |
8fdf7655 | 242 | asm("" : "=r" (p4), "=r" (p5)); |
1da177e4 LT |
243 | |
244 | kernel_fpu_end(); | |
245 | } | |
246 | ||
247 | #undef LD | |
248 | #undef XO1 | |
249 | #undef XO2 | |
250 | #undef XO3 | |
251 | #undef XO4 | |
252 | #undef ST | |
253 | #undef BLOCK | |
254 | ||
255 | static void | |
256 | xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
257 | { | |
258 | unsigned long lines = bytes >> 6; | |
259 | ||
260 | kernel_fpu_begin(); | |
261 | ||
8fdf7655 | 262 | asm volatile( |
1da177e4 LT |
263 | " .align 32 ;\n" |
264 | " 1: ;\n" | |
265 | " movq (%1), %%mm0 ;\n" | |
266 | " movq 8(%1), %%mm1 ;\n" | |
267 | " pxor (%2), %%mm0 ;\n" | |
268 | " movq 16(%1), %%mm2 ;\n" | |
269 | " movq %%mm0, (%1) ;\n" | |
270 | " pxor 8(%2), %%mm1 ;\n" | |
271 | " movq 24(%1), %%mm3 ;\n" | |
272 | " movq %%mm1, 8(%1) ;\n" | |
273 | " pxor 16(%2), %%mm2 ;\n" | |
274 | " movq 32(%1), %%mm4 ;\n" | |
275 | " movq %%mm2, 16(%1) ;\n" | |
276 | " pxor 24(%2), %%mm3 ;\n" | |
277 | " movq 40(%1), %%mm5 ;\n" | |
278 | " movq %%mm3, 24(%1) ;\n" | |
279 | " pxor 32(%2), %%mm4 ;\n" | |
280 | " movq 48(%1), %%mm6 ;\n" | |
281 | " movq %%mm4, 32(%1) ;\n" | |
282 | " pxor 40(%2), %%mm5 ;\n" | |
283 | " movq 56(%1), %%mm7 ;\n" | |
284 | " movq %%mm5, 40(%1) ;\n" | |
285 | " pxor 48(%2), %%mm6 ;\n" | |
286 | " pxor 56(%2), %%mm7 ;\n" | |
287 | " movq %%mm6, 48(%1) ;\n" | |
288 | " movq %%mm7, 56(%1) ;\n" | |
8fdf7655 | 289 | |
1da177e4 LT |
290 | " addl $64, %1 ;\n" |
291 | " addl $64, %2 ;\n" | |
292 | " decl %0 ;\n" | |
293 | " jnz 1b ;\n" | |
294 | : "+r" (lines), | |
295 | "+r" (p1), "+r" (p2) | |
296 | : | |
297 | : "memory"); | |
298 | ||
299 | kernel_fpu_end(); | |
300 | } | |
301 | ||
302 | static void | |
303 | xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
304 | unsigned long *p3) | |
305 | { | |
306 | unsigned long lines = bytes >> 6; | |
307 | ||
308 | kernel_fpu_begin(); | |
309 | ||
8fdf7655 | 310 | asm volatile( |
1da177e4 LT |
311 | " .align 32,0x90 ;\n" |
312 | " 1: ;\n" | |
313 | " movq (%1), %%mm0 ;\n" | |
314 | " movq 8(%1), %%mm1 ;\n" | |
315 | " pxor (%2), %%mm0 ;\n" | |
316 | " movq 16(%1), %%mm2 ;\n" | |
317 | " pxor 8(%2), %%mm1 ;\n" | |
318 | " pxor (%3), %%mm0 ;\n" | |
319 | " pxor 16(%2), %%mm2 ;\n" | |
320 | " movq %%mm0, (%1) ;\n" | |
321 | " pxor 8(%3), %%mm1 ;\n" | |
322 | " pxor 16(%3), %%mm2 ;\n" | |
323 | " movq 24(%1), %%mm3 ;\n" | |
324 | " movq %%mm1, 8(%1) ;\n" | |
325 | " movq 32(%1), %%mm4 ;\n" | |
326 | " movq 40(%1), %%mm5 ;\n" | |
327 | " pxor 24(%2), %%mm3 ;\n" | |
328 | " movq %%mm2, 16(%1) ;\n" | |
329 | " pxor 32(%2), %%mm4 ;\n" | |
330 | " pxor 24(%3), %%mm3 ;\n" | |
331 | " pxor 40(%2), %%mm5 ;\n" | |
332 | " movq %%mm3, 24(%1) ;\n" | |
333 | " pxor 32(%3), %%mm4 ;\n" | |
334 | " pxor 40(%3), %%mm5 ;\n" | |
335 | " movq 48(%1), %%mm6 ;\n" | |
336 | " movq %%mm4, 32(%1) ;\n" | |
337 | " movq 56(%1), %%mm7 ;\n" | |
338 | " pxor 48(%2), %%mm6 ;\n" | |
339 | " movq %%mm5, 40(%1) ;\n" | |
340 | " pxor 56(%2), %%mm7 ;\n" | |
341 | " pxor 48(%3), %%mm6 ;\n" | |
342 | " pxor 56(%3), %%mm7 ;\n" | |
343 | " movq %%mm6, 48(%1) ;\n" | |
344 | " movq %%mm7, 56(%1) ;\n" | |
8fdf7655 | 345 | |
1da177e4 LT |
346 | " addl $64, %1 ;\n" |
347 | " addl $64, %2 ;\n" | |
348 | " addl $64, %3 ;\n" | |
349 | " decl %0 ;\n" | |
350 | " jnz 1b ;\n" | |
351 | : "+r" (lines), | |
352 | "+r" (p1), "+r" (p2), "+r" (p3) | |
353 | : | |
354 | : "memory" ); | |
355 | ||
356 | kernel_fpu_end(); | |
357 | } | |
358 | ||
359 | static void | |
360 | xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
361 | unsigned long *p3, unsigned long *p4) | |
362 | { | |
363 | unsigned long lines = bytes >> 6; | |
364 | ||
365 | kernel_fpu_begin(); | |
366 | ||
8fdf7655 | 367 | asm volatile( |
1da177e4 LT |
368 | " .align 32,0x90 ;\n" |
369 | " 1: ;\n" | |
370 | " movq (%1), %%mm0 ;\n" | |
371 | " movq 8(%1), %%mm1 ;\n" | |
372 | " pxor (%2), %%mm0 ;\n" | |
373 | " movq 16(%1), %%mm2 ;\n" | |
374 | " pxor 8(%2), %%mm1 ;\n" | |
375 | " pxor (%3), %%mm0 ;\n" | |
376 | " pxor 16(%2), %%mm2 ;\n" | |
377 | " pxor 8(%3), %%mm1 ;\n" | |
378 | " pxor (%4), %%mm0 ;\n" | |
379 | " movq 24(%1), %%mm3 ;\n" | |
380 | " pxor 16(%3), %%mm2 ;\n" | |
381 | " pxor 8(%4), %%mm1 ;\n" | |
382 | " movq %%mm0, (%1) ;\n" | |
383 | " movq 32(%1), %%mm4 ;\n" | |
384 | " pxor 24(%2), %%mm3 ;\n" | |
385 | " pxor 16(%4), %%mm2 ;\n" | |
386 | " movq %%mm1, 8(%1) ;\n" | |
387 | " movq 40(%1), %%mm5 ;\n" | |
388 | " pxor 32(%2), %%mm4 ;\n" | |
389 | " pxor 24(%3), %%mm3 ;\n" | |
390 | " movq %%mm2, 16(%1) ;\n" | |
391 | " pxor 40(%2), %%mm5 ;\n" | |
392 | " pxor 32(%3), %%mm4 ;\n" | |
393 | " pxor 24(%4), %%mm3 ;\n" | |
394 | " movq %%mm3, 24(%1) ;\n" | |
395 | " movq 56(%1), %%mm7 ;\n" | |
396 | " movq 48(%1), %%mm6 ;\n" | |
397 | " pxor 40(%3), %%mm5 ;\n" | |
398 | " pxor 32(%4), %%mm4 ;\n" | |
399 | " pxor 48(%2), %%mm6 ;\n" | |
400 | " movq %%mm4, 32(%1) ;\n" | |
401 | " pxor 56(%2), %%mm7 ;\n" | |
402 | " pxor 40(%4), %%mm5 ;\n" | |
403 | " pxor 48(%3), %%mm6 ;\n" | |
404 | " pxor 56(%3), %%mm7 ;\n" | |
405 | " movq %%mm5, 40(%1) ;\n" | |
406 | " pxor 48(%4), %%mm6 ;\n" | |
407 | " pxor 56(%4), %%mm7 ;\n" | |
408 | " movq %%mm6, 48(%1) ;\n" | |
409 | " movq %%mm7, 56(%1) ;\n" | |
8fdf7655 | 410 | |
1da177e4 LT |
411 | " addl $64, %1 ;\n" |
412 | " addl $64, %2 ;\n" | |
413 | " addl $64, %3 ;\n" | |
414 | " addl $64, %4 ;\n" | |
415 | " decl %0 ;\n" | |
416 | " jnz 1b ;\n" | |
417 | : "+r" (lines), | |
418 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
419 | : | |
420 | : "memory"); | |
421 | ||
422 | kernel_fpu_end(); | |
423 | } | |
424 | ||
425 | static void | |
426 | xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
427 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
428 | { | |
429 | unsigned long lines = bytes >> 6; | |
430 | ||
431 | kernel_fpu_begin(); | |
432 | ||
433 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
434 | such that it won't pass to the asm volatile below a | |
435 | register that is shared with any other variable. That's | |
436 | because we modify p4 and p5 there, but we can't mark them | |
437 | as read/write, otherwise we'd overflow the 10-asm-operands | |
438 | limit of GCC < 3.1. */ | |
8fdf7655 | 439 | asm("" : "+r" (p4), "+r" (p5)); |
1da177e4 | 440 | |
8fdf7655 | 441 | asm volatile( |
1da177e4 LT |
442 | " .align 32,0x90 ;\n" |
443 | " 1: ;\n" | |
444 | " movq (%1), %%mm0 ;\n" | |
445 | " movq 8(%1), %%mm1 ;\n" | |
446 | " pxor (%2), %%mm0 ;\n" | |
447 | " pxor 8(%2), %%mm1 ;\n" | |
448 | " movq 16(%1), %%mm2 ;\n" | |
449 | " pxor (%3), %%mm0 ;\n" | |
450 | " pxor 8(%3), %%mm1 ;\n" | |
451 | " pxor 16(%2), %%mm2 ;\n" | |
452 | " pxor (%4), %%mm0 ;\n" | |
453 | " pxor 8(%4), %%mm1 ;\n" | |
454 | " pxor 16(%3), %%mm2 ;\n" | |
455 | " movq 24(%1), %%mm3 ;\n" | |
456 | " pxor (%5), %%mm0 ;\n" | |
457 | " pxor 8(%5), %%mm1 ;\n" | |
458 | " movq %%mm0, (%1) ;\n" | |
459 | " pxor 16(%4), %%mm2 ;\n" | |
460 | " pxor 24(%2), %%mm3 ;\n" | |
461 | " movq %%mm1, 8(%1) ;\n" | |
462 | " pxor 16(%5), %%mm2 ;\n" | |
463 | " pxor 24(%3), %%mm3 ;\n" | |
464 | " movq 32(%1), %%mm4 ;\n" | |
465 | " movq %%mm2, 16(%1) ;\n" | |
466 | " pxor 24(%4), %%mm3 ;\n" | |
467 | " pxor 32(%2), %%mm4 ;\n" | |
468 | " movq 40(%1), %%mm5 ;\n" | |
469 | " pxor 24(%5), %%mm3 ;\n" | |
470 | " pxor 32(%3), %%mm4 ;\n" | |
471 | " pxor 40(%2), %%mm5 ;\n" | |
472 | " movq %%mm3, 24(%1) ;\n" | |
473 | " pxor 32(%4), %%mm4 ;\n" | |
474 | " pxor 40(%3), %%mm5 ;\n" | |
475 | " movq 48(%1), %%mm6 ;\n" | |
476 | " movq 56(%1), %%mm7 ;\n" | |
477 | " pxor 32(%5), %%mm4 ;\n" | |
478 | " pxor 40(%4), %%mm5 ;\n" | |
479 | " pxor 48(%2), %%mm6 ;\n" | |
480 | " pxor 56(%2), %%mm7 ;\n" | |
481 | " movq %%mm4, 32(%1) ;\n" | |
482 | " pxor 48(%3), %%mm6 ;\n" | |
483 | " pxor 56(%3), %%mm7 ;\n" | |
484 | " pxor 40(%5), %%mm5 ;\n" | |
485 | " pxor 48(%4), %%mm6 ;\n" | |
486 | " pxor 56(%4), %%mm7 ;\n" | |
487 | " movq %%mm5, 40(%1) ;\n" | |
488 | " pxor 48(%5), %%mm6 ;\n" | |
489 | " pxor 56(%5), %%mm7 ;\n" | |
490 | " movq %%mm6, 48(%1) ;\n" | |
491 | " movq %%mm7, 56(%1) ;\n" | |
8fdf7655 | 492 | |
1da177e4 LT |
493 | " addl $64, %1 ;\n" |
494 | " addl $64, %2 ;\n" | |
495 | " addl $64, %3 ;\n" | |
496 | " addl $64, %4 ;\n" | |
497 | " addl $64, %5 ;\n" | |
498 | " decl %0 ;\n" | |
499 | " jnz 1b ;\n" | |
500 | : "+r" (lines), | |
501 | "+r" (p1), "+r" (p2), "+r" (p3) | |
502 | : "r" (p4), "r" (p5) | |
503 | : "memory"); | |
504 | ||
505 | /* p4 and p5 were modified, and now the variables are dead. | |
506 | Clobber them just to be sure nobody does something stupid | |
507 | like assuming they have some legal value. */ | |
8fdf7655 | 508 | asm("" : "=r" (p4), "=r" (p5)); |
1da177e4 LT |
509 | |
510 | kernel_fpu_end(); | |
511 | } | |
512 | ||
513 | static struct xor_block_template xor_block_pII_mmx = { | |
514 | .name = "pII_mmx", | |
515 | .do_2 = xor_pII_mmx_2, | |
516 | .do_3 = xor_pII_mmx_3, | |
517 | .do_4 = xor_pII_mmx_4, | |
518 | .do_5 = xor_pII_mmx_5, | |
519 | }; | |
520 | ||
521 | static struct xor_block_template xor_block_p5_mmx = { | |
522 | .name = "p5_mmx", | |
523 | .do_2 = xor_p5_mmx_2, | |
524 | .do_3 = xor_p5_mmx_3, | |
525 | .do_4 = xor_p5_mmx_4, | |
526 | .do_5 = xor_p5_mmx_5, | |
527 | }; | |
528 | ||
529 | /* | |
530 | * Cache avoiding checksumming functions utilizing KNI instructions | |
531 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | |
532 | */ | |
533 | ||
8fdf7655 JP |
534 | #define XMMS_SAVE \ |
535 | do { \ | |
1da177e4 | 536 | preempt_disable(); \ |
4bb0d3ec ZA |
537 | cr0 = read_cr0(); \ |
538 | clts(); \ | |
8fdf7655 | 539 | asm volatile( \ |
4bb0d3ec ZA |
540 | "movups %%xmm0,(%0) ;\n\t" \ |
541 | "movups %%xmm1,0x10(%0) ;\n\t" \ | |
542 | "movups %%xmm2,0x20(%0) ;\n\t" \ | |
543 | "movups %%xmm3,0x30(%0) ;\n\t" \ | |
544 | : \ | |
1da177e4 LT |
545 | : "r" (xmm_save) \ |
546 | : "memory"); \ | |
8fdf7655 | 547 | } while (0) |
1da177e4 | 548 | |
8fdf7655 JP |
549 | #define XMMS_RESTORE \ |
550 | do { \ | |
551 | asm volatile( \ | |
1da177e4 | 552 | "sfence ;\n\t" \ |
4bb0d3ec ZA |
553 | "movups (%0),%%xmm0 ;\n\t" \ |
554 | "movups 0x10(%0),%%xmm1 ;\n\t" \ | |
555 | "movups 0x20(%0),%%xmm2 ;\n\t" \ | |
556 | "movups 0x30(%0),%%xmm3 ;\n\t" \ | |
1da177e4 | 557 | : \ |
4bb0d3ec | 558 | : "r" (xmm_save) \ |
1da177e4 | 559 | : "memory"); \ |
4bb0d3ec | 560 | write_cr0(cr0); \ |
1da177e4 | 561 | preempt_enable(); \ |
8fdf7655 | 562 | } while (0) |
1da177e4 LT |
563 | |
564 | #define ALIGN16 __attribute__((aligned(16))) | |
565 | ||
566 | #define OFFS(x) "16*("#x")" | |
567 | #define PF_OFFS(x) "256+16*("#x")" | |
568 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" | |
8fdf7655 JP |
569 | #define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" |
570 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" | |
1da177e4 LT |
571 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" |
572 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" | |
573 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" | |
574 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" | |
575 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" | |
8fdf7655 JP |
576 | #define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" |
577 | #define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" | |
578 | #define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" | |
579 | #define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" | |
580 | #define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" | |
1da177e4 LT |
581 | |
582 | ||
583 | static void | |
584 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
585 | { | |
8fdf7655 | 586 | unsigned long lines = bytes >> 8; |
1da177e4 LT |
587 | char xmm_save[16*4] ALIGN16; |
588 | int cr0; | |
589 | ||
590 | XMMS_SAVE; | |
591 | ||
8fdf7655 | 592 | asm volatile( |
1da177e4 | 593 | #undef BLOCK |
8fdf7655 JP |
594 | #define BLOCK(i) \ |
595 | LD(i, 0) \ | |
596 | LD(i + 1, 1) \ | |
1da177e4 | 597 | PF1(i) \ |
8fdf7655 JP |
598 | PF1(i + 2) \ |
599 | LD(i + 2, 2) \ | |
600 | LD(i + 3, 3) \ | |
601 | PF0(i + 4) \ | |
602 | PF0(i + 6) \ | |
603 | XO1(i, 0) \ | |
604 | XO1(i + 1, 1) \ | |
605 | XO1(i + 2, 2) \ | |
606 | XO1(i + 3, 3) \ | |
607 | ST(i, 0) \ | |
608 | ST(i + 1, 1) \ | |
609 | ST(i + 2, 2) \ | |
610 | ST(i + 3, 3) \ | |
1da177e4 LT |
611 | |
612 | ||
613 | PF0(0) | |
614 | PF0(2) | |
615 | ||
616 | " .align 32 ;\n" | |
8fdf7655 | 617 | " 1: ;\n" |
1da177e4 LT |
618 | |
619 | BLOCK(0) | |
620 | BLOCK(4) | |
621 | BLOCK(8) | |
622 | BLOCK(12) | |
623 | ||
8fdf7655 JP |
624 | " addl $256, %1 ;\n" |
625 | " addl $256, %2 ;\n" | |
626 | " decl %0 ;\n" | |
627 | " jnz 1b ;\n" | |
1da177e4 LT |
628 | : "+r" (lines), |
629 | "+r" (p1), "+r" (p2) | |
630 | : | |
8fdf7655 | 631 | : "memory"); |
1da177e4 LT |
632 | |
633 | XMMS_RESTORE; | |
634 | } | |
635 | ||
636 | static void | |
637 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
638 | unsigned long *p3) | |
639 | { | |
8fdf7655 | 640 | unsigned long lines = bytes >> 8; |
1da177e4 LT |
641 | char xmm_save[16*4] ALIGN16; |
642 | int cr0; | |
643 | ||
644 | XMMS_SAVE; | |
645 | ||
8fdf7655 | 646 | asm volatile( |
1da177e4 LT |
647 | #undef BLOCK |
648 | #define BLOCK(i) \ | |
649 | PF1(i) \ | |
8fdf7655 | 650 | PF1(i + 2) \ |
1da177e4 | 651 | LD(i,0) \ |
8fdf7655 JP |
652 | LD(i + 1, 1) \ |
653 | LD(i + 2, 2) \ | |
654 | LD(i + 3, 3) \ | |
1da177e4 | 655 | PF2(i) \ |
8fdf7655 JP |
656 | PF2(i + 2) \ |
657 | PF0(i + 4) \ | |
658 | PF0(i + 6) \ | |
1da177e4 | 659 | XO1(i,0) \ |
8fdf7655 JP |
660 | XO1(i + 1, 1) \ |
661 | XO1(i + 2, 2) \ | |
662 | XO1(i + 3, 3) \ | |
1da177e4 | 663 | XO2(i,0) \ |
8fdf7655 JP |
664 | XO2(i + 1, 1) \ |
665 | XO2(i + 2, 2) \ | |
666 | XO2(i + 3, 3) \ | |
1da177e4 | 667 | ST(i,0) \ |
8fdf7655 JP |
668 | ST(i + 1, 1) \ |
669 | ST(i + 2, 2) \ | |
670 | ST(i + 3, 3) \ | |
1da177e4 LT |
671 | |
672 | ||
673 | PF0(0) | |
674 | PF0(2) | |
675 | ||
676 | " .align 32 ;\n" | |
8fdf7655 | 677 | " 1: ;\n" |
1da177e4 LT |
678 | |
679 | BLOCK(0) | |
680 | BLOCK(4) | |
681 | BLOCK(8) | |
682 | BLOCK(12) | |
683 | ||
8fdf7655 JP |
684 | " addl $256, %1 ;\n" |
685 | " addl $256, %2 ;\n" | |
686 | " addl $256, %3 ;\n" | |
687 | " decl %0 ;\n" | |
688 | " jnz 1b ;\n" | |
1da177e4 LT |
689 | : "+r" (lines), |
690 | "+r" (p1), "+r"(p2), "+r"(p3) | |
691 | : | |
8fdf7655 | 692 | : "memory" ); |
1da177e4 LT |
693 | |
694 | XMMS_RESTORE; | |
695 | } | |
696 | ||
697 | static void | |
698 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
699 | unsigned long *p3, unsigned long *p4) | |
700 | { | |
8fdf7655 | 701 | unsigned long lines = bytes >> 8; |
1da177e4 LT |
702 | char xmm_save[16*4] ALIGN16; |
703 | int cr0; | |
704 | ||
705 | XMMS_SAVE; | |
706 | ||
8fdf7655 | 707 | asm volatile( |
1da177e4 LT |
708 | #undef BLOCK |
709 | #define BLOCK(i) \ | |
710 | PF1(i) \ | |
8fdf7655 | 711 | PF1(i + 2) \ |
1da177e4 | 712 | LD(i,0) \ |
8fdf7655 JP |
713 | LD(i + 1, 1) \ |
714 | LD(i + 2, 2) \ | |
715 | LD(i + 3, 3) \ | |
1da177e4 | 716 | PF2(i) \ |
8fdf7655 | 717 | PF2(i + 2) \ |
1da177e4 | 718 | XO1(i,0) \ |
8fdf7655 JP |
719 | XO1(i + 1, 1) \ |
720 | XO1(i + 2, 2) \ | |
721 | XO1(i + 3, 3) \ | |
1da177e4 | 722 | PF3(i) \ |
8fdf7655 JP |
723 | PF3(i + 2) \ |
724 | PF0(i + 4) \ | |
725 | PF0(i + 6) \ | |
1da177e4 | 726 | XO2(i,0) \ |
8fdf7655 JP |
727 | XO2(i + 1, 1) \ |
728 | XO2(i + 2, 2) \ | |
729 | XO2(i + 3, 3) \ | |
1da177e4 | 730 | XO3(i,0) \ |
8fdf7655 JP |
731 | XO3(i + 1, 1) \ |
732 | XO3(i + 2, 2) \ | |
733 | XO3(i + 3, 3) \ | |
1da177e4 | 734 | ST(i,0) \ |
8fdf7655 JP |
735 | ST(i + 1, 1) \ |
736 | ST(i + 2, 2) \ | |
737 | ST(i + 3, 3) \ | |
1da177e4 LT |
738 | |
739 | ||
740 | PF0(0) | |
741 | PF0(2) | |
742 | ||
743 | " .align 32 ;\n" | |
8fdf7655 | 744 | " 1: ;\n" |
1da177e4 LT |
745 | |
746 | BLOCK(0) | |
747 | BLOCK(4) | |
748 | BLOCK(8) | |
749 | BLOCK(12) | |
750 | ||
8fdf7655 JP |
751 | " addl $256, %1 ;\n" |
752 | " addl $256, %2 ;\n" | |
753 | " addl $256, %3 ;\n" | |
754 | " addl $256, %4 ;\n" | |
755 | " decl %0 ;\n" | |
756 | " jnz 1b ;\n" | |
1da177e4 LT |
757 | : "+r" (lines), |
758 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | |
759 | : | |
8fdf7655 | 760 | : "memory" ); |
1da177e4 LT |
761 | |
762 | XMMS_RESTORE; | |
763 | } | |
764 | ||
765 | static void | |
766 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
767 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
768 | { | |
8fdf7655 | 769 | unsigned long lines = bytes >> 8; |
1da177e4 LT |
770 | char xmm_save[16*4] ALIGN16; |
771 | int cr0; | |
772 | ||
773 | XMMS_SAVE; | |
774 | ||
775 | /* Make sure GCC forgets anything it knows about p4 or p5, | |
776 | such that it won't pass to the asm volatile below a | |
777 | register that is shared with any other variable. That's | |
778 | because we modify p4 and p5 there, but we can't mark them | |
779 | as read/write, otherwise we'd overflow the 10-asm-operands | |
780 | limit of GCC < 3.1. */ | |
8fdf7655 | 781 | asm("" : "+r" (p4), "+r" (p5)); |
1da177e4 | 782 | |
8fdf7655 | 783 | asm volatile( |
1da177e4 LT |
784 | #undef BLOCK |
785 | #define BLOCK(i) \ | |
786 | PF1(i) \ | |
8fdf7655 | 787 | PF1(i + 2) \ |
1da177e4 | 788 | LD(i,0) \ |
8fdf7655 JP |
789 | LD(i + 1, 1) \ |
790 | LD(i + 2, 2) \ | |
791 | LD(i + 3, 3) \ | |
1da177e4 | 792 | PF2(i) \ |
8fdf7655 | 793 | PF2(i + 2) \ |
1da177e4 | 794 | XO1(i,0) \ |
8fdf7655 JP |
795 | XO1(i + 1, 1) \ |
796 | XO1(i + 2, 2) \ | |
797 | XO1(i + 3, 3) \ | |
1da177e4 | 798 | PF3(i) \ |
8fdf7655 | 799 | PF3(i + 2) \ |
1da177e4 | 800 | XO2(i,0) \ |
8fdf7655 JP |
801 | XO2(i + 1, 1) \ |
802 | XO2(i + 2, 2) \ | |
803 | XO2(i + 3, 3) \ | |
1da177e4 | 804 | PF4(i) \ |
8fdf7655 JP |
805 | PF4(i + 2) \ |
806 | PF0(i + 4) \ | |
807 | PF0(i + 6) \ | |
1da177e4 | 808 | XO3(i,0) \ |
8fdf7655 JP |
809 | XO3(i + 1, 1) \ |
810 | XO3(i + 2, 2) \ | |
811 | XO3(i + 3, 3) \ | |
1da177e4 | 812 | XO4(i,0) \ |
8fdf7655 JP |
813 | XO4(i + 1, 1) \ |
814 | XO4(i + 2, 2) \ | |
815 | XO4(i + 3, 3) \ | |
1da177e4 | 816 | ST(i,0) \ |
8fdf7655 JP |
817 | ST(i + 1, 1) \ |
818 | ST(i + 2, 2) \ | |
819 | ST(i + 3, 3) \ | |
1da177e4 LT |
820 | |
821 | ||
822 | PF0(0) | |
823 | PF0(2) | |
824 | ||
825 | " .align 32 ;\n" | |
8fdf7655 | 826 | " 1: ;\n" |
1da177e4 LT |
827 | |
828 | BLOCK(0) | |
829 | BLOCK(4) | |
830 | BLOCK(8) | |
831 | BLOCK(12) | |
832 | ||
8fdf7655 JP |
833 | " addl $256, %1 ;\n" |
834 | " addl $256, %2 ;\n" | |
835 | " addl $256, %3 ;\n" | |
836 | " addl $256, %4 ;\n" | |
837 | " addl $256, %5 ;\n" | |
838 | " decl %0 ;\n" | |
839 | " jnz 1b ;\n" | |
1da177e4 LT |
840 | : "+r" (lines), |
841 | "+r" (p1), "+r" (p2), "+r" (p3) | |
842 | : "r" (p4), "r" (p5) | |
843 | : "memory"); | |
844 | ||
845 | /* p4 and p5 were modified, and now the variables are dead. | |
846 | Clobber them just to be sure nobody does something stupid | |
847 | like assuming they have some legal value. */ | |
8fdf7655 | 848 | asm("" : "=r" (p4), "=r" (p5)); |
1da177e4 LT |
849 | |
850 | XMMS_RESTORE; | |
851 | } | |
852 | ||
853 | static struct xor_block_template xor_block_pIII_sse = { | |
8fdf7655 JP |
854 | .name = "pIII_sse", |
855 | .do_2 = xor_sse_2, | |
856 | .do_3 = xor_sse_3, | |
857 | .do_4 = xor_sse_4, | |
858 | .do_5 = xor_sse_5, | |
1da177e4 LT |
859 | }; |
860 | ||
861 | /* Also try the generic routines. */ | |
862 | #include <asm-generic/xor.h> | |
863 | ||
864 | #undef XOR_TRY_TEMPLATES | |
865 | #define XOR_TRY_TEMPLATES \ | |
8fdf7655 JP |
866 | do { \ |
867 | xor_speed(&xor_block_8regs); \ | |
868 | xor_speed(&xor_block_8regs_p); \ | |
869 | xor_speed(&xor_block_32regs); \ | |
870 | xor_speed(&xor_block_32regs_p); \ | |
871 | if (cpu_has_xmm) \ | |
872 | xor_speed(&xor_block_pIII_sse); \ | |
873 | if (cpu_has_mmx) { \ | |
874 | xor_speed(&xor_block_pII_mmx); \ | |
875 | xor_speed(&xor_block_p5_mmx); \ | |
876 | } \ | |
877 | } while (0) | |
1da177e4 LT |
878 | |
879 | /* We force the use of the SSE xor block because it can write around L2. | |
880 | We may also be able to load into the L1 only depending on how the cpu | |
881 | deals with a load to a line that is being prefetched. */ | |
8fdf7655 | 882 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
1da177e4 | 883 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) |