Commit | Line | Data |
---|---|---|
f8561296 VN |
1 | #ifdef CONFIG_KMEMCHECK |
2 | /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ | |
3 | # include <asm-generic/xor.h> | |
e8f6e3f8 JB |
4 | #elif !defined(_ASM_X86_XOR_H) |
5 | #define _ASM_X86_XOR_H | |
6 | ||
7 | /* | |
8 | * Optimized RAID-5 checksumming functions for SSE. | |
9 | * | |
10 | * This program is free software; you can redistribute it and/or modify | |
11 | * it under the terms of the GNU General Public License as published by | |
12 | * the Free Software Foundation; either version 2, or (at your option) | |
13 | * any later version. | |
14 | * | |
15 | * You should have received a copy of the GNU General Public License | |
16 | * (for example /usr/src/linux/COPYING); if not, write to the Free | |
17 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 | */ | |
19 | ||
20 | /* | |
21 | * Cache avoiding checksumming functions utilizing KNI instructions | |
22 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | |
23 | */ | |
24 | ||
25 | /* | |
26 | * Based on | |
27 | * High-speed RAID5 checksumming functions utilizing SSE instructions. | |
28 | * Copyright (C) 1998 Ingo Molnar. | |
29 | */ | |
30 | ||
31 | /* | |
32 | * x86-64 changes / gcc fixes from Andi Kleen. | |
33 | * Copyright 2002 Andi Kleen, SuSE Labs. | |
34 | * | |
35 | * This hasn't been optimized for the hammer yet, but there are likely | |
36 | * no advantages to be gotten from x86-64 here anyways. | |
37 | */ | |
38 | ||
df6b35f4 | 39 | #include <asm/fpu/api.h> |
e8f6e3f8 JB |
40 | |
41 | #ifdef CONFIG_X86_32 | |
42 | /* reduce register pressure */ | |
43 | # define XOR_CONSTANT_CONSTRAINT "i" | |
f8561296 | 44 | #else |
e8f6e3f8 JB |
45 | # define XOR_CONSTANT_CONSTRAINT "re" |
46 | #endif | |
47 | ||
48 | #define OFFS(x) "16*("#x")" | |
49 | #define PF_OFFS(x) "256+16*("#x")" | |
50 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" | |
51 | #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" | |
52 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" | |
53 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" | |
54 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" | |
55 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" | |
56 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" | |
57 | #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" | |
58 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" | |
59 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" | |
60 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" | |
f317820c JB |
61 | #define NOP(x) |
62 | ||
63 | #define BLK64(pf, op, i) \ | |
64 | pf(i) \ | |
65 | op(i, 0) \ | |
66 | op(i + 1, 1) \ | |
67 | op(i + 2, 2) \ | |
68 | op(i + 3, 3) | |
e8f6e3f8 JB |
69 | |
70 | static void | |
71 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
72 | { | |
73 | unsigned long lines = bytes >> 8; | |
74 | ||
75 | kernel_fpu_begin(); | |
76 | ||
77 | asm volatile( | |
78 | #undef BLOCK | |
79 | #define BLOCK(i) \ | |
80 | LD(i, 0) \ | |
81 | LD(i + 1, 1) \ | |
82 | PF1(i) \ | |
83 | PF1(i + 2) \ | |
84 | LD(i + 2, 2) \ | |
85 | LD(i + 3, 3) \ | |
86 | PF0(i + 4) \ | |
87 | PF0(i + 6) \ | |
88 | XO1(i, 0) \ | |
89 | XO1(i + 1, 1) \ | |
90 | XO1(i + 2, 2) \ | |
91 | XO1(i + 3, 3) \ | |
92 | ST(i, 0) \ | |
93 | ST(i + 1, 1) \ | |
94 | ST(i + 2, 2) \ | |
95 | ST(i + 3, 3) \ | |
96 | ||
97 | ||
98 | PF0(0) | |
99 | PF0(2) | |
100 | ||
101 | " .align 32 ;\n" | |
102 | " 1: ;\n" | |
103 | ||
104 | BLOCK(0) | |
105 | BLOCK(4) | |
106 | BLOCK(8) | |
107 | BLOCK(12) | |
108 | ||
109 | " add %[inc], %[p1] ;\n" | |
110 | " add %[inc], %[p2] ;\n" | |
111 | " dec %[cnt] ;\n" | |
112 | " jnz 1b ;\n" | |
113 | : [cnt] "+r" (lines), | |
114 | [p1] "+r" (p1), [p2] "+r" (p2) | |
115 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
116 | : "memory"); | |
117 | ||
118 | kernel_fpu_end(); | |
119 | } | |
120 | ||
f317820c JB |
121 | static void |
122 | xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
123 | { | |
124 | unsigned long lines = bytes >> 8; | |
125 | ||
126 | kernel_fpu_begin(); | |
127 | ||
128 | asm volatile( | |
129 | #undef BLOCK | |
130 | #define BLOCK(i) \ | |
131 | BLK64(PF0, LD, i) \ | |
132 | BLK64(PF1, XO1, i) \ | |
133 | BLK64(NOP, ST, i) \ | |
134 | ||
135 | " .align 32 ;\n" | |
136 | " 1: ;\n" | |
137 | ||
138 | BLOCK(0) | |
139 | BLOCK(4) | |
140 | BLOCK(8) | |
141 | BLOCK(12) | |
142 | ||
143 | " add %[inc], %[p1] ;\n" | |
144 | " add %[inc], %[p2] ;\n" | |
145 | " dec %[cnt] ;\n" | |
146 | " jnz 1b ;\n" | |
147 | : [cnt] "+r" (lines), | |
148 | [p1] "+r" (p1), [p2] "+r" (p2) | |
149 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
150 | : "memory"); | |
151 | ||
152 | kernel_fpu_end(); | |
153 | } | |
154 | ||
e8f6e3f8 JB |
155 | static void |
156 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
157 | unsigned long *p3) | |
158 | { | |
159 | unsigned long lines = bytes >> 8; | |
160 | ||
161 | kernel_fpu_begin(); | |
162 | ||
163 | asm volatile( | |
164 | #undef BLOCK | |
165 | #define BLOCK(i) \ | |
166 | PF1(i) \ | |
167 | PF1(i + 2) \ | |
168 | LD(i, 0) \ | |
169 | LD(i + 1, 1) \ | |
170 | LD(i + 2, 2) \ | |
171 | LD(i + 3, 3) \ | |
172 | PF2(i) \ | |
173 | PF2(i + 2) \ | |
174 | PF0(i + 4) \ | |
175 | PF0(i + 6) \ | |
176 | XO1(i, 0) \ | |
177 | XO1(i + 1, 1) \ | |
178 | XO1(i + 2, 2) \ | |
179 | XO1(i + 3, 3) \ | |
180 | XO2(i, 0) \ | |
181 | XO2(i + 1, 1) \ | |
182 | XO2(i + 2, 2) \ | |
183 | XO2(i + 3, 3) \ | |
184 | ST(i, 0) \ | |
185 | ST(i + 1, 1) \ | |
186 | ST(i + 2, 2) \ | |
187 | ST(i + 3, 3) \ | |
188 | ||
189 | ||
190 | PF0(0) | |
191 | PF0(2) | |
192 | ||
193 | " .align 32 ;\n" | |
194 | " 1: ;\n" | |
195 | ||
196 | BLOCK(0) | |
197 | BLOCK(4) | |
198 | BLOCK(8) | |
199 | BLOCK(12) | |
200 | ||
201 | " add %[inc], %[p1] ;\n" | |
202 | " add %[inc], %[p2] ;\n" | |
203 | " add %[inc], %[p3] ;\n" | |
204 | " dec %[cnt] ;\n" | |
205 | " jnz 1b ;\n" | |
206 | : [cnt] "+r" (lines), | |
207 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | |
208 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
209 | : "memory"); | |
210 | ||
211 | kernel_fpu_end(); | |
212 | } | |
213 | ||
f317820c JB |
214 | static void |
215 | xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
216 | unsigned long *p3) | |
217 | { | |
218 | unsigned long lines = bytes >> 8; | |
219 | ||
220 | kernel_fpu_begin(); | |
221 | ||
222 | asm volatile( | |
223 | #undef BLOCK | |
224 | #define BLOCK(i) \ | |
225 | BLK64(PF0, LD, i) \ | |
226 | BLK64(PF1, XO1, i) \ | |
227 | BLK64(PF2, XO2, i) \ | |
228 | BLK64(NOP, ST, i) \ | |
229 | ||
230 | " .align 32 ;\n" | |
231 | " 1: ;\n" | |
232 | ||
233 | BLOCK(0) | |
234 | BLOCK(4) | |
235 | BLOCK(8) | |
236 | BLOCK(12) | |
237 | ||
238 | " add %[inc], %[p1] ;\n" | |
239 | " add %[inc], %[p2] ;\n" | |
240 | " add %[inc], %[p3] ;\n" | |
241 | " dec %[cnt] ;\n" | |
242 | " jnz 1b ;\n" | |
243 | : [cnt] "+r" (lines), | |
244 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | |
245 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
246 | : "memory"); | |
247 | ||
248 | kernel_fpu_end(); | |
249 | } | |
250 | ||
e8f6e3f8 JB |
251 | static void |
252 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
253 | unsigned long *p3, unsigned long *p4) | |
254 | { | |
255 | unsigned long lines = bytes >> 8; | |
256 | ||
257 | kernel_fpu_begin(); | |
258 | ||
259 | asm volatile( | |
260 | #undef BLOCK | |
261 | #define BLOCK(i) \ | |
262 | PF1(i) \ | |
263 | PF1(i + 2) \ | |
264 | LD(i, 0) \ | |
265 | LD(i + 1, 1) \ | |
266 | LD(i + 2, 2) \ | |
267 | LD(i + 3, 3) \ | |
268 | PF2(i) \ | |
269 | PF2(i + 2) \ | |
270 | XO1(i, 0) \ | |
271 | XO1(i + 1, 1) \ | |
272 | XO1(i + 2, 2) \ | |
273 | XO1(i + 3, 3) \ | |
274 | PF3(i) \ | |
275 | PF3(i + 2) \ | |
276 | PF0(i + 4) \ | |
277 | PF0(i + 6) \ | |
278 | XO2(i, 0) \ | |
279 | XO2(i + 1, 1) \ | |
280 | XO2(i + 2, 2) \ | |
281 | XO2(i + 3, 3) \ | |
282 | XO3(i, 0) \ | |
283 | XO3(i + 1, 1) \ | |
284 | XO3(i + 2, 2) \ | |
285 | XO3(i + 3, 3) \ | |
286 | ST(i, 0) \ | |
287 | ST(i + 1, 1) \ | |
288 | ST(i + 2, 2) \ | |
289 | ST(i + 3, 3) \ | |
290 | ||
291 | ||
292 | PF0(0) | |
293 | PF0(2) | |
294 | ||
295 | " .align 32 ;\n" | |
296 | " 1: ;\n" | |
297 | ||
298 | BLOCK(0) | |
299 | BLOCK(4) | |
300 | BLOCK(8) | |
301 | BLOCK(12) | |
302 | ||
303 | " add %[inc], %[p1] ;\n" | |
304 | " add %[inc], %[p2] ;\n" | |
305 | " add %[inc], %[p3] ;\n" | |
306 | " add %[inc], %[p4] ;\n" | |
307 | " dec %[cnt] ;\n" | |
308 | " jnz 1b ;\n" | |
309 | : [cnt] "+r" (lines), [p1] "+r" (p1), | |
310 | [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | |
311 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
312 | : "memory"); | |
313 | ||
314 | kernel_fpu_end(); | |
315 | } | |
316 | ||
f317820c JB |
317 | static void |
318 | xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
319 | unsigned long *p3, unsigned long *p4) | |
320 | { | |
321 | unsigned long lines = bytes >> 8; | |
322 | ||
323 | kernel_fpu_begin(); | |
324 | ||
325 | asm volatile( | |
326 | #undef BLOCK | |
327 | #define BLOCK(i) \ | |
328 | BLK64(PF0, LD, i) \ | |
329 | BLK64(PF1, XO1, i) \ | |
330 | BLK64(PF2, XO2, i) \ | |
331 | BLK64(PF3, XO3, i) \ | |
332 | BLK64(NOP, ST, i) \ | |
333 | ||
334 | " .align 32 ;\n" | |
335 | " 1: ;\n" | |
336 | ||
337 | BLOCK(0) | |
338 | BLOCK(4) | |
339 | BLOCK(8) | |
340 | BLOCK(12) | |
341 | ||
342 | " add %[inc], %[p1] ;\n" | |
343 | " add %[inc], %[p2] ;\n" | |
344 | " add %[inc], %[p3] ;\n" | |
345 | " add %[inc], %[p4] ;\n" | |
346 | " dec %[cnt] ;\n" | |
347 | " jnz 1b ;\n" | |
348 | : [cnt] "+r" (lines), [p1] "+r" (p1), | |
349 | [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | |
350 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
351 | : "memory"); | |
352 | ||
353 | kernel_fpu_end(); | |
354 | } | |
355 | ||
e8f6e3f8 JB |
356 | static void |
357 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
358 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
359 | { | |
360 | unsigned long lines = bytes >> 8; | |
361 | ||
362 | kernel_fpu_begin(); | |
363 | ||
364 | asm volatile( | |
365 | #undef BLOCK | |
366 | #define BLOCK(i) \ | |
367 | PF1(i) \ | |
368 | PF1(i + 2) \ | |
369 | LD(i, 0) \ | |
370 | LD(i + 1, 1) \ | |
371 | LD(i + 2, 2) \ | |
372 | LD(i + 3, 3) \ | |
373 | PF2(i) \ | |
374 | PF2(i + 2) \ | |
375 | XO1(i, 0) \ | |
376 | XO1(i + 1, 1) \ | |
377 | XO1(i + 2, 2) \ | |
378 | XO1(i + 3, 3) \ | |
379 | PF3(i) \ | |
380 | PF3(i + 2) \ | |
381 | XO2(i, 0) \ | |
382 | XO2(i + 1, 1) \ | |
383 | XO2(i + 2, 2) \ | |
384 | XO2(i + 3, 3) \ | |
385 | PF4(i) \ | |
386 | PF4(i + 2) \ | |
387 | PF0(i + 4) \ | |
388 | PF0(i + 6) \ | |
389 | XO3(i, 0) \ | |
390 | XO3(i + 1, 1) \ | |
391 | XO3(i + 2, 2) \ | |
392 | XO3(i + 3, 3) \ | |
393 | XO4(i, 0) \ | |
394 | XO4(i + 1, 1) \ | |
395 | XO4(i + 2, 2) \ | |
396 | XO4(i + 3, 3) \ | |
397 | ST(i, 0) \ | |
398 | ST(i + 1, 1) \ | |
399 | ST(i + 2, 2) \ | |
400 | ST(i + 3, 3) \ | |
401 | ||
402 | ||
403 | PF0(0) | |
404 | PF0(2) | |
405 | ||
406 | " .align 32 ;\n" | |
407 | " 1: ;\n" | |
408 | ||
409 | BLOCK(0) | |
410 | BLOCK(4) | |
411 | BLOCK(8) | |
412 | BLOCK(12) | |
413 | ||
414 | " add %[inc], %[p1] ;\n" | |
415 | " add %[inc], %[p2] ;\n" | |
416 | " add %[inc], %[p3] ;\n" | |
417 | " add %[inc], %[p4] ;\n" | |
418 | " add %[inc], %[p5] ;\n" | |
419 | " dec %[cnt] ;\n" | |
420 | " jnz 1b ;\n" | |
421 | : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), | |
422 | [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) | |
423 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
424 | : "memory"); | |
425 | ||
426 | kernel_fpu_end(); | |
427 | } | |
428 | ||
f317820c JB |
429 | static void |
430 | xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
431 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
432 | { | |
433 | unsigned long lines = bytes >> 8; | |
434 | ||
435 | kernel_fpu_begin(); | |
436 | ||
437 | asm volatile( | |
438 | #undef BLOCK | |
439 | #define BLOCK(i) \ | |
440 | BLK64(PF0, LD, i) \ | |
441 | BLK64(PF1, XO1, i) \ | |
442 | BLK64(PF2, XO2, i) \ | |
443 | BLK64(PF3, XO3, i) \ | |
444 | BLK64(PF4, XO4, i) \ | |
445 | BLK64(NOP, ST, i) \ | |
446 | ||
447 | " .align 32 ;\n" | |
448 | " 1: ;\n" | |
449 | ||
450 | BLOCK(0) | |
451 | BLOCK(4) | |
452 | BLOCK(8) | |
453 | BLOCK(12) | |
454 | ||
455 | " add %[inc], %[p1] ;\n" | |
456 | " add %[inc], %[p2] ;\n" | |
457 | " add %[inc], %[p3] ;\n" | |
458 | " add %[inc], %[p4] ;\n" | |
459 | " add %[inc], %[p5] ;\n" | |
460 | " dec %[cnt] ;\n" | |
461 | " jnz 1b ;\n" | |
462 | : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), | |
463 | [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) | |
464 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
465 | : "memory"); | |
466 | ||
467 | kernel_fpu_end(); | |
468 | } | |
469 | ||
470 | static struct xor_block_template xor_block_sse_pf64 = { | |
471 | .name = "prefetch64-sse", | |
472 | .do_2 = xor_sse_2_pf64, | |
473 | .do_3 = xor_sse_3_pf64, | |
474 | .do_4 = xor_sse_4_pf64, | |
475 | .do_5 = xor_sse_5_pf64, | |
476 | }; | |
477 | ||
e8f6e3f8 JB |
478 | #undef LD |
479 | #undef XO1 | |
480 | #undef XO2 | |
481 | #undef XO3 | |
482 | #undef XO4 | |
483 | #undef ST | |
f317820c JB |
484 | #undef NOP |
485 | #undef BLK64 | |
e8f6e3f8 JB |
486 | #undef BLOCK |
487 | ||
488 | #undef XOR_CONSTANT_CONSTRAINT | |
489 | ||
96a388de | 490 | #ifdef CONFIG_X86_32 |
a1ce3928 | 491 | # include <asm/xor_32.h> |
96a388de | 492 | #else |
a1ce3928 | 493 | # include <asm/xor_64.h> |
96a388de | 494 | #endif |
e8f6e3f8 | 495 | |
f317820c JB |
496 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
497 | AVX_SELECT(FASTEST) | |
498 | ||
e8f6e3f8 | 499 | #endif /* _ASM_X86_XOR_H */ |