Commit | Line | Data |
---|---|---|
ea4d26ae JK |
1 | #ifndef _ASM_X86_XOR_AVX_H |
2 | #define _ASM_X86_XOR_AVX_H | |
3 | ||
4 | /* | |
5 | * Optimized RAID-5 checksumming functions for AVX | |
6 | * | |
7 | * Copyright (C) 2012 Intel Corporation | |
8 | * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> | |
9 | * | |
10 | * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines | |
11 | * | |
12 | * This program is free software; you can redistribute it and/or | |
13 | * modify it under the terms of the GNU General Public License | |
14 | * as published by the Free Software Foundation; version 2 | |
15 | * of the License. | |
16 | */ | |
17 | ||
18 | #ifdef CONFIG_AS_AVX | |
19 | ||
20 | #include <linux/compiler.h> | |
21 | #include <asm/i387.h> | |
22 | ||
23 | #define ALIGN32 __aligned(32) | |
24 | ||
25 | #define YMM_SAVED_REGS 4 | |
26 | ||
27 | #define YMMS_SAVE \ | |
28 | do { \ | |
29 | preempt_disable(); \ | |
30 | cr0 = read_cr0(); \ | |
31 | clts(); \ | |
32 | asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ | |
33 | asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ | |
34 | asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ | |
35 | asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ | |
36 | } while (0); | |
37 | ||
38 | #define YMMS_RESTORE \ | |
39 | do { \ | |
40 | asm volatile("sfence" : : : "memory"); \ | |
41 | asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ | |
42 | asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ | |
43 | asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ | |
44 | asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ | |
45 | write_cr0(cr0); \ | |
46 | preempt_enable(); \ | |
47 | } while (0); | |
48 | ||
49 | #define BLOCK4(i) \ | |
50 | BLOCK(32 * i, 0) \ | |
51 | BLOCK(32 * (i + 1), 1) \ | |
52 | BLOCK(32 * (i + 2), 2) \ | |
53 | BLOCK(32 * (i + 3), 3) | |
54 | ||
55 | #define BLOCK16() \ | |
56 | BLOCK4(0) \ | |
57 | BLOCK4(4) \ | |
58 | BLOCK4(8) \ | |
59 | BLOCK4(12) | |
60 | ||
61 | static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) | |
62 | { | |
63 | unsigned long cr0, lines = bytes >> 9; | |
64 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | |
65 | ||
66 | YMMS_SAVE | |
67 | ||
68 | while (lines--) { | |
69 | #undef BLOCK | |
70 | #define BLOCK(i, reg) \ | |
71 | do { \ | |
72 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ | |
73 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
74 | "m" (p0[i / sizeof(*p0)])); \ | |
75 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | |
76 | "=m" (p0[i / sizeof(*p0)])); \ | |
77 | } while (0); | |
78 | ||
79 | BLOCK16() | |
80 | ||
81 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | |
82 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | |
83 | } | |
84 | ||
85 | YMMS_RESTORE | |
86 | } | |
87 | ||
88 | static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, | |
89 | unsigned long *p2) | |
90 | { | |
91 | unsigned long cr0, lines = bytes >> 9; | |
92 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | |
93 | ||
94 | YMMS_SAVE | |
95 | ||
96 | while (lines--) { | |
97 | #undef BLOCK | |
98 | #define BLOCK(i, reg) \ | |
99 | do { \ | |
100 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ | |
101 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
102 | "m" (p1[i / sizeof(*p1)])); \ | |
103 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
104 | "m" (p0[i / sizeof(*p0)])); \ | |
105 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | |
106 | "=m" (p0[i / sizeof(*p0)])); \ | |
107 | } while (0); | |
108 | ||
109 | BLOCK16() | |
110 | ||
111 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | |
112 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | |
113 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | |
114 | } | |
115 | ||
116 | YMMS_RESTORE | |
117 | } | |
118 | ||
119 | static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, | |
120 | unsigned long *p2, unsigned long *p3) | |
121 | { | |
122 | unsigned long cr0, lines = bytes >> 9; | |
123 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | |
124 | ||
125 | YMMS_SAVE | |
126 | ||
127 | while (lines--) { | |
128 | #undef BLOCK | |
129 | #define BLOCK(i, reg) \ | |
130 | do { \ | |
131 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ | |
132 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
133 | "m" (p2[i / sizeof(*p2)])); \ | |
134 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
135 | "m" (p1[i / sizeof(*p1)])); \ | |
136 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
137 | "m" (p0[i / sizeof(*p0)])); \ | |
138 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | |
139 | "=m" (p0[i / sizeof(*p0)])); \ | |
140 | } while (0); | |
141 | ||
142 | BLOCK16(); | |
143 | ||
144 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | |
145 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | |
146 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | |
147 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | |
148 | } | |
149 | ||
150 | YMMS_RESTORE | |
151 | } | |
152 | ||
153 | static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, | |
154 | unsigned long *p2, unsigned long *p3, unsigned long *p4) | |
155 | { | |
156 | unsigned long cr0, lines = bytes >> 9; | |
157 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | |
158 | ||
159 | YMMS_SAVE | |
160 | ||
161 | while (lines--) { | |
162 | #undef BLOCK | |
163 | #define BLOCK(i, reg) \ | |
164 | do { \ | |
165 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ | |
166 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
167 | "m" (p3[i / sizeof(*p3)])); \ | |
168 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
169 | "m" (p2[i / sizeof(*p2)])); \ | |
170 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
171 | "m" (p1[i / sizeof(*p1)])); \ | |
172 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
173 | "m" (p0[i / sizeof(*p0)])); \ | |
174 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | |
175 | "=m" (p0[i / sizeof(*p0)])); \ | |
176 | } while (0); | |
177 | ||
178 | BLOCK16() | |
179 | ||
180 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | |
181 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | |
182 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | |
183 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | |
184 | p4 = (unsigned long *)((uintptr_t)p4 + 512); | |
185 | } | |
186 | ||
187 | YMMS_RESTORE | |
188 | } | |
189 | ||
190 | static struct xor_block_template xor_block_avx = { | |
191 | .name = "avx", | |
192 | .do_2 = xor_avx_2, | |
193 | .do_3 = xor_avx_3, | |
194 | .do_4 = xor_avx_4, | |
195 | .do_5 = xor_avx_5, | |
196 | }; | |
197 | ||
198 | #define AVX_XOR_SPEED \ | |
199 | do { \ | |
200 | if (cpu_has_avx) \ | |
201 | xor_speed(&xor_block_avx); \ | |
202 | } while (0) | |
203 | ||
204 | #define AVX_SELECT(FASTEST) \ | |
205 | (cpu_has_avx ? &xor_block_avx : FASTEST) | |
206 | ||
207 | #else | |
208 | ||
209 | #define AVX_XOR_SPEED {} | |
210 | ||
211 | #define AVX_SELECT(FASTEST) (FASTEST) | |
212 | ||
213 | #endif | |
214 | #endif |