Commit | Line | Data |
---|---|---|
20f1b1f1 MS |
1 | /* |
2 | * Fast SHA-1 implementation for SPE instruction set (PPC) | |
3 | * | |
4 | * This code makes use of the SPE SIMD instruction set as defined in | |
5 | * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf | |
6 | * Implementation is based on optimization guide notes from | |
7 | * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf | |
8 | * | |
9 | * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> | |
10 | * | |
11 | * This program is free software; you can redistribute it and/or modify it | |
12 | * under the terms of the GNU General Public License as published by the Free | |
13 | * Software Foundation; either version 2 of the License, or (at your option) | |
14 | * any later version. | |
15 | * | |
16 | */ | |
17 | ||
18 | #include <asm/ppc_asm.h> | |
19 | #include <asm/asm-offsets.h> | |
20 | ||
21 | #define rHP r3 /* pointer to hash value */ | |
22 | #define rWP r4 /* pointer to input */ | |
23 | #define rKP r5 /* pointer to constants */ | |
24 | ||
25 | #define rW0 r14 /* 64 bit round words */ | |
26 | #define rW1 r15 | |
27 | #define rW2 r16 | |
28 | #define rW3 r17 | |
29 | #define rW4 r18 | |
30 | #define rW5 r19 | |
31 | #define rW6 r20 | |
32 | #define rW7 r21 | |
33 | ||
34 | #define rH0 r6 /* 32 bit hash values */ | |
35 | #define rH1 r7 | |
36 | #define rH2 r8 | |
37 | #define rH3 r9 | |
38 | #define rH4 r10 | |
39 | ||
40 | #define rT0 r22 /* 64 bit temporary */ | |
41 | #define rT1 r0 /* 32 bit temporaries */ | |
42 | #define rT2 r11 | |
43 | #define rT3 r12 | |
44 | ||
45 | #define rK r23 /* 64 bit constant in volatile register */ | |
46 | ||
47 | #define LOAD_K01 | |
48 | ||
49 | #define LOAD_K11 \ | |
50 | evlwwsplat rK,0(rKP); | |
51 | ||
52 | #define LOAD_K21 \ | |
53 | evlwwsplat rK,4(rKP); | |
54 | ||
55 | #define LOAD_K31 \ | |
56 | evlwwsplat rK,8(rKP); | |
57 | ||
58 | #define LOAD_K41 \ | |
59 | evlwwsplat rK,12(rKP); | |
60 | ||
61 | #define INITIALIZE \ | |
62 | stwu r1,-128(r1); /* create stack frame */ \ | |
63 | evstdw r14,8(r1); /* We must save non volatile */ \ | |
64 | evstdw r15,16(r1); /* registers. Take the chance */ \ | |
65 | evstdw r16,24(r1); /* and save the SPE part too */ \ | |
66 | evstdw r17,32(r1); \ | |
67 | evstdw r18,40(r1); \ | |
68 | evstdw r19,48(r1); \ | |
69 | evstdw r20,56(r1); \ | |
70 | evstdw r21,64(r1); \ | |
71 | evstdw r22,72(r1); \ | |
72 | evstdw r23,80(r1); | |
73 | ||
74 | ||
75 | #define FINALIZE \ | |
76 | evldw r14,8(r1); /* restore SPE registers */ \ | |
77 | evldw r15,16(r1); \ | |
78 | evldw r16,24(r1); \ | |
79 | evldw r17,32(r1); \ | |
80 | evldw r18,40(r1); \ | |
81 | evldw r19,48(r1); \ | |
82 | evldw r20,56(r1); \ | |
83 | evldw r21,64(r1); \ | |
84 | evldw r22,72(r1); \ | |
85 | evldw r23,80(r1); \ | |
86 | xor r0,r0,r0; \ | |
87 | stw r0,8(r1); /* Delete sensitive data */ \ | |
88 | stw r0,16(r1); /* that we might have pushed */ \ | |
89 | stw r0,24(r1); /* from other context that runs */ \ | |
90 | stw r0,32(r1); /* the same code. Assume that */ \ | |
91 | stw r0,40(r1); /* the lower part of the GPRs */ \ | |
92 | stw r0,48(r1); /* were already overwritten on */ \ | |
93 | stw r0,56(r1); /* the way down to here */ \ | |
94 | stw r0,64(r1); \ | |
95 | stw r0,72(r1); \ | |
96 | stw r0,80(r1); \ | |
97 | addi r1,r1,128; /* cleanup stack frame */ | |
98 | ||
99 | #ifdef __BIG_ENDIAN__ | |
100 | #define LOAD_DATA(reg, off) \ | |
101 | lwz reg,off(rWP); /* load data */ | |
102 | #define NEXT_BLOCK \ | |
103 | addi rWP,rWP,64; /* increment per block */ | |
104 | #else | |
105 | #define LOAD_DATA(reg, off) \ | |
106 | lwbrx reg,0,rWP; /* load data */ \ | |
107 | addi rWP,rWP,4; /* increment per word */ | |
108 | #define NEXT_BLOCK /* nothing to do */ | |
109 | #endif | |
110 | ||
111 | #define R_00_15(a, b, c, d, e, w0, w1, k, off) \ | |
112 | LOAD_DATA(w0, off) /* 1: W */ \ | |
113 | and rT2,b,c; /* 1: F' = B and C */ \ | |
114 | LOAD_K##k##1 \ | |
115 | andc rT1,d,b; /* 1: F" = ~B and D */ \ | |
116 | rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \ | |
117 | or rT2,rT2,rT1; /* 1: F = F' or F" */ \ | |
118 | add e,e,rT0; /* 1: E = E + A' */ \ | |
119 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ | |
120 | add e,e,w0; /* 1: E = E + W */ \ | |
121 | LOAD_DATA(w1, off+4) /* 2: W */ \ | |
122 | add e,e,rT2; /* 1: E = E + F */ \ | |
123 | and rT1,a,b; /* 2: F' = B and C */ \ | |
124 | add e,e,rK; /* 1: E = E + K */ \ | |
125 | andc rT2,c,a; /* 2: F" = ~B and D */ \ | |
126 | add d,d,rK; /* 2: E = E + K */ \ | |
127 | or rT2,rT2,rT1; /* 2: F = F' or F" */ \ | |
128 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ | |
129 | add d,d,w1; /* 2: E = E + W */ \ | |
130 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ | |
131 | add d,d,rT0; /* 2: E = E + A' */ \ | |
132 | evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \ | |
133 | add d,d,rT2 /* 2: E = E + F */ | |
134 | ||
135 | #define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ | |
136 | and rT2,b,c; /* 1: F' = B and C */ \ | |
137 | evmergelohi rT0,w7,w6; /* W[-3] */ \ | |
138 | andc rT1,d,b; /* 1: F" = ~B and D */ \ | |
139 | evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ | |
140 | or rT1,rT1,rT2; /* 1: F = F' or F" */ \ | |
141 | evxor w0,w0,w4; /* W = W xor W[-8] */ \ | |
142 | add e,e,rT1; /* 1: E = E + F */ \ | |
143 | evxor w0,w0,w1; /* W = W xor W[-14] */ \ | |
144 | rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ | |
145 | evrlwi w0,w0,1; /* W = W rotl 1 */ \ | |
146 | add e,e,rT2; /* 1: E = E + A' */ \ | |
147 | evaddw rT0,w0,rK; /* WK = W + K */ \ | |
148 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ | |
149 | LOAD_K##k##1 \ | |
150 | evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ | |
151 | add e,e,rT0; /* 1: E = E + WK */ \ | |
152 | add d,d,rT1; /* 2: E = E + WK */ \ | |
153 | and rT2,a,b; /* 2: F' = B and C */ \ | |
154 | andc rT1,c,a; /* 2: F" = ~B and D */ \ | |
155 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ | |
156 | or rT1,rT1,rT2; /* 2: F = F' or F" */ \ | |
157 | add d,d,rT0; /* 2: E = E + A' */ \ | |
158 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ | |
159 | add d,d,rT1 /* 2: E = E + F */ | |
160 | ||
161 | #define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ | |
162 | evmergelohi rT0,w7,w6; /* W[-3] */ \ | |
163 | xor rT2,b,c; /* 1: F' = B xor C */ \ | |
164 | evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ | |
165 | xor rT2,rT2,d; /* 1: F = F' xor D */ \ | |
166 | evxor w0,w0,w4; /* W = W xor W[-8] */ \ | |
167 | add e,e,rT2; /* 1: E = E + F */ \ | |
168 | evxor w0,w0,w1; /* W = W xor W[-14] */ \ | |
169 | rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ | |
170 | evrlwi w0,w0,1; /* W = W rotl 1 */ \ | |
171 | add e,e,rT2; /* 1: E = E + A' */ \ | |
172 | evaddw rT0,w0,rK; /* WK = W + K */ \ | |
173 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ | |
174 | LOAD_K##k##1 \ | |
175 | evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ | |
176 | add e,e,rT0; /* 1: E = E + WK */ \ | |
177 | xor rT2,a,b; /* 2: F' = B xor C */ \ | |
178 | add d,d,rT1; /* 2: E = E + WK */ \ | |
179 | xor rT2,rT2,c; /* 2: F = F' xor D */ \ | |
180 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ | |
181 | add d,d,rT2; /* 2: E = E + F */ \ | |
182 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ | |
183 | add d,d,rT0 /* 2: E = E + A' */ | |
184 | ||
185 | #define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ | |
186 | and rT2,b,c; /* 1: F' = B and C */ \ | |
187 | evmergelohi rT0,w7,w6; /* W[-3] */ \ | |
188 | or rT1,b,c; /* 1: F" = B or C */ \ | |
189 | evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ | |
190 | and rT1,d,rT1; /* 1: F" = F" and D */ \ | |
191 | evxor w0,w0,w4; /* W = W xor W[-8] */ \ | |
192 | or rT2,rT2,rT1; /* 1: F = F' or F" */ \ | |
193 | evxor w0,w0,w1; /* W = W xor W[-14] */ \ | |
194 | add e,e,rT2; /* 1: E = E + F */ \ | |
195 | evrlwi w0,w0,1; /* W = W rotl 1 */ \ | |
196 | rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ | |
197 | evaddw rT0,w0,rK; /* WK = W + K */ \ | |
198 | add e,e,rT2; /* 1: E = E + A' */ \ | |
199 | LOAD_K##k##1 \ | |
200 | evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ | |
201 | rotrwi b,b,2; /* 1: B = B rotl 30 */ \ | |
202 | add e,e,rT0; /* 1: E = E + WK */ \ | |
203 | and rT2,a,b; /* 2: F' = B and C */ \ | |
204 | or rT0,a,b; /* 2: F" = B or C */ \ | |
205 | add d,d,rT1; /* 2: E = E + WK */ \ | |
206 | and rT0,c,rT0; /* 2: F" = F" and D */ \ | |
207 | rotrwi a,a,2; /* 2: B = B rotl 30 */ \ | |
208 | or rT2,rT2,rT0; /* 2: F = F' or F" */ \ | |
209 | rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ | |
210 | add d,d,rT2; /* 2: E = E + F */ \ | |
211 | add d,d,rT0 /* 2: E = E + A' */ | |
212 | ||
213 | #define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ | |
214 | R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) | |
215 | ||
216 | _GLOBAL(ppc_spe_sha1_transform) | |
217 | INITIALIZE | |
218 | ||
219 | lwz rH0,0(rHP) | |
220 | lwz rH1,4(rHP) | |
221 | mtctr r5 | |
222 | lwz rH2,8(rHP) | |
223 | lis rKP,PPC_SPE_SHA1_K@h | |
224 | lwz rH3,12(rHP) | |
225 | ori rKP,rKP,PPC_SPE_SHA1_K@l | |
226 | lwz rH4,16(rHP) | |
227 | ||
228 | ppc_spe_sha1_main: | |
229 | R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0) | |
230 | R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8) | |
231 | R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16) | |
232 | R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24) | |
233 | R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32) | |
234 | R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40) | |
235 | R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48) | |
236 | R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56) | |
237 | ||
238 | R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0) | |
239 | R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2) | |
240 | ||
241 | R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0) | |
242 | R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0) | |
243 | R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0) | |
244 | R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0) | |
245 | R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0) | |
246 | R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0) | |
247 | R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0) | |
248 | R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0) | |
249 | R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0) | |
250 | R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3) | |
251 | ||
252 | R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0) | |
253 | R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0) | |
254 | R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0) | |
255 | R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0) | |
256 | R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0) | |
257 | R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0) | |
258 | R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0) | |
259 | R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0) | |
260 | R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0) | |
261 | R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4) | |
262 | ||
263 | R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0) | |
264 | R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0) | |
265 | R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0) | |
266 | R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0) | |
267 | R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0) | |
268 | R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0) | |
269 | R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0) | |
270 | lwz rT3,0(rHP) | |
271 | R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0) | |
272 | lwz rW1,4(rHP) | |
273 | R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0) | |
274 | lwz rW2,8(rHP) | |
275 | R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0) | |
276 | lwz rW3,12(rHP) | |
277 | NEXT_BLOCK | |
278 | lwz rW4,16(rHP) | |
279 | ||
280 | add rH0,rH0,rT3 | |
281 | stw rH0,0(rHP) | |
282 | add rH1,rH1,rW1 | |
283 | stw rH1,4(rHP) | |
284 | add rH2,rH2,rW2 | |
285 | stw rH2,8(rHP) | |
286 | add rH3,rH3,rW3 | |
287 | stw rH3,12(rHP) | |
288 | add rH4,rH4,rW4 | |
289 | stw rH4,16(rHP) | |
290 | ||
291 | bdnz ppc_spe_sha1_main | |
292 | ||
293 | FINALIZE | |
294 | blr | |
295 | ||
296 | .data | |
297 | .align 4 | |
298 | PPC_SPE_SHA1_K: | |
299 | .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6 |