Commit | Line | Data |
---|---|---|
d34a4600 TC |
1 | ######################################################################## |
2 | # Implement fast SHA-256 with AVX2 instructions. (x86_64) | |
3 | # | |
4 | # Copyright (C) 2013 Intel Corporation. | |
5 | # | |
6 | # Authors: | |
7 | # James Guilford <james.guilford@intel.com> | |
8 | # Kirk Yap <kirk.s.yap@intel.com> | |
9 | # Tim Chen <tim.c.chen@linux.intel.com> | |
10 | # | |
11 | # This software is available to you under a choice of one of two | |
12 | # licenses. You may choose to be licensed under the terms of the GNU | |
13 | # General Public License (GPL) Version 2, available from the file | |
14 | # COPYING in the main directory of this source tree, or the | |
15 | # OpenIB.org BSD license below: | |
16 | # | |
17 | # Redistribution and use in source and binary forms, with or | |
18 | # without modification, are permitted provided that the following | |
19 | # conditions are met: | |
20 | # | |
21 | # - Redistributions of source code must retain the above | |
22 | # copyright notice, this list of conditions and the following | |
23 | # disclaimer. | |
24 | # | |
25 | # - Redistributions in binary form must reproduce the above | |
26 | # copyright notice, this list of conditions and the following | |
27 | # disclaimer in the documentation and/or other materials | |
28 | # provided with the distribution. | |
29 | # | |
30 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
31 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
32 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
33 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
34 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
35 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
36 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
37 | # SOFTWARE. | |
38 | # | |
39 | ######################################################################## | |
40 | # | |
41 | # This code is described in an Intel White-Paper: | |
42 | # "Fast SHA-256 Implementations on Intel Architecture Processors" | |
43 | # | |
44 | # To find it, surf to http://www.intel.com/p/en_US/embedded | |
45 | # and search for that title. | |
46 | # | |
47 | ######################################################################## | |
48 | # This code schedules 2 blocks at a time, with 4 lanes per block | |
49 | ######################################################################## | |
50 | ||
51 | #ifdef CONFIG_AS_AVX2 | |
52 | #include <linux/linkage.h> | |
53 | ||
54 | ## assume buffers not aligned | |
55 | #define VMOVDQ vmovdqu | |
56 | ||
57 | ################################ Define Macros | |
58 | ||
59 | # addm [mem], reg | |
60 | # Add reg to mem using reg-mem add and store | |
61 | .macro addm p1 p2 | |
62 | add \p1, \p2 | |
63 | mov \p2, \p1 | |
64 | .endm | |
65 | ||
66 | ################################ | |
67 | ||
68 | X0 = %ymm4 | |
69 | X1 = %ymm5 | |
70 | X2 = %ymm6 | |
71 | X3 = %ymm7 | |
72 | ||
73 | # XMM versions of above | |
74 | XWORD0 = %xmm4 | |
75 | XWORD1 = %xmm5 | |
76 | XWORD2 = %xmm6 | |
77 | XWORD3 = %xmm7 | |
78 | ||
79 | XTMP0 = %ymm0 | |
80 | XTMP1 = %ymm1 | |
81 | XTMP2 = %ymm2 | |
82 | XTMP3 = %ymm3 | |
83 | XTMP4 = %ymm8 | |
84 | XFER = %ymm9 | |
85 | XTMP5 = %ymm11 | |
86 | ||
87 | SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA | |
88 | SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 | |
89 | BYTE_FLIP_MASK = %ymm13 | |
90 | ||
91 | X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK | |
92 | ||
93 | NUM_BLKS = %rdx # 3rd arg | |
1631030a AB |
94 | INP = %rsi # 2nd arg |
95 | CTX = %rdi # 1st arg | |
d34a4600 TC |
96 | c = %ecx |
97 | d = %r8d | |
98 | e = %edx # clobbers NUM_BLKS | |
1631030a | 99 | y3 = %esi # clobbers INP |
d34a4600 TC |
100 | |
101 | ||
102 | TBL = %rbp | |
103 | SRND = CTX # SRND is same register as CTX | |
104 | ||
105 | a = %eax | |
106 | b = %ebx | |
107 | f = %r9d | |
108 | g = %r10d | |
109 | h = %r11d | |
110 | old_h = %r11d | |
111 | ||
112 | T1 = %r12d | |
113 | y0 = %r13d | |
114 | y1 = %r14d | |
115 | y2 = %r15d | |
116 | ||
117 | ||
118 | _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round | |
119 | _XMM_SAVE_SIZE = 0 | |
120 | _INP_END_SIZE = 8 | |
121 | _INP_SIZE = 8 | |
122 | _CTX_SIZE = 8 | |
123 | _RSP_SIZE = 8 | |
124 | ||
125 | _XFER = 0 | |
126 | _XMM_SAVE = _XFER + _XFER_SIZE | |
127 | _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE | |
128 | _INP = _INP_END + _INP_END_SIZE | |
129 | _CTX = _INP + _INP_SIZE | |
130 | _RSP = _CTX + _CTX_SIZE | |
131 | STACK_SIZE = _RSP + _RSP_SIZE | |
132 | ||
133 | # rotate_Xs | |
134 | # Rotate values of symbols X0...X3 | |
135 | .macro rotate_Xs | |
136 | X_ = X0 | |
137 | X0 = X1 | |
138 | X1 = X2 | |
139 | X2 = X3 | |
140 | X3 = X_ | |
141 | .endm | |
142 | ||
143 | # ROTATE_ARGS | |
144 | # Rotate values of symbols a...h | |
145 | .macro ROTATE_ARGS | |
146 | old_h = h | |
147 | TMP_ = h | |
148 | h = g | |
149 | g = f | |
150 | f = e | |
151 | e = d | |
152 | d = c | |
153 | c = b | |
154 | b = a | |
155 | a = TMP_ | |
156 | .endm | |
157 | ||
158 | .macro FOUR_ROUNDS_AND_SCHED disp | |
159 | ################################### RND N + 0 ############################ | |
160 | ||
161 | mov a, y3 # y3 = a # MAJA | |
162 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
163 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
164 | ||
165 | addl \disp(%rsp, SRND), h # h = k + w + h # -- | |
166 | or c, y3 # y3 = a|c # MAJA | |
167 | vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] | |
168 | mov f, y2 # y2 = f # CH | |
169 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
170 | ||
171 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
172 | xor g, y2 # y2 = f^g # CH | |
173 | vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 | |
174 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
175 | ||
176 | and e, y2 # y2 = (f^g)&e # CH | |
177 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
178 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
179 | add h, d # d = k + w + h + d # -- | |
180 | ||
181 | and b, y3 # y3 = (a|c)&b # MAJA | |
182 | vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] | |
183 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
184 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
185 | ||
186 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
187 | vpsrld $7, XTMP1, XTMP2 | |
188 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
189 | mov a, T1 # T1 = a # MAJB | |
190 | and c, T1 # T1 = a&c # MAJB | |
191 | ||
192 | add y0, y2 # y2 = S1 + CH # -- | |
193 | vpslld $(32-7), XTMP1, XTMP3 | |
194 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
195 | add y1, h # h = k + w + h + S0 # -- | |
196 | ||
197 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
198 | vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 | |
199 | ||
200 | vpsrld $18, XTMP1, XTMP2 | |
201 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
202 | add y3, h # h = t1 + S0 + MAJ # -- | |
203 | ||
204 | ||
205 | ROTATE_ARGS | |
206 | ||
207 | ################################### RND N + 1 ############################ | |
208 | ||
209 | mov a, y3 # y3 = a # MAJA | |
210 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
211 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
212 | offset = \disp + 1*4 | |
213 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
214 | or c, y3 # y3 = a|c # MAJA | |
215 | ||
216 | ||
217 | vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 | |
218 | mov f, y2 # y2 = f # CH | |
219 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
220 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
221 | xor g, y2 # y2 = f^g # CH | |
222 | ||
223 | ||
224 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
225 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
226 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
227 | and e, y2 # y2 = (f^g)&e # CH | |
228 | add h, d # d = k + w + h + d # -- | |
229 | ||
230 | vpslld $(32-18), XTMP1, XTMP1 | |
231 | and b, y3 # y3 = (a|c)&b # MAJA | |
232 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
233 | ||
234 | vpxor XTMP1, XTMP3, XTMP3 | |
235 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
236 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
237 | ||
238 | vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 | |
239 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
240 | mov a, T1 # T1 = a # MAJB | |
241 | and c, T1 # T1 = a&c # MAJB | |
242 | add y0, y2 # y2 = S1 + CH # -- | |
243 | ||
244 | vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 | |
245 | vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} | |
246 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
247 | add y1, h # h = k + w + h + S0 # -- | |
248 | ||
249 | vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 | |
250 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
251 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
252 | add y3, h # h = t1 + S0 + MAJ # -- | |
253 | ||
254 | vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} | |
255 | ||
256 | ||
257 | ROTATE_ARGS | |
258 | ||
259 | ################################### RND N + 2 ############################ | |
260 | ||
261 | mov a, y3 # y3 = a # MAJA | |
262 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
263 | offset = \disp + 2*4 | |
264 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
265 | ||
266 | vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} | |
267 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
268 | or c, y3 # y3 = a|c # MAJA | |
269 | mov f, y2 # y2 = f # CH | |
270 | xor g, y2 # y2 = f^g # CH | |
271 | ||
272 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
273 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
274 | vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} | |
275 | and e, y2 # y2 = (f^g)&e # CH | |
276 | ||
277 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
278 | vpxor XTMP3, XTMP2, XTMP2 | |
279 | add h, d # d = k + w + h + d # -- | |
280 | and b, y3 # y3 = (a|c)&b # MAJA | |
281 | ||
282 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
283 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
284 | vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} | |
285 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
286 | ||
287 | vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} | |
288 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
289 | rorx $2, a ,T1 # T1 = (a >> 2) # S0 | |
290 | vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} | |
291 | ||
292 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
293 | mov a, T1 # T1 = a # MAJB | |
294 | and c, T1 # T1 = a&c # MAJB | |
295 | add y0, y2 # y2 = S1 + CH # -- | |
296 | vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} | |
297 | ||
298 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
299 | add y1,h # h = k + w + h + S0 # -- | |
300 | add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
301 | add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
302 | ||
303 | add y3,h # h = t1 + S0 + MAJ # -- | |
304 | ||
305 | ||
306 | ROTATE_ARGS | |
307 | ||
308 | ################################### RND N + 3 ############################ | |
309 | ||
310 | mov a, y3 # y3 = a # MAJA | |
311 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
312 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
313 | offset = \disp + 3*4 | |
314 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
315 | or c, y3 # y3 = a|c # MAJA | |
316 | ||
317 | ||
318 | vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} | |
319 | mov f, y2 # y2 = f # CH | |
320 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
321 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
322 | xor g, y2 # y2 = f^g # CH | |
323 | ||
324 | ||
325 | vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} | |
326 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
327 | and e, y2 # y2 = (f^g)&e # CH | |
328 | add h, d # d = k + w + h + d # -- | |
329 | and b, y3 # y3 = (a|c)&b # MAJA | |
330 | ||
331 | vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} | |
332 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
333 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
334 | ||
335 | vpxor XTMP3, XTMP2, XTMP2 | |
336 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
337 | add y0, y2 # y2 = S1 + CH # -- | |
338 | ||
339 | vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} | |
340 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
341 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
342 | ||
343 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
344 | vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} | |
345 | ||
346 | vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} | |
347 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
348 | mov a, T1 # T1 = a # MAJB | |
349 | and c, T1 # T1 = a&c # MAJB | |
350 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
351 | ||
352 | add y1, h # h = k + w + h + S0 # -- | |
353 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
354 | add y3, h # h = t1 + S0 + MAJ # -- | |
355 | ||
356 | ROTATE_ARGS | |
357 | rotate_Xs | |
358 | .endm | |
359 | ||
360 | .macro DO_4ROUNDS disp | |
361 | ################################### RND N + 0 ########################### | |
362 | ||
363 | mov f, y2 # y2 = f # CH | |
364 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
365 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
366 | xor g, y2 # y2 = f^g # CH | |
367 | ||
368 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
369 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
370 | and e, y2 # y2 = (f^g)&e # CH | |
371 | ||
372 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
373 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
374 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
375 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
376 | mov a, y3 # y3 = a # MAJA | |
377 | ||
378 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
379 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
380 | addl \disp(%rsp, SRND), h # h = k + w + h # -- | |
381 | or c, y3 # y3 = a|c # MAJA | |
382 | ||
383 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
384 | mov a, T1 # T1 = a # MAJB | |
385 | and b, y3 # y3 = (a|c)&b # MAJA | |
386 | and c, T1 # T1 = a&c # MAJB | |
387 | add y0, y2 # y2 = S1 + CH # -- | |
388 | ||
389 | ||
390 | add h, d # d = k + w + h + d # -- | |
391 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
392 | add y1, h # h = k + w + h + S0 # -- | |
393 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
394 | ||
395 | ROTATE_ARGS | |
396 | ||
397 | ################################### RND N + 1 ########################### | |
398 | ||
399 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
400 | mov f, y2 # y2 = f # CH | |
401 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
402 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
403 | xor g, y2 # y2 = f^g # CH | |
404 | ||
405 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
406 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
407 | and e, y2 # y2 = (f^g)&e # CH | |
408 | add y3, old_h # h = t1 + S0 + MAJ # -- | |
409 | ||
410 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
411 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
412 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
413 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
414 | mov a, y3 # y3 = a # MAJA | |
415 | ||
416 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
417 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
418 | offset = 4*1 + \disp | |
419 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
420 | or c, y3 # y3 = a|c # MAJA | |
421 | ||
422 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
423 | mov a, T1 # T1 = a # MAJB | |
424 | and b, y3 # y3 = (a|c)&b # MAJA | |
425 | and c, T1 # T1 = a&c # MAJB | |
426 | add y0, y2 # y2 = S1 + CH # -- | |
427 | ||
428 | ||
429 | add h, d # d = k + w + h + d # -- | |
430 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
431 | add y1, h # h = k + w + h + S0 # -- | |
432 | ||
433 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
434 | ||
435 | ROTATE_ARGS | |
436 | ||
437 | ################################### RND N + 2 ############################## | |
438 | ||
439 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
440 | mov f, y2 # y2 = f # CH | |
441 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
442 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
443 | xor g, y2 # y2 = f^g # CH | |
444 | ||
445 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
446 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
447 | and e, y2 # y2 = (f^g)&e # CH | |
448 | add y3, old_h # h = t1 + S0 + MAJ # -- | |
449 | ||
450 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
451 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
452 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
453 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
454 | mov a, y3 # y3 = a # MAJA | |
455 | ||
456 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
457 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
458 | offset = 4*2 + \disp | |
459 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
460 | or c, y3 # y3 = a|c # MAJA | |
461 | ||
462 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
463 | mov a, T1 # T1 = a # MAJB | |
464 | and b, y3 # y3 = (a|c)&b # MAJA | |
465 | and c, T1 # T1 = a&c # MAJB | |
466 | add y0, y2 # y2 = S1 + CH # -- | |
467 | ||
468 | ||
469 | add h, d # d = k + w + h + d # -- | |
470 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
471 | add y1, h # h = k + w + h + S0 # -- | |
472 | ||
473 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
474 | ||
475 | ROTATE_ARGS | |
476 | ||
477 | ################################### RND N + 3 ########################### | |
478 | ||
479 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
480 | mov f, y2 # y2 = f # CH | |
481 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
482 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
483 | xor g, y2 # y2 = f^g # CH | |
484 | ||
485 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
486 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
487 | and e, y2 # y2 = (f^g)&e # CH | |
488 | add y3, old_h # h = t1 + S0 + MAJ # -- | |
489 | ||
490 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
491 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
492 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
493 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
494 | mov a, y3 # y3 = a # MAJA | |
495 | ||
496 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
497 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
498 | offset = 4*3 + \disp | |
499 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
500 | or c, y3 # y3 = a|c # MAJA | |
501 | ||
502 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
503 | mov a, T1 # T1 = a # MAJB | |
504 | and b, y3 # y3 = (a|c)&b # MAJA | |
505 | and c, T1 # T1 = a&c # MAJB | |
506 | add y0, y2 # y2 = S1 + CH # -- | |
507 | ||
508 | ||
509 | add h, d # d = k + w + h + d # -- | |
510 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
511 | add y1, h # h = k + w + h + S0 # -- | |
512 | ||
513 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
514 | ||
515 | ||
516 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
517 | ||
518 | add y3, h # h = t1 + S0 + MAJ # -- | |
519 | ||
520 | ROTATE_ARGS | |
521 | ||
522 | .endm | |
523 | ||
524 | ######################################################################## | |
525 | ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) | |
1631030a AB |
526 | ## arg 1 : pointer to digest |
527 | ## arg 2 : pointer to input data | |
d34a4600 TC |
528 | ## arg 3 : Num blocks |
529 | ######################################################################## | |
530 | .text | |
531 | ENTRY(sha256_transform_rorx) | |
532 | .align 32 | |
533 | pushq %rbx | |
534 | pushq %rbp | |
535 | pushq %r12 | |
536 | pushq %r13 | |
537 | pushq %r14 | |
538 | pushq %r15 | |
539 | ||
540 | mov %rsp, %rax | |
541 | subq $STACK_SIZE, %rsp | |
542 | and $-32, %rsp # align rsp to 32 byte boundary | |
543 | mov %rax, _RSP(%rsp) | |
544 | ||
545 | ||
546 | shl $6, NUM_BLKS # convert to bytes | |
547 | jz done_hash | |
548 | lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block | |
549 | mov NUM_BLKS, _INP_END(%rsp) | |
550 | ||
551 | cmp NUM_BLKS, INP | |
552 | je only_one_block | |
553 | ||
554 | ## load initial digest | |
555 | mov (CTX), a | |
556 | mov 4*1(CTX), b | |
557 | mov 4*2(CTX), c | |
558 | mov 4*3(CTX), d | |
559 | mov 4*4(CTX), e | |
560 | mov 4*5(CTX), f | |
561 | mov 4*6(CTX), g | |
562 | mov 4*7(CTX), h | |
563 | ||
564 | vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | |
565 | vmovdqa _SHUF_00BA(%rip), SHUF_00BA | |
566 | vmovdqa _SHUF_DC00(%rip), SHUF_DC00 | |
567 | ||
568 | mov CTX, _CTX(%rsp) | |
569 | ||
570 | loop0: | |
571 | lea K256(%rip), TBL | |
572 | ||
573 | ## Load first 16 dwords from two blocks | |
574 | VMOVDQ 0*32(INP),XTMP0 | |
575 | VMOVDQ 1*32(INP),XTMP1 | |
576 | VMOVDQ 2*32(INP),XTMP2 | |
577 | VMOVDQ 3*32(INP),XTMP3 | |
578 | ||
579 | ## byte swap data | |
580 | vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 | |
581 | vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 | |
582 | vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 | |
583 | vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 | |
584 | ||
585 | ## transpose data into high/low halves | |
586 | vperm2i128 $0x20, XTMP2, XTMP0, X0 | |
587 | vperm2i128 $0x31, XTMP2, XTMP0, X1 | |
588 | vperm2i128 $0x20, XTMP3, XTMP1, X2 | |
589 | vperm2i128 $0x31, XTMP3, XTMP1, X3 | |
590 | ||
591 | last_block_enter: | |
592 | add $64, INP | |
593 | mov INP, _INP(%rsp) | |
594 | ||
595 | ## schedule 48 input dwords, by doing 3 rounds of 12 each | |
596 | xor SRND, SRND | |
597 | ||
598 | .align 16 | |
599 | loop1: | |
600 | vpaddd 0*32(TBL, SRND), X0, XFER | |
601 | vmovdqa XFER, 0*32+_XFER(%rsp, SRND) | |
602 | FOUR_ROUNDS_AND_SCHED _XFER + 0*32 | |
603 | ||
604 | vpaddd 1*32(TBL, SRND), X0, XFER | |
605 | vmovdqa XFER, 1*32+_XFER(%rsp, SRND) | |
606 | FOUR_ROUNDS_AND_SCHED _XFER + 1*32 | |
607 | ||
608 | vpaddd 2*32(TBL, SRND), X0, XFER | |
609 | vmovdqa XFER, 2*32+_XFER(%rsp, SRND) | |
610 | FOUR_ROUNDS_AND_SCHED _XFER + 2*32 | |
611 | ||
612 | vpaddd 3*32(TBL, SRND), X0, XFER | |
613 | vmovdqa XFER, 3*32+_XFER(%rsp, SRND) | |
614 | FOUR_ROUNDS_AND_SCHED _XFER + 3*32 | |
615 | ||
616 | add $4*32, SRND | |
617 | cmp $3*4*32, SRND | |
618 | jb loop1 | |
619 | ||
620 | loop2: | |
621 | ## Do last 16 rounds with no scheduling | |
622 | vpaddd 0*32(TBL, SRND), X0, XFER | |
623 | vmovdqa XFER, 0*32+_XFER(%rsp, SRND) | |
624 | DO_4ROUNDS _XFER + 0*32 | |
625 | vpaddd 1*32(TBL, SRND), X1, XFER | |
626 | vmovdqa XFER, 1*32+_XFER(%rsp, SRND) | |
627 | DO_4ROUNDS _XFER + 1*32 | |
628 | add $2*32, SRND | |
629 | ||
630 | vmovdqa X2, X0 | |
631 | vmovdqa X3, X1 | |
632 | ||
633 | cmp $4*4*32, SRND | |
634 | jb loop2 | |
635 | ||
636 | mov _CTX(%rsp), CTX | |
637 | mov _INP(%rsp), INP | |
638 | ||
639 | addm (4*0)(CTX),a | |
640 | addm (4*1)(CTX),b | |
641 | addm (4*2)(CTX),c | |
642 | addm (4*3)(CTX),d | |
643 | addm (4*4)(CTX),e | |
644 | addm (4*5)(CTX),f | |
645 | addm (4*6)(CTX),g | |
646 | addm (4*7)(CTX),h | |
647 | ||
648 | cmp _INP_END(%rsp), INP | |
649 | ja done_hash | |
650 | ||
651 | #### Do second block using previously scheduled results | |
652 | xor SRND, SRND | |
653 | .align 16 | |
654 | loop3: | |
655 | DO_4ROUNDS _XFER + 0*32 + 16 | |
656 | DO_4ROUNDS _XFER + 1*32 + 16 | |
657 | add $2*32, SRND | |
658 | cmp $4*4*32, SRND | |
659 | jb loop3 | |
660 | ||
661 | mov _CTX(%rsp), CTX | |
662 | mov _INP(%rsp), INP | |
663 | add $64, INP | |
664 | ||
665 | addm (4*0)(CTX),a | |
666 | addm (4*1)(CTX),b | |
667 | addm (4*2)(CTX),c | |
668 | addm (4*3)(CTX),d | |
669 | addm (4*4)(CTX),e | |
670 | addm (4*5)(CTX),f | |
671 | addm (4*6)(CTX),g | |
672 | addm (4*7)(CTX),h | |
673 | ||
674 | cmp _INP_END(%rsp), INP | |
675 | jb loop0 | |
676 | ja done_hash | |
677 | ||
678 | do_last_block: | |
679 | #### do last block | |
680 | lea K256(%rip), TBL | |
681 | ||
682 | VMOVDQ 0*16(INP),XWORD0 | |
683 | VMOVDQ 1*16(INP),XWORD1 | |
684 | VMOVDQ 2*16(INP),XWORD2 | |
685 | VMOVDQ 3*16(INP),XWORD3 | |
686 | ||
687 | vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 | |
688 | vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 | |
689 | vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 | |
690 | vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 | |
691 | ||
692 | jmp last_block_enter | |
693 | ||
694 | only_one_block: | |
695 | ||
696 | ## load initial digest | |
697 | mov (4*0)(CTX),a | |
698 | mov (4*1)(CTX),b | |
699 | mov (4*2)(CTX),c | |
700 | mov (4*3)(CTX),d | |
701 | mov (4*4)(CTX),e | |
702 | mov (4*5)(CTX),f | |
703 | mov (4*6)(CTX),g | |
704 | mov (4*7)(CTX),h | |
705 | ||
706 | vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | |
707 | vmovdqa _SHUF_00BA(%rip), SHUF_00BA | |
708 | vmovdqa _SHUF_DC00(%rip), SHUF_DC00 | |
709 | ||
710 | mov CTX, _CTX(%rsp) | |
711 | jmp do_last_block | |
712 | ||
713 | done_hash: | |
714 | ||
715 | mov _RSP(%rsp), %rsp | |
716 | ||
717 | popq %r15 | |
718 | popq %r14 | |
719 | popq %r13 | |
720 | popq %r12 | |
721 | popq %rbp | |
722 | popq %rbx | |
723 | ret | |
724 | ENDPROC(sha256_transform_rorx) | |
725 | ||
726 | .data | |
727 | .align 64 | |
728 | K256: | |
729 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
730 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
731 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
732 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
733 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
734 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
735 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
736 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
737 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
738 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
739 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
740 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
741 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
742 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
743 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
744 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
745 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
746 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
747 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
748 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
749 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
750 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
751 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
752 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
753 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
754 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
755 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
756 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
757 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
758 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
759 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
760 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
761 | ||
762 | PSHUFFLE_BYTE_FLIP_MASK: | |
763 | .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 | |
764 | ||
765 | # shuffle xBxA -> 00BA | |
766 | _SHUF_00BA: | |
767 | .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 | |
768 | ||
769 | # shuffle xDxC -> DC00 | |
770 | _SHUF_DC00: | |
771 | .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF | |
772 | #endif |