7c1da8d0 |
1 | /* |
2 | * Implement fast SHA-1 with AVX2 instructions. (x86_64) |
3 | * |
4 | * This file is provided under a dual BSD/GPLv2 license. When using or |
5 | * redistributing this file, you may do so under either license. |
6 | * |
7 | * GPL LICENSE SUMMARY |
8 | * |
9 | * Copyright(c) 2014 Intel Corporation. |
10 | * |
11 | * This program is free software; you can redistribute it and/or modify |
12 | * it under the terms of version 2 of the GNU General Public License as |
13 | * published by the Free Software Foundation. |
14 | * |
15 | * This program is distributed in the hope that it will be useful, but |
16 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | * General Public License for more details. |
19 | * |
20 | * Contact Information: |
21 | * Ilya Albrekht <ilya.albrekht@intel.com> |
22 | * Maxim Locktyukhin <maxim.locktyukhin@intel.com> |
23 | * Ronen Zohar <ronen.zohar@intel.com> |
24 | * Chandramouli Narayanan <mouli@linux.intel.com> |
25 | * |
26 | * BSD LICENSE |
27 | * |
28 | * Copyright(c) 2014 Intel Corporation. |
29 | * |
30 | * Redistribution and use in source and binary forms, with or without |
31 | * modification, are permitted provided that the following conditions |
32 | * are met: |
33 | * |
34 | * Redistributions of source code must retain the above copyright |
35 | * notice, this list of conditions and the following disclaimer. |
36 | * Redistributions in binary form must reproduce the above copyright |
37 | * notice, this list of conditions and the following disclaimer in |
38 | * the documentation and/or other materials provided with the |
39 | * distribution. |
40 | * Neither the name of Intel Corporation nor the names of its |
41 | * contributors may be used to endorse or promote products derived |
42 | * from this software without specific prior written permission. |
43 | * |
44 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
45 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
46 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
47 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
48 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
49 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
50 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
51 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
52 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
53 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
54 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
55 | * |
56 | */ |
57 | |
58 | /* |
59 | * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. |
60 | * |
61 | *This implementation is based on the previous SSSE3 release: |
62 | *Visit http://software.intel.com/en-us/articles/ |
63 | *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ |
64 | * |
65 | *Updates 20-byte SHA-1 record in 'hash' for even number of |
66 | *'num_blocks' consecutive 64-byte blocks |
67 | * |
68 | *extern "C" void sha1_transform_avx2( |
69 | * int *hash, const char* input, size_t num_blocks ); |
70 | */ |
71 | |
72 | #include <linux/linkage.h> |
73 | |
74 | #define CTX %rdi /* arg1 */ |
75 | #define BUF %rsi /* arg2 */ |
76 | #define CNT %rdx /* arg3 */ |
77 | |
78 | #define REG_A %ecx |
79 | #define REG_B %esi |
80 | #define REG_C %edi |
81 | #define REG_D %eax |
82 | #define REG_E %edx |
83 | #define REG_TB %ebx |
84 | #define REG_TA %r12d |
85 | #define REG_RA %rcx |
86 | #define REG_RB %rsi |
87 | #define REG_RC %rdi |
88 | #define REG_RD %rax |
89 | #define REG_RE %rdx |
90 | #define REG_RTA %r12 |
91 | #define REG_RTB %rbx |
92 | #define REG_T1 %ebp |
93 | #define xmm_mov vmovups |
94 | #define avx2_zeroupper vzeroupper |
95 | #define RND_F1 1 |
96 | #define RND_F2 2 |
97 | #define RND_F3 3 |
98 | |
99 | .macro REGALLOC |
100 | .set A, REG_A |
101 | .set B, REG_B |
102 | .set C, REG_C |
103 | .set D, REG_D |
104 | .set E, REG_E |
105 | .set TB, REG_TB |
106 | .set TA, REG_TA |
107 | |
108 | .set RA, REG_RA |
109 | .set RB, REG_RB |
110 | .set RC, REG_RC |
111 | .set RD, REG_RD |
112 | .set RE, REG_RE |
113 | |
114 | .set RTA, REG_RTA |
115 | .set RTB, REG_RTB |
116 | |
117 | .set T1, REG_T1 |
118 | .endm |
119 | |
120 | #define K_BASE %r8 |
121 | #define HASH_PTR %r9 |
122 | #define BUFFER_PTR %r10 |
123 | #define BUFFER_PTR2 %r13 |
124 | #define BUFFER_END %r11 |
125 | |
126 | #define PRECALC_BUF %r14 |
127 | #define WK_BUF %r15 |
128 | |
129 | #define W_TMP %xmm0 |
130 | #define WY_TMP %ymm0 |
131 | #define WY_TMP2 %ymm9 |
132 | |
133 | # AVX2 variables |
134 | #define WY0 %ymm3 |
135 | #define WY4 %ymm5 |
136 | #define WY08 %ymm7 |
137 | #define WY12 %ymm8 |
138 | #define WY16 %ymm12 |
139 | #define WY20 %ymm13 |
140 | #define WY24 %ymm14 |
141 | #define WY28 %ymm15 |
142 | |
143 | #define YMM_SHUFB_BSWAP %ymm10 |
144 | |
145 | /* |
146 | * Keep 2 iterations precalculated at a time: |
147 | * - 80 DWORDs per iteration * 2 |
148 | */ |
149 | #define W_SIZE (80*2*2 +16) |
150 | |
151 | #define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) |
152 | #define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) |
153 | |
154 | |
155 | .macro UPDATE_HASH hash, val |
156 | add \hash, \val |
157 | mov \val, \hash |
158 | .endm |
159 | |
160 | .macro PRECALC_RESET_WY |
161 | .set WY_00, WY0 |
162 | .set WY_04, WY4 |
163 | .set WY_08, WY08 |
164 | .set WY_12, WY12 |
165 | .set WY_16, WY16 |
166 | .set WY_20, WY20 |
167 | .set WY_24, WY24 |
168 | .set WY_28, WY28 |
169 | .set WY_32, WY_00 |
170 | .endm |
171 | |
172 | .macro PRECALC_ROTATE_WY |
173 | /* Rotate macros */ |
174 | .set WY_32, WY_28 |
175 | .set WY_28, WY_24 |
176 | .set WY_24, WY_20 |
177 | .set WY_20, WY_16 |
178 | .set WY_16, WY_12 |
179 | .set WY_12, WY_08 |
180 | .set WY_08, WY_04 |
181 | .set WY_04, WY_00 |
182 | .set WY_00, WY_32 |
183 | |
184 | /* Define register aliases */ |
185 | .set WY, WY_00 |
186 | .set WY_minus_04, WY_04 |
187 | .set WY_minus_08, WY_08 |
188 | .set WY_minus_12, WY_12 |
189 | .set WY_minus_16, WY_16 |
190 | .set WY_minus_20, WY_20 |
191 | .set WY_minus_24, WY_24 |
192 | .set WY_minus_28, WY_28 |
193 | .set WY_minus_32, WY |
194 | .endm |
195 | |
196 | .macro PRECALC_00_15 |
197 | .if (i == 0) # Initialize and rotate registers |
198 | PRECALC_RESET_WY |
199 | PRECALC_ROTATE_WY |
200 | .endif |
201 | |
202 | /* message scheduling pre-compute for rounds 0-15 */ |
203 | .if ((i & 7) == 0) |
204 | /* |
205 | * blended AVX2 and ALU instruction scheduling |
206 | * 1 vector iteration per 8 rounds |
207 | */ |
208 | vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP |
209 | .elseif ((i & 7) == 1) |
210 | vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ |
211 | WY_TMP, WY_TMP |
212 | .elseif ((i & 7) == 2) |
213 | vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY |
214 | .elseif ((i & 7) == 4) |
215 | vpaddd K_XMM(K_BASE), WY, WY_TMP |
216 | .elseif ((i & 7) == 7) |
217 | vmovdqu WY_TMP, PRECALC_WK(i&~7) |
218 | |
219 | PRECALC_ROTATE_WY |
220 | .endif |
221 | .endm |
222 | |
223 | .macro PRECALC_16_31 |
224 | /* |
225 | * message scheduling pre-compute for rounds 16-31 |
226 | * calculating last 32 w[i] values in 8 XMM registers |
227 | * pre-calculate K+w[i] values and store to mem |
228 | * for later load by ALU add instruction |
229 | * |
230 | * "brute force" vectorization for rounds 16-31 only |
231 | * due to w[i]->w[i-3] dependency |
232 | */ |
233 | .if ((i & 7) == 0) |
234 | /* |
235 | * blended AVX2 and ALU instruction scheduling |
236 | * 1 vector iteration per 8 rounds |
237 | */ |
238 | /* w[i-14] */ |
239 | vpalignr $8, WY_minus_16, WY_minus_12, WY |
240 | vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ |
241 | .elseif ((i & 7) == 1) |
242 | vpxor WY_minus_08, WY, WY |
243 | vpxor WY_minus_16, WY_TMP, WY_TMP |
244 | .elseif ((i & 7) == 2) |
245 | vpxor WY_TMP, WY, WY |
246 | vpslldq $12, WY, WY_TMP2 |
247 | .elseif ((i & 7) == 3) |
248 | vpslld $1, WY, WY_TMP |
249 | vpsrld $31, WY, WY |
250 | .elseif ((i & 7) == 4) |
251 | vpor WY, WY_TMP, WY_TMP |
252 | vpslld $2, WY_TMP2, WY |
253 | .elseif ((i & 7) == 5) |
254 | vpsrld $30, WY_TMP2, WY_TMP2 |
255 | vpxor WY, WY_TMP, WY_TMP |
256 | .elseif ((i & 7) == 7) |
257 | vpxor WY_TMP2, WY_TMP, WY |
258 | vpaddd K_XMM(K_BASE), WY, WY_TMP |
259 | vmovdqu WY_TMP, PRECALC_WK(i&~7) |
260 | |
261 | PRECALC_ROTATE_WY |
262 | .endif |
263 | .endm |
264 | |
265 | .macro PRECALC_32_79 |
266 | /* |
267 | * in SHA-1 specification: |
268 | * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 |
269 | * instead we do equal: |
270 | * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 |
271 | * allows more efficient vectorization |
272 | * since w[i]=>w[i-3] dependency is broken |
273 | */ |
274 | |
275 | .if ((i & 7) == 0) |
276 | /* |
277 | * blended AVX2 and ALU instruction scheduling |
278 | * 1 vector iteration per 8 rounds |
279 | */ |
280 | vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP |
281 | .elseif ((i & 7) == 1) |
282 | /* W is W_minus_32 before xor */ |
283 | vpxor WY_minus_28, WY, WY |
284 | .elseif ((i & 7) == 2) |
285 | vpxor WY_minus_16, WY_TMP, WY_TMP |
286 | .elseif ((i & 7) == 3) |
287 | vpxor WY_TMP, WY, WY |
288 | .elseif ((i & 7) == 4) |
289 | vpslld $2, WY, WY_TMP |
290 | .elseif ((i & 7) == 5) |
291 | vpsrld $30, WY, WY |
292 | vpor WY, WY_TMP, WY |
293 | .elseif ((i & 7) == 7) |
294 | vpaddd K_XMM(K_BASE), WY, WY_TMP |
295 | vmovdqu WY_TMP, PRECALC_WK(i&~7) |
296 | |
297 | PRECALC_ROTATE_WY |
298 | .endif |
299 | .endm |
300 | |
301 | .macro PRECALC r, s |
302 | .set i, \r |
303 | |
304 | .if (i < 40) |
305 | .set K_XMM, 32*0 |
306 | .elseif (i < 80) |
307 | .set K_XMM, 32*1 |
308 | .elseif (i < 120) |
309 | .set K_XMM, 32*2 |
310 | .else |
311 | .set K_XMM, 32*3 |
312 | .endif |
313 | |
314 | .if (i<32) |
315 | PRECALC_00_15 \s |
316 | .elseif (i<64) |
317 | PRECALC_16_31 \s |
318 | .elseif (i < 160) |
319 | PRECALC_32_79 \s |
320 | .endif |
321 | .endm |
322 | |
323 | .macro ROTATE_STATE |
324 | .set T_REG, E |
325 | .set E, D |
326 | .set D, C |
327 | .set C, B |
328 | .set B, TB |
329 | .set TB, A |
330 | .set A, T_REG |
331 | |
332 | .set T_REG, RE |
333 | .set RE, RD |
334 | .set RD, RC |
335 | .set RC, RB |
336 | .set RB, RTB |
337 | .set RTB, RA |
338 | .set RA, T_REG |
339 | .endm |
340 | |
341 | /* Macro relies on saved ROUND_Fx */ |
342 | |
343 | .macro RND_FUN f, r |
344 | .if (\f == RND_F1) |
345 | ROUND_F1 \r |
346 | .elseif (\f == RND_F2) |
347 | ROUND_F2 \r |
348 | .elseif (\f == RND_F3) |
349 | ROUND_F3 \r |
350 | .endif |
351 | .endm |
352 | |
353 | .macro RR r |
354 | .set round_id, (\r % 80) |
355 | |
356 | .if (round_id == 0) /* Precalculate F for first round */ |
357 | .set ROUND_FUNC, RND_F1 |
358 | mov B, TB |
359 | |
360 | rorx $(32-30), B, B /* b>>>2 */ |
361 | andn D, TB, T1 |
362 | and C, TB |
363 | xor T1, TB |
364 | .endif |
365 | |
366 | RND_FUN ROUND_FUNC, \r |
367 | ROTATE_STATE |
368 | |
369 | .if (round_id == 18) |
370 | .set ROUND_FUNC, RND_F2 |
371 | .elseif (round_id == 38) |
372 | .set ROUND_FUNC, RND_F3 |
373 | .elseif (round_id == 58) |
374 | .set ROUND_FUNC, RND_F2 |
375 | .endif |
376 | |
377 | .set round_id, ( (\r+1) % 80) |
378 | |
379 | RND_FUN ROUND_FUNC, (\r+1) |
380 | ROTATE_STATE |
381 | .endm |
382 | |
383 | .macro ROUND_F1 r |
384 | add WK(\r), E |
385 | |
386 | andn C, A, T1 /* ~b&d */ |
387 | lea (RE,RTB), E /* Add F from the previous round */ |
388 | |
389 | rorx $(32-5), A, TA /* T2 = A >>> 5 */ |
390 | rorx $(32-30),A, TB /* b>>>2 for next round */ |
391 | |
392 | PRECALC (\r) /* msg scheduling for next 2 blocks */ |
393 | |
394 | /* |
395 | * Calculate F for the next round |
396 | * (b & c) ^ andn[b, d] |
397 | */ |
398 | and B, A /* b&c */ |
399 | xor T1, A /* F1 = (b&c) ^ (~b&d) */ |
400 | |
401 | lea (RE,RTA), E /* E += A >>> 5 */ |
402 | .endm |
403 | |
404 | .macro ROUND_F2 r |
405 | add WK(\r), E |
406 | lea (RE,RTB), E /* Add F from the previous round */ |
407 | |
408 | /* Calculate F for the next round */ |
409 | rorx $(32-5), A, TA /* T2 = A >>> 5 */ |
410 | .if ((round_id) < 79) |
411 | rorx $(32-30), A, TB /* b>>>2 for next round */ |
412 | .endif |
413 | PRECALC (\r) /* msg scheduling for next 2 blocks */ |
414 | |
415 | .if ((round_id) < 79) |
416 | xor B, A |
417 | .endif |
418 | |
419 | add TA, E /* E += A >>> 5 */ |
420 | |
421 | .if ((round_id) < 79) |
422 | xor C, A |
423 | .endif |
424 | .endm |
425 | |
426 | .macro ROUND_F3 r |
427 | add WK(\r), E |
428 | PRECALC (\r) /* msg scheduling for next 2 blocks */ |
429 | |
430 | lea (RE,RTB), E /* Add F from the previous round */ |
431 | |
432 | mov B, T1 |
433 | or A, T1 |
434 | |
435 | rorx $(32-5), A, TA /* T2 = A >>> 5 */ |
436 | rorx $(32-30), A, TB /* b>>>2 for next round */ |
437 | |
438 | /* Calculate F for the next round |
439 | * (b and c) or (d and (b or c)) |
440 | */ |
441 | and C, T1 |
442 | and B, A |
443 | or T1, A |
444 | |
445 | add TA, E /* E += A >>> 5 */ |
446 | |
447 | .endm |
448 | |
449 | /* |
450 | * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining |
451 | */ |
452 | .macro SHA1_PIPELINED_MAIN_BODY |
453 | |
454 | REGALLOC |
455 | |
456 | mov (HASH_PTR), A |
457 | mov 4(HASH_PTR), B |
458 | mov 8(HASH_PTR), C |
459 | mov 12(HASH_PTR), D |
460 | mov 16(HASH_PTR), E |
461 | |
462 | mov %rsp, PRECALC_BUF |
463 | lea (2*4*80+32)(%rsp), WK_BUF |
464 | |
465 | # Precalc WK for first 2 blocks |
466 | PRECALC_OFFSET = 0 |
467 | .set i, 0 |
468 | .rept 160 |
469 | PRECALC i |
470 | .set i, i + 1 |
471 | .endr |
472 | PRECALC_OFFSET = 128 |
473 | xchg WK_BUF, PRECALC_BUF |
474 | |
475 | .align 32 |
476 | _loop: |
477 | /* |
478 | * code loops through more than one block |
479 | * we use K_BASE value as a signal of a last block, |
480 | * it is set below by: cmovae BUFFER_PTR, K_BASE |
481 | */ |
482 | cmp K_BASE, BUFFER_PTR |
483 | jne _begin |
484 | .align 32 |
485 | jmp _end |
486 | .align 32 |
487 | _begin: |
488 | |
489 | /* |
490 | * Do first block |
491 | * rounds: 0,2,4,6,8 |
492 | */ |
493 | .set j, 0 |
494 | .rept 5 |
495 | RR j |
496 | .set j, j+2 |
497 | .endr |
498 | |
499 | jmp _loop0 |
500 | _loop0: |
501 | |
502 | /* |
503 | * rounds: |
504 | * 10,12,14,16,18 |
505 | * 20,22,24,26,28 |
506 | * 30,32,34,36,38 |
507 | * 40,42,44,46,48 |
508 | * 50,52,54,56,58 |
509 | */ |
510 | .rept 25 |
511 | RR j |
512 | .set j, j+2 |
513 | .endr |
514 | |
515 | add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ |
516 | cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ |
517 | cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ |
518 | |
519 | /* |
520 | * rounds |
521 | * 60,62,64,66,68 |
522 | * 70,72,74,76,78 |
523 | */ |
524 | .rept 10 |
525 | RR j |
526 | .set j, j+2 |
527 | .endr |
528 | |
529 | UPDATE_HASH (HASH_PTR), A |
530 | UPDATE_HASH 4(HASH_PTR), TB |
531 | UPDATE_HASH 8(HASH_PTR), C |
532 | UPDATE_HASH 12(HASH_PTR), D |
533 | UPDATE_HASH 16(HASH_PTR), E |
534 | |
535 | cmp K_BASE, BUFFER_PTR /* is current block the last one? */ |
536 | je _loop |
537 | |
538 | mov TB, B |
539 | |
540 | /* Process second block */ |
541 | /* |
542 | * rounds |
543 | * 0+80, 2+80, 4+80, 6+80, 8+80 |
544 | * 10+80,12+80,14+80,16+80,18+80 |
545 | */ |
546 | |
547 | .set j, 0 |
548 | .rept 10 |
549 | RR j+80 |
550 | .set j, j+2 |
551 | .endr |
552 | |
553 | jmp _loop1 |
554 | _loop1: |
555 | /* |
556 | * rounds |
557 | * 20+80,22+80,24+80,26+80,28+80 |
558 | * 30+80,32+80,34+80,36+80,38+80 |
559 | */ |
560 | .rept 10 |
561 | RR j+80 |
562 | .set j, j+2 |
563 | .endr |
564 | |
565 | jmp _loop2 |
566 | _loop2: |
567 | |
568 | /* |
569 | * rounds |
570 | * 40+80,42+80,44+80,46+80,48+80 |
571 | * 50+80,52+80,54+80,56+80,58+80 |
572 | */ |
573 | .rept 10 |
574 | RR j+80 |
575 | .set j, j+2 |
576 | .endr |
577 | |
578 | add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ |
579 | |
580 | cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ |
581 | cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ |
582 | |
583 | jmp _loop3 |
584 | _loop3: |
585 | |
586 | /* |
587 | * rounds |
588 | * 60+80,62+80,64+80,66+80,68+80 |
589 | * 70+80,72+80,74+80,76+80,78+80 |
590 | */ |
591 | .rept 10 |
592 | RR j+80 |
593 | .set j, j+2 |
594 | .endr |
595 | |
596 | UPDATE_HASH (HASH_PTR), A |
597 | UPDATE_HASH 4(HASH_PTR), TB |
598 | UPDATE_HASH 8(HASH_PTR), C |
599 | UPDATE_HASH 12(HASH_PTR), D |
600 | UPDATE_HASH 16(HASH_PTR), E |
601 | |
602 | /* Reset state for AVX2 reg permutation */ |
603 | mov A, TA |
604 | mov TB, A |
605 | mov C, TB |
606 | mov E, C |
607 | mov D, B |
608 | mov TA, D |
609 | |
610 | REGALLOC |
611 | |
612 | xchg WK_BUF, PRECALC_BUF |
613 | |
614 | jmp _loop |
615 | |
616 | .align 32 |
617 | _end: |
618 | |
619 | .endm |
620 | /* |
621 | * macro implements SHA-1 function's body for several 64-byte blocks |
622 | * param: function's name |
623 | */ |
624 | .macro SHA1_VECTOR_ASM name |
625 | ENTRY(\name) |
7c1da8d0 |
626 | |
627 | push %rbx |
628 | push %rbp |
629 | push %r12 |
630 | push %r13 |
631 | push %r14 |
632 | push %r15 |
633 | |
634 | RESERVE_STACK = (W_SIZE*4 + 8+24) |
635 | |
636 | /* Align stack */ |
637 | mov %rsp, %rbx |
6c8c17cc |
638 | and $~(0x20-1), %rsp |
7c1da8d0 |
639 | push %rbx |
640 | sub $RESERVE_STACK, %rsp |
641 | |
642 | avx2_zeroupper |
643 | |
644 | lea K_XMM_AR(%rip), K_BASE |
645 | |
646 | mov CTX, HASH_PTR |
647 | mov BUF, BUFFER_PTR |
648 | lea 64(BUF), BUFFER_PTR2 |
649 | |
650 | shl $6, CNT /* mul by 64 */ |
651 | add BUF, CNT |
652 | add $64, CNT |
653 | mov CNT, BUFFER_END |
654 | |
655 | cmp BUFFER_END, BUFFER_PTR2 |
656 | cmovae K_BASE, BUFFER_PTR2 |
657 | |
658 | xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP |
659 | |
660 | SHA1_PIPELINED_MAIN_BODY |
661 | |
662 | avx2_zeroupper |
663 | |
664 | add $RESERVE_STACK, %rsp |
6c8c17cc |
665 | pop %rsp |
7c1da8d0 |
666 | |
667 | pop %r15 |
668 | pop %r14 |
669 | pop %r13 |
670 | pop %r12 |
671 | pop %rbp |
672 | pop %rbx |
673 | |
674 | ret |
675 | |
676 | ENDPROC(\name) |
677 | .endm |
678 | |
679 | .section .rodata |
680 | |
681 | #define K1 0x5a827999 |
682 | #define K2 0x6ed9eba1 |
683 | #define K3 0x8f1bbcdc |
684 | #define K4 0xca62c1d6 |
685 | |
686 | .align 128 |
687 | K_XMM_AR: |
688 | .long K1, K1, K1, K1 |
689 | .long K1, K1, K1, K1 |
690 | .long K2, K2, K2, K2 |
691 | .long K2, K2, K2, K2 |
692 | .long K3, K3, K3, K3 |
693 | .long K3, K3, K3, K3 |
694 | .long K4, K4, K4, K4 |
695 | .long K4, K4, K4, K4 |
696 | |
697 | BSWAP_SHUFB_CTL: |
698 | .long 0x00010203 |
699 | .long 0x04050607 |
700 | .long 0x08090a0b |
701 | .long 0x0c0d0e0f |
702 | .long 0x00010203 |
703 | .long 0x04050607 |
704 | .long 0x08090a0b |
705 | .long 0x0c0d0e0f |
706 | .text |
707 | |
708 | SHA1_VECTOR_ASM sha1_transform_avx2 |