Merge branch 'topic/livepatch' of git://git.kernel.org/pub/scm/linux/kernel/git/power...
[deliverable/linux.git] / arch / x86 / crypto / sha-mb / sha1_x8_avx2.S
CommitLineData
12d2513d
TC
1/*
2 * Multi-buffer SHA1 algorithm hash compute routine
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2014 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * James Guilford <james.guilford@intel.com>
22 * Tim Chen <tim.c.chen@linux.intel.com>
23 *
24 * BSD LICENSE
25 *
26 * Copyright(c) 2014 Intel Corporation.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * * Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * * Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in
36 * the documentation and/or other materials provided with the
37 * distribution.
38 * * Neither the name of Intel Corporation nor the names of its
39 * contributors may be used to endorse or promote products derived
40 * from this software without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55#include <linux/linkage.h>
56#include "sha1_mb_mgr_datastruct.S"
57
58## code to compute oct SHA1 using SSE-256
59## outer calling routine takes care of save and restore of XMM registers
60
61## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15
62##
63## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
64## Linux preserves: rdi rbp r8
65##
66## clobbers ymm0-15
67
68
69# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
70# "transpose" data in {r0...r7} using temps {t0...t1}
71# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
72# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
73# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
74# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
75# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
76# r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
77# r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
78# r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
79# r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
80#
81# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
82# r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
83# r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
84# r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
85# r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
86# r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
87# r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
88# r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
89# r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
90#
91
92.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
93 # process top half (r0..r3) {a...d}
94 vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
95 vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
96 vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
97 vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
98 vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
99 vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
100 vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
101 vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
102
103 # use r2 in place of t0
104 # process bottom half (r4..r7) {e...h}
105 vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
106 vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
107 vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
108 vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
109 vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
110 vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
111 vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
112 vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
113
114 vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
115 vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
116 vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
117 vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
118 vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
119 vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
120 vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
121 vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
122
123.endm
124##
125## Magic functions defined in FIPS 180-1
126##
127# macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D)))
128.macro MAGIC_F0 regF regB regC regD regT
129 vpxor \regD, \regC, \regF
130 vpand \regB, \regF, \regF
131 vpxor \regD, \regF, \regF
132.endm
133
134# macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D)
135.macro MAGIC_F1 regF regB regC regD regT
136 vpxor \regC, \regD, \regF
137 vpxor \regB, \regF, \regF
138.endm
139
140# macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D))
141.macro MAGIC_F2 regF regB regC regD regT
142 vpor \regC, \regB, \regF
143 vpand \regC, \regB, \regT
144 vpand \regD, \regF, \regF
145 vpor \regT, \regF, \regF
146.endm
147
148# macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D)
149.macro MAGIC_F3 regF regB regC regD regT
150 MAGIC_F1 \regF,\regB,\regC,\regD,\regT
151.endm
152
153# PROLD reg, imm, tmp
154.macro PROLD reg imm tmp
155 vpsrld $(32-\imm), \reg, \tmp
156 vpslld $\imm, \reg, \reg
157 vpor \tmp, \reg, \reg
158.endm
159
160.macro PROLD_nd reg imm tmp src
161 vpsrld $(32-\imm), \src, \tmp
162 vpslld $\imm, \src, \reg
163 vpor \tmp, \reg, \reg
164.endm
165
166.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
167 vpaddd \immCNT, \regE, \regE
168 vpaddd \memW*32(%rsp), \regE, \regE
169 PROLD_nd \regT, 5, \regF, \regA
170 vpaddd \regT, \regE, \regE
171 \MAGIC \regF, \regB, \regC, \regD, \regT
172 PROLD \regB, 30, \regT
173 vpaddd \regF, \regE, \regE
174.endm
175
176.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
177 vpaddd \immCNT, \regE, \regE
178 offset = ((\memW - 14) & 15) * 32
179 vmovdqu offset(%rsp), W14
180 vpxor W14, W16, W16
181 offset = ((\memW - 8) & 15) * 32
182 vpxor offset(%rsp), W16, W16
183 offset = ((\memW - 3) & 15) * 32
184 vpxor offset(%rsp), W16, W16
185 vpsrld $(32-1), W16, \regF
186 vpslld $1, W16, W16
187 vpor W16, \regF, \regF
188
189 ROTATE_W
190
191 offset = ((\memW - 0) & 15) * 32
192 vmovdqu \regF, offset(%rsp)
193 vpaddd \regF, \regE, \regE
194 PROLD_nd \regT, 5, \regF, \regA
195 vpaddd \regT, \regE, \regE
196 \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D)
197 PROLD \regB,30, \regT
198 vpaddd \regF, \regE, \regE
199.endm
200
201########################################################################
202########################################################################
203########################################################################
204
205## FRAMESZ plus pushes must be an odd multiple of 8
206YMM_SAVE = (15-15)*32
207FRAMESZ = 32*16 + YMM_SAVE
208_YMM = FRAMESZ - YMM_SAVE
209
210#define VMOVPS vmovups
211
212IDX = %rax
213inp0 = %r9
214inp1 = %r10
215inp2 = %r11
216inp3 = %r12
217inp4 = %r13
218inp5 = %r14
219inp6 = %r15
220inp7 = %rcx
221arg1 = %rdi
222arg2 = %rsi
223RSP_SAVE = %rdx
224
225# ymm0 A
226# ymm1 B
227# ymm2 C
228# ymm3 D
229# ymm4 E
230# ymm5 F AA
231# ymm6 T0 BB
232# ymm7 T1 CC
233# ymm8 T2 DD
234# ymm9 T3 EE
235# ymm10 T4 TMP
236# ymm11 T5 FUN
237# ymm12 T6 K
238# ymm13 T7 W14
239# ymm14 T8 W15
240# ymm15 T9 W16
241
242
243A = %ymm0
244B = %ymm1
245C = %ymm2
246D = %ymm3
247E = %ymm4
248F = %ymm5
249T0 = %ymm6
250T1 = %ymm7
251T2 = %ymm8
252T3 = %ymm9
253T4 = %ymm10
254T5 = %ymm11
255T6 = %ymm12
256T7 = %ymm13
257T8 = %ymm14
258T9 = %ymm15
259
260AA = %ymm5
261BB = %ymm6
262CC = %ymm7
263DD = %ymm8
264EE = %ymm9
265TMP = %ymm10
266FUN = %ymm11
267K = %ymm12
268W14 = %ymm13
269W15 = %ymm14
270W16 = %ymm15
271
272.macro ROTATE_ARGS
273 TMP_ = E
274 E = D
275 D = C
276 C = B
277 B = A
278 A = TMP_
279.endm
280
281.macro ROTATE_W
282TMP_ = W16
283W16 = W15
284W15 = W14
285W14 = TMP_
286.endm
287
288# 8 streams x 5 32bit words per digest x 4 bytes per word
289#define DIGEST_SIZE (8*5*4)
290
291.align 32
292
293# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
294# arg 1 : pointer to array[4] of pointer to input data
295# arg 2 : size (in blocks) ;; assumed to be >= 1
296#
297ENTRY(sha1_x8_avx2)
298
299 push RSP_SAVE
300
301 #save rsp
302 mov %rsp, RSP_SAVE
303 sub $FRAMESZ, %rsp
304
305 #align rsp to 32 Bytes
306 and $~0x1F, %rsp
307
308 ## Initialize digests
309 vmovdqu 0*32(arg1), A
310 vmovdqu 1*32(arg1), B
311 vmovdqu 2*32(arg1), C
312 vmovdqu 3*32(arg1), D
313 vmovdqu 4*32(arg1), E
314
315 ## transpose input onto stack
316 mov _data_ptr+0*8(arg1),inp0
317 mov _data_ptr+1*8(arg1),inp1
318 mov _data_ptr+2*8(arg1),inp2
319 mov _data_ptr+3*8(arg1),inp3
320 mov _data_ptr+4*8(arg1),inp4
321 mov _data_ptr+5*8(arg1),inp5
322 mov _data_ptr+6*8(arg1),inp6
323 mov _data_ptr+7*8(arg1),inp7
324
325 xor IDX, IDX
326lloop:
327 vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F
328 I=0
329.rep 2
330 VMOVPS (inp0, IDX), T0
331 VMOVPS (inp1, IDX), T1
332 VMOVPS (inp2, IDX), T2
333 VMOVPS (inp3, IDX), T3
334 VMOVPS (inp4, IDX), T4
335 VMOVPS (inp5, IDX), T5
336 VMOVPS (inp6, IDX), T6
337 VMOVPS (inp7, IDX), T7
338
339 TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
340 vpshufb F, T0, T0
341 vmovdqu T0, (I*8)*32(%rsp)
342 vpshufb F, T1, T1
343 vmovdqu T1, (I*8+1)*32(%rsp)
344 vpshufb F, T2, T2
345 vmovdqu T2, (I*8+2)*32(%rsp)
346 vpshufb F, T3, T3
347 vmovdqu T3, (I*8+3)*32(%rsp)
348 vpshufb F, T4, T4
349 vmovdqu T4, (I*8+4)*32(%rsp)
350 vpshufb F, T5, T5
351 vmovdqu T5, (I*8+5)*32(%rsp)
352 vpshufb F, T6, T6
353 vmovdqu T6, (I*8+6)*32(%rsp)
354 vpshufb F, T7, T7
355 vmovdqu T7, (I*8+7)*32(%rsp)
356 add $32, IDX
357 I = (I+1)
358.endr
359 # save old digests
360 vmovdqu A,AA
361 vmovdqu B,BB
362 vmovdqu C,CC
363 vmovdqu D,DD
364 vmovdqu E,EE
365
366##
367## perform 0-79 steps
368##
369 vmovdqu K00_19(%rip), K
370## do rounds 0...15
371 I = 0
372.rep 16
373 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
374 ROTATE_ARGS
375 I = (I+1)
376.endr
377
378## do rounds 16...19
379 vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16
380 vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15
381.rep 4
382 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
383 ROTATE_ARGS
384 I = (I+1)
385.endr
386
387## do rounds 20...39
388 vmovdqu K20_39(%rip), K
389.rep 20
390 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
391 ROTATE_ARGS
392 I = (I+1)
393.endr
394
395## do rounds 40...59
396 vmovdqu K40_59(%rip), K
397.rep 20
398 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
399 ROTATE_ARGS
400 I = (I+1)
401.endr
402
403## do rounds 60...79
404 vmovdqu K60_79(%rip), K
405.rep 20
406 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
407 ROTATE_ARGS
408 I = (I+1)
409.endr
410
411 vpaddd AA,A,A
412 vpaddd BB,B,B
413 vpaddd CC,C,C
414 vpaddd DD,D,D
415 vpaddd EE,E,E
416
417 sub $1, arg2
418 jne lloop
419
420 # write out digests
421 vmovdqu A, 0*32(arg1)
422 vmovdqu B, 1*32(arg1)
423 vmovdqu C, 2*32(arg1)
424 vmovdqu D, 3*32(arg1)
425 vmovdqu E, 4*32(arg1)
426
427 # update input pointers
428 add IDX, inp0
429 add IDX, inp1
430 add IDX, inp2
431 add IDX, inp3
432 add IDX, inp4
433 add IDX, inp5
434 add IDX, inp6
435 add IDX, inp7
436 mov inp0, _data_ptr (arg1)
437 mov inp1, _data_ptr + 1*8(arg1)
438 mov inp2, _data_ptr + 2*8(arg1)
439 mov inp3, _data_ptr + 3*8(arg1)
440 mov inp4, _data_ptr + 4*8(arg1)
441 mov inp5, _data_ptr + 5*8(arg1)
442 mov inp6, _data_ptr + 6*8(arg1)
443 mov inp7, _data_ptr + 7*8(arg1)
444
445 ################
446 ## Postamble
447
448 mov RSP_SAVE, %rsp
449 pop RSP_SAVE
450
451 ret
452ENDPROC(sha1_x8_avx2)
453
454
455.data
456
457.align 32
458K00_19:
459.octa 0x5A8279995A8279995A8279995A827999
460.octa 0x5A8279995A8279995A8279995A827999
461K20_39:
462.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
463.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
464K40_59:
465.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
466.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
467K60_79:
468.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
469.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
470PSHUFFLE_BYTE_FLIP_MASK:
471.octa 0x0c0d0e0f08090a0b0405060700010203
472.octa 0x0c0d0e0f08090a0b0405060700010203
This page took 0.116162 seconds and 5 git commands to generate.