[deliverable/linux.git] / arch / x86 / crypto / sha1_avx2_x86_64_asm.S

/*
 *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
 *
 * This file is provided under a dual BSD/GPLv2 license.  When using or
 * redistributing this file, you may do so under either license.
 *
 * GPL LICENSE SUMMARY
 *
 * Copyright(c) 2014 Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * Contact Information:
 * Ilya Albrekht <ilya.albrekht@intel.com>
 * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
 * Ronen Zohar <ronen.zohar@intel.com>
 * Chandramouli Narayanan <mouli@linux.intel.com>
 *
 * BSD LICENSE
 *
 * Copyright(c) 2014 Intel Corporation.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in
 * the documentation and/or other materials provided with the
 * distribution.
 * Neither the name of Intel Corporation nor the names of its
 * contributors may be used to endorse or promote products derived
 * from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

/*
 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
 *
 *This implementation is based on the previous SSSE3 release:
 *Visit http://software.intel.com/en-us/articles/
 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
 *
 *Updates 20-byte SHA-1 record in 'hash' for even number of
 *'num_blocks' consecutive 64-byte blocks
 *
 *extern "C" void sha1_transform_avx2(
 *	int *hash, const char* input, size_t num_blocks );
 */

#include <linux/linkage.h>

#define	CTX	%rdi	/* arg1 */
#define BUF	%rsi	/* arg2 */
#define CNT	%rdx	/* arg3 */

#define	REG_A	%ecx
#define	REG_B	%esi
#define	REG_C	%edi
#define	REG_D	%eax
#define	REG_E	%edx
#define	REG_TB	%ebx
#define	REG_TA	%r12d
#define	REG_RA	%rcx
#define	REG_RB	%rsi
#define	REG_RC	%rdi
#define	REG_RD	%rax
#define	REG_RE	%rdx
#define	REG_RTA	%r12
#define	REG_RTB	%rbx
#define	REG_T1	%ebp
#define	xmm_mov	vmovups
#define	avx2_zeroupper	vzeroupper
#define	RND_F1	1
#define	RND_F2	2
#define	RND_F3	3

.macro REGALLOC
	.set A, REG_A
	.set B, REG_B
	.set C, REG_C
	.set D, REG_D
	.set E, REG_E
	.set TB, REG_TB
	.set TA, REG_TA

	.set RA, REG_RA
	.set RB, REG_RB
	.set RC, REG_RC
	.set RD, REG_RD
	.set RE, REG_RE

	.set RTA, REG_RTA
	.set RTB, REG_RTB

	.set T1, REG_T1
.endm

#define K_BASE		%r8
#define HASH_PTR	%r9
#define BUFFER_PTR	%r10
#define BUFFER_PTR2	%r13
#define BUFFER_END	%r11

#define PRECALC_BUF	%r14
#define WK_BUF		%r15

#define W_TMP		%xmm0
#define WY_TMP		%ymm0
#define WY_TMP2		%ymm9

# AVX2 variables
#define WY0		%ymm3
#define WY4		%ymm5
#define WY08		%ymm7
#define WY12		%ymm8
#define WY16		%ymm12
#define WY20		%ymm13
#define WY24		%ymm14
#define WY28		%ymm15

#define YMM_SHUFB_BSWAP	%ymm10

/*
 * Keep 2 iterations precalculated at a time:
 *    - 80 DWORDs per iteration * 2
 */
#define W_SIZE		(80*2*2 +16)

#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)


.macro UPDATE_HASH  hash, val
	add	\hash, \val
	mov	\val, \hash
.endm

.macro PRECALC_RESET_WY
	.set WY_00, WY0
	.set WY_04, WY4
	.set WY_08, WY08
	.set WY_12, WY12
	.set WY_16, WY16
	.set WY_20, WY20
	.set WY_24, WY24
	.set WY_28, WY28
	.set WY_32, WY_00
.endm

.macro PRECALC_ROTATE_WY
	/* Rotate macros */
	.set WY_32, WY_28
	.set WY_28, WY_24
	.set WY_24, WY_20
	.set WY_20, WY_16
	.set WY_16, WY_12
	.set WY_12, WY_08
	.set WY_08, WY_04
	.set WY_04, WY_00
	.set WY_00, WY_32

	/* Define register aliases */
	.set WY, WY_00
	.set WY_minus_04, WY_04
	.set WY_minus_08, WY_08
	.set WY_minus_12, WY_12
	.set WY_minus_16, WY_16
	.set WY_minus_20, WY_20
	.set WY_minus_24, WY_24
	.set WY_minus_28, WY_28
	.set WY_minus_32, WY
.endm

.macro PRECALC_00_15
	.if (i == 0) # Initialize and rotate registers
		PRECALC_RESET_WY
		PRECALC_ROTATE_WY
	.endif

	/* message scheduling pre-compute for rounds 0-15 */
	.if   ((i & 7) == 0)
		/*
		 * blended AVX2 and ALU instruction scheduling
		 * 1 vector iteration per 8 rounds
		 */
		vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
	.elseif ((i & 7) == 1)
		vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
			 WY_TMP, WY_TMP
	.elseif ((i & 7) == 2)
		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
	.elseif ((i & 7) == 4)
		vpaddd  K_XMM(K_BASE), WY, WY_TMP
	.elseif ((i & 7) == 7)
		vmovdqu  WY_TMP, PRECALC_WK(i&~7)

		PRECALC_ROTATE_WY
	.endif
.endm

.macro PRECALC_16_31
	/*
	 * message scheduling pre-compute for rounds 16-31
	 * calculating last 32 w[i] values in 8 XMM registers
	 * pre-calculate K+w[i] values and store to mem
	 * for later load by ALU add instruction
	 *
	 * "brute force" vectorization for rounds 16-31 only
	 * due to w[i]->w[i-3] dependency
	 */
	.if   ((i & 7) == 0)
		/*
		 * blended AVX2 and ALU instruction scheduling
		 * 1 vector iteration per 8 rounds
		 */
		/* w[i-14] */
		vpalignr	$8, WY_minus_16, WY_minus_12, WY
		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
	.elseif ((i & 7) == 1)
		vpxor	WY_minus_08, WY, WY
		vpxor	WY_minus_16, WY_TMP, WY_TMP
	.elseif ((i & 7) == 2)
		vpxor	WY_TMP, WY, WY
		vpslldq	$12, WY, WY_TMP2
	.elseif ((i & 7) == 3)
		vpslld	$1, WY, WY_TMP
		vpsrld	$31, WY, WY
	.elseif ((i & 7) == 4)
		vpor	WY, WY_TMP, WY_TMP
		vpslld	$2, WY_TMP2, WY
	.elseif ((i & 7) == 5)
		vpsrld	$30, WY_TMP2, WY_TMP2
		vpxor	WY, WY_TMP, WY_TMP
	.elseif ((i & 7) == 7)
		vpxor	WY_TMP2, WY_TMP, WY
		vpaddd	K_XMM(K_BASE), WY, WY_TMP
		vmovdqu	WY_TMP, PRECALC_WK(i&~7)

		PRECALC_ROTATE_WY
	.endif
.endm

.macro PRECALC_32_79
	/*
	 * in SHA-1 specification:
	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
	 * instead we do equal:
	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
	 * allows more efficient vectorization
	 * since w[i]=>w[i-3] dependency is broken
	 */

	.if   ((i & 7) == 0)
	/*
	 * blended AVX2 and ALU instruction scheduling
	 * 1 vector iteration per 8 rounds
	 */
		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
	.elseif ((i & 7) == 1)
		/* W is W_minus_32 before xor */
		vpxor	WY_minus_28, WY, WY
	.elseif ((i & 7) == 2)
		vpxor	WY_minus_16, WY_TMP, WY_TMP
	.elseif ((i & 7) == 3)
		vpxor	WY_TMP, WY, WY
	.elseif ((i & 7) == 4)
		vpslld	$2, WY, WY_TMP
	.elseif ((i & 7) == 5)
		vpsrld	$30, WY, WY
		vpor	WY, WY_TMP, WY
	.elseif ((i & 7) == 7)
		vpaddd	K_XMM(K_BASE), WY, WY_TMP
		vmovdqu	WY_TMP, PRECALC_WK(i&~7)

		PRECALC_ROTATE_WY
	.endif
.endm

.macro PRECALC r, s
	.set i, \r

	.if (i < 40)
		.set K_XMM, 32*0
	.elseif (i < 80)
		.set K_XMM, 32*1
	.elseif (i < 120)
		.set K_XMM, 32*2
	.else
		.set K_XMM, 32*3
	.endif

	.if (i<32)
		PRECALC_00_15	\s
	.elseif (i<64)
		PRECALC_16_31	\s
	.elseif (i < 160)
		PRECALC_32_79	\s
	.endif
.endm

.macro ROTATE_STATE
	.set T_REG, E
	.set E, D
	.set D, C
	.set C, B
	.set B, TB
	.set TB, A
	.set A, T_REG

	.set T_REG, RE
	.set RE, RD
	.set RD, RC
	.set RC, RB
	.set RB, RTB
	.set RTB, RA
	.set RA, T_REG
.endm

/* Macro relies on saved ROUND_Fx */

.macro RND_FUN f, r
	.if (\f == RND_F1)
		ROUND_F1	\r
	.elseif (\f == RND_F2)
		ROUND_F2	\r
	.elseif (\f == RND_F3)
		ROUND_F3	\r
	.endif
.endm

.macro RR r
	.set round_id, (\r % 80)

	.if (round_id == 0)        /* Precalculate F for first round */
		.set ROUND_FUNC, RND_F1
		mov	B, TB

		rorx	$(32-30), B, B    /* b>>>2 */
		andn	D, TB, T1
		and	C, TB
		xor	T1, TB
	.endif

	RND_FUN ROUND_FUNC, \r
	ROTATE_STATE

	.if   (round_id == 18)
		.set ROUND_FUNC, RND_F2
	.elseif (round_id == 38)
		.set ROUND_FUNC, RND_F3
	.elseif (round_id == 58)
		.set ROUND_FUNC, RND_F2
	.endif

	.set round_id, ( (\r+1) % 80)

	RND_FUN ROUND_FUNC, (\r+1)
	ROTATE_STATE
.endm

.macro ROUND_F1 r
	add	WK(\r), E

	andn	C, A, T1			/* ~b&d */
	lea	(RE,RTB), E		/* Add F from the previous round */

	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
	rorx	$(32-30),A, TB		/* b>>>2 for next round */

	PRECALC	(\r)			/* msg scheduling for next 2 blocks */

	/*
	 * Calculate F for the next round
	 * (b & c) ^ andn[b, d]
	 */
	and	B, A			/* b&c */
	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */

	lea	(RE,RTA), E		/* E += A >>> 5 */
.endm

.macro ROUND_F2 r
	add	WK(\r), E
	lea	(RE,RTB), E		/* Add F from the previous round */

	/* Calculate F for the next round */
	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
	.if ((round_id) < 79)
		rorx	$(32-30), A, TB	/* b>>>2 for next round */
	.endif
	PRECALC	(\r)			/* msg scheduling for next 2 blocks */

	.if ((round_id) < 79)
		xor	B, A
	.endif

	add	TA, E			/* E += A >>> 5 */

	.if ((round_id) < 79)
		xor	C, A
	.endif
.endm

.macro ROUND_F3 r
	add	WK(\r), E
	PRECALC	(\r)			/* msg scheduling for next 2 blocks */

	lea	(RE,RTB), E		/* Add F from the previous round */

	mov	B, T1
	or	A, T1

	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
	rorx	$(32-30), A, TB		/* b>>>2 for next round */

	/* Calculate F for the next round
	 * (b and c) or (d and (b or c))
	 */
	and	C, T1
	and	B, A
	or	T1, A

	add	TA, E			/* E += A >>> 5 */

.endm

/*
 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
 */
.macro SHA1_PIPELINED_MAIN_BODY

	REGALLOC

	mov	(HASH_PTR), A
	mov	4(HASH_PTR), B
	mov	8(HASH_PTR), C
	mov	12(HASH_PTR), D
	mov	16(HASH_PTR), E

	mov	%rsp, PRECALC_BUF
	lea	(2*4*80+32)(%rsp), WK_BUF

	# Precalc WK for first 2 blocks
	PRECALC_OFFSET = 0
	.set i, 0
	.rept    160
		PRECALC i
		.set i, i + 1
	.endr
	PRECALC_OFFSET = 128
	xchg	WK_BUF, PRECALC_BUF

	.align 32
_loop:
	/*
	 * code loops through more than one block
	 * we use K_BASE value as a signal of a last block,
	 * it is set below by: cmovae BUFFER_PTR, K_BASE
	 */
	cmp	K_BASE, BUFFER_PTR
	jne	_begin
	.align 32
	jmp	_end
	.align 32
_begin:

	/*
	 * Do first block
	 * rounds: 0,2,4,6,8
	 */
	.set j, 0
	.rept 5
		RR	j
		.set j, j+2
	.endr

	jmp _loop0
_loop0:

	/*
	 * rounds:
	 * 10,12,14,16,18
	 * 20,22,24,26,28
	 * 30,32,34,36,38
	 * 40,42,44,46,48
	 * 50,52,54,56,58
	 */
	.rept 25
		RR	j
		.set j, j+2
	.endr

	add	$(2*64), BUFFER_PTR       /* move to next odd-64-byte block */
	cmp	BUFFER_END, BUFFER_PTR    /* is current block the last one? */
	cmovae	K_BASE, BUFFER_PTR	/* signal the last iteration smartly */

	/*
	 * rounds
	 * 60,62,64,66,68
	 * 70,72,74,76,78
	 */
	.rept 10
		RR	j
		.set j, j+2
	.endr

	UPDATE_HASH	(HASH_PTR), A
	UPDATE_HASH	4(HASH_PTR), TB
	UPDATE_HASH	8(HASH_PTR), C
	UPDATE_HASH	12(HASH_PTR), D
	UPDATE_HASH	16(HASH_PTR), E

	cmp	K_BASE, BUFFER_PTR	/* is current block the last one? */
	je	_loop

	mov	TB, B

	/* Process second block */
	/*
	 * rounds
	 *  0+80, 2+80, 4+80, 6+80, 8+80
	 * 10+80,12+80,14+80,16+80,18+80
	 */

	.set j, 0
	.rept 10
		RR	j+80
		.set j, j+2
	.endr

	jmp	_loop1
_loop1:
	/*
	 * rounds
	 * 20+80,22+80,24+80,26+80,28+80
	 * 30+80,32+80,34+80,36+80,38+80
	 */
	.rept 10
		RR	j+80
		.set j, j+2
	.endr

	jmp	_loop2
_loop2:

	/*
	 * rounds
	 * 40+80,42+80,44+80,46+80,48+80
	 * 50+80,52+80,54+80,56+80,58+80
	 */
	.rept 10
		RR	j+80
		.set j, j+2
	.endr

	add	$(2*64), BUFFER_PTR2      /* move to next even-64-byte block */

	cmp	BUFFER_END, BUFFER_PTR2   /* is current block the last one */
	cmovae	K_BASE, BUFFER_PTR       /* signal the last iteration smartly */

	jmp	_loop3
_loop3:

	/*
	 * rounds
	 * 60+80,62+80,64+80,66+80,68+80
	 * 70+80,72+80,74+80,76+80,78+80
	 */
	.rept 10
		RR	j+80
		.set j, j+2
	.endr

	UPDATE_HASH	(HASH_PTR), A
	UPDATE_HASH	4(HASH_PTR), TB
	UPDATE_HASH	8(HASH_PTR), C
	UPDATE_HASH	12(HASH_PTR), D
	UPDATE_HASH	16(HASH_PTR), E

	/* Reset state for AVX2 reg permutation */
	mov	A, TA
	mov	TB, A
	mov	C, TB
	mov	E, C
	mov	D, B
	mov	TA, D

	REGALLOC

	xchg	WK_BUF, PRECALC_BUF

	jmp	_loop

	.align 32
	_end:

.endm
/*
 * macro implements SHA-1 function's body for several 64-byte blocks
 * param: function's name
 */
.macro SHA1_VECTOR_ASM  name
	ENTRY(\name)

	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15

	RESERVE_STACK  = (W_SIZE*4 + 8+24)

	/* Align stack */
	mov	%rsp, %rbx
	and	$~(0x20-1), %rsp
	push	%rbx
	sub	$RESERVE_STACK, %rsp

	avx2_zeroupper

	lea	K_XMM_AR(%rip), K_BASE

	mov	CTX, HASH_PTR
	mov	BUF, BUFFER_PTR
	lea	64(BUF), BUFFER_PTR2

	shl	$6, CNT			/* mul by 64 */
	add	BUF, CNT
	add	$64, CNT
	mov	CNT, BUFFER_END

	cmp	BUFFER_END, BUFFER_PTR2
	cmovae	K_BASE, BUFFER_PTR2

	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP

	SHA1_PIPELINED_MAIN_BODY

	avx2_zeroupper

	add	$RESERVE_STACK, %rsp
	pop	%rsp

	pop	%r15
	pop	%r14
	pop	%r13
	pop	%r12
	pop	%rbp
	pop	%rbx

	ret

	ENDPROC(\name)
.endm

.section .rodata

#define K1 0x5a827999
#define K2 0x6ed9eba1
#define K3 0x8f1bbcdc
#define K4 0xca62c1d6

.align 128
K_XMM_AR:
	.long K1, K1, K1, K1
	.long K1, K1, K1, K1
	.long K2, K2, K2, K2
	.long K2, K2, K2, K2
	.long K3, K3, K3, K3
	.long K3, K3, K3, K3
	.long K4, K4, K4, K4
	.long K4, K4, K4, K4

BSWAP_SHUFB_CTL:
	.long 0x00010203
	.long 0x04050607
	.long 0x08090a0b
	.long 0x0c0d0e0f
	.long 0x00010203
	.long 0x04050607
	.long 0x08090a0b
	.long 0x0c0d0e0f
.text

SHA1_VECTOR_ASM     sha1_transform_avx2
Commit	Line	Data
7c1da8d0	1	/*
	2	* Implement fast SHA-1 with AVX2 instructions. (x86_64)
	3	*
	4	* This file is provided under a dual BSD/GPLv2 license. When using or
	5	* redistributing this file, you may do so under either license.
	6	*
	7	* GPL LICENSE SUMMARY
	8	*
	9	* Copyright(c) 2014 Intel Corporation.
	10	*
	11	* This program is free software; you can redistribute it and/or modify
	12	* it under the terms of version 2 of the GNU General Public License as
	13	* published by the Free Software Foundation.
	14	*
	15	* This program is distributed in the hope that it will be useful, but
	16	* WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	18	* General Public License for more details.
	19	*
	20	* Contact Information:
	21	* Ilya Albrekht <ilya.albrekht@intel.com>
	22	* Maxim Locktyukhin <maxim.locktyukhin@intel.com>
	23	* Ronen Zohar <ronen.zohar@intel.com>
	24	* Chandramouli Narayanan <mouli@linux.intel.com>
	25	*
	26	* BSD LICENSE
	27	*
	28	* Copyright(c) 2014 Intel Corporation.
	29	*
	30	* Redistribution and use in source and binary forms, with or without
	31	* modification, are permitted provided that the following conditions
	32	* are met:
	33	*
	34	* Redistributions of source code must retain the above copyright
	35	* notice, this list of conditions and the following disclaimer.
	36	* Redistributions in binary form must reproduce the above copyright
	37	* notice, this list of conditions and the following disclaimer in
	38	* the documentation and/or other materials provided with the
	39	* distribution.
	40	* Neither the name of Intel Corporation nor the names of its
	41	* contributors may be used to endorse or promote products derived
	42	* from this software without specific prior written permission.
	43	*
	44	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	45	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	46	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	47	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	48	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	49	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	50	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	51	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	52	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	53	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	54	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	55	*
	56	*/
	57
	58	/*
	59	* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
	60	*
	61	*This implementation is based on the previous SSSE3 release:
	62	*Visit http://software.intel.com/en-us/articles/
	63	*and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
	64	*
65	*Updates 20-byte SHA-1 record in 'hash' for even number of
66	*'num_blocks' consecutive 64-byte blocks
67	*
68	*extern "C" void sha1_transform_avx2(
69	* int hash, const char input, size_t num_blocks );
70	*/
71
72	#include <linux/linkage.h>
73
74	#define CTX %rdi /* arg1 */
75	#define BUF %rsi /* arg2 */
76	#define CNT %rdx /* arg3 */
77
78	#define REG_A %ecx
79	#define REG_B %esi
80	#define REG_C %edi
81	#define REG_D %eax
82	#define REG_E %edx
83	#define REG_TB %ebx
84	#define REG_TA %r12d
85	#define REG_RA %rcx
86	#define REG_RB %rsi
87	#define REG_RC %rdi
88	#define REG_RD %rax
89	#define REG_RE %rdx
90	#define REG_RTA %r12
91	#define REG_RTB %rbx
92	#define REG_T1 %ebp
93	#define xmm_mov vmovups
94	#define avx2_zeroupper vzeroupper
95	#define RND_F1 1
96	#define RND_F2 2
97	#define RND_F3 3
98
99	.macro REGALLOC
100	.set A, REG_A
101	.set B, REG_B
102	.set C, REG_C
103	.set D, REG_D
104	.set E, REG_E
105	.set TB, REG_TB
106	.set TA, REG_TA
107
108	.set RA, REG_RA
109	.set RB, REG_RB
110	.set RC, REG_RC
111	.set RD, REG_RD
112	.set RE, REG_RE
113
114	.set RTA, REG_RTA
115	.set RTB, REG_RTB
116
117	.set T1, REG_T1
118	.endm
119
120	#define K_BASE %r8
121	#define HASH_PTR %r9
122	#define BUFFER_PTR %r10
123	#define BUFFER_PTR2 %r13
124	#define BUFFER_END %r11
125
126	#define PRECALC_BUF %r14
127	#define WK_BUF %r15
128
129	#define W_TMP %xmm0
130	#define WY_TMP %ymm0
131	#define WY_TMP2 %ymm9
132
133	# AVX2 variables
134	#define WY0 %ymm3
135	#define WY4 %ymm5
136	#define WY08 %ymm7
137	#define WY12 %ymm8
138	#define WY16 %ymm12
139	#define WY20 %ymm13
140	#define WY24 %ymm14
141	#define WY28 %ymm15
142
143	#define YMM_SHUFB_BSWAP %ymm10
144
145	/*
146	* Keep 2 iterations precalculated at a time:
147	* - 80 DWORDs per iteration * 2
148	*/
149	#define W_SIZE (8022 +16)
150
151	#define WK(t) ((((t) % 80) / 4)32 + ( (t) % 4)4 + ((t)/80)*16 )(WK_BUF)
152	#define PRECALC_WK(t) ((t)22)(PRECALC_BUF)
153
154
155	.macro UPDATE_HASH hash, val
156	add \hash, \val
157	mov \val, \hash
158	.endm
159
160	.macro PRECALC_RESET_WY
161	.set WY_00, WY0
162	.set WY_04, WY4
163	.set WY_08, WY08
164	.set WY_12, WY12
165	.set WY_16, WY16
166	.set WY_20, WY20
167	.set WY_24, WY24
168	.set WY_28, WY28
169	.set WY_32, WY_00
170	.endm
171
172	.macro PRECALC_ROTATE_WY
173	/* Rotate macros */
174	.set WY_32, WY_28
175	.set WY_28, WY_24
176	.set WY_24, WY_20
177	.set WY_20, WY_16
178	.set WY_16, WY_12
179	.set WY_12, WY_08
180	.set WY_08, WY_04
181	.set WY_04, WY_00
182	.set WY_00, WY_32
183
184	/* Define register aliases */
185	.set WY, WY_00
186	.set WY_minus_04, WY_04
187	.set WY_minus_08, WY_08
188	.set WY_minus_12, WY_12
189	.set WY_minus_16, WY_16
190	.set WY_minus_20, WY_20
191	.set WY_minus_24, WY_24
192	.set WY_minus_28, WY_28
193	.set WY_minus_32, WY
194	.endm
195
196	.macro PRECALC_00_15
197	.if (i == 0) # Initialize and rotate registers
198	PRECALC_RESET_WY
199	PRECALC_ROTATE_WY
200	.endif
201
202	/* message scheduling pre-compute for rounds 0-15 */
203	.if ((i & 7) == 0)
204	/*
205	* blended AVX2 and ALU instruction scheduling
206	* 1 vector iteration per 8 rounds
207	*/
208	vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
209	.elseif ((i & 7) == 1)
210	vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
211	WY_TMP, WY_TMP
212	.elseif ((i & 7) == 2)
213	vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
214	.elseif ((i & 7) == 4)
215	vpaddd K_XMM(K_BASE), WY, WY_TMP
216	.elseif ((i & 7) == 7)
217	vmovdqu WY_TMP, PRECALC_WK(i&~7)
218
219	PRECALC_ROTATE_WY
220	.endif
221	.endm
222
223	.macro PRECALC_16_31
224	/*
225	* message scheduling pre-compute for rounds 16-31
226	* calculating last 32 w[i] values in 8 XMM registers
227	* pre-calculate K+w[i] values and store to mem
228	* for later load by ALU add instruction
229	*
230	* "brute force" vectorization for rounds 16-31 only
231	* due to w[i]->w[i-3] dependency
232	*/
233	.if ((i & 7) == 0)
234	/*
235	* blended AVX2 and ALU instruction scheduling
236	* 1 vector iteration per 8 rounds
237	*/
238	/* w[i-14] */
239	vpalignr $8, WY_minus_16, WY_minus_12, WY
240	vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
241	.elseif ((i & 7) == 1)
242	vpxor WY_minus_08, WY, WY
243	vpxor WY_minus_16, WY_TMP, WY_TMP
244	.elseif ((i & 7) == 2)
245	vpxor WY_TMP, WY, WY
246	vpslldq $12, WY, WY_TMP2
247	.elseif ((i & 7) == 3)
248	vpslld $1, WY, WY_TMP
249	vpsrld $31, WY, WY
250	.elseif ((i & 7) == 4)
251	vpor WY, WY_TMP, WY_TMP
252	vpslld $2, WY_TMP2, WY
253	.elseif ((i & 7) == 5)
254	vpsrld $30, WY_TMP2, WY_TMP2
255	vpxor WY, WY_TMP, WY_TMP
256	.elseif ((i & 7) == 7)
257	vpxor WY_TMP2, WY_TMP, WY
258	vpaddd K_XMM(K_BASE), WY, WY_TMP
259	vmovdqu WY_TMP, PRECALC_WK(i&~7)
260
261	PRECALC_ROTATE_WY
262	.endif
263	.endm
264
265	.macro PRECALC_32_79
266	/*
267	* in SHA-1 specification:
268	* w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
269	* instead we do equal:
270	* w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
271	* allows more efficient vectorization
272	* since w[i]=>w[i-3] dependency is broken
273	*/
274
275	.if ((i & 7) == 0)
276	/*
277	* blended AVX2 and ALU instruction scheduling
278	* 1 vector iteration per 8 rounds
279	*/
280	vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
281	.elseif ((i & 7) == 1)
282	/* W is W_minus_32 before xor */
283	vpxor WY_minus_28, WY, WY
284	.elseif ((i & 7) == 2)
285	vpxor WY_minus_16, WY_TMP, WY_TMP
286	.elseif ((i & 7) == 3)
287	vpxor WY_TMP, WY, WY
288	.elseif ((i & 7) == 4)
289	vpslld $2, WY, WY_TMP
290	.elseif ((i & 7) == 5)
291	vpsrld $30, WY, WY
292	vpor WY, WY_TMP, WY
293	.elseif ((i & 7) == 7)
294	vpaddd K_XMM(K_BASE), WY, WY_TMP
295	vmovdqu WY_TMP, PRECALC_WK(i&~7)
296
297	PRECALC_ROTATE_WY
298	.endif
299	.endm
300
301	.macro PRECALC r, s
302	.set i, \r
303
304	.if (i < 40)
305	.set K_XMM, 32*0
306	.elseif (i < 80)
307	.set K_XMM, 32*1
308	.elseif (i < 120)
309	.set K_XMM, 32*2
310	.else
311	.set K_XMM, 32*3
312	.endif
313
314	.if (i<32)
315	PRECALC_00_15 \s
316	.elseif (i<64)
317	PRECALC_16_31 \s
318	.elseif (i < 160)
319	PRECALC_32_79 \s
320	.endif
321	.endm
322
323	.macro ROTATE_STATE
324	.set T_REG, E
325	.set E, D
326	.set D, C
327	.set C, B
328	.set B, TB
329	.set TB, A
330	.set A, T_REG
331
332	.set T_REG, RE
333	.set RE, RD
334	.set RD, RC
335	.set RC, RB
336	.set RB, RTB
337	.set RTB, RA
338	.set RA, T_REG
339	.endm
340
341	/* Macro relies on saved ROUND_Fx */
342
343	.macro RND_FUN f, r
344	.if (\f == RND_F1)
345	ROUND_F1 \r
346	.elseif (\f == RND_F2)
347	ROUND_F2 \r
348	.elseif (\f == RND_F3)
349	ROUND_F3 \r
350	.endif
351	.endm
352
353	.macro RR r
354	.set round_id, (\r % 80)
355
356	.if (round_id == 0) /* Precalculate F for first round */
357	.set ROUND_FUNC, RND_F1
358	mov B, TB
359
360	rorx $(32-30), B, B /* b>>>2 */
361	andn D, TB, T1
362	and C, TB
363	xor T1, TB
364	.endif
365
366	RND_FUN ROUND_FUNC, \r
367	ROTATE_STATE
368
369	.if (round_id == 18)
370	.set ROUND_FUNC, RND_F2
371	.elseif (round_id == 38)
372	.set ROUND_FUNC, RND_F3
373	.elseif (round_id == 58)
374	.set ROUND_FUNC, RND_F2
375	.endif
376
377	.set round_id, ( (\r+1) % 80)
378
379	RND_FUN ROUND_FUNC, (\r+1)
380	ROTATE_STATE
381	.endm
382
383	.macro ROUND_F1 r
384	add WK(\r), E
385
386	andn C, A, T1 /* ~b&d */
387	lea (RE,RTB), E /* Add F from the previous round */
388
389	rorx $(32-5), A, TA /* T2 = A >>> 5 */
390	rorx $(32-30),A, TB /* b>>>2 for next round */
391
392	PRECALC (\r) /* msg scheduling for next 2 blocks */
393
394	/*
395	* Calculate F for the next round
396	* (b & c) ^ andn[b, d]
397	*/
398	and B, A /* b&c */
399	xor T1, A /* F1 = (b&c) ^ (~b&d) */
400
401	lea (RE,RTA), E /* E += A >>> 5 */
402	.endm
403
404	.macro ROUND_F2 r
405	add WK(\r), E
406	lea (RE,RTB), E /* Add F from the previous round */
407
408	/* Calculate F for the next round */
409	rorx $(32-5), A, TA /* T2 = A >>> 5 */
410	.if ((round_id) < 79)
411	rorx $(32-30), A, TB /* b>>>2 for next round */
412	.endif
413	PRECALC (\r) /* msg scheduling for next 2 blocks */
414
415	.if ((round_id) < 79)
416	xor B, A
417	.endif
418
419	add TA, E /* E += A >>> 5 */
420
421	.if ((round_id) < 79)
422	xor C, A
423	.endif
424	.endm
425
426	.macro ROUND_F3 r
427	add WK(\r), E
428	PRECALC (\r) /* msg scheduling for next 2 blocks */
429
430	lea (RE,RTB), E /* Add F from the previous round */
431
432	mov B, T1
433	or A, T1
434
435	rorx $(32-5), A, TA /* T2 = A >>> 5 */
436	rorx $(32-30), A, TB /* b>>>2 for next round */
437
438	/* Calculate F for the next round
439	* (b and c) or (d and (b or c))
440	*/
441	and C, T1
442	and B, A
443	or T1, A
444
445	add TA, E /* E += A >>> 5 */
446
447	.endm
448
449	/*
450	* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
451	*/
452	.macro SHA1_PIPELINED_MAIN_BODY
453
454	REGALLOC
455
456	mov (HASH_PTR), A
457	mov 4(HASH_PTR), B
458	mov 8(HASH_PTR), C
459	mov 12(HASH_PTR), D
460	mov 16(HASH_PTR), E
461
462	mov %rsp, PRECALC_BUF
463	lea (2480+32)(%rsp), WK_BUF
464
465	# Precalc WK for first 2 blocks
466	PRECALC_OFFSET = 0
467	.set i, 0
468	.rept 160
469	PRECALC i
470	.set i, i + 1
471	.endr
472	PRECALC_OFFSET = 128
473	xchg WK_BUF, PRECALC_BUF
474
475	.align 32
476	_loop:
477	/*
478	* code loops through more than one block
479	* we use K_BASE value as a signal of a last block,
480	* it is set below by: cmovae BUFFER_PTR, K_BASE
481	*/
482	cmp K_BASE, BUFFER_PTR
483	jne _begin
484	.align 32
485	jmp _end
486	.align 32
487	_begin:
488
489	/*
490	* Do first block
491	* rounds: 0,2,4,6,8
492	*/
493	.set j, 0
494	.rept 5
495	RR j
496	.set j, j+2
497	.endr
498
499	jmp _loop0
500	_loop0:
501
502	/*
503	* rounds:
504	* 10,12,14,16,18
505	* 20,22,24,26,28
506	* 30,32,34,36,38
507	* 40,42,44,46,48
508	* 50,52,54,56,58
509	*/
510	.rept 25
511	RR j
512	.set j, j+2
513	.endr
514
515	add $(264), BUFFER_PTR / move to next odd-64-byte block */
516	cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */
517	cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
518
519	/*
520	* rounds
521	* 60,62,64,66,68
522	* 70,72,74,76,78
523	*/
524	.rept 10
525	RR j
526	.set j, j+2
527	.endr
528
529	UPDATE_HASH (HASH_PTR), A
530	UPDATE_HASH 4(HASH_PTR), TB
531	UPDATE_HASH 8(HASH_PTR), C
532	UPDATE_HASH 12(HASH_PTR), D
533	UPDATE_HASH 16(HASH_PTR), E
534
535	cmp K_BASE, BUFFER_PTR /* is current block the last one? */
536	je _loop
537
538	mov TB, B
539
540	/* Process second block */
541	/*
542	* rounds
543	* 0+80, 2+80, 4+80, 6+80, 8+80
544	* 10+80,12+80,14+80,16+80,18+80
545	*/
546
547	.set j, 0
548	.rept 10
549	RR j+80
550	.set j, j+2
551	.endr
552
553	jmp _loop1
554	_loop1:
555	/*
556	* rounds
557	* 20+80,22+80,24+80,26+80,28+80
558	* 30+80,32+80,34+80,36+80,38+80
559	*/
560	.rept 10
561	RR j+80
562	.set j, j+2
563	.endr
564
565	jmp _loop2
566	_loop2:
567
568	/*
569	* rounds
570	* 40+80,42+80,44+80,46+80,48+80
571	* 50+80,52+80,54+80,56+80,58+80
572	*/
573	.rept 10
574	RR j+80
575	.set j, j+2
576	.endr
577
578	add $(264), BUFFER_PTR2 / move to next even-64-byte block */
579
580	cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */
581	cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
582
583	jmp _loop3
584	_loop3:
585
586	/*
587	* rounds
588	* 60+80,62+80,64+80,66+80,68+80
589	* 70+80,72+80,74+80,76+80,78+80
590	*/
591	.rept 10
592	RR j+80
593	.set j, j+2
594	.endr
595
596	UPDATE_HASH (HASH_PTR), A
597	UPDATE_HASH 4(HASH_PTR), TB
598	UPDATE_HASH 8(HASH_PTR), C
599	UPDATE_HASH 12(HASH_PTR), D
600	UPDATE_HASH 16(HASH_PTR), E
601
602	/* Reset state for AVX2 reg permutation */
603	mov A, TA
604	mov TB, A
605	mov C, TB
606	mov E, C
607	mov D, B
608	mov TA, D
609
610	REGALLOC
611
612	xchg WK_BUF, PRECALC_BUF
613
614	jmp _loop
615
616	.align 32
617	_end:
618
619	.endm
620	/*
621	* macro implements SHA-1 function's body for several 64-byte blocks
622	* param: function's name
623	*/
624	.macro SHA1_VECTOR_ASM name
625	ENTRY(\name)
7c1da8d0	626
	627	push %rbx
	628	push %rbp
	629	push %r12
	630	push %r13
	631	push %r14
	632	push %r15
	633
	634	RESERVE_STACK = (W_SIZE*4 + 8+24)
	635
	636	/* Align stack */
	637	mov %rsp, %rbx
6c8c17cc	638	and $~(0x20-1), %rsp
7c1da8d0	639	push %rbx
	640	sub $RESERVE_STACK, %rsp
	641
	642	avx2_zeroupper
	643
	644	lea K_XMM_AR(%rip), K_BASE
	645
	646	mov CTX, HASH_PTR
	647	mov BUF, BUFFER_PTR
	648	lea 64(BUF), BUFFER_PTR2
	649
	650	shl $6, CNT /* mul by 64 */
	651	add BUF, CNT
	652	add $64, CNT
	653	mov CNT, BUFFER_END
	654
	655	cmp BUFFER_END, BUFFER_PTR2
	656	cmovae K_BASE, BUFFER_PTR2
	657
	658	xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
	659
	660	SHA1_PIPELINED_MAIN_BODY
	661
	662	avx2_zeroupper
	663
	664	add $RESERVE_STACK, %rsp
6c8c17cc	665	pop %rsp
7c1da8d0	666
	667	pop %r15
	668	pop %r14
	669	pop %r13
	670	pop %r12
	671	pop %rbp
	672	pop %rbx
	673
	674	ret
	675
	676	ENDPROC(\name)
	677	.endm
	678
	679	.section .rodata
	680
	681	#define K1 0x5a827999
	682	#define K2 0x6ed9eba1
	683	#define K3 0x8f1bbcdc
	684	#define K4 0xca62c1d6
	685
	686	.align 128
	687	K_XMM_AR:
	688	.long K1, K1, K1, K1
	689	.long K1, K1, K1, K1
	690	.long K2, K2, K2, K2
	691	.long K2, K2, K2, K2
	692	.long K3, K3, K3, K3
	693	.long K3, K3, K3, K3
	694	.long K4, K4, K4, K4
	695	.long K4, K4, K4, K4
	696
	697	BSWAP_SHUFB_CTL:
	698	.long 0x00010203
	699	.long 0x04050607
	700	.long 0x08090a0b
	701	.long 0x0c0d0e0f
	702	.long 0x00010203
	703	.long 0x04050607
	704	.long 0x08090a0b
	705	.long 0x0c0d0e0f
	706	.text
	707
	708	SHA1_VECTOR_ASM sha1_transform_avx2