Commit | Line | Data |
---|---|---|
d764593a TC |
1 | ######################################################################## |
2 | # Copyright (c) 2013, Intel Corporation | |
3 | # | |
4 | # This software is available to you under a choice of one of two | |
5 | # licenses. You may choose to be licensed under the terms of the GNU | |
6 | # General Public License (GPL) Version 2, available from the file | |
7 | # COPYING in the main directory of this source tree, or the | |
8 | # OpenIB.org BSD license below: | |
9 | # | |
10 | # Redistribution and use in source and binary forms, with or without | |
11 | # modification, are permitted provided that the following conditions are | |
12 | # met: | |
13 | # | |
14 | # * Redistributions of source code must retain the above copyright | |
15 | # notice, this list of conditions and the following disclaimer. | |
16 | # | |
17 | # * Redistributions in binary form must reproduce the above copyright | |
18 | # notice, this list of conditions and the following disclaimer in the | |
19 | # documentation and/or other materials provided with the | |
20 | # distribution. | |
21 | # | |
22 | # * Neither the name of the Intel Corporation nor the names of its | |
23 | # contributors may be used to endorse or promote products derived from | |
24 | # this software without specific prior written permission. | |
25 | # | |
26 | # | |
27 | # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY | |
28 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
29 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
30 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR | |
31 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
32 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
33 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR | |
34 | # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
35 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
36 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
37 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
38 | ######################################################################## | |
39 | ## | |
40 | ## Authors: | |
41 | ## Erdinc Ozturk <erdinc.ozturk@intel.com> | |
42 | ## Vinodh Gopal <vinodh.gopal@intel.com> | |
43 | ## James Guilford <james.guilford@intel.com> | |
44 | ## Tim Chen <tim.c.chen@linux.intel.com> | |
45 | ## | |
46 | ## References: | |
47 | ## This code was derived and highly optimized from the code described in paper: | |
48 | ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation | |
49 | ## on Intel Architecture Processors. August, 2010 | |
50 | ## The details of the implementation is explained in: | |
51 | ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode | |
52 | ## on Intel Architecture Processors. October, 2012. | |
53 | ## | |
54 | ## Assumptions: | |
55 | ## | |
56 | ## | |
57 | ## | |
58 | ## iv: | |
59 | ## 0 1 2 3 | |
60 | ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
61 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
62 | ## | Salt (From the SA) | | |
63 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
64 | ## | Initialization Vector | | |
65 | ## | (This is the sequence number from IPSec header) | | |
66 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
67 | ## | 0x1 | | |
68 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
69 | ## | |
70 | ## | |
71 | ## | |
72 | ## AAD: | |
73 | ## AAD padded to 128 bits with 0 | |
74 | ## for example, assume AAD is a u32 vector | |
75 | ## | |
76 | ## if AAD is 8 bytes: | |
77 | ## AAD[3] = {A0, A1}# | |
78 | ## padded AAD in xmm register = {A1 A0 0 0} | |
79 | ## | |
80 | ## 0 1 2 3 | |
81 | ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
82 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
83 | ## | SPI (A1) | | |
84 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
85 | ## | 32-bit Sequence Number (A0) | | |
86 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
87 | ## | 0x0 | | |
88 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
89 | ## | |
90 | ## AAD Format with 32-bit Sequence Number | |
91 | ## | |
92 | ## if AAD is 12 bytes: | |
93 | ## AAD[3] = {A0, A1, A2}# | |
94 | ## padded AAD in xmm register = {A2 A1 A0 0} | |
95 | ## | |
96 | ## 0 1 2 3 | |
97 | ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
98 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
99 | ## | SPI (A2) | | |
100 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
101 | ## | 64-bit Extended Sequence Number {A1,A0} | | |
102 | ## | | | |
103 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
104 | ## | 0x0 | | |
105 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
106 | ## | |
107 | ## AAD Format with 64-bit Extended Sequence Number | |
108 | ## | |
109 | ## | |
110 | ## aadLen: | |
111 | ## from the definition of the spec, aadLen can only be 8 or 12 bytes. | |
112 | ## The code additionally supports aadLen of length 16 bytes. | |
113 | ## | |
114 | ## TLen: | |
115 | ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. | |
116 | ## | |
117 | ## poly = x^128 + x^127 + x^126 + x^121 + 1 | |
118 | ## throughout the code, one tab and two tab indentations are used. one tab is | |
119 | ## for GHASH part, two tabs is for AES part. | |
120 | ## | |
121 | ||
122 | #include <linux/linkage.h> | |
123 | #include <asm/inst.h> | |
124 | ||
125 | .data | |
126 | .align 16 | |
127 | ||
128 | POLY: .octa 0xC2000000000000000000000000000001 | |
129 | POLY2: .octa 0xC20000000000000000000001C2000000 | |
130 | TWOONE: .octa 0x00000001000000000000000000000001 | |
131 | ||
132 | # order of these constants should not change. | |
133 | # more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F | |
134 | ||
135 | SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F | |
136 | SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 | |
137 | ALL_F: .octa 0xffffffffffffffffffffffffffffffff | |
138 | ZERO: .octa 0x00000000000000000000000000000000 | |
139 | ONE: .octa 0x00000000000000000000000000000001 | |
140 | ONEf: .octa 0x01000000000000000000000000000000 | |
141 | ||
142 | .text | |
143 | ||
144 | ||
145 | ##define the fields of the gcm aes context | |
146 | #{ | |
147 | # u8 expanded_keys[16*11] store expanded keys | |
148 | # u8 shifted_hkey_1[16] store HashKey <<1 mod poly here | |
149 | # u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here | |
150 | # u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here | |
151 | # u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here | |
152 | # u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here | |
153 | # u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here | |
154 | # u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here | |
155 | # u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here | |
156 | # u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) | |
157 | # u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) | |
158 | # u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) | |
159 | # u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) | |
160 | # u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) | |
161 | # u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) | |
162 | # u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) | |
163 | # u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) | |
164 | #} gcm_ctx# | |
165 | ||
166 | HashKey = 16*11 # store HashKey <<1 mod poly here | |
167 | HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here | |
168 | HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here | |
169 | HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here | |
170 | HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here | |
171 | HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here | |
172 | HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here | |
173 | HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here | |
174 | HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) | |
175 | HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) | |
176 | HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) | |
177 | HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) | |
178 | HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) | |
179 | HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) | |
180 | HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) | |
181 | HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) | |
182 | ||
183 | #define arg1 %rdi | |
184 | #define arg2 %rsi | |
185 | #define arg3 %rdx | |
186 | #define arg4 %rcx | |
187 | #define arg5 %r8 | |
188 | #define arg6 %r9 | |
189 | #define arg7 STACK_OFFSET+8*1(%r14) | |
190 | #define arg8 STACK_OFFSET+8*2(%r14) | |
191 | #define arg9 STACK_OFFSET+8*3(%r14) | |
192 | ||
193 | i = 0 | |
194 | j = 0 | |
195 | ||
196 | out_order = 0 | |
197 | in_order = 1 | |
198 | DEC = 0 | |
199 | ENC = 1 | |
200 | ||
201 | .macro define_reg r n | |
202 | reg_\r = %xmm\n | |
203 | .endm | |
204 | ||
205 | .macro setreg | |
206 | .altmacro | |
207 | define_reg i %i | |
208 | define_reg j %j | |
209 | .noaltmacro | |
210 | .endm | |
211 | ||
212 | # need to push 4 registers into stack to maintain | |
213 | STACK_OFFSET = 8*4 | |
214 | ||
215 | TMP1 = 16*0 # Temporary storage for AAD | |
216 | TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) | |
217 | TMP3 = 16*2 # Temporary storage for AES State 3 | |
218 | TMP4 = 16*3 # Temporary storage for AES State 4 | |
219 | TMP5 = 16*4 # Temporary storage for AES State 5 | |
220 | TMP6 = 16*5 # Temporary storage for AES State 6 | |
221 | TMP7 = 16*6 # Temporary storage for AES State 7 | |
222 | TMP8 = 16*7 # Temporary storage for AES State 8 | |
223 | ||
224 | VARIABLE_OFFSET = 16*8 | |
225 | ||
226 | ################################ | |
227 | # Utility Macros | |
228 | ################################ | |
229 | ||
230 | # Encryption of a single block | |
231 | .macro ENCRYPT_SINGLE_BLOCK XMM0 | |
232 | vpxor (arg1), \XMM0, \XMM0 | |
233 | i = 1 | |
234 | setreg | |
235 | .rep 9 | |
236 | vaesenc 16*i(arg1), \XMM0, \XMM0 | |
237 | i = (i+1) | |
238 | setreg | |
239 | .endr | |
240 | vaesenclast 16*10(arg1), \XMM0, \XMM0 | |
241 | .endm | |
242 | ||
243 | #ifdef CONFIG_AS_AVX | |
244 | ############################################################################### | |
245 | # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) | |
246 | # Input: A and B (128-bits each, bit-reflected) | |
247 | # Output: C = A*B*x mod poly, (i.e. >>1 ) | |
248 | # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | |
249 | # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | |
250 | ############################################################################### | |
251 | .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 | |
252 | ||
253 | vpshufd $0b01001110, \GH, \T2 | |
254 | vpshufd $0b01001110, \HK, \T3 | |
255 | vpxor \GH , \T2, \T2 # T2 = (a1+a0) | |
256 | vpxor \HK , \T3, \T3 # T3 = (b1+b0) | |
257 | ||
258 | vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 | |
259 | vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 | |
260 | vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) | |
261 | vpxor \GH, \T2,\T2 | |
262 | vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 | |
263 | ||
264 | vpslldq $8, \T2,\T3 # shift-L T3 2 DWs | |
265 | vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs | |
266 | vpxor \T3, \GH, \GH | |
267 | vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK | |
268 | ||
269 | #first phase of the reduction | |
270 | vpslld $31, \GH, \T2 # packed right shifting << 31 | |
271 | vpslld $30, \GH, \T3 # packed right shifting shift << 30 | |
272 | vpslld $25, \GH, \T4 # packed right shifting shift << 25 | |
273 | ||
274 | vpxor \T3, \T2, \T2 # xor the shifted versions | |
275 | vpxor \T4, \T2, \T2 | |
276 | ||
277 | vpsrldq $4, \T2, \T5 # shift-R T5 1 DW | |
278 | ||
279 | vpslldq $12, \T2, \T2 # shift-L T2 3 DWs | |
280 | vpxor \T2, \GH, \GH # first phase of the reduction complete | |
281 | ||
282 | #second phase of the reduction | |
283 | ||
284 | vpsrld $1,\GH, \T2 # packed left shifting >> 1 | |
285 | vpsrld $2,\GH, \T3 # packed left shifting >> 2 | |
286 | vpsrld $7,\GH, \T4 # packed left shifting >> 7 | |
287 | vpxor \T3, \T2, \T2 # xor the shifted versions | |
288 | vpxor \T4, \T2, \T2 | |
289 | ||
290 | vpxor \T5, \T2, \T2 | |
291 | vpxor \T2, \GH, \GH | |
292 | vpxor \T1, \GH, \GH # the result is in GH | |
293 | ||
294 | ||
295 | .endm | |
296 | ||
297 | .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 | |
298 | ||
299 | # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | |
300 | vmovdqa \HK, \T5 | |
301 | ||
302 | vpshufd $0b01001110, \T5, \T1 | |
303 | vpxor \T5, \T1, \T1 | |
304 | vmovdqa \T1, HashKey_k(arg1) | |
305 | ||
306 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly | |
307 | vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly | |
308 | vpshufd $0b01001110, \T5, \T1 | |
309 | vpxor \T5, \T1, \T1 | |
310 | vmovdqa \T1, HashKey_2_k(arg1) | |
311 | ||
312 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly | |
313 | vmovdqa \T5, HashKey_3(arg1) | |
314 | vpshufd $0b01001110, \T5, \T1 | |
315 | vpxor \T5, \T1, \T1 | |
316 | vmovdqa \T1, HashKey_3_k(arg1) | |
317 | ||
318 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly | |
319 | vmovdqa \T5, HashKey_4(arg1) | |
320 | vpshufd $0b01001110, \T5, \T1 | |
321 | vpxor \T5, \T1, \T1 | |
322 | vmovdqa \T1, HashKey_4_k(arg1) | |
323 | ||
324 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly | |
325 | vmovdqa \T5, HashKey_5(arg1) | |
326 | vpshufd $0b01001110, \T5, \T1 | |
327 | vpxor \T5, \T1, \T1 | |
328 | vmovdqa \T1, HashKey_5_k(arg1) | |
329 | ||
330 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly | |
331 | vmovdqa \T5, HashKey_6(arg1) | |
332 | vpshufd $0b01001110, \T5, \T1 | |
333 | vpxor \T5, \T1, \T1 | |
334 | vmovdqa \T1, HashKey_6_k(arg1) | |
335 | ||
336 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly | |
337 | vmovdqa \T5, HashKey_7(arg1) | |
338 | vpshufd $0b01001110, \T5, \T1 | |
339 | vpxor \T5, \T1, \T1 | |
340 | vmovdqa \T1, HashKey_7_k(arg1) | |
341 | ||
342 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly | |
343 | vmovdqa \T5, HashKey_8(arg1) | |
344 | vpshufd $0b01001110, \T5, \T1 | |
345 | vpxor \T5, \T1, \T1 | |
346 | vmovdqa \T1, HashKey_8_k(arg1) | |
347 | ||
348 | .endm | |
349 | ||
350 | ## if a = number of total plaintext bytes | |
351 | ## b = floor(a/16) | |
352 | ## num_initial_blocks = b mod 4# | |
353 | ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext | |
354 | ## r10, r11, r12, rax are clobbered | |
355 | ## arg1, arg2, arg3, r14 are used as a pointer only, not modified | |
356 | ||
357 | .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC | |
358 | i = (8-\num_initial_blocks) | |
359 | setreg | |
360 | ||
361 | mov arg6, %r10 # r10 = AAD | |
362 | mov arg7, %r12 # r12 = aadLen | |
363 | ||
364 | ||
365 | mov %r12, %r11 | |
366 | ||
367 | vpxor reg_i, reg_i, reg_i | |
368 | _get_AAD_loop\@: | |
369 | vmovd (%r10), \T1 | |
370 | vpslldq $12, \T1, \T1 | |
371 | vpsrldq $4, reg_i, reg_i | |
372 | vpxor \T1, reg_i, reg_i | |
373 | ||
374 | add $4, %r10 | |
375 | sub $4, %r12 | |
376 | jg _get_AAD_loop\@ | |
377 | ||
378 | ||
379 | cmp $16, %r11 | |
380 | je _get_AAD_loop2_done\@ | |
381 | mov $16, %r12 | |
382 | ||
383 | _get_AAD_loop2\@: | |
384 | vpsrldq $4, reg_i, reg_i | |
385 | sub $4, %r12 | |
386 | cmp %r11, %r12 | |
387 | jg _get_AAD_loop2\@ | |
388 | ||
389 | _get_AAD_loop2_done\@: | |
390 | ||
391 | #byte-reflect the AAD data | |
392 | vpshufb SHUF_MASK(%rip), reg_i, reg_i | |
393 | ||
394 | # initialize the data pointer offset as zero | |
395 | xor %r11, %r11 | |
396 | ||
397 | # start AES for num_initial_blocks blocks | |
398 | mov arg5, %rax # rax = *Y0 | |
399 | vmovdqu (%rax), \CTR # CTR = Y0 | |
400 | vpshufb SHUF_MASK(%rip), \CTR, \CTR | |
401 | ||
402 | ||
403 | i = (9-\num_initial_blocks) | |
404 | setreg | |
405 | .rep \num_initial_blocks | |
406 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
407 | vmovdqa \CTR, reg_i | |
408 | vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap | |
409 | i = (i+1) | |
410 | setreg | |
411 | .endr | |
412 | ||
413 | vmovdqa (arg1), \T_key | |
414 | i = (9-\num_initial_blocks) | |
415 | setreg | |
416 | .rep \num_initial_blocks | |
417 | vpxor \T_key, reg_i, reg_i | |
418 | i = (i+1) | |
419 | setreg | |
420 | .endr | |
421 | ||
422 | j = 1 | |
423 | setreg | |
424 | .rep 9 | |
425 | vmovdqa 16*j(arg1), \T_key | |
426 | i = (9-\num_initial_blocks) | |
427 | setreg | |
428 | .rep \num_initial_blocks | |
429 | vaesenc \T_key, reg_i, reg_i | |
430 | i = (i+1) | |
431 | setreg | |
432 | .endr | |
433 | ||
434 | j = (j+1) | |
435 | setreg | |
436 | .endr | |
437 | ||
438 | ||
439 | vmovdqa 16*10(arg1), \T_key | |
440 | i = (9-\num_initial_blocks) | |
441 | setreg | |
442 | .rep \num_initial_blocks | |
443 | vaesenclast \T_key, reg_i, reg_i | |
444 | i = (i+1) | |
445 | setreg | |
446 | .endr | |
447 | ||
448 | i = (9-\num_initial_blocks) | |
449 | setreg | |
450 | .rep \num_initial_blocks | |
451 | vmovdqu (arg3, %r11), \T1 | |
452 | vpxor \T1, reg_i, reg_i | |
453 | vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks | |
454 | add $16, %r11 | |
455 | .if \ENC_DEC == DEC | |
456 | vmovdqa \T1, reg_i | |
457 | .endif | |
458 | vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations | |
459 | i = (i+1) | |
460 | setreg | |
461 | .endr | |
462 | ||
463 | ||
464 | i = (8-\num_initial_blocks) | |
465 | j = (9-\num_initial_blocks) | |
466 | setreg | |
467 | GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 | |
468 | ||
469 | .rep \num_initial_blocks | |
470 | vpxor reg_i, reg_j, reg_j | |
471 | GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks | |
472 | i = (i+1) | |
473 | j = (j+1) | |
474 | setreg | |
475 | .endr | |
476 | # XMM8 has the combined result here | |
477 | ||
478 | vmovdqa \XMM8, TMP1(%rsp) | |
479 | vmovdqa \XMM8, \T3 | |
480 | ||
481 | cmp $128, %r13 | |
482 | jl _initial_blocks_done\@ # no need for precomputed constants | |
483 | ||
484 | ############################################################################### | |
485 | # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | |
486 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
487 | vmovdqa \CTR, \XMM1 | |
488 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | |
489 | ||
490 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
491 | vmovdqa \CTR, \XMM2 | |
492 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | |
493 | ||
494 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
495 | vmovdqa \CTR, \XMM3 | |
496 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | |
497 | ||
498 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
499 | vmovdqa \CTR, \XMM4 | |
500 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | |
501 | ||
502 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
503 | vmovdqa \CTR, \XMM5 | |
504 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | |
505 | ||
506 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
507 | vmovdqa \CTR, \XMM6 | |
508 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | |
509 | ||
510 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
511 | vmovdqa \CTR, \XMM7 | |
512 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | |
513 | ||
514 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
515 | vmovdqa \CTR, \XMM8 | |
516 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | |
517 | ||
518 | vmovdqa (arg1), \T_key | |
519 | vpxor \T_key, \XMM1, \XMM1 | |
520 | vpxor \T_key, \XMM2, \XMM2 | |
521 | vpxor \T_key, \XMM3, \XMM3 | |
522 | vpxor \T_key, \XMM4, \XMM4 | |
523 | vpxor \T_key, \XMM5, \XMM5 | |
524 | vpxor \T_key, \XMM6, \XMM6 | |
525 | vpxor \T_key, \XMM7, \XMM7 | |
526 | vpxor \T_key, \XMM8, \XMM8 | |
527 | ||
528 | i = 1 | |
529 | setreg | |
530 | .rep 9 # do 9 rounds | |
531 | vmovdqa 16*i(arg1), \T_key | |
532 | vaesenc \T_key, \XMM1, \XMM1 | |
533 | vaesenc \T_key, \XMM2, \XMM2 | |
534 | vaesenc \T_key, \XMM3, \XMM3 | |
535 | vaesenc \T_key, \XMM4, \XMM4 | |
536 | vaesenc \T_key, \XMM5, \XMM5 | |
537 | vaesenc \T_key, \XMM6, \XMM6 | |
538 | vaesenc \T_key, \XMM7, \XMM7 | |
539 | vaesenc \T_key, \XMM8, \XMM8 | |
540 | i = (i+1) | |
541 | setreg | |
542 | .endr | |
543 | ||
544 | ||
545 | vmovdqa 16*i(arg1), \T_key | |
546 | vaesenclast \T_key, \XMM1, \XMM1 | |
547 | vaesenclast \T_key, \XMM2, \XMM2 | |
548 | vaesenclast \T_key, \XMM3, \XMM3 | |
549 | vaesenclast \T_key, \XMM4, \XMM4 | |
550 | vaesenclast \T_key, \XMM5, \XMM5 | |
551 | vaesenclast \T_key, \XMM6, \XMM6 | |
552 | vaesenclast \T_key, \XMM7, \XMM7 | |
553 | vaesenclast \T_key, \XMM8, \XMM8 | |
554 | ||
555 | vmovdqu (arg3, %r11), \T1 | |
556 | vpxor \T1, \XMM1, \XMM1 | |
557 | vmovdqu \XMM1, (arg2 , %r11) | |
558 | .if \ENC_DEC == DEC | |
559 | vmovdqa \T1, \XMM1 | |
560 | .endif | |
561 | ||
562 | vmovdqu 16*1(arg3, %r11), \T1 | |
563 | vpxor \T1, \XMM2, \XMM2 | |
564 | vmovdqu \XMM2, 16*1(arg2 , %r11) | |
565 | .if \ENC_DEC == DEC | |
566 | vmovdqa \T1, \XMM2 | |
567 | .endif | |
568 | ||
569 | vmovdqu 16*2(arg3, %r11), \T1 | |
570 | vpxor \T1, \XMM3, \XMM3 | |
571 | vmovdqu \XMM3, 16*2(arg2 , %r11) | |
572 | .if \ENC_DEC == DEC | |
573 | vmovdqa \T1, \XMM3 | |
574 | .endif | |
575 | ||
576 | vmovdqu 16*3(arg3, %r11), \T1 | |
577 | vpxor \T1, \XMM4, \XMM4 | |
578 | vmovdqu \XMM4, 16*3(arg2 , %r11) | |
579 | .if \ENC_DEC == DEC | |
580 | vmovdqa \T1, \XMM4 | |
581 | .endif | |
582 | ||
583 | vmovdqu 16*4(arg3, %r11), \T1 | |
584 | vpxor \T1, \XMM5, \XMM5 | |
585 | vmovdqu \XMM5, 16*4(arg2 , %r11) | |
586 | .if \ENC_DEC == DEC | |
587 | vmovdqa \T1, \XMM5 | |
588 | .endif | |
589 | ||
590 | vmovdqu 16*5(arg3, %r11), \T1 | |
591 | vpxor \T1, \XMM6, \XMM6 | |
592 | vmovdqu \XMM6, 16*5(arg2 , %r11) | |
593 | .if \ENC_DEC == DEC | |
594 | vmovdqa \T1, \XMM6 | |
595 | .endif | |
596 | ||
597 | vmovdqu 16*6(arg3, %r11), \T1 | |
598 | vpxor \T1, \XMM7, \XMM7 | |
599 | vmovdqu \XMM7, 16*6(arg2 , %r11) | |
600 | .if \ENC_DEC == DEC | |
601 | vmovdqa \T1, \XMM7 | |
602 | .endif | |
603 | ||
604 | vmovdqu 16*7(arg3, %r11), \T1 | |
605 | vpxor \T1, \XMM8, \XMM8 | |
606 | vmovdqu \XMM8, 16*7(arg2 , %r11) | |
607 | .if \ENC_DEC == DEC | |
608 | vmovdqa \T1, \XMM8 | |
609 | .endif | |
610 | ||
611 | add $128, %r11 | |
612 | ||
613 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | |
614 | vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext | |
615 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | |
616 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | |
617 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | |
618 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | |
619 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | |
620 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | |
621 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | |
622 | ||
623 | ############################################################################### | |
624 | ||
625 | _initial_blocks_done\@: | |
626 | ||
627 | .endm | |
628 | ||
629 | # encrypt 8 blocks at a time | |
630 | # ghash the 8 previously encrypted ciphertext blocks | |
631 | # arg1, arg2, arg3 are used as pointers only, not modified | |
632 | # r11 is the data offset value | |
633 | .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC | |
634 | ||
635 | vmovdqa \XMM1, \T2 | |
636 | vmovdqa \XMM2, TMP2(%rsp) | |
637 | vmovdqa \XMM3, TMP3(%rsp) | |
638 | vmovdqa \XMM4, TMP4(%rsp) | |
639 | vmovdqa \XMM5, TMP5(%rsp) | |
640 | vmovdqa \XMM6, TMP6(%rsp) | |
641 | vmovdqa \XMM7, TMP7(%rsp) | |
642 | vmovdqa \XMM8, TMP8(%rsp) | |
643 | ||
644 | .if \loop_idx == in_order | |
645 | vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT | |
646 | vpaddd ONE(%rip), \XMM1, \XMM2 | |
647 | vpaddd ONE(%rip), \XMM2, \XMM3 | |
648 | vpaddd ONE(%rip), \XMM3, \XMM4 | |
649 | vpaddd ONE(%rip), \XMM4, \XMM5 | |
650 | vpaddd ONE(%rip), \XMM5, \XMM6 | |
651 | vpaddd ONE(%rip), \XMM6, \XMM7 | |
652 | vpaddd ONE(%rip), \XMM7, \XMM8 | |
653 | vmovdqa \XMM8, \CTR | |
654 | ||
655 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | |
656 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | |
657 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | |
658 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | |
659 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | |
660 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | |
661 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | |
662 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | |
663 | .else | |
664 | vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT | |
665 | vpaddd ONEf(%rip), \XMM1, \XMM2 | |
666 | vpaddd ONEf(%rip), \XMM2, \XMM3 | |
667 | vpaddd ONEf(%rip), \XMM3, \XMM4 | |
668 | vpaddd ONEf(%rip), \XMM4, \XMM5 | |
669 | vpaddd ONEf(%rip), \XMM5, \XMM6 | |
670 | vpaddd ONEf(%rip), \XMM6, \XMM7 | |
671 | vpaddd ONEf(%rip), \XMM7, \XMM8 | |
672 | vmovdqa \XMM8, \CTR | |
673 | .endif | |
674 | ||
675 | ||
676 | ####################################################################### | |
677 | ||
678 | vmovdqu (arg1), \T1 | |
679 | vpxor \T1, \XMM1, \XMM1 | |
680 | vpxor \T1, \XMM2, \XMM2 | |
681 | vpxor \T1, \XMM3, \XMM3 | |
682 | vpxor \T1, \XMM4, \XMM4 | |
683 | vpxor \T1, \XMM5, \XMM5 | |
684 | vpxor \T1, \XMM6, \XMM6 | |
685 | vpxor \T1, \XMM7, \XMM7 | |
686 | vpxor \T1, \XMM8, \XMM8 | |
687 | ||
688 | ####################################################################### | |
689 | ||
690 | ||
691 | ||
692 | ||
693 | ||
694 | vmovdqu 16*1(arg1), \T1 | |
695 | vaesenc \T1, \XMM1, \XMM1 | |
696 | vaesenc \T1, \XMM2, \XMM2 | |
697 | vaesenc \T1, \XMM3, \XMM3 | |
698 | vaesenc \T1, \XMM4, \XMM4 | |
699 | vaesenc \T1, \XMM5, \XMM5 | |
700 | vaesenc \T1, \XMM6, \XMM6 | |
701 | vaesenc \T1, \XMM7, \XMM7 | |
702 | vaesenc \T1, \XMM8, \XMM8 | |
703 | ||
704 | vmovdqu 16*2(arg1), \T1 | |
705 | vaesenc \T1, \XMM1, \XMM1 | |
706 | vaesenc \T1, \XMM2, \XMM2 | |
707 | vaesenc \T1, \XMM3, \XMM3 | |
708 | vaesenc \T1, \XMM4, \XMM4 | |
709 | vaesenc \T1, \XMM5, \XMM5 | |
710 | vaesenc \T1, \XMM6, \XMM6 | |
711 | vaesenc \T1, \XMM7, \XMM7 | |
712 | vaesenc \T1, \XMM8, \XMM8 | |
713 | ||
714 | ||
715 | ####################################################################### | |
716 | ||
717 | vmovdqa HashKey_8(arg1), \T5 | |
718 | vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 | |
719 | vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 | |
720 | ||
721 | vpshufd $0b01001110, \T2, \T6 | |
722 | vpxor \T2, \T6, \T6 | |
723 | ||
724 | vmovdqa HashKey_8_k(arg1), \T5 | |
725 | vpclmulqdq $0x00, \T5, \T6, \T6 | |
726 | ||
727 | vmovdqu 16*3(arg1), \T1 | |
728 | vaesenc \T1, \XMM1, \XMM1 | |
729 | vaesenc \T1, \XMM2, \XMM2 | |
730 | vaesenc \T1, \XMM3, \XMM3 | |
731 | vaesenc \T1, \XMM4, \XMM4 | |
732 | vaesenc \T1, \XMM5, \XMM5 | |
733 | vaesenc \T1, \XMM6, \XMM6 | |
734 | vaesenc \T1, \XMM7, \XMM7 | |
735 | vaesenc \T1, \XMM8, \XMM8 | |
736 | ||
737 | vmovdqa TMP2(%rsp), \T1 | |
738 | vmovdqa HashKey_7(arg1), \T5 | |
739 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
740 | vpxor \T3, \T4, \T4 | |
741 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
742 | vpxor \T3, \T7, \T7 | |
743 | ||
744 | vpshufd $0b01001110, \T1, \T3 | |
745 | vpxor \T1, \T3, \T3 | |
746 | vmovdqa HashKey_7_k(arg1), \T5 | |
747 | vpclmulqdq $0x10, \T5, \T3, \T3 | |
748 | vpxor \T3, \T6, \T6 | |
749 | ||
750 | vmovdqu 16*4(arg1), \T1 | |
751 | vaesenc \T1, \XMM1, \XMM1 | |
752 | vaesenc \T1, \XMM2, \XMM2 | |
753 | vaesenc \T1, \XMM3, \XMM3 | |
754 | vaesenc \T1, \XMM4, \XMM4 | |
755 | vaesenc \T1, \XMM5, \XMM5 | |
756 | vaesenc \T1, \XMM6, \XMM6 | |
757 | vaesenc \T1, \XMM7, \XMM7 | |
758 | vaesenc \T1, \XMM8, \XMM8 | |
759 | ||
760 | ####################################################################### | |
761 | ||
762 | vmovdqa TMP3(%rsp), \T1 | |
763 | vmovdqa HashKey_6(arg1), \T5 | |
764 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
765 | vpxor \T3, \T4, \T4 | |
766 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
767 | vpxor \T3, \T7, \T7 | |
768 | ||
769 | vpshufd $0b01001110, \T1, \T3 | |
770 | vpxor \T1, \T3, \T3 | |
771 | vmovdqa HashKey_6_k(arg1), \T5 | |
772 | vpclmulqdq $0x10, \T5, \T3, \T3 | |
773 | vpxor \T3, \T6, \T6 | |
774 | ||
775 | vmovdqu 16*5(arg1), \T1 | |
776 | vaesenc \T1, \XMM1, \XMM1 | |
777 | vaesenc \T1, \XMM2, \XMM2 | |
778 | vaesenc \T1, \XMM3, \XMM3 | |
779 | vaesenc \T1, \XMM4, \XMM4 | |
780 | vaesenc \T1, \XMM5, \XMM5 | |
781 | vaesenc \T1, \XMM6, \XMM6 | |
782 | vaesenc \T1, \XMM7, \XMM7 | |
783 | vaesenc \T1, \XMM8, \XMM8 | |
784 | ||
785 | vmovdqa TMP4(%rsp), \T1 | |
786 | vmovdqa HashKey_5(arg1), \T5 | |
787 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
788 | vpxor \T3, \T4, \T4 | |
789 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
790 | vpxor \T3, \T7, \T7 | |
791 | ||
792 | vpshufd $0b01001110, \T1, \T3 | |
793 | vpxor \T1, \T3, \T3 | |
794 | vmovdqa HashKey_5_k(arg1), \T5 | |
795 | vpclmulqdq $0x10, \T5, \T3, \T3 | |
796 | vpxor \T3, \T6, \T6 | |
797 | ||
798 | vmovdqu 16*6(arg1), \T1 | |
799 | vaesenc \T1, \XMM1, \XMM1 | |
800 | vaesenc \T1, \XMM2, \XMM2 | |
801 | vaesenc \T1, \XMM3, \XMM3 | |
802 | vaesenc \T1, \XMM4, \XMM4 | |
803 | vaesenc \T1, \XMM5, \XMM5 | |
804 | vaesenc \T1, \XMM6, \XMM6 | |
805 | vaesenc \T1, \XMM7, \XMM7 | |
806 | vaesenc \T1, \XMM8, \XMM8 | |
807 | ||
808 | ||
809 | vmovdqa TMP5(%rsp), \T1 | |
810 | vmovdqa HashKey_4(arg1), \T5 | |
811 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
812 | vpxor \T3, \T4, \T4 | |
813 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
814 | vpxor \T3, \T7, \T7 | |
815 | ||
816 | vpshufd $0b01001110, \T1, \T3 | |
817 | vpxor \T1, \T3, \T3 | |
818 | vmovdqa HashKey_4_k(arg1), \T5 | |
819 | vpclmulqdq $0x10, \T5, \T3, \T3 | |
820 | vpxor \T3, \T6, \T6 | |
821 | ||
822 | vmovdqu 16*7(arg1), \T1 | |
823 | vaesenc \T1, \XMM1, \XMM1 | |
824 | vaesenc \T1, \XMM2, \XMM2 | |
825 | vaesenc \T1, \XMM3, \XMM3 | |
826 | vaesenc \T1, \XMM4, \XMM4 | |
827 | vaesenc \T1, \XMM5, \XMM5 | |
828 | vaesenc \T1, \XMM6, \XMM6 | |
829 | vaesenc \T1, \XMM7, \XMM7 | |
830 | vaesenc \T1, \XMM8, \XMM8 | |
831 | ||
832 | vmovdqa TMP6(%rsp), \T1 | |
833 | vmovdqa HashKey_3(arg1), \T5 | |
834 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
835 | vpxor \T3, \T4, \T4 | |
836 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
837 | vpxor \T3, \T7, \T7 | |
838 | ||
839 | vpshufd $0b01001110, \T1, \T3 | |
840 | vpxor \T1, \T3, \T3 | |
841 | vmovdqa HashKey_3_k(arg1), \T5 | |
842 | vpclmulqdq $0x10, \T5, \T3, \T3 | |
843 | vpxor \T3, \T6, \T6 | |
844 | ||
845 | ||
846 | vmovdqu 16*8(arg1), \T1 | |
847 | vaesenc \T1, \XMM1, \XMM1 | |
848 | vaesenc \T1, \XMM2, \XMM2 | |
849 | vaesenc \T1, \XMM3, \XMM3 | |
850 | vaesenc \T1, \XMM4, \XMM4 | |
851 | vaesenc \T1, \XMM5, \XMM5 | |
852 | vaesenc \T1, \XMM6, \XMM6 | |
853 | vaesenc \T1, \XMM7, \XMM7 | |
854 | vaesenc \T1, \XMM8, \XMM8 | |
855 | ||
856 | vmovdqa TMP7(%rsp), \T1 | |
857 | vmovdqa HashKey_2(arg1), \T5 | |
858 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
859 | vpxor \T3, \T4, \T4 | |
860 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
861 | vpxor \T3, \T7, \T7 | |
862 | ||
863 | vpshufd $0b01001110, \T1, \T3 | |
864 | vpxor \T1, \T3, \T3 | |
865 | vmovdqa HashKey_2_k(arg1), \T5 | |
866 | vpclmulqdq $0x10, \T5, \T3, \T3 | |
867 | vpxor \T3, \T6, \T6 | |
868 | ||
869 | ####################################################################### | |
870 | ||
871 | vmovdqu 16*9(arg1), \T5 | |
872 | vaesenc \T5, \XMM1, \XMM1 | |
873 | vaesenc \T5, \XMM2, \XMM2 | |
874 | vaesenc \T5, \XMM3, \XMM3 | |
875 | vaesenc \T5, \XMM4, \XMM4 | |
876 | vaesenc \T5, \XMM5, \XMM5 | |
877 | vaesenc \T5, \XMM6, \XMM6 | |
878 | vaesenc \T5, \XMM7, \XMM7 | |
879 | vaesenc \T5, \XMM8, \XMM8 | |
880 | ||
881 | vmovdqa TMP8(%rsp), \T1 | |
882 | vmovdqa HashKey(arg1), \T5 | |
883 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
884 | vpxor \T3, \T4, \T4 | |
885 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
886 | vpxor \T3, \T7, \T7 | |
887 | ||
888 | vpshufd $0b01001110, \T1, \T3 | |
889 | vpxor \T1, \T3, \T3 | |
890 | vmovdqa HashKey_k(arg1), \T5 | |
891 | vpclmulqdq $0x10, \T5, \T3, \T3 | |
892 | vpxor \T3, \T6, \T6 | |
893 | ||
894 | vpxor \T4, \T6, \T6 | |
895 | vpxor \T7, \T6, \T6 | |
896 | ||
897 | vmovdqu 16*10(arg1), \T5 | |
898 | ||
899 | i = 0 | |
900 | j = 1 | |
901 | setreg | |
902 | .rep 8 | |
903 | vpxor 16*i(arg3, %r11), \T5, \T2 | |
904 | .if \ENC_DEC == ENC | |
905 | vaesenclast \T2, reg_j, reg_j | |
906 | .else | |
907 | vaesenclast \T2, reg_j, \T3 | |
908 | vmovdqu 16*i(arg3, %r11), reg_j | |
909 | vmovdqu \T3, 16*i(arg2, %r11) | |
910 | .endif | |
911 | i = (i+1) | |
912 | j = (j+1) | |
913 | setreg | |
914 | .endr | |
915 | ####################################################################### | |
916 | ||
917 | ||
918 | vpslldq $8, \T6, \T3 # shift-L T3 2 DWs | |
919 | vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs | |
920 | vpxor \T3, \T7, \T7 | |
921 | vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 | |
922 | ||
923 | ||
924 | ||
925 | ####################################################################### | |
926 | #first phase of the reduction | |
927 | ####################################################################### | |
928 | vpslld $31, \T7, \T2 # packed right shifting << 31 | |
929 | vpslld $30, \T7, \T3 # packed right shifting shift << 30 | |
930 | vpslld $25, \T7, \T4 # packed right shifting shift << 25 | |
931 | ||
932 | vpxor \T3, \T2, \T2 # xor the shifted versions | |
933 | vpxor \T4, \T2, \T2 | |
934 | ||
935 | vpsrldq $4, \T2, \T1 # shift-R T1 1 DW | |
936 | ||
937 | vpslldq $12, \T2, \T2 # shift-L T2 3 DWs | |
938 | vpxor \T2, \T7, \T7 # first phase of the reduction complete | |
939 | ####################################################################### | |
940 | .if \ENC_DEC == ENC | |
941 | vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer | |
942 | vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer | |
943 | vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer | |
944 | vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer | |
945 | vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer | |
946 | vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer | |
947 | vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer | |
948 | vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer | |
949 | .endif | |
950 | ||
951 | ####################################################################### | |
952 | #second phase of the reduction | |
953 | vpsrld $1, \T7, \T2 # packed left shifting >> 1 | |
954 | vpsrld $2, \T7, \T3 # packed left shifting >> 2 | |
955 | vpsrld $7, \T7, \T4 # packed left shifting >> 7 | |
956 | vpxor \T3, \T2, \T2 # xor the shifted versions | |
957 | vpxor \T4, \T2, \T2 | |
958 | ||
959 | vpxor \T1, \T2, \T2 | |
960 | vpxor \T2, \T7, \T7 | |
961 | vpxor \T7, \T6, \T6 # the result is in T6 | |
962 | ####################################################################### | |
963 | ||
964 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | |
965 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | |
966 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | |
967 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | |
968 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | |
969 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | |
970 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | |
971 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | |
972 | ||
973 | ||
974 | vpxor \T6, \XMM1, \XMM1 | |
975 | ||
976 | ||
977 | ||
978 | .endm | |
979 | ||
980 | ||
981 | # GHASH the last 4 ciphertext blocks. | |
982 | .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 | |
983 | ||
984 | ## Karatsuba Method | |
985 | ||
986 | ||
987 | vpshufd $0b01001110, \XMM1, \T2 | |
988 | vpxor \XMM1, \T2, \T2 | |
989 | vmovdqa HashKey_8(arg1), \T5 | |
990 | vpclmulqdq $0x11, \T5, \XMM1, \T6 | |
991 | vpclmulqdq $0x00, \T5, \XMM1, \T7 | |
992 | ||
993 | vmovdqa HashKey_8_k(arg1), \T3 | |
994 | vpclmulqdq $0x00, \T3, \T2, \XMM1 | |
995 | ||
996 | ###################### | |
997 | ||
998 | vpshufd $0b01001110, \XMM2, \T2 | |
999 | vpxor \XMM2, \T2, \T2 | |
1000 | vmovdqa HashKey_7(arg1), \T5 | |
1001 | vpclmulqdq $0x11, \T5, \XMM2, \T4 | |
1002 | vpxor \T4, \T6, \T6 | |
1003 | ||
1004 | vpclmulqdq $0x00, \T5, \XMM2, \T4 | |
1005 | vpxor \T4, \T7, \T7 | |
1006 | ||
1007 | vmovdqa HashKey_7_k(arg1), \T3 | |
1008 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
1009 | vpxor \T2, \XMM1, \XMM1 | |
1010 | ||
1011 | ###################### | |
1012 | ||
1013 | vpshufd $0b01001110, \XMM3, \T2 | |
1014 | vpxor \XMM3, \T2, \T2 | |
1015 | vmovdqa HashKey_6(arg1), \T5 | |
1016 | vpclmulqdq $0x11, \T5, \XMM3, \T4 | |
1017 | vpxor \T4, \T6, \T6 | |
1018 | ||
1019 | vpclmulqdq $0x00, \T5, \XMM3, \T4 | |
1020 | vpxor \T4, \T7, \T7 | |
1021 | ||
1022 | vmovdqa HashKey_6_k(arg1), \T3 | |
1023 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
1024 | vpxor \T2, \XMM1, \XMM1 | |
1025 | ||
1026 | ###################### | |
1027 | ||
1028 | vpshufd $0b01001110, \XMM4, \T2 | |
1029 | vpxor \XMM4, \T2, \T2 | |
1030 | vmovdqa HashKey_5(arg1), \T5 | |
1031 | vpclmulqdq $0x11, \T5, \XMM4, \T4 | |
1032 | vpxor \T4, \T6, \T6 | |
1033 | ||
1034 | vpclmulqdq $0x00, \T5, \XMM4, \T4 | |
1035 | vpxor \T4, \T7, \T7 | |
1036 | ||
1037 | vmovdqa HashKey_5_k(arg1), \T3 | |
1038 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
1039 | vpxor \T2, \XMM1, \XMM1 | |
1040 | ||
1041 | ###################### | |
1042 | ||
1043 | vpshufd $0b01001110, \XMM5, \T2 | |
1044 | vpxor \XMM5, \T2, \T2 | |
1045 | vmovdqa HashKey_4(arg1), \T5 | |
1046 | vpclmulqdq $0x11, \T5, \XMM5, \T4 | |
1047 | vpxor \T4, \T6, \T6 | |
1048 | ||
1049 | vpclmulqdq $0x00, \T5, \XMM5, \T4 | |
1050 | vpxor \T4, \T7, \T7 | |
1051 | ||
1052 | vmovdqa HashKey_4_k(arg1), \T3 | |
1053 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
1054 | vpxor \T2, \XMM1, \XMM1 | |
1055 | ||
1056 | ###################### | |
1057 | ||
1058 | vpshufd $0b01001110, \XMM6, \T2 | |
1059 | vpxor \XMM6, \T2, \T2 | |
1060 | vmovdqa HashKey_3(arg1), \T5 | |
1061 | vpclmulqdq $0x11, \T5, \XMM6, \T4 | |
1062 | vpxor \T4, \T6, \T6 | |
1063 | ||
1064 | vpclmulqdq $0x00, \T5, \XMM6, \T4 | |
1065 | vpxor \T4, \T7, \T7 | |
1066 | ||
1067 | vmovdqa HashKey_3_k(arg1), \T3 | |
1068 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
1069 | vpxor \T2, \XMM1, \XMM1 | |
1070 | ||
1071 | ###################### | |
1072 | ||
1073 | vpshufd $0b01001110, \XMM7, \T2 | |
1074 | vpxor \XMM7, \T2, \T2 | |
1075 | vmovdqa HashKey_2(arg1), \T5 | |
1076 | vpclmulqdq $0x11, \T5, \XMM7, \T4 | |
1077 | vpxor \T4, \T6, \T6 | |
1078 | ||
1079 | vpclmulqdq $0x00, \T5, \XMM7, \T4 | |
1080 | vpxor \T4, \T7, \T7 | |
1081 | ||
1082 | vmovdqa HashKey_2_k(arg1), \T3 | |
1083 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
1084 | vpxor \T2, \XMM1, \XMM1 | |
1085 | ||
1086 | ###################### | |
1087 | ||
1088 | vpshufd $0b01001110, \XMM8, \T2 | |
1089 | vpxor \XMM8, \T2, \T2 | |
1090 | vmovdqa HashKey(arg1), \T5 | |
1091 | vpclmulqdq $0x11, \T5, \XMM8, \T4 | |
1092 | vpxor \T4, \T6, \T6 | |
1093 | ||
1094 | vpclmulqdq $0x00, \T5, \XMM8, \T4 | |
1095 | vpxor \T4, \T7, \T7 | |
1096 | ||
1097 | vmovdqa HashKey_k(arg1), \T3 | |
1098 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
1099 | ||
1100 | vpxor \T2, \XMM1, \XMM1 | |
1101 | vpxor \T6, \XMM1, \XMM1 | |
1102 | vpxor \T7, \XMM1, \T2 | |
1103 | ||
1104 | ||
1105 | ||
1106 | ||
1107 | vpslldq $8, \T2, \T4 | |
1108 | vpsrldq $8, \T2, \T2 | |
1109 | ||
1110 | vpxor \T4, \T7, \T7 | |
1111 | vpxor \T2, \T6, \T6 # <T6:T7> holds the result of | |
1112 | # the accumulated carry-less multiplications | |
1113 | ||
1114 | ####################################################################### | |
1115 | #first phase of the reduction | |
1116 | vpslld $31, \T7, \T2 # packed right shifting << 31 | |
1117 | vpslld $30, \T7, \T3 # packed right shifting shift << 30 | |
1118 | vpslld $25, \T7, \T4 # packed right shifting shift << 25 | |
1119 | ||
1120 | vpxor \T3, \T2, \T2 # xor the shifted versions | |
1121 | vpxor \T4, \T2, \T2 | |
1122 | ||
1123 | vpsrldq $4, \T2, \T1 # shift-R T1 1 DW | |
1124 | ||
1125 | vpslldq $12, \T2, \T2 # shift-L T2 3 DWs | |
1126 | vpxor \T2, \T7, \T7 # first phase of the reduction complete | |
1127 | ####################################################################### | |
1128 | ||
1129 | ||
1130 | #second phase of the reduction | |
1131 | vpsrld $1, \T7, \T2 # packed left shifting >> 1 | |
1132 | vpsrld $2, \T7, \T3 # packed left shifting >> 2 | |
1133 | vpsrld $7, \T7, \T4 # packed left shifting >> 7 | |
1134 | vpxor \T3, \T2, \T2 # xor the shifted versions | |
1135 | vpxor \T4, \T2, \T2 | |
1136 | ||
1137 | vpxor \T1, \T2, \T2 | |
1138 | vpxor \T2, \T7, \T7 | |
1139 | vpxor \T7, \T6, \T6 # the result is in T6 | |
1140 | ||
1141 | .endm | |
1142 | ||
1143 | ||
1144 | # combined for GCM encrypt and decrypt functions | |
1145 | # clobbering all xmm registers | |
1146 | # clobbering r10, r11, r12, r13, r14, r15 | |
1147 | .macro GCM_ENC_DEC_AVX ENC_DEC | |
1148 | ||
1149 | #the number of pushes must equal STACK_OFFSET | |
1150 | push %r12 | |
1151 | push %r13 | |
1152 | push %r14 | |
1153 | push %r15 | |
1154 | ||
1155 | mov %rsp, %r14 | |
1156 | ||
1157 | ||
1158 | ||
1159 | ||
1160 | sub $VARIABLE_OFFSET, %rsp | |
1161 | and $~63, %rsp # align rsp to 64 bytes | |
1162 | ||
1163 | ||
1164 | vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey | |
1165 | ||
1166 | mov arg4, %r13 # save the number of bytes of plaintext/ciphertext | |
1167 | and $-16, %r13 # r13 = r13 - (r13 mod 16) | |
1168 | ||
1169 | mov %r13, %r12 | |
1170 | shr $4, %r12 | |
1171 | and $7, %r12 | |
1172 | jz _initial_num_blocks_is_0\@ | |
1173 | ||
1174 | cmp $7, %r12 | |
1175 | je _initial_num_blocks_is_7\@ | |
1176 | cmp $6, %r12 | |
1177 | je _initial_num_blocks_is_6\@ | |
1178 | cmp $5, %r12 | |
1179 | je _initial_num_blocks_is_5\@ | |
1180 | cmp $4, %r12 | |
1181 | je _initial_num_blocks_is_4\@ | |
1182 | cmp $3, %r12 | |
1183 | je _initial_num_blocks_is_3\@ | |
1184 | cmp $2, %r12 | |
1185 | je _initial_num_blocks_is_2\@ | |
1186 | ||
1187 | jmp _initial_num_blocks_is_1\@ | |
1188 | ||
1189 | _initial_num_blocks_is_7\@: | |
1190 | INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
1191 | sub $16*7, %r13 | |
1192 | jmp _initial_blocks_encrypted\@ | |
1193 | ||
1194 | _initial_num_blocks_is_6\@: | |
1195 | INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
1196 | sub $16*6, %r13 | |
1197 | jmp _initial_blocks_encrypted\@ | |
1198 | ||
1199 | _initial_num_blocks_is_5\@: | |
1200 | INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
1201 | sub $16*5, %r13 | |
1202 | jmp _initial_blocks_encrypted\@ | |
1203 | ||
1204 | _initial_num_blocks_is_4\@: | |
1205 | INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
1206 | sub $16*4, %r13 | |
1207 | jmp _initial_blocks_encrypted\@ | |
1208 | ||
1209 | _initial_num_blocks_is_3\@: | |
1210 | INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
1211 | sub $16*3, %r13 | |
1212 | jmp _initial_blocks_encrypted\@ | |
1213 | ||
1214 | _initial_num_blocks_is_2\@: | |
1215 | INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
1216 | sub $16*2, %r13 | |
1217 | jmp _initial_blocks_encrypted\@ | |
1218 | ||
1219 | _initial_num_blocks_is_1\@: | |
1220 | INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
1221 | sub $16*1, %r13 | |
1222 | jmp _initial_blocks_encrypted\@ | |
1223 | ||
1224 | _initial_num_blocks_is_0\@: | |
1225 | INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
1226 | ||
1227 | ||
1228 | _initial_blocks_encrypted\@: | |
1229 | cmp $0, %r13 | |
1230 | je _zero_cipher_left\@ | |
1231 | ||
1232 | sub $128, %r13 | |
1233 | je _eight_cipher_left\@ | |
1234 | ||
1235 | ||
1236 | ||
1237 | ||
1238 | vmovd %xmm9, %r15d | |
1239 | and $255, %r15d | |
1240 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
1241 | ||
1242 | ||
1243 | _encrypt_by_8_new\@: | |
1244 | cmp $(255-8), %r15d | |
1245 | jg _encrypt_by_8\@ | |
1246 | ||
1247 | ||
1248 | ||
1249 | add $8, %r15b | |
1250 | GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC | |
1251 | add $128, %r11 | |
1252 | sub $128, %r13 | |
1253 | jne _encrypt_by_8_new\@ | |
1254 | ||
1255 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
1256 | jmp _eight_cipher_left\@ | |
1257 | ||
1258 | _encrypt_by_8\@: | |
1259 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
1260 | add $8, %r15b | |
1261 | GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC | |
1262 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
1263 | add $128, %r11 | |
1264 | sub $128, %r13 | |
1265 | jne _encrypt_by_8_new\@ | |
1266 | ||
1267 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
1268 | ||
1269 | ||
1270 | ||
1271 | ||
1272 | _eight_cipher_left\@: | |
1273 | GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 | |
1274 | ||
1275 | ||
1276 | _zero_cipher_left\@: | |
1277 | cmp $16, arg4 | |
1278 | jl _only_less_than_16\@ | |
1279 | ||
1280 | mov arg4, %r13 | |
1281 | and $15, %r13 # r13 = (arg4 mod 16) | |
1282 | ||
1283 | je _multiple_of_16_bytes\@ | |
1284 | ||
1285 | # handle the last <16 Byte block seperately | |
1286 | ||
1287 | ||
1288 | vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn | |
1289 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
1290 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) | |
1291 | ||
1292 | sub $16, %r11 | |
1293 | add %r13, %r11 | |
1294 | vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block | |
1295 | ||
1296 | lea SHIFT_MASK+16(%rip), %r12 | |
1297 | sub %r13, %r12 # adjust the shuffle mask pointer to be | |
1298 | # able to shift 16-r13 bytes (r13 is the | |
1299 | # number of bytes in plaintext mod 16) | |
1300 | vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask | |
1301 | vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes | |
1302 | jmp _final_ghash_mul\@ | |
1303 | ||
1304 | _only_less_than_16\@: | |
1305 | # check for 0 length | |
1306 | mov arg4, %r13 | |
1307 | and $15, %r13 # r13 = (arg4 mod 16) | |
1308 | ||
1309 | je _multiple_of_16_bytes\@ | |
1310 | ||
1311 | # handle the last <16 Byte block seperately | |
1312 | ||
1313 | ||
1314 | vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn | |
1315 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
1316 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) | |
1317 | ||
1318 | ||
1319 | lea SHIFT_MASK+16(%rip), %r12 | |
1320 | sub %r13, %r12 # adjust the shuffle mask pointer to be | |
1321 | # able to shift 16-r13 bytes (r13 is the | |
1322 | # number of bytes in plaintext mod 16) | |
1323 | ||
1324 | _get_last_16_byte_loop\@: | |
1325 | movb (arg3, %r11), %al | |
1326 | movb %al, TMP1 (%rsp , %r11) | |
1327 | add $1, %r11 | |
1328 | cmp %r13, %r11 | |
1329 | jne _get_last_16_byte_loop\@ | |
1330 | ||
1331 | vmovdqu TMP1(%rsp), %xmm1 | |
1332 | ||
1333 | sub $16, %r11 | |
1334 | ||
1335 | _final_ghash_mul\@: | |
1336 | .if \ENC_DEC == DEC | |
1337 | vmovdqa %xmm1, %xmm2 | |
1338 | vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) | |
1339 | vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to | |
1340 | # mask out top 16-r13 bytes of xmm9 | |
1341 | vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 | |
1342 | vpand %xmm1, %xmm2, %xmm2 | |
1343 | vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 | |
1344 | vpxor %xmm2, %xmm14, %xmm14 | |
1345 | #GHASH computation for the last <16 Byte block | |
1346 | GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 | |
1347 | sub %r13, %r11 | |
1348 | add $16, %r11 | |
1349 | .else | |
1350 | vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) | |
1351 | vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to | |
1352 | # mask out top 16-r13 bytes of xmm9 | |
1353 | vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 | |
1354 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
1355 | vpxor %xmm9, %xmm14, %xmm14 | |
1356 | #GHASH computation for the last <16 Byte block | |
1357 | GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 | |
1358 | sub %r13, %r11 | |
1359 | add $16, %r11 | |
1360 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext | |
1361 | .endif | |
1362 | ||
1363 | ||
1364 | ############################# | |
1365 | # output r13 Bytes | |
1366 | vmovq %xmm9, %rax | |
1367 | cmp $8, %r13 | |
1368 | jle _less_than_8_bytes_left\@ | |
1369 | ||
1370 | mov %rax, (arg2 , %r11) | |
1371 | add $8, %r11 | |
1372 | vpsrldq $8, %xmm9, %xmm9 | |
1373 | vmovq %xmm9, %rax | |
1374 | sub $8, %r13 | |
1375 | ||
1376 | _less_than_8_bytes_left\@: | |
1377 | movb %al, (arg2 , %r11) | |
1378 | add $1, %r11 | |
1379 | shr $8, %rax | |
1380 | sub $1, %r13 | |
1381 | jne _less_than_8_bytes_left\@ | |
1382 | ############################# | |
1383 | ||
1384 | _multiple_of_16_bytes\@: | |
1385 | mov arg7, %r12 # r12 = aadLen (number of bytes) | |
1386 | shl $3, %r12 # convert into number of bits | |
1387 | vmovd %r12d, %xmm15 # len(A) in xmm15 | |
1388 | ||
1389 | shl $3, arg4 # len(C) in bits (*128) | |
1390 | vmovq arg4, %xmm1 | |
1391 | vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 | |
1392 | vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) | |
1393 | ||
1394 | vpxor %xmm15, %xmm14, %xmm14 | |
1395 | GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation | |
1396 | vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap | |
1397 | ||
1398 | mov arg5, %rax # rax = *Y0 | |
1399 | vmovdqu (%rax), %xmm9 # xmm9 = Y0 | |
1400 | ||
1401 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) | |
1402 | ||
1403 | vpxor %xmm14, %xmm9, %xmm9 | |
1404 | ||
1405 | ||
1406 | ||
1407 | _return_T\@: | |
1408 | mov arg8, %r10 # r10 = authTag | |
1409 | mov arg9, %r11 # r11 = auth_tag_len | |
1410 | ||
1411 | cmp $16, %r11 | |
1412 | je _T_16\@ | |
1413 | ||
1414 | cmp $12, %r11 | |
1415 | je _T_12\@ | |
1416 | ||
1417 | _T_8\@: | |
1418 | vmovq %xmm9, %rax | |
1419 | mov %rax, (%r10) | |
1420 | jmp _return_T_done\@ | |
1421 | _T_12\@: | |
1422 | vmovq %xmm9, %rax | |
1423 | mov %rax, (%r10) | |
1424 | vpsrldq $8, %xmm9, %xmm9 | |
1425 | vmovd %xmm9, %eax | |
1426 | mov %eax, 8(%r10) | |
1427 | jmp _return_T_done\@ | |
1428 | ||
1429 | _T_16\@: | |
1430 | vmovdqu %xmm9, (%r10) | |
1431 | ||
1432 | _return_T_done\@: | |
1433 | mov %r14, %rsp | |
1434 | ||
1435 | pop %r15 | |
1436 | pop %r14 | |
1437 | pop %r13 | |
1438 | pop %r12 | |
1439 | .endm | |
1440 | ||
1441 | ||
1442 | ############################################################# | |
1443 | #void aesni_gcm_precomp_avx_gen2 | |
1444 | # (gcm_data *my_ctx_data, | |
1445 | # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ | |
1446 | ############################################################# | |
1447 | ENTRY(aesni_gcm_precomp_avx_gen2) | |
1448 | #the number of pushes must equal STACK_OFFSET | |
1449 | push %r12 | |
1450 | push %r13 | |
1451 | push %r14 | |
1452 | push %r15 | |
1453 | ||
1454 | mov %rsp, %r14 | |
1455 | ||
1456 | ||
1457 | ||
1458 | sub $VARIABLE_OFFSET, %rsp | |
1459 | and $~63, %rsp # align rsp to 64 bytes | |
1460 | ||
1461 | vmovdqu (arg2), %xmm6 # xmm6 = HashKey | |
1462 | ||
1463 | vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 | |
1464 | ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey | |
1465 | vmovdqa %xmm6, %xmm2 | |
1466 | vpsllq $1, %xmm6, %xmm6 | |
1467 | vpsrlq $63, %xmm2, %xmm2 | |
1468 | vmovdqa %xmm2, %xmm1 | |
1469 | vpslldq $8, %xmm2, %xmm2 | |
1470 | vpsrldq $8, %xmm1, %xmm1 | |
1471 | vpor %xmm2, %xmm6, %xmm6 | |
1472 | #reduction | |
1473 | vpshufd $0b00100100, %xmm1, %xmm2 | |
1474 | vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 | |
1475 | vpand POLY(%rip), %xmm2, %xmm2 | |
1476 | vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly | |
1477 | ####################################################################### | |
1478 | vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly | |
1479 | ||
1480 | ||
1481 | PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 | |
1482 | ||
1483 | mov %r14, %rsp | |
1484 | ||
1485 | pop %r15 | |
1486 | pop %r14 | |
1487 | pop %r13 | |
1488 | pop %r12 | |
1489 | ret | |
1490 | ENDPROC(aesni_gcm_precomp_avx_gen2) | |
1491 | ||
1492 | ############################################################################### | |
1493 | #void aesni_gcm_enc_avx_gen2( | |
1494 | # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ | |
1495 | # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ | |
1496 | # const u8 *in, /* Plaintext input */ | |
1497 | # u64 plaintext_len, /* Length of data in Bytes for encryption. */ | |
1498 | # u8 *iv, /* Pre-counter block j0: 4 byte salt | |
1499 | # (from Security Association) concatenated with 8 byte | |
1500 | # Initialisation Vector (from IPSec ESP Payload) | |
1501 | # concatenated with 0x00000001. 16-byte aligned pointer. */ | |
1502 | # const u8 *aad, /* Additional Authentication Data (AAD)*/ | |
1503 | # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ | |
1504 | # u8 *auth_tag, /* Authenticated Tag output. */ | |
1505 | # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. | |
1506 | # Valid values are 16 (most likely), 12 or 8. */ | |
1507 | ############################################################################### | |
1508 | ENTRY(aesni_gcm_enc_avx_gen2) | |
1509 | GCM_ENC_DEC_AVX ENC | |
1510 | ret | |
1511 | ENDPROC(aesni_gcm_enc_avx_gen2) | |
1512 | ||
1513 | ############################################################################### | |
1514 | #void aesni_gcm_dec_avx_gen2( | |
1515 | # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ | |
1516 | # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ | |
1517 | # const u8 *in, /* Ciphertext input */ | |
1518 | # u64 plaintext_len, /* Length of data in Bytes for encryption. */ | |
1519 | # u8 *iv, /* Pre-counter block j0: 4 byte salt | |
1520 | # (from Security Association) concatenated with 8 byte | |
1521 | # Initialisation Vector (from IPSec ESP Payload) | |
1522 | # concatenated with 0x00000001. 16-byte aligned pointer. */ | |
1523 | # const u8 *aad, /* Additional Authentication Data (AAD)*/ | |
1524 | # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ | |
1525 | # u8 *auth_tag, /* Authenticated Tag output. */ | |
1526 | # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. | |
1527 | # Valid values are 16 (most likely), 12 or 8. */ | |
1528 | ############################################################################### | |
1529 | ENTRY(aesni_gcm_dec_avx_gen2) | |
1530 | GCM_ENC_DEC_AVX DEC | |
1531 | ret | |
1532 | ENDPROC(aesni_gcm_dec_avx_gen2) | |
1533 | #endif /* CONFIG_AS_AVX */ | |
1534 | ||
1535 | #ifdef CONFIG_AS_AVX2 | |
1536 | ############################################################################### | |
1537 | # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) | |
1538 | # Input: A and B (128-bits each, bit-reflected) | |
1539 | # Output: C = A*B*x mod poly, (i.e. >>1 ) | |
1540 | # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | |
1541 | # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | |
1542 | ############################################################################### | |
1543 | .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 | |
1544 | ||
1545 | vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 | |
1546 | vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 | |
1547 | vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 | |
1548 | vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 | |
1549 | vpxor \T3, \GH, \GH | |
1550 | ||
1551 | ||
1552 | vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs | |
1553 | vpslldq $8 , \GH, \GH # shift-L GH 2 DWs | |
1554 | ||
1555 | vpxor \T3, \T1, \T1 | |
1556 | vpxor \T2, \GH, \GH | |
1557 | ||
1558 | ####################################################################### | |
1559 | #first phase of the reduction | |
1560 | vmovdqa POLY2(%rip), \T3 | |
1561 | ||
1562 | vpclmulqdq $0x01, \GH, \T3, \T2 | |
1563 | vpslldq $8, \T2, \T2 # shift-L T2 2 DWs | |
1564 | ||
1565 | vpxor \T2, \GH, \GH # first phase of the reduction complete | |
1566 | ####################################################################### | |
1567 | #second phase of the reduction | |
1568 | vpclmulqdq $0x00, \GH, \T3, \T2 | |
1569 | vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | |
1570 | ||
1571 | vpclmulqdq $0x10, \GH, \T3, \GH | |
1572 | vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) | |
1573 | ||
1574 | vpxor \T2, \GH, \GH # second phase of the reduction complete | |
1575 | ####################################################################### | |
1576 | vpxor \T1, \GH, \GH # the result is in GH | |
1577 | ||
1578 | ||
1579 | .endm | |
1580 | ||
1581 | .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 | |
1582 | ||
1583 | # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | |
1584 | vmovdqa \HK, \T5 | |
1585 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly | |
1586 | vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly | |
1587 | ||
1588 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly | |
1589 | vmovdqa \T5, HashKey_3(arg1) | |
1590 | ||
1591 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly | |
1592 | vmovdqa \T5, HashKey_4(arg1) | |
1593 | ||
1594 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly | |
1595 | vmovdqa \T5, HashKey_5(arg1) | |
1596 | ||
1597 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly | |
1598 | vmovdqa \T5, HashKey_6(arg1) | |
1599 | ||
1600 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly | |
1601 | vmovdqa \T5, HashKey_7(arg1) | |
1602 | ||
1603 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly | |
1604 | vmovdqa \T5, HashKey_8(arg1) | |
1605 | ||
1606 | .endm | |
1607 | ||
1608 | ||
1609 | ## if a = number of total plaintext bytes | |
1610 | ## b = floor(a/16) | |
1611 | ## num_initial_blocks = b mod 4# | |
1612 | ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext | |
1613 | ## r10, r11, r12, rax are clobbered | |
1614 | ## arg1, arg2, arg3, r14 are used as a pointer only, not modified | |
1615 | ||
1616 | .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER | |
1617 | i = (8-\num_initial_blocks) | |
1618 | setreg | |
1619 | ||
1620 | mov arg6, %r10 # r10 = AAD | |
1621 | mov arg7, %r12 # r12 = aadLen | |
1622 | ||
1623 | ||
1624 | mov %r12, %r11 | |
1625 | ||
1626 | vpxor reg_i, reg_i, reg_i | |
1627 | _get_AAD_loop\@: | |
1628 | vmovd (%r10), \T1 | |
1629 | vpslldq $12, \T1, \T1 | |
1630 | vpsrldq $4, reg_i, reg_i | |
1631 | vpxor \T1, reg_i, reg_i | |
1632 | ||
1633 | add $4, %r10 | |
1634 | sub $4, %r12 | |
1635 | jg _get_AAD_loop\@ | |
1636 | ||
1637 | ||
1638 | cmp $16, %r11 | |
1639 | je _get_AAD_loop2_done\@ | |
1640 | mov $16, %r12 | |
1641 | ||
1642 | _get_AAD_loop2\@: | |
1643 | vpsrldq $4, reg_i, reg_i | |
1644 | sub $4, %r12 | |
1645 | cmp %r11, %r12 | |
1646 | jg _get_AAD_loop2\@ | |
1647 | ||
1648 | _get_AAD_loop2_done\@: | |
1649 | ||
1650 | #byte-reflect the AAD data | |
1651 | vpshufb SHUF_MASK(%rip), reg_i, reg_i | |
1652 | ||
1653 | # initialize the data pointer offset as zero | |
1654 | xor %r11, %r11 | |
1655 | ||
1656 | # start AES for num_initial_blocks blocks | |
1657 | mov arg5, %rax # rax = *Y0 | |
1658 | vmovdqu (%rax), \CTR # CTR = Y0 | |
1659 | vpshufb SHUF_MASK(%rip), \CTR, \CTR | |
1660 | ||
1661 | ||
1662 | i = (9-\num_initial_blocks) | |
1663 | setreg | |
1664 | .rep \num_initial_blocks | |
1665 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
1666 | vmovdqa \CTR, reg_i | |
1667 | vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap | |
1668 | i = (i+1) | |
1669 | setreg | |
1670 | .endr | |
1671 | ||
1672 | vmovdqa (arg1), \T_key | |
1673 | i = (9-\num_initial_blocks) | |
1674 | setreg | |
1675 | .rep \num_initial_blocks | |
1676 | vpxor \T_key, reg_i, reg_i | |
1677 | i = (i+1) | |
1678 | setreg | |
1679 | .endr | |
1680 | ||
1681 | j = 1 | |
1682 | setreg | |
1683 | .rep 9 | |
1684 | vmovdqa 16*j(arg1), \T_key | |
1685 | i = (9-\num_initial_blocks) | |
1686 | setreg | |
1687 | .rep \num_initial_blocks | |
1688 | vaesenc \T_key, reg_i, reg_i | |
1689 | i = (i+1) | |
1690 | setreg | |
1691 | .endr | |
1692 | ||
1693 | j = (j+1) | |
1694 | setreg | |
1695 | .endr | |
1696 | ||
1697 | ||
1698 | vmovdqa 16*10(arg1), \T_key | |
1699 | i = (9-\num_initial_blocks) | |
1700 | setreg | |
1701 | .rep \num_initial_blocks | |
1702 | vaesenclast \T_key, reg_i, reg_i | |
1703 | i = (i+1) | |
1704 | setreg | |
1705 | .endr | |
1706 | ||
1707 | i = (9-\num_initial_blocks) | |
1708 | setreg | |
1709 | .rep \num_initial_blocks | |
1710 | vmovdqu (arg3, %r11), \T1 | |
1711 | vpxor \T1, reg_i, reg_i | |
1712 | vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for | |
1713 | # num_initial_blocks blocks | |
1714 | add $16, %r11 | |
1715 | .if \ENC_DEC == DEC | |
1716 | vmovdqa \T1, reg_i | |
1717 | .endif | |
1718 | vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations | |
1719 | i = (i+1) | |
1720 | setreg | |
1721 | .endr | |
1722 | ||
1723 | ||
1724 | i = (8-\num_initial_blocks) | |
1725 | j = (9-\num_initial_blocks) | |
1726 | setreg | |
1727 | GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 | |
1728 | ||
1729 | .rep \num_initial_blocks | |
1730 | vpxor reg_i, reg_j, reg_j | |
1731 | GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks | |
1732 | i = (i+1) | |
1733 | j = (j+1) | |
1734 | setreg | |
1735 | .endr | |
1736 | # XMM8 has the combined result here | |
1737 | ||
1738 | vmovdqa \XMM8, TMP1(%rsp) | |
1739 | vmovdqa \XMM8, \T3 | |
1740 | ||
1741 | cmp $128, %r13 | |
1742 | jl _initial_blocks_done\@ # no need for precomputed constants | |
1743 | ||
1744 | ############################################################################### | |
1745 | # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | |
1746 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
1747 | vmovdqa \CTR, \XMM1 | |
1748 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | |
1749 | ||
1750 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
1751 | vmovdqa \CTR, \XMM2 | |
1752 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | |
1753 | ||
1754 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
1755 | vmovdqa \CTR, \XMM3 | |
1756 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | |
1757 | ||
1758 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
1759 | vmovdqa \CTR, \XMM4 | |
1760 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | |
1761 | ||
1762 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
1763 | vmovdqa \CTR, \XMM5 | |
1764 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | |
1765 | ||
1766 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
1767 | vmovdqa \CTR, \XMM6 | |
1768 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | |
1769 | ||
1770 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
1771 | vmovdqa \CTR, \XMM7 | |
1772 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | |
1773 | ||
1774 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | |
1775 | vmovdqa \CTR, \XMM8 | |
1776 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | |
1777 | ||
1778 | vmovdqa (arg1), \T_key | |
1779 | vpxor \T_key, \XMM1, \XMM1 | |
1780 | vpxor \T_key, \XMM2, \XMM2 | |
1781 | vpxor \T_key, \XMM3, \XMM3 | |
1782 | vpxor \T_key, \XMM4, \XMM4 | |
1783 | vpxor \T_key, \XMM5, \XMM5 | |
1784 | vpxor \T_key, \XMM6, \XMM6 | |
1785 | vpxor \T_key, \XMM7, \XMM7 | |
1786 | vpxor \T_key, \XMM8, \XMM8 | |
1787 | ||
1788 | i = 1 | |
1789 | setreg | |
1790 | .rep 9 # do 9 rounds | |
1791 | vmovdqa 16*i(arg1), \T_key | |
1792 | vaesenc \T_key, \XMM1, \XMM1 | |
1793 | vaesenc \T_key, \XMM2, \XMM2 | |
1794 | vaesenc \T_key, \XMM3, \XMM3 | |
1795 | vaesenc \T_key, \XMM4, \XMM4 | |
1796 | vaesenc \T_key, \XMM5, \XMM5 | |
1797 | vaesenc \T_key, \XMM6, \XMM6 | |
1798 | vaesenc \T_key, \XMM7, \XMM7 | |
1799 | vaesenc \T_key, \XMM8, \XMM8 | |
1800 | i = (i+1) | |
1801 | setreg | |
1802 | .endr | |
1803 | ||
1804 | ||
1805 | vmovdqa 16*i(arg1), \T_key | |
1806 | vaesenclast \T_key, \XMM1, \XMM1 | |
1807 | vaesenclast \T_key, \XMM2, \XMM2 | |
1808 | vaesenclast \T_key, \XMM3, \XMM3 | |
1809 | vaesenclast \T_key, \XMM4, \XMM4 | |
1810 | vaesenclast \T_key, \XMM5, \XMM5 | |
1811 | vaesenclast \T_key, \XMM6, \XMM6 | |
1812 | vaesenclast \T_key, \XMM7, \XMM7 | |
1813 | vaesenclast \T_key, \XMM8, \XMM8 | |
1814 | ||
1815 | vmovdqu (arg3, %r11), \T1 | |
1816 | vpxor \T1, \XMM1, \XMM1 | |
1817 | vmovdqu \XMM1, (arg2 , %r11) | |
1818 | .if \ENC_DEC == DEC | |
1819 | vmovdqa \T1, \XMM1 | |
1820 | .endif | |
1821 | ||
1822 | vmovdqu 16*1(arg3, %r11), \T1 | |
1823 | vpxor \T1, \XMM2, \XMM2 | |
1824 | vmovdqu \XMM2, 16*1(arg2 , %r11) | |
1825 | .if \ENC_DEC == DEC | |
1826 | vmovdqa \T1, \XMM2 | |
1827 | .endif | |
1828 | ||
1829 | vmovdqu 16*2(arg3, %r11), \T1 | |
1830 | vpxor \T1, \XMM3, \XMM3 | |
1831 | vmovdqu \XMM3, 16*2(arg2 , %r11) | |
1832 | .if \ENC_DEC == DEC | |
1833 | vmovdqa \T1, \XMM3 | |
1834 | .endif | |
1835 | ||
1836 | vmovdqu 16*3(arg3, %r11), \T1 | |
1837 | vpxor \T1, \XMM4, \XMM4 | |
1838 | vmovdqu \XMM4, 16*3(arg2 , %r11) | |
1839 | .if \ENC_DEC == DEC | |
1840 | vmovdqa \T1, \XMM4 | |
1841 | .endif | |
1842 | ||
1843 | vmovdqu 16*4(arg3, %r11), \T1 | |
1844 | vpxor \T1, \XMM5, \XMM5 | |
1845 | vmovdqu \XMM5, 16*4(arg2 , %r11) | |
1846 | .if \ENC_DEC == DEC | |
1847 | vmovdqa \T1, \XMM5 | |
1848 | .endif | |
1849 | ||
1850 | vmovdqu 16*5(arg3, %r11), \T1 | |
1851 | vpxor \T1, \XMM6, \XMM6 | |
1852 | vmovdqu \XMM6, 16*5(arg2 , %r11) | |
1853 | .if \ENC_DEC == DEC | |
1854 | vmovdqa \T1, \XMM6 | |
1855 | .endif | |
1856 | ||
1857 | vmovdqu 16*6(arg3, %r11), \T1 | |
1858 | vpxor \T1, \XMM7, \XMM7 | |
1859 | vmovdqu \XMM7, 16*6(arg2 , %r11) | |
1860 | .if \ENC_DEC == DEC | |
1861 | vmovdqa \T1, \XMM7 | |
1862 | .endif | |
1863 | ||
1864 | vmovdqu 16*7(arg3, %r11), \T1 | |
1865 | vpxor \T1, \XMM8, \XMM8 | |
1866 | vmovdqu \XMM8, 16*7(arg2 , %r11) | |
1867 | .if \ENC_DEC == DEC | |
1868 | vmovdqa \T1, \XMM8 | |
1869 | .endif | |
1870 | ||
1871 | add $128, %r11 | |
1872 | ||
1873 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | |
1874 | vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with | |
1875 | # the corresponding ciphertext | |
1876 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | |
1877 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | |
1878 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | |
1879 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | |
1880 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | |
1881 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | |
1882 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | |
1883 | ||
1884 | ############################################################################### | |
1885 | ||
1886 | _initial_blocks_done\@: | |
1887 | ||
1888 | ||
1889 | .endm | |
1890 | ||
1891 | ||
1892 | ||
1893 | # encrypt 8 blocks at a time | |
1894 | # ghash the 8 previously encrypted ciphertext blocks | |
1895 | # arg1, arg2, arg3 are used as pointers only, not modified | |
1896 | # r11 is the data offset value | |
1897 | .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC | |
1898 | ||
1899 | vmovdqa \XMM1, \T2 | |
1900 | vmovdqa \XMM2, TMP2(%rsp) | |
1901 | vmovdqa \XMM3, TMP3(%rsp) | |
1902 | vmovdqa \XMM4, TMP4(%rsp) | |
1903 | vmovdqa \XMM5, TMP5(%rsp) | |
1904 | vmovdqa \XMM6, TMP6(%rsp) | |
1905 | vmovdqa \XMM7, TMP7(%rsp) | |
1906 | vmovdqa \XMM8, TMP8(%rsp) | |
1907 | ||
1908 | .if \loop_idx == in_order | |
1909 | vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT | |
1910 | vpaddd ONE(%rip), \XMM1, \XMM2 | |
1911 | vpaddd ONE(%rip), \XMM2, \XMM3 | |
1912 | vpaddd ONE(%rip), \XMM3, \XMM4 | |
1913 | vpaddd ONE(%rip), \XMM4, \XMM5 | |
1914 | vpaddd ONE(%rip), \XMM5, \XMM6 | |
1915 | vpaddd ONE(%rip), \XMM6, \XMM7 | |
1916 | vpaddd ONE(%rip), \XMM7, \XMM8 | |
1917 | vmovdqa \XMM8, \CTR | |
1918 | ||
1919 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | |
1920 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | |
1921 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | |
1922 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | |
1923 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | |
1924 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | |
1925 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | |
1926 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | |
1927 | .else | |
1928 | vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT | |
1929 | vpaddd ONEf(%rip), \XMM1, \XMM2 | |
1930 | vpaddd ONEf(%rip), \XMM2, \XMM3 | |
1931 | vpaddd ONEf(%rip), \XMM3, \XMM4 | |
1932 | vpaddd ONEf(%rip), \XMM4, \XMM5 | |
1933 | vpaddd ONEf(%rip), \XMM5, \XMM6 | |
1934 | vpaddd ONEf(%rip), \XMM6, \XMM7 | |
1935 | vpaddd ONEf(%rip), \XMM7, \XMM8 | |
1936 | vmovdqa \XMM8, \CTR | |
1937 | .endif | |
1938 | ||
1939 | ||
1940 | ####################################################################### | |
1941 | ||
1942 | vmovdqu (arg1), \T1 | |
1943 | vpxor \T1, \XMM1, \XMM1 | |
1944 | vpxor \T1, \XMM2, \XMM2 | |
1945 | vpxor \T1, \XMM3, \XMM3 | |
1946 | vpxor \T1, \XMM4, \XMM4 | |
1947 | vpxor \T1, \XMM5, \XMM5 | |
1948 | vpxor \T1, \XMM6, \XMM6 | |
1949 | vpxor \T1, \XMM7, \XMM7 | |
1950 | vpxor \T1, \XMM8, \XMM8 | |
1951 | ||
1952 | ####################################################################### | |
1953 | ||
1954 | ||
1955 | ||
1956 | ||
1957 | ||
1958 | vmovdqu 16*1(arg1), \T1 | |
1959 | vaesenc \T1, \XMM1, \XMM1 | |
1960 | vaesenc \T1, \XMM2, \XMM2 | |
1961 | vaesenc \T1, \XMM3, \XMM3 | |
1962 | vaesenc \T1, \XMM4, \XMM4 | |
1963 | vaesenc \T1, \XMM5, \XMM5 | |
1964 | vaesenc \T1, \XMM6, \XMM6 | |
1965 | vaesenc \T1, \XMM7, \XMM7 | |
1966 | vaesenc \T1, \XMM8, \XMM8 | |
1967 | ||
1968 | vmovdqu 16*2(arg1), \T1 | |
1969 | vaesenc \T1, \XMM1, \XMM1 | |
1970 | vaesenc \T1, \XMM2, \XMM2 | |
1971 | vaesenc \T1, \XMM3, \XMM3 | |
1972 | vaesenc \T1, \XMM4, \XMM4 | |
1973 | vaesenc \T1, \XMM5, \XMM5 | |
1974 | vaesenc \T1, \XMM6, \XMM6 | |
1975 | vaesenc \T1, \XMM7, \XMM7 | |
1976 | vaesenc \T1, \XMM8, \XMM8 | |
1977 | ||
1978 | ||
1979 | ####################################################################### | |
1980 | ||
1981 | vmovdqa HashKey_8(arg1), \T5 | |
1982 | vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 | |
1983 | vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 | |
1984 | vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 | |
1985 | vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 | |
1986 | vpxor \T5, \T6, \T6 | |
1987 | ||
1988 | vmovdqu 16*3(arg1), \T1 | |
1989 | vaesenc \T1, \XMM1, \XMM1 | |
1990 | vaesenc \T1, \XMM2, \XMM2 | |
1991 | vaesenc \T1, \XMM3, \XMM3 | |
1992 | vaesenc \T1, \XMM4, \XMM4 | |
1993 | vaesenc \T1, \XMM5, \XMM5 | |
1994 | vaesenc \T1, \XMM6, \XMM6 | |
1995 | vaesenc \T1, \XMM7, \XMM7 | |
1996 | vaesenc \T1, \XMM8, \XMM8 | |
1997 | ||
1998 | vmovdqa TMP2(%rsp), \T1 | |
1999 | vmovdqa HashKey_7(arg1), \T5 | |
2000 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
2001 | vpxor \T3, \T4, \T4 | |
2002 | ||
2003 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
2004 | vpxor \T3, \T7, \T7 | |
2005 | ||
2006 | vpclmulqdq $0x01, \T5, \T1, \T3 | |
2007 | vpxor \T3, \T6, \T6 | |
2008 | ||
2009 | vpclmulqdq $0x10, \T5, \T1, \T3 | |
2010 | vpxor \T3, \T6, \T6 | |
2011 | ||
2012 | vmovdqu 16*4(arg1), \T1 | |
2013 | vaesenc \T1, \XMM1, \XMM1 | |
2014 | vaesenc \T1, \XMM2, \XMM2 | |
2015 | vaesenc \T1, \XMM3, \XMM3 | |
2016 | vaesenc \T1, \XMM4, \XMM4 | |
2017 | vaesenc \T1, \XMM5, \XMM5 | |
2018 | vaesenc \T1, \XMM6, \XMM6 | |
2019 | vaesenc \T1, \XMM7, \XMM7 | |
2020 | vaesenc \T1, \XMM8, \XMM8 | |
2021 | ||
2022 | ####################################################################### | |
2023 | ||
2024 | vmovdqa TMP3(%rsp), \T1 | |
2025 | vmovdqa HashKey_6(arg1), \T5 | |
2026 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
2027 | vpxor \T3, \T4, \T4 | |
2028 | ||
2029 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
2030 | vpxor \T3, \T7, \T7 | |
2031 | ||
2032 | vpclmulqdq $0x01, \T5, \T1, \T3 | |
2033 | vpxor \T3, \T6, \T6 | |
2034 | ||
2035 | vpclmulqdq $0x10, \T5, \T1, \T3 | |
2036 | vpxor \T3, \T6, \T6 | |
2037 | ||
2038 | vmovdqu 16*5(arg1), \T1 | |
2039 | vaesenc \T1, \XMM1, \XMM1 | |
2040 | vaesenc \T1, \XMM2, \XMM2 | |
2041 | vaesenc \T1, \XMM3, \XMM3 | |
2042 | vaesenc \T1, \XMM4, \XMM4 | |
2043 | vaesenc \T1, \XMM5, \XMM5 | |
2044 | vaesenc \T1, \XMM6, \XMM6 | |
2045 | vaesenc \T1, \XMM7, \XMM7 | |
2046 | vaesenc \T1, \XMM8, \XMM8 | |
2047 | ||
2048 | vmovdqa TMP4(%rsp), \T1 | |
2049 | vmovdqa HashKey_5(arg1), \T5 | |
2050 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
2051 | vpxor \T3, \T4, \T4 | |
2052 | ||
2053 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
2054 | vpxor \T3, \T7, \T7 | |
2055 | ||
2056 | vpclmulqdq $0x01, \T5, \T1, \T3 | |
2057 | vpxor \T3, \T6, \T6 | |
2058 | ||
2059 | vpclmulqdq $0x10, \T5, \T1, \T3 | |
2060 | vpxor \T3, \T6, \T6 | |
2061 | ||
2062 | vmovdqu 16*6(arg1), \T1 | |
2063 | vaesenc \T1, \XMM1, \XMM1 | |
2064 | vaesenc \T1, \XMM2, \XMM2 | |
2065 | vaesenc \T1, \XMM3, \XMM3 | |
2066 | vaesenc \T1, \XMM4, \XMM4 | |
2067 | vaesenc \T1, \XMM5, \XMM5 | |
2068 | vaesenc \T1, \XMM6, \XMM6 | |
2069 | vaesenc \T1, \XMM7, \XMM7 | |
2070 | vaesenc \T1, \XMM8, \XMM8 | |
2071 | ||
2072 | ||
2073 | vmovdqa TMP5(%rsp), \T1 | |
2074 | vmovdqa HashKey_4(arg1), \T5 | |
2075 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
2076 | vpxor \T3, \T4, \T4 | |
2077 | ||
2078 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
2079 | vpxor \T3, \T7, \T7 | |
2080 | ||
2081 | vpclmulqdq $0x01, \T5, \T1, \T3 | |
2082 | vpxor \T3, \T6, \T6 | |
2083 | ||
2084 | vpclmulqdq $0x10, \T5, \T1, \T3 | |
2085 | vpxor \T3, \T6, \T6 | |
2086 | ||
2087 | vmovdqu 16*7(arg1), \T1 | |
2088 | vaesenc \T1, \XMM1, \XMM1 | |
2089 | vaesenc \T1, \XMM2, \XMM2 | |
2090 | vaesenc \T1, \XMM3, \XMM3 | |
2091 | vaesenc \T1, \XMM4, \XMM4 | |
2092 | vaesenc \T1, \XMM5, \XMM5 | |
2093 | vaesenc \T1, \XMM6, \XMM6 | |
2094 | vaesenc \T1, \XMM7, \XMM7 | |
2095 | vaesenc \T1, \XMM8, \XMM8 | |
2096 | ||
2097 | vmovdqa TMP6(%rsp), \T1 | |
2098 | vmovdqa HashKey_3(arg1), \T5 | |
2099 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
2100 | vpxor \T3, \T4, \T4 | |
2101 | ||
2102 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
2103 | vpxor \T3, \T7, \T7 | |
2104 | ||
2105 | vpclmulqdq $0x01, \T5, \T1, \T3 | |
2106 | vpxor \T3, \T6, \T6 | |
2107 | ||
2108 | vpclmulqdq $0x10, \T5, \T1, \T3 | |
2109 | vpxor \T3, \T6, \T6 | |
2110 | ||
2111 | vmovdqu 16*8(arg1), \T1 | |
2112 | vaesenc \T1, \XMM1, \XMM1 | |
2113 | vaesenc \T1, \XMM2, \XMM2 | |
2114 | vaesenc \T1, \XMM3, \XMM3 | |
2115 | vaesenc \T1, \XMM4, \XMM4 | |
2116 | vaesenc \T1, \XMM5, \XMM5 | |
2117 | vaesenc \T1, \XMM6, \XMM6 | |
2118 | vaesenc \T1, \XMM7, \XMM7 | |
2119 | vaesenc \T1, \XMM8, \XMM8 | |
2120 | ||
2121 | vmovdqa TMP7(%rsp), \T1 | |
2122 | vmovdqa HashKey_2(arg1), \T5 | |
2123 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
2124 | vpxor \T3, \T4, \T4 | |
2125 | ||
2126 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
2127 | vpxor \T3, \T7, \T7 | |
2128 | ||
2129 | vpclmulqdq $0x01, \T5, \T1, \T3 | |
2130 | vpxor \T3, \T6, \T6 | |
2131 | ||
2132 | vpclmulqdq $0x10, \T5, \T1, \T3 | |
2133 | vpxor \T3, \T6, \T6 | |
2134 | ||
2135 | ||
2136 | ####################################################################### | |
2137 | ||
2138 | vmovdqu 16*9(arg1), \T5 | |
2139 | vaesenc \T5, \XMM1, \XMM1 | |
2140 | vaesenc \T5, \XMM2, \XMM2 | |
2141 | vaesenc \T5, \XMM3, \XMM3 | |
2142 | vaesenc \T5, \XMM4, \XMM4 | |
2143 | vaesenc \T5, \XMM5, \XMM5 | |
2144 | vaesenc \T5, \XMM6, \XMM6 | |
2145 | vaesenc \T5, \XMM7, \XMM7 | |
2146 | vaesenc \T5, \XMM8, \XMM8 | |
2147 | ||
2148 | vmovdqa TMP8(%rsp), \T1 | |
2149 | vmovdqa HashKey(arg1), \T5 | |
2150 | ||
2151 | vpclmulqdq $0x00, \T5, \T1, \T3 | |
2152 | vpxor \T3, \T7, \T7 | |
2153 | ||
2154 | vpclmulqdq $0x01, \T5, \T1, \T3 | |
2155 | vpxor \T3, \T6, \T6 | |
2156 | ||
2157 | vpclmulqdq $0x10, \T5, \T1, \T3 | |
2158 | vpxor \T3, \T6, \T6 | |
2159 | ||
2160 | vpclmulqdq $0x11, \T5, \T1, \T3 | |
2161 | vpxor \T3, \T4, \T1 | |
2162 | ||
2163 | ||
2164 | vmovdqu 16*10(arg1), \T5 | |
2165 | ||
2166 | i = 0 | |
2167 | j = 1 | |
2168 | setreg | |
2169 | .rep 8 | |
2170 | vpxor 16*i(arg3, %r11), \T5, \T2 | |
2171 | .if \ENC_DEC == ENC | |
2172 | vaesenclast \T2, reg_j, reg_j | |
2173 | .else | |
2174 | vaesenclast \T2, reg_j, \T3 | |
2175 | vmovdqu 16*i(arg3, %r11), reg_j | |
2176 | vmovdqu \T3, 16*i(arg2, %r11) | |
2177 | .endif | |
2178 | i = (i+1) | |
2179 | j = (j+1) | |
2180 | setreg | |
2181 | .endr | |
2182 | ####################################################################### | |
2183 | ||
2184 | ||
2185 | vpslldq $8, \T6, \T3 # shift-L T3 2 DWs | |
2186 | vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs | |
2187 | vpxor \T3, \T7, \T7 | |
2188 | vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 | |
2189 | ||
2190 | ||
2191 | ||
2192 | ####################################################################### | |
2193 | #first phase of the reduction | |
2194 | vmovdqa POLY2(%rip), \T3 | |
2195 | ||
2196 | vpclmulqdq $0x01, \T7, \T3, \T2 | |
2197 | vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs | |
2198 | ||
2199 | vpxor \T2, \T7, \T7 # first phase of the reduction complete | |
2200 | ####################################################################### | |
2201 | .if \ENC_DEC == ENC | |
2202 | vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer | |
2203 | vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer | |
2204 | vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer | |
2205 | vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer | |
2206 | vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer | |
2207 | vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer | |
2208 | vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer | |
2209 | vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer | |
2210 | .endif | |
2211 | ||
2212 | ####################################################################### | |
2213 | #second phase of the reduction | |
2214 | vpclmulqdq $0x00, \T7, \T3, \T2 | |
2215 | vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | |
2216 | ||
2217 | vpclmulqdq $0x10, \T7, \T3, \T4 | |
2218 | vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) | |
2219 | ||
2220 | vpxor \T2, \T4, \T4 # second phase of the reduction complete | |
2221 | ####################################################################### | |
2222 | vpxor \T4, \T1, \T1 # the result is in T1 | |
2223 | ||
2224 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | |
2225 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | |
2226 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | |
2227 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | |
2228 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | |
2229 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | |
2230 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | |
2231 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | |
2232 | ||
2233 | ||
2234 | vpxor \T1, \XMM1, \XMM1 | |
2235 | ||
2236 | ||
2237 | ||
2238 | .endm | |
2239 | ||
2240 | ||
2241 | # GHASH the last 4 ciphertext blocks. | |
2242 | .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 | |
2243 | ||
2244 | ## Karatsuba Method | |
2245 | ||
2246 | vmovdqa HashKey_8(arg1), \T5 | |
2247 | ||
2248 | vpshufd $0b01001110, \XMM1, \T2 | |
2249 | vpshufd $0b01001110, \T5, \T3 | |
2250 | vpxor \XMM1, \T2, \T2 | |
2251 | vpxor \T5, \T3, \T3 | |
2252 | ||
2253 | vpclmulqdq $0x11, \T5, \XMM1, \T6 | |
2254 | vpclmulqdq $0x00, \T5, \XMM1, \T7 | |
2255 | ||
2256 | vpclmulqdq $0x00, \T3, \T2, \XMM1 | |
2257 | ||
2258 | ###################### | |
2259 | ||
2260 | vmovdqa HashKey_7(arg1), \T5 | |
2261 | vpshufd $0b01001110, \XMM2, \T2 | |
2262 | vpshufd $0b01001110, \T5, \T3 | |
2263 | vpxor \XMM2, \T2, \T2 | |
2264 | vpxor \T5, \T3, \T3 | |
2265 | ||
2266 | vpclmulqdq $0x11, \T5, \XMM2, \T4 | |
2267 | vpxor \T4, \T6, \T6 | |
2268 | ||
2269 | vpclmulqdq $0x00, \T5, \XMM2, \T4 | |
2270 | vpxor \T4, \T7, \T7 | |
2271 | ||
2272 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
2273 | ||
2274 | vpxor \T2, \XMM1, \XMM1 | |
2275 | ||
2276 | ###################### | |
2277 | ||
2278 | vmovdqa HashKey_6(arg1), \T5 | |
2279 | vpshufd $0b01001110, \XMM3, \T2 | |
2280 | vpshufd $0b01001110, \T5, \T3 | |
2281 | vpxor \XMM3, \T2, \T2 | |
2282 | vpxor \T5, \T3, \T3 | |
2283 | ||
2284 | vpclmulqdq $0x11, \T5, \XMM3, \T4 | |
2285 | vpxor \T4, \T6, \T6 | |
2286 | ||
2287 | vpclmulqdq $0x00, \T5, \XMM3, \T4 | |
2288 | vpxor \T4, \T7, \T7 | |
2289 | ||
2290 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
2291 | ||
2292 | vpxor \T2, \XMM1, \XMM1 | |
2293 | ||
2294 | ###################### | |
2295 | ||
2296 | vmovdqa HashKey_5(arg1), \T5 | |
2297 | vpshufd $0b01001110, \XMM4, \T2 | |
2298 | vpshufd $0b01001110, \T5, \T3 | |
2299 | vpxor \XMM4, \T2, \T2 | |
2300 | vpxor \T5, \T3, \T3 | |
2301 | ||
2302 | vpclmulqdq $0x11, \T5, \XMM4, \T4 | |
2303 | vpxor \T4, \T6, \T6 | |
2304 | ||
2305 | vpclmulqdq $0x00, \T5, \XMM4, \T4 | |
2306 | vpxor \T4, \T7, \T7 | |
2307 | ||
2308 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
2309 | ||
2310 | vpxor \T2, \XMM1, \XMM1 | |
2311 | ||
2312 | ###################### | |
2313 | ||
2314 | vmovdqa HashKey_4(arg1), \T5 | |
2315 | vpshufd $0b01001110, \XMM5, \T2 | |
2316 | vpshufd $0b01001110, \T5, \T3 | |
2317 | vpxor \XMM5, \T2, \T2 | |
2318 | vpxor \T5, \T3, \T3 | |
2319 | ||
2320 | vpclmulqdq $0x11, \T5, \XMM5, \T4 | |
2321 | vpxor \T4, \T6, \T6 | |
2322 | ||
2323 | vpclmulqdq $0x00, \T5, \XMM5, \T4 | |
2324 | vpxor \T4, \T7, \T7 | |
2325 | ||
2326 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
2327 | ||
2328 | vpxor \T2, \XMM1, \XMM1 | |
2329 | ||
2330 | ###################### | |
2331 | ||
2332 | vmovdqa HashKey_3(arg1), \T5 | |
2333 | vpshufd $0b01001110, \XMM6, \T2 | |
2334 | vpshufd $0b01001110, \T5, \T3 | |
2335 | vpxor \XMM6, \T2, \T2 | |
2336 | vpxor \T5, \T3, \T3 | |
2337 | ||
2338 | vpclmulqdq $0x11, \T5, \XMM6, \T4 | |
2339 | vpxor \T4, \T6, \T6 | |
2340 | ||
2341 | vpclmulqdq $0x00, \T5, \XMM6, \T4 | |
2342 | vpxor \T4, \T7, \T7 | |
2343 | ||
2344 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
2345 | ||
2346 | vpxor \T2, \XMM1, \XMM1 | |
2347 | ||
2348 | ###################### | |
2349 | ||
2350 | vmovdqa HashKey_2(arg1), \T5 | |
2351 | vpshufd $0b01001110, \XMM7, \T2 | |
2352 | vpshufd $0b01001110, \T5, \T3 | |
2353 | vpxor \XMM7, \T2, \T2 | |
2354 | vpxor \T5, \T3, \T3 | |
2355 | ||
2356 | vpclmulqdq $0x11, \T5, \XMM7, \T4 | |
2357 | vpxor \T4, \T6, \T6 | |
2358 | ||
2359 | vpclmulqdq $0x00, \T5, \XMM7, \T4 | |
2360 | vpxor \T4, \T7, \T7 | |
2361 | ||
2362 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
2363 | ||
2364 | vpxor \T2, \XMM1, \XMM1 | |
2365 | ||
2366 | ###################### | |
2367 | ||
2368 | vmovdqa HashKey(arg1), \T5 | |
2369 | vpshufd $0b01001110, \XMM8, \T2 | |
2370 | vpshufd $0b01001110, \T5, \T3 | |
2371 | vpxor \XMM8, \T2, \T2 | |
2372 | vpxor \T5, \T3, \T3 | |
2373 | ||
2374 | vpclmulqdq $0x11, \T5, \XMM8, \T4 | |
2375 | vpxor \T4, \T6, \T6 | |
2376 | ||
2377 | vpclmulqdq $0x00, \T5, \XMM8, \T4 | |
2378 | vpxor \T4, \T7, \T7 | |
2379 | ||
2380 | vpclmulqdq $0x00, \T3, \T2, \T2 | |
2381 | ||
2382 | vpxor \T2, \XMM1, \XMM1 | |
2383 | vpxor \T6, \XMM1, \XMM1 | |
2384 | vpxor \T7, \XMM1, \T2 | |
2385 | ||
2386 | ||
2387 | ||
2388 | ||
2389 | vpslldq $8, \T2, \T4 | |
2390 | vpsrldq $8, \T2, \T2 | |
2391 | ||
2392 | vpxor \T4, \T7, \T7 | |
2393 | vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the | |
2394 | # accumulated carry-less multiplications | |
2395 | ||
2396 | ####################################################################### | |
2397 | #first phase of the reduction | |
2398 | vmovdqa POLY2(%rip), \T3 | |
2399 | ||
2400 | vpclmulqdq $0x01, \T7, \T3, \T2 | |
2401 | vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs | |
2402 | ||
2403 | vpxor \T2, \T7, \T7 # first phase of the reduction complete | |
2404 | ####################################################################### | |
2405 | ||
2406 | ||
2407 | #second phase of the reduction | |
2408 | vpclmulqdq $0x00, \T7, \T3, \T2 | |
2409 | vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | |
2410 | ||
2411 | vpclmulqdq $0x10, \T7, \T3, \T4 | |
2412 | vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) | |
2413 | ||
2414 | vpxor \T2, \T4, \T4 # second phase of the reduction complete | |
2415 | ####################################################################### | |
2416 | vpxor \T4, \T6, \T6 # the result is in T6 | |
2417 | .endm | |
2418 | ||
2419 | ||
2420 | ||
2421 | # combined for GCM encrypt and decrypt functions | |
2422 | # clobbering all xmm registers | |
2423 | # clobbering r10, r11, r12, r13, r14, r15 | |
2424 | .macro GCM_ENC_DEC_AVX2 ENC_DEC | |
2425 | ||
2426 | #the number of pushes must equal STACK_OFFSET | |
2427 | push %r12 | |
2428 | push %r13 | |
2429 | push %r14 | |
2430 | push %r15 | |
2431 | ||
2432 | mov %rsp, %r14 | |
2433 | ||
2434 | ||
2435 | ||
2436 | ||
2437 | sub $VARIABLE_OFFSET, %rsp | |
2438 | and $~63, %rsp # align rsp to 64 bytes | |
2439 | ||
2440 | ||
2441 | vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey | |
2442 | ||
2443 | mov arg4, %r13 # save the number of bytes of plaintext/ciphertext | |
2444 | and $-16, %r13 # r13 = r13 - (r13 mod 16) | |
2445 | ||
2446 | mov %r13, %r12 | |
2447 | shr $4, %r12 | |
2448 | and $7, %r12 | |
2449 | jz _initial_num_blocks_is_0\@ | |
2450 | ||
2451 | cmp $7, %r12 | |
2452 | je _initial_num_blocks_is_7\@ | |
2453 | cmp $6, %r12 | |
2454 | je _initial_num_blocks_is_6\@ | |
2455 | cmp $5, %r12 | |
2456 | je _initial_num_blocks_is_5\@ | |
2457 | cmp $4, %r12 | |
2458 | je _initial_num_blocks_is_4\@ | |
2459 | cmp $3, %r12 | |
2460 | je _initial_num_blocks_is_3\@ | |
2461 | cmp $2, %r12 | |
2462 | je _initial_num_blocks_is_2\@ | |
2463 | ||
2464 | jmp _initial_num_blocks_is_1\@ | |
2465 | ||
2466 | _initial_num_blocks_is_7\@: | |
2467 | INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
2468 | sub $16*7, %r13 | |
2469 | jmp _initial_blocks_encrypted\@ | |
2470 | ||
2471 | _initial_num_blocks_is_6\@: | |
2472 | INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
2473 | sub $16*6, %r13 | |
2474 | jmp _initial_blocks_encrypted\@ | |
2475 | ||
2476 | _initial_num_blocks_is_5\@: | |
2477 | INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
2478 | sub $16*5, %r13 | |
2479 | jmp _initial_blocks_encrypted\@ | |
2480 | ||
2481 | _initial_num_blocks_is_4\@: | |
2482 | INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
2483 | sub $16*4, %r13 | |
2484 | jmp _initial_blocks_encrypted\@ | |
2485 | ||
2486 | _initial_num_blocks_is_3\@: | |
2487 | INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
2488 | sub $16*3, %r13 | |
2489 | jmp _initial_blocks_encrypted\@ | |
2490 | ||
2491 | _initial_num_blocks_is_2\@: | |
2492 | INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
2493 | sub $16*2, %r13 | |
2494 | jmp _initial_blocks_encrypted\@ | |
2495 | ||
2496 | _initial_num_blocks_is_1\@: | |
2497 | INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
2498 | sub $16*1, %r13 | |
2499 | jmp _initial_blocks_encrypted\@ | |
2500 | ||
2501 | _initial_num_blocks_is_0\@: | |
2502 | INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | |
2503 | ||
2504 | ||
2505 | _initial_blocks_encrypted\@: | |
2506 | cmp $0, %r13 | |
2507 | je _zero_cipher_left\@ | |
2508 | ||
2509 | sub $128, %r13 | |
2510 | je _eight_cipher_left\@ | |
2511 | ||
2512 | ||
2513 | ||
2514 | ||
2515 | vmovd %xmm9, %r15d | |
2516 | and $255, %r15d | |
2517 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
2518 | ||
2519 | ||
2520 | _encrypt_by_8_new\@: | |
2521 | cmp $(255-8), %r15d | |
2522 | jg _encrypt_by_8\@ | |
2523 | ||
2524 | ||
2525 | ||
2526 | add $8, %r15b | |
2527 | GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC | |
2528 | add $128, %r11 | |
2529 | sub $128, %r13 | |
2530 | jne _encrypt_by_8_new\@ | |
2531 | ||
2532 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
2533 | jmp _eight_cipher_left\@ | |
2534 | ||
2535 | _encrypt_by_8\@: | |
2536 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
2537 | add $8, %r15b | |
2538 | GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC | |
2539 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
2540 | add $128, %r11 | |
2541 | sub $128, %r13 | |
2542 | jne _encrypt_by_8_new\@ | |
2543 | ||
2544 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
2545 | ||
2546 | ||
2547 | ||
2548 | ||
2549 | _eight_cipher_left\@: | |
2550 | GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 | |
2551 | ||
2552 | ||
2553 | _zero_cipher_left\@: | |
2554 | cmp $16, arg4 | |
2555 | jl _only_less_than_16\@ | |
2556 | ||
2557 | mov arg4, %r13 | |
2558 | and $15, %r13 # r13 = (arg4 mod 16) | |
2559 | ||
2560 | je _multiple_of_16_bytes\@ | |
2561 | ||
2562 | # handle the last <16 Byte block seperately | |
2563 | ||
2564 | ||
2565 | vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn | |
2566 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
2567 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) | |
2568 | ||
2569 | sub $16, %r11 | |
2570 | add %r13, %r11 | |
2571 | vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block | |
2572 | ||
2573 | lea SHIFT_MASK+16(%rip), %r12 | |
2574 | sub %r13, %r12 # adjust the shuffle mask pointer | |
2575 | # to be able to shift 16-r13 bytes | |
2576 | # (r13 is the number of bytes in plaintext mod 16) | |
2577 | vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask | |
2578 | vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes | |
2579 | jmp _final_ghash_mul\@ | |
2580 | ||
2581 | _only_less_than_16\@: | |
2582 | # check for 0 length | |
2583 | mov arg4, %r13 | |
2584 | and $15, %r13 # r13 = (arg4 mod 16) | |
2585 | ||
2586 | je _multiple_of_16_bytes\@ | |
2587 | ||
2588 | # handle the last <16 Byte block seperately | |
2589 | ||
2590 | ||
2591 | vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn | |
2592 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
2593 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) | |
2594 | ||
2595 | ||
2596 | lea SHIFT_MASK+16(%rip), %r12 | |
2597 | sub %r13, %r12 # adjust the shuffle mask pointer to be | |
2598 | # able to shift 16-r13 bytes (r13 is the | |
2599 | # number of bytes in plaintext mod 16) | |
2600 | ||
2601 | _get_last_16_byte_loop\@: | |
2602 | movb (arg3, %r11), %al | |
2603 | movb %al, TMP1 (%rsp , %r11) | |
2604 | add $1, %r11 | |
2605 | cmp %r13, %r11 | |
2606 | jne _get_last_16_byte_loop\@ | |
2607 | ||
2608 | vmovdqu TMP1(%rsp), %xmm1 | |
2609 | ||
2610 | sub $16, %r11 | |
2611 | ||
2612 | _final_ghash_mul\@: | |
2613 | .if \ENC_DEC == DEC | |
2614 | vmovdqa %xmm1, %xmm2 | |
2615 | vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) | |
2616 | vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 | |
2617 | vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 | |
2618 | vpand %xmm1, %xmm2, %xmm2 | |
2619 | vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 | |
2620 | vpxor %xmm2, %xmm14, %xmm14 | |
2621 | #GHASH computation for the last <16 Byte block | |
2622 | GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 | |
2623 | sub %r13, %r11 | |
2624 | add $16, %r11 | |
2625 | .else | |
2626 | vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) | |
2627 | vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 | |
2628 | vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 | |
2629 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | |
2630 | vpxor %xmm9, %xmm14, %xmm14 | |
2631 | #GHASH computation for the last <16 Byte block | |
2632 | GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 | |
2633 | sub %r13, %r11 | |
2634 | add $16, %r11 | |
2635 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext | |
2636 | .endif | |
2637 | ||
2638 | ||
2639 | ############################# | |
2640 | # output r13 Bytes | |
2641 | vmovq %xmm9, %rax | |
2642 | cmp $8, %r13 | |
2643 | jle _less_than_8_bytes_left\@ | |
2644 | ||
2645 | mov %rax, (arg2 , %r11) | |
2646 | add $8, %r11 | |
2647 | vpsrldq $8, %xmm9, %xmm9 | |
2648 | vmovq %xmm9, %rax | |
2649 | sub $8, %r13 | |
2650 | ||
2651 | _less_than_8_bytes_left\@: | |
2652 | movb %al, (arg2 , %r11) | |
2653 | add $1, %r11 | |
2654 | shr $8, %rax | |
2655 | sub $1, %r13 | |
2656 | jne _less_than_8_bytes_left\@ | |
2657 | ############################# | |
2658 | ||
2659 | _multiple_of_16_bytes\@: | |
2660 | mov arg7, %r12 # r12 = aadLen (number of bytes) | |
2661 | shl $3, %r12 # convert into number of bits | |
2662 | vmovd %r12d, %xmm15 # len(A) in xmm15 | |
2663 | ||
2664 | shl $3, arg4 # len(C) in bits (*128) | |
2665 | vmovq arg4, %xmm1 | |
2666 | vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 | |
2667 | vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) | |
2668 | ||
2669 | vpxor %xmm15, %xmm14, %xmm14 | |
2670 | GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation | |
2671 | vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap | |
2672 | ||
2673 | mov arg5, %rax # rax = *Y0 | |
2674 | vmovdqu (%rax), %xmm9 # xmm9 = Y0 | |
2675 | ||
2676 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) | |
2677 | ||
2678 | vpxor %xmm14, %xmm9, %xmm9 | |
2679 | ||
2680 | ||
2681 | ||
2682 | _return_T\@: | |
2683 | mov arg8, %r10 # r10 = authTag | |
2684 | mov arg9, %r11 # r11 = auth_tag_len | |
2685 | ||
2686 | cmp $16, %r11 | |
2687 | je _T_16\@ | |
2688 | ||
2689 | cmp $12, %r11 | |
2690 | je _T_12\@ | |
2691 | ||
2692 | _T_8\@: | |
2693 | vmovq %xmm9, %rax | |
2694 | mov %rax, (%r10) | |
2695 | jmp _return_T_done\@ | |
2696 | _T_12\@: | |
2697 | vmovq %xmm9, %rax | |
2698 | mov %rax, (%r10) | |
2699 | vpsrldq $8, %xmm9, %xmm9 | |
2700 | vmovd %xmm9, %eax | |
2701 | mov %eax, 8(%r10) | |
2702 | jmp _return_T_done\@ | |
2703 | ||
2704 | _T_16\@: | |
2705 | vmovdqu %xmm9, (%r10) | |
2706 | ||
2707 | _return_T_done\@: | |
2708 | mov %r14, %rsp | |
2709 | ||
2710 | pop %r15 | |
2711 | pop %r14 | |
2712 | pop %r13 | |
2713 | pop %r12 | |
2714 | .endm | |
2715 | ||
2716 | ||
2717 | ############################################################# | |
2718 | #void aesni_gcm_precomp_avx_gen4 | |
2719 | # (gcm_data *my_ctx_data, | |
2720 | # u8 *hash_subkey)# /* H, the Hash sub key input. | |
2721 | # Data starts on a 16-byte boundary. */ | |
2722 | ############################################################# | |
2723 | ENTRY(aesni_gcm_precomp_avx_gen4) | |
2724 | #the number of pushes must equal STACK_OFFSET | |
2725 | push %r12 | |
2726 | push %r13 | |
2727 | push %r14 | |
2728 | push %r15 | |
2729 | ||
2730 | mov %rsp, %r14 | |
2731 | ||
2732 | ||
2733 | ||
2734 | sub $VARIABLE_OFFSET, %rsp | |
2735 | and $~63, %rsp # align rsp to 64 bytes | |
2736 | ||
2737 | vmovdqu (arg2), %xmm6 # xmm6 = HashKey | |
2738 | ||
2739 | vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 | |
2740 | ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey | |
2741 | vmovdqa %xmm6, %xmm2 | |
2742 | vpsllq $1, %xmm6, %xmm6 | |
2743 | vpsrlq $63, %xmm2, %xmm2 | |
2744 | vmovdqa %xmm2, %xmm1 | |
2745 | vpslldq $8, %xmm2, %xmm2 | |
2746 | vpsrldq $8, %xmm1, %xmm1 | |
2747 | vpor %xmm2, %xmm6, %xmm6 | |
2748 | #reduction | |
2749 | vpshufd $0b00100100, %xmm1, %xmm2 | |
2750 | vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 | |
2751 | vpand POLY(%rip), %xmm2, %xmm2 | |
2752 | vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly | |
2753 | ####################################################################### | |
2754 | vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly | |
2755 | ||
2756 | ||
2757 | PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 | |
2758 | ||
2759 | mov %r14, %rsp | |
2760 | ||
2761 | pop %r15 | |
2762 | pop %r14 | |
2763 | pop %r13 | |
2764 | pop %r12 | |
2765 | ret | |
2766 | ENDPROC(aesni_gcm_precomp_avx_gen4) | |
2767 | ||
2768 | ||
2769 | ############################################################################### | |
2770 | #void aesni_gcm_enc_avx_gen4( | |
2771 | # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ | |
2772 | # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ | |
2773 | # const u8 *in, /* Plaintext input */ | |
2774 | # u64 plaintext_len, /* Length of data in Bytes for encryption. */ | |
2775 | # u8 *iv, /* Pre-counter block j0: 4 byte salt | |
2776 | # (from Security Association) concatenated with 8 byte | |
2777 | # Initialisation Vector (from IPSec ESP Payload) | |
2778 | # concatenated with 0x00000001. 16-byte aligned pointer. */ | |
2779 | # const u8 *aad, /* Additional Authentication Data (AAD)*/ | |
2780 | # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ | |
2781 | # u8 *auth_tag, /* Authenticated Tag output. */ | |
2782 | # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. | |
2783 | # Valid values are 16 (most likely), 12 or 8. */ | |
2784 | ############################################################################### | |
2785 | ENTRY(aesni_gcm_enc_avx_gen4) | |
2786 | GCM_ENC_DEC_AVX2 ENC | |
2787 | ret | |
2788 | ENDPROC(aesni_gcm_enc_avx_gen4) | |
2789 | ||
2790 | ############################################################################### | |
2791 | #void aesni_gcm_dec_avx_gen4( | |
2792 | # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ | |
2793 | # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ | |
2794 | # const u8 *in, /* Ciphertext input */ | |
2795 | # u64 plaintext_len, /* Length of data in Bytes for encryption. */ | |
2796 | # u8 *iv, /* Pre-counter block j0: 4 byte salt | |
2797 | # (from Security Association) concatenated with 8 byte | |
2798 | # Initialisation Vector (from IPSec ESP Payload) | |
2799 | # concatenated with 0x00000001. 16-byte aligned pointer. */ | |
2800 | # const u8 *aad, /* Additional Authentication Data (AAD)*/ | |
2801 | # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ | |
2802 | # u8 *auth_tag, /* Authenticated Tag output. */ | |
2803 | # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. | |
2804 | # Valid values are 16 (most likely), 12 or 8. */ | |
2805 | ############################################################################### | |
2806 | ENTRY(aesni_gcm_dec_avx_gen4) | |
2807 | GCM_ENC_DEC_AVX2 DEC | |
2808 | ret | |
2809 | ENDPROC(aesni_gcm_dec_avx_gen4) | |
2810 | ||
2811 | #endif /* CONFIG_AS_AVX2 */ |