Commit | Line | Data |
---|---|---|
22cddcc7 | 1 | /* |
2 | * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) | |
3 | * | |
4 | * This is AES128/192/256 CTR mode optimization implementation. It requires | |
5 | * the support of Intel(R) AESNI and AVX instructions. | |
6 | * | |
7 | * This work was inspired by the AES CTR mode optimization published | |
8 | * in Intel Optimized IPSEC Cryptograhpic library. | |
9 | * Additional information on it can be found at: | |
10 | * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 | |
11 | * | |
12 | * This file is provided under a dual BSD/GPLv2 license. When using or | |
13 | * redistributing this file, you may do so under either license. | |
14 | * | |
15 | * GPL LICENSE SUMMARY | |
16 | * | |
17 | * Copyright(c) 2014 Intel Corporation. | |
18 | * | |
19 | * This program is free software; you can redistribute it and/or modify | |
20 | * it under the terms of version 2 of the GNU General Public License as | |
21 | * published by the Free Software Foundation. | |
22 | * | |
23 | * This program is distributed in the hope that it will be useful, but | |
24 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
26 | * General Public License for more details. | |
27 | * | |
28 | * Contact Information: | |
29 | * James Guilford <james.guilford@intel.com> | |
30 | * Sean Gulley <sean.m.gulley@intel.com> | |
31 | * Chandramouli Narayanan <mouli@linux.intel.com> | |
32 | * | |
33 | * BSD LICENSE | |
34 | * | |
35 | * Copyright(c) 2014 Intel Corporation. | |
36 | * | |
37 | * Redistribution and use in source and binary forms, with or without | |
38 | * modification, are permitted provided that the following conditions | |
39 | * are met: | |
40 | * | |
41 | * Redistributions of source code must retain the above copyright | |
42 | * notice, this list of conditions and the following disclaimer. | |
43 | * Redistributions in binary form must reproduce the above copyright | |
44 | * notice, this list of conditions and the following disclaimer in | |
45 | * the documentation and/or other materials provided with the | |
46 | * distribution. | |
47 | * Neither the name of Intel Corporation nor the names of its | |
48 | * contributors may be used to endorse or promote products derived | |
49 | * from this software without specific prior written permission. | |
50 | * | |
51 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
52 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
53 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
54 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
55 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
56 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
57 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
58 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
59 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
60 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
61 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
62 | * | |
63 | */ | |
64 | ||
65 | #include <linux/linkage.h> | |
66 | #include <asm/inst.h> | |
67 | ||
68 | #define CONCAT(a,b) a##b | |
69 | #define VMOVDQ vmovdqu | |
70 | ||
71 | #define xdata0 %xmm0 | |
72 | #define xdata1 %xmm1 | |
73 | #define xdata2 %xmm2 | |
74 | #define xdata3 %xmm3 | |
75 | #define xdata4 %xmm4 | |
76 | #define xdata5 %xmm5 | |
77 | #define xdata6 %xmm6 | |
78 | #define xdata7 %xmm7 | |
79 | #define xcounter %xmm8 | |
80 | #define xbyteswap %xmm9 | |
81 | #define xkey0 %xmm10 | |
22cddcc7 | 82 | #define xkey4 %xmm11 |
83 | #define xkey8 %xmm12 | |
84 | #define xkey12 %xmm13 | |
85 | #define xkeyA %xmm14 | |
86 | #define xkeyB %xmm15 | |
87 | ||
88 | #define p_in %rdi | |
89 | #define p_iv %rsi | |
90 | #define p_keys %rdx | |
91 | #define p_out %rcx | |
92 | #define num_bytes %r8 | |
93 | ||
94 | #define tmp %r10 | |
95 | #define DDQ(i) CONCAT(ddq_add_,i) | |
96 | #define XMM(i) CONCAT(%xmm, i) | |
97 | #define DDQ_DATA 0 | |
98 | #define XDATA 1 | |
99 | #define KEY_128 1 | |
100 | #define KEY_192 2 | |
101 | #define KEY_256 3 | |
102 | ||
103 | .section .rodata | |
104 | .align 16 | |
105 | ||
106 | byteswap_const: | |
107 | .octa 0x000102030405060708090A0B0C0D0E0F | |
80dca473 MK |
108 | ddq_low_msk: |
109 | .octa 0x0000000000000000FFFFFFFFFFFFFFFF | |
110 | ddq_high_add_1: | |
111 | .octa 0x00000000000000010000000000000000 | |
22cddcc7 | 112 | ddq_add_1: |
113 | .octa 0x00000000000000000000000000000001 | |
114 | ddq_add_2: | |
115 | .octa 0x00000000000000000000000000000002 | |
116 | ddq_add_3: | |
117 | .octa 0x00000000000000000000000000000003 | |
118 | ddq_add_4: | |
119 | .octa 0x00000000000000000000000000000004 | |
120 | ddq_add_5: | |
121 | .octa 0x00000000000000000000000000000005 | |
122 | ddq_add_6: | |
123 | .octa 0x00000000000000000000000000000006 | |
124 | ddq_add_7: | |
125 | .octa 0x00000000000000000000000000000007 | |
126 | ddq_add_8: | |
127 | .octa 0x00000000000000000000000000000008 | |
128 | ||
129 | .text | |
130 | ||
131 | /* generate a unique variable for ddq_add_x */ | |
132 | ||
133 | .macro setddq n | |
134 | var_ddq_add = DDQ(\n) | |
135 | .endm | |
136 | ||
137 | /* generate a unique variable for xmm register */ | |
138 | .macro setxdata n | |
139 | var_xdata = XMM(\n) | |
140 | .endm | |
141 | ||
142 | /* club the numeric 'id' to the symbol 'name' */ | |
143 | ||
144 | .macro club name, id | |
145 | .altmacro | |
146 | .if \name == DDQ_DATA | |
147 | setddq %\id | |
148 | .elseif \name == XDATA | |
149 | setxdata %\id | |
150 | .endif | |
151 | .noaltmacro | |
152 | .endm | |
153 | ||
154 | /* | |
155 | * do_aes num_in_par load_keys key_len | |
156 | * This increments p_in, but not p_out | |
157 | */ | |
158 | .macro do_aes b, k, key_len | |
159 | .set by, \b | |
160 | .set load_keys, \k | |
161 | .set klen, \key_len | |
162 | ||
163 | .if (load_keys) | |
164 | vmovdqa 0*16(p_keys), xkey0 | |
165 | .endif | |
166 | ||
167 | vpshufb xbyteswap, xcounter, xdata0 | |
168 | ||
169 | .set i, 1 | |
170 | .rept (by - 1) | |
171 | club DDQ_DATA, i | |
172 | club XDATA, i | |
80dca473 MK |
173 | vpaddq var_ddq_add(%rip), xcounter, var_xdata |
174 | vptest ddq_low_msk(%rip), var_xdata | |
175 | jnz 1f | |
176 | vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata | |
177 | vpaddq ddq_high_add_1(%rip), xcounter, xcounter | |
178 | 1: | |
22cddcc7 | 179 | vpshufb xbyteswap, var_xdata, var_xdata |
180 | .set i, (i +1) | |
181 | .endr | |
182 | ||
183 | vmovdqa 1*16(p_keys), xkeyA | |
184 | ||
185 | vpxor xkey0, xdata0, xdata0 | |
186 | club DDQ_DATA, by | |
80dca473 MK |
187 | vpaddq var_ddq_add(%rip), xcounter, xcounter |
188 | vptest ddq_low_msk(%rip), xcounter | |
189 | jnz 1f | |
190 | vpaddq ddq_high_add_1(%rip), xcounter, xcounter | |
191 | 1: | |
22cddcc7 | 192 | |
193 | .set i, 1 | |
194 | .rept (by - 1) | |
195 | club XDATA, i | |
196 | vpxor xkey0, var_xdata, var_xdata | |
197 | .set i, (i +1) | |
198 | .endr | |
199 | ||
200 | vmovdqa 2*16(p_keys), xkeyB | |
201 | ||
202 | .set i, 0 | |
203 | .rept by | |
204 | club XDATA, i | |
205 | vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ | |
206 | .set i, (i +1) | |
207 | .endr | |
208 | ||
209 | .if (klen == KEY_128) | |
210 | .if (load_keys) | |
0b1e95b2 | 211 | vmovdqa 3*16(p_keys), xkey4 |
22cddcc7 | 212 | .endif |
213 | .else | |
214 | vmovdqa 3*16(p_keys), xkeyA | |
215 | .endif | |
216 | ||
217 | .set i, 0 | |
218 | .rept by | |
219 | club XDATA, i | |
220 | vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ | |
221 | .set i, (i +1) | |
222 | .endr | |
223 | ||
224 | add $(16*by), p_in | |
225 | ||
226 | .if (klen == KEY_128) | |
0b1e95b2 | 227 | vmovdqa 4*16(p_keys), xkeyB |
22cddcc7 | 228 | .else |
229 | .if (load_keys) | |
230 | vmovdqa 4*16(p_keys), xkey4 | |
231 | .endif | |
232 | .endif | |
233 | ||
234 | .set i, 0 | |
235 | .rept by | |
236 | club XDATA, i | |
0b1e95b2 MK |
237 | /* key 3 */ |
238 | .if (klen == KEY_128) | |
239 | vaesenc xkey4, var_xdata, var_xdata | |
240 | .else | |
241 | vaesenc xkeyA, var_xdata, var_xdata | |
242 | .endif | |
22cddcc7 | 243 | .set i, (i +1) |
244 | .endr | |
245 | ||
246 | vmovdqa 5*16(p_keys), xkeyA | |
247 | ||
248 | .set i, 0 | |
249 | .rept by | |
250 | club XDATA, i | |
0b1e95b2 MK |
251 | /* key 4 */ |
252 | .if (klen == KEY_128) | |
253 | vaesenc xkeyB, var_xdata, var_xdata | |
254 | .else | |
255 | vaesenc xkey4, var_xdata, var_xdata | |
256 | .endif | |
22cddcc7 | 257 | .set i, (i +1) |
258 | .endr | |
259 | ||
260 | .if (klen == KEY_128) | |
261 | .if (load_keys) | |
0b1e95b2 | 262 | vmovdqa 6*16(p_keys), xkey8 |
22cddcc7 | 263 | .endif |
264 | .else | |
265 | vmovdqa 6*16(p_keys), xkeyB | |
266 | .endif | |
267 | ||
268 | .set i, 0 | |
269 | .rept by | |
270 | club XDATA, i | |
271 | vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ | |
272 | .set i, (i +1) | |
273 | .endr | |
274 | ||
275 | vmovdqa 7*16(p_keys), xkeyA | |
276 | ||
277 | .set i, 0 | |
278 | .rept by | |
279 | club XDATA, i | |
0b1e95b2 MK |
280 | /* key 6 */ |
281 | .if (klen == KEY_128) | |
282 | vaesenc xkey8, var_xdata, var_xdata | |
283 | .else | |
284 | vaesenc xkeyB, var_xdata, var_xdata | |
285 | .endif | |
22cddcc7 | 286 | .set i, (i +1) |
287 | .endr | |
288 | ||
289 | .if (klen == KEY_128) | |
0b1e95b2 | 290 | vmovdqa 8*16(p_keys), xkeyB |
22cddcc7 | 291 | .else |
292 | .if (load_keys) | |
293 | vmovdqa 8*16(p_keys), xkey8 | |
294 | .endif | |
295 | .endif | |
296 | ||
297 | .set i, 0 | |
298 | .rept by | |
299 | club XDATA, i | |
300 | vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ | |
301 | .set i, (i +1) | |
302 | .endr | |
303 | ||
304 | .if (klen == KEY_128) | |
305 | .if (load_keys) | |
0b1e95b2 | 306 | vmovdqa 9*16(p_keys), xkey12 |
22cddcc7 | 307 | .endif |
308 | .else | |
309 | vmovdqa 9*16(p_keys), xkeyA | |
310 | .endif | |
311 | ||
312 | .set i, 0 | |
313 | .rept by | |
314 | club XDATA, i | |
0b1e95b2 MK |
315 | /* key 8 */ |
316 | .if (klen == KEY_128) | |
317 | vaesenc xkeyB, var_xdata, var_xdata | |
318 | .else | |
319 | vaesenc xkey8, var_xdata, var_xdata | |
320 | .endif | |
22cddcc7 | 321 | .set i, (i +1) |
322 | .endr | |
323 | ||
324 | vmovdqa 10*16(p_keys), xkeyB | |
325 | ||
326 | .set i, 0 | |
327 | .rept by | |
328 | club XDATA, i | |
0b1e95b2 MK |
329 | /* key 9 */ |
330 | .if (klen == KEY_128) | |
331 | vaesenc xkey12, var_xdata, var_xdata | |
332 | .else | |
333 | vaesenc xkeyA, var_xdata, var_xdata | |
334 | .endif | |
22cddcc7 | 335 | .set i, (i +1) |
336 | .endr | |
337 | ||
338 | .if (klen != KEY_128) | |
339 | vmovdqa 11*16(p_keys), xkeyA | |
340 | .endif | |
341 | ||
342 | .set i, 0 | |
343 | .rept by | |
344 | club XDATA, i | |
345 | /* key 10 */ | |
346 | .if (klen == KEY_128) | |
347 | vaesenclast xkeyB, var_xdata, var_xdata | |
348 | .else | |
349 | vaesenc xkeyB, var_xdata, var_xdata | |
350 | .endif | |
351 | .set i, (i +1) | |
352 | .endr | |
353 | ||
354 | .if (klen != KEY_128) | |
355 | .if (load_keys) | |
356 | vmovdqa 12*16(p_keys), xkey12 | |
357 | .endif | |
358 | ||
359 | .set i, 0 | |
360 | .rept by | |
361 | club XDATA, i | |
362 | vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ | |
363 | .set i, (i +1) | |
364 | .endr | |
365 | ||
366 | .if (klen == KEY_256) | |
367 | vmovdqa 13*16(p_keys), xkeyA | |
368 | .endif | |
369 | ||
370 | .set i, 0 | |
371 | .rept by | |
372 | club XDATA, i | |
373 | .if (klen == KEY_256) | |
374 | /* key 12 */ | |
375 | vaesenc xkey12, var_xdata, var_xdata | |
376 | .else | |
377 | vaesenclast xkey12, var_xdata, var_xdata | |
378 | .endif | |
379 | .set i, (i +1) | |
380 | .endr | |
381 | ||
382 | .if (klen == KEY_256) | |
383 | vmovdqa 14*16(p_keys), xkeyB | |
384 | ||
385 | .set i, 0 | |
386 | .rept by | |
387 | club XDATA, i | |
388 | /* key 13 */ | |
389 | vaesenc xkeyA, var_xdata, var_xdata | |
390 | .set i, (i +1) | |
391 | .endr | |
392 | ||
393 | .set i, 0 | |
394 | .rept by | |
395 | club XDATA, i | |
396 | /* key 14 */ | |
397 | vaesenclast xkeyB, var_xdata, var_xdata | |
398 | .set i, (i +1) | |
399 | .endr | |
400 | .endif | |
401 | .endif | |
402 | ||
403 | .set i, 0 | |
404 | .rept (by / 2) | |
405 | .set j, (i+1) | |
406 | VMOVDQ (i*16 - 16*by)(p_in), xkeyA | |
407 | VMOVDQ (j*16 - 16*by)(p_in), xkeyB | |
408 | club XDATA, i | |
409 | vpxor xkeyA, var_xdata, var_xdata | |
410 | club XDATA, j | |
411 | vpxor xkeyB, var_xdata, var_xdata | |
412 | .set i, (i+2) | |
413 | .endr | |
414 | ||
415 | .if (i < by) | |
416 | VMOVDQ (i*16 - 16*by)(p_in), xkeyA | |
417 | club XDATA, i | |
418 | vpxor xkeyA, var_xdata, var_xdata | |
419 | .endif | |
420 | ||
421 | .set i, 0 | |
422 | .rept by | |
423 | club XDATA, i | |
424 | VMOVDQ var_xdata, i*16(p_out) | |
425 | .set i, (i+1) | |
426 | .endr | |
427 | .endm | |
428 | ||
429 | .macro do_aes_load val, key_len | |
430 | do_aes \val, 1, \key_len | |
431 | .endm | |
432 | ||
433 | .macro do_aes_noload val, key_len | |
434 | do_aes \val, 0, \key_len | |
435 | .endm | |
436 | ||
437 | /* main body of aes ctr load */ | |
438 | ||
439 | .macro do_aes_ctrmain key_len | |
22cddcc7 | 440 | cmp $16, num_bytes |
441 | jb .Ldo_return2\key_len | |
442 | ||
443 | vmovdqa byteswap_const(%rip), xbyteswap | |
444 | vmovdqu (p_iv), xcounter | |
445 | vpshufb xbyteswap, xcounter, xcounter | |
446 | ||
447 | mov num_bytes, tmp | |
448 | and $(7*16), tmp | |
449 | jz .Lmult_of_8_blks\key_len | |
450 | ||
451 | /* 1 <= tmp <= 7 */ | |
452 | cmp $(4*16), tmp | |
453 | jg .Lgt4\key_len | |
454 | je .Leq4\key_len | |
455 | ||
456 | .Llt4\key_len: | |
457 | cmp $(2*16), tmp | |
458 | jg .Leq3\key_len | |
459 | je .Leq2\key_len | |
460 | ||
461 | .Leq1\key_len: | |
462 | do_aes_load 1, \key_len | |
463 | add $(1*16), p_out | |
464 | and $(~7*16), num_bytes | |
465 | jz .Ldo_return2\key_len | |
466 | jmp .Lmain_loop2\key_len | |
467 | ||
468 | .Leq2\key_len: | |
469 | do_aes_load 2, \key_len | |
470 | add $(2*16), p_out | |
471 | and $(~7*16), num_bytes | |
472 | jz .Ldo_return2\key_len | |
473 | jmp .Lmain_loop2\key_len | |
474 | ||
475 | ||
476 | .Leq3\key_len: | |
477 | do_aes_load 3, \key_len | |
478 | add $(3*16), p_out | |
479 | and $(~7*16), num_bytes | |
480 | jz .Ldo_return2\key_len | |
481 | jmp .Lmain_loop2\key_len | |
482 | ||
483 | .Leq4\key_len: | |
484 | do_aes_load 4, \key_len | |
485 | add $(4*16), p_out | |
486 | and $(~7*16), num_bytes | |
487 | jz .Ldo_return2\key_len | |
488 | jmp .Lmain_loop2\key_len | |
489 | ||
490 | .Lgt4\key_len: | |
491 | cmp $(6*16), tmp | |
492 | jg .Leq7\key_len | |
493 | je .Leq6\key_len | |
494 | ||
495 | .Leq5\key_len: | |
496 | do_aes_load 5, \key_len | |
497 | add $(5*16), p_out | |
498 | and $(~7*16), num_bytes | |
499 | jz .Ldo_return2\key_len | |
500 | jmp .Lmain_loop2\key_len | |
501 | ||
502 | .Leq6\key_len: | |
503 | do_aes_load 6, \key_len | |
504 | add $(6*16), p_out | |
505 | and $(~7*16), num_bytes | |
506 | jz .Ldo_return2\key_len | |
507 | jmp .Lmain_loop2\key_len | |
508 | ||
509 | .Leq7\key_len: | |
510 | do_aes_load 7, \key_len | |
511 | add $(7*16), p_out | |
512 | and $(~7*16), num_bytes | |
513 | jz .Ldo_return2\key_len | |
514 | jmp .Lmain_loop2\key_len | |
515 | ||
516 | .Lmult_of_8_blks\key_len: | |
517 | .if (\key_len != KEY_128) | |
518 | vmovdqa 0*16(p_keys), xkey0 | |
519 | vmovdqa 4*16(p_keys), xkey4 | |
520 | vmovdqa 8*16(p_keys), xkey8 | |
521 | vmovdqa 12*16(p_keys), xkey12 | |
522 | .else | |
523 | vmovdqa 0*16(p_keys), xkey0 | |
524 | vmovdqa 3*16(p_keys), xkey4 | |
525 | vmovdqa 6*16(p_keys), xkey8 | |
526 | vmovdqa 9*16(p_keys), xkey12 | |
527 | .endif | |
528 | .align 16 | |
529 | .Lmain_loop2\key_len: | |
530 | /* num_bytes is a multiple of 8 and >0 */ | |
531 | do_aes_noload 8, \key_len | |
532 | add $(8*16), p_out | |
533 | sub $(8*16), num_bytes | |
534 | jne .Lmain_loop2\key_len | |
535 | ||
536 | .Ldo_return2\key_len: | |
537 | /* return updated IV */ | |
538 | vpshufb xbyteswap, xcounter, xcounter | |
539 | vmovdqu xcounter, (p_iv) | |
540 | ret | |
541 | .endm | |
542 | ||
543 | /* | |
544 | * routine to do AES128 CTR enc/decrypt "by8" | |
545 | * XMM registers are clobbered. | |
546 | * Saving/restoring must be done at a higher level | |
547 | * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, | |
548 | * unsigned int num_bytes) | |
549 | */ | |
550 | ENTRY(aes_ctr_enc_128_avx_by8) | |
551 | /* call the aes main loop */ | |
552 | do_aes_ctrmain KEY_128 | |
553 | ||
554 | ENDPROC(aes_ctr_enc_128_avx_by8) | |
555 | ||
556 | /* | |
557 | * routine to do AES192 CTR enc/decrypt "by8" | |
558 | * XMM registers are clobbered. | |
559 | * Saving/restoring must be done at a higher level | |
560 | * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, | |
561 | * unsigned int num_bytes) | |
562 | */ | |
563 | ENTRY(aes_ctr_enc_192_avx_by8) | |
564 | /* call the aes main loop */ | |
565 | do_aes_ctrmain KEY_192 | |
566 | ||
567 | ENDPROC(aes_ctr_enc_192_avx_by8) | |
568 | ||
569 | /* | |
570 | * routine to do AES256 CTR enc/decrypt "by8" | |
571 | * XMM registers are clobbered. | |
572 | * Saving/restoring must be done at a higher level | |
573 | * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, | |
574 | * unsigned int num_bytes) | |
575 | */ | |
576 | ENTRY(aes_ctr_enc_256_avx_by8) | |
577 | /* call the aes main loop */ | |
578 | do_aes_ctrmain KEY_256 | |
579 | ||
580 | ENDPROC(aes_ctr_enc_256_avx_by8) |