22cddcc7 |
1 | /* |
2 | * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) |
3 | * |
4 | * This is AES128/192/256 CTR mode optimization implementation. It requires |
5 | * the support of Intel(R) AESNI and AVX instructions. |
6 | * |
7 | * This work was inspired by the AES CTR mode optimization published |
8 | * in Intel Optimized IPSEC Cryptograhpic library. |
9 | * Additional information on it can be found at: |
10 | * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 |
11 | * |
12 | * This file is provided under a dual BSD/GPLv2 license. When using or |
13 | * redistributing this file, you may do so under either license. |
14 | * |
15 | * GPL LICENSE SUMMARY |
16 | * |
17 | * Copyright(c) 2014 Intel Corporation. |
18 | * |
19 | * This program is free software; you can redistribute it and/or modify |
20 | * it under the terms of version 2 of the GNU General Public License as |
21 | * published by the Free Software Foundation. |
22 | * |
23 | * This program is distributed in the hope that it will be useful, but |
24 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
26 | * General Public License for more details. |
27 | * |
28 | * Contact Information: |
29 | * James Guilford <james.guilford@intel.com> |
30 | * Sean Gulley <sean.m.gulley@intel.com> |
31 | * Chandramouli Narayanan <mouli@linux.intel.com> |
32 | * |
33 | * BSD LICENSE |
34 | * |
35 | * Copyright(c) 2014 Intel Corporation. |
36 | * |
37 | * Redistribution and use in source and binary forms, with or without |
38 | * modification, are permitted provided that the following conditions |
39 | * are met: |
40 | * |
41 | * Redistributions of source code must retain the above copyright |
42 | * notice, this list of conditions and the following disclaimer. |
43 | * Redistributions in binary form must reproduce the above copyright |
44 | * notice, this list of conditions and the following disclaimer in |
45 | * the documentation and/or other materials provided with the |
46 | * distribution. |
47 | * Neither the name of Intel Corporation nor the names of its |
48 | * contributors may be used to endorse or promote products derived |
49 | * from this software without specific prior written permission. |
50 | * |
51 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
52 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
53 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
54 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
55 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
56 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
57 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
58 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
59 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
60 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
61 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
62 | * |
63 | */ |
64 | |
65 | #include <linux/linkage.h> |
66 | #include <asm/inst.h> |
67 | |
68 | #define CONCAT(a,b) a##b |
69 | #define VMOVDQ vmovdqu |
70 | |
71 | #define xdata0 %xmm0 |
72 | #define xdata1 %xmm1 |
73 | #define xdata2 %xmm2 |
74 | #define xdata3 %xmm3 |
75 | #define xdata4 %xmm4 |
76 | #define xdata5 %xmm5 |
77 | #define xdata6 %xmm6 |
78 | #define xdata7 %xmm7 |
79 | #define xcounter %xmm8 |
80 | #define xbyteswap %xmm9 |
81 | #define xkey0 %xmm10 |
82 | #define xkey3 %xmm11 |
83 | #define xkey6 %xmm12 |
84 | #define xkey9 %xmm13 |
85 | #define xkey4 %xmm11 |
86 | #define xkey8 %xmm12 |
87 | #define xkey12 %xmm13 |
88 | #define xkeyA %xmm14 |
89 | #define xkeyB %xmm15 |
90 | |
91 | #define p_in %rdi |
92 | #define p_iv %rsi |
93 | #define p_keys %rdx |
94 | #define p_out %rcx |
95 | #define num_bytes %r8 |
96 | |
97 | #define tmp %r10 |
98 | #define DDQ(i) CONCAT(ddq_add_,i) |
99 | #define XMM(i) CONCAT(%xmm, i) |
100 | #define DDQ_DATA 0 |
101 | #define XDATA 1 |
102 | #define KEY_128 1 |
103 | #define KEY_192 2 |
104 | #define KEY_256 3 |
105 | |
106 | .section .rodata |
107 | .align 16 |
108 | |
109 | byteswap_const: |
110 | .octa 0x000102030405060708090A0B0C0D0E0F |
111 | ddq_add_1: |
112 | .octa 0x00000000000000000000000000000001 |
113 | ddq_add_2: |
114 | .octa 0x00000000000000000000000000000002 |
115 | ddq_add_3: |
116 | .octa 0x00000000000000000000000000000003 |
117 | ddq_add_4: |
118 | .octa 0x00000000000000000000000000000004 |
119 | ddq_add_5: |
120 | .octa 0x00000000000000000000000000000005 |
121 | ddq_add_6: |
122 | .octa 0x00000000000000000000000000000006 |
123 | ddq_add_7: |
124 | .octa 0x00000000000000000000000000000007 |
125 | ddq_add_8: |
126 | .octa 0x00000000000000000000000000000008 |
127 | |
128 | .text |
129 | |
130 | /* generate a unique variable for ddq_add_x */ |
131 | |
132 | .macro setddq n |
133 | var_ddq_add = DDQ(\n) |
134 | .endm |
135 | |
136 | /* generate a unique variable for xmm register */ |
137 | .macro setxdata n |
138 | var_xdata = XMM(\n) |
139 | .endm |
140 | |
141 | /* club the numeric 'id' to the symbol 'name' */ |
142 | |
143 | .macro club name, id |
144 | .altmacro |
145 | .if \name == DDQ_DATA |
146 | setddq %\id |
147 | .elseif \name == XDATA |
148 | setxdata %\id |
149 | .endif |
150 | .noaltmacro |
151 | .endm |
152 | |
153 | /* |
154 | * do_aes num_in_par load_keys key_len |
155 | * This increments p_in, but not p_out |
156 | */ |
157 | .macro do_aes b, k, key_len |
158 | .set by, \b |
159 | .set load_keys, \k |
160 | .set klen, \key_len |
161 | |
162 | .if (load_keys) |
163 | vmovdqa 0*16(p_keys), xkey0 |
164 | .endif |
165 | |
166 | vpshufb xbyteswap, xcounter, xdata0 |
167 | |
168 | .set i, 1 |
169 | .rept (by - 1) |
170 | club DDQ_DATA, i |
171 | club XDATA, i |
172 | vpaddd var_ddq_add(%rip), xcounter, var_xdata |
173 | vpshufb xbyteswap, var_xdata, var_xdata |
174 | .set i, (i +1) |
175 | .endr |
176 | |
177 | vmovdqa 1*16(p_keys), xkeyA |
178 | |
179 | vpxor xkey0, xdata0, xdata0 |
180 | club DDQ_DATA, by |
181 | vpaddd var_ddq_add(%rip), xcounter, xcounter |
182 | |
183 | .set i, 1 |
184 | .rept (by - 1) |
185 | club XDATA, i |
186 | vpxor xkey0, var_xdata, var_xdata |
187 | .set i, (i +1) |
188 | .endr |
189 | |
190 | vmovdqa 2*16(p_keys), xkeyB |
191 | |
192 | .set i, 0 |
193 | .rept by |
194 | club XDATA, i |
195 | vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ |
196 | .set i, (i +1) |
197 | .endr |
198 | |
199 | .if (klen == KEY_128) |
200 | .if (load_keys) |
201 | vmovdqa 3*16(p_keys), xkeyA |
202 | .endif |
203 | .else |
204 | vmovdqa 3*16(p_keys), xkeyA |
205 | .endif |
206 | |
207 | .set i, 0 |
208 | .rept by |
209 | club XDATA, i |
210 | vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ |
211 | .set i, (i +1) |
212 | .endr |
213 | |
214 | add $(16*by), p_in |
215 | |
216 | .if (klen == KEY_128) |
217 | vmovdqa 4*16(p_keys), xkey4 |
218 | .else |
219 | .if (load_keys) |
220 | vmovdqa 4*16(p_keys), xkey4 |
221 | .endif |
222 | .endif |
223 | |
224 | .set i, 0 |
225 | .rept by |
226 | club XDATA, i |
227 | vaesenc xkeyA, var_xdata, var_xdata /* key 3 */ |
228 | .set i, (i +1) |
229 | .endr |
230 | |
231 | vmovdqa 5*16(p_keys), xkeyA |
232 | |
233 | .set i, 0 |
234 | .rept by |
235 | club XDATA, i |
236 | vaesenc xkey4, var_xdata, var_xdata /* key 4 */ |
237 | .set i, (i +1) |
238 | .endr |
239 | |
240 | .if (klen == KEY_128) |
241 | .if (load_keys) |
242 | vmovdqa 6*16(p_keys), xkeyB |
243 | .endif |
244 | .else |
245 | vmovdqa 6*16(p_keys), xkeyB |
246 | .endif |
247 | |
248 | .set i, 0 |
249 | .rept by |
250 | club XDATA, i |
251 | vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ |
252 | .set i, (i +1) |
253 | .endr |
254 | |
255 | vmovdqa 7*16(p_keys), xkeyA |
256 | |
257 | .set i, 0 |
258 | .rept by |
259 | club XDATA, i |
260 | vaesenc xkeyB, var_xdata, var_xdata /* key 6 */ |
261 | .set i, (i +1) |
262 | .endr |
263 | |
264 | .if (klen == KEY_128) |
265 | vmovdqa 8*16(p_keys), xkey8 |
266 | .else |
267 | .if (load_keys) |
268 | vmovdqa 8*16(p_keys), xkey8 |
269 | .endif |
270 | .endif |
271 | |
272 | .set i, 0 |
273 | .rept by |
274 | club XDATA, i |
275 | vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ |
276 | .set i, (i +1) |
277 | .endr |
278 | |
279 | .if (klen == KEY_128) |
280 | .if (load_keys) |
281 | vmovdqa 9*16(p_keys), xkeyA |
282 | .endif |
283 | .else |
284 | vmovdqa 9*16(p_keys), xkeyA |
285 | .endif |
286 | |
287 | .set i, 0 |
288 | .rept by |
289 | club XDATA, i |
290 | vaesenc xkey8, var_xdata, var_xdata /* key 8 */ |
291 | .set i, (i +1) |
292 | .endr |
293 | |
294 | vmovdqa 10*16(p_keys), xkeyB |
295 | |
296 | .set i, 0 |
297 | .rept by |
298 | club XDATA, i |
299 | vaesenc xkeyA, var_xdata, var_xdata /* key 9 */ |
300 | .set i, (i +1) |
301 | .endr |
302 | |
303 | .if (klen != KEY_128) |
304 | vmovdqa 11*16(p_keys), xkeyA |
305 | .endif |
306 | |
307 | .set i, 0 |
308 | .rept by |
309 | club XDATA, i |
310 | /* key 10 */ |
311 | .if (klen == KEY_128) |
312 | vaesenclast xkeyB, var_xdata, var_xdata |
313 | .else |
314 | vaesenc xkeyB, var_xdata, var_xdata |
315 | .endif |
316 | .set i, (i +1) |
317 | .endr |
318 | |
319 | .if (klen != KEY_128) |
320 | .if (load_keys) |
321 | vmovdqa 12*16(p_keys), xkey12 |
322 | .endif |
323 | |
324 | .set i, 0 |
325 | .rept by |
326 | club XDATA, i |
327 | vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ |
328 | .set i, (i +1) |
329 | .endr |
330 | |
331 | .if (klen == KEY_256) |
332 | vmovdqa 13*16(p_keys), xkeyA |
333 | .endif |
334 | |
335 | .set i, 0 |
336 | .rept by |
337 | club XDATA, i |
338 | .if (klen == KEY_256) |
339 | /* key 12 */ |
340 | vaesenc xkey12, var_xdata, var_xdata |
341 | .else |
342 | vaesenclast xkey12, var_xdata, var_xdata |
343 | .endif |
344 | .set i, (i +1) |
345 | .endr |
346 | |
347 | .if (klen == KEY_256) |
348 | vmovdqa 14*16(p_keys), xkeyB |
349 | |
350 | .set i, 0 |
351 | .rept by |
352 | club XDATA, i |
353 | /* key 13 */ |
354 | vaesenc xkeyA, var_xdata, var_xdata |
355 | .set i, (i +1) |
356 | .endr |
357 | |
358 | .set i, 0 |
359 | .rept by |
360 | club XDATA, i |
361 | /* key 14 */ |
362 | vaesenclast xkeyB, var_xdata, var_xdata |
363 | .set i, (i +1) |
364 | .endr |
365 | .endif |
366 | .endif |
367 | |
368 | .set i, 0 |
369 | .rept (by / 2) |
370 | .set j, (i+1) |
371 | VMOVDQ (i*16 - 16*by)(p_in), xkeyA |
372 | VMOVDQ (j*16 - 16*by)(p_in), xkeyB |
373 | club XDATA, i |
374 | vpxor xkeyA, var_xdata, var_xdata |
375 | club XDATA, j |
376 | vpxor xkeyB, var_xdata, var_xdata |
377 | .set i, (i+2) |
378 | .endr |
379 | |
380 | .if (i < by) |
381 | VMOVDQ (i*16 - 16*by)(p_in), xkeyA |
382 | club XDATA, i |
383 | vpxor xkeyA, var_xdata, var_xdata |
384 | .endif |
385 | |
386 | .set i, 0 |
387 | .rept by |
388 | club XDATA, i |
389 | VMOVDQ var_xdata, i*16(p_out) |
390 | .set i, (i+1) |
391 | .endr |
392 | .endm |
393 | |
394 | .macro do_aes_load val, key_len |
395 | do_aes \val, 1, \key_len |
396 | .endm |
397 | |
398 | .macro do_aes_noload val, key_len |
399 | do_aes \val, 0, \key_len |
400 | .endm |
401 | |
402 | /* main body of aes ctr load */ |
403 | |
404 | .macro do_aes_ctrmain key_len |
405 | |
406 | cmp $16, num_bytes |
407 | jb .Ldo_return2\key_len |
408 | |
409 | vmovdqa byteswap_const(%rip), xbyteswap |
410 | vmovdqu (p_iv), xcounter |
411 | vpshufb xbyteswap, xcounter, xcounter |
412 | |
413 | mov num_bytes, tmp |
414 | and $(7*16), tmp |
415 | jz .Lmult_of_8_blks\key_len |
416 | |
417 | /* 1 <= tmp <= 7 */ |
418 | cmp $(4*16), tmp |
419 | jg .Lgt4\key_len |
420 | je .Leq4\key_len |
421 | |
422 | .Llt4\key_len: |
423 | cmp $(2*16), tmp |
424 | jg .Leq3\key_len |
425 | je .Leq2\key_len |
426 | |
427 | .Leq1\key_len: |
428 | do_aes_load 1, \key_len |
429 | add $(1*16), p_out |
430 | and $(~7*16), num_bytes |
431 | jz .Ldo_return2\key_len |
432 | jmp .Lmain_loop2\key_len |
433 | |
434 | .Leq2\key_len: |
435 | do_aes_load 2, \key_len |
436 | add $(2*16), p_out |
437 | and $(~7*16), num_bytes |
438 | jz .Ldo_return2\key_len |
439 | jmp .Lmain_loop2\key_len |
440 | |
441 | |
442 | .Leq3\key_len: |
443 | do_aes_load 3, \key_len |
444 | add $(3*16), p_out |
445 | and $(~7*16), num_bytes |
446 | jz .Ldo_return2\key_len |
447 | jmp .Lmain_loop2\key_len |
448 | |
449 | .Leq4\key_len: |
450 | do_aes_load 4, \key_len |
451 | add $(4*16), p_out |
452 | and $(~7*16), num_bytes |
453 | jz .Ldo_return2\key_len |
454 | jmp .Lmain_loop2\key_len |
455 | |
456 | .Lgt4\key_len: |
457 | cmp $(6*16), tmp |
458 | jg .Leq7\key_len |
459 | je .Leq6\key_len |
460 | |
461 | .Leq5\key_len: |
462 | do_aes_load 5, \key_len |
463 | add $(5*16), p_out |
464 | and $(~7*16), num_bytes |
465 | jz .Ldo_return2\key_len |
466 | jmp .Lmain_loop2\key_len |
467 | |
468 | .Leq6\key_len: |
469 | do_aes_load 6, \key_len |
470 | add $(6*16), p_out |
471 | and $(~7*16), num_bytes |
472 | jz .Ldo_return2\key_len |
473 | jmp .Lmain_loop2\key_len |
474 | |
475 | .Leq7\key_len: |
476 | do_aes_load 7, \key_len |
477 | add $(7*16), p_out |
478 | and $(~7*16), num_bytes |
479 | jz .Ldo_return2\key_len |
480 | jmp .Lmain_loop2\key_len |
481 | |
482 | .Lmult_of_8_blks\key_len: |
483 | .if (\key_len != KEY_128) |
484 | vmovdqa 0*16(p_keys), xkey0 |
485 | vmovdqa 4*16(p_keys), xkey4 |
486 | vmovdqa 8*16(p_keys), xkey8 |
487 | vmovdqa 12*16(p_keys), xkey12 |
488 | .else |
489 | vmovdqa 0*16(p_keys), xkey0 |
490 | vmovdqa 3*16(p_keys), xkey4 |
491 | vmovdqa 6*16(p_keys), xkey8 |
492 | vmovdqa 9*16(p_keys), xkey12 |
493 | .endif |
494 | .align 16 |
495 | .Lmain_loop2\key_len: |
496 | /* num_bytes is a multiple of 8 and >0 */ |
497 | do_aes_noload 8, \key_len |
498 | add $(8*16), p_out |
499 | sub $(8*16), num_bytes |
500 | jne .Lmain_loop2\key_len |
501 | |
502 | .Ldo_return2\key_len: |
503 | /* return updated IV */ |
504 | vpshufb xbyteswap, xcounter, xcounter |
505 | vmovdqu xcounter, (p_iv) |
506 | ret |
507 | .endm |
508 | |
509 | /* |
510 | * routine to do AES128 CTR enc/decrypt "by8" |
511 | * XMM registers are clobbered. |
512 | * Saving/restoring must be done at a higher level |
513 | * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, |
514 | * unsigned int num_bytes) |
515 | */ |
516 | ENTRY(aes_ctr_enc_128_avx_by8) |
517 | /* call the aes main loop */ |
518 | do_aes_ctrmain KEY_128 |
519 | |
520 | ENDPROC(aes_ctr_enc_128_avx_by8) |
521 | |
522 | /* |
523 | * routine to do AES192 CTR enc/decrypt "by8" |
524 | * XMM registers are clobbered. |
525 | * Saving/restoring must be done at a higher level |
526 | * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, |
527 | * unsigned int num_bytes) |
528 | */ |
529 | ENTRY(aes_ctr_enc_192_avx_by8) |
530 | /* call the aes main loop */ |
531 | do_aes_ctrmain KEY_192 |
532 | |
533 | ENDPROC(aes_ctr_enc_192_avx_by8) |
534 | |
535 | /* |
536 | * routine to do AES256 CTR enc/decrypt "by8" |
537 | * XMM registers are clobbered. |
538 | * Saving/restoring must be done at a higher level |
539 | * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, |
540 | * unsigned int num_bytes) |
541 | */ |
542 | ENTRY(aes_ctr_enc_256_avx_by8) |
543 | /* call the aes main loop */ |
544 | do_aes_ctrmain KEY_256 |
545 | |
546 | ENDPROC(aes_ctr_enc_256_avx_by8) |