83a53818f0a5e3cb89a02bce1d83a9174fc0b0a6
[deliverable/linux.git] / arch / x86 / crypto / cast6-avx-x86_64-asm_64.S
1 /*
2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
22 * USA
23 *
24 */
25
26 #include "glue_helper-asm-avx.S"
27
28 .file "cast6-avx-x86_64-asm_64.S"
29
30 .extern cast6_s1
31 .extern cast6_s2
32 .extern cast6_s3
33 .extern cast6_s4
34
35 /* structure of crypto context */
36 #define km 0
37 #define kr (12*4*4)
38
39 /* s-boxes */
40 #define s1 cast6_s1
41 #define s2 cast6_s2
42 #define s3 cast6_s3
43 #define s4 cast6_s4
44
45 /**********************************************************************
46 8-way AVX cast6
47 **********************************************************************/
48 #define CTX %rdi
49
50 #define RA1 %xmm0
51 #define RB1 %xmm1
52 #define RC1 %xmm2
53 #define RD1 %xmm3
54
55 #define RA2 %xmm4
56 #define RB2 %xmm5
57 #define RC2 %xmm6
58 #define RD2 %xmm7
59
60 #define RX %xmm8
61
62 #define RKM %xmm9
63 #define RKR %xmm10
64 #define RKRF %xmm11
65 #define RKRR %xmm12
66 #define R32 %xmm13
67 #define R1ST %xmm14
68
69 #define RTMP %xmm15
70
71 #define RID1 %rbp
72 #define RID1d %ebp
73 #define RID2 %rsi
74 #define RID2d %esi
75
76 #define RGI1 %rdx
77 #define RGI1bl %dl
78 #define RGI1bh %dh
79 #define RGI2 %rcx
80 #define RGI2bl %cl
81 #define RGI2bh %ch
82
83 #define RGI3 %rax
84 #define RGI3bl %al
85 #define RGI3bh %ah
86 #define RGI4 %rbx
87 #define RGI4bl %bl
88 #define RGI4bh %bh
89
90 #define RFS1 %r8
91 #define RFS1d %r8d
92 #define RFS2 %r9
93 #define RFS2d %r9d
94 #define RFS3 %r10
95 #define RFS3d %r10d
96
97
98 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
99 movzbl src ## bh, RID1d; \
100 movzbl src ## bl, RID2d; \
101 shrq $16, src; \
102 movl s1(, RID1, 4), dst ## d; \
103 op1 s2(, RID2, 4), dst ## d; \
104 movzbl src ## bh, RID1d; \
105 movzbl src ## bl, RID2d; \
106 interleave_op(il_reg); \
107 op2 s3(, RID1, 4), dst ## d; \
108 op3 s4(, RID2, 4), dst ## d;
109
110 #define dummy(d) /* do nothing */
111
112 #define shr_next(reg) \
113 shrq $16, reg;
114
115 #define F_head(a, x, gi1, gi2, op0) \
116 op0 a, RKM, x; \
117 vpslld RKRF, x, RTMP; \
118 vpsrld RKRR, x, x; \
119 vpor RTMP, x, x; \
120 \
121 vmovq x, gi1; \
122 vpextrq $1, x, gi2;
123
124 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
125 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
126 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
127 \
128 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
129 shlq $32, RFS2; \
130 orq RFS1, RFS2; \
131 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
132 shlq $32, RFS1; \
133 orq RFS1, RFS3; \
134 \
135 vmovq RFS2, x; \
136 vpinsrq $1, RFS3, x, x;
137
138 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
139 F_head(b1, RX, RGI1, RGI2, op0); \
140 F_head(b2, RX, RGI3, RGI4, op0); \
141 \
142 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
143 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
144 \
145 vpxor a1, RX, a1; \
146 vpxor a2, RTMP, a2;
147
148 #define F1_2(a1, b1, a2, b2) \
149 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
150 #define F2_2(a1, b1, a2, b2) \
151 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
152 #define F3_2(a1, b1, a2, b2) \
153 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
154
155 #define qop(in, out, f) \
156 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
157
158 #define get_round_keys(nn) \
159 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
160 vpand R1ST, RKR, RKRF; \
161 vpsubq RKRF, R32, RKRR; \
162 vpsrldq $1, RKR, RKR;
163
164 #define Q(n) \
165 get_round_keys(4*n+0); \
166 qop(RD, RC, 1); \
167 \
168 get_round_keys(4*n+1); \
169 qop(RC, RB, 2); \
170 \
171 get_round_keys(4*n+2); \
172 qop(RB, RA, 3); \
173 \
174 get_round_keys(4*n+3); \
175 qop(RA, RD, 1);
176
177 #define QBAR(n) \
178 get_round_keys(4*n+3); \
179 qop(RA, RD, 1); \
180 \
181 get_round_keys(4*n+2); \
182 qop(RB, RA, 3); \
183 \
184 get_round_keys(4*n+1); \
185 qop(RC, RB, 2); \
186 \
187 get_round_keys(4*n+0); \
188 qop(RD, RC, 1);
189
190 #define shuffle(mask) \
191 vpshufb mask, RKR, RKR;
192
193 #define preload_rkr(n, do_mask, mask) \
194 vbroadcastss .L16_mask, RKR; \
195 /* add 16-bit rotation to key rotations (mod 32) */ \
196 vpxor (kr+n*16)(CTX), RKR, RKR; \
197 do_mask(mask);
198
199 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
200 vpunpckldq x1, x0, t0; \
201 vpunpckhdq x1, x0, t2; \
202 vpunpckldq x3, x2, t1; \
203 vpunpckhdq x3, x2, x3; \
204 \
205 vpunpcklqdq t1, t0, x0; \
206 vpunpckhqdq t1, t0, x1; \
207 vpunpcklqdq x3, t2, x2; \
208 vpunpckhqdq x3, t2, x3;
209
210 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
211 vpshufb rmask, x0, x0; \
212 vpshufb rmask, x1, x1; \
213 vpshufb rmask, x2, x2; \
214 vpshufb rmask, x3, x3; \
215 \
216 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
217
218 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
219 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
220 \
221 vpshufb rmask, x0, x0; \
222 vpshufb rmask, x1, x1; \
223 vpshufb rmask, x2, x2; \
224 vpshufb rmask, x3, x3;
225
226 .data
227
228 .align 16
229 .Lbswap_mask:
230 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
231 .Lbswap128_mask:
232 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
233 .Lrkr_enc_Q_Q_QBAR_QBAR:
234 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
235 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
236 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
237 .Lrkr_dec_Q_Q_Q_Q:
238 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
239 .Lrkr_dec_Q_Q_QBAR_QBAR:
240 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
241 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
242 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
243 .L16_mask:
244 .byte 16, 16, 16, 16
245 .L32_mask:
246 .byte 32, 0, 0, 0
247 .Lfirst_mask:
248 .byte 0x1f, 0, 0, 0
249
250 .text
251
252 .align 8
253 .type __cast6_enc_blk8,@function;
254
255 __cast6_enc_blk8:
256 /* input:
257 * %rdi: ctx, CTX
258 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
259 * output:
260 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
261 */
262
263 pushq %rbp;
264 pushq %rbx;
265
266 vmovdqa .Lbswap_mask, RKM;
267 vmovd .Lfirst_mask, R1ST;
268 vmovd .L32_mask, R32;
269
270 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
271 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
272
273 preload_rkr(0, dummy, none);
274 Q(0);
275 Q(1);
276 Q(2);
277 Q(3);
278 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
279 Q(4);
280 Q(5);
281 QBAR(6);
282 QBAR(7);
283 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
284 QBAR(8);
285 QBAR(9);
286 QBAR(10);
287 QBAR(11);
288
289 popq %rbx;
290 popq %rbp;
291
292 vmovdqa .Lbswap_mask, RKM;
293
294 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
295 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
296
297 ret;
298
299 .align 8
300 .type __cast6_dec_blk8,@function;
301
302 __cast6_dec_blk8:
303 /* input:
304 * %rdi: ctx, CTX
305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
306 * output:
307 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
308 */
309
310 pushq %rbp;
311 pushq %rbx;
312
313 vmovdqa .Lbswap_mask, RKM;
314 vmovd .Lfirst_mask, R1ST;
315 vmovd .L32_mask, R32;
316
317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
319
320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
321 Q(11);
322 Q(10);
323 Q(9);
324 Q(8);
325 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
326 Q(7);
327 Q(6);
328 QBAR(5);
329 QBAR(4);
330 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
331 QBAR(3);
332 QBAR(2);
333 QBAR(1);
334 QBAR(0);
335
336 popq %rbx;
337 popq %rbp;
338
339 vmovdqa .Lbswap_mask, RKM;
340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
342
343 ret;
344
345 .align 8
346 .global cast6_ecb_enc_8way
347 .type cast6_ecb_enc_8way,@function;
348
349 cast6_ecb_enc_8way:
350 /* input:
351 * %rdi: ctx, CTX
352 * %rsi: dst
353 * %rdx: src
354 */
355
356 movq %rsi, %r11;
357
358 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
359
360 call __cast6_enc_blk8;
361
362 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
363
364 ret;
365
366 .align 8
367 .global cast6_ecb_dec_8way
368 .type cast6_ecb_dec_8way,@function;
369
370 cast6_ecb_dec_8way:
371 /* input:
372 * %rdi: ctx, CTX
373 * %rsi: dst
374 * %rdx: src
375 */
376
377 movq %rsi, %r11;
378
379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
380
381 call __cast6_dec_blk8;
382
383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
384
385 ret;
386
387 .align 8
388 .global cast6_cbc_dec_8way
389 .type cast6_cbc_dec_8way,@function;
390
391 cast6_cbc_dec_8way:
392 /* input:
393 * %rdi: ctx, CTX
394 * %rsi: dst
395 * %rdx: src
396 */
397
398 pushq %r12;
399
400 movq %rsi, %r11;
401 movq %rdx, %r12;
402
403 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
404
405 call __cast6_dec_blk8;
406
407 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
408
409 popq %r12;
410
411 ret;
412
413 .align 8
414 .global cast6_ctr_8way
415 .type cast6_ctr_8way,@function;
416
417 cast6_ctr_8way:
418 /* input:
419 * %rdi: ctx, CTX
420 * %rsi: dst
421 * %rdx: src
422 * %rcx: iv (little endian, 128bit)
423 */
424
425 pushq %r12;
426
427 movq %rsi, %r11;
428 movq %rdx, %r12;
429
430 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
431 RD2, RX, RKR, RKM);
432
433 call __cast6_enc_blk8;
434
435 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
436
437 popq %r12;
438
439 ret;
This page took 0.039895 seconds and 5 git commands to generate.