Commit | Line | Data |
---|---|---|
4ea1277d JG |
1 | /* |
2 | * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64) | |
3 | * | |
4 | * Copyright (C) 2012 Johannes Goetzfried | |
5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | |
6 | * | |
70177286 | 7 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
c09220e1 | 8 | * |
4ea1277d JG |
9 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of the GNU General Public License as published by | |
11 | * the Free Software Foundation; either version 2 of the License, or | |
12 | * (at your option) any later version. | |
13 | * | |
14 | * This program is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | * GNU General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU General Public License | |
20 | * along with this program; if not, write to the Free Software | |
21 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
22 | * USA | |
23 | * | |
24 | */ | |
25 | ||
1985fecf | 26 | #include <linux/linkage.h> |
cba1cce0 JK |
27 | #include "glue_helper-asm-avx.S" |
28 | ||
4ea1277d | 29 | .file "cast6-avx-x86_64-asm_64.S" |
4ea1277d | 30 | |
044ab525 JK |
31 | .extern cast_s1 |
32 | .extern cast_s2 | |
33 | .extern cast_s3 | |
34 | .extern cast_s4 | |
4ea1277d JG |
35 | |
36 | /* structure of crypto context */ | |
37 | #define km 0 | |
38 | #define kr (12*4*4) | |
39 | ||
40 | /* s-boxes */ | |
044ab525 JK |
41 | #define s1 cast_s1 |
42 | #define s2 cast_s2 | |
43 | #define s3 cast_s3 | |
44 | #define s4 cast_s4 | |
4ea1277d JG |
45 | |
46 | /********************************************************************** | |
47 | 8-way AVX cast6 | |
48 | **********************************************************************/ | |
49 | #define CTX %rdi | |
50 | ||
51 | #define RA1 %xmm0 | |
52 | #define RB1 %xmm1 | |
53 | #define RC1 %xmm2 | |
54 | #define RD1 %xmm3 | |
55 | ||
56 | #define RA2 %xmm4 | |
57 | #define RB2 %xmm5 | |
58 | #define RC2 %xmm6 | |
59 | #define RD2 %xmm7 | |
60 | ||
c09220e1 | 61 | #define RX %xmm8 |
4ea1277d JG |
62 | |
63 | #define RKM %xmm9 | |
c09220e1 JK |
64 | #define RKR %xmm10 |
65 | #define RKRF %xmm11 | |
66 | #define RKRR %xmm12 | |
67 | #define R32 %xmm13 | |
68 | #define R1ST %xmm14 | |
4ea1277d | 69 | |
c09220e1 | 70 | #define RTMP %xmm15 |
4ea1277d | 71 | |
c09220e1 JK |
72 | #define RID1 %rbp |
73 | #define RID1d %ebp | |
74 | #define RID2 %rsi | |
75 | #define RID2d %esi | |
4ea1277d JG |
76 | |
77 | #define RGI1 %rdx | |
78 | #define RGI1bl %dl | |
79 | #define RGI1bh %dh | |
80 | #define RGI2 %rcx | |
81 | #define RGI2bl %cl | |
82 | #define RGI2bh %ch | |
83 | ||
c09220e1 JK |
84 | #define RGI3 %rax |
85 | #define RGI3bl %al | |
86 | #define RGI3bh %ah | |
87 | #define RGI4 %rbx | |
88 | #define RGI4bl %bl | |
89 | #define RGI4bh %bh | |
90 | ||
4ea1277d JG |
91 | #define RFS1 %r8 |
92 | #define RFS1d %r8d | |
93 | #define RFS2 %r9 | |
94 | #define RFS2d %r9d | |
95 | #define RFS3 %r10 | |
96 | #define RFS3d %r10d | |
97 | ||
98 | ||
c09220e1 JK |
99 | #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ |
100 | movzbl src ## bh, RID1d; \ | |
101 | movzbl src ## bl, RID2d; \ | |
102 | shrq $16, src; \ | |
4ea1277d JG |
103 | movl s1(, RID1, 4), dst ## d; \ |
104 | op1 s2(, RID2, 4), dst ## d; \ | |
c09220e1 JK |
105 | movzbl src ## bh, RID1d; \ |
106 | movzbl src ## bl, RID2d; \ | |
107 | interleave_op(il_reg); \ | |
4ea1277d JG |
108 | op2 s3(, RID1, 4), dst ## d; \ |
109 | op3 s4(, RID2, 4), dst ## d; | |
110 | ||
c09220e1 JK |
111 | #define dummy(d) /* do nothing */ |
112 | ||
113 | #define shr_next(reg) \ | |
114 | shrq $16, reg; | |
115 | ||
116 | #define F_head(a, x, gi1, gi2, op0) \ | |
4ea1277d | 117 | op0 a, RKM, x; \ |
c09220e1 JK |
118 | vpslld RKRF, x, RTMP; \ |
119 | vpsrld RKRR, x, x; \ | |
4ea1277d JG |
120 | vpor RTMP, x, x; \ |
121 | \ | |
c09220e1 JK |
122 | vmovq x, gi1; \ |
123 | vpextrq $1, x, gi2; | |
124 | ||
125 | #define F_tail(a, x, gi1, gi2, op1, op2, op3) \ | |
126 | lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ | |
127 | lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ | |
4ea1277d | 128 | \ |
c09220e1 JK |
129 | lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ |
130 | shlq $32, RFS2; \ | |
131 | orq RFS1, RFS2; \ | |
132 | lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ | |
133 | shlq $32, RFS1; \ | |
134 | orq RFS1, RFS3; \ | |
4ea1277d | 135 | \ |
c09220e1 | 136 | vmovq RFS2, x; \ |
4ea1277d JG |
137 | vpinsrq $1, RFS3, x, x; |
138 | ||
c09220e1 JK |
139 | #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ |
140 | F_head(b1, RX, RGI1, RGI2, op0); \ | |
141 | F_head(b2, RX, RGI3, RGI4, op0); \ | |
142 | \ | |
143 | F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ | |
144 | F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ | |
145 | \ | |
146 | vpxor a1, RX, a1; \ | |
147 | vpxor a2, RTMP, a2; | |
148 | ||
149 | #define F1_2(a1, b1, a2, b2) \ | |
150 | F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) | |
151 | #define F2_2(a1, b1, a2, b2) \ | |
152 | F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) | |
153 | #define F3_2(a1, b1, a2, b2) \ | |
154 | F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) | |
4ea1277d | 155 | |
c09220e1 JK |
156 | #define qop(in, out, f) \ |
157 | F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2); | |
158 | ||
159 | #define get_round_keys(nn) \ | |
160 | vbroadcastss (km+(4*(nn)))(CTX), RKM; \ | |
161 | vpand R1ST, RKR, RKRF; \ | |
162 | vpsubq RKRF, R32, RKRR; \ | |
163 | vpsrldq $1, RKR, RKR; | |
4ea1277d JG |
164 | |
165 | #define Q(n) \ | |
c09220e1 JK |
166 | get_round_keys(4*n+0); \ |
167 | qop(RD, RC, 1); \ | |
4ea1277d | 168 | \ |
c09220e1 JK |
169 | get_round_keys(4*n+1); \ |
170 | qop(RC, RB, 2); \ | |
4ea1277d | 171 | \ |
c09220e1 JK |
172 | get_round_keys(4*n+2); \ |
173 | qop(RB, RA, 3); \ | |
4ea1277d | 174 | \ |
c09220e1 JK |
175 | get_round_keys(4*n+3); \ |
176 | qop(RA, RD, 1); | |
4ea1277d JG |
177 | |
178 | #define QBAR(n) \ | |
c09220e1 JK |
179 | get_round_keys(4*n+3); \ |
180 | qop(RA, RD, 1); \ | |
4ea1277d | 181 | \ |
c09220e1 JK |
182 | get_round_keys(4*n+2); \ |
183 | qop(RB, RA, 3); \ | |
4ea1277d | 184 | \ |
c09220e1 JK |
185 | get_round_keys(4*n+1); \ |
186 | qop(RC, RB, 2); \ | |
4ea1277d | 187 | \ |
c09220e1 JK |
188 | get_round_keys(4*n+0); \ |
189 | qop(RD, RC, 1); | |
190 | ||
191 | #define shuffle(mask) \ | |
192 | vpshufb mask, RKR, RKR; | |
4ea1277d | 193 | |
c09220e1 JK |
194 | #define preload_rkr(n, do_mask, mask) \ |
195 | vbroadcastss .L16_mask, RKR; \ | |
196 | /* add 16-bit rotation to key rotations (mod 32) */ \ | |
197 | vpxor (kr+n*16)(CTX), RKR, RKR; \ | |
198 | do_mask(mask); | |
4ea1277d JG |
199 | |
200 | #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | |
201 | vpunpckldq x1, x0, t0; \ | |
202 | vpunpckhdq x1, x0, t2; \ | |
203 | vpunpckldq x3, x2, t1; \ | |
204 | vpunpckhdq x3, x2, x3; \ | |
205 | \ | |
206 | vpunpcklqdq t1, t0, x0; \ | |
207 | vpunpckhqdq t1, t0, x1; \ | |
208 | vpunpcklqdq x3, t2, x2; \ | |
209 | vpunpckhqdq x3, t2, x3; | |
210 | ||
cba1cce0 | 211 | #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ |
c09220e1 JK |
212 | vpshufb rmask, x0, x0; \ |
213 | vpshufb rmask, x1, x1; \ | |
214 | vpshufb rmask, x2, x2; \ | |
215 | vpshufb rmask, x3, x3; \ | |
4ea1277d JG |
216 | \ |
217 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | |
218 | ||
cba1cce0 | 219 | #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ |
4ea1277d JG |
220 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ |
221 | \ | |
c09220e1 JK |
222 | vpshufb rmask, x0, x0; \ |
223 | vpshufb rmask, x1, x1; \ | |
224 | vpshufb rmask, x2, x2; \ | |
cba1cce0 | 225 | vpshufb rmask, x3, x3; |
4ea1277d | 226 | |
c09220e1 JK |
227 | .data |
228 | ||
4ea1277d | 229 | .align 16 |
70177286 JK |
230 | .Lxts_gf128mul_and_shl1_mask: |
231 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | |
4ea1277d JG |
232 | .Lbswap_mask: |
233 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | |
cba1cce0 JK |
234 | .Lbswap128_mask: |
235 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
c09220e1 JK |
236 | .Lrkr_enc_Q_Q_QBAR_QBAR: |
237 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 | |
238 | .Lrkr_enc_QBAR_QBAR_QBAR_QBAR: | |
239 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | |
240 | .Lrkr_dec_Q_Q_Q_Q: | |
241 | .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 | |
242 | .Lrkr_dec_Q_Q_QBAR_QBAR: | |
243 | .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0 | |
244 | .Lrkr_dec_QBAR_QBAR_QBAR_QBAR: | |
245 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
246 | .L16_mask: | |
247 | .byte 16, 16, 16, 16 | |
4ea1277d | 248 | .L32_mask: |
c09220e1 JK |
249 | .byte 32, 0, 0, 0 |
250 | .Lfirst_mask: | |
251 | .byte 0x1f, 0, 0, 0 | |
252 | ||
253 | .text | |
4ea1277d | 254 | |
cba1cce0 | 255 | .align 8 |
cba1cce0 | 256 | __cast6_enc_blk8: |
4ea1277d JG |
257 | /* input: |
258 | * %rdi: ctx, CTX | |
cba1cce0 JK |
259 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks |
260 | * output: | |
261 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks | |
4ea1277d JG |
262 | */ |
263 | ||
c09220e1 | 264 | pushq %rbp; |
4ea1277d | 265 | pushq %rbx; |
4ea1277d | 266 | |
c09220e1 JK |
267 | vmovdqa .Lbswap_mask, RKM; |
268 | vmovd .Lfirst_mask, R1ST; | |
269 | vmovd .L32_mask, R32; | |
4ea1277d | 270 | |
cba1cce0 JK |
271 | inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
272 | inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | |
4ea1277d | 273 | |
c09220e1 | 274 | preload_rkr(0, dummy, none); |
4ea1277d JG |
275 | Q(0); |
276 | Q(1); | |
277 | Q(2); | |
278 | Q(3); | |
c09220e1 | 279 | preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR); |
4ea1277d JG |
280 | Q(4); |
281 | Q(5); | |
282 | QBAR(6); | |
283 | QBAR(7); | |
c09220e1 | 284 | preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR); |
4ea1277d JG |
285 | QBAR(8); |
286 | QBAR(9); | |
287 | QBAR(10); | |
288 | QBAR(11); | |
289 | ||
4ea1277d | 290 | popq %rbx; |
c09220e1 | 291 | popq %rbp; |
4ea1277d | 292 | |
c09220e1 | 293 | vmovdqa .Lbswap_mask, RKM; |
4ea1277d | 294 | |
cba1cce0 JK |
295 | outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
296 | outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | |
4ea1277d JG |
297 | |
298 | ret; | |
1985fecf | 299 | ENDPROC(__cast6_enc_blk8) |
4ea1277d | 300 | |
cba1cce0 | 301 | .align 8 |
cba1cce0 | 302 | __cast6_dec_blk8: |
4ea1277d JG |
303 | /* input: |
304 | * %rdi: ctx, CTX | |
cba1cce0 JK |
305 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
306 | * output: | |
307 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks | |
4ea1277d JG |
308 | */ |
309 | ||
c09220e1 | 310 | pushq %rbp; |
4ea1277d JG |
311 | pushq %rbx; |
312 | ||
c09220e1 JK |
313 | vmovdqa .Lbswap_mask, RKM; |
314 | vmovd .Lfirst_mask, R1ST; | |
315 | vmovd .L32_mask, R32; | |
4ea1277d | 316 | |
cba1cce0 JK |
317 | inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
318 | inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | |
4ea1277d | 319 | |
c09220e1 | 320 | preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); |
4ea1277d JG |
321 | Q(11); |
322 | Q(10); | |
323 | Q(9); | |
324 | Q(8); | |
c09220e1 | 325 | preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR); |
4ea1277d JG |
326 | Q(7); |
327 | Q(6); | |
328 | QBAR(5); | |
329 | QBAR(4); | |
c09220e1 | 330 | preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR); |
4ea1277d JG |
331 | QBAR(3); |
332 | QBAR(2); | |
333 | QBAR(1); | |
334 | QBAR(0); | |
335 | ||
336 | popq %rbx; | |
c09220e1 | 337 | popq %rbp; |
4ea1277d | 338 | |
c09220e1 | 339 | vmovdqa .Lbswap_mask, RKM; |
cba1cce0 JK |
340 | outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
341 | outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | |
342 | ||
343 | ret; | |
1985fecf | 344 | ENDPROC(__cast6_dec_blk8) |
cba1cce0 | 345 | |
1985fecf | 346 | ENTRY(cast6_ecb_enc_8way) |
cba1cce0 JK |
347 | /* input: |
348 | * %rdi: ctx, CTX | |
349 | * %rsi: dst | |
350 | * %rdx: src | |
351 | */ | |
352 | ||
353 | movq %rsi, %r11; | |
354 | ||
355 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
356 | ||
357 | call __cast6_enc_blk8; | |
358 | ||
359 | store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
360 | ||
361 | ret; | |
1985fecf | 362 | ENDPROC(cast6_ecb_enc_8way) |
cba1cce0 | 363 | |
1985fecf | 364 | ENTRY(cast6_ecb_dec_8way) |
cba1cce0 JK |
365 | /* input: |
366 | * %rdi: ctx, CTX | |
367 | * %rsi: dst | |
368 | * %rdx: src | |
369 | */ | |
370 | ||
371 | movq %rsi, %r11; | |
372 | ||
373 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
374 | ||
375 | call __cast6_dec_blk8; | |
376 | ||
377 | store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
378 | ||
379 | ret; | |
1985fecf | 380 | ENDPROC(cast6_ecb_dec_8way) |
cba1cce0 | 381 | |
1985fecf | 382 | ENTRY(cast6_cbc_dec_8way) |
cba1cce0 JK |
383 | /* input: |
384 | * %rdi: ctx, CTX | |
385 | * %rsi: dst | |
386 | * %rdx: src | |
387 | */ | |
388 | ||
389 | pushq %r12; | |
390 | ||
391 | movq %rsi, %r11; | |
392 | movq %rdx, %r12; | |
393 | ||
394 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
395 | ||
396 | call __cast6_dec_blk8; | |
397 | ||
398 | store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
399 | ||
400 | popq %r12; | |
401 | ||
402 | ret; | |
1985fecf | 403 | ENDPROC(cast6_cbc_dec_8way) |
cba1cce0 | 404 | |
1985fecf | 405 | ENTRY(cast6_ctr_8way) |
cba1cce0 JK |
406 | /* input: |
407 | * %rdi: ctx, CTX | |
408 | * %rsi: dst | |
409 | * %rdx: src | |
410 | * %rcx: iv (little endian, 128bit) | |
411 | */ | |
412 | ||
413 | pushq %r12; | |
414 | ||
415 | movq %rsi, %r11; | |
416 | movq %rdx, %r12; | |
417 | ||
418 | load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | |
419 | RD2, RX, RKR, RKM); | |
420 | ||
421 | call __cast6_enc_blk8; | |
422 | ||
423 | store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
424 | ||
425 | popq %r12; | |
4ea1277d JG |
426 | |
427 | ret; | |
1985fecf | 428 | ENDPROC(cast6_ctr_8way) |
70177286 JK |
429 | |
430 | ENTRY(cast6_xts_enc_8way) | |
431 | /* input: | |
432 | * %rdi: ctx, CTX | |
433 | * %rsi: dst | |
434 | * %rdx: src | |
435 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | |
436 | */ | |
437 | ||
438 | movq %rsi, %r11; | |
439 | ||
440 | /* regs <= src, dst <= IVs, regs <= regs xor IVs */ | |
441 | load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, | |
442 | RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); | |
443 | ||
444 | call __cast6_enc_blk8; | |
445 | ||
446 | /* dst <= regs xor IVs(in dst) */ | |
447 | store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
448 | ||
449 | ret; | |
450 | ENDPROC(cast6_xts_enc_8way) | |
451 | ||
452 | ENTRY(cast6_xts_dec_8way) | |
453 | /* input: | |
454 | * %rdi: ctx, CTX | |
455 | * %rsi: dst | |
456 | * %rdx: src | |
457 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | |
458 | */ | |
459 | ||
460 | movq %rsi, %r11; | |
461 | ||
462 | /* regs <= src, dst <= IVs, regs <= regs xor IVs */ | |
463 | load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, | |
464 | RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); | |
465 | ||
466 | call __cast6_dec_blk8; | |
467 | ||
468 | /* dst <= regs xor IVs(in dst) */ | |
469 | store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
470 | ||
471 | ret; | |
472 | ENDPROC(cast6_xts_dec_8way) |