Commit | Line | Data |
---|---|---|
8280daad JK |
1 | /* |
2 | * Twofish Cipher 3-way parallel algorithm (x86_64) | |
3 | * | |
4 | * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
19 | * USA | |
20 | * | |
21 | */ | |
22 | ||
23 | .file "twofish-x86_64-asm-3way.S" | |
24 | .text | |
25 | ||
26 | /* structure of crypto context */ | |
27 | #define s0 0 | |
28 | #define s1 1024 | |
29 | #define s2 2048 | |
30 | #define s3 3072 | |
31 | #define w 4096 | |
32 | #define k 4128 | |
33 | ||
34 | /********************************************************************** | |
35 | 3-way twofish | |
36 | **********************************************************************/ | |
37 | #define CTX %rdi | |
38 | #define RIO %rdx | |
39 | ||
40 | #define RAB0 %rax | |
41 | #define RAB1 %rbx | |
42 | #define RAB2 %rcx | |
43 | ||
44 | #define RAB0d %eax | |
45 | #define RAB1d %ebx | |
46 | #define RAB2d %ecx | |
47 | ||
48 | #define RAB0bh %ah | |
49 | #define RAB1bh %bh | |
50 | #define RAB2bh %ch | |
51 | ||
52 | #define RAB0bl %al | |
53 | #define RAB1bl %bl | |
54 | #define RAB2bl %cl | |
55 | ||
56 | #define RCD0 %r8 | |
57 | #define RCD1 %r9 | |
58 | #define RCD2 %r10 | |
59 | ||
60 | #define RCD0d %r8d | |
61 | #define RCD1d %r9d | |
62 | #define RCD2d %r10d | |
63 | ||
64 | #define RX0 %rbp | |
65 | #define RX1 %r11 | |
66 | #define RX2 %r12 | |
67 | ||
68 | #define RX0d %ebp | |
69 | #define RX1d %r11d | |
70 | #define RX2d %r12d | |
71 | ||
72 | #define RY0 %r13 | |
73 | #define RY1 %r14 | |
74 | #define RY2 %r15 | |
75 | ||
76 | #define RY0d %r13d | |
77 | #define RY1d %r14d | |
78 | #define RY2d %r15d | |
79 | ||
80 | #define RT0 %rdx | |
81 | #define RT1 %rsi | |
82 | ||
83 | #define RT0d %edx | |
84 | #define RT1d %esi | |
85 | ||
86 | #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ | |
87 | movzbl ab ## bl, tmp2 ## d; \ | |
88 | movzbl ab ## bh, tmp1 ## d; \ | |
89 | rorq $(rot), ab; \ | |
90 | op1##l T0(CTX, tmp2, 4), dst ## d; \ | |
91 | op2##l T1(CTX, tmp1, 4), dst ## d; | |
92 | ||
93 | /* | |
94 | * Combined G1 & G2 function. Reordered with help of rotates to have moves | |
95 | * at begining. | |
96 | */ | |
97 | #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ | |
98 | /* G1,1 && G2,1 */ \ | |
99 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ | |
100 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ | |
101 | \ | |
102 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ | |
103 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ | |
104 | \ | |
105 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ | |
106 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ | |
107 | \ | |
108 | /* G1,2 && G2,2 */ \ | |
109 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ | |
110 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ | |
111 | xchgq cd ## 0, ab ## 0; \ | |
112 | \ | |
113 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ | |
114 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ | |
115 | xchgq cd ## 1, ab ## 1; \ | |
116 | \ | |
117 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ | |
118 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ | |
119 | xchgq cd ## 2, ab ## 2; | |
120 | ||
121 | #define enc_round_end(ab, x, y, n) \ | |
122 | addl y ## d, x ## d; \ | |
123 | addl x ## d, y ## d; \ | |
124 | addl k+4*(2*(n))(CTX), x ## d; \ | |
125 | xorl ab ## d, x ## d; \ | |
126 | addl k+4*(2*(n)+1)(CTX), y ## d; \ | |
127 | shrq $32, ab; \ | |
128 | roll $1, ab ## d; \ | |
129 | xorl y ## d, ab ## d; \ | |
130 | shlq $32, ab; \ | |
131 | rorl $1, x ## d; \ | |
132 | orq x, ab; | |
133 | ||
134 | #define dec_round_end(ba, x, y, n) \ | |
135 | addl y ## d, x ## d; \ | |
136 | addl x ## d, y ## d; \ | |
137 | addl k+4*(2*(n))(CTX), x ## d; \ | |
138 | addl k+4*(2*(n)+1)(CTX), y ## d; \ | |
139 | xorl ba ## d, y ## d; \ | |
140 | shrq $32, ba; \ | |
141 | roll $1, ba ## d; \ | |
142 | xorl x ## d, ba ## d; \ | |
143 | shlq $32, ba; \ | |
144 | rorl $1, y ## d; \ | |
145 | orq y, ba; | |
146 | ||
147 | #define encrypt_round3(ab, cd, n) \ | |
148 | g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ | |
149 | \ | |
150 | enc_round_end(ab ## 0, RX0, RY0, n); \ | |
151 | enc_round_end(ab ## 1, RX1, RY1, n); \ | |
152 | enc_round_end(ab ## 2, RX2, RY2, n); | |
153 | ||
154 | #define decrypt_round3(ba, dc, n) \ | |
155 | g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ | |
156 | \ | |
157 | dec_round_end(ba ## 0, RX0, RY0, n); \ | |
158 | dec_round_end(ba ## 1, RX1, RY1, n); \ | |
159 | dec_round_end(ba ## 2, RX2, RY2, n); | |
160 | ||
161 | #define encrypt_cycle3(ab, cd, n) \ | |
162 | encrypt_round3(ab, cd, n*2); \ | |
163 | encrypt_round3(ab, cd, (n*2)+1); | |
164 | ||
165 | #define decrypt_cycle3(ba, dc, n) \ | |
166 | decrypt_round3(ba, dc, (n*2)+1); \ | |
167 | decrypt_round3(ba, dc, (n*2)); | |
168 | ||
169 | #define inpack3(in, n, xy, m) \ | |
170 | movq 4*(n)(in), xy ## 0; \ | |
171 | xorq w+4*m(CTX), xy ## 0; \ | |
172 | \ | |
173 | movq 4*(4+(n))(in), xy ## 1; \ | |
174 | xorq w+4*m(CTX), xy ## 1; \ | |
175 | \ | |
176 | movq 4*(8+(n))(in), xy ## 2; \ | |
177 | xorq w+4*m(CTX), xy ## 2; | |
178 | ||
179 | #define outunpack3(op, out, n, xy, m) \ | |
180 | xorq w+4*m(CTX), xy ## 0; \ | |
181 | op ## q xy ## 0, 4*(n)(out); \ | |
182 | \ | |
183 | xorq w+4*m(CTX), xy ## 1; \ | |
184 | op ## q xy ## 1, 4*(4+(n))(out); \ | |
185 | \ | |
186 | xorq w+4*m(CTX), xy ## 2; \ | |
187 | op ## q xy ## 2, 4*(8+(n))(out); | |
188 | ||
189 | #define inpack_enc3() \ | |
190 | inpack3(RIO, 0, RAB, 0); \ | |
191 | inpack3(RIO, 2, RCD, 2); | |
192 | ||
193 | #define outunpack_enc3(op) \ | |
194 | outunpack3(op, RIO, 2, RAB, 6); \ | |
195 | outunpack3(op, RIO, 0, RCD, 4); | |
196 | ||
197 | #define inpack_dec3() \ | |
198 | inpack3(RIO, 0, RAB, 4); \ | |
199 | rorq $32, RAB0; \ | |
200 | rorq $32, RAB1; \ | |
201 | rorq $32, RAB2; \ | |
202 | inpack3(RIO, 2, RCD, 6); \ | |
203 | rorq $32, RCD0; \ | |
204 | rorq $32, RCD1; \ | |
205 | rorq $32, RCD2; | |
206 | ||
207 | #define outunpack_dec3() \ | |
208 | rorq $32, RCD0; \ | |
209 | rorq $32, RCD1; \ | |
210 | rorq $32, RCD2; \ | |
211 | outunpack3(mov, RIO, 0, RCD, 0); \ | |
212 | rorq $32, RAB0; \ | |
213 | rorq $32, RAB1; \ | |
214 | rorq $32, RAB2; \ | |
215 | outunpack3(mov, RIO, 2, RAB, 2); | |
216 | ||
217 | .align 8 | |
218 | .global __twofish_enc_blk_3way | |
219 | .type __twofish_enc_blk_3way,@function; | |
220 | ||
221 | __twofish_enc_blk_3way: | |
222 | /* input: | |
223 | * %rdi: ctx, CTX | |
224 | * %rsi: dst | |
225 | * %rdx: src, RIO | |
226 | * %rcx: bool, if true: xor output | |
227 | */ | |
228 | pushq %r15; | |
229 | pushq %r14; | |
230 | pushq %r13; | |
231 | pushq %r12; | |
232 | pushq %rbp; | |
233 | pushq %rbx; | |
234 | ||
235 | pushq %rcx; /* bool xor */ | |
236 | pushq %rsi; /* dst */ | |
237 | ||
238 | inpack_enc3(); | |
239 | ||
240 | encrypt_cycle3(RAB, RCD, 0); | |
241 | encrypt_cycle3(RAB, RCD, 1); | |
242 | encrypt_cycle3(RAB, RCD, 2); | |
243 | encrypt_cycle3(RAB, RCD, 3); | |
244 | encrypt_cycle3(RAB, RCD, 4); | |
245 | encrypt_cycle3(RAB, RCD, 5); | |
246 | encrypt_cycle3(RAB, RCD, 6); | |
247 | encrypt_cycle3(RAB, RCD, 7); | |
248 | ||
249 | popq RIO; /* dst */ | |
250 | popq %rbp; /* bool xor */ | |
251 | ||
252 | testb %bpl, %bpl; | |
253 | jnz __enc_xor3; | |
254 | ||
255 | outunpack_enc3(mov); | |
256 | ||
257 | popq %rbx; | |
258 | popq %rbp; | |
259 | popq %r12; | |
260 | popq %r13; | |
261 | popq %r14; | |
262 | popq %r15; | |
263 | ret; | |
264 | ||
265 | __enc_xor3: | |
266 | outunpack_enc3(xor); | |
267 | ||
268 | popq %rbx; | |
269 | popq %rbp; | |
270 | popq %r12; | |
271 | popq %r13; | |
272 | popq %r14; | |
273 | popq %r15; | |
274 | ret; | |
275 | ||
276 | .global twofish_dec_blk_3way | |
277 | .type twofish_dec_blk_3way,@function; | |
278 | ||
279 | twofish_dec_blk_3way: | |
280 | /* input: | |
281 | * %rdi: ctx, CTX | |
282 | * %rsi: dst | |
283 | * %rdx: src, RIO | |
284 | */ | |
285 | pushq %r15; | |
286 | pushq %r14; | |
287 | pushq %r13; | |
288 | pushq %r12; | |
289 | pushq %rbp; | |
290 | pushq %rbx; | |
291 | ||
292 | pushq %rsi; /* dst */ | |
293 | ||
294 | inpack_dec3(); | |
295 | ||
296 | decrypt_cycle3(RAB, RCD, 7); | |
297 | decrypt_cycle3(RAB, RCD, 6); | |
298 | decrypt_cycle3(RAB, RCD, 5); | |
299 | decrypt_cycle3(RAB, RCD, 4); | |
300 | decrypt_cycle3(RAB, RCD, 3); | |
301 | decrypt_cycle3(RAB, RCD, 2); | |
302 | decrypt_cycle3(RAB, RCD, 1); | |
303 | decrypt_cycle3(RAB, RCD, 0); | |
304 | ||
305 | popq RIO; /* dst */ | |
306 | ||
307 | outunpack_dec3(); | |
308 | ||
309 | popq %rbx; | |
310 | popq %rbp; | |
311 | popq %r12; | |
312 | popq %r13; | |
313 | popq %r14; | |
314 | popq %r15; | |
315 | ret; | |
316 |