Commit | Line | Data |
---|---|---|
64b94cea JK |
1 | /* |
2 | * Blowfish Cipher Algorithm (x86_64) | |
3 | * | |
4 | * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
19 | * USA | |
20 | * | |
21 | */ | |
22 | ||
23 | .file "blowfish-x86_64-asm.S" | |
24 | .text | |
25 | ||
26 | /* structure of crypto context */ | |
27 | #define p 0 | |
28 | #define s0 ((16 + 2) * 4) | |
29 | #define s1 ((16 + 2 + (1 * 256)) * 4) | |
30 | #define s2 ((16 + 2 + (2 * 256)) * 4) | |
31 | #define s3 ((16 + 2 + (3 * 256)) * 4) | |
32 | ||
33 | /* register macros */ | |
34 | #define CTX %rdi | |
35 | #define RIO %rsi | |
36 | ||
37 | #define RX0 %rax | |
38 | #define RX1 %rbx | |
39 | #define RX2 %rcx | |
40 | #define RX3 %rdx | |
41 | ||
42 | #define RX0d %eax | |
43 | #define RX1d %ebx | |
44 | #define RX2d %ecx | |
45 | #define RX3d %edx | |
46 | ||
47 | #define RX0bl %al | |
48 | #define RX1bl %bl | |
49 | #define RX2bl %cl | |
50 | #define RX3bl %dl | |
51 | ||
52 | #define RX0bh %ah | |
53 | #define RX1bh %bh | |
54 | #define RX2bh %ch | |
55 | #define RX3bh %dh | |
56 | ||
57 | #define RT0 %rbp | |
58 | #define RT1 %rsi | |
e827bb09 JK |
59 | #define RT2 %r8 |
60 | #define RT3 %r9 | |
64b94cea JK |
61 | |
62 | #define RT0d %ebp | |
63 | #define RT1d %esi | |
e827bb09 JK |
64 | #define RT2d %r8d |
65 | #define RT3d %r9d | |
64b94cea | 66 | |
e827bb09 | 67 | #define RKEY %r10 |
64b94cea JK |
68 | |
69 | /*********************************************************************** | |
70 | * 1-way blowfish | |
71 | ***********************************************************************/ | |
e827bb09 JK |
72 | #define F() \ |
73 | rorq $16, RX0; \ | |
74 | movzbl RX0bh, RT0d; \ | |
75 | movzbl RX0bl, RT1d; \ | |
76 | rolq $16, RX0; \ | |
77 | movl s0(CTX,RT0,4), RT0d; \ | |
78 | addl s1(CTX,RT1,4), RT0d; \ | |
79 | movzbl RX0bh, RT1d; \ | |
80 | movzbl RX0bl, RT2d; \ | |
81 | rolq $32, RX0; \ | |
82 | xorl s2(CTX,RT1,4), RT0d; \ | |
83 | addl s3(CTX,RT2,4), RT0d; \ | |
84 | xorq RT0, RX0; | |
64b94cea JK |
85 | |
86 | #define add_roundkey_enc(n) \ | |
87 | xorq p+4*(n)(CTX), RX0; | |
88 | ||
89 | #define round_enc(n) \ | |
90 | add_roundkey_enc(n); \ | |
91 | \ | |
e827bb09 JK |
92 | F(); \ |
93 | F(); | |
64b94cea JK |
94 | |
95 | #define add_roundkey_dec(n) \ | |
96 | movq p+4*(n-1)(CTX), RT0; \ | |
97 | rorq $32, RT0; \ | |
98 | xorq RT0, RX0; | |
99 | ||
100 | #define round_dec(n) \ | |
101 | add_roundkey_dec(n); \ | |
102 | \ | |
e827bb09 JK |
103 | F(); \ |
104 | F(); \ | |
64b94cea JK |
105 | |
106 | #define read_block() \ | |
107 | movq (RIO), RX0; \ | |
108 | rorq $32, RX0; \ | |
109 | bswapq RX0; | |
110 | ||
111 | #define write_block() \ | |
112 | bswapq RX0; \ | |
113 | movq RX0, (RIO); | |
114 | ||
115 | #define xor_block() \ | |
116 | bswapq RX0; \ | |
117 | xorq RX0, (RIO); | |
118 | ||
119 | .align 8 | |
120 | .global __blowfish_enc_blk | |
121 | .type __blowfish_enc_blk,@function; | |
122 | ||
123 | __blowfish_enc_blk: | |
e827bb09 JK |
124 | /* input: |
125 | * %rdi: ctx, CTX | |
126 | * %rsi: dst | |
127 | * %rdx: src | |
128 | * %rcx: bool, if true: xor output | |
129 | */ | |
130 | movq %rbp, %r11; | |
131 | ||
132 | movq %rsi, %r10; | |
64b94cea JK |
133 | movq %rdx, RIO; |
134 | ||
135 | read_block(); | |
136 | ||
137 | round_enc(0); | |
138 | round_enc(2); | |
139 | round_enc(4); | |
140 | round_enc(6); | |
141 | round_enc(8); | |
142 | round_enc(10); | |
143 | round_enc(12); | |
144 | round_enc(14); | |
145 | add_roundkey_enc(16); | |
146 | ||
e827bb09 | 147 | movq %r11, %rbp; |
64b94cea | 148 | |
e827bb09 JK |
149 | movq %r10, RIO; |
150 | test %cl, %cl; | |
64b94cea JK |
151 | jnz __enc_xor; |
152 | ||
153 | write_block(); | |
64b94cea | 154 | ret; |
64b94cea JK |
155 | __enc_xor: |
156 | xor_block(); | |
e827bb09 | 157 | ret; |
64b94cea JK |
158 | |
159 | .align 8 | |
160 | .global blowfish_dec_blk | |
161 | .type blowfish_dec_blk,@function; | |
162 | ||
163 | blowfish_dec_blk: | |
e827bb09 JK |
164 | /* input: |
165 | * %rdi: ctx, CTX | |
166 | * %rsi: dst | |
167 | * %rdx: src | |
168 | */ | |
169 | movq %rbp, %r11; | |
170 | ||
171 | movq %rsi, %r10; | |
64b94cea JK |
172 | movq %rdx, RIO; |
173 | ||
174 | read_block(); | |
175 | ||
176 | round_dec(17); | |
177 | round_dec(15); | |
178 | round_dec(13); | |
179 | round_dec(11); | |
180 | round_dec(9); | |
181 | round_dec(7); | |
182 | round_dec(5); | |
183 | round_dec(3); | |
184 | add_roundkey_dec(1); | |
185 | ||
e827bb09 | 186 | movq %r10, RIO; |
64b94cea JK |
187 | write_block(); |
188 | ||
e827bb09 | 189 | movq %r11, %rbp; |
64b94cea JK |
190 | |
191 | ret; | |
192 | ||
193 | /********************************************************************** | |
194 | 4-way blowfish, four blocks parallel | |
195 | **********************************************************************/ | |
e827bb09 JK |
196 | |
197 | /* F() for 4-way. Slower when used alone/1-way, but faster when used | |
198 | * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). | |
199 | */ | |
200 | #define F4(x) \ | |
201 | movzbl x ## bh, RT1d; \ | |
202 | movzbl x ## bl, RT3d; \ | |
203 | rorq $16, x; \ | |
204 | movzbl x ## bh, RT0d; \ | |
205 | movzbl x ## bl, RT2d; \ | |
206 | rorq $16, x; \ | |
207 | movl s0(CTX,RT0,4), RT0d; \ | |
208 | addl s1(CTX,RT2,4), RT0d; \ | |
209 | xorl s2(CTX,RT1,4), RT0d; \ | |
210 | addl s3(CTX,RT3,4), RT0d; \ | |
211 | xorq RT0, x; | |
212 | ||
64b94cea JK |
213 | #define add_preloaded_roundkey4() \ |
214 | xorq RKEY, RX0; \ | |
215 | xorq RKEY, RX1; \ | |
216 | xorq RKEY, RX2; \ | |
217 | xorq RKEY, RX3; | |
218 | ||
219 | #define preload_roundkey_enc(n) \ | |
220 | movq p+4*(n)(CTX), RKEY; | |
221 | ||
222 | #define add_roundkey_enc4(n) \ | |
223 | add_preloaded_roundkey4(); \ | |
224 | preload_roundkey_enc(n + 2); | |
225 | ||
226 | #define round_enc4(n) \ | |
227 | add_roundkey_enc4(n); \ | |
228 | \ | |
e827bb09 JK |
229 | F4(RX0); \ |
230 | F4(RX1); \ | |
231 | F4(RX2); \ | |
232 | F4(RX3); \ | |
64b94cea | 233 | \ |
e827bb09 JK |
234 | F4(RX0); \ |
235 | F4(RX1); \ | |
236 | F4(RX2); \ | |
237 | F4(RX3); | |
64b94cea JK |
238 | |
239 | #define preload_roundkey_dec(n) \ | |
240 | movq p+4*((n)-1)(CTX), RKEY; \ | |
241 | rorq $32, RKEY; | |
242 | ||
243 | #define add_roundkey_dec4(n) \ | |
244 | add_preloaded_roundkey4(); \ | |
245 | preload_roundkey_dec(n - 2); | |
246 | ||
247 | #define round_dec4(n) \ | |
248 | add_roundkey_dec4(n); \ | |
249 | \ | |
e827bb09 JK |
250 | F4(RX0); \ |
251 | F4(RX1); \ | |
252 | F4(RX2); \ | |
253 | F4(RX3); \ | |
64b94cea | 254 | \ |
e827bb09 JK |
255 | F4(RX0); \ |
256 | F4(RX1); \ | |
257 | F4(RX2); \ | |
258 | F4(RX3); | |
64b94cea JK |
259 | |
260 | #define read_block4() \ | |
261 | movq (RIO), RX0; \ | |
262 | rorq $32, RX0; \ | |
263 | bswapq RX0; \ | |
264 | \ | |
265 | movq 8(RIO), RX1; \ | |
266 | rorq $32, RX1; \ | |
267 | bswapq RX1; \ | |
268 | \ | |
269 | movq 16(RIO), RX2; \ | |
270 | rorq $32, RX2; \ | |
271 | bswapq RX2; \ | |
272 | \ | |
273 | movq 24(RIO), RX3; \ | |
274 | rorq $32, RX3; \ | |
275 | bswapq RX3; | |
276 | ||
277 | #define write_block4() \ | |
278 | bswapq RX0; \ | |
279 | movq RX0, (RIO); \ | |
280 | \ | |
281 | bswapq RX1; \ | |
282 | movq RX1, 8(RIO); \ | |
283 | \ | |
284 | bswapq RX2; \ | |
285 | movq RX2, 16(RIO); \ | |
286 | \ | |
287 | bswapq RX3; \ | |
288 | movq RX3, 24(RIO); | |
289 | ||
290 | #define xor_block4() \ | |
291 | bswapq RX0; \ | |
292 | xorq RX0, (RIO); \ | |
293 | \ | |
294 | bswapq RX1; \ | |
295 | xorq RX1, 8(RIO); \ | |
296 | \ | |
297 | bswapq RX2; \ | |
298 | xorq RX2, 16(RIO); \ | |
299 | \ | |
300 | bswapq RX3; \ | |
301 | xorq RX3, 24(RIO); | |
302 | ||
303 | .align 8 | |
304 | .global __blowfish_enc_blk_4way | |
305 | .type __blowfish_enc_blk_4way,@function; | |
306 | ||
307 | __blowfish_enc_blk_4way: | |
e827bb09 JK |
308 | /* input: |
309 | * %rdi: ctx, CTX | |
310 | * %rsi: dst | |
311 | * %rdx: src | |
312 | * %rcx: bool, if true: xor output | |
313 | */ | |
64b94cea JK |
314 | pushq %rbp; |
315 | pushq %rbx; | |
e827bb09 JK |
316 | pushq %rcx; |
317 | ||
64b94cea JK |
318 | preload_roundkey_enc(0); |
319 | ||
e827bb09 | 320 | movq %rsi, %r11; |
64b94cea JK |
321 | movq %rdx, RIO; |
322 | ||
323 | read_block4(); | |
324 | ||
325 | round_enc4(0); | |
326 | round_enc4(2); | |
327 | round_enc4(4); | |
328 | round_enc4(6); | |
329 | round_enc4(8); | |
330 | round_enc4(10); | |
331 | round_enc4(12); | |
332 | round_enc4(14); | |
333 | add_preloaded_roundkey4(); | |
334 | ||
335 | popq %rbp; | |
e827bb09 | 336 | movq %r11, RIO; |
64b94cea JK |
337 | |
338 | test %bpl, %bpl; | |
339 | jnz __enc_xor4; | |
340 | ||
341 | write_block4(); | |
342 | ||
64b94cea JK |
343 | popq %rbx; |
344 | popq %rbp; | |
64b94cea JK |
345 | ret; |
346 | ||
347 | __enc_xor4: | |
348 | xor_block4(); | |
349 | ||
e827bb09 JK |
350 | popq %rbx; |
351 | popq %rbp; | |
352 | ret; | |
64b94cea JK |
353 | |
354 | .align 8 | |
355 | .global blowfish_dec_blk_4way | |
356 | .type blowfish_dec_blk_4way,@function; | |
357 | ||
358 | blowfish_dec_blk_4way: | |
e827bb09 JK |
359 | /* input: |
360 | * %rdi: ctx, CTX | |
361 | * %rsi: dst | |
362 | * %rdx: src | |
363 | */ | |
64b94cea JK |
364 | pushq %rbp; |
365 | pushq %rbx; | |
64b94cea JK |
366 | preload_roundkey_dec(17); |
367 | ||
e827bb09 | 368 | movq %rsi, %r11; |
64b94cea JK |
369 | movq %rdx, RIO; |
370 | ||
371 | read_block4(); | |
372 | ||
373 | round_dec4(17); | |
374 | round_dec4(15); | |
375 | round_dec4(13); | |
376 | round_dec4(11); | |
377 | round_dec4(9); | |
378 | round_dec4(7); | |
379 | round_dec4(5); | |
380 | round_dec4(3); | |
381 | add_preloaded_roundkey4(); | |
382 | ||
e827bb09 | 383 | movq %r11, RIO; |
64b94cea JK |
384 | write_block4(); |
385 | ||
64b94cea JK |
386 | popq %rbx; |
387 | popq %rbp; | |
388 | ||
389 | ret; | |
390 |