Commit | Line | Data |
---|---|---|
64b94cea JK |
1 | /* |
2 | * Blowfish Cipher Algorithm (x86_64) | |
3 | * | |
4 | * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
19 | * USA | |
20 | * | |
21 | */ | |
22 | ||
5186e395 JK |
23 | #include <linux/linkage.h> |
24 | ||
64b94cea JK |
25 | .file "blowfish-x86_64-asm.S" |
26 | .text | |
27 | ||
28 | /* structure of crypto context */ | |
29 | #define p 0 | |
30 | #define s0 ((16 + 2) * 4) | |
31 | #define s1 ((16 + 2 + (1 * 256)) * 4) | |
32 | #define s2 ((16 + 2 + (2 * 256)) * 4) | |
33 | #define s3 ((16 + 2 + (3 * 256)) * 4) | |
34 | ||
35 | /* register macros */ | |
36 | #define CTX %rdi | |
37 | #define RIO %rsi | |
38 | ||
39 | #define RX0 %rax | |
40 | #define RX1 %rbx | |
41 | #define RX2 %rcx | |
42 | #define RX3 %rdx | |
43 | ||
44 | #define RX0d %eax | |
45 | #define RX1d %ebx | |
46 | #define RX2d %ecx | |
47 | #define RX3d %edx | |
48 | ||
49 | #define RX0bl %al | |
50 | #define RX1bl %bl | |
51 | #define RX2bl %cl | |
52 | #define RX3bl %dl | |
53 | ||
54 | #define RX0bh %ah | |
55 | #define RX1bh %bh | |
56 | #define RX2bh %ch | |
57 | #define RX3bh %dh | |
58 | ||
59 | #define RT0 %rbp | |
60 | #define RT1 %rsi | |
e827bb09 JK |
61 | #define RT2 %r8 |
62 | #define RT3 %r9 | |
64b94cea JK |
63 | |
64 | #define RT0d %ebp | |
65 | #define RT1d %esi | |
e827bb09 JK |
66 | #define RT2d %r8d |
67 | #define RT3d %r9d | |
64b94cea | 68 | |
e827bb09 | 69 | #define RKEY %r10 |
64b94cea JK |
70 | |
71 | /*********************************************************************** | |
72 | * 1-way blowfish | |
73 | ***********************************************************************/ | |
e827bb09 JK |
74 | #define F() \ |
75 | rorq $16, RX0; \ | |
76 | movzbl RX0bh, RT0d; \ | |
77 | movzbl RX0bl, RT1d; \ | |
78 | rolq $16, RX0; \ | |
79 | movl s0(CTX,RT0,4), RT0d; \ | |
80 | addl s1(CTX,RT1,4), RT0d; \ | |
81 | movzbl RX0bh, RT1d; \ | |
82 | movzbl RX0bl, RT2d; \ | |
83 | rolq $32, RX0; \ | |
84 | xorl s2(CTX,RT1,4), RT0d; \ | |
85 | addl s3(CTX,RT2,4), RT0d; \ | |
86 | xorq RT0, RX0; | |
64b94cea JK |
87 | |
88 | #define add_roundkey_enc(n) \ | |
89 | xorq p+4*(n)(CTX), RX0; | |
90 | ||
91 | #define round_enc(n) \ | |
92 | add_roundkey_enc(n); \ | |
93 | \ | |
e827bb09 JK |
94 | F(); \ |
95 | F(); | |
64b94cea JK |
96 | |
97 | #define add_roundkey_dec(n) \ | |
98 | movq p+4*(n-1)(CTX), RT0; \ | |
99 | rorq $32, RT0; \ | |
100 | xorq RT0, RX0; | |
101 | ||
102 | #define round_dec(n) \ | |
103 | add_roundkey_dec(n); \ | |
104 | \ | |
e827bb09 JK |
105 | F(); \ |
106 | F(); \ | |
64b94cea JK |
107 | |
108 | #define read_block() \ | |
109 | movq (RIO), RX0; \ | |
110 | rorq $32, RX0; \ | |
111 | bswapq RX0; | |
112 | ||
113 | #define write_block() \ | |
114 | bswapq RX0; \ | |
115 | movq RX0, (RIO); | |
116 | ||
117 | #define xor_block() \ | |
118 | bswapq RX0; \ | |
119 | xorq RX0, (RIO); | |
120 | ||
5186e395 | 121 | ENTRY(__blowfish_enc_blk) |
e827bb09 JK |
122 | /* input: |
123 | * %rdi: ctx, CTX | |
124 | * %rsi: dst | |
125 | * %rdx: src | |
126 | * %rcx: bool, if true: xor output | |
127 | */ | |
128 | movq %rbp, %r11; | |
129 | ||
130 | movq %rsi, %r10; | |
64b94cea JK |
131 | movq %rdx, RIO; |
132 | ||
133 | read_block(); | |
134 | ||
135 | round_enc(0); | |
136 | round_enc(2); | |
137 | round_enc(4); | |
138 | round_enc(6); | |
139 | round_enc(8); | |
140 | round_enc(10); | |
141 | round_enc(12); | |
142 | round_enc(14); | |
143 | add_roundkey_enc(16); | |
144 | ||
e827bb09 | 145 | movq %r11, %rbp; |
64b94cea | 146 | |
e827bb09 JK |
147 | movq %r10, RIO; |
148 | test %cl, %cl; | |
5186e395 | 149 | jnz .L__enc_xor; |
64b94cea JK |
150 | |
151 | write_block(); | |
64b94cea | 152 | ret; |
5186e395 | 153 | .L__enc_xor: |
64b94cea | 154 | xor_block(); |
e827bb09 | 155 | ret; |
5186e395 | 156 | ENDPROC(__blowfish_enc_blk) |
64b94cea | 157 | |
5186e395 | 158 | ENTRY(blowfish_dec_blk) |
e827bb09 JK |
159 | /* input: |
160 | * %rdi: ctx, CTX | |
161 | * %rsi: dst | |
162 | * %rdx: src | |
163 | */ | |
164 | movq %rbp, %r11; | |
165 | ||
166 | movq %rsi, %r10; | |
64b94cea JK |
167 | movq %rdx, RIO; |
168 | ||
169 | read_block(); | |
170 | ||
171 | round_dec(17); | |
172 | round_dec(15); | |
173 | round_dec(13); | |
174 | round_dec(11); | |
175 | round_dec(9); | |
176 | round_dec(7); | |
177 | round_dec(5); | |
178 | round_dec(3); | |
179 | add_roundkey_dec(1); | |
180 | ||
e827bb09 | 181 | movq %r10, RIO; |
64b94cea JK |
182 | write_block(); |
183 | ||
e827bb09 | 184 | movq %r11, %rbp; |
64b94cea JK |
185 | |
186 | ret; | |
5186e395 | 187 | ENDPROC(blowfish_dec_blk) |
64b94cea JK |
188 | |
189 | /********************************************************************** | |
190 | 4-way blowfish, four blocks parallel | |
191 | **********************************************************************/ | |
e827bb09 JK |
192 | |
193 | /* F() for 4-way. Slower when used alone/1-way, but faster when used | |
194 | * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). | |
195 | */ | |
196 | #define F4(x) \ | |
197 | movzbl x ## bh, RT1d; \ | |
198 | movzbl x ## bl, RT3d; \ | |
199 | rorq $16, x; \ | |
200 | movzbl x ## bh, RT0d; \ | |
201 | movzbl x ## bl, RT2d; \ | |
202 | rorq $16, x; \ | |
203 | movl s0(CTX,RT0,4), RT0d; \ | |
204 | addl s1(CTX,RT2,4), RT0d; \ | |
205 | xorl s2(CTX,RT1,4), RT0d; \ | |
206 | addl s3(CTX,RT3,4), RT0d; \ | |
207 | xorq RT0, x; | |
208 | ||
64b94cea JK |
209 | #define add_preloaded_roundkey4() \ |
210 | xorq RKEY, RX0; \ | |
211 | xorq RKEY, RX1; \ | |
212 | xorq RKEY, RX2; \ | |
213 | xorq RKEY, RX3; | |
214 | ||
215 | #define preload_roundkey_enc(n) \ | |
216 | movq p+4*(n)(CTX), RKEY; | |
217 | ||
218 | #define add_roundkey_enc4(n) \ | |
219 | add_preloaded_roundkey4(); \ | |
220 | preload_roundkey_enc(n + 2); | |
221 | ||
222 | #define round_enc4(n) \ | |
223 | add_roundkey_enc4(n); \ | |
224 | \ | |
e827bb09 JK |
225 | F4(RX0); \ |
226 | F4(RX1); \ | |
227 | F4(RX2); \ | |
228 | F4(RX3); \ | |
64b94cea | 229 | \ |
e827bb09 JK |
230 | F4(RX0); \ |
231 | F4(RX1); \ | |
232 | F4(RX2); \ | |
233 | F4(RX3); | |
64b94cea JK |
234 | |
235 | #define preload_roundkey_dec(n) \ | |
236 | movq p+4*((n)-1)(CTX), RKEY; \ | |
237 | rorq $32, RKEY; | |
238 | ||
239 | #define add_roundkey_dec4(n) \ | |
240 | add_preloaded_roundkey4(); \ | |
241 | preload_roundkey_dec(n - 2); | |
242 | ||
243 | #define round_dec4(n) \ | |
244 | add_roundkey_dec4(n); \ | |
245 | \ | |
e827bb09 JK |
246 | F4(RX0); \ |
247 | F4(RX1); \ | |
248 | F4(RX2); \ | |
249 | F4(RX3); \ | |
64b94cea | 250 | \ |
e827bb09 JK |
251 | F4(RX0); \ |
252 | F4(RX1); \ | |
253 | F4(RX2); \ | |
254 | F4(RX3); | |
64b94cea JK |
255 | |
256 | #define read_block4() \ | |
257 | movq (RIO), RX0; \ | |
258 | rorq $32, RX0; \ | |
259 | bswapq RX0; \ | |
260 | \ | |
261 | movq 8(RIO), RX1; \ | |
262 | rorq $32, RX1; \ | |
263 | bswapq RX1; \ | |
264 | \ | |
265 | movq 16(RIO), RX2; \ | |
266 | rorq $32, RX2; \ | |
267 | bswapq RX2; \ | |
268 | \ | |
269 | movq 24(RIO), RX3; \ | |
270 | rorq $32, RX3; \ | |
271 | bswapq RX3; | |
272 | ||
273 | #define write_block4() \ | |
274 | bswapq RX0; \ | |
275 | movq RX0, (RIO); \ | |
276 | \ | |
277 | bswapq RX1; \ | |
278 | movq RX1, 8(RIO); \ | |
279 | \ | |
280 | bswapq RX2; \ | |
281 | movq RX2, 16(RIO); \ | |
282 | \ | |
283 | bswapq RX3; \ | |
284 | movq RX3, 24(RIO); | |
285 | ||
286 | #define xor_block4() \ | |
287 | bswapq RX0; \ | |
288 | xorq RX0, (RIO); \ | |
289 | \ | |
290 | bswapq RX1; \ | |
291 | xorq RX1, 8(RIO); \ | |
292 | \ | |
293 | bswapq RX2; \ | |
294 | xorq RX2, 16(RIO); \ | |
295 | \ | |
296 | bswapq RX3; \ | |
297 | xorq RX3, 24(RIO); | |
298 | ||
5186e395 | 299 | ENTRY(__blowfish_enc_blk_4way) |
e827bb09 JK |
300 | /* input: |
301 | * %rdi: ctx, CTX | |
302 | * %rsi: dst | |
303 | * %rdx: src | |
304 | * %rcx: bool, if true: xor output | |
305 | */ | |
64b94cea JK |
306 | pushq %rbp; |
307 | pushq %rbx; | |
e827bb09 JK |
308 | pushq %rcx; |
309 | ||
64b94cea JK |
310 | preload_roundkey_enc(0); |
311 | ||
e827bb09 | 312 | movq %rsi, %r11; |
64b94cea JK |
313 | movq %rdx, RIO; |
314 | ||
315 | read_block4(); | |
316 | ||
317 | round_enc4(0); | |
318 | round_enc4(2); | |
319 | round_enc4(4); | |
320 | round_enc4(6); | |
321 | round_enc4(8); | |
322 | round_enc4(10); | |
323 | round_enc4(12); | |
324 | round_enc4(14); | |
325 | add_preloaded_roundkey4(); | |
326 | ||
327 | popq %rbp; | |
e827bb09 | 328 | movq %r11, RIO; |
64b94cea JK |
329 | |
330 | test %bpl, %bpl; | |
5186e395 | 331 | jnz .L__enc_xor4; |
64b94cea JK |
332 | |
333 | write_block4(); | |
334 | ||
64b94cea JK |
335 | popq %rbx; |
336 | popq %rbp; | |
64b94cea JK |
337 | ret; |
338 | ||
5186e395 | 339 | .L__enc_xor4: |
64b94cea JK |
340 | xor_block4(); |
341 | ||
e827bb09 JK |
342 | popq %rbx; |
343 | popq %rbp; | |
344 | ret; | |
5186e395 | 345 | ENDPROC(__blowfish_enc_blk_4way) |
64b94cea | 346 | |
5186e395 | 347 | ENTRY(blowfish_dec_blk_4way) |
e827bb09 JK |
348 | /* input: |
349 | * %rdi: ctx, CTX | |
350 | * %rsi: dst | |
351 | * %rdx: src | |
352 | */ | |
64b94cea JK |
353 | pushq %rbp; |
354 | pushq %rbx; | |
64b94cea JK |
355 | preload_roundkey_dec(17); |
356 | ||
e827bb09 | 357 | movq %rsi, %r11; |
64b94cea JK |
358 | movq %rdx, RIO; |
359 | ||
360 | read_block4(); | |
361 | ||
362 | round_dec4(17); | |
363 | round_dec4(15); | |
364 | round_dec4(13); | |
365 | round_dec4(11); | |
366 | round_dec4(9); | |
367 | round_dec4(7); | |
368 | round_dec4(5); | |
369 | round_dec4(3); | |
370 | add_preloaded_roundkey4(); | |
371 | ||
e827bb09 | 372 | movq %r11, RIO; |
64b94cea JK |
373 | write_block4(); |
374 | ||
64b94cea JK |
375 | popq %rbx; |
376 | popq %rbp; | |
377 | ||
378 | ret; | |
5186e395 | 379 | ENDPROC(blowfish_dec_blk_4way) |