Commit | Line | Data |
---|---|---|
1c201e64 MS |
1 | /* |
2 | * Fast AES implementation for SPE instruction set (PPC) | |
3 | * | |
4 | * This code makes use of the SPE SIMD instruction set as defined in | |
5 | * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf | |
6 | * Implementation is based on optimization guide notes from | |
7 | * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf | |
8 | * | |
9 | * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> | |
10 | * | |
11 | * This program is free software; you can redistribute it and/or modify it | |
12 | * under the terms of the GNU General Public License as published by the Free | |
13 | * Software Foundation; either version 2 of the License, or (at your option) | |
14 | * any later version. | |
15 | * | |
16 | */ | |
17 | ||
18 | #include <asm/ppc_asm.h> | |
19 | #include "aes-spe-regs.h" | |
20 | ||
21 | #define EAD(in, bpos) \ | |
22 | rlwimi rT0,in,28-((bpos+3)%4)*8,20,27; | |
23 | ||
24 | #define DAD(in, bpos) \ | |
25 | rlwimi rT1,in,24-((bpos+3)%4)*8,24,31; | |
26 | ||
27 | #define LWH(out, off) \ | |
28 | evlwwsplat out,off(rT0); /* load word high */ | |
29 | ||
30 | #define LWL(out, off) \ | |
31 | lwz out,off(rT0); /* load word low */ | |
32 | ||
33 | #define LBZ(out, tab, off) \ | |
34 | lbz out,off(tab); /* load byte */ | |
35 | ||
36 | #define LAH(out, in, bpos, off) \ | |
37 | EAD(in, bpos) /* calc addr + load word high */ \ | |
38 | LWH(out, off) | |
39 | ||
40 | #define LAL(out, in, bpos, off) \ | |
41 | EAD(in, bpos) /* calc addr + load word low */ \ | |
42 | LWL(out, off) | |
43 | ||
44 | #define LAE(out, in, bpos) \ | |
45 | EAD(in, bpos) /* calc addr + load enc byte */ \ | |
46 | LBZ(out, rT0, 8) | |
47 | ||
48 | #define LBE(out) \ | |
49 | LBZ(out, rT0, 8) /* load enc byte */ | |
50 | ||
51 | #define LAD(out, in, bpos) \ | |
52 | DAD(in, bpos) /* calc addr + load dec byte */ \ | |
53 | LBZ(out, rT1, 0) | |
54 | ||
55 | #define LBD(out) \ | |
56 | LBZ(out, rT1, 0) | |
57 | ||
58 | /* | |
59 | * ppc_encrypt_block: The central encryption function for a single 16 bytes | |
60 | * block. It does no stack handling or register saving to support fast calls | |
61 | * via bl/blr. It expects that caller has pre-xored input data with first | |
62 | * 4 words of encryption key into rD0-rD3. Pointer/counter registers must | |
63 | * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 | |
64 | * and rW0-rW3 and caller must execute a final xor on the ouput registers. | |
65 | * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. | |
66 | * | |
67 | */ | |
68 | _GLOBAL(ppc_encrypt_block) | |
69 | LAH(rW4, rD1, 2, 4) | |
70 | LAH(rW6, rD0, 3, 0) | |
71 | LAH(rW3, rD0, 1, 8) | |
72 | ppc_encrypt_block_loop: | |
73 | LAH(rW0, rD3, 0, 12) | |
74 | LAL(rW0, rD0, 0, 12) | |
75 | LAH(rW1, rD1, 0, 12) | |
76 | LAH(rW2, rD2, 1, 8) | |
77 | LAL(rW2, rD3, 1, 8) | |
78 | LAL(rW3, rD1, 1, 8) | |
79 | LAL(rW4, rD2, 2, 4) | |
80 | LAL(rW6, rD1, 3, 0) | |
81 | LAH(rW5, rD3, 2, 4) | |
82 | LAL(rW5, rD0, 2, 4) | |
83 | LAH(rW7, rD2, 3, 0) | |
84 | evldw rD1,16(rKP) | |
85 | EAD(rD3, 3) | |
86 | evxor rW2,rW2,rW4 | |
87 | LWL(rW7, 0) | |
88 | evxor rW2,rW2,rW6 | |
89 | EAD(rD2, 0) | |
90 | evxor rD1,rD1,rW2 | |
91 | LWL(rW1, 12) | |
92 | evxor rD1,rD1,rW0 | |
93 | evldw rD3,24(rKP) | |
94 | evmergehi rD0,rD0,rD1 | |
95 | EAD(rD1, 2) | |
96 | evxor rW3,rW3,rW5 | |
97 | LWH(rW4, 4) | |
98 | evxor rW3,rW3,rW7 | |
99 | EAD(rD0, 3) | |
100 | evxor rD3,rD3,rW3 | |
101 | LWH(rW6, 0) | |
102 | evxor rD3,rD3,rW1 | |
103 | EAD(rD0, 1) | |
104 | evmergehi rD2,rD2,rD3 | |
105 | LWH(rW3, 8) | |
106 | LAH(rW0, rD3, 0, 12) | |
107 | LAL(rW0, rD0, 0, 12) | |
108 | LAH(rW1, rD1, 0, 12) | |
109 | LAH(rW2, rD2, 1, 8) | |
110 | LAL(rW2, rD3, 1, 8) | |
111 | LAL(rW3, rD1, 1, 8) | |
112 | LAL(rW4, rD2, 2, 4) | |
113 | LAL(rW6, rD1, 3, 0) | |
114 | LAH(rW5, rD3, 2, 4) | |
115 | LAL(rW5, rD0, 2, 4) | |
116 | LAH(rW7, rD2, 3, 0) | |
117 | evldw rD1,32(rKP) | |
118 | EAD(rD3, 3) | |
119 | evxor rW2,rW2,rW4 | |
120 | LWL(rW7, 0) | |
121 | evxor rW2,rW2,rW6 | |
122 | EAD(rD2, 0) | |
123 | evxor rD1,rD1,rW2 | |
124 | LWL(rW1, 12) | |
125 | evxor rD1,rD1,rW0 | |
126 | evldw rD3,40(rKP) | |
127 | evmergehi rD0,rD0,rD1 | |
128 | EAD(rD1, 2) | |
129 | evxor rW3,rW3,rW5 | |
130 | LWH(rW4, 4) | |
131 | evxor rW3,rW3,rW7 | |
132 | EAD(rD0, 3) | |
133 | evxor rD3,rD3,rW3 | |
134 | LWH(rW6, 0) | |
135 | evxor rD3,rD3,rW1 | |
136 | EAD(rD0, 1) | |
137 | evmergehi rD2,rD2,rD3 | |
138 | LWH(rW3, 8) | |
139 | addi rKP,rKP,32 | |
140 | bdnz ppc_encrypt_block_loop | |
141 | LAH(rW0, rD3, 0, 12) | |
142 | LAL(rW0, rD0, 0, 12) | |
143 | LAH(rW1, rD1, 0, 12) | |
144 | LAH(rW2, rD2, 1, 8) | |
145 | LAL(rW2, rD3, 1, 8) | |
146 | LAL(rW3, rD1, 1, 8) | |
147 | LAL(rW4, rD2, 2, 4) | |
148 | LAH(rW5, rD3, 2, 4) | |
149 | LAL(rW6, rD1, 3, 0) | |
150 | LAL(rW5, rD0, 2, 4) | |
151 | LAH(rW7, rD2, 3, 0) | |
152 | evldw rD1,16(rKP) | |
153 | EAD(rD3, 3) | |
154 | evxor rW2,rW2,rW4 | |
155 | LWL(rW7, 0) | |
156 | evxor rW2,rW2,rW6 | |
157 | EAD(rD2, 0) | |
158 | evxor rD1,rD1,rW2 | |
159 | LWL(rW1, 12) | |
160 | evxor rD1,rD1,rW0 | |
161 | evldw rD3,24(rKP) | |
162 | evmergehi rD0,rD0,rD1 | |
163 | EAD(rD1, 0) | |
164 | evxor rW3,rW3,rW5 | |
165 | LBE(rW2) | |
166 | evxor rW3,rW3,rW7 | |
167 | EAD(rD0, 1) | |
168 | evxor rD3,rD3,rW3 | |
169 | LBE(rW6) | |
170 | evxor rD3,rD3,rW1 | |
171 | EAD(rD0, 0) | |
172 | evmergehi rD2,rD2,rD3 | |
173 | LBE(rW1) | |
174 | LAE(rW0, rD3, 0) | |
175 | LAE(rW1, rD0, 0) | |
176 | LAE(rW4, rD2, 1) | |
177 | LAE(rW5, rD3, 1) | |
178 | LAE(rW3, rD2, 0) | |
179 | LAE(rW7, rD1, 1) | |
180 | rlwimi rW0,rW4,8,16,23 | |
181 | rlwimi rW1,rW5,8,16,23 | |
182 | LAE(rW4, rD1, 2) | |
183 | LAE(rW5, rD2, 2) | |
184 | rlwimi rW2,rW6,8,16,23 | |
185 | rlwimi rW3,rW7,8,16,23 | |
186 | LAE(rW6, rD3, 2) | |
187 | LAE(rW7, rD0, 2) | |
188 | rlwimi rW0,rW4,16,8,15 | |
189 | rlwimi rW1,rW5,16,8,15 | |
190 | LAE(rW4, rD0, 3) | |
191 | LAE(rW5, rD1, 3) | |
192 | rlwimi rW2,rW6,16,8,15 | |
193 | lwz rD0,32(rKP) | |
194 | rlwimi rW3,rW7,16,8,15 | |
195 | lwz rD1,36(rKP) | |
196 | LAE(rW6, rD2, 3) | |
197 | LAE(rW7, rD3, 3) | |
198 | rlwimi rW0,rW4,24,0,7 | |
199 | lwz rD2,40(rKP) | |
200 | rlwimi rW1,rW5,24,0,7 | |
201 | lwz rD3,44(rKP) | |
202 | rlwimi rW2,rW6,24,0,7 | |
203 | rlwimi rW3,rW7,24,0,7 | |
204 | blr | |
205 | ||
206 | /* | |
207 | * ppc_decrypt_block: The central decryption function for a single 16 bytes | |
208 | * block. It does no stack handling or register saving to support fast calls | |
209 | * via bl/blr. It expects that caller has pre-xored input data with first | |
210 | * 4 words of encryption key into rD0-rD3. Pointer/counter registers must | |
211 | * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 | |
212 | * and rW0-rW3 and caller must execute a final xor on the ouput registers. | |
213 | * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. | |
214 | * | |
215 | */ | |
216 | _GLOBAL(ppc_decrypt_block) | |
217 | LAH(rW0, rD1, 0, 12) | |
218 | LAH(rW6, rD0, 3, 0) | |
219 | LAH(rW3, rD0, 1, 8) | |
220 | ppc_decrypt_block_loop: | |
221 | LAH(rW1, rD3, 0, 12) | |
222 | LAL(rW0, rD2, 0, 12) | |
223 | LAH(rW2, rD2, 1, 8) | |
224 | LAL(rW2, rD3, 1, 8) | |
225 | LAH(rW4, rD3, 2, 4) | |
226 | LAL(rW4, rD0, 2, 4) | |
227 | LAL(rW6, rD1, 3, 0) | |
228 | LAH(rW5, rD1, 2, 4) | |
229 | LAH(rW7, rD2, 3, 0) | |
230 | LAL(rW7, rD3, 3, 0) | |
231 | LAL(rW3, rD1, 1, 8) | |
232 | evldw rD1,16(rKP) | |
233 | EAD(rD0, 0) | |
234 | evxor rW4,rW4,rW6 | |
235 | LWL(rW1, 12) | |
236 | evxor rW0,rW0,rW4 | |
237 | EAD(rD2, 2) | |
238 | evxor rW0,rW0,rW2 | |
239 | LWL(rW5, 4) | |
240 | evxor rD1,rD1,rW0 | |
241 | evldw rD3,24(rKP) | |
242 | evmergehi rD0,rD0,rD1 | |
243 | EAD(rD1, 0) | |
244 | evxor rW3,rW3,rW7 | |
245 | LWH(rW0, 12) | |
246 | evxor rW3,rW3,rW1 | |
247 | EAD(rD0, 3) | |
248 | evxor rD3,rD3,rW3 | |
249 | LWH(rW6, 0) | |
250 | evxor rD3,rD3,rW5 | |
251 | EAD(rD0, 1) | |
252 | evmergehi rD2,rD2,rD3 | |
253 | LWH(rW3, 8) | |
254 | LAH(rW1, rD3, 0, 12) | |
255 | LAL(rW0, rD2, 0, 12) | |
256 | LAH(rW2, rD2, 1, 8) | |
257 | LAL(rW2, rD3, 1, 8) | |
258 | LAH(rW4, rD3, 2, 4) | |
259 | LAL(rW4, rD0, 2, 4) | |
260 | LAL(rW6, rD1, 3, 0) | |
261 | LAH(rW5, rD1, 2, 4) | |
262 | LAH(rW7, rD2, 3, 0) | |
263 | LAL(rW7, rD3, 3, 0) | |
264 | LAL(rW3, rD1, 1, 8) | |
265 | evldw rD1,32(rKP) | |
266 | EAD(rD0, 0) | |
267 | evxor rW4,rW4,rW6 | |
268 | LWL(rW1, 12) | |
269 | evxor rW0,rW0,rW4 | |
270 | EAD(rD2, 2) | |
271 | evxor rW0,rW0,rW2 | |
272 | LWL(rW5, 4) | |
273 | evxor rD1,rD1,rW0 | |
274 | evldw rD3,40(rKP) | |
275 | evmergehi rD0,rD0,rD1 | |
276 | EAD(rD1, 0) | |
277 | evxor rW3,rW3,rW7 | |
278 | LWH(rW0, 12) | |
279 | evxor rW3,rW3,rW1 | |
280 | EAD(rD0, 3) | |
281 | evxor rD3,rD3,rW3 | |
282 | LWH(rW6, 0) | |
283 | evxor rD3,rD3,rW5 | |
284 | EAD(rD0, 1) | |
285 | evmergehi rD2,rD2,rD3 | |
286 | LWH(rW3, 8) | |
287 | addi rKP,rKP,32 | |
288 | bdnz ppc_decrypt_block_loop | |
289 | LAH(rW1, rD3, 0, 12) | |
290 | LAL(rW0, rD2, 0, 12) | |
291 | LAH(rW2, rD2, 1, 8) | |
292 | LAL(rW2, rD3, 1, 8) | |
293 | LAH(rW4, rD3, 2, 4) | |
294 | LAL(rW4, rD0, 2, 4) | |
295 | LAL(rW6, rD1, 3, 0) | |
296 | LAH(rW5, rD1, 2, 4) | |
297 | LAH(rW7, rD2, 3, 0) | |
298 | LAL(rW7, rD3, 3, 0) | |
299 | LAL(rW3, rD1, 1, 8) | |
300 | evldw rD1,16(rKP) | |
301 | EAD(rD0, 0) | |
302 | evxor rW4,rW4,rW6 | |
303 | LWL(rW1, 12) | |
304 | evxor rW0,rW0,rW4 | |
305 | EAD(rD2, 2) | |
306 | evxor rW0,rW0,rW2 | |
307 | LWL(rW5, 4) | |
308 | evxor rD1,rD1,rW0 | |
309 | evldw rD3,24(rKP) | |
310 | evmergehi rD0,rD0,rD1 | |
311 | DAD(rD1, 0) | |
312 | evxor rW3,rW3,rW7 | |
313 | LBD(rW0) | |
314 | evxor rW3,rW3,rW1 | |
315 | DAD(rD0, 1) | |
316 | evxor rD3,rD3,rW3 | |
317 | LBD(rW6) | |
318 | evxor rD3,rD3,rW5 | |
319 | DAD(rD0, 0) | |
320 | evmergehi rD2,rD2,rD3 | |
321 | LBD(rW3) | |
322 | LAD(rW2, rD3, 0) | |
323 | LAD(rW1, rD2, 0) | |
324 | LAD(rW4, rD2, 1) | |
325 | LAD(rW5, rD3, 1) | |
326 | LAD(rW7, rD1, 1) | |
327 | rlwimi rW0,rW4,8,16,23 | |
328 | rlwimi rW1,rW5,8,16,23 | |
329 | LAD(rW4, rD3, 2) | |
330 | LAD(rW5, rD0, 2) | |
331 | rlwimi rW2,rW6,8,16,23 | |
332 | rlwimi rW3,rW7,8,16,23 | |
333 | LAD(rW6, rD1, 2) | |
334 | LAD(rW7, rD2, 2) | |
335 | rlwimi rW0,rW4,16,8,15 | |
336 | rlwimi rW1,rW5,16,8,15 | |
337 | LAD(rW4, rD0, 3) | |
338 | LAD(rW5, rD1, 3) | |
339 | rlwimi rW2,rW6,16,8,15 | |
340 | lwz rD0,32(rKP) | |
341 | rlwimi rW3,rW7,16,8,15 | |
342 | lwz rD1,36(rKP) | |
343 | LAD(rW6, rD2, 3) | |
344 | LAD(rW7, rD3, 3) | |
345 | rlwimi rW0,rW4,24,0,7 | |
346 | lwz rD2,40(rKP) | |
347 | rlwimi rW1,rW5,24,0,7 | |
348 | lwz rD3,44(rKP) | |
349 | rlwimi rW2,rW6,24,0,7 | |
350 | rlwimi rW3,rW7,24,0,7 | |
351 | blr |