arm: omap2+: add missing HWMOD_NO_IDLEST in 81xx hwmod data
[deliverable/linux.git] / arch / arm / crypto / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA512 block procedure for ARMv4. September 2007.
13
14 # This code is ~4.5 (four and a half) times faster than code generated
15 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16 # Xscale PXA250 core].
17 #
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
21 # Cortex A8 core and ~40 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 7%
26 # improvement on Coxtex A8 core and ~38 cycles per byte.
27
28 # March 2011.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process
31 # one byte in 23.3 cycles or ~60% faster than integer-only code.
32
33 # August 2012.
34 #
35 # Improve NEON performance by 12% on Snapdragon S4. In absolute
36 # terms it's 22.6 cycles per byte, which is disappointing result.
37 # Technical writers asserted that 3-way S4 pipeline can sustain
38 # multiple NEON instructions per cycle, but dual NEON issue could
39 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40 # for further details. On side note Cortex-A15 processes one byte in
41 # 16 cycles.
42
43 # Byte order [in]dependence. =========================================
44 #
45 # Originally caller was expected to maintain specific *dword* order in
46 # h[0-7], namely with most significant dword at *lower* address, which
47 # was reflected in below two parameters as 0 and 4. Now caller is
48 # expected to maintain native byte order for whole 64-bit values.
49 $hi="HI";
50 $lo="LO";
51 # ====================================================================
52
53 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
54 open STDOUT,">$output";
55
56 $ctx="r0"; # parameter block
57 $inp="r1";
58 $len="r2";
59
60 $Tlo="r3";
61 $Thi="r4";
62 $Alo="r5";
63 $Ahi="r6";
64 $Elo="r7";
65 $Ehi="r8";
66 $t0="r9";
67 $t1="r10";
68 $t2="r11";
69 $t3="r12";
70 ############ r13 is stack pointer
71 $Ktbl="r14";
72 ############ r15 is program counter
73
74 $Aoff=8*0;
75 $Boff=8*1;
76 $Coff=8*2;
77 $Doff=8*3;
78 $Eoff=8*4;
79 $Foff=8*5;
80 $Goff=8*6;
81 $Hoff=8*7;
82 $Xoff=8*8;
83
84 sub BODY_00_15() {
85 my $magic = shift;
86 $code.=<<___;
87 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
88 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
89 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
90 mov $t0,$Elo,lsr#14
91 str $Tlo,[sp,#$Xoff+0]
92 mov $t1,$Ehi,lsr#14
93 str $Thi,[sp,#$Xoff+4]
94 eor $t0,$t0,$Ehi,lsl#18
95 ldr $t2,[sp,#$Hoff+0] @ h.lo
96 eor $t1,$t1,$Elo,lsl#18
97 ldr $t3,[sp,#$Hoff+4] @ h.hi
98 eor $t0,$t0,$Elo,lsr#18
99 eor $t1,$t1,$Ehi,lsr#18
100 eor $t0,$t0,$Ehi,lsl#14
101 eor $t1,$t1,$Elo,lsl#14
102 eor $t0,$t0,$Ehi,lsr#9
103 eor $t1,$t1,$Elo,lsr#9
104 eor $t0,$t0,$Elo,lsl#23
105 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
106 adds $Tlo,$Tlo,$t0
107 ldr $t0,[sp,#$Foff+0] @ f.lo
108 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
109 ldr $t1,[sp,#$Foff+4] @ f.hi
110 adds $Tlo,$Tlo,$t2
111 ldr $t2,[sp,#$Goff+0] @ g.lo
112 adc $Thi,$Thi,$t3 @ T += h
113 ldr $t3,[sp,#$Goff+4] @ g.hi
114
115 eor $t0,$t0,$t2
116 str $Elo,[sp,#$Eoff+0]
117 eor $t1,$t1,$t3
118 str $Ehi,[sp,#$Eoff+4]
119 and $t0,$t0,$Elo
120 str $Alo,[sp,#$Aoff+0]
121 and $t1,$t1,$Ehi
122 str $Ahi,[sp,#$Aoff+4]
123 eor $t0,$t0,$t2
124 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
125 eor $t1,$t1,$t3 @ Ch(e,f,g)
126 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
127
128 adds $Tlo,$Tlo,$t0
129 ldr $Elo,[sp,#$Doff+0] @ d.lo
130 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
131 ldr $Ehi,[sp,#$Doff+4] @ d.hi
132 adds $Tlo,$Tlo,$t2
133 and $t0,$t2,#0xff
134 adc $Thi,$Thi,$t3 @ T += K[i]
135 adds $Elo,$Elo,$Tlo
136 ldr $t2,[sp,#$Boff+0] @ b.lo
137 adc $Ehi,$Ehi,$Thi @ d += T
138 teq $t0,#$magic
139
140 ldr $t3,[sp,#$Coff+0] @ c.lo
141 #if __ARM_ARCH__>=7
142 it eq @ Thumb2 thing, sanity check in ARM
143 #endif
144 orreq $Ktbl,$Ktbl,#1
145 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
146 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
147 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
148 mov $t0,$Alo,lsr#28
149 mov $t1,$Ahi,lsr#28
150 eor $t0,$t0,$Ahi,lsl#4
151 eor $t1,$t1,$Alo,lsl#4
152 eor $t0,$t0,$Ahi,lsr#2
153 eor $t1,$t1,$Alo,lsr#2
154 eor $t0,$t0,$Alo,lsl#30
155 eor $t1,$t1,$Ahi,lsl#30
156 eor $t0,$t0,$Ahi,lsr#7
157 eor $t1,$t1,$Alo,lsr#7
158 eor $t0,$t0,$Alo,lsl#25
159 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
160 adds $Tlo,$Tlo,$t0
161 and $t0,$Alo,$t2
162 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
163
164 ldr $t1,[sp,#$Boff+4] @ b.hi
165 orr $Alo,$Alo,$t2
166 ldr $t2,[sp,#$Coff+4] @ c.hi
167 and $Alo,$Alo,$t3
168 and $t3,$Ahi,$t1
169 orr $Ahi,$Ahi,$t1
170 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
171 and $Ahi,$Ahi,$t2
172 adds $Alo,$Alo,$Tlo
173 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
174 sub sp,sp,#8
175 adc $Ahi,$Ahi,$Thi @ h += T
176 tst $Ktbl,#1
177 add $Ktbl,$Ktbl,#8
178 ___
179 }
180 $code=<<___;
181 #ifndef __KERNEL__
182 # include "arm_arch.h"
183 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
184 # define VFP_ABI_POP vldmia sp!,{d8-d15}
185 #else
186 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
187 # define __ARM_MAX_ARCH__ 7
188 # define VFP_ABI_PUSH
189 # define VFP_ABI_POP
190 #endif
191
192 #ifdef __ARMEL__
193 # define LO 0
194 # define HI 4
195 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
196 #else
197 # define HI 0
198 # define LO 4
199 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
200 #endif
201
202 .text
203 #if __ARM_ARCH__<7
204 .code 32
205 #else
206 .syntax unified
207 # ifdef __thumb2__
208 # define adrl adr
209 .thumb
210 # else
211 .code 32
212 # endif
213 #endif
214
215 .type K512,%object
216 .align 5
217 K512:
218 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
219 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
220 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
221 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
222 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
223 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
224 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
225 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
226 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
227 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
228 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
229 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
230 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
231 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
232 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
233 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
234 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
235 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
236 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
237 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
238 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
239 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
240 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
241 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
242 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
243 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
244 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
245 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
246 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
247 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
248 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
249 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
250 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
251 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
252 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
253 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
254 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
255 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
256 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
257 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
258 .size K512,.-K512
259 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
260 .LOPENSSL_armcap:
261 .word OPENSSL_armcap_P-sha512_block_data_order
262 .skip 32-4
263 #else
264 .skip 32
265 #endif
266
267 .global sha512_block_data_order
268 .type sha512_block_data_order,%function
269 sha512_block_data_order:
270 #if __ARM_ARCH__<7
271 sub r3,pc,#8 @ sha512_block_data_order
272 #else
273 adr r3,sha512_block_data_order
274 #endif
275 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
276 ldr r12,.LOPENSSL_armcap
277 ldr r12,[r3,r12] @ OPENSSL_armcap_P
278 tst r12,#1
279 bne .LNEON
280 #endif
281 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
282 stmdb sp!,{r4-r12,lr}
283 sub $Ktbl,r3,#672 @ K512
284 sub sp,sp,#9*8
285
286 ldr $Elo,[$ctx,#$Eoff+$lo]
287 ldr $Ehi,[$ctx,#$Eoff+$hi]
288 ldr $t0, [$ctx,#$Goff+$lo]
289 ldr $t1, [$ctx,#$Goff+$hi]
290 ldr $t2, [$ctx,#$Hoff+$lo]
291 ldr $t3, [$ctx,#$Hoff+$hi]
292 .Loop:
293 str $t0, [sp,#$Goff+0]
294 str $t1, [sp,#$Goff+4]
295 str $t2, [sp,#$Hoff+0]
296 str $t3, [sp,#$Hoff+4]
297 ldr $Alo,[$ctx,#$Aoff+$lo]
298 ldr $Ahi,[$ctx,#$Aoff+$hi]
299 ldr $Tlo,[$ctx,#$Boff+$lo]
300 ldr $Thi,[$ctx,#$Boff+$hi]
301 ldr $t0, [$ctx,#$Coff+$lo]
302 ldr $t1, [$ctx,#$Coff+$hi]
303 ldr $t2, [$ctx,#$Doff+$lo]
304 ldr $t3, [$ctx,#$Doff+$hi]
305 str $Tlo,[sp,#$Boff+0]
306 str $Thi,[sp,#$Boff+4]
307 str $t0, [sp,#$Coff+0]
308 str $t1, [sp,#$Coff+4]
309 str $t2, [sp,#$Doff+0]
310 str $t3, [sp,#$Doff+4]
311 ldr $Tlo,[$ctx,#$Foff+$lo]
312 ldr $Thi,[$ctx,#$Foff+$hi]
313 str $Tlo,[sp,#$Foff+0]
314 str $Thi,[sp,#$Foff+4]
315
316 .L00_15:
317 #if __ARM_ARCH__<7
318 ldrb $Tlo,[$inp,#7]
319 ldrb $t0, [$inp,#6]
320 ldrb $t1, [$inp,#5]
321 ldrb $t2, [$inp,#4]
322 ldrb $Thi,[$inp,#3]
323 ldrb $t3, [$inp,#2]
324 orr $Tlo,$Tlo,$t0,lsl#8
325 ldrb $t0, [$inp,#1]
326 orr $Tlo,$Tlo,$t1,lsl#16
327 ldrb $t1, [$inp],#8
328 orr $Tlo,$Tlo,$t2,lsl#24
329 orr $Thi,$Thi,$t3,lsl#8
330 orr $Thi,$Thi,$t0,lsl#16
331 orr $Thi,$Thi,$t1,lsl#24
332 #else
333 ldr $Tlo,[$inp,#4]
334 ldr $Thi,[$inp],#8
335 #ifdef __ARMEL__
336 rev $Tlo,$Tlo
337 rev $Thi,$Thi
338 #endif
339 #endif
340 ___
341 &BODY_00_15(0x94);
342 $code.=<<___;
343 tst $Ktbl,#1
344 beq .L00_15
345 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
346 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
347 bic $Ktbl,$Ktbl,#1
348 .L16_79:
349 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
350 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
351 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
352 mov $Tlo,$t0,lsr#1
353 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
354 mov $Thi,$t1,lsr#1
355 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
356 eor $Tlo,$Tlo,$t1,lsl#31
357 eor $Thi,$Thi,$t0,lsl#31
358 eor $Tlo,$Tlo,$t0,lsr#8
359 eor $Thi,$Thi,$t1,lsr#8
360 eor $Tlo,$Tlo,$t1,lsl#24
361 eor $Thi,$Thi,$t0,lsl#24
362 eor $Tlo,$Tlo,$t0,lsr#7
363 eor $Thi,$Thi,$t1,lsr#7
364 eor $Tlo,$Tlo,$t1,lsl#25
365
366 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
367 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
368 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
369 mov $t0,$t2,lsr#19
370 mov $t1,$t3,lsr#19
371 eor $t0,$t0,$t3,lsl#13
372 eor $t1,$t1,$t2,lsl#13
373 eor $t0,$t0,$t3,lsr#29
374 eor $t1,$t1,$t2,lsr#29
375 eor $t0,$t0,$t2,lsl#3
376 eor $t1,$t1,$t3,lsl#3
377 eor $t0,$t0,$t2,lsr#6
378 eor $t1,$t1,$t3,lsr#6
379 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
380 eor $t0,$t0,$t3,lsl#26
381
382 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
383 adds $Tlo,$Tlo,$t0
384 ldr $t0,[sp,#`$Xoff+8*16`+0]
385 adc $Thi,$Thi,$t1
386
387 ldr $t1,[sp,#`$Xoff+8*16`+4]
388 adds $Tlo,$Tlo,$t2
389 adc $Thi,$Thi,$t3
390 adds $Tlo,$Tlo,$t0
391 adc $Thi,$Thi,$t1
392 ___
393 &BODY_00_15(0x17);
394 $code.=<<___;
395 #if __ARM_ARCH__>=7
396 ittt eq @ Thumb2 thing, sanity check in ARM
397 #endif
398 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
399 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
400 beq .L16_79
401 bic $Ktbl,$Ktbl,#1
402
403 ldr $Tlo,[sp,#$Boff+0]
404 ldr $Thi,[sp,#$Boff+4]
405 ldr $t0, [$ctx,#$Aoff+$lo]
406 ldr $t1, [$ctx,#$Aoff+$hi]
407 ldr $t2, [$ctx,#$Boff+$lo]
408 ldr $t3, [$ctx,#$Boff+$hi]
409 adds $t0,$Alo,$t0
410 str $t0, [$ctx,#$Aoff+$lo]
411 adc $t1,$Ahi,$t1
412 str $t1, [$ctx,#$Aoff+$hi]
413 adds $t2,$Tlo,$t2
414 str $t2, [$ctx,#$Boff+$lo]
415 adc $t3,$Thi,$t3
416 str $t3, [$ctx,#$Boff+$hi]
417
418 ldr $Alo,[sp,#$Coff+0]
419 ldr $Ahi,[sp,#$Coff+4]
420 ldr $Tlo,[sp,#$Doff+0]
421 ldr $Thi,[sp,#$Doff+4]
422 ldr $t0, [$ctx,#$Coff+$lo]
423 ldr $t1, [$ctx,#$Coff+$hi]
424 ldr $t2, [$ctx,#$Doff+$lo]
425 ldr $t3, [$ctx,#$Doff+$hi]
426 adds $t0,$Alo,$t0
427 str $t0, [$ctx,#$Coff+$lo]
428 adc $t1,$Ahi,$t1
429 str $t1, [$ctx,#$Coff+$hi]
430 adds $t2,$Tlo,$t2
431 str $t2, [$ctx,#$Doff+$lo]
432 adc $t3,$Thi,$t3
433 str $t3, [$ctx,#$Doff+$hi]
434
435 ldr $Tlo,[sp,#$Foff+0]
436 ldr $Thi,[sp,#$Foff+4]
437 ldr $t0, [$ctx,#$Eoff+$lo]
438 ldr $t1, [$ctx,#$Eoff+$hi]
439 ldr $t2, [$ctx,#$Foff+$lo]
440 ldr $t3, [$ctx,#$Foff+$hi]
441 adds $Elo,$Elo,$t0
442 str $Elo,[$ctx,#$Eoff+$lo]
443 adc $Ehi,$Ehi,$t1
444 str $Ehi,[$ctx,#$Eoff+$hi]
445 adds $t2,$Tlo,$t2
446 str $t2, [$ctx,#$Foff+$lo]
447 adc $t3,$Thi,$t3
448 str $t3, [$ctx,#$Foff+$hi]
449
450 ldr $Alo,[sp,#$Goff+0]
451 ldr $Ahi,[sp,#$Goff+4]
452 ldr $Tlo,[sp,#$Hoff+0]
453 ldr $Thi,[sp,#$Hoff+4]
454 ldr $t0, [$ctx,#$Goff+$lo]
455 ldr $t1, [$ctx,#$Goff+$hi]
456 ldr $t2, [$ctx,#$Hoff+$lo]
457 ldr $t3, [$ctx,#$Hoff+$hi]
458 adds $t0,$Alo,$t0
459 str $t0, [$ctx,#$Goff+$lo]
460 adc $t1,$Ahi,$t1
461 str $t1, [$ctx,#$Goff+$hi]
462 adds $t2,$Tlo,$t2
463 str $t2, [$ctx,#$Hoff+$lo]
464 adc $t3,$Thi,$t3
465 str $t3, [$ctx,#$Hoff+$hi]
466
467 add sp,sp,#640
468 sub $Ktbl,$Ktbl,#640
469
470 teq $inp,$len
471 bne .Loop
472
473 add sp,sp,#8*9 @ destroy frame
474 #if __ARM_ARCH__>=5
475 ldmia sp!,{r4-r12,pc}
476 #else
477 ldmia sp!,{r4-r12,lr}
478 tst lr,#1
479 moveq pc,lr @ be binary compatible with V4, yet
480 bx lr @ interoperable with Thumb ISA:-)
481 #endif
482 .size sha512_block_data_order,.-sha512_block_data_order
483 ___
484
485 {
486 my @Sigma0=(28,34,39);
487 my @Sigma1=(14,18,41);
488 my @sigma0=(1, 8, 7);
489 my @sigma1=(19,61,6);
490
491 my $Ktbl="r3";
492 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
493
494 my @X=map("d$_",(0..15));
495 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
496
497 sub NEON_00_15() {
498 my $i=shift;
499 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
500 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
501
502 $code.=<<___ if ($i<16 || $i&1);
503 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
504 #if $i<16
505 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
506 #endif
507 vshr.u64 $t1,$e,#@Sigma1[1]
508 #if $i>0
509 vadd.i64 $a,$Maj @ h+=Maj from the past
510 #endif
511 vshr.u64 $t2,$e,#@Sigma1[2]
512 ___
513 $code.=<<___;
514 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
515 vsli.64 $t0,$e,#`64-@Sigma1[0]`
516 vsli.64 $t1,$e,#`64-@Sigma1[1]`
517 vmov $Ch,$e
518 vsli.64 $t2,$e,#`64-@Sigma1[2]`
519 #if $i<16 && defined(__ARMEL__)
520 vrev64.8 @X[$i],@X[$i]
521 #endif
522 veor $t1,$t0
523 vbsl $Ch,$f,$g @ Ch(e,f,g)
524 vshr.u64 $t0,$a,#@Sigma0[0]
525 veor $t2,$t1 @ Sigma1(e)
526 vadd.i64 $T1,$Ch,$h
527 vshr.u64 $t1,$a,#@Sigma0[1]
528 vsli.64 $t0,$a,#`64-@Sigma0[0]`
529 vadd.i64 $T1,$t2
530 vshr.u64 $t2,$a,#@Sigma0[2]
531 vadd.i64 $K,@X[$i%16]
532 vsli.64 $t1,$a,#`64-@Sigma0[1]`
533 veor $Maj,$a,$b
534 vsli.64 $t2,$a,#`64-@Sigma0[2]`
535 veor $h,$t0,$t1
536 vadd.i64 $T1,$K
537 vbsl $Maj,$c,$b @ Maj(a,b,c)
538 veor $h,$t2 @ Sigma0(a)
539 vadd.i64 $d,$T1
540 vadd.i64 $Maj,$T1
541 @ vadd.i64 $h,$Maj
542 ___
543 }
544
545 sub NEON_16_79() {
546 my $i=shift;
547
548 if ($i&1) { &NEON_00_15($i,@_); return; }
549
550 # 2x-vectorized, therefore runs every 2nd round
551 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
552 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
553 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
554 my $e=@_[4]; # $e from NEON_00_15
555 $i /= 2;
556 $code.=<<___;
557 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
558 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
559 vadd.i64 @_[0],d30 @ h+=Maj from the past
560 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
561 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
562 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
563 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
564 veor $s1,$t0
565 vshr.u64 $t0,$s0,#@sigma0[0]
566 veor $s1,$t1 @ sigma1(X[i+14])
567 vshr.u64 $t1,$s0,#@sigma0[1]
568 vadd.i64 @X[$i%8],$s1
569 vshr.u64 $s1,$s0,#@sigma0[2]
570 vsli.64 $t0,$s0,#`64-@sigma0[0]`
571 vsli.64 $t1,$s0,#`64-@sigma0[1]`
572 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
573 veor $s1,$t0
574 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
575 vadd.i64 @X[$i%8],$s0
576 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
577 veor $s1,$t1 @ sigma0(X[i+1])
578 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
579 vadd.i64 @X[$i%8],$s1
580 ___
581 &NEON_00_15(2*$i,@_);
582 }
583
584 $code.=<<___;
585 #if __ARM_MAX_ARCH__>=7
586 .arch armv7-a
587 .fpu neon
588
589 .global sha512_block_data_order_neon
590 .type sha512_block_data_order_neon,%function
591 .align 4
592 sha512_block_data_order_neon:
593 .LNEON:
594 dmb @ errata #451034 on early Cortex A8
595 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
596 VFP_ABI_PUSH
597 adrl $Ktbl,K512
598 vldmia $ctx,{$A-$H} @ load context
599 .Loop_neon:
600 ___
601 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
602 $code.=<<___;
603 mov $cnt,#4
604 .L16_79_neon:
605 subs $cnt,#1
606 ___
607 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
608 $code.=<<___;
609 bne .L16_79_neon
610
611 vadd.i64 $A,d30 @ h+=Maj from the past
612 vldmia $ctx,{d24-d31} @ load context to temp
613 vadd.i64 q8,q12 @ vectorized accumulate
614 vadd.i64 q9,q13
615 vadd.i64 q10,q14
616 vadd.i64 q11,q15
617 vstmia $ctx,{$A-$H} @ save context
618 teq $inp,$len
619 sub $Ktbl,#640 @ rewind K512
620 bne .Loop_neon
621
622 VFP_ABI_POP
623 ret @ bx lr
624 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
625 #endif
626 ___
627 }
628 $code.=<<___;
629 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
630 .align 2
631 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
632 .comm OPENSSL_armcap_P,4,4
633 #endif
634 ___
635
636 $code =~ s/\`([^\`]*)\`/eval $1/gem;
637 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
638 $code =~ s/\bret\b/bx lr/gm;
639
640 open SELF,$0;
641 while(<SELF>) {
642 next if (/^#!/);
643 last if (!s/^#/@/ and !/^$/);
644 print;
645 }
646 close SELF;
647
648 print $code;
649 close STDOUT; # enforce flush
This page took 0.068183 seconds and 5 git commands to generate.