3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA512 block procedure for ARMv4. September 2007.
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14 # Xscale PXA250 core].
18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
19 # Cortex A8 core and ~40 cycles per processed byte.
23 # Profiler-assisted and platform-specific optimization resulted in 7%
24 # improvement on Coxtex A8 core and ~38 cycles per byte.
28 # Add NEON implementation. On Cortex A8 it was measured to process
29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
33 # Improve NEON performance by 12% on Snapdragon S4. In absolute
34 # terms it's 22.6 cycles per byte, which is disappointing result.
35 # Technical writers asserted that 3-way S4 pipeline can sustain
36 # multiple NEON instructions per cycle, but dual NEON issue could
37 # not be observed, and for NEON-only sequences IPC(*) was found to
38 # be limited by 1:-( 0.33 and 0.66 were measured for sequences with
39 # ILPs(*) of 1 and 2 respectively. This in turn means that you can
40 # even find yourself striving, as I did here, for achieving IPC
41 # adequate to one delivered by Cortex A8 [for reference, it's
42 # 0.5 for ILP of 1, and 1 for higher ILPs].
44 # (*) ILP, instruction-level parallelism, how many instructions
45 # *can* execute at the same time. IPC, instructions per cycle,
46 # indicates how many instructions actually execute.
48 # Byte order [in]dependence. =========================================
50 # Originally caller was expected to maintain specific *dword* order in
51 # h[0-7], namely with most significant dword at *lower* address, which
52 # was reflected in below two parameters as 0 and 4. Now caller is
53 # expected to maintain native byte order for whole 64-bit values.
56 # ====================================================================
58 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
59 open STDOUT,">$output";
61 $ctx="r0"; # parameter block
75 ############ r13 is stack pointer
77 ############ r15 is program counter
92 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
93 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
94 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
96 str $Tlo,[sp,#$Xoff+0]
98 str $Thi,[sp,#$Xoff+4]
99 eor $t0,$t0,$Ehi,lsl#18
100 ldr $t2,[sp,#$Hoff+0] @ h.lo
101 eor $t1,$t1,$Elo,lsl#18
102 ldr $t3,[sp,#$Hoff+4] @ h.hi
103 eor $t0,$t0,$Elo,lsr#18
104 eor $t1,$t1,$Ehi,lsr#18
105 eor $t0,$t0,$Ehi,lsl#14
106 eor $t1,$t1,$Elo,lsl#14
107 eor $t0,$t0,$Ehi,lsr#9
108 eor $t1,$t1,$Elo,lsr#9
109 eor $t0,$t0,$Elo,lsl#23
110 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
112 ldr $t0,[sp,#$Foff+0] @ f.lo
113 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
114 ldr $t1,[sp,#$Foff+4] @ f.hi
116 ldr $t2,[sp,#$Goff+0] @ g.lo
117 adc $Thi,$Thi,$t3 @ T += h
118 ldr $t3,[sp,#$Goff+4] @ g.hi
121 str $Elo,[sp,#$Eoff+0]
123 str $Ehi,[sp,#$Eoff+4]
125 str $Alo,[sp,#$Aoff+0]
127 str $Ahi,[sp,#$Aoff+4]
129 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
130 eor $t1,$t1,$t3 @ Ch(e,f,g)
131 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
134 ldr $Elo,[sp,#$Doff+0] @ d.lo
135 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
136 ldr $Ehi,[sp,#$Doff+4] @ d.hi
139 adc $Thi,$Thi,$t3 @ T += K[i]
141 ldr $t2,[sp,#$Boff+0] @ b.lo
142 adc $Ehi,$Ehi,$Thi @ d += T
145 ldr $t3,[sp,#$Coff+0] @ c.lo
147 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
148 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
149 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
152 eor $t0,$t0,$Ahi,lsl#4
153 eor $t1,$t1,$Alo,lsl#4
154 eor $t0,$t0,$Ahi,lsr#2
155 eor $t1,$t1,$Alo,lsr#2
156 eor $t0,$t0,$Alo,lsl#30
157 eor $t1,$t1,$Ahi,lsl#30
158 eor $t0,$t0,$Ahi,lsr#7
159 eor $t1,$t1,$Alo,lsr#7
160 eor $t0,$t0,$Alo,lsl#25
161 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
164 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
166 ldr $t1,[sp,#$Boff+4] @ b.hi
168 ldr $t2,[sp,#$Coff+4] @ c.hi
172 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
175 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
177 adc $Ahi,$Ahi,$Thi @ h += T
183 #include "arm_arch.h"
187 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
191 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
199 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
200 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
201 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
202 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
203 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
204 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
205 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
206 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
207 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
208 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
209 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
210 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
211 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
212 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
213 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
214 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
215 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
216 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
217 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
218 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
219 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
220 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
221 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
222 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
223 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
224 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
225 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
226 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
227 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
228 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
229 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
230 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
231 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
232 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
233 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
234 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
235 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
236 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
237 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
238 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
240 #if __ARM_MAX_ARCH__>=7
242 .word OPENSSL_armcap_P-sha512_block_data_order
248 .global sha512_block_data_order
249 .type sha512_block_data_order,%function
250 sha512_block_data_order:
251 sub r3,pc,#8 @ sha512_block_data_order
252 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
253 #if __ARM_MAX_ARCH__>=7
254 ldr r12,.LOPENSSL_armcap
255 ldr r12,[r3,r12] @ OPENSSL_armcap_P
259 stmdb sp!,{r4-r12,lr}
260 sub $Ktbl,r3,#672 @ K512
263 ldr $Elo,[$ctx,#$Eoff+$lo]
264 ldr $Ehi,[$ctx,#$Eoff+$hi]
265 ldr $t0, [$ctx,#$Goff+$lo]
266 ldr $t1, [$ctx,#$Goff+$hi]
267 ldr $t2, [$ctx,#$Hoff+$lo]
268 ldr $t3, [$ctx,#$Hoff+$hi]
270 str $t0, [sp,#$Goff+0]
271 str $t1, [sp,#$Goff+4]
272 str $t2, [sp,#$Hoff+0]
273 str $t3, [sp,#$Hoff+4]
274 ldr $Alo,[$ctx,#$Aoff+$lo]
275 ldr $Ahi,[$ctx,#$Aoff+$hi]
276 ldr $Tlo,[$ctx,#$Boff+$lo]
277 ldr $Thi,[$ctx,#$Boff+$hi]
278 ldr $t0, [$ctx,#$Coff+$lo]
279 ldr $t1, [$ctx,#$Coff+$hi]
280 ldr $t2, [$ctx,#$Doff+$lo]
281 ldr $t3, [$ctx,#$Doff+$hi]
282 str $Tlo,[sp,#$Boff+0]
283 str $Thi,[sp,#$Boff+4]
284 str $t0, [sp,#$Coff+0]
285 str $t1, [sp,#$Coff+4]
286 str $t2, [sp,#$Doff+0]
287 str $t3, [sp,#$Doff+4]
288 ldr $Tlo,[$ctx,#$Foff+$lo]
289 ldr $Thi,[$ctx,#$Foff+$hi]
290 str $Tlo,[sp,#$Foff+0]
291 str $Thi,[sp,#$Foff+4]
301 orr $Tlo,$Tlo,$t0,lsl#8
303 orr $Tlo,$Tlo,$t1,lsl#16
305 orr $Tlo,$Tlo,$t2,lsl#24
306 orr $Thi,$Thi,$t3,lsl#8
307 orr $Thi,$Thi,$t0,lsl#16
308 orr $Thi,$Thi,$t1,lsl#24
322 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
323 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
326 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
327 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
328 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
330 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
332 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
333 eor $Tlo,$Tlo,$t1,lsl#31
334 eor $Thi,$Thi,$t0,lsl#31
335 eor $Tlo,$Tlo,$t0,lsr#8
336 eor $Thi,$Thi,$t1,lsr#8
337 eor $Tlo,$Tlo,$t1,lsl#24
338 eor $Thi,$Thi,$t0,lsl#24
339 eor $Tlo,$Tlo,$t0,lsr#7
340 eor $Thi,$Thi,$t1,lsr#7
341 eor $Tlo,$Tlo,$t1,lsl#25
343 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
344 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
345 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
348 eor $t0,$t0,$t3,lsl#13
349 eor $t1,$t1,$t2,lsl#13
350 eor $t0,$t0,$t3,lsr#29
351 eor $t1,$t1,$t2,lsr#29
352 eor $t0,$t0,$t2,lsl#3
353 eor $t1,$t1,$t3,lsl#3
354 eor $t0,$t0,$t2,lsr#6
355 eor $t1,$t1,$t3,lsr#6
356 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
357 eor $t0,$t0,$t3,lsl#26
359 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
361 ldr $t0,[sp,#`$Xoff+8*16`+0]
364 ldr $t1,[sp,#`$Xoff+8*16`+4]
372 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
373 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
377 ldr $Tlo,[sp,#$Boff+0]
378 ldr $Thi,[sp,#$Boff+4]
379 ldr $t0, [$ctx,#$Aoff+$lo]
380 ldr $t1, [$ctx,#$Aoff+$hi]
381 ldr $t2, [$ctx,#$Boff+$lo]
382 ldr $t3, [$ctx,#$Boff+$hi]
384 str $t0, [$ctx,#$Aoff+$lo]
386 str $t1, [$ctx,#$Aoff+$hi]
388 str $t2, [$ctx,#$Boff+$lo]
390 str $t3, [$ctx,#$Boff+$hi]
392 ldr $Alo,[sp,#$Coff+0]
393 ldr $Ahi,[sp,#$Coff+4]
394 ldr $Tlo,[sp,#$Doff+0]
395 ldr $Thi,[sp,#$Doff+4]
396 ldr $t0, [$ctx,#$Coff+$lo]
397 ldr $t1, [$ctx,#$Coff+$hi]
398 ldr $t2, [$ctx,#$Doff+$lo]
399 ldr $t3, [$ctx,#$Doff+$hi]
401 str $t0, [$ctx,#$Coff+$lo]
403 str $t1, [$ctx,#$Coff+$hi]
405 str $t2, [$ctx,#$Doff+$lo]
407 str $t3, [$ctx,#$Doff+$hi]
409 ldr $Tlo,[sp,#$Foff+0]
410 ldr $Thi,[sp,#$Foff+4]
411 ldr $t0, [$ctx,#$Eoff+$lo]
412 ldr $t1, [$ctx,#$Eoff+$hi]
413 ldr $t2, [$ctx,#$Foff+$lo]
414 ldr $t3, [$ctx,#$Foff+$hi]
416 str $Elo,[$ctx,#$Eoff+$lo]
418 str $Ehi,[$ctx,#$Eoff+$hi]
420 str $t2, [$ctx,#$Foff+$lo]
422 str $t3, [$ctx,#$Foff+$hi]
424 ldr $Alo,[sp,#$Goff+0]
425 ldr $Ahi,[sp,#$Goff+4]
426 ldr $Tlo,[sp,#$Hoff+0]
427 ldr $Thi,[sp,#$Hoff+4]
428 ldr $t0, [$ctx,#$Goff+$lo]
429 ldr $t1, [$ctx,#$Goff+$hi]
430 ldr $t2, [$ctx,#$Hoff+$lo]
431 ldr $t3, [$ctx,#$Hoff+$hi]
433 str $t0, [$ctx,#$Goff+$lo]
435 str $t1, [$ctx,#$Goff+$hi]
437 str $t2, [$ctx,#$Hoff+$lo]
439 str $t3, [$ctx,#$Hoff+$hi]
447 add sp,sp,#8*9 @ destroy frame
449 ldmia sp!,{r4-r12,pc}
451 ldmia sp!,{r4-r12,lr}
453 moveq pc,lr @ be binary compatible with V4, yet
454 bx lr @ interoperable with Thumb ISA:-)
459 my @Sigma0=(28,34,39);
460 my @Sigma1=(14,18,41);
461 my @sigma0=(1, 8, 7);
462 my @sigma1=(19,61,6);
465 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
467 my @X=map("d$_",(0..15));
468 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
472 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
473 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
475 $code.=<<___ if ($i<16 || $i&1);
476 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
478 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
480 vshr.u64 $t1,$e,#@Sigma1[1]
482 vadd.i64 $a,$Maj @ h+=Maj from the past
484 vshr.u64 $t2,$e,#@Sigma1[2]
487 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
488 vsli.64 $t0,$e,#`64-@Sigma1[0]`
489 vsli.64 $t1,$e,#`64-@Sigma1[1]`
491 vsli.64 $t2,$e,#`64-@Sigma1[2]`
492 #if $i<16 && defined(__ARMEL__)
493 vrev64.8 @X[$i],@X[$i]
496 vbsl $Ch,$f,$g @ Ch(e,f,g)
497 vshr.u64 $t0,$a,#@Sigma0[0]
498 veor $t2,$t1 @ Sigma1(e)
500 vshr.u64 $t1,$a,#@Sigma0[1]
501 vsli.64 $t0,$a,#`64-@Sigma0[0]`
503 vshr.u64 $t2,$a,#@Sigma0[2]
504 vadd.i64 $K,@X[$i%16]
505 vsli.64 $t1,$a,#`64-@Sigma0[1]`
507 vsli.64 $t2,$a,#`64-@Sigma0[2]`
510 vbsl $Maj,$c,$b @ Maj(a,b,c)
511 veor $h,$t2 @ Sigma0(a)
521 if ($i&1) { &NEON_00_15($i,@_); return; }
523 # 2x-vectorized, therefore runs every 2nd round
524 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
525 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
526 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
527 my $e=@_[4]; # $e from NEON_00_15
530 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
531 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
532 vadd.i64 @_[0],d30 @ h+=Maj from the past
533 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
534 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
535 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
536 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
538 vshr.u64 $t0,$s0,#@sigma0[0]
539 veor $s1,$t1 @ sigma1(X[i+14])
540 vshr.u64 $t1,$s0,#@sigma0[1]
541 vadd.i64 @X[$i%8],$s1
542 vshr.u64 $s1,$s0,#@sigma0[2]
543 vsli.64 $t0,$s0,#`64-@sigma0[0]`
544 vsli.64 $t1,$s0,#`64-@sigma0[1]`
545 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
547 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
548 vadd.i64 @X[$i%8],$s0
549 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
550 veor $s1,$t1 @ sigma0(X[i+1])
551 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
552 vadd.i64 @X[$i%8],$s1
554 &NEON_00_15(2*$i,@_);
558 #if __ARM_MAX_ARCH__>=7
564 dmb @ errata #451034 on early Cortex A8
565 vstmdb sp!,{d8-d15} @ ABI specification says so
566 sub $Ktbl,r3,#672 @ K512
567 vldmia $ctx,{$A-$H} @ load context
570 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
576 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
580 vadd.i64 $A,d30 @ h+=Maj from the past
581 vldmia $ctx,{d24-d31} @ load context to temp
582 vadd.i64 q8,q12 @ vectorized accumulate
586 vstmia $ctx,{$A-$H} @ save context
588 sub $Ktbl,#640 @ rewind K512
591 vldmia sp!,{d8-d15} @ epilogue
597 .size sha512_block_data_order,.-sha512_block_data_order
598 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
600 #if __ARM_MAX_ARCH__>=7
601 .comm OPENSSL_armcap_P,4,4
605 $code =~ s/\`([^\`]*)\`/eval $1/gem;
606 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
607 $code =~ s/\bret\b/bx lr/gm;
609 close STDOUT; # enforce flush