3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA256 block procedure for ARMv4. May 2007.
12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14 # byte [on single-issue Xscale PXA250 core].
18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
19 # Cortex A8 core and ~20 cycles per processed byte.
23 # Profiler-assisted and platform-specific optimization resulted in 16%
24 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
28 # Add NEON implementation. On Cortex A8 it was measured to process one
29 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
30 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
31 # code (meaning that latter performs sub-optimally, nothing was done
36 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
38 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
39 open STDOUT,">$output";
53 @V=($A,$B,$C,$D,$E,$F,$G,$H);
63 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
65 $code.=<<___ if ($i<16);
67 @ ldr $t1,[$inp],#4 @ $i
69 str $inp,[sp,#17*4] @ make room for $t4
71 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
72 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
73 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
76 @ ldrb $t1,[$inp,#3] @ $i
77 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
82 orr $t1,$t1,$t0,lsl#16
84 str $inp,[sp,#17*4] @ make room for $t4
86 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
87 orr $t1,$t1,$t2,lsl#24
88 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
92 ldr $t2,[$Ktbl],#4 @ *K256++
93 add $h,$h,$t1 @ h+=X[i]
94 str $t1,[sp,#`$i%16`*4]
96 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
98 add $h,$h,$t2 @ h+=K256[i]
99 eor $t1,$t1,$g @ Ch(e,f,g)
100 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
101 add $h,$h,$t1 @ h+=Ch(e,f,g)
104 cmp $t2,#0xf2 @ done?
108 ldr $t1,[$inp],#4 @ prefetch
112 eor $t2,$a,$b @ a^b, b^c in next round
114 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
115 eor $t2,$a,$b @ a^b, b^c in next round
116 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
118 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
119 and $t3,$t3,$t2 @ (b^c)&=(a^b)
121 eor $t3,$t3,$b @ Maj(a,b,c)
122 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
123 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
129 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
132 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
133 @ ldr $t4,[sp,#`($i+14)%16`*4]
134 mov $t0,$t1,ror#$sigma0[0]
135 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
136 mov $t2,$t4,ror#$sigma1[0]
137 eor $t0,$t0,$t1,ror#$sigma0[1]
138 eor $t2,$t2,$t4,ror#$sigma1[1]
139 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
140 ldr $t1,[sp,#`($i+0)%16`*4]
141 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
142 ldr $t4,[sp,#`($i+9)%16`*4]
145 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
147 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
148 add $t1,$t1,$t4 @ X[i]
154 #include "arm_arch.h"
162 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
163 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
164 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
167 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
168 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
169 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
170 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
171 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
172 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
173 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
174 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
175 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
176 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
177 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
181 .word OPENSSL_armcap_P-sha256_block_data_order
184 .global sha256_block_data_order
185 .type sha256_block_data_order,%function
186 sha256_block_data_order:
187 sub r3,pc,#8 @ sha256_block_data_order
188 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
190 ldr r12,.LOPENSSL_armcap
191 ldr r12,[r3,r12] @ OPENSSL_armcap_P
197 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
198 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
199 sub $Ktbl,r3,#256+32 @ K256
200 sub sp,sp,#16*4 @ alloca(X[16])
207 eor $t3,$B,$C @ magic
210 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
211 $code.=".Lrounds_16_xx:\n";
212 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
214 ldreq $t3,[sp,#16*4] @ pull ctx
217 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
232 ldr $inp,[sp,#17*4] @ pull inp
233 ldr $t2,[sp,#18*4] @ pull inp+len
236 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
238 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
241 add sp,sp,#`16+3`*4 @ destroy frame
243 ldmia sp!,{r4-r11,pc}
245 ldmia sp!,{r4-r11,lr}
247 moveq pc,lr @ be binary compatible with V4, yet
248 bx lr @ interoperable with Thumb ISA:-)
250 .size sha256_block_data_order,.-sha256_block_data_order
252 ######################################################################
256 my @X=map("q$_",(0..3));
257 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
261 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
262 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
264 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
265 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
267 $arg = "#$arg" if ($arg*1 eq $arg);
268 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
274 my @insns = (&$body,&$body,&$body,&$body);
275 my ($a,$b,$c,$d,$e,$f,$g,$h);
277 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
281 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
285 &vshr_u32 ($T2,$T0,$sigma0[0]);
288 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
291 &vshr_u32 ($T1,$T0,$sigma0[2]);
294 &vsli_32 ($T2,$T0,32-$sigma0[0]);
297 &vshr_u32 ($T3,$T0,$sigma0[1]);
303 &vsli_32 ($T3,$T0,32-$sigma0[1]);
306 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
309 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
312 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
315 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
318 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
324 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
327 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
330 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
333 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
336 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
339 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
342 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
348 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
351 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
354 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
357 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
360 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
363 &vadd_i32 ($T0,$T0,@X[0]);
364 while($#insns>=2) { eval(shift(@insns)); }
365 &vst1_32 ("{$T0}","[$Xfer,:128]!");
369 push(@X,shift(@X)); # "rotate" X[]
375 my @insns = (&$body,&$body,&$body,&$body);
376 my ($a,$b,$c,$d,$e,$f,$g,$h);
382 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
387 &vrev32_8 (@X[0],@X[0]);
392 &vadd_i32 ($T0,$T0,@X[0]);
393 foreach (@insns) { eval; } # remaining instructions
394 &vst1_32 ("{$T0}","[$Xfer,:128]!");
396 push(@X,shift(@X)); # "rotate" X[]
401 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
402 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
404 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
405 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
407 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
408 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
409 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
410 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
411 '&eor ($t2,$a,$b)', # a^b, b^c in next round
412 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
413 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
414 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
415 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
416 '&ldr ($t1,"[sp,#64]") if ($j==31)',
417 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
418 '&add ($d,$d,$h)', # d+=h
419 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
420 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
421 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
429 .type sha256_block_data_order_neon,%function
431 sha256_block_data_order_neon:
433 stmdb sp!,{r4-r12,lr}
436 sub sp,sp,#16*4+16 @ alloca
437 sub $Ktbl,r3,#256+32 @ K256
438 bic sp,sp,#15 @ align for 128-bit stores
440 vld1.8 {@X[0]},[$inp]!
441 vld1.8 {@X[1]},[$inp]!
442 vld1.8 {@X[2]},[$inp]!
443 vld1.8 {@X[3]},[$inp]!
444 vld1.32 {$T0},[$Ktbl,:128]!
445 vld1.32 {$T1},[$Ktbl,:128]!
446 vld1.32 {$T2},[$Ktbl,:128]!
447 vld1.32 {$T3},[$Ktbl,:128]!
448 vrev32.8 @X[0],@X[0] @ yes, even on
450 vrev32.8 @X[1],@X[1] @ big-endian
456 str $t2,[sp,#76] @ save original sp
457 vadd.i32 $T0,$T0,@X[0]
458 vadd.i32 $T1,$T1,@X[1]
459 vst1.32 {$T0},[$Xfer,:128]!
460 vadd.i32 $T2,$T2,@X[2]
461 vst1.32 {$T1},[$Xfer,:128]!
462 vadd.i32 $T3,$T3,@X[3]
463 vst1.32 {$T2},[$Xfer,:128]!
464 vst1.32 {$T3},[$Xfer,:128]!
476 &Xupdate(\&body_00_15);
477 &Xupdate(\&body_00_15);
478 &Xupdate(\&body_00_15);
479 &Xupdate(\&body_00_15);
481 teq $t1,#0 @ check for K256 terminator
488 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
490 subeq $inp,$inp,#64 @ avoid SEGV
491 vld1.8 {@X[0]},[$inp]! @ load next input block
492 vld1.8 {@X[1]},[$inp]!
493 vld1.8 {@X[2]},[$inp]!
494 vld1.8 {@X[3]},[$inp]!
498 &Xpreload(\&body_00_15);
499 &Xpreload(\&body_00_15);
500 &Xpreload(\&body_00_15);
501 &Xpreload(\&body_00_15);
504 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
508 add $A,$A,$t0 @ accumulate
529 ldreq sp,[sp,#76] @ restore original sp
533 ldmia sp!,{r4-r12,pc}
534 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
538 ######################################################################
542 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
543 my @MSG=map("q$_",(8..11));
544 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
549 .type sha256_block_data_order_armv8,%function
551 sha256_block_data_order_armv8:
553 vld1.32 {$ABCD,$EFGH},[$ctx]
554 sub $Ktbl,r3,#sha256_block_data_order-K256
557 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
558 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
559 vld1.32 {$W0},[$Ktbl]!
560 vrev32.8 @MSG[0],@MSG[0]
561 vrev32.8 @MSG[1],@MSG[1]
562 vrev32.8 @MSG[2],@MSG[2]
563 vrev32.8 @MSG[3],@MSG[3]
564 vmov $ABCD_SAVE,$ABCD @ offload
565 vmov $EFGH_SAVE,$EFGH
568 for($i=0;$i<12;$i++) {
570 vld1.32 {$W1},[$Ktbl]!
571 vadd.i32 $W0,$W0,@MSG[0]
572 sha256su0 @MSG[0],@MSG[1]
574 sha256h $ABCD,$EFGH,$W0
575 sha256h2 $EFGH,$abcd,$W0
576 sha256su1 @MSG[0],@MSG[2],@MSG[3]
578 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
581 vld1.32 {$W1},[$Ktbl]!
582 vadd.i32 $W0,$W0,@MSG[0]
584 sha256h $ABCD,$EFGH,$W0
585 sha256h2 $EFGH,$abcd,$W0
587 vld1.32 {$W0},[$Ktbl]!
588 vadd.i32 $W1,$W1,@MSG[1]
590 sha256h $ABCD,$EFGH,$W1
591 sha256h2 $EFGH,$abcd,$W1
593 vld1.32 {$W1},[$Ktbl]
594 vadd.i32 $W0,$W0,@MSG[2]
595 sub $Ktbl,$Ktbl,#256-16 @ rewind
597 sha256h $ABCD,$EFGH,$W0
598 sha256h2 $EFGH,$abcd,$W0
600 vadd.i32 $W1,$W1,@MSG[3]
602 sha256h $ABCD,$EFGH,$W1
603 sha256h2 $EFGH,$abcd,$W1
605 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
606 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
609 vst1.32 {$ABCD,$EFGH},[$ctx]
612 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
617 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
619 .comm OPENSSL_armcap_P,4,4
623 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
624 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
627 my ($mnemonic,$arg)=@_;
629 $arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o
631 sprintf ".long\t0x%08x\t@ %s %s",
632 $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
633 |(($2&7)<<17)|(($2&8)<<4)
634 |(($3&7)<<1) |(($3&8)<<2),
639 foreach (split($/,$code)) {
641 s/\`([^\`]*)\`/eval $1/geo;
643 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
645 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
650 close STDOUT; # enforce flush