2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
16 # Permission to use under GPL terms is granted.
17 # ====================================================================
19 # SHA256 block procedure for ARMv4. May 2007.
21 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
22 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23 # byte [on single-issue Xscale PXA250 core].
27 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
28 # Cortex A8 core and ~20 cycles per processed byte.
32 # Profiler-assisted and platform-specific optimization resulted in 16%
33 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
37 # Add NEON implementation. On Cortex A8 it was measured to process one
38 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40 # code (meaning that latter performs sub-optimally, nothing was done
45 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
48 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
49 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
51 if ($flavour && $flavour ne "void") {
52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
54 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
55 die "can't locate arm-xlate.pl";
57 open STDOUT,"| \"$^X\" $xlate $flavour $output";
59 open STDOUT,">$output";
74 @V=($A,$B,$C,$D,$E,$F,$G,$H);
84 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
86 $code.=<<___ if ($i<16);
88 @ ldr $t1,[$inp],#4 @ $i
90 str $inp,[sp,#17*4] @ make room for $t4
92 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
93 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
94 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
99 @ ldrb $t1,[$inp,#3] @ $i
100 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
103 orr $t1,$t1,$t2,lsl#8
105 orr $t1,$t1,$t0,lsl#16
107 str $inp,[sp,#17*4] @ make room for $t4
109 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
110 orr $t1,$t1,$t2,lsl#24
111 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
115 ldr $t2,[$Ktbl],#4 @ *K256++
116 add $h,$h,$t1 @ h+=X[i]
117 str $t1,[sp,#`$i%16`*4]
119 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
121 add $h,$h,$t2 @ h+=K256[i]
122 eor $t1,$t1,$g @ Ch(e,f,g)
123 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
124 add $h,$h,$t1 @ h+=Ch(e,f,g)
127 cmp $t2,#0xf2 @ done?
131 ldr $t1,[$inp],#4 @ prefetch
135 eor $t2,$a,$b @ a^b, b^c in next round
137 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
138 eor $t2,$a,$b @ a^b, b^c in next round
139 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
141 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
142 and $t3,$t3,$t2 @ (b^c)&=(a^b)
144 eor $t3,$t3,$b @ Maj(a,b,c)
145 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
146 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
152 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
155 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
156 @ ldr $t4,[sp,#`($i+14)%16`*4]
157 mov $t0,$t1,ror#$sigma0[0]
158 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
159 mov $t2,$t4,ror#$sigma1[0]
160 eor $t0,$t0,$t1,ror#$sigma0[1]
161 eor $t2,$t2,$t4,ror#$sigma1[1]
162 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
163 ldr $t1,[sp,#`($i+0)%16`*4]
164 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
165 ldr $t4,[sp,#`($i+9)%16`*4]
168 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
170 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
171 add $t1,$t1,$t4 @ X[i]
178 # include "arm_arch.h"
180 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
181 # define __ARM_MAX_ARCH__ 7
185 #if defined(__thumb2__)
195 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
196 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
197 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
198 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
199 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
200 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
201 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
202 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
203 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
204 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
205 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
206 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
207 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
208 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
209 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
210 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
213 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
215 .word OPENSSL_armcap_P-.Lsha256_block_data_order
219 .global sha256_block_data_order
220 .type sha256_block_data_order,%function
221 sha256_block_data_order:
222 .Lsha256_block_data_order:
223 #if __ARM_ARCH__<7 && !defined(__thumb2__)
224 sub r3,pc,#8 @ sha256_block_data_order
226 adr r3,.Lsha256_block_data_order
228 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
229 ldr r12,.LOPENSSL_armcap
230 ldr r12,[r3,r12] @ OPENSSL_armcap_P
234 tst r12,#ARMV8_SHA256
239 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
240 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
241 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
242 sub $Ktbl,r3,#256+32 @ K256
243 sub sp,sp,#16*4 @ alloca(X[16])
250 eor $t3,$B,$C @ magic
253 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
254 $code.=".Lrounds_16_xx:\n";
255 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
258 ite eq @ Thumb2 thing, sanity check in ARM
260 ldreq $t3,[sp,#16*4] @ pull ctx
263 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
278 ldr $inp,[sp,#17*4] @ pull inp
279 ldr $t2,[sp,#18*4] @ pull inp+len
282 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
284 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
287 add sp,sp,#`16+3`*4 @ destroy frame
289 ldmia sp!,{r4-r11,pc}
291 ldmia sp!,{r4-r11,lr}
293 moveq pc,lr @ be binary compatible with V4, yet
294 bx lr @ interoperable with Thumb ISA:-)
296 .size sha256_block_data_order,.-sha256_block_data_order
298 ######################################################################
302 my @X=map("q$_",(0..3));
303 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
307 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
308 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
310 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
311 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
313 $arg = "#$arg" if ($arg*1 eq $arg);
314 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
320 my @insns = (&$body,&$body,&$body,&$body);
321 my ($a,$b,$c,$d,$e,$f,$g,$h);
323 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
327 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
331 &vshr_u32 ($T2,$T0,$sigma0[0]);
334 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
337 &vshr_u32 ($T1,$T0,$sigma0[2]);
340 &vsli_32 ($T2,$T0,32-$sigma0[0]);
343 &vshr_u32 ($T3,$T0,$sigma0[1]);
349 &vsli_32 ($T3,$T0,32-$sigma0[1]);
352 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
355 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
358 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
361 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
364 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
370 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
373 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
376 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
379 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
382 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
385 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
388 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
394 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
397 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
400 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
403 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
406 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
409 &vadd_i32 ($T0,$T0,@X[0]);
410 while($#insns>=2) { eval(shift(@insns)); }
411 &vst1_32 ("{$T0}","[$Xfer,:128]!");
415 push(@X,shift(@X)); # "rotate" X[]
421 my @insns = (&$body,&$body,&$body,&$body);
422 my ($a,$b,$c,$d,$e,$f,$g,$h);
428 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
433 &vrev32_8 (@X[0],@X[0]);
438 &vadd_i32 ($T0,$T0,@X[0]);
439 foreach (@insns) { eval; } # remaining instructions
440 &vst1_32 ("{$T0}","[$Xfer,:128]!");
442 push(@X,shift(@X)); # "rotate" X[]
447 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
448 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
450 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
451 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
453 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
454 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
455 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
456 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
457 '&eor ($t2,$a,$b)', # a^b, b^c in next round
458 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
459 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
460 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
461 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
462 '&ldr ($t1,"[sp,#64]") if ($j==31)',
463 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
464 '&add ($d,$d,$h)', # d+=h
465 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
466 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
467 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
472 #if __ARM_MAX_ARCH__>=7
476 .global sha256_block_data_order_neon
477 .type sha256_block_data_order_neon,%function
480 sha256_block_data_order_neon:
482 stmdb sp!,{r4-r12,lr}
486 bic $H,$H,#15 @ align for 128-bit stores
489 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
491 vld1.8 {@X[0]},[$inp]!
492 vld1.8 {@X[1]},[$inp]!
493 vld1.8 {@X[2]},[$inp]!
494 vld1.8 {@X[3]},[$inp]!
495 vld1.32 {$T0},[$Ktbl,:128]!
496 vld1.32 {$T1},[$Ktbl,:128]!
497 vld1.32 {$T2},[$Ktbl,:128]!
498 vld1.32 {$T3},[$Ktbl,:128]!
499 vrev32.8 @X[0],@X[0] @ yes, even on
501 vrev32.8 @X[1],@X[1] @ big-endian
507 str $t2,[sp,#76] @ save original sp
508 vadd.i32 $T0,$T0,@X[0]
509 vadd.i32 $T1,$T1,@X[1]
510 vst1.32 {$T0},[$Xfer,:128]!
511 vadd.i32 $T2,$T2,@X[2]
512 vst1.32 {$T1},[$Xfer,:128]!
513 vadd.i32 $T3,$T3,@X[3]
514 vst1.32 {$T2},[$Xfer,:128]!
515 vst1.32 {$T3},[$Xfer,:128]!
527 &Xupdate(\&body_00_15);
528 &Xupdate(\&body_00_15);
529 &Xupdate(\&body_00_15);
530 &Xupdate(\&body_00_15);
532 teq $t1,#0 @ check for K256 terminator
539 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
542 subeq $inp,$inp,#64 @ avoid SEGV
543 vld1.8 {@X[0]},[$inp]! @ load next input block
544 vld1.8 {@X[1]},[$inp]!
545 vld1.8 {@X[2]},[$inp]!
546 vld1.8 {@X[3]},[$inp]!
551 &Xpreload(\&body_00_15);
552 &Xpreload(\&body_00_15);
553 &Xpreload(\&body_00_15);
554 &Xpreload(\&body_00_15);
557 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
561 add $A,$A,$t0 @ accumulate
583 ldreq sp,[sp,#76] @ restore original sp
588 ldmia sp!,{r4-r12,pc}
589 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
593 ######################################################################
597 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
598 my @MSG=map("q$_",(8..11));
599 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
603 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
605 # if defined(__thumb2__)
606 # define INST(a,b,c,d) .byte c,d|0xc,a,b
608 # define INST(a,b,c,d) .byte a,b,c,d
611 .type sha256_block_data_order_armv8,%function
613 sha256_block_data_order_armv8:
615 vld1.32 {$ABCD,$EFGH},[$ctx]
616 sub $Ktbl,$Ktbl,#256+32
617 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
622 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
623 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
624 vld1.32 {$W0},[$Ktbl]!
625 vrev32.8 @MSG[0],@MSG[0]
626 vrev32.8 @MSG[1],@MSG[1]
627 vrev32.8 @MSG[2],@MSG[2]
628 vrev32.8 @MSG[3],@MSG[3]
629 vmov $ABCD_SAVE,$ABCD @ offload
630 vmov $EFGH_SAVE,$EFGH
633 for($i=0;$i<12;$i++) {
635 vld1.32 {$W1},[$Ktbl]!
636 vadd.i32 $W0,$W0,@MSG[0]
637 sha256su0 @MSG[0],@MSG[1]
639 sha256h $ABCD,$EFGH,$W0
640 sha256h2 $EFGH,$abcd,$W0
641 sha256su1 @MSG[0],@MSG[2],@MSG[3]
643 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
646 vld1.32 {$W1},[$Ktbl]!
647 vadd.i32 $W0,$W0,@MSG[0]
649 sha256h $ABCD,$EFGH,$W0
650 sha256h2 $EFGH,$abcd,$W0
652 vld1.32 {$W0},[$Ktbl]!
653 vadd.i32 $W1,$W1,@MSG[1]
655 sha256h $ABCD,$EFGH,$W1
656 sha256h2 $EFGH,$abcd,$W1
658 vld1.32 {$W1},[$Ktbl]
659 vadd.i32 $W0,$W0,@MSG[2]
660 sub $Ktbl,$Ktbl,#256-16 @ rewind
662 sha256h $ABCD,$EFGH,$W0
663 sha256h2 $EFGH,$abcd,$W0
665 vadd.i32 $W1,$W1,@MSG[3]
667 sha256h $ABCD,$EFGH,$W1
668 sha256h2 $EFGH,$abcd,$W1
670 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
671 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
675 vst1.32 {$ABCD,$EFGH},[$ctx]
678 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
683 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
685 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
686 .comm OPENSSL_armcap_P,4,4
693 last if (!s/^#/@/ and !/^$/);
699 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
700 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
703 my ($mnemonic,$arg)=@_;
705 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
706 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
707 |(($2&7)<<17)|(($2&8)<<4)
708 |(($3&7)<<1) |(($3&8)<<2);
709 # since ARMv7 instructions are always encoded little-endian.
710 # correct solution is to use .inst directive, but older
711 # assemblers don't implement it:-(
712 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
713 $word&0xff,($word>>8)&0xff,
714 ($word>>16)&0xff,($word>>24)&0xff,
720 foreach (split($/,$code)) {
722 s/\`([^\`]*)\`/eval $1/geo;
724 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
726 s/\bret\b/bx lr/go or
727 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
732 close STDOUT; # enforce flush