3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA256 block procedure for ARMv4. May 2007.
12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14 # byte [on single-issue Xscale PXA250 core].
18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
19 # Cortex A8 core and ~20 cycles per processed byte.
23 # Profiler-assisted and platform-specific optimization resulted in 16%
24 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
28 # Add NEON implementation. On Cortex A8 it was measured to process one
29 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
30 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
31 # code (meaning that latter performs sub-optimally, nothing was done
34 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
35 open STDOUT,">$output";
49 @V=($A,$B,$C,$D,$E,$F,$G,$H);
59 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
61 $code.=<<___ if ($i<16);
63 @ ldr $t1,[$inp],#4 @ $i
65 str $inp,[sp,#17*4] @ make room for $t4
67 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
68 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
69 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
72 @ ldrb $t1,[$inp,#3] @ $i
73 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
78 orr $t1,$t1,$t0,lsl#16
80 str $inp,[sp,#17*4] @ make room for $t4
82 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
83 orr $t1,$t1,$t2,lsl#24
84 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
88 ldr $t2,[$Ktbl],#4 @ *K256++
89 add $h,$h,$t1 @ h+=X[i]
90 str $t1,[sp,#`$i%16`*4]
92 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
94 add $h,$h,$t2 @ h+=K256[i]
95 eor $t1,$t1,$g @ Ch(e,f,g)
96 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
97 add $h,$h,$t1 @ h+=Ch(e,f,g)
100 cmp $t2,#0xf2 @ done?
104 ldr $t1,[$inp],#4 @ prefetch
108 eor $t2,$a,$b @ a^b, b^c in next round
110 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
111 eor $t2,$a,$b @ a^b, b^c in next round
112 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
114 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
115 and $t3,$t3,$t2 @ (b^c)&=(a^b)
117 eor $t3,$t3,$b @ Maj(a,b,c)
118 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
119 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
125 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
128 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
129 @ ldr $t4,[sp,#`($i+14)%16`*4]
130 mov $t0,$t1,ror#$sigma0[0]
131 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
132 mov $t2,$t4,ror#$sigma1[0]
133 eor $t0,$t0,$t1,ror#$sigma0[1]
134 eor $t2,$t2,$t4,ror#$sigma1[1]
135 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
136 ldr $t1,[sp,#`($i+0)%16`*4]
137 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
138 ldr $t4,[sp,#`($i+9)%16`*4]
141 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
143 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
144 add $t1,$t1,$t4 @ X[i]
150 #include "arm_arch.h"
158 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
159 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
160 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
161 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
162 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
163 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
164 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
165 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
166 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
167 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
168 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
169 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
170 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
171 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
172 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
173 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
177 .word OPENSSL_armcap_P-sha256_block_data_order
180 .global sha256_block_data_order
181 .type sha256_block_data_order,%function
182 sha256_block_data_order:
183 sub r3,pc,#8 @ sha256_block_data_order
184 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
186 ldr r12,.LOPENSSL_armcap
187 ldr r12,[r3,r12] @ OPENSSL_armcap_P
191 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
192 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
193 sub $Ktbl,r3,#256+32 @ K256
194 sub sp,sp,#16*4 @ alloca(X[16])
201 eor $t3,$B,$C @ magic
204 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
205 $code.=".Lrounds_16_xx:\n";
206 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
208 ldreq $t3,[sp,#16*4] @ pull ctx
211 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
226 ldr $inp,[sp,#17*4] @ pull inp
227 ldr $t2,[sp,#18*4] @ pull inp+len
230 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
232 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
235 add sp,sp,#`16+3`*4 @ destroy frame
237 ldmia sp!,{r4-r11,pc}
239 ldmia sp!,{r4-r11,lr}
241 moveq pc,lr @ be binary compatible with V4, yet
242 bx lr @ interoperable with Thumb ISA:-)
245 ######################################################################
249 my @X=map("q$_",(0..3));
250 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
254 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
255 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
257 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
258 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
260 $arg = "#$arg" if ($arg*1 eq $arg);
261 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
267 my @insns = (&$body,&$body,&$body,&$body);
268 my ($a,$b,$c,$d,$e,$f,$g,$h);
270 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
274 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
278 &vshr_u32 ($T2,$T0,$sigma0[0]);
281 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
284 &vshr_u32 ($T1,$T0,$sigma0[2]);
287 &vsli_32 ($T2,$T0,32-$sigma0[0]);
290 &vshr_u32 ($T3,$T0,$sigma0[1]);
296 &vsli_32 ($T3,$T0,32-$sigma0[1]);
299 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
302 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
305 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
308 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
311 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
317 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
320 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
323 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
326 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
329 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
332 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
335 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
341 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
344 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
347 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
350 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
353 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
356 &vadd_i32 ($T0,$T0,@X[0]);
357 while($#insns>=2) { eval(shift(@insns)); }
358 &vst1_32 ("{$T0}","[$Xfer,:128]!");
362 push(@X,shift(@X)); # "rotate" X[]
368 my @insns = (&$body,&$body,&$body,&$body);
369 my ($a,$b,$c,$d,$e,$f,$g,$h);
375 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
380 &vrev32_8 (@X[0],@X[0]);
385 &vadd_i32 ($T0,$T0,@X[0]);
386 foreach (@insns) { eval; } # remaining instructions
387 &vst1_32 ("{$T0}","[$Xfer,:128]!");
389 push(@X,shift(@X)); # "rotate" X[]
394 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
395 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
397 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
398 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
400 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
401 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
402 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
403 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
404 '&eor ($t2,$a,$b)', # a^b, b^c in next round
405 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
406 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
407 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
408 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
409 '&ldr ($t1,"[sp,#64]") if ($j==31)',
410 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
411 '&add ($d,$d,$h)', # d+=h
412 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
413 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
414 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
423 stmdb sp!,{r4-r12,lr}
426 sub sp,sp,#16*4+16 @ alloca
427 sub $Ktbl,r3,#256+32 @ K256
428 bic sp,sp,#15 @ align for 128-bit stores
430 vld1.8 {@X[0]},[$inp]!
431 vld1.8 {@X[1]},[$inp]!
432 vld1.8 {@X[2]},[$inp]!
433 vld1.8 {@X[3]},[$inp]!
434 vld1.32 {$T0},[$Ktbl,:128]!
435 vld1.32 {$T1},[$Ktbl,:128]!
436 vld1.32 {$T2},[$Ktbl,:128]!
437 vld1.32 {$T3},[$Ktbl,:128]!
438 vrev32.8 @X[0],@X[0] @ yes, even on
440 vrev32.8 @X[1],@X[1] @ big-endian
446 str $t2,[sp,#76] @ save original sp
447 vadd.i32 $T0,$T0,@X[0]
448 vadd.i32 $T1,$T1,@X[1]
449 vst1.32 {$T0},[$Xfer,:128]!
450 vadd.i32 $T2,$T2,@X[2]
451 vst1.32 {$T1},[$Xfer,:128]!
452 vadd.i32 $T3,$T3,@X[3]
453 vst1.32 {$T2},[$Xfer,:128]!
454 vst1.32 {$T3},[$Xfer,:128]!
466 &Xupdate(\&body_00_15);
467 &Xupdate(\&body_00_15);
468 &Xupdate(\&body_00_15);
469 &Xupdate(\&body_00_15);
471 teq $t1,#0 @ check for K256 terminator
478 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
480 subeq $inp,$inp,#64 @ avoid SEGV
481 vld1.8 {@X[0]},[$inp]! @ load next input block
482 vld1.8 {@X[1]},[$inp]!
483 vld1.8 {@X[2]},[$inp]!
484 vld1.8 {@X[3]},[$inp]!
488 &Xpreload(\&body_00_15);
489 &Xpreload(\&body_00_15);
490 &Xpreload(\&body_00_15);
491 &Xpreload(\&body_00_15);
494 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
498 add $A,$A,$t0 @ accumulate
519 ldreq sp,[sp,#76] @ restore original sp
523 ldmia sp!,{r4-r12,pc}
528 .size sha256_block_data_order,.-sha256_block_data_order
529 .asciz "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
531 .comm OPENSSL_armcap_P,4,4
534 $code =~ s/\`([^\`]*)\`/eval $1/gem;
535 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
537 close STDOUT; # enforce flush