crypto/sha/asm/sha256-armv4.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # SHA256 block procedure for ARMv4. May 2007.
  11
  12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  14 # byte [on single-issue Xscale PXA250 core].
  15
  16 # July 2010.
  17 #
  18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  19 # Cortex A8 core and ~20 cycles per processed byte.
  20
  21 # February 2011.
  22 #
  23 # Profiler-assisted and platform-specific optimization resulted in 16%
  24 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
  25
  26 # September 2013.
  27 #
  28 # Add NEON implementation. On Cortex A8 it was measured to process one
  29 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
  30 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
  31 # code (meaning that latter performs sub-optimally, nothing was done
  32 # about it).
  33
  34 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  35 open STDOUT,">$output";
  36
  37 $ctx="r0";      $t0="r0";
  38 $inp="r1";      $t4="r1";
  39 $len="r2";      $t1="r2";
  40 $T1="r3";       $t3="r3";
  41 $A="r4";
  42 $B="r5";
  43 $C="r6";
  44 $D="r7";
  45 $E="r8";
  46 $F="r9";
  47 $G="r10";
  48 $H="r11";
  49 @V=($A,$B,$C,$D,$E,$F,$G,$H);
  50 $t2="r12";
  51 $Ktbl="r14";
  52
  53 @Sigma0=( 2,13,22);
  54 @Sigma1=( 6,11,25);
  55 @sigma0=( 7,18, 3);
  56 @sigma1=(17,19,10);
  57
  58 sub BODY_00_15 {
  59 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  60
  61 $code.=<<___ if ($i<16);
  62 #if __ARM_ARCH__>=7
  63         @ ldr   $t1,[$inp],#4                   @ $i
  64 # if $i==15
  65         str     $inp,[sp,#17*4]                 @ make room for $t4
  66 # endif
  67         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  68         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
  69         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
  70         rev     $t1,$t1
  71 #else
  72         @ ldrb  $t1,[$inp,#3]                   @ $i
  73         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
  74         ldrb    $t2,[$inp,#2]
  75         ldrb    $t0,[$inp,#1]
  76         orr     $t1,$t1,$t2,lsl#8
  77         ldrb    $t2,[$inp],#4
  78         orr     $t1,$t1,$t0,lsl#16
  79 # if $i==15
  80         str     $inp,[sp,#17*4]                 @ make room for $t4
  81 # endif
  82         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  83         orr     $t1,$t1,$t2,lsl#24
  84         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
  85 #endif
  86 ___
  87 $code.=<<___;
  88         ldr     $t2,[$Ktbl],#4                  @ *K256++
  89         add     $h,$h,$t1                       @ h+=X[i]
  90         str     $t1,[sp,#`$i%16`*4]
  91         eor     $t1,$f,$g
  92         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
  93         and     $t1,$t1,$e
  94         add     $h,$h,$t2                       @ h+=K256[i]
  95         eor     $t1,$t1,$g                      @ Ch(e,f,g)
  96         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
  97         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
  98 #if $i==31
  99         and     $t2,$t2,#0xff
 100         cmp     $t2,#0xf2                       @ done?
 101 #endif
 102 #if $i<15
 103 # if __ARM_ARCH__>=7
 104         ldr     $t1,[$inp],#4                   @ prefetch
 105 # else
 106         ldrb    $t1,[$inp,#3]
 107 # endif
 108         eor     $t2,$a,$b                       @ a^b, b^c in next round
 109 #else
 110         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
 111         eor     $t2,$a,$b                       @ a^b, b^c in next round
 112         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
 113 #endif
 114         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
 115         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
 116         add     $d,$d,$h                        @ d+=h
 117         eor     $t3,$t3,$b                      @ Maj(a,b,c)
 118         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
 119         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
 120 ___
 121         ($t2,$t3)=($t3,$t2);
 122 }
 123
 124 sub BODY_16_XX {
 125 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 126
 127 $code.=<<___;
 128         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
 129         @ ldr   $t4,[sp,#`($i+14)%16`*4]
 130         mov     $t0,$t1,ror#$sigma0[0]
 131         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
 132         mov     $t2,$t4,ror#$sigma1[0]
 133         eor     $t0,$t0,$t1,ror#$sigma0[1]
 134         eor     $t2,$t2,$t4,ror#$sigma1[1]
 135         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
 136         ldr     $t1,[sp,#`($i+0)%16`*4]
 137         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
 138         ldr     $t4,[sp,#`($i+9)%16`*4]
 139
 140         add     $t2,$t2,$t0
 141         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
 142         add     $t1,$t1,$t2
 143         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
 144         add     $t1,$t1,$t4                     @ X[i]
 145 ___
 146         &BODY_00_15(@_);
 147 }
 148
 149 $code=<<___;
 150 #include "arm_arch.h"
 151
 152 .text
 153 .code   32
 154
 155 .type   K256,%object
 156 .align  5
 157 K256:
 158 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 159 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 160 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 161 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 162 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 163 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 164 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 165 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 166 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 167 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 168 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 169 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 170 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 171 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 172 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 173 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 174 .size   K256,.-K256
 175 .word   0                               @ terminator
 176 .LOPENSSL_armcap:
 177 .word   OPENSSL_armcap_P-sha256_block_data_order
 178 .align  5
 179
 180 .global sha256_block_data_order
 181 .type   sha256_block_data_order,%function
 182 sha256_block_data_order:
 183         sub     r3,pc,#8                @ sha256_block_data_order
 184         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
 185 #if __ARM_ARCH__>=7
 186         ldr     r12,.LOPENSSL_armcap
 187         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
 188         tst     r12,#1
 189         bne     .LNEON
 190 #endif
 191         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
 192         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 193         sub     $Ktbl,r3,#256+32        @ K256
 194         sub     sp,sp,#16*4             @ alloca(X[16])
 195 .Loop:
 196 # if __ARM_ARCH__>=7
 197         ldr     $t1,[$inp],#4
 198 # else
 199         ldrb    $t1,[$inp,#3]
 200 # endif
 201         eor     $t3,$B,$C               @ magic
 202         eor     $t2,$t2,$t2
 203 ___
 204 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 205 $code.=".Lrounds_16_xx:\n";
 206 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 207 $code.=<<___;
 208         ldreq   $t3,[sp,#16*4]          @ pull ctx
 209         bne     .Lrounds_16_xx
 210
 211         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
 212         ldr     $t0,[$t3,#0]
 213         ldr     $t1,[$t3,#4]
 214         ldr     $t2,[$t3,#8]
 215         add     $A,$A,$t0
 216         ldr     $t0,[$t3,#12]
 217         add     $B,$B,$t1
 218         ldr     $t1,[$t3,#16]
 219         add     $C,$C,$t2
 220         ldr     $t2,[$t3,#20]
 221         add     $D,$D,$t0
 222         ldr     $t0,[$t3,#24]
 223         add     $E,$E,$t1
 224         ldr     $t1,[$t3,#28]
 225         add     $F,$F,$t2
 226         ldr     $inp,[sp,#17*4]         @ pull inp
 227         ldr     $t2,[sp,#18*4]          @ pull inp+len
 228         add     $G,$G,$t0
 229         add     $H,$H,$t1
 230         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
 231         cmp     $inp,$t2
 232         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
 233         bne     .Loop
 234
 235         add     sp,sp,#`16+3`*4 @ destroy frame
 236 #if __ARM_ARCH__>=5
 237         ldmia   sp!,{r4-r11,pc}
 238 #else
 239         ldmia   sp!,{r4-r11,lr}
 240         tst     lr,#1
 241         moveq   pc,lr                   @ be binary compatible with V4, yet
 242         bx      lr                      @ interoperable with Thumb ISA:-)
 243 #endif
 244 ___
 245 ######################################################################
 246 # NEON stuff
 247 #
 248 {{{
 249 my @X=map("q$_",(0..3));
 250 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
 251 my $Xfer=$t4;
 252 my $j=0;
 253
 254 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 255 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
 256
 257 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
 258 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
 259   my $arg = pop;
 260     $arg = "#$arg" if ($arg*1 eq $arg);
 261     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
 262 }
 263
 264 sub Xupdate()
 265 { use integer;
 266   my $body = shift;
 267   my @insns = (&$body,&$body,&$body,&$body);
 268   my ($a,$b,$c,$d,$e,$f,$g,$h);
 269
 270         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
 271          eval(shift(@insns));
 272          eval(shift(@insns));
 273          eval(shift(@insns));
 274         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
 275          eval(shift(@insns));
 276          eval(shift(@insns));
 277          eval(shift(@insns));
 278         &vshr_u32       ($T2,$T0,$sigma0[0]);
 279          eval(shift(@insns));
 280          eval(shift(@insns));
 281         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
 282          eval(shift(@insns));
 283          eval(shift(@insns));
 284         &vshr_u32       ($T1,$T0,$sigma0[2]);
 285          eval(shift(@insns));
 286          eval(shift(@insns));
 287         &vsli_32        ($T2,$T0,32-$sigma0[0]);
 288          eval(shift(@insns));
 289          eval(shift(@insns));
 290         &vshr_u32       ($T3,$T0,$sigma0[1]);
 291          eval(shift(@insns));
 292          eval(shift(@insns));
 293         &veor           ($T1,$T1,$T2);
 294          eval(shift(@insns));
 295          eval(shift(@insns));
 296         &vsli_32        ($T3,$T0,32-$sigma0[1]);
 297          eval(shift(@insns));
 298          eval(shift(@insns));
 299           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
 300          eval(shift(@insns));
 301          eval(shift(@insns));
 302         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
 303          eval(shift(@insns));
 304          eval(shift(@insns));
 305           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
 306          eval(shift(@insns));
 307          eval(shift(@insns));
 308           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
 309          eval(shift(@insns));
 310          eval(shift(@insns));
 311         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
 312          eval(shift(@insns));
 313          eval(shift(@insns));
 314           &veor         ($T5,$T5,$T4);
 315          eval(shift(@insns));
 316          eval(shift(@insns));
 317           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
 318          eval(shift(@insns));
 319          eval(shift(@insns));
 320           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
 321          eval(shift(@insns));
 322          eval(shift(@insns));
 323           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
 324          eval(shift(@insns));
 325          eval(shift(@insns));
 326         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
 327          eval(shift(@insns));
 328          eval(shift(@insns));
 329           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
 330          eval(shift(@insns));
 331          eval(shift(@insns));
 332           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
 333          eval(shift(@insns));
 334          eval(shift(@insns));
 335           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
 336          eval(shift(@insns));
 337          eval(shift(@insns));
 338           &veor         ($T5,$T5,$T4);
 339          eval(shift(@insns));
 340          eval(shift(@insns));
 341           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
 342          eval(shift(@insns));
 343          eval(shift(@insns));
 344         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
 345          eval(shift(@insns));
 346          eval(shift(@insns));
 347           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
 348          eval(shift(@insns));
 349          eval(shift(@insns));
 350           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
 351          eval(shift(@insns));
 352          eval(shift(@insns));
 353         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
 354          eval(shift(@insns));
 355          eval(shift(@insns));
 356         &vadd_i32       ($T0,$T0,@X[0]);
 357          while($#insns>=2) { eval(shift(@insns)); }
 358         &vst1_32        ("{$T0}","[$Xfer,:128]!");
 359          eval(shift(@insns));
 360          eval(shift(@insns));
 361
 362         push(@X,shift(@X));             # "rotate" X[]
 363 }
 364
 365 sub Xpreload()
 366 { use integer;
 367   my $body = shift;
 368   my @insns = (&$body,&$body,&$body,&$body);
 369   my ($a,$b,$c,$d,$e,$f,$g,$h);
 370
 371          eval(shift(@insns));
 372          eval(shift(@insns));
 373          eval(shift(@insns));
 374          eval(shift(@insns));
 375         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
 376          eval(shift(@insns));
 377          eval(shift(@insns));
 378          eval(shift(@insns));
 379          eval(shift(@insns));
 380         &vrev32_8       (@X[0],@X[0]);
 381          eval(shift(@insns));
 382          eval(shift(@insns));
 383          eval(shift(@insns));
 384          eval(shift(@insns));
 385         &vadd_i32       ($T0,$T0,@X[0]);
 386          foreach (@insns) { eval; }     # remaining instructions
 387         &vst1_32        ("{$T0}","[$Xfer,:128]!");
 388
 389         push(@X,shift(@X));             # "rotate" X[]
 390 }
 391
 392 sub body_00_15 () {
 393         (
 394         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
 395         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
 396         '&eor   ($t1,$f,$g)',
 397         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
 398         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
 399         '&and   ($t1,$t1,$e)',
 400         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
 401         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
 402         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
 403         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
 404         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
 405         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
 406         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
 407         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
 408         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
 409         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
 410         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
 411         '&add   ($d,$d,$h)',                    # d+=h
 412         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
 413         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
 414         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
 415         )
 416 }
 417
 418 $code.=<<___;
 419 #if __ARM_ARCH__>=7
 420 .fpu    neon
 421 .align  4
 422 .LNEON:
 423         stmdb   sp!,{r4-r12,lr}
 424
 425         mov     $t2,sp
 426         sub     sp,sp,#16*4+16          @ alloca
 427         sub     $Ktbl,r3,#256+32        @ K256
 428         bic     sp,sp,#15               @ align for 128-bit stores
 429
 430         vld1.8          {@X[0]},[$inp]!
 431         vld1.8          {@X[1]},[$inp]!
 432         vld1.8          {@X[2]},[$inp]!
 433         vld1.8          {@X[3]},[$inp]!
 434         vld1.32         {$T0},[$Ktbl,:128]!
 435         vld1.32         {$T1},[$Ktbl,:128]!
 436         vld1.32         {$T2},[$Ktbl,:128]!
 437         vld1.32         {$T3},[$Ktbl,:128]!
 438         vrev32.8        @X[0],@X[0]             @ yes, even on
 439         str             $ctx,[sp,#64]
 440         vrev32.8        @X[1],@X[1]             @ big-endian
 441         str             $inp,[sp,#68]
 442         mov             $Xfer,sp
 443         vrev32.8        @X[2],@X[2]
 444         str             $len,[sp,#72]
 445         vrev32.8        @X[3],@X[3]
 446         str             $t2,[sp,#76]            @ save original sp
 447         vadd.i32        $T0,$T0,@X[0]
 448         vadd.i32        $T1,$T1,@X[1]
 449         vst1.32         {$T0},[$Xfer,:128]!
 450         vadd.i32        $T2,$T2,@X[2]
 451         vst1.32         {$T1},[$Xfer,:128]!
 452         vadd.i32        $T3,$T3,@X[3]
 453         vst1.32         {$T2},[$Xfer,:128]!
 454         vst1.32         {$T3},[$Xfer,:128]!
 455
 456         ldmia           $ctx,{$A-$H}
 457         sub             $Xfer,$Xfer,#64
 458         ldr             $t1,[sp,#0]
 459         eor             $t2,$t2,$t2
 460         eor             $t3,$B,$C
 461         b               .L_00_48
 462
 463 .align  4
 464 .L_00_48:
 465 ___
 466         &Xupdate(\&body_00_15);
 467         &Xupdate(\&body_00_15);
 468         &Xupdate(\&body_00_15);
 469         &Xupdate(\&body_00_15);
 470 $code.=<<___;
 471         teq     $t1,#0                          @ check for K256 terminator
 472         ldr     $t1,[sp,#0]
 473         sub     $Xfer,$Xfer,#64
 474         bne     .L_00_48
 475
 476         ldr             $inp,[sp,#68]
 477         ldr             $t0,[sp,#72]
 478         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
 479         teq             $inp,$t0
 480         subeq           $inp,$inp,#64           @ avoid SEGV
 481         vld1.8          {@X[0]},[$inp]!         @ load next input block
 482         vld1.8          {@X[1]},[$inp]!
 483         vld1.8          {@X[2]},[$inp]!
 484         vld1.8          {@X[3]},[$inp]!
 485         strne           $inp,[sp,#68]
 486         mov             $Xfer,sp
 487 ___
 488         &Xpreload(\&body_00_15);
 489         &Xpreload(\&body_00_15);
 490         &Xpreload(\&body_00_15);
 491         &Xpreload(\&body_00_15);
 492 $code.=<<___;
 493         ldr     $t0,[$t1,#0]
 494         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
 495         ldr     $t2,[$t1,#4]
 496         ldr     $t3,[$t1,#8]
 497         ldr     $t4,[$t1,#12]
 498         add     $A,$A,$t0                       @ accumulate
 499         ldr     $t0,[$t1,#16]
 500         add     $B,$B,$t2
 501         ldr     $t2,[$t1,#20]
 502         add     $C,$C,$t3
 503         ldr     $t3,[$t1,#24]
 504         add     $D,$D,$t4
 505         ldr     $t4,[$t1,#28]
 506         add     $E,$E,$t0
 507         str     $A,[$t1],#4
 508         add     $F,$F,$t2
 509         str     $B,[$t1],#4
 510         add     $G,$G,$t3
 511         str     $C,[$t1],#4
 512         add     $H,$H,$t4
 513         str     $D,[$t1],#4
 514         stmia   $t1,{$E-$H}
 515
 516         movne   $Xfer,sp
 517         ldrne   $t1,[sp,#0]
 518         eorne   $t2,$t2,$t2
 519         ldreq   sp,[sp,#76]                     @ restore original sp
 520         eorne   $t3,$B,$C
 521         bne     .L_00_48
 522
 523         ldmia   sp!,{r4-r12,pc}
 524 #endif
 525 ___
 526 }}}
 527 $code.=<<___;
 528 .size   sha256_block_data_order,.-sha256_block_data_order
 529 .asciz  "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 530 .align  2
 531 .comm   OPENSSL_armcap_P,4,4
 532 ___
 533
 534 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 535 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 536 print $code;
 537 close STDOUT; # enforce flush