crypto/sha/asm/sha256-armv4.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 #
  16 # Permission to use under GPL terms is granted.
  17 # ====================================================================
  18
  19 # SHA256 block procedure for ARMv4. May 2007.
  20
  21 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  22 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  23 # byte [on single-issue Xscale PXA250 core].
  24
  25 # July 2010.
  26 #
  27 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  28 # Cortex A8 core and ~20 cycles per processed byte.
  29
  30 # February 2011.
  31 #
  32 # Profiler-assisted and platform-specific optimization resulted in 16%
  33 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
  34
  35 # September 2013.
  36 #
  37 # Add NEON implementation. On Cortex A8 it was measured to process one
  38 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
  39 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
  40 # code (meaning that latter performs sub-optimally, nothing was done
  41 # about it).
  42
  43 # May 2014.
  44 #
  45 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
  46
  47 $flavour = shift;
  48 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  49 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  50
  51 if ($flavour && $flavour ne "void") {
  52     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  53     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  54     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  55     die "can't locate arm-xlate.pl";
  56
  57     open STDOUT,"| \"$^X\" $xlate $flavour $output";
  58 } else {
  59     open STDOUT,">$output";
  60 }
  61
  62 $ctx="r0";      $t0="r0";
  63 $inp="r1";      $t4="r1";
  64 $len="r2";      $t1="r2";
  65 $T1="r3";       $t3="r3";
  66 $A="r4";
  67 $B="r5";
  68 $C="r6";
  69 $D="r7";
  70 $E="r8";
  71 $F="r9";
  72 $G="r10";
  73 $H="r11";
  74 @V=($A,$B,$C,$D,$E,$F,$G,$H);
  75 $t2="r12";
  76 $Ktbl="r14";
  77
  78 @Sigma0=( 2,13,22);
  79 @Sigma1=( 6,11,25);
  80 @sigma0=( 7,18, 3);
  81 @sigma1=(17,19,10);
  82
  83 sub BODY_00_15 {
  84 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  85
  86 $code.=<<___ if ($i<16);
  87 #if __ARM_ARCH__>=7
  88         @ ldr   $t1,[$inp],#4                   @ $i
  89 # if $i==15
  90         str     $inp,[sp,#17*4]                 @ make room for $t4
  91 # endif
  92         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  93         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
  94         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
  95 # ifndef __ARMEB__
  96         rev     $t1,$t1
  97 # endif
  98 #else
  99         @ ldrb  $t1,[$inp,#3]                   @ $i
 100         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
 101         ldrb    $t2,[$inp,#2]
 102         ldrb    $t0,[$inp,#1]
 103         orr     $t1,$t1,$t2,lsl#8
 104         ldrb    $t2,[$inp],#4
 105         orr     $t1,$t1,$t0,lsl#16
 106 # if $i==15
 107         str     $inp,[sp,#17*4]                 @ make room for $t4
 108 # endif
 109         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
 110         orr     $t1,$t1,$t2,lsl#24
 111         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
 112 #endif
 113 ___
 114 $code.=<<___;
 115         ldr     $t2,[$Ktbl],#4                  @ *K256++
 116         add     $h,$h,$t1                       @ h+=X[i]
 117         str     $t1,[sp,#`$i%16`*4]
 118         eor     $t1,$f,$g
 119         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
 120         and     $t1,$t1,$e
 121         add     $h,$h,$t2                       @ h+=K256[i]
 122         eor     $t1,$t1,$g                      @ Ch(e,f,g)
 123         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
 124         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
 125 #if $i==31
 126         and     $t2,$t2,#0xff
 127         cmp     $t2,#0xf2                       @ done?
 128 #endif
 129 #if $i<15
 130 # if __ARM_ARCH__>=7
 131         ldr     $t1,[$inp],#4                   @ prefetch
 132 # else
 133         ldrb    $t1,[$inp,#3]
 134 # endif
 135         eor     $t2,$a,$b                       @ a^b, b^c in next round
 136 #else
 137         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
 138         eor     $t2,$a,$b                       @ a^b, b^c in next round
 139         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
 140 #endif
 141         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
 142         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
 143         add     $d,$d,$h                        @ d+=h
 144         eor     $t3,$t3,$b                      @ Maj(a,b,c)
 145         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
 146         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
 147 ___
 148         ($t2,$t3)=($t3,$t2);
 149 }
 150
 151 sub BODY_16_XX {
 152 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 153
 154 $code.=<<___;
 155         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
 156         @ ldr   $t4,[sp,#`($i+14)%16`*4]
 157         mov     $t0,$t1,ror#$sigma0[0]
 158         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
 159         mov     $t2,$t4,ror#$sigma1[0]
 160         eor     $t0,$t0,$t1,ror#$sigma0[1]
 161         eor     $t2,$t2,$t4,ror#$sigma1[1]
 162         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
 163         ldr     $t1,[sp,#`($i+0)%16`*4]
 164         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
 165         ldr     $t4,[sp,#`($i+9)%16`*4]
 166
 167         add     $t2,$t2,$t0
 168         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
 169         add     $t1,$t1,$t2
 170         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
 171         add     $t1,$t1,$t4                     @ X[i]
 172 ___
 173         &BODY_00_15(@_);
 174 }
 175
 176 $code=<<___;
 177 #ifndef __KERNEL__
 178 # include "arm_arch.h"
 179 #else
 180 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
 181 # define __ARM_MAX_ARCH__ 7
 182 #endif
 183
 184 .text
 185 #if defined(__thumb2__)
 186 .syntax unified
 187 .thumb
 188 #else
 189 .code   32
 190 #endif
 191
 192 .type   K256,%object
 193 .align  5
 194 K256:
 195 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 196 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 197 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 198 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 199 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 200 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 201 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 202 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 203 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 204 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 205 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 206 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 207 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 208 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 209 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 210 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 211 .size   K256,.-K256
 212 .word   0                               @ terminator
 213 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 214 .LOPENSSL_armcap:
 215 .word   OPENSSL_armcap_P-.Lsha256_block_data_order
 216 #endif
 217 .align  5
 218
 219 .global sha256_block_data_order
 220 .type   sha256_block_data_order,%function
 221 sha256_block_data_order:
 222 .Lsha256_block_data_order:
 223 #if __ARM_ARCH__<7 && !defined(__thumb2__)
 224         sub     r3,pc,#8                @ sha256_block_data_order
 225 #else
 226         adr     r3,.Lsha256_block_data_order
 227 #endif
 228 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 229         ldr     r12,.LOPENSSL_armcap
 230         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
 231 #ifdef  __APPLE__
 232         ldr     r12,[r12]
 233 #endif
 234         tst     r12,#ARMV8_SHA256
 235         bne     .LARMv8
 236         tst     r12,#ARMV7_NEON
 237         bne     .LNEON
 238 #endif
 239         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
 240         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
 241         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 242         sub     $Ktbl,r3,#256+32        @ K256
 243         sub     sp,sp,#16*4             @ alloca(X[16])
 244 .Loop:
 245 # if __ARM_ARCH__>=7
 246         ldr     $t1,[$inp],#4
 247 # else
 248         ldrb    $t1,[$inp,#3]
 249 # endif
 250         eor     $t3,$B,$C               @ magic
 251         eor     $t2,$t2,$t2
 252 ___
 253 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 254 $code.=".Lrounds_16_xx:\n";
 255 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 256 $code.=<<___;
 257 #if __ARM_ARCH__>=7
 258         ite     eq                      @ Thumb2 thing, sanity check in ARM
 259 #endif
 260         ldreq   $t3,[sp,#16*4]          @ pull ctx
 261         bne     .Lrounds_16_xx
 262
 263         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
 264         ldr     $t0,[$t3,#0]
 265         ldr     $t1,[$t3,#4]
 266         ldr     $t2,[$t3,#8]
 267         add     $A,$A,$t0
 268         ldr     $t0,[$t3,#12]
 269         add     $B,$B,$t1
 270         ldr     $t1,[$t3,#16]
 271         add     $C,$C,$t2
 272         ldr     $t2,[$t3,#20]
 273         add     $D,$D,$t0
 274         ldr     $t0,[$t3,#24]
 275         add     $E,$E,$t1
 276         ldr     $t1,[$t3,#28]
 277         add     $F,$F,$t2
 278         ldr     $inp,[sp,#17*4]         @ pull inp
 279         ldr     $t2,[sp,#18*4]          @ pull inp+len
 280         add     $G,$G,$t0
 281         add     $H,$H,$t1
 282         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
 283         cmp     $inp,$t2
 284         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
 285         bne     .Loop
 286
 287         add     sp,sp,#`16+3`*4 @ destroy frame
 288 #if __ARM_ARCH__>=5
 289         ldmia   sp!,{r4-r11,pc}
 290 #else
 291         ldmia   sp!,{r4-r11,lr}
 292         tst     lr,#1
 293         moveq   pc,lr                   @ be binary compatible with V4, yet
 294         bx      lr                      @ interoperable with Thumb ISA:-)
 295 #endif
 296 .size   sha256_block_data_order,.-sha256_block_data_order
 297 ___
 298 ######################################################################
 299 # NEON stuff
 300 #
 301 {{{
 302 my @X=map("q$_",(0..3));
 303 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
 304 my $Xfer=$t4;
 305 my $j=0;
 306
 307 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 308 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
 309
 310 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
 311 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
 312   my $arg = pop;
 313     $arg = "#$arg" if ($arg*1 eq $arg);
 314     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
 315 }
 316
 317 sub Xupdate()
 318 { use integer;
 319   my $body = shift;
 320   my @insns = (&$body,&$body,&$body,&$body);
 321   my ($a,$b,$c,$d,$e,$f,$g,$h);
 322
 323         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
 324          eval(shift(@insns));
 325          eval(shift(@insns));
 326          eval(shift(@insns));
 327         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
 328          eval(shift(@insns));
 329          eval(shift(@insns));
 330          eval(shift(@insns));
 331         &vshr_u32       ($T2,$T0,$sigma0[0]);
 332          eval(shift(@insns));
 333          eval(shift(@insns));
 334         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
 335          eval(shift(@insns));
 336          eval(shift(@insns));
 337         &vshr_u32       ($T1,$T0,$sigma0[2]);
 338          eval(shift(@insns));
 339          eval(shift(@insns));
 340         &vsli_32        ($T2,$T0,32-$sigma0[0]);
 341          eval(shift(@insns));
 342          eval(shift(@insns));
 343         &vshr_u32       ($T3,$T0,$sigma0[1]);
 344          eval(shift(@insns));
 345          eval(shift(@insns));
 346         &veor           ($T1,$T1,$T2);
 347          eval(shift(@insns));
 348          eval(shift(@insns));
 349         &vsli_32        ($T3,$T0,32-$sigma0[1]);
 350          eval(shift(@insns));
 351          eval(shift(@insns));
 352           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
 353          eval(shift(@insns));
 354          eval(shift(@insns));
 355         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
 356          eval(shift(@insns));
 357          eval(shift(@insns));
 358           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
 359          eval(shift(@insns));
 360          eval(shift(@insns));
 361           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
 362          eval(shift(@insns));
 363          eval(shift(@insns));
 364         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
 365          eval(shift(@insns));
 366          eval(shift(@insns));
 367           &veor         ($T5,$T5,$T4);
 368          eval(shift(@insns));
 369          eval(shift(@insns));
 370           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
 371          eval(shift(@insns));
 372          eval(shift(@insns));
 373           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
 374          eval(shift(@insns));
 375          eval(shift(@insns));
 376           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
 377          eval(shift(@insns));
 378          eval(shift(@insns));
 379         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
 380          eval(shift(@insns));
 381          eval(shift(@insns));
 382           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
 383          eval(shift(@insns));
 384          eval(shift(@insns));
 385           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
 386          eval(shift(@insns));
 387          eval(shift(@insns));
 388           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
 389          eval(shift(@insns));
 390          eval(shift(@insns));
 391           &veor         ($T5,$T5,$T4);
 392          eval(shift(@insns));
 393          eval(shift(@insns));
 394           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
 395          eval(shift(@insns));
 396          eval(shift(@insns));
 397         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
 398          eval(shift(@insns));
 399          eval(shift(@insns));
 400           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
 401          eval(shift(@insns));
 402          eval(shift(@insns));
 403           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
 404          eval(shift(@insns));
 405          eval(shift(@insns));
 406         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
 407          eval(shift(@insns));
 408          eval(shift(@insns));
 409         &vadd_i32       ($T0,$T0,@X[0]);
 410          while($#insns>=2) { eval(shift(@insns)); }
 411         &vst1_32        ("{$T0}","[$Xfer,:128]!");
 412          eval(shift(@insns));
 413          eval(shift(@insns));
 414
 415         push(@X,shift(@X));             # "rotate" X[]
 416 }
 417
 418 sub Xpreload()
 419 { use integer;
 420   my $body = shift;
 421   my @insns = (&$body,&$body,&$body,&$body);
 422   my ($a,$b,$c,$d,$e,$f,$g,$h);
 423
 424          eval(shift(@insns));
 425          eval(shift(@insns));
 426          eval(shift(@insns));
 427          eval(shift(@insns));
 428         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
 429          eval(shift(@insns));
 430          eval(shift(@insns));
 431          eval(shift(@insns));
 432          eval(shift(@insns));
 433         &vrev32_8       (@X[0],@X[0]);
 434          eval(shift(@insns));
 435          eval(shift(@insns));
 436          eval(shift(@insns));
 437          eval(shift(@insns));
 438         &vadd_i32       ($T0,$T0,@X[0]);
 439          foreach (@insns) { eval; }     # remaining instructions
 440         &vst1_32        ("{$T0}","[$Xfer,:128]!");
 441
 442         push(@X,shift(@X));             # "rotate" X[]
 443 }
 444
 445 sub body_00_15 () {
 446         (
 447         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
 448         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
 449         '&eor   ($t1,$f,$g)',
 450         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
 451         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
 452         '&and   ($t1,$t1,$e)',
 453         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
 454         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
 455         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
 456         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
 457         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
 458         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
 459         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
 460         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
 461         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
 462         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
 463         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
 464         '&add   ($d,$d,$h)',                    # d+=h
 465         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
 466         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
 467         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
 468         )
 469 }
 470
 471 $code.=<<___;
 472 #if __ARM_MAX_ARCH__>=7
 473 .arch   armv7-a
 474 .fpu    neon
 475
 476 .global sha256_block_data_order_neon
 477 .type   sha256_block_data_order_neon,%function
 478 .align  5
 479 .skip   16
 480 sha256_block_data_order_neon:
 481 .LNEON:
 482         stmdb   sp!,{r4-r12,lr}
 483
 484         sub     $H,sp,#16*4+16
 485         adr     $Ktbl,K256
 486         bic     $H,$H,#15               @ align for 128-bit stores
 487         mov     $t2,sp
 488         mov     sp,$H                   @ alloca
 489         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
 490
 491         vld1.8          {@X[0]},[$inp]!
 492         vld1.8          {@X[1]},[$inp]!
 493         vld1.8          {@X[2]},[$inp]!
 494         vld1.8          {@X[3]},[$inp]!
 495         vld1.32         {$T0},[$Ktbl,:128]!
 496         vld1.32         {$T1},[$Ktbl,:128]!
 497         vld1.32         {$T2},[$Ktbl,:128]!
 498         vld1.32         {$T3},[$Ktbl,:128]!
 499         vrev32.8        @X[0],@X[0]             @ yes, even on
 500         str             $ctx,[sp,#64]
 501         vrev32.8        @X[1],@X[1]             @ big-endian
 502         str             $inp,[sp,#68]
 503         mov             $Xfer,sp
 504         vrev32.8        @X[2],@X[2]
 505         str             $len,[sp,#72]
 506         vrev32.8        @X[3],@X[3]
 507         str             $t2,[sp,#76]            @ save original sp
 508         vadd.i32        $T0,$T0,@X[0]
 509         vadd.i32        $T1,$T1,@X[1]
 510         vst1.32         {$T0},[$Xfer,:128]!
 511         vadd.i32        $T2,$T2,@X[2]
 512         vst1.32         {$T1},[$Xfer,:128]!
 513         vadd.i32        $T3,$T3,@X[3]
 514         vst1.32         {$T2},[$Xfer,:128]!
 515         vst1.32         {$T3},[$Xfer,:128]!
 516
 517         ldmia           $ctx,{$A-$H}
 518         sub             $Xfer,$Xfer,#64
 519         ldr             $t1,[sp,#0]
 520         eor             $t2,$t2,$t2
 521         eor             $t3,$B,$C
 522         b               .L_00_48
 523
 524 .align  4
 525 .L_00_48:
 526 ___
 527         &Xupdate(\&body_00_15);
 528         &Xupdate(\&body_00_15);
 529         &Xupdate(\&body_00_15);
 530         &Xupdate(\&body_00_15);
 531 $code.=<<___;
 532         teq     $t1,#0                          @ check for K256 terminator
 533         ldr     $t1,[sp,#0]
 534         sub     $Xfer,$Xfer,#64
 535         bne     .L_00_48
 536
 537         ldr             $inp,[sp,#68]
 538         ldr             $t0,[sp,#72]
 539         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
 540         teq             $inp,$t0
 541         it              eq
 542         subeq           $inp,$inp,#64           @ avoid SEGV
 543         vld1.8          {@X[0]},[$inp]!         @ load next input block
 544         vld1.8          {@X[1]},[$inp]!
 545         vld1.8          {@X[2]},[$inp]!
 546         vld1.8          {@X[3]},[$inp]!
 547         it              ne
 548         strne           $inp,[sp,#68]
 549         mov             $Xfer,sp
 550 ___
 551         &Xpreload(\&body_00_15);
 552         &Xpreload(\&body_00_15);
 553         &Xpreload(\&body_00_15);
 554         &Xpreload(\&body_00_15);
 555 $code.=<<___;
 556         ldr     $t0,[$t1,#0]
 557         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
 558         ldr     $t2,[$t1,#4]
 559         ldr     $t3,[$t1,#8]
 560         ldr     $t4,[$t1,#12]
 561         add     $A,$A,$t0                       @ accumulate
 562         ldr     $t0,[$t1,#16]
 563         add     $B,$B,$t2
 564         ldr     $t2,[$t1,#20]
 565         add     $C,$C,$t3
 566         ldr     $t3,[$t1,#24]
 567         add     $D,$D,$t4
 568         ldr     $t4,[$t1,#28]
 569         add     $E,$E,$t0
 570         str     $A,[$t1],#4
 571         add     $F,$F,$t2
 572         str     $B,[$t1],#4
 573         add     $G,$G,$t3
 574         str     $C,[$t1],#4
 575         add     $H,$H,$t4
 576         str     $D,[$t1],#4
 577         stmia   $t1,{$E-$H}
 578
 579         ittte   ne
 580         movne   $Xfer,sp
 581         ldrne   $t1,[sp,#0]
 582         eorne   $t2,$t2,$t2
 583         ldreq   sp,[sp,#76]                     @ restore original sp
 584         itt     ne
 585         eorne   $t3,$B,$C
 586         bne     .L_00_48
 587
 588         ldmia   sp!,{r4-r12,pc}
 589 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
 590 #endif
 591 ___
 592 }}}
 593 ######################################################################
 594 # ARMv8 stuff
 595 #
 596 {{{
 597 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
 598 my @MSG=map("q$_",(8..11));
 599 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
 600 my $Ktbl="r3";
 601
 602 $code.=<<___;
 603 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 604
 605 # if defined(__thumb2__)
 606 #  define INST(a,b,c,d) .byte   c,d|0xc,a,b
 607 # else
 608 #  define INST(a,b,c,d) .byte   a,b,c,d
 609 # endif
 610
 611 .type   sha256_block_data_order_armv8,%function
 612 .align  5
 613 sha256_block_data_order_armv8:
 614 .LARMv8:
 615         vld1.32 {$ABCD,$EFGH},[$ctx]
 616         sub     $Ktbl,$Ktbl,#256+32
 617         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
 618         b       .Loop_v8
 619
 620 .align  4
 621 .Loop_v8:
 622         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
 623         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
 624         vld1.32         {$W0},[$Ktbl]!
 625         vrev32.8        @MSG[0],@MSG[0]
 626         vrev32.8        @MSG[1],@MSG[1]
 627         vrev32.8        @MSG[2],@MSG[2]
 628         vrev32.8        @MSG[3],@MSG[3]
 629         vmov            $ABCD_SAVE,$ABCD        @ offload
 630         vmov            $EFGH_SAVE,$EFGH
 631         teq             $inp,$len
 632 ___
 633 for($i=0;$i<12;$i++) {
 634 $code.=<<___;
 635         vld1.32         {$W1},[$Ktbl]!
 636         vadd.i32        $W0,$W0,@MSG[0]
 637         sha256su0       @MSG[0],@MSG[1]
 638         vmov            $abcd,$ABCD
 639         sha256h         $ABCD,$EFGH,$W0
 640         sha256h2        $EFGH,$abcd,$W0
 641         sha256su1       @MSG[0],@MSG[2],@MSG[3]
 642 ___
 643         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
 644 }
 645 $code.=<<___;
 646         vld1.32         {$W1},[$Ktbl]!
 647         vadd.i32        $W0,$W0,@MSG[0]
 648         vmov            $abcd,$ABCD
 649         sha256h         $ABCD,$EFGH,$W0
 650         sha256h2        $EFGH,$abcd,$W0
 651
 652         vld1.32         {$W0},[$Ktbl]!
 653         vadd.i32        $W1,$W1,@MSG[1]
 654         vmov            $abcd,$ABCD
 655         sha256h         $ABCD,$EFGH,$W1
 656         sha256h2        $EFGH,$abcd,$W1
 657
 658         vld1.32         {$W1},[$Ktbl]
 659         vadd.i32        $W0,$W0,@MSG[2]
 660         sub             $Ktbl,$Ktbl,#256-16     @ rewind
 661         vmov            $abcd,$ABCD
 662         sha256h         $ABCD,$EFGH,$W0
 663         sha256h2        $EFGH,$abcd,$W0
 664
 665         vadd.i32        $W1,$W1,@MSG[3]
 666         vmov            $abcd,$ABCD
 667         sha256h         $ABCD,$EFGH,$W1
 668         sha256h2        $EFGH,$abcd,$W1
 669
 670         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
 671         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
 672         it              ne
 673         bne             .Loop_v8
 674
 675         vst1.32         {$ABCD,$EFGH},[$ctx]
 676
 677         ret             @ bx lr
 678 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
 679 #endif
 680 ___
 681 }}}
 682 $code.=<<___;
 683 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 684 .align  2
 685 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 686 .comm   OPENSSL_armcap_P,4,4
 687 #endif
 688 ___
 689
 690 open SELF,$0;
 691 while(<SELF>) {
 692         next if (/^#!/);
 693         last if (!s/^#/@/ and !/^$/);
 694         print;
 695 }
 696 close SELF;
 697
 698 {   my  %opcode = (
 699         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
 700         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
 701
 702     sub unsha256 {
 703         my ($mnemonic,$arg)=@_;
 704
 705         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
 706             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
 707                                          |(($2&7)<<17)|(($2&8)<<4)
 708                                          |(($3&7)<<1) |(($3&8)<<2);
 709             # since ARMv7 instructions are always encoded little-endian.
 710             # correct solution is to use .inst directive, but older
 711             # assemblers don't implement it:-(
 712             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
 713                         $word&0xff,($word>>8)&0xff,
 714                         ($word>>16)&0xff,($word>>24)&0xff,
 715                         $mnemonic,$arg;
 716         }
 717     }
 718 }
 719
 720 foreach (split($/,$code)) {
 721
 722         s/\`([^\`]*)\`/eval $1/geo;
 723
 724         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
 725
 726         s/\bret\b/bx    lr/go           or
 727         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
 728
 729         print $_,"\n";
 730 }
 731
 732 close STDOUT; # enforce flush