crypto/sha/asm/sha512-armv4.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # SHA512 block procedure for ARMv4. September 2007.
  11
  12 # This code is ~4.5 (four and a half) times faster than code generated
  13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
  14 # Xscale PXA250 core].
  15 #
  16 # July 2010.
  17 #
  18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
  19 # Cortex A8 core and ~40 cycles per processed byte.
  20
  21 # February 2011.
  22 #
  23 # Profiler-assisted and platform-specific optimization resulted in 7%
  24 # improvement on Coxtex A8 core and ~38 cycles per byte.
  25
  26 # March 2011.
  27 #
  28 # Add NEON implementation. On Cortex A8 it was measured to process
  29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
  30
  31 # August 2012.
  32 #
  33 # Improve NEON performance by 12% on Snapdragon S4. In absolute
  34 # terms it's 22.6 cycles per byte, which is disappointing result.
  35 # Technical writers asserted that 3-way S4 pipeline can sustain
  36 # multiple NEON instructions per cycle, but dual NEON issue could
  37 # not be observed, and for NEON-only sequences IPC(*) was found to
  38 # be limited by 1:-( 0.33 and 0.66 were measured for sequences with
  39 # ILPs(*) of 1 and 2 respectively. This in turn means that you can
  40 # even find yourself striving, as I did here, for achieving IPC
  41 # adequate to one delivered by Cortex A8 [for reference, it's
  42 # 0.5 for ILP of 1, and 1 for higher ILPs].
  43 #
  44 # (*) ILP, instruction-level parallelism, how many instructions
  45 #     *can* execute at the same time. IPC, instructions per cycle,
  46 #     indicates how many instructions actually execute.
  47
  48 # Byte order [in]dependence. =========================================
  49 #
  50 # Originally caller was expected to maintain specific *dword* order in
  51 # h[0-7], namely with most significant dword at *lower* address, which
  52 # was reflected in below two parameters as 0 and 4. Now caller is
  53 # expected to maintain native byte order for whole 64-bit values.
  54 $hi="HI";
  55 $lo="LO";
  56 # ====================================================================
  57
  58 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  59 open STDOUT,">$output";
  60
  61 $ctx="r0";      # parameter block
  62 $inp="r1";
  63 $len="r2";
  64
  65 $Tlo="r3";
  66 $Thi="r4";
  67 $Alo="r5";
  68 $Ahi="r6";
  69 $Elo="r7";
  70 $Ehi="r8";
  71 $t0="r9";
  72 $t1="r10";
  73 $t2="r11";
  74 $t3="r12";
  75 ############    r13 is stack pointer
  76 $Ktbl="r14";
  77 ############    r15 is program counter
  78
  79 $Aoff=8*0;
  80 $Boff=8*1;
  81 $Coff=8*2;
  82 $Doff=8*3;
  83 $Eoff=8*4;
  84 $Foff=8*5;
  85 $Goff=8*6;
  86 $Hoff=8*7;
  87 $Xoff=8*8;
  88
  89 sub BODY_00_15() {
  90 my $magic = shift;
  91 $code.=<<___;
  92         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
  93         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
  94         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
  95         mov     $t0,$Elo,lsr#14
  96         str     $Tlo,[sp,#$Xoff+0]
  97         mov     $t1,$Ehi,lsr#14
  98         str     $Thi,[sp,#$Xoff+4]
  99         eor     $t0,$t0,$Ehi,lsl#18
 100         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
 101         eor     $t1,$t1,$Elo,lsl#18
 102         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
 103         eor     $t0,$t0,$Elo,lsr#18
 104         eor     $t1,$t1,$Ehi,lsr#18
 105         eor     $t0,$t0,$Ehi,lsl#14
 106         eor     $t1,$t1,$Elo,lsl#14
 107         eor     $t0,$t0,$Ehi,lsr#9
 108         eor     $t1,$t1,$Elo,lsr#9
 109         eor     $t0,$t0,$Elo,lsl#23
 110         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
 111         adds    $Tlo,$Tlo,$t0
 112         ldr     $t0,[sp,#$Foff+0]       @ f.lo
 113         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
 114         ldr     $t1,[sp,#$Foff+4]       @ f.hi
 115         adds    $Tlo,$Tlo,$t2
 116         ldr     $t2,[sp,#$Goff+0]       @ g.lo
 117         adc     $Thi,$Thi,$t3           @ T += h
 118         ldr     $t3,[sp,#$Goff+4]       @ g.hi
 119
 120         eor     $t0,$t0,$t2
 121         str     $Elo,[sp,#$Eoff+0]
 122         eor     $t1,$t1,$t3
 123         str     $Ehi,[sp,#$Eoff+4]
 124         and     $t0,$t0,$Elo
 125         str     $Alo,[sp,#$Aoff+0]
 126         and     $t1,$t1,$Ehi
 127         str     $Ahi,[sp,#$Aoff+4]
 128         eor     $t0,$t0,$t2
 129         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
 130         eor     $t1,$t1,$t3             @ Ch(e,f,g)
 131         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
 132
 133         adds    $Tlo,$Tlo,$t0
 134         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
 135         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
 136         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
 137         adds    $Tlo,$Tlo,$t2
 138         and     $t0,$t2,#0xff
 139         adc     $Thi,$Thi,$t3           @ T += K[i]
 140         adds    $Elo,$Elo,$Tlo
 141         ldr     $t2,[sp,#$Boff+0]       @ b.lo
 142         adc     $Ehi,$Ehi,$Thi          @ d += T
 143         teq     $t0,#$magic
 144
 145         ldr     $t3,[sp,#$Coff+0]       @ c.lo
 146         orreq   $Ktbl,$Ktbl,#1
 147         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 148         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 149         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
 150         mov     $t0,$Alo,lsr#28
 151         mov     $t1,$Ahi,lsr#28
 152         eor     $t0,$t0,$Ahi,lsl#4
 153         eor     $t1,$t1,$Alo,lsl#4
 154         eor     $t0,$t0,$Ahi,lsr#2
 155         eor     $t1,$t1,$Alo,lsr#2
 156         eor     $t0,$t0,$Alo,lsl#30
 157         eor     $t1,$t1,$Ahi,lsl#30
 158         eor     $t0,$t0,$Ahi,lsr#7
 159         eor     $t1,$t1,$Alo,lsr#7
 160         eor     $t0,$t0,$Alo,lsl#25
 161         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
 162         adds    $Tlo,$Tlo,$t0
 163         and     $t0,$Alo,$t2
 164         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
 165
 166         ldr     $t1,[sp,#$Boff+4]       @ b.hi
 167         orr     $Alo,$Alo,$t2
 168         ldr     $t2,[sp,#$Coff+4]       @ c.hi
 169         and     $Alo,$Alo,$t3
 170         and     $t3,$Ahi,$t1
 171         orr     $Ahi,$Ahi,$t1
 172         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
 173         and     $Ahi,$Ahi,$t2
 174         adds    $Alo,$Alo,$Tlo
 175         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
 176         sub     sp,sp,#8
 177         adc     $Ahi,$Ahi,$Thi          @ h += T
 178         tst     $Ktbl,#1
 179         add     $Ktbl,$Ktbl,#8
 180 ___
 181 }
 182 $code=<<___;
 183 #include "arm_arch.h"
 184 #ifdef __ARMEL__
 185 # define LO 0
 186 # define HI 4
 187 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
 188 #else
 189 # define HI 0
 190 # define LO 4
 191 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
 192 #endif
 193
 194 .text
 195 .code   32
 196 .type   K512,%object
 197 .align  5
 198 K512:
 199 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
 200 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
 201 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
 202 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
 203 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
 204 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
 205 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
 206 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
 207 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
 208 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
 209 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
 210 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
 211 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
 212 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
 213 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
 214 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
 215 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
 216 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
 217 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
 218 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
 219 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
 220 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
 221 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
 222 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
 223 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
 224 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
 225 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
 226 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
 227 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
 228 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
 229 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
 230 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
 231 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
 232 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
 233 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
 234 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
 235 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
 236 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
 237 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 238 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 239 .size   K512,.-K512
 240 .LOPENSSL_armcap:
 241 .word   OPENSSL_armcap_P-sha512_block_data_order
 242 .skip   32-4
 243
 244 .global sha512_block_data_order
 245 .type   sha512_block_data_order,%function
 246 sha512_block_data_order:
 247         sub     r3,pc,#8                @ sha512_block_data_order
 248         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
 249 #if __ARM_ARCH__>=7
 250         ldr     r12,.LOPENSSL_armcap
 251         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
 252         tst     r12,#1
 253         bne     .LNEON
 254 #endif
 255         stmdb   sp!,{r4-r12,lr}
 256         sub     $Ktbl,r3,#672           @ K512
 257         sub     sp,sp,#9*8
 258
 259         ldr     $Elo,[$ctx,#$Eoff+$lo]
 260         ldr     $Ehi,[$ctx,#$Eoff+$hi]
 261         ldr     $t0, [$ctx,#$Goff+$lo]
 262         ldr     $t1, [$ctx,#$Goff+$hi]
 263         ldr     $t2, [$ctx,#$Hoff+$lo]
 264         ldr     $t3, [$ctx,#$Hoff+$hi]
 265 .Loop:
 266         str     $t0, [sp,#$Goff+0]
 267         str     $t1, [sp,#$Goff+4]
 268         str     $t2, [sp,#$Hoff+0]
 269         str     $t3, [sp,#$Hoff+4]
 270         ldr     $Alo,[$ctx,#$Aoff+$lo]
 271         ldr     $Ahi,[$ctx,#$Aoff+$hi]
 272         ldr     $Tlo,[$ctx,#$Boff+$lo]
 273         ldr     $Thi,[$ctx,#$Boff+$hi]
 274         ldr     $t0, [$ctx,#$Coff+$lo]
 275         ldr     $t1, [$ctx,#$Coff+$hi]
 276         ldr     $t2, [$ctx,#$Doff+$lo]
 277         ldr     $t3, [$ctx,#$Doff+$hi]
 278         str     $Tlo,[sp,#$Boff+0]
 279         str     $Thi,[sp,#$Boff+4]
 280         str     $t0, [sp,#$Coff+0]
 281         str     $t1, [sp,#$Coff+4]
 282         str     $t2, [sp,#$Doff+0]
 283         str     $t3, [sp,#$Doff+4]
 284         ldr     $Tlo,[$ctx,#$Foff+$lo]
 285         ldr     $Thi,[$ctx,#$Foff+$hi]
 286         str     $Tlo,[sp,#$Foff+0]
 287         str     $Thi,[sp,#$Foff+4]
 288
 289 .L00_15:
 290 #if __ARM_ARCH__<7
 291         ldrb    $Tlo,[$inp,#7]
 292         ldrb    $t0, [$inp,#6]
 293         ldrb    $t1, [$inp,#5]
 294         ldrb    $t2, [$inp,#4]
 295         ldrb    $Thi,[$inp,#3]
 296         ldrb    $t3, [$inp,#2]
 297         orr     $Tlo,$Tlo,$t0,lsl#8
 298         ldrb    $t0, [$inp,#1]
 299         orr     $Tlo,$Tlo,$t1,lsl#16
 300         ldrb    $t1, [$inp],#8
 301         orr     $Tlo,$Tlo,$t2,lsl#24
 302         orr     $Thi,$Thi,$t3,lsl#8
 303         orr     $Thi,$Thi,$t0,lsl#16
 304         orr     $Thi,$Thi,$t1,lsl#24
 305 #else
 306         ldr     $Tlo,[$inp,#4]
 307         ldr     $Thi,[$inp],#8
 308 #ifdef __ARMEL__
 309         rev     $Tlo,$Tlo
 310         rev     $Thi,$Thi
 311 #endif
 312 #endif
 313 ___
 314         &BODY_00_15(0x94);
 315 $code.=<<___;
 316         tst     $Ktbl,#1
 317         beq     .L00_15
 318         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
 319         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
 320         bic     $Ktbl,$Ktbl,#1
 321 .L16_79:
 322         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 323         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 324         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 325         mov     $Tlo,$t0,lsr#1
 326         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
 327         mov     $Thi,$t1,lsr#1
 328         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
 329         eor     $Tlo,$Tlo,$t1,lsl#31
 330         eor     $Thi,$Thi,$t0,lsl#31
 331         eor     $Tlo,$Tlo,$t0,lsr#8
 332         eor     $Thi,$Thi,$t1,lsr#8
 333         eor     $Tlo,$Tlo,$t1,lsl#24
 334         eor     $Thi,$Thi,$t0,lsl#24
 335         eor     $Tlo,$Tlo,$t0,lsr#7
 336         eor     $Thi,$Thi,$t1,lsr#7
 337         eor     $Tlo,$Tlo,$t1,lsl#25
 338
 339         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 340         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
 341         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
 342         mov     $t0,$t2,lsr#19
 343         mov     $t1,$t3,lsr#19
 344         eor     $t0,$t0,$t3,lsl#13
 345         eor     $t1,$t1,$t2,lsl#13
 346         eor     $t0,$t0,$t3,lsr#29
 347         eor     $t1,$t1,$t2,lsr#29
 348         eor     $t0,$t0,$t2,lsl#3
 349         eor     $t1,$t1,$t3,lsl#3
 350         eor     $t0,$t0,$t2,lsr#6
 351         eor     $t1,$t1,$t3,lsr#6
 352         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
 353         eor     $t0,$t0,$t3,lsl#26
 354
 355         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
 356         adds    $Tlo,$Tlo,$t0
 357         ldr     $t0,[sp,#`$Xoff+8*16`+0]
 358         adc     $Thi,$Thi,$t1
 359
 360         ldr     $t1,[sp,#`$Xoff+8*16`+4]
 361         adds    $Tlo,$Tlo,$t2
 362         adc     $Thi,$Thi,$t3
 363         adds    $Tlo,$Tlo,$t0
 364         adc     $Thi,$Thi,$t1
 365 ___
 366         &BODY_00_15(0x17);
 367 $code.=<<___;
 368         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
 369         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
 370         beq     .L16_79
 371         bic     $Ktbl,$Ktbl,#1
 372
 373         ldr     $Tlo,[sp,#$Boff+0]
 374         ldr     $Thi,[sp,#$Boff+4]
 375         ldr     $t0, [$ctx,#$Aoff+$lo]
 376         ldr     $t1, [$ctx,#$Aoff+$hi]
 377         ldr     $t2, [$ctx,#$Boff+$lo]
 378         ldr     $t3, [$ctx,#$Boff+$hi]
 379         adds    $t0,$Alo,$t0
 380         str     $t0, [$ctx,#$Aoff+$lo]
 381         adc     $t1,$Ahi,$t1
 382         str     $t1, [$ctx,#$Aoff+$hi]
 383         adds    $t2,$Tlo,$t2
 384         str     $t2, [$ctx,#$Boff+$lo]
 385         adc     $t3,$Thi,$t3
 386         str     $t3, [$ctx,#$Boff+$hi]
 387
 388         ldr     $Alo,[sp,#$Coff+0]
 389         ldr     $Ahi,[sp,#$Coff+4]
 390         ldr     $Tlo,[sp,#$Doff+0]
 391         ldr     $Thi,[sp,#$Doff+4]
 392         ldr     $t0, [$ctx,#$Coff+$lo]
 393         ldr     $t1, [$ctx,#$Coff+$hi]
 394         ldr     $t2, [$ctx,#$Doff+$lo]
 395         ldr     $t3, [$ctx,#$Doff+$hi]
 396         adds    $t0,$Alo,$t0
 397         str     $t0, [$ctx,#$Coff+$lo]
 398         adc     $t1,$Ahi,$t1
 399         str     $t1, [$ctx,#$Coff+$hi]
 400         adds    $t2,$Tlo,$t2
 401         str     $t2, [$ctx,#$Doff+$lo]
 402         adc     $t3,$Thi,$t3
 403         str     $t3, [$ctx,#$Doff+$hi]
 404
 405         ldr     $Tlo,[sp,#$Foff+0]
 406         ldr     $Thi,[sp,#$Foff+4]
 407         ldr     $t0, [$ctx,#$Eoff+$lo]
 408         ldr     $t1, [$ctx,#$Eoff+$hi]
 409         ldr     $t2, [$ctx,#$Foff+$lo]
 410         ldr     $t3, [$ctx,#$Foff+$hi]
 411         adds    $Elo,$Elo,$t0
 412         str     $Elo,[$ctx,#$Eoff+$lo]
 413         adc     $Ehi,$Ehi,$t1
 414         str     $Ehi,[$ctx,#$Eoff+$hi]
 415         adds    $t2,$Tlo,$t2
 416         str     $t2, [$ctx,#$Foff+$lo]
 417         adc     $t3,$Thi,$t3
 418         str     $t3, [$ctx,#$Foff+$hi]
 419
 420         ldr     $Alo,[sp,#$Goff+0]
 421         ldr     $Ahi,[sp,#$Goff+4]
 422         ldr     $Tlo,[sp,#$Hoff+0]
 423         ldr     $Thi,[sp,#$Hoff+4]
 424         ldr     $t0, [$ctx,#$Goff+$lo]
 425         ldr     $t1, [$ctx,#$Goff+$hi]
 426         ldr     $t2, [$ctx,#$Hoff+$lo]
 427         ldr     $t3, [$ctx,#$Hoff+$hi]
 428         adds    $t0,$Alo,$t0
 429         str     $t0, [$ctx,#$Goff+$lo]
 430         adc     $t1,$Ahi,$t1
 431         str     $t1, [$ctx,#$Goff+$hi]
 432         adds    $t2,$Tlo,$t2
 433         str     $t2, [$ctx,#$Hoff+$lo]
 434         adc     $t3,$Thi,$t3
 435         str     $t3, [$ctx,#$Hoff+$hi]
 436
 437         add     sp,sp,#640
 438         sub     $Ktbl,$Ktbl,#640
 439
 440         teq     $inp,$len
 441         bne     .Loop
 442
 443         add     sp,sp,#8*9              @ destroy frame
 444 #if __ARM_ARCH__>=5
 445         ldmia   sp!,{r4-r12,pc}
 446 #else
 447         ldmia   sp!,{r4-r12,lr}
 448         tst     lr,#1
 449         moveq   pc,lr                   @ be binary compatible with V4, yet
 450         bx      lr                      @ interoperable with Thumb ISA:-)
 451 #endif
 452 ___
 453
 454 {
 455 my @Sigma0=(28,34,39);
 456 my @Sigma1=(14,18,41);
 457 my @sigma0=(1, 8, 7);
 458 my @sigma1=(19,61,6);
 459
 460 my $Ktbl="r3";
 461 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
 462
 463 my @X=map("d$_",(0..15));
 464 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
 465
 466 sub NEON_00_15() {
 467 my $i=shift;
 468 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
 469 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
 470
 471 $code.=<<___ if ($i<16 || $i&1);
 472         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
 473 #if $i<16
 474         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
 475 #endif
 476         vshr.u64        $t1,$e,#@Sigma1[1]
 477 #if $i>0
 478          vadd.i64       $a,$Maj                 @ h+=Maj from the past
 479 #endif
 480         vshr.u64        $t2,$e,#@Sigma1[2]
 481 ___
 482 $code.=<<___;
 483         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
 484         vsli.64         $t0,$e,#`64-@Sigma1[0]`
 485         vsli.64         $t1,$e,#`64-@Sigma1[1]`
 486         vmov            $Ch,$e
 487         vsli.64         $t2,$e,#`64-@Sigma1[2]`
 488 #if $i<16 && defined(__ARMEL__)
 489         vrev64.8        @X[$i],@X[$i]
 490 #endif
 491         veor            $t1,$t0
 492         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
 493         vshr.u64        $t0,$a,#@Sigma0[0]
 494         veor            $t2,$t1                 @ Sigma1(e)
 495         vadd.i64        $T1,$Ch,$h
 496         vshr.u64        $t1,$a,#@Sigma0[1]
 497         vsli.64         $t0,$a,#`64-@Sigma0[0]`
 498         vadd.i64        $T1,$t2
 499         vshr.u64        $t2,$a,#@Sigma0[2]
 500         vadd.i64        $K,@X[$i%16]
 501         vsli.64         $t1,$a,#`64-@Sigma0[1]`
 502         veor            $Maj,$a,$b
 503         vsli.64         $t2,$a,#`64-@Sigma0[2]`
 504         veor            $h,$t0,$t1
 505         vadd.i64        $T1,$K
 506         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
 507         veor            $h,$t2                  @ Sigma0(a)
 508         vadd.i64        $d,$T1
 509         vadd.i64        $Maj,$T1
 510         @ vadd.i64      $h,$Maj
 511 ___
 512 }
 513
 514 sub NEON_16_79() {
 515 my $i=shift;
 516
 517 if ($i&1)       { &NEON_00_15($i,@_); return; }
 518
 519 # 2x-vectorized, therefore runs every 2nd round
 520 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
 521 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
 522 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
 523 my $e=@_[4];                                    # $e from NEON_00_15
 524 $i /= 2;
 525 $code.=<<___;
 526         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
 527         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
 528          vadd.i64       @_[0],d30                       @ h+=Maj from the past
 529         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
 530         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
 531         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
 532         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
 533         veor            $s1,$t0
 534         vshr.u64        $t0,$s0,#@sigma0[0]
 535         veor            $s1,$t1                         @ sigma1(X[i+14])
 536         vshr.u64        $t1,$s0,#@sigma0[1]
 537         vadd.i64        @X[$i%8],$s1
 538         vshr.u64        $s1,$s0,#@sigma0[2]
 539         vsli.64         $t0,$s0,#`64-@sigma0[0]`
 540         vsli.64         $t1,$s0,#`64-@sigma0[1]`
 541         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
 542         veor            $s1,$t0
 543         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
 544         vadd.i64        @X[$i%8],$s0
 545         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
 546         veor            $s1,$t1                         @ sigma0(X[i+1])
 547         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
 548         vadd.i64        @X[$i%8],$s1
 549 ___
 550         &NEON_00_15(2*$i,@_);
 551 }
 552
 553 $code.=<<___;
 554 #if __ARM_ARCH__>=7
 555 .fpu    neon
 556
 557 .align  4
 558 .LNEON:
 559         dmb                             @ errata #451034 on early Cortex A8
 560         vstmdb  sp!,{d8-d15}            @ ABI specification says so
 561         sub     $Ktbl,r3,#672           @ K512
 562         vldmia  $ctx,{$A-$H}            @ load context
 563 .Loop_neon:
 564 ___
 565 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
 566 $code.=<<___;
 567         mov             $cnt,#4
 568 .L16_79_neon:
 569         subs            $cnt,#1
 570 ___
 571 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
 572 $code.=<<___;
 573         bne             .L16_79_neon
 574
 575          vadd.i64       $A,d30          @ h+=Maj from the past
 576         vldmia          $ctx,{d24-d31}  @ load context to temp
 577         vadd.i64        q8,q12          @ vectorized accumulate
 578         vadd.i64        q9,q13
 579         vadd.i64        q10,q14
 580         vadd.i64        q11,q15
 581         vstmia          $ctx,{$A-$H}    @ save context
 582         teq             $inp,$len
 583         sub             $Ktbl,#640      @ rewind K512
 584         bne             .Loop_neon
 585
 586         vldmia  sp!,{d8-d15}            @ epilogue
 587         bx      lr
 588 #endif
 589 ___
 590 }
 591 $code.=<<___;
 592 .size   sha512_block_data_order,.-sha512_block_data_order
 593 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 594 .align  2
 595 .comm   OPENSSL_armcap_P,4,4
 596 ___
 597
 598 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 599 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 600 print $code;
 601 close STDOUT; # enforce flush