crypto/sha/asm/sha512-armv4.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # SHA512 block procedure for ARMv4. September 2007.
  11
  12 # This code is ~4.5 (four and a half) times faster than code generated
  13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
  14 # Xscale PXA250 core].
  15 #
  16 # July 2010.
  17 #
  18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
  19 # Cortex A8 core and ~40 cycles per processed byte.
  20
  21 # February 2011.
  22 #
  23 # Profiler-assisted and platform-specific optimization resulted in 7%
  24 # improvement on Coxtex A8 core and ~38 cycles per byte.
  25
  26 # March 2011.
  27 #
  28 # Add NEON implementation. On Cortex A8 it was measured to process
  29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
  30
  31 # August 2012.
  32 #
  33 # Improve NEON performance by 12% on Snapdragon S4. In absolute
  34 # terms it's 22.6 cycles per byte, which is disappointing result.
  35 # Technical writers asserted that 3-way S4 pipeline can sustain
  36 # multiple NEON instructions per cycle, but dual NEON issue could
  37 # not be observed, and for NEON-only sequences IPC(*) was found to
  38 # be limited by 1:-( 0.33 and 0.66 were measured for sequences with
  39 # ILPs(*) of 1 and 2 respectively. This in turn means that you can
  40 # even find yourself striving, as I did here, for achieving IPC
  41 # adequate to one delivered by Cortex A8 [for reference, it's
  42 # 0.5 for ILP of 1, and 1 for higher ILPs].
  43 #
  44 # (*) ILP, instruction-level parallelism, how many instructions
  45 #     *can* execute at the same time. IPC, instructions per cycle,
  46 #     indicates how many instructions actually execute.
  47
  48 # Byte order [in]dependence. =========================================
  49 #
  50 # Originally caller was expected to maintain specific *dword* order in
  51 # h[0-7], namely with most significant dword at *lower* address, which
  52 # was reflected in below two parameters as 0 and 4. Now caller is
  53 # expected to maintain native byte order for whole 64-bit values.
  54 $hi="HI";
  55 $lo="LO";
  56 # ====================================================================
  57
  58 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  59 open STDOUT,">$output";
  60
  61 $ctx="r0";      # parameter block
  62 $inp="r1";
  63 $len="r2";
  64
  65 $Tlo="r3";
  66 $Thi="r4";
  67 $Alo="r5";
  68 $Ahi="r6";
  69 $Elo="r7";
  70 $Ehi="r8";
  71 $t0="r9";
  72 $t1="r10";
  73 $t2="r11";
  74 $t3="r12";
  75 ############    r13 is stack pointer
  76 $Ktbl="r14";
  77 ############    r15 is program counter
  78
  79 $Aoff=8*0;
  80 $Boff=8*1;
  81 $Coff=8*2;
  82 $Doff=8*3;
  83 $Eoff=8*4;
  84 $Foff=8*5;
  85 $Goff=8*6;
  86 $Hoff=8*7;
  87 $Xoff=8*8;
  88
  89 sub BODY_00_15() {
  90 my $magic = shift;
  91 $code.=<<___;
  92         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
  93         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
  94         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
  95         mov     $t0,$Elo,lsr#14
  96         str     $Tlo,[sp,#$Xoff+0]
  97         mov     $t1,$Ehi,lsr#14
  98         str     $Thi,[sp,#$Xoff+4]
  99         eor     $t0,$t0,$Ehi,lsl#18
 100         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
 101         eor     $t1,$t1,$Elo,lsl#18
 102         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
 103         eor     $t0,$t0,$Elo,lsr#18
 104         eor     $t1,$t1,$Ehi,lsr#18
 105         eor     $t0,$t0,$Ehi,lsl#14
 106         eor     $t1,$t1,$Elo,lsl#14
 107         eor     $t0,$t0,$Ehi,lsr#9
 108         eor     $t1,$t1,$Elo,lsr#9
 109         eor     $t0,$t0,$Elo,lsl#23
 110         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
 111         adds    $Tlo,$Tlo,$t0
 112         ldr     $t0,[sp,#$Foff+0]       @ f.lo
 113         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
 114         ldr     $t1,[sp,#$Foff+4]       @ f.hi
 115         adds    $Tlo,$Tlo,$t2
 116         ldr     $t2,[sp,#$Goff+0]       @ g.lo
 117         adc     $Thi,$Thi,$t3           @ T += h
 118         ldr     $t3,[sp,#$Goff+4]       @ g.hi
 119
 120         eor     $t0,$t0,$t2
 121         str     $Elo,[sp,#$Eoff+0]
 122         eor     $t1,$t1,$t3
 123         str     $Ehi,[sp,#$Eoff+4]
 124         and     $t0,$t0,$Elo
 125         str     $Alo,[sp,#$Aoff+0]
 126         and     $t1,$t1,$Ehi
 127         str     $Ahi,[sp,#$Aoff+4]
 128         eor     $t0,$t0,$t2
 129         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
 130         eor     $t1,$t1,$t3             @ Ch(e,f,g)
 131         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
 132
 133         adds    $Tlo,$Tlo,$t0
 134         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
 135         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
 136         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
 137         adds    $Tlo,$Tlo,$t2
 138         and     $t0,$t2,#0xff
 139         adc     $Thi,$Thi,$t3           @ T += K[i]
 140         adds    $Elo,$Elo,$Tlo
 141         ldr     $t2,[sp,#$Boff+0]       @ b.lo
 142         adc     $Ehi,$Ehi,$Thi          @ d += T
 143         teq     $t0,#$magic
 144
 145         ldr     $t3,[sp,#$Coff+0]       @ c.lo
 146         orreq   $Ktbl,$Ktbl,#1
 147         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 148         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 149         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
 150         mov     $t0,$Alo,lsr#28
 151         mov     $t1,$Ahi,lsr#28
 152         eor     $t0,$t0,$Ahi,lsl#4
 153         eor     $t1,$t1,$Alo,lsl#4
 154         eor     $t0,$t0,$Ahi,lsr#2
 155         eor     $t1,$t1,$Alo,lsr#2
 156         eor     $t0,$t0,$Alo,lsl#30
 157         eor     $t1,$t1,$Ahi,lsl#30
 158         eor     $t0,$t0,$Ahi,lsr#7
 159         eor     $t1,$t1,$Alo,lsr#7
 160         eor     $t0,$t0,$Alo,lsl#25
 161         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
 162         adds    $Tlo,$Tlo,$t0
 163         and     $t0,$Alo,$t2
 164         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
 165
 166         ldr     $t1,[sp,#$Boff+4]       @ b.hi
 167         orr     $Alo,$Alo,$t2
 168         ldr     $t2,[sp,#$Coff+4]       @ c.hi
 169         and     $Alo,$Alo,$t3
 170         and     $t3,$Ahi,$t1
 171         orr     $Ahi,$Ahi,$t1
 172         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
 173         and     $Ahi,$Ahi,$t2
 174         adds    $Alo,$Alo,$Tlo
 175         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
 176         sub     sp,sp,#8
 177         adc     $Ahi,$Ahi,$Thi          @ h += T
 178         tst     $Ktbl,#1
 179         add     $Ktbl,$Ktbl,#8
 180 ___
 181 }
 182 $code=<<___;
 183 #include "arm_arch.h"
 184 #ifdef __ARMEL__
 185 # define LO 0
 186 # define HI 4
 187 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
 188 #else
 189 # define HI 0
 190 # define LO 4
 191 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
 192 #endif
 193
 194 .text
 195 .code   32
 196 .type   K512,%object
 197 .align  5
 198 K512:
 199 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
 200 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
 201 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
 202 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
 203 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
 204 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
 205 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
 206 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
 207 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
 208 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
 209 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
 210 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
 211 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
 212 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
 213 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
 214 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
 215 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
 216 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
 217 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
 218 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
 219 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
 220 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
 221 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
 222 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
 223 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
 224 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
 225 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
 226 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
 227 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
 228 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
 229 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
 230 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
 231 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
 232 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
 233 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
 234 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
 235 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
 236 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
 237 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 238 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 239 .size   K512,.-K512
 240 #if __ARM_MAX_ARCH__>=7
 241 .LOPENSSL_armcap:
 242 .word   OPENSSL_armcap_P-sha512_block_data_order
 243 .skip   32-4
 244 #else
 245 .skip   32
 246 #endif
 247
 248 .global sha512_block_data_order
 249 .type   sha512_block_data_order,%function
 250 sha512_block_data_order:
 251         sub     r3,pc,#8                @ sha512_block_data_order
 252         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
 253 #if __ARM_MAX_ARCH__>=7
 254         ldr     r12,.LOPENSSL_armcap
 255         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
 256         tst     r12,#1
 257         bne     .LNEON
 258 #endif
 259         stmdb   sp!,{r4-r12,lr}
 260         sub     $Ktbl,r3,#672           @ K512
 261         sub     sp,sp,#9*8
 262
 263         ldr     $Elo,[$ctx,#$Eoff+$lo]
 264         ldr     $Ehi,[$ctx,#$Eoff+$hi]
 265         ldr     $t0, [$ctx,#$Goff+$lo]
 266         ldr     $t1, [$ctx,#$Goff+$hi]
 267         ldr     $t2, [$ctx,#$Hoff+$lo]
 268         ldr     $t3, [$ctx,#$Hoff+$hi]
 269 .Loop:
 270         str     $t0, [sp,#$Goff+0]
 271         str     $t1, [sp,#$Goff+4]
 272         str     $t2, [sp,#$Hoff+0]
 273         str     $t3, [sp,#$Hoff+4]
 274         ldr     $Alo,[$ctx,#$Aoff+$lo]
 275         ldr     $Ahi,[$ctx,#$Aoff+$hi]
 276         ldr     $Tlo,[$ctx,#$Boff+$lo]
 277         ldr     $Thi,[$ctx,#$Boff+$hi]
 278         ldr     $t0, [$ctx,#$Coff+$lo]
 279         ldr     $t1, [$ctx,#$Coff+$hi]
 280         ldr     $t2, [$ctx,#$Doff+$lo]
 281         ldr     $t3, [$ctx,#$Doff+$hi]
 282         str     $Tlo,[sp,#$Boff+0]
 283         str     $Thi,[sp,#$Boff+4]
 284         str     $t0, [sp,#$Coff+0]
 285         str     $t1, [sp,#$Coff+4]
 286         str     $t2, [sp,#$Doff+0]
 287         str     $t3, [sp,#$Doff+4]
 288         ldr     $Tlo,[$ctx,#$Foff+$lo]
 289         ldr     $Thi,[$ctx,#$Foff+$hi]
 290         str     $Tlo,[sp,#$Foff+0]
 291         str     $Thi,[sp,#$Foff+4]
 292
 293 .L00_15:
 294 #if __ARM_ARCH__<7
 295         ldrb    $Tlo,[$inp,#7]
 296         ldrb    $t0, [$inp,#6]
 297         ldrb    $t1, [$inp,#5]
 298         ldrb    $t2, [$inp,#4]
 299         ldrb    $Thi,[$inp,#3]
 300         ldrb    $t3, [$inp,#2]
 301         orr     $Tlo,$Tlo,$t0,lsl#8
 302         ldrb    $t0, [$inp,#1]
 303         orr     $Tlo,$Tlo,$t1,lsl#16
 304         ldrb    $t1, [$inp],#8
 305         orr     $Tlo,$Tlo,$t2,lsl#24
 306         orr     $Thi,$Thi,$t3,lsl#8
 307         orr     $Thi,$Thi,$t0,lsl#16
 308         orr     $Thi,$Thi,$t1,lsl#24
 309 #else
 310         ldr     $Tlo,[$inp,#4]
 311         ldr     $Thi,[$inp],#8
 312 #ifdef __ARMEL__
 313         rev     $Tlo,$Tlo
 314         rev     $Thi,$Thi
 315 #endif
 316 #endif
 317 ___
 318         &BODY_00_15(0x94);
 319 $code.=<<___;
 320         tst     $Ktbl,#1
 321         beq     .L00_15
 322         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
 323         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
 324         bic     $Ktbl,$Ktbl,#1
 325 .L16_79:
 326         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 327         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 328         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 329         mov     $Tlo,$t0,lsr#1
 330         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
 331         mov     $Thi,$t1,lsr#1
 332         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
 333         eor     $Tlo,$Tlo,$t1,lsl#31
 334         eor     $Thi,$Thi,$t0,lsl#31
 335         eor     $Tlo,$Tlo,$t0,lsr#8
 336         eor     $Thi,$Thi,$t1,lsr#8
 337         eor     $Tlo,$Tlo,$t1,lsl#24
 338         eor     $Thi,$Thi,$t0,lsl#24
 339         eor     $Tlo,$Tlo,$t0,lsr#7
 340         eor     $Thi,$Thi,$t1,lsr#7
 341         eor     $Tlo,$Tlo,$t1,lsl#25
 342
 343         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 344         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
 345         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
 346         mov     $t0,$t2,lsr#19
 347         mov     $t1,$t3,lsr#19
 348         eor     $t0,$t0,$t3,lsl#13
 349         eor     $t1,$t1,$t2,lsl#13
 350         eor     $t0,$t0,$t3,lsr#29
 351         eor     $t1,$t1,$t2,lsr#29
 352         eor     $t0,$t0,$t2,lsl#3
 353         eor     $t1,$t1,$t3,lsl#3
 354         eor     $t0,$t0,$t2,lsr#6
 355         eor     $t1,$t1,$t3,lsr#6
 356         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
 357         eor     $t0,$t0,$t3,lsl#26
 358
 359         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
 360         adds    $Tlo,$Tlo,$t0
 361         ldr     $t0,[sp,#`$Xoff+8*16`+0]
 362         adc     $Thi,$Thi,$t1
 363
 364         ldr     $t1,[sp,#`$Xoff+8*16`+4]
 365         adds    $Tlo,$Tlo,$t2
 366         adc     $Thi,$Thi,$t3
 367         adds    $Tlo,$Tlo,$t0
 368         adc     $Thi,$Thi,$t1
 369 ___
 370         &BODY_00_15(0x17);
 371 $code.=<<___;
 372         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
 373         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
 374         beq     .L16_79
 375         bic     $Ktbl,$Ktbl,#1
 376
 377         ldr     $Tlo,[sp,#$Boff+0]
 378         ldr     $Thi,[sp,#$Boff+4]
 379         ldr     $t0, [$ctx,#$Aoff+$lo]
 380         ldr     $t1, [$ctx,#$Aoff+$hi]
 381         ldr     $t2, [$ctx,#$Boff+$lo]
 382         ldr     $t3, [$ctx,#$Boff+$hi]
 383         adds    $t0,$Alo,$t0
 384         str     $t0, [$ctx,#$Aoff+$lo]
 385         adc     $t1,$Ahi,$t1
 386         str     $t1, [$ctx,#$Aoff+$hi]
 387         adds    $t2,$Tlo,$t2
 388         str     $t2, [$ctx,#$Boff+$lo]
 389         adc     $t3,$Thi,$t3
 390         str     $t3, [$ctx,#$Boff+$hi]
 391
 392         ldr     $Alo,[sp,#$Coff+0]
 393         ldr     $Ahi,[sp,#$Coff+4]
 394         ldr     $Tlo,[sp,#$Doff+0]
 395         ldr     $Thi,[sp,#$Doff+4]
 396         ldr     $t0, [$ctx,#$Coff+$lo]
 397         ldr     $t1, [$ctx,#$Coff+$hi]
 398         ldr     $t2, [$ctx,#$Doff+$lo]
 399         ldr     $t3, [$ctx,#$Doff+$hi]
 400         adds    $t0,$Alo,$t0
 401         str     $t0, [$ctx,#$Coff+$lo]
 402         adc     $t1,$Ahi,$t1
 403         str     $t1, [$ctx,#$Coff+$hi]
 404         adds    $t2,$Tlo,$t2
 405         str     $t2, [$ctx,#$Doff+$lo]
 406         adc     $t3,$Thi,$t3
 407         str     $t3, [$ctx,#$Doff+$hi]
 408
 409         ldr     $Tlo,[sp,#$Foff+0]
 410         ldr     $Thi,[sp,#$Foff+4]
 411         ldr     $t0, [$ctx,#$Eoff+$lo]
 412         ldr     $t1, [$ctx,#$Eoff+$hi]
 413         ldr     $t2, [$ctx,#$Foff+$lo]
 414         ldr     $t3, [$ctx,#$Foff+$hi]
 415         adds    $Elo,$Elo,$t0
 416         str     $Elo,[$ctx,#$Eoff+$lo]
 417         adc     $Ehi,$Ehi,$t1
 418         str     $Ehi,[$ctx,#$Eoff+$hi]
 419         adds    $t2,$Tlo,$t2
 420         str     $t2, [$ctx,#$Foff+$lo]
 421         adc     $t3,$Thi,$t3
 422         str     $t3, [$ctx,#$Foff+$hi]
 423
 424         ldr     $Alo,[sp,#$Goff+0]
 425         ldr     $Ahi,[sp,#$Goff+4]
 426         ldr     $Tlo,[sp,#$Hoff+0]
 427         ldr     $Thi,[sp,#$Hoff+4]
 428         ldr     $t0, [$ctx,#$Goff+$lo]
 429         ldr     $t1, [$ctx,#$Goff+$hi]
 430         ldr     $t2, [$ctx,#$Hoff+$lo]
 431         ldr     $t3, [$ctx,#$Hoff+$hi]
 432         adds    $t0,$Alo,$t0
 433         str     $t0, [$ctx,#$Goff+$lo]
 434         adc     $t1,$Ahi,$t1
 435         str     $t1, [$ctx,#$Goff+$hi]
 436         adds    $t2,$Tlo,$t2
 437         str     $t2, [$ctx,#$Hoff+$lo]
 438         adc     $t3,$Thi,$t3
 439         str     $t3, [$ctx,#$Hoff+$hi]
 440
 441         add     sp,sp,#640
 442         sub     $Ktbl,$Ktbl,#640
 443
 444         teq     $inp,$len
 445         bne     .Loop
 446
 447         add     sp,sp,#8*9              @ destroy frame
 448 #if __ARM_ARCH__>=5
 449         ldmia   sp!,{r4-r12,pc}
 450 #else
 451         ldmia   sp!,{r4-r12,lr}
 452         tst     lr,#1
 453         moveq   pc,lr                   @ be binary compatible with V4, yet
 454         bx      lr                      @ interoperable with Thumb ISA:-)
 455 #endif
 456 ___
 457
 458 {
 459 my @Sigma0=(28,34,39);
 460 my @Sigma1=(14,18,41);
 461 my @sigma0=(1, 8, 7);
 462 my @sigma1=(19,61,6);
 463
 464 my $Ktbl="r3";
 465 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
 466
 467 my @X=map("d$_",(0..15));
 468 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
 469
 470 sub NEON_00_15() {
 471 my $i=shift;
 472 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
 473 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
 474
 475 $code.=<<___ if ($i<16 || $i&1);
 476         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
 477 #if $i<16
 478         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
 479 #endif
 480         vshr.u64        $t1,$e,#@Sigma1[1]
 481 #if $i>0
 482          vadd.i64       $a,$Maj                 @ h+=Maj from the past
 483 #endif
 484         vshr.u64        $t2,$e,#@Sigma1[2]
 485 ___
 486 $code.=<<___;
 487         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
 488         vsli.64         $t0,$e,#`64-@Sigma1[0]`
 489         vsli.64         $t1,$e,#`64-@Sigma1[1]`
 490         vmov            $Ch,$e
 491         vsli.64         $t2,$e,#`64-@Sigma1[2]`
 492 #if $i<16 && defined(__ARMEL__)
 493         vrev64.8        @X[$i],@X[$i]
 494 #endif
 495         veor            $t1,$t0
 496         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
 497         vshr.u64        $t0,$a,#@Sigma0[0]
 498         veor            $t2,$t1                 @ Sigma1(e)
 499         vadd.i64        $T1,$Ch,$h
 500         vshr.u64        $t1,$a,#@Sigma0[1]
 501         vsli.64         $t0,$a,#`64-@Sigma0[0]`
 502         vadd.i64        $T1,$t2
 503         vshr.u64        $t2,$a,#@Sigma0[2]
 504         vadd.i64        $K,@X[$i%16]
 505         vsli.64         $t1,$a,#`64-@Sigma0[1]`
 506         veor            $Maj,$a,$b
 507         vsli.64         $t2,$a,#`64-@Sigma0[2]`
 508         veor            $h,$t0,$t1
 509         vadd.i64        $T1,$K
 510         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
 511         veor            $h,$t2                  @ Sigma0(a)
 512         vadd.i64        $d,$T1
 513         vadd.i64        $Maj,$T1
 514         @ vadd.i64      $h,$Maj
 515 ___
 516 }
 517
 518 sub NEON_16_79() {
 519 my $i=shift;
 520
 521 if ($i&1)       { &NEON_00_15($i,@_); return; }
 522
 523 # 2x-vectorized, therefore runs every 2nd round
 524 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
 525 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
 526 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
 527 my $e=@_[4];                                    # $e from NEON_00_15
 528 $i /= 2;
 529 $code.=<<___;
 530         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
 531         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
 532          vadd.i64       @_[0],d30                       @ h+=Maj from the past
 533         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
 534         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
 535         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
 536         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
 537         veor            $s1,$t0
 538         vshr.u64        $t0,$s0,#@sigma0[0]
 539         veor            $s1,$t1                         @ sigma1(X[i+14])
 540         vshr.u64        $t1,$s0,#@sigma0[1]
 541         vadd.i64        @X[$i%8],$s1
 542         vshr.u64        $s1,$s0,#@sigma0[2]
 543         vsli.64         $t0,$s0,#`64-@sigma0[0]`
 544         vsli.64         $t1,$s0,#`64-@sigma0[1]`
 545         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
 546         veor            $s1,$t0
 547         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
 548         vadd.i64        @X[$i%8],$s0
 549         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
 550         veor            $s1,$t1                         @ sigma0(X[i+1])
 551         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
 552         vadd.i64        @X[$i%8],$s1
 553 ___
 554         &NEON_00_15(2*$i,@_);
 555 }
 556
 557 $code.=<<___;
 558 #if __ARM_MAX_ARCH__>=7
 559 .arch   armv7-a
 560 .fpu    neon
 561
 562 .align  4
 563 .LNEON:
 564         dmb                             @ errata #451034 on early Cortex A8
 565         vstmdb  sp!,{d8-d15}            @ ABI specification says so
 566         sub     $Ktbl,r3,#672           @ K512
 567         vldmia  $ctx,{$A-$H}            @ load context
 568 .Loop_neon:
 569 ___
 570 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
 571 $code.=<<___;
 572         mov             $cnt,#4
 573 .L16_79_neon:
 574         subs            $cnt,#1
 575 ___
 576 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
 577 $code.=<<___;
 578         bne             .L16_79_neon
 579
 580          vadd.i64       $A,d30          @ h+=Maj from the past
 581         vldmia          $ctx,{d24-d31}  @ load context to temp
 582         vadd.i64        q8,q12          @ vectorized accumulate
 583         vadd.i64        q9,q13
 584         vadd.i64        q10,q14
 585         vadd.i64        q11,q15
 586         vstmia          $ctx,{$A-$H}    @ save context
 587         teq             $inp,$len
 588         sub             $Ktbl,#640      @ rewind K512
 589         bne             .Loop_neon
 590
 591         vldmia  sp!,{d8-d15}            @ epilogue
 592         ret                             @ bx lr
 593 #endif
 594 ___
 595 }
 596 $code.=<<___;
 597 .size   sha512_block_data_order,.-sha512_block_data_order
 598 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 599 .align  2
 600 #if __ARM_MAX_ARCH__>=7
 601 .comm   OPENSSL_armcap_P,4,4
 602 #endif
 603 ___
 604
 605 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 606 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 607 $code =~ s/\bret\b/bx   lr/gm;
 608 print $code;
 609 close STDOUT; # enforce flush