crypto/aes/asm/aesv8-armx.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # This module implements support for ARMv8 AES instructions. The
  11 # module is endian-agnostic in sense that it supports both big- and
  12 # little-endian cases. As does it support both 32- and 64-bit modes
  13 # of operation. Latter is achieved by limiting amount of utilized
  14 # registers to 16, which implies additional NEON load and integer
  15 # instructions. This has no effect on mighty Apple A7, where results
  16 # are literally equal to the theoretical estimates based on AES
  17 # instruction latencies and issue rates. On Cortex-A53, an in-order
  18 # execution core, this costs up to 10-15%, which is partially
  19 # compensated by implementing dedicated code path for 128-bit
  20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  21 # seems to be limited by sheer amount of NEON instructions...
  22 #
  23 # Performance in cycles per byte processed with 128-bit key:
  24 #
  25 #               CBC enc         CBC dec         CTR
  26 # Apple A7      2.39            1.20            1.20
  27 # Cortex-A53    2.45            1.87            1.94
  28 # Cortex-A57    3.64            1.34            1.32
  29
  30 $flavour = shift;
  31 open STDOUT,">".shift;
  32
  33 $prefix="aes_v8";
  34
  35 $code=<<___;
  36 #include "arm_arch.h"
  37
  38 #if __ARM_ARCH__>=7
  39 .text
  40 ___
  41 $code.=".arch   armv8-a+crypto\n"       if ($flavour =~ /64/);
  42 $code.=".fpu    neon\n.code     32\n"   if ($flavour !~ /64/);
  43
  44 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  45 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  46 # maintain both 32- and 64-bit codes within single module and
  47 # transliterate common code to either flavour with regex vodoo.
  48 #
  49 {{{
  50 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  51 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  52         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  53
  54
  55 $code.=<<___;
  56 .align  5
  57 rcon:
  58 .long   0x01,0x01,0x01,0x01
  59 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
  60 .long   0x1b,0x1b,0x1b,0x1b
  61
  62 .globl  ${prefix}_set_encrypt_key
  63 .type   ${prefix}_set_encrypt_key,%function
  64 .align  5
  65 ${prefix}_set_encrypt_key:
  66 .Lenc_key:
  67 ___
  68 $code.=<<___    if ($flavour =~ /64/);
  69         stp     x29,x30,[sp,#-16]!
  70         add     x29,sp,#0
  71 ___
  72 $code.=<<___;
  73         mov     $ptr,#-1
  74         cmp     $inp,#0
  75         b.eq    .Lenc_key_abort
  76         cmp     $out,#0
  77         b.eq    .Lenc_key_abort
  78         mov     $ptr,#-2
  79         cmp     $bits,#128
  80         b.lt    .Lenc_key_abort
  81         cmp     $bits,#256
  82         b.gt    .Lenc_key_abort
  83         tst     $bits,#0x3f
  84         b.ne    .Lenc_key_abort
  85
  86         adr     $ptr,rcon
  87         cmp     $bits,#192
  88
  89         veor    $zero,$zero,$zero
  90         vld1.8  {$in0},[$inp],#16
  91         mov     $bits,#8                // reuse $bits
  92         vld1.32 {$rcon,$mask},[$ptr],#32
  93
  94         b.lt    .Loop128
  95         b.eq    .L192
  96         b       .L256
  97
  98 .align  4
  99 .Loop128:
 100         vtbl.8  $key,{$in0},$mask
 101         vext.8  $tmp,$zero,$in0,#12
 102         vst1.32 {$in0},[$out],#16
 103         aese    $key,$zero
 104         subs    $bits,$bits,#1
 105
 106         veor    $in0,$in0,$tmp
 107         vext.8  $tmp,$zero,$tmp,#12
 108         veor    $in0,$in0,$tmp
 109         vext.8  $tmp,$zero,$tmp,#12
 110          veor   $key,$key,$rcon
 111         veor    $in0,$in0,$tmp
 112         vshl.u8 $rcon,$rcon,#1
 113         veor    $in0,$in0,$key
 114         b.ne    .Loop128
 115
 116         vld1.32 {$rcon},[$ptr]
 117
 118         vtbl.8  $key,{$in0},$mask
 119         vext.8  $tmp,$zero,$in0,#12
 120         vst1.32 {$in0},[$out],#16
 121         aese    $key,$zero
 122
 123         veor    $in0,$in0,$tmp
 124         vext.8  $tmp,$zero,$tmp,#12
 125         veor    $in0,$in0,$tmp
 126         vext.8  $tmp,$zero,$tmp,#12
 127          veor   $key,$key,$rcon
 128         veor    $in0,$in0,$tmp
 129         vshl.u8 $rcon,$rcon,#1
 130         veor    $in0,$in0,$key
 131
 132         vtbl.8  $key,{$in0},$mask
 133         vext.8  $tmp,$zero,$in0,#12
 134         vst1.32 {$in0},[$out],#16
 135         aese    $key,$zero
 136
 137         veor    $in0,$in0,$tmp
 138         vext.8  $tmp,$zero,$tmp,#12
 139         veor    $in0,$in0,$tmp
 140         vext.8  $tmp,$zero,$tmp,#12
 141          veor   $key,$key,$rcon
 142         veor    $in0,$in0,$tmp
 143         veor    $in0,$in0,$key
 144         vst1.32 {$in0},[$out]
 145         add     $out,$out,#0x50
 146
 147         mov     $rounds,#10
 148         b       .Ldone
 149
 150 .align  4
 151 .L192:
 152         vld1.8  {$in1},[$inp],#8
 153         vmov.i8 $key,#8                 // borrow $key
 154         vst1.32 {$in0},[$out],#16
 155         vsub.i8 $mask,$mask,$key        // adjust the mask
 156
 157 .Loop192:
 158         vtbl.8  $key,{$in1},$mask
 159         vext.8  $tmp,$zero,$in0,#12
 160         vst1.32 {$in1},[$out],#8
 161         aese    $key,$zero
 162         subs    $bits,$bits,#1
 163
 164         veor    $in0,$in0,$tmp
 165         vext.8  $tmp,$zero,$tmp,#12
 166         veor    $in0,$in0,$tmp
 167         vext.8  $tmp,$zero,$tmp,#12
 168         veor    $in0,$in0,$tmp
 169
 170         vdup.32 $tmp,${in0}[3]
 171         veor    $tmp,$tmp,$in1
 172          veor   $key,$key,$rcon
 173         vext.8  $in1,$zero,$in1,#12
 174         vshl.u8 $rcon,$rcon,#1
 175         veor    $in1,$in1,$tmp
 176         veor    $in0,$in0,$key
 177         veor    $in1,$in1,$key
 178         vst1.32 {$in0},[$out],#16
 179         b.ne    .Loop192
 180
 181         mov     $rounds,#12
 182         add     $out,$out,#0x20
 183         b       .Ldone
 184
 185 .align  4
 186 .L256:
 187         vld1.8  {$in1},[$inp]
 188         mov     $bits,#7
 189         mov     $rounds,#14
 190         vst1.32 {$in0},[$out],#16
 191
 192 .Loop256:
 193         vtbl.8  $key,{$in1},$mask
 194         vext.8  $tmp,$zero,$in0,#12
 195         vst1.32 {$in1},[$out],#16
 196         aese    $key,$zero
 197         subs    $bits,$bits,#1
 198
 199         veor    $in0,$in0,$tmp
 200         vext.8  $tmp,$zero,$tmp,#12
 201         veor    $in0,$in0,$tmp
 202         vext.8  $tmp,$zero,$tmp,#12
 203          veor   $key,$key,$rcon
 204         veor    $in0,$in0,$tmp
 205         vshl.u8 $rcon,$rcon,#1
 206         veor    $in0,$in0,$key
 207         vst1.32 {$in0},[$out],#16
 208         b.eq    .Ldone
 209
 210         vdup.32 $key,${in0}[3]          // just splat
 211         vext.8  $tmp,$zero,$in1,#12
 212         aese    $key,$zero
 213
 214         veor    $in1,$in1,$tmp
 215         vext.8  $tmp,$zero,$tmp,#12
 216         veor    $in1,$in1,$tmp
 217         vext.8  $tmp,$zero,$tmp,#12
 218         veor    $in1,$in1,$tmp
 219
 220         veor    $in1,$in1,$key
 221         b       .Loop256
 222
 223 .Ldone:
 224         str     $rounds,[$out]
 225         mov     $ptr,#0
 226
 227 .Lenc_key_abort:
 228         mov     x0,$ptr                 // return value
 229         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
 230         ret
 231 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
 232
 233 .globl  ${prefix}_set_decrypt_key
 234 .type   ${prefix}_set_decrypt_key,%function
 235 .align  5
 236 ${prefix}_set_decrypt_key:
 237 ___
 238 $code.=<<___    if ($flavour =~ /64/);
 239         stp     x29,x30,[sp,#-16]!
 240         add     x29,sp,#0
 241 ___
 242 $code.=<<___    if ($flavour !~ /64/);
 243         stmdb   sp!,{r4,lr}
 244 ___
 245 $code.=<<___;
 246         bl      .Lenc_key
 247
 248         cmp     x0,#0
 249         b.ne    .Ldec_key_abort
 250
 251         sub     $out,$out,#240          // restore original $out
 252         mov     x4,#-16
 253         add     $inp,$out,x12,lsl#4     // end of key schedule
 254
 255         vld1.32 {v0.16b},[$out]
 256         vld1.32 {v1.16b},[$inp]
 257         vst1.32 {v0.16b},[$inp],x4
 258         vst1.32 {v1.16b},[$out],#16
 259
 260 .Loop_imc:
 261         vld1.32 {v0.16b},[$out]
 262         vld1.32 {v1.16b},[$inp]
 263         aesimc  v0.16b,v0.16b
 264         aesimc  v1.16b,v1.16b
 265         vst1.32 {v0.16b},[$inp],x4
 266         vst1.32 {v1.16b},[$out],#16
 267         cmp     $inp,$out
 268         b.hi    .Loop_imc
 269
 270         vld1.32 {v0.16b},[$out]
 271         aesimc  v0.16b,v0.16b
 272         vst1.32 {v0.16b},[$inp]
 273
 274         eor     x0,x0,x0                // return value
 275 .Ldec_key_abort:
 276 ___
 277 $code.=<<___    if ($flavour !~ /64/);
 278         ldmia   sp!,{r4,pc}
 279 ___
 280 $code.=<<___    if ($flavour =~ /64/);
 281         ldp     x29,x30,[sp],#16
 282         ret
 283 ___
 284 $code.=<<___;
 285 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
 286 ___
 287 }}}
 288 {{{
 289 sub gen_block () {
 290 my $dir = shift;
 291 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
 292 my ($inp,$out,$key)=map("x$_",(0..2));
 293 my $rounds="w3";
 294 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
 295
 296 $code.=<<___;
 297 .globl  ${prefix}_${dir}crypt
 298 .type   ${prefix}_${dir}crypt,%function
 299 .align  5
 300 ${prefix}_${dir}crypt:
 301         ldr     $rounds,[$key,#240]
 302         vld1.32 {$rndkey0},[$key],#16
 303         vld1.8  {$inout},[$inp]
 304         sub     $rounds,$rounds,#2
 305         vld1.32 {$rndkey1},[$key],#16
 306
 307 .Loop_${dir}c:
 308         aes$e   $inout,$rndkey0
 309         vld1.32 {$rndkey0},[$key],#16
 310         aes$mc  $inout,$inout
 311         subs    $rounds,$rounds,#2
 312         aes$e   $inout,$rndkey1
 313         vld1.32 {$rndkey1},[$key],#16
 314         aes$mc  $inout,$inout
 315         b.gt    .Loop_${dir}c
 316
 317         aes$e   $inout,$rndkey0
 318         vld1.32 {$rndkey0},[$key]
 319         aes$mc  $inout,$inout
 320         aes$e   $inout,$rndkey1
 321         veor    $inout,$inout,$rndkey0
 322
 323         vst1.8  {$inout},[$out]
 324         ret
 325 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
 326 ___
 327 }
 328 &gen_block("en");
 329 &gen_block("de");
 330 }}}
 331 {{{
 332 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
 333 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
 334 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 335
 336 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
 337
 338 ### q8-q15      preloaded key schedule
 339
 340 $code.=<<___;
 341 .globl  ${prefix}_cbc_encrypt
 342 .type   ${prefix}_cbc_encrypt,%function
 343 .align  5
 344 ${prefix}_cbc_encrypt:
 345 ___
 346 $code.=<<___    if ($flavour =~ /64/);
 347         stp     x29,x30,[sp,#-16]!
 348         add     x29,sp,#0
 349 ___
 350 $code.=<<___    if ($flavour !~ /64/);
 351         mov     ip,sp
 352         stmdb   sp!,{r4-r8,lr}
 353         vstmdb  sp!,{d8-d15}            @ ABI specification says so
 354         ldmia   ip,{r4-r5}              @ load remaining args
 355 ___
 356 $code.=<<___;
 357         subs    $len,$len,#16
 358         mov     $step,#16
 359         b.lo    .Lcbc_abort
 360         cclr    $step,eq
 361
 362         cmp     $enc,#0                 // en- or decrypting?
 363         ldr     $rounds,[$key,#240]
 364         and     $len,$len,#-16
 365         vld1.8  {$ivec},[$ivp]
 366         vld1.8  {$dat},[$inp],$step
 367
 368         vld1.32 {q8-q9},[$key]          // load key schedule...
 369         sub     $rounds,$rounds,#6
 370         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
 371         sub     $rounds,$rounds,#2
 372         vld1.32 {q10-q11},[$key_],#32
 373         vld1.32 {q12-q13},[$key_],#32
 374         vld1.32 {q14-q15},[$key_],#32
 375         vld1.32 {$rndlast},[$key_]
 376
 377         add     $key_,$key,#32
 378         mov     $cnt,$rounds
 379         b.eq    .Lcbc_dec
 380
 381         cmp     $rounds,#2
 382         veor    $dat,$dat,$ivec
 383         veor    $rndzero_n_last,q8,$rndlast
 384         b.eq    .Lcbc_enc128
 385
 386 .Loop_cbc_enc:
 387         aese    $dat,q8
 388         vld1.32 {q8},[$key_],#16
 389         aesmc   $dat,$dat
 390         subs    $cnt,$cnt,#2
 391         aese    $dat,q9
 392         vld1.32 {q9},[$key_],#16
 393         aesmc   $dat,$dat
 394         b.gt    .Loop_cbc_enc
 395
 396         aese    $dat,q8
 397         aesmc   $dat,$dat
 398          subs   $len,$len,#16
 399         aese    $dat,q9
 400         aesmc   $dat,$dat
 401          cclr   $step,eq
 402         aese    $dat,q10
 403         aesmc   $dat,$dat
 404          add    $key_,$key,#16
 405         aese    $dat,q11
 406         aesmc   $dat,$dat
 407          vld1.8 {q8},[$inp],$step
 408         aese    $dat,q12
 409         aesmc   $dat,$dat
 410          veor   q8,q8,$rndzero_n_last
 411         aese    $dat,q13
 412         aesmc   $dat,$dat
 413          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
 414         aese    $dat,q14
 415         aesmc   $dat,$dat
 416         aese    $dat,q15
 417
 418          mov    $cnt,$rounds
 419         veor    $ivec,$dat,$rndlast
 420         vst1.8  {$ivec},[$out],#16
 421         b.hs    .Loop_cbc_enc
 422
 423         b       .Lcbc_done
 424
 425 .align  5
 426 .Lcbc_enc128:
 427         vld1.32 {$in0-$in1},[$key_]
 428         aese    $dat,q8
 429         aesmc   $dat,$dat
 430         b       .Lenter_cbc_enc128
 431 .Loop_cbc_enc128:
 432         aese    $dat,q8
 433         aesmc   $dat,$dat
 434          vst1.8 {$ivec},[$out],#16
 435 .Lenter_cbc_enc128:
 436         aese    $dat,q9
 437         aesmc   $dat,$dat
 438          subs   $len,$len,#16
 439         aese    $dat,$in0
 440         aesmc   $dat,$dat
 441          cclr   $step,eq
 442         aese    $dat,$in1
 443         aesmc   $dat,$dat
 444         aese    $dat,q10
 445         aesmc   $dat,$dat
 446         aese    $dat,q11
 447         aesmc   $dat,$dat
 448          vld1.8 {q8},[$inp],$step
 449         aese    $dat,q12
 450         aesmc   $dat,$dat
 451         aese    $dat,q13
 452         aesmc   $dat,$dat
 453         aese    $dat,q14
 454         aesmc   $dat,$dat
 455          veor   q8,q8,$rndzero_n_last
 456         aese    $dat,q15
 457         veor    $ivec,$dat,$rndlast
 458         b.hs    .Loop_cbc_enc128
 459
 460         vst1.8  {$ivec},[$out],#16
 461         b       .Lcbc_done
 462 ___
 463 {
 464 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 465 $code.=<<___;
 466 .align  5
 467 .Lcbc_dec:
 468         vld1.8  {$dat2},[$inp],#16
 469         subs    $len,$len,#32           // bias
 470         add     $cnt,$rounds,#2
 471         vorr    $in1,$dat,$dat
 472         vorr    $dat1,$dat,$dat
 473         vorr    $in2,$dat2,$dat2
 474         b.lo    .Lcbc_dec_tail
 475
 476         vorr    $dat1,$dat2,$dat2
 477         vld1.8  {$dat2},[$inp],#16
 478         vorr    $in0,$dat,$dat
 479         vorr    $in1,$dat1,$dat1
 480         vorr    $in2,$dat2,$dat2
 481
 482 .Loop3x_cbc_dec:
 483         aesd    $dat0,q8
 484         aesd    $dat1,q8
 485         aesd    $dat2,q8
 486         vld1.32 {q8},[$key_],#16
 487         aesimc  $dat0,$dat0
 488         aesimc  $dat1,$dat1
 489         aesimc  $dat2,$dat2
 490         subs    $cnt,$cnt,#2
 491         aesd    $dat0,q9
 492         aesd    $dat1,q9
 493         aesd    $dat2,q9
 494         vld1.32 {q9},[$key_],#16
 495         aesimc  $dat0,$dat0
 496         aesimc  $dat1,$dat1
 497         aesimc  $dat2,$dat2
 498         b.gt    .Loop3x_cbc_dec
 499
 500         aesd    $dat0,q8
 501         aesd    $dat1,q8
 502         aesd    $dat2,q8
 503          veor   $tmp0,$ivec,$rndlast
 504         aesimc  $dat0,$dat0
 505         aesimc  $dat1,$dat1
 506         aesimc  $dat2,$dat2
 507          veor   $tmp1,$in0,$rndlast
 508         aesd    $dat0,q9
 509         aesd    $dat1,q9
 510         aesd    $dat2,q9
 511          veor   $tmp2,$in1,$rndlast
 512          subs   $len,$len,#0x30
 513         aesimc  $dat0,$dat0
 514         aesimc  $dat1,$dat1
 515         aesimc  $dat2,$dat2
 516          vorr   $ivec,$in2,$in2
 517          mov.lo x6,$len                 // x6, $cnt, is zero at this point
 518         aesd    $dat0,q12
 519         aesd    $dat1,q12
 520         aesd    $dat2,q12
 521          add    $inp,$inp,x6            // $inp is adjusted in such way that
 522                                         // at exit from the loop $dat1-$dat2
 523                                         // are loaded with last "words"
 524         aesimc  $dat0,$dat0
 525         aesimc  $dat1,$dat1
 526         aesimc  $dat2,$dat2
 527          mov    $key_,$key
 528         aesd    $dat0,q13
 529         aesd    $dat1,q13
 530         aesd    $dat2,q13
 531          vld1.8 {$in0},[$inp],#16
 532         aesimc  $dat0,$dat0
 533         aesimc  $dat1,$dat1
 534         aesimc  $dat2,$dat2
 535          vld1.8 {$in1},[$inp],#16
 536         aesd    $dat0,q14
 537         aesd    $dat1,q14
 538         aesd    $dat2,q14
 539          vld1.8 {$in2},[$inp],#16
 540         aesimc  $dat0,$dat0
 541         aesimc  $dat1,$dat1
 542         aesimc  $dat2,$dat2
 543          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
 544         aesd    $dat0,q15
 545         aesd    $dat1,q15
 546         aesd    $dat2,q15
 547
 548          add    $cnt,$rounds,#2
 549         veor    $tmp0,$tmp0,$dat0
 550         veor    $tmp1,$tmp1,$dat1
 551         veor    $dat2,$dat2,$tmp2
 552          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
 553          vorr   $dat0,$in0,$in0
 554         vst1.8  {$tmp0},[$out],#16
 555          vorr   $dat1,$in1,$in1
 556         vst1.8  {$tmp1},[$out],#16
 557         vst1.8  {$dat2},[$out],#16
 558          vorr   $dat2,$in2,$in2
 559         b.hs    .Loop3x_cbc_dec
 560
 561         cmn     $len,#0x30
 562         b.eq    .Lcbc_done
 563         nop
 564
 565 .Lcbc_dec_tail:
 566         aesd    $dat1,q8
 567         aesd    $dat2,q8
 568         vld1.32 {q8},[$key_],#16
 569         aesimc  $dat1,$dat1
 570         aesimc  $dat2,$dat2
 571         subs    $cnt,$cnt,#2
 572         aesd    $dat1,q9
 573         aesd    $dat2,q9
 574         vld1.32 {q9},[$key_],#16
 575         aesimc  $dat1,$dat1
 576         aesimc  $dat2,$dat2
 577         b.gt    .Lcbc_dec_tail
 578
 579         aesd    $dat1,q8
 580         aesd    $dat2,q8
 581         aesimc  $dat1,$dat1
 582         aesimc  $dat2,$dat2
 583         aesd    $dat1,q9
 584         aesd    $dat2,q9
 585         aesimc  $dat1,$dat1
 586         aesimc  $dat2,$dat2
 587         aesd    $dat1,q12
 588         aesd    $dat2,q12
 589         aesimc  $dat1,$dat1
 590         aesimc  $dat2,$dat2
 591          cmn    $len,#0x20
 592         aesd    $dat1,q13
 593         aesd    $dat2,q13
 594         aesimc  $dat1,$dat1
 595         aesimc  $dat2,$dat2
 596          veor   $tmp1,$ivec,$rndlast
 597         aesd    $dat1,q14
 598         aesd    $dat2,q14
 599         aesimc  $dat1,$dat1
 600         aesimc  $dat2,$dat2
 601          veor   $tmp2,$in1,$rndlast
 602         aesd    $dat1,q15
 603         aesd    $dat2,q15
 604         b.eq    .Lcbc_dec_one
 605         veor    $tmp1,$tmp1,$dat1
 606         veor    $tmp2,$tmp2,$dat2
 607          vorr   $ivec,$in2,$in2
 608         vst1.8  {$tmp1},[$out],#16
 609         vst1.8  {$tmp2},[$out],#16
 610         b       .Lcbc_done
 611
 612 .Lcbc_dec_one:
 613         veor    $tmp1,$tmp1,$dat2
 614          vorr   $ivec,$in2,$in2
 615         vst1.8  {$tmp1},[$out],#16
 616
 617 .Lcbc_done:
 618         vst1.8  {$ivec},[$ivp]
 619 .Lcbc_abort:
 620 ___
 621 }
 622 $code.=<<___    if ($flavour !~ /64/);
 623         vldmia  sp!,{d8-d15}
 624         ldmia   sp!,{r4-r8,pc}
 625 ___
 626 $code.=<<___    if ($flavour =~ /64/);
 627         ldr     x29,[sp],#16
 628         ret
 629 ___
 630 $code.=<<___;
 631 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
 632 ___
 633 }}}
 634 {{{
 635 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
 636 my ($rounds,$cnt,$key_)=("w5","w6","x7");
 637 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
 638 my $step="x12";         # aliases with $tctr2
 639
 640 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 641 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 642
 643 my ($dat,$tmp)=($dat0,$tmp0);
 644
 645 ### q8-q15      preloaded key schedule
 646
 647 $code.=<<___;
 648 .globl  ${prefix}_ctr32_encrypt_blocks
 649 .type   ${prefix}_ctr32_encrypt_blocks,%function
 650 .align  5
 651 ${prefix}_ctr32_encrypt_blocks:
 652 ___
 653 $code.=<<___    if ($flavour =~ /64/);
 654         stp             x29,x30,[sp,#-16]!
 655         add             x29,sp,#0
 656 ___
 657 $code.=<<___    if ($flavour !~ /64/);
 658         mov             ip,sp
 659         stmdb           sp!,{r4-r10,lr}
 660         vstmdb          sp!,{d8-d15}            @ ABI specification says so
 661         ldr             r4, [ip]                @ load remaining arg
 662 ___
 663 $code.=<<___;
 664         ldr             $rounds,[$key,#240]
 665
 666         ldr             $ctr, [$ivp, #12]
 667         vld1.32         {$dat0},[$ivp]
 668
 669         vld1.32         {q8-q9},[$key]          // load key schedule...
 670         sub             $rounds,$rounds,#4
 671         mov             $step,#16
 672         cmp             $len,#2
 673         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
 674         sub             $rounds,$rounds,#2
 675         vld1.32         {q12-q13},[$key_],#32
 676         vld1.32         {q14-q15},[$key_],#32
 677         vld1.32         {$rndlast},[$key_]
 678         add             $key_,$key,#32
 679         mov             $cnt,$rounds
 680         cclr            $step,lo
 681 #ifndef __ARMEB__
 682         rev             $ctr, $ctr
 683 #endif
 684         vorr            $dat1,$dat0,$dat0
 685         add             $tctr1, $ctr, #1
 686         vorr            $dat2,$dat0,$dat0
 687         add             $ctr, $ctr, #2
 688         vorr            $ivec,$dat0,$dat0
 689         rev             $tctr1, $tctr1
 690         vmov.32         ${dat1}[3],$tctr1
 691         b.ls            .Lctr32_tail
 692         rev             $tctr2, $ctr
 693         sub             $len,$len,#3            // bias
 694         vmov.32         ${dat2}[3],$tctr2
 695         b               .Loop3x_ctr32
 696
 697 .align  4
 698 .Loop3x_ctr32:
 699         aese            $dat0,q8
 700         aese            $dat1,q8
 701         aese            $dat2,q8
 702         vld1.32         {q8},[$key_],#16
 703         aesmc           $dat0,$dat0
 704         aesmc           $dat1,$dat1
 705         aesmc           $dat2,$dat2
 706         subs            $cnt,$cnt,#2
 707         aese            $dat0,q9
 708         aese            $dat1,q9
 709         aese            $dat2,q9
 710         vld1.32         {q9},[$key_],#16
 711         aesmc           $dat0,$dat0
 712         aesmc           $dat1,$dat1
 713         aesmc           $dat2,$dat2
 714         b.gt            .Loop3x_ctr32
 715
 716         aese            $dat0,q8
 717         aese            $dat1,q8
 718         aese            $dat2,q8
 719          mov            $key_,$key
 720         aesmc           $tmp0,$dat0
 721          vld1.8         {$in0},[$inp],#16
 722         aesmc           $tmp1,$dat1
 723         aesmc           $dat2,$dat2
 724          vorr           $dat0,$ivec,$ivec
 725         aese            $tmp0,q9
 726          vld1.8         {$in1},[$inp],#16
 727         aese            $tmp1,q9
 728         aese            $dat2,q9
 729          vorr           $dat1,$ivec,$ivec
 730         aesmc           $tmp0,$tmp0
 731          vld1.8         {$in2},[$inp],#16
 732         aesmc           $tmp1,$tmp1
 733         aesmc           $tmp2,$dat2
 734          vorr           $dat2,$ivec,$ivec
 735          add            $tctr0,$ctr,#1
 736         aese            $tmp0,q12
 737         aese            $tmp1,q12
 738         aese            $tmp2,q12
 739          veor           $in0,$in0,$rndlast
 740          add            $tctr1,$ctr,#2
 741         aesmc           $tmp0,$tmp0
 742         aesmc           $tmp1,$tmp1
 743         aesmc           $tmp2,$tmp2
 744          veor           $in1,$in1,$rndlast
 745          add            $ctr,$ctr,#3
 746         aese            $tmp0,q13
 747         aese            $tmp1,q13
 748         aese            $tmp2,q13
 749          veor           $in2,$in2,$rndlast
 750          rev            $tctr0,$tctr0
 751         aesmc           $tmp0,$tmp0
 752          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
 753         aesmc           $tmp1,$tmp1
 754         aesmc           $tmp2,$tmp2
 755          vmov.32        ${dat0}[3], $tctr0
 756          rev            $tctr1,$tctr1
 757         aese            $tmp0,q14
 758         aese            $tmp1,q14
 759         aese            $tmp2,q14
 760          vmov.32        ${dat1}[3], $tctr1
 761          rev            $tctr2,$ctr
 762         aesmc           $tmp0,$tmp0
 763         aesmc           $tmp1,$tmp1
 764         aesmc           $tmp2,$tmp2
 765          vmov.32        ${dat2}[3], $tctr2
 766          subs           $len,$len,#3
 767         aese            $tmp0,q15
 768         aese            $tmp1,q15
 769         aese            $tmp2,q15
 770
 771          mov            $cnt,$rounds
 772         veor            $in0,$in0,$tmp0
 773         veor            $in1,$in1,$tmp1
 774         veor            $in2,$in2,$tmp2
 775          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
 776         vst1.8          {$in0},[$out],#16
 777         vst1.8          {$in1},[$out],#16
 778         vst1.8          {$in2},[$out],#16
 779         b.hs            .Loop3x_ctr32
 780
 781         adds            $len,$len,#3
 782         b.eq            .Lctr32_done
 783         cmp             $len,#1
 784         mov             $step,#16
 785         cclr            $step,eq
 786
 787 .Lctr32_tail:
 788         aese            $dat0,q8
 789         aese            $dat1,q8
 790         vld1.32         {q8},[$key_],#16
 791         aesmc           $dat0,$dat0
 792         aesmc           $dat1,$dat1
 793         subs            $cnt,$cnt,#2
 794         aese            $dat0,q9
 795         aese            $dat1,q9
 796         vld1.32         {q9},[$key_],#16
 797         aesmc           $dat0,$dat0
 798         aesmc           $dat1,$dat1
 799         b.gt            .Lctr32_tail
 800
 801         aese            $dat0,q8
 802         aese            $dat1,q8
 803         aesmc           $dat0,$dat0
 804         aesmc           $dat1,$dat1
 805         aese            $dat0,q9
 806         aese            $dat1,q9
 807         aesmc           $dat0,$dat0
 808         aesmc           $dat1,$dat1
 809          vld1.8         {$in0},[$inp],$step
 810         aese            $dat0,q12
 811         aese            $dat1,q12
 812          vld1.8         {$in1},[$inp]
 813         aesmc           $dat0,$dat0
 814         aesmc           $dat1,$dat1
 815         aese            $dat0,q13
 816         aese            $dat1,q13
 817         aesmc           $dat0,$dat0
 818         aesmc           $dat1,$dat1
 819         aese            $dat0,q14
 820         aese            $dat1,q14
 821          veor           $in0,$in0,$rndlast
 822         aesmc           $dat0,$dat0
 823         aesmc           $dat1,$dat1
 824          veor           $in1,$in1,$rndlast
 825         aese            $dat0,q15
 826         aese            $dat1,q15
 827
 828         cmp             $len,#1
 829         veor            $in0,$in0,$dat0
 830         veor            $in1,$in1,$dat1
 831         vst1.8          {$in0},[$out],#16
 832         b.eq            .Lctr32_done
 833         vst1.8          {$in1},[$out]
 834
 835 .Lctr32_done:
 836 ___
 837 $code.=<<___    if ($flavour !~ /64/);
 838         vldmia          sp!,{d8-d15}
 839         ldmia           sp!,{r4-r10,pc}
 840 ___
 841 $code.=<<___    if ($flavour =~ /64/);
 842         ldr             x29,[sp],#16
 843         ret
 844 ___
 845 $code.=<<___;
 846 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
 847 ___
 848 }}}
 849 $code.=<<___;
 850 #endif
 851 ___
 852 ########################################
 853 if ($flavour =~ /64/) {                 ######## 64-bit code
 854     my %opcode = (
 855         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
 856         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
 857
 858     local *unaes = sub {
 859         my ($mnemonic,$arg)=@_;
 860
 861         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
 862         sprintf ".inst\t0x%08x\t//%s %s",
 863                         $opcode{$mnemonic}|$1|($2<<5),
 864                         $mnemonic,$arg;
 865     };
 866
 867     foreach(split("\n",$code)) {
 868         s/\`([^\`]*)\`/eval($1)/geo;
 869
 870         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
 871         s/@\s/\/\//o;                   # old->new style commentary
 872
 873         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
 874         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
 875         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
 876         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
 877         s/vext\.8/ext/o         or
 878         s/vrev32\.8/rev32/o     or
 879         s/vtst\.8/cmtst/o       or
 880         s/vshr/ushr/o           or
 881         s/^(\s+)v/$1/o          or      # strip off v prefix
 882         s/\bbx\s+lr\b/ret/o;
 883
 884         # fix up remainig legacy suffixes
 885         s/\.[ui]?8//o;
 886         m/\],#8/o and s/\.16b/\.8b/go;
 887         s/\.[ui]?32//o and s/\.16b/\.4s/go;
 888         s/\.[ui]?64//o and s/\.16b/\.2d/go;
 889         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
 890
 891         print $_,"\n";
 892     }
 893 } else {                                ######## 32-bit code
 894     my %opcode = (
 895         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
 896         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
 897
 898     local *unaes = sub {
 899         my ($mnemonic,$arg)=@_;
 900
 901         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
 902             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
 903                                          |(($2&7)<<1) |(($2&8)<<2);
 904             # since ARMv7 instructions are always encoded little-endian.
 905             # correct solution is to use .inst directive, but older
 906             # assemblers don't implement it:-(
 907             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 908                         $word&0xff,($word>>8)&0xff,
 909                         ($word>>16)&0xff,($word>>24)&0xff,
 910                         $mnemonic,$arg;
 911         }
 912     };
 913
 914     sub unvtbl {
 915         my $arg=shift;
 916
 917         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
 918         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
 919                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
 920     }
 921
 922     sub unvdup32 {
 923         my $arg=shift;
 924
 925         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
 926         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
 927     }
 928
 929     sub unvmov32 {
 930         my $arg=shift;
 931
 932         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
 933         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
 934     }
 935
 936     foreach(split("\n",$code)) {
 937         s/\`([^\`]*)\`/eval($1)/geo;
 938
 939         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
 940         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
 941         s/\/\/\s?/@ /o;                         # new->old style commentary
 942
 943         # fix up remainig new-style suffixes
 944         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
 945         s/\],#[0-9]+/]!/o;
 946
 947         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
 948         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
 949         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
 950         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
 951         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
 952         s/^(\s+)b\./$1b/o                               or
 953         s/^(\s+)mov\./$1mov/o                           or
 954         s/^(\s+)ret/$1bx\tlr/o;
 955
 956         print $_,"\n";
 957     }
 958 }
 959
 960 close STDOUT;