crypto/aes/asm/aesv8-armx.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # This module implements support for ARMv8 AES instructions. The
  18 # module is endian-agnostic in sense that it supports both big- and
  19 # little-endian cases. As does it support both 32- and 64-bit modes
  20 # of operation. Latter is achieved by limiting amount of utilized
  21 # registers to 16, which implies additional NEON load and integer
  22 # instructions. This has no effect on mighty Apple A7, where results
  23 # are literally equal to the theoretical estimates based on AES
  24 # instruction latencies and issue rates. On Cortex-A53, an in-order
  25 # execution core, this costs up to 10-15%, which is partially
  26 # compensated by implementing dedicated code path for 128-bit
  27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  28 # seems to be limited by sheer amount of NEON instructions...
  29 #
  30 # Performance in cycles per byte processed with 128-bit key:
  31 #
  32 #               CBC enc         CBC dec         CTR
  33 # Apple A7      2.39            1.20            1.20
  34 # Cortex-A53    1.32            1.29            1.46
  35 # Cortex-A57(*) 1.95            0.85            0.93
  36 # Denver        1.96            0.86            0.80
  37 # Mongoose      1.33            1.20            1.20
  38 # Kryo          1.26            0.94            1.00
  39 #
  40 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
  41 #       and are still same even for updated module;
  42
  43 $flavour = shift;
  44 $output  = shift;
  45
  46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  47 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  48 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  49 die "can't locate arm-xlate.pl";
  50
  51 open OUT,"| \"$^X\" $xlate $flavour $output";
  52 *STDOUT=*OUT;
  53
  54 $prefix="aes_v8";
  55
  56 $code=<<___;
  57 #include "arm_arch.h"
  58
  59 #if __ARM_MAX_ARCH__>=7
  60 .text
  61 ___
  62 $code.=".arch   armv8-a+crypto\n"                       if ($flavour =~ /64/);
  63 $code.=<<___                                            if ($flavour !~ /64/);
  64 .arch   armv7-a // don't confuse not-so-latest binutils with argv8 :-)
  65 .fpu    neon
  66 .code   32
  67 #undef  __thumb2__
  68 ___
  69
  70 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  71 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  72 # maintain both 32- and 64-bit codes within single module and
  73 # transliterate common code to either flavour with regex vodoo.
  74 #
  75 {{{
  76 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  77 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  78         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  79
  80
  81 $code.=<<___;
  82 .align  5
  83 .Lrcon:
  84 .long   0x01,0x01,0x01,0x01
  85 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
  86 .long   0x1b,0x1b,0x1b,0x1b
  87
  88 .globl  ${prefix}_set_encrypt_key
  89 .type   ${prefix}_set_encrypt_key,%function
  90 .align  5
  91 ${prefix}_set_encrypt_key:
  92 .Lenc_key:
  93 ___
  94 $code.=<<___    if ($flavour =~ /64/);
  95         stp     x29,x30,[sp,#-16]!
  96         add     x29,sp,#0
  97 ___
  98 $code.=<<___;
  99         mov     $ptr,#-1
 100         cmp     $inp,#0
 101         b.eq    .Lenc_key_abort
 102         cmp     $out,#0
 103         b.eq    .Lenc_key_abort
 104         mov     $ptr,#-2
 105         cmp     $bits,#128
 106         b.lt    .Lenc_key_abort
 107         cmp     $bits,#256
 108         b.gt    .Lenc_key_abort
 109         tst     $bits,#0x3f
 110         b.ne    .Lenc_key_abort
 111
 112         adr     $ptr,.Lrcon
 113         cmp     $bits,#192
 114
 115         veor    $zero,$zero,$zero
 116         vld1.8  {$in0},[$inp],#16
 117         mov     $bits,#8                // reuse $bits
 118         vld1.32 {$rcon,$mask},[$ptr],#32
 119
 120         b.lt    .Loop128
 121         b.eq    .L192
 122         b       .L256
 123
 124 .align  4
 125 .Loop128:
 126         vtbl.8  $key,{$in0},$mask
 127         vext.8  $tmp,$zero,$in0,#12
 128         vst1.32 {$in0},[$out],#16
 129         aese    $key,$zero
 130         subs    $bits,$bits,#1
 131
 132         veor    $in0,$in0,$tmp
 133         vext.8  $tmp,$zero,$tmp,#12
 134         veor    $in0,$in0,$tmp
 135         vext.8  $tmp,$zero,$tmp,#12
 136          veor   $key,$key,$rcon
 137         veor    $in0,$in0,$tmp
 138         vshl.u8 $rcon,$rcon,#1
 139         veor    $in0,$in0,$key
 140         b.ne    .Loop128
 141
 142         vld1.32 {$rcon},[$ptr]
 143
 144         vtbl.8  $key,{$in0},$mask
 145         vext.8  $tmp,$zero,$in0,#12
 146         vst1.32 {$in0},[$out],#16
 147         aese    $key,$zero
 148
 149         veor    $in0,$in0,$tmp
 150         vext.8  $tmp,$zero,$tmp,#12
 151         veor    $in0,$in0,$tmp
 152         vext.8  $tmp,$zero,$tmp,#12
 153          veor   $key,$key,$rcon
 154         veor    $in0,$in0,$tmp
 155         vshl.u8 $rcon,$rcon,#1
 156         veor    $in0,$in0,$key
 157
 158         vtbl.8  $key,{$in0},$mask
 159         vext.8  $tmp,$zero,$in0,#12
 160         vst1.32 {$in0},[$out],#16
 161         aese    $key,$zero
 162
 163         veor    $in0,$in0,$tmp
 164         vext.8  $tmp,$zero,$tmp,#12
 165         veor    $in0,$in0,$tmp
 166         vext.8  $tmp,$zero,$tmp,#12
 167          veor   $key,$key,$rcon
 168         veor    $in0,$in0,$tmp
 169         veor    $in0,$in0,$key
 170         vst1.32 {$in0},[$out]
 171         add     $out,$out,#0x50
 172
 173         mov     $rounds,#10
 174         b       .Ldone
 175
 176 .align  4
 177 .L192:
 178         vld1.8  {$in1},[$inp],#8
 179         vmov.i8 $key,#8                 // borrow $key
 180         vst1.32 {$in0},[$out],#16
 181         vsub.i8 $mask,$mask,$key        // adjust the mask
 182
 183 .Loop192:
 184         vtbl.8  $key,{$in1},$mask
 185         vext.8  $tmp,$zero,$in0,#12
 186         vst1.32 {$in1},[$out],#8
 187         aese    $key,$zero
 188         subs    $bits,$bits,#1
 189
 190         veor    $in0,$in0,$tmp
 191         vext.8  $tmp,$zero,$tmp,#12
 192         veor    $in0,$in0,$tmp
 193         vext.8  $tmp,$zero,$tmp,#12
 194         veor    $in0,$in0,$tmp
 195
 196         vdup.32 $tmp,${in0}[3]
 197         veor    $tmp,$tmp,$in1
 198          veor   $key,$key,$rcon
 199         vext.8  $in1,$zero,$in1,#12
 200         vshl.u8 $rcon,$rcon,#1
 201         veor    $in1,$in1,$tmp
 202         veor    $in0,$in0,$key
 203         veor    $in1,$in1,$key
 204         vst1.32 {$in0},[$out],#16
 205         b.ne    .Loop192
 206
 207         mov     $rounds,#12
 208         add     $out,$out,#0x20
 209         b       .Ldone
 210
 211 .align  4
 212 .L256:
 213         vld1.8  {$in1},[$inp]
 214         mov     $bits,#7
 215         mov     $rounds,#14
 216         vst1.32 {$in0},[$out],#16
 217
 218 .Loop256:
 219         vtbl.8  $key,{$in1},$mask
 220         vext.8  $tmp,$zero,$in0,#12
 221         vst1.32 {$in1},[$out],#16
 222         aese    $key,$zero
 223         subs    $bits,$bits,#1
 224
 225         veor    $in0,$in0,$tmp
 226         vext.8  $tmp,$zero,$tmp,#12
 227         veor    $in0,$in0,$tmp
 228         vext.8  $tmp,$zero,$tmp,#12
 229          veor   $key,$key,$rcon
 230         veor    $in0,$in0,$tmp
 231         vshl.u8 $rcon,$rcon,#1
 232         veor    $in0,$in0,$key
 233         vst1.32 {$in0},[$out],#16
 234         b.eq    .Ldone
 235
 236         vdup.32 $key,${in0}[3]          // just splat
 237         vext.8  $tmp,$zero,$in1,#12
 238         aese    $key,$zero
 239
 240         veor    $in1,$in1,$tmp
 241         vext.8  $tmp,$zero,$tmp,#12
 242         veor    $in1,$in1,$tmp
 243         vext.8  $tmp,$zero,$tmp,#12
 244         veor    $in1,$in1,$tmp
 245
 246         veor    $in1,$in1,$key
 247         b       .Loop256
 248
 249 .Ldone:
 250         str     $rounds,[$out]
 251         mov     $ptr,#0
 252
 253 .Lenc_key_abort:
 254         mov     x0,$ptr                 // return value
 255         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
 256         ret
 257 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
 258
 259 .globl  ${prefix}_set_decrypt_key
 260 .type   ${prefix}_set_decrypt_key,%function
 261 .align  5
 262 ${prefix}_set_decrypt_key:
 263 ___
 264 $code.=<<___    if ($flavour =~ /64/);
 265         stp     x29,x30,[sp,#-16]!
 266         add     x29,sp,#0
 267 ___
 268 $code.=<<___    if ($flavour !~ /64/);
 269         stmdb   sp!,{r4,lr}
 270 ___
 271 $code.=<<___;
 272         bl      .Lenc_key
 273
 274         cmp     x0,#0
 275         b.ne    .Ldec_key_abort
 276
 277         sub     $out,$out,#240          // restore original $out
 278         mov     x4,#-16
 279         add     $inp,$out,x12,lsl#4     // end of key schedule
 280
 281         vld1.32 {v0.16b},[$out]
 282         vld1.32 {v1.16b},[$inp]
 283         vst1.32 {v0.16b},[$inp],x4
 284         vst1.32 {v1.16b},[$out],#16
 285
 286 .Loop_imc:
 287         vld1.32 {v0.16b},[$out]
 288         vld1.32 {v1.16b},[$inp]
 289         aesimc  v0.16b,v0.16b
 290         aesimc  v1.16b,v1.16b
 291         vst1.32 {v0.16b},[$inp],x4
 292         vst1.32 {v1.16b},[$out],#16
 293         cmp     $inp,$out
 294         b.hi    .Loop_imc
 295
 296         vld1.32 {v0.16b},[$out]
 297         aesimc  v0.16b,v0.16b
 298         vst1.32 {v0.16b},[$inp]
 299
 300         eor     x0,x0,x0                // return value
 301 .Ldec_key_abort:
 302 ___
 303 $code.=<<___    if ($flavour !~ /64/);
 304         ldmia   sp!,{r4,pc}
 305 ___
 306 $code.=<<___    if ($flavour =~ /64/);
 307         ldp     x29,x30,[sp],#16
 308         ret
 309 ___
 310 $code.=<<___;
 311 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
 312 ___
 313 }}}
 314 {{{
 315 sub gen_block () {
 316 my $dir = shift;
 317 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
 318 my ($inp,$out,$key)=map("x$_",(0..2));
 319 my $rounds="w3";
 320 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
 321
 322 $code.=<<___;
 323 .globl  ${prefix}_${dir}crypt
 324 .type   ${prefix}_${dir}crypt,%function
 325 .align  5
 326 ${prefix}_${dir}crypt:
 327         ldr     $rounds,[$key,#240]
 328         vld1.32 {$rndkey0},[$key],#16
 329         vld1.8  {$inout},[$inp]
 330         sub     $rounds,$rounds,#2
 331         vld1.32 {$rndkey1},[$key],#16
 332
 333 .Loop_${dir}c:
 334         aes$e   $inout,$rndkey0
 335         aes$mc  $inout,$inout
 336         vld1.32 {$rndkey0},[$key],#16
 337         subs    $rounds,$rounds,#2
 338         aes$e   $inout,$rndkey1
 339         aes$mc  $inout,$inout
 340         vld1.32 {$rndkey1},[$key],#16
 341         b.gt    .Loop_${dir}c
 342
 343         aes$e   $inout,$rndkey0
 344         aes$mc  $inout,$inout
 345         vld1.32 {$rndkey0},[$key]
 346         aes$e   $inout,$rndkey1
 347         veor    $inout,$inout,$rndkey0
 348
 349         vst1.8  {$inout},[$out]
 350         ret
 351 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
 352 ___
 353 }
 354 &gen_block("en");
 355 &gen_block("de");
 356 }}}
 357 {{{
 358 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
 359 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
 360 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 361
 362 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
 363 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
 364
 365 ### q8-q15      preloaded key schedule
 366
 367 $code.=<<___;
 368 .globl  ${prefix}_cbc_encrypt
 369 .type   ${prefix}_cbc_encrypt,%function
 370 .align  5
 371 ${prefix}_cbc_encrypt:
 372 ___
 373 $code.=<<___    if ($flavour =~ /64/);
 374         stp     x29,x30,[sp,#-16]!
 375         add     x29,sp,#0
 376 ___
 377 $code.=<<___    if ($flavour !~ /64/);
 378         mov     ip,sp
 379         stmdb   sp!,{r4-r8,lr}
 380         vstmdb  sp!,{d8-d15}            @ ABI specification says so
 381         ldmia   ip,{r4-r5}              @ load remaining args
 382 ___
 383 $code.=<<___;
 384         subs    $len,$len,#16
 385         mov     $step,#16
 386         b.lo    .Lcbc_abort
 387         cclr    $step,eq
 388
 389         cmp     $enc,#0                 // en- or decrypting?
 390         ldr     $rounds,[$key,#240]
 391         and     $len,$len,#-16
 392         vld1.8  {$ivec},[$ivp]
 393         vld1.8  {$dat},[$inp],$step
 394
 395         vld1.32 {q8-q9},[$key]          // load key schedule...
 396         sub     $rounds,$rounds,#6
 397         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
 398         sub     $rounds,$rounds,#2
 399         vld1.32 {q10-q11},[$key_],#32
 400         vld1.32 {q12-q13},[$key_],#32
 401         vld1.32 {q14-q15},[$key_],#32
 402         vld1.32 {$rndlast},[$key_]
 403
 404         add     $key_,$key,#32
 405         mov     $cnt,$rounds
 406         b.eq    .Lcbc_dec
 407
 408         cmp     $rounds,#2
 409         veor    $dat,$dat,$ivec
 410         veor    $rndzero_n_last,q8,$rndlast
 411         b.eq    .Lcbc_enc128
 412
 413         vld1.32 {$in0-$in1},[$key_]
 414         add     $key_,$key,#16
 415         add     $key4,$key,#16*4
 416         add     $key5,$key,#16*5
 417         aese    $dat,q8
 418         aesmc   $dat,$dat
 419         add     $key6,$key,#16*6
 420         add     $key7,$key,#16*7
 421         b       .Lenter_cbc_enc
 422
 423 .align  4
 424 .Loop_cbc_enc:
 425         aese    $dat,q8
 426         aesmc   $dat,$dat
 427          vst1.8 {$ivec},[$out],#16
 428 .Lenter_cbc_enc:
 429         aese    $dat,q9
 430         aesmc   $dat,$dat
 431         aese    $dat,$in0
 432         aesmc   $dat,$dat
 433         vld1.32 {q8},[$key4]
 434         cmp     $rounds,#4
 435         aese    $dat,$in1
 436         aesmc   $dat,$dat
 437         vld1.32 {q9},[$key5]
 438         b.eq    .Lcbc_enc192
 439
 440         aese    $dat,q8
 441         aesmc   $dat,$dat
 442         vld1.32 {q8},[$key6]
 443         aese    $dat,q9
 444         aesmc   $dat,$dat
 445         vld1.32 {q9},[$key7]
 446         nop
 447
 448 .Lcbc_enc192:
 449         aese    $dat,q8
 450         aesmc   $dat,$dat
 451          subs   $len,$len,#16
 452         aese    $dat,q9
 453         aesmc   $dat,$dat
 454          cclr   $step,eq
 455         aese    $dat,q10
 456         aesmc   $dat,$dat
 457         aese    $dat,q11
 458         aesmc   $dat,$dat
 459          vld1.8 {q8},[$inp],$step
 460         aese    $dat,q12
 461         aesmc   $dat,$dat
 462          veor   q8,q8,$rndzero_n_last
 463         aese    $dat,q13
 464         aesmc   $dat,$dat
 465          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
 466         aese    $dat,q14
 467         aesmc   $dat,$dat
 468         aese    $dat,q15
 469         veor    $ivec,$dat,$rndlast
 470         b.hs    .Loop_cbc_enc
 471
 472         vst1.8  {$ivec},[$out],#16
 473         b       .Lcbc_done
 474
 475 .align  5
 476 .Lcbc_enc128:
 477         vld1.32 {$in0-$in1},[$key_]
 478         aese    $dat,q8
 479         aesmc   $dat,$dat
 480         b       .Lenter_cbc_enc128
 481 .Loop_cbc_enc128:
 482         aese    $dat,q8
 483         aesmc   $dat,$dat
 484          vst1.8 {$ivec},[$out],#16
 485 .Lenter_cbc_enc128:
 486         aese    $dat,q9
 487         aesmc   $dat,$dat
 488          subs   $len,$len,#16
 489         aese    $dat,$in0
 490         aesmc   $dat,$dat
 491          cclr   $step,eq
 492         aese    $dat,$in1
 493         aesmc   $dat,$dat
 494         aese    $dat,q10
 495         aesmc   $dat,$dat
 496         aese    $dat,q11
 497         aesmc   $dat,$dat
 498          vld1.8 {q8},[$inp],$step
 499         aese    $dat,q12
 500         aesmc   $dat,$dat
 501         aese    $dat,q13
 502         aesmc   $dat,$dat
 503         aese    $dat,q14
 504         aesmc   $dat,$dat
 505          veor   q8,q8,$rndzero_n_last
 506         aese    $dat,q15
 507         veor    $ivec,$dat,$rndlast
 508         b.hs    .Loop_cbc_enc128
 509
 510         vst1.8  {$ivec},[$out],#16
 511         b       .Lcbc_done
 512 ___
 513 {
 514 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 515 $code.=<<___;
 516 .align  5
 517 .Lcbc_dec:
 518         vld1.8  {$dat2},[$inp],#16
 519         subs    $len,$len,#32           // bias
 520         add     $cnt,$rounds,#2
 521         vorr    $in1,$dat,$dat
 522         vorr    $dat1,$dat,$dat
 523         vorr    $in2,$dat2,$dat2
 524         b.lo    .Lcbc_dec_tail
 525
 526         vorr    $dat1,$dat2,$dat2
 527         vld1.8  {$dat2},[$inp],#16
 528         vorr    $in0,$dat,$dat
 529         vorr    $in1,$dat1,$dat1
 530         vorr    $in2,$dat2,$dat2
 531
 532 .Loop3x_cbc_dec:
 533         aesd    $dat0,q8
 534         aesimc  $dat0,$dat0
 535         aesd    $dat1,q8
 536         aesimc  $dat1,$dat1
 537         aesd    $dat2,q8
 538         aesimc  $dat2,$dat2
 539         vld1.32 {q8},[$key_],#16
 540         subs    $cnt,$cnt,#2
 541         aesd    $dat0,q9
 542         aesimc  $dat0,$dat0
 543         aesd    $dat1,q9
 544         aesimc  $dat1,$dat1
 545         aesd    $dat2,q9
 546         aesimc  $dat2,$dat2
 547         vld1.32 {q9},[$key_],#16
 548         b.gt    .Loop3x_cbc_dec
 549
 550         aesd    $dat0,q8
 551         aesimc  $dat0,$dat0
 552         aesd    $dat1,q8
 553         aesimc  $dat1,$dat1
 554         aesd    $dat2,q8
 555         aesimc  $dat2,$dat2
 556          veor   $tmp0,$ivec,$rndlast
 557          subs   $len,$len,#0x30
 558          veor   $tmp1,$in0,$rndlast
 559          mov.lo x6,$len                 // x6, $cnt, is zero at this point
 560         aesd    $dat0,q9
 561         aesimc  $dat0,$dat0
 562         aesd    $dat1,q9
 563         aesimc  $dat1,$dat1
 564         aesd    $dat2,q9
 565         aesimc  $dat2,$dat2
 566          veor   $tmp2,$in1,$rndlast
 567          add    $inp,$inp,x6            // $inp is adjusted in such way that
 568                                         // at exit from the loop $dat1-$dat2
 569                                         // are loaded with last "words"
 570          vorr   $ivec,$in2,$in2
 571          mov    $key_,$key
 572         aesd    $dat0,q12
 573         aesimc  $dat0,$dat0
 574         aesd    $dat1,q12
 575         aesimc  $dat1,$dat1
 576         aesd    $dat2,q12
 577         aesimc  $dat2,$dat2
 578          vld1.8 {$in0},[$inp],#16
 579         aesd    $dat0,q13
 580         aesimc  $dat0,$dat0
 581         aesd    $dat1,q13
 582         aesimc  $dat1,$dat1
 583         aesd    $dat2,q13
 584         aesimc  $dat2,$dat2
 585          vld1.8 {$in1},[$inp],#16
 586         aesd    $dat0,q14
 587         aesimc  $dat0,$dat0
 588         aesd    $dat1,q14
 589         aesimc  $dat1,$dat1
 590         aesd    $dat2,q14
 591         aesimc  $dat2,$dat2
 592          vld1.8 {$in2},[$inp],#16
 593         aesd    $dat0,q15
 594         aesd    $dat1,q15
 595         aesd    $dat2,q15
 596          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
 597          add    $cnt,$rounds,#2
 598         veor    $tmp0,$tmp0,$dat0
 599         veor    $tmp1,$tmp1,$dat1
 600         veor    $dat2,$dat2,$tmp2
 601          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
 602         vst1.8  {$tmp0},[$out],#16
 603          vorr   $dat0,$in0,$in0
 604         vst1.8  {$tmp1},[$out],#16
 605          vorr   $dat1,$in1,$in1
 606         vst1.8  {$dat2},[$out],#16
 607          vorr   $dat2,$in2,$in2
 608         b.hs    .Loop3x_cbc_dec
 609
 610         cmn     $len,#0x30
 611         b.eq    .Lcbc_done
 612         nop
 613
 614 .Lcbc_dec_tail:
 615         aesd    $dat1,q8
 616         aesimc  $dat1,$dat1
 617         aesd    $dat2,q8
 618         aesimc  $dat2,$dat2
 619         vld1.32 {q8},[$key_],#16
 620         subs    $cnt,$cnt,#2
 621         aesd    $dat1,q9
 622         aesimc  $dat1,$dat1
 623         aesd    $dat2,q9
 624         aesimc  $dat2,$dat2
 625         vld1.32 {q9},[$key_],#16
 626         b.gt    .Lcbc_dec_tail
 627
 628         aesd    $dat1,q8
 629         aesimc  $dat1,$dat1
 630         aesd    $dat2,q8
 631         aesimc  $dat2,$dat2
 632         aesd    $dat1,q9
 633         aesimc  $dat1,$dat1
 634         aesd    $dat2,q9
 635         aesimc  $dat2,$dat2
 636         aesd    $dat1,q12
 637         aesimc  $dat1,$dat1
 638         aesd    $dat2,q12
 639         aesimc  $dat2,$dat2
 640          cmn    $len,#0x20
 641         aesd    $dat1,q13
 642         aesimc  $dat1,$dat1
 643         aesd    $dat2,q13
 644         aesimc  $dat2,$dat2
 645          veor   $tmp1,$ivec,$rndlast
 646         aesd    $dat1,q14
 647         aesimc  $dat1,$dat1
 648         aesd    $dat2,q14
 649         aesimc  $dat2,$dat2
 650          veor   $tmp2,$in1,$rndlast
 651         aesd    $dat1,q15
 652         aesd    $dat2,q15
 653         b.eq    .Lcbc_dec_one
 654         veor    $tmp1,$tmp1,$dat1
 655         veor    $tmp2,$tmp2,$dat2
 656          vorr   $ivec,$in2,$in2
 657         vst1.8  {$tmp1},[$out],#16
 658         vst1.8  {$tmp2},[$out],#16
 659         b       .Lcbc_done
 660
 661 .Lcbc_dec_one:
 662         veor    $tmp1,$tmp1,$dat2
 663          vorr   $ivec,$in2,$in2
 664         vst1.8  {$tmp1},[$out],#16
 665
 666 .Lcbc_done:
 667         vst1.8  {$ivec},[$ivp]
 668 .Lcbc_abort:
 669 ___
 670 }
 671 $code.=<<___    if ($flavour !~ /64/);
 672         vldmia  sp!,{d8-d15}
 673         ldmia   sp!,{r4-r8,pc}
 674 ___
 675 $code.=<<___    if ($flavour =~ /64/);
 676         ldr     x29,[sp],#16
 677         ret
 678 ___
 679 $code.=<<___;
 680 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
 681 ___
 682 }}}
 683 {{{
 684 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
 685 my ($rounds,$cnt,$key_)=("w5","w6","x7");
 686 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
 687 my $step="x12";         # aliases with $tctr2
 688
 689 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 690 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 691
 692 my ($dat,$tmp)=($dat0,$tmp0);
 693
 694 ### q8-q15      preloaded key schedule
 695
 696 $code.=<<___;
 697 .globl  ${prefix}_ctr32_encrypt_blocks
 698 .type   ${prefix}_ctr32_encrypt_blocks,%function
 699 .align  5
 700 ${prefix}_ctr32_encrypt_blocks:
 701 ___
 702 $code.=<<___    if ($flavour =~ /64/);
 703         stp             x29,x30,[sp,#-16]!
 704         add             x29,sp,#0
 705 ___
 706 $code.=<<___    if ($flavour !~ /64/);
 707         mov             ip,sp
 708         stmdb           sp!,{r4-r10,lr}
 709         vstmdb          sp!,{d8-d15}            @ ABI specification says so
 710         ldr             r4, [ip]                @ load remaining arg
 711 ___
 712 $code.=<<___;
 713         ldr             $rounds,[$key,#240]
 714
 715         ldr             $ctr, [$ivp, #12]
 716         vld1.32         {$dat0},[$ivp]
 717
 718         vld1.32         {q8-q9},[$key]          // load key schedule...
 719         sub             $rounds,$rounds,#4
 720         mov             $step,#16
 721         cmp             $len,#2
 722         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
 723         sub             $rounds,$rounds,#2
 724         vld1.32         {q12-q13},[$key_],#32
 725         vld1.32         {q14-q15},[$key_],#32
 726         vld1.32         {$rndlast},[$key_]
 727         add             $key_,$key,#32
 728         mov             $cnt,$rounds
 729         cclr            $step,lo
 730 #ifndef __ARMEB__
 731         rev             $ctr, $ctr
 732 #endif
 733         vorr            $dat1,$dat0,$dat0
 734         add             $tctr1, $ctr, #1
 735         vorr            $dat2,$dat0,$dat0
 736         add             $ctr, $ctr, #2
 737         vorr            $ivec,$dat0,$dat0
 738         rev             $tctr1, $tctr1
 739         vmov.32         ${dat1}[3],$tctr1
 740         b.ls            .Lctr32_tail
 741         rev             $tctr2, $ctr
 742         sub             $len,$len,#3            // bias
 743         vmov.32         ${dat2}[3],$tctr2
 744         b               .Loop3x_ctr32
 745
 746 .align  4
 747 .Loop3x_ctr32:
 748         aese            $dat0,q8
 749         aesmc           $dat0,$dat0
 750         aese            $dat1,q8
 751         aesmc           $dat1,$dat1
 752         aese            $dat2,q8
 753         aesmc           $dat2,$dat2
 754         vld1.32         {q8},[$key_],#16
 755         subs            $cnt,$cnt,#2
 756         aese            $dat0,q9
 757         aesmc           $dat0,$dat0
 758         aese            $dat1,q9
 759         aesmc           $dat1,$dat1
 760         aese            $dat2,q9
 761         aesmc           $dat2,$dat2
 762         vld1.32         {q9},[$key_],#16
 763         b.gt            .Loop3x_ctr32
 764
 765         aese            $dat0,q8
 766         aesmc           $tmp0,$dat0
 767         aese            $dat1,q8
 768         aesmc           $tmp1,$dat1
 769          vld1.8         {$in0},[$inp],#16
 770          vorr           $dat0,$ivec,$ivec
 771         aese            $dat2,q8
 772         aesmc           $dat2,$dat2
 773          vld1.8         {$in1},[$inp],#16
 774          vorr           $dat1,$ivec,$ivec
 775         aese            $tmp0,q9
 776         aesmc           $tmp0,$tmp0
 777         aese            $tmp1,q9
 778         aesmc           $tmp1,$tmp1
 779          vld1.8         {$in2},[$inp],#16
 780          mov            $key_,$key
 781         aese            $dat2,q9
 782         aesmc           $tmp2,$dat2
 783          vorr           $dat2,$ivec,$ivec
 784          add            $tctr0,$ctr,#1
 785         aese            $tmp0,q12
 786         aesmc           $tmp0,$tmp0
 787         aese            $tmp1,q12
 788         aesmc           $tmp1,$tmp1
 789          veor           $in0,$in0,$rndlast
 790          add            $tctr1,$ctr,#2
 791         aese            $tmp2,q12
 792         aesmc           $tmp2,$tmp2
 793          veor           $in1,$in1,$rndlast
 794          add            $ctr,$ctr,#3
 795         aese            $tmp0,q13
 796         aesmc           $tmp0,$tmp0
 797         aese            $tmp1,q13
 798         aesmc           $tmp1,$tmp1
 799          veor           $in2,$in2,$rndlast
 800          rev            $tctr0,$tctr0
 801         aese            $tmp2,q13
 802         aesmc           $tmp2,$tmp2
 803          vmov.32        ${dat0}[3], $tctr0
 804          rev            $tctr1,$tctr1
 805         aese            $tmp0,q14
 806         aesmc           $tmp0,$tmp0
 807         aese            $tmp1,q14
 808         aesmc           $tmp1,$tmp1
 809          vmov.32        ${dat1}[3], $tctr1
 810          rev            $tctr2,$ctr
 811         aese            $tmp2,q14
 812         aesmc           $tmp2,$tmp2
 813          vmov.32        ${dat2}[3], $tctr2
 814          subs           $len,$len,#3
 815         aese            $tmp0,q15
 816         aese            $tmp1,q15
 817         aese            $tmp2,q15
 818
 819         veor            $in0,$in0,$tmp0
 820          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
 821         vst1.8          {$in0},[$out],#16
 822         veor            $in1,$in1,$tmp1
 823          mov            $cnt,$rounds
 824         vst1.8          {$in1},[$out],#16
 825         veor            $in2,$in2,$tmp2
 826          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
 827         vst1.8          {$in2},[$out],#16
 828         b.hs            .Loop3x_ctr32
 829
 830         adds            $len,$len,#3
 831         b.eq            .Lctr32_done
 832         cmp             $len,#1
 833         mov             $step,#16
 834         cclr            $step,eq
 835
 836 .Lctr32_tail:
 837         aese            $dat0,q8
 838         aesmc           $dat0,$dat0
 839         aese            $dat1,q8
 840         aesmc           $dat1,$dat1
 841         vld1.32         {q8},[$key_],#16
 842         subs            $cnt,$cnt,#2
 843         aese            $dat0,q9
 844         aesmc           $dat0,$dat0
 845         aese            $dat1,q9
 846         aesmc           $dat1,$dat1
 847         vld1.32         {q9},[$key_],#16
 848         b.gt            .Lctr32_tail
 849
 850         aese            $dat0,q8
 851         aesmc           $dat0,$dat0
 852         aese            $dat1,q8
 853         aesmc           $dat1,$dat1
 854         aese            $dat0,q9
 855         aesmc           $dat0,$dat0
 856         aese            $dat1,q9
 857         aesmc           $dat1,$dat1
 858          vld1.8         {$in0},[$inp],$step
 859         aese            $dat0,q12
 860         aesmc           $dat0,$dat0
 861         aese            $dat1,q12
 862         aesmc           $dat1,$dat1
 863          vld1.8         {$in1},[$inp]
 864         aese            $dat0,q13
 865         aesmc           $dat0,$dat0
 866         aese            $dat1,q13
 867         aesmc           $dat1,$dat1
 868          veor           $in0,$in0,$rndlast
 869         aese            $dat0,q14
 870         aesmc           $dat0,$dat0
 871         aese            $dat1,q14
 872         aesmc           $dat1,$dat1
 873          veor           $in1,$in1,$rndlast
 874         aese            $dat0,q15
 875         aese            $dat1,q15
 876
 877         cmp             $len,#1
 878         veor            $in0,$in0,$dat0
 879         veor            $in1,$in1,$dat1
 880         vst1.8          {$in0},[$out],#16
 881         b.eq            .Lctr32_done
 882         vst1.8          {$in1},[$out]
 883
 884 .Lctr32_done:
 885 ___
 886 $code.=<<___    if ($flavour !~ /64/);
 887         vldmia          sp!,{d8-d15}
 888         ldmia           sp!,{r4-r10,pc}
 889 ___
 890 $code.=<<___    if ($flavour =~ /64/);
 891         ldr             x29,[sp],#16
 892         ret
 893 ___
 894 $code.=<<___;
 895 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
 896 ___
 897 }}}
 898 $code.=<<___;
 899 #endif
 900 ___
 901 ########################################
 902 if ($flavour =~ /64/) {                 ######## 64-bit code
 903     my %opcode = (
 904         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
 905         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
 906
 907     local *unaes = sub {
 908         my ($mnemonic,$arg)=@_;
 909
 910         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
 911         sprintf ".inst\t0x%08x\t//%s %s",
 912                         $opcode{$mnemonic}|$1|($2<<5),
 913                         $mnemonic,$arg;
 914     };
 915
 916     foreach(split("\n",$code)) {
 917         s/\`([^\`]*)\`/eval($1)/geo;
 918
 919         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
 920         s/@\s/\/\//o;                   # old->new style commentary
 921
 922         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
 923         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
 924         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
 925         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
 926         s/vext\.8/ext/o         or
 927         s/vrev32\.8/rev32/o     or
 928         s/vtst\.8/cmtst/o       or
 929         s/vshr/ushr/o           or
 930         s/^(\s+)v/$1/o          or      # strip off v prefix
 931         s/\bbx\s+lr\b/ret/o;
 932
 933         # fix up remaining legacy suffixes
 934         s/\.[ui]?8//o;
 935         m/\],#8/o and s/\.16b/\.8b/go;
 936         s/\.[ui]?32//o and s/\.16b/\.4s/go;
 937         s/\.[ui]?64//o and s/\.16b/\.2d/go;
 938         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
 939
 940         print $_,"\n";
 941     }
 942 } else {                                ######## 32-bit code
 943     my %opcode = (
 944         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
 945         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
 946
 947     local *unaes = sub {
 948         my ($mnemonic,$arg)=@_;
 949
 950         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
 951             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
 952                                          |(($2&7)<<1) |(($2&8)<<2);
 953             # since ARMv7 instructions are always encoded little-endian.
 954             # correct solution is to use .inst directive, but older
 955             # assemblers don't implement it:-(
 956             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 957                         $word&0xff,($word>>8)&0xff,
 958                         ($word>>16)&0xff,($word>>24)&0xff,
 959                         $mnemonic,$arg;
 960         }
 961     };
 962
 963     sub unvtbl {
 964         my $arg=shift;
 965
 966         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
 967         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
 968                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
 969     }
 970
 971     sub unvdup32 {
 972         my $arg=shift;
 973
 974         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
 975         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
 976     }
 977
 978     sub unvmov32 {
 979         my $arg=shift;
 980
 981         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
 982         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
 983     }
 984
 985     foreach(split("\n",$code)) {
 986         s/\`([^\`]*)\`/eval($1)/geo;
 987
 988         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
 989         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
 990         s/\/\/\s?/@ /o;                         # new->old style commentary
 991
 992         # fix up remaining new-style suffixes
 993         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
 994         s/\],#[0-9]+/]!/o;
 995
 996         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
 997         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
 998         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
 999         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
1000         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
1001         s/^(\s+)b\./$1b/o                               or
1002         s/^(\s+)mov\./$1mov/o                           or
1003         s/^(\s+)ret/$1bx\tlr/o;
1004
1005         print $_,"\n";
1006     }
1007 }
1008
1009 close STDOUT;