crypto/aes/asm/aesv8-armx.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # This module implements support for ARMv8 AES instructions. The
  18 # module is endian-agnostic in sense that it supports both big- and
  19 # little-endian cases. As does it support both 32- and 64-bit modes
  20 # of operation. Latter is achieved by limiting amount of utilized
  21 # registers to 16, which implies additional NEON load and integer
  22 # instructions. This has no effect on mighty Apple A7, where results
  23 # are literally equal to the theoretical estimates based on AES
  24 # instruction latencies and issue rates. On Cortex-A53, an in-order
  25 # execution core, this costs up to 10-15%, which is partially
  26 # compensated by implementing dedicated code path for 128-bit
  27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  28 # seems to be limited by sheer amount of NEON instructions...
  29 #
  30 # April 2019
  31 #
  32 # Key to performance of parallelize-able modes is round instruction
  33 # interleaving. But which factor to use? There is optimal one for
  34 # each combination of instruction latency and issue rate, beyond
  35 # which increasing interleave factor doesn't pay off. While on cons
  36 # side we have code size increase and resource waste on platforms for
  37 # which interleave factor is too high. In other words you want it to
  38 # be just right. So far interleave factor of 3x was serving well all
  39 # platforms. But for ThunderX2 optimal interleave factor was measured
  40 # to be 5x...
  41 #
  42 # Performance in cycles per byte processed with 128-bit key:
  43 #
  44 #               CBC enc         CBC dec         CTR
  45 # Apple A7      2.39            1.20            1.20
  46 # Cortex-A53    1.32            1.17/1.29(**)   1.36/1.46
  47 # Cortex-A57(*) 1.95            0.82/0.85       0.89/0.93
  48 # Cortex-A72    1.33            0.85/0.88       0.92/0.96
  49 # Denver        1.96            0.65/0.86       0.76/0.80
  50 # Mongoose      1.33            1.23/1.20       1.30/1.20
  51 # Kryo          1.26            0.87/0.94       1.00/1.00
  52 # ThunderX2     5.95            1.25            1.30
  53 #
  54 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
  55 #       and are still same even for updated module;
  56 # (**)  numbers after slash are for 32-bit code, which is 3x-
  57 #       interleaved;
  58
  59 # $output is the last argument if it looks like a file (it has an extension)
  60 # $flavour is the first argument if it doesn't look like a file
  61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  63
  64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  67 die "can't locate arm-xlate.pl";
  68
  69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  70     or die "can't call $xlate: $!";
  71 *STDOUT=*OUT;
  72
  73 $prefix="aes_v8";
  74
  75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  76
  77 $code=<<___;
  78 #include "arm_arch.h"
  79
  80 #if __ARM_MAX_ARCH__>=7
  81 ___
  82 $code.=".arch   armv8-a+crypto\n.text\n"                if ($flavour =~ /64/);
  83 $code.=<<___                                            if ($flavour !~ /64/);
  84 .arch   armv7-a // don't confuse not-so-latest binutils with argv8 :-)
  85 .fpu    neon
  86 #ifdef  __thumb2__
  87 .syntax unified
  88 .thumb
  89 # define INST(a,b,c,d)  $_byte  c,d|0xc,a,b
  90 #else
  91 .code   32
  92 # define INST(a,b,c,d)  $_byte  a,b,c,d
  93 #endif
  94
  95 .text
  96 ___
  97
  98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
 100 # maintain both 32- and 64-bit codes within single module and
 101 # transliterate common code to either flavour with regex vodoo.
 102 #
 103 {{{
 104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
 105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
 106         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
 107
 108
 109 $code.=<<___;
 110 .align  5
 111 .Lrcon:
 112 .long   0x01,0x01,0x01,0x01
 113 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
 114 .long   0x1b,0x1b,0x1b,0x1b
 115
 116 .globl  ${prefix}_set_encrypt_key
 117 .type   ${prefix}_set_encrypt_key,%function
 118 .align  5
 119 ${prefix}_set_encrypt_key:
 120 .Lenc_key:
 121 ___
 122 $code.=<<___    if ($flavour =~ /64/);
 123         stp     x29,x30,[sp,#-16]!
 124         add     x29,sp,#0
 125 ___
 126 $code.=<<___;
 127         mov     $ptr,#-1
 128         cmp     $inp,#0
 129         b.eq    .Lenc_key_abort
 130         cmp     $out,#0
 131         b.eq    .Lenc_key_abort
 132         mov     $ptr,#-2
 133         cmp     $bits,#128
 134         b.lt    .Lenc_key_abort
 135         cmp     $bits,#256
 136         b.gt    .Lenc_key_abort
 137         tst     $bits,#0x3f
 138         b.ne    .Lenc_key_abort
 139
 140         adr     $ptr,.Lrcon
 141         cmp     $bits,#192
 142
 143         veor    $zero,$zero,$zero
 144         vld1.8  {$in0},[$inp],#16
 145         mov     $bits,#8                // reuse $bits
 146         vld1.32 {$rcon,$mask},[$ptr],#32
 147
 148         b.lt    .Loop128
 149         b.eq    .L192
 150         b       .L256
 151
 152 .align  4
 153 .Loop128:
 154         vtbl.8  $key,{$in0},$mask
 155         vext.8  $tmp,$zero,$in0,#12
 156         vst1.32 {$in0},[$out],#16
 157         aese    $key,$zero
 158         subs    $bits,$bits,#1
 159
 160         veor    $in0,$in0,$tmp
 161         vext.8  $tmp,$zero,$tmp,#12
 162         veor    $in0,$in0,$tmp
 163         vext.8  $tmp,$zero,$tmp,#12
 164          veor   $key,$key,$rcon
 165         veor    $in0,$in0,$tmp
 166         vshl.u8 $rcon,$rcon,#1
 167         veor    $in0,$in0,$key
 168         b.ne    .Loop128
 169
 170         vld1.32 {$rcon},[$ptr]
 171
 172         vtbl.8  $key,{$in0},$mask
 173         vext.8  $tmp,$zero,$in0,#12
 174         vst1.32 {$in0},[$out],#16
 175         aese    $key,$zero
 176
 177         veor    $in0,$in0,$tmp
 178         vext.8  $tmp,$zero,$tmp,#12
 179         veor    $in0,$in0,$tmp
 180         vext.8  $tmp,$zero,$tmp,#12
 181          veor   $key,$key,$rcon
 182         veor    $in0,$in0,$tmp
 183         vshl.u8 $rcon,$rcon,#1
 184         veor    $in0,$in0,$key
 185
 186         vtbl.8  $key,{$in0},$mask
 187         vext.8  $tmp,$zero,$in0,#12
 188         vst1.32 {$in0},[$out],#16
 189         aese    $key,$zero
 190
 191         veor    $in0,$in0,$tmp
 192         vext.8  $tmp,$zero,$tmp,#12
 193         veor    $in0,$in0,$tmp
 194         vext.8  $tmp,$zero,$tmp,#12
 195          veor   $key,$key,$rcon
 196         veor    $in0,$in0,$tmp
 197         veor    $in0,$in0,$key
 198         vst1.32 {$in0},[$out]
 199         add     $out,$out,#0x50
 200
 201         mov     $rounds,#10
 202         b       .Ldone
 203
 204 .align  4
 205 .L192:
 206         vld1.8  {$in1},[$inp],#8
 207         vmov.i8 $key,#8                 // borrow $key
 208         vst1.32 {$in0},[$out],#16
 209         vsub.i8 $mask,$mask,$key        // adjust the mask
 210
 211 .Loop192:
 212         vtbl.8  $key,{$in1},$mask
 213         vext.8  $tmp,$zero,$in0,#12
 214 #ifdef __ARMEB__
 215         vst1.32 {$in1},[$out],#16
 216         sub     $out,$out,#8
 217 #else
 218         vst1.32 {$in1},[$out],#8
 219 #endif
 220         aese    $key,$zero
 221         subs    $bits,$bits,#1
 222
 223         veor    $in0,$in0,$tmp
 224         vext.8  $tmp,$zero,$tmp,#12
 225         veor    $in0,$in0,$tmp
 226         vext.8  $tmp,$zero,$tmp,#12
 227         veor    $in0,$in0,$tmp
 228
 229         vdup.32 $tmp,${in0}[3]
 230         veor    $tmp,$tmp,$in1
 231          veor   $key,$key,$rcon
 232         vext.8  $in1,$zero,$in1,#12
 233         vshl.u8 $rcon,$rcon,#1
 234         veor    $in1,$in1,$tmp
 235         veor    $in0,$in0,$key
 236         veor    $in1,$in1,$key
 237         vst1.32 {$in0},[$out],#16
 238         b.ne    .Loop192
 239
 240         mov     $rounds,#12
 241         add     $out,$out,#0x20
 242         b       .Ldone
 243
 244 .align  4
 245 .L256:
 246         vld1.8  {$in1},[$inp]
 247         mov     $bits,#7
 248         mov     $rounds,#14
 249         vst1.32 {$in0},[$out],#16
 250
 251 .Loop256:
 252         vtbl.8  $key,{$in1},$mask
 253         vext.8  $tmp,$zero,$in0,#12
 254         vst1.32 {$in1},[$out],#16
 255         aese    $key,$zero
 256         subs    $bits,$bits,#1
 257
 258         veor    $in0,$in0,$tmp
 259         vext.8  $tmp,$zero,$tmp,#12
 260         veor    $in0,$in0,$tmp
 261         vext.8  $tmp,$zero,$tmp,#12
 262          veor   $key,$key,$rcon
 263         veor    $in0,$in0,$tmp
 264         vshl.u8 $rcon,$rcon,#1
 265         veor    $in0,$in0,$key
 266         vst1.32 {$in0},[$out],#16
 267         b.eq    .Ldone
 268
 269         vdup.32 $key,${in0}[3]          // just splat
 270         vext.8  $tmp,$zero,$in1,#12
 271         aese    $key,$zero
 272
 273         veor    $in1,$in1,$tmp
 274         vext.8  $tmp,$zero,$tmp,#12
 275         veor    $in1,$in1,$tmp
 276         vext.8  $tmp,$zero,$tmp,#12
 277         veor    $in1,$in1,$tmp
 278
 279         veor    $in1,$in1,$key
 280         b       .Loop256
 281
 282 .Ldone:
 283         str     $rounds,[$out]
 284         mov     $ptr,#0
 285
 286 .Lenc_key_abort:
 287         mov     x0,$ptr                 // return value
 288         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
 289         ret
 290 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
 291
 292 .globl  ${prefix}_set_decrypt_key
 293 .type   ${prefix}_set_decrypt_key,%function
 294 .align  5
 295 ${prefix}_set_decrypt_key:
 296 ___
 297 $code.=<<___    if ($flavour =~ /64/);
 298         .inst   0xd503233f              // paciasp
 299         stp     x29,x30,[sp,#-16]!
 300         add     x29,sp,#0
 301 ___
 302 $code.=<<___    if ($flavour !~ /64/);
 303         stmdb   sp!,{r4,lr}
 304 ___
 305 $code.=<<___;
 306         bl      .Lenc_key
 307
 308         cmp     x0,#0
 309         b.ne    .Ldec_key_abort
 310
 311         sub     $out,$out,#240          // restore original $out
 312         mov     x4,#-16
 313         add     $inp,$out,x12,lsl#4     // end of key schedule
 314
 315         vld1.32 {v0.16b},[$out]
 316         vld1.32 {v1.16b},[$inp]
 317         vst1.32 {v0.16b},[$inp],x4
 318         vst1.32 {v1.16b},[$out],#16
 319
 320 .Loop_imc:
 321         vld1.32 {v0.16b},[$out]
 322         vld1.32 {v1.16b},[$inp]
 323         aesimc  v0.16b,v0.16b
 324         aesimc  v1.16b,v1.16b
 325         vst1.32 {v0.16b},[$inp],x4
 326         vst1.32 {v1.16b},[$out],#16
 327         cmp     $inp,$out
 328         b.hi    .Loop_imc
 329
 330         vld1.32 {v0.16b},[$out]
 331         aesimc  v0.16b,v0.16b
 332         vst1.32 {v0.16b},[$inp]
 333
 334         eor     x0,x0,x0                // return value
 335 .Ldec_key_abort:
 336 ___
 337 $code.=<<___    if ($flavour !~ /64/);
 338         ldmia   sp!,{r4,pc}
 339 ___
 340 $code.=<<___    if ($flavour =~ /64/);
 341         ldp     x29,x30,[sp],#16
 342         .inst   0xd50323bf              // autiasp
 343         ret
 344 ___
 345 $code.=<<___;
 346 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
 347 ___
 348 }}}
 349 {{{
 350 sub gen_block () {
 351 my $dir = shift;
 352 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
 353 my ($inp,$out,$key)=map("x$_",(0..2));
 354 my $rounds="w3";
 355 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
 356
 357 $code.=<<___;
 358 .globl  ${prefix}_${dir}crypt
 359 .type   ${prefix}_${dir}crypt,%function
 360 .align  5
 361 ${prefix}_${dir}crypt:
 362         ldr     $rounds,[$key,#240]
 363         vld1.32 {$rndkey0},[$key],#16
 364         vld1.8  {$inout},[$inp]
 365         sub     $rounds,$rounds,#2
 366         vld1.32 {$rndkey1},[$key],#16
 367
 368 .Loop_${dir}c:
 369         aes$e   $inout,$rndkey0
 370         aes$mc  $inout,$inout
 371         vld1.32 {$rndkey0},[$key],#16
 372         subs    $rounds,$rounds,#2
 373         aes$e   $inout,$rndkey1
 374         aes$mc  $inout,$inout
 375         vld1.32 {$rndkey1},[$key],#16
 376         b.gt    .Loop_${dir}c
 377
 378         aes$e   $inout,$rndkey0
 379         aes$mc  $inout,$inout
 380         vld1.32 {$rndkey0},[$key]
 381         aes$e   $inout,$rndkey1
 382         veor    $inout,$inout,$rndkey0
 383
 384         vst1.8  {$inout},[$out]
 385         ret
 386 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
 387 ___
 388 }
 389 &gen_block("en");
 390 &gen_block("de");
 391 }}}
 392
 393 # Performance in cycles per byte.
 394 # Processed with AES-ECB different key size.
 395 # It shows the value before and after optimization as below:
 396 # (before/after):
 397 #
 398 #               AES-128-ECB             AES-192-ECB             AES-256-ECB
 399 # Cortex-A57    1.85/0.82               2.16/0.96               2.47/1.10
 400 # Cortex-A72    1.64/0.85               1.82/0.99               2.13/1.14
 401
 402 # Optimization is implemented by loop unrolling and interleaving.
 403 # Commonly, we choose the unrolling factor as 5, if the input
 404 # data size smaller than 5 blocks, but not smaller than 3 blocks,
 405 # choose 3 as the unrolling factor.
 406 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
 407 # as one iteration, every loop the left size lsize -= 5*16.
 408 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
 409 # every loop lsize -=3*16.
 410 # If lsize < 3*16 bytes, treat them as the tail, interleave the
 411 # two blocks AES instructions.
 412 # There is one special case, if the original input data size dsize
 413 # = 16 bytes, we will treat it seperately to improve the
 414 # performance: one independent code block without LR, FP load and
 415 # store, just looks like what the original ECB implementation does.
 416
 417 {{{
 418 my ($inp,$out,$len,$key)=map("x$_",(0..3));
 419 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
 420 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
 421
 422 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
 423
 424 ### q7  last round key
 425 ### q10-q15     q7 Last 7 round keys
 426 ### q8-q9       preloaded round keys except last 7 keys for big size
 427 ### q5, q6, q8-q9       preloaded round keys except last 7 keys for only 16 byte
 428
 429 {
 430 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 431
 432 my ($dat3,$in3,$tmp3);  # used only in 64-bit mode
 433 my ($dat4,$in4,$tmp4);
 434 if ($flavour =~ /64/) {
 435     ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
 436 }
 437
 438 $code.=<<___;
 439 .globl  ${prefix}_ecb_encrypt
 440 .type   ${prefix}_ecb_encrypt,%function
 441 .align  5
 442 ${prefix}_ecb_encrypt:
 443 ___
 444 $code.=<<___    if ($flavour =~ /64/);
 445         subs    $len,$len,#16
 446         // Original input data size bigger than 16, jump to big size processing.
 447         b.ne    .Lecb_big_size
 448         vld1.8  {$dat0},[$inp]
 449         cmp     $enc,#0                                 // en- or decrypting?
 450         ldr     $rounds,[$key,#240]
 451         vld1.32 {q5-q6},[$key],#32                      // load key schedule...
 452
 453         b.eq .Lecb_small_dec
 454         aese    $dat0,q5
 455         aesmc   $dat0,$dat0
 456         vld1.32 {q8-q9},[$key],#32                      // load key schedule...
 457         aese    $dat0,q6
 458         aesmc   $dat0,$dat0
 459         subs    $rounds,$rounds,#10                     // if rounds==10, jump to aes-128-ecb processing
 460         b.eq    .Lecb_128_enc
 461 .Lecb_round_loop:
 462         aese    $dat0,q8
 463         aesmc   $dat0,$dat0
 464         vld1.32 {q8},[$key],#16                         // load key schedule...
 465         aese    $dat0,q9
 466         aesmc   $dat0,$dat0
 467         vld1.32 {q9},[$key],#16                         // load key schedule...
 468         subs    $rounds,$rounds,#2                      // bias
 469         b.gt    .Lecb_round_loop
 470 .Lecb_128_enc:
 471         vld1.32 {q10-q11},[$key],#32            // load key schedule...
 472         aese    $dat0,q8
 473         aesmc   $dat0,$dat0
 474         aese    $dat0,q9
 475         aesmc   $dat0,$dat0
 476         vld1.32 {q12-q13},[$key],#32            // load key schedule...
 477         aese    $dat0,q10
 478         aesmc   $dat0,$dat0
 479         aese    $dat0,q11
 480         aesmc   $dat0,$dat0
 481         vld1.32 {q14-q15},[$key],#32            // load key schedule...
 482         aese    $dat0,q12
 483         aesmc   $dat0,$dat0
 484         aese    $dat0,q13
 485         aesmc   $dat0,$dat0
 486         vld1.32 {$rndlast},[$key]
 487         aese    $dat0,q14
 488         aesmc   $dat0,$dat0
 489         aese    $dat0,q15
 490         veor    $dat0,$dat0,$rndlast
 491         vst1.8  {$dat0},[$out]
 492         b       .Lecb_Final_abort
 493 .Lecb_small_dec:
 494         aesd    $dat0,q5
 495         aesimc  $dat0,$dat0
 496         vld1.32 {q8-q9},[$key],#32                      // load key schedule...
 497         aesd    $dat0,q6
 498         aesimc  $dat0,$dat0
 499         subs    $rounds,$rounds,#10                     // bias
 500         b.eq    .Lecb_128_dec
 501 .Lecb_dec_round_loop:
 502         aesd    $dat0,q8
 503         aesimc  $dat0,$dat0
 504         vld1.32 {q8},[$key],#16                         // load key schedule...
 505         aesd    $dat0,q9
 506         aesimc  $dat0,$dat0
 507         vld1.32 {q9},[$key],#16                         // load key schedule...
 508         subs    $rounds,$rounds,#2                      // bias
 509         b.gt    .Lecb_dec_round_loop
 510 .Lecb_128_dec:
 511         vld1.32 {q10-q11},[$key],#32            // load key schedule...
 512         aesd    $dat0,q8
 513         aesimc  $dat0,$dat0
 514         aesd    $dat0,q9
 515         aesimc  $dat0,$dat0
 516         vld1.32 {q12-q13},[$key],#32            // load key schedule...
 517         aesd    $dat0,q10
 518         aesimc  $dat0,$dat0
 519         aesd    $dat0,q11
 520         aesimc  $dat0,$dat0
 521         vld1.32 {q14-q15},[$key],#32            // load key schedule...
 522         aesd    $dat0,q12
 523         aesimc  $dat0,$dat0
 524         aesd    $dat0,q13
 525         aesimc  $dat0,$dat0
 526         vld1.32 {$rndlast},[$key]
 527         aesd    $dat0,q14
 528         aesimc  $dat0,$dat0
 529         aesd    $dat0,q15
 530         veor    $dat0,$dat0,$rndlast
 531         vst1.8  {$dat0},[$out]
 532         b       .Lecb_Final_abort
 533 .Lecb_big_size:
 534 ___
 535 $code.=<<___    if ($flavour =~ /64/);
 536         stp     x29,x30,[sp,#-16]!
 537         add     x29,sp,#0
 538 ___
 539 $code.=<<___    if ($flavour !~ /64/);
 540         mov     ip,sp
 541         stmdb   sp!,{r4-r8,lr}
 542         vstmdb  sp!,{d8-d15}                    @ ABI specification says so
 543         ldmia   ip,{r4-r5}                      @ load remaining args
 544         subs    $len,$len,#16
 545 ___
 546 $code.=<<___;
 547         mov     $step,#16
 548         b.lo    .Lecb_done
 549         cclr    $step,eq
 550
 551         cmp     $enc,#0                                 // en- or decrypting?
 552         ldr     $rounds,[$key,#240]
 553         and     $len,$len,#-16
 554         vld1.8  {$dat},[$inp],$step
 555
 556         vld1.32 {q8-q9},[$key]                          // load key schedule...
 557         sub     $rounds,$rounds,#6
 558         add     $key_,$key,x5,lsl#4                             // pointer to last 7 round keys
 559         sub     $rounds,$rounds,#2
 560         vld1.32 {q10-q11},[$key_],#32
 561         vld1.32 {q12-q13},[$key_],#32
 562         vld1.32 {q14-q15},[$key_],#32
 563         vld1.32 {$rndlast},[$key_]
 564
 565         add     $key_,$key,#32
 566         mov     $cnt,$rounds
 567         b.eq    .Lecb_dec
 568
 569         vld1.8  {$dat1},[$inp],#16
 570         subs    $len,$len,#32                           // bias
 571         add     $cnt,$rounds,#2
 572         vorr    $in1,$dat1,$dat1
 573         vorr    $dat2,$dat1,$dat1
 574         vorr    $dat1,$dat,$dat
 575         b.lo    .Lecb_enc_tail
 576
 577         vorr    $dat1,$in1,$in1
 578         vld1.8  {$dat2},[$inp],#16
 579 ___
 580 $code.=<<___    if ($flavour =~ /64/);
 581         cmp     $len,#32
 582         b.lo    .Loop3x_ecb_enc
 583
 584         vld1.8  {$dat3},[$inp],#16
 585         vld1.8  {$dat4},[$inp],#16
 586         sub     $len,$len,#32                           // bias
 587         mov     $cnt,$rounds
 588
 589 .Loop5x_ecb_enc:
 590         aese    $dat0,q8
 591         aesmc   $dat0,$dat0
 592         aese    $dat1,q8
 593         aesmc   $dat1,$dat1
 594         aese    $dat2,q8
 595         aesmc   $dat2,$dat2
 596         aese    $dat3,q8
 597         aesmc   $dat3,$dat3
 598         aese    $dat4,q8
 599         aesmc   $dat4,$dat4
 600         vld1.32 {q8},[$key_],#16
 601         subs    $cnt,$cnt,#2
 602         aese    $dat0,q9
 603         aesmc   $dat0,$dat0
 604         aese    $dat1,q9
 605         aesmc   $dat1,$dat1
 606         aese    $dat2,q9
 607         aesmc   $dat2,$dat2
 608         aese    $dat3,q9
 609         aesmc   $dat3,$dat3
 610         aese    $dat4,q9
 611         aesmc   $dat4,$dat4
 612         vld1.32 {q9},[$key_],#16
 613         b.gt    .Loop5x_ecb_enc
 614
 615         aese    $dat0,q8
 616         aesmc   $dat0,$dat0
 617         aese    $dat1,q8
 618         aesmc   $dat1,$dat1
 619         aese    $dat2,q8
 620         aesmc   $dat2,$dat2
 621         aese    $dat3,q8
 622         aesmc   $dat3,$dat3
 623         aese    $dat4,q8
 624         aesmc   $dat4,$dat4
 625         cmp     $len,#0x40                                      // because .Lecb_enc_tail4x
 626         sub     $len,$len,#0x50
 627
 628         aese    $dat0,q9
 629         aesmc   $dat0,$dat0
 630         aese    $dat1,q9
 631         aesmc   $dat1,$dat1
 632         aese    $dat2,q9
 633         aesmc   $dat2,$dat2
 634         aese    $dat3,q9
 635         aesmc   $dat3,$dat3
 636         aese    $dat4,q9
 637         aesmc   $dat4,$dat4
 638         csel    x6,xzr,$len,gt                  // borrow x6, $cnt, "gt" is not typo
 639         mov     $key_,$key
 640
 641         aese    $dat0,q10
 642         aesmc   $dat0,$dat0
 643         aese    $dat1,q10
 644         aesmc   $dat1,$dat1
 645         aese    $dat2,q10
 646         aesmc   $dat2,$dat2
 647         aese    $dat3,q10
 648         aesmc   $dat3,$dat3
 649         aese    $dat4,q10
 650         aesmc   $dat4,$dat4
 651         add     $inp,$inp,x6                            // $inp is adjusted in such way that
 652                                                         // at exit from the loop $dat1-$dat4
 653                                                         // are loaded with last "words"
 654         add     x6,$len,#0x60               // because .Lecb_enc_tail4x
 655
 656         aese    $dat0,q11
 657         aesmc   $dat0,$dat0
 658         aese    $dat1,q11
 659         aesmc   $dat1,$dat1
 660         aese    $dat2,q11
 661         aesmc   $dat2,$dat2
 662         aese    $dat3,q11
 663         aesmc   $dat3,$dat3
 664         aese    $dat4,q11
 665         aesmc   $dat4,$dat4
 666
 667         aese    $dat0,q12
 668         aesmc   $dat0,$dat0
 669         aese    $dat1,q12
 670         aesmc   $dat1,$dat1
 671         aese    $dat2,q12
 672         aesmc   $dat2,$dat2
 673         aese    $dat3,q12
 674         aesmc   $dat3,$dat3
 675         aese    $dat4,q12
 676         aesmc   $dat4,$dat4
 677
 678         aese    $dat0,q13
 679         aesmc   $dat0,$dat0
 680         aese    $dat1,q13
 681         aesmc   $dat1,$dat1
 682         aese    $dat2,q13
 683         aesmc   $dat2,$dat2
 684         aese    $dat3,q13
 685         aesmc   $dat3,$dat3
 686         aese    $dat4,q13
 687         aesmc   $dat4,$dat4
 688
 689         aese    $dat0,q14
 690         aesmc   $dat0,$dat0
 691         aese    $dat1,q14
 692         aesmc   $dat1,$dat1
 693         aese    $dat2,q14
 694         aesmc   $dat2,$dat2
 695         aese    $dat3,q14
 696         aesmc   $dat3,$dat3
 697         aese    $dat4,q14
 698         aesmc   $dat4,$dat4
 699
 700         aese    $dat0,q15
 701         vld1.8  {$in0},[$inp],#16
 702         aese    $dat1,q15
 703         vld1.8  {$in1},[$inp],#16
 704         aese    $dat2,q15
 705         vld1.8  {$in2},[$inp],#16
 706         aese    $dat3,q15
 707         vld1.8  {$in3},[$inp],#16
 708         aese    $dat4,q15
 709         vld1.8  {$in4},[$inp],#16
 710         cbz     x6,.Lecb_enc_tail4x
 711         vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
 712         veor    $tmp0,$rndlast,$dat0
 713         vorr    $dat0,$in0,$in0
 714         veor    $tmp1,$rndlast,$dat1
 715         vorr    $dat1,$in1,$in1
 716         veor    $tmp2,$rndlast,$dat2
 717         vorr    $dat2,$in2,$in2
 718         veor    $tmp3,$rndlast,$dat3
 719         vorr    $dat3,$in3,$in3
 720         veor    $tmp4,$rndlast,$dat4
 721         vst1.8  {$tmp0},[$out],#16
 722         vorr    $dat4,$in4,$in4
 723         vst1.8  {$tmp1},[$out],#16
 724         mov     $cnt,$rounds
 725         vst1.8  {$tmp2},[$out],#16
 726         vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
 727         vst1.8  {$tmp3},[$out],#16
 728         vst1.8  {$tmp4},[$out],#16
 729         b.hs    .Loop5x_ecb_enc
 730
 731         add     $len,$len,#0x50
 732         cbz     $len,.Lecb_done
 733
 734         add     $cnt,$rounds,#2
 735         subs    $len,$len,#0x30
 736         vorr    $dat0,$in2,$in2
 737         vorr    $dat1,$in3,$in3
 738         vorr    $dat2,$in4,$in4
 739         b.lo    .Lecb_enc_tail
 740
 741         b       .Loop3x_ecb_enc
 742
 743 .align  4
 744 .Lecb_enc_tail4x:
 745         veor    $tmp1,$rndlast,$dat1
 746         veor    $tmp2,$rndlast,$dat2
 747         veor    $tmp3,$rndlast,$dat3
 748         veor    $tmp4,$rndlast,$dat4
 749         vst1.8  {$tmp1},[$out],#16
 750         vst1.8  {$tmp2},[$out],#16
 751         vst1.8  {$tmp3},[$out],#16
 752         vst1.8  {$tmp4},[$out],#16
 753
 754         b       .Lecb_done
 755 .align  4
 756 ___
 757 $code.=<<___;
 758 .Loop3x_ecb_enc:
 759         aese    $dat0,q8
 760         aesmc   $dat0,$dat0
 761         aese    $dat1,q8
 762         aesmc   $dat1,$dat1
 763         aese    $dat2,q8
 764         aesmc   $dat2,$dat2
 765         vld1.32 {q8},[$key_],#16
 766         subs    $cnt,$cnt,#2
 767         aese    $dat0,q9
 768         aesmc   $dat0,$dat0
 769         aese    $dat1,q9
 770         aesmc   $dat1,$dat1
 771         aese    $dat2,q9
 772         aesmc   $dat2,$dat2
 773         vld1.32 {q9},[$key_],#16
 774         b.gt    .Loop3x_ecb_enc
 775
 776         aese    $dat0,q8
 777         aesmc   $dat0,$dat0
 778         aese    $dat1,q8
 779         aesmc   $dat1,$dat1
 780         aese    $dat2,q8
 781         aesmc   $dat2,$dat2
 782         subs    $len,$len,#0x30
 783         mov.lo  x6,$len                         // x6, $cnt, is zero at this point
 784         aese    $dat0,q9
 785         aesmc   $dat0,$dat0
 786         aese    $dat1,q9
 787         aesmc   $dat1,$dat1
 788         aese    $dat2,q9
 789         aesmc   $dat2,$dat2
 790         add     $inp,$inp,x6                    // $inp is adjusted in such way that
 791                                                 // at exit from the loop $dat1-$dat2
 792                                                 // are loaded with last "words"
 793         mov     $key_,$key
 794         aese    $dat0,q12
 795         aesmc   $dat0,$dat0
 796         aese    $dat1,q12
 797         aesmc   $dat1,$dat1
 798         aese    $dat2,q12
 799         aesmc   $dat2,$dat2
 800         vld1.8  {$in0},[$inp],#16
 801         aese    $dat0,q13
 802         aesmc   $dat0,$dat0
 803         aese    $dat1,q13
 804         aesmc   $dat1,$dat1
 805         aese    $dat2,q13
 806         aesmc   $dat2,$dat2
 807         vld1.8  {$in1},[$inp],#16
 808         aese    $dat0,q14
 809         aesmc   $dat0,$dat0
 810         aese    $dat1,q14
 811         aesmc   $dat1,$dat1
 812         aese    $dat2,q14
 813         aesmc   $dat2,$dat2
 814         vld1.8  {$in2},[$inp],#16
 815         aese    $dat0,q15
 816         aese    $dat1,q15
 817         aese    $dat2,q15
 818         vld1.32 {q8},[$key_],#16                // re-pre-load rndkey[0]
 819         add     $cnt,$rounds,#2
 820         veor    $tmp0,$rndlast,$dat0
 821         veor    $tmp1,$rndlast,$dat1
 822         veor    $dat2,$dat2,$rndlast
 823         vld1.32 {q9},[$key_],#16                // re-pre-load rndkey[1]
 824         vst1.8  {$tmp0},[$out],#16
 825         vorr    $dat0,$in0,$in0
 826         vst1.8  {$tmp1},[$out],#16
 827         vorr    $dat1,$in1,$in1
 828         vst1.8  {$dat2},[$out],#16
 829         vorr    $dat2,$in2,$in2
 830         b.hs    .Loop3x_ecb_enc
 831
 832         cmn     $len,#0x30
 833         b.eq    .Lecb_done
 834         nop
 835
 836 .Lecb_enc_tail:
 837         aese    $dat1,q8
 838         aesmc   $dat1,$dat1
 839         aese    $dat2,q8
 840         aesmc   $dat2,$dat2
 841         vld1.32 {q8},[$key_],#16
 842         subs    $cnt,$cnt,#2
 843         aese    $dat1,q9
 844         aesmc   $dat1,$dat1
 845         aese    $dat2,q9
 846         aesmc   $dat2,$dat2
 847         vld1.32 {q9},[$key_],#16
 848         b.gt    .Lecb_enc_tail
 849
 850         aese    $dat1,q8
 851         aesmc   $dat1,$dat1
 852         aese    $dat2,q8
 853         aesmc   $dat2,$dat2
 854         aese    $dat1,q9
 855         aesmc   $dat1,$dat1
 856         aese    $dat2,q9
 857         aesmc   $dat2,$dat2
 858         aese    $dat1,q12
 859         aesmc   $dat1,$dat1
 860         aese    $dat2,q12
 861         aesmc   $dat2,$dat2
 862         cmn     $len,#0x20
 863         aese    $dat1,q13
 864         aesmc   $dat1,$dat1
 865         aese    $dat2,q13
 866         aesmc   $dat2,$dat2
 867         aese    $dat1,q14
 868         aesmc   $dat1,$dat1
 869         aese    $dat2,q14
 870         aesmc   $dat2,$dat2
 871         aese    $dat1,q15
 872         aese    $dat2,q15
 873         b.eq    .Lecb_enc_one
 874         veor    $tmp1,$rndlast,$dat1
 875         veor    $tmp2,$rndlast,$dat2
 876         vst1.8  {$tmp1},[$out],#16
 877         vst1.8  {$tmp2},[$out],#16
 878         b       .Lecb_done
 879
 880 .Lecb_enc_one:
 881         veor    $tmp1,$rndlast,$dat2
 882         vst1.8  {$tmp1},[$out],#16
 883         b       .Lecb_done
 884 ___
 885
 886 $code.=<<___;
 887 .align  5
 888 .Lecb_dec:
 889         vld1.8  {$dat1},[$inp],#16
 890         subs    $len,$len,#32                   // bias
 891         add     $cnt,$rounds,#2
 892         vorr    $in1,$dat1,$dat1
 893         vorr    $dat2,$dat1,$dat1
 894         vorr    $dat1,$dat,$dat
 895         b.lo    .Lecb_dec_tail
 896
 897         vorr    $dat1,$in1,$in1
 898         vld1.8  {$dat2},[$inp],#16
 899 ___
 900 $code.=<<___    if ($flavour =~ /64/);
 901         cmp     $len,#32
 902         b.lo    .Loop3x_ecb_dec
 903
 904         vld1.8  {$dat3},[$inp],#16
 905         vld1.8  {$dat4},[$inp],#16
 906         sub     $len,$len,#32                           // bias
 907         mov     $cnt,$rounds
 908
 909 .Loop5x_ecb_dec:
 910         aesd    $dat0,q8
 911         aesimc  $dat0,$dat0
 912         aesd    $dat1,q8
 913         aesimc  $dat1,$dat1
 914         aesd    $dat2,q8
 915         aesimc  $dat2,$dat2
 916         aesd    $dat3,q8
 917         aesimc  $dat3,$dat3
 918         aesd    $dat4,q8
 919         aesimc  $dat4,$dat4
 920         vld1.32 {q8},[$key_],#16
 921         subs    $cnt,$cnt,#2
 922         aesd    $dat0,q9
 923         aesimc  $dat0,$dat0
 924         aesd    $dat1,q9
 925         aesimc  $dat1,$dat1
 926         aesd    $dat2,q9
 927         aesimc  $dat2,$dat2
 928         aesd    $dat3,q9
 929         aesimc  $dat3,$dat3
 930         aesd    $dat4,q9
 931         aesimc  $dat4,$dat4
 932         vld1.32 {q9},[$key_],#16
 933         b.gt    .Loop5x_ecb_dec
 934
 935         aesd    $dat0,q8
 936         aesimc  $dat0,$dat0
 937         aesd    $dat1,q8
 938         aesimc  $dat1,$dat1
 939         aesd    $dat2,q8
 940         aesimc  $dat2,$dat2
 941         aesd    $dat3,q8
 942         aesimc  $dat3,$dat3
 943         aesd    $dat4,q8
 944         aesimc  $dat4,$dat4
 945         cmp     $len,#0x40                              // because .Lecb_tail4x
 946         sub     $len,$len,#0x50
 947
 948         aesd    $dat0,q9
 949         aesimc  $dat0,$dat0
 950         aesd    $dat1,q9
 951         aesimc  $dat1,$dat1
 952         aesd    $dat2,q9
 953         aesimc  $dat2,$dat2
 954         aesd    $dat3,q9
 955         aesimc  $dat3,$dat3
 956         aesd    $dat4,q9
 957         aesimc  $dat4,$dat4
 958         csel    x6,xzr,$len,gt          // borrow x6, $cnt, "gt" is not typo
 959         mov     $key_,$key
 960
 961         aesd    $dat0,q10
 962         aesimc  $dat0,$dat0
 963         aesd    $dat1,q10
 964         aesimc  $dat1,$dat1
 965         aesd    $dat2,q10
 966         aesimc  $dat2,$dat2
 967         aesd    $dat3,q10
 968         aesimc  $dat3,$dat3
 969         aesd    $dat4,q10
 970         aesimc  $dat4,$dat4
 971         add     $inp,$inp,x6                            // $inp is adjusted in such way that
 972                                                         // at exit from the loop $dat1-$dat4
 973                                                         // are loaded with last "words"
 974         add     x6,$len,#0x60                   // because .Lecb_tail4x
 975
 976         aesd    $dat0,q11
 977         aesimc  $dat0,$dat0
 978         aesd    $dat1,q11
 979         aesimc  $dat1,$dat1
 980         aesd    $dat2,q11
 981         aesimc  $dat2,$dat2
 982         aesd    $dat3,q11
 983         aesimc  $dat3,$dat3
 984         aesd    $dat4,q11
 985         aesimc  $dat4,$dat4
 986
 987         aesd    $dat0,q12
 988         aesimc  $dat0,$dat0
 989         aesd    $dat1,q12
 990         aesimc  $dat1,$dat1
 991         aesd    $dat2,q12
 992         aesimc  $dat2,$dat2
 993         aesd    $dat3,q12
 994         aesimc  $dat3,$dat3
 995         aesd    $dat4,q12
 996         aesimc  $dat4,$dat4
 997
 998         aesd    $dat0,q13
 999         aesimc  $dat0,$dat0
1000         aesd    $dat1,q13
1001         aesimc  $dat1,$dat1
1002         aesd    $dat2,q13
1003         aesimc  $dat2,$dat2
1004         aesd    $dat3,q13
1005         aesimc  $dat3,$dat3
1006         aesd    $dat4,q13
1007         aesimc  $dat4,$dat4
1008
1009         aesd    $dat0,q14
1010         aesimc  $dat0,$dat0
1011         aesd    $dat1,q14
1012         aesimc  $dat1,$dat1
1013         aesd    $dat2,q14
1014         aesimc  $dat2,$dat2
1015         aesd    $dat3,q14
1016         aesimc  $dat3,$dat3
1017         aesd    $dat4,q14
1018         aesimc  $dat4,$dat4
1019
1020         aesd    $dat0,q15
1021         vld1.8  {$in0},[$inp],#16
1022         aesd    $dat1,q15
1023         vld1.8  {$in1},[$inp],#16
1024         aesd    $dat2,q15
1025         vld1.8  {$in2},[$inp],#16
1026         aesd    $dat3,q15
1027         vld1.8  {$in3},[$inp],#16
1028         aesd    $dat4,q15
1029         vld1.8  {$in4},[$inp],#16
1030         cbz     x6,.Lecb_tail4x
1031         vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
1032         veor    $tmp0,$rndlast,$dat0
1033         vorr    $dat0,$in0,$in0
1034         veor    $tmp1,$rndlast,$dat1
1035         vorr    $dat1,$in1,$in1
1036         veor    $tmp2,$rndlast,$dat2
1037         vorr    $dat2,$in2,$in2
1038         veor    $tmp3,$rndlast,$dat3
1039         vorr    $dat3,$in3,$in3
1040         veor    $tmp4,$rndlast,$dat4
1041         vst1.8  {$tmp0},[$out],#16
1042         vorr    $dat4,$in4,$in4
1043         vst1.8  {$tmp1},[$out],#16
1044         mov     $cnt,$rounds
1045         vst1.8  {$tmp2},[$out],#16
1046         vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
1047         vst1.8  {$tmp3},[$out],#16
1048         vst1.8  {$tmp4},[$out],#16
1049         b.hs    .Loop5x_ecb_dec
1050
1051         add     $len,$len,#0x50
1052         cbz     $len,.Lecb_done
1053
1054         add     $cnt,$rounds,#2
1055         subs    $len,$len,#0x30
1056         vorr    $dat0,$in2,$in2
1057         vorr    $dat1,$in3,$in3
1058         vorr    $dat2,$in4,$in4
1059         b.lo    .Lecb_dec_tail
1060
1061         b       .Loop3x_ecb_dec
1062
1063 .align  4
1064 .Lecb_tail4x:
1065         veor    $tmp1,$rndlast,$dat1
1066         veor    $tmp2,$rndlast,$dat2
1067         veor    $tmp3,$rndlast,$dat3
1068         veor    $tmp4,$rndlast,$dat4
1069         vst1.8  {$tmp1},[$out],#16
1070         vst1.8  {$tmp2},[$out],#16
1071         vst1.8  {$tmp3},[$out],#16
1072         vst1.8  {$tmp4},[$out],#16
1073
1074         b       .Lecb_done
1075 .align  4
1076 ___
1077 $code.=<<___;
1078 .Loop3x_ecb_dec:
1079         aesd    $dat0,q8
1080         aesimc  $dat0,$dat0
1081         aesd    $dat1,q8
1082         aesimc  $dat1,$dat1
1083         aesd    $dat2,q8
1084         aesimc  $dat2,$dat2
1085         vld1.32 {q8},[$key_],#16
1086         subs    $cnt,$cnt,#2
1087         aesd    $dat0,q9
1088         aesimc  $dat0,$dat0
1089         aesd    $dat1,q9
1090         aesimc  $dat1,$dat1
1091         aesd    $dat2,q9
1092         aesimc  $dat2,$dat2
1093         vld1.32 {q9},[$key_],#16
1094         b.gt    .Loop3x_ecb_dec
1095
1096         aesd    $dat0,q8
1097         aesimc  $dat0,$dat0
1098         aesd    $dat1,q8
1099         aesimc  $dat1,$dat1
1100         aesd    $dat2,q8
1101         aesimc  $dat2,$dat2
1102         subs    $len,$len,#0x30
1103         mov.lo  x6,$len                         // x6, $cnt, is zero at this point
1104         aesd    $dat0,q9
1105         aesimc  $dat0,$dat0
1106         aesd    $dat1,q9
1107         aesimc  $dat1,$dat1
1108         aesd    $dat2,q9
1109         aesimc  $dat2,$dat2
1110         add     $inp,$inp,x6                    // $inp is adjusted in such way that
1111                                                 // at exit from the loop $dat1-$dat2
1112                                                 // are loaded with last "words"
1113         mov     $key_,$key
1114         aesd    $dat0,q12
1115         aesimc  $dat0,$dat0
1116         aesd    $dat1,q12
1117         aesimc  $dat1,$dat1
1118         aesd    $dat2,q12
1119         aesimc  $dat2,$dat2
1120         vld1.8  {$in0},[$inp],#16
1121         aesd    $dat0,q13
1122         aesimc  $dat0,$dat0
1123         aesd    $dat1,q13
1124         aesimc  $dat1,$dat1
1125         aesd    $dat2,q13
1126         aesimc  $dat2,$dat2
1127         vld1.8  {$in1},[$inp],#16
1128         aesd    $dat0,q14
1129         aesimc  $dat0,$dat0
1130         aesd    $dat1,q14
1131         aesimc  $dat1,$dat1
1132         aesd    $dat2,q14
1133         aesimc  $dat2,$dat2
1134         vld1.8  {$in2},[$inp],#16
1135         aesd    $dat0,q15
1136         aesd    $dat1,q15
1137         aesd    $dat2,q15
1138         vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
1139         add     $cnt,$rounds,#2
1140         veor    $tmp0,$rndlast,$dat0
1141         veor    $tmp1,$rndlast,$dat1
1142         veor    $dat2,$dat2,$rndlast
1143         vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
1144         vst1.8  {$tmp0},[$out],#16
1145         vorr    $dat0,$in0,$in0
1146         vst1.8  {$tmp1},[$out],#16
1147         vorr    $dat1,$in1,$in1
1148         vst1.8  {$dat2},[$out],#16
1149         vorr    $dat2,$in2,$in2
1150         b.hs    .Loop3x_ecb_dec
1151
1152         cmn     $len,#0x30
1153         b.eq    .Lecb_done
1154         nop
1155
1156 .Lecb_dec_tail:
1157         aesd    $dat1,q8
1158         aesimc  $dat1,$dat1
1159         aesd    $dat2,q8
1160         aesimc  $dat2,$dat2
1161         vld1.32 {q8},[$key_],#16
1162         subs    $cnt,$cnt,#2
1163         aesd    $dat1,q9
1164         aesimc  $dat1,$dat1
1165         aesd    $dat2,q9
1166         aesimc  $dat2,$dat2
1167         vld1.32 {q9},[$key_],#16
1168         b.gt    .Lecb_dec_tail
1169
1170         aesd    $dat1,q8
1171         aesimc  $dat1,$dat1
1172         aesd    $dat2,q8
1173         aesimc  $dat2,$dat2
1174         aesd    $dat1,q9
1175         aesimc  $dat1,$dat1
1176         aesd    $dat2,q9
1177         aesimc  $dat2,$dat2
1178         aesd    $dat1,q12
1179         aesimc  $dat1,$dat1
1180         aesd    $dat2,q12
1181         aesimc  $dat2,$dat2
1182         cmn     $len,#0x20
1183         aesd    $dat1,q13
1184         aesimc  $dat1,$dat1
1185         aesd    $dat2,q13
1186         aesimc  $dat2,$dat2
1187         aesd    $dat1,q14
1188         aesimc  $dat1,$dat1
1189         aesd    $dat2,q14
1190         aesimc  $dat2,$dat2
1191         aesd    $dat1,q15
1192         aesd    $dat2,q15
1193         b.eq    .Lecb_dec_one
1194         veor    $tmp1,$rndlast,$dat1
1195         veor    $tmp2,$rndlast,$dat2
1196         vst1.8  {$tmp1},[$out],#16
1197         vst1.8  {$tmp2},[$out],#16
1198         b       .Lecb_done
1199
1200 .Lecb_dec_one:
1201         veor    $tmp1,$rndlast,$dat2
1202         vst1.8  {$tmp1},[$out],#16
1203
1204 .Lecb_done:
1205 ___
1206 }
1207 $code.=<<___    if ($flavour !~ /64/);
1208         vldmia  sp!,{d8-d15}
1209         ldmia   sp!,{r4-r8,pc}
1210 ___
1211 $code.=<<___    if ($flavour =~ /64/);
1212         ldr     x29,[sp],#16
1213 ___
1214 $code.=<<___    if ($flavour =~ /64/);
1215 .Lecb_Final_abort:
1216         ret
1217 ___
1218 $code.=<<___;
1219 .size   ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1220 ___
1221 }}}
1222 {{{
1223 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1224 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1225 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1226
1227 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1228 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1229
1230 ### q8-q15      preloaded key schedule
1231
1232 $code.=<<___;
1233 .globl  ${prefix}_cbc_encrypt
1234 .type   ${prefix}_cbc_encrypt,%function
1235 .align  5
1236 ${prefix}_cbc_encrypt:
1237 ___
1238 $code.=<<___    if ($flavour =~ /64/);
1239         stp     x29,x30,[sp,#-16]!
1240         add     x29,sp,#0
1241 ___
1242 $code.=<<___    if ($flavour !~ /64/);
1243         mov     ip,sp
1244         stmdb   sp!,{r4-r8,lr}
1245         vstmdb  sp!,{d8-d15}            @ ABI specification says so
1246         ldmia   ip,{r4-r5}              @ load remaining args
1247 ___
1248 $code.=<<___;
1249         subs    $len,$len,#16
1250         mov     $step,#16
1251         b.lo    .Lcbc_abort
1252         cclr    $step,eq
1253
1254         cmp     $enc,#0                 // en- or decrypting?
1255         ldr     $rounds,[$key,#240]
1256         and     $len,$len,#-16
1257         vld1.8  {$ivec},[$ivp]
1258         vld1.8  {$dat},[$inp],$step
1259
1260         vld1.32 {q8-q9},[$key]          // load key schedule...
1261         sub     $rounds,$rounds,#6
1262         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
1263         sub     $rounds,$rounds,#2
1264         vld1.32 {q10-q11},[$key_],#32
1265         vld1.32 {q12-q13},[$key_],#32
1266         vld1.32 {q14-q15},[$key_],#32
1267         vld1.32 {$rndlast},[$key_]
1268
1269         add     $key_,$key,#32
1270         mov     $cnt,$rounds
1271         b.eq    .Lcbc_dec
1272
1273         cmp     $rounds,#2
1274         veor    $dat,$dat,$ivec
1275         veor    $rndzero_n_last,q8,$rndlast
1276         b.eq    .Lcbc_enc128
1277
1278         vld1.32 {$in0-$in1},[$key_]
1279         add     $key_,$key,#16
1280         add     $key4,$key,#16*4
1281         add     $key5,$key,#16*5
1282         aese    $dat,q8
1283         aesmc   $dat,$dat
1284         add     $key6,$key,#16*6
1285         add     $key7,$key,#16*7
1286         b       .Lenter_cbc_enc
1287
1288 .align  4
1289 .Loop_cbc_enc:
1290         aese    $dat,q8
1291         aesmc   $dat,$dat
1292          vst1.8 {$ivec},[$out],#16
1293 .Lenter_cbc_enc:
1294         aese    $dat,q9
1295         aesmc   $dat,$dat
1296         aese    $dat,$in0
1297         aesmc   $dat,$dat
1298         vld1.32 {q8},[$key4]
1299         cmp     $rounds,#4
1300         aese    $dat,$in1
1301         aesmc   $dat,$dat
1302         vld1.32 {q9},[$key5]
1303         b.eq    .Lcbc_enc192
1304
1305         aese    $dat,q8
1306         aesmc   $dat,$dat
1307         vld1.32 {q8},[$key6]
1308         aese    $dat,q9
1309         aesmc   $dat,$dat
1310         vld1.32 {q9},[$key7]
1311         nop
1312
1313 .Lcbc_enc192:
1314         aese    $dat,q8
1315         aesmc   $dat,$dat
1316          subs   $len,$len,#16
1317         aese    $dat,q9
1318         aesmc   $dat,$dat
1319          cclr   $step,eq
1320         aese    $dat,q10
1321         aesmc   $dat,$dat
1322         aese    $dat,q11
1323         aesmc   $dat,$dat
1324          vld1.8 {q8},[$inp],$step
1325         aese    $dat,q12
1326         aesmc   $dat,$dat
1327          veor   q8,q8,$rndzero_n_last
1328         aese    $dat,q13
1329         aesmc   $dat,$dat
1330          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
1331         aese    $dat,q14
1332         aesmc   $dat,$dat
1333         aese    $dat,q15
1334         veor    $ivec,$dat,$rndlast
1335         b.hs    .Loop_cbc_enc
1336
1337         vst1.8  {$ivec},[$out],#16
1338         b       .Lcbc_done
1339
1340 .align  5
1341 .Lcbc_enc128:
1342         vld1.32 {$in0-$in1},[$key_]
1343         aese    $dat,q8
1344         aesmc   $dat,$dat
1345         b       .Lenter_cbc_enc128
1346 .Loop_cbc_enc128:
1347         aese    $dat,q8
1348         aesmc   $dat,$dat
1349          vst1.8 {$ivec},[$out],#16
1350 .Lenter_cbc_enc128:
1351         aese    $dat,q9
1352         aesmc   $dat,$dat
1353          subs   $len,$len,#16
1354         aese    $dat,$in0
1355         aesmc   $dat,$dat
1356          cclr   $step,eq
1357         aese    $dat,$in1
1358         aesmc   $dat,$dat
1359         aese    $dat,q10
1360         aesmc   $dat,$dat
1361         aese    $dat,q11
1362         aesmc   $dat,$dat
1363          vld1.8 {q8},[$inp],$step
1364         aese    $dat,q12
1365         aesmc   $dat,$dat
1366         aese    $dat,q13
1367         aesmc   $dat,$dat
1368         aese    $dat,q14
1369         aesmc   $dat,$dat
1370          veor   q8,q8,$rndzero_n_last
1371         aese    $dat,q15
1372         veor    $ivec,$dat,$rndlast
1373         b.hs    .Loop_cbc_enc128
1374
1375         vst1.8  {$ivec},[$out],#16
1376         b       .Lcbc_done
1377 ___
1378 {
1379 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1380
1381 my ($dat3,$in3,$tmp3);  # used only in 64-bit mode
1382 my ($dat4,$in4,$tmp4);
1383 if ($flavour =~ /64/) {
1384     ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1385 }
1386
1387 $code.=<<___;
1388 .align  5
1389 .Lcbc_dec:
1390         vld1.8  {$dat2},[$inp],#16
1391         subs    $len,$len,#32           // bias
1392         add     $cnt,$rounds,#2
1393         vorr    $in1,$dat,$dat
1394         vorr    $dat1,$dat,$dat
1395         vorr    $in2,$dat2,$dat2
1396         b.lo    .Lcbc_dec_tail
1397
1398         vorr    $dat1,$dat2,$dat2
1399         vld1.8  {$dat2},[$inp],#16
1400         vorr    $in0,$dat,$dat
1401         vorr    $in1,$dat1,$dat1
1402         vorr    $in2,$dat2,$dat2
1403 ___
1404 $code.=<<___    if ($flavour =~ /64/);
1405         cmp     $len,#32
1406         b.lo    .Loop3x_cbc_dec
1407
1408         vld1.8  {$dat3},[$inp],#16
1409         vld1.8  {$dat4},[$inp],#16
1410         sub     $len,$len,#32           // bias
1411         mov     $cnt,$rounds
1412         vorr    $in3,$dat3,$dat3
1413         vorr    $in4,$dat4,$dat4
1414
1415 .Loop5x_cbc_dec:
1416         aesd    $dat0,q8
1417         aesimc  $dat0,$dat0
1418         aesd    $dat1,q8
1419         aesimc  $dat1,$dat1
1420         aesd    $dat2,q8
1421         aesimc  $dat2,$dat2
1422         aesd    $dat3,q8
1423         aesimc  $dat3,$dat3
1424         aesd    $dat4,q8
1425         aesimc  $dat4,$dat4
1426         vld1.32 {q8},[$key_],#16
1427         subs    $cnt,$cnt,#2
1428         aesd    $dat0,q9
1429         aesimc  $dat0,$dat0
1430         aesd    $dat1,q9
1431         aesimc  $dat1,$dat1
1432         aesd    $dat2,q9
1433         aesimc  $dat2,$dat2
1434         aesd    $dat3,q9
1435         aesimc  $dat3,$dat3
1436         aesd    $dat4,q9
1437         aesimc  $dat4,$dat4
1438         vld1.32 {q9},[$key_],#16
1439         b.gt    .Loop5x_cbc_dec
1440
1441         aesd    $dat0,q8
1442         aesimc  $dat0,$dat0
1443         aesd    $dat1,q8
1444         aesimc  $dat1,$dat1
1445         aesd    $dat2,q8
1446         aesimc  $dat2,$dat2
1447         aesd    $dat3,q8
1448         aesimc  $dat3,$dat3
1449         aesd    $dat4,q8
1450         aesimc  $dat4,$dat4
1451          cmp    $len,#0x40              // because .Lcbc_tail4x
1452          sub    $len,$len,#0x50
1453
1454         aesd    $dat0,q9
1455         aesimc  $dat0,$dat0
1456         aesd    $dat1,q9
1457         aesimc  $dat1,$dat1
1458         aesd    $dat2,q9
1459         aesimc  $dat2,$dat2
1460         aesd    $dat3,q9
1461         aesimc  $dat3,$dat3
1462         aesd    $dat4,q9
1463         aesimc  $dat4,$dat4
1464          csel   x6,xzr,$len,gt          // borrow x6, $cnt, "gt" is not typo
1465          mov    $key_,$key
1466
1467         aesd    $dat0,q10
1468         aesimc  $dat0,$dat0
1469         aesd    $dat1,q10
1470         aesimc  $dat1,$dat1
1471         aesd    $dat2,q10
1472         aesimc  $dat2,$dat2
1473         aesd    $dat3,q10
1474         aesimc  $dat3,$dat3
1475         aesd    $dat4,q10
1476         aesimc  $dat4,$dat4
1477          add    $inp,$inp,x6            // $inp is adjusted in such way that
1478                                         // at exit from the loop $dat1-$dat4
1479                                         // are loaded with last "words"
1480          add    x6,$len,#0x60           // because .Lcbc_tail4x
1481
1482         aesd    $dat0,q11
1483         aesimc  $dat0,$dat0
1484         aesd    $dat1,q11
1485         aesimc  $dat1,$dat1
1486         aesd    $dat2,q11
1487         aesimc  $dat2,$dat2
1488         aesd    $dat3,q11
1489         aesimc  $dat3,$dat3
1490         aesd    $dat4,q11
1491         aesimc  $dat4,$dat4
1492
1493         aesd    $dat0,q12
1494         aesimc  $dat0,$dat0
1495         aesd    $dat1,q12
1496         aesimc  $dat1,$dat1
1497         aesd    $dat2,q12
1498         aesimc  $dat2,$dat2
1499         aesd    $dat3,q12
1500         aesimc  $dat3,$dat3
1501         aesd    $dat4,q12
1502         aesimc  $dat4,$dat4
1503
1504         aesd    $dat0,q13
1505         aesimc  $dat0,$dat0
1506         aesd    $dat1,q13
1507         aesimc  $dat1,$dat1
1508         aesd    $dat2,q13
1509         aesimc  $dat2,$dat2
1510         aesd    $dat3,q13
1511         aesimc  $dat3,$dat3
1512         aesd    $dat4,q13
1513         aesimc  $dat4,$dat4
1514
1515         aesd    $dat0,q14
1516         aesimc  $dat0,$dat0
1517         aesd    $dat1,q14
1518         aesimc  $dat1,$dat1
1519         aesd    $dat2,q14
1520         aesimc  $dat2,$dat2
1521         aesd    $dat3,q14
1522         aesimc  $dat3,$dat3
1523         aesd    $dat4,q14
1524         aesimc  $dat4,$dat4
1525
1526          veor   $tmp0,$ivec,$rndlast
1527         aesd    $dat0,q15
1528          veor   $tmp1,$in0,$rndlast
1529          vld1.8 {$in0},[$inp],#16
1530         aesd    $dat1,q15
1531          veor   $tmp2,$in1,$rndlast
1532          vld1.8 {$in1},[$inp],#16
1533         aesd    $dat2,q15
1534          veor   $tmp3,$in2,$rndlast
1535          vld1.8 {$in2},[$inp],#16
1536         aesd    $dat3,q15
1537          veor   $tmp4,$in3,$rndlast
1538          vld1.8 {$in3},[$inp],#16
1539         aesd    $dat4,q15
1540          vorr   $ivec,$in4,$in4
1541          vld1.8 {$in4},[$inp],#16
1542         cbz     x6,.Lcbc_tail4x
1543          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
1544         veor    $tmp0,$tmp0,$dat0
1545          vorr   $dat0,$in0,$in0
1546         veor    $tmp1,$tmp1,$dat1
1547          vorr   $dat1,$in1,$in1
1548         veor    $tmp2,$tmp2,$dat2
1549          vorr   $dat2,$in2,$in2
1550         veor    $tmp3,$tmp3,$dat3
1551          vorr   $dat3,$in3,$in3
1552         veor    $tmp4,$tmp4,$dat4
1553         vst1.8  {$tmp0},[$out],#16
1554          vorr   $dat4,$in4,$in4
1555         vst1.8  {$tmp1},[$out],#16
1556          mov    $cnt,$rounds
1557         vst1.8  {$tmp2},[$out],#16
1558          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
1559         vst1.8  {$tmp3},[$out],#16
1560         vst1.8  {$tmp4},[$out],#16
1561         b.hs    .Loop5x_cbc_dec
1562
1563         add     $len,$len,#0x50
1564         cbz     $len,.Lcbc_done
1565
1566         add     $cnt,$rounds,#2
1567         subs    $len,$len,#0x30
1568         vorr    $dat0,$in2,$in2
1569         vorr    $in0,$in2,$in2
1570         vorr    $dat1,$in3,$in3
1571         vorr    $in1,$in3,$in3
1572         vorr    $dat2,$in4,$in4
1573         vorr    $in2,$in4,$in4
1574         b.lo    .Lcbc_dec_tail
1575
1576         b       .Loop3x_cbc_dec
1577
1578 .align  4
1579 .Lcbc_tail4x:
1580         veor    $tmp1,$tmp0,$dat1
1581         veor    $tmp2,$tmp2,$dat2
1582         veor    $tmp3,$tmp3,$dat3
1583         veor    $tmp4,$tmp4,$dat4
1584         vst1.8  {$tmp1},[$out],#16
1585         vst1.8  {$tmp2},[$out],#16
1586         vst1.8  {$tmp3},[$out],#16
1587         vst1.8  {$tmp4},[$out],#16
1588
1589         b       .Lcbc_done
1590 .align  4
1591 ___
1592 $code.=<<___;
1593 .Loop3x_cbc_dec:
1594         aesd    $dat0,q8
1595         aesimc  $dat0,$dat0
1596         aesd    $dat1,q8
1597         aesimc  $dat1,$dat1
1598         aesd    $dat2,q8
1599         aesimc  $dat2,$dat2
1600         vld1.32 {q8},[$key_],#16
1601         subs    $cnt,$cnt,#2
1602         aesd    $dat0,q9
1603         aesimc  $dat0,$dat0
1604         aesd    $dat1,q9
1605         aesimc  $dat1,$dat1
1606         aesd    $dat2,q9
1607         aesimc  $dat2,$dat2
1608         vld1.32 {q9},[$key_],#16
1609         b.gt    .Loop3x_cbc_dec
1610
1611         aesd    $dat0,q8
1612         aesimc  $dat0,$dat0
1613         aesd    $dat1,q8
1614         aesimc  $dat1,$dat1
1615         aesd    $dat2,q8
1616         aesimc  $dat2,$dat2
1617          veor   $tmp0,$ivec,$rndlast
1618          subs   $len,$len,#0x30
1619          veor   $tmp1,$in0,$rndlast
1620          mov.lo x6,$len                 // x6, $cnt, is zero at this point
1621         aesd    $dat0,q9
1622         aesimc  $dat0,$dat0
1623         aesd    $dat1,q9
1624         aesimc  $dat1,$dat1
1625         aesd    $dat2,q9
1626         aesimc  $dat2,$dat2
1627          veor   $tmp2,$in1,$rndlast
1628          add    $inp,$inp,x6            // $inp is adjusted in such way that
1629                                         // at exit from the loop $dat1-$dat2
1630                                         // are loaded with last "words"
1631          vorr   $ivec,$in2,$in2
1632          mov    $key_,$key
1633         aesd    $dat0,q12
1634         aesimc  $dat0,$dat0
1635         aesd    $dat1,q12
1636         aesimc  $dat1,$dat1
1637         aesd    $dat2,q12
1638         aesimc  $dat2,$dat2
1639          vld1.8 {$in0},[$inp],#16
1640         aesd    $dat0,q13
1641         aesimc  $dat0,$dat0
1642         aesd    $dat1,q13
1643         aesimc  $dat1,$dat1
1644         aesd    $dat2,q13
1645         aesimc  $dat2,$dat2
1646          vld1.8 {$in1},[$inp],#16
1647         aesd    $dat0,q14
1648         aesimc  $dat0,$dat0
1649         aesd    $dat1,q14
1650         aesimc  $dat1,$dat1
1651         aesd    $dat2,q14
1652         aesimc  $dat2,$dat2
1653          vld1.8 {$in2},[$inp],#16
1654         aesd    $dat0,q15
1655         aesd    $dat1,q15
1656         aesd    $dat2,q15
1657          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
1658          add    $cnt,$rounds,#2
1659         veor    $tmp0,$tmp0,$dat0
1660         veor    $tmp1,$tmp1,$dat1
1661         veor    $dat2,$dat2,$tmp2
1662          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
1663         vst1.8  {$tmp0},[$out],#16
1664          vorr   $dat0,$in0,$in0
1665         vst1.8  {$tmp1},[$out],#16
1666          vorr   $dat1,$in1,$in1
1667         vst1.8  {$dat2},[$out],#16
1668          vorr   $dat2,$in2,$in2
1669         b.hs    .Loop3x_cbc_dec
1670
1671         cmn     $len,#0x30
1672         b.eq    .Lcbc_done
1673         nop
1674
1675 .Lcbc_dec_tail:
1676         aesd    $dat1,q8
1677         aesimc  $dat1,$dat1
1678         aesd    $dat2,q8
1679         aesimc  $dat2,$dat2
1680         vld1.32 {q8},[$key_],#16
1681         subs    $cnt,$cnt,#2
1682         aesd    $dat1,q9
1683         aesimc  $dat1,$dat1
1684         aesd    $dat2,q9
1685         aesimc  $dat2,$dat2
1686         vld1.32 {q9},[$key_],#16
1687         b.gt    .Lcbc_dec_tail
1688
1689         aesd    $dat1,q8
1690         aesimc  $dat1,$dat1
1691         aesd    $dat2,q8
1692         aesimc  $dat2,$dat2
1693         aesd    $dat1,q9
1694         aesimc  $dat1,$dat1
1695         aesd    $dat2,q9
1696         aesimc  $dat2,$dat2
1697         aesd    $dat1,q12
1698         aesimc  $dat1,$dat1
1699         aesd    $dat2,q12
1700         aesimc  $dat2,$dat2
1701          cmn    $len,#0x20
1702         aesd    $dat1,q13
1703         aesimc  $dat1,$dat1
1704         aesd    $dat2,q13
1705         aesimc  $dat2,$dat2
1706          veor   $tmp1,$ivec,$rndlast
1707         aesd    $dat1,q14
1708         aesimc  $dat1,$dat1
1709         aesd    $dat2,q14
1710         aesimc  $dat2,$dat2
1711          veor   $tmp2,$in1,$rndlast
1712         aesd    $dat1,q15
1713         aesd    $dat2,q15
1714         b.eq    .Lcbc_dec_one
1715         veor    $tmp1,$tmp1,$dat1
1716         veor    $tmp2,$tmp2,$dat2
1717          vorr   $ivec,$in2,$in2
1718         vst1.8  {$tmp1},[$out],#16
1719         vst1.8  {$tmp2},[$out],#16
1720         b       .Lcbc_done
1721
1722 .Lcbc_dec_one:
1723         veor    $tmp1,$tmp1,$dat2
1724          vorr   $ivec,$in2,$in2
1725         vst1.8  {$tmp1},[$out],#16
1726
1727 .Lcbc_done:
1728         vst1.8  {$ivec},[$ivp]
1729 .Lcbc_abort:
1730 ___
1731 }
1732 $code.=<<___    if ($flavour !~ /64/);
1733         vldmia  sp!,{d8-d15}
1734         ldmia   sp!,{r4-r8,pc}
1735 ___
1736 $code.=<<___    if ($flavour =~ /64/);
1737         ldr     x29,[sp],#16
1738         ret
1739 ___
1740 $code.=<<___;
1741 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1742 ___
1743 }}}
1744 {{{
1745 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1746 my ($rounds,$cnt,$key_)=("w5","w6","x7");
1747 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1748 my $step="x12";         # aliases with $tctr2
1749
1750 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1751 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1752
1753 # used only in 64-bit mode...
1754 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1755
1756 my ($dat,$tmp)=($dat0,$tmp0);
1757
1758 ### q8-q15      preloaded key schedule
1759
1760 $code.=<<___;
1761 .globl  ${prefix}_ctr32_encrypt_blocks
1762 .type   ${prefix}_ctr32_encrypt_blocks,%function
1763 .align  5
1764 ${prefix}_ctr32_encrypt_blocks:
1765 ___
1766 $code.=<<___    if ($flavour =~ /64/);
1767         stp             x29,x30,[sp,#-16]!
1768         add             x29,sp,#0
1769 ___
1770 $code.=<<___    if ($flavour !~ /64/);
1771         mov             ip,sp
1772         stmdb           sp!,{r4-r10,lr}
1773         vstmdb          sp!,{d8-d15}            @ ABI specification says so
1774         ldr             r4, [ip]                @ load remaining arg
1775 ___
1776 $code.=<<___;
1777         ldr             $rounds,[$key,#240]
1778
1779         ldr             $ctr, [$ivp, #12]
1780 #ifdef __ARMEB__
1781         vld1.8          {$dat0},[$ivp]
1782 #else
1783         vld1.32         {$dat0},[$ivp]
1784 #endif
1785         vld1.32         {q8-q9},[$key]          // load key schedule...
1786         sub             $rounds,$rounds,#4
1787         mov             $step,#16
1788         cmp             $len,#2
1789         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
1790         sub             $rounds,$rounds,#2
1791         vld1.32         {q12-q13},[$key_],#32
1792         vld1.32         {q14-q15},[$key_],#32
1793         vld1.32         {$rndlast},[$key_]
1794         add             $key_,$key,#32
1795         mov             $cnt,$rounds
1796         cclr            $step,lo
1797 #ifndef __ARMEB__
1798         rev             $ctr, $ctr
1799 #endif
1800         vorr            $dat1,$dat0,$dat0
1801         add             $tctr1, $ctr, #1
1802         vorr            $dat2,$dat0,$dat0
1803         add             $ctr, $ctr, #2
1804         vorr            $ivec,$dat0,$dat0
1805         rev             $tctr1, $tctr1
1806         vmov.32         ${dat1}[3],$tctr1
1807         b.ls            .Lctr32_tail
1808         rev             $tctr2, $ctr
1809         sub             $len,$len,#3            // bias
1810         vmov.32         ${dat2}[3],$tctr2
1811 ___
1812 $code.=<<___    if ($flavour =~ /64/);
1813         cmp             $len,#2
1814         b.lo            .Loop3x_ctr32
1815
1816         add             w13,$ctr,#1
1817         add             w14,$ctr,#2
1818         vorr            $dat3,$dat0,$dat0
1819         rev             w13,w13
1820         vorr            $dat4,$dat0,$dat0
1821         rev             w14,w14
1822         vmov.32         ${dat3}[3],w13
1823         sub             $len,$len,#2            // bias
1824         vmov.32         ${dat4}[3],w14
1825         add             $ctr,$ctr,#2
1826         b               .Loop5x_ctr32
1827
1828 .align  4
1829 .Loop5x_ctr32:
1830         aese            $dat0,q8
1831         aesmc           $dat0,$dat0
1832         aese            $dat1,q8
1833         aesmc           $dat1,$dat1
1834         aese            $dat2,q8
1835         aesmc           $dat2,$dat2
1836         aese            $dat3,q8
1837         aesmc           $dat3,$dat3
1838         aese            $dat4,q8
1839         aesmc           $dat4,$dat4
1840         vld1.32         {q8},[$key_],#16
1841         subs            $cnt,$cnt,#2
1842         aese            $dat0,q9
1843         aesmc           $dat0,$dat0
1844         aese            $dat1,q9
1845         aesmc           $dat1,$dat1
1846         aese            $dat2,q9
1847         aesmc           $dat2,$dat2
1848         aese            $dat3,q9
1849         aesmc           $dat3,$dat3
1850         aese            $dat4,q9
1851         aesmc           $dat4,$dat4
1852         vld1.32         {q9},[$key_],#16
1853         b.gt            .Loop5x_ctr32
1854
1855         mov             $key_,$key
1856         aese            $dat0,q8
1857         aesmc           $dat0,$dat0
1858         aese            $dat1,q8
1859         aesmc           $dat1,$dat1
1860         aese            $dat2,q8
1861         aesmc           $dat2,$dat2
1862         aese            $dat3,q8
1863         aesmc           $dat3,$dat3
1864         aese            $dat4,q8
1865         aesmc           $dat4,$dat4
1866         vld1.32         {q8},[$key_],#16        // re-pre-load rndkey[0]
1867
1868         aese            $dat0,q9
1869         aesmc           $dat0,$dat0
1870         aese            $dat1,q9
1871         aesmc           $dat1,$dat1
1872         aese            $dat2,q9
1873         aesmc           $dat2,$dat2
1874         aese            $dat3,q9
1875         aesmc           $dat3,$dat3
1876         aese            $dat4,q9
1877         aesmc           $dat4,$dat4
1878         vld1.32         {q9},[$key_],#16        // re-pre-load rndkey[1]
1879
1880         aese            $dat0,q12
1881         aesmc           $dat0,$dat0
1882          add            $tctr0,$ctr,#1
1883          add            $tctr1,$ctr,#2
1884         aese            $dat1,q12
1885         aesmc           $dat1,$dat1
1886          add            $tctr2,$ctr,#3
1887          add            w13,$ctr,#4
1888         aese            $dat2,q12
1889         aesmc           $dat2,$dat2
1890          add            w14,$ctr,#5
1891          rev            $tctr0,$tctr0
1892         aese            $dat3,q12
1893         aesmc           $dat3,$dat3
1894          rev            $tctr1,$tctr1
1895          rev            $tctr2,$tctr2
1896         aese            $dat4,q12
1897         aesmc           $dat4,$dat4
1898          rev            w13,w13
1899          rev            w14,w14
1900
1901         aese            $dat0,q13
1902         aesmc           $dat0,$dat0
1903         aese            $dat1,q13
1904         aesmc           $dat1,$dat1
1905         aese            $dat2,q13
1906         aesmc           $dat2,$dat2
1907         aese            $dat3,q13
1908         aesmc           $dat3,$dat3
1909         aese            $dat4,q13
1910         aesmc           $dat4,$dat4
1911
1912         aese            $dat0,q14
1913         aesmc           $dat0,$dat0
1914          vld1.8         {$in0},[$inp],#16
1915         aese            $dat1,q14
1916         aesmc           $dat1,$dat1
1917          vld1.8         {$in1},[$inp],#16
1918         aese            $dat2,q14
1919         aesmc           $dat2,$dat2
1920          vld1.8         {$in2},[$inp],#16
1921         aese            $dat3,q14
1922         aesmc           $dat3,$dat3
1923          vld1.8         {$in3},[$inp],#16
1924         aese            $dat4,q14
1925         aesmc           $dat4,$dat4
1926          vld1.8         {$in4},[$inp],#16
1927
1928         aese            $dat0,q15
1929          veor           $in0,$in0,$rndlast
1930         aese            $dat1,q15
1931          veor           $in1,$in1,$rndlast
1932         aese            $dat2,q15
1933          veor           $in2,$in2,$rndlast
1934         aese            $dat3,q15
1935          veor           $in3,$in3,$rndlast
1936         aese            $dat4,q15
1937          veor           $in4,$in4,$rndlast
1938
1939         veor            $in0,$in0,$dat0
1940          vorr           $dat0,$ivec,$ivec
1941         veor            $in1,$in1,$dat1
1942          vorr           $dat1,$ivec,$ivec
1943         veor            $in2,$in2,$dat2
1944          vorr           $dat2,$ivec,$ivec
1945         veor            $in3,$in3,$dat3
1946          vorr           $dat3,$ivec,$ivec
1947         veor            $in4,$in4,$dat4
1948          vorr           $dat4,$ivec,$ivec
1949
1950         vst1.8          {$in0},[$out],#16
1951          vmov.32        ${dat0}[3],$tctr0
1952         vst1.8          {$in1},[$out],#16
1953          vmov.32        ${dat1}[3],$tctr1
1954         vst1.8          {$in2},[$out],#16
1955          vmov.32        ${dat2}[3],$tctr2
1956         vst1.8          {$in3},[$out],#16
1957          vmov.32        ${dat3}[3],w13
1958         vst1.8          {$in4},[$out],#16
1959          vmov.32        ${dat4}[3],w14
1960
1961         mov             $cnt,$rounds
1962         cbz             $len,.Lctr32_done
1963
1964         add             $ctr,$ctr,#5
1965         subs            $len,$len,#5
1966         b.hs            .Loop5x_ctr32
1967
1968         add             $len,$len,#5
1969         sub             $ctr,$ctr,#5
1970
1971         cmp             $len,#2
1972         mov             $step,#16
1973         cclr            $step,lo
1974         b.ls            .Lctr32_tail
1975
1976         sub             $len,$len,#3            // bias
1977         add             $ctr,$ctr,#3
1978 ___
1979 $code.=<<___;
1980         b               .Loop3x_ctr32
1981
1982 .align  4
1983 .Loop3x_ctr32:
1984         aese            $dat0,q8
1985         aesmc           $dat0,$dat0
1986         aese            $dat1,q8
1987         aesmc           $dat1,$dat1
1988         aese            $dat2,q8
1989         aesmc           $dat2,$dat2
1990         vld1.32         {q8},[$key_],#16
1991         subs            $cnt,$cnt,#2
1992         aese            $dat0,q9
1993         aesmc           $dat0,$dat0
1994         aese            $dat1,q9
1995         aesmc           $dat1,$dat1
1996         aese            $dat2,q9
1997         aesmc           $dat2,$dat2
1998         vld1.32         {q9},[$key_],#16
1999         b.gt            .Loop3x_ctr32
2000
2001         aese            $dat0,q8
2002         aesmc           $tmp0,$dat0
2003         aese            $dat1,q8
2004         aesmc           $tmp1,$dat1
2005          vld1.8         {$in0},[$inp],#16
2006          vorr           $dat0,$ivec,$ivec
2007         aese            $dat2,q8
2008         aesmc           $dat2,$dat2
2009          vld1.8         {$in1},[$inp],#16
2010          vorr           $dat1,$ivec,$ivec
2011         aese            $tmp0,q9
2012         aesmc           $tmp0,$tmp0
2013         aese            $tmp1,q9
2014         aesmc           $tmp1,$tmp1
2015          vld1.8         {$in2},[$inp],#16
2016          mov            $key_,$key
2017         aese            $dat2,q9
2018         aesmc           $tmp2,$dat2
2019          vorr           $dat2,$ivec,$ivec
2020          add            $tctr0,$ctr,#1
2021         aese            $tmp0,q12
2022         aesmc           $tmp0,$tmp0
2023         aese            $tmp1,q12
2024         aesmc           $tmp1,$tmp1
2025          veor           $in0,$in0,$rndlast
2026          add            $tctr1,$ctr,#2
2027         aese            $tmp2,q12
2028         aesmc           $tmp2,$tmp2
2029          veor           $in1,$in1,$rndlast
2030          add            $ctr,$ctr,#3
2031         aese            $tmp0,q13
2032         aesmc           $tmp0,$tmp0
2033         aese            $tmp1,q13
2034         aesmc           $tmp1,$tmp1
2035          veor           $in2,$in2,$rndlast
2036          rev            $tctr0,$tctr0
2037         aese            $tmp2,q13
2038         aesmc           $tmp2,$tmp2
2039          vmov.32        ${dat0}[3], $tctr0
2040          rev            $tctr1,$tctr1
2041         aese            $tmp0,q14
2042         aesmc           $tmp0,$tmp0
2043         aese            $tmp1,q14
2044         aesmc           $tmp1,$tmp1
2045          vmov.32        ${dat1}[3], $tctr1
2046          rev            $tctr2,$ctr
2047         aese            $tmp2,q14
2048         aesmc           $tmp2,$tmp2
2049          vmov.32        ${dat2}[3], $tctr2
2050          subs           $len,$len,#3
2051         aese            $tmp0,q15
2052         aese            $tmp1,q15
2053         aese            $tmp2,q15
2054
2055         veor            $in0,$in0,$tmp0
2056          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
2057         vst1.8          {$in0},[$out],#16
2058         veor            $in1,$in1,$tmp1
2059          mov            $cnt,$rounds
2060         vst1.8          {$in1},[$out],#16
2061         veor            $in2,$in2,$tmp2
2062          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
2063         vst1.8          {$in2},[$out],#16
2064         b.hs            .Loop3x_ctr32
2065
2066         adds            $len,$len,#3
2067         b.eq            .Lctr32_done
2068         cmp             $len,#1
2069         mov             $step,#16
2070         cclr            $step,eq
2071
2072 .Lctr32_tail:
2073         aese            $dat0,q8
2074         aesmc           $dat0,$dat0
2075         aese            $dat1,q8
2076         aesmc           $dat1,$dat1
2077         vld1.32         {q8},[$key_],#16
2078         subs            $cnt,$cnt,#2
2079         aese            $dat0,q9
2080         aesmc           $dat0,$dat0
2081         aese            $dat1,q9
2082         aesmc           $dat1,$dat1
2083         vld1.32         {q9},[$key_],#16
2084         b.gt            .Lctr32_tail
2085
2086         aese            $dat0,q8
2087         aesmc           $dat0,$dat0
2088         aese            $dat1,q8
2089         aesmc           $dat1,$dat1
2090         aese            $dat0,q9
2091         aesmc           $dat0,$dat0
2092         aese            $dat1,q9
2093         aesmc           $dat1,$dat1
2094          vld1.8         {$in0},[$inp],$step
2095         aese            $dat0,q12
2096         aesmc           $dat0,$dat0
2097         aese            $dat1,q12
2098         aesmc           $dat1,$dat1
2099          vld1.8         {$in1},[$inp]
2100         aese            $dat0,q13
2101         aesmc           $dat0,$dat0
2102         aese            $dat1,q13
2103         aesmc           $dat1,$dat1
2104          veor           $in0,$in0,$rndlast
2105         aese            $dat0,q14
2106         aesmc           $dat0,$dat0
2107         aese            $dat1,q14
2108         aesmc           $dat1,$dat1
2109          veor           $in1,$in1,$rndlast
2110         aese            $dat0,q15
2111         aese            $dat1,q15
2112
2113         cmp             $len,#1
2114         veor            $in0,$in0,$dat0
2115         veor            $in1,$in1,$dat1
2116         vst1.8          {$in0},[$out],#16
2117         b.eq            .Lctr32_done
2118         vst1.8          {$in1},[$out]
2119
2120 .Lctr32_done:
2121 ___
2122 $code.=<<___    if ($flavour !~ /64/);
2123         vldmia          sp!,{d8-d15}
2124         ldmia           sp!,{r4-r10,pc}
2125 ___
2126 $code.=<<___    if ($flavour =~ /64/);
2127         ldr             x29,[sp],#16
2128         ret
2129 ___
2130 $code.=<<___;
2131 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2132 ___
2133 }}}
2134 $code.=<<___;
2135 #endif
2136 ___
2137 ########################################
2138 if ($flavour =~ /64/) {                 ######## 64-bit code
2139     my %opcode = (
2140         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
2141         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
2142
2143     local *unaes = sub {
2144         my ($mnemonic,$arg)=@_;
2145
2146         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
2147         sprintf ".inst\t0x%08x\t//%s %s",
2148                         $opcode{$mnemonic}|$1|($2<<5),
2149                         $mnemonic,$arg;
2150     };
2151
2152     foreach(split("\n",$code)) {
2153         s/\`([^\`]*)\`/eval($1)/geo;
2154
2155         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
2156         s/@\s/\/\//o;                   # old->new style commentary
2157
2158         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
2159         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
2160         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
2161         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
2162         s/vext\.8/ext/o         or
2163         s/vrev32\.8/rev32/o     or
2164         s/vtst\.8/cmtst/o       or
2165         s/vshr/ushr/o           or
2166         s/^(\s+)v/$1/o          or      # strip off v prefix
2167         s/\bbx\s+lr\b/ret/o;
2168
2169         # fix up remaining legacy suffixes
2170         s/\.[ui]?8//o;
2171         m/\],#8/o and s/\.16b/\.8b/go;
2172         s/\.[ui]?32//o and s/\.16b/\.4s/go;
2173         s/\.[ui]?64//o and s/\.16b/\.2d/go;
2174         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
2175
2176         print $_,"\n";
2177     }
2178 } else {                                ######## 32-bit code
2179     my %opcode = (
2180         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
2181         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
2182
2183     local *unaes = sub {
2184         my ($mnemonic,$arg)=@_;
2185
2186         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
2187             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
2188                                          |(($2&7)<<1) |(($2&8)<<2);
2189             # since ARMv7 instructions are always encoded little-endian.
2190             # correct solution is to use .inst directive, but older
2191             # assemblers don't implement it:-(
2192             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
2193                         $word&0xff,($word>>8)&0xff,
2194                         ($word>>16)&0xff,($word>>24)&0xff,
2195                         $mnemonic,$arg;
2196         }
2197     };
2198
2199     sub unvtbl {
2200         my $arg=shift;
2201
2202         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
2203         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
2204                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
2205     }
2206
2207     sub unvdup32 {
2208         my $arg=shift;
2209
2210         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
2211         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
2212     }
2213
2214     sub unvmov32 {
2215         my $arg=shift;
2216
2217         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
2218         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
2219     }
2220
2221     foreach(split("\n",$code)) {
2222         s/\`([^\`]*)\`/eval($1)/geo;
2223
2224         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
2225         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
2226         s/\/\/\s?/@ /o;                         # new->old style commentary
2227
2228         # fix up remaining new-style suffixes
2229         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
2230         s/\],#[0-9]+/]!/o;
2231
2232         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
2233         s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2     $1,#0/o or
2234         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
2235         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
2236         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
2237         s/^(\s+)b\./$1b/o                               or
2238         s/^(\s+)ret/$1bx\tlr/o;
2239
2240         if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
2241             print "     it      $2\n";
2242         }
2243
2244         print $_,"\n";
2245     }
2246 }
2247
2248 close STDOUT or die "error closing STDOUT: $!";