crypto/aes/asm/bsaes-armv7.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 #
  16 # Specific modes and adaptation for Linux kernel by Ard Biesheuvel
  17 # <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
  18 # granted.
  19 # ====================================================================
  20
  21 # Bit-sliced AES for ARM NEON
  22 #
  23 # February 2012.
  24 #
  25 # This implementation is direct adaptation of bsaes-x86_64 module for
  26 # ARM NEON. Except that this module is endian-neutral [in sense that
  27 # it can be compiled for either endianness] by courtesy of vld1.8's
  28 # neutrality. Initial version doesn't implement interface to OpenSSL,
  29 # only low-level primitives and unsupported entry points, just enough
  30 # to collect performance results, which for Cortex-A8 core are:
  31 #
  32 # encrypt       19.5 cycles per byte processed with 128-bit key
  33 # decrypt       22.1 cycles per byte processed with 128-bit key
  34 # key conv.     440  cycles per 128-bit key/0.18 of 8x block
  35 #
  36 # Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
  37 # which is [much] worse than anticipated (for further details see
  38 # http://www.openssl.org/~appro/Snapdragon-S4.html).
  39 #
  40 # Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
  41 # manages in 20.0 cycles].
  42 #
  43 # When comparing to x86_64 results keep in mind that NEON unit is
  44 # [mostly] single-issue and thus can't [fully] benefit from
  45 # instruction-level parallelism. And when comparing to aes-armv4
  46 # results keep in mind key schedule conversion overhead (see
  47 # bsaes-x86_64.pl for further details)...
  48 #
  49 #                                               <appro@openssl.org>
  50
  51 # April-August 2013
  52 #
  53 # Add CBC, CTR and XTS subroutines, adapt for kernel use.
  54 #
  55 #                                       <ard.biesheuvel@linaro.org>
  56
  57 $flavour = shift;
  58 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  59 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  60
  61 if ($flavour && $flavour ne "void") {
  62     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  63     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  64     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  65     die "can't locate arm-xlate.pl";
  66
  67     open STDOUT,"| \"$^X\" $xlate $flavour $output";
  68 } else {
  69     open STDOUT,">$output";
  70 }
  71
  72 my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
  73 my @XMM=map("q$_",(0..15));
  74
  75 {
  76 my ($key,$rounds,$const)=("r4","r5","r6");
  77
  78 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
  79 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
  80
  81 sub Sbox {
  82 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  83 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
  84 my @b=@_[0..7];
  85 my @t=@_[8..11];
  86 my @s=@_[12..15];
  87         &InBasisChange  (@b);
  88         &Inv_GF256      (@b[6,5,0,3,7,1,4,2],@t,@s);
  89         &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
  90 }
  91
  92 sub InBasisChange {
  93 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  94 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
  95 my @b=@_[0..7];
  96 $code.=<<___;
  97         veor    @b[2], @b[2], @b[1]
  98         veor    @b[5], @b[5], @b[6]
  99         veor    @b[3], @b[3], @b[0]
 100         veor    @b[6], @b[6], @b[2]
 101         veor    @b[5], @b[5], @b[0]
 102
 103         veor    @b[6], @b[6], @b[3]
 104         veor    @b[3], @b[3], @b[7]
 105         veor    @b[7], @b[7], @b[5]
 106         veor    @b[3], @b[3], @b[4]
 107         veor    @b[4], @b[4], @b[5]
 108
 109         veor    @b[2], @b[2], @b[7]
 110         veor    @b[3], @b[3], @b[1]
 111         veor    @b[1], @b[1], @b[5]
 112 ___
 113 }
 114
 115 sub OutBasisChange {
 116 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
 117 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
 118 my @b=@_[0..7];
 119 $code.=<<___;
 120         veor    @b[0], @b[0], @b[6]
 121         veor    @b[1], @b[1], @b[4]
 122         veor    @b[4], @b[4], @b[6]
 123         veor    @b[2], @b[2], @b[0]
 124         veor    @b[6], @b[6], @b[1]
 125
 126         veor    @b[1], @b[1], @b[5]
 127         veor    @b[5], @b[5], @b[3]
 128         veor    @b[3], @b[3], @b[7]
 129         veor    @b[7], @b[7], @b[5]
 130         veor    @b[2], @b[2], @b[5]
 131
 132         veor    @b[4], @b[4], @b[7]
 133 ___
 134 }
 135
 136 sub InvSbox {
 137 # input in lsb  > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
 138 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
 139 my @b=@_[0..7];
 140 my @t=@_[8..11];
 141 my @s=@_[12..15];
 142         &InvInBasisChange       (@b);
 143         &Inv_GF256              (@b[5,1,2,6,3,7,0,4],@t,@s);
 144         &InvOutBasisChange      (@b[3,7,0,4,5,1,2,6]);
 145 }
 146
 147 sub InvInBasisChange {          # OutBasisChange in reverse (with twist)
 148 my @b=@_[5,1,2,6,3,7,0,4];
 149 $code.=<<___
 150          veor   @b[1], @b[1], @b[7]
 151         veor    @b[4], @b[4], @b[7]
 152
 153         veor    @b[7], @b[7], @b[5]
 154          veor   @b[1], @b[1], @b[3]
 155         veor    @b[2], @b[2], @b[5]
 156         veor    @b[3], @b[3], @b[7]
 157
 158         veor    @b[6], @b[6], @b[1]
 159         veor    @b[2], @b[2], @b[0]
 160          veor   @b[5], @b[5], @b[3]
 161         veor    @b[4], @b[4], @b[6]
 162         veor    @b[0], @b[0], @b[6]
 163         veor    @b[1], @b[1], @b[4]
 164 ___
 165 }
 166
 167 sub InvOutBasisChange {         # InBasisChange in reverse
 168 my @b=@_[2,5,7,3,6,1,0,4];
 169 $code.=<<___;
 170         veor    @b[1], @b[1], @b[5]
 171         veor    @b[2], @b[2], @b[7]
 172
 173         veor    @b[3], @b[3], @b[1]
 174         veor    @b[4], @b[4], @b[5]
 175         veor    @b[7], @b[7], @b[5]
 176         veor    @b[3], @b[3], @b[4]
 177          veor   @b[5], @b[5], @b[0]
 178         veor    @b[3], @b[3], @b[7]
 179          veor   @b[6], @b[6], @b[2]
 180          veor   @b[2], @b[2], @b[1]
 181         veor    @b[6], @b[6], @b[3]
 182
 183         veor    @b[3], @b[3], @b[0]
 184         veor    @b[5], @b[5], @b[6]
 185 ___
 186 }
 187
 188 sub Mul_GF4 {
 189 #;*************************************************************
 190 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
 191 #;*************************************************************
 192 my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
 193 $code.=<<___;
 194         veor    $t0, $y0, $y1
 195         vand    $t0, $t0, $x0
 196         veor    $x0, $x0, $x1
 197         vand    $t1, $x1, $y0
 198         vand    $x0, $x0, $y1
 199         veor    $x1, $t1, $t0
 200         veor    $x0, $x0, $t1
 201 ___
 202 }
 203
 204 sub Mul_GF4_N {                         # not used, see next subroutine
 205 # multiply and scale by N
 206 my ($x0,$x1,$y0,$y1,$t0)=@_;
 207 $code.=<<___;
 208         veor    $t0, $y0, $y1
 209         vand    $t0, $t0, $x0
 210         veor    $x0, $x0, $x1
 211         vand    $x1, $x1, $y0
 212         vand    $x0, $x0, $y1
 213         veor    $x1, $x1, $x0
 214         veor    $x0, $x0, $t0
 215 ___
 216 }
 217
 218 sub Mul_GF4_N_GF4 {
 219 # interleaved Mul_GF4_N and Mul_GF4
 220 my ($x0,$x1,$y0,$y1,$t0,
 221     $x2,$x3,$y2,$y3,$t1)=@_;
 222 $code.=<<___;
 223         veor    $t0, $y0, $y1
 224          veor   $t1, $y2, $y3
 225         vand    $t0, $t0, $x0
 226          vand   $t1, $t1, $x2
 227         veor    $x0, $x0, $x1
 228          veor   $x2, $x2, $x3
 229         vand    $x1, $x1, $y0
 230          vand   $x3, $x3, $y2
 231         vand    $x0, $x0, $y1
 232          vand   $x2, $x2, $y3
 233         veor    $x1, $x1, $x0
 234          veor   $x2, $x2, $x3
 235         veor    $x0, $x0, $t0
 236          veor   $x3, $x3, $t1
 237 ___
 238 }
 239 sub Mul_GF16_2 {
 240 my @x=@_[0..7];
 241 my @y=@_[8..11];
 242 my @t=@_[12..15];
 243 $code.=<<___;
 244         veor    @t[0], @x[0], @x[2]
 245         veor    @t[1], @x[1], @x[3]
 246 ___
 247         &Mul_GF4        (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
 248 $code.=<<___;
 249         veor    @y[0], @y[0], @y[2]
 250         veor    @y[1], @y[1], @y[3]
 251 ___
 252         Mul_GF4_N_GF4   (@t[0], @t[1], @y[0], @y[1], @t[3],
 253                          @x[2], @x[3], @y[2], @y[3], @t[2]);
 254 $code.=<<___;
 255         veor    @x[0], @x[0], @t[0]
 256         veor    @x[2], @x[2], @t[0]
 257         veor    @x[1], @x[1], @t[1]
 258         veor    @x[3], @x[3], @t[1]
 259
 260         veor    @t[0], @x[4], @x[6]
 261         veor    @t[1], @x[5], @x[7]
 262 ___
 263         &Mul_GF4_N_GF4  (@t[0], @t[1], @y[0], @y[1], @t[3],
 264                          @x[6], @x[7], @y[2], @y[3], @t[2]);
 265 $code.=<<___;
 266         veor    @y[0], @y[0], @y[2]
 267         veor    @y[1], @y[1], @y[3]
 268 ___
 269         &Mul_GF4        (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
 270 $code.=<<___;
 271         veor    @x[4], @x[4], @t[0]
 272         veor    @x[6], @x[6], @t[0]
 273         veor    @x[5], @x[5], @t[1]
 274         veor    @x[7], @x[7], @t[1]
 275 ___
 276 }
 277 sub Inv_GF256 {
 278 #;********************************************************************
 279 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
 280 #;********************************************************************
 281 my @x=@_[0..7];
 282 my @t=@_[8..11];
 283 my @s=@_[12..15];
 284 # direct optimizations from hardware
 285 $code.=<<___;
 286         veor    @t[3], @x[4], @x[6]
 287         veor    @t[2], @x[5], @x[7]
 288         veor    @t[1], @x[1], @x[3]
 289         veor    @s[1], @x[7], @x[6]
 290          vmov   @t[0], @t[2]
 291         veor    @s[0], @x[0], @x[2]
 292
 293         vorr    @t[2], @t[2], @t[1]
 294         veor    @s[3], @t[3], @t[0]
 295         vand    @s[2], @t[3], @s[0]
 296         vorr    @t[3], @t[3], @s[0]
 297         veor    @s[0], @s[0], @t[1]
 298         vand    @t[0], @t[0], @t[1]
 299         veor    @t[1], @x[3], @x[2]
 300         vand    @s[3], @s[3], @s[0]
 301         vand    @s[1], @s[1], @t[1]
 302         veor    @t[1], @x[4], @x[5]
 303         veor    @s[0], @x[1], @x[0]
 304         veor    @t[3], @t[3], @s[1]
 305         veor    @t[2], @t[2], @s[1]
 306         vand    @s[1], @t[1], @s[0]
 307         vorr    @t[1], @t[1], @s[0]
 308         veor    @t[3], @t[3], @s[3]
 309         veor    @t[0], @t[0], @s[1]
 310         veor    @t[2], @t[2], @s[2]
 311         veor    @t[1], @t[1], @s[3]
 312         veor    @t[0], @t[0], @s[2]
 313         vand    @s[0], @x[7], @x[3]
 314         veor    @t[1], @t[1], @s[2]
 315         vand    @s[1], @x[6], @x[2]
 316         vand    @s[2], @x[5], @x[1]
 317         vorr    @s[3], @x[4], @x[0]
 318         veor    @t[3], @t[3], @s[0]
 319         veor    @t[1], @t[1], @s[2]
 320         veor    @t[0], @t[0], @s[3]
 321         veor    @t[2], @t[2], @s[1]
 322
 323         @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 324
 325         @ new smaller inversion
 326
 327         vand    @s[2], @t[3], @t[1]
 328         vmov    @s[0], @t[0]
 329
 330         veor    @s[1], @t[2], @s[2]
 331         veor    @s[3], @t[0], @s[2]
 332         veor    @s[2], @t[0], @s[2]     @ @s[2]=@s[3]
 333
 334         vbsl    @s[1], @t[1], @t[0]
 335         vbsl    @s[3], @t[3], @t[2]
 336         veor    @t[3], @t[3], @t[2]
 337
 338         vbsl    @s[0], @s[1], @s[2]
 339         vbsl    @t[0], @s[2], @s[1]
 340
 341         vand    @s[2], @s[0], @s[3]
 342         veor    @t[1], @t[1], @t[0]
 343
 344         veor    @s[2], @s[2], @t[3]
 345 ___
 346 # output in s3, s2, s1, t1
 347
 348 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
 349
 350 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
 351         &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
 352
 353 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
 354 }
 355
 356 # AES linear components
 357
 358 sub ShiftRows {
 359 my @x=@_[0..7];
 360 my @t=@_[8..11];
 361 my $mask=pop;
 362 $code.=<<___;
 363         vldmia  $key!, {@t[0]-@t[3]}
 364         veor    @t[0], @t[0], @x[0]
 365         veor    @t[1], @t[1], @x[1]
 366         vtbl.8  `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
 367         vtbl.8  `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
 368         vldmia  $key!, {@t[0]}
 369         veor    @t[2], @t[2], @x[2]
 370         vtbl.8  `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
 371         vtbl.8  `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
 372         vldmia  $key!, {@t[1]}
 373         veor    @t[3], @t[3], @x[3]
 374         vtbl.8  `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
 375         vtbl.8  `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
 376         vldmia  $key!, {@t[2]}
 377         vtbl.8  `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
 378         vtbl.8  `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
 379         vldmia  $key!, {@t[3]}
 380         veor    @t[0], @t[0], @x[4]
 381         veor    @t[1], @t[1], @x[5]
 382         vtbl.8  `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
 383         vtbl.8  `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
 384         veor    @t[2], @t[2], @x[6]
 385         vtbl.8  `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
 386         vtbl.8  `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
 387         veor    @t[3], @t[3], @x[7]
 388         vtbl.8  `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
 389         vtbl.8  `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
 390         vtbl.8  `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
 391         vtbl.8  `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
 392 ___
 393 }
 394
 395 sub MixColumns {
 396 # modified to emit output in order suitable for feeding back to aesenc[last]
 397 my @x=@_[0..7];
 398 my @t=@_[8..15];
 399 my $inv=@_[16]; # optional
 400 $code.=<<___;
 401         vext.8  @t[0], @x[0], @x[0], #12        @ x0 <<< 32
 402         vext.8  @t[1], @x[1], @x[1], #12
 403          veor   @x[0], @x[0], @t[0]             @ x0 ^ (x0 <<< 32)
 404         vext.8  @t[2], @x[2], @x[2], #12
 405          veor   @x[1], @x[1], @t[1]
 406         vext.8  @t[3], @x[3], @x[3], #12
 407          veor   @x[2], @x[2], @t[2]
 408         vext.8  @t[4], @x[4], @x[4], #12
 409          veor   @x[3], @x[3], @t[3]
 410         vext.8  @t[5], @x[5], @x[5], #12
 411          veor   @x[4], @x[4], @t[4]
 412         vext.8  @t[6], @x[6], @x[6], #12
 413          veor   @x[5], @x[5], @t[5]
 414         vext.8  @t[7], @x[7], @x[7], #12
 415          veor   @x[6], @x[6], @t[6]
 416
 417         veor    @t[1], @t[1], @x[0]
 418          veor   @x[7], @x[7], @t[7]
 419          vext.8 @x[0], @x[0], @x[0], #8         @ (x0 ^ (x0 <<< 32)) <<< 64)
 420         veor    @t[2], @t[2], @x[1]
 421         veor    @t[0], @t[0], @x[7]
 422         veor    @t[1], @t[1], @x[7]
 423          vext.8 @x[1], @x[1], @x[1], #8
 424         veor    @t[5], @t[5], @x[4]
 425          veor   @x[0], @x[0], @t[0]
 426         veor    @t[6], @t[6], @x[5]
 427          veor   @x[1], @x[1], @t[1]
 428          vext.8 @t[0], @x[4], @x[4], #8
 429         veor    @t[4], @t[4], @x[3]
 430          vext.8 @t[1], @x[5], @x[5], #8
 431         veor    @t[7], @t[7], @x[6]
 432          vext.8 @x[4], @x[3], @x[3], #8
 433         veor    @t[3], @t[3], @x[2]
 434          vext.8 @x[5], @x[7], @x[7], #8
 435         veor    @t[4], @t[4], @x[7]
 436          vext.8 @x[3], @x[6], @x[6], #8
 437         veor    @t[3], @t[3], @x[7]
 438          vext.8 @x[6], @x[2], @x[2], #8
 439         veor    @x[7], @t[1], @t[5]
 440 ___
 441 $code.=<<___ if (!$inv);
 442         veor    @x[2], @t[0], @t[4]
 443         veor    @x[4], @x[4], @t[3]
 444         veor    @x[5], @x[5], @t[7]
 445         veor    @x[3], @x[3], @t[6]
 446          @ vmov @x[2], @t[0]
 447         veor    @x[6], @x[6], @t[2]
 448          @ vmov @x[7], @t[1]
 449 ___
 450 $code.=<<___ if ($inv);
 451         veor    @t[3], @t[3], @x[4]
 452         veor    @x[5], @x[5], @t[7]
 453         veor    @x[2], @x[3], @t[6]
 454         veor    @x[3], @t[0], @t[4]
 455         veor    @x[4], @x[6], @t[2]
 456         vmov    @x[6], @t[3]
 457          @ vmov @x[7], @t[1]
 458 ___
 459 }
 460
 461 sub InvMixColumns_orig {
 462 my @x=@_[0..7];
 463 my @t=@_[8..15];
 464
 465 $code.=<<___;
 466         @ multiplication by 0x0e
 467         vext.8  @t[7], @x[7], @x[7], #12
 468         vmov    @t[2], @x[2]
 469         veor    @x[2], @x[2], @x[5]             @ 2 5
 470         veor    @x[7], @x[7], @x[5]             @ 7 5
 471         vext.8  @t[0], @x[0], @x[0], #12
 472         vmov    @t[5], @x[5]
 473         veor    @x[5], @x[5], @x[0]             @ 5 0           [1]
 474         veor    @x[0], @x[0], @x[1]             @ 0 1
 475         vext.8  @t[1], @x[1], @x[1], #12
 476         veor    @x[1], @x[1], @x[2]             @ 1 25
 477         veor    @x[0], @x[0], @x[6]             @ 01 6          [2]
 478         vext.8  @t[3], @x[3], @x[3], #12
 479         veor    @x[1], @x[1], @x[3]             @ 125 3         [4]
 480         veor    @x[2], @x[2], @x[0]             @ 25 016        [3]
 481         veor    @x[3], @x[3], @x[7]             @ 3 75
 482         veor    @x[7], @x[7], @x[6]             @ 75 6          [0]
 483         vext.8  @t[6], @x[6], @x[6], #12
 484         vmov    @t[4], @x[4]
 485         veor    @x[6], @x[6], @x[4]             @ 6 4
 486         veor    @x[4], @x[4], @x[3]             @ 4 375         [6]
 487         veor    @x[3], @x[3], @x[7]             @ 375 756=36
 488         veor    @x[6], @x[6], @t[5]             @ 64 5          [7]
 489         veor    @x[3], @x[3], @t[2]             @ 36 2
 490         vext.8  @t[5], @t[5], @t[5], #12
 491         veor    @x[3], @x[3], @t[4]             @ 362 4         [5]
 492 ___
 493                                         my @y = @x[7,5,0,2,1,3,4,6];
 494 $code.=<<___;
 495         @ multiplication by 0x0b
 496         veor    @y[1], @y[1], @y[0]
 497         veor    @y[0], @y[0], @t[0]
 498         vext.8  @t[2], @t[2], @t[2], #12
 499         veor    @y[1], @y[1], @t[1]
 500         veor    @y[0], @y[0], @t[5]
 501         vext.8  @t[4], @t[4], @t[4], #12
 502         veor    @y[1], @y[1], @t[6]
 503         veor    @y[0], @y[0], @t[7]
 504         veor    @t[7], @t[7], @t[6]             @ clobber t[7]
 505
 506         veor    @y[3], @y[3], @t[0]
 507          veor   @y[1], @y[1], @y[0]
 508         vext.8  @t[0], @t[0], @t[0], #12
 509         veor    @y[2], @y[2], @t[1]
 510         veor    @y[4], @y[4], @t[1]
 511         vext.8  @t[1], @t[1], @t[1], #12
 512         veor    @y[2], @y[2], @t[2]
 513         veor    @y[3], @y[3], @t[2]
 514         veor    @y[5], @y[5], @t[2]
 515         veor    @y[2], @y[2], @t[7]
 516         vext.8  @t[2], @t[2], @t[2], #12
 517         veor    @y[3], @y[3], @t[3]
 518         veor    @y[6], @y[6], @t[3]
 519         veor    @y[4], @y[4], @t[3]
 520         veor    @y[7], @y[7], @t[4]
 521         vext.8  @t[3], @t[3], @t[3], #12
 522         veor    @y[5], @y[5], @t[4]
 523         veor    @y[7], @y[7], @t[7]
 524         veor    @t[7], @t[7], @t[5]             @ clobber t[7] even more
 525         veor    @y[3], @y[3], @t[5]
 526         veor    @y[4], @y[4], @t[4]
 527
 528         veor    @y[5], @y[5], @t[7]
 529         vext.8  @t[4], @t[4], @t[4], #12
 530         veor    @y[6], @y[6], @t[7]
 531         veor    @y[4], @y[4], @t[7]
 532
 533         veor    @t[7], @t[7], @t[5]
 534         vext.8  @t[5], @t[5], @t[5], #12
 535
 536         @ multiplication by 0x0d
 537         veor    @y[4], @y[4], @y[7]
 538          veor   @t[7], @t[7], @t[6]             @ restore t[7]
 539         veor    @y[7], @y[7], @t[4]
 540         vext.8  @t[6], @t[6], @t[6], #12
 541         veor    @y[2], @y[2], @t[0]
 542         veor    @y[7], @y[7], @t[5]
 543         vext.8  @t[7], @t[7], @t[7], #12
 544         veor    @y[2], @y[2], @t[2]
 545
 546         veor    @y[3], @y[3], @y[1]
 547         veor    @y[1], @y[1], @t[1]
 548         veor    @y[0], @y[0], @t[0]
 549         veor    @y[3], @y[3], @t[0]
 550         veor    @y[1], @y[1], @t[5]
 551         veor    @y[0], @y[0], @t[5]
 552         vext.8  @t[0], @t[0], @t[0], #12
 553         veor    @y[1], @y[1], @t[7]
 554         veor    @y[0], @y[0], @t[6]
 555         veor    @y[3], @y[3], @y[1]
 556         veor    @y[4], @y[4], @t[1]
 557         vext.8  @t[1], @t[1], @t[1], #12
 558
 559         veor    @y[7], @y[7], @t[7]
 560         veor    @y[4], @y[4], @t[2]
 561         veor    @y[5], @y[5], @t[2]
 562         veor    @y[2], @y[2], @t[6]
 563         veor    @t[6], @t[6], @t[3]             @ clobber t[6]
 564         vext.8  @t[2], @t[2], @t[2], #12
 565         veor    @y[4], @y[4], @y[7]
 566         veor    @y[3], @y[3], @t[6]
 567
 568         veor    @y[6], @y[6], @t[6]
 569         veor    @y[5], @y[5], @t[5]
 570         vext.8  @t[5], @t[5], @t[5], #12
 571         veor    @y[6], @y[6], @t[4]
 572         vext.8  @t[4], @t[4], @t[4], #12
 573         veor    @y[5], @y[5], @t[6]
 574         veor    @y[6], @y[6], @t[7]
 575         vext.8  @t[7], @t[7], @t[7], #12
 576         veor    @t[6], @t[6], @t[3]             @ restore t[6]
 577         vext.8  @t[3], @t[3], @t[3], #12
 578
 579         @ multiplication by 0x09
 580         veor    @y[4], @y[4], @y[1]
 581         veor    @t[1], @t[1], @y[1]             @ t[1]=y[1]
 582         veor    @t[0], @t[0], @t[5]             @ clobber t[0]
 583         vext.8  @t[6], @t[6], @t[6], #12
 584         veor    @t[1], @t[1], @t[5]
 585         veor    @y[3], @y[3], @t[0]
 586         veor    @t[0], @t[0], @y[0]             @ t[0]=y[0]
 587         veor    @t[1], @t[1], @t[6]
 588         veor    @t[6], @t[6], @t[7]             @ clobber t[6]
 589         veor    @y[4], @y[4], @t[1]
 590         veor    @y[7], @y[7], @t[4]
 591         veor    @y[6], @y[6], @t[3]
 592         veor    @y[5], @y[5], @t[2]
 593         veor    @t[4], @t[4], @y[4]             @ t[4]=y[4]
 594         veor    @t[3], @t[3], @y[3]             @ t[3]=y[3]
 595         veor    @t[5], @t[5], @y[5]             @ t[5]=y[5]
 596         veor    @t[2], @t[2], @y[2]             @ t[2]=y[2]
 597         veor    @t[3], @t[3], @t[7]
 598         veor    @XMM[5], @t[5], @t[6]
 599         veor    @XMM[6], @t[6], @y[6]           @ t[6]=y[6]
 600         veor    @XMM[2], @t[2], @t[6]
 601         veor    @XMM[7], @t[7], @y[7]           @ t[7]=y[7]
 602
 603         vmov    @XMM[0], @t[0]
 604         vmov    @XMM[1], @t[1]
 605         @ vmov  @XMM[2], @t[2]
 606         vmov    @XMM[3], @t[3]
 607         vmov    @XMM[4], @t[4]
 608         @ vmov  @XMM[5], @t[5]
 609         @ vmov  @XMM[6], @t[6]
 610         @ vmov  @XMM[7], @t[7]
 611 ___
 612 }
 613
 614 sub InvMixColumns {
 615 my @x=@_[0..7];
 616 my @t=@_[8..15];
 617
 618 # Thanks to Jussi Kivilinna for providing pointer to
 619 #
 620 # | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
 621 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
 622 # | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
 623 # | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
 624
 625 $code.=<<___;
 626         @ multiplication by 0x05-0x00-0x04-0x00
 627         vext.8  @t[0], @x[0], @x[0], #8
 628         vext.8  @t[6], @x[6], @x[6], #8
 629         vext.8  @t[7], @x[7], @x[7], #8
 630         veor    @t[0], @t[0], @x[0]
 631         vext.8  @t[1], @x[1], @x[1], #8
 632         veor    @t[6], @t[6], @x[6]
 633         vext.8  @t[2], @x[2], @x[2], #8
 634         veor    @t[7], @t[7], @x[7]
 635         vext.8  @t[3], @x[3], @x[3], #8
 636         veor    @t[1], @t[1], @x[1]
 637         vext.8  @t[4], @x[4], @x[4], #8
 638         veor    @t[2], @t[2], @x[2]
 639         vext.8  @t[5], @x[5], @x[5], #8
 640         veor    @t[3], @t[3], @x[3]
 641         veor    @t[4], @t[4], @x[4]
 642         veor    @t[5], @t[5], @x[5]
 643
 644          veor   @x[0], @x[0], @t[6]
 645          veor   @x[1], @x[1], @t[6]
 646          veor   @x[2], @x[2], @t[0]
 647          veor   @x[4], @x[4], @t[2]
 648          veor   @x[3], @x[3], @t[1]
 649          veor   @x[1], @x[1], @t[7]
 650          veor   @x[2], @x[2], @t[7]
 651          veor   @x[4], @x[4], @t[6]
 652          veor   @x[5], @x[5], @t[3]
 653          veor   @x[3], @x[3], @t[6]
 654          veor   @x[6], @x[6], @t[4]
 655          veor   @x[4], @x[4], @t[7]
 656          veor   @x[5], @x[5], @t[7]
 657          veor   @x[7], @x[7], @t[5]
 658 ___
 659         &MixColumns     (@x,@t,1);      # flipped 2<->3 and 4<->6
 660 }
 661
 662 sub swapmove {
 663 my ($a,$b,$n,$mask,$t)=@_;
 664 $code.=<<___;
 665         vshr.u64        $t, $b, #$n
 666         veor            $t, $t, $a
 667         vand            $t, $t, $mask
 668         veor            $a, $a, $t
 669         vshl.u64        $t, $t, #$n
 670         veor            $b, $b, $t
 671 ___
 672 }
 673 sub swapmove2x {
 674 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
 675 $code.=<<___;
 676         vshr.u64        $t0, $b0, #$n
 677          vshr.u64       $t1, $b1, #$n
 678         veor            $t0, $t0, $a0
 679          veor           $t1, $t1, $a1
 680         vand            $t0, $t0, $mask
 681          vand           $t1, $t1, $mask
 682         veor            $a0, $a0, $t0
 683         vshl.u64        $t0, $t0, #$n
 684          veor           $a1, $a1, $t1
 685          vshl.u64       $t1, $t1, #$n
 686         veor            $b0, $b0, $t0
 687          veor           $b1, $b1, $t1
 688 ___
 689 }
 690
 691 sub bitslice {
 692 my @x=reverse(@_[0..7]);
 693 my ($t0,$t1,$t2,$t3)=@_[8..11];
 694 $code.=<<___;
 695         vmov.i8 $t0,#0x55                       @ compose .LBS0
 696         vmov.i8 $t1,#0x33                       @ compose .LBS1
 697 ___
 698         &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
 699         &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
 700 $code.=<<___;
 701         vmov.i8 $t0,#0x0f                       @ compose .LBS2
 702 ___
 703         &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
 704         &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
 705
 706         &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
 707         &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
 708 }
 709
 710 $code.=<<___;
 711 #ifndef __KERNEL__
 712 # include "arm_arch.h"
 713
 714 # define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
 715 # define VFP_ABI_POP    vldmia  sp!,{d8-d15}
 716 # define VFP_ABI_FRAME  0x40
 717 #else
 718 # define VFP_ABI_PUSH
 719 # define VFP_ABI_POP
 720 # define VFP_ABI_FRAME  0
 721 # define BSAES_ASM_EXTENDED_KEY
 722 # define XTS_CHAIN_TWEAK
 723 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
 724 # define __ARM_MAX_ARCH__ 7
 725 #endif
 726
 727 #ifdef __thumb__
 728 # define adrl adr
 729 #endif
 730
 731 #if __ARM_MAX_ARCH__>=7
 732 .arch   armv7-a
 733 .fpu    neon
 734
 735 .text
 736 .syntax unified         @ ARMv7-capable assembler is expected to handle this
 737 #if defined(__thumb2__) && !defined(__APPLE__)
 738 .thumb
 739 #else
 740 .code   32
 741 # undef __thumb2__
 742 #endif
 743
 744 .type   _bsaes_decrypt8,%function
 745 .align  4
 746 _bsaes_decrypt8:
 747         adr     $const,.
 748         vldmia  $key!, {@XMM[9]}                @ round 0 key
 749 #if defined(__thumb2__) || defined(__APPLE__)
 750         adr     $const,.LM0ISR
 751 #else
 752         add     $const,$const,#.LM0ISR-_bsaes_decrypt8
 753 #endif
 754
 755         vldmia  $const!, {@XMM[8]}              @ .LM0ISR
 756         veor    @XMM[10], @XMM[0], @XMM[9]      @ xor with round0 key
 757         veor    @XMM[11], @XMM[1], @XMM[9]
 758          vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
 759          vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
 760         veor    @XMM[12], @XMM[2], @XMM[9]
 761          vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
 762          vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
 763         veor    @XMM[13], @XMM[3], @XMM[9]
 764          vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
 765          vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
 766         veor    @XMM[14], @XMM[4], @XMM[9]
 767          vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
 768          vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
 769         veor    @XMM[15], @XMM[5], @XMM[9]
 770          vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
 771          vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
 772         veor    @XMM[10], @XMM[6], @XMM[9]
 773          vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
 774          vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
 775         veor    @XMM[11], @XMM[7], @XMM[9]
 776          vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
 777          vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
 778          vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
 779          vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
 780 ___
 781         &bitslice       (@XMM[0..7, 8..11]);
 782 $code.=<<___;
 783         sub     $rounds,$rounds,#1
 784         b       .Ldec_sbox
 785 .align  4
 786 .Ldec_loop:
 787 ___
 788         &ShiftRows      (@XMM[0..7, 8..12]);
 789 $code.=".Ldec_sbox:\n";
 790         &InvSbox        (@XMM[0..7, 8..15]);
 791 $code.=<<___;
 792         subs    $rounds,$rounds,#1
 793         bcc     .Ldec_done
 794 ___
 795         &InvMixColumns  (@XMM[0,1,6,4,2,7,3,5, 8..15]);
 796 $code.=<<___;
 797         vldmia  $const, {@XMM[12]}              @ .LISR
 798         ite     eq                              @ Thumb2 thing, sanity check in ARM
 799         addeq   $const,$const,#0x10
 800         bne     .Ldec_loop
 801         vldmia  $const, {@XMM[12]}              @ .LISRM0
 802         b       .Ldec_loop
 803 .align  4
 804 .Ldec_done:
 805 ___
 806         &bitslice       (@XMM[0,1,6,4,2,7,3,5, 8..11]);
 807 $code.=<<___;
 808         vldmia  $key, {@XMM[8]}                 @ last round key
 809         veor    @XMM[6], @XMM[6], @XMM[8]
 810         veor    @XMM[4], @XMM[4], @XMM[8]
 811         veor    @XMM[2], @XMM[2], @XMM[8]
 812         veor    @XMM[7], @XMM[7], @XMM[8]
 813         veor    @XMM[3], @XMM[3], @XMM[8]
 814         veor    @XMM[5], @XMM[5], @XMM[8]
 815         veor    @XMM[0], @XMM[0], @XMM[8]
 816         veor    @XMM[1], @XMM[1], @XMM[8]
 817         bx      lr
 818 .size   _bsaes_decrypt8,.-_bsaes_decrypt8
 819
 820 .type   _bsaes_const,%object
 821 .align  6
 822 _bsaes_const:
 823 .LM0ISR:        @ InvShiftRows constants
 824         .quad   0x0a0e0206070b0f03, 0x0004080c0d010509
 825 .LISR:
 826         .quad   0x0504070602010003, 0x0f0e0d0c080b0a09
 827 .LISRM0:
 828         .quad   0x01040b0e0205080f, 0x0306090c00070a0d
 829 .LM0SR:         @ ShiftRows constants
 830         .quad   0x0a0e02060f03070b, 0x0004080c05090d01
 831 .LSR:
 832         .quad   0x0504070600030201, 0x0f0e0d0c0a09080b
 833 .LSRM0:
 834         .quad   0x0304090e00050a0f, 0x01060b0c0207080d
 835 .LM0:
 836         .quad   0x02060a0e03070b0f, 0x0004080c0105090d
 837 .LREVM0SR:
 838         .quad   0x090d01050c000408, 0x03070b0f060a0e02
 839 .asciz  "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
 840 .align  6
 841 .size   _bsaes_const,.-_bsaes_const
 842
 843 .type   _bsaes_encrypt8,%function
 844 .align  4
 845 _bsaes_encrypt8:
 846         adr     $const,.
 847         vldmia  $key!, {@XMM[9]}                @ round 0 key
 848 #if defined(__thumb2__) || defined(__APPLE__)
 849         adr     $const,.LM0SR
 850 #else
 851         sub     $const,$const,#_bsaes_encrypt8-.LM0SR
 852 #endif
 853
 854         vldmia  $const!, {@XMM[8]}              @ .LM0SR
 855 _bsaes_encrypt8_alt:
 856         veor    @XMM[10], @XMM[0], @XMM[9]      @ xor with round0 key
 857         veor    @XMM[11], @XMM[1], @XMM[9]
 858          vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
 859          vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
 860         veor    @XMM[12], @XMM[2], @XMM[9]
 861          vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
 862          vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
 863         veor    @XMM[13], @XMM[3], @XMM[9]
 864          vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
 865          vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
 866         veor    @XMM[14], @XMM[4], @XMM[9]
 867          vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
 868          vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
 869         veor    @XMM[15], @XMM[5], @XMM[9]
 870          vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
 871          vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
 872         veor    @XMM[10], @XMM[6], @XMM[9]
 873          vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
 874          vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
 875         veor    @XMM[11], @XMM[7], @XMM[9]
 876          vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
 877          vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
 878          vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
 879          vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
 880 _bsaes_encrypt8_bitslice:
 881 ___
 882         &bitslice       (@XMM[0..7, 8..11]);
 883 $code.=<<___;
 884         sub     $rounds,$rounds,#1
 885         b       .Lenc_sbox
 886 .align  4
 887 .Lenc_loop:
 888 ___
 889         &ShiftRows      (@XMM[0..7, 8..12]);
 890 $code.=".Lenc_sbox:\n";
 891         &Sbox           (@XMM[0..7, 8..15]);
 892 $code.=<<___;
 893         subs    $rounds,$rounds,#1
 894         bcc     .Lenc_done
 895 ___
 896         &MixColumns     (@XMM[0,1,4,6,3,7,2,5, 8..15]);
 897 $code.=<<___;
 898         vldmia  $const, {@XMM[12]}              @ .LSR
 899         ite     eq                              @ Thumb2 thing, samity check in ARM
 900         addeq   $const,$const,#0x10
 901         bne     .Lenc_loop
 902         vldmia  $const, {@XMM[12]}              @ .LSRM0
 903         b       .Lenc_loop
 904 .align  4
 905 .Lenc_done:
 906 ___
 907         # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
 908         &bitslice       (@XMM[0,1,4,6,3,7,2,5, 8..11]);
 909 $code.=<<___;
 910         vldmia  $key, {@XMM[8]}                 @ last round key
 911         veor    @XMM[4], @XMM[4], @XMM[8]
 912         veor    @XMM[6], @XMM[6], @XMM[8]
 913         veor    @XMM[3], @XMM[3], @XMM[8]
 914         veor    @XMM[7], @XMM[7], @XMM[8]
 915         veor    @XMM[2], @XMM[2], @XMM[8]
 916         veor    @XMM[5], @XMM[5], @XMM[8]
 917         veor    @XMM[0], @XMM[0], @XMM[8]
 918         veor    @XMM[1], @XMM[1], @XMM[8]
 919         bx      lr
 920 .size   _bsaes_encrypt8,.-_bsaes_encrypt8
 921 ___
 922 }
 923 {
 924 my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
 925
 926 sub bitslice_key {
 927 my @x=reverse(@_[0..7]);
 928 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
 929
 930         &swapmove       (@x[0,1],1,$bs0,$t2,$t3);
 931 $code.=<<___;
 932         @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
 933         vmov    @x[2], @x[0]
 934         vmov    @x[3], @x[1]
 935 ___
 936         #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
 937
 938         &swapmove2x     (@x[0,2,1,3],2,$bs1,$t2,$t3);
 939 $code.=<<___;
 940         @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
 941         vmov    @x[4], @x[0]
 942         vmov    @x[6], @x[2]
 943         vmov    @x[5], @x[1]
 944         vmov    @x[7], @x[3]
 945 ___
 946         &swapmove2x     (@x[0,4,1,5],4,$bs2,$t2,$t3);
 947         &swapmove2x     (@x[2,6,3,7],4,$bs2,$t2,$t3);
 948 }
 949
 950 $code.=<<___;
 951 .type   _bsaes_key_convert,%function
 952 .align  4
 953 _bsaes_key_convert:
 954         adr     $const,.
 955         vld1.8  {@XMM[7]},  [$inp]!             @ load round 0 key
 956 #if defined(__thumb2__) || defined(__APPLE__)
 957         adr     $const,.LM0
 958 #else
 959         sub     $const,$const,#_bsaes_key_convert-.LM0
 960 #endif
 961         vld1.8  {@XMM[15]}, [$inp]!             @ load round 1 key
 962
 963         vmov.i8 @XMM[8],  #0x01                 @ bit masks
 964         vmov.i8 @XMM[9],  #0x02
 965         vmov.i8 @XMM[10], #0x04
 966         vmov.i8 @XMM[11], #0x08
 967         vmov.i8 @XMM[12], #0x10
 968         vmov.i8 @XMM[13], #0x20
 969         vldmia  $const, {@XMM[14]}              @ .LM0
 970
 971 #ifdef __ARMEL__
 972         vrev32.8        @XMM[7],  @XMM[7]
 973         vrev32.8        @XMM[15], @XMM[15]
 974 #endif
 975         sub     $rounds,$rounds,#1
 976         vstmia  $out!, {@XMM[7]}                @ save round 0 key
 977         b       .Lkey_loop
 978
 979 .align  4
 980 .Lkey_loop:
 981         vtbl.8  `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
 982         vtbl.8  `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
 983         vmov.i8 @XMM[6],  #0x40
 984         vmov.i8 @XMM[15], #0x80
 985
 986         vtst.8  @XMM[0], @XMM[7], @XMM[8]
 987         vtst.8  @XMM[1], @XMM[7], @XMM[9]
 988         vtst.8  @XMM[2], @XMM[7], @XMM[10]
 989         vtst.8  @XMM[3], @XMM[7], @XMM[11]
 990         vtst.8  @XMM[4], @XMM[7], @XMM[12]
 991         vtst.8  @XMM[5], @XMM[7], @XMM[13]
 992         vtst.8  @XMM[6], @XMM[7], @XMM[6]
 993         vtst.8  @XMM[7], @XMM[7], @XMM[15]
 994         vld1.8  {@XMM[15]}, [$inp]!             @ load next round key
 995         vmvn    @XMM[0], @XMM[0]                @ "pnot"
 996         vmvn    @XMM[1], @XMM[1]
 997         vmvn    @XMM[5], @XMM[5]
 998         vmvn    @XMM[6], @XMM[6]
 999 #ifdef __ARMEL__
1000         vrev32.8        @XMM[15], @XMM[15]
1001 #endif
1002         subs    $rounds,$rounds,#1
1003         vstmia  $out!,{@XMM[0]-@XMM[7]}         @ write bit-sliced round key
1004         bne     .Lkey_loop
1005
1006         vmov.i8 @XMM[7],#0x63                   @ compose .L63
1007         @ don't save last round key
1008         bx      lr
1009 .size   _bsaes_key_convert,.-_bsaes_key_convert
1010 ___
1011 }
1012
1013 if (0) {                # following four functions are unsupported interface
1014                         # used for benchmarking...
1015 $code.=<<___;
1016 .globl  bsaes_enc_key_convert
1017 .type   bsaes_enc_key_convert,%function
1018 .align  4
1019 bsaes_enc_key_convert:
1020         stmdb   sp!,{r4-r6,lr}
1021         vstmdb  sp!,{d8-d15}            @ ABI specification says so
1022
1023         ldr     r5,[$inp,#240]                  @ pass rounds
1024         mov     r4,$inp                         @ pass key
1025         mov     r12,$out                        @ pass key schedule
1026         bl      _bsaes_key_convert
1027         veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
1028         vstmia  r12, {@XMM[7]}                  @ save last round key
1029
1030         vldmia  sp!,{d8-d15}
1031         ldmia   sp!,{r4-r6,pc}
1032 .size   bsaes_enc_key_convert,.-bsaes_enc_key_convert
1033
1034 .globl  bsaes_encrypt_128
1035 .type   bsaes_encrypt_128,%function
1036 .align  4
1037 bsaes_encrypt_128:
1038         stmdb   sp!,{r4-r6,lr}
1039         vstmdb  sp!,{d8-d15}            @ ABI specification says so
1040 .Lenc128_loop:
1041         vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
1042         vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
1043         mov     r4,$key                         @ pass the key
1044         vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
1045         mov     r5,#10                          @ pass rounds
1046         vld1.8  {@XMM[6]-@XMM[7]}, [$inp]!
1047
1048         bl      _bsaes_encrypt8
1049
1050         vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
1051         vst1.8  {@XMM[4]}, [$out]!
1052         vst1.8  {@XMM[6]}, [$out]!
1053         vst1.8  {@XMM[3]}, [$out]!
1054         vst1.8  {@XMM[7]}, [$out]!
1055         vst1.8  {@XMM[2]}, [$out]!
1056         subs    $len,$len,#0x80
1057         vst1.8  {@XMM[5]}, [$out]!
1058         bhi     .Lenc128_loop
1059
1060         vldmia  sp!,{d8-d15}
1061         ldmia   sp!,{r4-r6,pc}
1062 .size   bsaes_encrypt_128,.-bsaes_encrypt_128
1063
1064 .globl  bsaes_dec_key_convert
1065 .type   bsaes_dec_key_convert,%function
1066 .align  4
1067 bsaes_dec_key_convert:
1068         stmdb   sp!,{r4-r6,lr}
1069         vstmdb  sp!,{d8-d15}            @ ABI specification says so
1070
1071         ldr     r5,[$inp,#240]                  @ pass rounds
1072         mov     r4,$inp                         @ pass key
1073         mov     r12,$out                        @ pass key schedule
1074         bl      _bsaes_key_convert
1075         vldmia  $out, {@XMM[6]}
1076         vstmia  r12,  {@XMM[15]}                @ save last round key
1077         veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
1078         vstmia  $out, {@XMM[7]}
1079
1080         vldmia  sp!,{d8-d15}
1081         ldmia   sp!,{r4-r6,pc}
1082 .size   bsaes_dec_key_convert,.-bsaes_dec_key_convert
1083
1084 .globl  bsaes_decrypt_128
1085 .type   bsaes_decrypt_128,%function
1086 .align  4
1087 bsaes_decrypt_128:
1088         stmdb   sp!,{r4-r6,lr}
1089         vstmdb  sp!,{d8-d15}            @ ABI specification says so
1090 .Ldec128_loop:
1091         vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
1092         vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
1093         mov     r4,$key                         @ pass the key
1094         vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
1095         mov     r5,#10                          @ pass rounds
1096         vld1.8  {@XMM[6]-@XMM[7]}, [$inp]!
1097
1098         bl      _bsaes_decrypt8
1099
1100         vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
1101         vst1.8  {@XMM[6]}, [$out]!
1102         vst1.8  {@XMM[4]}, [$out]!
1103         vst1.8  {@XMM[2]}, [$out]!
1104         vst1.8  {@XMM[7]}, [$out]!
1105         vst1.8  {@XMM[3]}, [$out]!
1106         subs    $len,$len,#0x80
1107         vst1.8  {@XMM[5]}, [$out]!
1108         bhi     .Ldec128_loop
1109
1110         vldmia  sp!,{d8-d15}
1111         ldmia   sp!,{r4-r6,pc}
1112 .size   bsaes_decrypt_128,.-bsaes_decrypt_128
1113 ___
1114 }
1115 {
1116 my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
1117 my ($keysched)=("sp");
1118
1119 $code.=<<___;
1120 .extern AES_cbc_encrypt
1121 .extern AES_decrypt
1122
1123 .global bsaes_cbc_encrypt
1124 .type   bsaes_cbc_encrypt,%function
1125 .align  5
1126 bsaes_cbc_encrypt:
1127 #ifndef __KERNEL__
1128         cmp     $len, #128
1129 #ifndef __thumb__
1130         blo     AES_cbc_encrypt
1131 #else
1132         bhs     1f
1133         b       AES_cbc_encrypt
1134 1:
1135 #endif
1136 #endif
1137
1138         @ it is up to the caller to make sure we are called with enc == 0
1139
1140         mov     ip, sp
1141         stmdb   sp!, {r4-r10, lr}
1142         VFP_ABI_PUSH
1143         ldr     $ivp, [ip]                      @ IV is 1st arg on the stack
1144         mov     $len, $len, lsr#4               @ len in 16 byte blocks
1145         sub     sp, #0x10                       @ scratch space to carry over the IV
1146         mov     $fp, sp                         @ save sp
1147
1148         ldr     $rounds, [$key, #240]           @ get # of rounds
1149 #ifndef BSAES_ASM_EXTENDED_KEY
1150         @ allocate the key schedule on the stack
1151         sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
1152         add     r12, #`128-32`                  @ sifze of bit-slices key schedule
1153
1154         @ populate the key schedule
1155         mov     r4, $key                        @ pass key
1156         mov     r5, $rounds                     @ pass # of rounds
1157         mov     sp, r12                         @ sp is $keysched
1158         bl      _bsaes_key_convert
1159         vldmia  $keysched, {@XMM[6]}
1160         vstmia  r12,  {@XMM[15]}                @ save last round key
1161         veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
1162         vstmia  $keysched, {@XMM[7]}
1163 #else
1164         ldr     r12, [$key, #244]
1165         eors    r12, #1
1166         beq     0f
1167
1168         @ populate the key schedule
1169         str     r12, [$key, #244]
1170         mov     r4, $key                        @ pass key
1171         mov     r5, $rounds                     @ pass # of rounds
1172         add     r12, $key, #248                 @ pass key schedule
1173         bl      _bsaes_key_convert
1174         add     r4, $key, #248
1175         vldmia  r4, {@XMM[6]}
1176         vstmia  r12, {@XMM[15]}                 @ save last round key
1177         veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
1178         vstmia  r4, {@XMM[7]}
1179
1180 .align  2
1181 0:
1182 #endif
1183
1184         vld1.8  {@XMM[15]}, [$ivp]              @ load IV
1185         b       .Lcbc_dec_loop
1186
1187 .align  4
1188 .Lcbc_dec_loop:
1189         subs    $len, $len, #0x8
1190         bmi     .Lcbc_dec_loop_finish
1191
1192         vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
1193         vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
1194 #ifndef BSAES_ASM_EXTENDED_KEY
1195         mov     r4, $keysched                   @ pass the key
1196 #else
1197         add     r4, $key, #248
1198 #endif
1199         vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
1200         mov     r5, $rounds
1201         vld1.8  {@XMM[6]-@XMM[7]}, [$inp]
1202         sub     $inp, $inp, #0x60
1203         vstmia  $fp, {@XMM[15]}                 @ put aside IV
1204
1205         bl      _bsaes_decrypt8
1206
1207         vldmia  $fp, {@XMM[14]}                 @ reload IV
1208         vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
1209         veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
1210         vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
1211         veor    @XMM[1], @XMM[1], @XMM[8]
1212         veor    @XMM[6], @XMM[6], @XMM[9]
1213         vld1.8  {@XMM[12]-@XMM[13]}, [$inp]!
1214         veor    @XMM[4], @XMM[4], @XMM[10]
1215         veor    @XMM[2], @XMM[2], @XMM[11]
1216         vld1.8  {@XMM[14]-@XMM[15]}, [$inp]!
1217         veor    @XMM[7], @XMM[7], @XMM[12]
1218         vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
1219         veor    @XMM[3], @XMM[3], @XMM[13]
1220         vst1.8  {@XMM[6]}, [$out]!
1221         veor    @XMM[5], @XMM[5], @XMM[14]
1222         vst1.8  {@XMM[4]}, [$out]!
1223         vst1.8  {@XMM[2]}, [$out]!
1224         vst1.8  {@XMM[7]}, [$out]!
1225         vst1.8  {@XMM[3]}, [$out]!
1226         vst1.8  {@XMM[5]}, [$out]!
1227
1228         b       .Lcbc_dec_loop
1229
1230 .Lcbc_dec_loop_finish:
1231         adds    $len, $len, #8
1232         beq     .Lcbc_dec_done
1233
1234         vld1.8  {@XMM[0]}, [$inp]!              @ load input
1235         cmp     $len, #2
1236         blo     .Lcbc_dec_one
1237         vld1.8  {@XMM[1]}, [$inp]!
1238 #ifndef BSAES_ASM_EXTENDED_KEY
1239         mov     r4, $keysched                   @ pass the key
1240 #else
1241         add     r4, $key, #248
1242 #endif
1243         mov     r5, $rounds
1244         vstmia  $fp, {@XMM[15]}                 @ put aside IV
1245         beq     .Lcbc_dec_two
1246         vld1.8  {@XMM[2]}, [$inp]!
1247         cmp     $len, #4
1248         blo     .Lcbc_dec_three
1249         vld1.8  {@XMM[3]}, [$inp]!
1250         beq     .Lcbc_dec_four
1251         vld1.8  {@XMM[4]}, [$inp]!
1252         cmp     $len, #6
1253         blo     .Lcbc_dec_five
1254         vld1.8  {@XMM[5]}, [$inp]!
1255         beq     .Lcbc_dec_six
1256         vld1.8  {@XMM[6]}, [$inp]!
1257         sub     $inp, $inp, #0x70
1258
1259         bl      _bsaes_decrypt8
1260
1261         vldmia  $fp, {@XMM[14]}                 @ reload IV
1262         vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
1263         veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
1264         vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
1265         veor    @XMM[1], @XMM[1], @XMM[8]
1266         veor    @XMM[6], @XMM[6], @XMM[9]
1267         vld1.8  {@XMM[12]-@XMM[13]}, [$inp]!
1268         veor    @XMM[4], @XMM[4], @XMM[10]
1269         veor    @XMM[2], @XMM[2], @XMM[11]
1270         vld1.8  {@XMM[15]}, [$inp]!
1271         veor    @XMM[7], @XMM[7], @XMM[12]
1272         vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
1273         veor    @XMM[3], @XMM[3], @XMM[13]
1274         vst1.8  {@XMM[6]}, [$out]!
1275         vst1.8  {@XMM[4]}, [$out]!
1276         vst1.8  {@XMM[2]}, [$out]!
1277         vst1.8  {@XMM[7]}, [$out]!
1278         vst1.8  {@XMM[3]}, [$out]!
1279         b       .Lcbc_dec_done
1280 .align  4
1281 .Lcbc_dec_six:
1282         sub     $inp, $inp, #0x60
1283         bl      _bsaes_decrypt8
1284         vldmia  $fp,{@XMM[14]}                  @ reload IV
1285         vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
1286         veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
1287         vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
1288         veor    @XMM[1], @XMM[1], @XMM[8]
1289         veor    @XMM[6], @XMM[6], @XMM[9]
1290         vld1.8  {@XMM[12]}, [$inp]!
1291         veor    @XMM[4], @XMM[4], @XMM[10]
1292         veor    @XMM[2], @XMM[2], @XMM[11]
1293         vld1.8  {@XMM[15]}, [$inp]!
1294         veor    @XMM[7], @XMM[7], @XMM[12]
1295         vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
1296         vst1.8  {@XMM[6]}, [$out]!
1297         vst1.8  {@XMM[4]}, [$out]!
1298         vst1.8  {@XMM[2]}, [$out]!
1299         vst1.8  {@XMM[7]}, [$out]!
1300         b       .Lcbc_dec_done
1301 .align  4
1302 .Lcbc_dec_five:
1303         sub     $inp, $inp, #0x50
1304         bl      _bsaes_decrypt8
1305         vldmia  $fp, {@XMM[14]}                 @ reload IV
1306         vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
1307         veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
1308         vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
1309         veor    @XMM[1], @XMM[1], @XMM[8]
1310         veor    @XMM[6], @XMM[6], @XMM[9]
1311         vld1.8  {@XMM[15]}, [$inp]!
1312         veor    @XMM[4], @XMM[4], @XMM[10]
1313         vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
1314         veor    @XMM[2], @XMM[2], @XMM[11]
1315         vst1.8  {@XMM[6]}, [$out]!
1316         vst1.8  {@XMM[4]}, [$out]!
1317         vst1.8  {@XMM[2]}, [$out]!
1318         b       .Lcbc_dec_done
1319 .align  4
1320 .Lcbc_dec_four:
1321         sub     $inp, $inp, #0x40
1322         bl      _bsaes_decrypt8
1323         vldmia  $fp, {@XMM[14]}                 @ reload IV
1324         vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
1325         veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
1326         vld1.8  {@XMM[10]}, [$inp]!
1327         veor    @XMM[1], @XMM[1], @XMM[8]
1328         veor    @XMM[6], @XMM[6], @XMM[9]
1329         vld1.8  {@XMM[15]}, [$inp]!
1330         veor    @XMM[4], @XMM[4], @XMM[10]
1331         vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
1332         vst1.8  {@XMM[6]}, [$out]!
1333         vst1.8  {@XMM[4]}, [$out]!
1334         b       .Lcbc_dec_done
1335 .align  4
1336 .Lcbc_dec_three:
1337         sub     $inp, $inp, #0x30
1338         bl      _bsaes_decrypt8
1339         vldmia  $fp, {@XMM[14]}                 @ reload IV
1340         vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
1341         veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
1342         vld1.8  {@XMM[15]}, [$inp]!
1343         veor    @XMM[1], @XMM[1], @XMM[8]
1344         veor    @XMM[6], @XMM[6], @XMM[9]
1345         vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
1346         vst1.8  {@XMM[6]}, [$out]!
1347         b       .Lcbc_dec_done
1348 .align  4
1349 .Lcbc_dec_two:
1350         sub     $inp, $inp, #0x20
1351         bl      _bsaes_decrypt8
1352         vldmia  $fp, {@XMM[14]}                 @ reload IV
1353         vld1.8  {@XMM[8]}, [$inp]!              @ reload input
1354         veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
1355         vld1.8  {@XMM[15]}, [$inp]!             @ reload input
1356         veor    @XMM[1], @XMM[1], @XMM[8]
1357         vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
1358         b       .Lcbc_dec_done
1359 .align  4
1360 .Lcbc_dec_one:
1361         sub     $inp, $inp, #0x10
1362         mov     $rounds, $out                   @ save original out pointer
1363         mov     $out, $fp                       @ use the iv scratch space as out buffer
1364         mov     r2, $key
1365         vmov    @XMM[4],@XMM[15]                @ just in case ensure that IV
1366         vmov    @XMM[5],@XMM[0]                 @ and input are preserved
1367         bl      AES_decrypt
1368         vld1.8  {@XMM[0]}, [$fp]                @ load result
1369         veor    @XMM[0], @XMM[0], @XMM[4]       @ ^= IV
1370         vmov    @XMM[15], @XMM[5]               @ @XMM[5] holds input
1371         vst1.8  {@XMM[0]}, [$rounds]            @ write output
1372
1373 .Lcbc_dec_done:
1374 #ifndef BSAES_ASM_EXTENDED_KEY
1375         vmov.i32        q0, #0
1376         vmov.i32        q1, #0
1377 .Lcbc_dec_bzero:                                @ wipe key schedule [if any]
1378         vstmia          $keysched!, {q0-q1}
1379         cmp             $keysched, $fp
1380         bne             .Lcbc_dec_bzero
1381 #endif
1382
1383         mov     sp, $fp
1384         add     sp, #0x10                       @ add sp,$fp,#0x10 is no good for thumb
1385         vst1.8  {@XMM[15]}, [$ivp]              @ return IV
1386         VFP_ABI_POP
1387         ldmia   sp!, {r4-r10, pc}
1388 .size   bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1389 ___
1390 }
1391 {
1392 my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
1393 my $const = "r6";       # shared with _bsaes_encrypt8_alt
1394 my $keysched = "sp";
1395
1396 $code.=<<___;
1397 .extern AES_encrypt
1398 .global bsaes_ctr32_encrypt_blocks
1399 .type   bsaes_ctr32_encrypt_blocks,%function
1400 .align  5
1401 bsaes_ctr32_encrypt_blocks:
1402         cmp     $len, #8                        @ use plain AES for
1403         blo     .Lctr_enc_short                 @ small sizes
1404
1405         mov     ip, sp
1406         stmdb   sp!, {r4-r10, lr}
1407         VFP_ABI_PUSH
1408         ldr     $ctr, [ip]                      @ ctr is 1st arg on the stack
1409         sub     sp, sp, #0x10                   @ scratch space to carry over the ctr
1410         mov     $fp, sp                         @ save sp
1411
1412         ldr     $rounds, [$key, #240]           @ get # of rounds
1413 #ifndef BSAES_ASM_EXTENDED_KEY
1414         @ allocate the key schedule on the stack
1415         sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
1416         add     r12, #`128-32`                  @ size of bit-sliced key schedule
1417
1418         @ populate the key schedule
1419         mov     r4, $key                        @ pass key
1420         mov     r5, $rounds                     @ pass # of rounds
1421         mov     sp, r12                         @ sp is $keysched
1422         bl      _bsaes_key_convert
1423         veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
1424         vstmia  r12, {@XMM[7]}                  @ save last round key
1425
1426         vld1.8  {@XMM[0]}, [$ctr]               @ load counter
1427 #ifdef  __APPLE__
1428         mov     $ctr, #:lower16:(.LREVM0SR-.LM0)
1429         add     $ctr, $const, $ctr
1430 #else
1431         add     $ctr, $const, #.LREVM0SR-.LM0   @ borrow $ctr
1432 #endif
1433         vldmia  $keysched, {@XMM[4]}            @ load round0 key
1434 #else
1435         ldr     r12, [$key, #244]
1436         eors    r12, #1
1437         beq     0f
1438
1439         @ populate the key schedule
1440         str     r12, [$key, #244]
1441         mov     r4, $key                        @ pass key
1442         mov     r5, $rounds                     @ pass # of rounds
1443         add     r12, $key, #248                 @ pass key schedule
1444         bl      _bsaes_key_convert
1445         veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
1446         vstmia  r12, {@XMM[7]}                  @ save last round key
1447
1448 .align  2
1449 0:      add     r12, $key, #248
1450         vld1.8  {@XMM[0]}, [$ctr]               @ load counter
1451         adrl    $ctr, .LREVM0SR                 @ borrow $ctr
1452         vldmia  r12, {@XMM[4]}                  @ load round0 key
1453         sub     sp, #0x10                       @ place for adjusted round0 key
1454 #endif
1455
1456         vmov.i32        @XMM[8],#1              @ compose 1<<96
1457         veor            @XMM[9],@XMM[9],@XMM[9]
1458         vrev32.8        @XMM[0],@XMM[0]
1459         vext.8          @XMM[8],@XMM[9],@XMM[8],#4
1460         vrev32.8        @XMM[4],@XMM[4]
1461         vadd.u32        @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
1462         vstmia  $keysched, {@XMM[4]}            @ save adjusted round0 key
1463         b       .Lctr_enc_loop
1464
1465 .align  4
1466 .Lctr_enc_loop:
1467         vadd.u32        @XMM[10], @XMM[8], @XMM[9]      @ compose 3<<96
1468         vadd.u32        @XMM[1], @XMM[0], @XMM[8]       @ +1
1469         vadd.u32        @XMM[2], @XMM[0], @XMM[9]       @ +2
1470         vadd.u32        @XMM[3], @XMM[0], @XMM[10]      @ +3
1471         vadd.u32        @XMM[4], @XMM[1], @XMM[10]
1472         vadd.u32        @XMM[5], @XMM[2], @XMM[10]
1473         vadd.u32        @XMM[6], @XMM[3], @XMM[10]
1474         vadd.u32        @XMM[7], @XMM[4], @XMM[10]
1475         vadd.u32        @XMM[10], @XMM[5], @XMM[10]     @ next counter
1476
1477         @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1478         @ to flip byte order in 32-bit counter
1479
1480         vldmia          $keysched, {@XMM[9]}            @ load round0 key
1481 #ifndef BSAES_ASM_EXTENDED_KEY
1482         add             r4, $keysched, #0x10            @ pass next round key
1483 #else
1484         add             r4, $key, #`248+16`
1485 #endif
1486         vldmia          $ctr, {@XMM[8]}                 @ .LREVM0SR
1487         mov             r5, $rounds                     @ pass rounds
1488         vstmia          $fp, {@XMM[10]}                 @ save next counter
1489 #ifdef  __APPLE__
1490         mov             $const, #:lower16:(.LREVM0SR-.LSR)
1491         sub             $const, $ctr, $const
1492 #else
1493         sub             $const, $ctr, #.LREVM0SR-.LSR   @ pass constants
1494 #endif
1495
1496         bl              _bsaes_encrypt8_alt
1497
1498         subs            $len, $len, #8
1499         blo             .Lctr_enc_loop_done
1500
1501         vld1.8          {@XMM[8]-@XMM[9]}, [$inp]!      @ load input
1502         vld1.8          {@XMM[10]-@XMM[11]}, [$inp]!
1503         veor            @XMM[0], @XMM[8]
1504         veor            @XMM[1], @XMM[9]
1505         vld1.8          {@XMM[12]-@XMM[13]}, [$inp]!
1506         veor            @XMM[4], @XMM[10]
1507         veor            @XMM[6], @XMM[11]
1508         vld1.8          {@XMM[14]-@XMM[15]}, [$inp]!
1509         veor            @XMM[3], @XMM[12]
1510         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!      @ write output
1511         veor            @XMM[7], @XMM[13]
1512         veor            @XMM[2], @XMM[14]
1513         vst1.8          {@XMM[4]}, [$out]!
1514         veor            @XMM[5], @XMM[15]
1515         vst1.8          {@XMM[6]}, [$out]!
1516         vmov.i32        @XMM[8], #1                     @ compose 1<<96
1517         vst1.8          {@XMM[3]}, [$out]!
1518         veor            @XMM[9], @XMM[9], @XMM[9]
1519         vst1.8          {@XMM[7]}, [$out]!
1520         vext.8          @XMM[8], @XMM[9], @XMM[8], #4
1521         vst1.8          {@XMM[2]}, [$out]!
1522         vadd.u32        @XMM[9],@XMM[8],@XMM[8]         @ compose 2<<96
1523         vst1.8          {@XMM[5]}, [$out]!
1524         vldmia          $fp, {@XMM[0]}                  @ load counter
1525
1526         bne             .Lctr_enc_loop
1527         b               .Lctr_enc_done
1528
1529 .align  4
1530 .Lctr_enc_loop_done:
1531         add             $len, $len, #8
1532         vld1.8          {@XMM[8]}, [$inp]!      @ load input
1533         veor            @XMM[0], @XMM[8]
1534         vst1.8          {@XMM[0]}, [$out]!      @ write output
1535         cmp             $len, #2
1536         blo             .Lctr_enc_done
1537         vld1.8          {@XMM[9]}, [$inp]!
1538         veor            @XMM[1], @XMM[9]
1539         vst1.8          {@XMM[1]}, [$out]!
1540         beq             .Lctr_enc_done
1541         vld1.8          {@XMM[10]}, [$inp]!
1542         veor            @XMM[4], @XMM[10]
1543         vst1.8          {@XMM[4]}, [$out]!
1544         cmp             $len, #4
1545         blo             .Lctr_enc_done
1546         vld1.8          {@XMM[11]}, [$inp]!
1547         veor            @XMM[6], @XMM[11]
1548         vst1.8          {@XMM[6]}, [$out]!
1549         beq             .Lctr_enc_done
1550         vld1.8          {@XMM[12]}, [$inp]!
1551         veor            @XMM[3], @XMM[12]
1552         vst1.8          {@XMM[3]}, [$out]!
1553         cmp             $len, #6
1554         blo             .Lctr_enc_done
1555         vld1.8          {@XMM[13]}, [$inp]!
1556         veor            @XMM[7], @XMM[13]
1557         vst1.8          {@XMM[7]}, [$out]!
1558         beq             .Lctr_enc_done
1559         vld1.8          {@XMM[14]}, [$inp]
1560         veor            @XMM[2], @XMM[14]
1561         vst1.8          {@XMM[2]}, [$out]!
1562
1563 .Lctr_enc_done:
1564         vmov.i32        q0, #0
1565         vmov.i32        q1, #0
1566 #ifndef BSAES_ASM_EXTENDED_KEY
1567 .Lctr_enc_bzero:                        @ wipe key schedule [if any]
1568         vstmia          $keysched!, {q0-q1}
1569         cmp             $keysched, $fp
1570         bne             .Lctr_enc_bzero
1571 #else
1572         vstmia          $keysched, {q0-q1}
1573 #endif
1574
1575         mov     sp, $fp
1576         add     sp, #0x10               @ add sp,$fp,#0x10 is no good for thumb
1577         VFP_ABI_POP
1578         ldmia   sp!, {r4-r10, pc}       @ return
1579
1580 .align  4
1581 .Lctr_enc_short:
1582         ldr     ip, [sp]                @ ctr pointer is passed on stack
1583         stmdb   sp!, {r4-r8, lr}
1584
1585         mov     r4, $inp                @ copy arguments
1586         mov     r5, $out
1587         mov     r6, $len
1588         mov     r7, $key
1589         ldr     r8, [ip, #12]           @ load counter LSW
1590         vld1.8  {@XMM[1]}, [ip]         @ load whole counter value
1591 #ifdef __ARMEL__
1592         rev     r8, r8
1593 #endif
1594         sub     sp, sp, #0x10
1595         vst1.8  {@XMM[1]}, [sp]         @ copy counter value
1596         sub     sp, sp, #0x10
1597
1598 .Lctr_enc_short_loop:
1599         add     r0, sp, #0x10           @ input counter value
1600         mov     r1, sp                  @ output on the stack
1601         mov     r2, r7                  @ key
1602
1603         bl      AES_encrypt
1604
1605         vld1.8  {@XMM[0]}, [r4]!        @ load input
1606         vld1.8  {@XMM[1]}, [sp]         @ load encrypted counter
1607         add     r8, r8, #1
1608 #ifdef __ARMEL__
1609         rev     r0, r8
1610         str     r0, [sp, #0x1c]         @ next counter value
1611 #else
1612         str     r8, [sp, #0x1c]         @ next counter value
1613 #endif
1614         veor    @XMM[0],@XMM[0],@XMM[1]
1615         vst1.8  {@XMM[0]}, [r5]!        @ store output
1616         subs    r6, r6, #1
1617         bne     .Lctr_enc_short_loop
1618
1619         vmov.i32        q0, #0
1620         vmov.i32        q1, #0
1621         vstmia          sp!, {q0-q1}
1622
1623         ldmia   sp!, {r4-r8, pc}
1624 .size   bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1625 ___
1626 }
1627 {
1628 ######################################################################
1629 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1630 #       const AES_KEY *key1, const AES_KEY *key2,
1631 #       const unsigned char iv[16]);
1632 #
1633 my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
1634 my $const="r6";         # returned by _bsaes_key_convert
1635 my $twmask=@XMM[5];
1636 my @T=@XMM[6..7];
1637
1638 $code.=<<___;
1639 .globl  bsaes_xts_encrypt
1640 .type   bsaes_xts_encrypt,%function
1641 .align  4
1642 bsaes_xts_encrypt:
1643         mov     ip, sp
1644         stmdb   sp!, {r4-r10, lr}               @ 0x20
1645         VFP_ABI_PUSH
1646         mov     r6, sp                          @ future $fp
1647
1648         mov     $inp, r0
1649         mov     $out, r1
1650         mov     $len, r2
1651         mov     $key, r3
1652
1653         sub     r0, sp, #0x10                   @ 0x10
1654         bic     r0, #0xf                        @ align at 16 bytes
1655         mov     sp, r0
1656
1657 #ifdef  XTS_CHAIN_TWEAK
1658         ldr     r0, [ip]                        @ pointer to input tweak
1659 #else
1660         @ generate initial tweak
1661         ldr     r0, [ip, #4]                    @ iv[]
1662         mov     r1, sp
1663         ldr     r2, [ip, #0]                    @ key2
1664         bl      AES_encrypt
1665         mov     r0,sp                           @ pointer to initial tweak
1666 #endif
1667
1668         ldr     $rounds, [$key, #240]           @ get # of rounds
1669         mov     $fp, r6
1670 #ifndef BSAES_ASM_EXTENDED_KEY
1671         @ allocate the key schedule on the stack
1672         sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
1673         @ add   r12, #`128-32`                  @ size of bit-sliced key schedule
1674         sub     r12, #`32+16`                   @ place for tweak[9]
1675
1676         @ populate the key schedule
1677         mov     r4, $key                        @ pass key
1678         mov     r5, $rounds                     @ pass # of rounds
1679         mov     sp, r12
1680         add     r12, #0x90                      @ pass key schedule
1681         bl      _bsaes_key_convert
1682         veor    @XMM[7], @XMM[7], @XMM[15]      @ fix up last round key
1683         vstmia  r12, {@XMM[7]}                  @ save last round key
1684 #else
1685         ldr     r12, [$key, #244]
1686         eors    r12, #1
1687         beq     0f
1688
1689         str     r12, [$key, #244]
1690         mov     r4, $key                        @ pass key
1691         mov     r5, $rounds                     @ pass # of rounds
1692         add     r12, $key, #248                 @ pass key schedule
1693         bl      _bsaes_key_convert
1694         veor    @XMM[7], @XMM[7], @XMM[15]      @ fix up last round key
1695         vstmia  r12, {@XMM[7]}
1696
1697 .align  2
1698 0:      sub     sp, #0x90                       @ place for tweak[9]
1699 #endif
1700
1701         vld1.8  {@XMM[8]}, [r0]                 @ initial tweak
1702         adr     $magic, .Lxts_magic
1703
1704         subs    $len, #0x80
1705         blo     .Lxts_enc_short
1706         b       .Lxts_enc_loop
1707
1708 .align  4
1709 .Lxts_enc_loop:
1710         vldmia          $magic, {$twmask}       @ load XTS magic
1711         vshr.s64        @T[0], @XMM[8], #63
1712         mov             r0, sp
1713         vand            @T[0], @T[0], $twmask
1714 ___
1715 for($i=9;$i<16;$i++) {
1716 $code.=<<___;
1717         vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
1718         vst1.64         {@XMM[$i-1]}, [r0,:128]!
1719         vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1720         vshr.s64        @T[1], @XMM[$i], #63
1721         veor            @XMM[$i], @XMM[$i], @T[0]
1722         vand            @T[1], @T[1], $twmask
1723 ___
1724         @T=reverse(@T);
1725
1726 $code.=<<___ if ($i>=10);
1727         vld1.8          {@XMM[$i-10]}, [$inp]!
1728 ___
1729 $code.=<<___ if ($i>=11);
1730         veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
1731 ___
1732 }
1733 $code.=<<___;
1734         vadd.u64        @XMM[8], @XMM[15], @XMM[15]
1735         vst1.64         {@XMM[15]}, [r0,:128]!
1736         vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1737         veor            @XMM[8], @XMM[8], @T[0]
1738         vst1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
1739
1740         vld1.8          {@XMM[6]-@XMM[7]}, [$inp]!
1741         veor            @XMM[5], @XMM[5], @XMM[13]
1742 #ifndef BSAES_ASM_EXTENDED_KEY
1743         add             r4, sp, #0x90                   @ pass key schedule
1744 #else
1745         add             r4, $key, #248                  @ pass key schedule
1746 #endif
1747         veor            @XMM[6], @XMM[6], @XMM[14]
1748         mov             r5, $rounds                     @ pass rounds
1749         veor            @XMM[7], @XMM[7], @XMM[15]
1750         mov             r0, sp
1751
1752         bl              _bsaes_encrypt8
1753
1754         vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1755         vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
1756         veor            @XMM[0], @XMM[0], @XMM[ 8]
1757         vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
1758         veor            @XMM[1], @XMM[1], @XMM[ 9]
1759         veor            @XMM[8], @XMM[4], @XMM[10]
1760         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
1761         veor            @XMM[9], @XMM[6], @XMM[11]
1762         vld1.64         {@XMM[14]-@XMM[15]}, [r0,:128]!
1763         veor            @XMM[10], @XMM[3], @XMM[12]
1764         vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
1765         veor            @XMM[11], @XMM[7], @XMM[13]
1766         veor            @XMM[12], @XMM[2], @XMM[14]
1767         vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
1768         veor            @XMM[13], @XMM[5], @XMM[15]
1769         vst1.8          {@XMM[12]-@XMM[13]}, [$out]!
1770
1771         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
1772
1773         subs            $len, #0x80
1774         bpl             .Lxts_enc_loop
1775
1776 .Lxts_enc_short:
1777         adds            $len, #0x70
1778         bmi             .Lxts_enc_done
1779
1780         vldmia          $magic, {$twmask}       @ load XTS magic
1781         vshr.s64        @T[0], @XMM[8], #63
1782         mov             r0, sp
1783         vand            @T[0], @T[0], $twmask
1784 ___
1785 for($i=9;$i<16;$i++) {
1786 $code.=<<___;
1787         vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
1788         vst1.64         {@XMM[$i-1]}, [r0,:128]!
1789         vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1790         vshr.s64        @T[1], @XMM[$i], #63
1791         veor            @XMM[$i], @XMM[$i], @T[0]
1792         vand            @T[1], @T[1], $twmask
1793 ___
1794         @T=reverse(@T);
1795
1796 $code.=<<___ if ($i>=10);
1797         vld1.8          {@XMM[$i-10]}, [$inp]!
1798         subs            $len, #0x10
1799         bmi             .Lxts_enc_`$i-9`
1800 ___
1801 $code.=<<___ if ($i>=11);
1802         veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
1803 ___
1804 }
1805 $code.=<<___;
1806         sub             $len, #0x10
1807         vst1.64         {@XMM[15]}, [r0,:128]           @ next round tweak
1808
1809         vld1.8          {@XMM[6]}, [$inp]!
1810         veor            @XMM[5], @XMM[5], @XMM[13]
1811 #ifndef BSAES_ASM_EXTENDED_KEY
1812         add             r4, sp, #0x90                   @ pass key schedule
1813 #else
1814         add             r4, $key, #248                  @ pass key schedule
1815 #endif
1816         veor            @XMM[6], @XMM[6], @XMM[14]
1817         mov             r5, $rounds                     @ pass rounds
1818         mov             r0, sp
1819
1820         bl              _bsaes_encrypt8
1821
1822         vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1823         vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
1824         veor            @XMM[0], @XMM[0], @XMM[ 8]
1825         vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
1826         veor            @XMM[1], @XMM[1], @XMM[ 9]
1827         veor            @XMM[8], @XMM[4], @XMM[10]
1828         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
1829         veor            @XMM[9], @XMM[6], @XMM[11]
1830         vld1.64         {@XMM[14]}, [r0,:128]!
1831         veor            @XMM[10], @XMM[3], @XMM[12]
1832         vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
1833         veor            @XMM[11], @XMM[7], @XMM[13]
1834         veor            @XMM[12], @XMM[2], @XMM[14]
1835         vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
1836         vst1.8          {@XMM[12]}, [$out]!
1837
1838         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
1839         b               .Lxts_enc_done
1840 .align  4
1841 .Lxts_enc_6:
1842         veor            @XMM[4], @XMM[4], @XMM[12]
1843 #ifndef BSAES_ASM_EXTENDED_KEY
1844         add             r4, sp, #0x90                   @ pass key schedule
1845 #else
1846         add             r4, $key, #248                  @ pass key schedule
1847 #endif
1848         veor            @XMM[5], @XMM[5], @XMM[13]
1849         mov             r5, $rounds                     @ pass rounds
1850         mov             r0, sp
1851
1852         bl              _bsaes_encrypt8
1853
1854         vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1855         vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
1856         veor            @XMM[0], @XMM[0], @XMM[ 8]
1857         vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
1858         veor            @XMM[1], @XMM[1], @XMM[ 9]
1859         veor            @XMM[8], @XMM[4], @XMM[10]
1860         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
1861         veor            @XMM[9], @XMM[6], @XMM[11]
1862         veor            @XMM[10], @XMM[3], @XMM[12]
1863         vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
1864         veor            @XMM[11], @XMM[7], @XMM[13]
1865         vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
1866
1867         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
1868         b               .Lxts_enc_done
1869
1870 @ put this in range for both ARM and Thumb mode adr instructions
1871 .align  5
1872 .Lxts_magic:
1873         .quad   1, 0x87
1874
1875 .align  5
1876 .Lxts_enc_5:
1877         veor            @XMM[3], @XMM[3], @XMM[11]
1878 #ifndef BSAES_ASM_EXTENDED_KEY
1879         add             r4, sp, #0x90                   @ pass key schedule
1880 #else
1881         add             r4, $key, #248                  @ pass key schedule
1882 #endif
1883         veor            @XMM[4], @XMM[4], @XMM[12]
1884         mov             r5, $rounds                     @ pass rounds
1885         mov             r0, sp
1886
1887         bl              _bsaes_encrypt8
1888
1889         vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1890         vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
1891         veor            @XMM[0], @XMM[0], @XMM[ 8]
1892         vld1.64         {@XMM[12]}, [r0,:128]!
1893         veor            @XMM[1], @XMM[1], @XMM[ 9]
1894         veor            @XMM[8], @XMM[4], @XMM[10]
1895         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
1896         veor            @XMM[9], @XMM[6], @XMM[11]
1897         veor            @XMM[10], @XMM[3], @XMM[12]
1898         vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
1899         vst1.8          {@XMM[10]}, [$out]!
1900
1901         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
1902         b               .Lxts_enc_done
1903 .align  4
1904 .Lxts_enc_4:
1905         veor            @XMM[2], @XMM[2], @XMM[10]
1906 #ifndef BSAES_ASM_EXTENDED_KEY
1907         add             r4, sp, #0x90                   @ pass key schedule
1908 #else
1909         add             r4, $key, #248                  @ pass key schedule
1910 #endif
1911         veor            @XMM[3], @XMM[3], @XMM[11]
1912         mov             r5, $rounds                     @ pass rounds
1913         mov             r0, sp
1914
1915         bl              _bsaes_encrypt8
1916
1917         vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1918         vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
1919         veor            @XMM[0], @XMM[0], @XMM[ 8]
1920         veor            @XMM[1], @XMM[1], @XMM[ 9]
1921         veor            @XMM[8], @XMM[4], @XMM[10]
1922         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
1923         veor            @XMM[9], @XMM[6], @XMM[11]
1924         vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
1925
1926         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
1927         b               .Lxts_enc_done
1928 .align  4
1929 .Lxts_enc_3:
1930         veor            @XMM[1], @XMM[1], @XMM[9]
1931 #ifndef BSAES_ASM_EXTENDED_KEY
1932         add             r4, sp, #0x90                   @ pass key schedule
1933 #else
1934         add             r4, $key, #248                  @ pass key schedule
1935 #endif
1936         veor            @XMM[2], @XMM[2], @XMM[10]
1937         mov             r5, $rounds                     @ pass rounds
1938         mov             r0, sp
1939
1940         bl              _bsaes_encrypt8
1941
1942         vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
1943         vld1.64         {@XMM[10]}, [r0,:128]!
1944         veor            @XMM[0], @XMM[0], @XMM[ 8]
1945         veor            @XMM[1], @XMM[1], @XMM[ 9]
1946         veor            @XMM[8], @XMM[4], @XMM[10]
1947         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
1948         vst1.8          {@XMM[8]}, [$out]!
1949
1950         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
1951         b               .Lxts_enc_done
1952 .align  4
1953 .Lxts_enc_2:
1954         veor            @XMM[0], @XMM[0], @XMM[8]
1955 #ifndef BSAES_ASM_EXTENDED_KEY
1956         add             r4, sp, #0x90                   @ pass key schedule
1957 #else
1958         add             r4, $key, #248                  @ pass key schedule
1959 #endif
1960         veor            @XMM[1], @XMM[1], @XMM[9]
1961         mov             r5, $rounds                     @ pass rounds
1962         mov             r0, sp
1963
1964         bl              _bsaes_encrypt8
1965
1966         vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
1967         veor            @XMM[0], @XMM[0], @XMM[ 8]
1968         veor            @XMM[1], @XMM[1], @XMM[ 9]
1969         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
1970
1971         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
1972         b               .Lxts_enc_done
1973 .align  4
1974 .Lxts_enc_1:
1975         mov             r0, sp
1976         veor            @XMM[0], @XMM[0], @XMM[8]
1977         mov             r1, sp
1978         vst1.8          {@XMM[0]}, [sp,:128]
1979         mov             r2, $key
1980         mov             r4, $fp                         @ preserve fp
1981
1982         bl              AES_encrypt
1983
1984         vld1.8          {@XMM[0]}, [sp,:128]
1985         veor            @XMM[0], @XMM[0], @XMM[8]
1986         vst1.8          {@XMM[0]}, [$out]!
1987         mov             $fp, r4
1988
1989         vmov            @XMM[8], @XMM[9]                @ next round tweak
1990
1991 .Lxts_enc_done:
1992 #ifndef XTS_CHAIN_TWEAK
1993         adds            $len, #0x10
1994         beq             .Lxts_enc_ret
1995         sub             r6, $out, #0x10
1996
1997 .Lxts_enc_steal:
1998         ldrb            r0, [$inp], #1
1999         ldrb            r1, [$out, #-0x10]
2000         strb            r0, [$out, #-0x10]
2001         strb            r1, [$out], #1
2002
2003         subs            $len, #1
2004         bhi             .Lxts_enc_steal
2005
2006         vld1.8          {@XMM[0]}, [r6]
2007         mov             r0, sp
2008         veor            @XMM[0], @XMM[0], @XMM[8]
2009         mov             r1, sp
2010         vst1.8          {@XMM[0]}, [sp,:128]
2011         mov             r2, $key
2012         mov             r4, $fp                 @ preserve fp
2013
2014         bl              AES_encrypt
2015
2016         vld1.8          {@XMM[0]}, [sp,:128]
2017         veor            @XMM[0], @XMM[0], @XMM[8]
2018         vst1.8          {@XMM[0]}, [r6]
2019         mov             $fp, r4
2020 #endif
2021
2022 .Lxts_enc_ret:
2023         bic             r0, $fp, #0xf
2024         vmov.i32        q0, #0
2025         vmov.i32        q1, #0
2026 #ifdef  XTS_CHAIN_TWEAK
2027         ldr             r1, [$fp, #0x20+VFP_ABI_FRAME]  @ chain tweak
2028 #endif
2029 .Lxts_enc_bzero:                                @ wipe key schedule [if any]
2030         vstmia          sp!, {q0-q1}
2031         cmp             sp, r0
2032         bne             .Lxts_enc_bzero
2033
2034         mov             sp, $fp
2035 #ifdef  XTS_CHAIN_TWEAK
2036         vst1.8          {@XMM[8]}, [r1]
2037 #endif
2038         VFP_ABI_POP
2039         ldmia           sp!, {r4-r10, pc}       @ return
2040
2041 .size   bsaes_xts_encrypt,.-bsaes_xts_encrypt
2042
2043 .globl  bsaes_xts_decrypt
2044 .type   bsaes_xts_decrypt,%function
2045 .align  4
2046 bsaes_xts_decrypt:
2047         mov     ip, sp
2048         stmdb   sp!, {r4-r10, lr}               @ 0x20
2049         VFP_ABI_PUSH
2050         mov     r6, sp                          @ future $fp
2051
2052         mov     $inp, r0
2053         mov     $out, r1
2054         mov     $len, r2
2055         mov     $key, r3
2056
2057         sub     r0, sp, #0x10                   @ 0x10
2058         bic     r0, #0xf                        @ align at 16 bytes
2059         mov     sp, r0
2060
2061 #ifdef  XTS_CHAIN_TWEAK
2062         ldr     r0, [ip]                        @ pointer to input tweak
2063 #else
2064         @ generate initial tweak
2065         ldr     r0, [ip, #4]                    @ iv[]
2066         mov     r1, sp
2067         ldr     r2, [ip, #0]                    @ key2
2068         bl      AES_encrypt
2069         mov     r0, sp                          @ pointer to initial tweak
2070 #endif
2071
2072         ldr     $rounds, [$key, #240]           @ get # of rounds
2073         mov     $fp, r6
2074 #ifndef BSAES_ASM_EXTENDED_KEY
2075         @ allocate the key schedule on the stack
2076         sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
2077         @ add   r12, #`128-32`                  @ size of bit-sliced key schedule
2078         sub     r12, #`32+16`                   @ place for tweak[9]
2079
2080         @ populate the key schedule
2081         mov     r4, $key                        @ pass key
2082         mov     r5, $rounds                     @ pass # of rounds
2083         mov     sp, r12
2084         add     r12, #0x90                      @ pass key schedule
2085         bl      _bsaes_key_convert
2086         add     r4, sp, #0x90
2087         vldmia  r4, {@XMM[6]}
2088         vstmia  r12,  {@XMM[15]}                @ save last round key
2089         veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
2090         vstmia  r4, {@XMM[7]}
2091 #else
2092         ldr     r12, [$key, #244]
2093         eors    r12, #1
2094         beq     0f
2095
2096         str     r12, [$key, #244]
2097         mov     r4, $key                        @ pass key
2098         mov     r5, $rounds                     @ pass # of rounds
2099         add     r12, $key, #248                 @ pass key schedule
2100         bl      _bsaes_key_convert
2101         add     r4, $key, #248
2102         vldmia  r4, {@XMM[6]}
2103         vstmia  r12,  {@XMM[15]}                @ save last round key
2104         veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
2105         vstmia  r4, {@XMM[7]}
2106
2107 .align  2
2108 0:      sub     sp, #0x90                       @ place for tweak[9]
2109 #endif
2110         vld1.8  {@XMM[8]}, [r0]                 @ initial tweak
2111         adr     $magic, .Lxts_magic
2112
2113 #ifndef XTS_CHAIN_TWEAK
2114         tst     $len, #0xf                      @ if not multiple of 16
2115         it      ne                              @ Thumb2 thing, sanity check in ARM
2116         subne   $len, #0x10                     @ subtract another 16 bytes
2117 #endif
2118         subs    $len, #0x80
2119
2120         blo     .Lxts_dec_short
2121         b       .Lxts_dec_loop
2122
2123 .align  4
2124 .Lxts_dec_loop:
2125         vldmia          $magic, {$twmask}       @ load XTS magic
2126         vshr.s64        @T[0], @XMM[8], #63
2127         mov             r0, sp
2128         vand            @T[0], @T[0], $twmask
2129 ___
2130 for($i=9;$i<16;$i++) {
2131 $code.=<<___;
2132         vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
2133         vst1.64         {@XMM[$i-1]}, [r0,:128]!
2134         vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2135         vshr.s64        @T[1], @XMM[$i], #63
2136         veor            @XMM[$i], @XMM[$i], @T[0]
2137         vand            @T[1], @T[1], $twmask
2138 ___
2139         @T=reverse(@T);
2140
2141 $code.=<<___ if ($i>=10);
2142         vld1.8          {@XMM[$i-10]}, [$inp]!
2143 ___
2144 $code.=<<___ if ($i>=11);
2145         veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
2146 ___
2147 }
2148 $code.=<<___;
2149         vadd.u64        @XMM[8], @XMM[15], @XMM[15]
2150         vst1.64         {@XMM[15]}, [r0,:128]!
2151         vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2152         veor            @XMM[8], @XMM[8], @T[0]
2153         vst1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
2154
2155         vld1.8          {@XMM[6]-@XMM[7]}, [$inp]!
2156         veor            @XMM[5], @XMM[5], @XMM[13]
2157 #ifndef BSAES_ASM_EXTENDED_KEY
2158         add             r4, sp, #0x90                   @ pass key schedule
2159 #else
2160         add             r4, $key, #248                  @ pass key schedule
2161 #endif
2162         veor            @XMM[6], @XMM[6], @XMM[14]
2163         mov             r5, $rounds                     @ pass rounds
2164         veor            @XMM[7], @XMM[7], @XMM[15]
2165         mov             r0, sp
2166
2167         bl              _bsaes_decrypt8
2168
2169         vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2170         vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
2171         veor            @XMM[0], @XMM[0], @XMM[ 8]
2172         vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
2173         veor            @XMM[1], @XMM[1], @XMM[ 9]
2174         veor            @XMM[8], @XMM[6], @XMM[10]
2175         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
2176         veor            @XMM[9], @XMM[4], @XMM[11]
2177         vld1.64         {@XMM[14]-@XMM[15]}, [r0,:128]!
2178         veor            @XMM[10], @XMM[2], @XMM[12]
2179         vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
2180         veor            @XMM[11], @XMM[7], @XMM[13]
2181         veor            @XMM[12], @XMM[3], @XMM[14]
2182         vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
2183         veor            @XMM[13], @XMM[5], @XMM[15]
2184         vst1.8          {@XMM[12]-@XMM[13]}, [$out]!
2185
2186         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
2187
2188         subs            $len, #0x80
2189         bpl             .Lxts_dec_loop
2190
2191 .Lxts_dec_short:
2192         adds            $len, #0x70
2193         bmi             .Lxts_dec_done
2194
2195         vldmia          $magic, {$twmask}       @ load XTS magic
2196         vshr.s64        @T[0], @XMM[8], #63
2197         mov             r0, sp
2198         vand            @T[0], @T[0], $twmask
2199 ___
2200 for($i=9;$i<16;$i++) {
2201 $code.=<<___;
2202         vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
2203         vst1.64         {@XMM[$i-1]}, [r0,:128]!
2204         vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2205         vshr.s64        @T[1], @XMM[$i], #63
2206         veor            @XMM[$i], @XMM[$i], @T[0]
2207         vand            @T[1], @T[1], $twmask
2208 ___
2209         @T=reverse(@T);
2210
2211 $code.=<<___ if ($i>=10);
2212         vld1.8          {@XMM[$i-10]}, [$inp]!
2213         subs            $len, #0x10
2214         bmi             .Lxts_dec_`$i-9`
2215 ___
2216 $code.=<<___ if ($i>=11);
2217         veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
2218 ___
2219 }
2220 $code.=<<___;
2221         sub             $len, #0x10
2222         vst1.64         {@XMM[15]}, [r0,:128]           @ next round tweak
2223
2224         vld1.8          {@XMM[6]}, [$inp]!
2225         veor            @XMM[5], @XMM[5], @XMM[13]
2226 #ifndef BSAES_ASM_EXTENDED_KEY
2227         add             r4, sp, #0x90                   @ pass key schedule
2228 #else
2229         add             r4, $key, #248                  @ pass key schedule
2230 #endif
2231         veor            @XMM[6], @XMM[6], @XMM[14]
2232         mov             r5, $rounds                     @ pass rounds
2233         mov             r0, sp
2234
2235         bl              _bsaes_decrypt8
2236
2237         vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2238         vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
2239         veor            @XMM[0], @XMM[0], @XMM[ 8]
2240         vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
2241         veor            @XMM[1], @XMM[1], @XMM[ 9]
2242         veor            @XMM[8], @XMM[6], @XMM[10]
2243         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
2244         veor            @XMM[9], @XMM[4], @XMM[11]
2245         vld1.64         {@XMM[14]}, [r0,:128]!
2246         veor            @XMM[10], @XMM[2], @XMM[12]
2247         vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
2248         veor            @XMM[11], @XMM[7], @XMM[13]
2249         veor            @XMM[12], @XMM[3], @XMM[14]
2250         vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
2251         vst1.8          {@XMM[12]}, [$out]!
2252
2253         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
2254         b               .Lxts_dec_done
2255 .align  4
2256 .Lxts_dec_6:
2257         vst1.64         {@XMM[14]}, [r0,:128]           @ next round tweak
2258
2259         veor            @XMM[4], @XMM[4], @XMM[12]
2260 #ifndef BSAES_ASM_EXTENDED_KEY
2261         add             r4, sp, #0x90                   @ pass key schedule
2262 #else
2263         add             r4, $key, #248                  @ pass key schedule
2264 #endif
2265         veor            @XMM[5], @XMM[5], @XMM[13]
2266         mov             r5, $rounds                     @ pass rounds
2267         mov             r0, sp
2268
2269         bl              _bsaes_decrypt8
2270
2271         vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2272         vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
2273         veor            @XMM[0], @XMM[0], @XMM[ 8]
2274         vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
2275         veor            @XMM[1], @XMM[1], @XMM[ 9]
2276         veor            @XMM[8], @XMM[6], @XMM[10]
2277         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
2278         veor            @XMM[9], @XMM[4], @XMM[11]
2279         veor            @XMM[10], @XMM[2], @XMM[12]
2280         vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
2281         veor            @XMM[11], @XMM[7], @XMM[13]
2282         vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
2283
2284         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
2285         b               .Lxts_dec_done
2286 .align  4
2287 .Lxts_dec_5:
2288         veor            @XMM[3], @XMM[3], @XMM[11]
2289 #ifndef BSAES_ASM_EXTENDED_KEY
2290         add             r4, sp, #0x90                   @ pass key schedule
2291 #else
2292         add             r4, $key, #248                  @ pass key schedule
2293 #endif
2294         veor            @XMM[4], @XMM[4], @XMM[12]
2295         mov             r5, $rounds                     @ pass rounds
2296         mov             r0, sp
2297
2298         bl              _bsaes_decrypt8
2299
2300         vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2301         vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
2302         veor            @XMM[0], @XMM[0], @XMM[ 8]
2303         vld1.64         {@XMM[12]}, [r0,:128]!
2304         veor            @XMM[1], @XMM[1], @XMM[ 9]
2305         veor            @XMM[8], @XMM[6], @XMM[10]
2306         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
2307         veor            @XMM[9], @XMM[4], @XMM[11]
2308         veor            @XMM[10], @XMM[2], @XMM[12]
2309         vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
2310         vst1.8          {@XMM[10]}, [$out]!
2311
2312         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
2313         b               .Lxts_dec_done
2314 .align  4
2315 .Lxts_dec_4:
2316         veor            @XMM[2], @XMM[2], @XMM[10]
2317 #ifndef BSAES_ASM_EXTENDED_KEY
2318         add             r4, sp, #0x90                   @ pass key schedule
2319 #else
2320         add             r4, $key, #248                  @ pass key schedule
2321 #endif
2322         veor            @XMM[3], @XMM[3], @XMM[11]
2323         mov             r5, $rounds                     @ pass rounds
2324         mov             r0, sp
2325
2326         bl              _bsaes_decrypt8
2327
2328         vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2329         vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
2330         veor            @XMM[0], @XMM[0], @XMM[ 8]
2331         veor            @XMM[1], @XMM[1], @XMM[ 9]
2332         veor            @XMM[8], @XMM[6], @XMM[10]
2333         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
2334         veor            @XMM[9], @XMM[4], @XMM[11]
2335         vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
2336
2337         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
2338         b               .Lxts_dec_done
2339 .align  4
2340 .Lxts_dec_3:
2341         veor            @XMM[1], @XMM[1], @XMM[9]
2342 #ifndef BSAES_ASM_EXTENDED_KEY
2343         add             r4, sp, #0x90                   @ pass key schedule
2344 #else
2345         add             r4, $key, #248                  @ pass key schedule
2346 #endif
2347         veor            @XMM[2], @XMM[2], @XMM[10]
2348         mov             r5, $rounds                     @ pass rounds
2349         mov             r0, sp
2350
2351         bl              _bsaes_decrypt8
2352
2353         vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
2354         vld1.64         {@XMM[10]}, [r0,:128]!
2355         veor            @XMM[0], @XMM[0], @XMM[ 8]
2356         veor            @XMM[1], @XMM[1], @XMM[ 9]
2357         veor            @XMM[8], @XMM[6], @XMM[10]
2358         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
2359         vst1.8          {@XMM[8]}, [$out]!
2360
2361         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
2362         b               .Lxts_dec_done
2363 .align  4
2364 .Lxts_dec_2:
2365         veor            @XMM[0], @XMM[0], @XMM[8]
2366 #ifndef BSAES_ASM_EXTENDED_KEY
2367         add             r4, sp, #0x90                   @ pass key schedule
2368 #else
2369         add             r4, $key, #248                  @ pass key schedule
2370 #endif
2371         veor            @XMM[1], @XMM[1], @XMM[9]
2372         mov             r5, $rounds                     @ pass rounds
2373         mov             r0, sp
2374
2375         bl              _bsaes_decrypt8
2376
2377         vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
2378         veor            @XMM[0], @XMM[0], @XMM[ 8]
2379         veor            @XMM[1], @XMM[1], @XMM[ 9]
2380         vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
2381
2382         vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
2383         b               .Lxts_dec_done
2384 .align  4
2385 .Lxts_dec_1:
2386         mov             r0, sp
2387         veor            @XMM[0], @XMM[0], @XMM[8]
2388         mov             r1, sp
2389         vst1.8          {@XMM[0]}, [sp,:128]
2390         mov             r5, $magic                      @ preserve magic
2391         mov             r2, $key
2392         mov             r4, $fp                         @ preserve fp
2393
2394         bl              AES_decrypt
2395
2396         vld1.8          {@XMM[0]}, [sp,:128]
2397         veor            @XMM[0], @XMM[0], @XMM[8]
2398         vst1.8          {@XMM[0]}, [$out]!
2399         mov             $fp, r4
2400         mov             $magic, r5
2401
2402         vmov            @XMM[8], @XMM[9]                @ next round tweak
2403
2404 .Lxts_dec_done:
2405 #ifndef XTS_CHAIN_TWEAK
2406         adds            $len, #0x10
2407         beq             .Lxts_dec_ret
2408
2409         @ calculate one round of extra tweak for the stolen ciphertext
2410         vldmia          $magic, {$twmask}
2411         vshr.s64        @XMM[6], @XMM[8], #63
2412         vand            @XMM[6], @XMM[6], $twmask
2413         vadd.u64        @XMM[9], @XMM[8], @XMM[8]
2414         vswp            `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
2415         veor            @XMM[9], @XMM[9], @XMM[6]
2416
2417         @ perform the final decryption with the last tweak value
2418         vld1.8          {@XMM[0]}, [$inp]!
2419         mov             r0, sp
2420         veor            @XMM[0], @XMM[0], @XMM[9]
2421         mov             r1, sp
2422         vst1.8          {@XMM[0]}, [sp,:128]
2423         mov             r2, $key
2424         mov             r4, $fp                 @ preserve fp
2425
2426         bl              AES_decrypt
2427
2428         vld1.8          {@XMM[0]}, [sp,:128]
2429         veor            @XMM[0], @XMM[0], @XMM[9]
2430         vst1.8          {@XMM[0]}, [$out]
2431
2432         mov             r6, $out
2433 .Lxts_dec_steal:
2434         ldrb            r1, [$out]
2435         ldrb            r0, [$inp], #1
2436         strb            r1, [$out, #0x10]
2437         strb            r0, [$out], #1
2438
2439         subs            $len, #1
2440         bhi             .Lxts_dec_steal
2441
2442         vld1.8          {@XMM[0]}, [r6]
2443         mov             r0, sp
2444         veor            @XMM[0], @XMM[8]
2445         mov             r1, sp
2446         vst1.8          {@XMM[0]}, [sp,:128]
2447         mov             r2, $key
2448
2449         bl              AES_decrypt
2450
2451         vld1.8          {@XMM[0]}, [sp,:128]
2452         veor            @XMM[0], @XMM[0], @XMM[8]
2453         vst1.8          {@XMM[0]}, [r6]
2454         mov             $fp, r4
2455 #endif
2456
2457 .Lxts_dec_ret:
2458         bic             r0, $fp, #0xf
2459         vmov.i32        q0, #0
2460         vmov.i32        q1, #0
2461 #ifdef  XTS_CHAIN_TWEAK
2462         ldr             r1, [$fp, #0x20+VFP_ABI_FRAME]  @ chain tweak
2463 #endif
2464 .Lxts_dec_bzero:                                @ wipe key schedule [if any]
2465         vstmia          sp!, {q0-q1}
2466         cmp             sp, r0
2467         bne             .Lxts_dec_bzero
2468
2469         mov             sp, $fp
2470 #ifdef  XTS_CHAIN_TWEAK
2471         vst1.8          {@XMM[8]}, [r1]
2472 #endif
2473         VFP_ABI_POP
2474         ldmia           sp!, {r4-r10, pc}       @ return
2475
2476 .size   bsaes_xts_decrypt,.-bsaes_xts_decrypt
2477 ___
2478 }
2479 $code.=<<___;
2480 #endif
2481 ___
2482
2483 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
2484
2485 open SELF,$0;
2486 while(<SELF>) {
2487         next if (/^#!/);
2488         last if (!s/^#/@/ and !/^$/);
2489         print;
2490 }
2491 close SELF;
2492
2493 print $code;
2494
2495 close STDOUT;