2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
32 # Key to performance of parallelize-able modes is round instruction
33 # interleaving. But which factor to use? There is optimal one for
34 # each combination of instruction latency and issue rate, beyond
35 # which increasing interleave factor doesn't pay off. While on cons
36 # side we have code size increase and resource waste on platforms for
37 # which interleave factor is too high. In other words you want it to
38 # be just right. So far interleave factor of 3x was serving well all
39 # platforms. But for ThunderX2 optimal interleave factor was measured
42 # Performance in cycles per byte processed with 128-bit key:
45 # Apple A7 2.39 1.20 1.20
46 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48 # Cortex-A72 1.33 0.85/0.88 0.92/0.96
49 # Denver 1.96 0.65/0.86 0.76/0.80
50 # Mongoose 1.33 1.23/1.20 1.30/1.20
51 # Kryo 1.26 0.87/0.94 1.00/1.00
52 # ThunderX2 5.95 1.25 1.30
54 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
55 # and are still same even for updated module;
56 # (**) numbers after slash are for 32-bit code, which is 3x-
59 # $output is the last argument if it looks like a file (it has an extension)
60 # $flavour is the first argument if it doesn't look like a file
61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67 die "can't locate arm-xlate.pl";
69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
80 #if __ARM_MAX_ARCH__>=7
82 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83 $code.=<<___ if ($flavour !~ /64/);
84 .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
89 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
92 # define INST(a,b,c,d) $_byte a,b,c,d
98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100 # maintain both 32- and 64-bit codes within single module and
101 # transliterate common code to either flavour with regex vodoo.
104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
112 .long 0x01,0x01,0x01,0x01
113 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114 .long 0x1b,0x1b,0x1b,0x1b
116 .globl ${prefix}_set_encrypt_key
117 .type ${prefix}_set_encrypt_key,%function
119 ${prefix}_set_encrypt_key:
122 $code.=<<___ if ($flavour =~ /64/);
123 stp x29,x30,[sp,#-16]!
143 veor $zero,$zero,$zero
144 vld1.8 {$in0},[$inp],#16
145 mov $bits,#8 // reuse $bits
146 vld1.32 {$rcon,$mask},[$ptr],#32
154 vtbl.8 $key,{$in0},$mask
155 vext.8 $tmp,$zero,$in0,#12
156 vst1.32 {$in0},[$out],#16
161 vext.8 $tmp,$zero,$tmp,#12
163 vext.8 $tmp,$zero,$tmp,#12
166 vshl.u8 $rcon,$rcon,#1
170 vld1.32 {$rcon},[$ptr]
172 vtbl.8 $key,{$in0},$mask
173 vext.8 $tmp,$zero,$in0,#12
174 vst1.32 {$in0},[$out],#16
178 vext.8 $tmp,$zero,$tmp,#12
180 vext.8 $tmp,$zero,$tmp,#12
183 vshl.u8 $rcon,$rcon,#1
186 vtbl.8 $key,{$in0},$mask
187 vext.8 $tmp,$zero,$in0,#12
188 vst1.32 {$in0},[$out],#16
192 vext.8 $tmp,$zero,$tmp,#12
194 vext.8 $tmp,$zero,$tmp,#12
198 vst1.32 {$in0},[$out]
206 vld1.8 {$in1},[$inp],#8
207 vmov.i8 $key,#8 // borrow $key
208 vst1.32 {$in0},[$out],#16
209 vsub.i8 $mask,$mask,$key // adjust the mask
212 vtbl.8 $key,{$in1},$mask
213 vext.8 $tmp,$zero,$in0,#12
215 vst1.32 {$in1},[$out],#16
218 vst1.32 {$in1},[$out],#8
224 vext.8 $tmp,$zero,$tmp,#12
226 vext.8 $tmp,$zero,$tmp,#12
229 vdup.32 $tmp,${in0}[3]
232 vext.8 $in1,$zero,$in1,#12
233 vshl.u8 $rcon,$rcon,#1
237 vst1.32 {$in0},[$out],#16
249 vst1.32 {$in0},[$out],#16
252 vtbl.8 $key,{$in1},$mask
253 vext.8 $tmp,$zero,$in0,#12
254 vst1.32 {$in1},[$out],#16
259 vext.8 $tmp,$zero,$tmp,#12
261 vext.8 $tmp,$zero,$tmp,#12
264 vshl.u8 $rcon,$rcon,#1
266 vst1.32 {$in0},[$out],#16
269 vdup.32 $key,${in0}[3] // just splat
270 vext.8 $tmp,$zero,$in1,#12
274 vext.8 $tmp,$zero,$tmp,#12
276 vext.8 $tmp,$zero,$tmp,#12
287 mov x0,$ptr // return value
288 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
290 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
292 .globl ${prefix}_set_decrypt_key
293 .type ${prefix}_set_decrypt_key,%function
295 ${prefix}_set_decrypt_key:
297 $code.=<<___ if ($flavour =~ /64/);
298 .inst 0xd503233f // paciasp
299 stp x29,x30,[sp,#-16]!
302 $code.=<<___ if ($flavour !~ /64/);
311 sub $out,$out,#240 // restore original $out
313 add $inp,$out,x12,lsl#4 // end of key schedule
315 vld1.32 {v0.16b},[$out]
316 vld1.32 {v1.16b},[$inp]
317 vst1.32 {v0.16b},[$inp],x4
318 vst1.32 {v1.16b},[$out],#16
321 vld1.32 {v0.16b},[$out]
322 vld1.32 {v1.16b},[$inp]
325 vst1.32 {v0.16b},[$inp],x4
326 vst1.32 {v1.16b},[$out],#16
330 vld1.32 {v0.16b},[$out]
332 vst1.32 {v0.16b},[$inp]
334 eor x0,x0,x0 // return value
337 $code.=<<___ if ($flavour !~ /64/);
340 $code.=<<___ if ($flavour =~ /64/);
342 .inst 0xd50323bf // autiasp
346 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
352 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
353 my ($inp,$out,$key)=map("x$_",(0..2));
355 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
358 .globl ${prefix}_${dir}crypt
359 .type ${prefix}_${dir}crypt,%function
361 ${prefix}_${dir}crypt:
362 ldr $rounds,[$key,#240]
363 vld1.32 {$rndkey0},[$key],#16
364 vld1.8 {$inout},[$inp]
365 sub $rounds,$rounds,#2
366 vld1.32 {$rndkey1},[$key],#16
369 aes$e $inout,$rndkey0
371 vld1.32 {$rndkey0},[$key],#16
372 subs $rounds,$rounds,#2
373 aes$e $inout,$rndkey1
375 vld1.32 {$rndkey1},[$key],#16
378 aes$e $inout,$rndkey0
380 vld1.32 {$rndkey0},[$key]
381 aes$e $inout,$rndkey1
382 veor $inout,$inout,$rndkey0
384 vst1.8 {$inout},[$out]
386 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
393 # Performance in cycles per byte.
394 # Processed with AES-ECB different key size.
395 # It shows the value before and after optimization as below:
398 # AES-128-ECB AES-192-ECB AES-256-ECB
399 # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
400 # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
402 # Optimization is implemented by loop unrolling and interleaving.
403 # Commonly, we choose the unrolling factor as 5, if the input
404 # data size smaller than 5 blocks, but not smaller than 3 blocks,
405 # choose 3 as the unrolling factor.
406 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
407 # as one iteration, every loop the left size lsize -= 5*16.
408 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
409 # every loop lsize -=3*16.
410 # If lsize < 3*16 bytes, treat them as the tail, interleave the
411 # two blocks AES instructions.
412 # There is one special case, if the original input data size dsize
413 # = 16 bytes, we will treat it seperately to improve the
414 # performance: one independent code block without LR, FP load and
415 # store, just looks like what the original ECB implementation does.
418 my ($inp,$out,$len,$key)=map("x$_",(0..3));
419 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
420 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
422 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
424 ### q7 last round key
425 ### q10-q15 q7 Last 7 round keys
426 ### q8-q9 preloaded round keys except last 7 keys for big size
427 ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
430 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
432 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
433 my ($dat4,$in4,$tmp4);
434 if ($flavour =~ /64/) {
435 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
439 .globl ${prefix}_ecb_encrypt
440 .type ${prefix}_ecb_encrypt,%function
442 ${prefix}_ecb_encrypt:
444 $code.=<<___ if ($flavour =~ /64/);
446 // Original input data size bigger than 16, jump to big size processing.
448 vld1.8 {$dat0},[$inp]
449 cmp $enc,#0 // en- or decrypting?
450 ldr $rounds,[$key,#240]
451 vld1.32 {q5-q6},[$key],#32 // load key schedule...
456 vld1.32 {q8-q9},[$key],#32 // load key schedule...
459 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
464 vld1.32 {q8},[$key],#16 // load key schedule...
467 vld1.32 {q9},[$key],#16 // load key schedule...
468 subs $rounds,$rounds,#2 // bias
469 b.gt .Lecb_round_loop
471 vld1.32 {q10-q11},[$key],#32 // load key schedule...
476 vld1.32 {q12-q13},[$key],#32 // load key schedule...
481 vld1.32 {q14-q15},[$key],#32 // load key schedule...
486 vld1.32 {$rndlast},[$key]
490 veor $dat0,$dat0,$rndlast
491 vst1.8 {$dat0},[$out]
496 vld1.32 {q8-q9},[$key],#32 // load key schedule...
499 subs $rounds,$rounds,#10 // bias
501 .Lecb_dec_round_loop:
504 vld1.32 {q8},[$key],#16 // load key schedule...
507 vld1.32 {q9},[$key],#16 // load key schedule...
508 subs $rounds,$rounds,#2 // bias
509 b.gt .Lecb_dec_round_loop
511 vld1.32 {q10-q11},[$key],#32 // load key schedule...
516 vld1.32 {q12-q13},[$key],#32 // load key schedule...
521 vld1.32 {q14-q15},[$key],#32 // load key schedule...
526 vld1.32 {$rndlast},[$key]
530 veor $dat0,$dat0,$rndlast
531 vst1.8 {$dat0},[$out]
535 $code.=<<___ if ($flavour =~ /64/);
536 stp x29,x30,[sp,#-16]!
539 $code.=<<___ if ($flavour !~ /64/);
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 ldmia ip,{r4-r5} @ load remaining args
551 cmp $enc,#0 // en- or decrypting?
552 ldr $rounds,[$key,#240]
554 vld1.8 {$dat},[$inp],$step
556 vld1.32 {q8-q9},[$key] // load key schedule...
557 sub $rounds,$rounds,#6
558 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
559 sub $rounds,$rounds,#2
560 vld1.32 {q10-q11},[$key_],#32
561 vld1.32 {q12-q13},[$key_],#32
562 vld1.32 {q14-q15},[$key_],#32
563 vld1.32 {$rndlast},[$key_]
569 vld1.8 {$dat1},[$inp],#16
570 subs $len,$len,#32 // bias
572 vorr $in1,$dat1,$dat1
573 vorr $dat2,$dat1,$dat1
578 vld1.8 {$dat2},[$inp],#16
580 $code.=<<___ if ($flavour =~ /64/);
584 vld1.8 {$dat3},[$inp],#16
585 vld1.8 {$dat4},[$inp],#16
586 sub $len,$len,#32 // bias
600 vld1.32 {q8},[$key_],#16
612 vld1.32 {q9},[$key_],#16
625 cmp $len,#0x40 // because .Lecb_enc_tail4x
638 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
651 add $inp,$inp,x6 // $inp is adjusted in such way that
652 // at exit from the loop $dat1-$dat4
653 // are loaded with last "words"
654 add x6,$len,#0x60 // because .Lecb_enc_tail4x
701 vld1.8 {$in0},[$inp],#16
703 vld1.8 {$in1},[$inp],#16
705 vld1.8 {$in2},[$inp],#16
707 vld1.8 {$in3},[$inp],#16
709 vld1.8 {$in4},[$inp],#16
710 cbz x6,.Lecb_enc_tail4x
711 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
712 veor $tmp0,$rndlast,$dat0
714 veor $tmp1,$rndlast,$dat1
716 veor $tmp2,$rndlast,$dat2
718 veor $tmp3,$rndlast,$dat3
720 veor $tmp4,$rndlast,$dat4
721 vst1.8 {$tmp0},[$out],#16
723 vst1.8 {$tmp1},[$out],#16
725 vst1.8 {$tmp2},[$out],#16
726 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
727 vst1.8 {$tmp3},[$out],#16
728 vst1.8 {$tmp4},[$out],#16
745 veor $tmp1,$rndlast,$dat1
746 veor $tmp2,$rndlast,$dat2
747 veor $tmp3,$rndlast,$dat3
748 veor $tmp4,$rndlast,$dat4
749 vst1.8 {$tmp1},[$out],#16
750 vst1.8 {$tmp2},[$out],#16
751 vst1.8 {$tmp3},[$out],#16
752 vst1.8 {$tmp4},[$out],#16
765 vld1.32 {q8},[$key_],#16
773 vld1.32 {q9},[$key_],#16
783 mov.lo x6,$len // x6, $cnt, is zero at this point
790 add $inp,$inp,x6 // $inp is adjusted in such way that
791 // at exit from the loop $dat1-$dat2
792 // are loaded with last "words"
800 vld1.8 {$in0},[$inp],#16
807 vld1.8 {$in1},[$inp],#16
814 vld1.8 {$in2},[$inp],#16
818 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
820 veor $tmp0,$rndlast,$dat0
821 veor $tmp1,$rndlast,$dat1
822 veor $dat2,$dat2,$rndlast
823 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
824 vst1.8 {$tmp0},[$out],#16
826 vst1.8 {$tmp1},[$out],#16
828 vst1.8 {$dat2},[$out],#16
841 vld1.32 {q8},[$key_],#16
847 vld1.32 {q9},[$key_],#16
874 veor $tmp1,$rndlast,$dat1
875 veor $tmp2,$rndlast,$dat2
876 vst1.8 {$tmp1},[$out],#16
877 vst1.8 {$tmp2},[$out],#16
881 veor $tmp1,$rndlast,$dat2
882 vst1.8 {$tmp1},[$out],#16
889 vld1.8 {$dat1},[$inp],#16
890 subs $len,$len,#32 // bias
892 vorr $in1,$dat1,$dat1
893 vorr $dat2,$dat1,$dat1
898 vld1.8 {$dat2},[$inp],#16
900 $code.=<<___ if ($flavour =~ /64/);
904 vld1.8 {$dat3},[$inp],#16
905 vld1.8 {$dat4},[$inp],#16
906 sub $len,$len,#32 // bias
920 vld1.32 {q8},[$key_],#16
932 vld1.32 {q9},[$key_],#16
945 cmp $len,#0x40 // because .Lecb_tail4x
958 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
971 add $inp,$inp,x6 // $inp is adjusted in such way that
972 // at exit from the loop $dat1-$dat4
973 // are loaded with last "words"
974 add x6,$len,#0x60 // because .Lecb_tail4x
1021 vld1.8 {$in0},[$inp],#16
1023 vld1.8 {$in1},[$inp],#16
1025 vld1.8 {$in2},[$inp],#16
1027 vld1.8 {$in3},[$inp],#16
1029 vld1.8 {$in4},[$inp],#16
1031 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1032 veor $tmp0,$rndlast,$dat0
1033 vorr $dat0,$in0,$in0
1034 veor $tmp1,$rndlast,$dat1
1035 vorr $dat1,$in1,$in1
1036 veor $tmp2,$rndlast,$dat2
1037 vorr $dat2,$in2,$in2
1038 veor $tmp3,$rndlast,$dat3
1039 vorr $dat3,$in3,$in3
1040 veor $tmp4,$rndlast,$dat4
1041 vst1.8 {$tmp0},[$out],#16
1042 vorr $dat4,$in4,$in4
1043 vst1.8 {$tmp1},[$out],#16
1045 vst1.8 {$tmp2},[$out],#16
1046 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1047 vst1.8 {$tmp3},[$out],#16
1048 vst1.8 {$tmp4},[$out],#16
1049 b.hs .Loop5x_ecb_dec
1055 subs $len,$len,#0x30
1056 vorr $dat0,$in2,$in2
1057 vorr $dat1,$in3,$in3
1058 vorr $dat2,$in4,$in4
1065 veor $tmp1,$rndlast,$dat1
1066 veor $tmp2,$rndlast,$dat2
1067 veor $tmp3,$rndlast,$dat3
1068 veor $tmp4,$rndlast,$dat4
1069 vst1.8 {$tmp1},[$out],#16
1070 vst1.8 {$tmp2},[$out],#16
1071 vst1.8 {$tmp3},[$out],#16
1072 vst1.8 {$tmp4},[$out],#16
1085 vld1.32 {q8},[$key_],#16
1093 vld1.32 {q9},[$key_],#16
1094 b.gt .Loop3x_ecb_dec
1102 subs $len,$len,#0x30
1103 mov.lo x6,$len // x6, $cnt, is zero at this point
1110 add $inp,$inp,x6 // $inp is adjusted in such way that
1111 // at exit from the loop $dat1-$dat2
1112 // are loaded with last "words"
1120 vld1.8 {$in0},[$inp],#16
1127 vld1.8 {$in1},[$inp],#16
1134 vld1.8 {$in2},[$inp],#16
1138 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1140 veor $tmp0,$rndlast,$dat0
1141 veor $tmp1,$rndlast,$dat1
1142 veor $dat2,$dat2,$rndlast
1143 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1144 vst1.8 {$tmp0},[$out],#16
1145 vorr $dat0,$in0,$in0
1146 vst1.8 {$tmp1},[$out],#16
1147 vorr $dat1,$in1,$in1
1148 vst1.8 {$dat2},[$out],#16
1149 vorr $dat2,$in2,$in2
1150 b.hs .Loop3x_ecb_dec
1161 vld1.32 {q8},[$key_],#16
1167 vld1.32 {q9},[$key_],#16
1194 veor $tmp1,$rndlast,$dat1
1195 veor $tmp2,$rndlast,$dat2
1196 vst1.8 {$tmp1},[$out],#16
1197 vst1.8 {$tmp2},[$out],#16
1201 veor $tmp1,$rndlast,$dat2
1202 vst1.8 {$tmp1},[$out],#16
1207 $code.=<<___ if ($flavour !~ /64/);
1209 ldmia sp!,{r4-r8,pc}
1211 $code.=<<___ if ($flavour =~ /64/);
1214 $code.=<<___ if ($flavour =~ /64/);
1219 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1223 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1224 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1225 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1227 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1228 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1230 ### q8-q15 preloaded key schedule
1233 .globl ${prefix}_cbc_encrypt
1234 .type ${prefix}_cbc_encrypt,%function
1236 ${prefix}_cbc_encrypt:
1238 $code.=<<___ if ($flavour =~ /64/);
1239 stp x29,x30,[sp,#-16]!
1242 $code.=<<___ if ($flavour !~ /64/);
1244 stmdb sp!,{r4-r8,lr}
1245 vstmdb sp!,{d8-d15} @ ABI specification says so
1246 ldmia ip,{r4-r5} @ load remaining args
1254 cmp $enc,#0 // en- or decrypting?
1255 ldr $rounds,[$key,#240]
1257 vld1.8 {$ivec},[$ivp]
1258 vld1.8 {$dat},[$inp],$step
1260 vld1.32 {q8-q9},[$key] // load key schedule...
1261 sub $rounds,$rounds,#6
1262 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1263 sub $rounds,$rounds,#2
1264 vld1.32 {q10-q11},[$key_],#32
1265 vld1.32 {q12-q13},[$key_],#32
1266 vld1.32 {q14-q15},[$key_],#32
1267 vld1.32 {$rndlast},[$key_]
1274 veor $dat,$dat,$ivec
1275 veor $rndzero_n_last,q8,$rndlast
1278 vld1.32 {$in0-$in1},[$key_]
1280 add $key4,$key,#16*4
1281 add $key5,$key,#16*5
1284 add $key6,$key,#16*6
1285 add $key7,$key,#16*7
1292 vst1.8 {$ivec},[$out],#16
1298 vld1.32 {q8},[$key4]
1302 vld1.32 {q9},[$key5]
1307 vld1.32 {q8},[$key6]
1310 vld1.32 {q9},[$key7]
1324 vld1.8 {q8},[$inp],$step
1327 veor q8,q8,$rndzero_n_last
1330 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1334 veor $ivec,$dat,$rndlast
1337 vst1.8 {$ivec},[$out],#16
1342 vld1.32 {$in0-$in1},[$key_]
1345 b .Lenter_cbc_enc128
1349 vst1.8 {$ivec},[$out],#16
1363 vld1.8 {q8},[$inp],$step
1370 veor q8,q8,$rndzero_n_last
1372 veor $ivec,$dat,$rndlast
1373 b.hs .Loop_cbc_enc128
1375 vst1.8 {$ivec},[$out],#16
1379 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1381 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1382 my ($dat4,$in4,$tmp4);
1383 if ($flavour =~ /64/) {
1384 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1390 vld1.8 {$dat2},[$inp],#16
1391 subs $len,$len,#32 // bias
1394 vorr $dat1,$dat,$dat
1395 vorr $in2,$dat2,$dat2
1398 vorr $dat1,$dat2,$dat2
1399 vld1.8 {$dat2},[$inp],#16
1401 vorr $in1,$dat1,$dat1
1402 vorr $in2,$dat2,$dat2
1404 $code.=<<___ if ($flavour =~ /64/);
1406 b.lo .Loop3x_cbc_dec
1408 vld1.8 {$dat3},[$inp],#16
1409 vld1.8 {$dat4},[$inp],#16
1410 sub $len,$len,#32 // bias
1412 vorr $in3,$dat3,$dat3
1413 vorr $in4,$dat4,$dat4
1426 vld1.32 {q8},[$key_],#16
1438 vld1.32 {q9},[$key_],#16
1439 b.gt .Loop5x_cbc_dec
1451 cmp $len,#0x40 // because .Lcbc_tail4x
1464 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1477 add $inp,$inp,x6 // $inp is adjusted in such way that
1478 // at exit from the loop $dat1-$dat4
1479 // are loaded with last "words"
1480 add x6,$len,#0x60 // because .Lcbc_tail4x
1526 veor $tmp0,$ivec,$rndlast
1528 veor $tmp1,$in0,$rndlast
1529 vld1.8 {$in0},[$inp],#16
1531 veor $tmp2,$in1,$rndlast
1532 vld1.8 {$in1},[$inp],#16
1534 veor $tmp3,$in2,$rndlast
1535 vld1.8 {$in2},[$inp],#16
1537 veor $tmp4,$in3,$rndlast
1538 vld1.8 {$in3},[$inp],#16
1540 vorr $ivec,$in4,$in4
1541 vld1.8 {$in4},[$inp],#16
1543 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1544 veor $tmp0,$tmp0,$dat0
1545 vorr $dat0,$in0,$in0
1546 veor $tmp1,$tmp1,$dat1
1547 vorr $dat1,$in1,$in1
1548 veor $tmp2,$tmp2,$dat2
1549 vorr $dat2,$in2,$in2
1550 veor $tmp3,$tmp3,$dat3
1551 vorr $dat3,$in3,$in3
1552 veor $tmp4,$tmp4,$dat4
1553 vst1.8 {$tmp0},[$out],#16
1554 vorr $dat4,$in4,$in4
1555 vst1.8 {$tmp1},[$out],#16
1557 vst1.8 {$tmp2},[$out],#16
1558 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1559 vst1.8 {$tmp3},[$out],#16
1560 vst1.8 {$tmp4},[$out],#16
1561 b.hs .Loop5x_cbc_dec
1567 subs $len,$len,#0x30
1568 vorr $dat0,$in2,$in2
1570 vorr $dat1,$in3,$in3
1572 vorr $dat2,$in4,$in4
1580 veor $tmp1,$tmp0,$dat1
1581 veor $tmp2,$tmp2,$dat2
1582 veor $tmp3,$tmp3,$dat3
1583 veor $tmp4,$tmp4,$dat4
1584 vst1.8 {$tmp1},[$out],#16
1585 vst1.8 {$tmp2},[$out],#16
1586 vst1.8 {$tmp3},[$out],#16
1587 vst1.8 {$tmp4},[$out],#16
1600 vld1.32 {q8},[$key_],#16
1608 vld1.32 {q9},[$key_],#16
1609 b.gt .Loop3x_cbc_dec
1617 veor $tmp0,$ivec,$rndlast
1618 subs $len,$len,#0x30
1619 veor $tmp1,$in0,$rndlast
1620 mov.lo x6,$len // x6, $cnt, is zero at this point
1627 veor $tmp2,$in1,$rndlast
1628 add $inp,$inp,x6 // $inp is adjusted in such way that
1629 // at exit from the loop $dat1-$dat2
1630 // are loaded with last "words"
1631 vorr $ivec,$in2,$in2
1639 vld1.8 {$in0},[$inp],#16
1646 vld1.8 {$in1},[$inp],#16
1653 vld1.8 {$in2},[$inp],#16
1657 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1659 veor $tmp0,$tmp0,$dat0
1660 veor $tmp1,$tmp1,$dat1
1661 veor $dat2,$dat2,$tmp2
1662 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1663 vst1.8 {$tmp0},[$out],#16
1664 vorr $dat0,$in0,$in0
1665 vst1.8 {$tmp1},[$out],#16
1666 vorr $dat1,$in1,$in1
1667 vst1.8 {$dat2},[$out],#16
1668 vorr $dat2,$in2,$in2
1669 b.hs .Loop3x_cbc_dec
1680 vld1.32 {q8},[$key_],#16
1686 vld1.32 {q9},[$key_],#16
1706 veor $tmp1,$ivec,$rndlast
1711 veor $tmp2,$in1,$rndlast
1715 veor $tmp1,$tmp1,$dat1
1716 veor $tmp2,$tmp2,$dat2
1717 vorr $ivec,$in2,$in2
1718 vst1.8 {$tmp1},[$out],#16
1719 vst1.8 {$tmp2},[$out],#16
1723 veor $tmp1,$tmp1,$dat2
1724 vorr $ivec,$in2,$in2
1725 vst1.8 {$tmp1},[$out],#16
1728 vst1.8 {$ivec},[$ivp]
1732 $code.=<<___ if ($flavour !~ /64/);
1734 ldmia sp!,{r4-r8,pc}
1736 $code.=<<___ if ($flavour =~ /64/);
1741 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1745 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1746 my ($rounds,$cnt,$key_)=("w5","w6","x7");
1747 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1748 my $step="x12"; # aliases with $tctr2
1750 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1751 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1753 # used only in 64-bit mode...
1754 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1756 my ($dat,$tmp)=($dat0,$tmp0);
1758 ### q8-q15 preloaded key schedule
1761 .globl ${prefix}_ctr32_encrypt_blocks
1762 .type ${prefix}_ctr32_encrypt_blocks,%function
1764 ${prefix}_ctr32_encrypt_blocks:
1766 $code.=<<___ if ($flavour =~ /64/);
1767 stp x29,x30,[sp,#-16]!
1770 $code.=<<___ if ($flavour !~ /64/);
1772 stmdb sp!,{r4-r10,lr}
1773 vstmdb sp!,{d8-d15} @ ABI specification says so
1774 ldr r4, [ip] @ load remaining arg
1777 ldr $rounds,[$key,#240]
1779 ldr $ctr, [$ivp, #12]
1781 vld1.8 {$dat0},[$ivp]
1783 vld1.32 {$dat0},[$ivp]
1785 vld1.32 {q8-q9},[$key] // load key schedule...
1786 sub $rounds,$rounds,#4
1789 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
1790 sub $rounds,$rounds,#2
1791 vld1.32 {q12-q13},[$key_],#32
1792 vld1.32 {q14-q15},[$key_],#32
1793 vld1.32 {$rndlast},[$key_]
1800 vorr $dat1,$dat0,$dat0
1801 add $tctr1, $ctr, #1
1802 vorr $dat2,$dat0,$dat0
1804 vorr $ivec,$dat0,$dat0
1806 vmov.32 ${dat1}[3],$tctr1
1809 sub $len,$len,#3 // bias
1810 vmov.32 ${dat2}[3],$tctr2
1812 $code.=<<___ if ($flavour =~ /64/);
1818 vorr $dat3,$dat0,$dat0
1820 vorr $dat4,$dat0,$dat0
1822 vmov.32 ${dat3}[3],w13
1823 sub $len,$len,#2 // bias
1824 vmov.32 ${dat4}[3],w14
1840 vld1.32 {q8},[$key_],#16
1852 vld1.32 {q9},[$key_],#16
1866 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1878 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1914 vld1.8 {$in0},[$inp],#16
1917 vld1.8 {$in1},[$inp],#16
1920 vld1.8 {$in2},[$inp],#16
1923 vld1.8 {$in3},[$inp],#16
1926 vld1.8 {$in4},[$inp],#16
1929 veor $in0,$in0,$rndlast
1931 veor $in1,$in1,$rndlast
1933 veor $in2,$in2,$rndlast
1935 veor $in3,$in3,$rndlast
1937 veor $in4,$in4,$rndlast
1939 veor $in0,$in0,$dat0
1940 vorr $dat0,$ivec,$ivec
1941 veor $in1,$in1,$dat1
1942 vorr $dat1,$ivec,$ivec
1943 veor $in2,$in2,$dat2
1944 vorr $dat2,$ivec,$ivec
1945 veor $in3,$in3,$dat3
1946 vorr $dat3,$ivec,$ivec
1947 veor $in4,$in4,$dat4
1948 vorr $dat4,$ivec,$ivec
1950 vst1.8 {$in0},[$out],#16
1951 vmov.32 ${dat0}[3],$tctr0
1952 vst1.8 {$in1},[$out],#16
1953 vmov.32 ${dat1}[3],$tctr1
1954 vst1.8 {$in2},[$out],#16
1955 vmov.32 ${dat2}[3],$tctr2
1956 vst1.8 {$in3},[$out],#16
1957 vmov.32 ${dat3}[3],w13
1958 vst1.8 {$in4},[$out],#16
1959 vmov.32 ${dat4}[3],w14
1962 cbz $len,.Lctr32_done
1976 sub $len,$len,#3 // bias
1990 vld1.32 {q8},[$key_],#16
1998 vld1.32 {q9},[$key_],#16
2005 vld1.8 {$in0},[$inp],#16
2006 vorr $dat0,$ivec,$ivec
2009 vld1.8 {$in1},[$inp],#16
2010 vorr $dat1,$ivec,$ivec
2015 vld1.8 {$in2},[$inp],#16
2019 vorr $dat2,$ivec,$ivec
2025 veor $in0,$in0,$rndlast
2029 veor $in1,$in1,$rndlast
2035 veor $in2,$in2,$rndlast
2039 vmov.32 ${dat0}[3], $tctr0
2045 vmov.32 ${dat1}[3], $tctr1
2049 vmov.32 ${dat2}[3], $tctr2
2055 veor $in0,$in0,$tmp0
2056 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2057 vst1.8 {$in0},[$out],#16
2058 veor $in1,$in1,$tmp1
2060 vst1.8 {$in1},[$out],#16
2061 veor $in2,$in2,$tmp2
2062 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2063 vst1.8 {$in2},[$out],#16
2077 vld1.32 {q8},[$key_],#16
2083 vld1.32 {q9},[$key_],#16
2094 vld1.8 {$in0},[$inp],$step
2099 vld1.8 {$in1},[$inp]
2104 veor $in0,$in0,$rndlast
2109 veor $in1,$in1,$rndlast
2114 veor $in0,$in0,$dat0
2115 veor $in1,$in1,$dat1
2116 vst1.8 {$in0},[$out],#16
2118 vst1.8 {$in1},[$out]
2122 $code.=<<___ if ($flavour !~ /64/);
2124 ldmia sp!,{r4-r10,pc}
2126 $code.=<<___ if ($flavour =~ /64/);
2131 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2137 ########################################
2138 if ($flavour =~ /64/) { ######## 64-bit code
2140 "aesd" => 0x4e285800, "aese" => 0x4e284800,
2141 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
2143 local *unaes = sub {
2144 my ($mnemonic,$arg)=@_;
2146 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
2147 sprintf ".inst\t0x%08x\t//%s %s",
2148 $opcode{$mnemonic}|$1|($2<<5),
2152 foreach(split("\n",$code)) {
2153 s/\`([^\`]*)\`/eval($1)/geo;
2155 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
2156 s/@\s/\/\//o; # old->new style commentary
2158 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
2159 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
2160 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
2161 s/vmov\.i8/movi/o or # fix up legacy mnemonics
2163 s/vrev32\.8/rev32/o or
2164 s/vtst\.8/cmtst/o or
2166 s/^(\s+)v/$1/o or # strip off v prefix
2167 s/\bbx\s+lr\b/ret/o;
2169 # fix up remaining legacy suffixes
2171 m/\],#8/o and s/\.16b/\.8b/go;
2172 s/\.[ui]?32//o and s/\.16b/\.4s/go;
2173 s/\.[ui]?64//o and s/\.16b/\.2d/go;
2174 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
2178 } else { ######## 32-bit code
2180 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
2181 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
2183 local *unaes = sub {
2184 my ($mnemonic,$arg)=@_;
2186 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
2187 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
2188 |(($2&7)<<1) |(($2&8)<<2);
2189 # since ARMv7 instructions are always encoded little-endian.
2190 # correct solution is to use .inst directive, but older
2191 # assemblers don't implement it:-(
2192 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
2193 $word&0xff,($word>>8)&0xff,
2194 ($word>>16)&0xff,($word>>24)&0xff,
2202 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
2203 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
2204 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
2210 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
2211 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
2217 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
2218 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
2221 foreach(split("\n",$code)) {
2222 s/\`([^\`]*)\`/eval($1)/geo;
2224 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
2225 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
2226 s/\/\/\s?/@ /o; # new->old style commentary
2228 # fix up remaining new-style suffixes
2229 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
2232 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
2233 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
2234 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
2235 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
2236 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
2237 s/^(\s+)b\./$1b/o or
2238 s/^(\s+)ret/$1bx\tlr/o;
2240 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
2248 close STDOUT or die "error closing STDOUT: $!";