3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional instructions. This has
15 # no effect on mighty Apple A7, as results are literally equal to
16 # the theoretical estimates based on instruction latencies and issue
17 # rate. It remains to be seen how does it affect other platforms...
19 # Performance in cycles per byte processed with 128-bit key:
28 $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
29 $code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
31 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
32 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
33 # maintain both 32- and 64-bit codes within single module and
34 # transliterate common code to either flavour with regex vodoo.
37 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
38 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
39 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
45 .long 0x01,0x01,0x01,0x01
46 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
47 .long 0x1b,0x1b,0x1b,0x1b
49 .globl ${prefix}_set_encrypt_key
50 .type ${prefix}_set_encrypt_key,%function
52 ${prefix}_set_encrypt_key:
55 $code.=<<___ if ($flavour =~ /64/);
56 stp x29,x30,[sp,#-16]!
63 veor $zero,$zero,$zero
64 vld1.8 {$in0},[$inp],#16
65 mov $bits,#8 // reuse $bits
66 vld1.32 {$rcon,$mask},[$ptr],#32
74 vtbl.8 $key,{$in0},$mask
75 vext.8 $tmp,$zero,$in0,#12
76 vst1.32 {$in0},[$out],#16
81 vext.8 $tmp,$zero,$tmp,#12
83 vext.8 $tmp,$zero,$tmp,#12
86 vshl.u8 $rcon,$rcon,#1
90 vld1.32 {$rcon},[$ptr]
92 vtbl.8 $key,{$in0},$mask
93 vext.8 $tmp,$zero,$in0,#12
94 vst1.32 {$in0},[$out],#16
98 vext.8 $tmp,$zero,$tmp,#12
100 vext.8 $tmp,$zero,$tmp,#12
103 vshl.u8 $rcon,$rcon,#1
106 vtbl.8 $key,{$in0},$mask
107 vext.8 $tmp,$zero,$in0,#12
108 vst1.32 {$in0},[$out],#16
112 vext.8 $tmp,$zero,$tmp,#12
114 vext.8 $tmp,$zero,$tmp,#12
118 vst1.32 {$in0},[$out]
126 vld1.8 {$in1},[$inp],#8
127 vmov.i8 $key,#8 // borrow $key
128 vst1.32 {$in0},[$out],#16
129 vsub.i8 $mask,$mask,$key // adjust the mask
132 vtbl.8 $key,{$in1},$mask
133 vext.8 $tmp,$zero,$in0,#12
134 vst1.32 {$in1},[$out],#8
139 vext.8 $tmp,$zero,$tmp,#12
141 vext.8 $tmp,$zero,$tmp,#12
144 vdup.32 $tmp,${in0}[3]
147 vext.8 $in1,$zero,$in1,#12
148 vshl.u8 $rcon,$rcon,#1
152 vst1.32 {$in0},[$out],#16
164 vst1.32 {$in0},[$out],#16
167 vtbl.8 $key,{$in1},$mask
168 vext.8 $tmp,$zero,$in0,#12
169 vst1.32 {$in1},[$out],#16
174 vext.8 $tmp,$zero,$tmp,#12
176 vext.8 $tmp,$zero,$tmp,#12
179 vshl.u8 $rcon,$rcon,#1
181 vst1.32 {$in0},[$out],#16
184 vdup.32 $key,${in0}[3] // just splat
185 vext.8 $tmp,$zero,$in1,#12
189 vext.8 $tmp,$zero,$tmp,#12
191 vext.8 $tmp,$zero,$tmp,#12
200 eor x0,x0,x0 // return value
201 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
203 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
205 .globl ${prefix}_set_decrypt_key
206 .type ${prefix}_set_decrypt_key,%function
208 ${prefix}_set_decrypt_key:
210 $code.=<<___ if ($flavour =~ /64/);
211 stp x29,x30,[sp,#-16]!
214 $code.=<<___ if ($flavour !~ /64/);
220 sub $out,$out,#240 // restore original $out
222 add $inp,$out,x12,lsl#4 // end of key schedule
224 vld1.32 {v0.16b},[$out]
225 vld1.32 {v1.16b},[$inp]
226 vst1.32 {v0.16b},[$inp],x4
227 vst1.32 {v1.16b},[$out],#16
230 vld1.32 {v0.16b},[$out]
231 vld1.32 {v1.16b},[$inp]
234 vst1.32 {v0.16b},[$inp],x4
235 vst1.32 {v1.16b},[$out],#16
239 vld1.32 {v0.16b},[$out]
241 vst1.32 {v0.16b},[$inp]
243 eor x0,x0,x0 // return value
245 $code.=<<___ if ($flavour !~ /64/);
248 $code.=<<___ if ($flavour =~ /64/);
253 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
259 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
260 my ($inp,$out,$key)=map("x$_",(0..2));
262 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
265 .globl ${prefix}_${dir}crypt
266 .type ${prefix}_${dir}crypt,%function
268 ${prefix}_${dir}crypt:
269 ldr $rounds,[$key,#240]
270 vld1.32 {$rndkey0},[$key],#16
271 vld1.8 {$inout},[$inp]
272 sub $rounds,$rounds,#2
273 vld1.32 {$rndkey1},[$key],#16
276 aes$e $inout,$rndkey0
277 vld1.32 {$rndkey0},[$key],#16
279 subs $rounds,$rounds,#2
280 aes$e $inout,$rndkey1
281 vld1.32 {$rndkey1},[$key],#16
285 aes$e $inout,$rndkey0
286 vld1.32 {$rndkey0},[$key]
288 aes$e $inout,$rndkey1
289 veor $inout,$inout,$rndkey0
291 vst1.8 {$inout},[$out]
293 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
300 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
301 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
302 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
304 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
306 ### q8-q15 preloaded key schedule
309 .globl ${prefix}_cbc_encrypt
310 .type ${prefix}_cbc_encrypt,%function
312 ${prefix}_cbc_encrypt:
314 $code.=<<___ if ($flavour =~ /64/);
315 stp x29,x30,[sp,#-16]!
318 $code.=<<___ if ($flavour !~ /64/);
321 vstmdb sp!,{d8-d15} @ ABI specification says so
322 ldmia ip,{r4-r5} @ load remaining args
330 cmp $enc,#0 // en- or decrypting?
331 ldr $rounds,[$key,#240]
333 vld1.8 {$ivec},[$ivp]
334 vld1.8 {$dat},[$inp],$step
336 vld1.32 {q8-q9},[$key] // load key schedule...
337 sub $rounds,$rounds,#6
338 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
339 sub $rounds,$rounds,#2
340 vld1.32 {q10-q11},[$key_],#32
341 vld1.32 {q12-q13},[$key_],#32
342 vld1.32 {q14-q15},[$key_],#32
343 vld1.32 {$rndlast},[$key_]
351 veor $rndzero_n_last,q8,$rndlast
356 vld1.32 {q8},[$key_],#16
360 vld1.32 {q9},[$key_],#16
375 vld1.8 {q8},[$inp],$step
378 veor q8,q8,$rndzero_n_last
381 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
387 veor $ivec,$dat,$rndlast
388 vst1.8 {$ivec},[$out],#16
395 vld1.32 {$in0-$in1},[$key_]
402 vst1.8 {$ivec},[$out],#16
416 vld1.8 {q8},[$inp],$step
423 veor q8,q8,$rndzero_n_last
425 veor $ivec,$dat,$rndlast
426 b.hs .Loop_cbc_enc128
428 vst1.8 {$ivec},[$out],#16
433 vld1.32 {$tmp0-$tmp1},[$key_]
434 veor $ivec,$ivec,$rndlast
435 veor $in0,$dat0,$rndlast
481 veor $ivec,$ivec,$dat0
483 vld1.8 {$dat0},[$inp],$step
484 vld1.8 {$dat1},[$inp],$step1
485 vst1.8 {$ivec},[$out],#16
486 veor $ivec,$in1,$rndlast
487 vst1.8 {$in0},[$out],#16
488 veor $in0,$dat0,$rndlast
489 vorr $in1,$dat1,$dat1
490 b.hs .Loop2x_cbc_dec128
493 veor $ivec,$ivec,$rndlast
495 veor $in0,$in0,$rndlast
506 vld1.8 {$dat1},[$inp],$step
507 vorr $in1,$dat1,$dat1
513 vld1.32 {q8},[$key_],#16
519 vld1.32 {q9},[$key_],#16
528 veor $tmp0,$ivec,$rndlast
529 veor $tmp1,$in0,$rndlast
545 vld1.8 {$in0},[$inp],$step
552 vld1.8 {$in1},[$inp],$step
557 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
562 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
567 veor $tmp0,$tmp0,$dat0
568 veor $tmp1,$tmp1,$dat1
570 vst1.8 {$tmp0},[$out],#32
572 vst1.8 {$tmp1},[$out],#32
580 vld1.32 {q8},[$key_],#16
584 vld1.32 {q9},[$key_],#16
592 veor $tmp,$ivec,$rndlast
607 vst1.8 {$tmp},[$out],#16
610 vst1.8 {$ivec},[$ivp]
613 $code.=<<___ if ($flavour !~ /64/);
617 $code.=<<___ if ($flavour =~ /64/);
622 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
625 ########################################
626 if ($flavour =~ /64/) { ######## 64-bit code
628 "aesd" => 0x4e285800, "aese" => 0x4e284800,
629 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
632 my ($mnemonic,$arg)=@_;
634 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
635 sprintf ".long\t0x%08x\t//%s %s",
636 $opcode{$mnemonic}|$1|($2<<5),
640 foreach(split("\n",$code)) {
641 s/\`([^\`]*)\`/eval($1)/geo;
643 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
644 s/@\s/\/\//o; # old->new style commentary
646 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
647 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
648 s/vmov\.i8/movi/o or # fix up legacy mnemonics
650 s/vrev32\.8/rev32/o or
653 s/^(\s+)v/$1/o or # strip off v prefix
656 # fix up remainig legacy suffixes
658 m/\],#8/o and s/\.16b/\.8b/go;
659 s/\.[ui]?32//o and s/\.16b/\.4s/go;
660 s/\.[ui]?64//o and s/\.16b/\.2d/go;
661 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
665 } else { ######## 32-bit code
667 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
668 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
671 my ($mnemonic,$arg)=@_;
673 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
674 sprintf ".long\t0x%08x\t@ %s %s",
675 $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
676 |(($2&7)<<1) |(($2&8)<<2),
683 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
684 sprintf "vtbl.8 d%d,{q%d},d%d\n\tvtbl.8 d%d,{q%d},d%d",2*$1,$2,2*$3,2*$1+1,$2,2*$3+1;
690 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
691 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+$3>>1,$3&1;
694 foreach(split("\n",$code)) {
695 s/\`([^\`]*)\`/eval($1)/geo;
697 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
698 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
699 s/\/\/\s?/@ /o; # new->old style commentary
701 # fix up remainig new-style suffixes
704 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
705 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
706 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
707 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
709 s/^(\s+)ret/$1bx\tlr/o;