#
# CBC enc CBC dec
# Apple A7 2.39 1.20
+# Cortex-A5x n/a n/a
$flavour = shift;
$prefix="AES";
aesd $dat1,q15
veor $ivec,$ivec,$dat0
- veor $in0,$in0,$dat1
vld1.8 {$dat0},[$inp],$step
+ veor $in0,$in0,$dat1
vld1.8 {$dat1},[$inp],$step1
vst1.8 {$ivec},[$out],#16
veor $ivec,$in1,$rndlast
.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
___
}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15 preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ mov ip,sp
+ stmdb sp!,{r4-r10,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldr r4, [ip] @ load remaining arg
+___
+$code.=<<___;
+ ldr $rounds,[$key,#240]
+
+ ldr $ctr, [$ivp, #12]
+ vld1.32 {$dat0},[$ivp]
+
+ vld1.32 {q8-q9},[$key] // load key schedule...
+ sub $rounds,$rounds,#6
+ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
+ sub $rounds,$rounds,#2
+ vld1.32 {q10-q11},[$key_],#32
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+
+ add $key_,$key,#32
+ mov $cnt,$rounds
+
+ subs $len,$len,#2
+ b.lo .Lctr32_tail
+
+#ifndef BIG_ENDIAN
+ rev $ctr, $ctr
+#endif
+ vorr $dat1,$dat0,$dat0
+ add $ctr, $ctr, #1
+ vorr $ivec,$dat0,$dat0
+ rev $tctr1, $ctr
+ cmp $rounds,#2
+ vmov.32 ${dat1}[3],$tctr1
+ b.eq .Lctr32_128
+
+.Loop2x_ctr32:
+ aese $dat0,q8
+ aese $dat1,q8
+ vld1.32 {q8},[$key_],#16
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aese $dat1,q9
+ vld1.32 {q9},[$key_],#16
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ b.gt .Loop2x_ctr32
+
+ aese $dat0,q8
+ aese $dat1,q8
+ aesmc $tmp0,$dat0
+ vorr $dat0,$ivec,$ivec
+ aesmc $tmp1,$dat1
+ vorr $dat1,$ivec,$ivec
+ aese $tmp0,q9
+ aese $tmp1,q9
+ vld1.8 {$in0},[$inp],#16
+ aesmc $tmp0,$tmp0
+ vld1.8 {$in1},[$inp],#16
+ aesmc $tmp1,$tmp1
+ add $ctr,$ctr,#1
+ aese $tmp0,q10
+ aese $tmp1,q10
+ rev $tctr,$ctr
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ add $ctr,$ctr,#1
+ aese $tmp0,q11
+ aese $tmp1,q11
+ veor $in0,$in0,$rndlast
+ rev $tctr1,$ctr
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ veor $in1,$in1,$rndlast
+ mov $key_,$key
+ aese $tmp0,q12
+ aese $tmp1,q12
+ subs $len,$len,#2
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ aese $tmp0,q13
+ aese $tmp1,q13
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ aese $tmp0,q14
+ aese $tmp1,q14
+ vmov.32 ${dat0}[3], $tctr
+ aesmc $tmp0,$tmp0
+ vmov.32 ${dat1}[3], $tctr1
+ aesmc $tmp1,$tmp1
+ aese $tmp0,q15
+ aese $tmp1,q15
+
+ mov $cnt,$rounds
+ veor $in0,$in0,$tmp0
+ veor $in1,$in1,$tmp1
+ vst1.8 {$in0},[$out],#16
+ vst1.8 {$in1},[$out],#16
+ b.hs .Loop2x_ctr32
+
+ adds $len,$len,#2
+ b.eq .Lctr32_done
+ b .Lctr32_tail
+
+.Lctr32_128:
+ vld1.32 {$tmp0-$tmp1},[$key_]
+
+.Loop2x_ctr32_128:
+ aese $dat0,q8
+ aese $dat1,q8
+ aesmc $dat0,$dat0
+ vld1.8 {$in0},[$inp],#16
+ aesmc $dat1,$dat1
+ vld1.8 {$in1},[$inp],#16
+ aese $dat0,q9
+ aese $dat1,q9
+ add $ctr,$ctr,#1
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ rev $tctr,$ctr
+ aese $dat0,$tmp0
+ aese $dat1,$tmp0
+ add $ctr,$ctr,#1
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ rev $tctr1,$ctr
+ aese $dat0,$tmp1
+ aese $dat1,$tmp1
+ subs $len,$len,#2
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q10
+ aese $dat1,q10
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q11
+ aese $dat1,q11
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q12
+ aese $dat1,q12
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q13
+ aese $dat1,q13
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q14
+ aese $dat1,q14
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ veor $in0,$in0,$rndlast
+ aese $dat0,q15
+ veor $in1,$in1,$rndlast
+ aese $dat1,q15
+
+ veor $in0,$in0,$dat0
+ vorr $dat0,$ivec,$ivec
+ veor $in1,$in1,$dat1
+ vorr $dat1,$ivec,$ivec
+ vst1.8 {$in0},[$out],#16
+ vmov.32 ${dat0}[3], $tctr
+ vst1.8 {$in1},[$out],#16
+ vmov.32 ${dat1}[3], $tctr1
+ b.hs .Loop2x_ctr32_128
+
+ adds $len,$len,#2
+ b.eq .Lctr32_done
+
+.Lctr32_tail:
+ aese $dat,q8
+ vld1.32 {q8},[$key_],#16
+ aesmc $dat,$dat
+ subs $cnt,$cnt,#2
+ aese $dat,q9
+ vld1.32 {q9},[$key_],#16
+ aesmc $dat,$dat
+ b.gt .Lctr32_tail
+
+ aese $dat,q8
+ aesmc $dat,$dat
+ aese $dat,q9
+ aesmc $dat,$dat
+ vld1.8 {$in0},[$inp]
+ aese $dat,q10
+ aesmc $dat,$dat
+ aese $dat,q11
+ aesmc $dat,$dat
+ aese $dat,q12
+ aesmc $dat,$dat
+ aese $dat,q13
+ aesmc $dat,$dat
+ aese $dat,q14
+ aesmc $dat,$dat
+ veor $in0,$in0,$rndlast
+ aese $dat,q15
+
+ veor $in0,$in0,$dat
+ vst1.8 {$in0},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r10,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldr x29,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
########################################
if ($flavour =~ /64/) { ######## 64-bit code
my %opcode = (
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+$3>>1,$3&1;
}
+ sub unvmov32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+ sprintf "vmov.32 d%d[%d],%s",2*$1+$2>>1,$2&1,$3;
+ }
+
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/vmov\.32\s+(.*)/unvmov32($1)/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;