aesv8-armx.pl: add CTR implementation.

author Andy Polyakov <appro@openssl.org>

Thu, 29 May 2014 20:45:35 +0000 (22:45 +0200)

committer Andy Polyakov <appro@openssl.org>

Thu, 29 May 2014 20:45:35 +0000 (22:45 +0200)
author Andy Polyakov <appro@openssl.org>
Thu, 29 May 2014 20:45:35 +0000 (22:45 +0200)
committer Andy Polyakov <appro@openssl.org>
Thu, 29 May 2014 20:45:35 +0000 (22:45 +0200)
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl

index c4481b56b1d14dfe3f06fd189e48c6a8b26f7f5f..763377f2ccbda7fb503f182ec63a8d7c9273fb6f 100755 (executable)
--- a/crypto/aes/asm/aesv8-armx.pl
+++ b/crypto/aes/asm/aesv8-armx.pl
@@ -20,6 +20,7 @@
  #
  #              CBC enc         CBC dec
  # Apple A7     2.39            1.20
+# Cortex-A5x   n/a             n/a
  
  $flavour = shift;
  $prefix="AES";
@@ -479,8 +480,8 @@ $code.=<<___;
         aesd    $dat1,q15
  
         veor    $ivec,$ivec,$dat0
-       veor    $in0,$in0,$dat1
         vld1.8  {$dat0},[$inp],$step
+       veor    $in0,$in0,$dat1
         vld1.8  {$dat1},[$inp],$step1
         vst1.8  {$ivec},[$out],#16
         veor    $ivec,$in1,$rndlast
@@ -622,6 +623,245 @@ $code.=<<___;
  .size  ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  ___
  }}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15     preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type  ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___   if ($flavour =~ /64/);
+       stp             x29,x30,[sp,#-16]!
+       add             x29,sp,#0
+___
+$code.=<<___   if ($flavour !~ /64/);
+       mov             ip,sp
+       stmdb           sp!,{r4-r10,lr}
+       vstmdb          sp!,{d8-d15}            @ ABI specification says so
+       ldr             r4, [ip]                @ load remaining arg
+___
+$code.=<<___;
+       ldr             $rounds,[$key,#240]
+
+       ldr             $ctr, [$ivp, #12]
+       vld1.32         {$dat0},[$ivp]
+
+       vld1.32         {q8-q9},[$key]          // load key schedule...
+       sub             $rounds,$rounds,#6
+       add             $key_,$key,x5,lsl#4     // pointer to last 7 round keys
+       sub             $rounds,$rounds,#2
+       vld1.32         {q10-q11},[$key_],#32
+       vld1.32         {q12-q13},[$key_],#32
+       vld1.32         {q14-q15},[$key_],#32
+       vld1.32         {$rndlast},[$key_]
+
+       add             $key_,$key,#32
+       mov             $cnt,$rounds
+
+       subs            $len,$len,#2
+       b.lo            .Lctr32_tail
+
+#ifndef BIG_ENDIAN
+       rev             $ctr, $ctr
+#endif
+       vorr            $dat1,$dat0,$dat0
+       add             $ctr, $ctr, #1
+       vorr            $ivec,$dat0,$dat0
+       rev             $tctr1, $ctr
+       cmp             $rounds,#2
+       vmov.32         ${dat1}[3],$tctr1
+       b.eq            .Lctr32_128
+
+.Loop2x_ctr32:
+       aese            $dat0,q8
+       aese            $dat1,q8
+       vld1.32         {q8},[$key_],#16
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       subs            $cnt,$cnt,#2
+       aese            $dat0,q9
+       aese            $dat1,q9
+       vld1.32         {q9},[$key_],#16
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       b.gt            .Loop2x_ctr32
+
+       aese            $dat0,q8
+       aese            $dat1,q8
+       aesmc           $tmp0,$dat0
+        vorr           $dat0,$ivec,$ivec
+       aesmc           $tmp1,$dat1
+        vorr           $dat1,$ivec,$ivec
+       aese            $tmp0,q9
+       aese            $tmp1,q9
+        vld1.8         {$in0},[$inp],#16
+       aesmc           $tmp0,$tmp0
+        vld1.8         {$in1},[$inp],#16
+       aesmc           $tmp1,$tmp1
+        add            $ctr,$ctr,#1
+       aese            $tmp0,q10
+       aese            $tmp1,q10
+        rev            $tctr,$ctr
+       aesmc           $tmp0,$tmp0
+       aesmc           $tmp1,$tmp1
+        add            $ctr,$ctr,#1
+       aese            $tmp0,q11
+       aese            $tmp1,q11
+        veor           $in0,$in0,$rndlast
+        rev            $tctr1,$ctr
+       aesmc           $tmp0,$tmp0
+       aesmc           $tmp1,$tmp1
+        veor           $in1,$in1,$rndlast
+        mov            $key_,$key
+       aese            $tmp0,q12
+       aese            $tmp1,q12
+        subs           $len,$len,#2
+       aesmc           $tmp0,$tmp0
+       aesmc           $tmp1,$tmp1
+        vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
+       aese            $tmp0,q13
+       aese            $tmp1,q13
+       aesmc           $tmp0,$tmp0
+       aesmc           $tmp1,$tmp1
+        vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
+       aese            $tmp0,q14
+       aese            $tmp1,q14
+        vmov.32        ${dat0}[3], $tctr
+       aesmc           $tmp0,$tmp0
+        vmov.32        ${dat1}[3], $tctr1
+       aesmc           $tmp1,$tmp1
+       aese            $tmp0,q15
+       aese            $tmp1,q15
+
+        mov            $cnt,$rounds
+       veor            $in0,$in0,$tmp0
+       veor            $in1,$in1,$tmp1
+       vst1.8          {$in0},[$out],#16
+       vst1.8          {$in1},[$out],#16
+       b.hs            .Loop2x_ctr32
+
+       adds            $len,$len,#2
+       b.eq            .Lctr32_done
+       b               .Lctr32_tail
+
+.Lctr32_128:
+       vld1.32         {$tmp0-$tmp1},[$key_]
+
+.Loop2x_ctr32_128:
+       aese            $dat0,q8
+       aese            $dat1,q8
+       aesmc           $dat0,$dat0
+        vld1.8         {$in0},[$inp],#16
+       aesmc           $dat1,$dat1
+        vld1.8         {$in1},[$inp],#16
+       aese            $dat0,q9
+       aese            $dat1,q9
+        add            $ctr,$ctr,#1
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+        rev            $tctr,$ctr
+       aese            $dat0,$tmp0
+       aese            $dat1,$tmp0
+        add            $ctr,$ctr,#1
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+        rev            $tctr1,$ctr
+       aese            $dat0,$tmp1
+       aese            $dat1,$tmp1
+        subs           $len,$len,#2
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       aese            $dat0,q10
+       aese            $dat1,q10
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       aese            $dat0,q11
+       aese            $dat1,q11
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       aese            $dat0,q12
+       aese            $dat1,q12
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       aese            $dat0,q13
+       aese            $dat1,q13
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       aese            $dat0,q14
+       aese            $dat1,q14
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+        veor           $in0,$in0,$rndlast
+       aese            $dat0,q15
+        veor           $in1,$in1,$rndlast
+       aese            $dat1,q15
+
+       veor            $in0,$in0,$dat0
+       vorr            $dat0,$ivec,$ivec
+       veor            $in1,$in1,$dat1
+       vorr            $dat1,$ivec,$ivec
+       vst1.8          {$in0},[$out],#16
+       vmov.32         ${dat0}[3], $tctr
+       vst1.8          {$in1},[$out],#16
+       vmov.32         ${dat1}[3], $tctr1
+       b.hs            .Loop2x_ctr32_128
+
+       adds            $len,$len,#2
+       b.eq            .Lctr32_done
+
+.Lctr32_tail:
+       aese            $dat,q8
+       vld1.32         {q8},[$key_],#16
+       aesmc           $dat,$dat
+       subs            $cnt,$cnt,#2
+       aese            $dat,q9
+       vld1.32         {q9},[$key_],#16
+       aesmc           $dat,$dat
+       b.gt            .Lctr32_tail
+
+       aese            $dat,q8
+       aesmc           $dat,$dat
+       aese            $dat,q9
+       aesmc           $dat,$dat
+        vld1.8         {$in0},[$inp]
+       aese            $dat,q10
+       aesmc           $dat,$dat
+       aese            $dat,q11
+       aesmc           $dat,$dat
+       aese            $dat,q12
+       aesmc           $dat,$dat
+       aese            $dat,q13
+       aesmc           $dat,$dat
+       aese            $dat,q14
+       aesmc           $dat,$dat
+        veor           $in0,$in0,$rndlast
+       aese            $dat,q15
+
+       veor            $in0,$in0,$dat
+       vst1.8          {$in0},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___   if ($flavour !~ /64/);
+       vldmia          sp!,{d8-d15}
+       ldmia           sp!,{r4-r10,pc}
+___
+$code.=<<___   if ($flavour =~ /64/);
+       ldr             x29,[sp],#16
+       ret
+___
+$code.=<<___;
+.size  ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
  ########################################
  if ($flavour =~ /64/) {                        ######## 64-bit code
      my %opcode = (
@@ -691,6 +931,13 @@ if ($flavour =~ /64/) {                    ######## 64-bit code
         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+$3>>1,$3&1;        
      }
  
+    sub unvmov32 {
+       my $arg=shift;
+
+       $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+       sprintf "vmov.32        d%d[%d],%s",2*$1+$2>>1,$2&1,$3; 
+    }
+
      foreach(split("\n",$code)) {
          s/\`([^\`]*)\`/eval($1)/geo;
  
@@ -705,6 +952,7 @@ if ($flavour =~ /64/) {                     ######## 64-bit code
         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
+       s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
         s/^(\s+)b\./$1b/o                               or
         s/^(\s+)ret/$1bx\tlr/o;
author	Andy Polyakov <appro@openssl.org>
	Thu, 29 May 2014 20:45:35 +0000 (22:45 +0200)
committer	Andy Polyakov <appro@openssl.org>
	Thu, 29 May 2014 20:45:35 +0000 (22:45 +0200)