bn/asm/armv4-mont.pl: boost NEON performance.
authorAndy Polyakov <appro@openssl.org>
Fri, 25 Sep 2015 11:43:00 +0000 (13:43 +0200)
committerAndy Polyakov <appro@openssl.org>
Wed, 30 Sep 2015 15:48:30 +0000 (17:48 +0200)
Close difference gap on Cortex-A9, which resulted in further improvement
even on other processors.

Reviewed-by: Richard Levitte <levitte@openssl.org>
crypto/bn/asm/armv4-mont.pl

index bd56f989c70a158451571541f06bb1936a41ca51..48e523013fe4523d9a9feac649dc6b491f5cdc4f 100644 (file)
 # for execution on all NEON-capable processors, because gain on
 # others outweighs the marginal loss on Cortex-A9.
 
+# September 2015
+#
+# Align Cortex-A9 performance with November 2013 improvements, i.e.
+# NEON code is now ~20-105% faster than integer-only one on this
+# processor. But this optimization further improved performance even
+# on other processors: NEON code path is ~45-180% faster than original
+# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
+# Snapdragon S4.
+
 $flavour = shift;
 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
@@ -272,19 +281,16 @@ bn_mul_mont:
 .size  bn_mul_mont,.-bn_mul_mont
 ___
 {
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-
 my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
 my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
 my ($Z,$Temp)=("q4","q5");
-my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
+my @ACC=map("q$_",(6..13));
 my ($Bi,$Ni,$M0)=map("d$_",(28..31));
-my $zero=&Dlo($Z);
-my $temp=&Dlo($Temp);
+my $zero="$Z#lo";
+my $temp="$Temp#lo";
 
 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
-my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
 
 $code.=<<___;
 #if __ARM_MAX_ARCH__>=7
@@ -300,59 +306,58 @@ bn_mul8x_mont_neon:
        ldmia   ip,{r4-r5}              @ load rest of parameter block
        mov     ip,sp
 
-       sub             $toutptr,sp,#16
+       cmp     $num,#8
+       bhi     .LNEON_8n
+
+       @ special case for $num==8, everything is in register bank...
+
        vld1.32         {${Bi}[0]}, [$bptr,:32]!
-       sub             $toutptr,$toutptr,$num,lsl#4
+       veor            $zero,$zero,$zero
+       sub             $toutptr,sp,$num,lsl#4
        vld1.32         {$A0-$A3},  [$aptr]!            @ can't specify :32 :-(
        and             $toutptr,$toutptr,#-64
        vld1.32         {${M0}[0]}, [$n0,:32]
        mov             sp,$toutptr                     @ alloca
-       veor            $zero,$zero,$zero
-       subs            $inner,$num,#8
        vzip.16         $Bi,$zero
 
-       vmull.u32       $A0xB,$Bi,${A0}[0]
-       vmull.u32       $A1xB,$Bi,${A0}[1]
-       vmull.u32       $A2xB,$Bi,${A1}[0]
-       vshl.i64        $temp,`&Dhi("$A0xB")`,#16
-       vmull.u32       $A3xB,$Bi,${A1}[1]
+       vmull.u32       @ACC[0],$Bi,${A0}[0]
+       vmull.u32       @ACC[1],$Bi,${A0}[1]
+       vmull.u32       @ACC[2],$Bi,${A1}[0]
+       vshl.i64        $Ni,@ACC[0]#hi,#16
+       vmull.u32       @ACC[3],$Bi,${A1}[1]
 
-       vadd.u64        $temp,$temp,`&Dlo("$A0xB")`
+       vadd.u64        $Ni,$Ni,@ACC[0]#lo
        veor            $zero,$zero,$zero
-       vmul.u32        $Ni,$temp,$M0
+       vmul.u32        $Ni,$Ni,$M0
 
-       vmull.u32       $A4xB,$Bi,${A2}[0]
+       vmull.u32       @ACC[4],$Bi,${A2}[0]
         vld1.32        {$N0-$N3}, [$nptr]!
-       vmull.u32       $A5xB,$Bi,${A2}[1]
-       vmull.u32       $A6xB,$Bi,${A3}[0]
+       vmull.u32       @ACC[5],$Bi,${A2}[1]
+       vmull.u32       @ACC[6],$Bi,${A3}[0]
        vzip.16         $Ni,$zero
-       vmull.u32       $A7xB,$Bi,${A3}[1]
-
-       bne     .LNEON_1st
+       vmull.u32       @ACC[7],$Bi,${A3}[1]
 
-       @ special case for num=8, everything is in register bank...
-
-       vmlal.u32       $A0xB,$Ni,${N0}[0]
+       vmlal.u32       @ACC[0],$Ni,${N0}[0]
        sub             $outer,$num,#1
-       vmlal.u32       $A1xB,$Ni,${N0}[1]
-       vmlal.u32       $A2xB,$Ni,${N1}[0]
-       vmlal.u32       $A3xB,$Ni,${N1}[1]
-
-       vmlal.u32       $A4xB,$Ni,${N2}[0]
-       vmov            $Temp,$A0xB
-       vmlal.u32       $A5xB,$Ni,${N2}[1]
-       vmov            $A0xB,$A1xB
-       vmlal.u32       $A6xB,$Ni,${N3}[0]
-       vmov            $A1xB,$A2xB
-       vmlal.u32       $A7xB,$Ni,${N3}[1]
-       vmov            $A2xB,$A3xB
-       vmov            $A3xB,$A4xB
+       vmlal.u32       @ACC[1],$Ni,${N0}[1]
+       vmlal.u32       @ACC[2],$Ni,${N1}[0]
+       vmlal.u32       @ACC[3],$Ni,${N1}[1]
+
+       vmlal.u32       @ACC[4],$Ni,${N2}[0]
+       vmov            $Temp,@ACC[0]
+       vmlal.u32       @ACC[5],$Ni,${N2}[1]
+       vmov            @ACC[0],@ACC[1]
+       vmlal.u32       @ACC[6],$Ni,${N3}[0]
+       vmov            @ACC[1],@ACC[2]
+       vmlal.u32       @ACC[7],$Ni,${N3}[1]
+       vmov            @ACC[2],@ACC[3]
+       vmov            @ACC[3],@ACC[4]
        vshr.u64        $temp,$temp,#16
-       vmov            $A4xB,$A5xB
-       vmov            $A5xB,$A6xB
-       vadd.u64        $temp,$temp,`&Dhi("$Temp")`
-       vmov            $A6xB,$A7xB
-       veor            $A7xB,$A7xB
+       vmov            @ACC[4],@ACC[5]
+       vmov            @ACC[5],@ACC[6]
+       vadd.u64        $temp,$temp,$Temp#hi
+       vmov            @ACC[6],@ACC[7]
+       veor            @ACC[7],@ACC[7]
        vshr.u64        $temp,$temp,#16
 
        b       .LNEON_outer8
@@ -362,279 +367,302 @@ bn_mul8x_mont_neon:
        vld1.32         {${Bi}[0]}, [$bptr,:32]!
        veor            $zero,$zero,$zero
        vzip.16         $Bi,$zero
-       vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+       vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
 
-       vmlal.u32       $A0xB,$Bi,${A0}[0]
-       vmlal.u32       $A1xB,$Bi,${A0}[1]
-       vmlal.u32       $A2xB,$Bi,${A1}[0]
-       vshl.i64        $temp,`&Dhi("$A0xB")`,#16
-       vmlal.u32       $A3xB,$Bi,${A1}[1]
+       vmlal.u32       @ACC[0],$Bi,${A0}[0]
+       vmlal.u32       @ACC[1],$Bi,${A0}[1]
+       vmlal.u32       @ACC[2],$Bi,${A1}[0]
+       vshl.i64        $Ni,@ACC[0]#hi,#16
+       vmlal.u32       @ACC[3],$Bi,${A1}[1]
 
-       vadd.u64        $temp,$temp,`&Dlo("$A0xB")`
+       vadd.u64        $Ni,$Ni,@ACC[0]#lo
        veor            $zero,$zero,$zero
        subs            $outer,$outer,#1
-       vmul.u32        $Ni,$temp,$M0
+       vmul.u32        $Ni,$Ni,$M0
 
-       vmlal.u32       $A4xB,$Bi,${A2}[0]
-       vmlal.u32       $A5xB,$Bi,${A2}[1]
-       vmlal.u32       $A6xB,$Bi,${A3}[0]
+       vmlal.u32       @ACC[4],$Bi,${A2}[0]
+       vmlal.u32       @ACC[5],$Bi,${A2}[1]
+       vmlal.u32       @ACC[6],$Bi,${A3}[0]
        vzip.16         $Ni,$zero
-       vmlal.u32       $A7xB,$Bi,${A3}[1]
-
-       vmlal.u32       $A0xB,$Ni,${N0}[0]
-       vmlal.u32       $A1xB,$Ni,${N0}[1]
-       vmlal.u32       $A2xB,$Ni,${N1}[0]
-       vmlal.u32       $A3xB,$Ni,${N1}[1]
-
-       vmlal.u32       $A4xB,$Ni,${N2}[0]
-       vmov            $Temp,$A0xB
-       vmlal.u32       $A5xB,$Ni,${N2}[1]
-       vmov            $A0xB,$A1xB
-       vmlal.u32       $A6xB,$Ni,${N3}[0]
-       vmov            $A1xB,$A2xB
-       vmlal.u32       $A7xB,$Ni,${N3}[1]
-       vmov            $A2xB,$A3xB
-       vmov            $A3xB,$A4xB
+       vmlal.u32       @ACC[7],$Bi,${A3}[1]
+
+       vmlal.u32       @ACC[0],$Ni,${N0}[0]
+       vmlal.u32       @ACC[1],$Ni,${N0}[1]
+       vmlal.u32       @ACC[2],$Ni,${N1}[0]
+       vmlal.u32       @ACC[3],$Ni,${N1}[1]
+
+       vmlal.u32       @ACC[4],$Ni,${N2}[0]
+       vmov            $Temp,@ACC[0]
+       vmlal.u32       @ACC[5],$Ni,${N2}[1]
+       vmov            @ACC[0],@ACC[1]
+       vmlal.u32       @ACC[6],$Ni,${N3}[0]
+       vmov            @ACC[1],@ACC[2]
+       vmlal.u32       @ACC[7],$Ni,${N3}[1]
+       vmov            @ACC[2],@ACC[3]
+       vmov            @ACC[3],@ACC[4]
        vshr.u64        $temp,$temp,#16
-       vmov            $A4xB,$A5xB
-       vmov            $A5xB,$A6xB
-       vadd.u64        $temp,$temp,`&Dhi("$Temp")`
-       vmov            $A6xB,$A7xB
-       veor            $A7xB,$A7xB
+       vmov            @ACC[4],@ACC[5]
+       vmov            @ACC[5],@ACC[6]
+       vadd.u64        $temp,$temp,$Temp#hi
+       vmov            @ACC[6],@ACC[7]
+       veor            @ACC[7],@ACC[7]
        vshr.u64        $temp,$temp,#16
 
        bne     .LNEON_outer8
 
-       vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+       vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
        mov             $toutptr,sp
-       vshr.u64        $temp,`&Dlo("$A0xB")`,#16
+       vshr.u64        $temp,@ACC[0]#lo,#16
        mov             $inner,$num
-       vadd.u64        `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
-       add             $tinptr,sp,#16
-       vshr.u64        $temp,`&Dhi("$A0xB")`,#16
-       vzip.16         `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+       vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
+       add             $tinptr,sp,#96
+       vshr.u64        $temp,@ACC[0]#hi,#16
+       vzip.16         @ACC[0]#lo,@ACC[0]#hi
 
-       b       .LNEON_tail2
+       b       .LNEON_tail_entry
 
 .align 4
-.LNEON_1st:
-       vmlal.u32       $A0xB,$Ni,${N0}[0]
-        vld1.32        {$A0-$A3}, [$aptr]!
-       vmlal.u32       $A1xB,$Ni,${N0}[1]
+.LNEON_8n:
+       veor            @ACC[0],@ACC[0],@ACC[0]
+        sub            $toutptr,sp,#128
+       veor            @ACC[1],@ACC[1],@ACC[1]
+        sub            $toutptr,$toutptr,$num,lsl#4
+       veor            @ACC[2],@ACC[2],@ACC[2]
+        and            $toutptr,$toutptr,#-64
+       veor            @ACC[3],@ACC[3],@ACC[3]
+        mov            sp,$toutptr                     @ alloca
+       veor            @ACC[4],@ACC[4],@ACC[4]
+        add            $toutptr,$toutptr,#256
+       veor            @ACC[5],@ACC[5],@ACC[5]
+        sub            $inner,$num,#8
+       veor            @ACC[6],@ACC[6],@ACC[6]
+       veor            @ACC[7],@ACC[7],@ACC[7]
+
+.LNEON_8n_init:
+       vst1.64         {@ACC[0]-@ACC[1]},[$toutptr,:256]!
        subs            $inner,$inner,#8
-       vmlal.u32       $A2xB,$Ni,${N1}[0]
-       vmlal.u32       $A3xB,$Ni,${N1}[1]
-
-       vmlal.u32       $A4xB,$Ni,${N2}[0]
-        vld1.32        {$N0-$N1}, [$nptr]!
-       vmlal.u32       $A5xB,$Ni,${N2}[1]
-        vst1.64        {$A0xB-$A1xB}, [$toutptr,:256]!
-       vmlal.u32       $A6xB,$Ni,${N3}[0]
-       vmlal.u32       $A7xB,$Ni,${N3}[1]
-        vst1.64        {$A2xB-$A3xB}, [$toutptr,:256]!
-
-       vmull.u32       $A0xB,$Bi,${A0}[0]
-        vld1.32        {$N2-$N3}, [$nptr]!
-       vmull.u32       $A1xB,$Bi,${A0}[1]
-        vst1.64        {$A4xB-$A5xB}, [$toutptr,:256]!
-       vmull.u32       $A2xB,$Bi,${A1}[0]
-       vmull.u32       $A3xB,$Bi,${A1}[1]
-        vst1.64        {$A6xB-$A7xB}, [$toutptr,:256]!
-
-       vmull.u32       $A4xB,$Bi,${A2}[0]
-       vmull.u32       $A5xB,$Bi,${A2}[1]
-       vmull.u32       $A6xB,$Bi,${A3}[0]
-       vmull.u32       $A7xB,$Bi,${A3}[1]
-
-       bne     .LNEON_1st
-
-       vmlal.u32       $A0xB,$Ni,${N0}[0]
-       add             $tinptr,sp,#16
-       vmlal.u32       $A1xB,$Ni,${N0}[1]
-       sub             $aptr,$aptr,$num,lsl#2          @ rewind $aptr
-       vmlal.u32       $A2xB,$Ni,${N1}[0]
-        vld1.64        {$Temp}, [sp,:128]
-       vmlal.u32       $A3xB,$Ni,${N1}[1]
-       sub             $outer,$num,#1
-
-       vmlal.u32       $A4xB,$Ni,${N2}[0]
-       vst1.64         {$A0xB-$A1xB}, [$toutptr,:256]!
-       vmlal.u32       $A5xB,$Ni,${N2}[1]
-       vshr.u64        $temp,$temp,#16
-        vld1.64        {$A0xB},       [$tinptr, :128]!
-       vmlal.u32       $A6xB,$Ni,${N3}[0]
-       vst1.64         {$A2xB-$A3xB}, [$toutptr,:256]!
-       vmlal.u32       $A7xB,$Ni,${N3}[1]
-
-       vst1.64         {$A4xB-$A5xB}, [$toutptr,:256]!
-       vadd.u64        $temp,$temp,`&Dhi("$Temp")`
-       veor            $Z,$Z,$Z
-       vst1.64         {$A6xB-$A7xB}, [$toutptr,:256]!
-        vld1.64        {$A1xB-$A2xB}, [$tinptr, :256]!
-       vst1.64         {$Z},          [$toutptr,:128]
-       vshr.u64        $temp,$temp,#16
-
-       b               .LNEON_outer
+       vst1.64         {@ACC[2]-@ACC[3]},[$toutptr,:256]!
+       vst1.64         {@ACC[4]-@ACC[5]},[$toutptr,:256]!
+       vst1.64         {@ACC[6]-@ACC[7]},[$toutptr,:256]!
+       bne             .LNEON_8n_init
+
+       add             $tinptr,sp,#256
+       vld1.32         {$A0-$A3},[$aptr]!
+       add             $bnptr,sp,#8
+       vld1.32         {${M0}[0]},[$n0,:32]
+       mov             $outer,$num
+       b               .LNEON_8n_outer
 
 .align 4
-.LNEON_outer:
-       vld1.32         {${Bi}[0]}, [$bptr,:32]!
-       sub             $nptr,$nptr,$num,lsl#2          @ rewind $nptr
-       vld1.32         {$A0-$A3},  [$aptr]!
+.LNEON_8n_outer:
+       vld1.32         {${Bi}[0]},[$bptr,:32]! @ *b++
        veor            $zero,$zero,$zero
-       mov             $toutptr,sp
        vzip.16         $Bi,$zero
+       add             $toutptr,sp,#128
+       vld1.32         {$N0-$N3},[$nptr]!
+
+       vmlal.u32       @ACC[0],$Bi,${A0}[0]
+       vmlal.u32       @ACC[1],$Bi,${A0}[1]
+        veor           $zero,$zero,$zero
+       vmlal.u32       @ACC[2],$Bi,${A1}[0]
+        vshl.i64       $Ni,@ACC[0]#hi,#16
+       vmlal.u32       @ACC[3],$Bi,${A1}[1]
+        vadd.u64       $Ni,$Ni,@ACC[0]#lo
+       vmlal.u32       @ACC[4],$Bi,${A2}[0]
+        vmul.u32       $Ni,$Ni,$M0
+       vmlal.u32       @ACC[5],$Bi,${A2}[1]
+       vst1.32         {$Bi},[sp,:64]          @ put aside smashed b[8*i+0]
+       vmlal.u32       @ACC[6],$Bi,${A3}[0]
+        vzip.16        $Ni,$zero
+       vmlal.u32       @ACC[7],$Bi,${A3}[1]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+       vld1.32         {${Bi}[0]},[$bptr,:32]! @ *b++
+       vmlal.u32       @ACC[0],$Ni,${N0}[0]
+       veor            $temp,$temp,$temp
+       vmlal.u32       @ACC[1],$Ni,${N0}[1]
+       vzip.16         $Bi,$temp
+       vmlal.u32       @ACC[2],$Ni,${N1}[0]
+        vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
+       vmlal.u32       @ACC[3],$Ni,${N1}[1]
+       vmlal.u32       @ACC[4],$Ni,${N2}[0]
+        vadd.u64       @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+       vmlal.u32       @ACC[5],$Ni,${N2}[1]
+        vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
+       vmlal.u32       @ACC[6],$Ni,${N3}[0]
+       vmlal.u32       @ACC[7],$Ni,${N3}[1]
+        vadd.u64       @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+       vst1.32         {$Ni},[$bnptr,:64]!     @ put aside smashed m[8*i+$i]
+___
+       push(@ACC,shift(@ACC)); $i++;
+$code.=<<___;
+       vmlal.u32       @ACC[0],$Bi,${A0}[0]
+       vld1.64         {@ACC[7]},[$tinptr,:128]!
+       vmlal.u32       @ACC[1],$Bi,${A0}[1]
+        veor           $zero,$zero,$zero
+       vmlal.u32       @ACC[2],$Bi,${A1}[0]
+        vshl.i64       $Ni,@ACC[0]#hi,#16
+       vmlal.u32       @ACC[3],$Bi,${A1}[1]
+        vadd.u64       $Ni,$Ni,@ACC[0]#lo
+       vmlal.u32       @ACC[4],$Bi,${A2}[0]
+        vmul.u32       $Ni,$Ni,$M0
+       vmlal.u32       @ACC[5],$Bi,${A2}[1]
+       vst1.32         {$Bi},[$bnptr,:64]!     @ put aside smashed b[8*i+$i]
+       vmlal.u32       @ACC[6],$Bi,${A3}[0]
+        vzip.16        $Ni,$zero
+       vmlal.u32       @ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+       vld1.32         {$Bi},[sp,:64]          @ pull smashed b[8*i+0]
+       vmlal.u32       @ACC[0],$Ni,${N0}[0]
+       vld1.32         {$A0-$A3},[$aptr]!
+       vmlal.u32       @ACC[1],$Ni,${N0}[1]
+       vmlal.u32       @ACC[2],$Ni,${N1}[0]
+        vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
+       vmlal.u32       @ACC[3],$Ni,${N1}[1]
+       vmlal.u32       @ACC[4],$Ni,${N2}[0]
+        vadd.u64       @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+       vmlal.u32       @ACC[5],$Ni,${N2}[1]
+        vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
+       vmlal.u32       @ACC[6],$Ni,${N3}[0]
+       vmlal.u32       @ACC[7],$Ni,${N3}[1]
+        vadd.u64       @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+       vst1.32         {$Ni},[$bnptr,:64]      @ put aside smashed m[8*i+$i]
+       add             $bnptr,sp,#8            @ rewind
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
        sub             $inner,$num,#8
-       vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
-
-       vmlal.u32       $A0xB,$Bi,${A0}[0]
-        vld1.64        {$A3xB-$A4xB},[$tinptr,:256]!
-       vmlal.u32       $A1xB,$Bi,${A0}[1]
-       vmlal.u32       $A2xB,$Bi,${A1}[0]
-        vld1.64        {$A5xB-$A6xB},[$tinptr,:256]!
-       vmlal.u32       $A3xB,$Bi,${A1}[1]
-
-       vshl.i64        $temp,`&Dhi("$A0xB")`,#16
-       veor            $zero,$zero,$zero
-       vadd.u64        $temp,$temp,`&Dlo("$A0xB")`
-        vld1.64        {$A7xB},[$tinptr,:128]!
-       vmul.u32        $Ni,$temp,$M0
-
-       vmlal.u32       $A4xB,$Bi,${A2}[0]
-        vld1.32        {$N0-$N3}, [$nptr]!
-       vmlal.u32       $A5xB,$Bi,${A2}[1]
-       vmlal.u32       $A6xB,$Bi,${A3}[0]
-       vzip.16         $Ni,$zero
-       vmlal.u32       $A7xB,$Bi,${A3}[1]
-
-.LNEON_inner:
-       vmlal.u32       $A0xB,$Ni,${N0}[0]
-        vld1.32        {$A0-$A3}, [$aptr]!
-       vmlal.u32       $A1xB,$Ni,${N0}[1]
-        subs           $inner,$inner,#8
-       vmlal.u32       $A2xB,$Ni,${N1}[0]
-       vmlal.u32       $A3xB,$Ni,${N1}[1]
-       vst1.64         {$A0xB-$A1xB}, [$toutptr,:256]!
-
-       vmlal.u32       $A4xB,$Ni,${N2}[0]
-        vld1.64        {$A0xB},       [$tinptr, :128]!
-       vmlal.u32       $A5xB,$Ni,${N2}[1]
-       vst1.64         {$A2xB-$A3xB}, [$toutptr,:256]!
-       vmlal.u32       $A6xB,$Ni,${N3}[0]
-        vld1.64        {$A1xB-$A2xB}, [$tinptr, :256]!
-       vmlal.u32       $A7xB,$Ni,${N3}[1]
-       vst1.64         {$A4xB-$A5xB}, [$toutptr,:256]!
-
-       vmlal.u32       $A0xB,$Bi,${A0}[0]
-        vld1.64        {$A3xB-$A4xB}, [$tinptr, :256]!
-       vmlal.u32       $A1xB,$Bi,${A0}[1]
-       vst1.64         {$A6xB-$A7xB}, [$toutptr,:256]!
-       vmlal.u32       $A2xB,$Bi,${A1}[0]
-        vld1.64        {$A5xB-$A6xB}, [$tinptr, :256]!
-       vmlal.u32       $A3xB,$Bi,${A1}[1]
-        vld1.32        {$N0-$N3}, [$nptr]!
-
-       vmlal.u32       $A4xB,$Bi,${A2}[0]
-        vld1.64        {$A7xB},       [$tinptr, :128]!
-       vmlal.u32       $A5xB,$Bi,${A2}[1]
-       vmlal.u32       $A6xB,$Bi,${A3}[0]
-       vmlal.u32       $A7xB,$Bi,${A3}[1]
-
-       bne     .LNEON_inner
-
-       vmlal.u32       $A0xB,$Ni,${N0}[0]
-       add             $tinptr,sp,#16
-       vmlal.u32       $A1xB,$Ni,${N0}[1]
-       sub             $aptr,$aptr,$num,lsl#2          @ rewind $aptr
-       vmlal.u32       $A2xB,$Ni,${N1}[0]
-        vld1.64        {$Temp}, [sp,:128]
-       vmlal.u32       $A3xB,$Ni,${N1}[1]
-       subs            $outer,$outer,#1
+       b               .LNEON_8n_inner
 
-       vmlal.u32       $A4xB,$Ni,${N2}[0]
-       vst1.64         {$A0xB-$A1xB}, [$toutptr,:256]!
-       vmlal.u32       $A5xB,$Ni,${N2}[1]
-        vld1.64        {$A0xB},       [$tinptr, :128]!
-       vshr.u64        $temp,$temp,#16
-       vst1.64         {$A2xB-$A3xB}, [$toutptr,:256]!
-       vmlal.u32       $A6xB,$Ni,${N3}[0]
-        vld1.64        {$A1xB-$A2xB}, [$tinptr, :256]!
-       vmlal.u32       $A7xB,$Ni,${N3}[1]
-
-       vst1.64         {$A4xB-$A5xB}, [$toutptr,:256]!
-       vadd.u64        $temp,$temp,`&Dhi("$Temp")`
-       vst1.64         {$A6xB-$A7xB}, [$toutptr,:256]!
-       vshr.u64        $temp,$temp,#16
-
-       bne     .LNEON_outer
+.align 4
+.LNEON_8n_inner:
+       subs            $inner,$inner,#8
+       vmlal.u32       @ACC[0],$Bi,${A0}[0]
+       vld1.64         {@ACC[7]},[$tinptr,:128]
+       vmlal.u32       @ACC[1],$Bi,${A0}[1]
+       vld1.32         {$Ni},[$bnptr,:64]!     @ pull smashed m[8*i+0]
+       vmlal.u32       @ACC[2],$Bi,${A1}[0]
+       vld1.32         {$N0-$N3},[$nptr]!
+       vmlal.u32       @ACC[3],$Bi,${A1}[1]
+       it              ne
+       addne           $tinptr,$tinptr,#16     @ don't advance in last iteration
+       vmlal.u32       @ACC[4],$Bi,${A2}[0]
+       vmlal.u32       @ACC[5],$Bi,${A2}[1]
+       vmlal.u32       @ACC[6],$Bi,${A3}[0]
+       vmlal.u32       @ACC[7],$Bi,${A3}[1]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+       vld1.32         {$Bi},[$bnptr,:64]!     @ pull smashed b[8*i+$i]
+       vmlal.u32       @ACC[0],$Ni,${N0}[0]
+       vmlal.u32       @ACC[1],$Ni,${N0}[1]
+       vmlal.u32       @ACC[2],$Ni,${N1}[0]
+       vmlal.u32       @ACC[3],$Ni,${N1}[1]
+       vmlal.u32       @ACC[4],$Ni,${N2}[0]
+       vmlal.u32       @ACC[5],$Ni,${N2}[1]
+       vmlal.u32       @ACC[6],$Ni,${N3}[0]
+       vmlal.u32       @ACC[7],$Ni,${N3}[1]
+       vst1.64         {@ACC[0]},[$toutptr,:128]!
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       vmlal.u32       @ACC[0],$Bi,${A0}[0]
+       vld1.64         {@ACC[7]},[$tinptr,:128]
+       vmlal.u32       @ACC[1],$Bi,${A0}[1]
+       vld1.32         {$Ni},[$bnptr,:64]!     @ pull smashed m[8*i+$i]
+       vmlal.u32       @ACC[2],$Bi,${A1}[0]
+       it              ne
+       addne           $tinptr,$tinptr,#16     @ don't advance in last iteration
+       vmlal.u32       @ACC[3],$Bi,${A1}[1]
+       vmlal.u32       @ACC[4],$Bi,${A2}[0]
+       vmlal.u32       @ACC[5],$Bi,${A2}[1]
+       vmlal.u32       @ACC[6],$Bi,${A3}[0]
+       vmlal.u32       @ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+       it              eq
+       subeq           $aptr,$aptr,$num,lsl#2  @ rewind
+       vmlal.u32       @ACC[0],$Ni,${N0}[0]
+       vld1.32         {$Bi},[sp,:64]          @ pull smashed b[8*i+0]
+       vmlal.u32       @ACC[1],$Ni,${N0}[1]
+       vld1.32         {$A0-$A3},[$aptr]!
+       vmlal.u32       @ACC[2],$Ni,${N1}[0]
+       add             $bnptr,sp,#8            @ rewind
+       vmlal.u32       @ACC[3],$Ni,${N1}[1]
+       vmlal.u32       @ACC[4],$Ni,${N2}[0]
+       vmlal.u32       @ACC[5],$Ni,${N2}[1]
+       vmlal.u32       @ACC[6],$Ni,${N3}[0]
+       vst1.64         {@ACC[0]},[$toutptr,:128]!
+       vmlal.u32       @ACC[7],$Ni,${N3}[1]
+
+       bne             .LNEON_8n_inner
+___
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       add             $tinptr,sp,#128
+       vst1.64         {@ACC[0]-@ACC[1]},[$toutptr,:256]!
+       veor            q2,q2,q2                @ $N0-$N1
+       vst1.64         {@ACC[2]-@ACC[3]},[$toutptr,:256]!
+       veor            q3,q3,q3                @ $N2-$N3
+       vst1.64         {@ACC[4]-@ACC[5]},[$toutptr,:256]!
+       vst1.64         {@ACC[6]},[$toutptr,:128]
+
+       subs            $outer,$outer,#8
+       vld1.64         {@ACC[0]-@ACC[1]},[$tinptr,:256]!
+       vld1.64         {@ACC[2]-@ACC[3]},[$tinptr,:256]!
+       vld1.64         {@ACC[4]-@ACC[5]},[$tinptr,:256]!
+       vld1.64         {@ACC[6]-@ACC[7]},[$tinptr,:256]!
+
+       itt             ne
+       subne           $nptr,$nptr,$num,lsl#2  @ rewind
+       bne             .LNEON_8n_outer
+
+       add             $toutptr,sp,#128
+       vst1.64         {q2-q3}, [sp,:256]!     @ start wiping stack frame
+       vshr.u64        $temp,@ACC[0]#lo,#16
+       vst1.64         {q2-q3},[sp,:256]!
+       vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
+       vst1.64         {q2-q3}, [sp,:256]!
+       vshr.u64        $temp,@ACC[0]#hi,#16
+       vst1.64         {q2-q3}, [sp,:256]!
+       vzip.16         @ACC[0]#lo,@ACC[0]#hi
 
-       mov             $toutptr,sp
        mov             $inner,$num
+       b               .LNEON_tail_entry
 
+.align 4
 .LNEON_tail:
-       vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
-       vld1.64         {$A3xB-$A4xB}, [$tinptr, :256]!
-       vshr.u64        $temp,`&Dlo("$A0xB")`,#16
-       vadd.u64        `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
-       vld1.64         {$A5xB-$A6xB}, [$tinptr, :256]!
-       vshr.u64        $temp,`&Dhi("$A0xB")`,#16
-       vld1.64         {$A7xB},       [$tinptr, :128]!
-       vzip.16         `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
-
-.LNEON_tail2:
-       vadd.u64        `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
-       vst1.32         {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
-       vshr.u64        $temp,`&Dlo("$A1xB")`,#16
-       vadd.u64        `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
-       vshr.u64        $temp,`&Dhi("$A1xB")`,#16
-       vzip.16         `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
-
-       vadd.u64        `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
-       vst1.32         {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
-       vshr.u64        $temp,`&Dlo("$A2xB")`,#16
-       vadd.u64        `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
-       vshr.u64        $temp,`&Dhi("$A2xB")`,#16
-       vzip.16         `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
-
-       vadd.u64        `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
-       vst1.32         {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
-       vshr.u64        $temp,`&Dlo("$A3xB")`,#16
-       vadd.u64        `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
-       vshr.u64        $temp,`&Dhi("$A3xB")`,#16
-       vzip.16         `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
-
-       vadd.u64        `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
-       vst1.32         {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
-       vshr.u64        $temp,`&Dlo("$A4xB")`,#16
-       vadd.u64        `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
-       vshr.u64        $temp,`&Dhi("$A4xB")`,#16
-       vzip.16         `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
-
-       vadd.u64        `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
-       vst1.32         {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
-       vshr.u64        $temp,`&Dlo("$A5xB")`,#16
-       vadd.u64        `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
-       vshr.u64        $temp,`&Dhi("$A5xB")`,#16
-       vzip.16         `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
-
-       vadd.u64        `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
-       vst1.32         {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
-       vshr.u64        $temp,`&Dlo("$A6xB")`,#16
-       vadd.u64        `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
-       vld1.64         {$A0xB}, [$tinptr, :128]!
-       vshr.u64        $temp,`&Dhi("$A6xB")`,#16
-       vzip.16         `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
-
-       vadd.u64        `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
-       vst1.32         {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
-       vshr.u64        $temp,`&Dlo("$A7xB")`,#16
-       vadd.u64        `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
-       vld1.64         {$A1xB-$A2xB},  [$tinptr, :256]!
-       vshr.u64        $temp,`&Dhi("$A7xB")`,#16
-       vzip.16         `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
+       vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
+       vshr.u64        $temp,@ACC[0]#lo,#16
+       vld1.64         {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
+       vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
+       vld1.64         {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
+       vshr.u64        $temp,@ACC[0]#hi,#16
+       vld1.64         {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
+       vzip.16         @ACC[0]#lo,@ACC[0]#hi
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+       vadd.u64        @ACC[1]#lo,@ACC[1]#lo,$temp
+       vst1.32         {@ACC[0]#lo[0]}, [$toutptr, :32]!
+       vshr.u64        $temp,@ACC[1]#lo,#16
+       vadd.u64        @ACC[1]#hi,@ACC[1]#hi,$temp
+       vshr.u64        $temp,@ACC[1]#hi,#16
+       vzip.16         @ACC[1]#lo,@ACC[1]#hi
+___
+       push(@ACC,shift(@ACC));
+}
+       push(@ACC,shift(@ACC));
+$code.=<<___;
+       vld1.64         {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
        subs            $inner,$inner,#8
-       vst1.32         {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
-
+       vst1.32         {@ACC[7]#lo[0]},   [$toutptr, :32]!
        bne     .LNEON_tail
 
        vst1.32 {${temp}[0]}, [$toutptr, :32]           @ top-most bit
@@ -708,8 +736,14 @@ $code.=<<___;
 #endif
 ___
 
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx  lr/gm;
-print $code;
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/ge;
+
+       s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge        or
+       s/\bret\b/bx    lr/g                                            or
+       s/\bbx\s+lr\b/.word\t0xe12fff1e/g;      # make it possible to compile with -march=armv4
+
+       print $_,"\n";
+}
+
 close STDOUT;