x86_64-mont.pl: futher optimization resulting in up to 48% improvement
authorAndy Polyakov <appro@openssl.org>
Tue, 9 Aug 2011 13:05:05 +0000 (13:05 +0000)
committerAndy Polyakov <appro@openssl.org>
Tue, 9 Aug 2011 13:05:05 +0000 (13:05 +0000)
(4096-bit RSA sign benchmark on Core2) in comparison to initial version
from 2005.

crypto/bn/asm/x86_64-mont.pl

index 71b0fe0dd21a2c7ee399ddbb0763e0ffe4d980e2..4f330b81f382c7dedc816336925f0743367491f3 100755 (executable)
 # from platform to platform, but in average it's ~5%/15%/25%/33%
 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
 
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -56,13 +64,13 @@ $code=<<___;
 .type  bn_mul_mont,\@function,6
 .align 16
 bn_mul_mont:
-       cmp     $ap,$bp
-       jne     .Lmul_enter
-       test    \$1,${num}d
+       test    \$3,${num}d
        jnz     .Lmul_enter
-       cmp     \$4,${num}d
+       cmp     \$8,${num}d
        jb      .Lmul_enter
-       jmp     __bn_sqr_enter
+       cmp     $ap,$bp
+       jne     .Lmul4x_enter
+       jmp     .Lsqr4x_enter
 
 .align 16
 .Lmul_enter:
@@ -81,50 +89,65 @@ bn_mul_mont:
        and     \$-1024,%rsp            # minimize TLB usage
 
        mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
-.Lprologue:
+.Lmul_body:
        mov     $bp,%r12                # reassign $bp
 ___
                $bp="%r12";
 $code.=<<___;
        mov     ($n0),$n0               # pull n0[0] value
+       mov     ($bp),$m0               # m0=bp[0]
+       mov     ($ap),%rax
 
        xor     $i,$i                   # i=0
        xor     $j,$j                   # j=0
 
-       mov     ($bp),$m0               # m0=bp[0]
-       mov     ($ap),%rax
+       mov     $n0,$m1
        mulq    $m0                     # ap[0]*bp[0]
        mov     %rax,$lo0
-       mov     %rdx,$hi0
+       mov     ($np),%rax
 
-       imulq   $n0,%rax                # "tp[0]"*n0
-       mov     %rax,$m1
+       imulq   $lo0,$m1                # "tp[0]"*n0
+       mov     %rdx,$hi0
 
-       mulq    ($np)                   # np[0]*m1
-       add     $lo0,%rax               # discarded
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$lo0               # discarded
+       mov     8($ap),%rax
        adc     \$0,%rdx
        mov     %rdx,$hi1
 
        lea     1($j),$j                # j++
+       jmp     .L1st_enter
+
+.align 16
 .L1st:
+       add     %rax,$hi1
        mov     ($ap,$j,8),%rax
-       mulq    $m0                     # ap[j]*bp[0]
-       add     $hi0,%rax
        adc     \$0,%rdx
-       mov     %rax,$lo0
+       add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
+       mov     $lo0,$hi0
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+
+.L1st_enter:
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$hi0
        mov     ($np,$j,8),%rax
-       mov     %rdx,$hi0
+       adc     \$0,%rdx
+       lea     1($j),$j                # j++
+       mov     %rdx,$lo0
 
        mulq    $m1                     # np[j]*m1
-       add     $hi1,%rax
-       lea     1($j),$j                # j++
+       cmp     $num,$j
+       jne     .L1st
+
+       add     %rax,$hi1
        adc     \$0,%rdx
-       add     $lo0,%rax               # np[j]*m1+ap[j]*bp[0]
+       add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
        adc     \$0,%rdx
-       mov     %rax,-16(%rsp,$j,8)     # tp[j-1]
-       cmp     $num,$j
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
        mov     %rdx,$hi1
-       jl      .L1st
+       mov     $lo0,$hi0
 
        xor     %rdx,%rdx
        add     $hi0,$hi1
@@ -133,51 +156,64 @@ $code.=<<___;
        mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
 
        lea     1($i),$i                # i++
-.align 4
+       jmp     .Louter
+.align 16
 .Louter:
-       xor     $j,$j                   # j=0
-
        mov     ($bp,$i,8),$m0          # m0=bp[i]
+       xor     $j,$j                   # j=0
        mov     ($ap),%rax              # ap[0]
+       mov     $n0,$m1
+       mov     (%rsp),$lo0
        mulq    $m0                     # ap[0]*bp[i]
-       add     (%rsp),%rax             # ap[0]*bp[i]+tp[0]
+       add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
+       mov     ($np),%rax
        adc     \$0,%rdx
-       mov     %rax,$lo0
-       mov     %rdx,$hi0
 
-       imulq   $n0,%rax                # tp[0]*n0
-       mov     %rax,$m1
+       imulq   $lo0,$m1                # tp[0]*n0
+       mov     %rdx,$hi0
 
-       mulq    ($np,$j,8)              # np[0]*m1
-       add     $lo0,%rax               # discarded
-       mov     8(%rsp),$lo0            # tp[1]
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$lo0               # discarded
+       mov     8($ap),%rax
        adc     \$0,%rdx
+       mov     8(%rsp),$lo0            # tp[1]
        mov     %rdx,$hi1
 
        lea     1($j),$j                # j++
-       jmp     .Linner
+       jmp     .Linner_enter
+
 .align 16
 .Linner:
+       add     %rax,$hi1
        mov     ($ap,$j,8),%rax
-       mulq    $m0                     # ap[j]*bp[i]
-       add     $hi0,%rax
        adc     \$0,%rdx
-       add     %rax,$lo0               # ap[j]*bp[i]+tp[j]
+       add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
+       mov     (%rsp,$j,8),$lo0
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+
+.Linner_enter:
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$hi0
        mov     ($np,$j,8),%rax
        adc     \$0,%rdx
+       add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
        mov     %rdx,$hi0
+       adc     \$0,$hi0
+       lea     1($j),$j                # j++
 
        mulq    $m1                     # np[j]*m1
-       add     $hi1,%rax
-       lea     1($j),$j                # j++
-       adc     \$0,%rdx
-       add     $lo0,%rax               # np[j]*m1+ap[j]*bp[i]+tp[j]
+       cmp     $num,$j
+       jne     .Linner
+
+       add     %rax,$hi1
        adc     \$0,%rdx
+       add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
        mov     (%rsp,$j,8),$lo0
-       cmp     $num,$j
-       mov     %rax,-16(%rsp,$j,8)     # tp[j-1]
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
        mov     %rdx,$hi1
-       jl      .Linner
 
        xor     %rdx,%rdx
        add     $hi0,$hi1
@@ -191,35 +227,449 @@ $code.=<<___;
        cmp     $num,$i
        jl      .Louter
 
-       lea     (%rsp),$ap              # borrow ap for tp
-       lea     -1($num),$j             # j=num-1
-
-       mov     ($ap),%rax              # tp[0]
        xor     $i,$i                   # i=0 and clear CF!
+       mov     (%rsp),%rax             # tp[0]
+       lea     (%rsp),$ap              # borrow ap for tp
+       mov     $num,$j                 # j=num
        jmp     .Lsub
 .align 16
 .Lsub: sbb     ($np,$i,8),%rax
        mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
-       dec     $j                      # doesn't affect CF!
        mov     8($ap,$i,8),%rax        # tp[i+1]
        lea     1($i),$i                # i++
-       jge     .Lsub
+       dec     $j                      # doesnn't affect CF!
+       jnz     .Lsub
 
        sbb     \$0,%rax                # handle upmost overflow bit
+       xor     $i,$i
        and     %rax,$ap
        not     %rax
        mov     $rp,$np
        and     %rax,$np
-       lea     -1($num),$j
+       mov     $num,$j                 # j=num
        or      $np,$ap                 # ap=borrow?tp:rp
 .align 16
 .Lcopy:                                        # copy or in-place refresh
+       mov     ($ap,$i,8),%rax
+       mov     $i,(%rsp,$i,8)          # zap temporary vector
+       mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
+       lea     1($i),$i
+       sub     \$1,$j
+       jnz     .Lcopy
+
+       mov     8(%rsp,$num,8),%rsi     # restore %rsp
+       mov     \$1,%rax
+       mov     (%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lmul_epilogue:
+       ret
+.size  bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type  bn_mul4x_mont,\@function,6
+.align 16
+bn_mul4x_mont:
+.Lmul4x_enter:
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+
+       mov     ${num}d,${num}d
+       lea     4($num),%r10
+       mov     %rsp,%r11
+       neg     %r10
+       lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
+       and     \$-1024,%rsp            # minimize TLB usage
+
+       mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul4x_body:
+       mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
+       mov     %rdx,%r12               # reassign $bp
+___
+               $bp="%r12";
+$code.=<<___;
+       mov     ($n0),$n0               # pull n0[0] value
+       mov     ($bp),$m0               # m0=bp[0]
+       mov     ($ap),%rax
+
+       xor     $i,$i                   # i=0
+       xor     $j,$j                   # j=0
+
+       mov     $n0,$m1
+       mulq    $m0                     # ap[0]*bp[0]
+       mov     %rax,$A[0]
+       mov     ($np),%rax
+
+       imulq   $A[0],$m1               # "tp[0]"*n0
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$A[0]              # discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$N[1]
+
+       mulq    $m0
+       add     %rax,$A[1]
+       mov     8($np),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1
+       add     %rax,$N[1]
+       mov     16($ap),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       lea     4($j),$j                # j++
+       adc     \$0,%rdx
+       mov     $N[1],(%rsp)
+       mov     %rdx,$N[0]
+       jmp     .L1st4x
+.align 16
+.L1st4x:
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
        mov     ($ap,$j,8),%rax
-       mov     %rax,($rp,$j,8)         # rp[i]=tp[i]
-       mov     $i,(%rsp,$j,8)          # zap temporary vector
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     8($np,$j,8),%rax
+       adc     \$0,%rdx
+       lea     4($j),$j                # j++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     -16($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+       cmp     $num,$j
+       jl      .L1st4x
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       xor     $N[1],$N[1]
+       add     $A[0],$N[0]
+       adc     \$0,$N[1]
+       mov     $N[0],-8(%rsp,$j,8)
+       mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+
+       lea     1($i),$i                # i++
+.align 4
+.Louter4x:
+       mov     ($bp,$i,8),$m0          # m0=bp[i]
+       xor     $j,$j                   # j=0
+       mov     (%rsp),$A[0]
+       mov     $n0,$m1
+       mulq    $m0                     # ap[0]*bp[i]
+       add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
+       mov     ($np),%rax
+       adc     \$0,%rdx
+
+       imulq   $A[0],$m1               # tp[0]*n0
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$A[0]              # "$N[0]", discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     8($np),%rax
+       adc     \$0,%rdx
+       add     8(%rsp),$A[1]           # +tp[1]
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     16($ap),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
+       lea     4($j),$j                # j+=2
+       adc     \$0,%rdx
+       mov     $N[1],(%rsp)            # tp[j-1]
+       mov     %rdx,$N[0]
+       jmp     .Linner4x
+.align 16
+.Linner4x:
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       lea     4($j),$j                # j++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     -16($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+       cmp     $num,$j
+       jl      .Linner4x
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       lea     1($i),$i                # i++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       xor     $N[1],$N[1]
+       add     $A[0],$N[0]
+       adc     \$0,$N[1]
+       add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
+       adc     \$0,$N[1]
+       mov     $N[0],-8(%rsp,$j,8)
+       mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+
+       cmp     $num,$i
+       jl      .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+       mov     16(%rsp,$num,8),$rp     # restore $rp
+       mov     0(%rsp),@ri[0]          # tp[0]
+       pxor    %xmm0,%xmm0
+       mov     8(%rsp),@ri[1]          # tp[1]
+       shr     \$2,$num                # num/=4
+       lea     (%rsp),$ap              # borrow ap for tp
+       xor     $i,$i                   # i=0 and clear CF!
+
+       sub     0($np),@ri[0]
+       mov     16($ap),@ri[2]          # tp[2]
+       mov     24($ap),@ri[3]          # tp[3]
+       sbb     8($np),@ri[1]
+       lea     -1($num),$j             # j=num/4-1
+       jmp     .Lsub4x
+.align 16
+.Lsub4x:
+       mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       sbb     16($np,$i,8),@ri[2]
+       mov     32($ap,$i,8),@ri[0]     # tp[i+1]
+       mov     40($ap,$i,8),@ri[1]
+       sbb     24($np,$i,8),@ri[3]
+       mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       sbb     32($np,$i,8),@ri[0]
+       mov     48($ap,$i,8),@ri[2]
+       mov     56($ap,$i,8),@ri[3]
+       sbb     40($np,$i,8),@ri[1]
+       lea     4($i),$i                # i++
+       dec     $j                      # doesnn't affect CF!
+       jnz     .Lsub4x
+
+       mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       mov     32($ap,$i,8),@ri[0]     # load overflow bit
+       sbb     16($np,$i,8),@ri[2]
+       mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       sbb     24($np,$i,8),@ri[3]
+       mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+
+       sbb     \$0,@ri[0]              # handle upmost overflow bit
+       mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       xor     $i,$i                   # i=0
+       and     @ri[0],$ap
+       not     @ri[0]
+       mov     $rp,$np
+       and     @ri[0],$np
+       lea     -1($num),$j
+       or      $np,$ap                 # ap=borrow?tp:rp
+
+       movdqu  ($ap),%xmm1
+       movdqa  %xmm0,(%rsp)
+       movdqu  %xmm1,($rp)
+       jmp     .Lcopy4x
+.align 16
+.Lcopy4x:                                      # copy or in-place refresh
+       movdqu  16($ap,$i),%xmm2
+       movdqu  32($ap,$i),%xmm1
+       movdqa  %xmm0,16(%rsp,$i)
+       movdqu  %xmm2,16($rp,$i)
+       movdqa  %xmm0,32(%rsp,$i)
+       movdqu  %xmm1,32($rp,$i)
+       lea     32($i),$i
        dec     $j
-       jge     .Lcopy
+       jnz     .Lcopy4x
 
+       shl     \$2,$num
+       movdqu  16($ap,$i),%xmm2
+       movdqa  %xmm0,16(%rsp,$i)
+       movdqu  %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
        mov     \$1,%rax
        mov     (%rsi),%r15
@@ -229,19 +679,21 @@ $code.=<<___;
        mov     32(%rsi),%rbp
        mov     40(%rsi),%rbx
        lea     48(%rsi),%rsp
-.Lepilogue:
+.Lmul4x_epilogue:
        ret
-.size  bn_mul_mont,.-bn_mul_mont
+.size  bn_mul4x_mont,.-bn_mul4x_mont
 ___
+}}}
 \f{{{
 ######################################################################
-# void bn_sqr_mont(
+# void bn_sqr4x_mont(
 my $rptr="%rdi";       # const BN_ULONG *rptr,
 my $aptr="%rsi";       # const BN_ULONG *aptr,
 my $bptr="%rdx";       # not used
 my $nptr="%rcx";       # const BN_ULONG *nptr,
 my $n0  ="%r8";                # const BN_ULONG *n0);
-my $num ="%r9";                # int num, has to be even and not less than 4
+my $num ="%r9";                # int num, has to be divisible by 4 and
+                       # not less than 8
 
 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
 my @A0=("%r10","%r11");
@@ -249,10 +701,10 @@ my @A1=("%r12","%r13");
 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
 
 $code.=<<___;
-.type  bn_sqr_mont,\@function,5
+.type  bn_sqr4x_mont,\@function,6
 .align 16
-bn_sqr_mont:
-__bn_sqr_enter:
+bn_sqr4x_mont:
+.Lsqr4x_enter:
        push    %rbx
        push    %rbp
        push    %r12
@@ -282,7 +734,7 @@ __bn_sqr_enter:
        mov     $nptr,40(%rsp)
        mov     $n0,  48(%rsp)
        mov     %r11, 56(%rsp)          # save original %rsp
-.Lsqr_body:
+.Lsqr4x_body:
        ##############################################################
        # Squaring part:
        #
@@ -290,51 +742,167 @@ __bn_sqr_enter:
        # b) shift result of a) by 1 to the left and accumulate
        #    a[i]*a[i] products;
        #
-       lea     16(%r10),$i             # $i=-($num-16)
+       lea     32(%r10),$i             # $i=-($num-32)
        lea     ($aptr,$num),$aptr      # end of a[] buffer, ($aptr,$i)=&ap[2]
 
        mov     $num,$j                 # $j=$num
-       pxor    %xmm0,%xmm0
-       lea     64(%rsp),$tptr
-.Lbzero:                               # clear t[$num]
-       movdqa  %xmm0,($tptr)
-       lea     16($tptr),$tptr
-       sub     \$16,$j
-       jnz     .Lbzero
-       jmp     .Lsqr_outer
+
+                                       # comments apply to $num==8 case
+       mov     -32($aptr,$i),$a0       # a[0]
+       lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
+       mov     -24($aptr,$i),%rax      # a[1]
+       lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
+       mov     -16($aptr,$i),$ai       # a[2]
+       mov     %rax,$a1
+
+       mul     $a0                     # a[1]*a[0]
+       mov     %rax,$A0[0]             # a[1]*a[0]
+        mov    $ai,%rax                # a[2]
+       mov     %rdx,$A0[1]
+       mov     $A0[0],-24($tptr,$i)    # t[1]
+
+       xor     $A0[0],$A0[0]
+       mul     $a0                     # a[2]*a[0]
+       add     %rax,$A0[1]
+        mov    $ai,%rax
+       adc     %rdx,$A0[0]
+       mov     $A0[1],-16($tptr,$i)    # t[2]
+
+       lea     -16($i),$j              # j=-16
+
+
+        mov    8($aptr,$j),$ai         # a[3]
+       mul     $a1                     # a[2]*a[1]
+       mov     %rax,$A1[0]             # a[2]*a[1]+t[3]
+        mov    $ai,%rax
+       mov     %rdx,$A1[1]
+
+       xor     $A0[1],$A0[1]
+       add     $A1[0],$A0[0]
+        lea    16($j),$j
+       adc     \$0,$A0[1]
+       mul     $a0                     # a[3]*a[0]
+       add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
+        mov    $ai,%rax
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-8($tptr,$j)     # t[3]
+       jmp     .Lsqr4x_1st
 
 .align 16
-.Lsqr_outer:                           # comments apply to $num==4 case
-       mov     -16($aptr,$i),$a0       # a[0]
+.Lsqr4x_1st:
+        mov    ($aptr,$j),$ai          # a[4]
+       xor     $A1[0],$A1[0]
+       mul     $a1                     # a[3]*a[1]
+       add     %rax,$A1[1]             # a[3]*a[1]+t[4]
+        mov    $ai,%rax
+       adc     %rdx,$A1[0]
+
+       xor     $A0[0],$A0[0]
+       add     $A1[1],$A0[1]
+       adc     \$0,$A0[0]
+       mul     $a0                     # a[4]*a[0]
+       add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
+        mov    $ai,%rax                # a[3]
+       adc     %rdx,$A0[0]
+       mov     $A0[1],($tptr,$j)       # t[4]
+
+
+        mov    8($aptr,$j),$ai         # a[5]
+       xor     $A1[1],$A1[1]
+       mul     $a1                     # a[4]*a[3]
+       add     %rax,$A1[0]             # a[4]*a[3]+t[5]
+        mov    $ai,%rax
+       adc     %rdx,$A1[1]
+
+       xor     $A0[1],$A0[1]
+       add     $A1[0],$A0[0]
+        lea    16($j),$j
+       adc     \$0,$A0[1]
+       mul     $a0                     # a[5]*a[2]
+       add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
+        mov    $ai,%rax
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-8($tptr,$j)     # t[5]
+
+        mov    ($aptr,$j),$ai          # a[6]
+       xor     $A1[0],$A1[0]
+       mul     $a1                     # a[5]*a[3]
+       add     %rax,$A1[1]             # a[5]*a[3]+t[6]
+        mov    $ai,%rax
+       adc     %rdx,$A1[0]
+
+       xor     $A0[0],$A0[0]
+       add     $A1[1],$A0[1]
+       adc     \$0,$A0[0]
+       mul     $a0                     # a[6]*a[2]
+       add     %rax,$A0[1]             # a[6]*a[2]+a[5]*a[3]+t[6]
+        mov    $ai,%rax                # a[3]
+       adc     %rdx,$A0[0]
+       mov     $A0[1],($tptr,$j)       # t[6]
+
+
+        mov    8($aptr,$j),$ai         # a[7]
+       xor     $A1[1],$A1[1]
+       mul     $a1                     # a[6]*a[5]
+       add     %rax,$A1[0]             # a[6]*a[5]+t[7]
+        mov    $ai,%rax
+       adc     %rdx,$A1[1]
+
+       xor     $A0[1],$A0[1]
+       add     $A1[0],$A0[0]
+        lea    16($j),$j
+       adc     \$0,$A0[1]
+       mul     $a0                     # a[7]*a[4]
+       add     %rax,$A0[0]             # a[7]*a[4]+a[6]*a[5]+t[6]
+        mov    $ai,%rax
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-8($tptr,$j)     # t[7]
+
+       cmp     \$0,$j
+       jne     .Lsqr4x_1st
+
+       xor     $A1[0],$A1[0]
+       add     $A0[1],$A1[1]
+       adc     \$0,$A1[0]
+       mul     $a1                     # a[7]*a[5]
+       add     %rax,$A1[1]
+       adc     %rdx,$A1[0]
+
+       mov     $A1[1],($tptr)          # t[8]
+       lea     16($i),$i
+       mov     $A1[0],8($tptr)         # t[9]
+       jmp     .Lsqr4x_outer
+
+.align 16
+.Lsqr4x_outer:                         # comments apply to $num==6 case
+       mov     -32($aptr,$i),$a0       # a[0]
        lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
-       mov     -8($aptr,$i),%rax       # a[1]
-       lea     -16($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
-       mov     ($aptr,$i),$ai          # a[2]
+       mov     -24($aptr,$i),%rax      # a[1]
+       lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
+       mov     -16($aptr,$i),$ai       # a[2]
        mov     %rax,$a1
 
-       mov     -8($tptr,$i),$A0[0]     # t[1]
+       mov     -24($tptr,$i),$A0[0]    # t[1]
        xor     $A0[1],$A0[1]
        mul     $a0                     # a[1]*a[0]
        add     %rax,$A0[0]             # a[1]*a[0]+t[1]
         mov    $ai,%rax                # a[2]
        adc     %rdx,$A0[1]
-       mov     $A0[0],-8($tptr,$i)     # t[1]
+       mov     $A0[0],-24($tptr,$i)    # t[1]
 
        xor     $A0[0],$A0[0]
-       add     ($tptr,$i),$A0[1]       # a[2]*a[0]+t[2]
+       add     -16($tptr,$i),$A0[1]    # a[2]*a[0]+t[2]
        adc     \$0,$A0[0]
        mul     $a0                     # a[2]*a[0]
        add     %rax,$A0[1]
         mov    $ai,%rax
        adc     %rdx,$A0[0]
-       mov     $A0[1],($tptr,$i)       # t[2]
+       mov     $A0[1],-16($tptr,$i)    # t[2]
 
-       lea     ($i),$j                 # j=-16
+       lea     -16($i),$j              # j=-16
        xor     $A1[0],$A1[0]
-       jmp     .Lsqr_inner
 
-.align 16
-.Lsqr_inner:
+
         mov    8($aptr,$j),$ai         # a[3]
        xor     $A1[1],$A1[1]
        add     8($tptr,$j),$A1[0]
@@ -353,9 +921,11 @@ __bn_sqr_enter:
        adc     %rdx,$A0[1]
        mov     $A0[0],8($tptr,$j)      # t[3]
 
-       add     \$16,$j
-       jz      .Lsqr_inner_done
+       lea     16($j),$j
+       jmp     .Lsqr4x_inner
 
+.align 16
+.Lsqr4x_inner:
         mov    ($aptr,$j),$ai          # a[4]
        xor     $A1[0],$A1[0]
        add     ($tptr,$j),$A1[1]
@@ -373,10 +943,86 @@ __bn_sqr_enter:
         mov    $ai,%rax                # a[3]
        adc     %rdx,$A0[0]
        mov     $A0[1],($tptr,$j)       # t[4]
-       jmp     .Lsqr_inner
 
-.align 16
-.Lsqr_inner_done:
+        mov    8($aptr,$j),$ai         # a[5]
+       xor     $A1[1],$A1[1]
+       add     8($tptr,$j),$A1[0]
+       adc     \$0,$A1[1]
+       mul     $a1                     # a[4]*a[3]
+       add     %rax,$A1[0]             # a[4]*a[3]+t[5]
+        mov    $ai,%rax
+       adc     %rdx,$A1[1]
+
+       xor     $A0[1],$A0[1]
+       add     $A1[0],$A0[0]
+       lea     16($j),$j               # j++
+       adc     \$0,$A0[1]
+       mul     $a0                     # a[5]*a[2]
+       add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
+        mov    $ai,%rax
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-8($tptr,$j)     # t[5]
+
+       cmp     \$0,$j
+       jne     .Lsqr4x_inner
+
+       xor     $A1[0],$A1[0]
+       add     $A0[1],$A1[1]
+       adc     \$0,$A1[0]
+       mul     $a1                     # a[5]*a[3]
+       add     %rax,$A1[1]
+       adc     %rdx,$A1[0]
+
+       mov     $A1[1],($tptr)          # t[6]
+       mov     $A1[0],8($tptr)         # t[7]
+
+       add     \$16,$i
+       jnz     .Lsqr4x_outer
+
+                                       # comments apply to $num==4 case
+       mov     -32($aptr),$a0          # a[0]
+       lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
+       mov     -24($aptr),%rax         # a[1]
+       lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
+       mov     -16($aptr),$ai          # a[2]
+       mov     %rax,$a1
+
+       mov     -24($tptr),$A0[0]       # t[1]
+       xor     $A0[1],$A0[1]
+       mul     $a0                     # a[1]*a[0]
+       add     %rax,$A0[0]             # a[1]*a[0]+t[1]
+        mov    $ai,%rax                # a[2]
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-24($tptr)       # t[1]
+
+       xor     $A0[0],$A0[0]
+       add     -16($tptr),$A0[1]       # a[2]*a[0]+t[2]
+       adc     \$0,$A0[0]
+       mul     $a0                     # a[2]*a[0]
+       add     %rax,$A0[1]
+        mov    $ai,%rax
+       adc     %rdx,$A0[0]
+       mov     $A0[1],-16($tptr)       # t[2]
+
+       xor     $A1[0],$A1[0]
+        mov    -8($aptr),$ai           # a[3]
+       xor     $A1[1],$A1[1]
+       add     -8($tptr),$A1[0]
+       adc     \$0,$A1[1]
+       mul     $a1                     # a[2]*a[1]
+       add     %rax,$A1[0]             # a[2]*a[1]+t[3]
+        mov    $ai,%rax
+       adc     %rdx,$A1[1]
+
+       xor     $A0[1],$A0[1]
+       add     $A1[0],$A0[0]
+       adc     \$0,$A0[1]
+       mul     $a0                     # a[3]*a[0]
+       add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
+        mov    $ai,%rax
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-8($tptr)        # t[3]
+
        xor     $A1[0],$A1[0]
        add     $A0[1],$A1[1]
        adc     \$0,$A1[0]
@@ -388,17 +1034,15 @@ __bn_sqr_enter:
        mov     $A1[1],($tptr)          # t[4]
        mov     $A1[0],8($tptr)         # t[5]
 
-       add     \$16,$i
-       jnz     .Lsqr_outer
-
        mul     $ai                     # a[2]*a[3]
 ___
 {
 my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
 $code.=<<___;
-        add    \$8,$i
+        add    \$16,$i
         xor    $shift,$shift
-        sub    $num,$i                 # $i=8-$num
+        sub    $num,$i                 # $i=16-$num
         xor    $carry,$carry
 
        add     $A1[0],%rax             # t[5]
@@ -407,44 +1051,147 @@ $code.=<<___;
        mov     %rdx,16($tptr)          # t[6]
        mov     $carry,24($tptr)        # t[7]
 
-        mov    -8($aptr,$i),%rax       # a[0]
+        mov    -16($aptr,$i),%rax      # a[0]
        lea     64(%rsp,$num,2),$tptr
-        mov    -16($tptr,$i,2),$A0[0]  # t[0]
-        mov    -8($tptr,$i,2),$A0[1]   # t[1]
-       jmp     .Lsqr_shift_n_add
+        xor    $A0[0],$A0[0]           # t[0]
+        mov    -24($tptr,$i,2),$A0[1]  # t[1]
 
-.align 16
-.Lsqr_shift_n_add:
-       lea     ($shift,$A0[0],2),$A1[0]# t[2*i]<<1 | shift
+       lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[1]            # | t[2*i]>>63
+        mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+        mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[0]
+        mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
+       mov     $S[0],-32($tptr,$i,2)
+       adc     %rdx,$S[1]
+
+       lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+        mov    $S[1],-24($tptr,$i,2)
+        sbb    $carry,$carry           # mov cf,$carry
        shr     \$63,$A0[0]
-       lea     (,$A0[1],2),$A1[1]      # t[2*i+1]<<1 |
+       lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
        shr     \$63,$A0[1]
-       or      $A0[0],$A1[1]           # | t[2*i]>>63
+       or      $A0[0],$S[3]            # | t[2*i]>>63
         mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
        mov     $A0[1],$shift           # shift=t[2*i+1]>>63
        mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
         mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[2]
+        mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
+       mov     $S[2],-16($tptr,$i,2)
+       adc     %rdx,$S[3]
+       lea     16($i),$i
+       mov     $S[3],-40($tptr,$i,2)
+       sbb     $carry,$carry           # mov cf,$carry
+       jmp     .Lsqr4x_shift_n_add
+
+.align 16
+.Lsqr4x_shift_n_add:
+       lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[1]            # | t[2*i]>>63
+        mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+        mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[0]
+        mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
+       mov     $S[0],-32($tptr,$i,2)
+       adc     %rdx,$S[1]
+
+       lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+        mov    $S[1],-24($tptr,$i,2)
+        sbb    $carry,$carry           # mov cf,$carry
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[3]            # | t[2*i]>>63
+        mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
        neg     $carry                  # mov $carry,cf
-       adc     %rax,$A1[0]
+        mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[2]
         mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
-       adc     %rdx,$A1[1]
-       mov     $A1[0],-16($tptr,$i,2)
+       mov     $S[2],-16($tptr,$i,2)
+       adc     %rdx,$S[3]
+
+       lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+        mov    $S[3],-8($tptr,$i,2)
+        sbb    $carry,$carry           # mov cf,$carry
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[1]            # | t[2*i]>>63
+        mov    16($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+        mov    24($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[0]
+        mov    8($aptr,$i),%rax        # a[i+1]        # prefetch
+       mov     $S[0],0($tptr,$i,2)
+       adc     %rdx,$S[1]
+
+       lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+        mov    $S[1],8($tptr,$i,2)
+        sbb    $carry,$carry           # mov cf,$carry
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[3]            # | t[2*i]>>63
+        mov    32($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+        mov    40($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[2]
+        mov    16($aptr,$i),%rax       # a[i+1]        # prefetch
+       mov     $S[2],16($tptr,$i,2)
+       adc     %rdx,$S[3]
+       mov     $S[3],24($tptr,$i,2)
        sbb     $carry,$carry           # mov cf,$carry
-       mov     $A1[1],-8($tptr,$i,2)
-       add     \$8,$i
-       jnz     .Lsqr_shift_n_add
+       add     \$32,$i
+       jnz     .Lsqr4x_shift_n_add
 
-       lea     ($shift,$A0[0],2),$A1[0]# t[2*i]<<1|shift
+       lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
        shr     \$63,$A0[0]
-       lea     (,$A0[1],2),$A1[1]      # t[2*i+1]<<1 |
+       lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
        shr     \$63,$A0[1]
-       or      $A0[0],$A1[1]           # | t[2*i]>>63
+       or      $A0[0],$S[1]            # | t[2*i]>>63
+        mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
        mul     %rax                    # a[i]*a[i]
        neg     $carry                  # mov $carry,cf
-       adc     %rax,$A1[0]
-       adc     %rdx,$A1[1]
-       mov     $A1[0],-16($tptr)
-       mov     $A1[1],-8($tptr)
+        mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[0]
+        mov    -8($aptr),%rax          # a[i+1]        # prefetch
+       mov     $S[0],-32($tptr)
+       adc     %rdx,$S[1]
+
+       lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
+        mov    $S[1],-24($tptr)
+        sbb    $carry,$carry           # mov cf,$carry
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[3]            # | t[2*i]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+       adc     %rax,$S[2]
+       adc     %rdx,$S[3]
+       mov     $S[2],-16($tptr)
+       mov     $S[3],-8($tptr)
 ___
 }\f
 ##############################################################
@@ -456,6 +1203,7 @@ my ($m0,$m1)=($a0,$a1);
 my @Ni=("%rbx","%r9");
 $code.=<<___;
        mov     40(%rsp),$nptr          # restore $nptr
+       mov     48(%rsp),$n0            # restore *n0
        xor     $j,$j
        mov     $num,0(%rsp)            # save $num
        sub     $num,$j                 # $j=-$num
@@ -471,10 +1219,10 @@ $code.=<<___;
        mov     8($nptr,$j),$Ni[1]      # n[1]          # modsched #
         imulq  $A0[0],$m0              # m0=t[0]*n0    # modsched #
         mov    %rax,$Ni[0]             #               # modsched #
-       jmp     .Lmont_outer
+       jmp     .Lsqr4x_mont_outer
 
 .align 16
-.Lmont_outer:
+.Lsqr4x_mont_outer:
        xor     $A0[1],$A0[1]
        mul     $m0                     # n[0]*m0
        add     %rax,$A0[0]             # n[0]*m0+t[0]
@@ -491,12 +1239,8 @@ $code.=<<___;
        adc     %rdx,$A0[0]
 
        imulq   $A0[1],$m1
-       lea     16($j),$j
-       jmp     .Lmont_inner
 
-.align 16
-.Lmont_inner:
-       mov     ($nptr,$j),$Ni[0]       # n[2]
+       mov     16($nptr,$j),$Ni[0]     # n[2]
        xor     $A1[1],$A1[1]
        add     $A0[1],$A1[0]
        adc     \$0,$A1[1]
@@ -504,17 +1248,17 @@ $code.=<<___;
        add     %rax,$A1[0]             # n[0]*m1+"t[1]"
         mov    $Ni[0],%rax
        adc     %rdx,$A1[1]
-       mov     $A1[0],-8($tptr,$j)     # "t[1]"
+       mov     $A1[0],8($tptr,$j)      # "t[1]"
 
        xor     $A0[1],$A0[1]
-       add     ($tptr,$j),$A0[0]
+       add     16($tptr,$j),$A0[0]
        adc     \$0,$A0[1]
        mul     $m0                     # n[2]*m0
        add     %rax,$A0[0]             # n[2]*m0+t[2]
         mov    $Ni[1],%rax
        adc     %rdx,$A0[1]
 
-       mov     8($nptr,$j),$Ni[1]      # n[3]
+       mov     24($nptr,$j),$Ni[1]     # n[3]
        xor     $A1[0],$A1[0]
        add     $A0[0],$A1[1]
        adc     \$0,$A1[0]
@@ -522,18 +1266,95 @@ $code.=<<___;
        add     %rax,$A1[1]             # n[1]*m1+"t[2]"
         mov    $Ni[1],%rax
        adc     %rdx,$A1[0]
-       mov     $A1[1],($tptr,$j)       # "t[2]"
+       mov     $A1[1],16($tptr,$j)     # "t[2]"
 
        xor     $A0[0],$A0[0]
-       add     8($tptr,$j),$A0[1]
-       lea     16($j),$j
+       add     24($tptr,$j),$A0[1]
+       lea     32($j),$j
        adc     \$0,$A0[0]
        mul     $m0                     # n[3]*m0
        add     %rax,$A0[1]             # n[3]*m0+t[3]
         mov    $Ni[0],%rax
        adc     %rdx,$A0[0]
+       jmp     .Lsqr4x_mont_inner
+
+.align 16
+.Lsqr4x_mont_inner:
+       mov     ($nptr,$j),$Ni[0]       # n[4]
+       xor     $A1[1],$A1[1]
+       add     $A0[1],$A1[0]
+       adc     \$0,$A1[1]
+       mul     $m1                     # n[2]*m1
+       add     %rax,$A1[0]             # n[2]*m1+"t[3]"
+        mov    $Ni[0],%rax
+       adc     %rdx,$A1[1]
+       mov     $A1[0],-8($tptr,$j)     # "t[3]"
+
+       xor     $A0[1],$A0[1]
+       add     ($tptr,$j),$A0[0]
+       adc     \$0,$A0[1]
+       mul     $m0                     # n[4]*m0
+       add     %rax,$A0[0]             # n[4]*m0+t[4]
+        mov    $Ni[1],%rax
+       adc     %rdx,$A0[1]
+
+       mov     8($nptr,$j),$Ni[1]      # n[5]
+       xor     $A1[0],$A1[0]
+       add     $A0[0],$A1[1]
+       adc     \$0,$A1[0]
+       mul     $m1                     # n[3]*m1
+       add     %rax,$A1[1]             # n[3]*m1+"t[4]"
+        mov    $Ni[1],%rax
+       adc     %rdx,$A1[0]
+       mov     $A1[1],($tptr,$j)       # "t[4]"
+
+       xor     $A0[0],$A0[0]
+       add     8($tptr,$j),$A0[1]
+       adc     \$0,$A0[0]
+       mul     $m0                     # n[5]*m0
+       add     %rax,$A0[1]             # n[5]*m0+t[5]
+        mov    $Ni[0],%rax
+       adc     %rdx,$A0[0]
+
+
+       mov     16($nptr,$j),$Ni[0]     # n[6]
+       xor     $A1[1],$A1[1]
+       add     $A0[1],$A1[0]
+       adc     \$0,$A1[1]
+       mul     $m1                     # n[4]*m1
+       add     %rax,$A1[0]             # n[4]*m1+"t[5]"
+        mov    $Ni[0],%rax
+       adc     %rdx,$A1[1]
+       mov     $A1[0],8($tptr,$j)      # "t[5]"
+
+       xor     $A0[1],$A0[1]
+       add     16($tptr,$j),$A0[0]
+       adc     \$0,$A0[1]
+       mul     $m0                     # n[6]*m0
+       add     %rax,$A0[0]             # n[6]*m0+t[6]
+        mov    $Ni[1],%rax
+       adc     %rdx,$A0[1]
+
+       mov     24($nptr,$j),$Ni[1]     # n[7]
+       xor     $A1[0],$A1[0]
+       add     $A0[0],$A1[1]
+       adc     \$0,$A1[0]
+       mul     $m1                     # n[5]*m1
+       add     %rax,$A1[1]             # n[5]*m1+"t[6]"
+        mov    $Ni[1],%rax
+       adc     %rdx,$A1[0]
+       mov     $A1[1],16($tptr,$j)     # "t[6]"
+
+       xor     $A0[0],$A0[0]
+       add     24($tptr,$j),$A0[1]
+       lea     32($j),$j
+       adc     \$0,$A0[0]
+       mul     $m0                     # n[7]*m0
+       add     %rax,$A0[1]             # n[7]*m0+t[7]
+        mov    $Ni[0],%rax
+       adc     %rdx,$A0[0]
        cmp     \$0,$j
-       jne     .Lmont_inner
+       jne     .Lsqr4x_mont_inner
 
         sub    0(%rsp),$j              # $j=-$num      # modsched #
         mov    $n0,$m0                 #               # modsched #
@@ -541,14 +1362,14 @@ $code.=<<___;
        xor     $A1[1],$A1[1]
        add     $A0[1],$A1[0]
        adc     \$0,$A1[1]
-       mul     $m1                     # n[2]*m1
-       add     %rax,$A1[0]             # n[2]*m1+"t[3]"
+       mul     $m1                     # n[6]*m1
+       add     %rax,$A1[0]             # n[6]*m1+"t[7]"
        mov     $Ni[1],%rax
        adc     %rdx,$A1[1]
-       mov     $A1[0],-8($tptr)        # "t[3]"
+       mov     $A1[0],-8($tptr)        # "t[7]"
 
        xor     $A0[1],$A0[1]
-       add     ($tptr),$A0[0]          # +t[4]
+       add     ($tptr),$A0[0]          # +t[8]
        adc     \$0,$A0[1]
         mov    0($nptr,$j),$Ni[0]      # n[0]          # modsched #
        add     $topbit,$A0[0]
@@ -560,74 +1381,108 @@ $code.=<<___;
        add     $A0[0],$A1[1]
         mov    16($tptr,$j),$A0[0]     # t[0]          # modsched #
        adc     \$0,$A1[0]
-       mul     $m1                     # n[3]*m1
-       add     %rax,$A1[1]             # n[3]*m1+"t[4]"
+       mul     $m1                     # n[7]*m1
+       add     %rax,$A1[1]             # n[7]*m1+"t[8]"
         mov    $Ni[0],%rax             #               # modsched #
        adc     %rdx,$A1[0]
-       mov     $A1[1],($tptr)          # "t[4]"
+       mov     $A1[1],($tptr)          # "t[8]"
 
        xor     $topbit,$topbit
-       add     8($tptr),$A1[0]         # +t[5]
+       add     8($tptr),$A1[0]         # +t[9]
        adc     $topbit,$topbit
        add     $A0[1],$A1[0]
        lea     16($tptr),$tptr         # "t[$num]>>128"
        adc     \$0,$topbit
-       mov     $A1[0],-8($tptr)        # "t[5]"
+       mov     $A1[0],-8($tptr)        # "t[9]"
        cmp     8(%rsp),$tptr           # are we done?
-       jb      .Lmont_outer
+       jb      .Lsqr4x_mont_outer
 
        mov     0(%rsp),$num            # restore $num
        mov     $topbit,($tptr)         # save $topbit
 ___
 }\f
 ##############################################################
-# Post-condition, 2x unrolled copy from bn_mul_mont
+# Post-condition, 4x unrolled copy from bn_mul_mont
 #
 {
 my ($tptr,$nptr)=("%rbx",$aptr);
+my @ri=("%rax","%rdx","%r10","%r11");
 $code.=<<___;
+       mov     64(%rsp,$num),@ri[0]    # tp[0]
        lea     64(%rsp,$num),$tptr     # upper half of t[2*$num] holds result
-       shr     \$4,$num                # num/2
-       mov     32(%rsp),$rptr          # restore $rptr
        mov     40(%rsp),$nptr          # restore $nptr
-       lea     -1($num),$j             # j=num/2-1
-
-       mov     ($tptr),%rax            # tp[0]
+       shr     \$5,$num                # num/4
+       mov     8($tptr),@ri[1]         # t[1]
        xor     $i,$i                   # i=0 and clear CF!
-       jmp     .Lsqr_sub
+
+       mov     32(%rsp),$rptr          # restore $rptr
+       sub     0($nptr),@ri[0]
+       mov     16($tptr),@ri[2]        # t[2]
+       mov     24($tptr),@ri[3]        # t[3]
+       sbb     8($nptr),@ri[1]
+       lea     -1($num),$j             # j=num/4-1
+       jmp     .Lsqr4x_sub
 .align 16
-.Lsqr_sub:
-       mov     8($tptr,$i,8),%rdx
-       sbb     0($nptr,$i,8),%rax
-       sbb     8($nptr,$i,8),%rdx
-       mov     %rax,0($rptr,$i,8)      # rp[i]=tp[i]-np[i]
-       mov     %rdx,8($rptr,$i,8)      # rp[i]=tp[i]-np[i]
-       mov     16($tptr,$i,8),%rax     # tp[i+1]
-       lea     2($i),$i                # i++
+.Lsqr4x_sub:
+       mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+       mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+       sbb     16($nptr,$i,8),@ri[2]
+       mov     32($tptr,$i,8),@ri[0]   # tp[i+1]
+       mov     40($tptr,$i,8),@ri[1]
+       sbb     24($nptr,$i,8),@ri[3]
+       mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
+       mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
+       sbb     32($nptr,$i,8),@ri[0]
+       mov     48($tptr,$i,8),@ri[2]
+       mov     56($tptr,$i,8),@ri[3]
+       sbb     40($nptr,$i,8),@ri[1]
+       lea     4($i),$i                # i++
        dec     $j                      # doesn't affect CF!
-       jge     .Lsqr_sub
+       jnz     .Lsqr4x_sub
 
-       sbb     \$0,%rax                # handle upmost overflow bit
+       mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+       mov     32($tptr,$i,8),@ri[0]   # load overflow bit
+       sbb     16($nptr,$i,8),@ri[2]
+       mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+       sbb     24($nptr,$i,8),@ri[3]
+       mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
+
+       sbb     \$0,@ri[0]              # handle upmost overflow bit
+       mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
        xor     $i,$i                   # i=0
-       and     %rax,$tptr
-       not     %rax
+       and     @ri[0],$tptr
+       not     @ri[0]
        mov     $rptr,$nptr
-       and     %rax,$nptr
+       and     @ri[0],$nptr
        lea     -1($num),$j
        or      $nptr,$tptr             # tp=borrow?tp:rp
 
+       pxor    %xmm0,%xmm0
        lea     64(%rsp,$num,8),$nptr
+       movdqu  ($tptr),%xmm1
        lea     ($nptr,$num,8),$nptr
-       jmp     .Lsqr_copy
+       movdqa  %xmm0,64(%rsp)          # zap lower half of temporary vector
+       movdqa  %xmm0,($nptr)           # zap upper half of temporary vector
+       movdqu  %xmm1,($rptr)
+       jmp     .Lsqr4x_copy
 .align 16
-.Lsqr_copy:                            # copy or in-place refresh
-       movdqu  ($tptr,$i),%xmm1
-       movdqa  %xmm0,64(%rsp,$i)       # zap lower half of temporary vector
-       movdqa  %xmm0,($nptr,$i)        # zap upper half of temporary vector
-       movdqu  %xmm1,($rptr,$i)
-       lea     16($i),$i
+.Lsqr4x_copy:                          # copy or in-place refresh
+       movdqu  16($tptr,$i),%xmm2
+       movdqu  32($tptr,$i),%xmm1
+       movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
+       movdqa  %xmm0,96(%rsp,$i)       # zap lower half of temporary vector
+       movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
+       movdqa  %xmm0,32($nptr,$i)      # zap upper half of temporary vector
+       movdqu  %xmm2,16($rptr,$i)
+       movdqu  %xmm1,32($rptr,$i)
+       lea     32($i),$i
        dec     $j
-       jge     .Lsqr_copy
+       jnz     .Lsqr4x_copy
+
+       movdqu  16($tptr,$i),%xmm2
+       movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
+       movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
+       movdqu  %xmm2,16($rptr,$i)
 ___
 }
 $code.=<<___;
@@ -640,9 +1495,9 @@ $code.=<<___;
        mov     32(%rsi),%rbp
        mov     40(%rsi),%rbx
        lea     48(%rsi),%rsp
-.Lsqr_epilogue:
+.Lsqr4x_epilogue:
        ret
-.size  bn_sqr_mont,.-bn_sqr_mont
+.size  bn_sqr4x_mont,.-bn_sqr4x_mont
 ___
 }}}
 $code.=<<___;
@@ -677,14 +1532,19 @@ mul_handler:
        mov     120($context),%rax      # pull context->Rax
        mov     248($context),%rbx      # pull context->Rip
 
-       lea     .Lprologue(%rip),%r10
-       cmp     %r10,%rbx               # context->Rip<.Lprologue
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # end of prologue label
+       cmp     %r10,%rbx               # context->Rip<end of prologue label
        jb      .Lcommon_seh_tail
 
        mov     152($context),%rax      # pull context->Rsp
 
-       lea     .Lepilogue(%rip),%r10
-       cmp     %r10,%rbx               # context->Rip>=.Lepilogue
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
        jae     .Lcommon_seh_tail
 
        mov     192($context),%r10      # pull $num
@@ -724,13 +1584,13 @@ sqr_handler:
        mov     120($context),%rax      # pull context->Rax
        mov     248($context),%rbx      # pull context->Rip
 
-       lea     .Lsqr_body(%rip),%r10
+       lea     .Lsqr4x_body(%rip),%r10
        cmp     %r10,%rbx               # context->Rip<.Lsqr_body
        jb      .Lcommon_seh_tail
 
        mov     152($context),%rax      # pull context->Rsp
 
-       lea     .Lsqr_epilogue(%rip),%r10
+       lea     .Lsqr4x_epilogue(%rip),%r10
        cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
        jae     .Lcommon_seh_tail
 
@@ -796,16 +1656,25 @@ sqr_handler:
        .rva    .LSEH_end_bn_mul_mont
        .rva    .LSEH_info_bn_mul_mont
 
-       .rva    .LSEH_begin_bn_sqr_mont
-       .rva    .LSEH_end_bn_sqr_mont
-       .rva    .LSEH_info_bn_sqr_mont
+       .rva    .LSEH_begin_bn_mul4x_mont
+       .rva    .LSEH_end_bn_mul4x_mont
+       .rva    .LSEH_info_bn_mul4x_mont
+
+       .rva    .LSEH_begin_bn_sqr4x_mont
+       .rva    .LSEH_end_bn_sqr4x_mont
+       .rva    .LSEH_info_bn_sqr4x_mont
 
 .section       .xdata
 .align 8
 .LSEH_info_bn_mul_mont:
        .byte   9,0,0,0
        .rva    mul_handler
-.LSEH_info_bn_sqr_mont:
+       .rva    .Lmul_body,.Lmul_epilogue       # HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+       .byte   9,0,0,0
+       .rva    mul_handler
+       .rva    .Lmul4x_body,.Lmul4x_epilogue   # HandlerData[]
+.LSEH_info_bn_sqr4x_mont:
        .byte   9,0,0,0
        .rva    sqr_handler
 ___