Throw in bn_mul_mont for x86.
authorAndy Polyakov <appro@openssl.org>
Tue, 24 Jul 2007 15:29:21 +0000 (15:29 +0000)
committerAndy Polyakov <appro@openssl.org>
Tue, 24 Jul 2007 15:29:21 +0000 (15:29 +0000)
Configure
TABLE
crypto/bn/asm/x86-mont.pl [new file with mode: 0755]

index de32ac08bce2ce19b0ddfd86c466571bf38565c4..a1da13a2b79458bfd7c5443d0034098d0aa68269 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -116,9 +116,9 @@ my $tlib="-lnsl -lsocket";
 my $bits1="THIRTY_TWO_BIT ";
 my $bits2="SIXTY_FOUR_BIT ";
 
-my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o:rm86-elf.o:r586-elf.o";
-my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o";
-my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o";
+my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o mo86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o:rm86-elf.o:r586-elf.o";
+my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o mo86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o";
+my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o mo86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o";
 
 my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o::";
 my $ia64_asm=":bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o:::sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o::";
diff --git a/TABLE b/TABLE
index 11e157b6684e6de985a294f949ccd275e90f97fb..335cefd7545f7d994f85981ea745dda59a0af8ef 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -198,7 +198,7 @@ $sys_id       =
 $lflags       = 
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-out.o
-$bn_obj       = bn86-out.o co86-out.o
+$bn_obj       = bn86-out.o co86-out.o mo86-out.o
 $des_obj      = dx86-out.o yx86-out.o
 $aes_obj      = ax86-out.o
 $bf_obj       = bx86-out.o
@@ -225,7 +225,7 @@ $sys_id       =
 $lflags       = 
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -279,7 +279,7 @@ $sys_id       = CYGWIN32
 $lflags       = 
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-cof.o
-$bn_obj       = bn86-cof.o co86-cof.o
+$bn_obj       = bn86-cof.o co86-cof.o mo86-cof.o
 $des_obj      = dx86-cof.o yx86-cof.o
 $aes_obj      = ax86-cof.o
 $bf_obj       = bx86-cof.o
@@ -333,7 +333,7 @@ $sys_id       = MSDOS
 $lflags       = -L/dev/env/WATT_ROOT/lib -lwatt
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-out.o
-$bn_obj       = bn86-out.o co86-out.o
+$bn_obj       = bn86-out.o co86-out.o mo86-out.o
 $des_obj      = dx86-out.o yx86-out.o
 $aes_obj      = ax86-out.o
 $bf_obj       = bx86-out.o
@@ -846,7 +846,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -1035,7 +1035,7 @@ $sys_id       =
 $lflags       = 
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -1224,7 +1224,7 @@ $sys_id       =
 $lflags       = 
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -1305,7 +1305,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -1332,7 +1332,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -1413,7 +1413,7 @@ $sys_id       =
 $lflags       = -lefence -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -1440,7 +1440,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -1467,7 +1467,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -1494,7 +1494,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -1521,7 +1521,7 @@ $sys_id       =
 $lflags       = 
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -1656,7 +1656,7 @@ $sys_id       =
 $lflags       = -rdynamic -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -1818,7 +1818,7 @@ $sys_id       =
 $lflags       = -lnsl -lsocket
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -2277,7 +2277,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -2574,7 +2574,7 @@ $sys_id       =
 $lflags       = 
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-out.o
-$bn_obj       = bn86-out.o co86-out.o
+$bn_obj       = bn86-out.o co86-out.o mo86-out.o
 $des_obj      = dx86-out.o yx86-out.o
 $aes_obj      = ax86-out.o
 $bf_obj       = bx86-out.o
@@ -2601,7 +2601,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -2682,7 +2682,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -2979,7 +2979,7 @@ $sys_id       = MINGW32
 $lflags       = -lwsock32 -lgdi32
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT EXPORT_VAR_AS_FN
 $cpuid_obj    = x86cpuid-cof.o
-$bn_obj       = bn86-cof.o co86-cof.o
+$bn_obj       = bn86-cof.o co86-cof.o mo86-cof.o
 $des_obj      = dx86-cof.o yx86-cof.o
 $aes_obj      = ax86-cof.o
 $bf_obj       = bx86-cof.o
@@ -3384,7 +3384,7 @@ $sys_id       =
 $lflags       = -lsocket -lnsl
 $bn_ops       = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -3411,7 +3411,7 @@ $sys_id       =
 $lflags       = -lsocket -lnsl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -3627,7 +3627,7 @@ $sys_id       =
 $lflags       = -lsocket -lnsl -ldl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -3951,7 +3951,7 @@ $sys_id       =
 $lflags       = -lsocket -lnsl
 $bn_ops       = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
@@ -3978,7 +3978,7 @@ $sys_id       =
 $lflags       = -lsocket -lnsl
 $bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
-$bn_obj       = bn86-elf.o co86-elf.o
+$bn_obj       = bn86-elf.o co86-elf.o mo86-elf.o
 $des_obj      = dx86-elf.o yx86-elf.o
 $aes_obj      = ax86-elf.o
 $bf_obj       = bx86-elf.o
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
new file mode 100755 (executable)
index 0000000..5cd3cd2
--- /dev/null
@@ -0,0 +1,591 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";     $tp="esi";              # overlapping variables!!!
+$rp="edi";     $bp="edi";              # overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");                 # stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");  $_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;                             # size of above frame rounded up to 16n
+
+       &xor    ("eax","eax");
+       &mov    ("edi",&wparam(5));     # int num
+       &cmp    ("edi",4);
+       &jl     (&label("just_leave"));
+
+       &lea    ("esi",&wparam(0));     # put aside pointer to argument block
+       &lea    ("edx",&wparam(1));     # load ap
+       &mov    ("ebp","esp");          # saved stack pointer!
+       &add    ("edi",2);              # extra two words on top of tp
+       &neg    ("edi");
+       &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
+       &neg    ("edi");
+
+       # minimize cache contention by arraning 2K window between stack
+       # pointer and ap argument [np is also position sensitive vector,
+       # but it's assumed to be near ap, as it's allocated at ~same
+       # time].
+       &mov    ("eax","esp");
+       &sub    ("eax","edx");
+       &and    ("eax",2047);
+       &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
+
+       &xor    ("edx","esp");
+       &and    ("edx",2048);
+       &xor    ("edx",2048);
+       &sub    ("esp","edx");          # this splits them apart modulo 4096
+
+       &and    ("esp",-64);            # align to cache line
+
+       ################################# load argument block...
+       &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+       &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+       &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+       &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+       &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+       #&mov   ("edi",&DWP(5*4,"esi"));# int num
+
+       &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
+       &mov    ($_rp,"eax");           # ... save a copy of argument block
+       &mov    ($_ap,"ebx");
+       &mov    ($_bp,"ecx");
+       &mov    ($_np,"edx");
+       &mov    ($_n0,"esi");
+       &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
+       #&mov   ($_num,$num);           # redundant as $num is not reused
+       &mov    ($_sp,"ebp");           # saved stack pointer!
+\f
+if($sse2) {
+$acc0="mm0";   # mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+       &picmeup("eax","OPENSSL_ia32cap_P");
+       &bt     (&DWP(0,"eax"),26);
+       &jnc    (&label("non_sse2"));
+
+       &mov    ("eax",-1);
+       &movd   ($mask,"eax");          # mask 32 lower bits
+
+       &mov    ($ap,$_ap);             # load input pointers
+       &mov    ($bp,$_bp);
+       &mov    ($np,$_np);
+
+       &xor    ($i,$i);                # i=0
+       &xor    ($j,$j);                # j=0
+
+       &movd   ($mul0,&DWP(0,$bp));            # bp[0]
+       &movd   ($mul1,&DWP(0,$ap));            # ap[0]
+       &movd   ($car1,&DWP(0,$np));            # np[0]
+
+       &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
+       &movq   ($car0,$mul1);
+       &movq   ($acc0,$mul1);                  # I wish movd worked for
+       &pand   ($acc0,$mask);                  # inter-register transfers
+
+       &pmuludq($mul1,$_n0q);                  # *=n0
+
+       &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
+       &paddq  ($car1,$acc0);
+
+       &movd   ($acc1,&DWP(4,$np));            # np[1]
+       &movd   ($acc0,&DWP(4,$ap));            # ap[1]
+
+       &psrlq  ($car0,32);
+       &psrlq  ($car1,32);
+
+       &inc    ($j);                           # j++
+&set_label("1st",16);
+       &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
+       &pmuludq($acc1,$mul1);                  # np[j]*m1
+       &paddq  ($car0,$acc0);                  # +=c0
+       &paddq  ($car1,$acc1);                  # +=c1
+
+       &movq   ($acc0,$car0);
+       &pand   ($acc0,$mask);
+       &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
+       &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
+       &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
+       &psrlq  ($car0,32);
+       &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
+       &psrlq  ($car1,32);
+
+       &lea    ($j,&DWP(1,$j));
+       &cmp    ($j,$num);
+       &jl     (&label("1st"));
+
+       &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
+       &pmuludq($acc1,$mul1);                  # np[num-1]*m1
+       &paddq  ($car0,$acc0);                  # +=c0
+       &paddq  ($car1,$acc1);                  # +=c1
+
+       &movq   ($acc0,$car0);
+       &pand   ($acc0,$mask);
+       &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
+       &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
+
+       &psrlq  ($car0,32);
+       &psrlq  ($car1,32);
+
+       &paddq  ($car1,$car0);
+       &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
+\f
+       &inc    ($i);                           # i++
+&set_label("outer");
+       &xor    ($j,$j);                        # j=0
+
+       &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
+       &movd   ($mul1,&DWP(0,$ap));            # ap[0]
+       &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
+       &movd   ($car1,&DWP(0,$np));            # np[0]
+       &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
+
+       &paddq  ($mul1,$temp);                  # +=tp[0]
+       &movq   ($acc0,$mul1);
+       &movq   ($car0,$mul1);
+       &pand   ($acc0,$mask);
+
+       &pmuludq($mul1,$_n0q);                  # *=n0
+
+       &pmuludq($car1,$mul1);
+       &paddq  ($car1,$acc0);
+
+       &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
+       &movd   ($acc1,&DWP(4,$np));            # np[1]
+       &movd   ($acc0,&DWP(4,$ap));            # ap[1]
+
+       &psrlq  ($car0,32);
+       &psrlq  ($car1,32);
+       &paddq  ($car0,$temp);                  # +=tp[1]
+
+       &inc    ($j);                           # j++
+       &dec    ($num);
+&set_label("inner");
+       &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
+       &pmuludq($acc1,$mul1);                  # np[j]*m1
+       &paddq  ($car0,$acc0);                  # +=c0
+       &paddq  ($car1,$acc1);                  # +=c1
+
+       &movq   ($acc0,$car0);
+       &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+       &pand   ($acc0,$mask);
+       &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
+       &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
+       &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
+       &psrlq  ($car0,32);
+       &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+       &psrlq  ($car1,32);
+       &paddq  ($car0,$temp);                  # +=tp[j+1]
+
+       &dec    ($num);
+       &lea    ($j,&DWP(1,$j));                # j++
+       &jnz    (&label("inner"));
+
+       &mov    ($num,$j);
+       &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
+       &pmuludq($acc1,$mul1);                  # np[num-1]*m1
+       &paddq  ($car0,$acc0);                  # +=c0
+       &paddq  ($car1,$acc1);                  # +=c1
+
+       &movq   ($acc0,$car0);
+       &pand   ($acc0,$mask);
+       &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
+       &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
+       &psrlq  ($car0,32);
+       &psrlq  ($car1,32);
+
+       &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
+       &paddq  ($car1,$car0);
+       &paddq  ($car1,$temp);
+       &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
+
+       &lea    ($i,&DWP(1,$i));                # i++
+       &cmp    ($i,$num);
+       &jle    (&label("outer"));
+
+       &emms   ();                             # done with mmx bank
+       &jmp    (&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+\f
+if (0) {
+       &mov    ("esp",$_sp);
+       &xor    ("eax","eax");  # signal "not fast enough [yet]"
+       &jmp    (&label("just_leave"));
+       # While the below code provides competitive performance for
+       # all key lengthes on modern Intel cores, it's still more
+       # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+       # means compared to the original integer-only assembler.
+       # 512-bit RSA sign is better by ~40%, but that's about all
+       # one can say about all CPUs...
+} else {
+$inp="esi";    # integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+       &mov    ($inp,$_ap);
+       &lea    ($carry,&DWP(1,$num));
+       &mov    ($word,$_bp);
+       &xor    ($j,$j);                                # j=0
+       &mov    ("edx",$inp);
+       &and    ($carry,1);                             # see if num is even
+       &sub    ("edx",$word);                          # see if ap==bp
+       &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
+       &or     ($carry,"edx");
+       &mov    ($word,&DWP(0,$word));                  # bp[0]
+       &jz     (&label("bn_sqr_mont"));
+       &mov    ($_bpend,"eax");
+       &mov    ("eax",&DWP(0,$inp));
+       &xor    ("edx","edx");
+
+&set_label("mull",16);
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[j]*bp[0]
+       &add    ($carry,"eax");
+       &lea    ($j,&DWP(1,$j));
+       &adc    ("edx",0);
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
+       &cmp    ($j,$num);
+       &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+       &jl     (&label("mull"));
+
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[num-1]*bp[0]
+        &mov   ($word,$_n0);
+       &add    ("eax",$carry);
+        &mov   ($inp,$_np);
+       &adc    ("edx",0);
+        &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
+
+       &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
+       &xor    ($j,$j);
+       &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
+       &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
+
+       &mov    ("eax",&DWP(0,$inp));                   # np[0]
+       &mul    ($word);                                # np[0]*m
+       &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+       &mov    ("eax",&DWP(4,$inp));                   # np[1]
+       &adc    ("edx",0);
+       &inc    ($j);
+
+       &jmp    (&label("2ndmadd"));
+\f\f
+&set_label("1stmadd",16);
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[j]*bp[i]
+       &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+       &lea    ($j,&DWP(1,$j));
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
+       &adc    ("edx",0);
+       &cmp    ($j,$num);
+       &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+       &jl     (&label("1stmadd"));
+
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[num-1]*bp[i]
+       &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
+        &mov   ($word,$_n0);
+       &adc    ("edx",0);
+        &mov   ($inp,$_np);
+       &add    ($carry,"eax");
+       &adc    ("edx",0);
+        &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
+
+       &xor    ($j,$j);
+       &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
+       &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
+       &adc    ($j,0);
+        &mov   ("eax",&DWP(0,$inp));                   # np[0]
+       &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
+       &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
+
+       &mul    ($word);                                # np[0]*m
+       &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+       &mov    ("eax",&DWP(4,$inp));                   # np[1]
+       &adc    ("edx",0);
+       &mov    ($j,1);
+\f
+&set_label("2ndmadd",16);
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[j]*m
+       &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+       &lea    ($j,&DWP(1,$j));
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
+       &adc    ("edx",0);
+       &cmp    ($j,$num);
+       &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
+       &jl     (&label("2ndmadd"));
+
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[j]*m
+       &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &adc    ("edx",0);
+       &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
+
+       &xor    ("eax","eax");
+        &mov   ($j,$_bp);                              # &bp[i]
+       &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
+       &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
+        &lea   ($j,&DWP(4,$j));
+       &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
+        &cmp   ($j,$_bpend);
+       &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
+       &je     (&label("common_tail"));
+
+       &mov    ($word,&DWP(0,$j));                     # bp[i+1]
+       &mov    ($inp,$_ap);
+       &mov    ($_bp,$j);                              # &bp[++i]
+       &xor    ($j,$j);
+       &xor    ("edx","edx");
+       &mov    ("eax",&DWP(0,$inp));
+       &jmp    (&label("1stmadd"));
+\f
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+       &mov    ($_num,$num);
+       &mov    ($_bp,$j);                              # i=0
+
+       &mov    ("eax",$word);                          # ap[0]
+       &mul    ($word);                                # ap[0]*ap[0]
+       &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
+       &mov    ($sbit,"edx");
+       &shr    ("edx",1);
+       &and    ($sbit,1);
+       &inc    ($j);
+&set_label("sqr",16);
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[j]*ap[0]
+       &add    ("eax",$carry);
+       &lea    ($j,&DWP(1,$j));
+       &adc    ("edx",0);
+       &lea    ($carry,&DWP(0,$sbit,"eax",2));
+       &shr    ("eax",31);
+       &cmp    ($j,$_num);
+       &mov    ($sbit,"eax");
+       &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+       &jl     (&label("sqr"));
+
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[num-1]*ap[0]
+       &add    ("eax",$carry);
+        &mov   ($word,$_n0);
+       &adc    ("edx",0);
+        &mov   ($inp,$_np);
+       &lea    ($carry,&DWP(0,$sbit,"eax",2));
+        &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
+       &shr    ("eax",31);
+       &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
+
+       &lea    ($carry,&DWP(0,"eax","edx",2));
+        &mov   ("eax",&DWP(0,$inp));                   # np[0]
+       &shr    ("edx",31);
+       &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
+       &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
+
+       &mul    ($word);                                # np[0]*m
+       &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+       &mov    ($num,$j);
+       &adc    ("edx",0);
+       &mov    ("eax",&DWP(4,$inp));                   # np[1]
+       &mov    ($j,1);
+\f\f
+&set_label("3rdmadd",16);
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[j]*m
+       &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
+       &adc    ("edx",0);
+       &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
+
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[j+1]*m
+       &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
+       &lea    ($j,&DWP(2,$j));
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
+       &adc    ("edx",0);
+       &cmp    ($j,$num);
+       &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
+       &jl     (&label("3rdmadd"));
+
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[j]*m
+       &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &adc    ("edx",0);
+       &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
+
+       &mov    ($j,$_bp);                              # i
+       &xor    ("eax","eax");
+       &mov    ($inp,$_ap);
+       &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
+       &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
+       &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
+       &cmp    ($j,$num);
+       &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
+       &je     (&label("common_tail"));
+\f
+       &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
+       &lea    ($j,&DWP(1,$j));
+       &mov    ("eax",$word);
+       &mov    ($_bp,$j);                              # ++i
+       &mul    ($word);                                # ap[i]*ap[i]
+       &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
+       &adc    ("edx",0);
+       &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
+       &xor    ($carry,$carry);
+       &cmp    ($j,$num);
+       &lea    ($j,&DWP(1,$j));
+       &je     (&label("sqrlast"));
+
+       &mov    ($sbit,"edx");                          # zaps $num
+       &shr    ("edx",1);
+       &and    ($sbit,1);
+&set_label("sqradd",16);
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[j]*ap[i]
+       &add    ("eax",$carry);
+       &lea    ($carry,&DWP(0,"eax","eax"));
+       &adc    ("edx",0);
+       &shr    ("eax",31);
+       &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+       &lea    ($j,&DWP(1,$j));
+       &adc    ("eax",0);
+       &add    ($carry,$sbit);
+       &adc    ("eax",0);
+       &cmp    ($j,$_num);
+       &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+       &mov    ($sbit,"eax");
+       &jle    (&label("sqradd"));
+
+       &mov    ($carry,"edx");
+       &lea    ("edx",&DWP(0,$sbit,"edx",2));
+       &shr    ($carry,31);
+&set_label("sqrlast");
+       &mov    ($word,$_n0);
+       &mov    ($inp,$_np);
+       &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
+
+       &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
+       &mov    ("eax",&DWP(0,$inp));                   # np[0]
+       &adc    ($carry,0);
+       &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
+       &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
+
+       &mul    ($word);                                # np[0]*m
+       &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+       &lea    ($num,&DWP(-1,$j));
+       &adc    ("edx",0);
+       &mov    ($j,1);
+       &mov    ("eax",&DWP(4,$inp));                   # np[1]
+
+       &jmp    (&label("3rdmadd"));
+}
+\f
+&set_label("common_tail",16);
+       &mov    ($np,$_np);                     # load modulus pointer
+       &mov    ($rp,$_rp);                     # load result pointer
+       &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
+
+       &mov    ("eax",&DWP(0,$tp));            # tp[0]
+       &mov    ($j,$num);                      # j=num-1
+       &xor    ($i,$i);                        # i=0 and clear CF!
+
+&set_label("sub",16);
+       &sbb    ("eax",&DWP(0,$np,$i,4));
+       &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
+       &dec    ($j);                           # doesn't affect CF!
+       &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
+       &lea    ($i,&DWP(1,$i));                # i++
+       &jge    (&label("sub"));
+
+       &sbb    ("eax",0);                      # handle upmost overflow bit
+       &and    ($tp,"eax");
+       &not    ("eax");
+       &mov    ($np,$rp);
+       &and    ($np,"eax");
+       &or     ($tp,$np);                      # tp=carry?tp:rp
+
+&set_label("copy",16);                         # copy or in-place refresh
+       &mov    ("eax",&DWP(0,$tp,$num,4));
+       &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
+       &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
+       &dec    ($num);
+       &jge    (&label("copy"));
+
+       &mov    ("esp",$_sp);           # pull saved stack pointer
+       &mov    ("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();