my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::";
my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::";
my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o";
-my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void";
+my $armv4_asm=":bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void";
my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32";
my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64";
my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o:::::::";
$cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /86/);
$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /-mont/);
+$cflags.=" -DOPENSSL_BN_ASM_GF2m" if ($bn_obj =~ /-gf2m/);
if ($fips)
{
$lflags = -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR
$cpuid_obj =
-$bn_obj = bn_asm.o armv4-mont.o
+$bn_obj = bn_asm.o armv4-mont.o armv4-gf2m.o
$des_obj =
$aes_obj = aes_cbc.o aes-armv4.o
$bf_obj =
$lflags = -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR
$cpuid_obj =
-$bn_obj = bn_asm.o armv4-mont.o
+$bn_obj = bn_asm.o armv4-mont.o armv4-gf2m.o
$des_obj =
$aes_obj = aes_cbc.o aes-armv4.o
$bf_obj =
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
-$a="r1";
-$b="r0";
-
-($a0,$a1,$a2,$a12,$a4,$a14)=
-($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
-
-$mask="r12";
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
bx lr
.size mul_1x1_neon,.-mul_1x1_neon
#endif
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
-.align 5
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
.type mul_1x1_ialu,%function
+.align 5
mul_1x1_ialu:
mov $a0,#0
bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void bn_GF2m_mul_2x2(BN_ULONG *r,
+# BN_ULONG a1,BN_ULONG a0,
+# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
+
+($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
+$code.=<<___;
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
.Lpic: ldr r12,[pc,r12]
tst r12,#1
beq .Lialu
-___
-($A1,$B1,$A0,$B0,$A0B0,$A1B1)=map("d$_",(18..23));
-$code.=<<___;
+
veor $A1,$A1
vmov.32 $B1,r3,r3 @ two copies of b1
vmov.32 ${A1}[0],r1 @ a1
SQR_tb[(w) >> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF]
#endif
+#if !defined(OPENSSL_BN_ASM_GF2m)
/* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
* result is a polynomial r with degree < 2 * BN_BITS - 1
* The caller MUST ensure that the variables have the right amount
r[2] ^= m1 ^ r[1] ^ r[3]; /* h0 ^= m1 ^ l1 ^ h1; */
r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0; /* l1 ^= l0 ^ h0 ^ m0; */
}
-
+#else
+void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+#endif
/* Add polynomials a and b and store result in r; r could be a or b, a and b
* could be equal; r is the bitwise XOR of a and b.