From 6df8c74d5bc9504da54ae54bd3144aef2cb9b1bc Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 15 Dec 2005 22:40:58 +0000 Subject: [PATCH] Switch 64-bit sparcv9 platforms from bn(64,64) to bn(64,32). This doesn't have impact on performance, because amount of multiplications does not increase with this switch, not on sparcv9 that is. On the contrary, it actually improves performance, because it spares a load of instructions used to chase carries. Not to mention that BN assembler modules can be shared more freely between 32- and 64-bit builts. --- Configure | 8 +- TABLE | 8 +- crypto/bn/asm/sparcv9a-mont.pl | 157 ++++++++++++++------------------- 3 files changed, 72 insertions(+), 101 deletions(-) diff --git a/Configure b/Configure index f8f2e9f0f7..aa818ac053 100755 --- a/Configure +++ b/Configure @@ -202,7 +202,7 @@ my %table=( "solaris-sparcv8-gcc","gcc:-mv8 -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", # -m32 should be safe to add as long as driver recognizes -mcpu=ultrasparc "solaris-sparcv9-gcc","gcc:-m32 -mcpu=ultrasparc -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"solaris64-sparcv9-gcc","gcc:-m64 -mcpu=ultrasparc -O3 -Wall -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-fPIC:-m64 -shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"solaris64-sparcv9-gcc","gcc:-m64 -mcpu=ultrasparc -O3 -Wall -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-fPIC:-m64 -shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", #### "debug-solaris-sparcv8-gcc","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -O -g -mv8 -Wall -DB_ENDIAN::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8.o::::::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-solaris-sparcv9-gcc","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -DPEDANTIC -O -g -mcpu=ultrasparc -pedantic -ansi -Wall -Wshadow -Wno-long-long -D__EXTENSIONS__ -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o:des_enc-sparc.o fcrypt_b.o:::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", @@ -214,7 +214,7 @@ my %table=( "solaris-sparcv7-cc","cc:-xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR:${no_asm}:dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "solaris-sparcv8-cc","cc:-xarch=v8 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "solaris-sparcv9-cc","cc:-xtarget=ultra -xarch=v8plusa -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8plus.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"solaris64-sparcv9-cc","cc:-xtarget=ultra -xarch=v9a -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-KPIC:-xarch=v9 -G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):/usr/ccs/bin/ar rs", +"solaris64-sparcv9-cc","cc:-xtarget=ultra -xarch=v9a -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-KPIC:-xarch=v9 -G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):/usr/ccs/bin/ar rs", #### "debug-solaris-sparcv8-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -xarch=v8 -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8.o::::::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-solaris-sparcv9-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -xtarget=ultra -xarch=v8plus -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8plus.o::::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", @@ -335,7 +335,7 @@ my %table=( # -Wa,-Av8plus should do the trick no matter what. "linux-sparcv9","gcc:-m32 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -Wa,-Av8plusa -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv8plus.o::::::dlfcn:linux-shared:-fPIC:-m32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", # GCC 3.1 is a requirement -"linux64-sparcv9","gcc:-m64 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT:ULTRASPARC:-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux64-sparcv9","gcc:-m64 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT:ULTRASPARC:-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", #### Alpha Linux with GNU C and Compaq C setups # Special notes: # - linux-alpha+bwx-gcc is ment to be used from ./config only. If you @@ -365,7 +365,7 @@ my %table=( # -DMD32_REG_T=int doesn't actually belong in sparc64 target, it # simply *happens* to work around a compiler bug in gcc 3.3.3, # triggered by RIPEMD160 code. -"BSD-sparc64", "gcc:-DB_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"BSD-sparc64", "gcc:-DB_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "BSD-ia64", "gcc:-DL_ENDIAN -DTERMIOS -O3 -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "BSD-x86_64", "gcc:-DL_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", diff --git a/TABLE b/TABLE index 1c59c9ce5c..f2acd1472f 100644 --- a/TABLE +++ b/TABLE @@ -142,7 +142,7 @@ $unistd = $thread_cflag = -pthread -D_THREAD_SAFE -D_REENTRANT $sys_id = $lflags = -$bn_ops = SIXTY_FOUR_BIT_LONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR +$bn_ops = BN_LLONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR $cpuid_obj = $bn_obj = bn_asm.o sparcv9a-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o @@ -2923,7 +2923,7 @@ $unistd = $thread_cflag = -D_REENTRANT $sys_id = ULTRASPARC $lflags = -ldl -$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = $bn_obj = bn_asm.o sparcv9a-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o @@ -3625,7 +3625,7 @@ $unistd = $thread_cflag = -D_REENTRANT $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl -$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = $bn_obj = bn_asm.o sparcv9a-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o @@ -3652,7 +3652,7 @@ $unistd = $thread_cflag = -D_REENTRANT $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl -$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = $bn_obj = bn_asm.o sparcv9a-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o diff --git a/crypto/bn/asm/sparcv9a-mont.pl b/crypto/bn/asm/sparcv9a-mont.pl index 87f6ac1e9d..81d7ef608f 100755 --- a/crypto/bn/asm/sparcv9a-mont.pl +++ b/crypto/bn/asm/sparcv9a-mont.pl @@ -138,11 +138,7 @@ $fname: save %sp,-$frame-$locals,%sp sethi %hi(0xffff),$mask or $mask,%lo(0xffff),$mask -___ -$code.=<<___ if ($bits==64); - ldx [%i4],$n0 ! $n0 reassigned, remember? -___ -$code.=<<___ if ($bits==32); + cmp $num,4 bl,a,pn %icc,.Lret clr %i0 @@ -160,8 +156,7 @@ $code.=<<___ if ($bits==32); ld [%i4+4],%o0 sllx %o0,32,%o0 or %o0,$n0,$n0 ! $n0=n0[1].n0[0] -___ -$code.=<<___; + sll $num,3,$num ! num*=8 add %sp,$bias,%o0 ! real top of stack @@ -188,48 +183,44 @@ $code.=<<___; stx %o7,[%sp+$bias+$frame+48] ! save %asi - sub %g0,$num,$i - sub %g0,$num,$j + sub %g0,$num,$i ! i=-num + sub %g0,$num,$j ! j=-num add $ap,$j,%o3 add $bp,$i,%o4 -___ -$code.=<<___ if ($bits==64); + ldx [$bp+$i],%o0 ! bp[0] ldx [$ap+$j],%o1 ! ap[0] -___ -$code.=<<___ if ($bits==32); - ldd [$bp+$i],%o0 ! bp[0] - ldd [$ap+$j],%g2 ! ap[0] - sllx %o1,32,%o1 - sllx %g3,32,%g3 - or %o0,%o1,%o0 - or %g2,%g3,%o1 -___ -$code.=<<___; + sllx %o0,32,%g1 + sllx %o1,32,%g5 + srlx %o0,32,%o0 + srlx %o1,32,%o1 + or %g1,%o0,%o0 + or %g5,%o1,%o1 + add $np,$j,%o5 mulx %o1,%o0,%o0 ! ap[0]*bp[0] mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 stx %o0,[%sp+$bias+$frame+0] - ld [%o3+`$bits==32 ? 0 : 4`],$alo_ ! load a[j] as pair of 32-bit words + ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words fzeros $alo - ld [%o3+`$bits==32 ? 4 : 0`],$ahi_ + ld [%o3+4],$ahi_ fzeros $ahi - ld [%o5+`$bits==32 ? 0 : 4`],$nlo_ ! load n[j] as pair of 32-bit words + ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words fzeros $nlo - ld [%o5+`$bits==32 ? 4 : 0`],$nhi_ + ld [%o5+4],$nhi_ fzeros $nhi ! transfer b[i] to FPU as 4x16-bit values - ldda [%o4+`$bits==32 ? 2 : 6`]%asi,$ba + ldda [%o4+2]%asi,$ba fxtod $alo,$alo - ldda [%o4+`$bits==32 ? 0 : 4`]%asi,$bb + ldda [%o4+0]%asi,$bb fxtod $ahi,$ahi - ldda [%o4+`$bits==32 ? 6 : 2`]%asi,$bc + ldda [%o4+6]%asi,$bc fxtod $nlo,$nlo - ldda [%o4+`$bits==32 ? 4 : 0`]%asi,$bd + ldda [%o4+4]%asi,$bd fxtod $nhi,$nhi ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values @@ -256,24 +247,24 @@ $code.=<<___; fmuld $alo,$bb,$alob fmuld $nlo,$nb,$nlob fmuld $alo,$bc,$aloc - fmuld $nlo,$nc,$nloc faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc fmuld $alo,$bd,$alod - fmuld $nlo,$nd,$nlod faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod fmuld $ahi,$ba,$ahia - fmuld $nhi,$na,$nhia faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia fmuld $ahi,$bb,$ahib - fmuld $nhi,$nb,$nhib faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib fmuld $ahi,$bc,$ahic - fmuld $nhi,$nc,$nhic faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic fmuld $ahi,$bd,$ahid + faddd $ahib,$nhib,$nhib fmuld $nhi,$nd,$nhid - faddd $ahib,$nhib,$nhib faddd $ahic,$nhic,$dota ! $nhic faddd $ahid,$nhid,$dotb ! $nhid @@ -317,13 +308,13 @@ $code.=<<___; .L1st: add $ap,$j,%o3 add $np,$j,%o4 - ld [%o3+`$bits==32 ? 0 : 4`],$alo_ ! load a[j] as pair of 32-bit words + ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words fzeros $alo - ld [%o3+`$bits==32 ? 4 : 0`],$ahi_ + ld [%o3+4],$ahi_ fzeros $ahi - ld [%o4+`$bits==32 ? 0 : 4`],$nlo_ ! load n[j] as pair of 32-bit words + ld [%o4+0],$nlo_ ! load n[j] as pair of 32-bit words fzeros $nlo - ld [%o4+`$bits==32 ? 4 : 0`],$nhi_ + ld [%o4+4],$nhi_ fzeros $nhi fxtod $alo,$alo @@ -340,23 +331,23 @@ $code.=<<___; std $nhi,[$np_h+$j] fmuld $nlo,$nb,$nlob fmuld $alo,$bc,$aloc - fmuld $nlo,$nc,$nloc faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc fmuld $alo,$bd,$alod - fmuld $nlo,$nd,$nlod faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod fmuld $ahi,$ba,$ahia - fmuld $nhi,$na,$nhia faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia fmuld $ahi,$bb,$ahib - fmuld $nhi,$nb,$nhib faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib fmuld $ahi,$bc,$ahic - fmuld $nhi,$nc,$nhic faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic fmuld $ahi,$bd,$ahid - fmuld $nhi,$nd,$nhid faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid faddd $dota,$nloa,$nloa faddd $dotb,$nlob,$nlob @@ -429,36 +420,31 @@ $code.=<<___; add $i,8,$i .align 32 .Louter: - sub %g0,$num,$j + sub %g0,$num,$j ! j=-num add %sp,$bias+$frame+$locals,$tp add $bp,$i,%o4 -___ -$code.=<<___ if ($bits==64); + ldx [$bp+$i],%o0 ! bp[i] ldx [$ap+$j],%o1 ! ap[0] -___ -$code.=<<___ if ($bits==32); - ldd [$bp+$i],%o0 ! bp[i] - ldd [$ap+$j],%g2 ! ap[0] - sllx %o1,32,%o1 - sllx %g3,32,%g3 - or %o0,%o1,%o0 - or %g2,%g3,%o1 -___ -$code.=<<___; + sllx %o0,32,%g1 + sllx %o1,32,%g5 + srlx %o0,32,%o0 + srlx %o1,32,%o1 + or %g1,%o0,%o0 + or %g5,%o1,%o1 + ldx [$tp],%o2 ! tp[0] mulx %o1,%o0,%o0 addcc %o2,%o0,%o0 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 stx %o0,[%sp+$bias+$frame+0] - ! transfer b[i] to FPU as 4x16-bit values - ldda [%o4+`$bits==32 ? 2 : 6`]%asi,$ba - ldda [%o4+`$bits==32 ? 0 : 4`]%asi,$bb - ldda [%o4+`$bits==32 ? 6 : 2`]%asi,$bc - ldda [%o4+`$bits==32 ? 4 : 0`]%asi,$bd + ldda [%o4+2]%asi,$ba + ldda [%o4+0]%asi,$bb + ldda [%o4+6]%asi,$bc + ldda [%o4+4]%asi,$bd ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values ldda [%sp+$bias+$frame+6]%asi,$na @@ -483,24 +469,24 @@ $code.=<<___; fmuld $alo,$bb,$alob fmuld $nlo,$nb,$nlob fmuld $alo,$bc,$aloc - fmuld $nlo,$nc,$nloc faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc fmuld $alo,$bd,$alod - fmuld $nlo,$nd,$nlod faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod fmuld $ahi,$ba,$ahia - fmuld $nhi,$na,$nhia faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia fmuld $ahi,$bb,$ahib - fmuld $nhi,$nb,$nhib faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib fmuld $ahi,$bc,$ahic - fmuld $nhi,$nc,$nhic faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic fmuld $ahi,$bd,$ahid + faddd $ahib,$nhib,$nhib fmuld $nhi,$nd,$nhid - faddd $ahib,$nhib,$nhib faddd $ahic,$nhic,$dota ! $nhic faddd $ahid,$nhid,$dotb ! $nhid @@ -558,24 +544,24 @@ $code.=<<___; fmuld $alo,$bb,$alob fmuld $nlo,$nb,$nlob fmuld $alo,$bc,$aloc - fmuld $nlo,$nc,$nloc faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc fmuld $alo,$bd,$alod - fmuld $nlo,$nd,$nlod faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod fmuld $ahi,$ba,$ahia - fmuld $nhi,$na,$nhia faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia fmuld $ahi,$bb,$ahib - fmuld $nhi,$nb,$nhib faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib fmuld $ahi,$bc,$ahic - fmuld $nhi,$nc,$nhic faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic fmuld $ahi,$bd,$ahid + faddd $ahib,$nhib,$nhib fmuld $nhi,$nd,$nhid - faddd $ahib,$nhib,$nhib faddd $dota,$nloa,$nloa faddd $dotb,$nlob,$nlob faddd $ahic,$nhic,$dota ! $nhic @@ -661,7 +647,7 @@ $code.=<<___; add $tp,8,$tp ! adjust tp to point at the end ld [$tp-8],%o0 - ld [$np-`$bits==32 ? 4 : 8`],%o1 + ld [$np-4],%o1 cmp %o0,%o1 ! compare topmost words bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken nop @@ -670,41 +656,26 @@ $code.=<<___; .Lsub: ldd [$tp+%o7],%o0 ldd [$np+%o7],%o2 -___ -$code.=<<___ if ($bits==64); - subccc %o1,%o3,%o3 - subccc %o0,%o2,%o2 -___ -$code.=<<___ if ($bits==32); subccc %o1,%o2,%o2 subccc %o0,%o3,%o3 -___ -$code.=<<___; std %o2,[$rp+%o7] add %o7,8,%o7 brnz,pt %o7,.Lsub nop subccc $carry,0,$carry bcc,pt %icc,.Lzap - sub %g0,$num,%o7 + sub %g0,$num,%o7 ! n=-num .align 16,0x1000000 .Lcopy: ldx [$tp+%o7],%o0 -___ -$code.=<<___ if ($bits==64); - stx %o0,[$rp+%o7] -___ -$code.=<<___ if ($bits==32); srlx %o0,32,%o1 std %o0,[$rp+%o7] -___ -$code.=<<___; add %o7,8,%o7 brnz,pt %o7,.Lcopy nop ba .Lzap - sub %g0,$num,%o7 + sub %g0,$num,%o7 ! n=-num .align 32 .Lzap: -- 2.25.1