From 904732f68bcc6ebd3f8961a9272bc811dc26bcbd Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 28 Nov 2012 13:19:10 +0000 Subject: [PATCH] C64x+ assembly pack: improve EABI support. --- Configure | 2 +- TABLE | 2 +- crypto/aes/asm/aes-c64xplus.pl | 10 ++++++ crypto/bn/asm/bn-c64xplus.asm | 51 ++++++++++++++++++++++++++---- crypto/bn/asm/c64xplus-gf2m.pl | 3 ++ crypto/c64xpluscpuid.pl | 8 +++++ crypto/modes/asm/ghash-c64xplus.pl | 9 ++++-- crypto/sha/asm/sha1-c64xplus.pl | 3 ++ crypto/sha/asm/sha256-c64xplus.pl | 6 ++++ crypto/sha/asm/sha512-c64xplus.pl | 5 +++ 10 files changed, 88 insertions(+), 11 deletions(-) diff --git a/Configure b/Configure index 94fa7a8821..95c701af31 100755 --- a/Configure +++ b/Configure @@ -410,7 +410,7 @@ my %table=( "linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}", # # TI_CGT_C6000_7.3.x is a requirement -"linux-c64xplus","cl6x:--linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true", +"linux-c64xplus","cl6x:--linux -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true", # Android: linux-* but without -DTERMIO and pointers to headers and libs. "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", diff --git a/TABLE b/TABLE index 52d88b998f..9d1a80bf94 100644 --- a/TABLE +++ b/TABLE @@ -3995,7 +3995,7 @@ $multilib = *** linux-c64xplus $cc = cl6x -$cflags = --linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT +$cflags = --linux -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT $unistd = $thread_cflag = -D_REENTRANT $sys_id = diff --git a/crypto/aes/asm/aes-c64xplus.pl b/crypto/aes/asm/aes-c64xplus.pl index ad0c15a36f..cc14ae3157 100644 --- a/crypto/aes/asm/aes-c64xplus.pl +++ b/crypto/aes/asm/aes-c64xplus.pl @@ -46,6 +46,11 @@ $code=<<___; .text .if __TI_EABI__ .nocmp + .asg AES_encrypt,_AES_encrypt + .asg AES_decrypt,_AES_decrypt + .asg AES_set_encrypt_key,_AES_set_encrypt_key + .asg AES_set_decrypt_key,_AES_set_decrypt_key + .asg AES_ctr32_encrypt,_AES_ctr32_encrypt .endif .asg B3,RA @@ -1021,7 +1026,11 @@ ___ } # Tables are kept in endian-neutral manner $code.=<<___; + .if __TI_EABI__ + .sect ".text:aes_asm.const" + .else .sect ".const:aes_asm" + .endif .align 128 AES_Te: .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 @@ -1359,3 +1368,4 @@ AES_Td4: ___ print $code; +close STDOUT; diff --git a/crypto/bn/asm/bn-c64xplus.asm b/crypto/bn/asm/bn-c64xplus.asm index 161547c3b0..f07b09e439 100644 --- a/crypto/bn/asm/bn-c64xplus.asm +++ b/crypto/bn/asm/bn-c64xplus.asm @@ -12,6 +12,18 @@ ;; SPLOOPs spin at ... 2*n cycles [plus epilogue]. ;;==================================================================== .text + .if __TI_EABI__ + .asg bn_mul_add_words,_bn_mul_add_words + .asg bn_mul_words,_bn_mul_words + .asg bn_sqr_words,_bn_sqr_words + .asg bn_add_words,_bn_add_words + .asg bn_sub_words,_bn_sub_words + .asg bn_div_words,_bn_div_words + .asg bn_sqr_comba8,_bn_sqr_comba8 + .asg bn_mul_comba8,_bn_mul_comba8 + .asg bn_sqr_comba4,_bn_sqr_comba4 + .asg bn_mul_comba4,_bn_mul_comba4 + .endif .asg B3,RA .asg A4,ARG0 @@ -158,14 +170,39 @@ _bn_sub_words: .endasmfunc .global _bn_div_words - .global __divull _bn_div_words: .asmfunc - CALLP __divull,A3 ; jump to rts64plus.lib -|| MV ARG0,A5 -|| MV ARG1,ARG0 -|| MV ARG2,ARG1 -|| ZERO B5 + LMBD 1,A6,A0 ; leading zero bits in dv + LMBD 1,A4,A1 ; leading zero bits in hi +|| MVK 32,B0 + CMPLTU A1,A0,A2 +|| ADD A0,B0,B0 + [ A2] BNOP RA +||[ A2] MVK -1,A4 ; return overflow +||[!A2] MV A4,A3 ; reassign hi + [!A2] MV B4,A4 ; reassign lo, will be quotient +||[!A2] MVC B0,ILC + [!A2] SHL A6,A0,A6 ; normalize dv +|| MVK 1,A1 + + [!A2] CMPLTU A3,A6,A1 ; hi>31 + + SPLOOP 3 + [!A1] CMPLTU A3,A6,A1 ; hi>31 + SPKERNEL + + BNOP RA,5 .endasmfunc ;;==================================================================== @@ -256,7 +293,7 @@ _bn_mul_comba4: || LDW *A5++,B6 ; ap[0] || MV A0,A3 ; const A3=M .else - ;; This alternative is exercise in fully unrolled Comba + ;; This alternative is an exercise in fully unrolled Comba ;; algorithm implementation that operates at n*(n+1)+12, or ;; as little as 32 cycles... LDW *ARG1[0],B16 ; a[0] diff --git a/crypto/bn/asm/c64xplus-gf2m.pl b/crypto/bn/asm/c64xplus-gf2m.pl index cef83942c9..1b3ecc2c94 100644 --- a/crypto/bn/asm/c64xplus-gf2m.pl +++ b/crypto/bn/asm/c64xplus-gf2m.pl @@ -107,6 +107,9 @@ ___ } $code.=<<___; .text + .if __TI_EABI__ + .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2 + .endif .global _bn_GF2m_mul_2x2 _bn_GF2m_mul_2x2: diff --git a/crypto/c64xpluscpuid.pl b/crypto/c64xpluscpuid.pl index 067b693d5c..0ee0a4e86f 100644 --- a/crypto/c64xpluscpuid.pl +++ b/crypto/c64xpluscpuid.pl @@ -6,6 +6,14 @@ open STDOUT,">$output"; $code.=<<___; .text + .if __TI_EABI__ + .asg OPENSSL_rdtsc,_OPENSSL_rdtsc + .asg OPENSSL_cleanse,_OPENSSL_cleanse + .asg OPENSSL_atomic_add,_OPENSSL_atomic_add + .asg OPENSSL_wipe_cpu,_OPENSSL_wipe_cpu + .asg OPENSSL_instrument_bus,_OPENSSL_instrument_bus + .asg OPENSSL_instrument_bus2,_OPENSSL_instrument_bus2 + .endif .asg B3,RA diff --git a/crypto/modes/asm/ghash-c64xplus.pl b/crypto/modes/asm/ghash-c64xplus.pl index 1ac4d927d0..409b0d61b9 100644 --- a/crypto/modes/asm/ghash-c64xplus.pl +++ b/crypto/modes/asm/ghash-c64xplus.pl @@ -35,6 +35,11 @@ open STDOUT,">$output"; $code.=<<___; .text + .if __TI_EABI__ + .asg gcm_gmult_1bit,_gcm_gmult_1bit + .asg gcm_gmult_4bit,_gcm_gmult_4bit + .asg gcm_ghash_4bit,_gcm_ghash_4bit + .endif .asg B3,RA @@ -144,7 +149,7 @@ ___ # 8/2 S1 L1x S2 | .... #####... ................|............ $code.=<<___; - XORMPY $H0,$xia,$H0x ; 0 ; H·Xi[i] + XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1) || XORMPY $H01u,$xib,$H01y || [A0] LDBU *--${xip},$x0 XORMPY $H1,$xia,$H1x ; 1 @@ -153,7 +158,7 @@ $code.=<<___; XORMPY $H3,$xia,$H3x ; 3 || XORMPY $H3u,$xib,$H3y ||[!A0] MVK.D 15,A0 ; *--${xip} counter - XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·Xi[i] + XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1) || [A0] SUB.S A0,1,A0 XOR.L $H1x,$Z1,$Z1 ; 5 || AND.D $H01y,$FF000000,$H0z diff --git a/crypto/sha/asm/sha1-c64xplus.pl b/crypto/sha/asm/sha1-c64xplus.pl index 87000d1e8f..456f80a86e 100644 --- a/crypto/sha/asm/sha1-c64xplus.pl +++ b/crypto/sha/asm/sha1-c64xplus.pl @@ -38,6 +38,9 @@ open STDOUT,">$output"; $code=<<___; .text + .if __TI_EABI__ + .asg sha1_block_data_order,_sha1_block_data_order + .endif .asg B3,RA .asg A15,FP diff --git a/crypto/sha/asm/sha256-c64xplus.pl b/crypto/sha/asm/sha256-c64xplus.pl index 5a057868b4..798f78309b 100644 --- a/crypto/sha/asm/sha256-c64xplus.pl +++ b/crypto/sha/asm/sha256-c64xplus.pl @@ -40,6 +40,7 @@ $code.=<<___; .text .if __TI_EABI__ .nocmp + .asg sha256_block_data_order,_sha256_block_data_order .endif .asg B3,RA @@ -275,7 +276,11 @@ outerloop?: || STW $H,*${CTXB}[7] .endasmfunc + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else .sect ".const:sha_asm" + .endif .align 128 K256: .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 @@ -300,3 +305,4 @@ K256: ___ print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha512-c64xplus.pl b/crypto/sha/asm/sha512-c64xplus.pl index e4e7c042fd..77a62523e5 100644 --- a/crypto/sha/asm/sha512-c64xplus.pl +++ b/crypto/sha/asm/sha512-c64xplus.pl @@ -48,6 +48,7 @@ $code.=<<___; .text .if __TI_EABI__ .nocmp + .asg sha512_block_data_order,_sha512_block_data_order .endif .asg B3,RA @@ -370,7 +371,11 @@ break?: NOP 2 ; wait till FP is committed .endasmfunc + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else .sect ".const:sha_asm" + .endif .align 128 K512: .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd -- 2.25.1