From 4f39edbff1213c3c97f5a8367aa6fb650f1d57b3 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 14 Apr 2010 19:04:51 +0000 Subject: [PATCH] gcm128.c and assembler modules: change argument order for gcm_ghash_4bit. ghash-x86*.pl: fix performance numbers for Core2, as it turned out previous ones were "tainted" by variable clock frequency. --- crypto/modes/asm/ghash-alpha.pl | 12 +++-------- crypto/modes/asm/ghash-ia64.pl | 8 ++++---- crypto/modes/asm/ghash-sparcv9.pl | 10 ++++----- crypto/modes/asm/ghash-x86.pl | 34 +++++++++++++++---------------- crypto/modes/asm/ghash-x86_64.pl | 10 ++++----- crypto/modes/gcm128.c | 11 +++++----- 6 files changed, 38 insertions(+), 47 deletions(-) diff --git a/crypto/modes/asm/ghash-alpha.pl b/crypto/modes/asm/ghash-alpha.pl index d75dc78836..be3c7ef5c0 100644 --- a/crypto/modes/asm/ghash-alpha.pl +++ b/crypto/modes/asm/ghash-alpha.pl @@ -31,10 +31,10 @@ $Thi1="t5"; $Tlo1="t6"; $rem="t7"; # $8 ################# -$Xi="a0"; # $16 +$Xi="a0"; # $16, input argument block $Htbl="a1"; - - +$inp="a2"; +$len="a3"; $nlo="a4"; # $20 $nhi="a5"; $Zhi="t8"; @@ -314,12 +314,6 @@ $code.=<<___; .end gcm_gmult_4bit ___ -# argument block for gcm_ghash_4bit -$inp="a0"; # $16 -$len="a1"; -$Xi ="a2"; -$Htbl="a3"; - $inhi="s0"; $inlo="s1"; diff --git a/crypto/modes/asm/ghash-ia64.pl b/crypto/modes/asm/ghash-ia64.pl index 3fd1446cd9..d24a44fea2 100755 --- a/crypto/modes/asm/ghash-ia64.pl +++ b/crypto/modes/asm/ghash-ia64.pl @@ -142,13 +142,13 @@ gcm_ghash_4bit: .prologue { .mmi; .save ar.pfs,prevfs alloc prevfs=ar.pfs,4,4,0,8 - $ADDP inp=15,in0 // &inp[15] + $ADDP inp=15,in2 // &inp[15] mov rem_4bitp=ip } -{ .mmi; $ADDP end=in1,in0 // &inp[len] - $ADDP Xi=15,in2 // &Xi[15] +{ .mmi; $ADDP end=in3,in2 // &inp[len] + $ADDP Xi=15,in0 // &Xi[15] .save ar.lc,prevlc mov prevlc=ar.lc };; -{ .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo +{ .mmi; $ADDP Htbl=8,in1 // &Htbl[0].lo mov mask0xf0=0xf0 .save pr,prevpr mov prevpr=pr } diff --git a/crypto/modes/asm/ghash-sparcv9.pl b/crypto/modes/asm/ghash-sparcv9.pl index 47d7a1dca3..708ae0c893 100644 --- a/crypto/modes/asm/ghash-sparcv9.pl +++ b/crypto/modes/asm/ghash-sparcv9.pl @@ -54,10 +54,10 @@ $remi="%l5"; $Htblo="%l6"; $cnt="%l7"; -$inp="%i0"; # input arguments for gcm_ghash_4bit -$len="%i1"; -$Xi="%i2"; -$Htbl="%i3"; +$Xi="%i0"; # input argument block +$Htbl="%i1"; +$inp="%i2"; +$len="%i3"; $code.=<<___; .section ".text",#alloc,#execinstr @@ -208,8 +208,6 @@ gcm_ghash_4bit: .size gcm_ghash_4bit,(.-gcm_ghash_4bit) ___ -$Xi="%i0"; # input arguments for gcm_gmult_4bit -$Htbl="%i1"; undef $inp; undef $len; diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl index 13efbcef6a..1dbeff09c2 100644 --- a/crypto/modes/asm/ghash-x86.pl +++ b/crypto/modes/asm/ghash-x86.pl @@ -23,7 +23,7 @@ # PIII 63 /77 16 24 # P4 96 /122 30 84(***) # Opteron 50 /71 21 30 -# Core2 63 /102 19 28 +# Core2 54 /68 13 18 # # (*) gcc 3.4.x was observed to generate few percent slower code, # which is one of reasons why 2.95.3 results were chosen, @@ -317,12 +317,12 @@ if ($unroll) { &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); - &mov ($inp,&wparam(0)); # load in - &mov ($Zlh,&wparam(1)); # load len - &mov ($Zhh,&wparam(2)); # load Xi - &mov ($Htbl,&wparam(3)); # load Htable + &mov ($Zhh,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + &mov ($inp,&wparam(2)); # load in + &mov ($Zlh,&wparam(3)); # load len &add ($Zlh,$inp); - &mov (&wparam(1),$Zlh); # len to point at the end of input + &mov (&wparam(3),$Zlh); # len to point at the end of input &stack_push(4+1); # +1 for stack alignment &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] &mov ($Zhl,&DWP(4,$Zhh)); @@ -344,10 +344,10 @@ if ($unroll) { &mmx_loop("esp","eax"); &lea ($inp,&DWP(16,$inp)); - &cmp ($inp,&wparam(1)); + &cmp ($inp,&wparam(3)); &jb (&label("mmx_outer_loop")); - &mov ($inp,&wparam(2)); # load Xi + &mov ($inp,&wparam(0)); # load Xi &emms (); &mov (&DWP(12,$inp),$Zll); &mov (&DWP(4,$inp),$Zhl); @@ -359,12 +359,12 @@ if ($unroll) { &set_label("x86",16); } &stack_push(16+4+1); # +1 for 64-bit alignment - &mov ($inp,&wparam(0)); # load in - &mov ("ecx",&wparam(1)); # load len - &mov ($Zll,&wparam(2)); # load Xi - &mov ($Htbl,&wparam(3)); # load Htable + &mov ($Zll,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + &mov ($inp,&wparam(2)); # load in + &mov ("ecx",&wparam(3)); # load len &add ("ecx",$inp); - &mov (&wparam(1),"ecx"); + &mov (&wparam(3),"ecx"); &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] &mov ($Zhl,&DWP(4,$Zll)); @@ -390,14 +390,14 @@ if ($unroll) { &call ("_x86_gmult_4bit_inner"); } else { &x86_loop(0); - &mov ($inp,&wparam(0)); + &mov ($inp,&wparam(2)); } &lea ($inp,&DWP(16,$inp)); - &cmp ($inp,&wparam(1)); - &mov (&wparam(0),$inp) if (!$unroll); + &cmp ($inp,&wparam(3)); + &mov (&wparam(2),$inp) if (!$unroll); &jb (&label("x86_outer_loop")); - &mov ($inp,&wparam(2)); # load Xi + &mov ($inp,&wparam(0)); # load Xi &mov (&DWP(12,$inp),$Zll); &mov (&DWP(8,$inp),$Zlh); &mov (&DWP(4,$inp),$Zhl); diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index 1072979829..25005b9062 100644 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -18,7 +18,7 @@ # gcc 3.4.x assembler # # Opteron 18.5 10.2 +80% -# Core2 26.0 16.4 +58% +# Core2 17.5 11.0 +59% $flavour = shift; $output = shift; @@ -41,10 +41,10 @@ $Zhi="%r9"; $tmp="%r10"; $rem_4bit = "%r11"; -# per-function register layout $Xi="%rdi"; $Htbl="%rsi"; +# per-function register layout $cnt="%rcx"; $rem="%rdx"; @@ -159,10 +159,8 @@ ___ # per-function register layout -$inp="%rdi"; -$len="%rsi"; -$Xi="%rdx"; -$Htbl="%rcx"; +$inp="%rdx"; +$len="%rcx"; $cnt="%rbp"; $rem="%r12"; diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c index ce2d178215..7501833007 100644 --- a/crypto/modes/gcm128.c +++ b/crypto/modes/gcm128.c @@ -339,7 +339,7 @@ static const size_t rem_4bit[16] = { PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; -static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16]) +static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) { u128 Z; int cnt = 15; @@ -410,7 +410,8 @@ static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16]) * mostly as reference and a placeholder for possible future * non-trivial optimization[s]... */ -static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16]) +static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) { u128 Z; int cnt; @@ -479,13 +480,13 @@ static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16]) } #endif #else -void gcm_gmult_4bit(u64 Xi[2],u128 Htable[16]); -void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2],u128 Htable[16]); +void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); #endif #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) -#define GHASH(in,len,ctx) gcm_ghash_4bit(in,len,(ctx)->Xi.u,(ctx)->Htable) +#define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache * trashing effect. In other words idea is to hash data while it's * still in L1 cache after encryption pass... */ -- 2.25.1