gcm128.c and assembler modules: change argument order for gcm_ghash_4bit.

author Andy Polyakov <appro@openssl.org>

Wed, 14 Apr 2010 19:04:51 +0000 (19:04 +0000)

committer Andy Polyakov <appro@openssl.org>

Wed, 14 Apr 2010 19:04:51 +0000 (19:04 +0000)
author Andy Polyakov <appro@openssl.org>
Wed, 14 Apr 2010 19:04:51 +0000 (19:04 +0000)
committer Andy Polyakov <appro@openssl.org>
Wed, 14 Apr 2010 19:04:51 +0000 (19:04 +0000)
diff --git a/crypto/modes/asm/ghash-alpha.pl b/crypto/modes/asm/ghash-alpha.pl

index d75dc78836081154b7cbb16f50c419ddff17435c..be3c7ef5c0226c959344a9990f56c43a0c682adf 100644 (file)
--- a/crypto/modes/asm/ghash-alpha.pl
+++ b/crypto/modes/asm/ghash-alpha.pl
@@ -31,10 +31,10 @@ $Thi1="t5";
  $Tlo1="t6";
  $rem="t7";     # $8
  #################
-$Xi="a0";      # $16
+$Xi="a0";      # $16, input argument block
  $Htbl="a1";
-
-
+$inp="a2";
+$len="a3";
  $nlo="a4";     # $20
  $nhi="a5";
  $Zhi="t8";
@@ -314,12 +314,6 @@ $code.=<<___;
  .end   gcm_gmult_4bit
  ___
  
-# argument block for gcm_ghash_4bit
-$inp="a0";     # $16
-$len="a1";
-$Xi ="a2";
-$Htbl="a3";
-
  $inhi="s0";
  $inlo="s1";
  
diff --git a/crypto/modes/asm/ghash-ia64.pl b/crypto/modes/asm/ghash-ia64.pl

index 3fd1446cd9a008e22c3ba3637954cebaabfc1849..d24a44fea222fecef809922ac31fcc48a8d39ec4 100755 (executable)
--- a/crypto/modes/asm/ghash-ia64.pl
+++ b/crypto/modes/asm/ghash-ia64.pl
@@ -142,13 +142,13 @@ gcm_ghash_4bit:
         .prologue
  { .mmi;        .save   ar.pfs,prevfs
         alloc   prevfs=ar.pfs,4,4,0,8
-       $ADDP   inp=15,in0                      // &inp[15]
+       $ADDP   inp=15,in2                      // &inp[15]
         mov     rem_4bitp=ip            }
-{ .mmi;        $ADDP   end=in1,in0                     // &inp[len]
-       $ADDP   Xi=15,in2                       // &Xi[15]
+{ .mmi;        $ADDP   end=in3,in2                     // &inp[len]
+       $ADDP   Xi=15,in0                       // &Xi[15]
         .save   ar.lc,prevlc
         mov     prevlc=ar.lc            };;
-{ .mmi;        $ADDP   Htbl=8,in3                      // &Htbl[0].lo
+{ .mmi;        $ADDP   Htbl=8,in1                      // &Htbl[0].lo
         mov     mask0xf0=0xf0
         .save   pr,prevpr
         mov     prevpr=pr               }
diff --git a/crypto/modes/asm/ghash-sparcv9.pl b/crypto/modes/asm/ghash-sparcv9.pl

index 47d7a1dca37891b9536428e75891ac4da05bd497..708ae0c893ec6c014ea2af1287663ec7b914a4da 100644 (file)
--- a/crypto/modes/asm/ghash-sparcv9.pl
+++ b/crypto/modes/asm/ghash-sparcv9.pl
@@ -54,10 +54,10 @@ $remi="%l5";
  $Htblo="%l6";
  $cnt="%l7";
  
-$inp="%i0";    # input arguments for gcm_ghash_4bit
-$len="%i1";
-$Xi="%i2";
-$Htbl="%i3";
+$Xi="%i0";     # input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";
  
  $code.=<<___;
  .section       ".text",#alloc,#execinstr
@@ -208,8 +208,6 @@ gcm_ghash_4bit:
  .size  gcm_ghash_4bit,(.-gcm_ghash_4bit)
  ___
  
-$Xi="%i0";     # input arguments for gcm_gmult_4bit
-$Htbl="%i1";
  undef $inp;
  undef $len;
  
diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl

index 13efbcef6a9a150e4fbe15e66d5d7008375eaac0..1dbeff09c269468a6cc0cb27eae0355f988f9648 100644 (file)
--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
@@ -23,7 +23,7 @@
  # PIII         63 /77          16              24
  # P4           96 /122         30              84(***)
  # Opteron      50 /71          21              30
-# Core2                63 /102         19              28
+# Core2                54 /68          13              18
  #
  # (*)  gcc 3.4.x was observed to generate few percent slower code,
  #      which is one of reasons why 2.95.3 results were chosen,
@@ -317,12 +317,12 @@ if ($unroll) {
  
         &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
  
-       &mov    ($inp,&wparam(0));      # load in
-       &mov    ($Zlh,&wparam(1));      # load len
-       &mov    ($Zhh,&wparam(2));      # load Xi
-       &mov    ($Htbl,&wparam(3));     # load Htable
+       &mov    ($Zhh,&wparam(0));      # load Xi
+       &mov    ($Htbl,&wparam(1));     # load Htable
+       &mov    ($inp,&wparam(2));      # load in
+       &mov    ($Zlh,&wparam(3));      # load len
         &add    ($Zlh,$inp);
-       &mov    (&wparam(1),$Zlh);      # len to point at the end of input
+       &mov    (&wparam(3),$Zlh);      # len to point at the end of input
         &stack_push(4+1);               # +1 for stack alignment
         &mov    ($Zll,&DWP(12,$Zhh));   # load Xi[16]
         &mov    ($Zhl,&DWP(4,$Zhh));
@@ -344,10 +344,10 @@ if ($unroll) {
         &mmx_loop("esp","eax");
  
         &lea    ($inp,&DWP(16,$inp));
-       &cmp    ($inp,&wparam(1));
+       &cmp    ($inp,&wparam(3));
         &jb     (&label("mmx_outer_loop"));
  
-       &mov    ($inp,&wparam(2));      # load Xi
+       &mov    ($inp,&wparam(0));      # load Xi
         &emms   ();
         &mov    (&DWP(12,$inp),$Zll);
         &mov    (&DWP(4,$inp),$Zhl);
@@ -359,12 +359,12 @@ if ($unroll) {
      &set_label("x86",16);
      }
         &stack_push(16+4+1);                    # +1 for 64-bit alignment
-       &mov    ($inp,&wparam(0));              # load in
-       &mov    ("ecx",&wparam(1));             # load len
-       &mov    ($Zll,&wparam(2));              # load Xi
-       &mov    ($Htbl,&wparam(3));             # load Htable
+       &mov    ($Zll,&wparam(0));              # load Xi
+       &mov    ($Htbl,&wparam(1));             # load Htable
+       &mov    ($inp,&wparam(2));              # load in
+       &mov    ("ecx",&wparam(3));             # load len
         &add    ("ecx",$inp);
-       &mov    (&wparam(1),"ecx");
+       &mov    (&wparam(3),"ecx");
  
         &mov    ($Zhh,&DWP(0,$Zll));            # load Xi[16]
         &mov    ($Zhl,&DWP(4,$Zll));
@@ -390,14 +390,14 @@ if ($unroll) {
                 &call   ("_x86_gmult_4bit_inner");
         } else {
                 &x86_loop(0);
-               &mov    ($inp,&wparam(0));
+               &mov    ($inp,&wparam(2));
         }
         &lea    ($inp,&DWP(16,$inp));
-       &cmp    ($inp,&wparam(1));
-       &mov    (&wparam(0),$inp)       if (!$unroll);
+       &cmp    ($inp,&wparam(3));
+       &mov    (&wparam(2),$inp)       if (!$unroll);
         &jb     (&label("x86_outer_loop"));
  
-       &mov    ($inp,&wparam(2));      # load Xi
+       &mov    ($inp,&wparam(0));      # load Xi
         &mov    (&DWP(12,$inp),$Zll);
         &mov    (&DWP(8,$inp),$Zlh);
         &mov    (&DWP(4,$inp),$Zhl);
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl

index 10729798292db0d6341f4f198a14795e843eacbb..25005b9062d44dc0c906011372d8d91e9a34168d 100644 (file)
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -18,7 +18,7 @@
  #              gcc 3.4.x       assembler
  #
  # Opteron      18.5            10.2            +80%
-# Core2                26.0            16.4            +58%
+# Core2                17.5            11.0            +59%
  
  $flavour = shift;
  $output  = shift;
@@ -41,10 +41,10 @@ $Zhi="%r9";
  $tmp="%r10";
  $rem_4bit = "%r11";
  
-# per-function register layout
  $Xi="%rdi";
  $Htbl="%rsi";
  
+# per-function register layout
  $cnt="%rcx";
  $rem="%rdx";
  
@@ -159,10 +159,8 @@ ___
  
  
  # per-function register layout
-$inp="%rdi";
-$len="%rsi";
-$Xi="%rdx";
-$Htbl="%rcx";
+$inp="%rdx";
+$len="%rcx";
  
  $cnt="%rbp";
  $rem="%r12";
diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c

index ce2d178215b369d2641ed76708eece67ff9494e1..7501833007a891c875ba6e71335db47fad629e63 100644 (file)
--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@@ -339,7 +339,7 @@ static const size_t rem_4bit[16] = {
         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
  
-static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16])
+static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
  {
         u128 Z;
         int cnt = 15;
@@ -410,7 +410,8 @@ static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16])
   * mostly as reference and a placeholder for possible future
   * non-trivial optimization[s]...
   */
-static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16])
+static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
+                               const u8 *inp,size_t len)
  {
      u128 Z;
      int cnt;
@@ -479,13 +480,13 @@ static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16])
  }
  #endif
  #else
-void gcm_gmult_4bit(u64 Xi[2],u128 Htable[16]);
-void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2],u128 Htable[16]);
+void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
  #endif
  
  #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
  #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
-#define GHASH(in,len,ctx) gcm_ghash_4bit(in,len,(ctx)->Xi.u,(ctx)->Htable)
+#define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
  /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
   * trashing effect. In other words idea is to hash data while it's
   * still in L1 cache after encryption pass... */
author	Andy Polyakov <appro@openssl.org>
	Wed, 14 Apr 2010 19:04:51 +0000 (19:04 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Wed, 14 Apr 2010 19:04:51 +0000 (19:04 +0000)
crypto/modes/asm/ghash-alpha.pl		patch \| blob \| history
crypto/modes/asm/ghash-ia64.pl		patch \| blob \| history
crypto/modes/asm/ghash-sparcv9.pl		patch \| blob \| history
crypto/modes/asm/ghash-x86.pl		patch \| blob \| history
crypto/modes/asm/ghash-x86_64.pl		patch \| blob \| history
crypto/modes/gcm128.c		patch \| blob \| history