ghash-s390x.pl: reschedule instructions for better performance.

author Andy Polyakov <appro@openssl.org>

Tue, 21 Sep 2010 11:37:00 +0000 (11:37 +0000)

committer Andy Polyakov <appro@openssl.org>

Tue, 21 Sep 2010 11:37:00 +0000 (11:37 +0000)
author Andy Polyakov <appro@openssl.org>
Tue, 21 Sep 2010 11:37:00 +0000 (11:37 +0000)
committer Andy Polyakov <appro@openssl.org>
Tue, 21 Sep 2010 11:37:00 +0000 (11:37 +0000)
diff --git a/crypto/modes/asm/ghash-s390x.pl b/crypto/modes/asm/ghash-s390x.pl

index 18135ddb455ab8c6be753d7f87ce9c9c9b2a7b30..d7689de541fefda513f9d359ddfbada7ad5b8a35 100644 (file)
--- a/crypto/modes/asm/ghash-s390x.pl
+++ b/crypto/modes/asm/ghash-s390x.pl
@@ -8,10 +8,21 @@
  # ====================================================================
  
  # September 2010.
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# was measured to be ~18 cycles per processed byte on z10, which is
+# almost 40% better than gcc-generated code. It should be noted that
+# 18 cycles is worse result than expected: loop is scheduled for 12
+# and the result should be close to 12. In the lack of instruction-
+# level profiling data it's impossible to tell why...
  
  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";
  
+$softonly=1;   # disable hardware support for now
+
  $Zhi="%r0";
  $Zlo="%r1";
  
@@ -38,6 +49,31 @@ $code.=<<___;
  .globl gcm_gmult_4bit
  .align 32
  gcm_gmult_4bit:
+___
+$code.=<<___ if(!$softonly);
+       larl    %r1,OPENSSL_s390xcap_P
+       lg      %r0,0(%r1)
+       tmhl    %r0,0x4000      # check for message-security-assist
+       jz      .Lsoft_gmult
+       lghi    %r0,0
+       la      %r1,16($sp)
+       .long   0xb93e0004      # kimd %r0,%r4
+       lg      %r1,24($sp)
+       tmhh    %r1,0x4000      # check for function 65
+       jz      .Lsoft_gmult
+       stg     %r0,16($sp)     # arrange 16 bytes of zero input
+       stg     %r0,24($sp)
+       lghi    %r0,65          # function 65
+       la      %r1,0($Xi)      # H lies right after Xi in gcm128_context
+       la      $inp,16($sp)
+       lghi    $len,16
+       .long   0xb93e0004      # kimd %r0,$inp
+       brc     1,.-4           # pay attention to "partial completion"
+       br      %r14
+.align 32
+.Lsoft_gmult:
+___
+$code.=<<___;
         stmg    %r6,%r14,48($sp)
  
         aghi    $Xi,-1
@@ -53,6 +89,27 @@ gcm_gmult_4bit:
  .globl gcm_ghash_4bit
  .align 32
  gcm_ghash_4bit:
+___
+$code.=<<___ if(!$softonly);
+       larl    %r1,OPENSSL_s390xcap_P
+       lg      %r0,0(%r1)
+       tmhl    %r0,0x4000      # check for message-security-assist
+       jz      .Lsoft_ghash
+       lghi    %r0,0
+       la      %r1,16($sp)
+       .long   0xb93e0004      # kimd %r0,%r4
+       lg      %r1,24($sp)
+       tmhh    %r1,0x4000      # check for function 65
+       jz      .Lsoft_ghash
+       lghi    %r0,65          # function 65
+       la      %r1,0($Xi)      # H lies right after Xi in gcm128_context
+       .long   0xb93e0004      # kimd %r0,$inp
+       brc     1,.-4           # pay attention to "partial completion"
+       br      %r14
+.align 32
+.Lsoft_ghash:
+___
+$code.=<<___;
         stmg    %r6,%r14,48($sp)
  
         aghi    $Xi,-1
@@ -62,92 +119,94 @@ gcm_ghash_4bit:
  
         lg      $Zlo,8+1($Xi)           # Xi
         lg      $Zhi,0+1($Xi)
+       lghi    $tmp,0
  .Louter:
-       xg      $Zlo,8($inp)            # Xi ^= inp 
-       xg      $Zhi,0($inp)
+       xg      $Zhi,0($inp)            # Xi ^= inp 
+       xg      $Zlo,8($inp)
+       xgr     $Zhi,$tmp
         stg     $Zlo,8+1($Xi)
         stg     $Zhi,0+1($Xi)
  
  .Lgmult_shortcut:
-       lghi    $tmp,0xff
-       srlg    $xi,$Zlo,8              # extract first two bytes
+       lghi    $tmp,0xf0
+       sllg    $nlo,$Zlo,4
+       srlg    $xi,$Zlo,8              # extract second byte
+       ngr     $nlo,$tmp
         lgr     $nhi,$Zlo
-       ngr     $xi,$tmp
-       ngr     $nhi,$tmp
-
-       sllg    $nlo,$nhi,4
-       nill    $nhi,0xf0
-       nill    $nlo,0xf0
         lghi    $cnt,14
+       ngr     $nhi,$tmp
  
         lg      $Zlo,8($nlo,$Htbl)
         lg      $Zhi,0($nlo,$Htbl)
  
         sllg    $nlo,$xi,4
-       nill    $xi,0xf0
         sllg    $rem0,$Zlo,3
-       nill    $nlo,0xf0
-
-       srlg    $Zlo,$Zlo,4
+       ngr     $nlo,$tmp
         ngr     $rem0,$x78
+       ngr     $xi,$tmp
+
         sllg    $tmp,$Zhi,60
-       xg      $Zlo,8($nhi,$Htbl)
+       srlg    $Zlo,$Zlo,4
         srlg    $Zhi,$Zhi,4
-       xgr     $Zlo,$tmp
+       xg      $Zlo,8($nhi,$Htbl)
         xg      $Zhi,0($nhi,$Htbl)
         lgr     $nhi,$xi
         sllg    $rem1,$Zlo,3
-
+       xgr     $Zlo,$tmp
+       ngr     $rem1,$x78
+       j       .Lghash_inner
+.align 16
  .Lghash_inner:
         srlg    $Zlo,$Zlo,4
-       ngr     $rem1,$x78
-       xg      $Zlo,8($nlo,$Htbl)
         sllg    $tmp,$Zhi,60
-       xg      $Zhi,0($rem0,$rem_4bit)
-       xgr     $Zlo,$tmp
+       xg      $Zlo,8($nlo,$Htbl)
         srlg    $Zhi,$Zhi,4
         llgc    $xi,0($cnt,$Xi)
-       sllg    $rem0,$Zlo,3
         xg      $Zhi,0($nlo,$Htbl)
         sllg    $nlo,$xi,4
-       nill    $xi,0xf0
+       xg      $Zhi,0($rem0,$rem_4bit)
         nill    $nlo,0xf0
-
-       srlg    $Zlo,$Zlo,4
+       sllg    $rem0,$Zlo,3
+       xgr     $Zlo,$tmp
         ngr     $rem0,$x78
-       xg      $Zlo,8($nhi,$Htbl)
+       nill    $xi,0xf0
+
         sllg    $tmp,$Zhi,60
-       xg      $Zhi,0($rem1,$rem_4bit)
-       xgr     $Zlo,$tmp
+       srlg    $Zlo,$Zlo,4
         srlg    $Zhi,$Zhi,4
-       sllg    $rem1,$Zlo,3
+       xg      $Zlo,8($nhi,$Htbl)
         xg      $Zhi,0($nhi,$Htbl)
         lgr     $nhi,$xi
+       xg      $Zhi,0($rem1,$rem_4bit)
+       sllg    $rem1,$Zlo,3
+       xgr     $Zlo,$tmp
+       ngr     $rem1,$x78
         brct    $cnt,.Lghash_inner
  
+       sllg    $tmp,$Zhi,60
         srlg    $Zlo,$Zlo,4
-       ngr     $rem1,$x78
+       srlg    $Zhi,$Zhi,4
         xg      $Zlo,8($nlo,$Htbl)
-       sllg    $tmp,$Zhi,60
+       xg      $Zhi,0($nlo,$Htbl)
+       sllg    $xi,$Zlo,3
         xg      $Zhi,0($rem0,$rem_4bit)
         xgr     $Zlo,$tmp
-       srlg    $Zhi,$Zhi,4
-       sllg    $rem0,$Zlo,3
-       xg      $Zhi,0($nlo,$Htbl)
+       ngr     $xi,$x78
  
-       srlg    $Zlo,$Zlo,4
-       ngr     $rem0,$x78
-       xg      $Zhi,0($rem1,$rem_4bit)
         sllg    $tmp,$Zhi,60
-       xg      $Zlo,8($nhi,$Htbl)
+       srlg    $Zlo,$Zlo,4
         srlg    $Zhi,$Zhi,4
-       xgr     $Zlo,$tmp
+       xg      $Zlo,8($nhi,$Htbl)
         xg      $Zhi,0($nhi,$Htbl)
+       xgr     $Zlo,$tmp
+       xg      $Zhi,0($rem1,$rem_4bit)
  
+       lg      $tmp,0($xi,$rem_4bit)
         la      $inp,16($inp)
-       xg      $Zhi,0($rem0,$rem_4bit)
+       sllg    $tmp,$tmp,4             # correct last rem_4bit[rem]
         brctg   $len,.Louter
  
+       xgr     $Zhi,$tmp
         stg     $Zlo,8+1($Xi)
         stg     $Zhi,0+1($Xi)
         lmg     %r6,%r14,48($sp)
@@ -157,10 +216,10 @@ gcm_ghash_4bit:
  
  .align 64
  rem_4bit:
-       .long   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
-       .long   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
-       .long   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
-       .long   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+       .long   `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
+       .long   `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
+       .long   `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
+       .long   `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
  .type  rem_4bit,\@object
  .size  rem_4bit,(.-rem_4bit)
  .string        "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
author	Andy Polyakov <appro@openssl.org>
	Tue, 21 Sep 2010 11:37:00 +0000 (11:37 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Tue, 21 Sep 2010 11:37:00 +0000 (11:37 +0000)