From 8986e37249ab0045a48a14e63e3cda2f593e1f05 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 21 Sep 2010 11:37:00 +0000 Subject: [PATCH] ghash-s390x.pl: reschedule instructions for better performance. --- crypto/modes/asm/ghash-s390x.pl | 147 ++++++++++++++++++++++---------- 1 file changed, 103 insertions(+), 44 deletions(-) diff --git a/crypto/modes/asm/ghash-s390x.pl b/crypto/modes/asm/ghash-s390x.pl index 18135ddb45..d7689de541 100644 --- a/crypto/modes/asm/ghash-s390x.pl +++ b/crypto/modes/asm/ghash-s390x.pl @@ -8,10 +8,21 @@ # ==================================================================== # September 2010. +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Performance +# was measured to be ~18 cycles per processed byte on z10, which is +# almost 40% better than gcc-generated code. It should be noted that +# 18 cycles is worse result than expected: loop is scheduled for 12 +# and the result should be close to 12. In the lack of instruction- +# level profiling data it's impossible to tell why... while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; +$softonly=1; # disable hardware support for now + $Zhi="%r0"; $Zlo="%r1"; @@ -38,6 +49,31 @@ $code.=<<___; .globl gcm_gmult_4bit .align 32 gcm_gmult_4bit: +___ +$code.=<<___ if(!$softonly); + larl %r1,OPENSSL_s390xcap_P + lg %r0,0(%r1) + tmhl %r0,0x4000 # check for message-security-assist + jz .Lsoft_gmult + lghi %r0,0 + la %r1,16($sp) + .long 0xb93e0004 # kimd %r0,%r4 + lg %r1,24($sp) + tmhh %r1,0x4000 # check for function 65 + jz .Lsoft_gmult + stg %r0,16($sp) # arrange 16 bytes of zero input + stg %r0,24($sp) + lghi %r0,65 # function 65 + la %r1,0($Xi) # H lies right after Xi in gcm128_context + la $inp,16($sp) + lghi $len,16 + .long 0xb93e0004 # kimd %r0,$inp + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 32 +.Lsoft_gmult: +___ +$code.=<<___; stmg %r6,%r14,48($sp) aghi $Xi,-1 @@ -53,6 +89,27 @@ gcm_gmult_4bit: .globl gcm_ghash_4bit .align 32 gcm_ghash_4bit: +___ +$code.=<<___ if(!$softonly); + larl %r1,OPENSSL_s390xcap_P + lg %r0,0(%r1) + tmhl %r0,0x4000 # check for message-security-assist + jz .Lsoft_ghash + lghi %r0,0 + la %r1,16($sp) + .long 0xb93e0004 # kimd %r0,%r4 + lg %r1,24($sp) + tmhh %r1,0x4000 # check for function 65 + jz .Lsoft_ghash + lghi %r0,65 # function 65 + la %r1,0($Xi) # H lies right after Xi in gcm128_context + .long 0xb93e0004 # kimd %r0,$inp + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 32 +.Lsoft_ghash: +___ +$code.=<<___; stmg %r6,%r14,48($sp) aghi $Xi,-1 @@ -62,92 +119,94 @@ gcm_ghash_4bit: lg $Zlo,8+1($Xi) # Xi lg $Zhi,0+1($Xi) + lghi $tmp,0 .Louter: - xg $Zlo,8($inp) # Xi ^= inp - xg $Zhi,0($inp) + xg $Zhi,0($inp) # Xi ^= inp + xg $Zlo,8($inp) + xgr $Zhi,$tmp stg $Zlo,8+1($Xi) stg $Zhi,0+1($Xi) .Lgmult_shortcut: - lghi $tmp,0xff - srlg $xi,$Zlo,8 # extract first two bytes + lghi $tmp,0xf0 + sllg $nlo,$Zlo,4 + srlg $xi,$Zlo,8 # extract second byte + ngr $nlo,$tmp lgr $nhi,$Zlo - ngr $xi,$tmp - ngr $nhi,$tmp - - sllg $nlo,$nhi,4 - nill $nhi,0xf0 - nill $nlo,0xf0 lghi $cnt,14 + ngr $nhi,$tmp lg $Zlo,8($nlo,$Htbl) lg $Zhi,0($nlo,$Htbl) sllg $nlo,$xi,4 - nill $xi,0xf0 sllg $rem0,$Zlo,3 - nill $nlo,0xf0 - - srlg $Zlo,$Zlo,4 + ngr $nlo,$tmp ngr $rem0,$x78 + ngr $xi,$tmp + sllg $tmp,$Zhi,60 - xg $Zlo,8($nhi,$Htbl) + srlg $Zlo,$Zlo,4 srlg $Zhi,$Zhi,4 - xgr $Zlo,$tmp + xg $Zlo,8($nhi,$Htbl) xg $Zhi,0($nhi,$Htbl) lgr $nhi,$xi sllg $rem1,$Zlo,3 - + xgr $Zlo,$tmp + ngr $rem1,$x78 + j .Lghash_inner +.align 16 .Lghash_inner: srlg $Zlo,$Zlo,4 - ngr $rem1,$x78 - xg $Zlo,8($nlo,$Htbl) sllg $tmp,$Zhi,60 - xg $Zhi,0($rem0,$rem_4bit) - xgr $Zlo,$tmp + xg $Zlo,8($nlo,$Htbl) srlg $Zhi,$Zhi,4 llgc $xi,0($cnt,$Xi) - sllg $rem0,$Zlo,3 xg $Zhi,0($nlo,$Htbl) sllg $nlo,$xi,4 - nill $xi,0xf0 + xg $Zhi,0($rem0,$rem_4bit) nill $nlo,0xf0 - - srlg $Zlo,$Zlo,4 + sllg $rem0,$Zlo,3 + xgr $Zlo,$tmp ngr $rem0,$x78 - xg $Zlo,8($nhi,$Htbl) + nill $xi,0xf0 + sllg $tmp,$Zhi,60 - xg $Zhi,0($rem1,$rem_4bit) - xgr $Zlo,$tmp + srlg $Zlo,$Zlo,4 srlg $Zhi,$Zhi,4 - sllg $rem1,$Zlo,3 + xg $Zlo,8($nhi,$Htbl) xg $Zhi,0($nhi,$Htbl) lgr $nhi,$xi + xg $Zhi,0($rem1,$rem_4bit) + sllg $rem1,$Zlo,3 + xgr $Zlo,$tmp + ngr $rem1,$x78 brct $cnt,.Lghash_inner + sllg $tmp,$Zhi,60 srlg $Zlo,$Zlo,4 - ngr $rem1,$x78 + srlg $Zhi,$Zhi,4 xg $Zlo,8($nlo,$Htbl) - sllg $tmp,$Zhi,60 + xg $Zhi,0($nlo,$Htbl) + sllg $xi,$Zlo,3 xg $Zhi,0($rem0,$rem_4bit) xgr $Zlo,$tmp - srlg $Zhi,$Zhi,4 - sllg $rem0,$Zlo,3 - xg $Zhi,0($nlo,$Htbl) + ngr $xi,$x78 - srlg $Zlo,$Zlo,4 - ngr $rem0,$x78 - xg $Zhi,0($rem1,$rem_4bit) sllg $tmp,$Zhi,60 - xg $Zlo,8($nhi,$Htbl) + srlg $Zlo,$Zlo,4 srlg $Zhi,$Zhi,4 - xgr $Zlo,$tmp + xg $Zlo,8($nhi,$Htbl) xg $Zhi,0($nhi,$Htbl) + xgr $Zlo,$tmp + xg $Zhi,0($rem1,$rem_4bit) + lg $tmp,0($xi,$rem_4bit) la $inp,16($inp) - xg $Zhi,0($rem0,$rem_4bit) + sllg $tmp,$tmp,4 # correct last rem_4bit[rem] brctg $len,.Louter + xgr $Zhi,$tmp stg $Zlo,8+1($Xi) stg $Zhi,0+1($Xi) lmg %r6,%r14,48($sp) @@ -157,10 +216,10 @@ gcm_ghash_4bit: .align 64 rem_4bit: - .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 - .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 - .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 - .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 + .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0 + .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0 + .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0 + .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0 .type rem_4bit,\@object .size rem_4bit,(.-rem_4bit) .string "GHASH for s390x, CRYPTOGAMS by " -- 2.25.1