X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;f=crypto%2Fmodes%2Fasm%2Fghash-s390x.pl;h=bd1c811b7055a2398dceb3324a6a8b983fc27972;hb=a21314dbbc56cd30580123d74b3106a628540965;hp=18135ddb455ab8c6be753d7f87ce9c9c9b2a7b30;hpb=7d1f55e9d9a4c2d2d136735a07975504b12c1661;p=oweals%2Fopenssl.git diff --git a/crypto/modes/asm/ghash-s390x.pl b/crypto/modes/asm/ghash-s390x.pl index 18135ddb45..bd1c811b70 100644 --- a/crypto/modes/asm/ghash-s390x.pl +++ b/crypto/modes/asm/ghash-s390x.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -8,9 +15,51 @@ # ==================================================================== # September 2010. +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Performance +# was measured to be ~18 cycles per processed byte on z10, which is +# almost 40% better than gcc-generated code. It should be noted that +# 18 cycles is worse result than expected: loop is scheduled for 12 +# and the result should be close to 12. In the lack of instruction- +# level profiling data it's impossible to tell why... + +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 2.8x better than 32-bit code generated by gcc 4.3. + +# March 2011. +# +# Support for hardware KIMD-GHASH is verified to produce correct +# result and therefore is engaged. On z196 it was measured to process +# 8KB buffer ~7 faster than software implementation. It's not as +# impressive for smaller buffer sizes and for smallest 16-bytes buffer +# it's actually almost 2 times slower. Which is the reason why +# KIMD-GHASH is not used in gcm_gmult_4bit. + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$output and open STDOUT,">$output"; + +$softonly=0; $Zhi="%r0"; $Zlo="%r1"; @@ -33,12 +82,35 @@ $rem_4bit="%r14"; $sp="%r15"; $code.=<<___; +#include "s390x_arch.h" + .text .globl gcm_gmult_4bit .align 32 gcm_gmult_4bit: - stmg %r6,%r14,48($sp) +___ +$code.=<<___ if(!$softonly && 0); # hardware is slow for single block... + larl %r1,OPENSSL_s390xcap_P + lghi %r0,0 + lg %r1,S390X_KIMD+8(%r1) # load second word of kimd capabilities + # vector + tmhh %r1,0x4000 # check for function 65 + jz .Lsoft_gmult + stg %r0,16($sp) # arrange 16 bytes of zero input + stg %r0,24($sp) + lghi %r0,S390X_GHASH # function 65 + la %r1,0($Xi) # H lies right after Xi in gcm128_context + la $inp,16($sp) + lghi $len,16 + .long 0xb93e0004 # kimd %r0,$inp + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 32 +.Lsoft_gmult: +___ +$code.=<<___; + stm${g} %r6,%r14,6*$SIZE_T($sp) aghi $Xi,-1 lghi $len,1 @@ -53,7 +125,26 @@ gcm_gmult_4bit: .globl gcm_ghash_4bit .align 32 gcm_ghash_4bit: - stmg %r6,%r14,48($sp) +___ +$code.=<<___ if(!$softonly); + larl %r1,OPENSSL_s390xcap_P + lg %r0,S390X_KIMD+8(%r1) # load second word of kimd capabilities + # vector + tmhh %r0,0x4000 # check for function 65 + jz .Lsoft_ghash + lghi %r0,S390X_GHASH # function 65 + la %r1,0($Xi) # H lies right after Xi in gcm128_context + .long 0xb93e0004 # kimd %r0,$inp + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 32 +.Lsoft_ghash: +___ +$code.=<<___ if ($flavour =~ /3[12]/); + llgfr $len,$len +___ +$code.=<<___; + stm${g} %r6,%r14,6*$SIZE_T($sp) aghi $Xi,-1 srlg $len,$len,4 @@ -62,105 +153,107 @@ gcm_ghash_4bit: lg $Zlo,8+1($Xi) # Xi lg $Zhi,0+1($Xi) + lghi $tmp,0 .Louter: - xg $Zlo,8($inp) # Xi ^= inp - xg $Zhi,0($inp) + xg $Zhi,0($inp) # Xi ^= inp + xg $Zlo,8($inp) + xgr $Zhi,$tmp stg $Zlo,8+1($Xi) stg $Zhi,0+1($Xi) .Lgmult_shortcut: - lghi $tmp,0xff - srlg $xi,$Zlo,8 # extract first two bytes + lghi $tmp,0xf0 + sllg $nlo,$Zlo,4 + srlg $xi,$Zlo,8 # extract second byte + ngr $nlo,$tmp lgr $nhi,$Zlo - ngr $xi,$tmp - ngr $nhi,$tmp - - sllg $nlo,$nhi,4 - nill $nhi,0xf0 - nill $nlo,0xf0 lghi $cnt,14 + ngr $nhi,$tmp lg $Zlo,8($nlo,$Htbl) lg $Zhi,0($nlo,$Htbl) sllg $nlo,$xi,4 - nill $xi,0xf0 sllg $rem0,$Zlo,3 - nill $nlo,0xf0 - - srlg $Zlo,$Zlo,4 + ngr $nlo,$tmp ngr $rem0,$x78 + ngr $xi,$tmp + sllg $tmp,$Zhi,60 - xg $Zlo,8($nhi,$Htbl) + srlg $Zlo,$Zlo,4 srlg $Zhi,$Zhi,4 - xgr $Zlo,$tmp + xg $Zlo,8($nhi,$Htbl) xg $Zhi,0($nhi,$Htbl) lgr $nhi,$xi sllg $rem1,$Zlo,3 - -.Lghash_inner: - srlg $Zlo,$Zlo,4 + xgr $Zlo,$tmp ngr $rem1,$x78 - xg $Zlo,8($nlo,$Htbl) sllg $tmp,$Zhi,60 - xg $Zhi,0($rem0,$rem_4bit) - xgr $Zlo,$tmp + j .Lghash_inner +.align 16 +.Lghash_inner: + srlg $Zlo,$Zlo,4 srlg $Zhi,$Zhi,4 + xg $Zlo,8($nlo,$Htbl) llgc $xi,0($cnt,$Xi) - sllg $rem0,$Zlo,3 xg $Zhi,0($nlo,$Htbl) sllg $nlo,$xi,4 - nill $xi,0xf0 + xg $Zhi,0($rem0,$rem_4bit) nill $nlo,0xf0 - - srlg $Zlo,$Zlo,4 + sllg $rem0,$Zlo,3 + xgr $Zlo,$tmp ngr $rem0,$x78 - xg $Zlo,8($nhi,$Htbl) + nill $xi,0xf0 + sllg $tmp,$Zhi,60 - xg $Zhi,0($rem1,$rem_4bit) - xgr $Zlo,$tmp + srlg $Zlo,$Zlo,4 srlg $Zhi,$Zhi,4 - sllg $rem1,$Zlo,3 + xg $Zlo,8($nhi,$Htbl) xg $Zhi,0($nhi,$Htbl) lgr $nhi,$xi + xg $Zhi,0($rem1,$rem_4bit) + sllg $rem1,$Zlo,3 + xgr $Zlo,$tmp + ngr $rem1,$x78 + sllg $tmp,$Zhi,60 brct $cnt,.Lghash_inner srlg $Zlo,$Zlo,4 - ngr $rem1,$x78 + srlg $Zhi,$Zhi,4 xg $Zlo,8($nlo,$Htbl) - sllg $tmp,$Zhi,60 + xg $Zhi,0($nlo,$Htbl) + sllg $xi,$Zlo,3 xg $Zhi,0($rem0,$rem_4bit) xgr $Zlo,$tmp - srlg $Zhi,$Zhi,4 - sllg $rem0,$Zlo,3 - xg $Zhi,0($nlo,$Htbl) + ngr $xi,$x78 - srlg $Zlo,$Zlo,4 - ngr $rem0,$x78 - xg $Zhi,0($rem1,$rem_4bit) sllg $tmp,$Zhi,60 - xg $Zlo,8($nhi,$Htbl) + srlg $Zlo,$Zlo,4 srlg $Zhi,$Zhi,4 - xgr $Zlo,$tmp + xg $Zlo,8($nhi,$Htbl) xg $Zhi,0($nhi,$Htbl) + xgr $Zlo,$tmp + xg $Zhi,0($rem1,$rem_4bit) + lg $tmp,0($xi,$rem_4bit) la $inp,16($inp) - xg $Zhi,0($rem0,$rem_4bit) + sllg $tmp,$tmp,4 # correct last rem_4bit[rem] brctg $len,.Louter + xgr $Zhi,$tmp stg $Zlo,8+1($Xi) stg $Zhi,0+1($Xi) - lmg %r6,%r14,48($sp) + lm${g} %r6,%r14,6*$SIZE_T($sp) br %r14 .type gcm_ghash_4bit,\@function .size gcm_ghash_4bit,(.-gcm_ghash_4bit) .align 64 rem_4bit: - .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 - .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 - .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 - .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 + .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0 + .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0 + .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0 + .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0 .type rem_4bit,\@object .size rem_4bit,(.-rem_4bit) .string "GHASH for s390x, CRYPTOGAMS by " @@ -168,4 +261,4 @@ ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; -close STDOUT; +close STDOUT or die "error closing STDOUT: $!";