X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;ds=sidebyside;f=crypto%2Fsha%2Fasm%2Fsha512-s390x.pl;h=92d7a7725a67313480078ccb39b066ff4a4b0068;hb=0d7903f83f84bba1d29225efd999c633a0c5ba01;hp=67a17d3808a5ef4f4491fb09a87cc804cd4268c9;hpb=a2a54ffc5f3e27a5e12547063b0ebbb4ba30956f;p=oweals%2Fopenssl.git diff --git a/crypto/sha/asm/sha512-s390x.pl b/crypto/sha/asm/sha512-s390x.pl index 67a17d3808..92d7a7725a 100644 --- a/crypto/sha/asm/sha512-s390x.pl +++ b/crypto/sha/asm/sha512-s390x.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -12,17 +19,43 @@ # April 2007. # # sha256_block_data_order is reportedly >3 times faster than gcc 3.3 -# generated code (must to be a bug in compiler, as improvement is +# generated code (must be a bug in compiler, as improvement is # "pathologically" high, in particular in comparison to other SHA # modules). But the real twist is that it detects if hardware support # for SHA256 is available and in such case utilizes it. Then the -# performance can reach >12x of assembler one for larger chunks. +# performance can reach >6.5x of assembler one for larger chunks. # # sha512_block_data_order is ~70% faster than gcc 3.3 generated code. +# January 2009. +# +# Add support for hardware SHA512 and reschedule instructions to +# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster +# than software. + +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 SHA256 was measured to +# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + $t0="%r0"; $t1="%r1"; -$ctx="%r2"; +$ctx="%r2"; $t2="%r2"; $inp="%r3"; $len="%r4"; # used as index in inner loop @@ -38,7 +71,7 @@ $tbl="%r13"; $T1="%r14"; $sp="%r15"; -$output=shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; if ($output =~ /512/) { @@ -54,7 +87,7 @@ if ($output =~ /512/) { @sigma0=(56,63, 7); @sigma1=( 3,45, 6); $rounds=80; - $kimdfunc=0; # 0 means unknown/unsupported/unimplemented + $kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled } else { $label="256"; $SZ=4; @@ -72,7 +105,8 @@ if ($output =~ /512/) { } $Func="sha${label}_block_data_order"; $Table="K${label}"; -$frame=160+16*$SZ; +$stdframe=16*$SIZE_T+4*8; +$frame=$stdframe+16*$SZ; sub BODY_00_15 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; @@ -83,32 +117,32 @@ ___ $code.=<<___; $ROT $t0,$e,$Sigma1[0] $ROT $t1,$e,$Sigma1[1] + lgr $t2,$f xgr $t0,$t1 $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]` + xgr $t2,$g + $ST $T1,`$stdframe+$SZ*($i%16)`($sp) xgr $t0,$t1 # Sigma1(e) - $ST $T1,`160+$SZ*($i%16)`($sp) - algr $T1,$t0 # T1+=Sigma1(e) algr $T1,$h # T1+=h - $ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i] - lgr $t0,$f - xgr $t0,$g - ngr $t0,$e - xgr $t0,$g # Ch(e,f,g) - algr $T1,$t0 # T1+=Ch(e,f,g) + ngr $t2,$e + lgr $t1,$a + algr $T1,$t0 # T1+=Sigma1(e) $ROT $h,$a,$Sigma0[0] + xgr $t2,$g # Ch(e,f,g) + $ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i] $ROT $t0,$a,$Sigma0[1] + algr $T1,$t2 # T1+=Ch(e,f,g) + ogr $t1,$b xgr $h,$t0 + lgr $t2,$a + ngr $t1,$c $ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]` xgr $h,$t0 # h=Sigma0(a) - lgr $t0,$a - ogr $t0,$b - ngr $t0,$c - lgr $t1,$a - ngr $t1,$b - ogr $t0,$t1 # Maj(a,b,c) - algr $h,$t0 # h+=Maj(a,b,c) - algr $d,$T1 # d+=T1 + ngr $t2,$b algr $h,$T1 # h+=T1 + ogr $t2,$t1 # Maj(a,b,c) + algr $d,$T1 # d+=T1 + algr $h,$t2 # h+=Maj(a,b,c) ___ } @@ -116,19 +150,19 @@ sub BODY_16_XX { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - $LD $T1,`160+$SZ*(($i+1)%16)`($sp) ### $i - $LD $t1,`160+$SZ*(($i+14)%16)`($sp) + $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i + $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp) $ROT $t0,$T1,$sigma0[0] $SHR $T1,$sigma0[2] + $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]` xgr $T1,$t0 - $ROT $t0,$t0,`$sigma0[1]-$sigma0[0]` - xgr $T1,$t0 # sigma0(X[i+1]) $ROT $t0,$t1,$sigma1[0] - $ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i] + xgr $T1,$t2 # sigma0(X[i+1]) $SHR $t1,$sigma1[2] + $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i] xgr $t1,$t0 - $ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9] $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]` + $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9] xgr $t1,$t0 # sigma1(X[i+14]) algr $T1,$t1 # +=sigma1(X[i+14]) ___ @@ -206,33 +240,35 @@ $code.=<<___; .globl $Func .type $Func,\@function $Func: + sllg $len,$len,`log(16*$SZ)/log(2)` ___ $code.=<<___ if ($kimdfunc); - lghi %r0,0 - la %r1,16($sp) - .long 0xb93e0002 # kimd %r0,%r2 - lg %r0,16($sp) + larl %r1,OPENSSL_s390xcap_P + lg %r0,0(%r1) + tmhl %r0,0x4000 # check for message-security assist + jz .Lsoftware + lg %r0,16(%r1) # check kimd capabilities tmhh %r0,`0x8000>>$kimdfunc` jz .Lsoftware lghi %r0,$kimdfunc lgr %r1,$ctx lgr %r2,$inp - sllg %r3,$len,`log(16*$SZ)/log(2)` + lgr %r3,$len .long 0xb93e0002 # kimd %r0,%r2 + brc 1,.-4 # pay attention to "partial completion" br %r14 +.align 16 .Lsoftware: ___ $code.=<<___; - sllg $len,$len,`log(16*$SZ)/log(2)` - la $len,0($inp,$len) - stmg $len,%r15,32($sp) + lghi %r1,-$frame + la $len,0($len,$inp) + stm${g} $ctx,%r15,`2*$SIZE_T`($sp) lgr %r0,$sp - aghi $sp,-$frame - stg %r0,0($sp) - - bras $tbl,.Lpic -.Lpic: aghi $tbl,$Table-.Lpic + la $sp,0(%r1,$sp) + st${g} %r0,0($sp) + larl $tbl,$Table $LD $A,`0*$SZ`($ctx) $LD $B,`1*$SZ`($ctx) $LD $C,`2*$SZ`($ctx) @@ -254,6 +290,8 @@ $code.=<<___; clgr $len,$t0 jne .Lrounds_16_xx + l${g} $ctx,`$frame+2*$SIZE_T`($sp) + la $inp,`16*$SZ`($inp) $ADD $A,`0*$SZ`($ctx) $ADD $B,`1*$SZ`($ctx) $ADD $C,`2*$SZ`($ctx) @@ -270,14 +308,14 @@ $code.=<<___; $ST $F,`5*$SZ`($ctx) $ST $G,`6*$SZ`($ctx) $ST $H,`7*$SZ`($ctx) - la $inp,`16*$SZ`($inp) - clg $inp,`$frame+32`($sp) + cl${g} $inp,`$frame+4*$SIZE_T`($sp) jne .Lloop - lmg %r6,%r15,`$frame+48`($sp) + lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .size $Func,.-$Func .string "SHA${label} block transform for s390x, CRYPTOGAMS by " +.comm OPENSSL_s390xcap_P,80,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem;