#
# February 2009
#
-# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to avoid
-# arithmetic instructions, but adhere to load and load address in
-# order to minimize Address Generation Interlock.
+# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
+# "cluster" Address Generation Interlocks, so that one pipeline stall
+# resolves several dependencies.
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 50% better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
$rp="%r14";
$sp="%r15";
.type RC4,\@function
.align 64
RC4:
- stmg %r6,%r11,48($sp)
+ stm${g} %r6,%r11,6*$SIZE_T($sp)
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+ llgfr $len,$len
+___
+$code.=<<___;
llgc $XX[0],0($key)
llgc $YY,1($key)
la $XX[0],1($XX[0])
- llgcr $XX[0],$XX[0]
- llgc $TX[0],2($XX[0],$key)
+ nill $XX[0],0xff
srlg $cnt,$len,3
ltgr $cnt,$cnt
+ llgc $TX[0],2($XX[0],$key)
jz .Lshort
j .Loop8
for ($i=0;$i<8;$i++) {
$code.=<<___;
la $YY,0($YY,$TX[0]) # $i
- llgcr $YY,$YY
+ nill $YY,255
la $XX[1],1($XX[0])
- llgcr $XX[1],$XX[1]
+ nill $XX[1],255
+___
+$code.=<<___ if ($i==1);
+ llgc $acc,2($TY,$key)
___
$code.=<<___ if ($i>1);
sllg $acc,$acc,8
ic $acc,2($TY,$key)
___
-$code.=<<___ if ($i==1);
- llgc $acc,2($TY,$key)
-___
$code.=<<___;
llgc $TY,2($YY,$key)
stc $TX[0],2($YY,$key)
la $TX[1],0($TX[0])
.Lcmov$i:
la $TY,0($TY,$TX[0])
- llgcr $TY,$TY
+ nill $TY,255
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
xgr $acc,$TX[1]
stg $acc,0($out)
la $out,8($out)
- brct $cnt,.Loop8
+ brctg $cnt,.Loop8
.Lshort:
lghi $acc,7
.align 16
.Loop1:
la $YY,0($YY,$TX[0])
- llgcr $YY,$YY
+ nill $YY,255
llgc $TY,2($YY,$key)
stc $TX[0],2($YY,$key)
stc $TY,2($XX[0],$key)
- la $TY,0($TY,$TX[0])
- llgcr $TY,$TY
- la $XX[0],1($XX[0])
- llgcr $XX[0],$XX[0]
- llgc $TY,2($TY,$key)
- llgc $TX[0],2($XX[0],$key)
+ ar $TY,$TX[0]
+ ahi $XX[0],1
+ nill $TY,255
+ nill $XX[0],255
llgc $acc,0($inp)
la $inp,1($inp)
+ llgc $TY,2($TY,$key)
+ llgc $TX[0],2($XX[0],$key)
xr $acc,$TY
stc $acc,0($out)
la $out,1($out)
ahi $XX[0],-1
stc $XX[0],0($key)
stc $YY,1($key)
- lmg %r6,%r11,48($sp)
+ lm${g} %r6,%r11,6*$SIZE_T($sp)
br $rp
.size RC4,.-RC4
.string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
.type RC4_set_key,\@function
.align 64
RC4_set_key:
- stmg %r6,%r8,48($sp)
+ stm${g} %r6,%r8,6*$SIZE_T($sp)
lhi $cnt,256
la $idx,0(%r0)
sth $idx,0($key)
la $idx,0($idx,$acc)
la $ikey,1($ikey)
la $idx,0($idx,$dat)
+ nill $idx,255
la $iinp,1($iinp)
- llgcr $idx,$idx
tml $ikey,255
llgc $dat,2($idx,$key)
stc $dat,2+256-1($ikey,$key)
la $iinp,0(%r0)
j .L2ndloop
.Ldone:
- lmg %r6,%r8,48($sp)
+ lm${g} %r6,%r8,6*$SIZE_T($sp)
br $rp
.size RC4_set_key,.-RC4_set_key
___
print $code;
+close STDOUT; # force flush