dlfcn: always define _GNU_SOURCE

[oweals/openssl.git] / crypto / rc4 / asm / rc4-s390x.pl
diff --git a/crypto/rc4/asm/rc4-s390x.pl b/crypto/rc4/asm/rc4-s390x.pl

index 4366c4fc1a6c37a5440505c4697ed96b80310f86..1aa754820c64e905cf75956c27aff6866e172dae 100644 (file)
--- a/crypto/rc4/asm/rc4-s390x.pl
+++ b/crypto/rc4/asm/rc4-s390x.pl
@@ -9,9 +9,32 @@
  #
  # February 2009
  #
-# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to avoid
-# arithmetic instructions, but adhere to load and load address in
-# order to minimize Address Generation Interlock.
+# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
+# "cluster" Address Generation Interlocks, so that one pipeline stall
+# resolves several dependencies.
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 50% better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+       $SIZE_T=4;
+       $g="";
+} else {
+       $SIZE_T=8;
+       $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
  
  $rp="%r14";
  $sp="%r15";
@@ -39,14 +62,19 @@ $code.=<<___;
  .type  RC4,\@function
  .align 64
  RC4:
-       stmg    %r6,%r11,48($sp)
+       stm${g} %r6,%r11,6*$SIZE_T($sp)
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+       llgfr   $len,$len
+___
+$code.=<<___;
         llgc    $XX[0],0($key)
         llgc    $YY,1($key)
         la      $XX[0],1($XX[0])
-       llgcr   $XX[0],$XX[0]
-       llgc    $TX[0],2($XX[0],$key)
+       nill    $XX[0],0xff
         srlg    $cnt,$len,3
         ltgr    $cnt,$cnt
+       llgc    $TX[0],2($XX[0],$key)
         jz      .Lshort
         j       .Loop8
  
@@ -56,17 +84,17 @@ ___
  for ($i=0;$i<8;$i++) {
  $code.=<<___;
         la      $YY,0($YY,$TX[0])       # $i
-       llgcr   $YY,$YY
+       nill    $YY,255
         la      $XX[1],1($XX[0])
-       llgcr   $XX[1],$XX[1]
+       nill    $XX[1],255
+___
+$code.=<<___ if ($i==1);
+       llgc    $acc,2($TY,$key)
  ___
  $code.=<<___ if ($i>1);
         sllg    $acc,$acc,8
         ic      $acc,2($TY,$key)
  ___
-$code.=<<___ if ($i==1);
-       llgc    $acc,2($TY,$key)
-___
  $code.=<<___;
         llgc    $TY,2($YY,$key)
         stc     $TX[0],2($YY,$key)
@@ -77,7 +105,7 @@ $code.=<<___;
         la      $TX[1],0($TX[0])
  .Lcmov$i:
         la      $TY,0($TY,$TX[0])
-       llgcr   $TY,$TY
+       nill    $TY,255
  ___
  push(@TX,shift(@TX)); push(@XX,shift(@XX));     # "rotate" registers
  }
@@ -90,7 +118,7 @@ $code.=<<___;
         xgr     $acc,$TX[1]
         stg     $acc,0($out)
         la      $out,8($out)
-       brct    $cnt,.Loop8
+       brctg   $cnt,.Loop8
  
  .Lshort:
         lghi    $acc,7
@@ -101,18 +129,18 @@ $code.=<<___;
  .align 16
  .Loop1:
         la      $YY,0($YY,$TX[0])
-       llgcr   $YY,$YY
+       nill    $YY,255
         llgc    $TY,2($YY,$key)
         stc     $TX[0],2($YY,$key)
         stc     $TY,2($XX[0],$key)
-       la      $TY,0($TY,$TX[0])
-       llgcr   $TY,$TY
-       la      $XX[0],1($XX[0])
-       llgcr   $XX[0],$XX[0]
-       llgc    $TY,2($TY,$key)
-       llgc    $TX[0],2($XX[0],$key)
+       ar      $TY,$TX[0]
+       ahi     $XX[0],1
+       nill    $TY,255
+       nill    $XX[0],255
         llgc    $acc,0($inp)
         la      $inp,1($inp)
+       llgc    $TY,2($TY,$key)
+       llgc    $TX[0],2($XX[0],$key)
         xr      $acc,$TY
         stc     $acc,0($out)
         la      $out,1($out)
@@ -122,7 +150,7 @@ $code.=<<___;
         ahi     $XX[0],-1
         stc     $XX[0],0($key)
         stc     $YY,1($key)
-       lmg     %r6,%r11,48($sp)
+       lm${g}  %r6,%r11,6*$SIZE_T($sp)
         br      $rp
  .size  RC4,.-RC4
  .string        "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
@@ -147,7 +175,7 @@ $code.=<<___;
  .type  RC4_set_key,\@function
  .align 64
  RC4_set_key:
-       stmg    %r6,%r8,48($sp)
+       stm${g} %r6,%r8,6*$SIZE_T($sp)
         lhi     $cnt,256
         la      $idx,0(%r0)
         sth     $idx,0($key)
@@ -168,8 +196,8 @@ RC4_set_key:
         la      $idx,0($idx,$acc)
         la      $ikey,1($ikey)
         la      $idx,0($idx,$dat)
+       nill    $idx,255
         la      $iinp,1($iinp)
-       llgcr   $idx,$idx
         tml     $ikey,255
         llgc    $dat,2($idx,$key)
         stc     $dat,2+256-1($ikey,$key)
@@ -180,7 +208,7 @@ RC4_set_key:
         la      $iinp,0(%r0)
         j       .L2ndloop
  .Ldone:
-       lmg     %r6,%r8,48($sp)
+       lm${g}  %r6,%r8,6*$SIZE_T($sp)
         br      $rp
  .size  RC4_set_key,.-RC4_set_key
  
@@ -203,3 +231,4 @@ RC4_options:
  ___
  
  print $code;
+close STDOUT;  # force flush