-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# May 2011
#
# Optimize for Core2 and Westmere [and incidentally Opteron]. Current
-# performance in cycles per processed byte (less is better) is:
+# performance in cycles per processed byte (less is better) and
+# improvement relative to previous version of this module is:
#
# Pentium 10.2 # original numbers
# Pentium III 7.8(*)
# Intel P4 7.5
#
-# Opteron 6.4/+14% # new MMX numbers
-# Core2 5.8/+50%(**)
-# Westmere 5.5/+80%(**)
-# Sandy Bridge 5.4/0%
+# Opteron 6.1/+20% # new MMX numbers
+# Core2 5.3/+67%(**)
+# Westmere 5.1/+94%(**)
+# Sandy Bridge 5.0/+8%
+# Atom 12.6/+6%
+# VIA Nano 6.4/+9%
+# Ivy Bridge 4.9/±0%
+# Bulldozer 4.9/+15%
#
# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
# but this specific code performs poorly on Core2. And vice
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
-&asm_init($ARGV[0],"rc4-586.pl");
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
$xx="eax";
$yy="ebx";
$RC4_loop_mmx = sub {
my $i=shift;
- &add ($yy,$tx);
- &movz ($yy,&LB($yy)); # (*)
+ &add (&LB($yy),&LB($tx));
&psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1);
&mov ($ty,&DWP(0,$dat,$yy,4));
&mov (&DWP(0,$dat,$yy,4),$tx);
&movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
# (*) This is the key to Core2 and Westmere performance.
- # Whithout movz out-of-order execution logic confuses
+ # Without movz out-of-order execution logic confuses
# itself and fails to reorder loads and stores. Problem
# appears to be fixed in Sandy Bridge...
}
&and ($ty,-4); # how many 4-byte chunks?
&jz (&label("loop1"));
- &test ($ty,-8);
&mov (&wparam(3),$out); # $out as accumulator in these loops
+ if ($x86only) {
+ &jmp (&label("go4loop4"));
+ } else {
+ &test ($ty,-8);
&jz (&label("go4loop4"));
&picmeup($out,"OPENSSL_ia32cap_P");
&$RC4_loop_mmx(0);
&set_label("loop_mmx_enter");
for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
+ &mov ($ty,$yy);
+ &xor ($yy,$yy); # this is second key to Core2
+ &mov (&LB($yy),&LB($ty)); # and Westmere performance...
&cmp ($inp,&DWP(-4,$dat));
&lea ($inp,&DWP(8,$inp));
&jb (&label("loop_mmx"));
&cmp ($inp,&wparam(1)); # compare to input+len
&je (&label("done"));
&jmp (&label("loop1"));
+ }
&set_label("go4loop4",16);
&lea ($ty,&DWP(-4,$inp,$ty));
&asm_finish();
+close STDOUT;