X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;f=crypto%2Frc4%2Fasm%2Frc4-586.pl;h=ab16a97b6805835023b97259a1f32ad92d726c37;hb=9be34ee5c8576539a929d5b396ad071aed525f43;hp=0c4cac4e8986e24970ca04fe381526a1b2df1444;hpb=6a99984b57cbc7ecedf198399ed745a50c87a495;p=oweals%2Fopenssl.git diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl index 0c4cac4e89..ab16a97b68 100644 --- a/crypto/rc4/asm/rc4-586.pl +++ b/crypto/rc4/asm/rc4-586.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # [Re]written by Andy Polyakov for the OpenSSL @@ -31,16 +38,21 @@ # May 2011 # # Optimize for Core2 and Westmere [and incidentally Opteron]. Current -# performance in cycles per processed byte (less is better) is: +# performance in cycles per processed byte (less is better) and +# improvement relative to previous version of this module is: # # Pentium 10.2 # original numbers # Pentium III 7.8(*) # Intel P4 7.5 # -# Opteron 6.4/+14% # new MMX numbers -# Core2 5.8/+50%(**) -# Westmere 5.5/+80%(**) -# Sandy Bridge 5.4/0% +# Opteron 6.1/+20% # new MMX numbers +# Core2 5.3/+67%(**) +# Westmere 5.1/+94%(**) +# Sandy Bridge 5.0/+8% +# Atom 12.6/+6% +# VIA Nano 6.4/+9% +# Ivy Bridge 4.9/±0% +# Bulldozer 4.9/+15% # # (*) PIII can actually deliver 6.6 cycles per byte with MMX code, # but this specific code performs poorly on Core2. And vice @@ -58,7 +70,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],"rc4-586.pl"); +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386"); $xx="eax"; $yy="ebx"; @@ -126,8 +141,7 @@ if ($alt=0) { $RC4_loop_mmx = sub { my $i=shift; - &add ($yy,$tx); - &movz ($yy,&LB($yy)); # (*) + &add (&LB($yy),&LB($tx)); &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1); &mov ($ty,&DWP(0,$dat,$yy,4)); &mov (&DWP(0,$dat,$yy,4),$tx); @@ -143,7 +157,7 @@ if ($alt=0) { &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); # (*) This is the key to Core2 and Westmere performance. - # Whithout movz out-of-order execution logic confuses + # Without movz out-of-order execution logic confuses # itself and fails to reorder loads and stores. Problem # appears to be fixed in Sandy Bridge... } @@ -183,8 +197,11 @@ if ($alt=0) { &and ($ty,-4); # how many 4-byte chunks? &jz (&label("loop1")); - &test ($ty,-8); &mov (&wparam(3),$out); # $out as accumulator in these loops + if ($x86only) { + &jmp (&label("go4loop4")); + } else { + &test ($ty,-8); &jz (&label("go4loop4")); &picmeup($out,"OPENSSL_ia32cap_P"); @@ -204,6 +221,9 @@ if ($alt=0) { &$RC4_loop_mmx(0); &set_label("loop_mmx_enter"); for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); } + &mov ($ty,$yy); + &xor ($yy,$yy); # this is second key to Core2 + &mov (&LB($yy),&LB($ty)); # and Westmere performance... &cmp ($inp,&DWP(-4,$dat)); &lea ($inp,&DWP(8,$inp)); &jb (&label("loop_mmx")); @@ -224,6 +244,7 @@ if ($alt=0) { &cmp ($inp,&wparam(1)); # compare to input+len &je (&label("done")); &jmp (&label("loop1")); + } &set_label("go4loop4",16); &lea ($ty,&DWP(-4,$inp,$ty)); @@ -404,3 +425,4 @@ $idx="edx"; &asm_finish(); +close STDOUT;