X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;f=crypto%2Frc4%2Fasm%2Frc4-586.pl;h=ab16a97b6805835023b97259a1f32ad92d726c37;hb=9be34ee5c8576539a929d5b396ad071aed525f43;hp=4b8bc78b33b294960cc1bdf56354a0ad55ce0a54;hpb=0dff8ba2483520130cc8281b7dab604e9e6ca6da;p=oweals%2Fopenssl.git diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl index 4b8bc78b33..ab16a97b68 100644 --- a/crypto/rc4/asm/rc4-586.pl +++ b/crypto/rc4/asm/rc4-586.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # [Re]written by Andy Polyakov for the OpenSSL @@ -31,7 +38,8 @@ # May 2011 # # Optimize for Core2 and Westmere [and incidentally Opteron]. Current -# performance in cycles per processed byte (less is better) is: +# performance in cycles per processed byte (less is better) and +# improvement relative to previous version of this module is: # # Pentium 10.2 # original numbers # Pentium III 7.8(*) @@ -41,6 +49,10 @@ # Core2 5.3/+67%(**) # Westmere 5.1/+94%(**) # Sandy Bridge 5.0/+8% +# Atom 12.6/+6% +# VIA Nano 6.4/+9% +# Ivy Bridge 4.9/±0% +# Bulldozer 4.9/+15% # # (*) PIII can actually deliver 6.6 cycles per byte with MMX code, # but this specific code performs poorly on Core2. And vice @@ -58,7 +70,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],"rc4-586.pl"); +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386"); $xx="eax"; $yy="ebx"; @@ -142,7 +157,7 @@ if ($alt=0) { &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); # (*) This is the key to Core2 and Westmere performance. - # Whithout movz out-of-order execution logic confuses + # Without movz out-of-order execution logic confuses # itself and fails to reorder loads and stores. Problem # appears to be fixed in Sandy Bridge... } @@ -182,8 +197,11 @@ if ($alt=0) { &and ($ty,-4); # how many 4-byte chunks? &jz (&label("loop1")); - &test ($ty,-8); &mov (&wparam(3),$out); # $out as accumulator in these loops + if ($x86only) { + &jmp (&label("go4loop4")); + } else { + &test ($ty,-8); &jz (&label("go4loop4")); &picmeup($out,"OPENSSL_ia32cap_P"); @@ -226,6 +244,7 @@ if ($alt=0) { &cmp ($inp,&wparam(1)); # compare to input+len &je (&label("done")); &jmp (&label("loop1")); + } &set_label("go4loop4",16); &lea ($ty,&DWP(-4,$inp,$ty)); @@ -406,3 +425,4 @@ $idx="edx"; &asm_finish(); +close STDOUT;