From 0d51cf3ccc0224def10c32b6defd4a77a1b4322a Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 9 Jan 2020 06:20:09 -0800 Subject: [PATCH] x86_64: Don't assume 8-byte pointer size Since pointer in x32 is 4 bytes, add x86_64-support.pl to define pointer_size and pointer_register based on flavour to support stuctures like: struct { void *ptr; int blocks; } This fixes 90-test_sslapi.t on x32. Verified with $ ./Configure shared linux-x86_64 $ make $ make test and $ ./Configure shared linux-x32 $ make $ make test Reviewed-by: Richard Levitte Reviewed-by: Tomas Mraz (Merged from https://github.com/openssl/openssl/pull/10988) --- crypto/aes/asm/aesni-mb-x86_64.pl | 84 ++++++++++++++++++++---------- crypto/perlasm/x86_64-support.pl | 51 ++++++++++++++++++ crypto/sha/asm/sha1-mb-x86_64.pl | 42 ++++++++++----- crypto/sha/asm/sha256-mb-x86_64.pl | 42 ++++++++++----- 4 files changed, 167 insertions(+), 52 deletions(-) create mode 100644 crypto/perlasm/x86_64-support.pl diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl index 3b2b569481..0b86285d30 100644 --- a/crypto/aes/asm/aesni-mb-x86_64.pl +++ b/crypto/aes/asm/aesni-mb-x86_64.pl @@ -54,6 +54,11 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86_64-support.pl"; + +$ptr_size=&pointer_size($flavour); + $avx=0; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -88,6 +93,8 @@ $inp="%rdi"; # 1st arg $key="%rsi"; # 2nd arg $num="%edx"; +$inp_elm_size=2*$ptr_size+8+16; + @inptr=map("%r$_",(8..11)); @outptr=map("%r$_",(12..15)); @@ -163,21 +170,25 @@ $code.=<<___; .Lenc4x_body: movdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*2($inp),$inp + lea $inp_elm_size*2($inp),$inp .Lenc4x_loop_grande: mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { + $inptr_reg=&pointer_register($flavour,@inptr[$i]); + $outptr_reg=&pointer_register($flavour,@outptr[$i]); $code.=<<___; - mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*2`($inp),@inptr[$i] + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one + mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg cmp $num,$one - mov `40*$i+8-40*2`($inp),@outptr[$i] + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg cmovg $one,$num # find maximum test $one,$one - movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV + # load IV + movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@inptr[$i] # cancel input ___ @@ -335,14 +346,15 @@ $code.=<<___; #pxor @inp[0],@out[0] #pxor @inp[1],@out[1] - #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! + # output iv FIX ME! + #movdqu @out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp) #pxor @inp[2],@out[2] - #movdqu @out[1],`40*1+24-40*2`($inp) + #movdqu @out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp) #pxor @inp[3],@out[3] - #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller - #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... + #movdqu @out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp) # won't fix, let caller + #movdqu @out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp) # figure this out... - lea `40*4`($inp),$inp + lea `$inp_elm_size*4`($inp),$inp dec $num jnz .Lenc4x_loop_grande @@ -440,21 +452,25 @@ $code.=<<___; .Ldec4x_body: movdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*2($inp),$inp + lea $inp_elm_size*2($inp),$inp .Ldec4x_loop_grande: mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { + $inptr_reg=&pointer_register($flavour,@inptr[$i]); + $outptr_reg=&pointer_register($flavour,@outptr[$i]); $code.=<<___; - mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*2`($inp),@inptr[$i] + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one + mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg cmp $num,$one - mov `40*$i+8-40*2`($inp),@outptr[$i] + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg cmovg $one,$num # find maximum test $one,$one - movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV + # load IV + movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@inptr[$i] # cancel input ___ @@ -610,7 +626,7 @@ $code.=<<___; .cfi_def_cfa %rax,8 mov 24(%rsp),$num - lea `40*4`($inp),$inp + lea `$inp_elm_size*4`($inp),$inp dec $num jnz .Ldec4x_loop_grande @@ -709,7 +725,7 @@ $code.=<<___; vzeroupper vmovdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*4($inp),$inp + lea `$inp_elm_size*4`($inp),$inp shr \$1,$num .Lenc8x_loop_grande: @@ -718,14 +734,20 @@ $code.=<<___; ___ for($i=0;$i<8;$i++) { my $temp = $i ? $offload : $offset; + $ptr_reg=&pointer_register($flavour,@ptr[$i]); + $temp_reg=&pointer_register($flavour,$temp); $code.=<<___; - mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one + # input pointer + mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg cmp $num,$one - mov `40*$i+8-40*4`($inp),$temp # output pointer + # output pointer + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg cmovg $one,$num # find maximum test $one,$one - vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + # load IV + vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@ptr[$i] # cancel input sub @ptr[$i],$temp # distance between input and output @@ -910,7 +932,7 @@ $code.=<<___; mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 #mov 24(%rsp),$num - #lea `40*8`($inp),$inp + #lea `$inp_elm_size*8`($inp),$inp #dec $num #jnz .Lenc8x_loop_grande @@ -1002,7 +1024,7 @@ $code.=<<___; vzeroupper vmovdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*4($inp),$inp + lea `$inp_elm_size*4`($inp),$inp shr \$1,$num .Ldec8x_loop_grande: @@ -1011,14 +1033,20 @@ $code.=<<___; ___ for($i=0;$i<8;$i++) { my $temp = $i ? $offload : $offset; + $ptr_reg=&pointer_register($flavour,@ptr[$i]); + $temp_reg=&pointer_register($flavour,$temp); $code.=<<___; - mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one + # input pointer + mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg cmp $num,$one - mov `40*$i+8-40*4`($inp),$temp # output pointer + # output pointer + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg cmovg $one,$num # find maximum test $one,$one - vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + # load IV + vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@ptr[$i] # cancel input sub @ptr[$i],$temp # distance between input and output @@ -1234,7 +1262,7 @@ $code.=<<___; mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 #mov 24(%rsp),$num - #lea `40*8`($inp),$inp + #lea `$inp_elm_size*8`($inp),$inp #dec $num #jnz .Ldec8x_loop_grande diff --git a/crypto/perlasm/x86_64-support.pl b/crypto/perlasm/x86_64-support.pl new file mode 100644 index 0000000000..66aeaedab4 --- /dev/null +++ b/crypto/perlasm/x86_64-support.pl @@ -0,0 +1,51 @@ +#! /usr/bin/env perl +# Copyright 2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +package x86_64support; + +# require "x86_64-support.pl"; +# $ptr_size=&pointer_size($flavour); +# $ptr_reg=&pointer_register($flavour,$reg); + +sub ::pointer_size +{ + my($flavour)=@_; + my $ptr_size=8; $ptr_size=4 if ($flavour eq "elf32"); + return $ptr_size; +} + +sub ::pointer_register +{ + my($flavour,$reg)=@_; + if ($flavour eq "elf32") { + if ($reg eq "%rax") { + return "%eax"; + } elsif ($reg eq "%rbx") { + return "%ebx"; + } elsif ($reg eq "%rcx") { + return "%ecx"; + } elsif ($reg eq "%rdx") { + return "%edx"; + } elsif ($reg eq "%rdi") { + return "%edi"; + } elsif ($reg eq "%rsi") { + return "%esi"; + } elsif ($reg eq "%rbp") { + return "%ebp"; + } elsif ($reg eq "%rsp") { + return "%esp"; + } else { + return $reg."d"; + } + } else { + return $reg; + } +} + +1; diff --git a/crypto/sha/asm/sha1-mb-x86_64.pl b/crypto/sha/asm/sha1-mb-x86_64.pl index 0873bd96dc..ef1228786f 100644 --- a/crypto/sha/asm/sha1-mb-x86_64.pl +++ b/crypto/sha/asm/sha1-mb-x86_64.pl @@ -50,6 +50,11 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86_64-support.pl"; + +$ptr_size=&pointer_size($flavour); + $avx=0; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -89,6 +94,7 @@ $inp="%rsi"; # 2nd arg $num="%edx"; @ptr=map("%r$_",(8..11)); $Tbl="%rbp"; +$inp_elm_size=2*$ptr_size; @V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); ($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); @@ -409,9 +415,12 @@ $code.=<<___; xor $num,$num ___ for($i=0;$i<4;$i++) { + $ptr_reg=&pointer_register($flavour,@ptr[$i]); $code.=<<___; - mov `16*$i+0`($inp),@ptr[$i] # input pointer - mov `16*$i+8`($inp),%ecx # number of blocks + # input pointer + mov `$inp_elm_size*$i+0`($inp),$ptr_reg + # number of blocks + mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx @@ -488,7 +497,7 @@ $code.=<<___; mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx - lea `16*$REG_SZ/4`($inp),$inp + lea `$inp_elm_size*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande @@ -566,9 +575,12 @@ $code.=<<___; xor $num,$num ___ for($i=0;$i<2;$i++) { + $ptr_reg=&pointer_register($flavour,@ptr[$i]); $code.=<<___; - mov `16*$i+0`($inp),@ptr[$i] # input pointer - mov `16*$i+8`($inp),%ecx # number of blocks + # input pointer + mov `$inp_elm_size*$i+0`($inp),$ptr_reg + # number of blocks + mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx @@ -751,7 +763,7 @@ $code.=<<___; movq $E0,0x80-0x40($ctx) # e1.e0 lea `$REG_SZ/2`($ctx),$ctx - lea `16*2`($inp),$inp + lea `$inp_elm_size*2`($inp),$inp dec $num jnz .Loop_grande_shaext @@ -1071,9 +1083,12 @@ $code.=<<___; xor $num,$num ___ for($i=0;$i<4;$i++) { + $ptr_reg=&pointer_register($flavour,@ptr[$i]); $code.=<<___; - mov `16*$i+0`($inp),@ptr[$i] # input pointer - mov `16*$i+8`($inp),%ecx # number of blocks + # input pointer + mov `$inp_elm_size*$i+0`($inp),$ptr_reg + # number of blocks + mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx @@ -1144,7 +1159,7 @@ $code.=<<___; mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx - lea `16*$REG_SZ/4`($inp),$inp + lea `$inp_elm_size*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande_avx @@ -1240,9 +1255,12 @@ $code.=<<___; lea `$REG_SZ*16`(%rsp),%rbx ___ for($i=0;$i<8;$i++) { + $ptr_reg=&pointer_register($flavour,@ptr[$i]); $code.=<<___; - mov `16*$i+0`($inp),@ptr[$i] # input pointer - mov `16*$i+8`($inp),%ecx # number of blocks + # input pointer + mov `$inp_elm_size*$i+0`($inp),$ptr_reg + # number of blocks + mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx @@ -1313,7 +1331,7 @@ $code.=<<___; #mov `$REG_SZ*17+8`(%rsp),$num #lea $REG_SZ($ctx),$ctx - #lea `16*$REG_SZ/4`($inp),$inp + #lea `$inp_elm_size*$REG_SZ/4`($inp),$inp #dec $num #jnz .Loop_grande_avx2 diff --git a/crypto/sha/asm/sha256-mb-x86_64.pl b/crypto/sha/asm/sha256-mb-x86_64.pl index 7e9f486028..500a581a26 100644 --- a/crypto/sha/asm/sha256-mb-x86_64.pl +++ b/crypto/sha/asm/sha256-mb-x86_64.pl @@ -51,6 +51,11 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86_64-support.pl"; + +$ptr_size=&pointer_size($flavour); + $avx=0; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -93,6 +98,7 @@ $inp="%rsi"; # 2nd arg $num="%edx"; # 3rd arg @ptr=map("%r$_",(8..11)); $Tbl="%rbp"; +$inp_elm_size=2*$ptr_size; @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15)); ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7)); @@ -291,9 +297,12 @@ $code.=<<___; xor $num,$num ___ for($i=0;$i<4;$i++) { + $ptr_reg=&pointer_register($flavour,@ptr[$i]); $code.=<<___; - mov `16*$i+0`($inp),@ptr[$i] # input pointer - mov `16*$i+8`($inp),%ecx # number of blocks + # input pointer + mov `$inp_elm_size*$i+0`($inp),$ptr_reg + # number of blocks + mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx @@ -392,7 +401,7 @@ $code.=<<___; mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx - lea `16*$REG_SZ/4`($inp),$inp + lea `$inp_elm_size*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande @@ -470,9 +479,12 @@ $code.=<<___; xor $num,$num ___ for($i=0;$i<2;$i++) { + $ptr_reg=&pointer_register($flavour,@ptr[$i]); $code.=<<___; - mov `16*$i+0`($inp),@ptr[$i] # input pointer - mov `16*$i+8`($inp),%ecx # number of blocks + # input pointer + mov `$inp_elm_size*$i+0`($inp),$ptr_reg + # number of blocks + mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx @@ -753,7 +765,7 @@ $code.=<<___; movq @MSG0[1],0xe0-0x80($ctx) # H1.H0 lea `$REG_SZ/2`($ctx),$ctx - lea `16*2`($inp),$inp + lea `$inp_elm_size*2`($inp),$inp dec $num jnz .Loop_grande_shaext @@ -990,9 +1002,12 @@ $code.=<<___; xor $num,$num ___ for($i=0;$i<4;$i++) { + $ptr_reg=&pointer_register($flavour,@ptr[$i]); $code.=<<___; - mov `16*$i+0`($inp),@ptr[$i] # input pointer - mov `16*$i+8`($inp),%ecx # number of blocks + # input pointer + mov `$inp_elm_size*$i+0`($inp),$ptr_reg + # number of blocks + mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx @@ -1089,7 +1104,7 @@ $code.=<<___; mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx - lea `16*$REG_SZ/4`($inp),$inp + lea `$inp_elm_size*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande_avx @@ -1180,9 +1195,12 @@ $code.=<<___; lea `$REG_SZ*16`(%rsp),%rbx ___ for($i=0;$i<8;$i++) { + $ptr_reg=&pointer_register($flavour,@ptr[$i]); $code.=<<___; - mov `16*$i+0`($inp),@ptr[$i] # input pointer - mov `16*$i+8`($inp),%ecx # number of blocks + # input pointer + mov `$inp_elm_size*$i+0`($inp),$ptr_reg + # number of blocks + mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx @@ -1279,7 +1297,7 @@ $code.=<<___; #mov `$REG_SZ*17+8`(%rsp),$num #lea $REG_SZ($ctx),$ctx - #lea `16*$REG_SZ/4`($inp),$inp + #lea `$inp_elm_size*$REG_SZ/4`($inp),$inp #dec $num #jnz .Loop_grande_avx2 -- 2.25.1