3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
9 # sha1_block procedure for x86_64.
11 # It was brought to my attention that on EM64T compiler-generated code
12 # was far behind 32-bit assembler implementation. This is unlike on
13 # Opteron where compiler-generated code was only 15% behind 32-bit
14 # assembler, which originally made it hard to motivate the effort.
15 # There was suggestion to mechanically translate 32-bit code, but I
16 # dismissed it, reasoning that x86_64 offers enough register bank
17 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
18 # implementation:-) However! While 64-bit code does performs better
19 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
20 # x86_64 does offer larger *addressable* bank, but out-of-order core
21 # reaches for even more registers through dynamic aliasing, and EM64T
22 # core must have managed to run-time optimize even 32-bit code just as
23 # good as 64-bit one. Performance improvement is summarized in the
26 # gcc 3.4 32-bit asm cycles/byte
27 # Opteron +45% +20% 6.8
31 open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
33 $ctx="%rdi"; # 1st arg
34 $inp="%rsi"; # 2nd arg
35 $num="%rdx"; # 3rd arg
37 # reassign arguments in order to produce more compact code
52 @V=($A,$B,$C,$D,$E,$T);
58 .type $func,\@function,3
65 mov %rdi,$ctx # reassigned argument
67 mov %rsi,$inp # reassigned argument
69 mov %rdx,$num # reassigned argument
93 my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
95 $code.=<<___ if ($i==0);
97 `"bswap $xi" if(!defined($host))`
100 $code.=<<___ if ($i<15);
101 lea 0x5a827999($xi,$e),$f
106 `"bswap $xi" if(!defined($host))`
115 $code.=".Lshortcut:\n" if ($i==15);
116 $code.=<<___ if ($i>=15);
117 lea 0x5a827999($xi,$e),$f
118 mov `4*($j%16)`(%rsp),$xi
121 xor `4*(($j+2)%16)`(%rsp),$xi
124 xor `4*(($j+8)%16)`(%rsp),$xi
127 xor `4*(($j+13)%16)`(%rsp),$xi
132 mov $xi,`4*($j%16)`(%rsp)
137 my ($i,$a,$b,$c,$d,$e,$f)=@_;
139 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
140 $code.=<<___ if ($i<79);
142 mov `4*($j%16)`(%rsp),$xi
145 xor `4*(($j+2)%16)`(%rsp),$xi
148 xor `4*(($j+8)%16)`(%rsp),$xi
151 xor `4*(($j+13)%16)`(%rsp),$xi
155 mov $xi,`4*($j%16)`(%rsp)
157 $code.=<<___ if ($i==79);
171 my ($i,$a,$b,$c,$d,$e,$f)=@_;
174 lea 0x8f1bbcdc($xi,$e),$f
175 mov `4*($j%16)`(%rsp),$xi
178 xor `4*(($j+2)%16)`(%rsp),$xi
181 xor `4*(($j+8)%16)`(%rsp),$xi
184 xor `4*(($j+13)%16)`(%rsp),$xi
190 mov $xi,`4*($j%16)`(%rsp)
197 &PROLOGUE("sha1_block_asm_data_order");
198 $code.=".align 4\n.Lloop:\n";
199 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
200 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
201 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
202 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
215 xchg $E,$A # mov $E,$A
216 xchg $T,$B # mov $T,$B
217 xchg $E,$C # mov $A,$C
218 xchg $T,$D # mov $B,$D
220 lea `16*4`($inp),$inp
224 &EPILOGUE("sha1_block_asm_data_order");
226 ####################################################################
228 @V=($A,$B,$C,$D,$E,$T);
230 &PROLOGUE("sha1_block_asm_host_order");
231 for($i=0;$i<15;$i++) { &BODY_00_19($i,@V,1); unshift(@V,pop(@V)); }
234 .size sha1_block_asm_host_order,.-sha1_block_asm_host_order
237 $code =~ s/\`([^\`]*)\`/eval $1/gem;