&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
# mm5-mm7, but it's done on on-demand basis...
sub BODY_00_15_sse2 {
+ my $prefetch=shift;
+
&movq ("mm5",$Fsse2); # load f
&movq ("mm6",$Gsse2); # load g
&movq ("mm7",$Hsse2); # load h
&movq ("mm1",$E); # %mm1 is sliding right
&movq ("mm2",$E); # %mm2 is sliding left
&psrlq ("mm1",14);
- &movq ($Esse2,$E); # module-scheduled save e
+ &movq ($Esse2,$E); # modulo-scheduled save e
&psllq ("mm2",23);
&movq ("mm3","mm1"); # %mm3 is T1
&psrlq ("mm1",4);
&pxor ("mm5","mm6"); # f^=g
&movq ($E,$Dsse2); # e = load d
&paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
-
+ &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
&paddq ("mm3","mm7"); # T1+=h
&movq ("mm5",$A); # %mm5 is sliding right
&pxor ("mm7","mm6");
&psllq ("mm6",6);
&pxor ("mm7","mm5");
- &movq (&QWP(0,"esp"),$A); # module-scheduled save a
+ &sub ("esp",8);
&pxor ("mm7","mm6"); # T2=Sigma0_512(a)
&movq ("mm5",$A); # %mm5=a
&por ($A,"mm2"); # a=a|c
+ &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
&pand ("mm5","mm2"); # %mm5=a&c
&pand ($A,"mm1"); # a=(a|c)&b
+ &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch);
&por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
- &sub ("esp",8);
&paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
&movq ($A,"mm3"); # a=T1
}
-&function_begin("sha512_block_data_order",16);
+&function_begin("sha512_block_data_order");
&mov ("esi",wparam(0)); # ctx
&mov ("edi",wparam(1)); # inp
&mov ("eax",wparam(2)); # num
&mov (&DWP(8,"esp"),"eax"); # inp+num*128
&mov (&DWP(12,"esp"),"ebx"); # saved sp
- &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("pic_point"));
+if ($sse2) {
+ &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
&bt (&DWP(0,"edx"),26);
&jnc (&label("loop_x86"));
&cmp (&LB("edx"),0x35);
&jne (&label("00_14_sse2"));
- &BODY_00_15_sse2();
+ &BODY_00_15_sse2(1);
&set_label("16_79_sse2",16);
- &movq ("mm3",&QWP(8*(9+16-1),"esp"));
- &movq ("mm6",&QWP(8*(9+16-14),"esp"));
- &movq ("mm1","mm3");
+ #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15
+ #&movq ("mm6",&QWP(8*(9+16-14),"esp"));
+ &movq ("mm1","mm2");
- &psrlq ("mm3",1);
+ &psrlq ("mm2",1);
&movq ("mm7","mm6");
&psrlq ("mm6",6);
- &movq ("mm2","mm3");
+ &movq ("mm3","mm2");
- &psrlq ("mm3",7-1);
+ &psrlq ("mm2",7-1);
&movq ("mm5","mm6");
&psrlq ("mm6",19-6);
- &pxor ("mm2","mm3");
+ &pxor ("mm3","mm2");
- &psrlq ("mm3",8-7);
+ &psrlq ("mm2",8-7);
&pxor ("mm5","mm6");
&psrlq ("mm6",61-19);
- &pxor ("mm2","mm3");
+ &pxor ("mm3","mm2");
- &movq ("mm3",&QWP(8*(9+16),"esp"));
+ &movq ("mm2",&QWP(8*(9+16),"esp"));
&psllq ("mm1",56);
&pxor ("mm5","mm6");
&psllq ("mm7",3);
- &pxor ("mm2","mm1");
+ &pxor ("mm3","mm1");
- &paddq ("mm3",&QWP(8*(9+16-9),"esp"));
+ &paddq ("mm2",&QWP(8*(9+16-9),"esp"));
&psllq ("mm1",63-56);
&pxor ("mm5","mm7");
&psllq ("mm7",45-3);
- &pxor ("mm2","mm1");
+ &pxor ("mm3","mm1");
&pxor ("mm5","mm7");
- &paddq ("mm2","mm5");
- &paddq ("mm2","mm3");
- &movq (&QWP(8*9,"esp"),"mm2");
+ &paddq ("mm3","mm5");
+ &paddq ("mm3","mm2");
+ &movq (&QWP(8*9,"esp"),"mm3");
- &BODY_00_15_sse2();
+ &BODY_00_15_sse2(1);
&cmp (&LB("edx"),0x17);
&jne (&label("16_79_sse2"));
&emms ();
&mov ("esp",&DWP(8*10+12,"esp")); # restore sp
&function_end_A();
-
+}
&set_label("loop_x86",16);
# copy input block to stack reversing byte and qword order
for ($i=0;$i<8;$i++) {