&movd("mm0",&DWP(0,$a)); # mm0 = a[i]
&pmuludq("mm0","mm0"); # a[i] *= a[i]
&lea($a,&DWP(4,$a)); # a++
- &movq(&DWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
+ &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
&sub($c,1);
&lea($r,&DWP(8,$r)); # r += 2
&jnz(&label("sqr_sse2_loop"));
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
-$_n0=&DWP(4*5,"esp");
+$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32; # size of above frame rounded up to 16n
&movq ($acc0,$mul1); # I wish movd worked for
&pand ($acc0,$mask); # inter-register transfers
- &pmuludq($mul1,$_n0); # *=n0
+ &pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
&paddq ($car1,$acc0);
&psrlq ($car1,32);
&paddq ($car1,$car0);
- &movq (&DWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
+ &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
\f
&inc ($i); # i++
&set_label("outer");
&movq ($car0,$mul1);
&pand ($acc0,$mask);
- &pmuludq($mul1,$_n0); # *=n0
+ &pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1);
&paddq ($car1,$acc0);
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
&paddq ($car1,$car0);
&paddq ($car1,$temp);
- &movq (&DWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
+ &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&lea ($i,&DWP(1,$i)); # i++
&cmp ($i,$num);
}
sub ::BP { &get_mem("BYTE",@_); }
sub ::DWP { &get_mem("DWORD",@_); }
-sub ::QWP { &get_mem("QWORD",@_); }
+sub ::QWP { &get_mem("",@_); }
sub ::BC { (($::mwerks)?"":"BYTE ")."@_"; }
sub ::DWC { (($::mwerks)?"":"DWORD ")."@_"; }
segment .bss
common ${under}OPENSSL_ia32cap_P 4
___
+ grep {s/(^extern\s+${under}OPENSSL_ia32cap_P)/\;$1/} @out;
push (@out,$tmp);
}
}