sub main'shr { &out2("shrl",@_); }
sub main'xor { &out2("xorl",@_); }
sub main'xorb { &out2("xorb",@_); }
-sub main'add { &out2("addl",@_); }
+sub main'add { &out2($_[0]=~/%[a-d][lh]/?"addb":"addl",@_); }
sub main'adc { &out2("adcl",@_); }
sub main'sub { &out2("subl",@_); }
sub main'sbb { &out2("sbbl",@_); }
sub main'rotl { &out2("roll",@_); }
sub main'rotr { &out2("rorl",@_); }
-sub main'exch { &out2("xchg",@_); }
+sub main'exch { &out2($_[0]=~/%[a-d][lh]/?"xchgb":"xchgl",@_); }
sub main'cmp { &out2("cmpl",@_); }
sub main'lea { &out2("leal",@_); }
sub main'mul { &out1("mull",@_); }
+sub main'imul { &out2("imull",@_); }
sub main'div { &out1("divl",@_); }
sub main'jmp { &out1("jmp",@_); }
sub main'jmp_ptr { &out1p("jmp",@_); }
sub main'jnc { &out1("jnc",@_); }
sub main'jno { &out1("jno",@_); }
sub main'dec { &out1("decl",@_); }
-sub main'inc { &out1("incl",@_); }
+sub main'inc { &out1($_[0]=~/%[a-d][hl]/?"incb":"incl",@_); }
sub main'push { &out1("pushl",@_); $stack+=4; }
sub main'pop { &out1("popl",@_); $stack-=4; }
sub main'pushf { &out0("pushfl"); $stack+=4; }
{ if ($label{$i} eq $_[0]) { $pre=''; last; } }
&out1("call",$pre.$_[0]);
}
+sub main'call_ptr { &out1p("call",@_); }
sub main'ret { &out0("ret"); }
sub main'nop { &out0("nop"); }
sub main'test { &out2("testl",@_); }
sub main'bt { &out2("btl",@_); }
sub main'leave { &out0("leave"); }
-sub main'cpuid { &out0(".byte 0x0f; .byte 0xa2"); }
-sub main'rdtsc { &out0(".byte 0x0f; .byte 0x31"); }
+sub main'cpuid { &out0(".byte\t0x0f,0xa2"); }
+sub main'rdtsc { &out0(".byte\t0x0f,0x31"); }
sub main'halt { &out0("hlt"); }
+sub main'movz { &out2("movzbl",@_); }
+sub main'neg { &out1("negl",@_); }
+sub main'cld { &out0("cld"); }
# SSE2
sub main'emms { &out0("emms"); }
sub main'movd { &out2("movd",@_); }
-sub main'movq { &out2("movq",@_); }
sub main'movdqu { &out2("movdqu",@_); }
sub main'movdqa { &out2("movdqa",@_); }
sub main'movdq2q{ &out2("movdq2q",@_); }
sub main'pxor { &out2("pxor",@_); }
sub main'por { &out2("por",@_); }
sub main'pand { &out2("pand",@_); }
+sub main'movq {
+ local($p1,$p2,$optimize)=@_;
+ if ($optimize && $p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/)
+ # movq between mmx registers can sink Intel CPUs
+ { push(@out,"\tpshufw\t\$0xe4,%$p2,%$p1\n"); }
+ else { &out2("movq",@_); }
+ }
# The bswapl instruction is new for the 486. Emulate if i386.
sub main'bswap
}
}
+sub main'public_label
+ {
+ $label{$_[0]}="${under}${_[0]}" if (!defined($label{$_[0]}));
+ push(@out,".globl\t$label{$_[0]}\n");
+ }
+
sub main'label
{
if (!defined($label{$_[0]}))
sub main'file_end
{
# try to detect if SSE2 or MMX extensions were used on ELF platform...
- if ($main'elf && grep {/%[x]*mm[0-7]/i} @out) {
+ if ($main'elf && grep {/\b%[x]*mm[0-7]\b|OPENSSL_ia32cap_P\b/i} @out) {
local($tmp);
- push (@out,"\n.comm\t${under}OPENSSL_ia32cap_P,4,4\n");
-
- push (@out,".section\t.init\n");
- # One can argue that it's wasteful to craft every
- # SSE/MMX module with this snippet... Well, it's 72
- # bytes long and for the moment we have two modules.
- # Let's argue when we have 7 modules or so...
- #
- # $1<<10 sets a reserved bit to signal that variable
- # was initialized already...
- &main'picmeup("edx","OPENSSL_ia32cap_P");
- $tmp=<<___;
- cmpl \$0,(%edx)
- jne 1f
- movl \$1<<10,(%edx)
- pushf
- popl %eax
- movl %eax,%ecx
- xorl \$1<<21,%eax
- pushl %eax
- popf
- pushf
- popl %eax
- xorl %ecx,%eax
- btl \$21,%eax
- jnc 1f
- pushl %edi
- pushl %ebx
- movl %edx,%edi
- movl \$1,%eax
- .byte 0x0f; .byte 0xa2
- orl \$1<<10,%edx
- movl %edx,0(%edi)
- popl %ebx
- popl %edi
- .align $align
- 1:
-___
- push (@out,$tmp);
+ push (@out,"\n.section\t.bss\n");
+ push (@out,".comm\t${under}OPENSSL_ia32cap_P,4,4\n");
+
+ return;
}
if ($const ne "")
}
}
+sub main'data_byte
+ {
+ push(@out,"\t.byte\t".join(',',@_)."\n");
+ }
+
sub main'data_word
{
push(@out,"\t.long\t".join(',',@_)."\n");
$tmp=<<___;
.section .init
call $under$f
- .align $align
+ jmp .Linitalign
+.align $align
+.Linitalign:
___
}
elsif ($main'coff)