+# &asm_init(<flavor>,"des-586.pl"[,$i386only]);
+# &function_begin("foo");
+# ...
+# &function_end("foo");
+# &asm_finish
+
+$out=();
+$i386=0;
+
+# AUTOLOAD is this context has quite unpleasant side effect, namely
+# that typos in function calls effectively go to assembler output,
+# but on the pros side we don't have to implement one subroutine per
+# each opcode...
+sub ::AUTOLOAD
+{ my $opcode = $AUTOLOAD;
+
+ die "more than 4 arguments passed to $opcode" if ($#_>3);
+
+ $opcode =~ s/.*:://;
+ if ($opcode =~ /^push/) { $stack+=4; }
+ elsif ($opcode =~ /^pop/) { $stack-=4; }
+
+ &generic($opcode,@_) or die "undefined subroutine \&$AUTOLOAD";
+}
+
+sub ::emit
+{ my $opcode=shift;
+
+ if ($#_==-1) { push(@out,"\t$opcode\n"); }
+ else { push(@out,"\t$opcode\t".join(',',@_)."\n"); }
+}
+
+sub ::LB
+{ $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'low byte'";
+ $1."l";
+}
+sub ::HB
+{ $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'high byte'";
+ $1."h";
+}
+sub ::stack_push{ my $num=$_[0]*4; $stack+=$num; &sub("esp",$num); }
+sub ::stack_pop { my $num=$_[0]*4; $stack-=$num; &add("esp",$num); }
+sub ::blindpop { &pop($_[0]); $stack+=4; }
+sub ::wparam { &DWP($stack+4*$_[0],"esp"); }
+sub ::swtmp { &DWP(4*$_[0],"esp"); }
+
+sub ::bswap
+{ if ($i386) # emulate bswap for i386
+ { &comment("bswap @_");
+ &xchg(&HB(@_),&LB(@_));
+ &ror (@_,16);
+ &xchg(&HB(@_),&LB(@_));
+ }
+ else
+ { &generic("bswap",@_); }
+}
+# These are made-up opcodes introduced over the years essentially
+# by ignorance, just alias them to real ones...
+sub ::movb { &mov(@_); }
+sub ::xorb { &xor(@_); }
+sub ::rotl { &rol(@_); }
+sub ::rotr { &ror(@_); }
+sub ::exch { &xchg(@_); }
+sub ::halt { &hlt; }
+sub ::movz { &movzx(@_); }
+sub ::pushf { &pushfd; }
+sub ::popf { &popfd; }
+
+# 3 argument instructions
+sub ::movq
+{ my($p1,$p2,$optimize)=@_;
+
+ if ($optimize && $p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/)
+ # movq between mmx registers can sink Intel CPUs
+ { &::pshufw($p1,$p2,0xe4); }
+ else
+ { &::generic("movq",@_); }
+}
+
+# SSE>2 instructions
+my %regrm = ( "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
+ "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7 );
+sub ::pextrd
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); }
+ else
+ { &::generic("pextrd",@_); }
+}
+
+sub ::pinsrd
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); }
+ else
+ { &::generic("pinsrd",@_); }
+}
+
+sub ::pshufb
+{ my($dst,$src)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2); }
+ else
+ { &::generic("pshufb",@_); }
+}
+
+sub ::palignr
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); }
+ else
+ { &::generic("palignr",@_); }
+}
+
+sub ::pclmulqdq
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); }
+ else
+ { &::generic("pclmulqdq",@_); }
+}
+
+sub ::rdrand
+{ my ($dst)=@_;
+ if ($dst =~ /(e[a-dsd][ixp])/)
+ { &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst}); }
+ else
+ { &::generic("rdrand",@_); }
+}
+
+sub ::rdseed
+{ my ($dst)=@_;
+ if ($dst =~ /(e[a-dsd][ixp])/)
+ { &::data_byte(0x0f,0xc7,0xf8|$regrm{$dst}); }
+ else
+ { &::generic("rdrand",@_); }
+}
+
+sub rxb {
+ local *opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+ $rxb|=0x7<<5;
+ $rxb&=~(0x04<<5) if($dst>=8);
+ $rxb&=~(0x01<<5) if($src1>=8);
+ $rxb&=~(0x02<<5) if($src2>=8);
+ push @opcode,$rxb;
+}