3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA256/512 block procedure for PA-RISC.
14 # SHA256 performance is >75% better than gcc 3.2 generated code on
15 # PA-7100LC. Compared to code generated by vendor compiler this
16 # implementation is almost 70% faster in 64-bit build, but delivers
17 # virtually same performance in 32-bit build on PA-8600.
19 # SHA512 performance is >2.9x better than gcc 3.2 generated code on
20 # PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
21 # code is executed on PA-RISC 2.0 processor and switches to 64-bit
22 # code path delivering adequate peformance even in "blended" 32-bit
23 # build. Though 64-bit code is not any faster than code generated by
24 # vendor compiler on PA-8600...
26 # Special thanks to polarhome.com for providing HP-UX account.
30 open STDOUT,">$output";
32 if ($flavour =~ /64/) {
52 if ($output =~ /512/) {
53 $func="sha512_block_data_order";
65 $func="sha256_block_data_order";
78 $FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
79 # [+ argument transfer]
80 $XOFF=16*$SZ+32; # local variables
82 $XOFF+=$FRAME_MARKER; # distance between %sp and local variables
84 $ctx="%r26"; # zapped by $a0
85 $inp="%r25"; # zapped by $a1
86 $num="%r24"; # zapped by $t0
94 @V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
96 @X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
97 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
100 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
102 _ror $e,$Sigma1[0],$a0
104 _ror $e,$Sigma1[1],$a1
108 _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
109 or $t0,$t1,$t1 ; Ch(e,f,g)
111 xor $a0,$a1,$a1 ; Sigma1(e)
113 _ror $a,$Sigma0[0],$a0
116 _ror $a,$Sigma0[1],$a1
120 _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
123 xor $a0,$a1,$a1 ; Sigma0(a)
125 xor $t1,$t0,$t0 ; Maj(a,b,c)
126 `"$LDM $SZ($Tbl),$t1" if ($i<15)`
134 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
137 _ror @X[($i+1)%16],$sigma0[0],$a0
138 _ror @X[($i+1)%16],$sigma0[1],$a1
139 addl @X[($i+9)%16],@X[$i],@X[$i]
140 _ror @X[($i+14)%16],$sigma1[0],$t0
141 _ror @X[($i+14)%16],$sigma1[1],$t1
143 _shr @X[($i+1)%16],$sigma0[2],$a1
145 _shr @X[($i+14)%16],$sigma1[2],$t1
146 xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
147 xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
149 addl $a0,@X[$i],@X[$i]
150 addl $t0,@X[$i],@X[$i]
152 $code.=<<___ if ($i==15);
154 comiclr,<> $LAST10BITS,$a1,%r0
155 ldo 1($Tbl),$Tbl ; signal end of $Tbl
157 &ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
163 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
168 $code.=<<___ if ($SZ==8);
169 .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
170 .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
171 .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
172 .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
173 .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
174 .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
175 .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
176 .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
177 .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
178 .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
179 .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
180 .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
181 .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
182 .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
183 .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
184 .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
185 .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
186 .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
187 .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
188 .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
189 .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
190 .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
191 .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
192 .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
193 .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
194 .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
195 .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
196 .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
197 .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
198 .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
199 .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
200 .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
201 .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
202 .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
203 .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
204 .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
205 .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
206 .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
207 .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
208 .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
210 $code.=<<___ if ($SZ==4);
211 .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
212 .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
213 .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
214 .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
215 .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
216 .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
217 .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
218 .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
219 .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
220 .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
221 .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
222 .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
223 .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
224 .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
225 .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
226 .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
230 .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
234 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
236 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
237 $PUSHMA %r3,$FRAME(%sp)
238 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
239 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
240 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
241 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
242 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
243 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
244 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
245 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
246 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
247 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
248 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
249 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
250 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
251 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
252 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
254 _shl $num,`log(16*$SZ)/log(2)`,$num
255 addl $inp,$num,$num ; $num to point at the end of $inp
257 $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
258 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
259 $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
264 andcm $Tbl,$t1,$Tbl ; wipe privilege level
265 ldo L\$table-L\$pic($Tbl),$Tbl
267 $code.=<<___ if ($SZ==8 && $SIZE_T==4);
270 extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
275 $LD `0*$SZ`($ctx),$A ; load context
284 extru $inp,31,`log($SZ)/log(2)`,$t0
287 mtctl $t0,%cr11 ; load %sar with align factor
292 andcm $inp,$t0,$t0 ; align $inp
294 for ($i=0;$i<15;$i++) { # load input block
295 $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
297 cmpb,*= $inp,$t0,L\$aligned
298 $LD `$SZ*15`($t0),@X[15]
299 $LD `$SZ*16`($t0),@X[16]
301 for ($i=0;$i<16;$i++) { # align data
302 $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
305 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
308 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
311 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
313 for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
315 bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
318 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
319 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
320 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
321 ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
323 $LD `0*$SZ`($ctx),@X[0] ; load context
324 $LD `1*$SZ`($ctx),@X[1]
325 $LD `2*$SZ`($ctx),@X[2]
326 $LD `3*$SZ`($ctx),@X[3]
327 $LD `4*$SZ`($ctx),@X[4]
328 $LD `5*$SZ`($ctx),@X[5]
330 $LD `6*$SZ`($ctx),@X[6]
332 $LD `7*$SZ`($ctx),@X[7]
333 ldo `16*$SZ`($inp),$inp ; advance $inp
335 $ST $A,`0*$SZ`($ctx) ; save context
350 cmpb,*<>,n $inp,$num,L\$oop
351 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
353 if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
363 @V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
364 $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
365 ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
366 "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
377 @X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
379 sub ROUND_00_15_pa1 {
380 my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
381 $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
382 my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
384 $code.=<<___ if (!$flag);
385 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
386 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
389 shd $ehi,$elo,$Sigma1[0],$t0
391 shd $elo,$ehi,$Sigma1[0],$t1
392 addc $Xhi,$hhi,$hhi ; h += X[i]
393 shd $ehi,$elo,$Sigma1[1],$t2
395 shd $elo,$ehi,$Sigma1[1],$t3
396 ldw -4($Tbl),$Xlo ; load K[i]
401 shd $ehi,$elo,$Sigma1[2],$t2
403 shd $elo,$ehi,$Sigma1[2],$t3
406 xor $t3,$t1,$t1 ; Sigma1(e)
409 addc $Xhi,$hhi,$hhi ; h += K[i]
410 xor $a3,$a1,$a1 ; Ch(e,f,g)
413 shd $ahi,$alo,$Sigma0[0],$t0
414 addc $t1,$hhi,$hhi ; h += Sigma1(e)
415 shd $alo,$ahi,$Sigma0[0],$t1
417 shd $ahi,$alo,$Sigma0[1],$t2
418 addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
419 shd $alo,$ahi,$Sigma0[1],$t3
423 shd $ahi,$alo,$Sigma0[2],$t2
425 shd $alo,$ahi,$Sigma0[2],$t3
428 xor $t3,$t1,$t1 ; Sigma0(a)
435 addc $hhi,$dhi,$dhi ; d += h
439 addc $t1,$hhi,$hhi ; h += Sigma0(a)
442 xor $a3,$a1,$a1 ; Maj(a,b,c)
443 addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
446 $code.=<<___ if ($i==15 && $flag);
447 extru $Xlo,31,10,$Xlo
448 comiclr,= $LAST10BITS,$Xlo,%r0
452 push(@X,shift(@X)); push(@X,shift(@X));
455 sub ROUND_16_xx_pa1 {
456 my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
460 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
461 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
462 ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
463 ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
464 ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
465 ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
466 shd $Xnhi,$Xnlo,$sigma0[0],$t0
467 shd $Xnlo,$Xnhi,$sigma0[0],$t1
469 shd $Xnhi,$Xnlo,$sigma0[1],$t2
471 shd $Xnlo,$Xnhi,$sigma0[1],$t3
473 shd $Xnhi,$Xnlo,$sigma0[2],$t2
475 extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
477 shd $a3,$a2,$sigma1[0],$a0
478 xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
479 shd $a2,$a3,$sigma1[0],$a1
481 shd $a3,$a2,$sigma1[1],$t2
483 shd $a2,$a3,$sigma1[1],$t3
485 shd $a3,$a2,$sigma1[2],$t2
487 extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
489 xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
493 stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
494 stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
496 &ROUND_00_15_pa1($i,@_,1);
499 ldw `0*4`($ctx),$Ahi ; load context
509 ldw `10*4`($ctx),$Fhi
510 ldw `11*4`($ctx),$Flo
511 ldw `12*4`($ctx),$Ghi
512 ldw `13*4`($ctx),$Glo
513 ldw `14*4`($ctx),$Hhi
514 ldw `15*4`($ctx),$Hlo
519 mtctl $t0,%cr11 ; load %sar with align factor
523 comib,= 0,$a3,L\$aligned_pa1
526 ldw `0*4`($inp),$X[0]
527 ldw `1*4`($inp),$X[1]
534 vshd $X[0],$X[1],$X[0]
536 stw $X[0],`-$XOFF+0*4`(%sp)
539 stw $X[1],`-$XOFF+1*4`(%sp)
544 my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
545 for ($i=2;$i<=(128/4-8);$i++) {
547 stw $t[0],`-$XOFF+$i*4`(%sp)
548 ldw `(8+$i)*4`($inp),$t[0]
549 vshd $t[1],$t[2],$t[1]
553 for (;$i<(128/4-1);$i++) {
555 stw $t[0],`-$XOFF+$i*4`(%sp)
556 vshd $t[1],$t[2],$t[1]
562 stw $t[0],`-$XOFF+$i*4`(%sp)
568 ldw `0*4`($inp),$X[0]
569 ldw `1*4`($inp),$X[1]
576 stw $X[0],`-$XOFF+0*4`(%sp)
578 stw $X[1],`-$XOFF+1*4`(%sp)
582 my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
583 for ($i=2;$i<(128/4-8);$i++) {
585 stw $t[0],`-$XOFF+$i*4`(%sp)
586 ldw `(8+$i)*4`($inp),$t[0]
590 for (;$i<128/4;$i++) {
592 stw $t[0],`-$XOFF+$i*4`(%sp)
596 $code.="L\$collected_pa1\n";
599 for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
600 $code.="L\$rounds_pa1\n";
601 for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
604 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
605 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
606 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
607 ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
609 ldw `0*4`($ctx),$t1 ; update context
648 ldo `16*$SZ`($inp),$inp ; advance $inp
652 stw $Fhi,`10*4`($ctx)
653 stw $Flo,`11*4`($ctx)
654 stw $Ghi,`12*4`($ctx)
655 stw $Glo,`13*4`($ctx)
656 stw $Hhi,`14*4`($ctx)
657 comb,= $inp,$num,L\$done
658 stw $Hlo,`15*4`($ctx)
660 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
665 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
666 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
667 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
668 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
669 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
670 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
671 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
672 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
673 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
674 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
675 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
676 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
677 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
678 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
679 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
680 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
683 $POPMB -$FRAME(%sp),%r3
685 .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
688 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
689 # that it can be compiled with .LEVEL 1.0. It should be noted that I
690 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
694 my ($mod,$args) = @_;
695 my $orig = "ldd$mod\t$args";
697 if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
698 { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
699 $opcode|=(1<<3) if ($mod =~ /^,m/);
700 $opcode|=(1<<2) if ($mod =~ /^,mb/);
701 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
707 my ($mod,$args) = @_;
708 my $orig = "std$mod\t$args";
710 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
711 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
712 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
718 my ($mod,$args) = @_;
719 my $orig = "extrd$mod\t$args";
721 # I only have ",u" completer, it's implicitly encoded...
722 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
723 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
725 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
726 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
727 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
729 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
730 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
732 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
733 $opcode |= (1<<13) if ($mod =~ /,\**=/);
734 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
740 my ($mod,$args) = @_;
741 my $orig = "shrpd$mod\t$args";
743 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
744 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
746 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
747 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
749 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
750 { sprintf "\t.WORD\t0x%08x\t; %s",
751 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
757 my ($mnemonic,$mod,$args)=@_;
758 my $opcode = eval("\$$mnemonic");
760 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
763 foreach (split("\n",$code)) {
764 s/\`([^\`]*)\`/eval $1/ge;
766 s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
767 $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
768 : sprintf("shd\t%$1,%$2,%d",$3)/e or
769 # translate made up instructons: _ror, _shr, _align, _shl
770 s/_ror(\s+)(%r[0-9]+),/
771 ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
773 s/_shr(\s+%r[0-9]+),([0-9]+),/
774 $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
775 : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
777 s/_align(\s+%r[0-9]+,%r[0-9]+),/
778 ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
780 s/_shl(\s+%r[0-9]+),([0-9]+),/
781 $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
782 : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
784 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
786 s/cmpb,\*/comb,/ if ($SIZE_T==4);
788 s/\bbv\b/bve/ if ($SIZE_T==8);