Add IALU-only bn_mul_mont for SPARCv9. See commentary section for details.
authorAndy Polyakov <appro@openssl.org>
Thu, 15 Dec 2005 22:43:33 +0000 (22:43 +0000)
committerAndy Polyakov <appro@openssl.org>
Thu, 15 Dec 2005 22:43:33 +0000 (22:43 +0000)
crypto/bn/asm/sparcv9-mont.pl [new file with mode: 0644]

diff --git a/crypto/bn/asm/sparcv9-mont.pl b/crypto/bn/asm/sparcv9-mont.pl
new file mode 100644 (file)
index 0000000..0339bfe
--- /dev/null
@@ -0,0 +1,623 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+
+# December 2005
+#
+# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
+# for undertaken effort are multiple. First of all, UltraSPARC is not
+# the whole SPARCv9 universe and other VIS-free implementations deserve
+# optimized code as much. Secondly, newly introduced UltraSPARC T1,
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
+# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
+# several integrated RSA/DSA accelerator circuits accessible through
+# kernel driver [only(*)], but having decent user-land software
+# implementation is important too. Finally, reasons like desire to
+# experiment with dedicated squaring procedure. Yes, this module
+# implements one, because it was easiest to draft it in SPARCv9
+# instructions...
+
+# (*)  Engine accessing the driver in question is on my TODO list.
+#      For reference, acceleator is estimated to give 6 to 10 times
+#      improvement on single-threaded RSA sign. It should be noted
+#      that 6-10x improvement coefficient does not actually mean
+#      something extraordinary in terms of absolute [single-threaded]
+#      performance, as SPARCv9 instruction set is by all means least
+#      suitable for high performance crypto among other 64 bit
+#      platforms. 6-10x factor simply places T1 in same performance
+#      domain as say AMD64 and IA-64. Improvement of RSA verify don't
+#      appear impressive at all, but it's the sign operation which is
+#      far more critical/interesting.
+
+# You might notice that inner loops are modulo-scheduled:-) This has
+# essentially negligible impact on UltraSPARC performance, it's
+# Fujitsu SPARC64 V users who should notice and hopefully appreciate
+# the advantage... Currently this module surpasses sparcv9a-mont.pl
+# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
+# module still have hidden potential [see TODO list there], which is
+# estimated to be larger than 20%...
+
+# int bn_mul_mont(
+$rp="%i0";     # BN_ULONG *rp,
+$ap="%i1";     # const BN_ULONG *ap,
+$bp="%i2";     # const BN_ULONG *bp,
+$np="%i3";     # const BN_ULONG *np,
+$n0="%i4";     # const BN_ULONG *n0,
+$num="%i5";    # int num);
+
+$bits=32;
+for (@ARGV)    { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64) { $bias=2047; $frame=192; }
+else           { $bias=0;    $frame=128; }
+
+$car0="%o0";
+$car1="%o1";
+$car2="%o2";   # 1 bit
+$acc0="%o3";
+$acc1="%o4";
+$mask="%g1";   # 32 bits, what a waste...
+$tmp0="%g4";
+$tmp1="%g5";
+
+$i="%l0";
+$j="%l1";
+$mul0="%l2";
+$mul1="%l3";
+$tp="%l4";
+$apj="%l5";
+$npj="%l6";
+$tpj="%l7";
+
+$fname="bn_mul_mont";
+
+$code=<<___;
+.section       ".text",#alloc,#execinstr
+
+.global        $fname
+.align 32
+$fname:
+       cmp     %o5,4                   ! 128 bits minimum
+       bge,pt  %icc,.Lenter
+       sethi   %hi(0xffffffff),$mask
+       retl
+       clr     %o0
+.align 32
+.Lenter:
+       save    %sp,-$frame,%sp
+       sll     $num,2,$num             ! num*=4
+       or      $mask,%lo(0xffffffff),$mask
+       ld      [$n0],$n0
+       cmp     $ap,$bp
+       and     $num,$mask,$num
+       ld      [$bp],$mul0             ! bp[0]
+       be,pt   `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
+       nop
+
+       add     %sp,$bias,%o7           ! real top of stack
+       ld      [$ap],$car0             ! ap[0]
+       sub     %o7,$num,%o7
+       ld      [$ap+4],$apj            ! ap[1]
+       and     %o7,-1024,%o7
+       ld      [$np],$car1             ! np[0]
+       sub     %o7,$bias,%sp           ! alloca
+       ld      [$np+4],$npj            ! np[1]
+       mov     12,$j
+
+       mulx    $car0,$mul0,$car0       ! ap[0]*bp[0]
+       mulx    $apj,$mul0,$tmp0        !prologue! ap[1]*bp[0]
+       and     $car0,$mask,$acc0
+       add     %sp,$bias+$frame,$tp
+       ld      [$ap+8],$apj            !prologue!
+
+       mulx    $n0,$acc0,$mul1         ! "t[0]"*n0
+       and     $mul1,$mask,$mul1
+
+       mulx    $car1,$mul1,$car1       ! np[0]*"t[0]"*n0
+       mulx    $npj,$mul1,$acc1        !prologue! np[1]*"t[0]"*n0
+       srlx    $car0,32,$car0
+       add     $acc0,$car1,$car1
+       ld      [$np+8],$npj            !prologue!
+       srlx    $car1,32,$car1
+       mov     $tmp0,$acc0             !prologue!
+
+.L1st:
+       mulx    $apj,$mul0,$tmp0
+       mulx    $npj,$mul1,$tmp1
+       add     $acc0,$car0,$car0
+       ld      [$ap+$j],$apj           ! ap[j]
+       and     $car0,$mask,$acc0
+       add     $acc1,$car1,$car1
+       ld      [$np+$j],$npj           ! np[j]
+       srlx    $car0,32,$car0
+       add     $acc0,$car1,$car1
+       add     $j,4,$j                 ! j++
+       mov     $tmp0,$acc0
+       st      $car1,[$tp]
+       cmp     $j,$num
+       mov     $tmp1,$acc1
+       srlx    $car1,32,$car1
+       bl      %icc,.L1st
+       add     $tp,4,$tp               ! tp++
+!.L1st
+
+       mulx    $apj,$mul0,$tmp0        !epilogue!
+       mulx    $npj,$mul1,$tmp1
+       add     $acc0,$car0,$car0
+       and     $car0,$mask,$acc0
+       add     $acc1,$car1,$car1
+       srlx    $car0,32,$car0
+       add     $acc0,$car1,$car1
+       st      $car1,[$tp]
+       srlx    $car1,32,$car1
+
+       add     $tmp0,$car0,$car0
+       and     $car0,$mask,$acc0
+       add     $tmp1,$car1,$car1
+       srlx    $car0,32,$car0
+       add     $acc0,$car1,$car1
+       st      $car1,[$tp+4]
+       srlx    $car1,32,$car1
+
+       add     $car0,$car1,$car1
+       st      $car1,[$tp+8]
+       srlx    $car1,32,$car2
+\f
+       mov     4,$i                    ! i++
+       ld      [$bp+4],$mul0           ! bp[1]
+.Louter:
+       add     %sp,$bias+$frame,$tp
+       ld      [$ap],$car0             ! ap[0]
+       ld      [$ap+4],$apj            ! ap[1]
+       ld      [$np],$car1             ! np[0]
+       ld      [$np+4],$npj            ! np[1]
+       ld      [$tp],$tmp1             ! tp[0]
+       ld      [$tp+4],$tpj            ! tp[1]
+       mov     12,$j
+
+       mulx    $car0,$mul0,$car0
+       mulx    $apj,$mul0,$tmp0        !prologue!
+       add     $tmp1,$car0,$car0
+       ld      [$ap+8],$apj            !prologue!
+       and     $car0,$mask,$acc0
+
+       mulx    $n0,$acc0,$mul1
+       and     $mul1,$mask,$mul1
+
+       mulx    $car1,$mul1,$car1
+       mulx    $npj,$mul1,$acc1        !prologue!
+       srlx    $car0,32,$car0
+       add     $acc0,$car1,$car1
+       ld      [$np+8],$npj            !prologue!
+       srlx    $car1,32,$car1
+       mov     $tmp0,$acc0             !prologue!
+
+.Linner:
+       mulx    $apj,$mul0,$tmp0
+       mulx    $npj,$mul1,$tmp1
+       add     $tpj,$car0,$car0
+       ld      [$ap+$j],$apj           ! ap[j]
+       add     $acc0,$car0,$car0
+       add     $acc1,$car1,$car1
+       ld      [$np+$j],$npj           ! np[j]
+       and     $car0,$mask,$acc0
+       ld      [$tp+8],$tpj            ! tp[j]
+       srlx    $car0,32,$car0
+       add     $acc0,$car1,$car1
+       add     $j,4,$j                 ! j++
+       mov     $tmp0,$acc0
+       st      $car1,[$tp]             ! tp[j-1]
+       srlx    $car1,32,$car1
+       mov     $tmp1,$acc1
+       cmp     $j,$num
+       bl      %icc,.Linner
+       add     $tp,4,$tp               ! tp++
+!.Linner
+
+       mulx    $apj,$mul0,$tmp0        !epilogue!
+       mulx    $npj,$mul1,$tmp1
+       add     $tpj,$car0,$car0
+       add     $acc0,$car0,$car0
+       ld      [$tp+8],$tpj            ! tp[j]
+       and     $car0,$mask,$acc0
+       add     $acc1,$car1,$car1
+       srlx    $car0,32,$car0
+       add     $acc0,$car1,$car1
+       st      $car1,[$tp]             ! tp[j-1]
+       srlx    $car1,32,$car1
+
+       add     $tpj,$car0,$car0
+       add     $tmp0,$car0,$car0
+       and     $car0,$mask,$acc0
+       add     $tmp1,$car1,$car1
+       add     $acc0,$car1,$car1
+       st      $car1,[$tp+4]           ! tp[j-1]
+       srlx    $car0,32,$car0
+       add     $i,4,$i                 ! i++
+       srlx    $car1,32,$car1
+
+       add     $car0,$car1,$car1
+       cmp     $i,$num
+       add     $car2,$car1,$car1
+       st      $car1,[$tp+8]
+
+       srlx    $car1,32,$car2
+       bl,a    %icc,.Louter
+       ld      [$bp+$i],$mul0          ! bp[i]
+!.Louter
+
+       add     $tp,12,$tp
+\f
+.Ltail:
+       add     $np,$num,$np
+       add     $rp,$num,$rp
+
+       cmp     $car2,0                 ! clears %icc.c
+       bne,pn  %icc,.Lsub
+       sub     %g0,$num,%o7            ! k=-num
+
+       cmp     $car1,$npj              ! compare top-most $tp and $np words
+       bcs,pt  %icc,.Lcopy             ! %icc.c is clean if not taken
+       nop
+
+.align 16,0x1000000
+.Lsub:
+       ld      [$tp+%o7],%o0
+       ld      [$np+%o7],%o1
+       subccc  %o0,%o1,%o1
+       st      %o1,[$rp+%o7]
+       add     %o7,4,%o7
+       brnz    %o7,.Lsub
+       nop
+       subccc  $car2,0,$car2
+       bcc     %icc,.Lzap
+       sub     %g0,$num,%o7
+
+.align 16,0x1000000
+.Lcopy:
+       ld      [$tp+%o7],%o0
+       st      %o0,[$rp+%o7]
+       add     %o7,4,%o7
+       brnz    %o7,.Lcopy
+       nop
+       ba      .Lzap
+       sub     %g0,$num,%o7
+
+.align 32
+.Lzap:
+       st      %g0,[$tp+%o7]
+       add     %o7,4,%o7
+       brnz    %o7,.Lzap
+       nop
+       mov     1,%i0
+       ret
+       restore
+___
+\f
+########
+######## bn_sqr_mont gives up to 20% improvement over above code
+########
+$sbit="%i2";           # re-use $bp!
+
+$code.=<<___;
+.align 32
+.Lbn_sqr_mont:
+       add     %sp,$bias,%o7                   ! real top of stack
+       ld      [$ap+4],$apj                    ! ap[1]
+       sub     %o7,$num,%o7
+       ld      [$np],$car1                     ! np[0]
+       and     %o7,-1024,%o7
+       ld      [$np+4],$npj                    ! np[1]
+       sub     %o7,$bias,%sp                   ! alloca
+       mov     12,$j
+
+       mulx    $mul0,$mul0,$car0               ! ap[0]*ap[0]
+       mulx    $apj,$mul0,$tmp0                !prologue!
+       and     $car0,$mask,$acc0
+       add     %sp,$bias+$frame,$tp
+       ld      [$ap+8],$apj                    !prologue!
+
+       mulx    $n0,$acc0,$mul1                 ! "t[0]"*n0
+       srlx    $car0,32,$car0
+       and     $mul1,$mask,$mul1
+
+       mulx    $car1,$mul1,$car1               ! np[0]*"t[0]"*n0
+       mulx    $npj,$mul1,$acc1                !prologue!
+       and     $car0,1,$sbit
+       ld      [$np+8],$npj                    !prologue!
+       srlx    $car0,1,$car0
+       add     $acc0,$car1,$car1
+       srlx    $car1,32,$car1
+       mov     $tmp0,$acc0                     !prologue!
+
+.Lsqr_1st:
+       mulx    $apj,$mul0,$tmp0
+       mulx    $npj,$mul1,$tmp1
+       add     $acc0,$car0,$car0               ! ap[j]*a0+c0
+       add     $acc1,$car1,$car1
+       ld      [$ap+$j],$apj                   ! ap[j]
+       and     $car0,$mask,$acc0
+       ld      [$np+$j],$npj                   ! np[j]
+       srlx    $car0,32,$car0
+       add     $acc0,$acc0,$acc0
+       or      $sbit,$acc0,$acc0
+       mov     $tmp1,$acc1
+       srlx    $acc0,32,$sbit
+       add     $j,4,$j                         ! j++
+       and     $acc0,$mask,$acc0
+       cmp     $j,$num
+       add     $acc0,$car1,$car1
+       st      $car1,[$tp]
+       mov     $tmp0,$acc0
+       srlx    $car1,32,$car1
+       bl      %icc,.Lsqr_1st
+       add     $tp,4,$tp                       ! tp++
+!.Lsqr_1st
+
+       mulx    $apj,$mul0,$tmp0                ! epilogue
+       mulx    $npj,$mul1,$tmp1
+       add     $acc0,$car0,$car0               ! ap[j]*a0+c0
+       add     $acc1,$car1,$car1
+       and     $car0,$mask,$acc0
+       srlx    $car0,32,$car0
+       add     $acc0,$acc0,$acc0
+       or      $sbit,$acc0,$acc0
+       srlx    $acc0,32,$sbit
+       and     $acc0,$mask,$acc0
+       add     $acc0,$car1,$car1
+       st      $car1,[$tp]
+       srlx    $car1,32,$car1
+
+       add     $tmp0,$car0,$car0               ! ap[j]*a0+c0
+       add     $tmp1,$car1,$car1
+       and     $car0,$mask,$acc0
+       srlx    $car0,32,$car0
+       add     $acc0,$acc0,$acc0
+       or      $sbit,$acc0,$acc0
+       srlx    $acc0,32,$sbit
+       and     $acc0,$mask,$acc0
+       add     $acc0,$car1,$car1
+       st      $car1,[$tp+4]
+       srlx    $car1,32,$car1
+
+       add     $car0,$car0,$car0
+       or      $sbit,$car0,$car0
+       add     $car0,$car1,$car1
+       st      $car1,[$tp+8]
+       srlx    $car1,32,$car2
+\f
+       ld      [%sp+$bias+$frame],$tmp0        ! tp[0]
+       ld      [%sp+$bias+$frame+4],$tmp1      ! tp[1]
+       ld      [%sp+$bias+$frame+8],$tpj       ! tp[2]
+       ld      [$ap+4],$mul0                   ! ap[1]
+       ld      [$ap+8],$apj                    ! ap[2]
+       ld      [$np],$car1                     ! np[0]
+       ld      [$np+4],$npj                    ! np[1]
+       mulx    $n0,$tmp0,$mul1
+
+       mulx    $mul0,$mul0,$car0
+       and     $mul1,$mask,$mul1
+
+       mulx    $car1,$mul1,$car1
+       mulx    $npj,$mul1,$acc1
+       add     $tmp0,$car1,$car1
+       and     $car0,$mask,$acc0
+       ld      [$np+8],$npj                    ! np[2]
+       srlx    $car1,32,$car1
+       add     $tmp1,$car1,$car1
+       srlx    $car0,32,$car0
+       add     $acc0,$car1,$car1
+       and     $car0,1,$sbit
+       add     $acc1,$car1,$car1
+       srlx    $car0,1,$car0
+       mov     12,$j
+       st      $car1,[%sp+$bias+$frame]        ! tp[0]=
+       srlx    $car1,32,$car1
+       add     %sp,$bias+$frame+4,$tp
+
+.Lsqr_2nd:
+       mulx    $apj,$mul0,$acc0
+       mulx    $npj,$mul1,$acc1
+       add     $acc0,$car0,$car0
+       add     $tpj,$car1,$car1
+       ld      [$ap+$j],$apj                   ! ap[j]
+       and     $car0,$mask,$acc0
+       ld      [$np+$j],$npj                   ! np[j]
+       srlx    $car0,32,$car0
+       add     $acc1,$car1,$car1
+       ld      [$tp+8],$tpj                    ! tp[j]
+       add     $acc0,$acc0,$acc0
+       add     $j,4,$j                         ! j++
+       or      $sbit,$acc0,$acc0
+       srlx    $acc0,32,$sbit
+       and     $acc0,$mask,$acc0
+       cmp     $j,$num
+       add     $acc0,$car1,$car1
+       st      $car1,[$tp]                     ! tp[j-1]
+       srlx    $car1,32,$car1
+       bl      %icc,.Lsqr_2nd
+       add     $tp,4,$tp                       ! tp++
+!.Lsqr_2nd
+
+       mulx    $apj,$mul0,$acc0
+       mulx    $npj,$mul1,$acc1
+       add     $acc0,$car0,$car0
+       add     $tpj,$car1,$car1
+       and     $car0,$mask,$acc0
+       srlx    $car0,32,$car0
+       add     $acc1,$car1,$car1
+       add     $acc0,$acc0,$acc0
+       or      $sbit,$acc0,$acc0
+       srlx    $acc0,32,$sbit
+       and     $acc0,$mask,$acc0
+       add     $acc0,$car1,$car1
+       st      $car1,[$tp]                     ! tp[j-1]
+       srlx    $car1,32,$car1
+
+       add     $car0,$car0,$car0
+       or      $sbit,$car0,$car0
+       add     $car0,$car1,$car1
+       add     $car2,$car1,$car1
+       st      $car1,[$tp+4]
+       srlx    $car1,32,$car2
+\f
+       ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
+       ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
+       ld      [$ap+8],$mul0                   ! ap[2]
+       ld      [$np],$car1                     ! np[0]
+       ld      [$np+4],$npj                    ! np[1]
+       mulx    $n0,$tmp1,$mul1
+       and     $mul1,$mask,$mul1
+       mov     8,$i
+
+       mulx    $mul0,$mul0,$car0
+       mulx    $car1,$mul1,$car1
+       and     $car0,$mask,$acc0
+       add     $tmp1,$car1,$car1
+       srlx    $car0,32,$car0
+       add     %sp,$bias+$frame,$tp
+       srlx    $car1,32,$car1
+       and     $car0,1,$sbit
+       srlx    $car0,1,$car0
+       mov     4,$j
+
+.Lsqr_outer:
+.Lsqr_inner1:
+       mulx    $npj,$mul1,$acc1
+       add     $tpj,$car1,$car1
+       add     $j,4,$j
+       ld      [$tp+8],$tpj
+       cmp     $j,$i
+       add     $acc1,$car1,$car1
+       ld      [$np+$j],$npj
+       st      $car1,[$tp]
+       srlx    $car1,32,$car1
+       bl      %icc,.Lsqr_inner1
+       add     $tp,4,$tp
+!.Lsqr_inner1
+
+       add     $j,4,$j
+       ld      [$ap+$j],$apj                   ! ap[j]
+       mulx    $npj,$mul1,$acc1
+       add     $tpj,$car1,$car1
+       ld      [$np+$j],$npj                   ! np[j]
+       add     $acc0,$car1,$car1
+       ld      [$tp+8],$tpj                    ! tp[j]
+       add     $acc1,$car1,$car1
+       st      $car1,[$tp]
+       srlx    $car1,32,$car1
+
+       add     $j,4,$j
+       cmp     $j,$num
+       be,pn   %icc,.Lsqr_no_inner2
+       add     $tp,4,$tp
+
+.Lsqr_inner2:
+       mulx    $apj,$mul0,$acc0
+       mulx    $npj,$mul1,$acc1
+       add     $tpj,$car1,$car1
+       add     $acc0,$car0,$car0
+       ld      [$ap+$j],$apj                   ! ap[j]
+       and     $car0,$mask,$acc0
+       ld      [$np+$j],$npj                   ! np[j]
+       srlx    $car0,32,$car0
+       add     $acc0,$acc0,$acc0
+       ld      [$tp+8],$tpj                    ! tp[j]
+       or      $sbit,$acc0,$acc0
+       add     $j,4,$j                         ! j++
+       srlx    $acc0,32,$sbit
+       and     $acc0,$mask,$acc0
+       cmp     $j,$num
+       add     $acc0,$car1,$car1
+       add     $acc1,$car1,$car1
+       st      $car1,[$tp]                     ! tp[j-1]
+       srlx    $car1,32,$car1
+       bl      %icc,.Lsqr_inner2
+       add     $tp,4,$tp                       ! tp++
+
+.Lsqr_no_inner2:
+       mulx    $apj,$mul0,$acc0
+       mulx    $npj,$mul1,$acc1
+       add     $tpj,$car1,$car1
+       add     $acc0,$car0,$car0
+       and     $car0,$mask,$acc0
+       srlx    $car0,32,$car0
+       add     $acc0,$acc0,$acc0
+       or      $sbit,$acc0,$acc0
+       srlx    $acc0,32,$sbit
+       and     $acc0,$mask,$acc0
+       add     $acc0,$car1,$car1
+       add     $acc1,$car1,$car1
+       st      $car1,[$tp]                     ! tp[j-1]
+       srlx    $car1,32,$car1
+
+       add     $car0,$car0,$car0
+       or      $sbit,$car0,$car0
+       add     $car0,$car1,$car1
+       add     $car2,$car1,$car1
+       st      $car1,[$tp+4]
+       srlx    $car1,32,$car2
+\f
+       add     $i,4,$i                         ! i++
+       ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
+       ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
+       ld      [$ap+$i],$mul0                  ! ap[j]
+       ld      [$np],$car1                     ! np[0]
+       ld      [$np+4],$npj                    ! np[1]
+       mulx    $n0,$tmp1,$mul1
+       and     $mul1,$mask,$mul1
+       add     $i,4,$tmp0
+
+       mulx    $mul0,$mul0,$car0
+       mulx    $car1,$mul1,$car1
+       and     $car0,$mask,$acc0
+       add     $tmp1,$car1,$car1
+       srlx    $car0,32,$car0
+       add     %sp,$bias+$frame,$tp
+       srlx    $car1,32,$car1
+       and     $car0,1,$sbit
+       srlx    $car0,1,$car0
+
+       cmp     $tmp0,$num                      ! i<num-1
+       bl      %icc,.Lsqr_outer
+       mov     4,$j
+\f
+.Lsqr_last:
+       mulx    $npj,$mul1,$acc1
+       add     $tpj,$car1,$car1
+       add     $j,4,$j
+       ld      [$tp+8],$tpj
+       cmp     $j,$i
+       add     $acc1,$car1,$car1
+       ld      [$np+$j],$npj
+       st      $car1,[$tp]
+       srlx    $car1,32,$car1
+       bl      %icc,.Lsqr_last
+       add     $tp,4,$tp
+!.Lsqr_last
+
+       mulx    $npj,$mul1,$acc1
+       add     $tpj,$car1,$car1
+       add     $acc0,$car1,$car1
+       add     $acc1,$car1,$car1
+       st      $car1,[$tp]
+       srlx    $car1,32,$car1
+
+       add     $car0,$car0,$car0               ! recover $car0
+       or      $sbit,$car0,$car0
+       add     $car0,$car1,$car1
+       add     $car2,$car1,$car1
+       st      $car1,[$tp+4]
+       srlx    $car1,32,$car2
+
+       ba      .Ltail
+       add     $tp,8,$tp
+.type  $fname,#function
+.size  $fname,(.-$fname)
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;