crypto/bn/asm/parisc-mont.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16
  17 # On PA-7100LC this module performs ~90-50% better, less for longer
  18 # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
  19 # that compiler utilized xmpyu instruction to perform 32x32=64-bit
  20 # multiplication, which in turn means that "baseline" performance was
  21 # optimal in respect to instruction set capabilities. Fair comparison
  22 # with vendor compiler is problematic, because OpenSSL doesn't define
  23 # BN_LLONG [presumably] for historical reasons, which drives compiler
  24 # toward 4 times 16x16=32-bit multiplications [plus complementary
  25 # shifts and additions] instead. This means that you should observe
  26 # several times improvement over code generated by vendor compiler
  27 # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
  28 # improvement coefficient was never collected on PA-7100LC, or any
  29 # other 1.1 CPU, because I don't have access to such machine with
  30 # vendor compiler. But to give you a taste, PA-RISC 1.1 code path
  31 # reportedly outperformed code generated by cc +DA1.1 +O3 by factor
  32 # of ~5x on PA-8600.
  33 #
  34 # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
  35 # reportedly ~2x faster than vendor compiler generated code [according
  36 # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
  37 # this implementation is actually 32-bit one, in the sense that it
  38 # operates on 32-bit values. But pa-risc2[W].s operates on arrays of
  39 # 64-bit BN_LONGs... How do they interoperate then? No problem. This
  40 # module picks halves of 64-bit values in reverse order and pretends
  41 # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
  42 # 64-bit code such as pa-risc2[W].s then? Well, the thing is that
  43 # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
  44 # i.e. there is no "wider" multiplication like on most other 64-bit
  45 # platforms. This means that even being effectively 32-bit, this
  46 # implementation performs "64-bit" computational task in same amount
  47 # of arithmetic operations, most notably multiplications. It requires
  48 # more memory references, most notably to tp[num], but this doesn't
  49 # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
  50 # 2.0 code path provides virtually same performance as pa-risc2[W].s:
  51 # it's ~10% better for shortest key length and ~10% worse for longest
  52 # one.
  53 #
  54 # In case it wasn't clear. The module has two distinct code paths:
  55 # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
  56 # additions and 64-bit integer loads, not to mention specific
  57 # instruction scheduling. In 64-bit build naturally only 2.0 code path
  58 # is assembled. In 32-bit application context both code paths are
  59 # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
  60 # is taken automatically. Also, in 32-bit build the module imposes
  61 # couple of limitations: vector lengths has to be even and vector
  62 # addresses has to be 64-bit aligned. Normally neither is a problem:
  63 # most common key lengths are even and vectors are commonly malloc-ed,
  64 # which ensures alignment.
  65 #
  66 # Special thanks to polarhome.com for providing HP-UX account on
  67 # PA-RISC 1.1 machine, and to correspondent who chose to remain
  68 # anonymous for testing the code on PA-RISC 2.0 machine.
  69 \f
  70 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  71
  72 $flavour = shift;
  73 $output = shift;
  74
  75 open STDOUT,">$output";
  76
  77 if ($flavour =~ /64/) {
  78         $LEVEL          ="2.0W";
  79         $SIZE_T         =8;
  80         $FRAME_MARKER   =80;
  81         $SAVED_RP       =16;
  82         $PUSH           ="std";
  83         $PUSHMA         ="std,ma";
  84         $POP            ="ldd";
  85         $POPMB          ="ldd,mb";
  86         $BN_SZ          =$SIZE_T;
  87 } else {
  88         $LEVEL          ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
  89         $SIZE_T         =4;
  90         $FRAME_MARKER   =48;
  91         $SAVED_RP       =20;
  92         $PUSH           ="stw";
  93         $PUSHMA         ="stwm";
  94         $POP            ="ldw";
  95         $POPMB          ="ldwm";
  96         $BN_SZ          =$SIZE_T;
  97         if (open CONF,"<${dir}../../opensslconf.h") {
  98             while(<CONF>) {
  99                 if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
 100                     $BN_SZ=8;
 101                     $LEVEL="2.0";
 102                     last;
 103                 }
 104             }
 105             close CONF;
 106         }
 107 }
 108
 109 $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
 110                                 #                [+ argument transfer]
 111 $LOCALS=$FRAME-$FRAME_MARKER;
 112 $FRAME+=32;                     # local variables
 113
 114 $tp="%r31";
 115 $ti1="%r29";
 116 $ti0="%r28";
 117
 118 $rp="%r26";
 119 $ap="%r25";
 120 $bp="%r24";
 121 $np="%r23";
 122 $n0="%r22";     # passed through stack in 32-bit
 123 $num="%r21";    # passed through stack in 32-bit
 124 $idx="%r20";
 125 $arrsz="%r19";
 126
 127 $nm1="%r7";
 128 $nm0="%r6";
 129 $ab1="%r5";
 130 $ab0="%r4";
 131
 132 $fp="%r3";
 133 $hi1="%r2";
 134 $hi0="%r1";
 135
 136 $xfer=$n0;      # accommodates [-16..15] offset in fld[dw]s
 137
 138 $fm0="%fr4";    $fti=$fm0;
 139 $fbi="%fr5L";
 140 $fn0="%fr5R";
 141 $fai="%fr6";    $fab0="%fr7";   $fab1="%fr8";
 142 $fni="%fr9";    $fnm0="%fr10";  $fnm1="%fr11";
 143
 144 $code=<<___;
 145         .LEVEL  $LEVEL
 146         .SPACE  \$TEXT\$
 147         .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
 148
 149         .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
 150         .ALIGN  64
 151 bn_mul_mont
 152         .PROC
 153         .CALLINFO       FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
 154         .ENTRY
 155         $PUSH   %r2,-$SAVED_RP(%sp)             ; standard prologue
 156         $PUSHMA %r3,$FRAME(%sp)
 157         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
 158         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
 159         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
 160         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
 161         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
 162         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
 163         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
 164         ldo     -$FRAME(%sp),$fp
 165 ___
 166 $code.=<<___ if ($SIZE_T==4);
 167         ldw     `-$FRAME_MARKER-4`($fp),$n0
 168         ldw     `-$FRAME_MARKER-8`($fp),$num
 169         nop
 170         nop                                     ; alignment
 171 ___
 172 $code.=<<___ if ($BN_SZ==4);
 173         comiclr,<=      6,$num,%r0              ; are vectors long enough?
 174         b               L\$abort
 175         ldi             0,%r28                  ; signal "unhandled"
 176         add,ev          %r0,$num,$num           ; is $num even?
 177         b               L\$abort
 178         nop
 179         or              $ap,$np,$ti1
 180         extru,=         $ti1,31,3,%r0           ; are ap and np 64-bit aligned?
 181         b               L\$abort
 182         nop
 183         nop                                     ; alignment
 184         nop
 185
 186         fldws           0($n0),${fn0}
 187         fldws,ma        4($bp),${fbi}           ; bp[0]
 188 ___
 189 $code.=<<___ if ($BN_SZ==8);
 190         comib,>         3,$num,L\$abort         ; are vectors long enough?
 191         ldi             0,%r28                  ; signal "unhandled"
 192         addl            $num,$num,$num          ; I operate on 32-bit values
 193
 194         fldws           4($n0),${fn0}           ; only low part of n0
 195         fldws           4($bp),${fbi}           ; bp[0] in flipped word order
 196 ___
 197 $code.=<<___;
 198         fldds           0($ap),${fai}           ; ap[0,1]
 199         fldds           0($np),${fni}           ; np[0,1]
 200
 201         sh2addl         $num,%r0,$arrsz
 202         ldi             31,$hi0
 203         ldo             36($arrsz),$hi1         ; space for tp[num+1]
 204         andcm           $hi1,$hi0,$hi1          ; align
 205         addl            $hi1,%sp,%sp
 206         $PUSH           $fp,-$SIZE_T(%sp)
 207
 208         ldo             `$LOCALS+16`($fp),$xfer
 209         ldo             `$LOCALS+32+4`($fp),$tp
 210
 211         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[0]*bp[0]
 212         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[1]*bp[0]
 213         xmpyu           ${fn0},${fab0}R,${fm0}
 214
 215         addl            $arrsz,$ap,$ap          ; point at the end
 216         addl            $arrsz,$np,$np
 217         subi            0,$arrsz,$idx           ; j=0
 218         ldo             8($idx),$idx            ; j++++
 219
 220         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
 221         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
 222         fstds           ${fab0},-16($xfer)
 223         fstds           ${fnm0},-8($xfer)
 224         fstds           ${fab1},0($xfer)
 225         fstds           ${fnm1},8($xfer)
 226          flddx          $idx($ap),${fai}        ; ap[2,3]
 227          flddx          $idx($np),${fni}        ; np[2,3]
 228 ___
 229 $code.=<<___ if ($BN_SZ==4);
 230         mtctl           $hi0,%cr11              ; $hi0 still holds 31
 231         extrd,u,*=      $hi0,%sar,1,$hi0        ; executes on PA-RISC 1.0
 232         b               L\$parisc11
 233         nop
 234 ___
 235 $code.=<<___;                                   # PA-RISC 2.0 code-path
 236         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 237         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 238         ldd             -16($xfer),$ab0
 239         fstds           ${fab0},-16($xfer)
 240
 241         extrd,u         $ab0,31,32,$hi0
 242         extrd,u         $ab0,63,32,$ab0
 243         ldd             -8($xfer),$nm0
 244         fstds           ${fnm0},-8($xfer)
 245          ldo            8($idx),$idx            ; j++++
 246          addl           $ab0,$nm0,$nm0          ; low part is discarded
 247          extrd,u        $nm0,31,32,$hi1
 248 \f
 249 L\$1st
 250         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
 251         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 252         ldd             0($xfer),$ab1
 253         fstds           ${fab1},0($xfer)
 254          addl           $hi0,$ab1,$ab1
 255          extrd,u        $ab1,31,32,$hi0
 256         ldd             8($xfer),$nm1
 257         fstds           ${fnm1},8($xfer)
 258          extrd,u        $ab1,63,32,$ab1
 259          addl           $hi1,$nm1,$nm1
 260         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 261         flddx           $idx($np),${fni}        ; np[j,j+1]
 262          addl           $ab1,$nm1,$nm1
 263          extrd,u        $nm1,31,32,$hi1
 264
 265         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 266         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 267         ldd             -16($xfer),$ab0
 268         fstds           ${fab0},-16($xfer)
 269          addl           $hi0,$ab0,$ab0
 270          extrd,u        $ab0,31,32,$hi0
 271         ldd             -8($xfer),$nm0
 272         fstds           ${fnm0},-8($xfer)
 273          extrd,u        $ab0,63,32,$ab0
 274          addl           $hi1,$nm0,$nm0
 275         stw             $nm1,-4($tp)            ; tp[j-1]
 276          addl           $ab0,$nm0,$nm0
 277          stw,ma         $nm0,8($tp)             ; tp[j-1]
 278         addib,<>        8,$idx,L\$1st           ; j++++
 279          extrd,u        $nm0,31,32,$hi1
 280
 281         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
 282         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 283         ldd             0($xfer),$ab1
 284         fstds           ${fab1},0($xfer)
 285          addl           $hi0,$ab1,$ab1
 286          extrd,u        $ab1,31,32,$hi0
 287         ldd             8($xfer),$nm1
 288         fstds           ${fnm1},8($xfer)
 289          extrd,u        $ab1,63,32,$ab1
 290          addl           $hi1,$nm1,$nm1
 291         ldd             -16($xfer),$ab0
 292          addl           $ab1,$nm1,$nm1
 293         ldd             -8($xfer),$nm0
 294          extrd,u        $nm1,31,32,$hi1
 295
 296          addl           $hi0,$ab0,$ab0
 297          extrd,u        $ab0,31,32,$hi0
 298         stw             $nm1,-4($tp)            ; tp[j-1]
 299          extrd,u        $ab0,63,32,$ab0
 300          addl           $hi1,$nm0,$nm0
 301         ldd             0($xfer),$ab1
 302          addl           $ab0,$nm0,$nm0
 303         ldd,mb          8($xfer),$nm1
 304          extrd,u        $nm0,31,32,$hi1
 305         stw,ma          $nm0,8($tp)             ; tp[j-1]
 306
 307         ldo             -1($num),$num           ; i--
 308         subi            0,$arrsz,$idx           ; j=0
 309 ___
 310 $code.=<<___ if ($BN_SZ==4);
 311         fldws,ma        4($bp),${fbi}           ; bp[1]
 312 ___
 313 $code.=<<___ if ($BN_SZ==8);
 314         fldws           0($bp),${fbi}           ; bp[1] in flipped word order
 315 ___
 316 $code.=<<___;
 317          flddx          $idx($ap),${fai}        ; ap[0,1]
 318          flddx          $idx($np),${fni}        ; np[0,1]
 319          fldws          8($xfer),${fti}R        ; tp[0]
 320         addl            $hi0,$ab1,$ab1
 321          extrd,u        $ab1,31,32,$hi0
 322          extrd,u        $ab1,63,32,$ab1
 323          ldo            8($idx),$idx            ; j++++
 324          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
 325          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
 326         addl            $hi1,$nm1,$nm1
 327         addl            $ab1,$nm1,$nm1
 328         extrd,u         $nm1,31,32,$hi1
 329          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 330         stw             $nm1,-4($tp)            ; tp[j-1]
 331
 332          fcpy,sgl       %fr0,${fti}L            ; zero high part
 333          fcpy,sgl       %fr0,${fab0}L
 334         addl            $hi1,$hi0,$hi0
 335         extrd,u         $hi0,31,32,$hi1
 336          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 337          fcnvxf,dbl,dbl ${fab0},${fab0}
 338         stw             $hi0,0($tp)
 339         stw             $hi1,4($tp)
 340
 341         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
 342         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
 343         xmpyu           ${fn0},${fab0}R,${fm0}
 344         ldo             `$LOCALS+32+4`($fp),$tp
 345 L\$outer
 346         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
 347         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
 348         fstds           ${fab0},-16($xfer)      ; 33-bit value
 349         fstds           ${fnm0},-8($xfer)
 350          flddx          $idx($ap),${fai}        ; ap[2]
 351          flddx          $idx($np),${fni}        ; np[2]
 352          ldo            8($idx),$idx            ; j++++
 353         ldd             -16($xfer),$ab0         ; 33-bit value
 354         ldd             -8($xfer),$nm0
 355         ldw             0($xfer),$hi0           ; high part
 356
 357         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 358         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 359          extrd,u        $ab0,31,32,$ti0         ; carry bit
 360          extrd,u        $ab0,63,32,$ab0
 361         fstds           ${fab1},0($xfer)
 362          addl           $ti0,$hi0,$hi0          ; account carry bit
 363         fstds           ${fnm1},8($xfer)
 364          addl           $ab0,$nm0,$nm0          ; low part is discarded
 365         ldw             0($tp),$ti1             ; tp[1]
 366          extrd,u        $nm0,31,32,$hi1
 367         fstds           ${fab0},-16($xfer)
 368         fstds           ${fnm0},-8($xfer)
 369 \f
 370 L\$inner
 371         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
 372         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 373         ldd             0($xfer),$ab1
 374         fstds           ${fab1},0($xfer)
 375          addl           $hi0,$ti1,$ti1
 376          addl           $ti1,$ab1,$ab1
 377         ldd             8($xfer),$nm1
 378         fstds           ${fnm1},8($xfer)
 379          extrd,u        $ab1,31,32,$hi0
 380          extrd,u        $ab1,63,32,$ab1
 381         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 382         flddx           $idx($np),${fni}        ; np[j,j+1]
 383          addl           $hi1,$nm1,$nm1
 384          addl           $ab1,$nm1,$nm1
 385         ldw             4($tp),$ti0             ; tp[j]
 386         stw             $nm1,-4($tp)            ; tp[j-1]
 387
 388         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 389         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 390         ldd             -16($xfer),$ab0
 391         fstds           ${fab0},-16($xfer)
 392          addl           $hi0,$ti0,$ti0
 393          addl           $ti0,$ab0,$ab0
 394         ldd             -8($xfer),$nm0
 395         fstds           ${fnm0},-8($xfer)
 396          extrd,u        $ab0,31,32,$hi0
 397          extrd,u        $nm1,31,32,$hi1
 398         ldw             8($tp),$ti1             ; tp[j]
 399          extrd,u        $ab0,63,32,$ab0
 400          addl           $hi1,$nm0,$nm0
 401          addl           $ab0,$nm0,$nm0
 402          stw,ma         $nm0,8($tp)             ; tp[j-1]
 403         addib,<>        8,$idx,L\$inner         ; j++++
 404          extrd,u        $nm0,31,32,$hi1
 405
 406         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
 407         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 408         ldd             0($xfer),$ab1
 409         fstds           ${fab1},0($xfer)
 410          addl           $hi0,$ti1,$ti1
 411          addl           $ti1,$ab1,$ab1
 412         ldd             8($xfer),$nm1
 413         fstds           ${fnm1},8($xfer)
 414          extrd,u        $ab1,31,32,$hi0
 415          extrd,u        $ab1,63,32,$ab1
 416         ldw             4($tp),$ti0             ; tp[j]
 417          addl           $hi1,$nm1,$nm1
 418          addl           $ab1,$nm1,$nm1
 419         ldd             -16($xfer),$ab0
 420         ldd             -8($xfer),$nm0
 421          extrd,u        $nm1,31,32,$hi1
 422
 423         addl            $hi0,$ab0,$ab0
 424          addl           $ti0,$ab0,$ab0
 425          stw            $nm1,-4($tp)            ; tp[j-1]
 426          extrd,u        $ab0,31,32,$hi0
 427         ldw             8($tp),$ti1             ; tp[j]
 428          extrd,u        $ab0,63,32,$ab0
 429          addl           $hi1,$nm0,$nm0
 430         ldd             0($xfer),$ab1
 431          addl           $ab0,$nm0,$nm0
 432         ldd,mb          8($xfer),$nm1
 433          extrd,u        $nm0,31,32,$hi1
 434          stw,ma         $nm0,8($tp)             ; tp[j-1]
 435
 436         addib,=         -1,$num,L\$outerdone    ; i--
 437         subi            0,$arrsz,$idx           ; j=0
 438 ___
 439 $code.=<<___ if ($BN_SZ==4);
 440         fldws,ma        4($bp),${fbi}           ; bp[i]
 441 ___
 442 $code.=<<___ if ($BN_SZ==8);
 443         ldi             12,$ti0                 ; bp[i] in flipped word order
 444         addl,ev         %r0,$num,$num
 445         ldi             -4,$ti0
 446         addl            $ti0,$bp,$bp
 447         fldws           0($bp),${fbi}
 448 ___
 449 $code.=<<___;
 450          flddx          $idx($ap),${fai}        ; ap[0]
 451         addl            $hi0,$ab1,$ab1
 452          flddx          $idx($np),${fni}        ; np[0]
 453          fldws          8($xfer),${fti}R        ; tp[0]
 454         addl            $ti1,$ab1,$ab1
 455         extrd,u         $ab1,31,32,$hi0
 456         extrd,u         $ab1,63,32,$ab1
 457
 458          ldo            8($idx),$idx            ; j++++
 459          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
 460          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
 461         ldw             4($tp),$ti0             ; tp[j]
 462
 463         addl            $hi1,$nm1,$nm1
 464          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 465         addl            $ab1,$nm1,$nm1
 466         extrd,u         $nm1,31,32,$hi1
 467          fcpy,sgl       %fr0,${fti}L            ; zero high part
 468          fcpy,sgl       %fr0,${fab0}L
 469         stw             $nm1,-4($tp)            ; tp[j-1]
 470
 471          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 472          fcnvxf,dbl,dbl ${fab0},${fab0}
 473         addl            $hi1,$hi0,$hi0
 474          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
 475         addl            $ti0,$hi0,$hi0
 476         extrd,u         $hi0,31,32,$hi1
 477          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
 478         stw             $hi0,0($tp)
 479         stw             $hi1,4($tp)
 480          xmpyu          ${fn0},${fab0}R,${fm0}
 481
 482         b               L\$outer
 483         ldo             `$LOCALS+32+4`($fp),$tp
 484 \f
 485 L\$outerdone
 486         addl            $hi0,$ab1,$ab1
 487         addl            $ti1,$ab1,$ab1
 488         extrd,u         $ab1,31,32,$hi0
 489         extrd,u         $ab1,63,32,$ab1
 490
 491         ldw             4($tp),$ti0             ; tp[j]
 492
 493         addl            $hi1,$nm1,$nm1
 494         addl            $ab1,$nm1,$nm1
 495         extrd,u         $nm1,31,32,$hi1
 496         stw             $nm1,-4($tp)            ; tp[j-1]
 497
 498         addl            $hi1,$hi0,$hi0
 499         addl            $ti0,$hi0,$hi0
 500         extrd,u         $hi0,31,32,$hi1
 501         stw             $hi0,0($tp)
 502         stw             $hi1,4($tp)
 503
 504         ldo             `$LOCALS+32`($fp),$tp
 505         sub             %r0,%r0,%r0             ; clear borrow
 506 ___
 507 $code.=<<___ if ($BN_SZ==4);
 508         ldws,ma         4($tp),$ti0
 509         extru,=         $rp,31,3,%r0            ; is rp 64-bit aligned?
 510         b               L\$sub_pa11
 511         addl            $tp,$arrsz,$tp
 512 L\$sub
 513         ldwx            $idx($np),$hi0
 514         subb            $ti0,$hi0,$hi1
 515         ldwx            $idx($tp),$ti0
 516         addib,<>        4,$idx,L\$sub
 517         stws,ma         $hi1,4($rp)
 518
 519         subb            $ti0,%r0,$hi1
 520         ldo             -4($tp),$tp
 521 ___
 522 $code.=<<___ if ($BN_SZ==8);
 523         ldd,ma          8($tp),$ti0
 524 L\$sub
 525         ldd             $idx($np),$hi0
 526         shrpd           $ti0,$ti0,32,$ti0       ; flip word order
 527         std             $ti0,-8($tp)            ; save flipped value
 528         sub,db          $ti0,$hi0,$hi1
 529         ldd,ma          8($tp),$ti0
 530         addib,<>        8,$idx,L\$sub
 531         std,ma          $hi1,8($rp)
 532
 533         extrd,u         $ti0,31,32,$ti0         ; carry in flipped word order
 534         sub,db          $ti0,%r0,$hi1
 535         ldo             -8($tp),$tp
 536 ___
 537 $code.=<<___;
 538         and             $tp,$hi1,$ap
 539         andcm           $rp,$hi1,$bp
 540         or              $ap,$bp,$np
 541
 542         sub             $rp,$arrsz,$rp          ; rewind rp
 543         subi            0,$arrsz,$idx
 544         ldo             `$LOCALS+32`($fp),$tp
 545 L\$copy
 546         ldd             $idx($np),$hi0
 547         std,ma          %r0,8($tp)
 548         addib,<>        8,$idx,.-8              ; L\$copy
 549         std,ma          $hi0,8($rp)
 550 ___
 551
 552 if ($BN_SZ==4) {                                # PA-RISC 1.1 code-path
 553 $ablo=$ab0;
 554 $abhi=$ab1;
 555 $nmlo0=$nm0;
 556 $nmhi0=$nm1;
 557 $nmlo1="%r9";
 558 $nmhi1="%r8";
 559
 560 $code.=<<___;
 561         b               L\$done
 562         nop
 563
 564         .ALIGN          8
 565 L\$parisc11
 566         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 567         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 568         ldw             -12($xfer),$ablo
 569         ldw             -16($xfer),$hi0
 570         ldw             -4($xfer),$nmlo0
 571         ldw             -8($xfer),$nmhi0
 572         fstds           ${fab0},-16($xfer)
 573         fstds           ${fnm0},-8($xfer)
 574
 575          ldo            8($idx),$idx            ; j++++
 576          add            $ablo,$nmlo0,$nmlo0     ; discarded
 577          addc           %r0,$nmhi0,$hi1
 578         ldw             4($xfer),$ablo
 579         ldw             0($xfer),$abhi
 580         nop
 581 \f
 582 L\$1st_pa11
 583         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
 584         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 585         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 586         flddx           $idx($np),${fni}        ; np[j,j+1]
 587          add            $hi0,$ablo,$ablo
 588         ldw             12($xfer),$nmlo1
 589          addc           %r0,$abhi,$hi0
 590         ldw             8($xfer),$nmhi1
 591          add            $ablo,$nmlo1,$nmlo1
 592         fstds           ${fab1},0($xfer)
 593          addc           %r0,$nmhi1,$nmhi1
 594         fstds           ${fnm1},8($xfer)
 595          add            $hi1,$nmlo1,$nmlo1
 596         ldw             -12($xfer),$ablo
 597          addc           %r0,$nmhi1,$hi1
 598         ldw             -16($xfer),$abhi
 599
 600         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 601         ldw             -4($xfer),$nmlo0
 602         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 603         ldw             -8($xfer),$nmhi0
 604          add            $hi0,$ablo,$ablo
 605         stw             $nmlo1,-4($tp)          ; tp[j-1]
 606          addc           %r0,$abhi,$hi0
 607         fstds           ${fab0},-16($xfer)
 608          add            $ablo,$nmlo0,$nmlo0
 609         fstds           ${fnm0},-8($xfer)
 610          addc           %r0,$nmhi0,$nmhi0
 611         ldw             0($xfer),$abhi
 612          add            $hi1,$nmlo0,$nmlo0
 613         ldw             4($xfer),$ablo
 614          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
 615         addib,<>        8,$idx,L\$1st_pa11      ; j++++
 616          addc           %r0,$nmhi0,$hi1
 617
 618          ldw            8($xfer),$nmhi1
 619          ldw            12($xfer),$nmlo1
 620         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
 621         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 622          add            $hi0,$ablo,$ablo
 623         fstds           ${fab1},0($xfer)
 624          addc           %r0,$abhi,$hi0
 625         fstds           ${fnm1},8($xfer)
 626          add            $ablo,$nmlo1,$nmlo1
 627         ldw             -16($xfer),$abhi
 628          addc           %r0,$nmhi1,$nmhi1
 629         ldw             -12($xfer),$ablo
 630          add            $hi1,$nmlo1,$nmlo1
 631         ldw             -8($xfer),$nmhi0
 632          addc           %r0,$nmhi1,$hi1
 633         ldw             -4($xfer),$nmlo0
 634
 635          add            $hi0,$ablo,$ablo
 636         stw             $nmlo1,-4($tp)          ; tp[j-1]
 637          addc           %r0,$abhi,$hi0
 638         ldw             0($xfer),$abhi
 639          add            $ablo,$nmlo0,$nmlo0
 640         ldw             4($xfer),$ablo
 641          addc           %r0,$nmhi0,$nmhi0
 642         ldws,mb         8($xfer),$nmhi1
 643          add            $hi1,$nmlo0,$nmlo0
 644         ldw             4($xfer),$nmlo1
 645          addc           %r0,$nmhi0,$hi1
 646         stws,ma         $nmlo0,8($tp)           ; tp[j-1]
 647
 648         ldo             -1($num),$num           ; i--
 649         subi            0,$arrsz,$idx           ; j=0
 650
 651          fldws,ma       4($bp),${fbi}           ; bp[1]
 652          flddx          $idx($ap),${fai}        ; ap[0,1]
 653          flddx          $idx($np),${fni}        ; np[0,1]
 654          fldws          8($xfer),${fti}R        ; tp[0]
 655         add             $hi0,$ablo,$ablo
 656         addc            %r0,$abhi,$hi0
 657          ldo            8($idx),$idx            ; j++++
 658          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
 659          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
 660         add             $hi1,$nmlo1,$nmlo1
 661         addc            %r0,$nmhi1,$nmhi1
 662         add             $ablo,$nmlo1,$nmlo1
 663         addc            %r0,$nmhi1,$hi1
 664          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 665         stw             $nmlo1,-4($tp)          ; tp[j-1]
 666
 667          fcpy,sgl       %fr0,${fti}L            ; zero high part
 668          fcpy,sgl       %fr0,${fab0}L
 669         add             $hi1,$hi0,$hi0
 670         addc            %r0,%r0,$hi1
 671          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 672          fcnvxf,dbl,dbl ${fab0},${fab0}
 673         stw             $hi0,0($tp)
 674         stw             $hi1,4($tp)
 675
 676         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
 677         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
 678         xmpyu           ${fn0},${fab0}R,${fm0}
 679         ldo             `$LOCALS+32+4`($fp),$tp
 680 L\$outer_pa11
 681         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
 682         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
 683         fstds           ${fab0},-16($xfer)      ; 33-bit value
 684         fstds           ${fnm0},-8($xfer)
 685          flddx          $idx($ap),${fai}        ; ap[2,3]
 686          flddx          $idx($np),${fni}        ; np[2,3]
 687         ldw             -16($xfer),$abhi        ; carry bit actually
 688          ldo            8($idx),$idx            ; j++++
 689         ldw             -12($xfer),$ablo
 690         ldw             -8($xfer),$nmhi0
 691         ldw             -4($xfer),$nmlo0
 692         ldw             0($xfer),$hi0           ; high part
 693
 694         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 695         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 696         fstds           ${fab1},0($xfer)
 697          addl           $abhi,$hi0,$hi0         ; account carry bit
 698         fstds           ${fnm1},8($xfer)
 699          add            $ablo,$nmlo0,$nmlo0     ; discarded
 700         ldw             0($tp),$ti1             ; tp[1]
 701          addc           %r0,$nmhi0,$hi1
 702         fstds           ${fab0},-16($xfer)
 703         fstds           ${fnm0},-8($xfer)
 704         ldw             4($xfer),$ablo
 705         ldw             0($xfer),$abhi
 706 \f
 707 L\$inner_pa11
 708         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
 709         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 710         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 711         flddx           $idx($np),${fni}        ; np[j,j+1]
 712          add            $hi0,$ablo,$ablo
 713         ldw             4($tp),$ti0             ; tp[j]
 714          addc           %r0,$abhi,$abhi
 715         ldw             12($xfer),$nmlo1
 716          add            $ti1,$ablo,$ablo
 717         ldw             8($xfer),$nmhi1
 718          addc           %r0,$abhi,$hi0
 719         fstds           ${fab1},0($xfer)
 720          add            $ablo,$nmlo1,$nmlo1
 721         fstds           ${fnm1},8($xfer)
 722          addc           %r0,$nmhi1,$nmhi1
 723         ldw             -12($xfer),$ablo
 724          add            $hi1,$nmlo1,$nmlo1
 725         ldw             -16($xfer),$abhi
 726          addc           %r0,$nmhi1,$hi1
 727
 728         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 729         ldw             8($tp),$ti1             ; tp[j]
 730         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 731         ldw             -4($xfer),$nmlo0
 732          add            $hi0,$ablo,$ablo
 733         ldw             -8($xfer),$nmhi0
 734          addc           %r0,$abhi,$abhi
 735         stw             $nmlo1,-4($tp)          ; tp[j-1]
 736          add            $ti0,$ablo,$ablo
 737         fstds           ${fab0},-16($xfer)
 738          addc           %r0,$abhi,$hi0
 739         fstds           ${fnm0},-8($xfer)
 740          add            $ablo,$nmlo0,$nmlo0
 741         ldw             4($xfer),$ablo
 742          addc           %r0,$nmhi0,$nmhi0
 743         ldw             0($xfer),$abhi
 744          add            $hi1,$nmlo0,$nmlo0
 745          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
 746         addib,<>        8,$idx,L\$inner_pa11    ; j++++
 747          addc           %r0,$nmhi0,$hi1
 748
 749         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
 750         ldw             12($xfer),$nmlo1
 751         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 752         ldw             8($xfer),$nmhi1
 753          add            $hi0,$ablo,$ablo
 754         ldw             4($tp),$ti0             ; tp[j]
 755          addc           %r0,$abhi,$abhi
 756         fstds           ${fab1},0($xfer)
 757          add            $ti1,$ablo,$ablo
 758         fstds           ${fnm1},8($xfer)
 759          addc           %r0,$abhi,$hi0
 760         ldw             -16($xfer),$abhi
 761          add            $ablo,$nmlo1,$nmlo1
 762         ldw             -12($xfer),$ablo
 763          addc           %r0,$nmhi1,$nmhi1
 764         ldw             -8($xfer),$nmhi0
 765          add            $hi1,$nmlo1,$nmlo1
 766         ldw             -4($xfer),$nmlo0
 767          addc           %r0,$nmhi1,$hi1
 768
 769         add             $hi0,$ablo,$ablo
 770          stw            $nmlo1,-4($tp)          ; tp[j-1]
 771         addc            %r0,$abhi,$abhi
 772          add            $ti0,$ablo,$ablo
 773         ldw             8($tp),$ti1             ; tp[j]
 774          addc           %r0,$abhi,$hi0
 775         ldw             0($xfer),$abhi
 776          add            $ablo,$nmlo0,$nmlo0
 777         ldw             4($xfer),$ablo
 778          addc           %r0,$nmhi0,$nmhi0
 779         ldws,mb         8($xfer),$nmhi1
 780          add            $hi1,$nmlo0,$nmlo0
 781         ldw             4($xfer),$nmlo1
 782          addc           %r0,$nmhi0,$hi1
 783          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
 784
 785         addib,=         -1,$num,L\$outerdone_pa11; i--
 786         subi            0,$arrsz,$idx           ; j=0
 787
 788          fldws,ma       4($bp),${fbi}           ; bp[i]
 789          flddx          $idx($ap),${fai}        ; ap[0]
 790         add             $hi0,$ablo,$ablo
 791         addc            %r0,$abhi,$abhi
 792          flddx          $idx($np),${fni}        ; np[0]
 793          fldws          8($xfer),${fti}R        ; tp[0]
 794         add             $ti1,$ablo,$ablo
 795         addc            %r0,$abhi,$hi0
 796
 797          ldo            8($idx),$idx            ; j++++
 798          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
 799          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
 800         ldw             4($tp),$ti0             ; tp[j]
 801
 802         add             $hi1,$nmlo1,$nmlo1
 803         addc            %r0,$nmhi1,$nmhi1
 804          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 805         add             $ablo,$nmlo1,$nmlo1
 806         addc            %r0,$nmhi1,$hi1
 807          fcpy,sgl       %fr0,${fti}L            ; zero high part
 808          fcpy,sgl       %fr0,${fab0}L
 809         stw             $nmlo1,-4($tp)          ; tp[j-1]
 810
 811          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 812          fcnvxf,dbl,dbl ${fab0},${fab0}
 813         add             $hi1,$hi0,$hi0
 814         addc            %r0,%r0,$hi1
 815          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
 816         add             $ti0,$hi0,$hi0
 817         addc            %r0,$hi1,$hi1
 818          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
 819         stw             $hi0,0($tp)
 820         stw             $hi1,4($tp)
 821          xmpyu          ${fn0},${fab0}R,${fm0}
 822
 823         b               L\$outer_pa11
 824         ldo             `$LOCALS+32+4`($fp),$tp
 825 \f
 826 L\$outerdone_pa11
 827         add             $hi0,$ablo,$ablo
 828         addc            %r0,$abhi,$abhi
 829         add             $ti1,$ablo,$ablo
 830         addc            %r0,$abhi,$hi0
 831
 832         ldw             4($tp),$ti0             ; tp[j]
 833
 834         add             $hi1,$nmlo1,$nmlo1
 835         addc            %r0,$nmhi1,$nmhi1
 836         add             $ablo,$nmlo1,$nmlo1
 837         addc            %r0,$nmhi1,$hi1
 838         stw             $nmlo1,-4($tp)          ; tp[j-1]
 839
 840         add             $hi1,$hi0,$hi0
 841         addc            %r0,%r0,$hi1
 842         add             $ti0,$hi0,$hi0
 843         addc            %r0,$hi1,$hi1
 844         stw             $hi0,0($tp)
 845         stw             $hi1,4($tp)
 846
 847         ldo             `$LOCALS+32+4`($fp),$tp
 848         sub             %r0,%r0,%r0             ; clear borrow
 849         ldw             -4($tp),$ti0
 850         addl            $tp,$arrsz,$tp
 851 L\$sub_pa11
 852         ldwx            $idx($np),$hi0
 853         subb            $ti0,$hi0,$hi1
 854         ldwx            $idx($tp),$ti0
 855         addib,<>        4,$idx,L\$sub_pa11
 856         stws,ma         $hi1,4($rp)
 857
 858         subb            $ti0,%r0,$hi1
 859         ldo             -4($tp),$tp
 860         and             $tp,$hi1,$ap
 861         andcm           $rp,$hi1,$bp
 862         or              $ap,$bp,$np
 863
 864         sub             $rp,$arrsz,$rp          ; rewind rp
 865         subi            0,$arrsz,$idx
 866         ldo             `$LOCALS+32`($fp),$tp
 867 L\$copy_pa11
 868         ldwx            $idx($np),$hi0
 869         stws,ma         %r0,4($tp)
 870         addib,<>        4,$idx,L\$copy_pa11
 871         stws,ma         $hi0,4($rp)
 872
 873         nop                                     ; alignment
 874 L\$done
 875 ___
 876 }
 877 \f
 878 $code.=<<___;
 879         ldi             1,%r28                  ; signal "handled"
 880         ldo             $FRAME($fp),%sp         ; destroy tp[num+1]
 881
 882         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2    ; standard epilogue
 883         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
 884         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
 885         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
 886         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
 887         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
 888         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
 889         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
 890 L\$abort
 891         bv      (%r2)
 892         .EXIT
 893         $POPMB  -$FRAME(%sp),%r3
 894         .PROCEND
 895         .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
 896 ___
 897 \f
 898 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
 899 # that it can be compiled with .LEVEL 1.0. It should be noted that I
 900 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
 901 # directive...
 902
 903 my $ldd = sub {
 904   my ($mod,$args) = @_;
 905   my $orig = "ldd$mod\t$args";
 906
 907     if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)         # format 4
 908     {   my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
 909         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 910     }
 911     elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)     # format 5
 912     {   my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
 913         $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
 914         $opcode|=(1<<5)  if ($mod =~ /^,m/);
 915         $opcode|=(1<<13) if ($mod =~ /^,mb/);
 916         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 917     }
 918     else { "\t".$orig; }
 919 };
 920
 921 my $std = sub {
 922   my ($mod,$args) = @_;
 923   my $orig = "std$mod\t$args";
 924
 925     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)        # format 6
 926     {   my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
 927         $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);                  # encode offset
 928         $opcode|=(1<<5)  if ($mod =~ /^,m/);
 929         $opcode|=(1<<13) if ($mod =~ /^,mb/);
 930         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 931     }
 932     else { "\t".$orig; }
 933 };
 934
 935 my $extrd = sub {
 936   my ($mod,$args) = @_;
 937   my $orig = "extrd$mod\t$args";
 938
 939     # I only have ",u" completer, it's implicitly encoded...
 940     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
 941     {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
 942         my $len=32-$3;
 943         $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
 944         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
 945         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 946     }
 947     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
 948     {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
 949         my $len=32-$2;
 950         $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
 951         $opcode |= (1<<13) if ($mod =~ /,\**=/);
 952         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 953     }
 954     else { "\t".$orig; }
 955 };
 956
 957 my $shrpd = sub {
 958   my ($mod,$args) = @_;
 959   my $orig = "shrpd$mod\t$args";
 960
 961     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
 962     {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
 963         my $cpos=63-$3;
 964         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
 965         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 966     }
 967     else { "\t".$orig; }
 968 };
 969
 970 my $sub = sub {
 971   my ($mod,$args) = @_;
 972   my $orig = "sub$mod\t$args";
 973
 974     if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
 975         my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
 976         $opcode|=(1<<10);       # e1
 977         $opcode|=(1<<8);        # e2
 978         $opcode|=(1<<5);        # d
 979         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
 980     }
 981     else { "\t".$orig; }
 982 };
 983
 984 sub assemble {
 985   my ($mnemonic,$mod,$args)=@_;
 986   my $opcode = eval("\$$mnemonic");
 987
 988     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
 989 }
 990
 991 foreach (split("\n",$code)) {
 992         s/\`([^\`]*)\`/eval $1/ge;
 993         # flip word order in 64-bit mode...
 994         s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
 995         # assemble 2.0 instructions in 32-bit mode...
 996         s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
 997
 998         s/\bbv\b/bve/gm if ($SIZE_T==8);
 999
1000         print $_,"\n";
1001 }
1002 close STDOUT;