crypto/modes/asm/ghash-sparcv9.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16
  17 # March 2010
  18 #
  19 # The module implements "4-bit" GCM GHASH function and underlying
  20 # single multiplication operation in GF(2^128). "4-bit" means that it
  21 # uses 256 bytes per-key table [+128 bytes shared table]. Performance
  22 # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
  23 # and are expressed in cycles per processed byte, less is better:
  24 #
  25 #               gcc 3.3.x       cc 5.2          this assembler
  26 #
  27 # 32-bit build  81.4            43.3            12.6    (+546%/+244%)
  28 # 64-bit build  20.2            21.2            12.6    (+60%/+68%)
  29 #
  30 # Here is data collected on UltraSPARC T1 system running Linux:
  31 #
  32 #               gcc 4.4.1                       this assembler
  33 #
  34 # 32-bit build  566                             50      (+1000%)
  35 # 64-bit build  56                              50      (+12%)
  36 #
  37 # I don't quite understand why difference between 32-bit and 64-bit
  38 # compiler-generated code is so big. Compilers *were* instructed to
  39 # generate code for UltraSPARC and should have used 64-bit registers
  40 # for Z vector (see C code) even in 32-bit build... Oh well, it only
  41 # means more impressive improvement coefficients for this assembler
  42 # module;-) Loops are aggressively modulo-scheduled in respect to
  43 # references to input data and Z.hi updates to achieve 12 cycles
  44 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
  45 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
  46 #
  47 # October 2012
  48 #
  49 # Add VIS3 lookup-table-free implementation using polynomial
  50 # multiplication xmulx[hi] and extended addition addxc[cc]
  51 # instructions. 4.52/7.63x improvement on T3/T4 or in absolute
  52 # terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
  53 # saturates at ~15.5x single-process result on 8-core processor,
  54 # or ~20.5GBps per 2.85GHz socket.
  55
  56 $output=pop and open STDOUT,">$output";
  57
  58 $frame="STACK_FRAME";
  59 $bias="STACK_BIAS";
  60
  61 $Zhi="%o0";     # 64-bit values
  62 $Zlo="%o1";
  63 $Thi="%o2";
  64 $Tlo="%o3";
  65 $rem="%o4";
  66 $tmp="%o5";
  67
  68 $nhi="%l0";     # small values and pointers
  69 $nlo="%l1";
  70 $xi0="%l2";
  71 $xi1="%l3";
  72 $rem_4bit="%l4";
  73 $remi="%l5";
  74 $Htblo="%l6";
  75 $cnt="%l7";
  76
  77 $Xi="%i0";      # input argument block
  78 $Htbl="%i1";
  79 $inp="%i2";
  80 $len="%i3";
  81
  82 $code.=<<___;
  83 #include "sparc_arch.h"
  84
  85 #ifdef  __arch64__
  86 .register       %g2,#scratch
  87 .register       %g3,#scratch
  88 #endif
  89
  90 .section        ".text",#alloc,#execinstr
  91
  92 .align  64
  93 rem_4bit:
  94         .long   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
  95         .long   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
  96         .long   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
  97         .long   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
  98 .type   rem_4bit,#object
  99 .size   rem_4bit,(.-rem_4bit)
 100
 101 .globl  gcm_ghash_4bit
 102 .align  32
 103 gcm_ghash_4bit:
 104         save    %sp,-$frame,%sp
 105         ldub    [$inp+15],$nlo
 106         ldub    [$Xi+15],$xi0
 107         ldub    [$Xi+14],$xi1
 108         add     $len,$inp,$len
 109         add     $Htbl,8,$Htblo
 110
 111 1:      call    .+8
 112         add     %o7,rem_4bit-1b,$rem_4bit
 113
 114 .Louter:
 115         xor     $xi0,$nlo,$nlo
 116         and     $nlo,0xf0,$nhi
 117         and     $nlo,0x0f,$nlo
 118         sll     $nlo,4,$nlo
 119         ldx     [$Htblo+$nlo],$Zlo
 120         ldx     [$Htbl+$nlo],$Zhi
 121
 122         ldub    [$inp+14],$nlo
 123
 124         ldx     [$Htblo+$nhi],$Tlo
 125         and     $Zlo,0xf,$remi
 126         ldx     [$Htbl+$nhi],$Thi
 127         sll     $remi,3,$remi
 128         ldx     [$rem_4bit+$remi],$rem
 129         srlx    $Zlo,4,$Zlo
 130         mov     13,$cnt
 131         sllx    $Zhi,60,$tmp
 132         xor     $Tlo,$Zlo,$Zlo
 133         srlx    $Zhi,4,$Zhi
 134         xor     $Zlo,$tmp,$Zlo
 135
 136         xor     $xi1,$nlo,$nlo
 137         and     $Zlo,0xf,$remi
 138         and     $nlo,0xf0,$nhi
 139         and     $nlo,0x0f,$nlo
 140         ba      .Lghash_inner
 141         sll     $nlo,4,$nlo
 142 .align  32
 143 .Lghash_inner:
 144         ldx     [$Htblo+$nlo],$Tlo
 145         sll     $remi,3,$remi
 146         xor     $Thi,$Zhi,$Zhi
 147         ldx     [$Htbl+$nlo],$Thi
 148         srlx    $Zlo,4,$Zlo
 149         xor     $rem,$Zhi,$Zhi
 150         ldx     [$rem_4bit+$remi],$rem
 151         sllx    $Zhi,60,$tmp
 152         xor     $Tlo,$Zlo,$Zlo
 153         ldub    [$inp+$cnt],$nlo
 154         srlx    $Zhi,4,$Zhi
 155         xor     $Zlo,$tmp,$Zlo
 156         ldub    [$Xi+$cnt],$xi1
 157         xor     $Thi,$Zhi,$Zhi
 158         and     $Zlo,0xf,$remi
 159
 160         ldx     [$Htblo+$nhi],$Tlo
 161         sll     $remi,3,$remi
 162         xor     $rem,$Zhi,$Zhi
 163         ldx     [$Htbl+$nhi],$Thi
 164         srlx    $Zlo,4,$Zlo
 165         ldx     [$rem_4bit+$remi],$rem
 166         sllx    $Zhi,60,$tmp
 167         xor     $xi1,$nlo,$nlo
 168         srlx    $Zhi,4,$Zhi
 169         and     $nlo,0xf0,$nhi
 170         addcc   $cnt,-1,$cnt
 171         xor     $Zlo,$tmp,$Zlo
 172         and     $nlo,0x0f,$nlo
 173         xor     $Tlo,$Zlo,$Zlo
 174         sll     $nlo,4,$nlo
 175         blu     .Lghash_inner
 176         and     $Zlo,0xf,$remi
 177
 178         ldx     [$Htblo+$nlo],$Tlo
 179         sll     $remi,3,$remi
 180         xor     $Thi,$Zhi,$Zhi
 181         ldx     [$Htbl+$nlo],$Thi
 182         srlx    $Zlo,4,$Zlo
 183         xor     $rem,$Zhi,$Zhi
 184         ldx     [$rem_4bit+$remi],$rem
 185         sllx    $Zhi,60,$tmp
 186         xor     $Tlo,$Zlo,$Zlo
 187         srlx    $Zhi,4,$Zhi
 188         xor     $Zlo,$tmp,$Zlo
 189         xor     $Thi,$Zhi,$Zhi
 190
 191         add     $inp,16,$inp
 192         cmp     $inp,$len
 193         be,pn   SIZE_T_CC,.Ldone
 194         and     $Zlo,0xf,$remi
 195
 196         ldx     [$Htblo+$nhi],$Tlo
 197         sll     $remi,3,$remi
 198         xor     $rem,$Zhi,$Zhi
 199         ldx     [$Htbl+$nhi],$Thi
 200         srlx    $Zlo,4,$Zlo
 201         ldx     [$rem_4bit+$remi],$rem
 202         sllx    $Zhi,60,$tmp
 203         xor     $Tlo,$Zlo,$Zlo
 204         ldub    [$inp+15],$nlo
 205         srlx    $Zhi,4,$Zhi
 206         xor     $Zlo,$tmp,$Zlo
 207         xor     $Thi,$Zhi,$Zhi
 208         stx     $Zlo,[$Xi+8]
 209         xor     $rem,$Zhi,$Zhi
 210         stx     $Zhi,[$Xi]
 211         srl     $Zlo,8,$xi1
 212         and     $Zlo,0xff,$xi0
 213         ba      .Louter
 214         and     $xi1,0xff,$xi1
 215 .align  32
 216 .Ldone:
 217         ldx     [$Htblo+$nhi],$Tlo
 218         sll     $remi,3,$remi
 219         xor     $rem,$Zhi,$Zhi
 220         ldx     [$Htbl+$nhi],$Thi
 221         srlx    $Zlo,4,$Zlo
 222         ldx     [$rem_4bit+$remi],$rem
 223         sllx    $Zhi,60,$tmp
 224         xor     $Tlo,$Zlo,$Zlo
 225         srlx    $Zhi,4,$Zhi
 226         xor     $Zlo,$tmp,$Zlo
 227         xor     $Thi,$Zhi,$Zhi
 228         stx     $Zlo,[$Xi+8]
 229         xor     $rem,$Zhi,$Zhi
 230         stx     $Zhi,[$Xi]
 231
 232         ret
 233         restore
 234 .type   gcm_ghash_4bit,#function
 235 .size   gcm_ghash_4bit,(.-gcm_ghash_4bit)
 236 ___
 237
 238 undef $inp;
 239 undef $len;
 240
 241 $code.=<<___;
 242 .globl  gcm_gmult_4bit
 243 .align  32
 244 gcm_gmult_4bit:
 245         save    %sp,-$frame,%sp
 246         ldub    [$Xi+15],$nlo
 247         add     $Htbl,8,$Htblo
 248
 249 1:      call    .+8
 250         add     %o7,rem_4bit-1b,$rem_4bit
 251
 252         and     $nlo,0xf0,$nhi
 253         and     $nlo,0x0f,$nlo
 254         sll     $nlo,4,$nlo
 255         ldx     [$Htblo+$nlo],$Zlo
 256         ldx     [$Htbl+$nlo],$Zhi
 257
 258         ldub    [$Xi+14],$nlo
 259
 260         ldx     [$Htblo+$nhi],$Tlo
 261         and     $Zlo,0xf,$remi
 262         ldx     [$Htbl+$nhi],$Thi
 263         sll     $remi,3,$remi
 264         ldx     [$rem_4bit+$remi],$rem
 265         srlx    $Zlo,4,$Zlo
 266         mov     13,$cnt
 267         sllx    $Zhi,60,$tmp
 268         xor     $Tlo,$Zlo,$Zlo
 269         srlx    $Zhi,4,$Zhi
 270         xor     $Zlo,$tmp,$Zlo
 271
 272         and     $Zlo,0xf,$remi
 273         and     $nlo,0xf0,$nhi
 274         and     $nlo,0x0f,$nlo
 275         ba      .Lgmult_inner
 276         sll     $nlo,4,$nlo
 277 .align  32
 278 .Lgmult_inner:
 279         ldx     [$Htblo+$nlo],$Tlo
 280         sll     $remi,3,$remi
 281         xor     $Thi,$Zhi,$Zhi
 282         ldx     [$Htbl+$nlo],$Thi
 283         srlx    $Zlo,4,$Zlo
 284         xor     $rem,$Zhi,$Zhi
 285         ldx     [$rem_4bit+$remi],$rem
 286         sllx    $Zhi,60,$tmp
 287         xor     $Tlo,$Zlo,$Zlo
 288         ldub    [$Xi+$cnt],$nlo
 289         srlx    $Zhi,4,$Zhi
 290         xor     $Zlo,$tmp,$Zlo
 291         xor     $Thi,$Zhi,$Zhi
 292         and     $Zlo,0xf,$remi
 293
 294         ldx     [$Htblo+$nhi],$Tlo
 295         sll     $remi,3,$remi
 296         xor     $rem,$Zhi,$Zhi
 297         ldx     [$Htbl+$nhi],$Thi
 298         srlx    $Zlo,4,$Zlo
 299         ldx     [$rem_4bit+$remi],$rem
 300         sllx    $Zhi,60,$tmp
 301         srlx    $Zhi,4,$Zhi
 302         and     $nlo,0xf0,$nhi
 303         addcc   $cnt,-1,$cnt
 304         xor     $Zlo,$tmp,$Zlo
 305         and     $nlo,0x0f,$nlo
 306         xor     $Tlo,$Zlo,$Zlo
 307         sll     $nlo,4,$nlo
 308         blu     .Lgmult_inner
 309         and     $Zlo,0xf,$remi
 310
 311         ldx     [$Htblo+$nlo],$Tlo
 312         sll     $remi,3,$remi
 313         xor     $Thi,$Zhi,$Zhi
 314         ldx     [$Htbl+$nlo],$Thi
 315         srlx    $Zlo,4,$Zlo
 316         xor     $rem,$Zhi,$Zhi
 317         ldx     [$rem_4bit+$remi],$rem
 318         sllx    $Zhi,60,$tmp
 319         xor     $Tlo,$Zlo,$Zlo
 320         srlx    $Zhi,4,$Zhi
 321         xor     $Zlo,$tmp,$Zlo
 322         xor     $Thi,$Zhi,$Zhi
 323         and     $Zlo,0xf,$remi
 324
 325         ldx     [$Htblo+$nhi],$Tlo
 326         sll     $remi,3,$remi
 327         xor     $rem,$Zhi,$Zhi
 328         ldx     [$Htbl+$nhi],$Thi
 329         srlx    $Zlo,4,$Zlo
 330         ldx     [$rem_4bit+$remi],$rem
 331         sllx    $Zhi,60,$tmp
 332         xor     $Tlo,$Zlo,$Zlo
 333         srlx    $Zhi,4,$Zhi
 334         xor     $Zlo,$tmp,$Zlo
 335         xor     $Thi,$Zhi,$Zhi
 336         stx     $Zlo,[$Xi+8]
 337         xor     $rem,$Zhi,$Zhi
 338         stx     $Zhi,[$Xi]
 339
 340         ret
 341         restore
 342 .type   gcm_gmult_4bit,#function
 343 .size   gcm_gmult_4bit,(.-gcm_gmult_4bit)
 344 ___
 345 \f
 346 {{{
 347 # Straightforward 128x128-bit multiplication using Karatsuba algorithm
 348 # followed by pair of 64-bit reductions [with a shortcut in first one,
 349 # which allowed to break dependency between reductions and remove one
 350 # multiplication from critical path]. While it might be suboptimal
 351 # with regard to sheer number of multiplications, other methods [such
 352 # as aggregate reduction] would require more 64-bit registers, which
 353 # we don't have in 32-bit application context.
 354
 355 ($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
 356
 357 ($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
 358         (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
 359
 360 ($shl,$shr)=map("%l$_",(0..7));
 361
 362 # For details regarding "twisted H" see ghash-x86.pl.
 363 $code.=<<___;
 364 .globl  gcm_init_vis3
 365 .align  32
 366 gcm_init_vis3:
 367         save    %sp,-$frame,%sp
 368
 369         ldx     [%i1+0],$Hhi
 370         ldx     [%i1+8],$Hlo
 371         mov     0xE1,$Xhi
 372         mov     1,$Xlo
 373         sllx    $Xhi,57,$Xhi
 374         srax    $Hhi,63,$C0             ! broadcast carry
 375         addcc   $Hlo,$Hlo,$Hlo          ! H<<=1
 376         addxc   $Hhi,$Hhi,$Hhi
 377         and     $C0,$Xlo,$Xlo
 378         and     $C0,$Xhi,$Xhi
 379         xor     $Xlo,$Hlo,$Hlo
 380         xor     $Xhi,$Hhi,$Hhi
 381         stx     $Hlo,[%i0+8]            ! save twisted H
 382         stx     $Hhi,[%i0+0]
 383
 384         sethi   %hi(0xA0406080),$V
 385         sethi   %hi(0x20C0E000),%l0
 386         or      $V,%lo(0xA0406080),$V
 387         or      %l0,%lo(0x20C0E000),%l0
 388         sllx    $V,32,$V
 389         or      %l0,$V,$V               ! (0xE0·i)&0xff=0xA040608020C0E000
 390         stx     $V,[%i0+16]
 391
 392         ret
 393         restore
 394 .type   gcm_init_vis3,#function
 395 .size   gcm_init_vis3,.-gcm_init_vis3
 396
 397 .globl  gcm_gmult_vis3
 398 .align  32
 399 gcm_gmult_vis3:
 400         save    %sp,-$frame,%sp
 401
 402         ldx     [$Xip+8],$Xlo           ! load Xi
 403         ldx     [$Xip+0],$Xhi
 404         ldx     [$Htable+8],$Hlo        ! load twisted H
 405         ldx     [$Htable+0],$Hhi
 406
 407         mov     0xE1,%l7
 408         sllx    %l7,57,$xE1             ! 57 is not a typo
 409         ldx     [$Htable+16],$V         ! (0xE0·i)&0xff=0xA040608020C0E000
 410
 411         xor     $Hhi,$Hlo,$Hhl          ! Karatsuba pre-processing
 412         xmulx   $Xlo,$Hlo,$C0
 413         xor     $Xlo,$Xhi,$C2           ! Karatsuba pre-processing
 414         xmulx   $C2,$Hhl,$C1
 415         xmulxhi $Xlo,$Hlo,$Xlo
 416         xmulxhi $C2,$Hhl,$C2
 417         xmulxhi $Xhi,$Hhi,$C3
 418         xmulx   $Xhi,$Hhi,$Xhi
 419
 420         sll     $C0,3,$sqr
 421         srlx    $V,$sqr,$sqr            ! ·0xE0 [implicit &(7<<3)]
 422         xor     $C0,$sqr,$sqr
 423         sllx    $sqr,57,$sqr            ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
 424
 425         xor     $C0,$C1,$C1             ! Karatsuba post-processing
 426         xor     $Xlo,$C2,$C2
 427          xor    $sqr,$Xlo,$Xlo          ! real destination is $C1
 428         xor     $C3,$C2,$C2
 429         xor     $Xlo,$C1,$C1
 430         xor     $Xhi,$C2,$C2
 431         xor     $Xhi,$C1,$C1
 432
 433         xmulxhi $C0,$xE1,$Xlo           ! ·0xE1<<1<<56
 434          xor    $C0,$C2,$C2
 435         xmulx   $C1,$xE1,$C0
 436          xor    $C1,$C3,$C3
 437         xmulxhi $C1,$xE1,$C1
 438
 439         xor     $Xlo,$C2,$C2
 440         xor     $C0,$C2,$C2
 441         xor     $C1,$C3,$C3
 442
 443         stx     $C2,[$Xip+8]            ! save Xi
 444         stx     $C3,[$Xip+0]
 445
 446         ret
 447         restore
 448 .type   gcm_gmult_vis3,#function
 449 .size   gcm_gmult_vis3,.-gcm_gmult_vis3
 450
 451 .globl  gcm_ghash_vis3
 452 .align  32
 453 gcm_ghash_vis3:
 454         save    %sp,-$frame,%sp
 455         nop
 456         srln    $len,0,$len             ! needed on v8+, "nop" on v9
 457
 458         ldx     [$Xip+8],$C2            ! load Xi
 459         ldx     [$Xip+0],$C3
 460         ldx     [$Htable+8],$Hlo        ! load twisted H
 461         ldx     [$Htable+0],$Hhi
 462
 463         mov     0xE1,%l7
 464         sllx    %l7,57,$xE1             ! 57 is not a typo
 465         ldx     [$Htable+16],$V         ! (0xE0·i)&0xff=0xA040608020C0E000
 466
 467         and     $inp,7,$shl
 468         andn    $inp,7,$inp
 469         sll     $shl,3,$shl
 470         prefetch [$inp+63], 20
 471         sub     %g0,$shl,$shr
 472
 473         xor     $Hhi,$Hlo,$Hhl          ! Karatsuba pre-processing
 474 .Loop:
 475         ldx     [$inp+8],$Xlo
 476         brz,pt  $shl,1f
 477         ldx     [$inp+0],$Xhi
 478
 479         ldx     [$inp+16],$C1           ! align data
 480         srlx    $Xlo,$shr,$C0
 481         sllx    $Xlo,$shl,$Xlo
 482         sllx    $Xhi,$shl,$Xhi
 483         srlx    $C1,$shr,$C1
 484         or      $C0,$Xhi,$Xhi
 485         or      $C1,$Xlo,$Xlo
 486 1:
 487         add     $inp,16,$inp
 488         sub     $len,16,$len
 489         xor     $C2,$Xlo,$Xlo
 490         xor     $C3,$Xhi,$Xhi
 491         prefetch [$inp+63], 20
 492
 493         xmulx   $Xlo,$Hlo,$C0
 494         xor     $Xlo,$Xhi,$C2           ! Karatsuba pre-processing
 495         xmulx   $C2,$Hhl,$C1
 496         xmulxhi $Xlo,$Hlo,$Xlo
 497         xmulxhi $C2,$Hhl,$C2
 498         xmulxhi $Xhi,$Hhi,$C3
 499         xmulx   $Xhi,$Hhi,$Xhi
 500
 501         sll     $C0,3,$sqr
 502         srlx    $V,$sqr,$sqr            ! ·0xE0 [implicit &(7<<3)]
 503         xor     $C0,$sqr,$sqr
 504         sllx    $sqr,57,$sqr            ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
 505
 506         xor     $C0,$C1,$C1             ! Karatsuba post-processing
 507         xor     $Xlo,$C2,$C2
 508          xor    $sqr,$Xlo,$Xlo          ! real destination is $C1
 509         xor     $C3,$C2,$C2
 510         xor     $Xlo,$C1,$C1
 511         xor     $Xhi,$C2,$C2
 512         xor     $Xhi,$C1,$C1
 513
 514         xmulxhi $C0,$xE1,$Xlo           ! ·0xE1<<1<<56
 515          xor    $C0,$C2,$C2
 516         xmulx   $C1,$xE1,$C0
 517          xor    $C1,$C3,$C3
 518         xmulxhi $C1,$xE1,$C1
 519
 520         xor     $Xlo,$C2,$C2
 521         xor     $C0,$C2,$C2
 522         brnz,pt $len,.Loop
 523         xor     $C1,$C3,$C3
 524
 525         stx     $C2,[$Xip+8]            ! save Xi
 526         stx     $C3,[$Xip+0]
 527
 528         ret
 529         restore
 530 .type   gcm_ghash_vis3,#function
 531 .size   gcm_ghash_vis3,.-gcm_ghash_vis3
 532 ___
 533 }}}
 534 $code.=<<___;
 535 .asciz  "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
 536 .align  4
 537 ___
 538
 539 \f
 540 # Purpose of these subroutines is to explicitly encode VIS instructions,
 541 # so that one can compile the module without having to specify VIS
 542 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 543 # Idea is to reserve for option to produce "universal" binary and let
 544 # programmer detect if current CPU is VIS capable at run-time.
 545 sub unvis3 {
 546 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 547 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 548 my ($ref,$opf);
 549 my %visopf = (  "addxc"         => 0x011,
 550                 "addxccc"       => 0x013,
 551                 "xmulx"         => 0x115,
 552                 "xmulxhi"       => 0x116        );
 553
 554     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 555
 556     if ($opf=$visopf{$mnemonic}) {
 557         foreach ($rs1,$rs2,$rd) {
 558             return $ref if (!/%([goli])([0-9])/);
 559             $_=$bias{$1}+$2;
 560         }
 561
 562         return  sprintf ".word\t0x%08x !%s",
 563                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 564                         $ref;
 565     } else {
 566         return $ref;
 567     }
 568 }
 569
 570 foreach (split("\n",$code)) {
 571         s/\`([^\`]*)\`/eval $1/ge;
 572
 573         s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 574                 &unvis3($1,$2,$3,$4)
 575          /ge;
 576
 577         print $_,"\n";
 578 }
 579
 580 close STDOUT or die "error closing STDOUT";