crypto/modes/asm/ghash-sparcv9.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16
  17 # March 2010
  18 #
  19 # The module implements "4-bit" GCM GHASH function and underlying
  20 # single multiplication operation in GF(2^128). "4-bit" means that it
  21 # uses 256 bytes per-key table [+128 bytes shared table]. Performance
  22 # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
  23 # and are expressed in cycles per processed byte, less is better:
  24 #
  25 #               gcc 3.3.x       cc 5.2          this assembler
  26 #
  27 # 32-bit build  81.4            43.3            12.6    (+546%/+244%)
  28 # 64-bit build  20.2            21.2            12.6    (+60%/+68%)
  29 #
  30 # Here is data collected on UltraSPARC T1 system running Linux:
  31 #
  32 #               gcc 4.4.1                       this assembler
  33 #
  34 # 32-bit build  566                             50      (+1000%)
  35 # 64-bit build  56                              50      (+12%)
  36 #
  37 # I don't quite understand why difference between 32-bit and 64-bit
  38 # compiler-generated code is so big. Compilers *were* instructed to
  39 # generate code for UltraSPARC and should have used 64-bit registers
  40 # for Z vector (see C code) even in 32-bit build... Oh well, it only
  41 # means more impressive improvement coefficients for this assembler
  42 # module;-) Loops are aggressively modulo-scheduled in respect to
  43 # references to input data and Z.hi updates to achieve 12 cycles
  44 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
  45 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
  46 #
  47 # October 2012
  48 #
  49 # Add VIS3 lookup-table-free implementation using polynomial
  50 # multiplication xmulx[hi] and extended addition addxc[cc]
  51 # instructions. 4.52/7.63x improvement on T3/T4 or in absolute
  52 # terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
  53 # saturates at ~15.5x single-process result on 8-core processor,
  54 # or ~20.5GBps per 2.85GHz socket.
  55
  56 $output=pop;
  57 open STDOUT,">$output";
  58
  59 $frame="STACK_FRAME";
  60 $bias="STACK_BIAS";
  61
  62 $Zhi="%o0";     # 64-bit values
  63 $Zlo="%o1";
  64 $Thi="%o2";
  65 $Tlo="%o3";
  66 $rem="%o4";
  67 $tmp="%o5";
  68
  69 $nhi="%l0";     # small values and pointers
  70 $nlo="%l1";
  71 $xi0="%l2";
  72 $xi1="%l3";
  73 $rem_4bit="%l4";
  74 $remi="%l5";
  75 $Htblo="%l6";
  76 $cnt="%l7";
  77
  78 $Xi="%i0";      # input argument block
  79 $Htbl="%i1";
  80 $inp="%i2";
  81 $len="%i3";
  82
  83 $code.=<<___;
  84 #include "sparc_arch.h"
  85
  86 #ifdef  __arch64__
  87 .register       %g2,#scratch
  88 .register       %g3,#scratch
  89 #endif
  90
  91 .section        ".text",#alloc,#execinstr
  92
  93 .align  64
  94 rem_4bit:
  95         .long   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
  96         .long   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
  97         .long   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
  98         .long   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
  99 .type   rem_4bit,#object
 100 .size   rem_4bit,(.-rem_4bit)
 101
 102 .globl  gcm_ghash_4bit
 103 .align  32
 104 gcm_ghash_4bit:
 105         save    %sp,-$frame,%sp
 106         ldub    [$inp+15],$nlo
 107         ldub    [$Xi+15],$xi0
 108         ldub    [$Xi+14],$xi1
 109         add     $len,$inp,$len
 110         add     $Htbl,8,$Htblo
 111
 112 1:      call    .+8
 113         add     %o7,rem_4bit-1b,$rem_4bit
 114
 115 .Louter:
 116         xor     $xi0,$nlo,$nlo
 117         and     $nlo,0xf0,$nhi
 118         and     $nlo,0x0f,$nlo
 119         sll     $nlo,4,$nlo
 120         ldx     [$Htblo+$nlo],$Zlo
 121         ldx     [$Htbl+$nlo],$Zhi
 122
 123         ldub    [$inp+14],$nlo
 124
 125         ldx     [$Htblo+$nhi],$Tlo
 126         and     $Zlo,0xf,$remi
 127         ldx     [$Htbl+$nhi],$Thi
 128         sll     $remi,3,$remi
 129         ldx     [$rem_4bit+$remi],$rem
 130         srlx    $Zlo,4,$Zlo
 131         mov     13,$cnt
 132         sllx    $Zhi,60,$tmp
 133         xor     $Tlo,$Zlo,$Zlo
 134         srlx    $Zhi,4,$Zhi
 135         xor     $Zlo,$tmp,$Zlo
 136
 137         xor     $xi1,$nlo,$nlo
 138         and     $Zlo,0xf,$remi
 139         and     $nlo,0xf0,$nhi
 140         and     $nlo,0x0f,$nlo
 141         ba      .Lghash_inner
 142         sll     $nlo,4,$nlo
 143 .align  32
 144 .Lghash_inner:
 145         ldx     [$Htblo+$nlo],$Tlo
 146         sll     $remi,3,$remi
 147         xor     $Thi,$Zhi,$Zhi
 148         ldx     [$Htbl+$nlo],$Thi
 149         srlx    $Zlo,4,$Zlo
 150         xor     $rem,$Zhi,$Zhi
 151         ldx     [$rem_4bit+$remi],$rem
 152         sllx    $Zhi,60,$tmp
 153         xor     $Tlo,$Zlo,$Zlo
 154         ldub    [$inp+$cnt],$nlo
 155         srlx    $Zhi,4,$Zhi
 156         xor     $Zlo,$tmp,$Zlo
 157         ldub    [$Xi+$cnt],$xi1
 158         xor     $Thi,$Zhi,$Zhi
 159         and     $Zlo,0xf,$remi
 160
 161         ldx     [$Htblo+$nhi],$Tlo
 162         sll     $remi,3,$remi
 163         xor     $rem,$Zhi,$Zhi
 164         ldx     [$Htbl+$nhi],$Thi
 165         srlx    $Zlo,4,$Zlo
 166         ldx     [$rem_4bit+$remi],$rem
 167         sllx    $Zhi,60,$tmp
 168         xor     $xi1,$nlo,$nlo
 169         srlx    $Zhi,4,$Zhi
 170         and     $nlo,0xf0,$nhi
 171         addcc   $cnt,-1,$cnt
 172         xor     $Zlo,$tmp,$Zlo
 173         and     $nlo,0x0f,$nlo
 174         xor     $Tlo,$Zlo,$Zlo
 175         sll     $nlo,4,$nlo
 176         blu     .Lghash_inner
 177         and     $Zlo,0xf,$remi
 178
 179         ldx     [$Htblo+$nlo],$Tlo
 180         sll     $remi,3,$remi
 181         xor     $Thi,$Zhi,$Zhi
 182         ldx     [$Htbl+$nlo],$Thi
 183         srlx    $Zlo,4,$Zlo
 184         xor     $rem,$Zhi,$Zhi
 185         ldx     [$rem_4bit+$remi],$rem
 186         sllx    $Zhi,60,$tmp
 187         xor     $Tlo,$Zlo,$Zlo
 188         srlx    $Zhi,4,$Zhi
 189         xor     $Zlo,$tmp,$Zlo
 190         xor     $Thi,$Zhi,$Zhi
 191
 192         add     $inp,16,$inp
 193         cmp     $inp,$len
 194         be,pn   SIZE_T_CC,.Ldone
 195         and     $Zlo,0xf,$remi
 196
 197         ldx     [$Htblo+$nhi],$Tlo
 198         sll     $remi,3,$remi
 199         xor     $rem,$Zhi,$Zhi
 200         ldx     [$Htbl+$nhi],$Thi
 201         srlx    $Zlo,4,$Zlo
 202         ldx     [$rem_4bit+$remi],$rem
 203         sllx    $Zhi,60,$tmp
 204         xor     $Tlo,$Zlo,$Zlo
 205         ldub    [$inp+15],$nlo
 206         srlx    $Zhi,4,$Zhi
 207         xor     $Zlo,$tmp,$Zlo
 208         xor     $Thi,$Zhi,$Zhi
 209         stx     $Zlo,[$Xi+8]
 210         xor     $rem,$Zhi,$Zhi
 211         stx     $Zhi,[$Xi]
 212         srl     $Zlo,8,$xi1
 213         and     $Zlo,0xff,$xi0
 214         ba      .Louter
 215         and     $xi1,0xff,$xi1
 216 .align  32
 217 .Ldone:
 218         ldx     [$Htblo+$nhi],$Tlo
 219         sll     $remi,3,$remi
 220         xor     $rem,$Zhi,$Zhi
 221         ldx     [$Htbl+$nhi],$Thi
 222         srlx    $Zlo,4,$Zlo
 223         ldx     [$rem_4bit+$remi],$rem
 224         sllx    $Zhi,60,$tmp
 225         xor     $Tlo,$Zlo,$Zlo
 226         srlx    $Zhi,4,$Zhi
 227         xor     $Zlo,$tmp,$Zlo
 228         xor     $Thi,$Zhi,$Zhi
 229         stx     $Zlo,[$Xi+8]
 230         xor     $rem,$Zhi,$Zhi
 231         stx     $Zhi,[$Xi]
 232
 233         ret
 234         restore
 235 .type   gcm_ghash_4bit,#function
 236 .size   gcm_ghash_4bit,(.-gcm_ghash_4bit)
 237 ___
 238
 239 undef $inp;
 240 undef $len;
 241
 242 $code.=<<___;
 243 .globl  gcm_gmult_4bit
 244 .align  32
 245 gcm_gmult_4bit:
 246         save    %sp,-$frame,%sp
 247         ldub    [$Xi+15],$nlo
 248         add     $Htbl,8,$Htblo
 249
 250 1:      call    .+8
 251         add     %o7,rem_4bit-1b,$rem_4bit
 252
 253         and     $nlo,0xf0,$nhi
 254         and     $nlo,0x0f,$nlo
 255         sll     $nlo,4,$nlo
 256         ldx     [$Htblo+$nlo],$Zlo
 257         ldx     [$Htbl+$nlo],$Zhi
 258
 259         ldub    [$Xi+14],$nlo
 260
 261         ldx     [$Htblo+$nhi],$Tlo
 262         and     $Zlo,0xf,$remi
 263         ldx     [$Htbl+$nhi],$Thi
 264         sll     $remi,3,$remi
 265         ldx     [$rem_4bit+$remi],$rem
 266         srlx    $Zlo,4,$Zlo
 267         mov     13,$cnt
 268         sllx    $Zhi,60,$tmp
 269         xor     $Tlo,$Zlo,$Zlo
 270         srlx    $Zhi,4,$Zhi
 271         xor     $Zlo,$tmp,$Zlo
 272
 273         and     $Zlo,0xf,$remi
 274         and     $nlo,0xf0,$nhi
 275         and     $nlo,0x0f,$nlo
 276         ba      .Lgmult_inner
 277         sll     $nlo,4,$nlo
 278 .align  32
 279 .Lgmult_inner:
 280         ldx     [$Htblo+$nlo],$Tlo
 281         sll     $remi,3,$remi
 282         xor     $Thi,$Zhi,$Zhi
 283         ldx     [$Htbl+$nlo],$Thi
 284         srlx    $Zlo,4,$Zlo
 285         xor     $rem,$Zhi,$Zhi
 286         ldx     [$rem_4bit+$remi],$rem
 287         sllx    $Zhi,60,$tmp
 288         xor     $Tlo,$Zlo,$Zlo
 289         ldub    [$Xi+$cnt],$nlo
 290         srlx    $Zhi,4,$Zhi
 291         xor     $Zlo,$tmp,$Zlo
 292         xor     $Thi,$Zhi,$Zhi
 293         and     $Zlo,0xf,$remi
 294
 295         ldx     [$Htblo+$nhi],$Tlo
 296         sll     $remi,3,$remi
 297         xor     $rem,$Zhi,$Zhi
 298         ldx     [$Htbl+$nhi],$Thi
 299         srlx    $Zlo,4,$Zlo
 300         ldx     [$rem_4bit+$remi],$rem
 301         sllx    $Zhi,60,$tmp
 302         srlx    $Zhi,4,$Zhi
 303         and     $nlo,0xf0,$nhi
 304         addcc   $cnt,-1,$cnt
 305         xor     $Zlo,$tmp,$Zlo
 306         and     $nlo,0x0f,$nlo
 307         xor     $Tlo,$Zlo,$Zlo
 308         sll     $nlo,4,$nlo
 309         blu     .Lgmult_inner
 310         and     $Zlo,0xf,$remi
 311
 312         ldx     [$Htblo+$nlo],$Tlo
 313         sll     $remi,3,$remi
 314         xor     $Thi,$Zhi,$Zhi
 315         ldx     [$Htbl+$nlo],$Thi
 316         srlx    $Zlo,4,$Zlo
 317         xor     $rem,$Zhi,$Zhi
 318         ldx     [$rem_4bit+$remi],$rem
 319         sllx    $Zhi,60,$tmp
 320         xor     $Tlo,$Zlo,$Zlo
 321         srlx    $Zhi,4,$Zhi
 322         xor     $Zlo,$tmp,$Zlo
 323         xor     $Thi,$Zhi,$Zhi
 324         and     $Zlo,0xf,$remi
 325
 326         ldx     [$Htblo+$nhi],$Tlo
 327         sll     $remi,3,$remi
 328         xor     $rem,$Zhi,$Zhi
 329         ldx     [$Htbl+$nhi],$Thi
 330         srlx    $Zlo,4,$Zlo
 331         ldx     [$rem_4bit+$remi],$rem
 332         sllx    $Zhi,60,$tmp
 333         xor     $Tlo,$Zlo,$Zlo
 334         srlx    $Zhi,4,$Zhi
 335         xor     $Zlo,$tmp,$Zlo
 336         xor     $Thi,$Zhi,$Zhi
 337         stx     $Zlo,[$Xi+8]
 338         xor     $rem,$Zhi,$Zhi
 339         stx     $Zhi,[$Xi]
 340
 341         ret
 342         restore
 343 .type   gcm_gmult_4bit,#function
 344 .size   gcm_gmult_4bit,(.-gcm_gmult_4bit)
 345 ___
 346 \f
 347 {{{
 348 # Straightforward 128x128-bit multiplication using Karatsuba algorithm
 349 # followed by pair of 64-bit reductions [with a shortcut in first one,
 350 # which allowed to break dependency between reductions and remove one
 351 # multiplication from critical path]. While it might be suboptimal
 352 # with regard to sheer number of multiplications, other methods [such
 353 # as aggregate reduction] would require more 64-bit registers, which
 354 # we don't have in 32-bit application context.
 355
 356 ($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
 357
 358 ($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
 359         (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
 360
 361 ($shl,$shr)=map("%l$_",(0..7));
 362
 363 # For details regarding "twisted H" see ghash-x86.pl.
 364 $code.=<<___;
 365 .globl  gcm_init_vis3
 366 .align  32
 367 gcm_init_vis3:
 368         save    %sp,-$frame,%sp
 369
 370         ldx     [%i1+0],$Hhi
 371         ldx     [%i1+8],$Hlo
 372         mov     0xE1,$Xhi
 373         mov     1,$Xlo
 374         sllx    $Xhi,57,$Xhi
 375         srax    $Hhi,63,$C0             ! broadcast carry
 376         addcc   $Hlo,$Hlo,$Hlo          ! H<<=1
 377         addxc   $Hhi,$Hhi,$Hhi
 378         and     $C0,$Xlo,$Xlo
 379         and     $C0,$Xhi,$Xhi
 380         xor     $Xlo,$Hlo,$Hlo
 381         xor     $Xhi,$Hhi,$Hhi
 382         stx     $Hlo,[%i0+8]            ! save twisted H
 383         stx     $Hhi,[%i0+0]
 384
 385         sethi   %hi(0xA0406080),$V
 386         sethi   %hi(0x20C0E000),%l0
 387         or      $V,%lo(0xA0406080),$V
 388         or      %l0,%lo(0x20C0E000),%l0
 389         sllx    $V,32,$V
 390         or      %l0,$V,$V               ! (0xE0·i)&0xff=0xA040608020C0E000
 391         stx     $V,[%i0+16]
 392
 393         ret
 394         restore
 395 .type   gcm_init_vis3,#function
 396 .size   gcm_init_vis3,.-gcm_init_vis3
 397
 398 .globl  gcm_gmult_vis3
 399 .align  32
 400 gcm_gmult_vis3:
 401         save    %sp,-$frame,%sp
 402
 403         ldx     [$Xip+8],$Xlo           ! load Xi
 404         ldx     [$Xip+0],$Xhi
 405         ldx     [$Htable+8],$Hlo        ! load twisted H
 406         ldx     [$Htable+0],$Hhi
 407
 408         mov     0xE1,%l7
 409         sllx    %l7,57,$xE1             ! 57 is not a typo
 410         ldx     [$Htable+16],$V         ! (0xE0·i)&0xff=0xA040608020C0E000
 411
 412         xor     $Hhi,$Hlo,$Hhl          ! Karatsuba pre-processing
 413         xmulx   $Xlo,$Hlo,$C0
 414         xor     $Xlo,$Xhi,$C2           ! Karatsuba pre-processing
 415         xmulx   $C2,$Hhl,$C1
 416         xmulxhi $Xlo,$Hlo,$Xlo
 417         xmulxhi $C2,$Hhl,$C2
 418         xmulxhi $Xhi,$Hhi,$C3
 419         xmulx   $Xhi,$Hhi,$Xhi
 420
 421         sll     $C0,3,$sqr
 422         srlx    $V,$sqr,$sqr            ! ·0xE0 [implicit &(7<<3)]
 423         xor     $C0,$sqr,$sqr
 424         sllx    $sqr,57,$sqr            ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
 425
 426         xor     $C0,$C1,$C1             ! Karatsuba post-processing
 427         xor     $Xlo,$C2,$C2
 428          xor    $sqr,$Xlo,$Xlo          ! real destination is $C1
 429         xor     $C3,$C2,$C2
 430         xor     $Xlo,$C1,$C1
 431         xor     $Xhi,$C2,$C2
 432         xor     $Xhi,$C1,$C1
 433
 434         xmulxhi $C0,$xE1,$Xlo           ! ·0xE1<<1<<56
 435          xor    $C0,$C2,$C2
 436         xmulx   $C1,$xE1,$C0
 437          xor    $C1,$C3,$C3
 438         xmulxhi $C1,$xE1,$C1
 439
 440         xor     $Xlo,$C2,$C2
 441         xor     $C0,$C2,$C2
 442         xor     $C1,$C3,$C3
 443
 444         stx     $C2,[$Xip+8]            ! save Xi
 445         stx     $C3,[$Xip+0]
 446
 447         ret
 448         restore
 449 .type   gcm_gmult_vis3,#function
 450 .size   gcm_gmult_vis3,.-gcm_gmult_vis3
 451
 452 .globl  gcm_ghash_vis3
 453 .align  32
 454 gcm_ghash_vis3:
 455         save    %sp,-$frame,%sp
 456         nop
 457         srln    $len,0,$len             ! needed on v8+, "nop" on v9
 458
 459         ldx     [$Xip+8],$C2            ! load Xi
 460         ldx     [$Xip+0],$C3
 461         ldx     [$Htable+8],$Hlo        ! load twisted H
 462         ldx     [$Htable+0],$Hhi
 463
 464         mov     0xE1,%l7
 465         sllx    %l7,57,$xE1             ! 57 is not a typo
 466         ldx     [$Htable+16],$V         ! (0xE0·i)&0xff=0xA040608020C0E000
 467
 468         and     $inp,7,$shl
 469         andn    $inp,7,$inp
 470         sll     $shl,3,$shl
 471         prefetch [$inp+63], 20
 472         sub     %g0,$shl,$shr
 473
 474         xor     $Hhi,$Hlo,$Hhl          ! Karatsuba pre-processing
 475 .Loop:
 476         ldx     [$inp+8],$Xlo
 477         brz,pt  $shl,1f
 478         ldx     [$inp+0],$Xhi
 479
 480         ldx     [$inp+16],$C1           ! align data
 481         srlx    $Xlo,$shr,$C0
 482         sllx    $Xlo,$shl,$Xlo
 483         sllx    $Xhi,$shl,$Xhi
 484         srlx    $C1,$shr,$C1
 485         or      $C0,$Xhi,$Xhi
 486         or      $C1,$Xlo,$Xlo
 487 1:
 488         add     $inp,16,$inp
 489         sub     $len,16,$len
 490         xor     $C2,$Xlo,$Xlo
 491         xor     $C3,$Xhi,$Xhi
 492         prefetch [$inp+63], 20
 493
 494         xmulx   $Xlo,$Hlo,$C0
 495         xor     $Xlo,$Xhi,$C2           ! Karatsuba pre-processing
 496         xmulx   $C2,$Hhl,$C1
 497         xmulxhi $Xlo,$Hlo,$Xlo
 498         xmulxhi $C2,$Hhl,$C2
 499         xmulxhi $Xhi,$Hhi,$C3
 500         xmulx   $Xhi,$Hhi,$Xhi
 501
 502         sll     $C0,3,$sqr
 503         srlx    $V,$sqr,$sqr            ! ·0xE0 [implicit &(7<<3)]
 504         xor     $C0,$sqr,$sqr
 505         sllx    $sqr,57,$sqr            ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
 506
 507         xor     $C0,$C1,$C1             ! Karatsuba post-processing
 508         xor     $Xlo,$C2,$C2
 509          xor    $sqr,$Xlo,$Xlo          ! real destination is $C1
 510         xor     $C3,$C2,$C2
 511         xor     $Xlo,$C1,$C1
 512         xor     $Xhi,$C2,$C2
 513         xor     $Xhi,$C1,$C1
 514
 515         xmulxhi $C0,$xE1,$Xlo           ! ·0xE1<<1<<56
 516          xor    $C0,$C2,$C2
 517         xmulx   $C1,$xE1,$C0
 518          xor    $C1,$C3,$C3
 519         xmulxhi $C1,$xE1,$C1
 520
 521         xor     $Xlo,$C2,$C2
 522         xor     $C0,$C2,$C2
 523         brnz,pt $len,.Loop
 524         xor     $C1,$C3,$C3
 525
 526         stx     $C2,[$Xip+8]            ! save Xi
 527         stx     $C3,[$Xip+0]
 528
 529         ret
 530         restore
 531 .type   gcm_ghash_vis3,#function
 532 .size   gcm_ghash_vis3,.-gcm_ghash_vis3
 533 ___
 534 }}}
 535 $code.=<<___;
 536 .asciz  "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
 537 .align  4
 538 ___
 539
 540 \f
 541 # Purpose of these subroutines is to explicitly encode VIS instructions,
 542 # so that one can compile the module without having to specify VIS
 543 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 544 # Idea is to reserve for option to produce "universal" binary and let
 545 # programmer detect if current CPU is VIS capable at run-time.
 546 sub unvis3 {
 547 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 548 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 549 my ($ref,$opf);
 550 my %visopf = (  "addxc"         => 0x011,
 551                 "addxccc"       => 0x013,
 552                 "xmulx"         => 0x115,
 553                 "xmulxhi"       => 0x116        );
 554
 555     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 556
 557     if ($opf=$visopf{$mnemonic}) {
 558         foreach ($rs1,$rs2,$rd) {
 559             return $ref if (!/%([goli])([0-9])/);
 560             $_=$bias{$1}+$2;
 561         }
 562
 563         return  sprintf ".word\t0x%08x !%s",
 564                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 565                         $ref;
 566     } else {
 567         return $ref;
 568     }
 569 }
 570
 571 foreach (split("\n",$code)) {
 572         s/\`([^\`]*)\`/eval $1/ge;
 573
 574         s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 575                 &unvis3($1,$2,$3,$4)
 576          /ge;
 577
 578         print $_,"\n";
 579 }
 580
 581 close STDOUT;