crypto/aes/asm/aesfx-sparcv9.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16
  17 # March 2016
  18 #
  19 # Initial support for Fujitsu SPARC64 X/X+ comprises minimally
  20 # required key setup and single-block procedures.
  21 #
  22 # April 2016
  23 #
  24 # Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
  25 # that parallelizeable nature of CBC decrypt and CTR is not utilized
  26 # yet. CBC encrypt on the other hand is as good as it can possibly
  27 # get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
  28 # This is ~6x faster than pure software implementation...
  29
  30 $output = pop;
  31 open STDOUT,">$output";
  32
  33 {
  34 my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
  35
  36 $code.=<<___;
  37 #include "sparc_arch.h"
  38
  39 #define LOCALS (STACK_BIAS+STACK_FRAME)
  40
  41 .text
  42
  43 .globl  aes_fx_encrypt
  44 .align  32
  45 aes_fx_encrypt:
  46         and             $inp, 7, $tmp           ! is input aligned?
  47         andn            $inp, 7, $inp
  48         ld              [$key + 240], $rounds
  49         ldd             [$key +  0], %f6        ! round[0]
  50         ldd             [$key +  8], %f8
  51
  52         ldd             [$inp + 0], %f0         ! load input
  53         brz,pt          $tmp, .Lenc_inp_aligned
  54         ldd             [$inp + 8], %f2
  55
  56         ldd             [$inp + 16], %f4
  57         alignaddr       $inp, $tmp, %g0
  58         faligndata      %f0, %f2, %f0
  59         faligndata      %f2, %f4, %f2
  60
  61 .Lenc_inp_aligned:
  62         ldd             [$key + 16], %f10       ! round[1]
  63         ldd             [$key + 24], %f12
  64
  65         fxor            %f0, %f6, %f0           ! ^=round[0]
  66         fxor            %f2, %f8, %f2
  67         ldd             [$key + 32], %f6        ! round[2]
  68         ldd             [$key + 40], %f8
  69         add             $key, 32, $key
  70         sub             $rounds, 4, $rounds
  71
  72 .Loop_enc:
  73         fmovd           %f0, %f4
  74         faesencx        %f2, %f10, %f0
  75         faesencx        %f4, %f12, %f2
  76         ldd             [$key + 16], %f10
  77         ldd             [$key + 24], %f12
  78         add             $key, 32, $key
  79
  80         fmovd           %f0, %f4
  81         faesencx        %f2, %f6, %f0
  82         faesencx        %f4, %f8, %f2
  83         ldd             [$key +  0], %f6
  84         ldd             [$key +  8], %f8
  85
  86         brnz,a          $rounds, .Loop_enc
  87         sub             $rounds, 2, $rounds
  88
  89         andcc           $out, 7, $tmp           ! is output aligned?
  90         mov             0xff, $mask
  91
  92         fmovd           %f0, %f4
  93         faesencx        %f2, %f10, %f0
  94         faesencx        %f4, %f12, %f2
  95         fmovd           %f0, %f4
  96         faesenclx       %f2, %f6, %f0
  97         faesenclx       %f4, %f8, %f2
  98
  99         bnz,a,pn        %icc, .Lenc_out_unaligned
 100         srl             $mask, $tmp, $mask
 101
 102         std             %f0, [$out + 0]
 103         retl
 104         std             %f2, [$out + 8]
 105
 106 .align  16
 107 .Lenc_out_unaligned:
 108         alignaddrl      $out, %g0, $out
 109         faligndata      %f0, %f0, %f4
 110         faligndata      %f0, %f2, %f6
 111         faligndata      %f2, %f2, %f8
 112
 113         stda            %f4, [$out + $mask]0xc0 ! partial store
 114         std             %f6, [$out + 8]
 115         add             $out, 16, $out
 116         orn             %g0, $mask, $mask
 117         stda            %f8, [$out + $mask]0xc0 ! partial store
 118         retl
 119         nop
 120 .type   aes_fx_encrypt,#function
 121 .size   aes_fx_encrypt,.-aes_fx_encrypt
 122
 123 .globl  aes_fx_decrypt
 124 .align  32
 125 aes_fx_decrypt:
 126         and             $inp, 7, $tmp           ! is input aligned?
 127         andn            $inp, 7, $inp
 128         ld              [$key + 240], $rounds
 129         ldd             [$key +  0], %f6        ! round[0]
 130         ldd             [$key +  8], %f8
 131
 132         ldd             [$inp + 0], %f0         ! load input
 133         brz,pt          $tmp, .Ldec_inp_aligned
 134         ldd             [$inp + 8], %f2
 135
 136         ldd             [$inp + 16], %f4
 137         alignaddr       $inp, $tmp, $inp
 138         faligndata      %f0, %f2, %f0
 139         faligndata      %f2, %f4, %f2
 140
 141 .Ldec_inp_aligned:
 142         ldd             [$key + 16], %f10       ! round[1]
 143         ldd             [$key + 24], %f12
 144
 145         fxor            %f0, %f6, %f0           ! ^=round[0]
 146         fxor            %f2, %f8, %f2
 147         ldd             [$key + 32], %f6        ! round[2]
 148         ldd             [$key + 40], %f8
 149         add             $key, 32, $key
 150         sub             $rounds, 4, $rounds
 151
 152 .Loop_dec:
 153         fmovd           %f0, %f4
 154         faesdecx        %f2, %f10, %f0
 155         faesdecx        %f4, %f12, %f2
 156         ldd             [$key + 16], %f10
 157         ldd             [$key + 24], %f12
 158         add             $key, 32, $key
 159
 160         fmovd           %f0, %f4
 161         faesdecx        %f2, %f6, %f0
 162         faesdecx        %f4, %f8, %f2
 163         ldd             [$key +  0], %f6
 164         ldd             [$key +  8], %f8
 165
 166         brnz,a          $rounds, .Loop_dec
 167         sub             $rounds, 2, $rounds
 168
 169         andcc           $out, 7, $tmp           ! is output aligned?
 170         mov             0xff, $mask
 171
 172         fmovd           %f0, %f4
 173         faesdecx        %f2, %f10, %f0
 174         faesdecx        %f4, %f12, %f2
 175         fmovd           %f0, %f4
 176         faesdeclx       %f2, %f6, %f0
 177         faesdeclx       %f4, %f8, %f2
 178
 179         bnz,a,pn        %icc, .Ldec_out_unaligned
 180         srl             $mask, $tmp, $mask
 181
 182         std             %f0, [$out + 0]
 183         retl
 184         std             %f2, [$out + 8]
 185
 186 .align  16
 187 .Ldec_out_unaligned:
 188         alignaddrl      $out, %g0, $out
 189         faligndata      %f0, %f0, %f4
 190         faligndata      %f0, %f2, %f6
 191         faligndata      %f2, %f2, %f8
 192
 193         stda            %f4, [$out + $mask]0xc0 ! partial store
 194         std             %f6, [$out + 8]
 195         add             $out, 16, $out
 196         orn             %g0, $mask, $mask
 197         stda            %f8, [$out + $mask]0xc0 ! partial store
 198         retl
 199         nop
 200 .type   aes_fx_decrypt,#function
 201 .size   aes_fx_decrypt,.-aes_fx_decrypt
 202 ___
 203 }
 204 {
 205 my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
 206 $code.=<<___;
 207 .globl  aes_fx_set_decrypt_key
 208 .align  32
 209 aes_fx_set_decrypt_key:
 210         b               .Lset_encrypt_key
 211         mov             -1, $inc
 212         retl
 213         nop
 214 .type   aes_fx_set_decrypt_key,#function
 215 .size   aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
 216
 217 .globl  aes_fx_set_encrypt_key
 218 .align  32
 219 aes_fx_set_encrypt_key:
 220         mov             1, $inc
 221         nop
 222 .Lset_encrypt_key:
 223         and             $inp, 7, $tmp
 224         andn            $inp, 7, $inp
 225
 226         cmp             $bits, 192
 227         ldd             [$inp + 0], %f0
 228         bl,pt           %icc, .L128
 229         ldd             [$inp + 8], %f2
 230
 231         be,pt           %icc, .L192
 232         ldd             [$inp + 16], %f4
 233         brz,pt          $tmp, .L256aligned
 234         ldd             [$inp + 24], %f6
 235
 236         ldd             [$inp + 32], %f8
 237         alignaddr       $inp, $tmp, %g0
 238         faligndata      %f0, %f2, %f0
 239         faligndata      %f2, %f4, %f2
 240         faligndata      %f4, %f6, %f4
 241         faligndata      %f6, %f8, %f6
 242
 243 .L256aligned:
 244         mov             14, $bits
 245         and             $inc, `14*16`, $tmp
 246         st              $bits, [$out + 240]     ! store rounds
 247         add             $out, $tmp, $out        ! start or end of key schedule
 248         sllx            $inc, 4, $inc           ! 16 or -16
 249 ___
 250 for ($i=0; $i<6; $i++) {
 251     $code.=<<___;
 252         std             %f0, [$out + 0]
 253         faeskeyx        %f6, `0x10+$i`, %f0
 254         std             %f2, [$out + 8]
 255         add             $out, $inc, $out
 256         faeskeyx        %f0, 0x00, %f2
 257         std             %f4, [$out + 0]
 258         faeskeyx        %f2, 0x01, %f4
 259         std             %f6, [$out + 8]
 260         add             $out, $inc, $out
 261         faeskeyx        %f4, 0x00, %f6
 262 ___
 263 }
 264 $code.=<<___;
 265         std             %f0, [$out + 0]
 266         faeskeyx        %f6, `0x10+$i`, %f0
 267         std             %f2, [$out + 8]
 268         add             $out, $inc, $out
 269         faeskeyx        %f0, 0x00, %f2
 270         std             %f4,[$out + 0]
 271         std             %f6,[$out + 8]
 272         add             $out, $inc, $out
 273         std             %f0,[$out + 0]
 274         std             %f2,[$out + 8]
 275         retl
 276         xor             %o0, %o0, %o0           ! return 0
 277
 278 .align  16
 279 .L192:
 280         brz,pt          $tmp, .L192aligned
 281         nop
 282
 283         ldd             [$inp + 24], %f6
 284         alignaddr       $inp, $tmp, %g0
 285         faligndata      %f0, %f2, %f0
 286         faligndata      %f2, %f4, %f2
 287         faligndata      %f4, %f6, %f4
 288
 289 .L192aligned:
 290         mov             12, $bits
 291         and             $inc, `12*16`, $tmp
 292         st              $bits, [$out + 240]     ! store rounds
 293         add             $out, $tmp, $out        ! start or end of key schedule
 294         sllx            $inc, 4, $inc           ! 16 or -16
 295 ___
 296 for ($i=0; $i<8; $i+=2) {
 297     $code.=<<___;
 298         std             %f0, [$out + 0]
 299         faeskeyx        %f4, `0x10+$i`, %f0
 300         std             %f2, [$out + 8]
 301         add             $out, $inc, $out
 302         faeskeyx        %f0, 0x00, %f2
 303         std             %f4, [$out + 0]
 304         faeskeyx        %f2, 0x00, %f4
 305         std             %f0, [$out + 8]
 306         add             $out, $inc, $out
 307         faeskeyx        %f4, `0x10+$i+1`, %f0
 308         std             %f2, [$out + 0]
 309         faeskeyx        %f0, 0x00, %f2
 310         std             %f4, [$out + 8]
 311         add             $out, $inc, $out
 312 ___
 313 $code.=<<___            if ($i<6);
 314         faeskeyx        %f2, 0x00, %f4
 315 ___
 316 }
 317 $code.=<<___;
 318         std             %f0, [$out + 0]
 319         std             %f2, [$out + 8]
 320         retl
 321         xor             %o0, %o0, %o0           ! return 0
 322
 323 .align  16
 324 .L128:
 325         brz,pt          $tmp, .L128aligned
 326         nop
 327
 328         ldd             [$inp + 16], %f4
 329         alignaddr       $inp, $tmp, %g0
 330         faligndata      %f0, %f2, %f0
 331         faligndata      %f2, %f4, %f2
 332
 333 .L128aligned:
 334         mov             10, $bits
 335         and             $inc, `10*16`, $tmp
 336         st              $bits, [$out + 240]     ! store rounds
 337         add             $out, $tmp, $out        ! start or end of key schedule
 338         sllx            $inc, 4, $inc           ! 16 or -16
 339 ___
 340 for ($i=0; $i<10; $i++) {
 341     $code.=<<___;
 342         std             %f0, [$out + 0]
 343         faeskeyx        %f2, `0x10+$i`, %f0
 344         std             %f2, [$out + 8]
 345         add             $out, $inc, $out
 346         faeskeyx        %f0, 0x00, %f2
 347 ___
 348 }
 349 $code.=<<___;
 350         std             %f0, [$out + 0]
 351         std             %f2, [$out + 8]
 352         retl
 353         xor             %o0, %o0, %o0           ! return 0
 354 .type   aes_fx_set_encrypt_key,#function
 355 .size   aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
 356 ___
 357 }
 358 {
 359 my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
 360 my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
 361 my ($out0,$out1,$iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead)
 362    = map("%f$_",grep { !($_ & 1) } (16 .. 62));
 363 my ($ileft,$iright) = ($ialign,$oalign);
 364
 365 $code.=<<___;
 366 .globl  aes_fx_cbc_encrypt
 367 .align  32
 368 aes_fx_cbc_encrypt:
 369         save            %sp, -STACK_FRAME-16, %sp
 370         andncc          $len, 15, $len
 371         bz,pn           SIZE_T_CC, .Lcbc_no_data
 372         and             $inp, 7, $ialign
 373
 374         andn            $inp, 7, $inp
 375         ld              [$key + 240], $rounds
 376         and             $out, 7, $oalign
 377         ld              [$ivp + 0], %f0         ! load ivec
 378         ld              [$ivp + 4], %f1
 379         ld              [$ivp + 8], %f2
 380         ld              [$ivp + 12], %f3
 381
 382         sll             $rounds, 4, $rounds
 383         add             $rounds, $key, $end
 384         ldd             [$key + 0], $r0hi       ! round[0]
 385         ldd             [$key + 8], $r0lo
 386
 387         add             $inp, 16, $inp
 388         sub             $len, 16, $len
 389         ldd             [$end + 0], $rlhi       ! round[last]
 390         ldd             [$end + 8], $rllo
 391
 392         mov             16, $inc
 393         movrz           $len, 0, $inc
 394         ldd             [$key + 16], %f10       ! round[1]
 395         ldd             [$key + 24], %f12
 396
 397         ldd             [$inp - 16], $in0       ! load input
 398         ldd             [$inp -  8], $in1
 399         ldda            [$inp]0x82, $intail     ! non-faulting load
 400         brz             $dir, .Lcbc_decrypt
 401         add             $inp, $inc, $inp        ! inp+=16
 402
 403         fxor            $r0hi, %f0, %f0         ! ivec^=round[0]
 404         fxor            $r0lo, %f2, %f2
 405         alignaddr       $inp, $ialign, %g0
 406         faligndata      $in0, $in1, $in0
 407         faligndata      $in1, $intail, $in1
 408         fxor            $r0hi, $rlhi, $rlhi     ! round[last]^=round[0]
 409         fxor            $r0lo, $rllo, $rllo
 410
 411 .Loop_cbc_enc:
 412         fxor            $in0, %f0, %f0          ! inp^ivec^round[0]
 413         fxor            $in1, %f2, %f2
 414         ldd             [$key + 32], %f6        ! round[2]
 415         ldd             [$key + 40], %f8
 416         add             $key, 32, $end
 417         sub             $rounds, 16*6, $inner
 418
 419 .Lcbc_enc:
 420         fmovd           %f0, %f4
 421         faesencx        %f2, %f10, %f0
 422         faesencx        %f4, %f12, %f2
 423         ldd             [$end + 16], %f10
 424         ldd             [$end + 24], %f12
 425         add             $end, 32, $end
 426
 427         fmovd           %f0, %f4
 428         faesencx        %f2, %f6, %f0
 429         faesencx        %f4, %f8, %f2
 430         ldd             [$end + 0], %f6
 431         ldd             [$end + 8], %f8
 432
 433         brnz,a          $inner, .Lcbc_enc
 434         sub             $inner, 16*2, $inner
 435
 436         fmovd           %f0, %f4
 437         faesencx        %f2, %f10, %f0
 438         faesencx        %f4, %f12, %f2
 439         ldd             [$end + 16], %f10       ! round[last-1]
 440         ldd             [$end + 24], %f12
 441
 442         fmovd           %f0, %f4
 443         faesencx        %f2, %f6, %f0
 444         faesencx        %f4, %f8, %f2
 445
 446         movrz           $len, 0, $inc
 447         fmovd           $intail, $in0
 448         ldd             [$inp - 8], $in1        ! load next input block
 449         ldda            [$inp]0x82, $intail     ! non-faulting load
 450         add             $inp, $inc, $inp        ! inp+=16
 451
 452         fmovd           %f0, %f4
 453         faesencx        %f2, %f10, %f0
 454         faesencx        %f4, %f12, %f2
 455         ldd             [$key + 16], %f10       ! round[1]
 456         ldd             [$key + 24], %f12
 457
 458         faligndata      $in0, $in1, $in0
 459         faligndata      $in1, $intail, $in1
 460
 461         fmovd           %f0, %f4
 462         faesenclx       %f2, $rlhi, %f0         ! result is out^round[0]
 463         faesenclx       %f4, $rllo, %f2
 464
 465         fxor            %f0, $r0hi, $out0       ! out^round[0]^round[0]
 466         brnz,pn         $oalign, .Lcbc_enc_unaligned_out
 467         fxor            %f2, $r0lo, $out1
 468
 469         std             $out0, [$out + 0]
 470         std             $out1, [$out + 8]
 471         add             $out, 16, $out
 472
 473         brnz,a          $len, .Loop_cbc_enc
 474         sub             $len, 16, $len
 475
 476         st              $out0,    [$ivp + 0]    ! output ivec
 477         st              $out0#lo, [$ivp + 4]
 478         st              $out1,    [$ivp + 8]
 479         st              $out1#lo, [$ivp + 12]
 480
 481 .Lcbc_no_data:
 482         ret
 483         restore
 484
 485 .align  32
 486 .Lcbc_enc_unaligned_out:
 487         alignaddrl      $out, %g0, $out
 488         mov             0xff, $mask
 489         sll             $ialign, 3, $ileft
 490         srl             $mask, $oalign, $mask
 491         sub             %g0, $ileft, $iright
 492
 493         faligndata      $out0, $out0, %f6
 494         faligndata      $out0, $out1, %f8
 495
 496         stda            %f6, [$out + $mask]0xc0 ! partial store
 497         std             %f8, [$out + 8]
 498         add             $out, 16, $out
 499         brz             $len, .Lcbc_enc_unaligned_out_done
 500         orn             %g0, $mask, $mask
 501
 502 .Loop_cbc_enc_unaligned_out:
 503         fxor            $in0, %f0, %f0          ! inp^ivec^round[0]
 504         fxor            $in1, %f2, %f2
 505         ldd             [$key + 32], %f6        ! round[2]
 506         ldd             [$key + 40], %f8
 507
 508         fmovd           %f0, %f4
 509         faesencx        %f2, %f10, %f0
 510         faesencx        %f4, %f12, %f2
 511         ldd             [$key + 48], %f10       ! round[3]
 512         ldd             [$key + 56], %f12
 513
 514         ldx             [$inp - 16], %o0
 515         ldx             [$inp -  8], %o1
 516         brz             $ialign, .Lcbc_enc_aligned_inp
 517         movrz           $len, 0, $inc
 518
 519         ldx             [$inp], %o2
 520         sllx            %o0, $ileft, %o0
 521         srlx            %o1, $iright, %g1
 522         sllx            %o1, $ileft, %o1
 523         or              %g1, %o0, %o0
 524         srlx            %o2, $iright, %o2
 525         or              %o2, %o1, %o1
 526
 527 .Lcbc_enc_aligned_inp:
 528         fmovd           %f0, %f4
 529         faesencx        %f2, %f6, %f0
 530         faesencx        %f4, %f8, %f2
 531         ldd             [$key + 64], %f6        ! round[4]
 532         ldd             [$key + 72], %f8
 533         add             $key, 64, $end
 534         sub             $rounds, 16*8, $inner
 535
 536         stx             %o0, [%sp + LOCALS + 0]
 537         stx             %o1, [%sp + LOCALS + 8]
 538         add             $inp, $inc, $inp        ! inp+=16
 539
 540 .Lcbc_enc_unaligned:
 541         fmovd           %f0, %f4
 542         faesencx        %f2, %f10, %f0
 543         faesencx        %f4, %f12, %f2
 544         ldd             [$end + 16], %f10
 545         ldd             [$end + 24], %f12
 546         add             $end, 32, $end
 547
 548         fmovd           %f0, %f4
 549         faesencx        %f2, %f6, %f0
 550         faesencx        %f4, %f8, %f2
 551         ldd             [$end + 0], %f6
 552         ldd             [$end + 8], %f8
 553
 554         brnz,a          $inner, .Lcbc_enc_unaligned
 555         sub             $inner, 16*2, $inner
 556
 557         fmovd           %f0, %f4
 558         faesencx        %f2, %f10, %f0
 559         faesencx        %f4, %f12, %f2
 560         ldd             [$end + 16], %f10       ! round[last-1]
 561         ldd             [$end + 24], %f12
 562
 563         fmovd           %f0, %f4
 564         faesencx        %f2, %f6, %f0
 565         faesencx        %f4, %f8, %f2
 566         ldd             [%sp + LOCALS + 0], $in0
 567         ldd             [%sp + LOCALS + 8], $in1
 568
 569         fmovd           %f0, %f4
 570         faesencx        %f2, %f10, %f0
 571         faesencx        %f4, %f12, %f2
 572         ldd             [$key + 16], %f10       ! round[1]
 573         ldd             [$key + 24], %f12
 574
 575         fmovd           %f0, %f4
 576         faesenclx       %f2, $rlhi, %f0         ! result is out^round[0]
 577         faesenclx       %f4, $rllo, %f2
 578
 579         fmovd           $out1, $outhead
 580         fxor            %f0, $r0hi, $out0       ! out^round[0]^round[0]
 581         fxor            %f2, $r0lo, $out1
 582
 583         faligndata      $outhead, $out0, %f6
 584         faligndata      $out0, $out1, %f8
 585         std             %f6, [$out + 0]
 586         std             %f8, [$out + 8]
 587         add             $out, 16, $out
 588
 589         brnz,a          $len, .Loop_cbc_enc_unaligned_out
 590         sub             $len, 16, $len
 591
 592 .Lcbc_enc_unaligned_out_done:
 593         faligndata      $out1, $out1, %f8
 594         stda            %f8, [$out + $mask]0xc0 ! partial store
 595
 596         st              $out0,    [$ivp + 0]    ! output ivec
 597         st              $out0#lo, [$ivp + 4]
 598         st              $out1,    [$ivp + 8]
 599         st              $out1#lo, [$ivp + 12]
 600
 601         ret
 602         restore
 603
 604 .align  32
 605 .Lcbc_decrypt:
 606         alignaddr       $inp, $ialign, %g0
 607         faligndata      $in0, $in1, $in0
 608         faligndata      $in1, $intail, $in1
 609         fmovd           %f0, $iv0
 610         fmovd           %f2, $iv1
 611
 612 .Loop_cbc_dec:
 613         fxor            $in0, $r0hi, %f0        ! inp^round[0]
 614         fxor            $in1, $r0lo, %f2
 615         ldd             [$key + 32], %f6        ! round[2]
 616         ldd             [$key + 40], %f8
 617         add             $key, 32, $end
 618         sub             $rounds, 16*6, $inner
 619
 620 .Lcbc_dec:
 621         fmovd           %f0, %f4
 622         faesdecx        %f2, %f10, %f0
 623         faesdecx        %f4, %f12, %f2
 624         ldd             [$end + 16], %f10
 625         ldd             [$end + 24], %f12
 626         add             $end, 32, $end
 627
 628         fmovd           %f0, %f4
 629         faesdecx        %f2, %f6, %f0
 630         faesdecx        %f4, %f8, %f2
 631         ldd             [$end + 0], %f6
 632         ldd             [$end + 8], %f8
 633
 634         brnz,a          $inner, .Lcbc_dec
 635         sub             $inner, 16*2, $inner
 636
 637         fmovd           %f0, %f4
 638         faesdecx        %f2, %f10, %f0
 639         faesdecx        %f4, %f12, %f2
 640         ldd             [$end + 16], %f10       ! round[last-1]
 641         ldd             [$end + 24], %f12
 642
 643         fmovd           %f0, %f4
 644         faesdecx        %f2, %f6, %f0
 645         faesdecx        %f4, %f8, %f2
 646         fxor            $iv0, $rlhi, %f6        ! ivec^round[last]
 647         fxor            $iv1, $rllo, %f8
 648         fmovd           $in0, $iv0
 649         fmovd           $in1, $iv1
 650
 651         movrz           $len, 0, $inc
 652         fmovd           $intail, $in0
 653         ldd             [$inp - 8], $in1        ! load next input block
 654         ldda            [$inp]0x82, $intail     ! non-faulting load
 655         add             $inp, $inc, $inp        ! inp+=16
 656
 657         fmovd           %f0, %f4
 658         faesdecx        %f2, %f10, %f0
 659         faesdecx        %f4, %f12, %f2
 660         ldd             [$key + 16], %f10       ! round[1]
 661         ldd             [$key + 24], %f12
 662
 663         faligndata      $in0, $in1, $in0
 664         faligndata      $in1, $intail, $in1
 665
 666         fmovd           %f0, %f4
 667         faesdeclx       %f2, %f6, %f0
 668         faesdeclx       %f4, %f8, %f2
 669
 670         brnz,pn         $oalign, .Lcbc_dec_unaligned_out
 671         nop
 672
 673         std             %f0, [$out + 0]
 674         std             %f2, [$out + 8]
 675         add             $out, 16, $out
 676
 677         brnz,a          $len, .Loop_cbc_dec
 678         sub             $len, 16, $len
 679
 680         st              $iv0,    [$ivp + 0]     ! output ivec
 681         st              $iv0#lo, [$ivp + 4]
 682         st              $iv1,    [$ivp + 8]
 683         st              $iv1#lo, [$ivp + 12]
 684
 685         ret
 686         restore
 687
 688 .align  32
 689 .Lcbc_dec_unaligned_out:
 690         alignaddrl      $out, %g0, $out
 691         mov             0xff, $mask
 692         sll             $ialign, 3, $ileft
 693         srl             $mask, $oalign, $mask
 694         sub             %g0, $ileft, $iright
 695
 696         faligndata      %f0, %f0, $out0
 697         faligndata      %f0, %f2, $out1
 698
 699         stda            $out0, [$out + $mask]0xc0       ! partial store
 700         std             $out1, [$out + 8]
 701         add             $out, 16, $out
 702         brz             $len, .Lcbc_dec_unaligned_out_done
 703         orn             %g0, $mask, $mask
 704
 705 .Loop_cbc_dec_unaligned_out:
 706         fmovd           %f2, $outhead
 707         fxor            $in0, $r0hi, %f0        ! inp^round[0]
 708         fxor            $in1, $r0lo, %f2
 709         ldd             [$key + 32], %f6        ! round[2]
 710         ldd             [$key + 40], %f8
 711
 712         fmovd           %f0, %f4
 713         faesdecx        %f2, %f10, %f0
 714         faesdecx        %f4, %f12, %f2
 715         ldd             [$key + 48], %f10       ! round[3]
 716         ldd             [$key + 56], %f12
 717
 718         ldx             [$inp - 16], %o0
 719         ldx             [$inp - 8], %o1
 720         brz             $ialign, .Lcbc_dec_aligned_inp
 721         movrz           $len, 0, $inc
 722
 723         ldx             [$inp], %o2
 724         sllx            %o0, $ileft, %o0
 725         srlx            %o1, $iright, %g1
 726         sllx            %o1, $ileft, %o1
 727         or              %g1, %o0, %o0
 728         srlx            %o2, $iright, %o2
 729         or              %o2, %o1, %o1
 730
 731 .Lcbc_dec_aligned_inp:
 732         fmovd           %f0, %f4
 733         faesdecx        %f2, %f6, %f0
 734         faesdecx        %f4, %f8, %f2
 735         ldd             [$key + 64], %f6        ! round[4]
 736         ldd             [$key + 72], %f8
 737         add             $key, 64, $end
 738         sub             $rounds, 16*8, $inner
 739
 740         stx             %o0, [%sp + LOCALS + 0]
 741         stx             %o1, [%sp + LOCALS + 8]
 742         add             $inp, $inc, $inp        ! inp+=16
 743
 744 .Lcbc_dec_unaligned:
 745         fmovd           %f0, %f4
 746         faesdecx        %f2, %f10, %f0
 747         faesdecx        %f4, %f12, %f2
 748         ldd             [$end + 16], %f10
 749         ldd             [$end + 24], %f12
 750         add             $end, 32, $end
 751
 752         fmovd           %f0, %f4
 753         faesdecx        %f2, %f6, %f0
 754         faesdecx        %f4, %f8, %f2
 755         ldd             [$end + 0], %f6
 756         ldd             [$end + 8], %f8
 757
 758         brnz,a          $inner, .Lcbc_dec_unaligned
 759         sub             $inner, 16*2, $inner
 760
 761         fmovd           %f0, %f4
 762         faesdecx        %f2, %f10, %f0
 763         faesdecx        %f4, %f12, %f2
 764         ldd             [$end + 16], %f10       ! round[last-1]
 765         ldd             [$end + 24], %f12
 766
 767         fmovd           %f0, %f4
 768         faesdecx        %f2, %f6, %f0
 769         faesdecx        %f4, %f8, %f2
 770         fxor            $iv0, $rlhi, %f6        ! ivec^round[last]
 771         fxor            $iv1, $rllo, %f8
 772         fmovd           $in0, $iv0
 773         fmovd           $in1, $iv1
 774
 775         fmovd           %f0, %f4
 776         faesdecx        %f2, %f10, %f0
 777         faesdecx        %f4, %f12, %f2
 778         ldd             [$key + 16], %f10       ! round[1]
 779         ldd             [$key + 24], %f12
 780
 781         fmovd           %f0, %f4
 782         faesdeclx       %f2, %f6, %f0
 783         faesdeclx       %f4, %f8, %f2
 784         ldd             [%sp + LOCALS + 0], $in0
 785         ldd             [%sp + LOCALS + 8], $in1
 786
 787         faligndata      $outhead, %f0, $out0
 788         faligndata      %f0, %f2, $out1
 789         std             $out0, [$out + 0]
 790         std             $out1, [$out + 8]
 791         add             $out, 16, $out
 792
 793         brnz,a          $len, .Loop_cbc_dec_unaligned_out
 794         sub             $len, 16, $len
 795
 796 .Lcbc_dec_unaligned_out_done:
 797         faligndata      %f2, %f2, %f8
 798         stda            %f8, [$out + $mask]0xc0 ! partial store
 799
 800         st              $iv0,    [$ivp + 0]     ! output ivec
 801         st              $iv0#lo, [$ivp + 4]
 802         st              $iv1,    [$ivp + 8]
 803         st              $iv1#lo, [$ivp + 12]
 804
 805         ret
 806         restore
 807 .type   aes_fx_cbc_encrypt,#function
 808 .size   aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
 809 ___
 810 }
 811 {
 812 my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
 813 my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
 814 my ($out0,$out1,$ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead)
 815    = map("%f$_",grep { !($_ & 1) } (16 .. 62));
 816 my ($ileft,$iright) = ($ialign, $oalign);
 817 my $one = "%f14";
 818
 819 $code.=<<___;
 820 .globl  aes_fx_ctr32_encrypt_blocks
 821 .align  32
 822 aes_fx_ctr32_encrypt_blocks:
 823         save            %sp, -STACK_FRAME-16, %sp
 824         and             $inp, 7, $ialign
 825         brz,pn          $len, .Lctr32_no_data
 826         andn            $inp, 7, $inp
 827
 828 .Lpic:  call            .+8
 829         add             %o7, .Lone - .Lpic, %o0
 830
 831         ld              [$key + 240], $rounds
 832         and             $out, 7, $oalign
 833         ld              [$ivp +  0], $ctr0      ! load counter
 834         ld              [$ivp +  4], $ctr0#lo
 835         ld              [$ivp +  8], $ctr1
 836         ld              [$ivp + 12], $ctr1#lo
 837         ldd             [%o0], $one
 838
 839         sll             $rounds, 4, $rounds
 840         add             $rounds, $key, $end
 841         ldd             [$key + 0], $r0hi       ! round[0]
 842         ldd             [$key + 8], $r0lo
 843
 844         add             $inp, 16, $inp
 845         sub             $len, 1, $len
 846         ldd             [$key + 16], %f10       ! round[1]
 847         ldd             [$key + 24], %f12
 848
 849         mov             16, $inc
 850         movrz           $len, 0, $inc
 851         ldd             [$end + 0], $rlhi       ! round[last]
 852         ldd             [$end + 8], $rllo
 853
 854         ldd             [$inp - 16], $in0       ! load input
 855         ldd             [$inp -  8], $in1
 856         ldda            [$inp]0x82, $intail     ! non-faulting load
 857         add             $inp, $inc, $inp        ! inp+=16
 858
 859         alignaddr       $inp, $ialign, %g0
 860         faligndata      $in0, $in1, $in0
 861         faligndata      $in1, $intail, $in1
 862
 863 .Loop_ctr32:
 864         fxor            $ctr0, $r0hi, %f0       ! counter^round[0]
 865         fxor            $ctr1, $r0lo, %f2
 866         ldd             [$key + 32], %f6        ! round[2]
 867         ldd             [$key + 40], %f8
 868         add             $key, 32, $end
 869         sub             $rounds, 16*6, $inner
 870
 871 .Lctr32_enc:
 872         fmovd           %f0, %f4
 873         faesencx        %f2, %f10, %f0
 874         faesencx        %f4, %f12, %f2
 875         ldd             [$end + 16], %f10
 876         ldd             [$end + 24], %f12
 877         add             $end, 32, $end
 878
 879         fmovd           %f0, %f4
 880         faesencx        %f2, %f6, %f0
 881         faesencx        %f4, %f8, %f2
 882         ldd             [$end + 0], %f6
 883         ldd             [$end + 8], %f8
 884
 885         brnz,a          $inner, .Lctr32_enc
 886         sub             $inner, 16*2, $inner
 887
 888         fmovd           %f0, %f4
 889         faesencx        %f2, %f10, %f0
 890         faesencx        %f4, %f12, %f2
 891         ldd             [$end + 16], %f10       ! round[last-1]
 892         ldd             [$end + 24], %f12
 893
 894         fmovd           %f0, %f4
 895         faesencx        %f2, %f6, %f0
 896         faesencx        %f4, %f8, %f2
 897         fxor            $in0, $rlhi, %f6        ! inp^round[last]
 898         fxor            $in1, $rllo, %f8
 899
 900         movrz           $len, 0, $inc
 901         fmovd           $intail, $in0
 902         ldd             [$inp - 8], $in1        ! load next input block
 903         ldda            [$inp]0x82, $intail     ! non-faulting load
 904         add             $inp, $inc, $inp        ! inp+=16
 905
 906         fmovd           %f0, %f4
 907         faesencx        %f2, %f10, %f0
 908         faesencx        %f4, %f12, %f2
 909         ldd             [$key + 16], %f10       ! round[1]
 910         ldd             [$key + 24], %f12
 911
 912         faligndata      $in0, $in1, $in0
 913         faligndata      $in1, $intail, $in1
 914         fpadd32         $ctr1, $one, $ctr1      ! increment counter
 915
 916         fmovd           %f0, %f4
 917         faesenclx       %f2, %f6, %f0
 918         faesenclx       %f4, %f8, %f2
 919
 920         brnz,pn         $oalign, .Lctr32_unaligned_out
 921         nop
 922
 923         std             %f0, [$out + 0]
 924         std             %f2, [$out + 8]
 925         add             $out, 16, $out
 926
 927         brnz,a          $len, .Loop_ctr32
 928         sub             $len, 1, $len
 929
 930 .Lctr32_no_data:
 931         ret
 932         restore
 933
 934 .align  32
 935 .Lctr32_unaligned_out:
 936         alignaddrl      $out, %g0, $out
 937         mov             0xff, $mask
 938         sll             $ialign, 3, $ileft
 939         srl             $mask, $oalign, $mask
 940         sub             %g0, $ileft, $iright
 941
 942         faligndata      %f0, %f0, $out0
 943         faligndata      %f0, %f2, $out1
 944
 945         stda            $out0, [$out + $mask]0xc0       ! partial store
 946         std             $out1, [$out + 8]
 947         add             $out, 16, $out
 948         brz             $len, .Lctr32_unaligned_out_done
 949         orn             %g0, $mask, $mask
 950
 951 .Loop_ctr32_unaligned_out:
 952         fmovd           %f2, $outhead
 953         fxor            $ctr0, $r0hi, %f0       ! counter^round[0]
 954         fxor            $ctr1, $r0lo, %f2
 955         ldd             [$key + 32], %f6        ! round[2]
 956         ldd             [$key + 40], %f8
 957
 958         fmovd           %f0, %f4
 959         faesencx        %f2, %f10, %f0
 960         faesencx        %f4, %f12, %f2
 961         ldd             [$key + 48], %f10       ! round[3]
 962         ldd             [$key + 56], %f12
 963
 964         ldx             [$inp - 16], %o0
 965         ldx             [$inp -  8], %o1
 966         brz             $ialign, .Lctr32_aligned_inp
 967         movrz           $len, 0, $inc
 968
 969         ldx             [$inp], %o2
 970         sllx            %o0, $ileft, %o0
 971         srlx            %o1, $iright, %g1
 972         sllx            %o1, $ileft, %o1
 973         or              %g1, %o0, %o0
 974         srlx            %o2, $iright, %o2
 975         or              %o2, %o1, %o1
 976
 977 .Lctr32_aligned_inp:
 978         fmovd           %f0, %f4
 979         faesencx        %f2, %f6, %f0
 980         faesencx        %f4, %f8, %f2
 981         ldd             [$key + 64], %f6        ! round[4]
 982         ldd             [$key + 72], %f8
 983         add             $key, 64, $end
 984         sub             $rounds, 16*8, $inner
 985
 986         stx             %o0, [%sp + LOCALS + 0]
 987         stx             %o1, [%sp + LOCALS + 8]
 988         add             $inp, $inc, $inp        ! inp+=16
 989
 990 .Lctr32_enc_unaligned:
 991         fmovd           %f0, %f4
 992         faesencx        %f2, %f10, %f0
 993         faesencx        %f4, %f12, %f2
 994         ldd             [$end + 16], %f10
 995         ldd             [$end + 24], %f12
 996         add             $end, 32, $end
 997
 998         fmovd           %f0, %f4
 999         faesencx        %f2, %f6, %f0
1000         faesencx        %f4, %f8, %f2
1001         ldd             [$end + 0], %f6
1002         ldd             [$end + 8], %f8
1003
1004         brnz,a          $inner, .Lctr32_enc_unaligned
1005         sub             $inner, 16*2, $inner
1006
1007         fmovd           %f0, %f4
1008         faesencx        %f2, %f10, %f0
1009         faesencx        %f4, %f12, %f2
1010         ldd             [$end + 16], %f10       ! round[last-1]
1011         ldd             [$end + 24], %f12
1012         fpadd32         $ctr1, $one, $ctr1      ! increment counter
1013
1014         fmovd           %f0, %f4
1015         faesencx        %f2, %f6, %f0
1016         faesencx        %f4, %f8, %f2
1017         fxor            $in0, $rlhi, %f6        ! inp^round[last]
1018         fxor            $in1, $rllo, %f8
1019         ldd             [%sp + LOCALS + 0], $in0
1020         ldd             [%sp + LOCALS + 8], $in1
1021
1022         fmovd           %f0, %f4
1023         faesencx        %f2, %f10, %f0
1024         faesencx        %f4, %f12, %f2
1025         ldd             [$key + 16], %f10       ! round[1]
1026         ldd             [$key + 24], %f12
1027
1028         fmovd           %f0, %f4
1029         faesenclx       %f2, %f6, %f0
1030         faesenclx       %f4, %f8, %f2
1031
1032         faligndata      $outhead, %f0, $out0
1033         faligndata      %f0, %f2, $out1
1034         std             $out0, [$out + 0]
1035         std             $out1, [$out + 8]
1036         add             $out, 16, $out
1037
1038         brnz,a          $len, .Loop_ctr32_unaligned_out
1039         sub             $len, 1, $len
1040
1041 .Lctr32_unaligned_out_done:
1042         faligndata      %f2, %f2, %f8
1043         stda            %f8, [$out + $mask]0xc0 ! partial store
1044
1045         ret
1046         restore
1047 .type   aes_fx_ctr32_encrypt_blocks,#function
1048 .size   aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
1049 .align  32
1050 .Lone:
1051         .word   0, 1
1052 .asciz  "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
1053 .align  4
1054 ___
1055 }
1056 # Purpose of these subroutines is to explicitly encode VIS instructions,
1057 # so that one can compile the module without having to specify VIS
1058 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1059 # Idea is to reserve for option to produce "universal" binary and let
1060 # programmer detect if current CPU is VIS capable at run-time.
1061 sub unvis {
1062 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1063 my ($ref,$opf);
1064 my %visopf = (  "faligndata"    => 0x048,
1065                 "bshuffle"      => 0x04c,
1066                 "fpadd32"       => 0x052,
1067                 "fxor"          => 0x06c,
1068                 "fsrc2"         => 0x078        );
1069
1070     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1071
1072     if ($opf=$visopf{$mnemonic}) {
1073         foreach ($rs1,$rs2,$rd) {
1074             return $ref if (!/%f([0-9]{1,2})/);
1075             $_=$1;
1076             if ($1>=32) {
1077                 return $ref if ($1&1);
1078                 # re-encode for upper double register addressing
1079                 $_=($1|$1>>5)&31;
1080             }
1081         }
1082
1083         return  sprintf ".word\t0x%08x !%s",
1084                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1085                         $ref;
1086     } else {
1087         return $ref;
1088     }
1089 }
1090
1091 sub unvis3 {
1092 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1093 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1094 my ($ref,$opf);
1095 my %visopf = (  "alignaddr"     => 0x018,
1096                 "bmask"         => 0x019,
1097                 "alignaddrl"    => 0x01a        );
1098
1099     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1100
1101     if ($opf=$visopf{$mnemonic}) {
1102         foreach ($rs1,$rs2,$rd) {
1103             return $ref if (!/%([goli])([0-9])/);
1104             $_=$bias{$1}+$2;
1105         }
1106
1107         return  sprintf ".word\t0x%08x !%s",
1108                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1109                         $ref;
1110     } else {
1111         return $ref;
1112     }
1113 }
1114
1115 sub unfx {
1116 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1117 my ($ref,$opf);
1118 my %aesopf = (  "faesencx"      => 0x90,
1119                 "faesdecx"      => 0x91,
1120                 "faesenclx"     => 0x92,
1121                 "faesdeclx"     => 0x93,
1122                 "faeskeyx"      => 0x94 );
1123
1124     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1125
1126     if (defined($opf=$aesopf{$mnemonic})) {
1127         $rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2;
1128         $rs2 = oct($rs2) if ($rs2 =~ /^0/);
1129
1130         foreach ($rs1,$rd) {
1131             return $ref if (!/%f([0-9]{1,2})/);
1132             $_=$1;
1133             if ($1>=32) {
1134                 return $ref if ($1&1);
1135                 # re-encode for upper double register addressing
1136                 $_=($1|$1>>5)&31;
1137             }
1138         }
1139
1140         return  sprintf ".word\t0x%08x !%s",
1141                         2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1142                         $ref;
1143     } else {
1144         return $ref;
1145     }
1146 }
1147
1148 foreach (split("\n",$code)) {
1149     s/\`([^\`]*)\`/eval $1/ge;
1150
1151     s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
1152
1153     s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1154                 &unfx($1,$2,$3,$4,$5)
1155      /ge or
1156     s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1157                 &unvis($1,$2,$3,$4)
1158      /ge or
1159     s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1160                 &unvis3($1,$2,$3,$4)
1161      /ge;
1162     print $_,"\n";
1163 }
1164
1165 close STDOUT;