crypto/perlasm/sparcv9_modes.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # Specific modes implementations for SPARC Architecture 2011. There
  11 # is T4 dependency though, an ASI value that is not specified in the
  12 # Architecture Manual. But as SPARC universe is rather monocultural,
  13 # we imply that processor capable of executing crypto instructions
  14 # can handle the ASI in question as well. This means that we ought to
  15 # keep eyes open when new processors emerge...
  16 #
  17 # As for above mentioned ASI. It's so called "block initializing
  18 # store" which cancels "read" in "read-update-write" on cache lines.
  19 # This is "cooperative" optimization, as it reduces overall pressure
  20 # on memory interface. Benefits can't be observed/quantified with
  21 # usual benchmarks, on the contrary you can notice that single-thread
  22 # performance for parallelizable modes is ~1.5% worse for largest
  23 # block sizes [though few percent better for not so long ones]. All
  24 # this based on suggestions from David Miller.
  25
  26 $::bias="STACK_BIAS";
  27 $::frame="STACK_FRAME";
  28 $::size_t_cc="SIZE_T_CC";
  29
  30 sub asm_init {          # to be called with @ARGV as argument
  31     for (@_)            { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
  32     if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
  33     else                { $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
  34 }
  35
  36 # unified interface
  37 my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
  38 # local variables
  39 my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
  40
  41 sub alg_cbc_encrypt_implement {
  42 my ($alg,$bits) = @_;
  43
  44 $::code.=<<___;
  45 .globl  ${alg}${bits}_t4_cbc_encrypt
  46 .align  32
  47 ${alg}${bits}_t4_cbc_encrypt:
  48         save            %sp, -$::frame, %sp
  49         cmp             $len, 0
  50         be,pn           $::size_t_cc, .L${bits}_cbc_enc_abort
  51         sub             $inp, $out, $blk_init   ! $inp!=$out
  52 ___
  53 $::code.=<<___ if (!$::evp);
  54         andcc           $ivec, 7, $ivoff
  55         alignaddr       $ivec, %g0, $ivec
  56
  57         ldd             [$ivec + 0], %f0        ! load ivec
  58         bz,pt           %icc, 1f
  59         ldd             [$ivec + 8], %f2
  60         ldd             [$ivec + 16], %f4
  61         faligndata      %f0, %f2, %f0
  62         faligndata      %f2, %f4, %f2
  63 1:
  64 ___
  65 $::code.=<<___ if ($::evp);
  66         ld              [$ivec + 0], %f0
  67         ld              [$ivec + 4], %f1
  68         ld              [$ivec + 8], %f2
  69         ld              [$ivec + 12], %f3
  70 ___
  71 $::code.=<<___;
  72         prefetch        [$inp], 20
  73         prefetch        [$inp + 63], 20
  74         call            _${alg}${bits}_load_enckey
  75         and             $inp, 7, $ileft
  76         andn            $inp, 7, $inp
  77         sll             $ileft, 3, $ileft
  78         mov             64, $iright
  79         mov             0xff, $omask
  80         sub             $iright, $ileft, $iright
  81         and             $out, 7, $ooff
  82         cmp             $len, 127
  83         movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
  84         movleu          $::size_t_cc, 0, $blk_init      !       $len<128 ||
  85         brnz,pn         $blk_init, .L${bits}cbc_enc_blk !       $inp==$out)
  86         srl             $omask, $ooff, $omask
  87
  88         alignaddrl      $out, %g0, $out
  89         srlx            $len, 4, $len
  90         prefetch        [$out], 22
  91
  92 .L${bits}_cbc_enc_loop:
  93         ldx             [$inp + 0], %o0
  94         brz,pt          $ileft, 4f
  95         ldx             [$inp + 8], %o1
  96
  97         ldx             [$inp + 16], %o2
  98         sllx            %o0, $ileft, %o0
  99         srlx            %o1, $iright, %g1
 100         sllx            %o1, $ileft, %o1
 101         or              %g1, %o0, %o0
 102         srlx            %o2, $iright, %o2
 103         or              %o2, %o1, %o1
 104 4:
 105         xor             %g4, %o0, %o0           ! ^= rk[0]
 106         xor             %g5, %o1, %o1
 107         movxtod         %o0, %f12
 108         movxtod         %o1, %f14
 109
 110         fxor            %f12, %f0, %f0          ! ^= ivec
 111         fxor            %f14, %f2, %f2
 112         prefetch        [$out + 63], 22
 113         prefetch        [$inp + 16+63], 20
 114         call            _${alg}${bits}_encrypt_1x
 115         add             $inp, 16, $inp
 116
 117         brnz,pn         $ooff, 2f
 118         sub             $len, 1, $len
 119
 120         std             %f0, [$out + 0]
 121         std             %f2, [$out + 8]
 122         brnz,pt         $len, .L${bits}_cbc_enc_loop
 123         add             $out, 16, $out
 124 ___
 125 $::code.=<<___ if ($::evp);
 126         st              %f0, [$ivec + 0]
 127         st              %f1, [$ivec + 4]
 128         st              %f2, [$ivec + 8]
 129         st              %f3, [$ivec + 12]
 130 ___
 131 $::code.=<<___ if (!$::evp);
 132         brnz,pn         $ivoff, 3f
 133         nop
 134
 135         std             %f0, [$ivec + 0]        ! write out ivec
 136         std             %f2, [$ivec + 8]
 137 ___
 138 $::code.=<<___;
 139 .L${bits}_cbc_enc_abort:
 140         ret
 141         restore
 142
 143 .align  16
 144 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 145                                                 ! and ~3x deterioration
 146                                                 ! in inp==out case
 147         faligndata      %f0, %f0, %f4           ! handle unaligned output
 148         faligndata      %f0, %f2, %f6
 149         faligndata      %f2, %f2, %f8
 150
 151         stda            %f4, [$out + $omask]0xc0        ! partial store
 152         std             %f6, [$out + 8]
 153         add             $out, 16, $out
 154         orn             %g0, $omask, $omask
 155         stda            %f8, [$out + $omask]0xc0        ! partial store
 156
 157         brnz,pt         $len, .L${bits}_cbc_enc_loop+4
 158         orn             %g0, $omask, $omask
 159 ___
 160 $::code.=<<___ if ($::evp);
 161         st              %f0, [$ivec + 0]
 162         st              %f1, [$ivec + 4]
 163         st              %f2, [$ivec + 8]
 164         st              %f3, [$ivec + 12]
 165 ___
 166 $::code.=<<___ if (!$::evp);
 167         brnz,pn         $ivoff, 3f
 168         nop
 169
 170         std             %f0, [$ivec + 0]        ! write out ivec
 171         std             %f2, [$ivec + 8]
 172         ret
 173         restore
 174
 175 .align  16
 176 3:      alignaddrl      $ivec, $ivoff, %g0      ! handle unaligned ivec
 177         mov             0xff, $omask
 178         srl             $omask, $ivoff, $omask
 179         faligndata      %f0, %f0, %f4
 180         faligndata      %f0, %f2, %f6
 181         faligndata      %f2, %f2, %f8
 182         stda            %f4, [$ivec + $omask]0xc0
 183         std             %f6, [$ivec + 8]
 184         add             $ivec, 16, $ivec
 185         orn             %g0, $omask, $omask
 186         stda            %f8, [$ivec + $omask]0xc0
 187 ___
 188 $::code.=<<___;
 189         ret
 190         restore
 191
 192 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 193 .align  32
 194 .L${bits}cbc_enc_blk:
 195         add     $out, $len, $blk_init
 196         and     $blk_init, 63, $blk_init        ! tail
 197         sub     $len, $blk_init, $len
 198         add     $blk_init, 15, $blk_init        ! round up to 16n
 199         srlx    $len, 4, $len
 200         srl     $blk_init, 4, $blk_init
 201
 202 .L${bits}_cbc_enc_blk_loop:
 203         ldx             [$inp + 0], %o0
 204         brz,pt          $ileft, 5f
 205         ldx             [$inp + 8], %o1
 206
 207         ldx             [$inp + 16], %o2
 208         sllx            %o0, $ileft, %o0
 209         srlx            %o1, $iright, %g1
 210         sllx            %o1, $ileft, %o1
 211         or              %g1, %o0, %o0
 212         srlx            %o2, $iright, %o2
 213         or              %o2, %o1, %o1
 214 5:
 215         xor             %g4, %o0, %o0           ! ^= rk[0]
 216         xor             %g5, %o1, %o1
 217         movxtod         %o0, %f12
 218         movxtod         %o1, %f14
 219
 220         fxor            %f12, %f0, %f0          ! ^= ivec
 221         fxor            %f14, %f2, %f2
 222         prefetch        [$inp + 16+63], 20
 223         call            _${alg}${bits}_encrypt_1x
 224         add             $inp, 16, $inp
 225         sub             $len, 1, $len
 226
 227         stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
 228         add             $out, 8, $out
 229         stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
 230         brnz,pt         $len, .L${bits}_cbc_enc_blk_loop
 231         add             $out, 8, $out
 232
 233         membar          #StoreLoad|#StoreStore
 234         brnz,pt         $blk_init, .L${bits}_cbc_enc_loop
 235         mov             $blk_init, $len
 236 ___
 237 $::code.=<<___ if ($::evp);
 238         st              %f0, [$ivec + 0]
 239         st              %f1, [$ivec + 4]
 240         st              %f2, [$ivec + 8]
 241         st              %f3, [$ivec + 12]
 242 ___
 243 $::code.=<<___ if (!$::evp);
 244         brnz,pn         $ivoff, 3b
 245         nop
 246
 247         std             %f0, [$ivec + 0]        ! write out ivec
 248         std             %f2, [$ivec + 8]
 249 ___
 250 $::code.=<<___;
 251         ret
 252         restore
 253 .type   ${alg}${bits}_t4_cbc_encrypt,#function
 254 .size   ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
 255 ___
 256 }
 257
 258 sub alg_cbc_decrypt_implement {
 259 my ($alg,$bits) = @_;
 260
 261 $::code.=<<___;
 262 .globl  ${alg}${bits}_t4_cbc_decrypt
 263 .align  32
 264 ${alg}${bits}_t4_cbc_decrypt:
 265         save            %sp, -$::frame, %sp
 266         cmp             $len, 0
 267         be,pn           $::size_t_cc, .L${bits}_cbc_dec_abort
 268         sub             $inp, $out, $blk_init   ! $inp!=$out
 269 ___
 270 $::code.=<<___ if (!$::evp);
 271         andcc           $ivec, 7, $ivoff
 272         alignaddr       $ivec, %g0, $ivec
 273
 274         ldd             [$ivec + 0], %f12       ! load ivec
 275         bz,pt           %icc, 1f
 276         ldd             [$ivec + 8], %f14
 277         ldd             [$ivec + 16], %f0
 278         faligndata      %f12, %f14, %f12
 279         faligndata      %f14, %f0, %f14
 280 1:
 281 ___
 282 $::code.=<<___ if ($::evp);
 283         ld              [$ivec + 0], %f12       ! load ivec
 284         ld              [$ivec + 4], %f13
 285         ld              [$ivec + 8], %f14
 286         ld              [$ivec + 12], %f15
 287 ___
 288 $::code.=<<___;
 289         prefetch        [$inp], 20
 290         prefetch        [$inp + 63], 20
 291         call            _${alg}${bits}_load_deckey
 292         and             $inp, 7, $ileft
 293         andn            $inp, 7, $inp
 294         sll             $ileft, 3, $ileft
 295         mov             64, $iright
 296         mov             0xff, $omask
 297         sub             $iright, $ileft, $iright
 298         and             $out, 7, $ooff
 299         cmp             $len, 255
 300         movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
 301         movleu          $::size_t_cc, 0, $blk_init      !       $len<256 ||
 302         brnz,pn         $blk_init, .L${bits}cbc_dec_blk !       $inp==$out)
 303         srl             $omask, $ooff, $omask
 304
 305         andcc           $len, 16, %g0           ! is number of blocks even?
 306         srlx            $len, 4, $len
 307         alignaddrl      $out, %g0, $out
 308         bz              %icc, .L${bits}_cbc_dec_loop2x
 309         prefetch        [$out], 22
 310 .L${bits}_cbc_dec_loop:
 311         ldx             [$inp + 0], %o0
 312         brz,pt          $ileft, 4f
 313         ldx             [$inp + 8], %o1
 314
 315         ldx             [$inp + 16], %o2
 316         sllx            %o0, $ileft, %o0
 317         srlx            %o1, $iright, %g1
 318         sllx            %o1, $ileft, %o1
 319         or              %g1, %o0, %o0
 320         srlx            %o2, $iright, %o2
 321         or              %o2, %o1, %o1
 322 4:
 323         xor             %g4, %o0, %o2           ! ^= rk[0]
 324         xor             %g5, %o1, %o3
 325         movxtod         %o2, %f0
 326         movxtod         %o3, %f2
 327
 328         prefetch        [$out + 63], 22
 329         prefetch        [$inp + 16+63], 20
 330         call            _${alg}${bits}_decrypt_1x
 331         add             $inp, 16, $inp
 332
 333         fxor            %f12, %f0, %f0          ! ^= ivec
 334         fxor            %f14, %f2, %f2
 335         movxtod         %o0, %f12
 336         movxtod         %o1, %f14
 337
 338         brnz,pn         $ooff, 2f
 339         sub             $len, 1, $len
 340
 341         std             %f0, [$out + 0]
 342         std             %f2, [$out + 8]
 343         brnz,pt         $len, .L${bits}_cbc_dec_loop2x
 344         add             $out, 16, $out
 345 ___
 346 $::code.=<<___ if ($::evp);
 347         st              %f12, [$ivec + 0]
 348         st              %f13, [$ivec + 4]
 349         st              %f14, [$ivec + 8]
 350         st              %f15, [$ivec + 12]
 351 ___
 352 $::code.=<<___ if (!$::evp);
 353         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 354         nop
 355
 356         std             %f12, [$ivec + 0]       ! write out ivec
 357         std             %f14, [$ivec + 8]
 358 ___
 359 $::code.=<<___;
 360 .L${bits}_cbc_dec_abort:
 361         ret
 362         restore
 363
 364 .align  16
 365 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 366                                                 ! and ~3x deterioration
 367                                                 ! in inp==out case
 368         faligndata      %f0, %f0, %f4           ! handle unaligned output
 369         faligndata      %f0, %f2, %f6
 370         faligndata      %f2, %f2, %f8
 371
 372         stda            %f4, [$out + $omask]0xc0        ! partial store
 373         std             %f6, [$out + 8]
 374         add             $out, 16, $out
 375         orn             %g0, $omask, $omask
 376         stda            %f8, [$out + $omask]0xc0        ! partial store
 377
 378         brnz,pt         $len, .L${bits}_cbc_dec_loop2x+4
 379         orn             %g0, $omask, $omask
 380 ___
 381 $::code.=<<___ if ($::evp);
 382         st              %f12, [$ivec + 0]
 383         st              %f13, [$ivec + 4]
 384         st              %f14, [$ivec + 8]
 385         st              %f15, [$ivec + 12]
 386 ___
 387 $::code.=<<___ if (!$::evp);
 388         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 389         nop
 390
 391         std             %f12, [$ivec + 0]       ! write out ivec
 392         std             %f14, [$ivec + 8]
 393 ___
 394 $::code.=<<___;
 395         ret
 396         restore
 397
 398 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 399 .align  32
 400 .L${bits}_cbc_dec_loop2x:
 401         ldx             [$inp + 0], %o0
 402         ldx             [$inp + 8], %o1
 403         ldx             [$inp + 16], %o2
 404         brz,pt          $ileft, 4f
 405         ldx             [$inp + 24], %o3
 406
 407         ldx             [$inp + 32], %o4
 408         sllx            %o0, $ileft, %o0
 409         srlx            %o1, $iright, %g1
 410         or              %g1, %o0, %o0
 411         sllx            %o1, $ileft, %o1
 412         srlx            %o2, $iright, %g1
 413         or              %g1, %o1, %o1
 414         sllx            %o2, $ileft, %o2
 415         srlx            %o3, $iright, %g1
 416         or              %g1, %o2, %o2
 417         sllx            %o3, $ileft, %o3
 418         srlx            %o4, $iright, %o4
 419         or              %o4, %o3, %o3
 420 4:
 421         xor             %g4, %o0, %o4           ! ^= rk[0]
 422         xor             %g5, %o1, %o5
 423         movxtod         %o4, %f0
 424         movxtod         %o5, %f2
 425         xor             %g4, %o2, %o4
 426         xor             %g5, %o3, %o5
 427         movxtod         %o4, %f4
 428         movxtod         %o5, %f6
 429
 430         prefetch        [$out + 63], 22
 431         prefetch        [$inp + 32+63], 20
 432         call            _${alg}${bits}_decrypt_2x
 433         add             $inp, 32, $inp
 434
 435         movxtod         %o0, %f8
 436         movxtod         %o1, %f10
 437         fxor            %f12, %f0, %f0          ! ^= ivec
 438         fxor            %f14, %f2, %f2
 439         movxtod         %o2, %f12
 440         movxtod         %o3, %f14
 441         fxor            %f8, %f4, %f4
 442         fxor            %f10, %f6, %f6
 443
 444         brnz,pn         $ooff, 2f
 445         sub             $len, 2, $len
 446
 447         std             %f0, [$out + 0]
 448         std             %f2, [$out + 8]
 449         std             %f4, [$out + 16]
 450         std             %f6, [$out + 24]
 451         brnz,pt         $len, .L${bits}_cbc_dec_loop2x
 452         add             $out, 32, $out
 453 ___
 454 $::code.=<<___ if ($::evp);
 455         st              %f12, [$ivec + 0]
 456         st              %f13, [$ivec + 4]
 457         st              %f14, [$ivec + 8]
 458         st              %f15, [$ivec + 12]
 459 ___
 460 $::code.=<<___ if (!$::evp);
 461         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 462         nop
 463
 464         std             %f12, [$ivec + 0]       ! write out ivec
 465         std             %f14, [$ivec + 8]
 466 ___
 467 $::code.=<<___;
 468         ret
 469         restore
 470
 471 .align  16
 472 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 473                                                 ! and ~3x deterioration
 474                                                 ! in inp==out case
 475         faligndata      %f0, %f0, %f8           ! handle unaligned output
 476         faligndata      %f0, %f2, %f0
 477         faligndata      %f2, %f4, %f2
 478         faligndata      %f4, %f6, %f4
 479         faligndata      %f6, %f6, %f6
 480         stda            %f8, [$out + $omask]0xc0        ! partial store
 481         std             %f0, [$out + 8]
 482         std             %f2, [$out + 16]
 483         std             %f4, [$out + 24]
 484         add             $out, 32, $out
 485         orn             %g0, $omask, $omask
 486         stda            %f6, [$out + $omask]0xc0        ! partial store
 487
 488         brnz,pt         $len, .L${bits}_cbc_dec_loop2x+4
 489         orn             %g0, $omask, $omask
 490 ___
 491 $::code.=<<___ if ($::evp);
 492         st              %f12, [$ivec + 0]
 493         st              %f13, [$ivec + 4]
 494         st              %f14, [$ivec + 8]
 495         st              %f15, [$ivec + 12]
 496 ___
 497 $::code.=<<___ if (!$::evp);
 498         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 499         nop
 500
 501         std             %f12, [$ivec + 0]       ! write out ivec
 502         std             %f14, [$ivec + 8]
 503         ret
 504         restore
 505
 506 .align  16
 507 .L${bits}_cbc_dec_unaligned_ivec:
 508         alignaddrl      $ivec, $ivoff, %g0      ! handle unaligned ivec
 509         mov             0xff, $omask
 510         srl             $omask, $ivoff, $omask
 511         faligndata      %f12, %f12, %f0
 512         faligndata      %f12, %f14, %f2
 513         faligndata      %f14, %f14, %f4
 514         stda            %f0, [$ivec + $omask]0xc0
 515         std             %f2, [$ivec + 8]
 516         add             $ivec, 16, $ivec
 517         orn             %g0, $omask, $omask
 518         stda            %f4, [$ivec + $omask]0xc0
 519 ___
 520 $::code.=<<___;
 521         ret
 522         restore
 523
 524 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 525 .align  32
 526 .L${bits}cbc_dec_blk:
 527         add     $out, $len, $blk_init
 528         and     $blk_init, 63, $blk_init        ! tail
 529         sub     $len, $blk_init, $len
 530         add     $blk_init, 15, $blk_init        ! round up to 16n
 531         srlx    $len, 4, $len
 532         srl     $blk_init, 4, $blk_init
 533         sub     $len, 1, $len
 534         add     $blk_init, 1, $blk_init
 535
 536 .L${bits}_cbc_dec_blk_loop2x:
 537         ldx             [$inp + 0], %o0
 538         ldx             [$inp + 8], %o1
 539         ldx             [$inp + 16], %o2
 540         brz,pt          $ileft, 5f
 541         ldx             [$inp + 24], %o3
 542
 543         ldx             [$inp + 32], %o4
 544         sllx            %o0, $ileft, %o0
 545         srlx            %o1, $iright, %g1
 546         or              %g1, %o0, %o0
 547         sllx            %o1, $ileft, %o1
 548         srlx            %o2, $iright, %g1
 549         or              %g1, %o1, %o1
 550         sllx            %o2, $ileft, %o2
 551         srlx            %o3, $iright, %g1
 552         or              %g1, %o2, %o2
 553         sllx            %o3, $ileft, %o3
 554         srlx            %o4, $iright, %o4
 555         or              %o4, %o3, %o3
 556 5:
 557         xor             %g4, %o0, %o4           ! ^= rk[0]
 558         xor             %g5, %o1, %o5
 559         movxtod         %o4, %f0
 560         movxtod         %o5, %f2
 561         xor             %g4, %o2, %o4
 562         xor             %g5, %o3, %o5
 563         movxtod         %o4, %f4
 564         movxtod         %o5, %f6
 565
 566         prefetch        [$inp + 32+63], 20
 567         call            _${alg}${bits}_decrypt_2x
 568         add             $inp, 32, $inp
 569         subcc           $len, 2, $len
 570
 571         movxtod         %o0, %f8
 572         movxtod         %o1, %f10
 573         fxor            %f12, %f0, %f0          ! ^= ivec
 574         fxor            %f14, %f2, %f2
 575         movxtod         %o2, %f12
 576         movxtod         %o3, %f14
 577         fxor            %f8, %f4, %f4
 578         fxor            %f10, %f6, %f6
 579
 580         stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
 581         add             $out, 8, $out
 582         stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
 583         add             $out, 8, $out
 584         stda            %f4, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
 585         add             $out, 8, $out
 586         stda            %f6, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
 587         bgu,pt          $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
 588         add             $out, 8, $out
 589
 590         add             $blk_init, $len, $len
 591         andcc           $len, 1, %g0            ! is number of blocks even?
 592         membar          #StoreLoad|#StoreStore
 593         bnz,pt          %icc, .L${bits}_cbc_dec_loop
 594         srl             $len, 0, $len
 595         brnz,pn         $len, .L${bits}_cbc_dec_loop2x
 596         nop
 597 ___
 598 $::code.=<<___ if ($::evp);
 599         st              %f12, [$ivec + 0]       ! write out ivec
 600         st              %f13, [$ivec + 4]
 601         st              %f14, [$ivec + 8]
 602         st              %f15, [$ivec + 12]
 603 ___
 604 $::code.=<<___ if (!$::evp);
 605         brnz,pn         $ivoff, 3b
 606         nop
 607
 608         std             %f12, [$ivec + 0]       ! write out ivec
 609         std             %f14, [$ivec + 8]
 610 ___
 611 $::code.=<<___;
 612         ret
 613         restore
 614 .type   ${alg}${bits}_t4_cbc_decrypt,#function
 615 .size   ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
 616 ___
 617 }
 618
 619 sub alg_ctr32_implement {
 620 my ($alg,$bits) = @_;
 621
 622 $::code.=<<___;
 623 .globl  ${alg}${bits}_t4_ctr32_encrypt
 624 .align  32
 625 ${alg}${bits}_t4_ctr32_encrypt:
 626         save            %sp, -$::frame, %sp
 627
 628         prefetch        [$inp], 20
 629         prefetch        [$inp + 63], 20
 630         call            _${alg}${bits}_load_enckey
 631         sllx            $len, 4, $len
 632
 633         ld              [$ivec + 0], %l4        ! counter
 634         ld              [$ivec + 4], %l5
 635         ld              [$ivec + 8], %l6
 636         ld              [$ivec + 12], %l7
 637
 638         sllx            %l4, 32, %o5
 639         or              %l5, %o5, %o5
 640         sllx            %l6, 32, %g1
 641         xor             %o5, %g4, %g4           ! ^= rk[0]
 642         xor             %g1, %g5, %g5
 643         movxtod         %g4, %f14               ! most significant 64 bits
 644
 645         sub             $inp, $out, $blk_init   ! $inp!=$out
 646         and             $inp, 7, $ileft
 647         andn            $inp, 7, $inp
 648         sll             $ileft, 3, $ileft
 649         mov             64, $iright
 650         mov             0xff, $omask
 651         sub             $iright, $ileft, $iright
 652         and             $out, 7, $ooff
 653         cmp             $len, 255
 654         movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
 655         movleu          $::size_t_cc, 0, $blk_init      !       $len<256 ||
 656         brnz,pn         $blk_init, .L${bits}_ctr32_blk  !       $inp==$out)
 657         srl             $omask, $ooff, $omask
 658
 659         andcc           $len, 16, %g0           ! is number of blocks even?
 660         alignaddrl      $out, %g0, $out
 661         bz              %icc, .L${bits}_ctr32_loop2x
 662         srlx            $len, 4, $len
 663 .L${bits}_ctr32_loop:
 664         ldx             [$inp + 0], %o0
 665         brz,pt          $ileft, 4f
 666         ldx             [$inp + 8], %o1
 667
 668         ldx             [$inp + 16], %o2
 669         sllx            %o0, $ileft, %o0
 670         srlx            %o1, $iright, %g1
 671         sllx            %o1, $ileft, %o1
 672         or              %g1, %o0, %o0
 673         srlx            %o2, $iright, %o2
 674         or              %o2, %o1, %o1
 675 4:
 676         xor             %g5, %l7, %g1           ! ^= rk[0]
 677         add             %l7, 1, %l7
 678         movxtod         %g1, %f2
 679         srl             %l7, 0, %l7             ! clruw
 680         prefetch        [$out + 63], 22
 681         prefetch        [$inp + 16+63], 20
 682 ___
 683 $::code.=<<___ if ($alg eq "aes");
 684         aes_eround01    %f16, %f14, %f2, %f4
 685         aes_eround23    %f18, %f14, %f2, %f2
 686 ___
 687 $::code.=<<___ if ($alg eq "cmll");
 688         camellia_f      %f16, %f2, %f14, %f2
 689         camellia_f      %f18, %f14, %f2, %f0
 690 ___
 691 $::code.=<<___;
 692         call            _${alg}${bits}_encrypt_1x+8
 693         add             $inp, 16, $inp
 694
 695         movxtod         %o0, %f10
 696         movxtod         %o1, %f12
 697         fxor            %f10, %f0, %f0          ! ^= inp
 698         fxor            %f12, %f2, %f2
 699
 700         brnz,pn         $ooff, 2f
 701         sub             $len, 1, $len
 702
 703         std             %f0, [$out + 0]
 704         std             %f2, [$out + 8]
 705         brnz,pt         $len, .L${bits}_ctr32_loop2x
 706         add             $out, 16, $out
 707
 708         ret
 709         restore
 710
 711 .align  16
 712 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 713                                                 ! and ~3x deterioration
 714                                                 ! in inp==out case
 715         faligndata      %f0, %f0, %f4           ! handle unaligned output
 716         faligndata      %f0, %f2, %f6
 717         faligndata      %f2, %f2, %f8
 718         stda            %f4, [$out + $omask]0xc0        ! partial store
 719         std             %f6, [$out + 8]
 720         add             $out, 16, $out
 721         orn             %g0, $omask, $omask
 722         stda            %f8, [$out + $omask]0xc0        ! partial store
 723
 724         brnz,pt         $len, .L${bits}_ctr32_loop2x+4
 725         orn             %g0, $omask, $omask
 726
 727         ret
 728         restore
 729
 730 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 731 .align  32
 732 .L${bits}_ctr32_loop2x:
 733         ldx             [$inp + 0], %o0
 734         ldx             [$inp + 8], %o1
 735         ldx             [$inp + 16], %o2
 736         brz,pt          $ileft, 4f
 737         ldx             [$inp + 24], %o3
 738
 739         ldx             [$inp + 32], %o4
 740         sllx            %o0, $ileft, %o0
 741         srlx            %o1, $iright, %g1
 742         or              %g1, %o0, %o0
 743         sllx            %o1, $ileft, %o1
 744         srlx            %o2, $iright, %g1
 745         or              %g1, %o1, %o1
 746         sllx            %o2, $ileft, %o2
 747         srlx            %o3, $iright, %g1
 748         or              %g1, %o2, %o2
 749         sllx            %o3, $ileft, %o3
 750         srlx            %o4, $iright, %o4
 751         or              %o4, %o3, %o3
 752 4:
 753         xor             %g5, %l7, %g1           ! ^= rk[0]
 754         add             %l7, 1, %l7
 755         movxtod         %g1, %f2
 756         srl             %l7, 0, %l7             ! clruw
 757         xor             %g5, %l7, %g1
 758         add             %l7, 1, %l7
 759         movxtod         %g1, %f6
 760         srl             %l7, 0, %l7             ! clruw
 761         prefetch        [$out + 63], 22
 762         prefetch        [$inp + 32+63], 20
 763 ___
 764 $::code.=<<___ if ($alg eq "aes");
 765         aes_eround01    %f16, %f14, %f2, %f8
 766         aes_eround23    %f18, %f14, %f2, %f2
 767         aes_eround01    %f16, %f14, %f6, %f10
 768         aes_eround23    %f18, %f14, %f6, %f6
 769 ___
 770 $::code.=<<___ if ($alg eq "cmll");
 771         camellia_f      %f16, %f2, %f14, %f2
 772         camellia_f      %f16, %f6, %f14, %f6
 773         camellia_f      %f18, %f14, %f2, %f0
 774         camellia_f      %f18, %f14, %f6, %f4
 775 ___
 776 $::code.=<<___;
 777         call            _${alg}${bits}_encrypt_2x+16
 778         add             $inp, 32, $inp
 779
 780         movxtod         %o0, %f8
 781         movxtod         %o1, %f10
 782         movxtod         %o2, %f12
 783         fxor            %f8, %f0, %f0           ! ^= inp
 784         movxtod         %o3, %f8
 785         fxor            %f10, %f2, %f2
 786         fxor            %f12, %f4, %f4
 787         fxor            %f8, %f6, %f6
 788
 789         brnz,pn         $ooff, 2f
 790         sub             $len, 2, $len
 791
 792         std             %f0, [$out + 0]
 793         std             %f2, [$out + 8]
 794         std             %f4, [$out + 16]
 795         std             %f6, [$out + 24]
 796         brnz,pt         $len, .L${bits}_ctr32_loop2x
 797         add             $out, 32, $out
 798
 799         ret
 800         restore
 801
 802 .align  16
 803 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 804                                                 ! and ~3x deterioration
 805                                                 ! in inp==out case
 806         faligndata      %f0, %f0, %f8           ! handle unaligned output
 807         faligndata      %f0, %f2, %f0
 808         faligndata      %f2, %f4, %f2
 809         faligndata      %f4, %f6, %f4
 810         faligndata      %f6, %f6, %f6
 811
 812         stda            %f8, [$out + $omask]0xc0        ! partial store
 813         std             %f0, [$out + 8]
 814         std             %f2, [$out + 16]
 815         std             %f4, [$out + 24]
 816         add             $out, 32, $out
 817         orn             %g0, $omask, $omask
 818         stda            %f6, [$out + $omask]0xc0        ! partial store
 819
 820         brnz,pt         $len, .L${bits}_ctr32_loop2x+4
 821         orn             %g0, $omask, $omask
 822
 823         ret
 824         restore
 825
 826 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 827 .align  32
 828 .L${bits}_ctr32_blk:
 829         add     $out, $len, $blk_init
 830         and     $blk_init, 63, $blk_init        ! tail
 831         sub     $len, $blk_init, $len
 832         add     $blk_init, 15, $blk_init        ! round up to 16n
 833         srlx    $len, 4, $len
 834         srl     $blk_init, 4, $blk_init
 835         sub     $len, 1, $len
 836         add     $blk_init, 1, $blk_init
 837
 838 .L${bits}_ctr32_blk_loop2x:
 839         ldx             [$inp + 0], %o0
 840         ldx             [$inp + 8], %o1
 841         ldx             [$inp + 16], %o2
 842         brz,pt          $ileft, 5f
 843         ldx             [$inp + 24], %o3
 844
 845         ldx             [$inp + 32], %o4
 846         sllx            %o0, $ileft, %o0
 847         srlx            %o1, $iright, %g1
 848         or              %g1, %o0, %o0
 849         sllx            %o1, $ileft, %o1
 850         srlx            %o2, $iright, %g1
 851         or              %g1, %o1, %o1
 852         sllx            %o2, $ileft, %o2
 853         srlx            %o3, $iright, %g1
 854         or              %g1, %o2, %o2
 855         sllx            %o3, $ileft, %o3
 856         srlx            %o4, $iright, %o4
 857         or              %o4, %o3, %o3
 858 5:
 859         xor             %g5, %l7, %g1           ! ^= rk[0]
 860         add             %l7, 1, %l7
 861         movxtod         %g1, %f2
 862         srl             %l7, 0, %l7             ! clruw
 863         xor             %g5, %l7, %g1
 864         add             %l7, 1, %l7
 865         movxtod         %g1, %f6
 866         srl             %l7, 0, %l7             ! clruw
 867         prefetch        [$inp + 32+63], 20
 868 ___
 869 $::code.=<<___ if ($alg eq "aes");
 870         aes_eround01    %f16, %f14, %f2, %f8
 871         aes_eround23    %f18, %f14, %f2, %f2
 872         aes_eround01    %f16, %f14, %f6, %f10
 873         aes_eround23    %f18, %f14, %f6, %f6
 874 ___
 875 $::code.=<<___ if ($alg eq "cmll");
 876         camellia_f      %f16, %f2, %f14, %f2
 877         camellia_f      %f16, %f6, %f14, %f6
 878         camellia_f      %f18, %f14, %f2, %f0
 879         camellia_f      %f18, %f14, %f6, %f4
 880 ___
 881 $::code.=<<___;
 882         call            _${alg}${bits}_encrypt_2x+16
 883         add             $inp, 32, $inp
 884         subcc           $len, 2, $len
 885
 886         movxtod         %o0, %f8
 887         movxtod         %o1, %f10
 888         movxtod         %o2, %f12
 889         fxor            %f8, %f0, %f0           ! ^= inp
 890         movxtod         %o3, %f8
 891         fxor            %f10, %f2, %f2
 892         fxor            %f12, %f4, %f4
 893         fxor            %f8, %f6, %f6
 894
 895         stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
 896         add             $out, 8, $out
 897         stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
 898         add             $out, 8, $out
 899         stda            %f4, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
 900         add             $out, 8, $out
 901         stda            %f6, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
 902         bgu,pt          $::size_t_cc, .L${bits}_ctr32_blk_loop2x
 903         add             $out, 8, $out
 904
 905         add             $blk_init, $len, $len
 906         andcc           $len, 1, %g0            ! is number of blocks even?
 907         membar          #StoreLoad|#StoreStore
 908         bnz,pt          %icc, .L${bits}_ctr32_loop
 909         srl             $len, 0, $len
 910         brnz,pn         $len, .L${bits}_ctr32_loop2x
 911         nop
 912
 913         ret
 914         restore
 915 .type   ${alg}${bits}_t4_ctr32_encrypt,#function
 916 .size   ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
 917 ___
 918 }
 919
 920 sub alg_xts_implement {
 921 my ($alg,$bits,$dir) = @_;
 922 my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
 923 my $rem=$ivec;
 924
 925 $::code.=<<___;
 926 .globl  ${alg}${bits}_t4_xts_${dir}crypt
 927 .align  32
 928 ${alg}${bits}_t4_xts_${dir}crypt:
 929         save            %sp, -$::frame-16, %sp
 930
 931         mov             $ivec, %o0
 932         add             %fp, $::bias-16, %o1
 933         call            ${alg}_t4_encrypt
 934         mov             $key2, %o2
 935
 936         add             %fp, $::bias-16, %l7
 937         ldxa            [%l7]0x88, %g2
 938         add             %fp, $::bias-8, %l7
 939         ldxa            [%l7]0x88, %g3          ! %g3:%g2 is tweak
 940
 941         sethi           %hi(0x76543210), %l7
 942         or              %l7, %lo(0x76543210), %l7
 943         bmask           %l7, %g0, %g0           ! byte swap mask
 944
 945         prefetch        [$inp], 20
 946         prefetch        [$inp + 63], 20
 947         call            _${alg}${bits}_load_${dir}ckey
 948         and             $len, 15,  $rem
 949         and             $len, -16, $len
 950 ___
 951 $code.=<<___ if ($dir eq "de");
 952         mov             0, %l7
 953         movrnz          $rem, 16,  %l7
 954         sub             $len, %l7, $len
 955 ___
 956 $code.=<<___;
 957
 958         sub             $inp, $out, $blk_init   ! $inp!=$out
 959         and             $inp, 7, $ileft
 960         andn            $inp, 7, $inp
 961         sll             $ileft, 3, $ileft
 962         mov             64, $iright
 963         mov             0xff, $omask
 964         sub             $iright, $ileft, $iright
 965         and             $out, 7, $ooff
 966         cmp             $len, 255
 967         movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
 968         movleu          $::size_t_cc, 0, $blk_init      !       $len<256 ||
 969         brnz,pn         $blk_init, .L${bits}_xts_${dir}blk !    $inp==$out)
 970         srl             $omask, $ooff, $omask
 971
 972         andcc           $len, 16, %g0           ! is number of blocks even?
 973 ___
 974 $code.=<<___ if ($dir eq "de");
 975         brz,pn          $len, .L${bits}_xts_${dir}steal
 976 ___
 977 $code.=<<___;
 978         alignaddrl      $out, %g0, $out
 979         bz              %icc, .L${bits}_xts_${dir}loop2x
 980         srlx            $len, 4, $len
 981 .L${bits}_xts_${dir}loop:
 982         ldx             [$inp + 0], %o0
 983         brz,pt          $ileft, 4f
 984         ldx             [$inp + 8], %o1
 985
 986         ldx             [$inp + 16], %o2
 987         sllx            %o0, $ileft, %o0
 988         srlx            %o1, $iright, %g1
 989         sllx            %o1, $ileft, %o1
 990         or              %g1, %o0, %o0
 991         srlx            %o2, $iright, %o2
 992         or              %o2, %o1, %o1
 993 4:
 994         movxtod         %g2, %f12
 995         movxtod         %g3, %f14
 996         bshuffle        %f12, %f12, %f12
 997         bshuffle        %f14, %f14, %f14
 998
 999         xor             %g4, %o0, %o0           ! ^= rk[0]
1000         xor             %g5, %o1, %o1
1001         movxtod         %o0, %f0
1002         movxtod         %o1, %f2
1003
1004         fxor            %f12, %f0, %f0          ! ^= tweak[0]
1005         fxor            %f14, %f2, %f2
1006
1007         prefetch        [$out + 63], 22
1008         prefetch        [$inp + 16+63], 20
1009         call            _${alg}${bits}_${dir}crypt_1x
1010         add             $inp, 16, $inp
1011
1012         fxor            %f12, %f0, %f0          ! ^= tweak[0]
1013         fxor            %f14, %f2, %f2
1014
1015         srax            %g3, 63, %l7            ! next tweak value
1016         addcc           %g2, %g2, %g2
1017         and             %l7, 0x87, %l7
1018         addxc           %g3, %g3, %g3
1019         xor             %l7, %g2, %g2
1020
1021         brnz,pn         $ooff, 2f
1022         sub             $len, 1, $len
1023
1024         std             %f0, [$out + 0]
1025         std             %f2, [$out + 8]
1026         brnz,pt         $len, .L${bits}_xts_${dir}loop2x
1027         add             $out, 16, $out
1028
1029         brnz,pn         $rem, .L${bits}_xts_${dir}steal
1030         nop
1031
1032         ret
1033         restore
1034
1035 .align  16
1036 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
1037                                                 ! and ~3x deterioration
1038                                                 ! in inp==out case
1039         faligndata      %f0, %f0, %f4           ! handle unaligned output
1040         faligndata      %f0, %f2, %f6
1041         faligndata      %f2, %f2, %f8
1042         stda            %f4, [$out + $omask]0xc0        ! partial store
1043         std             %f6, [$out + 8]
1044         add             $out, 16, $out
1045         orn             %g0, $omask, $omask
1046         stda            %f8, [$out + $omask]0xc0        ! partial store
1047
1048         brnz,pt         $len, .L${bits}_xts_${dir}loop2x+4
1049         orn             %g0, $omask, $omask
1050
1051         brnz,pn         $rem, .L${bits}_xts_${dir}steal
1052         nop
1053
1054         ret
1055         restore
1056
1057 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1058 .align  32
1059 .L${bits}_xts_${dir}loop2x:
1060         ldx             [$inp + 0], %o0
1061         ldx             [$inp + 8], %o1
1062         ldx             [$inp + 16], %o2
1063         brz,pt          $ileft, 4f
1064         ldx             [$inp + 24], %o3
1065
1066         ldx             [$inp + 32], %o4
1067         sllx            %o0, $ileft, %o0
1068         srlx            %o1, $iright, %g1
1069         or              %g1, %o0, %o0
1070         sllx            %o1, $ileft, %o1
1071         srlx            %o2, $iright, %g1
1072         or              %g1, %o1, %o1
1073         sllx            %o2, $ileft, %o2
1074         srlx            %o3, $iright, %g1
1075         or              %g1, %o2, %o2
1076         sllx            %o3, $ileft, %o3
1077         srlx            %o4, $iright, %o4
1078         or              %o4, %o3, %o3
1079 4:
1080         movxtod         %g2, %f12
1081         movxtod         %g3, %f14
1082         bshuffle        %f12, %f12, %f12
1083         bshuffle        %f14, %f14, %f14
1084
1085         srax            %g3, 63, %l7            ! next tweak value
1086         addcc           %g2, %g2, %g2
1087         and             %l7, 0x87, %l7
1088         addxc           %g3, %g3, %g3
1089         xor             %l7, %g2, %g2
1090
1091         movxtod         %g2, %f8
1092         movxtod         %g3, %f10
1093         bshuffle        %f8,  %f8,  %f8
1094         bshuffle        %f10, %f10, %f10
1095
1096         xor             %g4, %o0, %o0           ! ^= rk[0]
1097         xor             %g5, %o1, %o1
1098         xor             %g4, %o2, %o2           ! ^= rk[0]
1099         xor             %g5, %o3, %o3
1100         movxtod         %o0, %f0
1101         movxtod         %o1, %f2
1102         movxtod         %o2, %f4
1103         movxtod         %o3, %f6
1104
1105         fxor            %f12, %f0, %f0          ! ^= tweak[0]
1106         fxor            %f14, %f2, %f2
1107         fxor            %f8,  %f4, %f4          ! ^= tweak[0]
1108         fxor            %f10, %f6, %f6
1109
1110         prefetch        [$out + 63], 22
1111         prefetch        [$inp + 32+63], 20
1112         call            _${alg}${bits}_${dir}crypt_2x
1113         add             $inp, 32, $inp
1114
1115         movxtod         %g2, %f8
1116         movxtod         %g3, %f10
1117
1118         srax            %g3, 63, %l7            ! next tweak value
1119         addcc           %g2, %g2, %g2
1120         and             %l7, 0x87, %l7
1121         addxc           %g3, %g3, %g3
1122         xor             %l7, %g2, %g2
1123
1124         bshuffle        %f8,  %f8,  %f8
1125         bshuffle        %f10, %f10, %f10
1126
1127         fxor            %f12, %f0, %f0          ! ^= tweak[0]
1128         fxor            %f14, %f2, %f2
1129         fxor            %f8,  %f4, %f4
1130         fxor            %f10, %f6, %f6
1131
1132         brnz,pn         $ooff, 2f
1133         sub             $len, 2, $len
1134
1135         std             %f0, [$out + 0]
1136         std             %f2, [$out + 8]
1137         std             %f4, [$out + 16]
1138         std             %f6, [$out + 24]
1139         brnz,pt         $len, .L${bits}_xts_${dir}loop2x
1140         add             $out, 32, $out
1141
1142         fsrc2           %f4, %f0
1143         fsrc2           %f6, %f2
1144         brnz,pn         $rem, .L${bits}_xts_${dir}steal
1145         nop
1146
1147         ret
1148         restore
1149
1150 .align  16
1151 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
1152                                                 ! and ~3x deterioration
1153                                                 ! in inp==out case
1154         faligndata      %f0, %f0, %f8           ! handle unaligned output
1155         faligndata      %f0, %f2, %f10
1156         faligndata      %f2, %f4, %f12
1157         faligndata      %f4, %f6, %f14
1158         faligndata      %f6, %f6, %f0
1159
1160         stda            %f8, [$out + $omask]0xc0        ! partial store
1161         std             %f10, [$out + 8]
1162         std             %f12, [$out + 16]
1163         std             %f14, [$out + 24]
1164         add             $out, 32, $out
1165         orn             %g0, $omask, $omask
1166         stda            %f0, [$out + $omask]0xc0        ! partial store
1167
1168         brnz,pt         $len, .L${bits}_xts_${dir}loop2x+4
1169         orn             %g0, $omask, $omask
1170
1171         fsrc2           %f4, %f0
1172         fsrc2           %f6, %f2
1173         brnz,pn         $rem, .L${bits}_xts_${dir}steal
1174         nop
1175
1176         ret
1177         restore
1178
1179 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1180 .align  32
1181 .L${bits}_xts_${dir}blk:
1182         add     $out, $len, $blk_init
1183         and     $blk_init, 63, $blk_init        ! tail
1184         sub     $len, $blk_init, $len
1185         add     $blk_init, 15, $blk_init        ! round up to 16n
1186         srlx    $len, 4, $len
1187         srl     $blk_init, 4, $blk_init
1188         sub     $len, 1, $len
1189         add     $blk_init, 1, $blk_init
1190
1191 .L${bits}_xts_${dir}blk2x:
1192         ldx             [$inp + 0], %o0
1193         ldx             [$inp + 8], %o1
1194         ldx             [$inp + 16], %o2
1195         brz,pt          $ileft, 5f
1196         ldx             [$inp + 24], %o3
1197
1198         ldx             [$inp + 32], %o4
1199         sllx            %o0, $ileft, %o0
1200         srlx            %o1, $iright, %g1
1201         or              %g1, %o0, %o0
1202         sllx            %o1, $ileft, %o1
1203         srlx            %o2, $iright, %g1
1204         or              %g1, %o1, %o1
1205         sllx            %o2, $ileft, %o2
1206         srlx            %o3, $iright, %g1
1207         or              %g1, %o2, %o2
1208         sllx            %o3, $ileft, %o3
1209         srlx            %o4, $iright, %o4
1210         or              %o4, %o3, %o3
1211 5:
1212         movxtod         %g2, %f12
1213         movxtod         %g3, %f14
1214         bshuffle        %f12, %f12, %f12
1215         bshuffle        %f14, %f14, %f14
1216
1217         srax            %g3, 63, %l7            ! next tweak value
1218         addcc           %g2, %g2, %g2
1219         and             %l7, 0x87, %l7
1220         addxc           %g3, %g3, %g3
1221         xor             %l7, %g2, %g2
1222
1223         movxtod         %g2, %f8
1224         movxtod         %g3, %f10
1225         bshuffle        %f8,  %f8,  %f8
1226         bshuffle        %f10, %f10, %f10
1227
1228         xor             %g4, %o0, %o0           ! ^= rk[0]
1229         xor             %g5, %o1, %o1
1230         xor             %g4, %o2, %o2           ! ^= rk[0]
1231         xor             %g5, %o3, %o3
1232         movxtod         %o0, %f0
1233         movxtod         %o1, %f2
1234         movxtod         %o2, %f4
1235         movxtod         %o3, %f6
1236
1237         fxor            %f12, %f0, %f0          ! ^= tweak[0]
1238         fxor            %f14, %f2, %f2
1239         fxor            %f8,  %f4, %f4          ! ^= tweak[0]
1240         fxor            %f10, %f6, %f6
1241
1242         prefetch        [$inp + 32+63], 20
1243         call            _${alg}${bits}_${dir}crypt_2x
1244         add             $inp, 32, $inp
1245
1246         movxtod         %g2, %f8
1247         movxtod         %g3, %f10
1248
1249         srax            %g3, 63, %l7            ! next tweak value
1250         addcc           %g2, %g2, %g2
1251         and             %l7, 0x87, %l7
1252         addxc           %g3, %g3, %g3
1253         xor             %l7, %g2, %g2
1254
1255         bshuffle        %f8,  %f8,  %f8
1256         bshuffle        %f10, %f10, %f10
1257
1258         fxor            %f12, %f0, %f0          ! ^= tweak[0]
1259         fxor            %f14, %f2, %f2
1260         fxor            %f8,  %f4, %f4
1261         fxor            %f10, %f6, %f6
1262
1263         subcc           $len, 2, $len
1264         stda            %f0, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
1265         add             $out, 8, $out
1266         stda            %f2, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
1267         add             $out, 8, $out
1268         stda            %f4, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
1269         add             $out, 8, $out
1270         stda            %f6, [$out]0xe2         ! ASI_BLK_INIT, T4-specific
1271         bgu,pt          $::size_t_cc, .L${bits}_xts_${dir}blk2x
1272         add             $out, 8, $out
1273
1274         add             $blk_init, $len, $len
1275         andcc           $len, 1, %g0            ! is number of blocks even?
1276         membar          #StoreLoad|#StoreStore
1277         bnz,pt          %icc, .L${bits}_xts_${dir}loop
1278         srl             $len, 0, $len
1279         brnz,pn         $len, .L${bits}_xts_${dir}loop2x
1280         nop
1281
1282         fsrc2           %f4, %f0
1283         fsrc2           %f6, %f2
1284         brnz,pn         $rem, .L${bits}_xts_${dir}steal
1285         nop
1286
1287         ret
1288         restore
1289 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1290 ___
1291 $code.=<<___ if ($dir eq "en");
1292 .align  32
1293 .L${bits}_xts_${dir}steal:
1294         std             %f0, [%fp + $::bias-16] ! copy of output
1295         std             %f2, [%fp + $::bias-8]
1296
1297         srl             $ileft, 3, $ileft
1298         add             %fp, $::bias-16, %l7
1299         add             $inp, $ileft, $inp      ! original $inp+$len&-15
1300         add             $out, $ooff, $out       ! original $out+$len&-15
1301         mov             0, $ileft
1302         nop                                     ! align
1303
1304 .L${bits}_xts_${dir}stealing:
1305         ldub            [$inp + $ileft], %o0
1306         ldub            [%l7  + $ileft], %o1
1307         dec             $rem
1308         stb             %o0, [%l7  + $ileft]
1309         stb             %o1, [$out + $ileft]
1310         brnz            $rem, .L${bits}_xts_${dir}stealing
1311         inc             $ileft
1312
1313         mov             %l7, $inp
1314         sub             $out, 16, $out
1315         mov             0, $ileft
1316         sub             $out, $ooff, $out
1317         ba              .L${bits}_xts_${dir}loop        ! one more time
1318         mov             1, $len                         ! $rem is 0
1319 ___
1320 $code.=<<___ if ($dir eq "de");
1321 .align  32
1322 .L${bits}_xts_${dir}steal:
1323         ldx             [$inp + 0], %o0
1324         brz,pt          $ileft, 8f
1325         ldx             [$inp + 8], %o1
1326
1327         ldx             [$inp + 16], %o2
1328         sllx            %o0, $ileft, %o0
1329         srlx            %o1, $iright, %g1
1330         sllx            %o1, $ileft, %o1
1331         or              %g1, %o0, %o0
1332         srlx            %o2, $iright, %o2
1333         or              %o2, %o1, %o1
1334 8:
1335         srax            %g3, 63, %l7            ! next tweak value
1336         addcc           %g2, %g2, %o2
1337         and             %l7, 0x87, %l7
1338         addxc           %g3, %g3, %o3
1339         xor             %l7, %o2, %o2
1340
1341         movxtod         %o2, %f12
1342         movxtod         %o3, %f14
1343         bshuffle        %f12, %f12, %f12
1344         bshuffle        %f14, %f14, %f14
1345
1346         xor             %g4, %o0, %o0           ! ^= rk[0]
1347         xor             %g5, %o1, %o1
1348         movxtod         %o0, %f0
1349         movxtod         %o1, %f2
1350
1351         fxor            %f12, %f0, %f0          ! ^= tweak[0]
1352         fxor            %f14, %f2, %f2
1353
1354         call            _${alg}${bits}_${dir}crypt_1x
1355         add             $inp, 16, $inp
1356
1357         fxor            %f12, %f0, %f0          ! ^= tweak[0]
1358         fxor            %f14, %f2, %f2
1359
1360         std             %f0, [%fp + $::bias-16]
1361         std             %f2, [%fp + $::bias-8]
1362
1363         srl             $ileft, 3, $ileft
1364         add             %fp, $::bias-16, %l7
1365         add             $inp, $ileft, $inp      ! original $inp+$len&-15
1366         add             $out, $ooff, $out       ! original $out+$len&-15
1367         mov             0, $ileft
1368         add             $out, 16, $out
1369         nop                                     ! align
1370
1371 .L${bits}_xts_${dir}stealing:
1372         ldub            [$inp + $ileft], %o0
1373         ldub            [%l7  + $ileft], %o1
1374         dec             $rem
1375         stb             %o0, [%l7  + $ileft]
1376         stb             %o1, [$out + $ileft]
1377         brnz            $rem, .L${bits}_xts_${dir}stealing
1378         inc             $ileft
1379
1380         mov             %l7, $inp
1381         sub             $out, 16, $out
1382         mov             0, $ileft
1383         sub             $out, $ooff, $out
1384         ba              .L${bits}_xts_${dir}loop        ! one more time
1385         mov             1, $len                         ! $rem is 0
1386 ___
1387 $code.=<<___;
1388         ret
1389         restore
1390 .type   ${alg}${bits}_t4_xts_${dir}crypt,#function
1391 .size   ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
1392 ___
1393 }
1394
1395 # Purpose of these subroutines is to explicitly encode VIS instructions,
1396 # so that one can compile the module without having to specify VIS
1397 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1398 # Idea is to reserve for option to produce "universal" binary and let
1399 # programmer detect if current CPU is VIS capable at run-time.
1400 sub unvis {
1401 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1402 my ($ref,$opf);
1403 my %visopf = (  "faligndata"    => 0x048,
1404                 "bshuffle"      => 0x04c,
1405                 "fnot2"         => 0x066,
1406                 "fxor"          => 0x06c,
1407                 "fsrc2"         => 0x078        );
1408
1409     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1410
1411     if ($opf=$visopf{$mnemonic}) {
1412         foreach ($rs1,$rs2,$rd) {
1413             return $ref if (!/%f([0-9]{1,2})/);
1414             $_=$1;
1415             if ($1>=32) {
1416                 return $ref if ($1&1);
1417                 # re-encode for upper double register addressing
1418                 $_=($1|$1>>5)&31;
1419             }
1420         }
1421
1422         return  sprintf ".word\t0x%08x !%s",
1423                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1424                         $ref;
1425     } else {
1426         return $ref;
1427     }
1428 }
1429
1430 sub unvis3 {
1431 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1432 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1433 my ($ref,$opf);
1434 my %visopf = (  "addxc"         => 0x011,
1435                 "addxccc"       => 0x013,
1436                 "umulxhi"       => 0x016,
1437                 "alignaddr"     => 0x018,
1438                 "bmask"         => 0x019,
1439                 "alignaddrl"    => 0x01a        );
1440
1441     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1442
1443     if ($opf=$visopf{$mnemonic}) {
1444         foreach ($rs1,$rs2,$rd) {
1445             return $ref if (!/%([goli])([0-9])/);
1446             $_=$bias{$1}+$2;
1447         }
1448
1449         return  sprintf ".word\t0x%08x !%s",
1450                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1451                         $ref;
1452     } else {
1453         return $ref;
1454     }
1455 }
1456
1457 sub unaes_round {       # 4-argument instructions
1458 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1459 my ($ref,$opf);
1460 my %aesopf = (  "aes_eround01"  => 0,
1461                 "aes_eround23"  => 1,
1462                 "aes_dround01"  => 2,
1463                 "aes_dround23"  => 3,
1464                 "aes_eround01_l"=> 4,
1465                 "aes_eround23_l"=> 5,
1466                 "aes_dround01_l"=> 6,
1467                 "aes_dround23_l"=> 7,
1468                 "aes_kexpand1"  => 8    );
1469
1470     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1471
1472     if (defined($opf=$aesopf{$mnemonic})) {
1473         $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1474         foreach ($rs1,$rs2,$rd) {
1475             return $ref if (!/%f([0-9]{1,2})/);
1476             $_=$1;
1477             if ($1>=32) {
1478                 return $ref if ($1&1);
1479                 # re-encode for upper double register addressing
1480                 $_=($1|$1>>5)&31;
1481             }
1482         }
1483
1484         return  sprintf ".word\t0x%08x !%s",
1485                         2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1486                         $ref;
1487     } else {
1488         return $ref;
1489     }
1490 }
1491
1492 sub unaes_kexpand {     # 3-argument instructions
1493 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1494 my ($ref,$opf);
1495 my %aesopf = (  "aes_kexpand0"  => 0x130,
1496                 "aes_kexpand2"  => 0x131        );
1497
1498     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1499
1500     if (defined($opf=$aesopf{$mnemonic})) {
1501         foreach ($rs1,$rs2,$rd) {
1502             return $ref if (!/%f([0-9]{1,2})/);
1503             $_=$1;
1504             if ($1>=32) {
1505                 return $ref if ($1&1);
1506                 # re-encode for upper double register addressing
1507                 $_=($1|$1>>5)&31;
1508             }
1509         }
1510
1511         return  sprintf ".word\t0x%08x !%s",
1512                         2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1513                         $ref;
1514     } else {
1515         return $ref;
1516     }
1517 }
1518
1519 sub uncamellia_f {      # 4-argument instructions
1520 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1521 my ($ref,$opf);
1522
1523     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1524
1525     if (1) {
1526         $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1527         foreach ($rs1,$rs2,$rd) {
1528             return $ref if (!/%f([0-9]{1,2})/);
1529             $_=$1;
1530             if ($1>=32) {
1531                 return $ref if ($1&1);
1532                 # re-encode for upper double register addressing
1533                 $_=($1|$1>>5)&31;
1534             }
1535         }
1536
1537         return  sprintf ".word\t0x%08x !%s",
1538                         2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1539                         $ref;
1540     } else {
1541         return $ref;
1542     }
1543 }
1544
1545 sub uncamellia3 {       # 3-argument instructions
1546 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1547 my ($ref,$opf);
1548 my %cmllopf = ( "camellia_fl"   => 0x13c,
1549                 "camellia_fli"  => 0x13d        );
1550
1551     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1552
1553     if (defined($opf=$cmllopf{$mnemonic})) {
1554         foreach ($rs1,$rs2,$rd) {
1555             return $ref if (!/%f([0-9]{1,2})/);
1556             $_=$1;
1557             if ($1>=32) {
1558                 return $ref if ($1&1);
1559                 # re-encode for upper double register addressing
1560                 $_=($1|$1>>5)&31;
1561             }
1562         }
1563
1564         return  sprintf ".word\t0x%08x !%s",
1565                         2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1566                         $ref;
1567     } else {
1568         return $ref;
1569     }
1570 }
1571
1572 sub unmovxtox {         # 2-argument instructions
1573 my ($mnemonic,$rs,$rd)=@_;
1574 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1575 my ($ref,$opf);
1576 my %movxopf = ( "movdtox"       => 0x110,
1577                 "movstouw"      => 0x111,
1578                 "movstosw"      => 0x113,
1579                 "movxtod"       => 0x118,
1580                 "movwtos"       => 0x119        );
1581
1582     $ref = "$mnemonic\t$rs,$rd";
1583
1584     if (defined($opf=$movxopf{$mnemonic})) {
1585         foreach ($rs,$rd) {
1586             return $ref if (!/%([fgoli])([0-9]{1,2})/);
1587             $_=$bias{$1}+$2;
1588             if ($2>=32) {
1589                 return $ref if ($2&1);
1590                 # re-encode for upper double register addressing
1591                 $_=($2|$2>>5)&31;
1592             }
1593         }
1594
1595         return  sprintf ".word\t0x%08x !%s",
1596                         2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1597                         $ref;
1598     } else {
1599         return $ref;
1600     }
1601 }
1602
1603 sub undes {
1604 my ($mnemonic)=shift;
1605 my @args=@_;
1606 my ($ref,$opf);
1607 my %desopf = (  "des_round"     => 0b1001,
1608                 "des_ip"        => 0b100110100,
1609                 "des_iip"       => 0b100110101,
1610                 "des_kexpand"   => 0b100110110  );
1611
1612     $ref = "$mnemonic\t".join(",",@_);
1613
1614     if (defined($opf=$desopf{$mnemonic})) {     # 4-arg
1615         if ($mnemonic eq "des_round") {
1616             foreach (@args[0..3]) {
1617                 return $ref if (!/%f([0-9]{1,2})/);
1618                 $_=$1;
1619                 if ($1>=32) {
1620                     return $ref if ($1&1);
1621                     # re-encode for upper double register addressing
1622                     $_=($1|$1>>5)&31;
1623                 }
1624             }
1625             return  sprintf ".word\t0x%08x !%s",
1626                             2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
1627                             $ref;
1628         } elsif ($mnemonic eq "des_kexpand") {  # 3-arg
1629             foreach (@args[0..2]) {
1630                 return $ref if (!/(%f)?([0-9]{1,2})/);
1631                 $_=$2;
1632                 if ($2>=32) {
1633                     return $ref if ($2&1);
1634                     # re-encode for upper double register addressing
1635                     $_=($2|$2>>5)&31;
1636                 }
1637             }
1638             return  sprintf ".word\t0x%08x !%s",
1639                             2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
1640                             $ref;
1641         } else {                                # 2-arg
1642             foreach (@args[0..1]) {
1643                 return $ref if (!/%f([0-9]{1,2})/);
1644                 $_=$1;
1645                 if ($1>=32) {
1646                     return $ref if ($2&1);
1647                     # re-encode for upper double register addressing
1648                     $_=($1|$1>>5)&31;
1649                 }
1650             }
1651             return  sprintf ".word\t0x%08x !%s",
1652                             2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
1653                             $ref;
1654         }
1655     } else {
1656         return $ref;
1657     }
1658 }
1659
1660 sub emit_assembler {
1661     foreach (split("\n",$::code)) {
1662         s/\`([^\`]*)\`/eval $1/ge;
1663
1664         s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
1665
1666         s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1667                 &unaes_round($1,$2,$3,$4,$5)
1668          /geo or
1669         s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1670                 &unaes_kexpand($1,$2,$3,$4)
1671          /geo or
1672         s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1673                 &uncamellia_f($1,$2,$3,$4,$5)
1674          /geo or
1675         s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1676                 &uncamellia3($1,$2,$3,$4)
1677          /geo or
1678         s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
1679                 &undes($1,$2,$3,$4,$5)
1680          /geo or
1681         s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1682                 &unmovxtox($1,$2,$3)
1683          /geo or
1684         s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1685                 &unmovxtox($1,$2,$3)
1686          /geo or
1687         s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1688                 &unvis($1,$2,$3,$4)
1689          /geo or
1690         s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1691                 &unvis3($1,$2,$3,$4)
1692          /geo;
1693
1694         print $_,"\n";
1695     }
1696 }
1697
1698 1;