2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # Specific modes implementations for SPARC Architecture 2011. There
11 # is T4 dependency though, an ASI value that is not specified in the
12 # Architecture Manual. But as SPARC universe is rather monocultural,
13 # we imply that processor capable of executing crypto instructions
14 # can handle the ASI in question as well. This means that we ought to
15 # keep eyes open when new processors emerge...
17 # As for above mentioned ASI. It's so called "block initializing
18 # store" which cancels "read" in "read-update-write" on cache lines.
19 # This is "cooperative" optimization, as it reduces overall pressure
20 # on memory interface. Benefits can't be observed/quantified with
21 # usual benchmarks, on the contrary you can notice that single-thread
22 # performance for parallelizable modes is ~1.5% worse for largest
23 # block sizes [though few percent better for not so long ones]. All
24 # this based on suggestions from David Miller.
27 $::frame="STACK_FRAME";
28 $::size_t_cc="SIZE_T_CC";
30 sub asm_init { # to be called with @ARGV as argument
31 for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
32 if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
33 else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
37 my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
39 my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
41 sub alg_cbc_encrypt_implement {
45 .globl ${alg}${bits}_t4_cbc_encrypt
47 ${alg}${bits}_t4_cbc_encrypt:
48 save %sp, -$::frame, %sp
50 be,pn $::size_t_cc, .L${bits}_cbc_enc_abort
51 srln $len, 0, $len ! needed on v8+, "nop" on v9
52 sub $inp, $out, $blk_init ! $inp!=$out
54 $::code.=<<___ if (!$::evp);
55 andcc $ivec, 7, $ivoff
56 alignaddr $ivec, %g0, $ivec
58 ldd [$ivec + 0], %f0 ! load ivec
62 faligndata %f0, %f2, %f0
63 faligndata %f2, %f4, %f2
66 $::code.=<<___ if ($::evp);
74 prefetch [$inp + 63], 20
75 call _${alg}${bits}_load_enckey
81 sub $iright, $ileft, $iright
84 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
85 movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
86 brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
87 srl $omask, $ooff, $omask
89 alignaddrl $out, %g0, $out
93 .L${bits}_cbc_enc_loop:
100 srlx %o1, $iright, %g1
101 sllx %o1, $ileft, %o1
103 srlx %o2, $iright, %o2
106 xor %g4, %o0, %o0 ! ^= rk[0]
111 fxor %f12, %f0, %f0 ! ^= ivec
113 prefetch [$out + 63], 22
114 prefetch [$inp + 16+63], 20
115 call _${alg}${bits}_encrypt_1x
123 brnz,pt $len, .L${bits}_cbc_enc_loop
126 $::code.=<<___ if ($::evp);
132 $::code.=<<___ if (!$::evp);
136 std %f0, [$ivec + 0] ! write out ivec
140 .L${bits}_cbc_enc_abort:
145 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
146 ! and ~3x deterioration
148 faligndata %f0, %f0, %f4 ! handle unaligned output
149 faligndata %f0, %f2, %f6
150 faligndata %f2, %f2, %f8
152 stda %f4, [$out + $omask]0xc0 ! partial store
155 orn %g0, $omask, $omask
156 stda %f8, [$out + $omask]0xc0 ! partial store
158 brnz,pt $len, .L${bits}_cbc_enc_loop+4
159 orn %g0, $omask, $omask
161 $::code.=<<___ if ($::evp);
167 $::code.=<<___ if (!$::evp);
171 std %f0, [$ivec + 0] ! write out ivec
177 3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
179 srl $omask, $ivoff, $omask
180 faligndata %f0, %f0, %f4
181 faligndata %f0, %f2, %f6
182 faligndata %f2, %f2, %f8
183 stda %f4, [$ivec + $omask]0xc0
186 orn %g0, $omask, $omask
187 stda %f8, [$ivec + $omask]0xc0
193 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
195 .L${bits}cbc_enc_blk:
196 add $out, $len, $blk_init
197 and $blk_init, 63, $blk_init ! tail
198 sub $len, $blk_init, $len
199 add $blk_init, 15, $blk_init ! round up to 16n
201 srl $blk_init, 4, $blk_init
203 .L${bits}_cbc_enc_blk_loop:
209 sllx %o0, $ileft, %o0
210 srlx %o1, $iright, %g1
211 sllx %o1, $ileft, %o1
213 srlx %o2, $iright, %o2
216 xor %g4, %o0, %o0 ! ^= rk[0]
221 fxor %f12, %f0, %f0 ! ^= ivec
223 prefetch [$inp + 16+63], 20
224 call _${alg}${bits}_encrypt_1x
228 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
230 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
231 brnz,pt $len, .L${bits}_cbc_enc_blk_loop
234 membar #StoreLoad|#StoreStore
235 brnz,pt $blk_init, .L${bits}_cbc_enc_loop
238 $::code.=<<___ if ($::evp);
244 $::code.=<<___ if (!$::evp);
248 std %f0, [$ivec + 0] ! write out ivec
254 .type ${alg}${bits}_t4_cbc_encrypt,#function
255 .size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
259 sub alg_cbc_decrypt_implement {
260 my ($alg,$bits) = @_;
263 .globl ${alg}${bits}_t4_cbc_decrypt
265 ${alg}${bits}_t4_cbc_decrypt:
266 save %sp, -$::frame, %sp
268 be,pn $::size_t_cc, .L${bits}_cbc_dec_abort
269 srln $len, 0, $len ! needed on v8+, "nop" on v9
270 sub $inp, $out, $blk_init ! $inp!=$out
272 $::code.=<<___ if (!$::evp);
273 andcc $ivec, 7, $ivoff
274 alignaddr $ivec, %g0, $ivec
276 ldd [$ivec + 0], %f12 ! load ivec
278 ldd [$ivec + 8], %f14
279 ldd [$ivec + 16], %f0
280 faligndata %f12, %f14, %f12
281 faligndata %f14, %f0, %f14
284 $::code.=<<___ if ($::evp);
285 ld [$ivec + 0], %f12 ! load ivec
288 ld [$ivec + 12], %f15
292 prefetch [$inp + 63], 20
293 call _${alg}${bits}_load_deckey
296 sll $ileft, 3, $ileft
299 sub $iright, $ileft, $iright
302 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
303 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
304 brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
305 srl $omask, $ooff, $omask
307 andcc $len, 16, %g0 ! is number of blocks even?
309 alignaddrl $out, %g0, $out
310 bz %icc, .L${bits}_cbc_dec_loop2x
312 .L${bits}_cbc_dec_loop:
318 sllx %o0, $ileft, %o0
319 srlx %o1, $iright, %g1
320 sllx %o1, $ileft, %o1
322 srlx %o2, $iright, %o2
325 xor %g4, %o0, %o2 ! ^= rk[0]
330 prefetch [$out + 63], 22
331 prefetch [$inp + 16+63], 20
332 call _${alg}${bits}_decrypt_1x
335 fxor %f12, %f0, %f0 ! ^= ivec
345 brnz,pt $len, .L${bits}_cbc_dec_loop2x
348 $::code.=<<___ if ($::evp);
352 st %f15, [$ivec + 12]
354 $::code.=<<___ if (!$::evp);
355 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
358 std %f12, [$ivec + 0] ! write out ivec
359 std %f14, [$ivec + 8]
362 .L${bits}_cbc_dec_abort:
367 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
368 ! and ~3x deterioration
370 faligndata %f0, %f0, %f4 ! handle unaligned output
371 faligndata %f0, %f2, %f6
372 faligndata %f2, %f2, %f8
374 stda %f4, [$out + $omask]0xc0 ! partial store
377 orn %g0, $omask, $omask
378 stda %f8, [$out + $omask]0xc0 ! partial store
380 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
381 orn %g0, $omask, $omask
383 $::code.=<<___ if ($::evp);
387 st %f15, [$ivec + 12]
389 $::code.=<<___ if (!$::evp);
390 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
393 std %f12, [$ivec + 0] ! write out ivec
394 std %f14, [$ivec + 8]
400 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
402 .L${bits}_cbc_dec_loop2x:
410 sllx %o0, $ileft, %o0
411 srlx %o1, $iright, %g1
413 sllx %o1, $ileft, %o1
414 srlx %o2, $iright, %g1
416 sllx %o2, $ileft, %o2
417 srlx %o3, $iright, %g1
419 sllx %o3, $ileft, %o3
420 srlx %o4, $iright, %o4
423 xor %g4, %o0, %o4 ! ^= rk[0]
432 prefetch [$out + 63], 22
433 prefetch [$inp + 32+63], 20
434 call _${alg}${bits}_decrypt_2x
439 fxor %f12, %f0, %f0 ! ^= ivec
453 brnz,pt $len, .L${bits}_cbc_dec_loop2x
456 $::code.=<<___ if ($::evp);
460 st %f15, [$ivec + 12]
462 $::code.=<<___ if (!$::evp);
463 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
466 std %f12, [$ivec + 0] ! write out ivec
467 std %f14, [$ivec + 8]
474 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
475 ! and ~3x deterioration
477 faligndata %f0, %f0, %f8 ! handle unaligned output
478 faligndata %f0, %f2, %f0
479 faligndata %f2, %f4, %f2
480 faligndata %f4, %f6, %f4
481 faligndata %f6, %f6, %f6
482 stda %f8, [$out + $omask]0xc0 ! partial store
487 orn %g0, $omask, $omask
488 stda %f6, [$out + $omask]0xc0 ! partial store
490 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
491 orn %g0, $omask, $omask
493 $::code.=<<___ if ($::evp);
497 st %f15, [$ivec + 12]
499 $::code.=<<___ if (!$::evp);
500 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
503 std %f12, [$ivec + 0] ! write out ivec
504 std %f14, [$ivec + 8]
509 .L${bits}_cbc_dec_unaligned_ivec:
510 alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
512 srl $omask, $ivoff, $omask
513 faligndata %f12, %f12, %f0
514 faligndata %f12, %f14, %f2
515 faligndata %f14, %f14, %f4
516 stda %f0, [$ivec + $omask]0xc0
519 orn %g0, $omask, $omask
520 stda %f4, [$ivec + $omask]0xc0
526 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
528 .L${bits}cbc_dec_blk:
529 add $out, $len, $blk_init
530 and $blk_init, 63, $blk_init ! tail
531 sub $len, $blk_init, $len
532 add $blk_init, 15, $blk_init ! round up to 16n
534 srl $blk_init, 4, $blk_init
536 add $blk_init, 1, $blk_init
538 .L${bits}_cbc_dec_blk_loop2x:
546 sllx %o0, $ileft, %o0
547 srlx %o1, $iright, %g1
549 sllx %o1, $ileft, %o1
550 srlx %o2, $iright, %g1
552 sllx %o2, $ileft, %o2
553 srlx %o3, $iright, %g1
555 sllx %o3, $ileft, %o3
556 srlx %o4, $iright, %o4
559 xor %g4, %o0, %o4 ! ^= rk[0]
568 prefetch [$inp + 32+63], 20
569 call _${alg}${bits}_decrypt_2x
575 fxor %f12, %f0, %f0 ! ^= ivec
582 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
584 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
586 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
588 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
589 bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
592 add $blk_init, $len, $len
593 andcc $len, 1, %g0 ! is number of blocks even?
594 membar #StoreLoad|#StoreStore
595 bnz,pt %icc, .L${bits}_cbc_dec_loop
597 brnz,pn $len, .L${bits}_cbc_dec_loop2x
600 $::code.=<<___ if ($::evp);
601 st %f12, [$ivec + 0] ! write out ivec
604 st %f15, [$ivec + 12]
606 $::code.=<<___ if (!$::evp);
610 std %f12, [$ivec + 0] ! write out ivec
611 std %f14, [$ivec + 8]
616 .type ${alg}${bits}_t4_cbc_decrypt,#function
617 .size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
621 sub alg_ctr32_implement {
622 my ($alg,$bits) = @_;
625 .globl ${alg}${bits}_t4_ctr32_encrypt
627 ${alg}${bits}_t4_ctr32_encrypt:
628 save %sp, -$::frame, %sp
629 srln $len, 0, $len ! needed on v8+, "nop" on v9
632 prefetch [$inp + 63], 20
633 call _${alg}${bits}_load_enckey
636 ld [$ivec + 0], %l4 ! counter
644 xor %o5, %g4, %g4 ! ^= rk[0]
646 movxtod %g4, %f14 ! most significant 64 bits
648 sub $inp, $out, $blk_init ! $inp!=$out
651 sll $ileft, 3, $ileft
654 sub $iright, $ileft, $iright
657 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
658 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
659 brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
660 srl $omask, $ooff, $omask
662 andcc $len, 16, %g0 ! is number of blocks even?
663 alignaddrl $out, %g0, $out
664 bz %icc, .L${bits}_ctr32_loop2x
666 .L${bits}_ctr32_loop:
672 sllx %o0, $ileft, %o0
673 srlx %o1, $iright, %g1
674 sllx %o1, $ileft, %o1
676 srlx %o2, $iright, %o2
679 xor %g5, %l7, %g1 ! ^= rk[0]
682 srl %l7, 0, %l7 ! clruw
683 prefetch [$out + 63], 22
684 prefetch [$inp + 16+63], 20
686 $::code.=<<___ if ($alg eq "aes");
687 aes_eround01 %f16, %f14, %f2, %f4
688 aes_eround23 %f18, %f14, %f2, %f2
690 $::code.=<<___ if ($alg eq "cmll");
691 camellia_f %f16, %f2, %f14, %f2
692 camellia_f %f18, %f14, %f2, %f0
695 call _${alg}${bits}_encrypt_1x+8
700 fxor %f10, %f0, %f0 ! ^= inp
708 brnz,pt $len, .L${bits}_ctr32_loop2x
715 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
716 ! and ~3x deterioration
718 faligndata %f0, %f0, %f4 ! handle unaligned output
719 faligndata %f0, %f2, %f6
720 faligndata %f2, %f2, %f8
721 stda %f4, [$out + $omask]0xc0 ! partial store
724 orn %g0, $omask, $omask
725 stda %f8, [$out + $omask]0xc0 ! partial store
727 brnz,pt $len, .L${bits}_ctr32_loop2x+4
728 orn %g0, $omask, $omask
733 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
735 .L${bits}_ctr32_loop2x:
743 sllx %o0, $ileft, %o0
744 srlx %o1, $iright, %g1
746 sllx %o1, $ileft, %o1
747 srlx %o2, $iright, %g1
749 sllx %o2, $ileft, %o2
750 srlx %o3, $iright, %g1
752 sllx %o3, $ileft, %o3
753 srlx %o4, $iright, %o4
756 xor %g5, %l7, %g1 ! ^= rk[0]
759 srl %l7, 0, %l7 ! clruw
763 srl %l7, 0, %l7 ! clruw
764 prefetch [$out + 63], 22
765 prefetch [$inp + 32+63], 20
767 $::code.=<<___ if ($alg eq "aes");
768 aes_eround01 %f16, %f14, %f2, %f8
769 aes_eround23 %f18, %f14, %f2, %f2
770 aes_eround01 %f16, %f14, %f6, %f10
771 aes_eround23 %f18, %f14, %f6, %f6
773 $::code.=<<___ if ($alg eq "cmll");
774 camellia_f %f16, %f2, %f14, %f2
775 camellia_f %f16, %f6, %f14, %f6
776 camellia_f %f18, %f14, %f2, %f0
777 camellia_f %f18, %f14, %f6, %f4
780 call _${alg}${bits}_encrypt_2x+16
786 fxor %f8, %f0, %f0 ! ^= inp
799 brnz,pt $len, .L${bits}_ctr32_loop2x
806 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
807 ! and ~3x deterioration
809 faligndata %f0, %f0, %f8 ! handle unaligned output
810 faligndata %f0, %f2, %f0
811 faligndata %f2, %f4, %f2
812 faligndata %f4, %f6, %f4
813 faligndata %f6, %f6, %f6
815 stda %f8, [$out + $omask]0xc0 ! partial store
820 orn %g0, $omask, $omask
821 stda %f6, [$out + $omask]0xc0 ! partial store
823 brnz,pt $len, .L${bits}_ctr32_loop2x+4
824 orn %g0, $omask, $omask
829 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
832 add $out, $len, $blk_init
833 and $blk_init, 63, $blk_init ! tail
834 sub $len, $blk_init, $len
835 add $blk_init, 15, $blk_init ! round up to 16n
837 srl $blk_init, 4, $blk_init
839 add $blk_init, 1, $blk_init
841 .L${bits}_ctr32_blk_loop2x:
849 sllx %o0, $ileft, %o0
850 srlx %o1, $iright, %g1
852 sllx %o1, $ileft, %o1
853 srlx %o2, $iright, %g1
855 sllx %o2, $ileft, %o2
856 srlx %o3, $iright, %g1
858 sllx %o3, $ileft, %o3
859 srlx %o4, $iright, %o4
862 xor %g5, %l7, %g1 ! ^= rk[0]
865 srl %l7, 0, %l7 ! clruw
869 srl %l7, 0, %l7 ! clruw
870 prefetch [$inp + 32+63], 20
872 $::code.=<<___ if ($alg eq "aes");
873 aes_eround01 %f16, %f14, %f2, %f8
874 aes_eround23 %f18, %f14, %f2, %f2
875 aes_eround01 %f16, %f14, %f6, %f10
876 aes_eround23 %f18, %f14, %f6, %f6
878 $::code.=<<___ if ($alg eq "cmll");
879 camellia_f %f16, %f2, %f14, %f2
880 camellia_f %f16, %f6, %f14, %f6
881 camellia_f %f18, %f14, %f2, %f0
882 camellia_f %f18, %f14, %f6, %f4
885 call _${alg}${bits}_encrypt_2x+16
892 fxor %f8, %f0, %f0 ! ^= inp
898 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
900 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
902 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
904 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
905 bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
908 add $blk_init, $len, $len
909 andcc $len, 1, %g0 ! is number of blocks even?
910 membar #StoreLoad|#StoreStore
911 bnz,pt %icc, .L${bits}_ctr32_loop
913 brnz,pn $len, .L${bits}_ctr32_loop2x
918 .type ${alg}${bits}_t4_ctr32_encrypt,#function
919 .size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
923 sub alg_xts_implement {
924 my ($alg,$bits,$dir) = @_;
925 my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
929 .globl ${alg}${bits}_t4_xts_${dir}crypt
931 ${alg}${bits}_t4_xts_${dir}crypt:
932 save %sp, -$::frame-16, %sp
933 srln $len, 0, $len ! needed on v8+, "nop" on v9
936 add %fp, $::bias-16, %o1
937 call ${alg}_t4_encrypt
940 add %fp, $::bias-16, %l7
942 add %fp, $::bias-8, %l7
943 ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
945 sethi %hi(0x76543210), %l7
946 or %l7, %lo(0x76543210), %l7
947 bmask %l7, %g0, %g0 ! byte swap mask
950 prefetch [$inp + 63], 20
951 call _${alg}${bits}_load_${dir}ckey
955 $code.=<<___ if ($dir eq "de");
962 sub $inp, $out, $blk_init ! $inp!=$out
965 sll $ileft, 3, $ileft
968 sub $iright, $ileft, $iright
971 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
972 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
973 brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
974 srl $omask, $ooff, $omask
976 andcc $len, 16, %g0 ! is number of blocks even?
978 $code.=<<___ if ($dir eq "de");
979 brz,pn $len, .L${bits}_xts_${dir}steal
982 alignaddrl $out, %g0, $out
983 bz %icc, .L${bits}_xts_${dir}loop2x
985 .L${bits}_xts_${dir}loop:
991 sllx %o0, $ileft, %o0
992 srlx %o1, $iright, %g1
993 sllx %o1, $ileft, %o1
995 srlx %o2, $iright, %o2
1000 bshuffle %f12, %f12, %f12
1001 bshuffle %f14, %f14, %f14
1003 xor %g4, %o0, %o0 ! ^= rk[0]
1008 fxor %f12, %f0, %f0 ! ^= tweak[0]
1011 prefetch [$out + 63], 22
1012 prefetch [$inp + 16+63], 20
1013 call _${alg}${bits}_${dir}crypt_1x
1016 fxor %f12, %f0, %f0 ! ^= tweak[0]
1019 srax %g3, 63, %l7 ! next tweak value
1030 brnz,pt $len, .L${bits}_xts_${dir}loop2x
1033 brnz,pn $rem, .L${bits}_xts_${dir}steal
1040 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1041 ! and ~3x deterioration
1043 faligndata %f0, %f0, %f4 ! handle unaligned output
1044 faligndata %f0, %f2, %f6
1045 faligndata %f2, %f2, %f8
1046 stda %f4, [$out + $omask]0xc0 ! partial store
1049 orn %g0, $omask, $omask
1050 stda %f8, [$out + $omask]0xc0 ! partial store
1052 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
1053 orn %g0, $omask, $omask
1055 brnz,pn $rem, .L${bits}_xts_${dir}steal
1061 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1063 .L${bits}_xts_${dir}loop2x:
1066 ldx [$inp + 16], %o2
1068 ldx [$inp + 24], %o3
1070 ldx [$inp + 32], %o4
1071 sllx %o0, $ileft, %o0
1072 srlx %o1, $iright, %g1
1074 sllx %o1, $ileft, %o1
1075 srlx %o2, $iright, %g1
1077 sllx %o2, $ileft, %o2
1078 srlx %o3, $iright, %g1
1080 sllx %o3, $ileft, %o3
1081 srlx %o4, $iright, %o4
1086 bshuffle %f12, %f12, %f12
1087 bshuffle %f14, %f14, %f14
1089 srax %g3, 63, %l7 ! next tweak value
1097 bshuffle %f8, %f8, %f8
1098 bshuffle %f10, %f10, %f10
1100 xor %g4, %o0, %o0 ! ^= rk[0]
1102 xor %g4, %o2, %o2 ! ^= rk[0]
1109 fxor %f12, %f0, %f0 ! ^= tweak[0]
1111 fxor %f8, %f4, %f4 ! ^= tweak[0]
1114 prefetch [$out + 63], 22
1115 prefetch [$inp + 32+63], 20
1116 call _${alg}${bits}_${dir}crypt_2x
1122 srax %g3, 63, %l7 ! next tweak value
1128 bshuffle %f8, %f8, %f8
1129 bshuffle %f10, %f10, %f10
1131 fxor %f12, %f0, %f0 ! ^= tweak[0]
1141 std %f4, [$out + 16]
1142 std %f6, [$out + 24]
1143 brnz,pt $len, .L${bits}_xts_${dir}loop2x
1148 brnz,pn $rem, .L${bits}_xts_${dir}steal
1155 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1156 ! and ~3x deterioration
1158 faligndata %f0, %f0, %f8 ! handle unaligned output
1159 faligndata %f0, %f2, %f10
1160 faligndata %f2, %f4, %f12
1161 faligndata %f4, %f6, %f14
1162 faligndata %f6, %f6, %f0
1164 stda %f8, [$out + $omask]0xc0 ! partial store
1165 std %f10, [$out + 8]
1166 std %f12, [$out + 16]
1167 std %f14, [$out + 24]
1169 orn %g0, $omask, $omask
1170 stda %f0, [$out + $omask]0xc0 ! partial store
1172 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
1173 orn %g0, $omask, $omask
1177 brnz,pn $rem, .L${bits}_xts_${dir}steal
1183 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1185 .L${bits}_xts_${dir}blk:
1186 add $out, $len, $blk_init
1187 and $blk_init, 63, $blk_init ! tail
1188 sub $len, $blk_init, $len
1189 add $blk_init, 15, $blk_init ! round up to 16n
1191 srl $blk_init, 4, $blk_init
1193 add $blk_init, 1, $blk_init
1195 .L${bits}_xts_${dir}blk2x:
1198 ldx [$inp + 16], %o2
1200 ldx [$inp + 24], %o3
1202 ldx [$inp + 32], %o4
1203 sllx %o0, $ileft, %o0
1204 srlx %o1, $iright, %g1
1206 sllx %o1, $ileft, %o1
1207 srlx %o2, $iright, %g1
1209 sllx %o2, $ileft, %o2
1210 srlx %o3, $iright, %g1
1212 sllx %o3, $ileft, %o3
1213 srlx %o4, $iright, %o4
1218 bshuffle %f12, %f12, %f12
1219 bshuffle %f14, %f14, %f14
1221 srax %g3, 63, %l7 ! next tweak value
1229 bshuffle %f8, %f8, %f8
1230 bshuffle %f10, %f10, %f10
1232 xor %g4, %o0, %o0 ! ^= rk[0]
1234 xor %g4, %o2, %o2 ! ^= rk[0]
1241 fxor %f12, %f0, %f0 ! ^= tweak[0]
1243 fxor %f8, %f4, %f4 ! ^= tweak[0]
1246 prefetch [$inp + 32+63], 20
1247 call _${alg}${bits}_${dir}crypt_2x
1253 srax %g3, 63, %l7 ! next tweak value
1259 bshuffle %f8, %f8, %f8
1260 bshuffle %f10, %f10, %f10
1262 fxor %f12, %f0, %f0 ! ^= tweak[0]
1268 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1270 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1272 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1274 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1275 bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
1278 add $blk_init, $len, $len
1279 andcc $len, 1, %g0 ! is number of blocks even?
1280 membar #StoreLoad|#StoreStore
1281 bnz,pt %icc, .L${bits}_xts_${dir}loop
1283 brnz,pn $len, .L${bits}_xts_${dir}loop2x
1288 brnz,pn $rem, .L${bits}_xts_${dir}steal
1293 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1295 $code.=<<___ if ($dir eq "en");
1297 .L${bits}_xts_${dir}steal:
1298 std %f0, [%fp + $::bias-16] ! copy of output
1299 std %f2, [%fp + $::bias-8]
1301 srl $ileft, 3, $ileft
1302 add %fp, $::bias-16, %l7
1303 add $inp, $ileft, $inp ! original $inp+$len&-15
1304 add $out, $ooff, $out ! original $out+$len&-15
1308 .L${bits}_xts_${dir}stealing:
1309 ldub [$inp + $ileft], %o0
1310 ldub [%l7 + $ileft], %o1
1312 stb %o0, [%l7 + $ileft]
1313 stb %o1, [$out + $ileft]
1314 brnz $rem, .L${bits}_xts_${dir}stealing
1320 sub $out, $ooff, $out
1321 ba .L${bits}_xts_${dir}loop ! one more time
1322 mov 1, $len ! $rem is 0
1324 $code.=<<___ if ($dir eq "de");
1326 .L${bits}_xts_${dir}steal:
1331 ldx [$inp + 16], %o2
1332 sllx %o0, $ileft, %o0
1333 srlx %o1, $iright, %g1
1334 sllx %o1, $ileft, %o1
1336 srlx %o2, $iright, %o2
1339 srax %g3, 63, %l7 ! next tweak value
1347 bshuffle %f12, %f12, %f12
1348 bshuffle %f14, %f14, %f14
1350 xor %g4, %o0, %o0 ! ^= rk[0]
1355 fxor %f12, %f0, %f0 ! ^= tweak[0]
1358 call _${alg}${bits}_${dir}crypt_1x
1361 fxor %f12, %f0, %f0 ! ^= tweak[0]
1364 std %f0, [%fp + $::bias-16]
1365 std %f2, [%fp + $::bias-8]
1367 srl $ileft, 3, $ileft
1368 add %fp, $::bias-16, %l7
1369 add $inp, $ileft, $inp ! original $inp+$len&-15
1370 add $out, $ooff, $out ! original $out+$len&-15
1375 .L${bits}_xts_${dir}stealing:
1376 ldub [$inp + $ileft], %o0
1377 ldub [%l7 + $ileft], %o1
1379 stb %o0, [%l7 + $ileft]
1380 stb %o1, [$out + $ileft]
1381 brnz $rem, .L${bits}_xts_${dir}stealing
1387 sub $out, $ooff, $out
1388 ba .L${bits}_xts_${dir}loop ! one more time
1389 mov 1, $len ! $rem is 0
1394 .type ${alg}${bits}_t4_xts_${dir}crypt,#function
1395 .size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
1399 # Purpose of these subroutines is to explicitly encode VIS instructions,
1400 # so that one can compile the module without having to specify VIS
1401 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1402 # Idea is to reserve for option to produce "universal" binary and let
1403 # programmer detect if current CPU is VIS capable at run-time.
1405 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1407 my %visopf = ( "faligndata" => 0x048,
1408 "bshuffle" => 0x04c,
1413 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1415 if ($opf=$visopf{$mnemonic}) {
1416 foreach ($rs1,$rs2,$rd) {
1417 return $ref if (!/%f([0-9]{1,2})/);
1420 return $ref if ($1&1);
1421 # re-encode for upper double register addressing
1426 return sprintf ".word\t0x%08x !%s",
1427 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1435 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1436 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1438 my %visopf = ( "addxc" => 0x011,
1441 "alignaddr" => 0x018,
1443 "alignaddrl" => 0x01a );
1445 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1447 if ($opf=$visopf{$mnemonic}) {
1448 foreach ($rs1,$rs2,$rd) {
1449 return $ref if (!/%([goli])([0-9])/);
1453 return sprintf ".word\t0x%08x !%s",
1454 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1461 sub unaes_round { # 4-argument instructions
1462 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1464 my %aesopf = ( "aes_eround01" => 0,
1465 "aes_eround23" => 1,
1466 "aes_dround01" => 2,
1467 "aes_dround23" => 3,
1468 "aes_eround01_l"=> 4,
1469 "aes_eround23_l"=> 5,
1470 "aes_dround01_l"=> 6,
1471 "aes_dround23_l"=> 7,
1472 "aes_kexpand1" => 8 );
1474 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1476 if (defined($opf=$aesopf{$mnemonic})) {
1477 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1478 foreach ($rs1,$rs2,$rd) {
1479 return $ref if (!/%f([0-9]{1,2})/);
1482 return $ref if ($1&1);
1483 # re-encode for upper double register addressing
1488 return sprintf ".word\t0x%08x !%s",
1489 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1496 sub unaes_kexpand { # 3-argument instructions
1497 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1499 my %aesopf = ( "aes_kexpand0" => 0x130,
1500 "aes_kexpand2" => 0x131 );
1502 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1504 if (defined($opf=$aesopf{$mnemonic})) {
1505 foreach ($rs1,$rs2,$rd) {
1506 return $ref if (!/%f([0-9]{1,2})/);
1509 return $ref if ($1&1);
1510 # re-encode for upper double register addressing
1515 return sprintf ".word\t0x%08x !%s",
1516 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1523 sub uncamellia_f { # 4-argument instructions
1524 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1527 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1530 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1531 foreach ($rs1,$rs2,$rd) {
1532 return $ref if (!/%f([0-9]{1,2})/);
1535 return $ref if ($1&1);
1536 # re-encode for upper double register addressing
1541 return sprintf ".word\t0x%08x !%s",
1542 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1549 sub uncamellia3 { # 3-argument instructions
1550 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1552 my %cmllopf = ( "camellia_fl" => 0x13c,
1553 "camellia_fli" => 0x13d );
1555 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1557 if (defined($opf=$cmllopf{$mnemonic})) {
1558 foreach ($rs1,$rs2,$rd) {
1559 return $ref if (!/%f([0-9]{1,2})/);
1562 return $ref if ($1&1);
1563 # re-encode for upper double register addressing
1568 return sprintf ".word\t0x%08x !%s",
1569 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1576 sub unmovxtox { # 2-argument instructions
1577 my ($mnemonic,$rs,$rd)=@_;
1578 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1580 my %movxopf = ( "movdtox" => 0x110,
1581 "movstouw" => 0x111,
1582 "movstosw" => 0x113,
1584 "movwtos" => 0x119 );
1586 $ref = "$mnemonic\t$rs,$rd";
1588 if (defined($opf=$movxopf{$mnemonic})) {
1590 return $ref if (!/%([fgoli])([0-9]{1,2})/);
1593 return $ref if ($2&1);
1594 # re-encode for upper double register addressing
1599 return sprintf ".word\t0x%08x !%s",
1600 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1608 my ($mnemonic)=shift;
1611 my %desopf = ( "des_round" => 0b1001,
1612 "des_ip" => 0b100110100,
1613 "des_iip" => 0b100110101,
1614 "des_kexpand" => 0b100110110 );
1616 $ref = "$mnemonic\t".join(",",@_);
1618 if (defined($opf=$desopf{$mnemonic})) { # 4-arg
1619 if ($mnemonic eq "des_round") {
1620 foreach (@args[0..3]) {
1621 return $ref if (!/%f([0-9]{1,2})/);
1624 return $ref if ($1&1);
1625 # re-encode for upper double register addressing
1629 return sprintf ".word\t0x%08x !%s",
1630 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
1632 } elsif ($mnemonic eq "des_kexpand") { # 3-arg
1633 foreach (@args[0..2]) {
1634 return $ref if (!/(%f)?([0-9]{1,2})/);
1637 return $ref if ($2&1);
1638 # re-encode for upper double register addressing
1642 return sprintf ".word\t0x%08x !%s",
1643 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
1646 foreach (@args[0..1]) {
1647 return $ref if (!/%f([0-9]{1,2})/);
1650 return $ref if ($2&1);
1651 # re-encode for upper double register addressing
1655 return sprintf ".word\t0x%08x !%s",
1656 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
1664 sub emit_assembler {
1665 foreach (split("\n",$::code)) {
1666 s/\`([^\`]*)\`/eval $1/ge;
1668 s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
1670 s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1671 &unaes_round($1,$2,$3,$4,$5)
1673 s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1674 &unaes_kexpand($1,$2,$3,$4)
1676 s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1677 &uncamellia_f($1,$2,$3,$4,$5)
1679 s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1680 &uncamellia3($1,$2,$3,$4)
1682 s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
1683 &undes($1,$2,$3,$4,$5)
1685 s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1686 &unmovxtox($1,$2,$3)
1688 s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1689 &unmovxtox($1,$2,$3)
1691 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1694 s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1695 &unvis3($1,$2,$3,$4)