2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # Specific modes implementations for SPARC Architecture 2011. There
11 # is T4 dependency though, an ASI value that is not specified in the
12 # Architecture Manual. But as SPARC universe is rather monocultural,
13 # we imply that processor capable of executing crypto instructions
14 # can handle the ASI in question as well. This means that we ought to
15 # keep eyes open when new processors emerge...
17 # As for above mentioned ASI. It's so called "block initializing
18 # store" which cancels "read" in "read-update-write" on cache lines.
19 # This is "cooperative" optimization, as it reduces overall pressure
20 # on memory interface. Benefits can't be observed/quantified with
21 # usual benchmarks, on the contrary you can notice that single-thread
22 # performance for parallelizable modes is ~1.5% worse for largest
23 # block sizes [though few percent better for not so long ones]. All
24 # this based on suggestions from David Miller.
27 $::frame="STACK_FRAME";
28 $::size_t_cc="SIZE_T_CC";
30 sub asm_init { # to be called with @ARGV as argument
31 for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
32 if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
33 else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
37 my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
39 my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
41 sub alg_cbc_encrypt_implement {
45 .globl ${alg}${bits}_t4_cbc_encrypt
47 ${alg}${bits}_t4_cbc_encrypt:
48 save %sp, -$::frame, %sp
50 be,pn $::size_t_cc, .L${bits}_cbc_enc_abort
51 sub $inp, $out, $blk_init ! $inp!=$out
53 $::code.=<<___ if (!$::evp);
54 andcc $ivec, 7, $ivoff
55 alignaddr $ivec, %g0, $ivec
57 ldd [$ivec + 0], %f0 ! load ivec
61 faligndata %f0, %f2, %f0
62 faligndata %f2, %f4, %f2
65 $::code.=<<___ if ($::evp);
73 prefetch [$inp + 63], 20
74 call _${alg}${bits}_load_enckey
80 sub $iright, $ileft, $iright
83 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
84 movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
85 brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
86 srl $omask, $ooff, $omask
88 alignaddrl $out, %g0, $out
92 .L${bits}_cbc_enc_loop:
99 srlx %o1, $iright, %g1
100 sllx %o1, $ileft, %o1
102 srlx %o2, $iright, %o2
105 xor %g4, %o0, %o0 ! ^= rk[0]
110 fxor %f12, %f0, %f0 ! ^= ivec
112 prefetch [$out + 63], 22
113 prefetch [$inp + 16+63], 20
114 call _${alg}${bits}_encrypt_1x
122 brnz,pt $len, .L${bits}_cbc_enc_loop
125 $::code.=<<___ if ($::evp);
131 $::code.=<<___ if (!$::evp);
135 std %f0, [$ivec + 0] ! write out ivec
139 .L${bits}_cbc_enc_abort:
144 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
145 ! and ~3x deterioration
147 faligndata %f0, %f0, %f4 ! handle unaligned output
148 faligndata %f0, %f2, %f6
149 faligndata %f2, %f2, %f8
151 stda %f4, [$out + $omask]0xc0 ! partial store
154 orn %g0, $omask, $omask
155 stda %f8, [$out + $omask]0xc0 ! partial store
157 brnz,pt $len, .L${bits}_cbc_enc_loop+4
158 orn %g0, $omask, $omask
160 $::code.=<<___ if ($::evp);
166 $::code.=<<___ if (!$::evp);
170 std %f0, [$ivec + 0] ! write out ivec
176 3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
178 srl $omask, $ivoff, $omask
179 faligndata %f0, %f0, %f4
180 faligndata %f0, %f2, %f6
181 faligndata %f2, %f2, %f8
182 stda %f4, [$ivec + $omask]0xc0
185 orn %g0, $omask, $omask
186 stda %f8, [$ivec + $omask]0xc0
192 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
194 .L${bits}cbc_enc_blk:
195 add $out, $len, $blk_init
196 and $blk_init, 63, $blk_init ! tail
197 sub $len, $blk_init, $len
198 add $blk_init, 15, $blk_init ! round up to 16n
200 srl $blk_init, 4, $blk_init
202 .L${bits}_cbc_enc_blk_loop:
208 sllx %o0, $ileft, %o0
209 srlx %o1, $iright, %g1
210 sllx %o1, $ileft, %o1
212 srlx %o2, $iright, %o2
215 xor %g4, %o0, %o0 ! ^= rk[0]
220 fxor %f12, %f0, %f0 ! ^= ivec
222 prefetch [$inp + 16+63], 20
223 call _${alg}${bits}_encrypt_1x
227 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
229 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
230 brnz,pt $len, .L${bits}_cbc_enc_blk_loop
233 membar #StoreLoad|#StoreStore
234 brnz,pt $blk_init, .L${bits}_cbc_enc_loop
237 $::code.=<<___ if ($::evp);
243 $::code.=<<___ if (!$::evp);
247 std %f0, [$ivec + 0] ! write out ivec
253 .type ${alg}${bits}_t4_cbc_encrypt,#function
254 .size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
258 sub alg_cbc_decrypt_implement {
259 my ($alg,$bits) = @_;
262 .globl ${alg}${bits}_t4_cbc_decrypt
264 ${alg}${bits}_t4_cbc_decrypt:
265 save %sp, -$::frame, %sp
267 be,pn $::size_t_cc, .L${bits}_cbc_dec_abort
268 sub $inp, $out, $blk_init ! $inp!=$out
270 $::code.=<<___ if (!$::evp);
271 andcc $ivec, 7, $ivoff
272 alignaddr $ivec, %g0, $ivec
274 ldd [$ivec + 0], %f12 ! load ivec
276 ldd [$ivec + 8], %f14
277 ldd [$ivec + 16], %f0
278 faligndata %f12, %f14, %f12
279 faligndata %f14, %f0, %f14
282 $::code.=<<___ if ($::evp);
283 ld [$ivec + 0], %f12 ! load ivec
286 ld [$ivec + 12], %f15
290 prefetch [$inp + 63], 20
291 call _${alg}${bits}_load_deckey
294 sll $ileft, 3, $ileft
297 sub $iright, $ileft, $iright
300 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
301 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
302 brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
303 srl $omask, $ooff, $omask
305 andcc $len, 16, %g0 ! is number of blocks even?
307 alignaddrl $out, %g0, $out
308 bz %icc, .L${bits}_cbc_dec_loop2x
310 .L${bits}_cbc_dec_loop:
316 sllx %o0, $ileft, %o0
317 srlx %o1, $iright, %g1
318 sllx %o1, $ileft, %o1
320 srlx %o2, $iright, %o2
323 xor %g4, %o0, %o2 ! ^= rk[0]
328 prefetch [$out + 63], 22
329 prefetch [$inp + 16+63], 20
330 call _${alg}${bits}_decrypt_1x
333 fxor %f12, %f0, %f0 ! ^= ivec
343 brnz,pt $len, .L${bits}_cbc_dec_loop2x
346 $::code.=<<___ if ($::evp);
350 st %f15, [$ivec + 12]
352 $::code.=<<___ if (!$::evp);
353 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
356 std %f12, [$ivec + 0] ! write out ivec
357 std %f14, [$ivec + 8]
360 .L${bits}_cbc_dec_abort:
365 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
366 ! and ~3x deterioration
368 faligndata %f0, %f0, %f4 ! handle unaligned output
369 faligndata %f0, %f2, %f6
370 faligndata %f2, %f2, %f8
372 stda %f4, [$out + $omask]0xc0 ! partial store
375 orn %g0, $omask, $omask
376 stda %f8, [$out + $omask]0xc0 ! partial store
378 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
379 orn %g0, $omask, $omask
381 $::code.=<<___ if ($::evp);
385 st %f15, [$ivec + 12]
387 $::code.=<<___ if (!$::evp);
388 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
391 std %f12, [$ivec + 0] ! write out ivec
392 std %f14, [$ivec + 8]
398 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
400 .L${bits}_cbc_dec_loop2x:
408 sllx %o0, $ileft, %o0
409 srlx %o1, $iright, %g1
411 sllx %o1, $ileft, %o1
412 srlx %o2, $iright, %g1
414 sllx %o2, $ileft, %o2
415 srlx %o3, $iright, %g1
417 sllx %o3, $ileft, %o3
418 srlx %o4, $iright, %o4
421 xor %g4, %o0, %o4 ! ^= rk[0]
430 prefetch [$out + 63], 22
431 prefetch [$inp + 32+63], 20
432 call _${alg}${bits}_decrypt_2x
437 fxor %f12, %f0, %f0 ! ^= ivec
451 brnz,pt $len, .L${bits}_cbc_dec_loop2x
454 $::code.=<<___ if ($::evp);
458 st %f15, [$ivec + 12]
460 $::code.=<<___ if (!$::evp);
461 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
464 std %f12, [$ivec + 0] ! write out ivec
465 std %f14, [$ivec + 8]
472 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
473 ! and ~3x deterioration
475 faligndata %f0, %f0, %f8 ! handle unaligned output
476 faligndata %f0, %f2, %f0
477 faligndata %f2, %f4, %f2
478 faligndata %f4, %f6, %f4
479 faligndata %f6, %f6, %f6
480 stda %f8, [$out + $omask]0xc0 ! partial store
485 orn %g0, $omask, $omask
486 stda %f6, [$out + $omask]0xc0 ! partial store
488 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
489 orn %g0, $omask, $omask
491 $::code.=<<___ if ($::evp);
495 st %f15, [$ivec + 12]
497 $::code.=<<___ if (!$::evp);
498 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
501 std %f12, [$ivec + 0] ! write out ivec
502 std %f14, [$ivec + 8]
507 .L${bits}_cbc_dec_unaligned_ivec:
508 alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
510 srl $omask, $ivoff, $omask
511 faligndata %f12, %f12, %f0
512 faligndata %f12, %f14, %f2
513 faligndata %f14, %f14, %f4
514 stda %f0, [$ivec + $omask]0xc0
517 orn %g0, $omask, $omask
518 stda %f4, [$ivec + $omask]0xc0
524 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
526 .L${bits}cbc_dec_blk:
527 add $out, $len, $blk_init
528 and $blk_init, 63, $blk_init ! tail
529 sub $len, $blk_init, $len
530 add $blk_init, 15, $blk_init ! round up to 16n
532 srl $blk_init, 4, $blk_init
534 add $blk_init, 1, $blk_init
536 .L${bits}_cbc_dec_blk_loop2x:
544 sllx %o0, $ileft, %o0
545 srlx %o1, $iright, %g1
547 sllx %o1, $ileft, %o1
548 srlx %o2, $iright, %g1
550 sllx %o2, $ileft, %o2
551 srlx %o3, $iright, %g1
553 sllx %o3, $ileft, %o3
554 srlx %o4, $iright, %o4
557 xor %g4, %o0, %o4 ! ^= rk[0]
566 prefetch [$inp + 32+63], 20
567 call _${alg}${bits}_decrypt_2x
573 fxor %f12, %f0, %f0 ! ^= ivec
580 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
582 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
584 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
586 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
587 bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
590 add $blk_init, $len, $len
591 andcc $len, 1, %g0 ! is number of blocks even?
592 membar #StoreLoad|#StoreStore
593 bnz,pt %icc, .L${bits}_cbc_dec_loop
595 brnz,pn $len, .L${bits}_cbc_dec_loop2x
598 $::code.=<<___ if ($::evp);
599 st %f12, [$ivec + 0] ! write out ivec
602 st %f15, [$ivec + 12]
604 $::code.=<<___ if (!$::evp);
608 std %f12, [$ivec + 0] ! write out ivec
609 std %f14, [$ivec + 8]
614 .type ${alg}${bits}_t4_cbc_decrypt,#function
615 .size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
619 sub alg_ctr32_implement {
620 my ($alg,$bits) = @_;
623 .globl ${alg}${bits}_t4_ctr32_encrypt
625 ${alg}${bits}_t4_ctr32_encrypt:
626 save %sp, -$::frame, %sp
629 prefetch [$inp + 63], 20
630 call _${alg}${bits}_load_enckey
633 ld [$ivec + 0], %l4 ! counter
641 xor %o5, %g4, %g4 ! ^= rk[0]
643 movxtod %g4, %f14 ! most significant 64 bits
645 sub $inp, $out, $blk_init ! $inp!=$out
648 sll $ileft, 3, $ileft
651 sub $iright, $ileft, $iright
654 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
655 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
656 brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
657 srl $omask, $ooff, $omask
659 andcc $len, 16, %g0 ! is number of blocks even?
660 alignaddrl $out, %g0, $out
661 bz %icc, .L${bits}_ctr32_loop2x
663 .L${bits}_ctr32_loop:
669 sllx %o0, $ileft, %o0
670 srlx %o1, $iright, %g1
671 sllx %o1, $ileft, %o1
673 srlx %o2, $iright, %o2
676 xor %g5, %l7, %g1 ! ^= rk[0]
679 srl %l7, 0, %l7 ! clruw
680 prefetch [$out + 63], 22
681 prefetch [$inp + 16+63], 20
683 $::code.=<<___ if ($alg eq "aes");
684 aes_eround01 %f16, %f14, %f2, %f4
685 aes_eround23 %f18, %f14, %f2, %f2
687 $::code.=<<___ if ($alg eq "cmll");
688 camellia_f %f16, %f2, %f14, %f2
689 camellia_f %f18, %f14, %f2, %f0
692 call _${alg}${bits}_encrypt_1x+8
697 fxor %f10, %f0, %f0 ! ^= inp
705 brnz,pt $len, .L${bits}_ctr32_loop2x
712 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
713 ! and ~3x deterioration
715 faligndata %f0, %f0, %f4 ! handle unaligned output
716 faligndata %f0, %f2, %f6
717 faligndata %f2, %f2, %f8
718 stda %f4, [$out + $omask]0xc0 ! partial store
721 orn %g0, $omask, $omask
722 stda %f8, [$out + $omask]0xc0 ! partial store
724 brnz,pt $len, .L${bits}_ctr32_loop2x+4
725 orn %g0, $omask, $omask
730 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
732 .L${bits}_ctr32_loop2x:
740 sllx %o0, $ileft, %o0
741 srlx %o1, $iright, %g1
743 sllx %o1, $ileft, %o1
744 srlx %o2, $iright, %g1
746 sllx %o2, $ileft, %o2
747 srlx %o3, $iright, %g1
749 sllx %o3, $ileft, %o3
750 srlx %o4, $iright, %o4
753 xor %g5, %l7, %g1 ! ^= rk[0]
756 srl %l7, 0, %l7 ! clruw
760 srl %l7, 0, %l7 ! clruw
761 prefetch [$out + 63], 22
762 prefetch [$inp + 32+63], 20
764 $::code.=<<___ if ($alg eq "aes");
765 aes_eround01 %f16, %f14, %f2, %f8
766 aes_eround23 %f18, %f14, %f2, %f2
767 aes_eround01 %f16, %f14, %f6, %f10
768 aes_eround23 %f18, %f14, %f6, %f6
770 $::code.=<<___ if ($alg eq "cmll");
771 camellia_f %f16, %f2, %f14, %f2
772 camellia_f %f16, %f6, %f14, %f6
773 camellia_f %f18, %f14, %f2, %f0
774 camellia_f %f18, %f14, %f6, %f4
777 call _${alg}${bits}_encrypt_2x+16
783 fxor %f8, %f0, %f0 ! ^= inp
796 brnz,pt $len, .L${bits}_ctr32_loop2x
803 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
804 ! and ~3x deterioration
806 faligndata %f0, %f0, %f8 ! handle unaligned output
807 faligndata %f0, %f2, %f0
808 faligndata %f2, %f4, %f2
809 faligndata %f4, %f6, %f4
810 faligndata %f6, %f6, %f6
812 stda %f8, [$out + $omask]0xc0 ! partial store
817 orn %g0, $omask, $omask
818 stda %f6, [$out + $omask]0xc0 ! partial store
820 brnz,pt $len, .L${bits}_ctr32_loop2x+4
821 orn %g0, $omask, $omask
826 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
829 add $out, $len, $blk_init
830 and $blk_init, 63, $blk_init ! tail
831 sub $len, $blk_init, $len
832 add $blk_init, 15, $blk_init ! round up to 16n
834 srl $blk_init, 4, $blk_init
836 add $blk_init, 1, $blk_init
838 .L${bits}_ctr32_blk_loop2x:
846 sllx %o0, $ileft, %o0
847 srlx %o1, $iright, %g1
849 sllx %o1, $ileft, %o1
850 srlx %o2, $iright, %g1
852 sllx %o2, $ileft, %o2
853 srlx %o3, $iright, %g1
855 sllx %o3, $ileft, %o3
856 srlx %o4, $iright, %o4
859 xor %g5, %l7, %g1 ! ^= rk[0]
862 srl %l7, 0, %l7 ! clruw
866 srl %l7, 0, %l7 ! clruw
867 prefetch [$inp + 32+63], 20
869 $::code.=<<___ if ($alg eq "aes");
870 aes_eround01 %f16, %f14, %f2, %f8
871 aes_eround23 %f18, %f14, %f2, %f2
872 aes_eround01 %f16, %f14, %f6, %f10
873 aes_eround23 %f18, %f14, %f6, %f6
875 $::code.=<<___ if ($alg eq "cmll");
876 camellia_f %f16, %f2, %f14, %f2
877 camellia_f %f16, %f6, %f14, %f6
878 camellia_f %f18, %f14, %f2, %f0
879 camellia_f %f18, %f14, %f6, %f4
882 call _${alg}${bits}_encrypt_2x+16
889 fxor %f8, %f0, %f0 ! ^= inp
895 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
897 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
899 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
901 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
902 bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
905 add $blk_init, $len, $len
906 andcc $len, 1, %g0 ! is number of blocks even?
907 membar #StoreLoad|#StoreStore
908 bnz,pt %icc, .L${bits}_ctr32_loop
910 brnz,pn $len, .L${bits}_ctr32_loop2x
915 .type ${alg}${bits}_t4_ctr32_encrypt,#function
916 .size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
920 sub alg_xts_implement {
921 my ($alg,$bits,$dir) = @_;
922 my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
926 .globl ${alg}${bits}_t4_xts_${dir}crypt
928 ${alg}${bits}_t4_xts_${dir}crypt:
929 save %sp, -$::frame-16, %sp
932 add %fp, $::bias-16, %o1
933 call ${alg}_t4_encrypt
936 add %fp, $::bias-16, %l7
938 add %fp, $::bias-8, %l7
939 ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
941 sethi %hi(0x76543210), %l7
942 or %l7, %lo(0x76543210), %l7
943 bmask %l7, %g0, %g0 ! byte swap mask
946 prefetch [$inp + 63], 20
947 call _${alg}${bits}_load_${dir}ckey
951 $code.=<<___ if ($dir eq "de");
958 sub $inp, $out, $blk_init ! $inp!=$out
961 sll $ileft, 3, $ileft
964 sub $iright, $ileft, $iright
967 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
968 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
969 brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
970 srl $omask, $ooff, $omask
972 andcc $len, 16, %g0 ! is number of blocks even?
974 $code.=<<___ if ($dir eq "de");
975 brz,pn $len, .L${bits}_xts_${dir}steal
978 alignaddrl $out, %g0, $out
979 bz %icc, .L${bits}_xts_${dir}loop2x
981 .L${bits}_xts_${dir}loop:
987 sllx %o0, $ileft, %o0
988 srlx %o1, $iright, %g1
989 sllx %o1, $ileft, %o1
991 srlx %o2, $iright, %o2
996 bshuffle %f12, %f12, %f12
997 bshuffle %f14, %f14, %f14
999 xor %g4, %o0, %o0 ! ^= rk[0]
1004 fxor %f12, %f0, %f0 ! ^= tweak[0]
1007 prefetch [$out + 63], 22
1008 prefetch [$inp + 16+63], 20
1009 call _${alg}${bits}_${dir}crypt_1x
1012 fxor %f12, %f0, %f0 ! ^= tweak[0]
1015 srax %g3, 63, %l7 ! next tweak value
1026 brnz,pt $len, .L${bits}_xts_${dir}loop2x
1029 brnz,pn $rem, .L${bits}_xts_${dir}steal
1036 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1037 ! and ~3x deterioration
1039 faligndata %f0, %f0, %f4 ! handle unaligned output
1040 faligndata %f0, %f2, %f6
1041 faligndata %f2, %f2, %f8
1042 stda %f4, [$out + $omask]0xc0 ! partial store
1045 orn %g0, $omask, $omask
1046 stda %f8, [$out + $omask]0xc0 ! partial store
1048 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
1049 orn %g0, $omask, $omask
1051 brnz,pn $rem, .L${bits}_xts_${dir}steal
1057 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1059 .L${bits}_xts_${dir}loop2x:
1062 ldx [$inp + 16], %o2
1064 ldx [$inp + 24], %o3
1066 ldx [$inp + 32], %o4
1067 sllx %o0, $ileft, %o0
1068 srlx %o1, $iright, %g1
1070 sllx %o1, $ileft, %o1
1071 srlx %o2, $iright, %g1
1073 sllx %o2, $ileft, %o2
1074 srlx %o3, $iright, %g1
1076 sllx %o3, $ileft, %o3
1077 srlx %o4, $iright, %o4
1082 bshuffle %f12, %f12, %f12
1083 bshuffle %f14, %f14, %f14
1085 srax %g3, 63, %l7 ! next tweak value
1093 bshuffle %f8, %f8, %f8
1094 bshuffle %f10, %f10, %f10
1096 xor %g4, %o0, %o0 ! ^= rk[0]
1098 xor %g4, %o2, %o2 ! ^= rk[0]
1105 fxor %f12, %f0, %f0 ! ^= tweak[0]
1107 fxor %f8, %f4, %f4 ! ^= tweak[0]
1110 prefetch [$out + 63], 22
1111 prefetch [$inp + 32+63], 20
1112 call _${alg}${bits}_${dir}crypt_2x
1118 srax %g3, 63, %l7 ! next tweak value
1124 bshuffle %f8, %f8, %f8
1125 bshuffle %f10, %f10, %f10
1127 fxor %f12, %f0, %f0 ! ^= tweak[0]
1137 std %f4, [$out + 16]
1138 std %f6, [$out + 24]
1139 brnz,pt $len, .L${bits}_xts_${dir}loop2x
1144 brnz,pn $rem, .L${bits}_xts_${dir}steal
1151 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1152 ! and ~3x deterioration
1154 faligndata %f0, %f0, %f8 ! handle unaligned output
1155 faligndata %f0, %f2, %f10
1156 faligndata %f2, %f4, %f12
1157 faligndata %f4, %f6, %f14
1158 faligndata %f6, %f6, %f0
1160 stda %f8, [$out + $omask]0xc0 ! partial store
1161 std %f10, [$out + 8]
1162 std %f12, [$out + 16]
1163 std %f14, [$out + 24]
1165 orn %g0, $omask, $omask
1166 stda %f0, [$out + $omask]0xc0 ! partial store
1168 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
1169 orn %g0, $omask, $omask
1173 brnz,pn $rem, .L${bits}_xts_${dir}steal
1179 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1181 .L${bits}_xts_${dir}blk:
1182 add $out, $len, $blk_init
1183 and $blk_init, 63, $blk_init ! tail
1184 sub $len, $blk_init, $len
1185 add $blk_init, 15, $blk_init ! round up to 16n
1187 srl $blk_init, 4, $blk_init
1189 add $blk_init, 1, $blk_init
1191 .L${bits}_xts_${dir}blk2x:
1194 ldx [$inp + 16], %o2
1196 ldx [$inp + 24], %o3
1198 ldx [$inp + 32], %o4
1199 sllx %o0, $ileft, %o0
1200 srlx %o1, $iright, %g1
1202 sllx %o1, $ileft, %o1
1203 srlx %o2, $iright, %g1
1205 sllx %o2, $ileft, %o2
1206 srlx %o3, $iright, %g1
1208 sllx %o3, $ileft, %o3
1209 srlx %o4, $iright, %o4
1214 bshuffle %f12, %f12, %f12
1215 bshuffle %f14, %f14, %f14
1217 srax %g3, 63, %l7 ! next tweak value
1225 bshuffle %f8, %f8, %f8
1226 bshuffle %f10, %f10, %f10
1228 xor %g4, %o0, %o0 ! ^= rk[0]
1230 xor %g4, %o2, %o2 ! ^= rk[0]
1237 fxor %f12, %f0, %f0 ! ^= tweak[0]
1239 fxor %f8, %f4, %f4 ! ^= tweak[0]
1242 prefetch [$inp + 32+63], 20
1243 call _${alg}${bits}_${dir}crypt_2x
1249 srax %g3, 63, %l7 ! next tweak value
1255 bshuffle %f8, %f8, %f8
1256 bshuffle %f10, %f10, %f10
1258 fxor %f12, %f0, %f0 ! ^= tweak[0]
1264 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1266 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1268 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1270 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1271 bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
1274 add $blk_init, $len, $len
1275 andcc $len, 1, %g0 ! is number of blocks even?
1276 membar #StoreLoad|#StoreStore
1277 bnz,pt %icc, .L${bits}_xts_${dir}loop
1279 brnz,pn $len, .L${bits}_xts_${dir}loop2x
1284 brnz,pn $rem, .L${bits}_xts_${dir}steal
1289 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1291 $code.=<<___ if ($dir eq "en");
1293 .L${bits}_xts_${dir}steal:
1294 std %f0, [%fp + $::bias-16] ! copy of output
1295 std %f2, [%fp + $::bias-8]
1297 srl $ileft, 3, $ileft
1298 add %fp, $::bias-16, %l7
1299 add $inp, $ileft, $inp ! original $inp+$len&-15
1300 add $out, $ooff, $out ! original $out+$len&-15
1304 .L${bits}_xts_${dir}stealing:
1305 ldub [$inp + $ileft], %o0
1306 ldub [%l7 + $ileft], %o1
1308 stb %o0, [%l7 + $ileft]
1309 stb %o1, [$out + $ileft]
1310 brnz $rem, .L${bits}_xts_${dir}stealing
1316 sub $out, $ooff, $out
1317 ba .L${bits}_xts_${dir}loop ! one more time
1318 mov 1, $len ! $rem is 0
1320 $code.=<<___ if ($dir eq "de");
1322 .L${bits}_xts_${dir}steal:
1327 ldx [$inp + 16], %o2
1328 sllx %o0, $ileft, %o0
1329 srlx %o1, $iright, %g1
1330 sllx %o1, $ileft, %o1
1332 srlx %o2, $iright, %o2
1335 srax %g3, 63, %l7 ! next tweak value
1343 bshuffle %f12, %f12, %f12
1344 bshuffle %f14, %f14, %f14
1346 xor %g4, %o0, %o0 ! ^= rk[0]
1351 fxor %f12, %f0, %f0 ! ^= tweak[0]
1354 call _${alg}${bits}_${dir}crypt_1x
1357 fxor %f12, %f0, %f0 ! ^= tweak[0]
1360 std %f0, [%fp + $::bias-16]
1361 std %f2, [%fp + $::bias-8]
1363 srl $ileft, 3, $ileft
1364 add %fp, $::bias-16, %l7
1365 add $inp, $ileft, $inp ! original $inp+$len&-15
1366 add $out, $ooff, $out ! original $out+$len&-15
1371 .L${bits}_xts_${dir}stealing:
1372 ldub [$inp + $ileft], %o0
1373 ldub [%l7 + $ileft], %o1
1375 stb %o0, [%l7 + $ileft]
1376 stb %o1, [$out + $ileft]
1377 brnz $rem, .L${bits}_xts_${dir}stealing
1383 sub $out, $ooff, $out
1384 ba .L${bits}_xts_${dir}loop ! one more time
1385 mov 1, $len ! $rem is 0
1390 .type ${alg}${bits}_t4_xts_${dir}crypt,#function
1391 .size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
1395 # Purpose of these subroutines is to explicitly encode VIS instructions,
1396 # so that one can compile the module without having to specify VIS
1397 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1398 # Idea is to reserve for option to produce "universal" binary and let
1399 # programmer detect if current CPU is VIS capable at run-time.
1401 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1403 my %visopf = ( "faligndata" => 0x048,
1404 "bshuffle" => 0x04c,
1409 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1411 if ($opf=$visopf{$mnemonic}) {
1412 foreach ($rs1,$rs2,$rd) {
1413 return $ref if (!/%f([0-9]{1,2})/);
1416 return $ref if ($1&1);
1417 # re-encode for upper double register addressing
1422 return sprintf ".word\t0x%08x !%s",
1423 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1431 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1432 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1434 my %visopf = ( "addxc" => 0x011,
1437 "alignaddr" => 0x018,
1439 "alignaddrl" => 0x01a );
1441 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1443 if ($opf=$visopf{$mnemonic}) {
1444 foreach ($rs1,$rs2,$rd) {
1445 return $ref if (!/%([goli])([0-9])/);
1449 return sprintf ".word\t0x%08x !%s",
1450 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1457 sub unaes_round { # 4-argument instructions
1458 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1460 my %aesopf = ( "aes_eround01" => 0,
1461 "aes_eround23" => 1,
1462 "aes_dround01" => 2,
1463 "aes_dround23" => 3,
1464 "aes_eround01_l"=> 4,
1465 "aes_eround23_l"=> 5,
1466 "aes_dround01_l"=> 6,
1467 "aes_dround23_l"=> 7,
1468 "aes_kexpand1" => 8 );
1470 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1472 if (defined($opf=$aesopf{$mnemonic})) {
1473 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1474 foreach ($rs1,$rs2,$rd) {
1475 return $ref if (!/%f([0-9]{1,2})/);
1478 return $ref if ($1&1);
1479 # re-encode for upper double register addressing
1484 return sprintf ".word\t0x%08x !%s",
1485 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1492 sub unaes_kexpand { # 3-argument instructions
1493 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1495 my %aesopf = ( "aes_kexpand0" => 0x130,
1496 "aes_kexpand2" => 0x131 );
1498 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1500 if (defined($opf=$aesopf{$mnemonic})) {
1501 foreach ($rs1,$rs2,$rd) {
1502 return $ref if (!/%f([0-9]{1,2})/);
1505 return $ref if ($1&1);
1506 # re-encode for upper double register addressing
1511 return sprintf ".word\t0x%08x !%s",
1512 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1519 sub uncamellia_f { # 4-argument instructions
1520 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1523 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1526 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1527 foreach ($rs1,$rs2,$rd) {
1528 return $ref if (!/%f([0-9]{1,2})/);
1531 return $ref if ($1&1);
1532 # re-encode for upper double register addressing
1537 return sprintf ".word\t0x%08x !%s",
1538 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1545 sub uncamellia3 { # 3-argument instructions
1546 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1548 my %cmllopf = ( "camellia_fl" => 0x13c,
1549 "camellia_fli" => 0x13d );
1551 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1553 if (defined($opf=$cmllopf{$mnemonic})) {
1554 foreach ($rs1,$rs2,$rd) {
1555 return $ref if (!/%f([0-9]{1,2})/);
1558 return $ref if ($1&1);
1559 # re-encode for upper double register addressing
1564 return sprintf ".word\t0x%08x !%s",
1565 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1572 sub unmovxtox { # 2-argument instructions
1573 my ($mnemonic,$rs,$rd)=@_;
1574 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1576 my %movxopf = ( "movdtox" => 0x110,
1577 "movstouw" => 0x111,
1578 "movstosw" => 0x113,
1580 "movwtos" => 0x119 );
1582 $ref = "$mnemonic\t$rs,$rd";
1584 if (defined($opf=$movxopf{$mnemonic})) {
1586 return $ref if (!/%([fgoli])([0-9]{1,2})/);
1589 return $ref if ($2&1);
1590 # re-encode for upper double register addressing
1595 return sprintf ".word\t0x%08x !%s",
1596 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1604 my ($mnemonic)=shift;
1607 my %desopf = ( "des_round" => 0b1001,
1608 "des_ip" => 0b100110100,
1609 "des_iip" => 0b100110101,
1610 "des_kexpand" => 0b100110110 );
1612 $ref = "$mnemonic\t".join(",",@_);
1614 if (defined($opf=$desopf{$mnemonic})) { # 4-arg
1615 if ($mnemonic eq "des_round") {
1616 foreach (@args[0..3]) {
1617 return $ref if (!/%f([0-9]{1,2})/);
1620 return $ref if ($1&1);
1621 # re-encode for upper double register addressing
1625 return sprintf ".word\t0x%08x !%s",
1626 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
1628 } elsif ($mnemonic eq "des_kexpand") { # 3-arg
1629 foreach (@args[0..2]) {
1630 return $ref if (!/(%f)?([0-9]{1,2})/);
1633 return $ref if ($2&1);
1634 # re-encode for upper double register addressing
1638 return sprintf ".word\t0x%08x !%s",
1639 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
1642 foreach (@args[0..1]) {
1643 return $ref if (!/%f([0-9]{1,2})/);
1646 return $ref if ($2&1);
1647 # re-encode for upper double register addressing
1651 return sprintf ".word\t0x%08x !%s",
1652 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
1660 sub emit_assembler {
1661 foreach (split("\n",$::code)) {
1662 s/\`([^\`]*)\`/eval $1/ge;
1664 s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
1666 s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1667 &unaes_round($1,$2,$3,$4,$5)
1669 s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1670 &unaes_kexpand($1,$2,$3,$4)
1672 s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1673 &uncamellia_f($1,$2,$3,$4,$5)
1675 s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1676 &uncamellia3($1,$2,$3,$4)
1678 s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
1679 &undes($1,$2,$3,$4,$5)
1681 s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1682 &unmovxtox($1,$2,$3)
1684 s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1685 &unmovxtox($1,$2,$3)
1687 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1690 s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1691 &unvis3($1,$2,$3,$4)