2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements Poly1305 hash for SPARCv9, vanilla, as well
18 # as VIS3 and FMA extensions.
22 # Numbers are cycles per processed byte with poly1305_blocks alone.
26 # UltraSPARC III 12.3(**)
28 # SPARC T4 1.70(***) 6.55
31 # (*) Comparison to compiler-generated code is really problematic,
32 # because latter's performance varies too much depending on too
33 # many variables. For example, one can measure from 5x to 15x
34 # improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
35 # unfair comparison, because compiler doesn't use VIS3, but
36 # given same initial conditions coefficient varies from 3x to 9x.
37 # (**) Pre-III performance should be even worse; floating-point
38 # performance for UltraSPARC I-IV on the other hand is reported
39 # to be 4.25 for hand-coded assembly, but they are just too old
41 # (***) Multi-process benchmark saturates at ~12.5x single-process
42 # result on 8-core processor, or ~21GBps per 2.85GHz socket.
44 # $output is the last argument if it looks like a file (it has an extension)
45 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
47 open STDOUT,">$output" if $output;
49 my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
50 my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
51 my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
52 my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
55 #include "sparc_arch.h"
58 .register %g2,#scratch
59 .register %g3,#scratch
66 #define LOCALS (STACK_BIAS+STACK_FRAME)
68 .section ".text",#alloc,#execinstr
77 save %sp,-STACK_FRAME-16,%sp
80 SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
83 and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
85 be .Lpoly1305_init_fma
89 stx %g0,[$ctx+8] ! zero hash value
93 and $inp,7,$shr ! alignment factor
98 sethi %hi(0x0ffffffc),$t0
100 or $t0,%lo(0x0ffffffc),$t0
103 or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc
104 or $t1,3,$t0 ! 0x0ffffffc0fffffff
106 ldxa [$inp+%g0]0x88,$h0 ! load little-endian key
107 brz,pt $shr,.Lkey_aligned
108 ldxa [$inp+$h1]0x88,$h1
110 ldxa [$inp+$h2]0x88,$h2
121 stx $h0,[$ctx+32+0] ! store key
124 andcc %g1,SPARCV9_VIS3,%g0
129 add %o7,poly1305_blocks_vis3-1b,%o7
131 add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
133 STPTR %o5,[%i2+SIZE_T]
136 restore %g0,1,%o0 ! return 1
140 restore %g0,%g0,%o0 ! return 0
141 .type poly1305_init,#function
142 .size poly1305_init,.-poly1305_init
144 .globl poly1305_blocks
147 save %sp,-STACK_FRAME,%sp
150 brz,pn $len,.Lno_data
153 ld [$ctx+32+0],$r1 ! load key
158 ld [$ctx+0],$h1 ! load hash value
164 and $inp,7,$shr ! alignment factor
179 ldxa [$inp+%g0]0x88,$d0 ! load little-endian input
180 brz,pt $shr,.Linp_aligned
181 ldxa [$inp+$d1]0x88,$d1
183 ldxa [$inp+$d2]0x88,$d2
193 addcc $d0,$h0,$h0 ! accumulate input
253 srl $h4,2,$t0 ! final reduction step
265 st $h1,[$ctx+0] ! store hash value
274 .type poly1305_blocks,#function
275 .size poly1305_blocks,.-poly1305_blocks
277 ########################################################################
278 # VIS3 has umulxhi and addxc...
280 my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
281 my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
285 poly1305_blocks_vis3:
286 save %sp,-STACK_FRAME,%sp
289 brz,pn $len,.Lno_data
292 ldx [$ctx+32+0],$R0 ! load key
295 ldx [$ctx+0],$H0 ! load hash value
299 and $inp,7,$shr ! alignment factor
311 ldxa [$inp+%g0]0x88,$D0 ! load little-endian input
312 brz,pt $shr,.Linp_aligned_vis3
313 ldxa [$inp+$r1]0x88,$D1
315 ldxa [$inp+$r2]0x88,$D2
324 addcc $D0,$H0,$H0 ! accumulate input
329 mulx $R0,$H0,$D0 ! r0*h0
330 addxc $padbit,$H2,$H2
332 mulx $S1,$H1,$T0 ! s1*h1
335 mulx $R1,$H0,$T0 ! r1*h0
339 mulx $R0,$H1,$T0 ! r0*h1
343 mulx $S1,$H2,$T0 ! s1*h2
345 mulx $R0,$H2,$T1 ! r0*h2
349 srlx $D2,2,$T0 ! final reduction step
356 brnz,pt $len,.Loop_vis3
359 stx $H0,[$ctx+0] ! store hash value
365 .type poly1305_blocks_vis3,#function
366 .size poly1305_blocks_vis3,.-poly1305_blocks_vis3
369 my ($mac,$nonce) = ($inp,$len);
375 save %sp,-STACK_FRAME,%sp
377 ld [$ctx+0],$h1 ! load hash value
383 addcc $h0,5,$r0 ! compare to modulus
388 andcc $h4,4,%g0 ! did it carry/borrow?
391 ld [$nonce+0],$r0 ! load nonce
399 addcc $r0,$h0,$h0 ! accumulate nonce
405 stb $h0,[$mac+0] ! store little-endian result
438 .type poly1305_emit,#function
439 .size poly1305_emit,.-poly1305_emit
443 my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
444 my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
445 my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
448 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
449 $two0,$two32,$two64,$two96,$two130,$five_two130,
450 $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
451 $s2lo,$s2hi,$s3lo,$s3hi,
452 $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
454 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
455 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
456 my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
461 save %sp,-STACK_FRAME-16,%sp
466 add %o7,.Lconsts_fma-1b,%o7
468 ldd [%o7+8*0],$two0 ! load constants
472 ldd [%o7+8*5],$five_two130
474 std $two0,[$ctx+8*0] ! initial hash value, biased 0
475 std $two32,[$ctx+8*1]
476 std $two64,[$ctx+8*2]
477 std $two96,[$ctx+8*3]
479 brz,pn $inp,.Lno_key_fma
482 stx %fsr,[%sp+LOCALS] ! save original %fsr
483 ldx [%o7+8*6],%fsr ! load new %fsr
485 std $two0,[$ctx+8*4] ! key "template"
486 std $two32,[$ctx+8*5]
487 std $two64,[$ctx+8*6]
488 std $two96,[$ctx+8*7]
491 andn $inp,7,$inp ! align pointer
497 ldxa [$inp+%g0]0x88,$in0 ! load little-endian key
498 ldxa [$inp+$i1]0x88,$in2
500 brz $shr,.Lkey_aligned_fma
501 sethi %hi(0xf0000000),$i1 ! 0xf0000000
503 ldxa [$inp+$i2]0x88,$in4
505 srlx $in0,$shr,$in0 ! align data
513 or $i1,3,$i2 ! 0xf0000003
515 andn $in0,$i1,$in0 ! &=0x0fffffff
516 andn $in1,$i2,$in1 ! &=0x0ffffffc
521 st $in0,[$ctx+`8*4+4`] ! fill "template"
522 st $in1,[$ctx+`8*5+4`]
523 st $in2,[$ctx+`8*6+4`]
524 st $in3,[$ctx+`8*7+4`]
526 ldd [$ctx+8*4],$h0lo ! load [biased] key
531 fsubd $h0lo,$two0, $h0lo ! r0
532 ldd [%o7+8*7],$two0 ! more constants
533 fsubd $h1lo,$two32,$h1lo ! r1
535 fsubd $h2lo,$two64,$h2lo ! r2
537 fsubd $h3lo,$two96,$h3lo ! r3
538 ldd [%o7+8*10],$two96
540 fmuld $five_two130,$h1lo,$s1lo ! s1
541 fmuld $five_two130,$h2lo,$s2lo ! s2
542 fmuld $five_two130,$h3lo,$s3lo ! s3
544 faddd $h0lo,$two0, $h0hi
545 faddd $h1lo,$two32,$h1hi
546 faddd $h2lo,$two64,$h2hi
547 faddd $h3lo,$two96,$h3hi
549 fsubd $h0hi,$two0, $h0hi
550 ldd [%o7+8*11],$two0 ! more constants
551 fsubd $h1hi,$two32,$h1hi
552 ldd [%o7+8*12],$two32
553 fsubd $h2hi,$two64,$h2hi
554 ldd [%o7+8*13],$two64
555 fsubd $h3hi,$two96,$h3hi
557 fsubd $h0lo,$h0hi,$h0lo
558 std $h0hi,[$ctx+8*5] ! r0hi
559 fsubd $h1lo,$h1hi,$h1lo
560 std $h1hi,[$ctx+8*7] ! r1hi
561 fsubd $h2lo,$h2hi,$h2lo
562 std $h2hi,[$ctx+8*9] ! r2hi
563 fsubd $h3lo,$h3hi,$h3lo
564 std $h3hi,[$ctx+8*11] ! r3hi
566 faddd $s1lo,$two0, $s1hi
567 faddd $s2lo,$two32,$s2hi
568 faddd $s3lo,$two64,$s3hi
570 fsubd $s1hi,$two0, $s1hi
571 fsubd $s2hi,$two32,$s2hi
572 fsubd $s3hi,$two64,$s3hi
574 fsubd $s1lo,$s1hi,$s1lo
575 fsubd $s2lo,$s2hi,$s2lo
576 fsubd $s3lo,$s3hi,$s3lo
578 ldx [%sp+LOCALS],%fsr ! restore %fsr
580 std $h0lo,[$ctx+8*4] ! r0lo
581 std $h1lo,[$ctx+8*6] ! r1lo
582 std $h2lo,[$ctx+8*8] ! r2lo
583 std $h3lo,[$ctx+8*10] ! r3lo
585 std $s1hi,[$ctx+8*13]
586 std $s2hi,[$ctx+8*15]
587 std $s3hi,[$ctx+8*17]
589 std $s1lo,[$ctx+8*12]
590 std $s2lo,[$ctx+8*14]
591 std $s3lo,[$ctx+8*16]
593 add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
594 add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
596 STPTR %o1,[%i2+SIZE_T]
599 restore %g0,1,%o0 ! return 1
603 restore %g0,%g0,%o0 ! return 0
604 .type poly1305_init_fma,#function
605 .size poly1305_init_fma,.-poly1305_init_fma
609 save %sp,-STACK_FRAME-48,%sp
616 add %o7,.Lconsts_fma-1b,%o7
618 ldd [%o7+8*0],$two0 ! load constants
622 ldd [%o7+8*4],$two130
623 ldd [%o7+8*5],$five_two130
625 ldd [$ctx+8*0],$h0lo ! load [biased] hash value
630 std $two0,[%sp+LOCALS+8*0] ! input "template"
631 sethi %hi((1023+52+96)<<20),$in3
632 std $two32,[%sp+LOCALS+8*1]
634 std $two64,[%sp+LOCALS+8*2]
635 st $in3,[%sp+LOCALS+8*3]
638 andn $inp,7,$inp ! align pointer
644 ldxa [$inp+%g0]0x88,$in0 ! load little-endian input
645 brz $shr,.Linp_aligned_fma
646 ldxa [$inp+$i1]0x88,$in2
648 ldxa [$inp+$step]0x88,$in4
651 srlx $in0,$shr,$in0 ! align data
656 srlx $in4,$shr,$in4 ! pre-shift
663 add $step,$inp,$inp ! conditional advance
665 st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
666 st $in1,[%sp+LOCALS+8*1+4]
667 st $in2,[%sp+LOCALS+8*2+4]
668 st $in3,[%sp+LOCALS+8*3+4]
670 ldd [$ctx+8*4],$r0lo ! load key
676 ldd [$ctx+8*10],$r3lo
677 ldd [$ctx+8*11],$r3hi
678 ldd [$ctx+8*12],$s1lo
679 ldd [$ctx+8*13],$s1hi
680 ldd [$ctx+8*14],$s2lo
681 ldd [$ctx+8*15],$s2hi
682 ldd [$ctx+8*16],$s3lo
683 ldd [$ctx+8*17],$s3hi
685 stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
686 ldx [%o7+8*6],%fsr ! load new %fsr
691 ldd [%sp+LOCALS+8*0],$x0 ! load biased input
692 ldd [%sp+LOCALS+8*1],$x1
693 ldd [%sp+LOCALS+8*2],$x2
694 ldd [%sp+LOCALS+8*3],$x3
696 fsubd $h0lo,$two0, $h0lo ! de-bias hash value
697 fsubd $h1lo,$two32,$h1lo
698 ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
699 fsubd $h2lo,$two64,$h2lo
700 fsubd $h3lo,$two96,$h3lo
701 ldxa [$inp+$i1]0x88,$in2
703 fsubd $x0,$two0, $x0 ! de-bias input
708 brz $shr,.Linp_aligned_fma2
709 add $step,$inp,$inp ! conditional advance
711 sllx $in0,$shl,$in1 ! align data
715 srlx $in2,$shr,$in4 ! pre-shift
721 faddd $h0lo,$x0,$x0 ! accumulate input
722 stw $in0,[%sp+LOCALS+8*0+4]
724 stw $in1,[%sp+LOCALS+8*1+4]
726 stw $in2,[%sp+LOCALS+8*2+4]
728 stw $in3,[%sp+LOCALS+8*3+4]
735 ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
736 ldxa [$inp+$i1]0x88,$in2
739 faddd $y0,$h0lo,$h0lo ! accumulate input
740 faddd $y1,$h0hi,$h0hi
741 faddd $y2,$h2lo,$h2lo
742 faddd $y3,$h2hi,$h2hi
744 brz,pn $shr,.Linp_aligned_fma3
745 add $step,$inp,$inp ! conditional advance
747 sllx $in0,$shl,$in1 ! align data
751 srlx $in2,$shr,$in4 ! pre-shift
755 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
756 faddd $two64,$h1lo,$c1lo
758 faddd $two64,$h1hi,$c1hi
760 faddd $two130,$h3lo,$c3lo
761 st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
762 faddd $two130,$h3hi,$c3hi
763 st $in1,[%sp+LOCALS+8*1+4]
764 faddd $two32,$h0lo,$c0lo
765 st $in2,[%sp+LOCALS+8*2+4]
766 faddd $two32,$h0hi,$c0hi
767 st $in3,[%sp+LOCALS+8*3+4]
768 faddd $two96,$h2lo,$c2lo
769 faddd $two96,$h2hi,$c2hi
771 fsubd $c1lo,$two64,$c1lo
772 fsubd $c1hi,$two64,$c1hi
773 fsubd $c3lo,$two130,$c3lo
774 fsubd $c3hi,$two130,$c3hi
775 fsubd $c0lo,$two32,$c0lo
776 fsubd $c0hi,$two32,$c0hi
777 fsubd $c2lo,$two96,$c2lo
778 fsubd $c2hi,$two96,$c2hi
780 fsubd $h1lo,$c1lo,$h1lo
781 fsubd $h1hi,$c1hi,$h1hi
782 fsubd $h3lo,$c3lo,$h3lo
783 fsubd $h3hi,$c3hi,$h3hi
784 fsubd $h2lo,$c2lo,$h2lo
785 fsubd $h2hi,$c2hi,$h2hi
786 fsubd $h0lo,$c0lo,$h0lo
787 fsubd $h0hi,$c0hi,$h0hi
789 faddd $h1lo,$c0lo,$h1lo
790 faddd $h1hi,$c0hi,$h1hi
791 faddd $h3lo,$c2lo,$h3lo
792 faddd $h3hi,$c2hi,$h3hi
793 faddd $h2lo,$c1lo,$h2lo
794 faddd $h2hi,$c1hi,$h2hi
795 fmaddd $five_two130,$c3lo,$h0lo,$h0lo
796 fmaddd $five_two130,$c3hi,$h0hi,$h0hi
798 faddd $h1lo,$h1hi,$x1
799 ldd [$ctx+8*12],$s1lo ! reload constants
800 faddd $h3lo,$h3hi,$x3
801 ldd [$ctx+8*13],$s1hi
802 faddd $h2lo,$h2hi,$x2
803 ldd [$ctx+8*10],$r3lo
804 faddd $h0lo,$h0hi,$x0
805 ldd [$ctx+8*11],$r3hi
808 fmuld $x1,$s3lo,$h0lo
809 fmuld $x1,$s3hi,$h0hi
810 fmuld $x1,$r1lo,$h2lo
811 fmuld $x1,$r1hi,$h2hi
812 fmuld $x1,$r0lo,$h1lo
813 fmuld $x1,$r0hi,$h1hi
814 fmuld $x1,$r2lo,$h3lo
815 fmuld $x1,$r2hi,$h3hi
817 fmaddd $x3,$s1lo,$h0lo,$h0lo
818 fmaddd $x3,$s1hi,$h0hi,$h0hi
819 fmaddd $x3,$s3lo,$h2lo,$h2lo
820 fmaddd $x3,$s3hi,$h2hi,$h2hi
821 fmaddd $x3,$s2lo,$h1lo,$h1lo
822 fmaddd $x3,$s2hi,$h1hi,$h1hi
823 fmaddd $x3,$r0lo,$h3lo,$h3lo
824 fmaddd $x3,$r0hi,$h3hi,$h3hi
826 fmaddd $x2,$s2lo,$h0lo,$h0lo
827 fmaddd $x2,$s2hi,$h0hi,$h0hi
828 fmaddd $x2,$r0lo,$h2lo,$h2lo
829 fmaddd $x2,$r0hi,$h2hi,$h2hi
830 fmaddd $x2,$s3lo,$h1lo,$h1lo
831 ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input
832 fmaddd $x2,$s3hi,$h1hi,$h1hi
833 ldd [%sp+LOCALS+8*1],$y1
834 fmaddd $x2,$r1lo,$h3lo,$h3lo
835 ldd [%sp+LOCALS+8*2],$y2
836 fmaddd $x2,$r1hi,$h3hi,$h3hi
837 ldd [%sp+LOCALS+8*3],$y3
839 fmaddd $x0,$r0lo,$h0lo,$h0lo
840 fsubd $y0,$two0, $y0 ! de-bias input
841 fmaddd $x0,$r0hi,$h0hi,$h0hi
843 fmaddd $x0,$r2lo,$h2lo,$h2lo
845 fmaddd $x0,$r2hi,$h2hi,$h2hi
847 fmaddd $x0,$r1lo,$h1lo,$h1lo
848 fmaddd $x0,$r1hi,$h1hi,$h1hi
849 fmaddd $x0,$r3lo,$h3lo,$h3lo
850 fmaddd $x0,$r3hi,$h3hi,$h3hi
852 bcc SIZE_T_CC,.Loop_fma
855 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
856 faddd $h0lo,$two32,$c0lo
857 faddd $h0hi,$two32,$c0hi
858 faddd $h2lo,$two96,$c2lo
859 faddd $h2hi,$two96,$c2hi
860 faddd $h1lo,$two64,$c1lo
861 faddd $h1hi,$two64,$c1hi
862 faddd $h3lo,$two130,$c3lo
863 faddd $h3hi,$two130,$c3hi
865 fsubd $c0lo,$two32,$c0lo
866 fsubd $c0hi,$two32,$c0hi
867 fsubd $c2lo,$two96,$c2lo
868 fsubd $c2hi,$two96,$c2hi
869 fsubd $c1lo,$two64,$c1lo
870 fsubd $c1hi,$two64,$c1hi
871 fsubd $c3lo,$two130,$c3lo
872 fsubd $c3hi,$two130,$c3hi
874 fsubd $h1lo,$c1lo,$h1lo
875 fsubd $h1hi,$c1hi,$h1hi
876 fsubd $h3lo,$c3lo,$h3lo
877 fsubd $h3hi,$c3hi,$h3hi
878 fsubd $h2lo,$c2lo,$h2lo
879 fsubd $h2hi,$c2hi,$h2hi
880 fsubd $h0lo,$c0lo,$h0lo
881 fsubd $h0hi,$c0hi,$h0hi
883 faddd $h1lo,$c0lo,$h1lo
884 faddd $h1hi,$c0hi,$h1hi
885 faddd $h3lo,$c2lo,$h3lo
886 faddd $h3hi,$c2hi,$h3hi
887 faddd $h2lo,$c1lo,$h2lo
888 faddd $h2hi,$c1hi,$h2hi
889 fmaddd $five_two130,$c3lo,$h0lo,$h0lo
890 fmaddd $five_two130,$c3hi,$h0hi,$h0hi
892 faddd $h1lo,$h1hi,$x1
893 faddd $h3lo,$h3hi,$x3
894 faddd $h2lo,$h2hi,$x2
895 faddd $h0lo,$h0hi,$x0
897 faddd $x1,$two32,$x1 ! bias
902 ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
904 std $x1,[$ctx+8*1] ! store [biased] hash value
912 .type poly1305_blocks_fma,#function
913 .size poly1305_blocks_fma,.-poly1305_blocks_fma
916 my ($mac,$nonce)=($inp,$len);
918 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
919 ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
924 save %sp,-STACK_FRAME,%sp
926 ld [$ctx+8*0+0],$d0 ! load hash
935 sethi %hi(0xfff00000),$mask
936 andn $d0,$mask,$d0 ! mask exponent
939 andn $d3,$mask,$d3 ! can be partially reduced...
942 srl $d3,2,$padbit ! ... so reduce
953 addcc $h0,5,$d0 ! compare to modulus
959 srl $mask,2,$mask ! did it carry/borrow?
961 sra $mask,31,$mask ! mask
968 ld [$nonce+0],$d0 ! load nonce
980 addcc $d0,$h0,$h0 ! accumulate nonce
985 stb $h0,[$mac+0] ! write little-endian result
1019 .type poly1305_emit_fma,#function
1020 .size poly1305_emit_fma,.-poly1305_emit_fma
1027 .word 0x43300000,0x00000000 ! 2^(52+0)
1028 .word 0x45300000,0x00000000 ! 2^(52+32)
1029 .word 0x47300000,0x00000000 ! 2^(52+64)
1030 .word 0x49300000,0x00000000 ! 2^(52+96)
1031 .word 0x4b500000,0x00000000 ! 2^(52+130)
1033 .word 0x37f40000,0x00000000 ! 5/2^130
1034 .word 0,1<<30 ! fsr: truncate, no exceptions
1036 .word 0x44300000,0x00000000 ! 2^(52+16+0)
1037 .word 0x46300000,0x00000000 ! 2^(52+16+32)
1038 .word 0x48300000,0x00000000 ! 2^(52+16+64)
1039 .word 0x4a300000,0x00000000 ! 2^(52+16+96)
1040 .word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
1041 .word 0x40300000,0x00000000 ! 2^(52+16+32-96)
1042 .word 0x42300000,0x00000000 ! 2^(52+16+64-96)
1043 .asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
1048 # Purpose of these subroutines is to explicitly encode VIS instructions,
1049 # so that one can compile the module without having to specify VIS
1050 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1051 # Idea is to reserve for option to produce "universal" binary and let
1052 # programmer detect if current CPU is VIS capable at run-time.
1054 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1055 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1057 my %visopf = ( "addxc" => 0x011,
1059 "umulxhi" => 0x016 );
1061 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1063 if ($opf=$visopf{$mnemonic}) {
1064 foreach ($rs1,$rs2,$rd) {
1065 return $ref if (!/%([goli])([0-9])/);
1069 return sprintf ".word\t0x%08x !%s",
1070 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1078 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1080 my %fmaopf = ( "fmadds" => 0x1,
1085 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1087 if ($opf=$fmaopf{$mnemonic}) {
1088 foreach ($rs1,$rs2,$rs3,$rd) {
1089 return $ref if (!/%f([0-9]{1,2})/);
1092 return $ref if ($1&1);
1093 # re-encode for upper double register addressing
1098 return sprintf ".word\t0x%08x !%s",
1099 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1106 foreach (split("\n",$code)) {
1107 s/\`([^\`]*)\`/eval $1/ge;
1109 s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1110 &unvis3($1,$2,$3,$4)
1112 s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
1113 &unfma($1,$2,$3,$4,$5)
1119 close STDOUT or die "error closing STDOUT";