2 # Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # X25519 lower-level primitives for PPC64.
20 # Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15%
21 # faster on PPC970/G5. POWER8 on the other hand seems to trip on own
22 # shoelaces when handling longer carry chains. As base 2^51 has just
23 # single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is
24 # pretty old, base 2^64 implementation is not engaged. Comparison to
25 # compiler-generated code is complicated by the fact that not all
26 # compilers support 128-bit integers. When compiler doesn't, like xlc,
27 # this module delivers more than 2x improvement, and when it does,
28 # from 12% to 30% improvement was measured...
30 # $output is the last argument if it looks like a file (it has an extension)
31 # $flavour is the first argument if it doesn't look like a file
32 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
33 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
37 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
38 die "can't locate ppc-xlate.pl";
40 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
41 or die "can't call $xlate: $!";
45 my ($rp,$ap,$bp) = map("r$_",3..5);
47 ####################################################### base 2^64
49 my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3,
50 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) =
51 map("r$_",(6..12,22..31));
58 .globl x25519_fe64_mul
59 .type x25519_fe64_mul,\@function
63 std r22,`$FRAME-8*10`($sp)
64 std r23,`$FRAME-8*9`($sp)
65 std r24,`$FRAME-8*8`($sp)
66 std r25,`$FRAME-8*7`($sp)
67 std r26,`$FRAME-8*6`($sp)
68 std r27,`$FRAME-8*5`($sp)
69 std r28,`$FRAME-8*4`($sp)
70 std r29,`$FRAME-8*3`($sp)
71 std r30,`$FRAME-8*2`($sp)
72 std r31,`$FRAME-8*1`($sp)
81 mulld $acc0,$a0,$bi # a[0]*b[0]
83 mulld $acc1,$a1,$bi # a[1]*b[0]
85 mulld $acc2,$a2,$bi # a[2]*b[0]
87 mulld $acc3,$a3,$bi # a[3]*b[0]
90 for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7),
91 my $i=1; $i<4; shift(@acc), $i++) {
92 my $acc4 = $i==1? $zero : @acc[4];
96 addc @acc[1],@acc[1],$t0 # accumulate high parts
98 adde @acc[2],@acc[2],$t1
100 adde @acc[3],@acc[3],$t2
102 adde @acc[4],$acc4,$t3
104 addc @acc[1],@acc[1],$t0 # accumulate low parts
106 adde @acc[2],@acc[2],$t1
108 adde @acc[3],@acc[3],$t2
110 adde @acc[4],@acc[4],$t3
112 adde @acc[5],$zero,$zero
134 adde $acc4,$zero,$zero
141 mulld $acc4,$acc4,$bi
143 addc $acc0,$acc0,$acc4
148 subfe $acc4,$acc4,$acc4 # carry -> ~mask
152 add $acc0,$acc0,$acc4
156 ld r22,`$FRAME-8*10`($sp)
157 ld r23,`$FRAME-8*9`($sp)
158 ld r24,`$FRAME-8*8`($sp)
159 ld r25,`$FRAME-8*7`($sp)
160 ld r26,`$FRAME-8*6`($sp)
161 ld r27,`$FRAME-8*5`($sp)
162 ld r28,`$FRAME-8*4`($sp)
163 ld r29,`$FRAME-8*3`($sp)
164 ld r30,`$FRAME-8*2`($sp)
165 ld r31,`$FRAME-8*1`($sp)
169 .byte 0,12,4,0,0x80,10,3,0
171 .size x25519_fe64_mul,.-x25519_fe64_mul
173 .globl x25519_fe64_sqr
174 .type x25519_fe64_sqr,\@function
177 stdu $sp,-$FRAME($sp)
178 std r22,`$FRAME-8*10`($sp)
179 std r23,`$FRAME-8*9`($sp)
180 std r24,`$FRAME-8*8`($sp)
181 std r25,`$FRAME-8*7`($sp)
182 std r26,`$FRAME-8*6`($sp)
183 std r27,`$FRAME-8*5`($sp)
184 std r28,`$FRAME-8*4`($sp)
185 std r29,`$FRAME-8*3`($sp)
186 std r30,`$FRAME-8*2`($sp)
187 std r31,`$FRAME-8*1`($sp)
190 xor $zero,$zero,$zero
195 ################################
196 # | | | | | |a1*a0| |
197 # | | | | |a2*a0| | |
198 # | |a3*a2|a3*a0| | | |
199 # | | | |a2*a1| | | |
200 # | | |a3*a1| | | | |
201 # *| | | | | | | | 2|
202 # +|a3*a3|a2*a2|a1*a1|a0*a0|
203 # |--+--+--+--+--+--+--+--|
204 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
206 # "can't overflow" below mark carrying into high part of
207 # multiplication result, which can't overflow, because it
208 # can never be all ones.
210 mulld $acc1,$a1,$a0 # a[1]*a[0]
212 mulld $acc2,$a2,$a0 # a[2]*a[0]
214 mulld $acc3,$a3,$a0 # a[3]*a[0]
217 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
218 mulld $t0,$a2,$a1 # a[2]*a[1]
221 mulld $t2,$a3,$a1 # a[3]*a[1]
223 addze $acc4,$acc4 # can't overflow
225 mulld $acc5,$a3,$a2 # a[3]*a[2]
228 addc $t1,$t1,$t2 # accumulate high parts of multiplication
229 mulld $acc0,$a0,$a0 # a[0]*a[0]
230 addze $t2,$t3 # can't overflow
232 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
235 mulld $t1,$a1,$a1 # a[1]*a[1]
238 addze $acc6,$acc6 # can't overflow
240 addc $acc1,$acc1,$acc1 # acc[1-6]*=2
241 mulld $t2,$a2,$a2 # a[2]*a[2]
242 adde $acc2,$acc2,$acc2
244 adde $acc3,$acc3,$acc3
245 mulld $t3,$a3,$a3 # a[3]*a[3]
246 adde $acc4,$acc4,$acc4
248 adde $acc5,$acc5,$acc5
249 adde $acc6,$acc6,$acc6
252 addc $acc1,$acc1,$a0 # +a[i]*a[i]
281 mulld $acc4,$acc4,$bi
283 addc $acc0,$acc0,$acc4
288 subfe $acc4,$acc4,$acc4 # carry -> ~mask
292 add $acc0,$acc0,$acc4
296 ld r22,`$FRAME-8*10`($sp)
297 ld r23,`$FRAME-8*9`($sp)
298 ld r24,`$FRAME-8*8`($sp)
299 ld r25,`$FRAME-8*7`($sp)
300 ld r26,`$FRAME-8*6`($sp)
301 ld r27,`$FRAME-8*5`($sp)
302 ld r28,`$FRAME-8*4`($sp)
303 ld r29,`$FRAME-8*3`($sp)
304 ld r30,`$FRAME-8*2`($sp)
305 ld r31,`$FRAME-8*1`($sp)
309 .byte 0,12,4,0,0x80,10,2,0
311 .size x25519_fe64_sqr,.-x25519_fe64_sqr
313 .globl x25519_fe64_mul121666
314 .type x25519_fe64_mul121666,\@function
316 x25519_fe64_mul121666:
318 ori $bi,$bi,`121666-65536`
346 subfe $t1,$t1,$t1 # carry -> ~mask
356 .byte 0,12,0x14,0,0,0,2,0
358 .size x25519_fe64_mul121666,.-x25519_fe64_mul121666
360 .globl x25519_fe64_add
361 .type x25519_fe64_add,\@function
379 subfe $t1,$t1,$t1 # carry -> ~mask
387 subfe $t1,$t1,$t1 # carry -> ~mask
397 .byte 0,12,0x14,0,0,0,3,0
399 .size x25519_fe64_add,.-x25519_fe64_add
401 .globl x25519_fe64_sub
402 .type x25519_fe64_sub,\@function
420 subfe $t1,$t1,$t1 # borrow -> mask
421 xor $zero,$zero,$zero
429 subfe $t1,$t1,$t1 # borrow -> mask
439 .byte 0,12,0x14,0,0,0,3,0
441 .size x25519_fe64_sub,.-x25519_fe64_sub
443 .globl x25519_fe64_tobytes
444 .type x25519_fe64_tobytes,\@function
452 sradi $t0,$a3,63 # most significant bit -> mask
456 add $t0,$t0,$t1 # compare to modulus in the same go
457 srdi $a3,$a3,1 # most significant bit cleared
464 xor $zero,$zero,$zero
465 sradi $t0,$a3,63 # most significant bit -> mask
468 srdi $a3,$a3,1 # most significant bit cleared
477 for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) {
499 .byte 0,12,0x14,0,0,0,2,0
501 .size x25519_fe64_tobytes,.-x25519_fe64_tobytes
504 ####################################################### base 2^51
506 my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1,
507 $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) =
508 map("r$_",(6..12,21..31));
515 .globl x25519_fe51_mul
516 .type x25519_fe51_mul,\@function
519 stdu $sp,-$FRAME($sp)
520 std r21,`$FRAME-8*11`($sp)
521 std r22,`$FRAME-8*10`($sp)
522 std r23,`$FRAME-8*9`($sp)
523 std r24,`$FRAME-8*8`($sp)
524 std r25,`$FRAME-8*7`($sp)
525 std r26,`$FRAME-8*6`($sp)
526 std r27,`$FRAME-8*5`($sp)
527 std r28,`$FRAME-8*4`($sp)
528 std r29,`$FRAME-8*3`($sp)
529 std r30,`$FRAME-8*2`($sp)
530 std r31,`$FRAME-8*1`($sp)
539 mulld $h0lo,$a0,$bi # a[0]*b[0]
542 mulld $h1lo,$a1,$bi # a[1]*b[0]
545 mulld $h4lo,$a4,$bi # a[4]*b[0]
550 mulld $h2lo,$a2,$bi # a[2]*b[0]
553 mulld $h3lo,$a3,$bi # a[3]*b[0]
556 for(my @a=($a0,$a1,$a2,$a3,$a4),
557 my $i=1; $i<4; $i++) {
558 ($ap,$bi) = ($bi,$ap);
572 ld $ap,`8*($i+1)`($bp)
589 ($ap,$bi) = ($bi,$ap);
618 srdi $mask,$mask,13 # 0x7ffffffffffff
622 insrdi $t0,$h2hi,51,0 # h2>>51
625 insrdi $t1,$h0hi,51,0 # h0>>51
633 insrdi $t0,$h3hi,51,0 # h3>>51
636 insrdi $t1,$h1hi,51,0 # h1>>51
643 insrdi $t0,$h4hi,51,0
644 mulli $t0,$t0,19 # (h4 >> 51) * 19
662 ld r21,`$FRAME-8*11`($sp)
663 ld r22,`$FRAME-8*10`($sp)
664 ld r23,`$FRAME-8*9`($sp)
665 ld r24,`$FRAME-8*8`($sp)
666 ld r25,`$FRAME-8*7`($sp)
667 ld r26,`$FRAME-8*6`($sp)
668 ld r27,`$FRAME-8*5`($sp)
669 ld r28,`$FRAME-8*4`($sp)
670 ld r29,`$FRAME-8*3`($sp)
671 ld r30,`$FRAME-8*2`($sp)
672 ld r31,`$FRAME-8*1`($sp)
676 .byte 0,12,4,0,0x80,11,3,0
678 .size x25519_fe51_mul,.-x25519_fe51_mul
681 my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1);
683 .globl x25519_fe51_sqr
684 .type x25519_fe51_sqr,\@function
687 stdu $sp,-$FRAME($sp)
688 std r21,`$FRAME-8*11`($sp)
689 std r22,`$FRAME-8*10`($sp)
690 std r23,`$FRAME-8*9`($sp)
691 std r24,`$FRAME-8*8`($sp)
692 std r25,`$FRAME-8*7`($sp)
693 std r26,`$FRAME-8*6`($sp)
694 std r27,`$FRAME-8*5`($sp)
695 std r28,`$FRAME-8*4`($sp)
696 std r29,`$FRAME-8*3`($sp)
697 std r30,`$FRAME-8*2`($sp)
698 std r31,`$FRAME-8*1`($sp)
706 add $bi,$a0,$a0 # a[0]*2
707 mulli $t1,$a4,19 # a[4]*19
719 add $bi,$a1,$a1 # a[1]*2
721 ($a4,$t1) = ($t1,$a4);
728 mulli $bp,$a3,19 # a[3]*19
744 add $bi,$a3,$a3 # a[3]*2
748 ($a3,$t1) = ($bp,$a3);
756 add $bi,$a2,$a2 # a[2]*2
775 .byte 0,12,4,0,0x80,11,2,0
777 .size x25519_fe51_sqr,.-x25519_fe51_sqr
781 .globl x25519_fe51_mul121666
782 .type x25519_fe51_mul121666,\@function
784 x25519_fe51_mul121666:
785 stdu $sp,-$FRAME($sp)
786 std r21,`$FRAME-8*11`($sp)
787 std r22,`$FRAME-8*10`($sp)
788 std r23,`$FRAME-8*9`($sp)
789 std r24,`$FRAME-8*8`($sp)
790 std r25,`$FRAME-8*7`($sp)
791 std r26,`$FRAME-8*6`($sp)
792 std r27,`$FRAME-8*5`($sp)
793 std r28,`$FRAME-8*4`($sp)
794 std r29,`$FRAME-8*3`($sp)
795 std r30,`$FRAME-8*2`($sp)
796 std r31,`$FRAME-8*1`($sp)
799 ori $bi,$bi,`121666-65536`
806 mulld $h0lo,$a0,$bi # a[0]*121666
808 mulld $h1lo,$a1,$bi # a[1]*121666
810 mulld $h2lo,$a2,$bi # a[2]*121666
812 mulld $h3lo,$a3,$bi # a[3]*121666
814 mulld $h4lo,$a4,$bi # a[4]*121666
819 .byte 0,12,4,0,0x80,11,2,0
821 .size x25519_fe51_mul121666,.-x25519_fe51_mul121666
825 $code =~ s/\`([^\`]*)\`/eval $1/gem;
827 close STDOUT or die "error closing STDOUT";