2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements Poly1305 hash for PowerPC FPU.
21 # Numbers are cycles per processed byte with poly1305_blocks alone,
22 # and improvement coefficients relative to gcc-generated code.
24 # Freescale e300 9.78/+30%
33 if ($flavour =~ /64/) {
40 } elsif ($flavour =~ /32/) {
47 } else { die "nonsense $flavour"; }
49 $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
51 $LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
55 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
56 die "can't locate ppc-xlate.pl";
58 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
61 $FRAME=$LOCALS+6*8+18*8;
65 my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
66 my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
68 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
69 $two0,$two32,$two64,$two96,$two130,$five_two130,
70 $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
71 $s2lo,$s2hi,$s3lo,$s3hi,
72 $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
74 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
75 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
76 my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
82 .globl .poly1305_init_fpu
85 $STU $sp,-$LOCALS($sp) # minimal frame
87 $PUSH $padbit,`$LOCALS+$LRSAVE`($sp)
92 mtlr $padbit # restore lr
94 lfd $two0,8*0($len) # load constants
99 lfd $five_two130,8*5($len)
101 stfd $two0,8*0($ctx) # initial hash value, biased 0
102 stfd $two32,8*1($ctx)
103 stfd $two64,8*2($ctx)
104 stfd $two96,8*3($ctx)
109 lfd $h3lo,8*13($len) # new fpscr
110 mffs $h3hi # old fpscr
112 stfd $two0,8*4($ctx) # key "template"
113 stfd $two32,8*5($ctx)
114 stfd $two64,8*6($ctx)
115 stfd $two96,8*7($ctx)
120 $LWXLE $in0,0,$inp # load key
121 $LWXLE $in1,$in1,$inp
122 $LWXLE $in2,$in2,$inp
123 $LWXLE $in3,$in3,$inp
125 lis $i1,0xf000 # 0xf0000000
126 ori $i2,$i1,3 # 0xf0000003
127 andc $in0,$in0,$i1 # &=0x0fffffff
128 andc $in1,$in1,$i2 # &=0x0ffffffc
132 stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template"
133 stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
134 stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
135 stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
137 mtfsf 255,$h3lo # fpscr
138 stfd $two0,8*18($ctx) # copy constants to context
139 stfd $two32,8*19($ctx)
140 stfd $two64,8*20($ctx)
141 stfd $two96,8*21($ctx)
142 stfd $two130,8*22($ctx)
143 stfd $five_two130,8*23($ctx)
145 lfd $h0lo,8*4($ctx) # load [biased] key
150 fsub $h0lo,$h0lo,$two0 # r0
151 fsub $h1lo,$h1lo,$two32 # r1
152 fsub $h2lo,$h2lo,$two64 # r2
153 fsub $h3lo,$h3lo,$two96 # r3
155 lfd $two0,8*6($len) # more constants
160 fmul $h1hi,$h1lo,$five_two130 # s1
161 fmul $h2hi,$h2lo,$five_two130 # s2
162 stfd $h3hi,8*15($ctx) # borrow slot for original fpscr
163 fmul $h3hi,$h3lo,$five_two130 # s3
165 fadd $h0hi,$h0lo,$two0
166 stfd $h1hi,8*12($ctx) # put aside for now
167 fadd $h1hi,$h1lo,$two32
168 stfd $h2hi,8*13($ctx)
169 fadd $h2hi,$h2lo,$two64
170 stfd $h3hi,8*14($ctx)
171 fadd $h3hi,$h3lo,$two96
173 fsub $h0hi,$h0hi,$two0
174 fsub $h1hi,$h1hi,$two32
175 fsub $h2hi,$h2hi,$two64
176 fsub $h3hi,$h3hi,$two96
178 lfd $two0,8*10($len) # more constants
179 lfd $two32,8*11($len)
180 lfd $two64,8*12($len)
182 fsub $h0lo,$h0lo,$h0hi
183 fsub $h1lo,$h1lo,$h1hi
184 fsub $h2lo,$h2lo,$h2hi
185 fsub $h3lo,$h3lo,$h3hi
187 stfd $h0hi,8*5($ctx) # r0hi
188 stfd $h1hi,8*7($ctx) # r1hi
189 stfd $h2hi,8*9($ctx) # r2hi
190 stfd $h3hi,8*11($ctx) # r3hi
192 stfd $h0lo,8*4($ctx) # r0lo
193 stfd $h1lo,8*6($ctx) # r1lo
194 stfd $h2lo,8*8($ctx) # r2lo
195 stfd $h3lo,8*10($ctx) # r3lo
197 lfd $h1lo,8*12($ctx) # s1
198 lfd $h2lo,8*13($ctx) # s2
199 lfd $h3lo,8*14($ctx) # s3
200 lfd $h0lo,8*15($ctx) # pull original fpscr
202 fadd $h1hi,$h1lo,$two0
203 fadd $h2hi,$h2lo,$two32
204 fadd $h3hi,$h3lo,$two64
206 fsub $h1hi,$h1hi,$two0
207 fsub $h2hi,$h2hi,$two32
208 fsub $h3hi,$h3hi,$two64
210 fsub $h1lo,$h1lo,$h1hi
211 fsub $h2lo,$h2lo,$h2hi
212 fsub $h3lo,$h3lo,$h3hi
214 stfd $h1hi,8*13($ctx) # s1hi
215 stfd $h2hi,8*15($ctx) # s2hi
216 stfd $h3hi,8*17($ctx) # s3hi
218 stfd $h1lo,8*12($ctx) # s1lo
219 stfd $h2lo,8*14($ctx) # s2lo
220 stfd $h3lo,8*16($ctx) # s3lo
222 mtfsf 255,$h0lo # restore fpscr
228 .byte 0,12,4,1,0x80,0,2,0
229 .size .poly1305_init_fpu,.-.poly1305_init_fpu
231 .globl .poly1305_blocks_fpu
233 .poly1305_blocks_fpu:
237 $STU $sp,-$FRAME($sp)
239 stfd f14,`$FRAME-8*18`($sp)
240 stfd f15,`$FRAME-8*17`($sp)
241 stfd f16,`$FRAME-8*16`($sp)
242 stfd f17,`$FRAME-8*15`($sp)
243 stfd f18,`$FRAME-8*14`($sp)
244 stfd f19,`$FRAME-8*13`($sp)
245 stfd f20,`$FRAME-8*12`($sp)
246 stfd f21,`$FRAME-8*11`($sp)
247 stfd f22,`$FRAME-8*10`($sp)
248 stfd f23,`$FRAME-8*9`($sp)
249 stfd f24,`$FRAME-8*8`($sp)
250 stfd f25,`$FRAME-8*7`($sp)
251 stfd f26,`$FRAME-8*6`($sp)
252 stfd f27,`$FRAME-8*5`($sp)
253 stfd f28,`$FRAME-8*4`($sp)
254 stfd f29,`$FRAME-8*3`($sp)
255 stfd f30,`$FRAME-8*2`($sp)
256 stfd f31,`$FRAME-8*1`($sp)
257 $PUSH r0,`$FRAME+$LRSAVE`($sp)
263 stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
264 stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
266 lfd $two0,8*18($ctx) # load constants
267 lfd $two32,8*19($ctx)
268 lfd $two64,8*20($ctx)
269 lfd $two96,8*21($ctx)
270 lfd $two130,8*22($ctx)
271 lfd $five_two130,8*23($ctx)
273 lfd $h0lo,8*0($ctx) # load [biased] hash value
278 stfd $two0,`$LOCALS+8*0`($sp) # input "template"
279 oris $in3,$padbit,`(1023+52+96)<<4`
280 stfd $two32,`$LOCALS+8*1`($sp)
281 stfd $two64,`$LOCALS+8*2`($sp)
282 stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
287 $LWXLE $in0,0,$inp # load input
293 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
294 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
295 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
296 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
298 mffs $x0 # original fpscr
299 lfd $x1,`$LOCALS+8*4`($sp) # new fpscr
300 lfd $r0lo,8*4($ctx) # load key
315 stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr
321 sub $inp,$inp,r0 # conditional rewind
323 lfd $x0,`$LOCALS+8*0`($sp)
324 lfd $x1,`$LOCALS+8*1`($sp)
325 lfd $x2,`$LOCALS+8*2`($sp)
326 lfd $x3,`$LOCALS+8*3`($sp)
328 fsub $h0lo,$h0lo,$two0 # de-bias hash value
329 $LWXLE $in0,0,$inp # modulo-scheduled input load
330 fsub $h1lo,$h1lo,$two32
332 fsub $h2lo,$h2lo,$two64
334 fsub $h3lo,$h3lo,$two96
337 fsub $x0,$x0,$two0 # de-bias input
343 fadd $x0,$x0,$h0lo # accumulate input
344 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
346 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
348 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
350 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
356 fsub $y0,$y0,$two0 # de-bias input
363 sub $inp,$inp,r0 # conditional rewind
365 fadd $h0lo,$h0lo,$y0 # accumulate input
370 ######################################### base 2^48 -> base 2^32
371 fadd $c1lo,$h1lo,$two64
372 $LWXLE $in0,0,$inp # modulo-scheduled input load
373 fadd $c1hi,$h1hi,$two64
375 fadd $c3lo,$h3lo,$two130
377 fadd $c3hi,$h3hi,$two130
379 fadd $c0lo,$h0lo,$two32
381 fadd $c0hi,$h0hi,$two32
382 fadd $c2lo,$h2lo,$two96
383 fadd $c2hi,$h2hi,$two96
385 fsub $c1lo,$c1lo,$two64
386 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
387 fsub $c1hi,$c1hi,$two64
388 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
389 fsub $c3lo,$c3lo,$two130
390 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
391 fsub $c3hi,$c3hi,$two130
392 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
393 fsub $c0lo,$c0lo,$two32
394 fsub $c0hi,$c0hi,$two32
395 fsub $c2lo,$c2lo,$two96
396 fsub $c2hi,$c2hi,$two96
398 fsub $h1lo,$h1lo,$c1lo
399 fsub $h1hi,$h1hi,$c1hi
400 fsub $h3lo,$h3lo,$c3lo
401 fsub $h3hi,$h3hi,$c3hi
402 fsub $h2lo,$h2lo,$c2lo
403 fsub $h2hi,$h2hi,$c2hi
404 fsub $h0lo,$h0lo,$c0lo
405 fsub $h0hi,$h0hi,$c0hi
407 fadd $h1lo,$h1lo,$c0lo
408 fadd $h1hi,$h1hi,$c0hi
409 fadd $h3lo,$h3lo,$c2lo
410 fadd $h3hi,$h3hi,$c2hi
411 fadd $h2lo,$h2lo,$c1lo
412 fadd $h2hi,$h2hi,$c1hi
413 fmadd $h0lo,$c3lo,$five_two130,$h0lo
414 fmadd $h0hi,$c3hi,$five_two130,$h0hi
417 lfd $s1lo,8*12($ctx) # reload constants
434 fmadd $h0lo,$s1lo,$x3,$h0lo
435 fmadd $h0hi,$s1hi,$x3,$h0hi
436 fmadd $h2lo,$s3lo,$x3,$h2lo
437 fmadd $h2hi,$s3hi,$x3,$h2hi
438 fmadd $h1lo,$s2lo,$x3,$h1lo
439 fmadd $h1hi,$s2hi,$x3,$h1hi
440 fmadd $h3lo,$r0lo,$x3,$h3lo
441 fmadd $h3hi,$r0hi,$x3,$h3hi
443 fmadd $h0lo,$s2lo,$x2,$h0lo
444 fmadd $h0hi,$s2hi,$x2,$h0hi
445 fmadd $h2lo,$r0lo,$x2,$h2lo
446 fmadd $h2hi,$r0hi,$x2,$h2hi
447 fmadd $h1lo,$s3lo,$x2,$h1lo
448 fmadd $h1hi,$s3hi,$x2,$h1hi
449 fmadd $h3lo,$r1lo,$x2,$h3lo
450 fmadd $h3hi,$r1hi,$x2,$h3hi
452 fmadd $h0lo,$r0lo,$x0,$h0lo
453 lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input
454 fmadd $h0hi,$r0hi,$x0,$h0hi
455 lfd $y1,`$LOCALS+8*1`($sp)
456 fmadd $h2lo,$r2lo,$x0,$h2lo
457 lfd $y2,`$LOCALS+8*2`($sp)
458 fmadd $h2hi,$r2hi,$x0,$h2hi
459 lfd $y3,`$LOCALS+8*3`($sp)
460 fmadd $h1lo,$r1lo,$x0,$h1lo
461 fmadd $h1hi,$r1hi,$x0,$h1hi
462 fmadd $h3lo,$r3lo,$x0,$h3lo
463 fmadd $h3hi,$r3hi,$x0,$h3hi
467 ######################################### base 2^48 -> base 2^32
468 fadd $c0lo,$h0lo,$two32
469 fadd $c0hi,$h0hi,$two32
470 fadd $c2lo,$h2lo,$two96
471 fadd $c2hi,$h2hi,$two96
472 fadd $c1lo,$h1lo,$two64
473 fadd $c1hi,$h1hi,$two64
474 fadd $c3lo,$h3lo,$two130
475 fadd $c3hi,$h3hi,$two130
477 fsub $c0lo,$c0lo,$two32
478 fsub $c0hi,$c0hi,$two32
479 fsub $c2lo,$c2lo,$two96
480 fsub $c2hi,$c2hi,$two96
481 fsub $c1lo,$c1lo,$two64
482 fsub $c1hi,$c1hi,$two64
483 fsub $c3lo,$c3lo,$two130
484 fsub $c3hi,$c3hi,$two130
486 fsub $h1lo,$h1lo,$c1lo
487 fsub $h1hi,$h1hi,$c1hi
488 fsub $h3lo,$h3lo,$c3lo
489 fsub $h3hi,$h3hi,$c3hi
490 fsub $h2lo,$h2lo,$c2lo
491 fsub $h2hi,$h2hi,$c2hi
492 fsub $h0lo,$h0lo,$c0lo
493 fsub $h0hi,$h0hi,$c0hi
495 fadd $h1lo,$h1lo,$c0lo
496 fadd $h1hi,$h1hi,$c0hi
497 fadd $h3lo,$h3lo,$c2lo
498 fadd $h3hi,$h3hi,$c2hi
499 fadd $h2lo,$h2lo,$c1lo
500 fadd $h2hi,$h2hi,$c1hi
501 fmadd $h0lo,$c3lo,$five_two130,$h0lo
502 fmadd $h0hi,$c3hi,$five_two130,$h0hi
509 lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr
510 fadd $x1,$x1,$two32 # bias
515 stfd $x1,8*1($ctx) # store [biased] hash value
520 mtfsf 255,$h0lo # restore original fpscr
521 lfd f14,`$FRAME-8*18`($sp)
522 lfd f15,`$FRAME-8*17`($sp)
523 lfd f16,`$FRAME-8*16`($sp)
524 lfd f17,`$FRAME-8*15`($sp)
525 lfd f18,`$FRAME-8*14`($sp)
526 lfd f19,`$FRAME-8*13`($sp)
527 lfd f20,`$FRAME-8*12`($sp)
528 lfd f21,`$FRAME-8*11`($sp)
529 lfd f22,`$FRAME-8*10`($sp)
530 lfd f23,`$FRAME-8*9`($sp)
531 lfd f24,`$FRAME-8*8`($sp)
532 lfd f25,`$FRAME-8*7`($sp)
533 lfd f26,`$FRAME-8*6`($sp)
534 lfd f27,`$FRAME-8*5`($sp)
535 lfd f28,`$FRAME-8*4`($sp)
536 lfd f29,`$FRAME-8*3`($sp)
537 lfd f30,`$FRAME-8*2`($sp)
538 lfd f31,`$FRAME-8*1`($sp)
543 .byte 0,12,4,1,0x80,0,4,0
544 .size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
547 my ($mac,$nonce)=($inp,$len);
549 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
550 ) = map("r$_",(7..11,28..31));
552 my $FRAME = (6+4)*$SIZE_T;
555 .globl .poly1305_emit_fpu
558 $STU $sp,-$FRAME($sp)
560 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
561 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
562 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
563 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
564 $PUSH r0,`$FRAME+$LRSAVE`($sp)
566 lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash
567 lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
568 lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
569 lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
570 lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
571 lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
572 lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
573 lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
576 andc $d0,$d0,$mask # mask exponent
579 andc $d3,$d3,$mask # can be partially reduced...
582 srwi $padbit,$d3,2 # ... so reduce
595 addic $d0,$h0,5 # compare to modulus
601 srwi $mask,$mask,2 # did it carry/borrow?
603 srawi $mask,$mask,31 # mask
610 lwz $d0,0($nonce) # load nonce
622 addc $h0,$h0,$d0 # accumulate nonce
646 addic $d0,$h0,5 # compare to modulus
650 srdi $mask,$d2,2 # did it carry/borrow?
652 sradi $mask,$mask,63 # mask
653 ld $d2,0($nonce) # load nonce
663 $code.=<<___ if (!$LITTLE_ENDIAN);
664 rotldi $d2,$d2,32 # flip nonce words
668 addc $h0,$h0,$d2 # accumulate nonce
675 $code.=<<___ if ($LITTLE_ENDIAN);
676 stw $h0,0($mac) # write result
681 $code.=<<___ if (!$LITTLE_ENDIAN);
683 stwbrx $h0,0,$mac # write result
691 $POP r28,`$FRAME-$SIZE_T*4`($sp)
692 $POP r29,`$FRAME-$SIZE_T*3`($sp)
693 $POP r30,`$FRAME-$SIZE_T*2`($sp)
694 $POP r31,`$FRAME-$SIZE_T*1`($sp)
698 .byte 0,12,4,1,0x80,4,3,0
699 .size .poly1305_emit_fpu,.-.poly1305_emit_fpu
702 # Ugly hack here, because PPC assembler syntax seem to vary too
703 # much from platforms to platform...
709 mflr $len # vvvvvv "distance" between . and 1st data entry
710 addi $len,$len,`64-8` # borrow $len
714 .byte 0,12,0x14,0,0,0,0,0
717 .quad 0x4330000000000000 # 2^(52+0)
718 .quad 0x4530000000000000 # 2^(52+32)
719 .quad 0x4730000000000000 # 2^(52+64)
720 .quad 0x4930000000000000 # 2^(52+96)
721 .quad 0x4b50000000000000 # 2^(52+130)
723 .quad 0x37f4000000000000 # 5/2^130
725 .quad 0x4430000000000000 # 2^(52+16+0)
726 .quad 0x4630000000000000 # 2^(52+16+32)
727 .quad 0x4830000000000000 # 2^(52+16+64)
728 .quad 0x4a30000000000000 # 2^(52+16+96)
729 .quad 0x3e30000000000000 # 2^(52+16+0-96)
730 .quad 0x4030000000000000 # 2^(52+16+32-96)
731 .quad 0x4230000000000000 # 2^(52+16+64-96)
733 .quad 0x0000000000000001 # fpscr: truncate, no exceptions
734 .asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
738 $code =~ s/\`([^\`]*)\`/eval $1/gem;