Update copyright year
[oweals/openssl.git] / crypto / poly1305 / asm / poly1305-ppcfp.pl
1 #! /usr/bin/env perl
2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements Poly1305 hash for PowerPC FPU.
18 #
19 # June 2015
20 #
21 # Numbers are cycles per processed byte with poly1305_blocks alone,
22 # and improvement coefficients relative to gcc-generated code.
23 #
24 # Freescale e300        9.78/+30%
25 # PPC74x0               6.92/+50%
26 # PPC970                6.03/+80%
27 # POWER7                3.50/+30%
28 # POWER8                3.75/+10%
29 # POWER9                2.80/+12%
30
31 $flavour = shift;
32
33 if ($flavour =~ /64/) {
34         $SIZE_T =8;
35         $LRSAVE =2*$SIZE_T;
36         $UCMP   ="cmpld";
37         $STU    ="stdu";
38         $POP    ="ld";
39         $PUSH   ="std";
40 } elsif ($flavour =~ /32/) {
41         $SIZE_T =4;
42         $LRSAVE =$SIZE_T;
43         $UCMP   ="cmplw";
44         $STU    ="stwu";
45         $POP    ="lwz";
46         $PUSH   ="stw";
47 } else { die "nonsense $flavour"; }
48
49 $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
50
51 $LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
52
53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
55 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
56 die "can't locate ppc-xlate.pl";
57
58 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
59
60 $LOCALS=6*$SIZE_T;
61 $FRAME=$LOCALS+6*8+18*8;
62
63 my $sp="r1";
64
65 my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
66 my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
67
68 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
69     $two0,$two32,$two64,$two96,$two130,$five_two130,
70     $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
71     $s2lo,$s2hi,$s3lo,$s3hi,
72     $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
73 # borrowings
74 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
75 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
76 my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
77
78 $code.=<<___;
79 .machine        "any"
80 .text
81
82 .globl  .poly1305_init_fpu
83 .align  6
84 .poly1305_init_fpu:
85         $STU    $sp,-$LOCALS($sp)               # minimal frame
86         mflr    $padbit
87         $PUSH   $padbit,`$LOCALS+$LRSAVE`($sp)
88
89         bl      LPICmeup
90
91         xor     r0,r0,r0
92         mtlr    $padbit                         # restore lr
93
94         lfd     $two0,8*0($len)                 # load constants
95         lfd     $two32,8*1($len)
96         lfd     $two64,8*2($len)
97         lfd     $two96,8*3($len)
98         lfd     $two130,8*4($len)
99         lfd     $five_two130,8*5($len)
100
101         stfd    $two0,8*0($ctx)                 # initial hash value, biased 0
102         stfd    $two32,8*1($ctx)
103         stfd    $two64,8*2($ctx)
104         stfd    $two96,8*3($ctx)
105
106         $UCMP   $inp,r0
107         beq-    Lno_key
108
109         lfd     $h3lo,8*13($len)                # new fpscr
110         mffs    $h3hi                           # old fpscr
111
112         stfd    $two0,8*4($ctx)                 # key "template"
113         stfd    $two32,8*5($ctx)
114         stfd    $two64,8*6($ctx)
115         stfd    $two96,8*7($ctx)
116
117         li      $in1,4
118         li      $in2,8
119         li      $in3,12
120         $LWXLE  $in0,0,$inp                     # load key
121         $LWXLE  $in1,$in1,$inp
122         $LWXLE  $in2,$in2,$inp
123         $LWXLE  $in3,$in3,$inp
124
125         lis     $i1,0xf000                      #   0xf0000000
126         ori     $i2,$i1,3                       #   0xf0000003
127         andc    $in0,$in0,$i1                   # &=0x0fffffff
128         andc    $in1,$in1,$i2                   # &=0x0ffffffc
129         andc    $in2,$in2,$i2
130         andc    $in3,$in3,$i2
131
132         stw     $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx)     # fill "template"
133         stw     $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
134         stw     $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
135         stw     $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
136
137         mtfsf   255,$h3lo                       # fpscr
138         stfd    $two0,8*18($ctx)                # copy constants to context
139         stfd    $two32,8*19($ctx)
140         stfd    $two64,8*20($ctx)
141         stfd    $two96,8*21($ctx)
142         stfd    $two130,8*22($ctx)
143         stfd    $five_two130,8*23($ctx)
144
145         lfd     $h0lo,8*4($ctx)                 # load [biased] key
146         lfd     $h1lo,8*5($ctx)
147         lfd     $h2lo,8*6($ctx)
148         lfd     $h3lo,8*7($ctx)
149
150         fsub    $h0lo,$h0lo,$two0               # r0
151         fsub    $h1lo,$h1lo,$two32              # r1
152         fsub    $h2lo,$h2lo,$two64              # r2
153         fsub    $h3lo,$h3lo,$two96              # r3
154
155         lfd     $two0,8*6($len)                 # more constants
156         lfd     $two32,8*7($len)
157         lfd     $two64,8*8($len)
158         lfd     $two96,8*9($len)
159
160         fmul    $h1hi,$h1lo,$five_two130        # s1
161         fmul    $h2hi,$h2lo,$five_two130        # s2
162          stfd   $h3hi,8*15($ctx)                # borrow slot for original fpscr
163         fmul    $h3hi,$h3lo,$five_two130        # s3
164
165         fadd    $h0hi,$h0lo,$two0
166          stfd   $h1hi,8*12($ctx)                # put aside for now
167         fadd    $h1hi,$h1lo,$two32
168          stfd   $h2hi,8*13($ctx)
169         fadd    $h2hi,$h2lo,$two64
170          stfd   $h3hi,8*14($ctx)
171         fadd    $h3hi,$h3lo,$two96
172
173         fsub    $h0hi,$h0hi,$two0
174         fsub    $h1hi,$h1hi,$two32
175         fsub    $h2hi,$h2hi,$two64
176         fsub    $h3hi,$h3hi,$two96
177
178         lfd     $two0,8*10($len)                # more constants
179         lfd     $two32,8*11($len)
180         lfd     $two64,8*12($len)
181
182         fsub    $h0lo,$h0lo,$h0hi
183         fsub    $h1lo,$h1lo,$h1hi
184         fsub    $h2lo,$h2lo,$h2hi
185         fsub    $h3lo,$h3lo,$h3hi
186
187         stfd    $h0hi,8*5($ctx)                 # r0hi
188         stfd    $h1hi,8*7($ctx)                 # r1hi
189         stfd    $h2hi,8*9($ctx)                 # r2hi
190         stfd    $h3hi,8*11($ctx)                # r3hi
191
192         stfd    $h0lo,8*4($ctx)                 # r0lo
193         stfd    $h1lo,8*6($ctx)                 # r1lo
194         stfd    $h2lo,8*8($ctx)                 # r2lo
195         stfd    $h3lo,8*10($ctx)                # r3lo
196
197         lfd     $h1lo,8*12($ctx)                # s1
198         lfd     $h2lo,8*13($ctx)                # s2
199         lfd     $h3lo,8*14($ctx)                # s3
200         lfd     $h0lo,8*15($ctx)                # pull original fpscr
201
202         fadd    $h1hi,$h1lo,$two0
203         fadd    $h2hi,$h2lo,$two32
204         fadd    $h3hi,$h3lo,$two64
205
206         fsub    $h1hi,$h1hi,$two0
207         fsub    $h2hi,$h2hi,$two32
208         fsub    $h3hi,$h3hi,$two64
209
210         fsub    $h1lo,$h1lo,$h1hi
211         fsub    $h2lo,$h2lo,$h2hi
212         fsub    $h3lo,$h3lo,$h3hi
213
214         stfd    $h1hi,8*13($ctx)                # s1hi
215         stfd    $h2hi,8*15($ctx)                # s2hi
216         stfd    $h3hi,8*17($ctx)                # s3hi
217
218         stfd    $h1lo,8*12($ctx)                # s1lo
219         stfd    $h2lo,8*14($ctx)                # s2lo
220         stfd    $h3lo,8*16($ctx)                # s3lo
221
222         mtfsf   255,$h0lo                       # restore fpscr
223 Lno_key:
224         xor     r3,r3,r3
225         addi    $sp,$sp,$LOCALS
226         blr
227         .long   0
228         .byte   0,12,4,1,0x80,0,2,0
229 .size   .poly1305_init_fpu,.-.poly1305_init_fpu
230
231 .globl  .poly1305_blocks_fpu
232 .align  4
233 .poly1305_blocks_fpu:
234         srwi.   $len,$len,4
235         beq-    Labort
236
237         $STU    $sp,-$FRAME($sp)
238         mflr    r0
239         stfd    f14,`$FRAME-8*18`($sp)
240         stfd    f15,`$FRAME-8*17`($sp)
241         stfd    f16,`$FRAME-8*16`($sp)
242         stfd    f17,`$FRAME-8*15`($sp)
243         stfd    f18,`$FRAME-8*14`($sp)
244         stfd    f19,`$FRAME-8*13`($sp)
245         stfd    f20,`$FRAME-8*12`($sp)
246         stfd    f21,`$FRAME-8*11`($sp)
247         stfd    f22,`$FRAME-8*10`($sp)
248         stfd    f23,`$FRAME-8*9`($sp)
249         stfd    f24,`$FRAME-8*8`($sp)
250         stfd    f25,`$FRAME-8*7`($sp)
251         stfd    f26,`$FRAME-8*6`($sp)
252         stfd    f27,`$FRAME-8*5`($sp)
253         stfd    f28,`$FRAME-8*4`($sp)
254         stfd    f29,`$FRAME-8*3`($sp)
255         stfd    f30,`$FRAME-8*2`($sp)
256         stfd    f31,`$FRAME-8*1`($sp)
257         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
258
259         xor     r0,r0,r0
260         li      $in3,1
261         mtctr   $len
262         neg     $len,$len
263         stw     r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
264         stw     $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
265
266         lfd     $two0,8*18($ctx)                # load constants
267         lfd     $two32,8*19($ctx)
268         lfd     $two64,8*20($ctx)
269         lfd     $two96,8*21($ctx)
270         lfd     $two130,8*22($ctx)
271         lfd     $five_two130,8*23($ctx)
272
273         lfd     $h0lo,8*0($ctx)                 # load [biased] hash value
274         lfd     $h1lo,8*1($ctx)
275         lfd     $h2lo,8*2($ctx)
276         lfd     $h3lo,8*3($ctx)
277
278         stfd    $two0,`$LOCALS+8*0`($sp)        # input "template"
279         oris    $in3,$padbit,`(1023+52+96)<<4`
280         stfd    $two32,`$LOCALS+8*1`($sp)
281         stfd    $two64,`$LOCALS+8*2`($sp)
282         stw     $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
283
284         li      $i1,4
285         li      $i2,8
286         li      $i3,12
287         $LWXLE  $in0,0,$inp                     # load input
288         $LWXLE  $in1,$i1,$inp
289         $LWXLE  $in2,$i2,$inp
290         $LWXLE  $in3,$i3,$inp
291         addi    $inp,$inp,16
292
293         stw     $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)      # fill "template"
294         stw     $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
295         stw     $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
296         stw     $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
297
298         mffs    $x0                             # original fpscr
299         lfd     $x1,`$LOCALS+8*4`($sp)          # new fpscr
300         lfd     $r0lo,8*4($ctx)                 # load key
301         lfd     $r0hi,8*5($ctx)
302         lfd     $r1lo,8*6($ctx)
303         lfd     $r1hi,8*7($ctx)
304         lfd     $r2lo,8*8($ctx)
305         lfd     $r2hi,8*9($ctx)
306         lfd     $r3lo,8*10($ctx)
307         lfd     $r3hi,8*11($ctx)
308         lfd     $s1lo,8*12($ctx)
309         lfd     $s1hi,8*13($ctx)
310         lfd     $s2lo,8*14($ctx)
311         lfd     $s2hi,8*15($ctx)
312         lfd     $s3lo,8*16($ctx)
313         lfd     $s3hi,8*17($ctx)
314
315         stfd    $x0,`$LOCALS+8*4`($sp)          # save original fpscr
316         mtfsf   255,$x1
317
318         addic   $len,$len,1
319         addze   r0,r0
320         slwi.   r0,r0,4
321         sub     $inp,$inp,r0                    # conditional rewind
322
323         lfd     $x0,`$LOCALS+8*0`($sp)
324         lfd     $x1,`$LOCALS+8*1`($sp)
325         lfd     $x2,`$LOCALS+8*2`($sp)
326         lfd     $x3,`$LOCALS+8*3`($sp)
327
328         fsub    $h0lo,$h0lo,$two0               # de-bias hash value
329          $LWXLE $in0,0,$inp                     # modulo-scheduled input load
330         fsub    $h1lo,$h1lo,$two32
331          $LWXLE $in1,$i1,$inp
332         fsub    $h2lo,$h2lo,$two64
333          $LWXLE $in2,$i2,$inp
334         fsub    $h3lo,$h3lo,$two96
335          $LWXLE $in3,$i3,$inp
336
337         fsub    $x0,$x0,$two0                   # de-bias input
338          addi   $inp,$inp,16
339         fsub    $x1,$x1,$two32
340         fsub    $x2,$x2,$two64
341         fsub    $x3,$x3,$two96
342
343         fadd    $x0,$x0,$h0lo                   # accumulate input
344          stw    $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
345         fadd    $x1,$x1,$h1lo
346          stw    $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
347         fadd    $x2,$x2,$h2lo
348          stw    $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
349         fadd    $x3,$x3,$h3lo
350          stw    $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
351
352         b       Lentry
353
354 .align  4
355 Loop:
356         fsub    $y0,$y0,$two0                   # de-bias input
357          addic  $len,$len,1
358         fsub    $y1,$y1,$two32
359          addze  r0,r0
360         fsub    $y2,$y2,$two64
361          slwi.  r0,r0,4
362         fsub    $y3,$y3,$two96
363          sub    $inp,$inp,r0                    # conditional rewind
364
365         fadd    $h0lo,$h0lo,$y0                 # accumulate input
366         fadd    $h0hi,$h0hi,$y1
367         fadd    $h2lo,$h2lo,$y2
368         fadd    $h2hi,$h2hi,$y3
369
370         ######################################### base 2^48 -> base 2^32
371         fadd    $c1lo,$h1lo,$two64
372          $LWXLE $in0,0,$inp                     # modulo-scheduled input load
373         fadd    $c1hi,$h1hi,$two64
374          $LWXLE $in1,$i1,$inp
375         fadd    $c3lo,$h3lo,$two130
376          $LWXLE $in2,$i2,$inp
377         fadd    $c3hi,$h3hi,$two130
378          $LWXLE $in3,$i3,$inp
379         fadd    $c0lo,$h0lo,$two32
380          addi   $inp,$inp,16
381         fadd    $c0hi,$h0hi,$two32
382         fadd    $c2lo,$h2lo,$two96
383         fadd    $c2hi,$h2hi,$two96
384
385         fsub    $c1lo,$c1lo,$two64
386          stw    $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)      # fill "template"
387         fsub    $c1hi,$c1hi,$two64
388          stw    $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
389         fsub    $c3lo,$c3lo,$two130
390          stw    $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
391         fsub    $c3hi,$c3hi,$two130
392          stw    $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
393         fsub    $c0lo,$c0lo,$two32
394         fsub    $c0hi,$c0hi,$two32
395         fsub    $c2lo,$c2lo,$two96
396         fsub    $c2hi,$c2hi,$two96
397
398         fsub    $h1lo,$h1lo,$c1lo
399         fsub    $h1hi,$h1hi,$c1hi
400         fsub    $h3lo,$h3lo,$c3lo
401         fsub    $h3hi,$h3hi,$c3hi
402         fsub    $h2lo,$h2lo,$c2lo
403         fsub    $h2hi,$h2hi,$c2hi
404         fsub    $h0lo,$h0lo,$c0lo
405         fsub    $h0hi,$h0hi,$c0hi
406
407         fadd    $h1lo,$h1lo,$c0lo
408         fadd    $h1hi,$h1hi,$c0hi
409         fadd    $h3lo,$h3lo,$c2lo
410         fadd    $h3hi,$h3hi,$c2hi
411         fadd    $h2lo,$h2lo,$c1lo
412         fadd    $h2hi,$h2hi,$c1hi
413         fmadd   $h0lo,$c3lo,$five_two130,$h0lo
414         fmadd   $h0hi,$c3hi,$five_two130,$h0hi
415
416         fadd    $x1,$h1lo,$h1hi
417          lfd    $s1lo,8*12($ctx)                # reload constants
418         fadd    $x3,$h3lo,$h3hi
419          lfd    $s1hi,8*13($ctx)
420         fadd    $x2,$h2lo,$h2hi
421          lfd    $r3lo,8*10($ctx)
422         fadd    $x0,$h0lo,$h0hi
423          lfd    $r3hi,8*11($ctx)
424 Lentry:
425         fmul    $h0lo,$s3lo,$x1
426         fmul    $h0hi,$s3hi,$x1
427         fmul    $h2lo,$r1lo,$x1
428         fmul    $h2hi,$r1hi,$x1
429         fmul    $h1lo,$r0lo,$x1
430         fmul    $h1hi,$r0hi,$x1
431         fmul    $h3lo,$r2lo,$x1
432         fmul    $h3hi,$r2hi,$x1
433
434         fmadd   $h0lo,$s1lo,$x3,$h0lo
435         fmadd   $h0hi,$s1hi,$x3,$h0hi
436         fmadd   $h2lo,$s3lo,$x3,$h2lo
437         fmadd   $h2hi,$s3hi,$x3,$h2hi
438         fmadd   $h1lo,$s2lo,$x3,$h1lo
439         fmadd   $h1hi,$s2hi,$x3,$h1hi
440         fmadd   $h3lo,$r0lo,$x3,$h3lo
441         fmadd   $h3hi,$r0hi,$x3,$h3hi
442
443         fmadd   $h0lo,$s2lo,$x2,$h0lo
444         fmadd   $h0hi,$s2hi,$x2,$h0hi
445         fmadd   $h2lo,$r0lo,$x2,$h2lo
446         fmadd   $h2hi,$r0hi,$x2,$h2hi
447         fmadd   $h1lo,$s3lo,$x2,$h1lo
448         fmadd   $h1hi,$s3hi,$x2,$h1hi
449         fmadd   $h3lo,$r1lo,$x2,$h3lo
450         fmadd   $h3hi,$r1hi,$x2,$h3hi
451
452         fmadd   $h0lo,$r0lo,$x0,$h0lo
453          lfd    $y0,`$LOCALS+8*0`($sp)          # load [biased] input
454         fmadd   $h0hi,$r0hi,$x0,$h0hi
455          lfd    $y1,`$LOCALS+8*1`($sp)
456         fmadd   $h2lo,$r2lo,$x0,$h2lo
457          lfd    $y2,`$LOCALS+8*2`($sp)
458         fmadd   $h2hi,$r2hi,$x0,$h2hi
459          lfd    $y3,`$LOCALS+8*3`($sp)
460         fmadd   $h1lo,$r1lo,$x0,$h1lo
461         fmadd   $h1hi,$r1hi,$x0,$h1hi
462         fmadd   $h3lo,$r3lo,$x0,$h3lo
463         fmadd   $h3hi,$r3hi,$x0,$h3hi
464
465         bdnz    Loop
466
467         ######################################### base 2^48 -> base 2^32
468         fadd    $c0lo,$h0lo,$two32
469         fadd    $c0hi,$h0hi,$two32
470         fadd    $c2lo,$h2lo,$two96
471         fadd    $c2hi,$h2hi,$two96
472         fadd    $c1lo,$h1lo,$two64
473         fadd    $c1hi,$h1hi,$two64
474         fadd    $c3lo,$h3lo,$two130
475         fadd    $c3hi,$h3hi,$two130
476
477         fsub    $c0lo,$c0lo,$two32
478         fsub    $c0hi,$c0hi,$two32
479         fsub    $c2lo,$c2lo,$two96
480         fsub    $c2hi,$c2hi,$two96
481         fsub    $c1lo,$c1lo,$two64
482         fsub    $c1hi,$c1hi,$two64
483         fsub    $c3lo,$c3lo,$two130
484         fsub    $c3hi,$c3hi,$two130
485
486         fsub    $h1lo,$h1lo,$c1lo
487         fsub    $h1hi,$h1hi,$c1hi
488         fsub    $h3lo,$h3lo,$c3lo
489         fsub    $h3hi,$h3hi,$c3hi
490         fsub    $h2lo,$h2lo,$c2lo
491         fsub    $h2hi,$h2hi,$c2hi
492         fsub    $h0lo,$h0lo,$c0lo
493         fsub    $h0hi,$h0hi,$c0hi
494
495         fadd    $h1lo,$h1lo,$c0lo
496         fadd    $h1hi,$h1hi,$c0hi
497         fadd    $h3lo,$h3lo,$c2lo
498         fadd    $h3hi,$h3hi,$c2hi
499         fadd    $h2lo,$h2lo,$c1lo
500         fadd    $h2hi,$h2hi,$c1hi
501         fmadd   $h0lo,$c3lo,$five_two130,$h0lo
502         fmadd   $h0hi,$c3hi,$five_two130,$h0hi
503
504         fadd    $x1,$h1lo,$h1hi
505         fadd    $x3,$h3lo,$h3hi
506         fadd    $x2,$h2lo,$h2hi
507         fadd    $x0,$h0lo,$h0hi
508
509         lfd     $h0lo,`$LOCALS+8*4`($sp)        # pull saved fpscr
510         fadd    $x1,$x1,$two32                  # bias
511         fadd    $x3,$x3,$two96
512         fadd    $x2,$x2,$two64
513         fadd    $x0,$x0,$two0
514
515         stfd    $x1,8*1($ctx)                   # store [biased] hash value
516         stfd    $x3,8*3($ctx)
517         stfd    $x2,8*2($ctx)
518         stfd    $x0,8*0($ctx)
519
520         mtfsf   255,$h0lo                       # restore original fpscr
521         lfd     f14,`$FRAME-8*18`($sp)
522         lfd     f15,`$FRAME-8*17`($sp)
523         lfd     f16,`$FRAME-8*16`($sp)
524         lfd     f17,`$FRAME-8*15`($sp)
525         lfd     f18,`$FRAME-8*14`($sp)
526         lfd     f19,`$FRAME-8*13`($sp)
527         lfd     f20,`$FRAME-8*12`($sp)
528         lfd     f21,`$FRAME-8*11`($sp)
529         lfd     f22,`$FRAME-8*10`($sp)
530         lfd     f23,`$FRAME-8*9`($sp)
531         lfd     f24,`$FRAME-8*8`($sp)
532         lfd     f25,`$FRAME-8*7`($sp)
533         lfd     f26,`$FRAME-8*6`($sp)
534         lfd     f27,`$FRAME-8*5`($sp)
535         lfd     f28,`$FRAME-8*4`($sp)
536         lfd     f29,`$FRAME-8*3`($sp)
537         lfd     f30,`$FRAME-8*2`($sp)
538         lfd     f31,`$FRAME-8*1`($sp)
539         addi    $sp,$sp,$FRAME
540 Labort:
541         blr
542         .long   0
543         .byte   0,12,4,1,0x80,0,4,0
544 .size   .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
545 ___
546 {
547 my ($mac,$nonce)=($inp,$len);
548
549 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
550    ) = map("r$_",(7..11,28..31));
551 my $mask = "r0";
552 my $FRAME = (6+4)*$SIZE_T;
553
554 $code.=<<___;
555 .globl  .poly1305_emit_fpu
556 .align  4
557 .poly1305_emit_fpu:
558         $STU    $sp,-$FRAME($sp)
559         mflr    r0
560         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
561         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
562         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
563         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
564         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
565
566         lwz     $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx)      # load hash
567         lwz     $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
568         lwz     $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
569         lwz     $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
570         lwz     $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
571         lwz     $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
572         lwz     $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
573         lwz     $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
574
575         lis     $mask,0xfff0
576         andc    $d0,$d0,$mask                   # mask exponent
577         andc    $d1,$d1,$mask
578         andc    $d2,$d2,$mask
579         andc    $d3,$d3,$mask                   # can be partially reduced...
580         li      $mask,3
581
582         srwi    $padbit,$d3,2                   # ... so reduce
583         and     $h4,$d3,$mask
584         andc    $d3,$d3,$mask
585         add     $d3,$d3,$padbit
586 ___
587                                                 if ($SIZE_T==4) {
588 $code.=<<___;
589         addc    $h0,$h0,$d3
590         adde    $h1,$h1,$d0
591         adde    $h2,$h2,$d1
592         adde    $h3,$h3,$d2
593         addze   $h4,$h4
594
595         addic   $d0,$h0,5                       # compare to modulus
596         addze   $d1,$h1
597         addze   $d2,$h2
598         addze   $d3,$h3
599         addze   $mask,$h4
600
601         srwi    $mask,$mask,2                   # did it carry/borrow?
602         neg     $mask,$mask
603         srawi   $mask,$mask,31                  # mask
604
605         andc    $h0,$h0,$mask
606         and     $d0,$d0,$mask
607         andc    $h1,$h1,$mask
608         and     $d1,$d1,$mask
609         or      $h0,$h0,$d0
610         lwz     $d0,0($nonce)                   # load nonce
611         andc    $h2,$h2,$mask
612         and     $d2,$d2,$mask
613         or      $h1,$h1,$d1
614         lwz     $d1,4($nonce)
615         andc    $h3,$h3,$mask
616         and     $d3,$d3,$mask
617         or      $h2,$h2,$d2
618         lwz     $d2,8($nonce)
619         or      $h3,$h3,$d3
620         lwz     $d3,12($nonce)
621
622         addc    $h0,$h0,$d0                     # accumulate nonce
623         adde    $h1,$h1,$d1
624         adde    $h2,$h2,$d2
625         adde    $h3,$h3,$d3
626 ___
627                                                 } else {
628 $code.=<<___;
629         add     $h0,$h0,$d3
630         add     $h1,$h1,$d0
631         add     $h2,$h2,$d1
632         add     $h3,$h3,$d2
633
634         srdi    $d0,$h0,32
635         add     $h1,$h1,$d0
636         srdi    $d1,$h1,32
637         add     $h2,$h2,$d1
638         srdi    $d2,$h2,32
639         add     $h3,$h3,$d2
640         srdi    $d3,$h3,32
641         add     $h4,$h4,$d3
642
643         insrdi  $h0,$h1,32,0
644         insrdi  $h2,$h3,32,0
645
646         addic   $d0,$h0,5                       # compare to modulus
647         addze   $d1,$h2
648         addze   $d2,$h4
649
650         srdi    $mask,$d2,2                     # did it carry/borrow?
651         neg     $mask,$mask
652         sradi   $mask,$mask,63                  # mask
653         ld      $d2,0($nonce)                   # load nonce
654         ld      $d3,8($nonce)
655
656         andc    $h0,$h0,$mask
657         and     $d0,$d0,$mask
658         andc    $h2,$h2,$mask
659         and     $d1,$d1,$mask
660         or      $h0,$h0,$d0
661         or      $h2,$h2,$d1
662 ___
663 $code.=<<___    if (!$LITTLE_ENDIAN);
664         rotldi  $d2,$d2,32                      # flip nonce words
665         rotldi  $d3,$d3,32
666 ___
667 $code.=<<___;
668         addc    $h0,$h0,$d2                     # accumulate nonce
669         adde    $h2,$h2,$d3
670
671         srdi    $h1,$h0,32
672         srdi    $h3,$h2,32
673 ___
674                                                 }
675 $code.=<<___    if ($LITTLE_ENDIAN);
676         stw     $h0,0($mac)                     # write result
677         stw     $h1,4($mac)
678         stw     $h2,8($mac)
679         stw     $h3,12($mac)
680 ___
681 $code.=<<___    if (!$LITTLE_ENDIAN);
682         li      $d1,4
683         stwbrx  $h0,0,$mac                      # write result
684         li      $d2,8
685         stwbrx  $h1,$d1,$mac
686         li      $d3,12
687         stwbrx  $h2,$d2,$mac
688         stwbrx  $h3,$d3,$mac
689 ___
690 $code.=<<___;
691         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
692         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
693         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
694         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
695         addi    $sp,$sp,$FRAME
696         blr
697         .long   0
698         .byte   0,12,4,1,0x80,4,3,0
699 .size   .poly1305_emit_fpu,.-.poly1305_emit_fpu
700 ___
701 }
702 # Ugly hack here, because PPC assembler syntax seem to vary too
703 # much from platforms to platform...
704 $code.=<<___;
705 .align  6
706 LPICmeup:
707         mflr    r0
708         bcl     20,31,\$+4
709         mflr    $len    # vvvvvv "distance" between . and 1st data entry
710         addi    $len,$len,`64-8`        # borrow $len
711         mtlr    r0
712         blr
713         .long   0
714         .byte   0,12,0x14,0,0,0,0,0
715         .space  `64-9*4`
716
717 .quad   0x4330000000000000              # 2^(52+0)
718 .quad   0x4530000000000000              # 2^(52+32)
719 .quad   0x4730000000000000              # 2^(52+64)
720 .quad   0x4930000000000000              # 2^(52+96)
721 .quad   0x4b50000000000000              # 2^(52+130)
722
723 .quad   0x37f4000000000000              # 5/2^130
724
725 .quad   0x4430000000000000              # 2^(52+16+0)
726 .quad   0x4630000000000000              # 2^(52+16+32)
727 .quad   0x4830000000000000              # 2^(52+16+64)
728 .quad   0x4a30000000000000              # 2^(52+16+96)
729 .quad   0x3e30000000000000              # 2^(52+16+0-96)
730 .quad   0x4030000000000000              # 2^(52+16+32-96)
731 .quad   0x4230000000000000              # 2^(52+16+64-96)
732
733 .quad   0x0000000000000001              # fpscr: truncate, no exceptions
734 .asciz  "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
735 .align  4
736 ___
737
738 $code =~ s/\`([^\`]*)\`/eval $1/gem;
739 print $code;
740 close STDOUT;