ec/asm/ecp_nistz256-*.pl: addition to perform stricter reduction.
[oweals/openssl.git] / crypto / ec / asm / ecp_nistz256-sparcv9.pl
1 #! /usr/bin/env perl
2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # ECP_NISTZ256 module for SPARCv9.
18 #
19 # February 2015.
20 #
21 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22 # http://eprint.iacr.org/2013/816. In the process of adaptation
23 # original .c module was made 32-bit savvy in order to make this
24 # implementation possible.
25 #
26 #                       with/without -DECP_NISTZ256_ASM
27 # UltraSPARC III        +12-18%
28 # SPARC T4              +99-550% (+66-150% on 32-bit Solaris)
29 #
30 # Ranges denote minimum and maximum improvement coefficients depending
31 # on benchmark. Lower coefficients are for ECDSA sign, server-side
32 # operation. Keep in mind that +200% means 3x improvement.
33
34 $output = pop;
35 open STDOUT,">$output";
36
37 $code.=<<___;
38 #include "sparc_arch.h"
39
40 #define LOCALS  (STACK_BIAS+STACK_FRAME)
41 #ifdef  __arch64__
42 .register       %g2,#scratch
43 .register       %g3,#scratch
44 # define STACK64_FRAME  STACK_FRAME
45 # define LOCALS64       LOCALS
46 #else
47 # define STACK64_FRAME  (2047+192)
48 # define LOCALS64       STACK64_FRAME
49 #endif
50
51 .section        ".text",#alloc,#execinstr
52 ___
53 ########################################################################
54 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
55 #
56 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
57 open TABLE,"<ecp_nistz256_table.c"              or
58 open TABLE,"<${dir}../ecp_nistz256_table.c"     or
59 die "failed to open ecp_nistz256_table.c:",$!;
60
61 use integer;
62
63 foreach(<TABLE>) {
64         s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
65 }
66 close TABLE;
67
68 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
69 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
70 # amount of elements.
71 die "insane number of elements" if ($#arr != 64*16*37-1);
72
73 $code.=<<___;
74 .globl  ecp_nistz256_precomputed
75 .align  4096
76 ecp_nistz256_precomputed:
77 ___
78 ########################################################################
79 # this conversion smashes P256_POINT_AFFINE by individual bytes with
80 # 64 byte interval, similar to
81 #       1111222233334444
82 #       1234123412341234
83 for(1..37) {
84         @tbl = splice(@arr,0,64*16);
85         for($i=0;$i<64;$i++) {
86                 undef @line;
87                 for($j=0;$j<64;$j++) {
88                         push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
89                 }
90                 $code.=".byte\t";
91                 $code.=join(',',map { sprintf "0x%02x",$_} @line);
92                 $code.="\n";
93         }
94 }
95
96 {{{
97 my ($rp,$ap,$bp)=map("%i$_",(0..2));
98 my @acc=map("%l$_",(0..7));
99 my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5");
100 my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1");
101 my ($rp_real,$ap_real)=("%g2","%g3");
102
103 $code.=<<___;
104 .type   ecp_nistz256_precomputed,#object
105 .size   ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
106 .align  64
107 .LRR:   ! 2^512 mod P precomputed for NIST P256 polynomial
108 .long   0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
109 .long   0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
110 .Lone:
111 .long   1,0,0,0,0,0,0,0
112 .asciz  "ECP_NISTZ256 for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
113
114 ! void  ecp_nistz256_to_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
115 .globl  ecp_nistz256_to_mont
116 .align  64
117 ecp_nistz256_to_mont:
118         save    %sp,-STACK_FRAME,%sp
119         nop
120 1:      call    .+8
121         add     %o7,.LRR-1b,$bp
122         call    __ecp_nistz256_mul_mont
123         nop
124         ret
125         restore
126 .type   ecp_nistz256_to_mont,#function
127 .size   ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
128
129 ! void  ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
130 .globl  ecp_nistz256_from_mont
131 .align  32
132 ecp_nistz256_from_mont:
133         save    %sp,-STACK_FRAME,%sp
134         nop
135 1:      call    .+8
136         add     %o7,.Lone-1b,$bp
137         call    __ecp_nistz256_mul_mont
138         nop
139         ret
140         restore
141 .type   ecp_nistz256_from_mont,#function
142 .size   ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
143
144 ! void  ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8],
145 !                                             const BN_ULONG %i2[8]);
146 .globl  ecp_nistz256_mul_mont
147 .align  32
148 ecp_nistz256_mul_mont:
149         save    %sp,-STACK_FRAME,%sp
150         nop
151         call    __ecp_nistz256_mul_mont
152         nop
153         ret
154         restore
155 .type   ecp_nistz256_mul_mont,#function
156 .size   ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
157
158 ! void  ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]);
159 .globl  ecp_nistz256_sqr_mont
160 .align  32
161 ecp_nistz256_sqr_mont:
162         save    %sp,-STACK_FRAME,%sp
163         mov     $ap,$bp
164         call    __ecp_nistz256_mul_mont
165         nop
166         ret
167         restore
168 .type   ecp_nistz256_sqr_mont,#function
169 .size   ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
170 ___
171
172 ########################################################################
173 # Special thing to keep in mind is that $t0-$t7 hold 64-bit values,
174 # while all others are meant to keep 32. "Meant to" means that additions
175 # to @acc[0-7] do "contaminate" upper bits, but they are cleared before
176 # they can affect outcome (follow 'and' with $mask). Also keep in mind
177 # that addition with carry is addition with 32-bit carry, even though
178 # CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see
179 # below for VIS3 code paths.]
180
181 $code.=<<___;
182 .align  32
183 __ecp_nistz256_mul_mont:
184         ld      [$bp+0],$bi             ! b[0]
185         mov     -1,$mask
186         ld      [$ap+0],$a0
187         srl     $mask,0,$mask           ! 0xffffffff
188         ld      [$ap+4],$t1
189         ld      [$ap+8],$t2
190         ld      [$ap+12],$t3
191         ld      [$ap+16],$t4
192         ld      [$ap+20],$t5
193         ld      [$ap+24],$t6
194         ld      [$ap+28],$t7
195         mulx    $a0,$bi,$t0             ! a[0-7]*b[0], 64-bit results
196         mulx    $t1,$bi,$t1
197         mulx    $t2,$bi,$t2
198         mulx    $t3,$bi,$t3
199         mulx    $t4,$bi,$t4
200         mulx    $t5,$bi,$t5
201         mulx    $t6,$bi,$t6
202         mulx    $t7,$bi,$t7
203         srlx    $t0,32,@acc[1]          ! extract high parts
204         srlx    $t1,32,@acc[2]
205         srlx    $t2,32,@acc[3]
206         srlx    $t3,32,@acc[4]
207         srlx    $t4,32,@acc[5]
208         srlx    $t5,32,@acc[6]
209         srlx    $t6,32,@acc[7]
210         srlx    $t7,32,@acc[0]          ! "@acc[8]"
211         mov     0,$carry
212 ___
213 for($i=1;$i<8;$i++) {
214 $code.=<<___;
215         addcc   @acc[1],$t1,@acc[1]     ! accumulate high parts
216         ld      [$bp+4*$i],$bi          ! b[$i]
217         ld      [$ap+4],$t1             ! re-load a[1-7]
218         addccc  @acc[2],$t2,@acc[2]
219         addccc  @acc[3],$t3,@acc[3]
220         ld      [$ap+8],$t2
221         ld      [$ap+12],$t3
222         addccc  @acc[4],$t4,@acc[4]
223         addccc  @acc[5],$t5,@acc[5]
224         ld      [$ap+16],$t4
225         ld      [$ap+20],$t5
226         addccc  @acc[6],$t6,@acc[6]
227         addccc  @acc[7],$t7,@acc[7]
228         ld      [$ap+24],$t6
229         ld      [$ap+28],$t7
230         addccc  @acc[0],$carry,@acc[0]  ! "@acc[8]"
231         addc    %g0,%g0,$carry
232 ___
233         # Reduction iteration is normally performed by accumulating
234         # result of multiplication of modulus by "magic" digit [and
235         # omitting least significant word, which is guaranteed to
236         # be 0], but thanks to special form of modulus and "magic"
237         # digit being equal to least significant word, it can be
238         # performed with additions and subtractions alone. Indeed:
239         #
240         #        ffff.0001.0000.0000.0000.ffff.ffff.ffff
241         # *                                         abcd
242         # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
243         #
244         # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
245         # rewrite above as:
246         #
247         #   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
248         # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
249         # -      abcd.0000.0000.0000.0000.0000.0000.abcd
250         #
251         # or marking redundant operations:
252         #
253         #   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
254         # + abcd.0000.abcd.0000.0000.abcd.----.----.----
255         # -      abcd.----.----.----.----.----.----.----
256
257 $code.=<<___;
258         ! multiplication-less reduction
259         addcc   @acc[3],$t0,@acc[3]     ! r[3]+=r[0]
260         addccc  @acc[4],%g0,@acc[4]     ! r[4]+=0
261          and    @acc[1],$mask,@acc[1]
262          and    @acc[2],$mask,@acc[2]
263         addccc  @acc[5],%g0,@acc[5]     ! r[5]+=0
264         addccc  @acc[6],$t0,@acc[6]     ! r[6]+=r[0]
265          and    @acc[3],$mask,@acc[3]
266          and    @acc[4],$mask,@acc[4]
267         addccc  @acc[7],%g0,@acc[7]     ! r[7]+=0
268         addccc  @acc[0],$t0,@acc[0]     ! r[8]+=r[0]    "@acc[8]"
269          and    @acc[5],$mask,@acc[5]
270          and    @acc[6],$mask,@acc[6]
271         addc    $carry,%g0,$carry       ! top-most carry
272         subcc   @acc[7],$t0,@acc[7]     ! r[7]-=r[0]
273         subccc  @acc[0],%g0,@acc[0]     ! r[8]-=0       "@acc[8]"
274         subc    $carry,%g0,$carry       ! top-most carry
275          and    @acc[7],$mask,@acc[7]
276          and    @acc[0],$mask,@acc[0]   ! "@acc[8]"
277 ___
278         push(@acc,shift(@acc));         # rotate registers to "omit" acc[0]
279 $code.=<<___;
280         mulx    $a0,$bi,$t0             ! a[0-7]*b[$i], 64-bit results
281         mulx    $t1,$bi,$t1
282         mulx    $t2,$bi,$t2
283         mulx    $t3,$bi,$t3
284         mulx    $t4,$bi,$t4
285         mulx    $t5,$bi,$t5
286         mulx    $t6,$bi,$t6
287         mulx    $t7,$bi,$t7
288         add     @acc[0],$t0,$t0         ! accumulate low parts, can't overflow
289         add     @acc[1],$t1,$t1
290         srlx    $t0,32,@acc[1]          ! extract high parts
291         add     @acc[2],$t2,$t2
292         srlx    $t1,32,@acc[2]
293         add     @acc[3],$t3,$t3
294         srlx    $t2,32,@acc[3]
295         add     @acc[4],$t4,$t4
296         srlx    $t3,32,@acc[4]
297         add     @acc[5],$t5,$t5
298         srlx    $t4,32,@acc[5]
299         add     @acc[6],$t6,$t6
300         srlx    $t5,32,@acc[6]
301         add     @acc[7],$t7,$t7
302         srlx    $t6,32,@acc[7]
303         srlx    $t7,32,@acc[0]          ! "@acc[8]"
304 ___
305 }
306 $code.=<<___;
307         addcc   @acc[1],$t1,@acc[1]     ! accumulate high parts
308         addccc  @acc[2],$t2,@acc[2]
309         addccc  @acc[3],$t3,@acc[3]
310         addccc  @acc[4],$t4,@acc[4]
311         addccc  @acc[5],$t5,@acc[5]
312         addccc  @acc[6],$t6,@acc[6]
313         addccc  @acc[7],$t7,@acc[7]
314         addccc  @acc[0],$carry,@acc[0]  ! "@acc[8]"
315         addc    %g0,%g0,$carry
316
317         addcc   @acc[3],$t0,@acc[3]     ! multiplication-less reduction
318         addccc  @acc[4],%g0,@acc[4]
319         addccc  @acc[5],%g0,@acc[5]
320         addccc  @acc[6],$t0,@acc[6]
321         addccc  @acc[7],%g0,@acc[7]
322         addccc  @acc[0],$t0,@acc[0]     ! "@acc[8]"
323         addc    $carry,%g0,$carry
324         subcc   @acc[7],$t0,@acc[7]
325         subccc  @acc[0],%g0,@acc[0]     ! "@acc[8]"
326         subc    $carry,%g0,$carry       ! top-most carry
327 ___
328         push(@acc,shift(@acc));         # rotate registers to omit acc[0]
329 $code.=<<___;
330         ! Final step is "if result > mod, subtract mod", but we do it
331         ! "other way around", namely subtract modulus from result
332         ! and if it borrowed, add modulus back.
333
334         subcc   @acc[0],-1,@acc[0]      ! subtract modulus
335         subccc  @acc[1],-1,@acc[1]
336         subccc  @acc[2],-1,@acc[2]
337         subccc  @acc[3],0,@acc[3]
338         subccc  @acc[4],0,@acc[4]
339         subccc  @acc[5],0,@acc[5]
340         subccc  @acc[6],1,@acc[6]
341         subccc  @acc[7],-1,@acc[7]
342         subc    $carry,0,$carry         ! broadcast borrow bit
343
344         ! Note that because mod has special form, i.e. consists of
345         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
346         ! using value of broadcasted borrow and the borrow bit itself.
347         ! To minimize dependency chain we first broadcast and then
348         ! extract the bit by negating (follow $bi).
349
350         addcc   @acc[0],$carry,@acc[0]  ! add modulus or zero
351         addccc  @acc[1],$carry,@acc[1]
352         neg     $carry,$bi
353         st      @acc[0],[$rp]
354         addccc  @acc[2],$carry,@acc[2]
355         st      @acc[1],[$rp+4]
356         addccc  @acc[3],0,@acc[3]
357         st      @acc[2],[$rp+8]
358         addccc  @acc[4],0,@acc[4]
359         st      @acc[3],[$rp+12]
360         addccc  @acc[5],0,@acc[5]
361         st      @acc[4],[$rp+16]
362         addccc  @acc[6],$bi,@acc[6]
363         st      @acc[5],[$rp+20]
364         addc    @acc[7],$carry,@acc[7]
365         st      @acc[6],[$rp+24]
366         retl
367         st      @acc[7],[$rp+28]
368 .type   __ecp_nistz256_mul_mont,#function
369 .size   __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
370
371 ! void  ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8],
372 !                                        const BN_ULONG %i2[8]);
373 .globl  ecp_nistz256_add
374 .align  32
375 ecp_nistz256_add:
376         save    %sp,-STACK_FRAME,%sp
377         ld      [$ap],@acc[0]
378         ld      [$ap+4],@acc[1]
379         ld      [$ap+8],@acc[2]
380         ld      [$ap+12],@acc[3]
381         ld      [$ap+16],@acc[4]
382         ld      [$ap+20],@acc[5]
383         ld      [$ap+24],@acc[6]
384         call    __ecp_nistz256_add
385         ld      [$ap+28],@acc[7]
386         ret
387         restore
388 .type   ecp_nistz256_add,#function
389 .size   ecp_nistz256_add,.-ecp_nistz256_add
390
391 .align  32
392 __ecp_nistz256_add:
393         ld      [$bp+0],$t0             ! b[0]
394         ld      [$bp+4],$t1
395         ld      [$bp+8],$t2
396         ld      [$bp+12],$t3
397         addcc   @acc[0],$t0,@acc[0]
398         ld      [$bp+16],$t4
399         ld      [$bp+20],$t5
400         addccc  @acc[1],$t1,@acc[1]
401         ld      [$bp+24],$t6
402         ld      [$bp+28],$t7
403         addccc  @acc[2],$t2,@acc[2]
404         addccc  @acc[3],$t3,@acc[3]
405         addccc  @acc[4],$t4,@acc[4]
406         addccc  @acc[5],$t5,@acc[5]
407         addccc  @acc[6],$t6,@acc[6]
408         addccc  @acc[7],$t7,@acc[7]
409         addc    %g0,%g0,$carry
410
411 .Lreduce_by_sub:
412
413         ! if a+b >= modulus, subtract modulus.
414         !
415         ! But since comparison implies subtraction, we subtract
416         ! modulus and then add it back if subraction borrowed.
417
418         subcc   @acc[0],-1,@acc[0]
419         subccc  @acc[1],-1,@acc[1]
420         subccc  @acc[2],-1,@acc[2]
421         subccc  @acc[3], 0,@acc[3]
422         subccc  @acc[4], 0,@acc[4]
423         subccc  @acc[5], 0,@acc[5]
424         subccc  @acc[6], 1,@acc[6]
425         subccc  @acc[7],-1,@acc[7]
426         subc    $carry,0,$carry
427
428         ! Note that because mod has special form, i.e. consists of
429         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
430         ! using value of borrow and its negative.
431
432         addcc   @acc[0],$carry,@acc[0]  ! add synthesized modulus
433         addccc  @acc[1],$carry,@acc[1]
434         neg     $carry,$bi
435         st      @acc[0],[$rp]
436         addccc  @acc[2],$carry,@acc[2]
437         st      @acc[1],[$rp+4]
438         addccc  @acc[3],0,@acc[3]
439         st      @acc[2],[$rp+8]
440         addccc  @acc[4],0,@acc[4]
441         st      @acc[3],[$rp+12]
442         addccc  @acc[5],0,@acc[5]
443         st      @acc[4],[$rp+16]
444         addccc  @acc[6],$bi,@acc[6]
445         st      @acc[5],[$rp+20]
446         addc    @acc[7],$carry,@acc[7]
447         st      @acc[6],[$rp+24]
448         retl
449         st      @acc[7],[$rp+28]
450 .type   __ecp_nistz256_add,#function
451 .size   __ecp_nistz256_add,.-__ecp_nistz256_add
452
453 ! void  ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
454 .globl  ecp_nistz256_mul_by_2
455 .align  32
456 ecp_nistz256_mul_by_2:
457         save    %sp,-STACK_FRAME,%sp
458         ld      [$ap],@acc[0]
459         ld      [$ap+4],@acc[1]
460         ld      [$ap+8],@acc[2]
461         ld      [$ap+12],@acc[3]
462         ld      [$ap+16],@acc[4]
463         ld      [$ap+20],@acc[5]
464         ld      [$ap+24],@acc[6]
465         call    __ecp_nistz256_mul_by_2
466         ld      [$ap+28],@acc[7]
467         ret
468         restore
469 .type   ecp_nistz256_mul_by_2,#function
470 .size   ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
471
472 .align  32
473 __ecp_nistz256_mul_by_2:
474         addcc   @acc[0],@acc[0],@acc[0] ! a+a=2*a
475         addccc  @acc[1],@acc[1],@acc[1]
476         addccc  @acc[2],@acc[2],@acc[2]
477         addccc  @acc[3],@acc[3],@acc[3]
478         addccc  @acc[4],@acc[4],@acc[4]
479         addccc  @acc[5],@acc[5],@acc[5]
480         addccc  @acc[6],@acc[6],@acc[6]
481         addccc  @acc[7],@acc[7],@acc[7]
482         b       .Lreduce_by_sub
483         addc    %g0,%g0,$carry
484 .type   __ecp_nistz256_mul_by_2,#function
485 .size   __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
486
487 ! void  ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
488 .globl  ecp_nistz256_mul_by_3
489 .align  32
490 ecp_nistz256_mul_by_3:
491         save    %sp,-STACK_FRAME,%sp
492         ld      [$ap],@acc[0]
493         ld      [$ap+4],@acc[1]
494         ld      [$ap+8],@acc[2]
495         ld      [$ap+12],@acc[3]
496         ld      [$ap+16],@acc[4]
497         ld      [$ap+20],@acc[5]
498         ld      [$ap+24],@acc[6]
499         call    __ecp_nistz256_mul_by_3
500         ld      [$ap+28],@acc[7]
501         ret
502         restore
503 .type   ecp_nistz256_mul_by_3,#function
504 .size   ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
505
506 .align  32
507 __ecp_nistz256_mul_by_3:
508         addcc   @acc[0],@acc[0],$t0     ! a+a=2*a
509         addccc  @acc[1],@acc[1],$t1
510         addccc  @acc[2],@acc[2],$t2
511         addccc  @acc[3],@acc[3],$t3
512         addccc  @acc[4],@acc[4],$t4
513         addccc  @acc[5],@acc[5],$t5
514         addccc  @acc[6],@acc[6],$t6
515         addccc  @acc[7],@acc[7],$t7
516         addc    %g0,%g0,$carry
517
518         subcc   $t0,-1,$t0              ! .Lreduce_by_sub but without stores
519         subccc  $t1,-1,$t1
520         subccc  $t2,-1,$t2
521         subccc  $t3, 0,$t3
522         subccc  $t4, 0,$t4
523         subccc  $t5, 0,$t5
524         subccc  $t6, 1,$t6
525         subccc  $t7,-1,$t7
526         subc    $carry,0,$carry
527
528         addcc   $t0,$carry,$t0          ! add synthesized modulus
529         addccc  $t1,$carry,$t1
530         neg     $carry,$bi
531         addccc  $t2,$carry,$t2
532         addccc  $t3,0,$t3
533         addccc  $t4,0,$t4
534         addccc  $t5,0,$t5
535         addccc  $t6,$bi,$t6
536         addc    $t7,$carry,$t7
537
538         addcc   $t0,@acc[0],@acc[0]     ! 2*a+a=3*a
539         addccc  $t1,@acc[1],@acc[1]
540         addccc  $t2,@acc[2],@acc[2]
541         addccc  $t3,@acc[3],@acc[3]
542         addccc  $t4,@acc[4],@acc[4]
543         addccc  $t5,@acc[5],@acc[5]
544         addccc  $t6,@acc[6],@acc[6]
545         addccc  $t7,@acc[7],@acc[7]
546         b       .Lreduce_by_sub
547         addc    %g0,%g0,$carry
548 .type   __ecp_nistz256_mul_by_3,#function
549 .size   __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
550
551 ! void  ecp_nistz256_sub(BN_ULONG %i0[8],const BN_ULONG %i1[8],
552 !                                        const BN_ULONG %i2[8]);
553 .globl  ecp_nistz256_sub
554 .align  32
555 ecp_nistz256_sub:
556         save    %sp,-STACK_FRAME,%sp
557         ld      [$ap],@acc[0]
558         ld      [$ap+4],@acc[1]
559         ld      [$ap+8],@acc[2]
560         ld      [$ap+12],@acc[3]
561         ld      [$ap+16],@acc[4]
562         ld      [$ap+20],@acc[5]
563         ld      [$ap+24],@acc[6]
564         call    __ecp_nistz256_sub_from
565         ld      [$ap+28],@acc[7]
566         ret
567         restore
568 .type   ecp_nistz256_sub,#function
569 .size   ecp_nistz256_sub,.-ecp_nistz256_sub
570
571 ! void  ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
572 .globl  ecp_nistz256_neg
573 .align  32
574 ecp_nistz256_neg:
575         save    %sp,-STACK_FRAME,%sp
576         mov     $ap,$bp
577         mov     0,@acc[0]
578         mov     0,@acc[1]
579         mov     0,@acc[2]
580         mov     0,@acc[3]
581         mov     0,@acc[4]
582         mov     0,@acc[5]
583         mov     0,@acc[6]
584         call    __ecp_nistz256_sub_from
585         mov     0,@acc[7]
586         ret
587         restore
588 .type   ecp_nistz256_neg,#function
589 .size   ecp_nistz256_neg,.-ecp_nistz256_neg
590
591 .align  32
592 __ecp_nistz256_sub_from:
593         ld      [$bp+0],$t0             ! b[0]
594         ld      [$bp+4],$t1
595         ld      [$bp+8],$t2
596         ld      [$bp+12],$t3
597         subcc   @acc[0],$t0,@acc[0]
598         ld      [$bp+16],$t4
599         ld      [$bp+20],$t5
600         subccc  @acc[1],$t1,@acc[1]
601         subccc  @acc[2],$t2,@acc[2]
602         ld      [$bp+24],$t6
603         ld      [$bp+28],$t7
604         subccc  @acc[3],$t3,@acc[3]
605         subccc  @acc[4],$t4,@acc[4]
606         subccc  @acc[5],$t5,@acc[5]
607         subccc  @acc[6],$t6,@acc[6]
608         subccc  @acc[7],$t7,@acc[7]
609         subc    %g0,%g0,$carry          ! broadcast borrow bit
610
611 .Lreduce_by_add:
612
613         ! if a-b borrows, add modulus.
614         !
615         ! Note that because mod has special form, i.e. consists of
616         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
617         ! using value of broadcasted borrow and the borrow bit itself.
618         ! To minimize dependency chain we first broadcast and then
619         ! extract the bit by negating (follow $bi).
620
621         addcc   @acc[0],$carry,@acc[0]  ! add synthesized modulus
622         addccc  @acc[1],$carry,@acc[1]
623         neg     $carry,$bi
624         st      @acc[0],[$rp]
625         addccc  @acc[2],$carry,@acc[2]
626         st      @acc[1],[$rp+4]
627         addccc  @acc[3],0,@acc[3]
628         st      @acc[2],[$rp+8]
629         addccc  @acc[4],0,@acc[4]
630         st      @acc[3],[$rp+12]
631         addccc  @acc[5],0,@acc[5]
632         st      @acc[4],[$rp+16]
633         addccc  @acc[6],$bi,@acc[6]
634         st      @acc[5],[$rp+20]
635         addc    @acc[7],$carry,@acc[7]
636         st      @acc[6],[$rp+24]
637         retl
638         st      @acc[7],[$rp+28]
639 .type   __ecp_nistz256_sub_from,#function
640 .size   __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
641
642 .align  32
643 __ecp_nistz256_sub_morf:
644         ld      [$bp+0],$t0             ! b[0]
645         ld      [$bp+4],$t1
646         ld      [$bp+8],$t2
647         ld      [$bp+12],$t3
648         subcc   $t0,@acc[0],@acc[0]
649         ld      [$bp+16],$t4
650         ld      [$bp+20],$t5
651         subccc  $t1,@acc[1],@acc[1]
652         subccc  $t2,@acc[2],@acc[2]
653         ld      [$bp+24],$t6
654         ld      [$bp+28],$t7
655         subccc  $t3,@acc[3],@acc[3]
656         subccc  $t4,@acc[4],@acc[4]
657         subccc  $t5,@acc[5],@acc[5]
658         subccc  $t6,@acc[6],@acc[6]
659         subccc  $t7,@acc[7],@acc[7]
660         b       .Lreduce_by_add
661         subc    %g0,%g0,$carry          ! broadcast borrow bit
662 .type   __ecp_nistz256_sub_morf,#function
663 .size   __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
664
665 ! void  ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
666 .globl  ecp_nistz256_div_by_2
667 .align  32
668 ecp_nistz256_div_by_2:
669         save    %sp,-STACK_FRAME,%sp
670         ld      [$ap],@acc[0]
671         ld      [$ap+4],@acc[1]
672         ld      [$ap+8],@acc[2]
673         ld      [$ap+12],@acc[3]
674         ld      [$ap+16],@acc[4]
675         ld      [$ap+20],@acc[5]
676         ld      [$ap+24],@acc[6]
677         call    __ecp_nistz256_div_by_2
678         ld      [$ap+28],@acc[7]
679         ret
680         restore
681 .type   ecp_nistz256_div_by_2,#function
682 .size   ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
683
684 .align  32
685 __ecp_nistz256_div_by_2:
686         ! ret = (a is odd ? a+mod : a) >> 1
687
688         and     @acc[0],1,$bi
689         neg     $bi,$carry
690         addcc   @acc[0],$carry,@acc[0]
691         addccc  @acc[1],$carry,@acc[1]
692         addccc  @acc[2],$carry,@acc[2]
693         addccc  @acc[3],0,@acc[3]
694         addccc  @acc[4],0,@acc[4]
695         addccc  @acc[5],0,@acc[5]
696         addccc  @acc[6],$bi,@acc[6]
697         addccc  @acc[7],$carry,@acc[7]
698         addc    %g0,%g0,$carry
699
700         ! ret >>= 1
701
702         srl     @acc[0],1,@acc[0]
703         sll     @acc[1],31,$t0
704         srl     @acc[1],1,@acc[1]
705         or      @acc[0],$t0,@acc[0]
706         sll     @acc[2],31,$t1
707         srl     @acc[2],1,@acc[2]
708         or      @acc[1],$t1,@acc[1]
709         sll     @acc[3],31,$t2
710         st      @acc[0],[$rp]
711         srl     @acc[3],1,@acc[3]
712         or      @acc[2],$t2,@acc[2]
713         sll     @acc[4],31,$t3
714         st      @acc[1],[$rp+4]
715         srl     @acc[4],1,@acc[4]
716         or      @acc[3],$t3,@acc[3]
717         sll     @acc[5],31,$t4
718         st      @acc[2],[$rp+8]
719         srl     @acc[5],1,@acc[5]
720         or      @acc[4],$t4,@acc[4]
721         sll     @acc[6],31,$t5
722         st      @acc[3],[$rp+12]
723         srl     @acc[6],1,@acc[6]
724         or      @acc[5],$t5,@acc[5]
725         sll     @acc[7],31,$t6
726         st      @acc[4],[$rp+16]
727         srl     @acc[7],1,@acc[7]
728         or      @acc[6],$t6,@acc[6]
729         sll     $carry,31,$t7
730         st      @acc[5],[$rp+20]
731         or      @acc[7],$t7,@acc[7]
732         st      @acc[6],[$rp+24]
733         retl
734         st      @acc[7],[$rp+28]
735 .type   __ecp_nistz256_div_by_2,#function
736 .size   __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
737 ___
738
739 ########################################################################
740 # following subroutines are "literal" implementation of those found in
741 # ecp_nistz256.c
742 #
743 ########################################################################
744 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
745 #
746 {
747 my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
748 # above map() describes stack layout with 4 temporary
749 # 256-bit vectors on top.
750
751 $code.=<<___;
752 #ifdef __PIC__
753 SPARC_PIC_THUNK(%g1)
754 #endif
755
756 .globl  ecp_nistz256_point_double
757 .align  32
758 ecp_nistz256_point_double:
759         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
760         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
761         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
762         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
763         be      ecp_nistz256_point_double_vis3
764         nop
765
766         save    %sp,-STACK_FRAME-32*4,%sp
767
768         mov     $rp,$rp_real
769         mov     $ap,$ap_real
770
771 .Lpoint_double_shortcut:
772         ld      [$ap+32],@acc[0]
773         ld      [$ap+32+4],@acc[1]
774         ld      [$ap+32+8],@acc[2]
775         ld      [$ap+32+12],@acc[3]
776         ld      [$ap+32+16],@acc[4]
777         ld      [$ap+32+20],@acc[5]
778         ld      [$ap+32+24],@acc[6]
779         ld      [$ap+32+28],@acc[7]
780         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y);
781         add     %sp,LOCALS+$S,$rp
782
783         add     $ap_real,64,$bp
784         add     $ap_real,64,$ap
785         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z);
786         add     %sp,LOCALS+$Zsqr,$rp
787
788         add     $ap_real,0,$bp
789         call    __ecp_nistz256_add      ! p256_add(M, Zsqr, in_x);
790         add     %sp,LOCALS+$M,$rp
791
792         add     %sp,LOCALS+$S,$bp
793         add     %sp,LOCALS+$S,$ap
794         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S);
795         add     %sp,LOCALS+$S,$rp
796
797         ld      [$ap_real],@acc[0]
798         add     %sp,LOCALS+$Zsqr,$bp
799         ld      [$ap_real+4],@acc[1]
800         ld      [$ap_real+8],@acc[2]
801         ld      [$ap_real+12],@acc[3]
802         ld      [$ap_real+16],@acc[4]
803         ld      [$ap_real+20],@acc[5]
804         ld      [$ap_real+24],@acc[6]
805         ld      [$ap_real+28],@acc[7]
806         call    __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr);
807         add     %sp,LOCALS+$Zsqr,$rp
808
809         add     $ap_real,32,$bp
810         add     $ap_real,64,$ap
811         call    __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y);
812         add     %sp,LOCALS+$tmp0,$rp
813
814         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0);
815         add     $rp_real,64,$rp
816
817         add     %sp,LOCALS+$Zsqr,$bp
818         add     %sp,LOCALS+$M,$ap
819         call    __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr);
820         add     %sp,LOCALS+$M,$rp
821
822         call    __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M);
823         add     %sp,LOCALS+$M,$rp
824
825         add     %sp,LOCALS+$S,$bp
826         add     %sp,LOCALS+$S,$ap
827         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S);
828         add     %sp,LOCALS+$tmp0,$rp
829
830         call    __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0);
831         add     $rp_real,32,$rp
832
833         add     $ap_real,0,$bp
834         add     %sp,LOCALS+$S,$ap
835         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x);
836         add     %sp,LOCALS+$S,$rp
837
838         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S);
839         add     %sp,LOCALS+$tmp0,$rp
840
841         add     %sp,LOCALS+$M,$bp
842         add     %sp,LOCALS+$M,$ap
843         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M);
844         add     $rp_real,0,$rp
845
846         add     %sp,LOCALS+$tmp0,$bp
847         call    __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0);
848         add     $rp_real,0,$rp
849
850         add     %sp,LOCALS+$S,$bp
851         call    __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x);
852         add     %sp,LOCALS+$S,$rp
853
854         add     %sp,LOCALS+$M,$bp
855         add     %sp,LOCALS+$S,$ap
856         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M);
857         add     %sp,LOCALS+$S,$rp
858
859         add     $rp_real,32,$bp
860         call    __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y);
861         add     $rp_real,32,$rp
862
863         ret
864         restore
865 .type   ecp_nistz256_point_double,#function
866 .size   ecp_nistz256_point_double,.-ecp_nistz256_point_double
867 ___
868 }
869
870 ########################################################################
871 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
872 #                             const P256_POINT *in2);
873 {
874 my ($res_x,$res_y,$res_z,
875     $H,$Hsqr,$R,$Rsqr,$Hcub,
876     $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
877 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
878
879 # above map() describes stack layout with 12 temporary
880 # 256-bit vectors on top. Then we reserve some space for
881 # !in1infty, !in2infty, result of check for zero and return pointer.
882
883 my $bp_real=$rp_real;
884
885 $code.=<<___;
886 .globl  ecp_nistz256_point_add
887 .align  32
888 ecp_nistz256_point_add:
889         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
890         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
891         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
892         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
893         be      ecp_nistz256_point_add_vis3
894         nop
895
896         save    %sp,-STACK_FRAME-32*12-32,%sp
897
898         stx     $rp,[%fp+STACK_BIAS-8]  ! off-load $rp
899         mov     $ap,$ap_real
900         mov     $bp,$bp_real
901
902         ld      [$bp],@acc[0]           ! in2_x
903         ld      [$bp+4],@acc[1]
904         ld      [$bp+8],@acc[2]
905         ld      [$bp+12],@acc[3]
906         ld      [$bp+16],@acc[4]
907         ld      [$bp+20],@acc[5]
908         ld      [$bp+24],@acc[6]
909         ld      [$bp+28],@acc[7]
910         ld      [$bp+32],$t0            ! in2_y
911         ld      [$bp+32+4],$t1
912         ld      [$bp+32+8],$t2
913         ld      [$bp+32+12],$t3
914         ld      [$bp+32+16],$t4
915         ld      [$bp+32+20],$t5
916         ld      [$bp+32+24],$t6
917         ld      [$bp+32+28],$t7
918         or      @acc[1],@acc[0],@acc[0]
919         or      @acc[3],@acc[2],@acc[2]
920         or      @acc[5],@acc[4],@acc[4]
921         or      @acc[7],@acc[6],@acc[6]
922         or      @acc[2],@acc[0],@acc[0]
923         or      @acc[6],@acc[4],@acc[4]
924         or      @acc[4],@acc[0],@acc[0]
925         or      $t1,$t0,$t0
926         or      $t3,$t2,$t2
927         or      $t5,$t4,$t4
928         or      $t7,$t6,$t6
929         or      $t2,$t0,$t0
930         or      $t6,$t4,$t4
931         or      $t4,$t0,$t0
932         or      @acc[0],$t0,$t0         ! !in2infty
933         movrnz  $t0,-1,$t0
934         st      $t0,[%fp+STACK_BIAS-12]
935
936         ld      [$ap],@acc[0]           ! in1_x
937         ld      [$ap+4],@acc[1]
938         ld      [$ap+8],@acc[2]
939         ld      [$ap+12],@acc[3]
940         ld      [$ap+16],@acc[4]
941         ld      [$ap+20],@acc[5]
942         ld      [$ap+24],@acc[6]
943         ld      [$ap+28],@acc[7]
944         ld      [$ap+32],$t0            ! in1_y
945         ld      [$ap+32+4],$t1
946         ld      [$ap+32+8],$t2
947         ld      [$ap+32+12],$t3
948         ld      [$ap+32+16],$t4
949         ld      [$ap+32+20],$t5
950         ld      [$ap+32+24],$t6
951         ld      [$ap+32+28],$t7
952         or      @acc[1],@acc[0],@acc[0]
953         or      @acc[3],@acc[2],@acc[2]
954         or      @acc[5],@acc[4],@acc[4]
955         or      @acc[7],@acc[6],@acc[6]
956         or      @acc[2],@acc[0],@acc[0]
957         or      @acc[6],@acc[4],@acc[4]
958         or      @acc[4],@acc[0],@acc[0]
959         or      $t1,$t0,$t0
960         or      $t3,$t2,$t2
961         or      $t5,$t4,$t4
962         or      $t7,$t6,$t6
963         or      $t2,$t0,$t0
964         or      $t6,$t4,$t4
965         or      $t4,$t0,$t0
966         or      @acc[0],$t0,$t0         ! !in1infty
967         movrnz  $t0,-1,$t0
968         st      $t0,[%fp+STACK_BIAS-16]
969
970         add     $bp_real,64,$bp
971         add     $bp_real,64,$ap
972         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z);
973         add     %sp,LOCALS+$Z2sqr,$rp
974
975         add     $ap_real,64,$bp
976         add     $ap_real,64,$ap
977         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
978         add     %sp,LOCALS+$Z1sqr,$rp
979
980         add     $bp_real,64,$bp
981         add     %sp,LOCALS+$Z2sqr,$ap
982         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z);
983         add     %sp,LOCALS+$S1,$rp
984
985         add     $ap_real,64,$bp
986         add     %sp,LOCALS+$Z1sqr,$ap
987         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
988         add     %sp,LOCALS+$S2,$rp
989
990         add     $ap_real,32,$bp
991         add     %sp,LOCALS+$S1,$ap
992         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y);
993         add     %sp,LOCALS+$S1,$rp
994
995         add     $bp_real,32,$bp
996         add     %sp,LOCALS+$S2,$ap
997         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
998         add     %sp,LOCALS+$S2,$rp
999
1000         add     %sp,LOCALS+$S1,$bp
1001         call    __ecp_nistz256_sub_from ! p256_sub(R, S2, S1);
1002         add     %sp,LOCALS+$R,$rp
1003
1004         or      @acc[1],@acc[0],@acc[0] ! see if result is zero
1005         or      @acc[3],@acc[2],@acc[2]
1006         or      @acc[5],@acc[4],@acc[4]
1007         or      @acc[7],@acc[6],@acc[6]
1008         or      @acc[2],@acc[0],@acc[0]
1009         or      @acc[6],@acc[4],@acc[4]
1010         or      @acc[4],@acc[0],@acc[0]
1011         st      @acc[0],[%fp+STACK_BIAS-20]
1012
1013         add     $ap_real,0,$bp
1014         add     %sp,LOCALS+$Z2sqr,$ap
1015         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr);
1016         add     %sp,LOCALS+$U1,$rp
1017
1018         add     $bp_real,0,$bp
1019         add     %sp,LOCALS+$Z1sqr,$ap
1020         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr);
1021         add     %sp,LOCALS+$U2,$rp
1022
1023         add     %sp,LOCALS+$U1,$bp
1024         call    __ecp_nistz256_sub_from ! p256_sub(H, U2, U1);
1025         add     %sp,LOCALS+$H,$rp
1026
1027         or      @acc[1],@acc[0],@acc[0] ! see if result is zero
1028         or      @acc[3],@acc[2],@acc[2]
1029         or      @acc[5],@acc[4],@acc[4]
1030         or      @acc[7],@acc[6],@acc[6]
1031         or      @acc[2],@acc[0],@acc[0]
1032         or      @acc[6],@acc[4],@acc[4]
1033         orcc    @acc[4],@acc[0],@acc[0]
1034
1035         bne,pt  %icc,.Ladd_proceed      ! is_equal(U1,U2)?
1036         nop
1037
1038         ld      [%fp+STACK_BIAS-12],$t0
1039         ld      [%fp+STACK_BIAS-16],$t1
1040         ld      [%fp+STACK_BIAS-20],$t2
1041         andcc   $t0,$t1,%g0
1042         be,pt   %icc,.Ladd_proceed      ! (in1infty || in2infty)?
1043         nop
1044         andcc   $t2,$t2,%g0
1045         be,pt   %icc,.Ladd_double       ! is_equal(S1,S2)?
1046         nop
1047
1048         ldx     [%fp+STACK_BIAS-8],$rp
1049         st      %g0,[$rp]
1050         st      %g0,[$rp+4]
1051         st      %g0,[$rp+8]
1052         st      %g0,[$rp+12]
1053         st      %g0,[$rp+16]
1054         st      %g0,[$rp+20]
1055         st      %g0,[$rp+24]
1056         st      %g0,[$rp+28]
1057         st      %g0,[$rp+32]
1058         st      %g0,[$rp+32+4]
1059         st      %g0,[$rp+32+8]
1060         st      %g0,[$rp+32+12]
1061         st      %g0,[$rp+32+16]
1062         st      %g0,[$rp+32+20]
1063         st      %g0,[$rp+32+24]
1064         st      %g0,[$rp+32+28]
1065         st      %g0,[$rp+64]
1066         st      %g0,[$rp+64+4]
1067         st      %g0,[$rp+64+8]
1068         st      %g0,[$rp+64+12]
1069         st      %g0,[$rp+64+16]
1070         st      %g0,[$rp+64+20]
1071         st      %g0,[$rp+64+24]
1072         st      %g0,[$rp+64+28]
1073         b       .Ladd_done
1074         nop
1075
1076 .align  16
1077 .Ladd_double:
1078         ldx     [%fp+STACK_BIAS-8],$rp_real
1079         mov     $ap_real,$ap
1080         b       .Lpoint_double_shortcut
1081         add     %sp,32*(12-4)+32,%sp    ! difference in frame sizes
1082
1083 .align  16
1084 .Ladd_proceed:
1085         add     %sp,LOCALS+$R,$bp
1086         add     %sp,LOCALS+$R,$ap
1087         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
1088         add     %sp,LOCALS+$Rsqr,$rp
1089
1090         add     $ap_real,64,$bp
1091         add     %sp,LOCALS+$H,$ap
1092         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
1093         add     %sp,LOCALS+$res_z,$rp
1094
1095         add     %sp,LOCALS+$H,$bp
1096         add     %sp,LOCALS+$H,$ap
1097         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
1098         add     %sp,LOCALS+$Hsqr,$rp
1099
1100         add     $bp_real,64,$bp
1101         add     %sp,LOCALS+$res_z,$ap
1102         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z);
1103         add     %sp,LOCALS+$res_z,$rp
1104
1105         add     %sp,LOCALS+$H,$bp
1106         add     %sp,LOCALS+$Hsqr,$ap
1107         call    __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
1108         add     %sp,LOCALS+$Hcub,$rp
1109
1110         add     %sp,LOCALS+$U1,$bp
1111         add     %sp,LOCALS+$Hsqr,$ap
1112         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr);
1113         add     %sp,LOCALS+$U2,$rp
1114
1115         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1116         add     %sp,LOCALS+$Hsqr,$rp
1117
1118         add     %sp,LOCALS+$Rsqr,$bp
1119         call    __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1120         add     %sp,LOCALS+$res_x,$rp
1121
1122         add     %sp,LOCALS+$Hcub,$bp
1123         call    __ecp_nistz256_sub_from !  p256_sub(res_x, res_x, Hcub);
1124         add     %sp,LOCALS+$res_x,$rp
1125
1126         add     %sp,LOCALS+$U2,$bp
1127         call    __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1128         add     %sp,LOCALS+$res_y,$rp
1129
1130         add     %sp,LOCALS+$Hcub,$bp
1131         add     %sp,LOCALS+$S1,$ap
1132         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub);
1133         add     %sp,LOCALS+$S2,$rp
1134
1135         add     %sp,LOCALS+$R,$bp
1136         add     %sp,LOCALS+$res_y,$ap
1137         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1138         add     %sp,LOCALS+$res_y,$rp
1139
1140         add     %sp,LOCALS+$S2,$bp
1141         call    __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1142         add     %sp,LOCALS+$res_y,$rp
1143
1144         ld      [%fp+STACK_BIAS-16],$t1 ! !in1infty
1145         ld      [%fp+STACK_BIAS-12],$t2 ! !in2infty
1146         ldx     [%fp+STACK_BIAS-8],$rp
1147 ___
1148 for($i=0;$i<96;$i+=8) {                 # conditional moves
1149 $code.=<<___;
1150         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1151         ld      [%sp+LOCALS+$i+4],@acc[1]
1152         ld      [$bp_real+$i],@acc[2]           ! in2
1153         ld      [$bp_real+$i+4],@acc[3]
1154         ld      [$ap_real+$i],@acc[4]           ! in1
1155         ld      [$ap_real+$i+4],@acc[5]
1156         movrz   $t1,@acc[2],@acc[0]
1157         movrz   $t1,@acc[3],@acc[1]
1158         movrz   $t2,@acc[4],@acc[0]
1159         movrz   $t2,@acc[5],@acc[1]
1160         st      @acc[0],[$rp+$i]
1161         st      @acc[1],[$rp+$i+4]
1162 ___
1163 }
1164 $code.=<<___;
1165 .Ladd_done:
1166         ret
1167         restore
1168 .type   ecp_nistz256_point_add,#function
1169 .size   ecp_nistz256_point_add,.-ecp_nistz256_point_add
1170 ___
1171 }
1172
1173 ########################################################################
1174 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1175 #                                    const P256_POINT_AFFINE *in2);
1176 {
1177 my ($res_x,$res_y,$res_z,
1178     $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1179 my $Z1sqr = $S2;
1180 # above map() describes stack layout with 10 temporary
1181 # 256-bit vectors on top. Then we reserve some space for
1182 # !in1infty, !in2infty, result of check for zero and return pointer.
1183
1184 my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1185 my $bp_real=$rp_real;
1186
1187 $code.=<<___;
1188 .globl  ecp_nistz256_point_add_affine
1189 .align  32
1190 ecp_nistz256_point_add_affine:
1191         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
1192         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
1193         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
1194         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
1195         be      ecp_nistz256_point_add_affine_vis3
1196         nop
1197
1198         save    %sp,-STACK_FRAME-32*10-32,%sp
1199
1200         stx     $rp,[%fp+STACK_BIAS-8]  ! off-load $rp
1201         mov     $ap,$ap_real
1202         mov     $bp,$bp_real
1203
1204         ld      [$ap],@acc[0]           ! in1_x
1205         ld      [$ap+4],@acc[1]
1206         ld      [$ap+8],@acc[2]
1207         ld      [$ap+12],@acc[3]
1208         ld      [$ap+16],@acc[4]
1209         ld      [$ap+20],@acc[5]
1210         ld      [$ap+24],@acc[6]
1211         ld      [$ap+28],@acc[7]
1212         ld      [$ap+32],$t0            ! in1_y
1213         ld      [$ap+32+4],$t1
1214         ld      [$ap+32+8],$t2
1215         ld      [$ap+32+12],$t3
1216         ld      [$ap+32+16],$t4
1217         ld      [$ap+32+20],$t5
1218         ld      [$ap+32+24],$t6
1219         ld      [$ap+32+28],$t7
1220         or      @acc[1],@acc[0],@acc[0]
1221         or      @acc[3],@acc[2],@acc[2]
1222         or      @acc[5],@acc[4],@acc[4]
1223         or      @acc[7],@acc[6],@acc[6]
1224         or      @acc[2],@acc[0],@acc[0]
1225         or      @acc[6],@acc[4],@acc[4]
1226         or      @acc[4],@acc[0],@acc[0]
1227         or      $t1,$t0,$t0
1228         or      $t3,$t2,$t2
1229         or      $t5,$t4,$t4
1230         or      $t7,$t6,$t6
1231         or      $t2,$t0,$t0
1232         or      $t6,$t4,$t4
1233         or      $t4,$t0,$t0
1234         or      @acc[0],$t0,$t0         ! !in1infty
1235         movrnz  $t0,-1,$t0
1236         st      $t0,[%fp+STACK_BIAS-16]
1237
1238         ld      [$bp],@acc[0]           ! in2_x
1239         ld      [$bp+4],@acc[1]
1240         ld      [$bp+8],@acc[2]
1241         ld      [$bp+12],@acc[3]
1242         ld      [$bp+16],@acc[4]
1243         ld      [$bp+20],@acc[5]
1244         ld      [$bp+24],@acc[6]
1245         ld      [$bp+28],@acc[7]
1246         ld      [$bp+32],$t0            ! in2_y
1247         ld      [$bp+32+4],$t1
1248         ld      [$bp+32+8],$t2
1249         ld      [$bp+32+12],$t3
1250         ld      [$bp+32+16],$t4
1251         ld      [$bp+32+20],$t5
1252         ld      [$bp+32+24],$t6
1253         ld      [$bp+32+28],$t7
1254         or      @acc[1],@acc[0],@acc[0]
1255         or      @acc[3],@acc[2],@acc[2]
1256         or      @acc[5],@acc[4],@acc[4]
1257         or      @acc[7],@acc[6],@acc[6]
1258         or      @acc[2],@acc[0],@acc[0]
1259         or      @acc[6],@acc[4],@acc[4]
1260         or      @acc[4],@acc[0],@acc[0]
1261         or      $t1,$t0,$t0
1262         or      $t3,$t2,$t2
1263         or      $t5,$t4,$t4
1264         or      $t7,$t6,$t6
1265         or      $t2,$t0,$t0
1266         or      $t6,$t4,$t4
1267         or      $t4,$t0,$t0
1268         or      @acc[0],$t0,$t0         ! !in2infty
1269         movrnz  $t0,-1,$t0
1270         st      $t0,[%fp+STACK_BIAS-12]
1271
1272         add     $ap_real,64,$bp
1273         add     $ap_real,64,$ap
1274         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
1275         add     %sp,LOCALS+$Z1sqr,$rp
1276
1277         add     $bp_real,0,$bp
1278         add     %sp,LOCALS+$Z1sqr,$ap
1279         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x);
1280         add     %sp,LOCALS+$U2,$rp
1281
1282         add     $ap_real,0,$bp
1283         call    __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x);
1284         add     %sp,LOCALS+$H,$rp
1285
1286         add     $ap_real,64,$bp
1287         add     %sp,LOCALS+$Z1sqr,$ap
1288         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
1289         add     %sp,LOCALS+$S2,$rp
1290
1291         add     $ap_real,64,$bp
1292         add     %sp,LOCALS+$H,$ap
1293         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
1294         add     %sp,LOCALS+$res_z,$rp
1295
1296         add     $bp_real,32,$bp
1297         add     %sp,LOCALS+$S2,$ap
1298         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
1299         add     %sp,LOCALS+$S2,$rp
1300
1301         add     $ap_real,32,$bp
1302         call    __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y);
1303         add     %sp,LOCALS+$R,$rp
1304
1305         add     %sp,LOCALS+$H,$bp
1306         add     %sp,LOCALS+$H,$ap
1307         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
1308         add     %sp,LOCALS+$Hsqr,$rp
1309
1310         add     %sp,LOCALS+$R,$bp
1311         add     %sp,LOCALS+$R,$ap
1312         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
1313         add     %sp,LOCALS+$Rsqr,$rp
1314
1315         add     %sp,LOCALS+$H,$bp
1316         add     %sp,LOCALS+$Hsqr,$ap
1317         call    __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
1318         add     %sp,LOCALS+$Hcub,$rp
1319
1320         add     $ap_real,0,$bp
1321         add     %sp,LOCALS+$Hsqr,$ap
1322         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr);
1323         add     %sp,LOCALS+$U2,$rp
1324
1325         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1326         add     %sp,LOCALS+$Hsqr,$rp
1327
1328         add     %sp,LOCALS+$Rsqr,$bp
1329         call    __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1330         add     %sp,LOCALS+$res_x,$rp
1331
1332         add     %sp,LOCALS+$Hcub,$bp
1333         call    __ecp_nistz256_sub_from !  p256_sub(res_x, res_x, Hcub);
1334         add     %sp,LOCALS+$res_x,$rp
1335
1336         add     %sp,LOCALS+$U2,$bp
1337         call    __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1338         add     %sp,LOCALS+$res_y,$rp
1339
1340         add     $ap_real,32,$bp
1341         add     %sp,LOCALS+$Hcub,$ap
1342         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub);
1343         add     %sp,LOCALS+$S2,$rp
1344
1345         add     %sp,LOCALS+$R,$bp
1346         add     %sp,LOCALS+$res_y,$ap
1347         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1348         add     %sp,LOCALS+$res_y,$rp
1349
1350         add     %sp,LOCALS+$S2,$bp
1351         call    __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1352         add     %sp,LOCALS+$res_y,$rp
1353
1354         ld      [%fp+STACK_BIAS-16],$t1 ! !in1infty
1355         ld      [%fp+STACK_BIAS-12],$t2 ! !in2infty
1356         ldx     [%fp+STACK_BIAS-8],$rp
1357 ___
1358 for($i=0;$i<64;$i+=8) {                 # conditional moves
1359 $code.=<<___;
1360         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1361         ld      [%sp+LOCALS+$i+4],@acc[1]
1362         ld      [$bp_real+$i],@acc[2]           ! in2
1363         ld      [$bp_real+$i+4],@acc[3]
1364         ld      [$ap_real+$i],@acc[4]           ! in1
1365         ld      [$ap_real+$i+4],@acc[5]
1366         movrz   $t1,@acc[2],@acc[0]
1367         movrz   $t1,@acc[3],@acc[1]
1368         movrz   $t2,@acc[4],@acc[0]
1369         movrz   $t2,@acc[5],@acc[1]
1370         st      @acc[0],[$rp+$i]
1371         st      @acc[1],[$rp+$i+4]
1372 ___
1373 }
1374 for(;$i<96;$i+=8) {
1375 my $j=($i-64)/4;
1376 $code.=<<___;
1377         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1378         ld      [%sp+LOCALS+$i+4],@acc[1]
1379         ld      [$ap_real+$i],@acc[4]           ! in1
1380         ld      [$ap_real+$i+4],@acc[5]
1381         movrz   $t1,@ONE_mont[$j],@acc[0]
1382         movrz   $t1,@ONE_mont[$j+1],@acc[1]
1383         movrz   $t2,@acc[4],@acc[0]
1384         movrz   $t2,@acc[5],@acc[1]
1385         st      @acc[0],[$rp+$i]
1386         st      @acc[1],[$rp+$i+4]
1387 ___
1388 }
1389 $code.=<<___;
1390         ret
1391         restore
1392 .type   ecp_nistz256_point_add_affine,#function
1393 .size   ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1394 ___
1395 }                                                               }}}
1396 {{{
1397 my ($out,$inp,$index)=map("%i$_",(0..2));
1398 my $mask="%o0";
1399
1400 $code.=<<___;
1401 ! void  ecp_nistz256_scatter_w5(void *%i0,const P256_POINT *%i1,
1402 !                                         int %i2);
1403 .globl  ecp_nistz256_scatter_w5
1404 .align  32
1405 ecp_nistz256_scatter_w5:
1406         save    %sp,-STACK_FRAME,%sp
1407
1408         sll     $index,2,$index
1409         add     $out,$index,$out
1410
1411         ld      [$inp],%l0              ! X
1412         ld      [$inp+4],%l1
1413         ld      [$inp+8],%l2
1414         ld      [$inp+12],%l3
1415         ld      [$inp+16],%l4
1416         ld      [$inp+20],%l5
1417         ld      [$inp+24],%l6
1418         ld      [$inp+28],%l7
1419         add     $inp,32,$inp
1420         st      %l0,[$out+64*0-4]
1421         st      %l1,[$out+64*1-4]
1422         st      %l2,[$out+64*2-4]
1423         st      %l3,[$out+64*3-4]
1424         st      %l4,[$out+64*4-4]
1425         st      %l5,[$out+64*5-4]
1426         st      %l6,[$out+64*6-4]
1427         st      %l7,[$out+64*7-4]
1428         add     $out,64*8,$out
1429
1430         ld      [$inp],%l0              ! Y
1431         ld      [$inp+4],%l1
1432         ld      [$inp+8],%l2
1433         ld      [$inp+12],%l3
1434         ld      [$inp+16],%l4
1435         ld      [$inp+20],%l5
1436         ld      [$inp+24],%l6
1437         ld      [$inp+28],%l7
1438         add     $inp,32,$inp
1439         st      %l0,[$out+64*0-4]
1440         st      %l1,[$out+64*1-4]
1441         st      %l2,[$out+64*2-4]
1442         st      %l3,[$out+64*3-4]
1443         st      %l4,[$out+64*4-4]
1444         st      %l5,[$out+64*5-4]
1445         st      %l6,[$out+64*6-4]
1446         st      %l7,[$out+64*7-4]
1447         add     $out,64*8,$out
1448
1449         ld      [$inp],%l0              ! Z
1450         ld      [$inp+4],%l1
1451         ld      [$inp+8],%l2
1452         ld      [$inp+12],%l3
1453         ld      [$inp+16],%l4
1454         ld      [$inp+20],%l5
1455         ld      [$inp+24],%l6
1456         ld      [$inp+28],%l7
1457         st      %l0,[$out+64*0-4]
1458         st      %l1,[$out+64*1-4]
1459         st      %l2,[$out+64*2-4]
1460         st      %l3,[$out+64*3-4]
1461         st      %l4,[$out+64*4-4]
1462         st      %l5,[$out+64*5-4]
1463         st      %l6,[$out+64*6-4]
1464         st      %l7,[$out+64*7-4]
1465
1466         ret
1467         restore
1468 .type   ecp_nistz256_scatter_w5,#function
1469 .size   ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1470
1471 ! void  ecp_nistz256_gather_w5(P256_POINT *%i0,const void *%i1,
1472 !                                              int %i2);
1473 .globl  ecp_nistz256_gather_w5
1474 .align  32
1475 ecp_nistz256_gather_w5:
1476         save    %sp,-STACK_FRAME,%sp
1477
1478         neg     $index,$mask
1479         srax    $mask,63,$mask
1480
1481         add     $index,$mask,$index
1482         sll     $index,2,$index
1483         add     $inp,$index,$inp
1484
1485         ld      [$inp+64*0],%l0
1486         ld      [$inp+64*1],%l1
1487         ld      [$inp+64*2],%l2
1488         ld      [$inp+64*3],%l3
1489         ld      [$inp+64*4],%l4
1490         ld      [$inp+64*5],%l5
1491         ld      [$inp+64*6],%l6
1492         ld      [$inp+64*7],%l7
1493         add     $inp,64*8,$inp
1494         and     %l0,$mask,%l0
1495         and     %l1,$mask,%l1
1496         st      %l0,[$out]              ! X
1497         and     %l2,$mask,%l2
1498         st      %l1,[$out+4]
1499         and     %l3,$mask,%l3
1500         st      %l2,[$out+8]
1501         and     %l4,$mask,%l4
1502         st      %l3,[$out+12]
1503         and     %l5,$mask,%l5
1504         st      %l4,[$out+16]
1505         and     %l6,$mask,%l6
1506         st      %l5,[$out+20]
1507         and     %l7,$mask,%l7
1508         st      %l6,[$out+24]
1509         st      %l7,[$out+28]
1510         add     $out,32,$out
1511
1512         ld      [$inp+64*0],%l0
1513         ld      [$inp+64*1],%l1
1514         ld      [$inp+64*2],%l2
1515         ld      [$inp+64*3],%l3
1516         ld      [$inp+64*4],%l4
1517         ld      [$inp+64*5],%l5
1518         ld      [$inp+64*6],%l6
1519         ld      [$inp+64*7],%l7
1520         add     $inp,64*8,$inp
1521         and     %l0,$mask,%l0
1522         and     %l1,$mask,%l1
1523         st      %l0,[$out]              ! Y
1524         and     %l2,$mask,%l2
1525         st      %l1,[$out+4]
1526         and     %l3,$mask,%l3
1527         st      %l2,[$out+8]
1528         and     %l4,$mask,%l4
1529         st      %l3,[$out+12]
1530         and     %l5,$mask,%l5
1531         st      %l4,[$out+16]
1532         and     %l6,$mask,%l6
1533         st      %l5,[$out+20]
1534         and     %l7,$mask,%l7
1535         st      %l6,[$out+24]
1536         st      %l7,[$out+28]
1537         add     $out,32,$out
1538
1539         ld      [$inp+64*0],%l0
1540         ld      [$inp+64*1],%l1
1541         ld      [$inp+64*2],%l2
1542         ld      [$inp+64*3],%l3
1543         ld      [$inp+64*4],%l4
1544         ld      [$inp+64*5],%l5
1545         ld      [$inp+64*6],%l6
1546         ld      [$inp+64*7],%l7
1547         and     %l0,$mask,%l0
1548         and     %l1,$mask,%l1
1549         st      %l0,[$out]              ! Z
1550         and     %l2,$mask,%l2
1551         st      %l1,[$out+4]
1552         and     %l3,$mask,%l3
1553         st      %l2,[$out+8]
1554         and     %l4,$mask,%l4
1555         st      %l3,[$out+12]
1556         and     %l5,$mask,%l5
1557         st      %l4,[$out+16]
1558         and     %l6,$mask,%l6
1559         st      %l5,[$out+20]
1560         and     %l7,$mask,%l7
1561         st      %l6,[$out+24]
1562         st      %l7,[$out+28]
1563
1564         ret
1565         restore
1566 .type   ecp_nistz256_gather_w5,#function
1567 .size   ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1568
1569 ! void  ecp_nistz256_scatter_w7(void *%i0,const P256_POINT_AFFINE *%i1,
1570 !                                         int %i2);
1571 .globl  ecp_nistz256_scatter_w7
1572 .align  32
1573 ecp_nistz256_scatter_w7:
1574         save    %sp,-STACK_FRAME,%sp
1575         nop
1576         add     $out,$index,$out
1577         mov     64/4,$index
1578 .Loop_scatter_w7:
1579         ld      [$inp],%l0
1580         add     $inp,4,$inp
1581         subcc   $index,1,$index
1582         stb     %l0,[$out+64*0-1]
1583         srl     %l0,8,%l1
1584         stb     %l1,[$out+64*1-1]
1585         srl     %l0,16,%l2
1586         stb     %l2,[$out+64*2-1]
1587         srl     %l0,24,%l3
1588         stb     %l3,[$out+64*3-1]
1589         bne     .Loop_scatter_w7
1590         add     $out,64*4,$out
1591
1592         ret
1593         restore
1594 .type   ecp_nistz256_scatter_w7,#function
1595 .size   ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1596
1597 ! void  ecp_nistz256_gather_w7(P256_POINT_AFFINE *%i0,const void *%i1,
1598 !                                                     int %i2);
1599 .globl  ecp_nistz256_gather_w7
1600 .align  32
1601 ecp_nistz256_gather_w7:
1602         save    %sp,-STACK_FRAME,%sp
1603
1604         neg     $index,$mask
1605         srax    $mask,63,$mask
1606
1607         add     $index,$mask,$index
1608         add     $inp,$index,$inp
1609         mov     64/4,$index
1610
1611 .Loop_gather_w7:
1612         ldub    [$inp+64*0],%l0
1613         prefetch [$inp+3840+64*0],1
1614         subcc   $index,1,$index
1615         ldub    [$inp+64*1],%l1
1616         prefetch [$inp+3840+64*1],1
1617         ldub    [$inp+64*2],%l2
1618         prefetch [$inp+3840+64*2],1
1619         ldub    [$inp+64*3],%l3
1620         prefetch [$inp+3840+64*3],1
1621         add     $inp,64*4,$inp
1622         sll     %l1,8,%l1
1623         sll     %l2,16,%l2
1624         or      %l0,%l1,%l0
1625         sll     %l3,24,%l3
1626         or      %l0,%l2,%l0
1627         or      %l0,%l3,%l0
1628         and     %l0,$mask,%l0
1629         st      %l0,[$out]
1630         bne     .Loop_gather_w7
1631         add     $out,4,$out
1632
1633         ret
1634         restore
1635 .type   ecp_nistz256_gather_w7,#function
1636 .size   ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1637 ___
1638 }}}
1639 {{{
1640 ########################################################################
1641 # Following subroutines are VIS3 counterparts of those above that
1642 # implement ones found in ecp_nistz256.c. Key difference is that they
1643 # use 128-bit muliplication and addition with 64-bit carry, and in order
1644 # to do that they perform conversion from uin32_t[8] to uint64_t[4] upon
1645 # entry and vice versa on return.
1646 #
1647 my ($rp,$ap,$bp)=map("%i$_",(0..2));
1648 my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7));
1649 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5));
1650 my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1");
1651 my ($rp_real,$ap_real)=("%g2","%g3");
1652 my ($acc6,$acc7)=($bp,$bi);     # used in squaring
1653
1654 $code.=<<___;
1655 .align  32
1656 __ecp_nistz256_mul_by_2_vis3:
1657         addcc   $acc0,$acc0,$acc0
1658         addxccc $acc1,$acc1,$acc1
1659         addxccc $acc2,$acc2,$acc2
1660         addxccc $acc3,$acc3,$acc3
1661         b       .Lreduce_by_sub_vis3
1662         addxc   %g0,%g0,$acc4           ! did it carry?
1663 .type   __ecp_nistz256_mul_by_2_vis3,#function
1664 .size   __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3
1665
1666 .align  32
1667 __ecp_nistz256_add_vis3:
1668         ldx     [$bp+0],$t0
1669         ldx     [$bp+8],$t1
1670         ldx     [$bp+16],$t2
1671         ldx     [$bp+24],$t3
1672
1673 __ecp_nistz256_add_noload_vis3:
1674
1675         addcc   $t0,$acc0,$acc0
1676         addxccc $t1,$acc1,$acc1
1677         addxccc $t2,$acc2,$acc2
1678         addxccc $t3,$acc3,$acc3
1679         addxc   %g0,%g0,$acc4           ! did it carry?
1680
1681 .Lreduce_by_sub_vis3:
1682
1683         addcc   $acc0,1,$t0             ! add -modulus, i.e. subtract
1684         addxccc $acc1,$poly1,$t1
1685         addxccc $acc2,$minus1,$t2
1686         addxccc $acc3,$poly3,$t3
1687         addxc   $acc4,$minus1,$acc4
1688
1689         movrz   $acc4,$t0,$acc0         ! ret = borrow ? ret : ret-modulus
1690         movrz   $acc4,$t1,$acc1
1691         stx     $acc0,[$rp]
1692         movrz   $acc4,$t2,$acc2
1693         stx     $acc1,[$rp+8]
1694         movrz   $acc4,$t3,$acc3
1695         stx     $acc2,[$rp+16]
1696         retl
1697         stx     $acc3,[$rp+24]
1698 .type   __ecp_nistz256_add_vis3,#function
1699 .size   __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3
1700
1701 ! Trouble with subtraction is that there is no subtraction with 64-bit
1702 ! borrow, only with 32-bit one. For this reason we "decompose" 64-bit
1703 ! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But
1704 ! recall that SPARC is big-endian, which is why you'll observe that
1705 ! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we
1706 ! "collect" result back to 64-bit $acc0-$acc3.
1707 .align  32
1708 __ecp_nistz256_sub_from_vis3:
1709         ld      [$bp+4],$t0
1710         ld      [$bp+0],$t1
1711         ld      [$bp+12],$t2
1712         ld      [$bp+8],$t3
1713
1714         srlx    $acc0,32,$acc4
1715         not     $poly1,$poly1
1716         srlx    $acc1,32,$acc5
1717         subcc   $acc0,$t0,$acc0
1718         ld      [$bp+20],$t0
1719         subccc  $acc4,$t1,$acc4
1720         ld      [$bp+16],$t1
1721         subccc  $acc1,$t2,$acc1
1722         ld      [$bp+28],$t2
1723         and     $acc0,$poly1,$acc0
1724         subccc  $acc5,$t3,$acc5
1725         ld      [$bp+24],$t3
1726         sllx    $acc4,32,$acc4
1727         and     $acc1,$poly1,$acc1
1728         sllx    $acc5,32,$acc5
1729         or      $acc0,$acc4,$acc0
1730         srlx    $acc2,32,$acc4
1731         or      $acc1,$acc5,$acc1
1732         srlx    $acc3,32,$acc5
1733         subccc  $acc2,$t0,$acc2
1734         subccc  $acc4,$t1,$acc4
1735         subccc  $acc3,$t2,$acc3
1736         and     $acc2,$poly1,$acc2
1737         subccc  $acc5,$t3,$acc5
1738         sllx    $acc4,32,$acc4
1739         and     $acc3,$poly1,$acc3
1740         sllx    $acc5,32,$acc5
1741         or      $acc2,$acc4,$acc2
1742         subc    %g0,%g0,$acc4           ! did it borrow?
1743         b       .Lreduce_by_add_vis3
1744         or      $acc3,$acc5,$acc3
1745 .type   __ecp_nistz256_sub_from_vis3,#function
1746 .size   __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3
1747
1748 .align  32
1749 __ecp_nistz256_sub_morf_vis3:
1750         ld      [$bp+4],$t0
1751         ld      [$bp+0],$t1
1752         ld      [$bp+12],$t2
1753         ld      [$bp+8],$t3
1754
1755         srlx    $acc0,32,$acc4
1756         not     $poly1,$poly1
1757         srlx    $acc1,32,$acc5
1758         subcc   $t0,$acc0,$acc0
1759         ld      [$bp+20],$t0
1760         subccc  $t1,$acc4,$acc4
1761         ld      [$bp+16],$t1
1762         subccc  $t2,$acc1,$acc1
1763         ld      [$bp+28],$t2
1764         and     $acc0,$poly1,$acc0
1765         subccc  $t3,$acc5,$acc5
1766         ld      [$bp+24],$t3
1767         sllx    $acc4,32,$acc4
1768         and     $acc1,$poly1,$acc1
1769         sllx    $acc5,32,$acc5
1770         or      $acc0,$acc4,$acc0
1771         srlx    $acc2,32,$acc4
1772         or      $acc1,$acc5,$acc1
1773         srlx    $acc3,32,$acc5
1774         subccc  $t0,$acc2,$acc2
1775         subccc  $t1,$acc4,$acc4
1776         subccc  $t2,$acc3,$acc3
1777         and     $acc2,$poly1,$acc2
1778         subccc  $t3,$acc5,$acc5
1779         sllx    $acc4,32,$acc4
1780         and     $acc3,$poly1,$acc3
1781         sllx    $acc5,32,$acc5
1782         or      $acc2,$acc4,$acc2
1783         subc    %g0,%g0,$acc4           ! did it borrow?
1784         or      $acc3,$acc5,$acc3
1785
1786 .Lreduce_by_add_vis3:
1787
1788         addcc   $acc0,-1,$t0            ! add modulus
1789         not     $poly3,$t3
1790         addxccc $acc1,$poly1,$t1
1791         not     $poly1,$poly1           ! restore $poly1
1792         addxccc $acc2,%g0,$t2
1793         addxc   $acc3,$t3,$t3
1794
1795         movrnz  $acc4,$t0,$acc0         ! if a-b borrowed, ret = ret+mod
1796         movrnz  $acc4,$t1,$acc1
1797         stx     $acc0,[$rp]
1798         movrnz  $acc4,$t2,$acc2
1799         stx     $acc1,[$rp+8]
1800         movrnz  $acc4,$t3,$acc3
1801         stx     $acc2,[$rp+16]
1802         retl
1803         stx     $acc3,[$rp+24]
1804 .type   __ecp_nistz256_sub_morf_vis3,#function
1805 .size   __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3
1806
1807 .align  32
1808 __ecp_nistz256_div_by_2_vis3:
1809         ! ret = (a is odd ? a+mod : a) >> 1
1810
1811         not     $poly1,$t1
1812         not     $poly3,$t3
1813         and     $acc0,1,$acc5
1814         addcc   $acc0,-1,$t0            ! add modulus
1815         addxccc $acc1,$t1,$t1
1816         addxccc $acc2,%g0,$t2
1817         addxccc $acc3,$t3,$t3
1818         addxc   %g0,%g0,$acc4           ! carry bit
1819
1820         movrnz  $acc5,$t0,$acc0
1821         movrnz  $acc5,$t1,$acc1
1822         movrnz  $acc5,$t2,$acc2
1823         movrnz  $acc5,$t3,$acc3
1824         movrz   $acc5,%g0,$acc4
1825
1826         ! ret >>= 1
1827
1828         srlx    $acc0,1,$acc0
1829         sllx    $acc1,63,$t0
1830         srlx    $acc1,1,$acc1
1831         or      $acc0,$t0,$acc0
1832         sllx    $acc2,63,$t1
1833         srlx    $acc2,1,$acc2
1834         or      $acc1,$t1,$acc1
1835         sllx    $acc3,63,$t2
1836         stx     $acc0,[$rp]
1837         srlx    $acc3,1,$acc3
1838         or      $acc2,$t2,$acc2
1839         sllx    $acc4,63,$t3            ! don't forget carry bit
1840         stx     $acc1,[$rp+8]
1841         or      $acc3,$t3,$acc3
1842         stx     $acc2,[$rp+16]
1843         retl
1844         stx     $acc3,[$rp+24]
1845 .type   __ecp_nistz256_div_by_2_vis3,#function
1846 .size   __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3
1847
1848 ! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and
1849 ! 4x faster [on T4]...
1850 .align  32
1851 __ecp_nistz256_mul_mont_vis3:
1852         mulx    $a0,$bi,$acc0
1853         not     $poly3,$poly3           ! 0xFFFFFFFF00000001
1854         umulxhi $a0,$bi,$t0
1855         mulx    $a1,$bi,$acc1
1856         umulxhi $a1,$bi,$t1
1857         mulx    $a2,$bi,$acc2
1858         umulxhi $a2,$bi,$t2
1859         mulx    $a3,$bi,$acc3
1860         umulxhi $a3,$bi,$t3
1861         ldx     [$bp+8],$bi             ! b[1]
1862
1863         addcc   $acc1,$t0,$acc1         ! accumulate high parts of multiplication
1864          sllx   $acc0,32,$t0
1865         addxccc $acc2,$t1,$acc2
1866          srlx   $acc0,32,$t1
1867         addxccc $acc3,$t2,$acc3
1868         addxc   %g0,$t3,$acc4
1869         mov     0,$acc5
1870 ___
1871 for($i=1;$i<4;$i++) {
1872         # Reduction iteration is normally performed by accumulating
1873         # result of multiplication of modulus by "magic" digit [and
1874         # omitting least significant word, which is guaranteed to
1875         # be 0], but thanks to special form of modulus and "magic"
1876         # digit being equal to least significant word, it can be
1877         # performed with additions and subtractions alone. Indeed:
1878         #
1879         #            ffff0001.00000000.0000ffff.ffffffff
1880         # *                                     abcdefgh
1881         # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1882         #
1883         # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1884         # rewrite above as:
1885         #
1886         #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1887         # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
1888         # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
1889         #
1890         # or marking redundant operations:
1891         #
1892         #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
1893         # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
1894         # - 0000abcd.efgh0000.--------.--------.--------
1895         #   ^^^^^^^^ but this word is calculated with umulxhi, because
1896         #            there is no subtract with 64-bit borrow:-(
1897
1898 $code.=<<___;
1899         sub     $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1900         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1901         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1902         mulx    $a0,$bi,$t0
1903         addxccc $acc2,$t1,$acc1
1904         mulx    $a1,$bi,$t1
1905         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1906         mulx    $a2,$bi,$t2
1907         addxccc $acc4,$t3,$acc3
1908         mulx    $a3,$bi,$t3
1909         addxc   $acc5,%g0,$acc4
1910
1911         addcc   $acc0,$t0,$acc0         ! accumulate low parts of multiplication
1912         umulxhi $a0,$bi,$t0
1913         addxccc $acc1,$t1,$acc1
1914         umulxhi $a1,$bi,$t1
1915         addxccc $acc2,$t2,$acc2
1916         umulxhi $a2,$bi,$t2
1917         addxccc $acc3,$t3,$acc3
1918         umulxhi $a3,$bi,$t3
1919         addxc   $acc4,%g0,$acc4
1920 ___
1921 $code.=<<___    if ($i<3);
1922         ldx     [$bp+8*($i+1)],$bi      ! bp[$i+1]
1923 ___
1924 $code.=<<___;
1925         addcc   $acc1,$t0,$acc1         ! accumulate high parts of multiplication 
1926          sllx   $acc0,32,$t0
1927         addxccc $acc2,$t1,$acc2
1928          srlx   $acc0,32,$t1
1929         addxccc $acc3,$t2,$acc3
1930         addxccc $acc4,$t3,$acc4
1931         addxc   %g0,%g0,$acc5
1932 ___
1933 }
1934 $code.=<<___;
1935         sub     $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1936         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1937         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1938         addxccc $acc2,$t1,$acc1
1939         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1940         addxccc $acc4,$t3,$acc3
1941         b       .Lmul_final_vis3        ! see below
1942         addxc   $acc5,%g0,$acc4
1943 .type   __ecp_nistz256_mul_mont_vis3,#function
1944 .size   __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3
1945
1946 ! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less
1947 ! instructions, but only 14% faster [on T4]...
1948 .align  32
1949 __ecp_nistz256_sqr_mont_vis3:
1950         !  |  |  |  |  |  |a1*a0|  |
1951         !  |  |  |  |  |a2*a0|  |  |
1952         !  |  |a3*a2|a3*a0|  |  |  |
1953         !  |  |  |  |a2*a1|  |  |  |
1954         !  |  |  |a3*a1|  |  |  |  |
1955         ! *|  |  |  |  |  |  |  | 2|
1956         ! +|a3*a3|a2*a2|a1*a1|a0*a0|
1957         !  |--+--+--+--+--+--+--+--|
1958         !  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1959         !
1960         !  "can't overflow" below mark carrying into high part of
1961         !  multiplication result, which can't overflow, because it
1962         !  can never be all ones.
1963
1964         mulx    $a1,$a0,$acc1           ! a[1]*a[0]
1965         umulxhi $a1,$a0,$t1
1966         mulx    $a2,$a0,$acc2           ! a[2]*a[0]
1967         umulxhi $a2,$a0,$t2
1968         mulx    $a3,$a0,$acc3           ! a[3]*a[0]
1969         umulxhi $a3,$a0,$acc4
1970
1971         addcc   $acc2,$t1,$acc2         ! accumulate high parts of multiplication
1972         mulx    $a2,$a1,$t0             ! a[2]*a[1]
1973         umulxhi $a2,$a1,$t1
1974         addxccc $acc3,$t2,$acc3
1975         mulx    $a3,$a1,$t2             ! a[3]*a[1]
1976         umulxhi $a3,$a1,$t3
1977         addxc   $acc4,%g0,$acc4         ! can't overflow
1978
1979         mulx    $a3,$a2,$acc5           ! a[3]*a[2]
1980         not     $poly3,$poly3           ! 0xFFFFFFFF00000001
1981         umulxhi $a3,$a2,$acc6
1982
1983         addcc   $t2,$t1,$t1             ! accumulate high parts of multiplication
1984         mulx    $a0,$a0,$acc0           ! a[0]*a[0]
1985         addxc   $t3,%g0,$t2             ! can't overflow
1986
1987         addcc   $acc3,$t0,$acc3         ! accumulate low parts of multiplication
1988         umulxhi $a0,$a0,$a0
1989         addxccc $acc4,$t1,$acc4
1990         mulx    $a1,$a1,$t1             ! a[1]*a[1]
1991         addxccc $acc5,$t2,$acc5
1992         umulxhi $a1,$a1,$a1
1993         addxc   $acc6,%g0,$acc6         ! can't overflow
1994
1995         addcc   $acc1,$acc1,$acc1       ! acc[1-6]*=2
1996         mulx    $a2,$a2,$t2             ! a[2]*a[2]
1997         addxccc $acc2,$acc2,$acc2
1998         umulxhi $a2,$a2,$a2
1999         addxccc $acc3,$acc3,$acc3
2000         mulx    $a3,$a3,$t3             ! a[3]*a[3]
2001         addxccc $acc4,$acc4,$acc4
2002         umulxhi $a3,$a3,$a3
2003         addxccc $acc5,$acc5,$acc5
2004         addxccc $acc6,$acc6,$acc6
2005         addxc   %g0,%g0,$acc7
2006
2007         addcc   $acc1,$a0,$acc1         ! +a[i]*a[i]
2008         addxccc $acc2,$t1,$acc2
2009         addxccc $acc3,$a1,$acc3
2010         addxccc $acc4,$t2,$acc4
2011          sllx   $acc0,32,$t0
2012         addxccc $acc5,$a2,$acc5
2013          srlx   $acc0,32,$t1
2014         addxccc $acc6,$t3,$acc6
2015          sub    $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
2016         addxc   $acc7,$a3,$acc7
2017 ___
2018 for($i=0;$i<3;$i++) {                   # reductions, see commentary
2019                                         # in multiplication for details
2020 $code.=<<___;
2021         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
2022         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
2023          sllx   $acc0,32,$t0
2024         addxccc $acc2,$t1,$acc1
2025          srlx   $acc0,32,$t1
2026         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
2027          sub    $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
2028         addxc   %g0,$t3,$acc3           ! cant't overflow
2029 ___
2030 }
2031 $code.=<<___;
2032         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
2033         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
2034         addxccc $acc2,$t1,$acc1
2035         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
2036         addxc   %g0,$t3,$acc3           ! can't overflow
2037
2038         addcc   $acc0,$acc4,$acc0       ! accumulate upper half
2039         addxccc $acc1,$acc5,$acc1
2040         addxccc $acc2,$acc6,$acc2
2041         addxccc $acc3,$acc7,$acc3
2042         addxc   %g0,%g0,$acc4
2043
2044 .Lmul_final_vis3:
2045
2046         ! Final step is "if result > mod, subtract mod", but as comparison
2047         ! means subtraction, we do the subtraction and then copy outcome
2048         ! if it didn't borrow. But note that as we [have to] replace
2049         ! subtraction with addition with negative, carry/borrow logic is
2050         ! inverse.
2051
2052         addcc   $acc0,1,$t0             ! add -modulus, i.e. subtract
2053         not     $poly3,$poly3           ! restore 0x00000000FFFFFFFE
2054         addxccc $acc1,$poly1,$t1
2055         addxccc $acc2,$minus1,$t2
2056         addxccc $acc3,$poly3,$t3
2057         addxccc $acc4,$minus1,%g0       ! did it carry?
2058
2059         movcs   %xcc,$t0,$acc0
2060         movcs   %xcc,$t1,$acc1
2061         stx     $acc0,[$rp]
2062         movcs   %xcc,$t2,$acc2
2063         stx     $acc1,[$rp+8]
2064         movcs   %xcc,$t3,$acc3
2065         stx     $acc2,[$rp+16]
2066         retl
2067         stx     $acc3,[$rp+24]
2068 .type   __ecp_nistz256_sqr_mont_vis3,#function
2069 .size   __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3
2070 ___
2071
2072 ########################################################################
2073 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
2074 #
2075 {
2076 my ($res_x,$res_y,$res_z,
2077     $in_x,$in_y,$in_z,
2078     $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9));
2079 # above map() describes stack layout with 10 temporary
2080 # 256-bit vectors on top.
2081
2082 $code.=<<___;
2083 .align  32
2084 ecp_nistz256_point_double_vis3:
2085         save    %sp,-STACK64_FRAME-32*10,%sp
2086
2087         mov     $rp,$rp_real
2088 .Ldouble_shortcut_vis3:
2089         mov     -1,$minus1
2090         mov     -2,$poly3
2091         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
2092         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
2093
2094         ! convert input to uint64_t[4]
2095         ld      [$ap],$a0                       ! in_x
2096         ld      [$ap+4],$t0
2097         ld      [$ap+8],$a1
2098         ld      [$ap+12],$t1
2099         ld      [$ap+16],$a2
2100         ld      [$ap+20],$t2
2101         ld      [$ap+24],$a3
2102         ld      [$ap+28],$t3
2103         sllx    $t0,32,$t0
2104         sllx    $t1,32,$t1
2105         ld      [$ap+32],$acc0                  ! in_y
2106         or      $a0,$t0,$a0
2107         ld      [$ap+32+4],$t0
2108         sllx    $t2,32,$t2
2109         ld      [$ap+32+8],$acc1
2110         or      $a1,$t1,$a1
2111         ld      [$ap+32+12],$t1
2112         sllx    $t3,32,$t3
2113         ld      [$ap+32+16],$acc2
2114         or      $a2,$t2,$a2
2115         ld      [$ap+32+20],$t2
2116         or      $a3,$t3,$a3
2117         ld      [$ap+32+24],$acc3
2118         sllx    $t0,32,$t0
2119         ld      [$ap+32+28],$t3
2120         sllx    $t1,32,$t1
2121         stx     $a0,[%sp+LOCALS64+$in_x]
2122         sllx    $t2,32,$t2
2123         stx     $a1,[%sp+LOCALS64+$in_x+8]
2124         sllx    $t3,32,$t3
2125         stx     $a2,[%sp+LOCALS64+$in_x+16]
2126         or      $acc0,$t0,$acc0
2127         stx     $a3,[%sp+LOCALS64+$in_x+24]
2128         or      $acc1,$t1,$acc1
2129         stx     $acc0,[%sp+LOCALS64+$in_y]
2130         or      $acc2,$t2,$acc2
2131         stx     $acc1,[%sp+LOCALS64+$in_y+8]
2132         or      $acc3,$t3,$acc3
2133         stx     $acc2,[%sp+LOCALS64+$in_y+16]
2134         stx     $acc3,[%sp+LOCALS64+$in_y+24]
2135
2136         ld      [$ap+64],$a0                    ! in_z
2137         ld      [$ap+64+4],$t0
2138         ld      [$ap+64+8],$a1
2139         ld      [$ap+64+12],$t1
2140         ld      [$ap+64+16],$a2
2141         ld      [$ap+64+20],$t2
2142         ld      [$ap+64+24],$a3
2143         ld      [$ap+64+28],$t3
2144         sllx    $t0,32,$t0
2145         sllx    $t1,32,$t1
2146         or      $a0,$t0,$a0
2147         sllx    $t2,32,$t2
2148         or      $a1,$t1,$a1
2149         sllx    $t3,32,$t3
2150         or      $a2,$t2,$a2
2151         or      $a3,$t3,$a3
2152         sllx    $t0,32,$t0
2153         sllx    $t1,32,$t1
2154         stx     $a0,[%sp+LOCALS64+$in_z]
2155         sllx    $t2,32,$t2
2156         stx     $a1,[%sp+LOCALS64+$in_z+8]
2157         sllx    $t3,32,$t3
2158         stx     $a2,[%sp+LOCALS64+$in_z+16]
2159         stx     $a3,[%sp+LOCALS64+$in_z+24]
2160
2161         ! in_y is still in $acc0-$acc3
2162         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(S, in_y);
2163         add     %sp,LOCALS64+$S,$rp
2164
2165         ! in_z is still in $a0-$a3
2166         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Zsqr, in_z);
2167         add     %sp,LOCALS64+$Zsqr,$rp
2168
2169         mov     $acc0,$a0                       ! put Zsqr aside
2170         mov     $acc1,$a1
2171         mov     $acc2,$a2
2172         mov     $acc3,$a3
2173
2174         add     %sp,LOCALS64+$in_x,$bp
2175         call    __ecp_nistz256_add_vis3         ! p256_add(M, Zsqr, in_x);
2176         add     %sp,LOCALS64+$M,$rp
2177
2178         mov     $a0,$acc0                       ! restore Zsqr
2179         ldx     [%sp+LOCALS64+$S],$a0           ! forward load
2180         mov     $a1,$acc1
2181         ldx     [%sp+LOCALS64+$S+8],$a1
2182         mov     $a2,$acc2
2183         ldx     [%sp+LOCALS64+$S+16],$a2
2184         mov     $a3,$acc3
2185         ldx     [%sp+LOCALS64+$S+24],$a3
2186
2187         add     %sp,LOCALS64+$in_x,$bp
2188         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(Zsqr, in_x, Zsqr);
2189         add     %sp,LOCALS64+$Zsqr,$rp
2190
2191         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(S, S);
2192         add     %sp,LOCALS64+$S,$rp
2193
2194         ldx     [%sp+LOCALS64+$in_z],$bi
2195         ldx     [%sp+LOCALS64+$in_y],$a0
2196         ldx     [%sp+LOCALS64+$in_y+8],$a1
2197         ldx     [%sp+LOCALS64+$in_y+16],$a2
2198         ldx     [%sp+LOCALS64+$in_y+24],$a3
2199         add     %sp,LOCALS64+$in_z,$bp
2200         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(tmp0, in_z, in_y);
2201         add     %sp,LOCALS64+$tmp0,$rp
2202
2203         ldx     [%sp+LOCALS64+$M],$bi           ! forward load
2204         ldx     [%sp+LOCALS64+$Zsqr],$a0
2205         ldx     [%sp+LOCALS64+$Zsqr+8],$a1
2206         ldx     [%sp+LOCALS64+$Zsqr+16],$a2
2207         ldx     [%sp+LOCALS64+$Zsqr+24],$a3
2208
2209         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(res_z, tmp0);
2210         add     %sp,LOCALS64+$res_z,$rp
2211
2212         add     %sp,LOCALS64+$M,$bp
2213         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(M, M, Zsqr);
2214         add     %sp,LOCALS64+$M,$rp
2215
2216         mov     $acc0,$a0                       ! put aside M
2217         mov     $acc1,$a1
2218         mov     $acc2,$a2
2219         mov     $acc3,$a3
2220         call    __ecp_nistz256_mul_by_2_vis3
2221         add     %sp,LOCALS64+$M,$rp
2222         mov     $a0,$t0                         ! copy M
2223         ldx     [%sp+LOCALS64+$S],$a0           ! forward load
2224         mov     $a1,$t1
2225         ldx     [%sp+LOCALS64+$S+8],$a1
2226         mov     $a2,$t2
2227         ldx     [%sp+LOCALS64+$S+16],$a2
2228         mov     $a3,$t3
2229         ldx     [%sp+LOCALS64+$S+24],$a3
2230         call    __ecp_nistz256_add_noload_vis3  ! p256_mul_by_3(M, M);
2231         add     %sp,LOCALS64+$M,$rp
2232
2233         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(tmp0, S);
2234         add     %sp,LOCALS64+$tmp0,$rp
2235
2236         ldx     [%sp+LOCALS64+$S],$bi           ! forward load
2237         ldx     [%sp+LOCALS64+$in_x],$a0
2238         ldx     [%sp+LOCALS64+$in_x+8],$a1
2239         ldx     [%sp+LOCALS64+$in_x+16],$a2
2240         ldx     [%sp+LOCALS64+$in_x+24],$a3
2241
2242         call    __ecp_nistz256_div_by_2_vis3    ! p256_div_by_2(res_y, tmp0);
2243         add     %sp,LOCALS64+$res_y,$rp
2244
2245         add     %sp,LOCALS64+$S,$bp
2246         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S, S, in_x);
2247         add     %sp,LOCALS64+$S,$rp
2248
2249         ldx     [%sp+LOCALS64+$M],$a0           ! forward load
2250         ldx     [%sp+LOCALS64+$M+8],$a1
2251         ldx     [%sp+LOCALS64+$M+16],$a2
2252         ldx     [%sp+LOCALS64+$M+24],$a3
2253
2254         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(tmp0, S);
2255         add     %sp,LOCALS64+$tmp0,$rp
2256
2257         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(res_x, M);
2258         add     %sp,LOCALS64+$res_x,$rp
2259
2260         add     %sp,LOCALS64+$tmp0,$bp
2261         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_x, res_x, tmp0);
2262         add     %sp,LOCALS64+$res_x,$rp
2263
2264         ldx     [%sp+LOCALS64+$M],$a0           ! forward load
2265         ldx     [%sp+LOCALS64+$M+8],$a1
2266         ldx     [%sp+LOCALS64+$M+16],$a2
2267         ldx     [%sp+LOCALS64+$M+24],$a3
2268
2269         add     %sp,LOCALS64+$S,$bp
2270         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(S, S, res_x);
2271         add     %sp,LOCALS64+$S,$rp
2272
2273         mov     $acc0,$bi
2274         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S, S, M);
2275         add     %sp,LOCALS64+$S,$rp
2276
2277         ldx     [%sp+LOCALS64+$res_x],$a0       ! forward load
2278         ldx     [%sp+LOCALS64+$res_x+8],$a1
2279         ldx     [%sp+LOCALS64+$res_x+16],$a2
2280         ldx     [%sp+LOCALS64+$res_x+24],$a3
2281
2282         add     %sp,LOCALS64+$res_y,$bp
2283         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, S, res_y);
2284         add     %sp,LOCALS64+$res_y,$bp
2285
2286         ! convert output to uint_32[8]
2287         srlx    $a0,32,$t0
2288         srlx    $a1,32,$t1
2289         st      $a0,[$rp_real]                  ! res_x
2290         srlx    $a2,32,$t2
2291         st      $t0,[$rp_real+4]
2292         srlx    $a3,32,$t3
2293         st      $a1,[$rp_real+8]
2294         st      $t1,[$rp_real+12]
2295         st      $a2,[$rp_real+16]
2296         st      $t2,[$rp_real+20]
2297         st      $a3,[$rp_real+24]
2298         st      $t3,[$rp_real+28]
2299
2300         ldx     [%sp+LOCALS64+$res_z],$a0       ! forward load
2301         srlx    $acc0,32,$t0
2302         ldx     [%sp+LOCALS64+$res_z+8],$a1
2303         srlx    $acc1,32,$t1
2304         ldx     [%sp+LOCALS64+$res_z+16],$a2
2305         srlx    $acc2,32,$t2
2306         ldx     [%sp+LOCALS64+$res_z+24],$a3
2307         srlx    $acc3,32,$t3
2308         st      $acc0,[$rp_real+32]             ! res_y
2309         st      $t0,  [$rp_real+32+4]
2310         st      $acc1,[$rp_real+32+8]
2311         st      $t1,  [$rp_real+32+12]
2312         st      $acc2,[$rp_real+32+16]
2313         st      $t2,  [$rp_real+32+20]
2314         st      $acc3,[$rp_real+32+24]
2315         st      $t3,  [$rp_real+32+28]
2316
2317         srlx    $a0,32,$t0
2318         srlx    $a1,32,$t1
2319         st      $a0,[$rp_real+64]               ! res_z
2320         srlx    $a2,32,$t2
2321         st      $t0,[$rp_real+64+4]
2322         srlx    $a3,32,$t3
2323         st      $a1,[$rp_real+64+8]
2324         st      $t1,[$rp_real+64+12]
2325         st      $a2,[$rp_real+64+16]
2326         st      $t2,[$rp_real+64+20]
2327         st      $a3,[$rp_real+64+24]
2328         st      $t3,[$rp_real+64+28]
2329
2330         ret
2331         restore
2332 .type   ecp_nistz256_point_double_vis3,#function
2333 .size   ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3
2334 ___
2335 }
2336 ########################################################################
2337 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
2338 #                             const P256_POINT *in2);
2339 {
2340 my ($res_x,$res_y,$res_z,
2341     $in1_x,$in1_y,$in1_z,
2342     $in2_x,$in2_y,$in2_z,
2343     $H,$Hsqr,$R,$Rsqr,$Hcub,
2344     $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
2345 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2346
2347 # above map() describes stack layout with 18 temporary
2348 # 256-bit vectors on top. Then we reserve some space for
2349 # !in1infty, !in2infty and result of check for zero.
2350
2351 $code.=<<___;
2352 .globl  ecp_nistz256_point_add_vis3
2353 .align  32
2354 ecp_nistz256_point_add_vis3:
2355         save    %sp,-STACK64_FRAME-32*18-32,%sp
2356
2357         mov     $rp,$rp_real
2358         mov     -1,$minus1
2359         mov     -2,$poly3
2360         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
2361         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
2362
2363         ! convert input to uint64_t[4]
2364         ld      [$bp],$a0                       ! in2_x
2365         ld      [$bp+4],$t0
2366         ld      [$bp+8],$a1
2367         ld      [$bp+12],$t1
2368         ld      [$bp+16],$a2
2369         ld      [$bp+20],$t2
2370         ld      [$bp+24],$a3
2371         ld      [$bp+28],$t3
2372         sllx    $t0,32,$t0
2373         sllx    $t1,32,$t1
2374         ld      [$bp+32],$acc0                  ! in2_y
2375         or      $a0,$t0,$a0
2376         ld      [$bp+32+4],$t0
2377         sllx    $t2,32,$t2
2378         ld      [$bp+32+8],$acc1
2379         or      $a1,$t1,$a1
2380         ld      [$bp+32+12],$t1
2381         sllx    $t3,32,$t3
2382         ld      [$bp+32+16],$acc2
2383         or      $a2,$t2,$a2
2384         ld      [$bp+32+20],$t2
2385         or      $a3,$t3,$a3
2386         ld      [$bp+32+24],$acc3
2387         sllx    $t0,32,$t0
2388         ld      [$bp+32+28],$t3
2389         sllx    $t1,32,$t1
2390         stx     $a0,[%sp+LOCALS64+$in2_x]
2391         sllx    $t2,32,$t2
2392         stx     $a1,[%sp+LOCALS64+$in2_x+8]
2393         sllx    $t3,32,$t3
2394         stx     $a2,[%sp+LOCALS64+$in2_x+16]
2395         or      $acc0,$t0,$acc0
2396         stx     $a3,[%sp+LOCALS64+$in2_x+24]
2397         or      $acc1,$t1,$acc1
2398         stx     $acc0,[%sp+LOCALS64+$in2_y]
2399         or      $acc2,$t2,$acc2
2400         stx     $acc1,[%sp+LOCALS64+$in2_y+8]
2401         or      $acc3,$t3,$acc3
2402         stx     $acc2,[%sp+LOCALS64+$in2_y+16]
2403         stx     $acc3,[%sp+LOCALS64+$in2_y+24]
2404
2405         or      $a1,$a0,$a0
2406         or      $a3,$a2,$a2
2407         or      $acc1,$acc0,$acc0
2408         or      $acc3,$acc2,$acc2
2409         or      $a2,$a0,$a0
2410         or      $acc2,$acc0,$acc0
2411         or      $acc0,$a0,$a0
2412         movrnz  $a0,-1,$a0                      ! !in2infty
2413         stx     $a0,[%fp+STACK_BIAS-8]
2414
2415         ld      [$bp+64],$acc0                  ! in2_z
2416         ld      [$bp+64+4],$t0
2417         ld      [$bp+64+8],$acc1
2418         ld      [$bp+64+12],$t1
2419         ld      [$bp+64+16],$acc2
2420         ld      [$bp+64+20],$t2
2421         ld      [$bp+64+24],$acc3
2422         ld      [$bp+64+28],$t3
2423         sllx    $t0,32,$t0
2424         sllx    $t1,32,$t1
2425         ld      [$ap],$a0                       ! in1_x
2426         or      $acc0,$t0,$acc0
2427         ld      [$ap+4],$t0
2428         sllx    $t2,32,$t2
2429         ld      [$ap+8],$a1
2430         or      $acc1,$t1,$acc1
2431         ld      [$ap+12],$t1
2432         sllx    $t3,32,$t3
2433         ld      [$ap+16],$a2
2434         or      $acc2,$t2,$acc2
2435         ld      [$ap+20],$t2
2436         or      $acc3,$t3,$acc3
2437         ld      [$ap+24],$a3
2438         sllx    $t0,32,$t0
2439         ld      [$ap+28],$t3
2440         sllx    $t1,32,$t1
2441         stx     $acc0,[%sp+LOCALS64+$in2_z]
2442         sllx    $t2,32,$t2
2443         stx     $acc1,[%sp+LOCALS64+$in2_z+8]
2444         sllx    $t3,32,$t3
2445         stx     $acc2,[%sp+LOCALS64+$in2_z+16]
2446         stx     $acc3,[%sp+LOCALS64+$in2_z+24]
2447
2448         or      $a0,$t0,$a0
2449         ld      [$ap+32],$acc0                  ! in1_y
2450         or      $a1,$t1,$a1
2451         ld      [$ap+32+4],$t0
2452         or      $a2,$t2,$a2
2453         ld      [$ap+32+8],$acc1
2454         or      $a3,$t3,$a3
2455         ld      [$ap+32+12],$t1
2456         ld      [$ap+32+16],$acc2
2457         ld      [$ap+32+20],$t2
2458         ld      [$ap+32+24],$acc3
2459         sllx    $t0,32,$t0
2460         ld      [$ap+32+28],$t3
2461         sllx    $t1,32,$t1
2462         stx     $a0,[%sp+LOCALS64+$in1_x]
2463         sllx    $t2,32,$t2
2464         stx     $a1,[%sp+LOCALS64+$in1_x+8]
2465         sllx    $t3,32,$t3
2466         stx     $a2,[%sp+LOCALS64+$in1_x+16]
2467         or      $acc0,$t0,$acc0
2468         stx     $a3,[%sp+LOCALS64+$in1_x+24]
2469         or      $acc1,$t1,$acc1
2470         stx     $acc0,[%sp+LOCALS64+$in1_y]
2471         or      $acc2,$t2,$acc2
2472         stx     $acc1,[%sp+LOCALS64+$in1_y+8]
2473         or      $acc3,$t3,$acc3
2474         stx     $acc2,[%sp+LOCALS64+$in1_y+16]
2475         stx     $acc3,[%sp+LOCALS64+$in1_y+24]
2476
2477         or      $a1,$a0,$a0
2478         or      $a3,$a2,$a2
2479         or      $acc1,$acc0,$acc0
2480         or      $acc3,$acc2,$acc2
2481         or      $a2,$a0,$a0
2482         or      $acc2,$acc0,$acc0
2483         or      $acc0,$a0,$a0
2484         movrnz  $a0,-1,$a0                      ! !in1infty
2485         stx     $a0,[%fp+STACK_BIAS-16]
2486
2487         ldx     [%sp+LOCALS64+$in2_z],$a0       ! forward load
2488         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2489         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2490         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2491
2492         ld      [$ap+64],$acc0                  ! in1_z
2493         ld      [$ap+64+4],$t0
2494         ld      [$ap+64+8],$acc1
2495         ld      [$ap+64+12],$t1
2496         ld      [$ap+64+16],$acc2
2497         ld      [$ap+64+20],$t2
2498         ld      [$ap+64+24],$acc3
2499         ld      [$ap+64+28],$t3
2500         sllx    $t0,32,$t0
2501         sllx    $t1,32,$t1
2502         or      $acc0,$t0,$acc0
2503         sllx    $t2,32,$t2
2504         or      $acc1,$t1,$acc1
2505         sllx    $t3,32,$t3
2506         stx     $acc0,[%sp+LOCALS64+$in1_z]
2507         or      $acc2,$t2,$acc2
2508         stx     $acc1,[%sp+LOCALS64+$in1_z+8]
2509         or      $acc3,$t3,$acc3
2510         stx     $acc2,[%sp+LOCALS64+$in1_z+16]
2511         stx     $acc3,[%sp+LOCALS64+$in1_z+24]
2512
2513         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z2sqr, in2_z);
2514         add     %sp,LOCALS64+$Z2sqr,$rp
2515
2516         ldx     [%sp+LOCALS64+$in1_z],$a0
2517         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2518         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2519         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2520         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z1sqr, in1_z);
2521         add     %sp,LOCALS64+$Z1sqr,$rp
2522
2523         ldx     [%sp+LOCALS64+$Z2sqr],$bi
2524         ldx     [%sp+LOCALS64+$in2_z],$a0
2525         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2526         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2527         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2528         add     %sp,LOCALS64+$Z2sqr,$bp
2529         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S1, Z2sqr, in2_z);
2530         add     %sp,LOCALS64+$S1,$rp
2531
2532         ldx     [%sp+LOCALS64+$Z1sqr],$bi
2533         ldx     [%sp+LOCALS64+$in1_z],$a0
2534         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2535         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2536         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2537         add     %sp,LOCALS64+$Z1sqr,$bp
2538         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, Z1sqr, in1_z);
2539         add     %sp,LOCALS64+$S2,$rp
2540
2541         ldx     [%sp+LOCALS64+$S1],$bi
2542         ldx     [%sp+LOCALS64+$in1_y],$a0
2543         ldx     [%sp+LOCALS64+$in1_y+8],$a1
2544         ldx     [%sp+LOCALS64+$in1_y+16],$a2
2545         ldx     [%sp+LOCALS64+$in1_y+24],$a3
2546         add     %sp,LOCALS64+$S1,$bp
2547         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S1, S1, in1_y);
2548         add     %sp,LOCALS64+$S1,$rp
2549
2550         ldx     [%sp+LOCALS64+$S2],$bi
2551         ldx     [%sp+LOCALS64+$in2_y],$a0
2552         ldx     [%sp+LOCALS64+$in2_y+8],$a1
2553         ldx     [%sp+LOCALS64+$in2_y+16],$a2
2554         ldx     [%sp+LOCALS64+$in2_y+24],$a3
2555         add     %sp,LOCALS64+$S2,$bp
2556         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S2, in2_y);
2557         add     %sp,LOCALS64+$S2,$rp
2558
2559         ldx     [%sp+LOCALS64+$Z2sqr],$bi       ! forward load
2560         ldx     [%sp+LOCALS64+$in1_x],$a0
2561         ldx     [%sp+LOCALS64+$in1_x+8],$a1
2562         ldx     [%sp+LOCALS64+$in1_x+16],$a2
2563         ldx     [%sp+LOCALS64+$in1_x+24],$a3
2564
2565         add     %sp,LOCALS64+$S1,$bp
2566         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(R, S2, S1);
2567         add     %sp,LOCALS64+$R,$rp
2568
2569         or      $acc1,$acc0,$acc0               ! see if result is zero
2570         or      $acc3,$acc2,$acc2
2571         or      $acc2,$acc0,$acc0
2572         stx     $acc0,[%fp+STACK_BIAS-24]
2573
2574         add     %sp,LOCALS64+$Z2sqr,$bp
2575         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U1, in1_x, Z2sqr);
2576         add     %sp,LOCALS64+$U1,$rp
2577
2578         ldx     [%sp+LOCALS64+$Z1sqr],$bi
2579         ldx     [%sp+LOCALS64+$in2_x],$a0
2580         ldx     [%sp+LOCALS64+$in2_x+8],$a1
2581         ldx     [%sp+LOCALS64+$in2_x+16],$a2
2582         ldx     [%sp+LOCALS64+$in2_x+24],$a3
2583         add     %sp,LOCALS64+$Z1sqr,$bp
2584         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, in2_x, Z1sqr);
2585         add     %sp,LOCALS64+$U2,$rp
2586
2587         ldx     [%sp+LOCALS64+$R],$a0           ! forward load
2588         ldx     [%sp+LOCALS64+$R+8],$a1
2589         ldx     [%sp+LOCALS64+$R+16],$a2
2590         ldx     [%sp+LOCALS64+$R+24],$a3
2591
2592         add     %sp,LOCALS64+$U1,$bp
2593         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(H, U2, U1);
2594         add     %sp,LOCALS64+$H,$rp
2595
2596         or      $acc1,$acc0,$acc0               ! see if result is zero
2597         or      $acc3,$acc2,$acc2
2598         orcc    $acc2,$acc0,$acc0
2599
2600         bne,pt  %xcc,.Ladd_proceed_vis3         ! is_equal(U1,U2)?
2601         nop
2602
2603         ldx     [%fp+STACK_BIAS-8],$t0
2604         ldx     [%fp+STACK_BIAS-16],$t1
2605         ldx     [%fp+STACK_BIAS-24],$t2
2606         andcc   $t0,$t1,%g0
2607         be,pt   %xcc,.Ladd_proceed_vis3         ! (in1infty || in2infty)?
2608         nop
2609         andcc   $t2,$t2,%g0
2610         be,a,pt %xcc,.Ldouble_shortcut_vis3     ! is_equal(S1,S2)?
2611         add     %sp,32*(12-10)+32,%sp           ! difference in frame sizes
2612
2613         st      %g0,[$rp_real]
2614         st      %g0,[$rp_real+4]
2615         st      %g0,[$rp_real+8]
2616         st      %g0,[$rp_real+12]
2617         st      %g0,[$rp_real+16]
2618         st      %g0,[$rp_real+20]
2619         st      %g0,[$rp_real+24]
2620         st      %g0,[$rp_real+28]
2621         st      %g0,[$rp_real+32]
2622         st      %g0,[$rp_real+32+4]
2623         st      %g0,[$rp_real+32+8]
2624         st      %g0,[$rp_real+32+12]
2625         st      %g0,[$rp_real+32+16]
2626         st      %g0,[$rp_real+32+20]
2627         st      %g0,[$rp_real+32+24]
2628         st      %g0,[$rp_real+32+28]
2629         st      %g0,[$rp_real+64]
2630         st      %g0,[$rp_real+64+4]
2631         st      %g0,[$rp_real+64+8]
2632         st      %g0,[$rp_real+64+12]
2633         st      %g0,[$rp_real+64+16]
2634         st      %g0,[$rp_real+64+20]
2635         st      %g0,[$rp_real+64+24]
2636         st      %g0,[$rp_real+64+28]
2637         b       .Ladd_done_vis3
2638         nop
2639
2640 .align  16
2641 .Ladd_proceed_vis3:
2642         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Rsqr, R);
2643         add     %sp,LOCALS64+$Rsqr,$rp
2644
2645         ldx     [%sp+LOCALS64+$H],$bi
2646         ldx     [%sp+LOCALS64+$in1_z],$a0
2647         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2648         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2649         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2650         add     %sp,LOCALS64+$H,$bp
2651         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, H, in1_z);
2652         add     %sp,LOCALS64+$res_z,$rp
2653
2654         ldx     [%sp+LOCALS64+$H],$a0
2655         ldx     [%sp+LOCALS64+$H+8],$a1
2656         ldx     [%sp+LOCALS64+$H+16],$a2
2657         ldx     [%sp+LOCALS64+$H+24],$a3
2658         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Hsqr, H);
2659         add     %sp,LOCALS64+$Hsqr,$rp
2660
2661         ldx     [%sp+LOCALS64+$res_z],$bi
2662         ldx     [%sp+LOCALS64+$in2_z],$a0
2663         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2664         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2665         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2666         add     %sp,LOCALS64+$res_z,$bp
2667         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, res_z, in2_z);
2668         add     %sp,LOCALS64+$res_z,$rp
2669
2670         ldx     [%sp+LOCALS64+$H],$bi
2671         ldx     [%sp+LOCALS64+$Hsqr],$a0
2672         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2673         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2674         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2675         add     %sp,LOCALS64+$H,$bp
2676         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(Hcub, Hsqr, H);
2677         add     %sp,LOCALS64+$Hcub,$rp
2678
2679         ldx     [%sp+LOCALS64+$U1],$bi
2680         ldx     [%sp+LOCALS64+$Hsqr],$a0
2681         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2682         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2683         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2684         add     %sp,LOCALS64+$U1,$bp
2685         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, U1, Hsqr);
2686         add     %sp,LOCALS64+$U2,$rp
2687
2688         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(Hsqr, U2);
2689         add     %sp,LOCALS64+$Hsqr,$rp
2690
2691         add     %sp,LOCALS64+$Rsqr,$bp
2692         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_x, Rsqr, Hsqr);
2693         add     %sp,LOCALS64+$res_x,$rp
2694
2695         add     %sp,LOCALS64+$Hcub,$bp
2696         call    __ecp_nistz256_sub_from_vis3    !  p256_sub(res_x, res_x, Hcub);
2697         add     %sp,LOCALS64+$res_x,$rp
2698
2699         ldx     [%sp+LOCALS64+$S1],$bi          ! forward load
2700         ldx     [%sp+LOCALS64+$Hcub],$a0
2701         ldx     [%sp+LOCALS64+$Hcub+8],$a1
2702         ldx     [%sp+LOCALS64+$Hcub+16],$a2
2703         ldx     [%sp+LOCALS64+$Hcub+24],$a3
2704
2705         add     %sp,LOCALS64+$U2,$bp
2706         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_y, U2, res_x);
2707         add     %sp,LOCALS64+$res_y,$rp
2708
2709         add     %sp,LOCALS64+$S1,$bp
2710         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S1, Hcub);
2711         add     %sp,LOCALS64+$S2,$rp
2712
2713         ldx     [%sp+LOCALS64+$R],$bi
2714         ldx     [%sp+LOCALS64+$res_y],$a0
2715         ldx     [%sp+LOCALS64+$res_y+8],$a1
2716         ldx     [%sp+LOCALS64+$res_y+16],$a2
2717         ldx     [%sp+LOCALS64+$res_y+24],$a3
2718         add     %sp,LOCALS64+$R,$bp
2719         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_y, res_y, R);
2720         add     %sp,LOCALS64+$res_y,$rp
2721
2722         add     %sp,LOCALS64+$S2,$bp
2723         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, res_y, S2);
2724         add     %sp,LOCALS64+$res_y,$rp
2725
2726         ldx     [%fp+STACK_BIAS-16],$t1         ! !in1infty
2727         ldx     [%fp+STACK_BIAS-8],$t2          ! !in2infty
2728 ___
2729 for($i=0;$i<96;$i+=16) {                        # conditional moves
2730 $code.=<<___;
2731         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
2732         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
2733         ldx     [%sp+LOCALS64+$in2_x+$i],$acc2  ! in2
2734         ldx     [%sp+LOCALS64+$in2_x+$i+8],$acc3
2735         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
2736         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
2737         movrz   $t1,$acc2,$acc0
2738         movrz   $t1,$acc3,$acc1
2739         movrz   $t2,$acc4,$acc0
2740         movrz   $t2,$acc5,$acc1
2741         srlx    $acc0,32,$acc2
2742         srlx    $acc1,32,$acc3
2743         st      $acc0,[$rp_real+$i]
2744         st      $acc2,[$rp_real+$i+4]
2745         st      $acc1,[$rp_real+$i+8]
2746         st      $acc3,[$rp_real+$i+12]
2747 ___
2748 }
2749 $code.=<<___;
2750 .Ladd_done_vis3:
2751         ret
2752         restore
2753 .type   ecp_nistz256_point_add_vis3,#function
2754 .size   ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3
2755 ___
2756 }
2757 ########################################################################
2758 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
2759 #                                    const P256_POINT_AFFINE *in2);
2760 {
2761 my ($res_x,$res_y,$res_z,
2762     $in1_x,$in1_y,$in1_z,
2763     $in2_x,$in2_y,
2764     $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
2765 my $Z1sqr = $S2;
2766 # above map() describes stack layout with 15 temporary
2767 # 256-bit vectors on top. Then we reserve some space for
2768 # !in1infty and !in2infty.
2769
2770 $code.=<<___;
2771 .align  32
2772 ecp_nistz256_point_add_affine_vis3:
2773         save    %sp,-STACK64_FRAME-32*15-32,%sp
2774
2775         mov     $rp,$rp_real
2776         mov     -1,$minus1
2777         mov     -2,$poly3
2778         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
2779         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
2780
2781         ! convert input to uint64_t[4]
2782         ld      [$bp],$a0                       ! in2_x
2783         ld      [$bp+4],$t0
2784         ld      [$bp+8],$a1
2785         ld      [$bp+12],$t1
2786         ld      [$bp+16],$a2
2787         ld      [$bp+20],$t2
2788         ld      [$bp+24],$a3
2789         ld      [$bp+28],$t3
2790         sllx    $t0,32,$t0
2791         sllx    $t1,32,$t1
2792         ld      [$bp+32],$acc0                  ! in2_y
2793         or      $a0,$t0,$a0
2794         ld      [$bp+32+4],$t0
2795         sllx    $t2,32,$t2
2796         ld      [$bp+32+8],$acc1
2797         or      $a1,$t1,$a1
2798         ld      [$bp+32+12],$t1
2799         sllx    $t3,32,$t3
2800         ld      [$bp+32+16],$acc2
2801         or      $a2,$t2,$a2
2802         ld      [$bp+32+20],$t2
2803         or      $a3,$t3,$a3
2804         ld      [$bp+32+24],$acc3
2805         sllx    $t0,32,$t0
2806         ld      [$bp+32+28],$t3
2807         sllx    $t1,32,$t1
2808         stx     $a0,[%sp+LOCALS64+$in2_x]
2809         sllx    $t2,32,$t2
2810         stx     $a1,[%sp+LOCALS64+$in2_x+8]
2811         sllx    $t3,32,$t3
2812         stx     $a2,[%sp+LOCALS64+$in2_x+16]
2813         or      $acc0,$t0,$acc0
2814         stx     $a3,[%sp+LOCALS64+$in2_x+24]
2815         or      $acc1,$t1,$acc1
2816         stx     $acc0,[%sp+LOCALS64+$in2_y]
2817         or      $acc2,$t2,$acc2
2818         stx     $acc1,[%sp+LOCALS64+$in2_y+8]
2819         or      $acc3,$t3,$acc3
2820         stx     $acc2,[%sp+LOCALS64+$in2_y+16]
2821         stx     $acc3,[%sp+LOCALS64+$in2_y+24]
2822
2823         or      $a1,$a0,$a0
2824         or      $a3,$a2,$a2
2825         or      $acc1,$acc0,$acc0
2826         or      $acc3,$acc2,$acc2
2827         or      $a2,$a0,$a0
2828         or      $acc2,$acc0,$acc0
2829         or      $acc0,$a0,$a0
2830         movrnz  $a0,-1,$a0                      ! !in2infty
2831         stx     $a0,[%fp+STACK_BIAS-8]
2832
2833         ld      [$ap],$a0                       ! in1_x
2834         ld      [$ap+4],$t0
2835         ld      [$ap+8],$a1
2836         ld      [$ap+12],$t1
2837         ld      [$ap+16],$a2
2838         ld      [$ap+20],$t2
2839         ld      [$ap+24],$a3
2840         ld      [$ap+28],$t3
2841         sllx    $t0,32,$t0
2842         sllx    $t1,32,$t1
2843         ld      [$ap+32],$acc0                  ! in1_y
2844         or      $a0,$t0,$a0
2845         ld      [$ap+32+4],$t0
2846         sllx    $t2,32,$t2
2847         ld      [$ap+32+8],$acc1
2848         or      $a1,$t1,$a1
2849         ld      [$ap+32+12],$t1
2850         sllx    $t3,32,$t3
2851         ld      [$ap+32+16],$acc2
2852         or      $a2,$t2,$a2
2853         ld      [$ap+32+20],$t2
2854         or      $a3,$t3,$a3
2855         ld      [$ap+32+24],$acc3
2856         sllx    $t0,32,$t0
2857         ld      [$ap+32+28],$t3
2858         sllx    $t1,32,$t1
2859         stx     $a0,[%sp+LOCALS64+$in1_x]
2860         sllx    $t2,32,$t2
2861         stx     $a1,[%sp+LOCALS64+$in1_x+8]
2862         sllx    $t3,32,$t3
2863         stx     $a2,[%sp+LOCALS64+$in1_x+16]
2864         or      $acc0,$t0,$acc0
2865         stx     $a3,[%sp+LOCALS64+$in1_x+24]
2866         or      $acc1,$t1,$acc1
2867         stx     $acc0,[%sp+LOCALS64+$in1_y]
2868         or      $acc2,$t2,$acc2
2869         stx     $acc1,[%sp+LOCALS64+$in1_y+8]
2870         or      $acc3,$t3,$acc3
2871         stx     $acc2,[%sp+LOCALS64+$in1_y+16]
2872         stx     $acc3,[%sp+LOCALS64+$in1_y+24]
2873
2874         or      $a1,$a0,$a0
2875         or      $a3,$a2,$a2
2876         or      $acc1,$acc0,$acc0
2877         or      $acc3,$acc2,$acc2
2878         or      $a2,$a0,$a0
2879         or      $acc2,$acc0,$acc0
2880         or      $acc0,$a0,$a0
2881         movrnz  $a0,-1,$a0                      ! !in1infty
2882         stx     $a0,[%fp+STACK_BIAS-16]
2883
2884         ld      [$ap+64],$a0                    ! in1_z
2885         ld      [$ap+64+4],$t0
2886         ld      [$ap+64+8],$a1
2887         ld      [$ap+64+12],$t1
2888         ld      [$ap+64+16],$a2
2889         ld      [$ap+64+20],$t2
2890         ld      [$ap+64+24],$a3
2891         ld      [$ap+64+28],$t3
2892         sllx    $t0,32,$t0
2893         sllx    $t1,32,$t1
2894         or      $a0,$t0,$a0
2895         sllx    $t2,32,$t2
2896         or      $a1,$t1,$a1
2897         sllx    $t3,32,$t3
2898         stx     $a0,[%sp+LOCALS64+$in1_z]
2899         or      $a2,$t2,$a2
2900         stx     $a1,[%sp+LOCALS64+$in1_z+8]
2901         or      $a3,$t3,$a3
2902         stx     $a2,[%sp+LOCALS64+$in1_z+16]
2903         stx     $a3,[%sp+LOCALS64+$in1_z+24]
2904
2905         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z1sqr, in1_z);
2906         add     %sp,LOCALS64+$Z1sqr,$rp
2907
2908         ldx     [%sp+LOCALS64+$in2_x],$bi
2909         mov     $acc0,$a0
2910         mov     $acc1,$a1
2911         mov     $acc2,$a2
2912         mov     $acc3,$a3
2913         add     %sp,LOCALS64+$in2_x,$bp
2914         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, Z1sqr, in2_x);
2915         add     %sp,LOCALS64+$U2,$rp
2916
2917         ldx     [%sp+LOCALS64+$Z1sqr],$bi       ! forward load
2918         ldx     [%sp+LOCALS64+$in1_z],$a0
2919         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2920         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2921         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2922
2923         add     %sp,LOCALS64+$in1_x,$bp
2924         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(H, U2, in1_x);
2925         add     %sp,LOCALS64+$H,$rp
2926
2927         add     %sp,LOCALS64+$Z1sqr,$bp
2928         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, Z1sqr, in1_z);
2929         add     %sp,LOCALS64+$S2,$rp
2930
2931         ldx     [%sp+LOCALS64+$H],$bi
2932         ldx     [%sp+LOCALS64+$in1_z],$a0
2933         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2934         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2935         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2936         add     %sp,LOCALS64+$H,$bp
2937         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, H, in1_z);
2938         add     %sp,LOCALS64+$res_z,$rp
2939
2940         ldx     [%sp+LOCALS64+$S2],$bi
2941         ldx     [%sp+LOCALS64+$in2_y],$a0
2942         ldx     [%sp+LOCALS64+$in2_y+8],$a1
2943         ldx     [%sp+LOCALS64+$in2_y+16],$a2
2944         ldx     [%sp+LOCALS64+$in2_y+24],$a3
2945         add     %sp,LOCALS64+$S2,$bp
2946         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S2, in2_y);
2947         add     %sp,LOCALS64+$S2,$rp
2948
2949         ldx     [%sp+LOCALS64+$H],$a0           ! forward load
2950         ldx     [%sp+LOCALS64+$H+8],$a1
2951         ldx     [%sp+LOCALS64+$H+16],$a2
2952         ldx     [%sp+LOCALS64+$H+24],$a3
2953
2954         add     %sp,LOCALS64+$in1_y,$bp
2955         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(R, S2, in1_y);
2956         add     %sp,LOCALS64+$R,$rp
2957
2958         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Hsqr, H);
2959         add     %sp,LOCALS64+$Hsqr,$rp
2960
2961         ldx     [%sp+LOCALS64+$R],$a0
2962         ldx     [%sp+LOCALS64+$R+8],$a1
2963         ldx     [%sp+LOCALS64+$R+16],$a2
2964         ldx     [%sp+LOCALS64+$R+24],$a3
2965         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Rsqr, R);
2966         add     %sp,LOCALS64+$Rsqr,$rp
2967
2968         ldx     [%sp+LOCALS64+$H],$bi
2969         ldx     [%sp+LOCALS64+$Hsqr],$a0
2970         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2971         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2972         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2973         add     %sp,LOCALS64+$H,$bp
2974         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(Hcub, Hsqr, H);
2975         add     %sp,LOCALS64+$Hcub,$rp
2976
2977         ldx     [%sp+LOCALS64+$Hsqr],$bi
2978         ldx     [%sp+LOCALS64+$in1_x],$a0
2979         ldx     [%sp+LOCALS64+$in1_x+8],$a1
2980         ldx     [%sp+LOCALS64+$in1_x+16],$a2
2981         ldx     [%sp+LOCALS64+$in1_x+24],$a3
2982         add     %sp,LOCALS64+$Hsqr,$bp
2983         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, in1_x, Hsqr);
2984         add     %sp,LOCALS64+$U2,$rp
2985
2986         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(Hsqr, U2);
2987         add     %sp,LOCALS64+$Hsqr,$rp
2988
2989         add     %sp,LOCALS64+$Rsqr,$bp
2990         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_x, Rsqr, Hsqr);
2991         add     %sp,LOCALS64+$res_x,$rp
2992
2993         add     %sp,LOCALS64+$Hcub,$bp
2994         call    __ecp_nistz256_sub_from_vis3    !  p256_sub(res_x, res_x, Hcub);
2995         add     %sp,LOCALS64+$res_x,$rp
2996
2997         ldx     [%sp+LOCALS64+$Hcub],$bi        ! forward load
2998         ldx     [%sp+LOCALS64+$in1_y],$a0
2999         ldx     [%sp+LOCALS64+$in1_y+8],$a1
3000         ldx     [%sp+LOCALS64+$in1_y+16],$a2
3001         ldx     [%sp+LOCALS64+$in1_y+24],$a3
3002
3003         add     %sp,LOCALS64+$U2,$bp
3004         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_y, U2, res_x);
3005         add     %sp,LOCALS64+$res_y,$rp
3006
3007         add     %sp,LOCALS64+$Hcub,$bp
3008         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, in1_y, Hcub);
3009         add     %sp,LOCALS64+$S2,$rp
3010
3011         ldx     [%sp+LOCALS64+$R],$bi
3012         ldx     [%sp+LOCALS64+$res_y],$a0
3013         ldx     [%sp+LOCALS64+$res_y+8],$a1
3014         ldx     [%sp+LOCALS64+$res_y+16],$a2
3015         ldx     [%sp+LOCALS64+$res_y+24],$a3
3016         add     %sp,LOCALS64+$R,$bp
3017         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_y, res_y, R);
3018         add     %sp,LOCALS64+$res_y,$rp
3019
3020         add     %sp,LOCALS64+$S2,$bp
3021         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, res_y, S2);
3022         add     %sp,LOCALS64+$res_y,$rp
3023
3024         ldx     [%fp+STACK_BIAS-16],$t1         ! !in1infty
3025         ldx     [%fp+STACK_BIAS-8],$t2          ! !in2infty
3026 1:      call    .+8
3027         add     %o7,.Lone_mont_vis3-1b,$bp
3028 ___
3029 for($i=0;$i<64;$i+=16) {                        # conditional moves
3030 $code.=<<___;
3031         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
3032         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
3033         ldx     [%sp+LOCALS64+$in2_x+$i],$acc2  ! in2
3034         ldx     [%sp+LOCALS64+$in2_x+$i+8],$acc3
3035         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
3036         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
3037         movrz   $t1,$acc2,$acc0
3038         movrz   $t1,$acc3,$acc1
3039         movrz   $t2,$acc4,$acc0
3040         movrz   $t2,$acc5,$acc1
3041         srlx    $acc0,32,$acc2
3042         srlx    $acc1,32,$acc3
3043         st      $acc0,[$rp_real+$i]
3044         st      $acc2,[$rp_real+$i+4]
3045         st      $acc1,[$rp_real+$i+8]
3046         st      $acc3,[$rp_real+$i+12]
3047 ___
3048 }
3049 for(;$i<96;$i+=16) {
3050 $code.=<<___;
3051         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
3052         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
3053         ldx     [$bp+$i-64],$acc2               ! "in2"
3054         ldx     [$bp+$i-64+8],$acc3
3055         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
3056         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
3057         movrz   $t1,$acc2,$acc0
3058         movrz   $t1,$acc3,$acc1
3059         movrz   $t2,$acc4,$acc0
3060         movrz   $t2,$acc5,$acc1
3061         srlx    $acc0,32,$acc2
3062         srlx    $acc1,32,$acc3
3063         st      $acc0,[$rp_real+$i]
3064         st      $acc2,[$rp_real+$i+4]
3065         st      $acc1,[$rp_real+$i+8]
3066         st      $acc3,[$rp_real+$i+12]
3067 ___
3068 }
3069 $code.=<<___;
3070         ret
3071         restore
3072 .type   ecp_nistz256_point_add_affine_vis3,#function
3073 .size   ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3
3074 .align  64
3075 .Lone_mont_vis3:
3076 .long   0x00000000,0x00000001, 0xffffffff,0x00000000
3077 .long   0xffffffff,0xffffffff, 0x00000000,0xfffffffe
3078 .align  64
3079 ___
3080 }                                                               }}}
3081 \f
3082 # Purpose of these subroutines is to explicitly encode VIS instructions,
3083 # so that one can compile the module without having to specify VIS
3084 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
3085 # Idea is to reserve for option to produce "universal" binary and let
3086 # programmer detect if current CPU is VIS capable at run-time.
3087 sub unvis3 {
3088 my ($mnemonic,$rs1,$rs2,$rd)=@_;
3089 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
3090 my ($ref,$opf);
3091 my %visopf = (  "addxc"         => 0x011,
3092                 "addxccc"       => 0x013,
3093                 "umulxhi"       => 0x016        );
3094
3095     $ref = "$mnemonic\t$rs1,$rs2,$rd";
3096
3097     if ($opf=$visopf{$mnemonic}) {
3098         foreach ($rs1,$rs2,$rd) {
3099             return $ref if (!/%([goli])([0-9])/);
3100             $_=$bias{$1}+$2;
3101         }
3102
3103         return  sprintf ".word\t0x%08x !%s",
3104                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
3105                         $ref;
3106     } else {
3107         return $ref;
3108     }
3109 }
3110
3111 foreach (split("\n",$code)) {
3112         s/\`([^\`]*)\`/eval $1/ge;
3113
3114         s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
3115                 &unvis3($1,$2,$3,$4)
3116          /ge;
3117
3118         print $_,"\n";
3119 }
3120
3121 close STDOUT;