bn/asm/x86_64-mont*.pl: add MULX/ADCX/ADOX code path.
[oweals/openssl.git] / crypto / bn / asm / rsaz-avx2.pl
1 #!/usr/bin/env perl
2
3 #******************************************************************************
4 #* Copyright(c) 2012, Intel Corp.                                             
5 #* Developers and authors:                                                    
6 #* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   
7 #* (1) Intel Corporation, Israel Development Center, Haifa, Israel
8 #* (2) University of Haifa, Israel                                              
9 #******************************************************************************
10 #* LICENSE:                                                                
11 #* This submission to OpenSSL is to be made available under the OpenSSL  
12 #* license, and only to the OpenSSL project, in order to allow integration    
13 #* into the publicly distributed code. 
14 #* The use of this code, or portions of this code, or concepts embedded in
15 #* this code, or modification of this code and/or algorithm(s) in it, or the
16 #* use of this code for any other purpose than stated above, requires special
17 #* licensing.                                                                  
18 #******************************************************************************
19 #* DISCLAIMER:                                                                
20 #* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS     
21 #* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
22 #* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
23 #* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT
24 #* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 
25 #* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF    
26 #* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS   
27 #* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN    
28 #* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)    
29 #* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
30 #* POSSIBILITY OF SUCH DAMAGE.                                                
31 #******************************************************************************
32 #* Reference:                                                                 
33 #* [1]  S. Gueron, V. Krasnov: "Software Implementation of Modular
34 #*      Exponentiation,  Using Advanced Vector Instructions Architectures",
35 #*      F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,
36 #*      pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012
37 #* [2]  S. Gueron: "Efficient Software Implementations of Modular
38 #*      Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).
39 #* [3]  S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE
40 #*      Proceedings of 9th International Conference on Information Technology:
41 #*      New Generations (ITNG 2012), pp.821-823 (2012)
42 #* [4]  S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
43 #*      resistant 1024-bit modular exponentiation, for optimizing RSA2048
44 #*      on AVX2 capable x86_64 platforms",
45 #*      http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest
46 #******************************************************************************
47
48 # +10% improvement by <appro@openssl.org>
49 #
50 # rsa2048 sign/sec      OpenSSL 1.0.1   scalar(*)       this
51 # 2GHz Haswell          544             632/+16%        947/+74%
52 #
53 # (*)   if system doesn't support AVX2, for reference purposes;
54
55 $flavour = shift;
56 $output  = shift;
57 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
58
59 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
60
61 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
63 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
64 die "can't locate x86_64-xlate.pl";
65
66 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
67                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
68         $avx = ($1>=2.19) + ($1>=2.22);
69 }
70
71 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
72             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
73         $avx = ($1>=2.09) + ($1>=2.10);
74 }
75
76 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
77             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
78         $avx = ($1>=10) + ($1>=11);
79 }
80
81 open OUT,"| $^X $xlate $flavour $output";
82 *STDOUT = *OUT;
83
84 if ($avx>1) {{{
85 { # void AMS_WW(
86 my $rp="%rdi";  # BN_ULONG *rp,
87 my $ap="%rsi";  # const BN_ULONG *ap,
88 my $np="%rdx";  # const BN_ULONG *np,
89 my $n0="%ecx";  # const BN_ULONG n0,
90 my $rep="%r8d"; # int repeat);
91
92 # The registers that hold the accumulated redundant result
93 # The AMM works on 1024 bit operands, and redundant word size is 29
94 # Therefore: ceil(1024/29)/4 = 9
95 my $ACC0="%ymm0";
96 my $ACC1="%ymm1";
97 my $ACC2="%ymm2";
98 my $ACC3="%ymm3";
99 my $ACC4="%ymm4";
100 my $ACC5="%ymm5";
101 my $ACC6="%ymm6";
102 my $ACC7="%ymm7";
103 my $ACC8="%ymm8";
104 my $ACC9="%ymm9";
105 # Registers that hold the broadcasted words of bp, currently used
106 my $B1="%ymm10";
107 my $B2="%ymm11";
108 # Registers that hold the broadcasted words of Y, currently used
109 my $Y1="%ymm12";
110 my $Y2="%ymm13";
111 # Helper registers
112 my $TEMP1="%ymm14";
113 my $AND_MASK="%ymm15";
114 # alu registers that hold the first words of the ACC
115 my $r0="%r9";
116 my $r1="%r10";
117 my $r2="%r11";
118 my $r3="%r12";
119
120 my $i="%r14d";                  # loop counter
121 my $tmp = "%r15";
122
123 my $FrameSize=32*18+32*8;       # place for A^2 and 2*A
124
125 my $aap=$r0;
126 my $tp0="%rbx";
127 my $tp1=$r3;
128
129 $np="%r13";                     # reassigned argument
130
131 $code.=<<___;
132 .text
133
134 .globl  rsaz_1024_sqr_avx2
135 .type   rsaz_1024_sqr_avx2,\@function,5
136 .align  64
137 rsaz_1024_sqr_avx2:             # 702 cycles, 14% faster than rsaz_1024_mul_avx2
138         lea     (%rsp), %rax
139         push    %rbx
140         push    %rbp
141         push    %r12
142         push    %r13
143         push    %r14
144         push    %r15
145 ___
146 $code.=<<___ if ($win64);
147         lea     -0xa8(%rsp),%rsp
148         movaps  %xmm6,-0xd8(%rax)
149         movaps  %xmm7,-0xc8(%rax)
150         movaps  %xmm8,-0xb8(%rax)
151         movaps  %xmm9,-0xa8(%rax)
152         movaps  %xmm10,-0x98(%rax)
153         movaps  %xmm11,-0x88(%rax)
154         movaps  %xmm12,-0x78(%rax)
155         movaps  %xmm13,-0x68(%rax)
156         movaps  %xmm14,-0x58(%rax)
157         movaps  %xmm15,-0x48(%rax)
158 .Lsqr_1024_body:
159 ___
160 $code.=<<___;
161         mov     %rax,%rbp
162         vzeroall
163         mov     %rdx, $np                       # reassigned argument
164         sub     \$$FrameSize, %rsp
165         mov     $np, $tmp
166         sub     \$-128, $rp                     # size optimization
167         sub     \$-128, $ap
168         sub     \$-128, $np
169
170         and     \$4095, $tmp                    # see if $np crosses page
171         add     \$32*10, $tmp
172         shr     \$12, $tmp
173         jz      .Lsqr_1024_no_n_copy
174
175         # unaligned 256-bit load that crosses page boundary can
176         # cause >2x performance degradation here, so if $np does
177         # cross page boundary, copy it to stack and make sure stack
178         # frame doesn't...
179         sub             \$32*10,%rsp
180         vmovdqu         32*0-128($np), $ACC0
181         and             \$-2048, %rsp
182         vmovdqu         32*1-128($np), $ACC1
183         vmovdqu         32*2-128($np), $ACC2
184         vmovdqu         32*3-128($np), $ACC3
185         vmovdqu         32*4-128($np), $ACC4
186         vmovdqu         32*5-128($np), $ACC5
187         vmovdqu         32*6-128($np), $ACC6
188         vmovdqu         32*7-128($np), $ACC7
189         vmovdqu         32*8-128($np), $ACC8
190         lea             $FrameSize+128(%rsp),$np
191         vmovdqu         $ACC0, 32*0-128($np)
192         vmovdqu         $ACC1, 32*1-128($np)
193         vmovdqu         $ACC2, 32*2-128($np)
194         vmovdqu         $ACC3, 32*3-128($np)
195         vmovdqu         $ACC4, 32*4-128($np)
196         vmovdqu         $ACC5, 32*5-128($np)
197         vmovdqu         $ACC6, 32*6-128($np)
198         vmovdqu         $ACC7, 32*7-128($np)
199         vmovdqu         $ACC8, 32*8-128($np)
200         vmovdqu         $ACC9, 32*9-128($np)    # $ACC9 is zero after vzeroall
201
202 .Lsqr_1024_no_n_copy:
203         and             \$-1024, %rsp
204
205         vmovdqu         32*1-128($ap), $ACC1
206         vmovdqu         32*2-128($ap), $ACC2
207         vmovdqu         32*3-128($ap), $ACC3
208         vmovdqu         32*4-128($ap), $ACC4
209         vmovdqu         32*5-128($ap), $ACC5
210         vmovdqu         32*6-128($ap), $ACC6
211         vmovdqu         32*7-128($ap), $ACC7
212         vmovdqu         32*8-128($ap), $ACC8
213
214         lea     192(%rsp), $tp0                 # 64+128=192
215         vpbroadcastq    .Land_mask(%rip), $AND_MASK
216         jmp     .LOOP_GRANDE_SQR_1024
217
218 .align  32
219 .LOOP_GRANDE_SQR_1024:
220         lea     32*18+128(%rsp), $aap           # size optimization
221         lea     448(%rsp), $tp1                 # 64+128+256=448
222
223         # the squaring is performed as described in Variant B of
224         # "Speeding up Big-Number Squaring", so start by calculating
225         # the A*2=A+A vector
226         vpaddq          $ACC1, $ACC1, $ACC1
227          vpbroadcastq   32*0-128($ap), $B1
228         vpaddq          $ACC2, $ACC2, $ACC2
229         vmovdqa         $ACC1, 32*0-128($aap)
230         vpaddq          $ACC3, $ACC3, $ACC3
231         vmovdqa         $ACC2, 32*1-128($aap)
232         vpaddq          $ACC4, $ACC4, $ACC4
233         vmovdqa         $ACC3, 32*2-128($aap)
234         vpaddq          $ACC5, $ACC5, $ACC5
235         vmovdqa         $ACC4, 32*3-128($aap)
236         vpaddq          $ACC6, $ACC6, $ACC6
237         vmovdqa         $ACC5, 32*4-128($aap)
238         vpaddq          $ACC7, $ACC7, $ACC7
239         vmovdqa         $ACC6, 32*5-128($aap)
240         vpaddq          $ACC8, $ACC8, $ACC8
241         vmovdqa         $ACC7, 32*6-128($aap)
242         vpxor           $ACC9, $ACC9, $ACC9
243         vmovdqa         $ACC8, 32*7-128($aap)
244
245         vpmuludq        32*0-128($ap), $B1, $ACC0
246          vpbroadcastq   32*1-128($ap), $B2
247          vmovdqu        $ACC9, 32*9-192($tp0)   # zero upper half
248         vpmuludq        $B1, $ACC1, $ACC1
249          vmovdqu        $ACC9, 32*10-448($tp1)
250         vpmuludq        $B1, $ACC2, $ACC2
251          vmovdqu        $ACC9, 32*11-448($tp1)
252         vpmuludq        $B1, $ACC3, $ACC3
253          vmovdqu        $ACC9, 32*12-448($tp1)
254         vpmuludq        $B1, $ACC4, $ACC4
255          vmovdqu        $ACC9, 32*13-448($tp1)
256         vpmuludq        $B1, $ACC5, $ACC5
257          vmovdqu        $ACC9, 32*14-448($tp1)
258         vpmuludq        $B1, $ACC6, $ACC6
259          vmovdqu        $ACC9, 32*15-448($tp1)
260         vpmuludq        $B1, $ACC7, $ACC7
261          vmovdqu        $ACC9, 32*16-448($tp1)
262         vpmuludq        $B1, $ACC8, $ACC8
263          vpbroadcastq   32*2-128($ap), $B1
264          vmovdqu        $ACC9, 32*17-448($tp1)
265
266         xor     $tmp, $tmp
267         mov     \$4, $i
268         jmp     .Lsqr_entry_1024
269 ___
270 $TEMP0=$Y1;
271 $TEMP2=$Y2;
272 $code.=<<___;
273 .align  32
274 .LOOP_SQR_1024:
275         vmovdqu         32*0(%rsp,$tmp), $TEMP0 # 32*0-192($tp0,$tmp)
276         vmovdqu         32*1(%rsp,$tmp), $TEMP1 # 32*1-192($tp0,$tmp)
277          vpbroadcastq   32*1-128($ap,$tmp), $B2
278         vpmuludq        32*0-128($ap), $B1, $ACC0
279         vmovdqu         32*2-192($tp0,$tmp), $TEMP2
280         vpaddq          $TEMP0, $ACC0, $ACC0
281         vpmuludq        32*0-128($aap), $B1, $ACC1
282         vmovdqu         32*3-192($tp0,$tmp), $TEMP0
283         vpaddq          $TEMP1, $ACC1, $ACC1
284         vpmuludq        32*1-128($aap), $B1, $ACC2
285         vmovdqu         32*4-192($tp0,$tmp), $TEMP1
286         vpaddq          $TEMP2, $ACC2, $ACC2
287         vpmuludq        32*2-128($aap), $B1, $ACC3
288         vmovdqu         32*5-192($tp0,$tmp), $TEMP2
289         vpaddq          $TEMP0, $ACC3, $ACC3
290         vpmuludq        32*3-128($aap), $B1, $ACC4
291         vmovdqu         32*6-192($tp0,$tmp), $TEMP0
292         vpaddq          $TEMP1, $ACC4, $ACC4
293         vpmuludq        32*4-128($aap), $B1, $ACC5
294         vmovdqu         32*7-192($tp0,$tmp), $TEMP1
295         vpaddq          $TEMP2, $ACC5, $ACC5
296         vpmuludq        32*5-128($aap), $B1, $ACC6
297         vmovdqu         32*8-192($tp0,$tmp), $TEMP2
298         vpaddq          $TEMP0, $ACC6, $ACC6
299         vpmuludq        32*6-128($aap), $B1, $ACC7
300         vpaddq          $TEMP1, $ACC7, $ACC7
301         vpmuludq        32*7-128($aap), $B1, $ACC8
302          vpbroadcastq   32*2-128($ap,$tmp), $B1
303         vpaddq          $TEMP2, $ACC8, $ACC8
304 .Lsqr_entry_1024:
305         vmovdqu         $ACC0, 32*0(%rsp,$tmp)  # 32*0-192($tp0,$tmp)
306         vmovdqu         $ACC1, 32*1(%rsp,$tmp)  # 32*1-192($tp0,$tmp)
307
308         vpmuludq        32*1-128($ap), $B2, $TEMP0
309         vpaddq          $TEMP0, $ACC2, $ACC2
310         vpmuludq        32*1-128($aap), $B2, $TEMP1
311         vpaddq          $TEMP1, $ACC3, $ACC3
312         vpmuludq        32*2-128($aap), $B2, $TEMP2
313         vpaddq          $TEMP2, $ACC4, $ACC4
314         vpmuludq        32*3-128($aap), $B2, $TEMP0
315         vpaddq          $TEMP0, $ACC5, $ACC5
316         vpmuludq        32*4-128($aap), $B2, $TEMP1
317         vpaddq          $TEMP1, $ACC6, $ACC6
318         vpmuludq        32*5-128($aap), $B2, $TEMP2
319         vmovdqu         32*9-192($tp0,$tmp), $TEMP1
320         vpaddq          $TEMP2, $ACC7, $ACC7
321         vpmuludq        32*6-128($aap), $B2, $TEMP0
322         vpaddq          $TEMP0, $ACC8, $ACC8
323         vpmuludq        32*7-128($aap), $B2, $ACC0
324          vpbroadcastq   32*3-128($ap,$tmp), $B2
325         vpaddq          $TEMP1, $ACC0, $ACC0
326
327         vmovdqu         $ACC2, 32*2-192($tp0,$tmp)
328         vmovdqu         $ACC3, 32*3-192($tp0,$tmp)
329
330         vpmuludq        32*2-128($ap), $B1, $TEMP2
331         vpaddq          $TEMP2, $ACC4, $ACC4
332         vpmuludq        32*2-128($aap), $B1, $TEMP0
333         vpaddq          $TEMP0, $ACC5, $ACC5
334         vpmuludq        32*3-128($aap), $B1, $TEMP1
335         vpaddq          $TEMP1, $ACC6, $ACC6
336         vpmuludq        32*4-128($aap), $B1, $TEMP2
337         vpaddq          $TEMP2, $ACC7, $ACC7
338         vpmuludq        32*5-128($aap), $B1, $TEMP0
339         vmovdqu         32*10-448($tp1,$tmp), $TEMP2
340         vpaddq          $TEMP0, $ACC8, $ACC8
341         vpmuludq        32*6-128($aap), $B1, $TEMP1
342         vpaddq          $TEMP1, $ACC0, $ACC0
343         vpmuludq        32*7-128($aap), $B1, $ACC1
344          vpbroadcastq   32*4-128($ap,$tmp), $B1
345         vpaddq          $TEMP2, $ACC1, $ACC1
346
347         vmovdqu         $ACC4, 32*4-192($tp0,$tmp)
348         vmovdqu         $ACC5, 32*5-192($tp0,$tmp)
349
350         vpmuludq        32*3-128($ap), $B2, $TEMP0
351         vpaddq          $TEMP0, $ACC6, $ACC6
352         vpmuludq        32*3-128($aap), $B2, $TEMP1
353         vpaddq          $TEMP1, $ACC7, $ACC7
354         vpmuludq        32*4-128($aap), $B2, $TEMP2
355         vpaddq          $TEMP2, $ACC8, $ACC8
356         vpmuludq        32*5-128($aap), $B2, $TEMP0
357         vmovdqu         32*11-448($tp1,$tmp), $TEMP2
358         vpaddq          $TEMP0, $ACC0, $ACC0
359         vpmuludq        32*6-128($aap), $B2, $TEMP1
360         vpaddq          $TEMP1, $ACC1, $ACC1
361         vpmuludq        32*7-128($aap), $B2, $ACC2
362          vpbroadcastq   32*5-128($ap,$tmp), $B2
363         vpaddq          $TEMP2, $ACC2, $ACC2    
364
365         vmovdqu         $ACC6, 32*6-192($tp0,$tmp)
366         vmovdqu         $ACC7, 32*7-192($tp0,$tmp)
367
368         vpmuludq        32*4-128($ap), $B1, $TEMP0
369         vpaddq          $TEMP0, $ACC8, $ACC8
370         vpmuludq        32*4-128($aap), $B1, $TEMP1
371         vpaddq          $TEMP1, $ACC0, $ACC0
372         vpmuludq        32*5-128($aap), $B1, $TEMP2
373         vmovdqu         32*12-448($tp1,$tmp), $TEMP1
374         vpaddq          $TEMP2, $ACC1, $ACC1
375         vpmuludq        32*6-128($aap), $B1, $TEMP0
376         vpaddq          $TEMP0, $ACC2, $ACC2
377         vpmuludq        32*7-128($aap), $B1, $ACC3
378          vpbroadcastq   32*6-128($ap,$tmp), $B1
379         vpaddq          $TEMP1, $ACC3, $ACC3
380
381         vmovdqu         $ACC8, 32*8-192($tp0,$tmp)
382         vmovdqu         $ACC0, 32*9-192($tp0,$tmp)
383
384         vpmuludq        32*5-128($ap), $B2, $TEMP2
385         vpaddq          $TEMP2, $ACC1, $ACC1
386         vpmuludq        32*5-128($aap), $B2, $TEMP0
387         vmovdqu         32*13-448($tp1,$tmp), $TEMP2
388         vpaddq          $TEMP0, $ACC2, $ACC2
389         vpmuludq        32*6-128($aap), $B2, $TEMP1
390         vpaddq          $TEMP1, $ACC3, $ACC3
391         vpmuludq        32*7-128($aap), $B2, $ACC4
392          vpbroadcastq   32*7-128($ap,$tmp), $B2
393         vpaddq          $TEMP2, $ACC4, $ACC4
394
395         vmovdqu         $ACC1, 32*10-448($tp1,$tmp)
396         vmovdqu         $ACC2, 32*11-448($tp1,$tmp)
397
398         vpmuludq        32*6-128($ap), $B1, $TEMP0
399         vmovdqu         32*14-448($tp1,$tmp), $TEMP2
400         vpaddq          $TEMP0, $ACC3, $ACC3
401         vpmuludq        32*6-128($aap), $B1, $TEMP1
402          vpbroadcastq   32*8-128($ap,$tmp), $ACC0       # borrow $ACC0 for $B1
403         vpaddq          $TEMP1, $ACC4, $ACC4
404         vpmuludq        32*7-128($aap), $B1, $ACC5
405          vpbroadcastq   32*0+8-128($ap,$tmp), $B1       # for next iteration
406         vpaddq          $TEMP2, $ACC5, $ACC5
407         vmovdqu         32*15-448($tp1,$tmp), $TEMP1
408
409         vmovdqu         $ACC3, 32*12-448($tp1,$tmp)
410         vmovdqu         $ACC4, 32*13-448($tp1,$tmp)
411
412         vpmuludq        32*7-128($ap), $B2, $TEMP0
413         vmovdqu         32*16-448($tp1,$tmp), $TEMP2
414         vpaddq          $TEMP0, $ACC5, $ACC5
415         vpmuludq        32*7-128($aap), $B2, $ACC6
416         vpaddq          $TEMP1, $ACC6, $ACC6
417
418         vpmuludq        32*8-128($ap), $ACC0, $ACC7
419         vmovdqu         $ACC5, 32*14-448($tp1,$tmp)
420         vpaddq          $TEMP2, $ACC7, $ACC7
421         vmovdqu         $ACC6, 32*15-448($tp1,$tmp)
422         vmovdqu         $ACC7, 32*16-448($tp1,$tmp)
423
424         lea     8($tmp), $tmp
425         dec     $i        
426         jnz     .LOOP_SQR_1024
427 ___
428 $ZERO = $ACC9;
429 $TEMP0 = $B1;
430 $TEMP2 = $B2;
431 $TEMP3 = $Y1;
432 $TEMP4 = $Y2;
433 $code.=<<___;
434         #we need to fix indexes 32-39 to avoid overflow
435         vmovdqu         32*8-192($tp0), $ACC8
436         vmovdqu         32*9-192($tp0), $ACC1
437         vmovdqu         32*10-448($tp1), $ACC2
438
439         vpsrlq          \$29, $ACC8, $TEMP1
440         vpand           $AND_MASK, $ACC8, $ACC8
441         vpsrlq          \$29, $ACC1, $TEMP2
442         vpand           $AND_MASK, $ACC1, $ACC1
443
444         vpermq          \$0x93, $TEMP1, $TEMP1
445         vpxor           $ZERO, $ZERO, $ZERO
446         vpermq          \$0x93, $TEMP2, $TEMP2
447
448         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
449         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
450         vpaddq          $TEMP0, $ACC8, $ACC8
451         vpblendd        \$3, $TEMP2, $ZERO, $TEMP2
452         vpaddq          $TEMP1, $ACC1, $ACC1
453         vpaddq          $TEMP2, $ACC2, $ACC2
454         vmovdqu         $ACC1, 32*9-192($tp0)
455         vmovdqu         $ACC2, 32*10-448($tp1)
456
457         mov     (%rsp), %rax
458         mov     8(%rsp), $r1
459         mov     16(%rsp), $r2
460         mov     24(%rsp), $r3
461         vmovdqu 32*1(%rsp), $ACC1
462         vmovdqu 32*2-192($tp0), $ACC2
463         vmovdqu 32*3-192($tp0), $ACC3
464         vmovdqu 32*4-192($tp0), $ACC4
465         vmovdqu 32*5-192($tp0), $ACC5
466         vmovdqu 32*6-192($tp0), $ACC6
467         vmovdqu 32*7-192($tp0), $ACC7
468
469         mov     %rax, $r0
470         imull   $n0, %eax
471         and     \$0x1fffffff, %eax
472         vmovd   %eax, $Y1
473
474         mov     %rax, %rdx
475         imulq   -128($np), %rax
476          vpbroadcastq   $Y1, $Y1
477         add     %rax, $r0
478         mov     %rdx, %rax
479         imulq   8-128($np), %rax
480         shr     \$29, $r0
481         add     %rax, $r1
482         mov     %rdx, %rax
483         imulq   16-128($np), %rax
484         add     $r0, $r1
485         add     %rax, $r2
486         imulq   24-128($np), %rdx
487         add     %rdx, $r3
488
489         mov     $r1, %rax
490         imull   $n0, %eax
491         and     \$0x1fffffff, %eax
492
493         mov \$9, $i
494         jmp .LOOP_REDUCE_1024
495
496 .align  32
497 .LOOP_REDUCE_1024:
498         vmovd   %eax, $Y2
499         vpbroadcastq    $Y2, $Y2
500
501         vpmuludq        32*1-128($np), $Y1, $TEMP0
502          mov    %rax, %rdx
503          imulq  -128($np), %rax
504         vpaddq          $TEMP0, $ACC1, $ACC1
505         vpmuludq        32*2-128($np), $Y1, $TEMP1
506          add    %rax, $r1
507          mov    %rdx, %rax
508          imulq  8-128($np), %rax
509         vpaddq          $TEMP1, $ACC2, $ACC2
510         vpmuludq        32*3-128($np), $Y1, $TEMP2
511          add    %rax, $r2
512          mov    %rdx, %rax
513          imulq  16-128($np), %rax
514          shr    \$29, $r1
515         vpaddq          $TEMP2, $ACC3, $ACC3
516         vpmuludq        32*4-128($np), $Y1, $TEMP0
517          add    %rax, $r3
518          add    $r1, $r2
519         vpaddq          $TEMP0, $ACC4, $ACC4
520         vpmuludq        32*5-128($np), $Y1, $TEMP1
521          mov    $r2, %rax
522          imull  $n0, %eax
523         vpaddq          $TEMP1, $ACC5, $ACC5
524         vpmuludq        32*6-128($np), $Y1, $TEMP2
525          and    \$0x1fffffff, %eax
526         vpaddq          $TEMP2, $ACC6, $ACC6
527         vpmuludq        32*7-128($np), $Y1, $TEMP0
528         vpaddq          $TEMP0, $ACC7, $ACC7
529         vpmuludq        32*8-128($np), $Y1, $TEMP1
530          vmovd  %eax, $Y1
531          vmovdqu        32*1-8-128($np), $TEMP2
532         vpaddq          $TEMP1, $ACC8, $ACC8
533          vmovdqu        32*2-8-128($np), $TEMP0
534          vpbroadcastq   $Y1, $Y1
535
536         vpmuludq        $Y2, $TEMP2, $TEMP2
537         vmovdqu         32*3-8-128($np), $TEMP1
538          mov    %rax, %rdx
539          imulq  -128($np), %rax
540         vpaddq          $TEMP2, $ACC1, $ACC1
541         vpmuludq        $Y2, $TEMP0, $TEMP0
542         vmovdqu         32*4-8-128($np), $TEMP2
543          add    %rax, $r2
544          mov    %rdx, %rax
545          imulq  8-128($np), %rax
546         vpaddq          $TEMP0, $ACC2, $ACC2
547          add    $r3, %rax
548          shr    \$29, $r2
549         vpmuludq        $Y2, $TEMP1, $TEMP1
550         vmovdqu         32*5-8-128($np), $TEMP0
551          add    $r2, %rax
552         vpaddq          $TEMP1, $ACC3, $ACC3
553         vpmuludq        $Y2, $TEMP2, $TEMP2
554         vmovdqu         32*6-8-128($np), $TEMP1
555          mov    %rax, $r3
556          imull  $n0, %eax
557         vpaddq          $TEMP2, $ACC4, $ACC4
558         vpmuludq        $Y2, $TEMP0, $TEMP0
559         vmovdqu         32*7-8-128($np), $TEMP2
560          and    \$0x1fffffff, %eax
561         vpaddq          $TEMP0, $ACC5, $ACC5
562         vpmuludq        $Y2, $TEMP1, $TEMP1
563         vmovdqu         32*8-8-128($np), $TEMP0
564         vpaddq          $TEMP1, $ACC6, $ACC6
565         vpmuludq        $Y2, $TEMP2, $TEMP2
566         vmovdqu         32*9-8-128($np), $ACC9
567          vmovd  %eax, $ACC0                     # borrow ACC0 for Y2
568          imulq  -128($np), %rax
569         vpaddq          $TEMP2, $ACC7, $ACC7
570         vpmuludq        $Y2, $TEMP0, $TEMP0
571          vmovdqu        32*1-16-128($np), $TEMP1
572          vpbroadcastq   $ACC0, $ACC0
573         vpaddq          $TEMP0, $ACC8, $ACC8
574         vpmuludq        $Y2, $ACC9, $ACC9
575          vmovdqu        32*2-16-128($np), $TEMP2
576          add    %rax, $r3
577
578 ___
579 ($ACC0,$Y2)=($Y2,$ACC0);
580 $code.=<<___;
581          vmovdqu        32*1-24-128($np), $ACC0
582         vpmuludq        $Y1, $TEMP1, $TEMP1
583         vmovdqu         32*3-16-128($np), $TEMP0
584         vpaddq          $TEMP1, $ACC1, $ACC1
585          vpmuludq       $Y2, $ACC0, $ACC0
586         vpmuludq        $Y1, $TEMP2, $TEMP2
587         vmovdqu         32*4-16-128($np), $TEMP1
588          vpaddq         $ACC1, $ACC0, $ACC0
589         vpaddq          $TEMP2, $ACC2, $ACC2
590         vpmuludq        $Y1, $TEMP0, $TEMP0
591         vmovdqu         32*5-16-128($np), $TEMP2
592          vmovq          $ACC0, %rax
593          vmovdqu        $ACC0, (%rsp)           # transfer $r0-$r3
594         vpaddq          $TEMP0, $ACC3, $ACC3
595         vpmuludq        $Y1, $TEMP1, $TEMP1
596         vmovdqu         32*6-16-128($np), $TEMP0
597         vpaddq          $TEMP1, $ACC4, $ACC4
598         vpmuludq        $Y1, $TEMP2, $TEMP2
599         vmovdqu         32*7-16-128($np), $TEMP1
600         vpaddq          $TEMP2, $ACC5, $ACC5
601         vpmuludq        $Y1, $TEMP0, $TEMP0
602         vmovdqu         32*8-16-128($np), $TEMP2
603         vpaddq          $TEMP0, $ACC6, $ACC6
604         vpmuludq        $Y1, $TEMP1, $TEMP1
605         vmovdqu         32*9-16-128($np), $TEMP0
606          shr    \$29, $r3
607         vpaddq          $TEMP1, $ACC7, $ACC7
608         vpmuludq        $Y1, $TEMP2, $TEMP2
609          vmovdqu        32*2-24-128($np), $TEMP1
610          add    $r3, %rax
611          mov    %rax, $r0
612          imull  $n0, %eax
613         vpaddq          $TEMP2, $ACC8, $ACC8
614         vpmuludq        $Y1, $TEMP0, $TEMP0
615          and    \$0x1fffffff, %eax
616          vmovd  %eax, $Y1
617          vmovdqu        32*3-24-128($np), $TEMP2
618         vpaddq          $TEMP0, $ACC9, $ACC9
619          vpbroadcastq   $Y1, $Y1
620
621         vpmuludq        $Y2, $TEMP1, $TEMP1
622         vmovdqu         32*4-24-128($np), $TEMP0
623          mov    %rax, %rdx
624          imulq  -128($np), %rax
625          mov    8(%rsp), $r1
626         vpaddq          $TEMP1, $ACC2, $ACC1
627         vpmuludq        $Y2, $TEMP2, $TEMP2
628         vmovdqu         32*5-24-128($np), $TEMP1
629          add    %rax, $r0
630          mov    %rdx, %rax
631          imulq  8-128($np), %rax
632          shr    \$29, $r0
633          mov    16(%rsp), $r2
634         vpaddq          $TEMP2, $ACC3, $ACC2
635         vpmuludq        $Y2, $TEMP0, $TEMP0
636         vmovdqu         32*6-24-128($np), $TEMP2
637          add    %rax, $r1
638          mov    %rdx, %rax
639          imulq  16-128($np), %rax
640         vpaddq          $TEMP0, $ACC4, $ACC3
641         vpmuludq        $Y2, $TEMP1, $TEMP1
642         vmovdqu         32*7-24-128($np), $TEMP0
643          imulq  24-128($np), %rdx               # future $r3
644          add    %rax, $r2
645          lea    ($r0,$r1), %rax
646         vpaddq          $TEMP1, $ACC5, $ACC4
647         vpmuludq        $Y2, $TEMP2, $TEMP2
648         vmovdqu         32*8-24-128($np), $TEMP1
649          mov    %rax, $r1
650          imull  $n0, %eax
651         vpaddq          $TEMP2, $ACC6, $ACC5
652         vpmuludq        $Y2, $TEMP0, $TEMP0
653         vmovdqu         32*9-24-128($np), $TEMP2
654          and    \$0x1fffffff, %eax
655         vpaddq          $TEMP0, $ACC7, $ACC6
656         vpmuludq        $Y2, $TEMP1, $TEMP1
657          add    24(%rsp), %rdx
658         vpaddq          $TEMP1, $ACC8, $ACC7
659         vpmuludq        $Y2, $TEMP2, $TEMP2
660         vpaddq          $TEMP2, $ACC9, $ACC8
661          vmovq  $r3, $ACC9
662          mov    %rdx, $r3
663
664         dec     $i
665         jnz     .LOOP_REDUCE_1024
666 ___
667 ($ACC0,$Y2)=($Y2,$ACC0);
668 $code.=<<___;
669         lea     448(%rsp), $tp1                 # size optimization
670         vpaddq  $ACC9, $Y2, $ACC0
671         vpxor   $ZERO, $ZERO, $ZERO
672
673         vpaddq          32*9-192($tp0), $ACC0, $ACC0
674         vpaddq          32*10-448($tp1), $ACC1, $ACC1
675         vpaddq          32*11-448($tp1), $ACC2, $ACC2
676         vpaddq          32*12-448($tp1), $ACC3, $ACC3
677         vpaddq          32*13-448($tp1), $ACC4, $ACC4
678         vpaddq          32*14-448($tp1), $ACC5, $ACC5
679         vpaddq          32*15-448($tp1), $ACC6, $ACC6
680         vpaddq          32*16-448($tp1), $ACC7, $ACC7
681         vpaddq          32*17-448($tp1), $ACC8, $ACC8
682
683         vpsrlq          \$29, $ACC0, $TEMP1
684         vpand           $AND_MASK, $ACC0, $ACC0
685         vpsrlq          \$29, $ACC1, $TEMP2
686         vpand           $AND_MASK, $ACC1, $ACC1
687         vpsrlq          \$29, $ACC2, $TEMP3
688         vpermq          \$0x93, $TEMP1, $TEMP1
689         vpand           $AND_MASK, $ACC2, $ACC2
690         vpsrlq          \$29, $ACC3, $TEMP4
691         vpermq          \$0x93, $TEMP2, $TEMP2
692         vpand           $AND_MASK, $ACC3, $ACC3
693         vpermq          \$0x93, $TEMP3, $TEMP3
694
695         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
696         vpermq          \$0x93, $TEMP4, $TEMP4
697         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
698         vpaddq          $TEMP0, $ACC0, $ACC0
699         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
700         vpaddq          $TEMP1, $ACC1, $ACC1
701         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
702         vpaddq          $TEMP2, $ACC2, $ACC2
703         vpblendd        \$3, $TEMP4, $ZERO, $TEMP4
704         vpaddq          $TEMP3, $ACC3, $ACC3
705         vpaddq          $TEMP4, $ACC4, $ACC4
706
707         vpsrlq          \$29, $ACC0, $TEMP1
708         vpand           $AND_MASK, $ACC0, $ACC0
709         vpsrlq          \$29, $ACC1, $TEMP2
710         vpand           $AND_MASK, $ACC1, $ACC1
711         vpsrlq          \$29, $ACC2, $TEMP3
712         vpermq          \$0x93, $TEMP1, $TEMP1
713         vpand           $AND_MASK, $ACC2, $ACC2
714         vpsrlq          \$29, $ACC3, $TEMP4
715         vpermq          \$0x93, $TEMP2, $TEMP2
716         vpand           $AND_MASK, $ACC3, $ACC3
717         vpermq          \$0x93, $TEMP3, $TEMP3
718
719         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
720         vpermq          \$0x93, $TEMP4, $TEMP4
721         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
722         vpaddq          $TEMP0, $ACC0, $ACC0
723         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
724         vpaddq          $TEMP1, $ACC1, $ACC1
725         vmovdqu         $ACC0, 32*0-128($rp)
726         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
727         vpaddq          $TEMP2, $ACC2, $ACC2
728         vmovdqu         $ACC1, 32*1-128($rp)
729         vpblendd        \$3, $TEMP4, $ZERO, $TEMP4
730         vpaddq          $TEMP3, $ACC3, $ACC3
731         vmovdqu         $ACC2, 32*2-128($rp)
732         vpaddq          $TEMP4, $ACC4, $ACC4
733         vmovdqu         $ACC3, 32*3-128($rp)
734 ___
735 $TEMP5=$ACC0;
736 $code.=<<___;
737         vpsrlq          \$29, $ACC4, $TEMP1
738         vpand           $AND_MASK, $ACC4, $ACC4
739         vpsrlq          \$29, $ACC5, $TEMP2
740         vpand           $AND_MASK, $ACC5, $ACC5
741         vpsrlq          \$29, $ACC6, $TEMP3
742         vpermq          \$0x93, $TEMP1, $TEMP1
743         vpand           $AND_MASK, $ACC6, $ACC6
744         vpsrlq          \$29, $ACC7, $TEMP4
745         vpermq          \$0x93, $TEMP2, $TEMP2
746         vpand           $AND_MASK, $ACC7, $ACC7
747         vpsrlq          \$29, $ACC8, $TEMP5
748         vpermq          \$0x93, $TEMP3, $TEMP3
749         vpand           $AND_MASK, $ACC8, $ACC8
750         vpermq          \$0x93, $TEMP4, $TEMP4
751
752         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
753         vpermq          \$0x93, $TEMP5, $TEMP5
754         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
755         vpaddq          $TEMP0, $ACC4, $ACC4
756         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
757         vpaddq          $TEMP1, $ACC5, $ACC5
758         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
759         vpaddq          $TEMP2, $ACC6, $ACC6
760         vpblendd        \$3, $TEMP4, $TEMP5, $TEMP4
761         vpaddq          $TEMP3, $ACC7, $ACC7
762         vpaddq          $TEMP4, $ACC8, $ACC8
763      
764         vpsrlq          \$29, $ACC4, $TEMP1
765         vpand           $AND_MASK, $ACC4, $ACC4
766         vpsrlq          \$29, $ACC5, $TEMP2
767         vpand           $AND_MASK, $ACC5, $ACC5
768         vpsrlq          \$29, $ACC6, $TEMP3
769         vpermq          \$0x93, $TEMP1, $TEMP1
770         vpand           $AND_MASK, $ACC6, $ACC6
771         vpsrlq          \$29, $ACC7, $TEMP4
772         vpermq          \$0x93, $TEMP2, $TEMP2
773         vpand           $AND_MASK, $ACC7, $ACC7
774         vpsrlq          \$29, $ACC8, $TEMP5
775         vpermq          \$0x93, $TEMP3, $TEMP3
776         vpand           $AND_MASK, $ACC8, $ACC8
777         vpermq          \$0x93, $TEMP4, $TEMP4
778
779         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
780         vpermq          \$0x93, $TEMP5, $TEMP5
781         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
782         vpaddq          $TEMP0, $ACC4, $ACC4
783         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
784         vpaddq          $TEMP1, $ACC5, $ACC5
785         vmovdqu         $ACC4, 32*4-128($rp)
786         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
787         vpaddq          $TEMP2, $ACC6, $ACC6
788         vmovdqu         $ACC5, 32*5-128($rp)
789         vpblendd        \$3, $TEMP4, $TEMP5, $TEMP4
790         vpaddq          $TEMP3, $ACC7, $ACC7
791         vmovdqu         $ACC6, 32*6-128($rp)
792         vpaddq          $TEMP4, $ACC8, $ACC8
793         vmovdqu         $ACC7, 32*7-128($rp)
794         vmovdqu         $ACC8, 32*8-128($rp)
795
796         mov     $rp, $ap
797         dec     $rep
798         jne     .LOOP_GRANDE_SQR_1024
799
800         vzeroall
801         mov     %rbp, %rax
802 ___
803 $code.=<<___ if ($win64);
804         movaps  -0xd8(%rax),%xmm6
805         movaps  -0xc8(%rax),%xmm7
806         movaps  -0xb8(%rax),%xmm8
807         movaps  -0xa8(%rax),%xmm9
808         movaps  -0x98(%rax),%xmm10
809         movaps  -0x88(%rax),%xmm11
810         movaps  -0x78(%rax),%xmm12
811         movaps  -0x68(%rax),%xmm13
812         movaps  -0x58(%rax),%xmm14
813         movaps  -0x48(%rax),%xmm15
814 ___
815 $code.=<<___;
816         mov     -48(%rax),%r15
817         mov     -40(%rax),%r14
818         mov     -32(%rax),%r13
819         mov     -24(%rax),%r12
820         mov     -16(%rax),%rbp
821         mov     -8(%rax),%rbx
822         lea     (%rax),%rsp             # restore %rsp
823 .Lsqr_1024_epilogue:
824         ret
825 .size   rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
826 ___
827 }
828
829 { # void AMM_WW(
830 my $rp="%rdi";  # BN_ULONG *rp,
831 my $ap="%rsi";  # const BN_ULONG *ap,
832 my $bp="%rdx";  # const BN_ULONG *bp,
833 my $np="%rcx";  # const BN_ULONG *np,
834 my $n0="%r8d";  # unsigned int n0);
835
836 # The registers that hold the accumulated redundant result
837 # The AMM works on 1024 bit operands, and redundant word size is 29
838 # Therefore: ceil(1024/29)/4 = 9
839 my $ACC0="%ymm0";
840 my $ACC1="%ymm1";
841 my $ACC2="%ymm2";
842 my $ACC3="%ymm3";
843 my $ACC4="%ymm4";
844 my $ACC5="%ymm5";
845 my $ACC6="%ymm6";
846 my $ACC7="%ymm7";
847 my $ACC8="%ymm8";
848 my $ACC9="%ymm9";
849
850 # Registers that hold the broadcasted words of multiplier, currently used
851 my $Bi="%ymm10";
852 my $Yi="%ymm11";
853
854 # Helper registers
855 my $TEMP0=$ACC0;
856 my $TEMP1="%ymm12";
857 my $TEMP2="%ymm13";
858 my $ZERO="%ymm14";
859 my $AND_MASK="%ymm15";
860
861 # alu registers that hold the first words of the ACC
862 my $r0="%r9";
863 my $r1="%r10";
864 my $r2="%r11";
865 my $r3="%r12";
866
867 my $i="%r14d";
868 my $tmp="%r15";
869
870 $bp="%r13";     # reassigned argument
871
872 $code.=<<___;
873 .globl  rsaz_1024_mul_avx2
874 .type   rsaz_1024_mul_avx2,\@function,5
875 .align  64
876 rsaz_1024_mul_avx2:
877         lea     (%rsp), %rax
878         push    %rbx
879         push    %rbp
880         push    %r12
881         push    %r13
882         push    %r14
883         push    %r15
884 ___
885 $code.=<<___ if ($win64);
886         lea     -0xa8(%rsp),%rsp
887         movaps  %xmm6,-0xd8(%rax)
888         movaps  %xmm7,-0xc8(%rax)
889         movaps  %xmm8,-0xb8(%rax)
890         movaps  %xmm9,-0xa8(%rax)
891         movaps  %xmm10,-0x98(%rax)
892         movaps  %xmm11,-0x88(%rax)
893         movaps  %xmm12,-0x78(%rax)
894         movaps  %xmm13,-0x68(%rax)
895         movaps  %xmm14,-0x58(%rax)
896         movaps  %xmm15,-0x48(%rax)
897 .Lmul_1024_body:
898 ___
899 $code.=<<___;
900         mov     %rax,%rbp
901         vzeroall
902         mov     %rdx, $bp       # reassigned argument
903         sub     \$64,%rsp
904
905         # unaligned 256-bit load that crosses page boundary can
906         # cause severe performance degradation here, so if $ap does
907         # cross page boundary, swap it with $bp [meaning that caller
908         # is advised to lay down $ap and $bp next to each other, so
909         # that only one can cross page boundary].
910         mov     $ap, $tmp
911         and     \$4095, $tmp
912         add     \$32*10, $tmp
913         shr     \$12, $tmp
914         mov     $ap, $tmp
915         cmovnz  $bp, $ap
916         cmovnz  $tmp, $bp
917
918         mov     $np, $tmp
919         sub     \$-128,$ap      # size optimization
920         sub     \$-128,$np
921         sub     \$-128,$rp
922
923         and     \$4095, $tmp    # see if $np crosses page
924         add     \$32*10, $tmp
925         shr     \$12, $tmp
926         jz      .Lmul_1024_no_n_copy
927
928         # unaligned 256-bit load that crosses page boundary can
929         # cause severe performance degradation here, so if $np does
930         # cross page boundary, copy it to stack and make sure stack
931         # frame doesn't...
932         sub             \$32*10,%rsp
933         vmovdqu         32*0-128($np), $ACC0
934         and             \$-512, %rsp
935         vmovdqu         32*1-128($np), $ACC1
936         vmovdqu         32*2-128($np), $ACC2
937         vmovdqu         32*3-128($np), $ACC3
938         vmovdqu         32*4-128($np), $ACC4
939         vmovdqu         32*5-128($np), $ACC5
940         vmovdqu         32*6-128($np), $ACC6
941         vmovdqu         32*7-128($np), $ACC7
942         vmovdqu         32*8-128($np), $ACC8
943         lea             64+128(%rsp),$np
944         vmovdqu         $ACC0, 32*0-128($np)
945         vpxor           $ACC0, $ACC0, $ACC0
946         vmovdqu         $ACC1, 32*1-128($np)
947         vpxor           $ACC1, $ACC1, $ACC1
948         vmovdqu         $ACC2, 32*2-128($np)
949         vpxor           $ACC2, $ACC2, $ACC2
950         vmovdqu         $ACC3, 32*3-128($np)
951         vpxor           $ACC3, $ACC3, $ACC3
952         vmovdqu         $ACC4, 32*4-128($np)
953         vpxor           $ACC4, $ACC4, $ACC4
954         vmovdqu         $ACC5, 32*5-128($np)
955         vpxor           $ACC5, $ACC5, $ACC5
956         vmovdqu         $ACC6, 32*6-128($np)
957         vpxor           $ACC6, $ACC6, $ACC6
958         vmovdqu         $ACC7, 32*7-128($np)
959         vpxor           $ACC7, $ACC7, $ACC7
960         vmovdqu         $ACC8, 32*8-128($np)
961         vmovdqa         $ACC0, $ACC8
962         vmovdqu         $ACC9, 32*9-128($np)    # $ACC9 is zero after vzeroall
963 .Lmul_1024_no_n_copy:
964         and     \$-64,%rsp
965
966         mov     ($bp), %rbx
967         vpbroadcastq ($bp), $Bi
968         vmovdqu $ACC0, (%rsp)                   # clear top of stack
969         xor     $r0, $r0
970         xor     $r1, $r1
971         xor     $r2, $r2
972         xor     $r3, $r3
973
974         vmovdqu .Land_mask(%rip), $AND_MASK
975         mov     \$9, $i
976         jmp     .Loop_mul_1024
977
978 .align  32
979 .Loop_mul_1024:
980          vpsrlq         \$29, $ACC3, $ACC9              # correct $ACC3(*)
981         mov     %rbx, %rax
982         imulq   -128($ap), %rax
983         add     $r0, %rax
984         mov     %rbx, $r1
985         imulq   8-128($ap), $r1
986         add     8(%rsp), $r1
987
988         mov     %rax, $r0
989         imull   $n0, %eax
990         and     \$0x1fffffff, %eax
991
992          mov    %rbx, $r2
993          imulq  16-128($ap), $r2
994          add    16(%rsp), $r2
995
996          mov    %rbx, $r3
997          imulq  24-128($ap), $r3
998          add    24(%rsp), $r3
999         vpmuludq        32*1-128($ap),$Bi,$TEMP0
1000          vmovd          %eax, $Yi
1001         vpaddq          $TEMP0,$ACC1,$ACC1
1002         vpmuludq        32*2-128($ap),$Bi,$TEMP1
1003          vpbroadcastq   $Yi, $Yi
1004         vpaddq          $TEMP1,$ACC2,$ACC2
1005         vpmuludq        32*3-128($ap),$Bi,$TEMP2
1006          vpand          $AND_MASK, $ACC3, $ACC3         # correct $ACC3
1007         vpaddq          $TEMP2,$ACC3,$ACC3
1008         vpmuludq        32*4-128($ap),$Bi,$TEMP0
1009         vpaddq          $TEMP0,$ACC4,$ACC4
1010         vpmuludq        32*5-128($ap),$Bi,$TEMP1
1011         vpaddq          $TEMP1,$ACC5,$ACC5
1012         vpmuludq        32*6-128($ap),$Bi,$TEMP2
1013         vpaddq          $TEMP2,$ACC6,$ACC6
1014         vpmuludq        32*7-128($ap),$Bi,$TEMP0
1015          vpermq         \$0x93, $ACC9, $ACC9            # correct $ACC3
1016         vpaddq          $TEMP0,$ACC7,$ACC7
1017         vpmuludq        32*8-128($ap),$Bi,$TEMP1
1018          vpbroadcastq   8($bp), $Bi
1019         vpaddq          $TEMP1,$ACC8,$ACC8
1020
1021         mov     %rax,%rdx
1022         imulq   -128($np),%rax
1023         add     %rax,$r0
1024         mov     %rdx,%rax
1025         imulq   8-128($np),%rax
1026         add     %rax,$r1
1027         mov     %rdx,%rax
1028         imulq   16-128($np),%rax
1029         add     %rax,$r2
1030         shr     \$29, $r0
1031         imulq   24-128($np),%rdx
1032         add     %rdx,$r3
1033         add     $r0, $r1
1034
1035         vpmuludq        32*1-128($np),$Yi,$TEMP2
1036          vmovq          $Bi, %rbx
1037         vpaddq          $TEMP2,$ACC1,$ACC1
1038         vpmuludq        32*2-128($np),$Yi,$TEMP0
1039         vpaddq          $TEMP0,$ACC2,$ACC2
1040         vpmuludq        32*3-128($np),$Yi,$TEMP1
1041         vpaddq          $TEMP1,$ACC3,$ACC3
1042         vpmuludq        32*4-128($np),$Yi,$TEMP2
1043         vpaddq          $TEMP2,$ACC4,$ACC4
1044         vpmuludq        32*5-128($np),$Yi,$TEMP0
1045         vpaddq          $TEMP0,$ACC5,$ACC5
1046         vpmuludq        32*6-128($np),$Yi,$TEMP1
1047         vpaddq          $TEMP1,$ACC6,$ACC6
1048         vpmuludq        32*7-128($np),$Yi,$TEMP2
1049          vpblendd       \$3, $ZERO, $ACC9, $ACC9        # correct $ACC3
1050         vpaddq          $TEMP2,$ACC7,$ACC7
1051         vpmuludq        32*8-128($np),$Yi,$TEMP0
1052          vpaddq         $ACC9, $ACC3, $ACC3             # correct $ACC3
1053         vpaddq          $TEMP0,$ACC8,$ACC8
1054
1055         mov     %rbx, %rax
1056         imulq   -128($ap),%rax
1057         add     %rax,$r1
1058          vmovdqu        -8+32*1-128($ap),$TEMP1
1059         mov     %rbx, %rax
1060         imulq   8-128($ap),%rax
1061         add     %rax,$r2
1062          vmovdqu        -8+32*2-128($ap),$TEMP2
1063
1064         mov     $r1, %rax
1065         imull   $n0, %eax
1066         and     \$0x1fffffff, %eax
1067
1068          imulq  16-128($ap),%rbx
1069          add    %rbx,$r3
1070         vpmuludq        $Bi,$TEMP1,$TEMP1
1071          vmovd          %eax, $Yi
1072         vmovdqu         -8+32*3-128($ap),$TEMP0
1073         vpaddq          $TEMP1,$ACC1,$ACC1
1074         vpmuludq        $Bi,$TEMP2,$TEMP2
1075          vpbroadcastq   $Yi, $Yi
1076         vmovdqu         -8+32*4-128($ap),$TEMP1
1077         vpaddq          $TEMP2,$ACC2,$ACC2
1078         vpmuludq        $Bi,$TEMP0,$TEMP0
1079         vmovdqu         -8+32*5-128($ap),$TEMP2
1080         vpaddq          $TEMP0,$ACC3,$ACC3
1081         vpmuludq        $Bi,$TEMP1,$TEMP1
1082         vmovdqu         -8+32*6-128($ap),$TEMP0
1083         vpaddq          $TEMP1,$ACC4,$ACC4
1084         vpmuludq        $Bi,$TEMP2,$TEMP2
1085         vmovdqu         -8+32*7-128($ap),$TEMP1
1086         vpaddq          $TEMP2,$ACC5,$ACC5
1087         vpmuludq        $Bi,$TEMP0,$TEMP0
1088         vmovdqu         -8+32*8-128($ap),$TEMP2
1089         vpaddq          $TEMP0,$ACC6,$ACC6
1090         vpmuludq        $Bi,$TEMP1,$TEMP1
1091         vmovdqu         -8+32*9-128($ap),$ACC9
1092         vpaddq          $TEMP1,$ACC7,$ACC7
1093         vpmuludq        $Bi,$TEMP2,$TEMP2
1094         vpaddq          $TEMP2,$ACC8,$ACC8
1095         vpmuludq        $Bi,$ACC9,$ACC9
1096          vpbroadcastq   16($bp), $Bi
1097
1098         mov     %rax,%rdx
1099         imulq   -128($np),%rax
1100         add     %rax,$r1
1101          vmovdqu        -8+32*1-128($np),$TEMP0
1102         mov     %rdx,%rax
1103         imulq   8-128($np),%rax
1104         add     %rax,$r2
1105          vmovdqu        -8+32*2-128($np),$TEMP1
1106         shr     \$29, $r1
1107         imulq   16-128($np),%rdx
1108         add     %rdx,$r3
1109         add     $r1, $r2
1110
1111         vpmuludq        $Yi,$TEMP0,$TEMP0
1112          vmovq          $Bi, %rbx
1113         vmovdqu         -8+32*3-128($np),$TEMP2
1114         vpaddq          $TEMP0,$ACC1,$ACC1
1115         vpmuludq        $Yi,$TEMP1,$TEMP1
1116         vmovdqu         -8+32*4-128($np),$TEMP0
1117         vpaddq          $TEMP1,$ACC2,$ACC2
1118         vpmuludq        $Yi,$TEMP2,$TEMP2
1119         vmovdqu         -8+32*5-128($np),$TEMP1
1120         vpaddq          $TEMP2,$ACC3,$ACC3
1121         vpmuludq        $Yi,$TEMP0,$TEMP0
1122         vmovdqu         -8+32*6-128($np),$TEMP2
1123         vpaddq          $TEMP0,$ACC4,$ACC4
1124         vpmuludq        $Yi,$TEMP1,$TEMP1
1125         vmovdqu         -8+32*7-128($np),$TEMP0
1126         vpaddq          $TEMP1,$ACC5,$ACC5
1127         vpmuludq        $Yi,$TEMP2,$TEMP2
1128         vmovdqu         -8+32*8-128($np),$TEMP1
1129         vpaddq          $TEMP2,$ACC6,$ACC6
1130         vpmuludq        $Yi,$TEMP0,$TEMP0
1131         vmovdqu         -8+32*9-128($np),$TEMP2
1132         vpaddq          $TEMP0,$ACC7,$ACC7
1133         vpmuludq        $Yi,$TEMP1,$TEMP1
1134         vpaddq          $TEMP1,$ACC8,$ACC8
1135         vpmuludq        $Yi,$TEMP2,$TEMP2
1136         vpaddq          $TEMP2,$ACC9,$ACC9
1137
1138          vmovdqu        -16+32*1-128($ap),$TEMP0
1139         mov     %rbx,%rax
1140         imulq   -128($ap),%rax
1141         add     $r2,%rax
1142
1143          vmovdqu        -16+32*2-128($ap),$TEMP1
1144         mov     %rax,$r2
1145         imull   $n0, %eax
1146         and     \$0x1fffffff, %eax
1147
1148          imulq  8-128($ap),%rbx
1149          add    %rbx,$r3
1150         vpmuludq        $Bi,$TEMP0,$TEMP0
1151          vmovd          %eax, $Yi
1152         vmovdqu         -16+32*3-128($ap),$TEMP2
1153         vpaddq          $TEMP0,$ACC1,$ACC1
1154         vpmuludq        $Bi,$TEMP1,$TEMP1
1155          vpbroadcastq   $Yi, $Yi
1156         vmovdqu         -16+32*4-128($ap),$TEMP0
1157         vpaddq          $TEMP1,$ACC2,$ACC2
1158         vpmuludq        $Bi,$TEMP2,$TEMP2
1159         vmovdqu         -16+32*5-128($ap),$TEMP1
1160         vpaddq          $TEMP2,$ACC3,$ACC3
1161         vpmuludq        $Bi,$TEMP0,$TEMP0
1162         vmovdqu         -16+32*6-128($ap),$TEMP2
1163         vpaddq          $TEMP0,$ACC4,$ACC4
1164         vpmuludq        $Bi,$TEMP1,$TEMP1
1165         vmovdqu         -16+32*7-128($ap),$TEMP0
1166         vpaddq          $TEMP1,$ACC5,$ACC5
1167         vpmuludq        $Bi,$TEMP2,$TEMP2
1168         vmovdqu         -16+32*8-128($ap),$TEMP1
1169         vpaddq          $TEMP2,$ACC6,$ACC6
1170         vpmuludq        $Bi,$TEMP0,$TEMP0
1171         vmovdqu         -16+32*9-128($ap),$TEMP2
1172         vpaddq          $TEMP0,$ACC7,$ACC7
1173         vpmuludq        $Bi,$TEMP1,$TEMP1
1174         vpaddq          $TEMP1,$ACC8,$ACC8
1175         vpmuludq        $Bi,$TEMP2,$TEMP2
1176          vpbroadcastq   24($bp), $Bi
1177         vpaddq          $TEMP2,$ACC9,$ACC9
1178
1179          vmovdqu        -16+32*1-128($np),$TEMP0
1180         mov     %rax,%rdx
1181         imulq   -128($np),%rax
1182         add     %rax,$r2
1183          vmovdqu        -16+32*2-128($np),$TEMP1
1184         imulq   8-128($np),%rdx
1185         add     %rdx,$r3
1186         shr     \$29, $r2
1187
1188         vpmuludq        $Yi,$TEMP0,$TEMP0
1189          vmovq          $Bi, %rbx
1190         vmovdqu         -16+32*3-128($np),$TEMP2
1191         vpaddq          $TEMP0,$ACC1,$ACC1
1192         vpmuludq        $Yi,$TEMP1,$TEMP1
1193         vmovdqu         -16+32*4-128($np),$TEMP0
1194         vpaddq          $TEMP1,$ACC2,$ACC2
1195         vpmuludq        $Yi,$TEMP2,$TEMP2
1196         vmovdqu         -16+32*5-128($np),$TEMP1
1197         vpaddq          $TEMP2,$ACC3,$ACC3
1198         vpmuludq        $Yi,$TEMP0,$TEMP0
1199         vmovdqu         -16+32*6-128($np),$TEMP2
1200         vpaddq          $TEMP0,$ACC4,$ACC4
1201         vpmuludq        $Yi,$TEMP1,$TEMP1
1202         vmovdqu         -16+32*7-128($np),$TEMP0
1203         vpaddq          $TEMP1,$ACC5,$ACC5
1204         vpmuludq        $Yi,$TEMP2,$TEMP2
1205         vmovdqu         -16+32*8-128($np),$TEMP1
1206         vpaddq          $TEMP2,$ACC6,$ACC6
1207         vpmuludq        $Yi,$TEMP0,$TEMP0
1208         vmovdqu         -16+32*9-128($np),$TEMP2
1209         vpaddq          $TEMP0,$ACC7,$ACC7
1210         vpmuludq        $Yi,$TEMP1,$TEMP1
1211          vmovdqu        -24+32*1-128($ap),$TEMP0
1212         vpaddq          $TEMP1,$ACC8,$ACC8
1213         vpmuludq        $Yi,$TEMP2,$TEMP2
1214          vmovdqu        -24+32*2-128($ap),$TEMP1
1215         vpaddq          $TEMP2,$ACC9,$ACC9
1216
1217         add     $r2, $r3
1218         imulq   -128($ap),%rbx
1219         add     %rbx,$r3
1220
1221         mov     $r3, %rax
1222         imull   $n0, %eax
1223         and     \$0x1fffffff, %eax
1224
1225         vpmuludq        $Bi,$TEMP0,$TEMP0
1226          vmovd          %eax, $Yi
1227         vmovdqu         -24+32*3-128($ap),$TEMP2
1228         vpaddq          $TEMP0,$ACC1,$ACC1
1229         vpmuludq        $Bi,$TEMP1,$TEMP1
1230          vpbroadcastq   $Yi, $Yi
1231         vmovdqu         -24+32*4-128($ap),$TEMP0
1232         vpaddq          $TEMP1,$ACC2,$ACC2
1233         vpmuludq        $Bi,$TEMP2,$TEMP2
1234         vmovdqu         -24+32*5-128($ap),$TEMP1
1235         vpaddq          $TEMP2,$ACC3,$ACC3
1236         vpmuludq        $Bi,$TEMP0,$TEMP0
1237         vmovdqu         -24+32*6-128($ap),$TEMP2
1238         vpaddq          $TEMP0,$ACC4,$ACC4
1239         vpmuludq        $Bi,$TEMP1,$TEMP1
1240         vmovdqu         -24+32*7-128($ap),$TEMP0
1241         vpaddq          $TEMP1,$ACC5,$ACC5
1242         vpmuludq        $Bi,$TEMP2,$TEMP2
1243         vmovdqu         -24+32*8-128($ap),$TEMP1
1244         vpaddq          $TEMP2,$ACC6,$ACC6
1245         vpmuludq        $Bi,$TEMP0,$TEMP0
1246         vmovdqu         -24+32*9-128($ap),$TEMP2
1247         vpaddq          $TEMP0,$ACC7,$ACC7
1248         vpmuludq        $Bi,$TEMP1,$TEMP1
1249         vpaddq          $TEMP1,$ACC8,$ACC8
1250         vpmuludq        $Bi,$TEMP2,$TEMP2
1251          vpbroadcastq   32($bp), $Bi
1252         vpaddq          $TEMP2,$ACC9,$ACC9
1253          add            \$32, $bp                       # $bp++
1254
1255         vmovdqu         -24+32*1-128($np),$TEMP0
1256         imulq   -128($np),%rax
1257         add     %rax,$r3
1258         shr     \$29, $r3
1259
1260         vmovdqu         -24+32*2-128($np),$TEMP1
1261         vpmuludq        $Yi,$TEMP0,$TEMP0
1262          vmovq          $Bi, %rbx
1263         vmovdqu         -24+32*3-128($np),$TEMP2
1264         vpaddq          $TEMP0,$ACC1,$ACC0              # $ACC0==$TEMP0
1265         vpmuludq        $Yi,$TEMP1,$TEMP1
1266          vmovdqu        $ACC0, (%rsp)                   # transfer $r0-$r3
1267         vpaddq          $TEMP1,$ACC2,$ACC1
1268         vmovdqu         -24+32*4-128($np),$TEMP0
1269         vpmuludq        $Yi,$TEMP2,$TEMP2
1270         vmovdqu         -24+32*5-128($np),$TEMP1
1271         vpaddq          $TEMP2,$ACC3,$ACC2
1272         vpmuludq        $Yi,$TEMP0,$TEMP0
1273         vmovdqu         -24+32*6-128($np),$TEMP2
1274         vpaddq          $TEMP0,$ACC4,$ACC3
1275         vpmuludq        $Yi,$TEMP1,$TEMP1
1276         vmovdqu         -24+32*7-128($np),$TEMP0
1277         vpaddq          $TEMP1,$ACC5,$ACC4
1278         vpmuludq        $Yi,$TEMP2,$TEMP2
1279         vmovdqu         -24+32*8-128($np),$TEMP1
1280         vpaddq          $TEMP2,$ACC6,$ACC5
1281         vpmuludq        $Yi,$TEMP0,$TEMP0
1282         vmovdqu         -24+32*9-128($np),$TEMP2
1283          mov    $r3, $r0
1284         vpaddq          $TEMP0,$ACC7,$ACC6
1285         vpmuludq        $Yi,$TEMP1,$TEMP1
1286          add    (%rsp), $r0
1287         vpaddq          $TEMP1,$ACC8,$ACC7
1288         vpmuludq        $Yi,$TEMP2,$TEMP2
1289          vmovq  $r3, $TEMP1
1290         vpaddq          $TEMP2,$ACC9,$ACC8
1291
1292         dec     $i
1293         jnz     .Loop_mul_1024
1294 ___
1295
1296 # (*)   Original implementation was correcting ACC1-ACC3 for overflow
1297 #       after 7 loop runs, or after 28 iterations, or 56 additions.
1298 #       But as we underutilize resources, it's possible to correct in
1299 #       each iteration with marginal performance loss. But then, as
1300 #       we do it in each iteration, we can correct less digits, and
1301 #       avoid performance penalties completely. Also note that we
1302 #       correct only three digits out of four. This works because
1303 #       most significant digit is subjected to less additions.
1304
1305 $TEMP0 = $ACC9;
1306 $TEMP3 = $Bi;
1307 $TEMP4 = $Yi;
1308 $code.=<<___;
1309         vpermq          \$0, $AND_MASK, $AND_MASK
1310         vpaddq          (%rsp), $TEMP1, $ACC0
1311
1312         vpsrlq          \$29, $ACC0, $TEMP1
1313         vpand           $AND_MASK, $ACC0, $ACC0
1314         vpsrlq          \$29, $ACC1, $TEMP2
1315         vpand           $AND_MASK, $ACC1, $ACC1
1316         vpsrlq          \$29, $ACC2, $TEMP3
1317         vpermq          \$0x93, $TEMP1, $TEMP1
1318         vpand           $AND_MASK, $ACC2, $ACC2
1319         vpsrlq          \$29, $ACC3, $TEMP4
1320         vpermq          \$0x93, $TEMP2, $TEMP2
1321         vpand           $AND_MASK, $ACC3, $ACC3
1322
1323         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
1324         vpermq          \$0x93, $TEMP3, $TEMP3
1325         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
1326         vpermq          \$0x93, $TEMP4, $TEMP4
1327         vpaddq          $TEMP0, $ACC0, $ACC0
1328         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
1329         vpaddq          $TEMP1, $ACC1, $ACC1
1330         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
1331         vpaddq          $TEMP2, $ACC2, $ACC2
1332         vpblendd        \$3, $TEMP4, $ZERO, $TEMP4
1333         vpaddq          $TEMP3, $ACC3, $ACC3
1334         vpaddq          $TEMP4, $ACC4, $ACC4
1335
1336         vpsrlq          \$29, $ACC0, $TEMP1
1337         vpand           $AND_MASK, $ACC0, $ACC0
1338         vpsrlq          \$29, $ACC1, $TEMP2
1339         vpand           $AND_MASK, $ACC1, $ACC1
1340         vpsrlq          \$29, $ACC2, $TEMP3
1341         vpermq          \$0x93, $TEMP1, $TEMP1
1342         vpand           $AND_MASK, $ACC2, $ACC2
1343         vpsrlq          \$29, $ACC3, $TEMP4
1344         vpermq          \$0x93, $TEMP2, $TEMP2
1345         vpand           $AND_MASK, $ACC3, $ACC3
1346         vpermq          \$0x93, $TEMP3, $TEMP3
1347
1348         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
1349         vpermq          \$0x93, $TEMP4, $TEMP4
1350         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
1351         vpaddq          $TEMP0, $ACC0, $ACC0
1352         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
1353         vpaddq          $TEMP1, $ACC1, $ACC1
1354         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
1355         vpaddq          $TEMP2, $ACC2, $ACC2
1356         vpblendd        \$3, $TEMP4, $ZERO, $TEMP4
1357         vpaddq          $TEMP3, $ACC3, $ACC3
1358         vpaddq          $TEMP4, $ACC4, $ACC4
1359
1360         vmovdqu         $ACC0, 0-128($rp)
1361         vmovdqu         $ACC1, 32-128($rp)
1362         vmovdqu         $ACC2, 64-128($rp)
1363         vmovdqu         $ACC3, 96-128($rp)
1364 ___
1365
1366 $TEMP5=$ACC0;
1367 $code.=<<___;
1368         vpsrlq          \$29, $ACC4, $TEMP1
1369         vpand           $AND_MASK, $ACC4, $ACC4
1370         vpsrlq          \$29, $ACC5, $TEMP2
1371         vpand           $AND_MASK, $ACC5, $ACC5
1372         vpsrlq          \$29, $ACC6, $TEMP3
1373         vpermq          \$0x93, $TEMP1, $TEMP1
1374         vpand           $AND_MASK, $ACC6, $ACC6
1375         vpsrlq          \$29, $ACC7, $TEMP4
1376         vpermq          \$0x93, $TEMP2, $TEMP2
1377         vpand           $AND_MASK, $ACC7, $ACC7
1378         vpsrlq          \$29, $ACC8, $TEMP5
1379         vpermq          \$0x93, $TEMP3, $TEMP3
1380         vpand           $AND_MASK, $ACC8, $ACC8
1381         vpermq          \$0x93, $TEMP4, $TEMP4
1382
1383         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
1384         vpermq          \$0x93, $TEMP5, $TEMP5
1385         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
1386         vpaddq          $TEMP0, $ACC4, $ACC4
1387         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
1388         vpaddq          $TEMP1, $ACC5, $ACC5
1389         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
1390         vpaddq          $TEMP2, $ACC6, $ACC6
1391         vpblendd        \$3, $TEMP4, $TEMP5, $TEMP4
1392         vpaddq          $TEMP3, $ACC7, $ACC7
1393         vpaddq          $TEMP4, $ACC8, $ACC8
1394
1395         vpsrlq          \$29, $ACC4, $TEMP1
1396         vpand           $AND_MASK, $ACC4, $ACC4
1397         vpsrlq          \$29, $ACC5, $TEMP2
1398         vpand           $AND_MASK, $ACC5, $ACC5
1399         vpsrlq          \$29, $ACC6, $TEMP3
1400         vpermq          \$0x93, $TEMP1, $TEMP1
1401         vpand           $AND_MASK, $ACC6, $ACC6
1402         vpsrlq          \$29, $ACC7, $TEMP4
1403         vpermq          \$0x93, $TEMP2, $TEMP2
1404         vpand           $AND_MASK, $ACC7, $ACC7
1405         vpsrlq          \$29, $ACC8, $TEMP5
1406         vpermq          \$0x93, $TEMP3, $TEMP3
1407         vpand           $AND_MASK, $ACC8, $ACC8
1408         vpermq          \$0x93, $TEMP4, $TEMP4
1409
1410         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
1411         vpermq          \$0x93, $TEMP5, $TEMP5
1412         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
1413         vpaddq          $TEMP0, $ACC4, $ACC4
1414         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
1415         vpaddq          $TEMP1, $ACC5, $ACC5
1416         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
1417         vpaddq          $TEMP2, $ACC6, $ACC6
1418         vpblendd        \$3, $TEMP4, $TEMP5, $TEMP4
1419         vpaddq          $TEMP3, $ACC7, $ACC7
1420         vpaddq          $TEMP4, $ACC8, $ACC8
1421
1422         vmovdqu         $ACC4, 128-128($rp)
1423         vmovdqu         $ACC5, 160-128($rp)    
1424         vmovdqu         $ACC6, 192-128($rp)
1425         vmovdqu         $ACC7, 224-128($rp)
1426         vmovdqu         $ACC8, 256-128($rp)
1427         vzeroupper
1428
1429         mov     %rbp, %rax
1430 ___
1431 $code.=<<___ if ($win64);
1432         movaps  -0xd8(%rax),%xmm6
1433         movaps  -0xc8(%rax),%xmm7
1434         movaps  -0xb8(%rax),%xmm8
1435         movaps  -0xa8(%rax),%xmm9
1436         movaps  -0x98(%rax),%xmm10
1437         movaps  -0x88(%rax),%xmm11
1438         movaps  -0x78(%rax),%xmm12
1439         movaps  -0x68(%rax),%xmm13
1440         movaps  -0x58(%rax),%xmm14
1441         movaps  -0x48(%rax),%xmm15
1442 ___
1443 $code.=<<___;
1444         mov     -48(%rax),%r15
1445         mov     -40(%rax),%r14
1446         mov     -32(%rax),%r13
1447         mov     -24(%rax),%r12
1448         mov     -16(%rax),%rbp
1449         mov     -8(%rax),%rbx
1450         lea     (%rax),%rsp             # restore %rsp
1451 .Lmul_1024_epilogue:
1452         ret
1453 .size   rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1454 ___
1455 }
1456 {
1457 my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1458 my @T = map("%r$_",(8..11));
1459
1460 $code.=<<___;
1461 .globl  rsaz_1024_red2norm_avx2
1462 .type   rsaz_1024_red2norm_avx2,\@abi-omnipotent
1463 .align  32
1464 rsaz_1024_red2norm_avx2:
1465         sub     \$-128,$inp     # size optimization
1466         xor     %rax,%rax
1467 ___
1468
1469 for ($j=0,$i=0; $i<16; $i++) {
1470     my $k=0;
1471     while (29*$j<64*($i+1)) {   # load data till boundary
1472         $code.="        mov     `8*$j-128`($inp), @T[0]\n";
1473         $j++; $k++; push(@T,shift(@T));
1474     }
1475     $l=$k;
1476     while ($k>1) {              # shift loaded data but last value
1477         $code.="        shl     \$`29*($j-$k)`,@T[-$k]\n";
1478         $k--;
1479     }
1480     $code.=<<___;               # shift last value
1481         mov     @T[-1], @T[0]
1482         shl     \$`29*($j-1)`, @T[-1]
1483         shr     \$`-29*($j-1)`, @T[0]
1484 ___
1485     while ($l) {                # accumulate all values
1486         $code.="        add     @T[-$l], %rax\n";
1487         $l--;
1488     }
1489         $code.=<<___;
1490         adc     \$0, @T[0]      # consume eventual carry
1491         mov     %rax, 8*$i($out)
1492         mov     @T[0], %rax
1493 ___
1494     push(@T,shift(@T));
1495 }
1496 $code.=<<___;
1497         ret
1498 .size   rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1499
1500 .globl  rsaz_1024_norm2red_avx2
1501 .type   rsaz_1024_norm2red_avx2,\@abi-omnipotent
1502 .align  32
1503 rsaz_1024_norm2red_avx2:
1504         sub     \$-128,$out     # size optimization
1505         mov     ($inp),@T[0]
1506         mov     \$0x1fffffff,%eax
1507 ___
1508 for ($j=0,$i=0; $i<16; $i++) {
1509     $code.="    mov     `8*($i+1)`($inp),@T[1]\n"       if ($i<15);
1510     $code.="    xor     @T[1],@T[1]\n"                  if ($i==15);
1511     my $k=1;
1512     while (29*($j+1)<64*($i+1)) {
1513         $code.=<<___;
1514         mov     @T[0],@T[-$k]
1515         shr     \$`29*$j`,@T[-$k]
1516         and     %rax,@T[-$k]                            # &0x1fffffff
1517         mov     @T[-$k],`8*$j-128`($out)
1518 ___
1519         $j++; $k++;
1520     }
1521     $code.=<<___;
1522         shrd    \$`29*$j`,@T[1],@T[0]
1523         and     %rax,@T[0]
1524         mov     @T[0],`8*$j-128`($out)
1525 ___
1526     $j++;
1527     push(@T,shift(@T));
1528 }
1529 $code.=<<___;
1530         mov     @T[0],`8*$j-128`($out)                  # zero
1531         mov     @T[0],`8*($j+1)-128`($out)
1532         mov     @T[0],`8*($j+2)-128`($out)
1533         mov     @T[0],`8*($j+3)-128`($out)
1534         ret
1535 .size   rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1536 ___
1537 }
1538 {
1539 my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1540
1541 $code.=<<___;
1542 .globl  rsaz_1024_scatter5_avx2
1543 .type   rsaz_1024_scatter5_avx2,\@abi-omnipotent
1544 .align  32
1545 rsaz_1024_scatter5_avx2:
1546         vzeroupper
1547         vmovdqu .Lscatter_permd(%rip),%ymm5
1548         shl     \$4,$power
1549         lea     ($out,$power),$out
1550         mov     \$9,%eax
1551         jmp     .Loop_scatter_1024
1552
1553 .align  32
1554 .Loop_scatter_1024:
1555         vmovdqu         ($inp),%ymm0
1556         lea             32($inp),$inp
1557         vpermd          %ymm0,%ymm5,%ymm0
1558         vmovdqu         %xmm0,($out)
1559         lea             16*32($out),$out
1560         dec     %eax
1561         jnz     .Loop_scatter_1024
1562
1563         vzeroupper
1564         ret
1565 .size   rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1566
1567 .globl  rsaz_1024_gather5_avx2
1568 .type   rsaz_1024_gather5_avx2,\@abi-omnipotent
1569 .align  32
1570 rsaz_1024_gather5_avx2:
1571 ___
1572 $code.=<<___ if ($win64);
1573         lea     -0x88(%rsp),%rax
1574 .LSEH_begin_rsaz_1024_gather5:
1575         # I can't trust assembler to use specific encoding:-(
1576         .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax),%rsp
1577         .byte   0x0f,0x29,0x70,0xe0             #movaps %xmm6,-0x20(%rax)
1578         .byte   0x0f,0x29,0x78,0xf0             #movaps %xmm7,-0x10(%rax)
1579         .byte   0x44,0x0f,0x29,0x00             #movaps %xmm8,0(%rax)
1580         .byte   0x44,0x0f,0x29,0x48,0x10        #movaps %xmm9,0x10(%rax)
1581         .byte   0x44,0x0f,0x29,0x50,0x20        #movaps %xmm10,0x20(%rax)
1582         .byte   0x44,0x0f,0x29,0x58,0x30        #movaps %xmm11,0x30(%rax)
1583         .byte   0x44,0x0f,0x29,0x60,0x40        #movaps %xmm12,0x40(%rax)
1584         .byte   0x44,0x0f,0x29,0x68,0x50        #movaps %xmm13,0x50(%rax)
1585         .byte   0x44,0x0f,0x29,0x70,0x60        #movaps %xmm14,0x60(%rax)
1586         .byte   0x44,0x0f,0x29,0x78,0x70        #movaps %xmm15,0x70(%rax)
1587 ___
1588 $code.=<<___;
1589         vzeroupper
1590         lea     .Lgather_table(%rip),%r11
1591         mov     $power,%eax
1592         and     \$3,$power
1593         shr     \$2,%eax                        # cache line number
1594         shl     \$4,$power                      # offset within cache line
1595
1596         vmovdqu         -32(%r11),%ymm7         # .Lgather_permd
1597         vpbroadcastb    8(%r11,%rax), %xmm8
1598         vpbroadcastb    7(%r11,%rax), %xmm9
1599         vpbroadcastb    6(%r11,%rax), %xmm10
1600         vpbroadcastb    5(%r11,%rax), %xmm11
1601         vpbroadcastb    4(%r11,%rax), %xmm12
1602         vpbroadcastb    3(%r11,%rax), %xmm13
1603         vpbroadcastb    2(%r11,%rax), %xmm14
1604         vpbroadcastb    1(%r11,%rax), %xmm15
1605
1606         lea     ($inp,$power),$inp
1607         mov     \$64,%r11                       # size optimization
1608         mov     \$9,%eax
1609         jmp     .Loop_gather_1024
1610
1611 .align  32
1612 .Loop_gather_1024:
1613         vpand           ($inp),                 %xmm8,%xmm0
1614         vpand           ($inp,%r11),            %xmm9,%xmm1
1615         vpand           ($inp,%r11,2),          %xmm10,%xmm2
1616         vpand           64($inp,%r11,2),        %xmm11,%xmm3
1617          vpor                                   %xmm0,%xmm1,%xmm1
1618         vpand           ($inp,%r11,4),          %xmm12,%xmm4
1619          vpor                                   %xmm2,%xmm3,%xmm3
1620         vpand           64($inp,%r11,4),        %xmm13,%xmm5
1621          vpor                                   %xmm1,%xmm3,%xmm3
1622         vpand           -128($inp,%r11,8),      %xmm14,%xmm6
1623          vpor                                   %xmm4,%xmm5,%xmm5
1624         vpand           -64($inp,%r11,8),       %xmm15,%xmm2
1625         lea             ($inp,%r11,8),$inp
1626          vpor                                   %xmm3,%xmm5,%xmm5
1627          vpor                                   %xmm2,%xmm6,%xmm6
1628          vpor                                   %xmm5,%xmm6,%xmm6
1629         vpermd          %ymm6,%ymm7,%ymm6
1630         vmovdqu         %ymm6,($out)
1631         lea             32($out),$out
1632         dec     %eax
1633         jnz     .Loop_gather_1024
1634
1635         vpxor   %ymm0,%ymm0,%ymm0
1636         vmovdqu %ymm0,($out)
1637         vzeroupper
1638 ___
1639 $code.=<<___ if ($win64);
1640         movaps  (%rsp),%xmm6
1641         movaps  0x10(%rsp),%xmm7
1642         movaps  0x20(%rsp),%xmm8
1643         movaps  0x30(%rsp),%xmm9
1644         movaps  0x40(%rsp),%xmm10
1645         movaps  0x50(%rsp),%xmm11
1646         movaps  0x60(%rsp),%xmm12
1647         movaps  0x70(%rsp),%xmm13
1648         movaps  0x80(%rsp),%xmm14
1649         movaps  0x90(%rsp),%xmm15
1650         lea     0xa8(%rsp),%rsp
1651 .LSEH_end_rsaz_1024_gather5:
1652 ___
1653 $code.=<<___;
1654         ret
1655 .size   rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1656 ___
1657 }
1658
1659 $code.=<<___;
1660 .extern OPENSSL_ia32cap_P
1661 .globl  rsaz_avx2_eligible
1662 .type   rsaz_avx2_eligible,\@abi-omnipotent
1663 .align  32
1664 rsaz_avx2_eligible:
1665         mov     OPENSSL_ia32cap_P+8(%rip),%eax
1666         and     \$`1<<5`,%eax
1667         shr     \$5,%eax
1668         ret
1669 .size   rsaz_avx2_eligible,.-rsaz_avx2_eligible
1670
1671 .align  64
1672 .Land_mask:
1673         .quad   0x1fffffff,0x1fffffff,0x1fffffff,-1
1674 .Lscatter_permd:
1675         .long   0,2,4,6,7,7,7,7
1676 .Lgather_permd:
1677         .long   0,7,1,7,2,7,3,7
1678 .Lgather_table:
1679         .byte   0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
1680 .align  64
1681 ___
1682
1683 if ($win64) {
1684 $rec="%rcx";
1685 $frame="%rdx";
1686 $context="%r8";
1687 $disp="%r9";
1688
1689 $code.=<<___
1690 .extern __imp_RtlVirtualUnwind
1691 .type   rsaz_se_handler,\@abi-omnipotent
1692 .align  16
1693 rsaz_se_handler:
1694         push    %rsi
1695         push    %rdi
1696         push    %rbx
1697         push    %rbp
1698         push    %r12
1699         push    %r13
1700         push    %r14
1701         push    %r15
1702         pushfq
1703         sub     \$64,%rsp
1704
1705         mov     120($context),%rax      # pull context->Rax
1706         mov     248($context),%rbx      # pull context->Rip
1707
1708         mov     8($disp),%rsi           # disp->ImageBase
1709         mov     56($disp),%r11          # disp->HandlerData
1710
1711         mov     0(%r11),%r10d           # HandlerData[0]
1712         lea     (%rsi,%r10),%r10        # prologue label
1713         cmp     %r10,%rbx               # context->Rip<prologue label
1714         jb      .Lcommon_seh_tail
1715
1716         mov     152($context),%rax      # pull context->Rsp
1717
1718         mov     4(%r11),%r10d           # HandlerData[1]
1719         lea     (%rsi,%r10),%r10        # epilogue label
1720         cmp     %r10,%rbx               # context->Rip>=epilogue label
1721         jae     .Lcommon_seh_tail
1722
1723         mov     160($context),%rax      # pull context->Rbp
1724
1725         mov     -48(%rax),%r15
1726         mov     -40(%rax),%r14
1727         mov     -32(%rax),%r13
1728         mov     -24(%rax),%r12
1729         mov     -16(%rax),%rbp
1730         mov     -8(%rax),%rbx
1731         mov     %r15,240($context)
1732         mov     %r14,232($context)
1733         mov     %r13,224($context)
1734         mov     %r12,216($context)
1735         mov     %rbp,160($context)
1736         mov     %rbx,144($context)
1737
1738         lea     -0xd8(%rax),%rsi        # %xmm save area
1739         lea     512($context),%rdi      # & context.Xmm6
1740         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
1741         .long   0xa548f3fc              # cld; rep movsq
1742
1743 .Lcommon_seh_tail:
1744         mov     8(%rax),%rdi
1745         mov     16(%rax),%rsi
1746         mov     %rax,152($context)      # restore context->Rsp
1747         mov     %rsi,168($context)      # restore context->Rsi
1748         mov     %rdi,176($context)      # restore context->Rdi
1749
1750         mov     40($disp),%rdi          # disp->ContextRecord
1751         mov     $context,%rsi           # context
1752         mov     \$154,%ecx              # sizeof(CONTEXT)
1753         .long   0xa548f3fc              # cld; rep movsq
1754
1755         mov     $disp,%rsi
1756         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1757         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1758         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1759         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1760         mov     40(%rsi),%r10           # disp->ContextRecord
1761         lea     56(%rsi),%r11           # &disp->HandlerData
1762         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1763         mov     %r10,32(%rsp)           # arg5
1764         mov     %r11,40(%rsp)           # arg6
1765         mov     %r12,48(%rsp)           # arg7
1766         mov     %rcx,56(%rsp)           # arg8, (NULL)
1767         call    *__imp_RtlVirtualUnwind(%rip)
1768
1769         mov     \$1,%eax                # ExceptionContinueSearch
1770         add     \$64,%rsp
1771         popfq
1772         pop     %r15
1773         pop     %r14
1774         pop     %r13
1775         pop     %r12
1776         pop     %rbp
1777         pop     %rbx
1778         pop     %rdi
1779         pop     %rsi
1780         ret
1781 .size   rsaz_se_handler,.-rsaz_se_handler
1782
1783 .section        .pdata
1784 .align  4
1785         .rva    .LSEH_begin_rsaz_1024_sqr_avx2
1786         .rva    .LSEH_end_rsaz_1024_sqr_avx2
1787         .rva    .LSEH_info_rsaz_1024_sqr_avx2
1788
1789         .rva    .LSEH_begin_rsaz_1024_mul_avx2
1790         .rva    .LSEH_end_rsaz_1024_mul_avx2
1791         .rva    .LSEH_info_rsaz_1024_mul_avx2
1792
1793         .rva    .LSEH_begin_rsaz_1024_gather5
1794         .rva    .LSEH_end_rsaz_1024_gather5
1795         .rva    .LSEH_info_rsaz_1024_gather5
1796 .section        .xdata
1797 .align  8
1798 .LSEH_info_rsaz_1024_sqr_avx2:
1799         .byte   9,0,0,0
1800         .rva    rsaz_se_handler
1801         .rva    .Lsqr_1024_body,.Lsqr_1024_epilogue
1802 .LSEH_info_rsaz_1024_mul_avx2:
1803         .byte   9,0,0,0
1804         .rva    rsaz_se_handler
1805         .rva    .Lmul_1024_body,.Lmul_1024_epilogue
1806 .LSEH_info_rsaz_1024_gather5:
1807         .byte   0x01,0x33,0x16,0x00
1808         .byte   0x33,0xf8,0x09,0x00     #movaps 0x90(rsp),xmm15
1809         .byte   0x2e,0xe8,0x08,0x00     #movaps 0x80(rsp),xmm14
1810         .byte   0x29,0xd8,0x07,0x00     #movaps 0x70(rsp),xmm13
1811         .byte   0x24,0xc8,0x06,0x00     #movaps 0x60(rsp),xmm12
1812         .byte   0x1f,0xb8,0x05,0x00     #movaps 0x50(rsp),xmm11
1813         .byte   0x1a,0xa8,0x04,0x00     #movaps 0x40(rsp),xmm10
1814         .byte   0x15,0x98,0x03,0x00     #movaps 0x30(rsp),xmm9
1815         .byte   0x10,0x88,0x02,0x00     #movaps 0x20(rsp),xmm8
1816         .byte   0x0c,0x78,0x01,0x00     #movaps 0x10(rsp),xmm7
1817         .byte   0x08,0x68,0x00,0x00     #movaps 0x00(rsp),xmm6
1818         .byte   0x04,0x01,0x15,0x00     #sub    rsp,0xa8
1819 ___
1820 }
1821
1822 foreach (split("\n",$code)) {
1823         s/\`([^\`]*)\`/eval($1)/ge;
1824
1825         s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge               or
1826
1827         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1828         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1829         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1830         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1831         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1832         print $_,"\n";
1833 }
1834
1835 }}} else {{{
1836 print <<___;    # assembler is too old
1837 .text
1838
1839 .globl  rsaz_avx2_eligible
1840 .type   rsaz_avx2_eligible,\@abi-omnipotent
1841 rsaz_avx2_eligible:
1842         xor     %eax,%eax
1843         ret
1844 .size   rsaz_avx2_eligible,.-rsaz_avx2_eligible
1845
1846 .globl  rsaz_1024_sqr_avx2
1847 .globl  rsaz_1024_mul_avx2
1848 .globl  rsaz_1024_norm2red_avx2
1849 .globl  rsaz_1024_red2norm_avx2
1850 .globl  rsaz_1024_scatter5_avx2
1851 .globl  rsaz_1024_gather5_avx2
1852 .type   rsaz_1024_sqr_avx2,\@abi-omnipotent
1853 rsaz_1024_sqr_avx2:
1854 rsaz_1024_mul_avx2:
1855 rsaz_1024_norm2red_avx2:
1856 rsaz_1024_red2norm_avx2:
1857 rsaz_1024_scatter5_avx2:
1858 rsaz_1024_gather5_avx2:
1859         .byte   0x0f,0x0b       # ud2
1860         ret
1861 .size   rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
1862 ___
1863 }}}
1864
1865 close STDOUT;