3 ##############################################################################
5 # Copyright (c) 2012, Intel Corporation #
7 # All rights reserved. #
9 # Redistribution and use in source and binary forms, with or without #
10 # modification, are permitted provided that the following conditions are #
13 # * Redistributions of source code must retain the above copyright #
14 # notice, this list of conditions and the following disclaimer. #
16 # * Redistributions in binary form must reproduce the above copyright #
17 # notice, this list of conditions and the following disclaimer in the #
18 # documentation and/or other materials provided with the #
21 # * Neither the name of the Intel Corporation nor the names of its #
22 # contributors may be used to endorse or promote products derived from #
23 # this software without specific prior written permission. #
26 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
38 ##############################################################################
39 # Developers and authors: #
40 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41 # (1) Intel Corporation, Israel Development Center, Haifa, Israel #
42 # (2) University of Haifa, Israel #
43 ##############################################################################
45 # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
46 # Exponentiation, Using Advanced Vector Instructions Architectures", #
47 # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
48 # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
49 # [2] S. Gueron: "Efficient Software Implementations of Modular #
50 # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
51 # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
52 # Proceedings of 9th International Conference on Information Technology: #
53 # New Generations (ITNG 2012), pp.821-823 (2012) #
54 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
55 # resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
56 # on AVX2 capable x86_64 platforms", #
57 # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
58 ##############################################################################
60 # +13% improvement over original submission by <appro@openssl.org>
62 # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
63 # 2.3GHz Haswell 621 765/+23% 1113/+79%
65 # (*) if system doesn't support AVX2, for reference purposes;
69 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
71 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
73 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
75 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
76 die "can't locate x86_64-xlate.pl";
78 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
79 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
80 $avx = ($1>=2.19) + ($1>=2.22);
83 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
84 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
85 $avx = ($1>=2.09) + ($1>=2.10);
88 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
89 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
90 $avx = ($1>=10) + ($1>=11);
93 if (!$avx && `$ENV{CC} -v 2>&1` =~ /LLVM ([3-9]\.[0-9]+)/) {
94 $avx = ($1>=3.0) + ($1>=3.1);
97 open OUT,"| $^X $xlate $flavour $output";
102 my $rp="%rdi"; # BN_ULONG *rp,
103 my $ap="%rsi"; # const BN_ULONG *ap,
104 my $np="%rdx"; # const BN_ULONG *np,
105 my $n0="%ecx"; # const BN_ULONG n0,
106 my $rep="%r8d"; # int repeat);
108 # The registers that hold the accumulated redundant result
109 # The AMM works on 1024 bit operands, and redundant word size is 29
110 # Therefore: ceil(1024/29)/4 = 9
121 # Registers that hold the broadcasted words of bp, currently used
124 # Registers that hold the broadcasted words of Y, currently used
129 my $AND_MASK="%ymm15";
130 # alu registers that hold the first words of the ACC
136 my $i="%r14d"; # loop counter
139 my $FrameSize=32*18+32*8; # place for A^2 and 2*A
146 $np="%r13"; # reassigned argument
151 .globl rsaz_1024_sqr_avx2
152 .type rsaz_1024_sqr_avx2,\@function,5
154 rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
164 $code.=<<___ if ($win64);
166 vmovaps %xmm6,-0xd8(%rax)
167 vmovaps %xmm7,-0xc8(%rax)
168 vmovaps %xmm8,-0xb8(%rax)
169 vmovaps %xmm9,-0xa8(%rax)
170 vmovaps %xmm10,-0x98(%rax)
171 vmovaps %xmm11,-0x88(%rax)
172 vmovaps %xmm12,-0x78(%rax)
173 vmovaps %xmm13,-0x68(%rax)
174 vmovaps %xmm14,-0x58(%rax)
175 vmovaps %xmm15,-0x48(%rax)
180 mov %rdx, $np # reassigned argument
181 sub \$$FrameSize, %rsp
183 sub \$-128, $rp # size optimization
187 and \$4095, $tmp # see if $np crosses page
190 vpxor $ACC9,$ACC9,$ACC9
191 jz .Lsqr_1024_no_n_copy
193 # unaligned 256-bit load that crosses page boundary can
194 # cause >2x performance degradation here, so if $np does
195 # cross page boundary, copy it to stack and make sure stack
198 vmovdqu 32*0-128($np), $ACC0
200 vmovdqu 32*1-128($np), $ACC1
201 vmovdqu 32*2-128($np), $ACC2
202 vmovdqu 32*3-128($np), $ACC3
203 vmovdqu 32*4-128($np), $ACC4
204 vmovdqu 32*5-128($np), $ACC5
205 vmovdqu 32*6-128($np), $ACC6
206 vmovdqu 32*7-128($np), $ACC7
207 vmovdqu 32*8-128($np), $ACC8
208 lea $FrameSize+128(%rsp),$np
209 vmovdqu $ACC0, 32*0-128($np)
210 vmovdqu $ACC1, 32*1-128($np)
211 vmovdqu $ACC2, 32*2-128($np)
212 vmovdqu $ACC3, 32*3-128($np)
213 vmovdqu $ACC4, 32*4-128($np)
214 vmovdqu $ACC5, 32*5-128($np)
215 vmovdqu $ACC6, 32*6-128($np)
216 vmovdqu $ACC7, 32*7-128($np)
217 vmovdqu $ACC8, 32*8-128($np)
218 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
220 .Lsqr_1024_no_n_copy:
223 vmovdqu 32*1-128($ap), $ACC1
224 vmovdqu 32*2-128($ap), $ACC2
225 vmovdqu 32*3-128($ap), $ACC3
226 vmovdqu 32*4-128($ap), $ACC4
227 vmovdqu 32*5-128($ap), $ACC5
228 vmovdqu 32*6-128($ap), $ACC6
229 vmovdqu 32*7-128($ap), $ACC7
230 vmovdqu 32*8-128($ap), $ACC8
232 lea 192(%rsp), $tp0 # 64+128=192
233 vpbroadcastq .Land_mask(%rip), $AND_MASK
234 jmp .LOOP_GRANDE_SQR_1024
237 .LOOP_GRANDE_SQR_1024:
238 lea 32*18+128(%rsp), $aap # size optimization
239 lea 448(%rsp), $tp1 # 64+128+256=448
241 # the squaring is performed as described in Variant B of
242 # "Speeding up Big-Number Squaring", so start by calculating
244 vpaddq $ACC1, $ACC1, $ACC1
245 vpbroadcastq 32*0-128($ap), $B1
246 vpaddq $ACC2, $ACC2, $ACC2
247 vmovdqa $ACC1, 32*0-128($aap)
248 vpaddq $ACC3, $ACC3, $ACC3
249 vmovdqa $ACC2, 32*1-128($aap)
250 vpaddq $ACC4, $ACC4, $ACC4
251 vmovdqa $ACC3, 32*2-128($aap)
252 vpaddq $ACC5, $ACC5, $ACC5
253 vmovdqa $ACC4, 32*3-128($aap)
254 vpaddq $ACC6, $ACC6, $ACC6
255 vmovdqa $ACC5, 32*4-128($aap)
256 vpaddq $ACC7, $ACC7, $ACC7
257 vmovdqa $ACC6, 32*5-128($aap)
258 vpaddq $ACC8, $ACC8, $ACC8
259 vmovdqa $ACC7, 32*6-128($aap)
260 vpxor $ACC9, $ACC9, $ACC9
261 vmovdqa $ACC8, 32*7-128($aap)
263 vpmuludq 32*0-128($ap), $B1, $ACC0
264 vpbroadcastq 32*1-128($ap), $B2
265 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
266 vpmuludq $B1, $ACC1, $ACC1
267 vmovdqu $ACC9, 32*10-448($tp1)
268 vpmuludq $B1, $ACC2, $ACC2
269 vmovdqu $ACC9, 32*11-448($tp1)
270 vpmuludq $B1, $ACC3, $ACC3
271 vmovdqu $ACC9, 32*12-448($tp1)
272 vpmuludq $B1, $ACC4, $ACC4
273 vmovdqu $ACC9, 32*13-448($tp1)
274 vpmuludq $B1, $ACC5, $ACC5
275 vmovdqu $ACC9, 32*14-448($tp1)
276 vpmuludq $B1, $ACC6, $ACC6
277 vmovdqu $ACC9, 32*15-448($tp1)
278 vpmuludq $B1, $ACC7, $ACC7
279 vmovdqu $ACC9, 32*16-448($tp1)
280 vpmuludq $B1, $ACC8, $ACC8
281 vpbroadcastq 32*2-128($ap), $B1
282 vmovdqu $ACC9, 32*17-448($tp1)
293 vpbroadcastq 32*1-128($tpa), $B2
294 vpmuludq 32*0-128($ap), $B1, $ACC0
295 vpaddq 32*0-192($tp0), $ACC0, $ACC0
296 vpmuludq 32*0-128($aap), $B1, $ACC1
297 vpaddq 32*1-192($tp0), $ACC1, $ACC1
298 vpmuludq 32*1-128($aap), $B1, $ACC2
299 vpaddq 32*2-192($tp0), $ACC2, $ACC2
300 vpmuludq 32*2-128($aap), $B1, $ACC3
301 vpaddq 32*3-192($tp0), $ACC3, $ACC3
302 vpmuludq 32*3-128($aap), $B1, $ACC4
303 vpaddq 32*4-192($tp0), $ACC4, $ACC4
304 vpmuludq 32*4-128($aap), $B1, $ACC5
305 vpaddq 32*5-192($tp0), $ACC5, $ACC5
306 vpmuludq 32*5-128($aap), $B1, $ACC6
307 vpaddq 32*6-192($tp0), $ACC6, $ACC6
308 vpmuludq 32*6-128($aap), $B1, $ACC7
309 vpaddq 32*7-192($tp0), $ACC7, $ACC7
310 vpmuludq 32*7-128($aap), $B1, $ACC8
311 vpbroadcastq 32*2-128($tpa), $B1
312 vpaddq 32*8-192($tp0), $ACC8, $ACC8
314 vmovdqu $ACC0, 32*0-192($tp0)
315 vmovdqu $ACC1, 32*1-192($tp0)
317 vpmuludq 32*1-128($ap), $B2, $TEMP0
318 vpaddq $TEMP0, $ACC2, $ACC2
319 vpmuludq 32*1-128($aap), $B2, $TEMP1
320 vpaddq $TEMP1, $ACC3, $ACC3
321 vpmuludq 32*2-128($aap), $B2, $TEMP2
322 vpaddq $TEMP2, $ACC4, $ACC4
323 vpmuludq 32*3-128($aap), $B2, $TEMP0
324 vpaddq $TEMP0, $ACC5, $ACC5
325 vpmuludq 32*4-128($aap), $B2, $TEMP1
326 vpaddq $TEMP1, $ACC6, $ACC6
327 vpmuludq 32*5-128($aap), $B2, $TEMP2
328 vpaddq $TEMP2, $ACC7, $ACC7
329 vpmuludq 32*6-128($aap), $B2, $TEMP0
330 vpaddq $TEMP0, $ACC8, $ACC8
331 vpmuludq 32*7-128($aap), $B2, $ACC0
332 vpbroadcastq 32*3-128($tpa), $B2
333 vpaddq 32*9-192($tp0), $ACC0, $ACC0
335 vmovdqu $ACC2, 32*2-192($tp0)
336 vmovdqu $ACC3, 32*3-192($tp0)
338 vpmuludq 32*2-128($ap), $B1, $TEMP2
339 vpaddq $TEMP2, $ACC4, $ACC4
340 vpmuludq 32*2-128($aap), $B1, $TEMP0
341 vpaddq $TEMP0, $ACC5, $ACC5
342 vpmuludq 32*3-128($aap), $B1, $TEMP1
343 vpaddq $TEMP1, $ACC6, $ACC6
344 vpmuludq 32*4-128($aap), $B1, $TEMP2
345 vpaddq $TEMP2, $ACC7, $ACC7
346 vpmuludq 32*5-128($aap), $B1, $TEMP0
347 vpaddq $TEMP0, $ACC8, $ACC8
348 vpmuludq 32*6-128($aap), $B1, $TEMP1
349 vpaddq $TEMP1, $ACC0, $ACC0
350 vpmuludq 32*7-128($aap), $B1, $ACC1
351 vpbroadcastq 32*4-128($tpa), $B1
352 vpaddq 32*10-448($tp1), $ACC1, $ACC1
354 vmovdqu $ACC4, 32*4-192($tp0)
355 vmovdqu $ACC5, 32*5-192($tp0)
357 vpmuludq 32*3-128($ap), $B2, $TEMP0
358 vpaddq $TEMP0, $ACC6, $ACC6
359 vpmuludq 32*3-128($aap), $B2, $TEMP1
360 vpaddq $TEMP1, $ACC7, $ACC7
361 vpmuludq 32*4-128($aap), $B2, $TEMP2
362 vpaddq $TEMP2, $ACC8, $ACC8
363 vpmuludq 32*5-128($aap), $B2, $TEMP0
364 vpaddq $TEMP0, $ACC0, $ACC0
365 vpmuludq 32*6-128($aap), $B2, $TEMP1
366 vpaddq $TEMP1, $ACC1, $ACC1
367 vpmuludq 32*7-128($aap), $B2, $ACC2
368 vpbroadcastq 32*5-128($tpa), $B2
369 vpaddq 32*11-448($tp1), $ACC2, $ACC2
371 vmovdqu $ACC6, 32*6-192($tp0)
372 vmovdqu $ACC7, 32*7-192($tp0)
374 vpmuludq 32*4-128($ap), $B1, $TEMP0
375 vpaddq $TEMP0, $ACC8, $ACC8
376 vpmuludq 32*4-128($aap), $B1, $TEMP1
377 vpaddq $TEMP1, $ACC0, $ACC0
378 vpmuludq 32*5-128($aap), $B1, $TEMP2
379 vpaddq $TEMP2, $ACC1, $ACC1
380 vpmuludq 32*6-128($aap), $B1, $TEMP0
381 vpaddq $TEMP0, $ACC2, $ACC2
382 vpmuludq 32*7-128($aap), $B1, $ACC3
383 vpbroadcastq 32*6-128($tpa), $B1
384 vpaddq 32*12-448($tp1), $ACC3, $ACC3
386 vmovdqu $ACC8, 32*8-192($tp0)
387 vmovdqu $ACC0, 32*9-192($tp0)
390 vpmuludq 32*5-128($ap), $B2, $TEMP2
391 vpaddq $TEMP2, $ACC1, $ACC1
392 vpmuludq 32*5-128($aap), $B2, $TEMP0
393 vpaddq $TEMP0, $ACC2, $ACC2
394 vpmuludq 32*6-128($aap), $B2, $TEMP1
395 vpaddq $TEMP1, $ACC3, $ACC3
396 vpmuludq 32*7-128($aap), $B2, $ACC4
397 vpbroadcastq 32*7-128($tpa), $B2
398 vpaddq 32*13-448($tp1), $ACC4, $ACC4
400 vmovdqu $ACC1, 32*10-448($tp1)
401 vmovdqu $ACC2, 32*11-448($tp1)
403 vpmuludq 32*6-128($ap), $B1, $TEMP0
404 vpaddq $TEMP0, $ACC3, $ACC3
405 vpmuludq 32*6-128($aap), $B1, $TEMP1
406 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
407 vpaddq $TEMP1, $ACC4, $ACC4
408 vpmuludq 32*7-128($aap), $B1, $ACC5
409 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
410 vpaddq 32*14-448($tp1), $ACC5, $ACC5
412 vmovdqu $ACC3, 32*12-448($tp1)
413 vmovdqu $ACC4, 32*13-448($tp1)
416 vpmuludq 32*7-128($ap), $B2, $TEMP0
417 vpaddq $TEMP0, $ACC5, $ACC5
418 vpmuludq 32*7-128($aap), $B2, $ACC6
419 vpaddq 32*15-448($tp1), $ACC6, $ACC6
421 vpmuludq 32*8-128($ap), $ACC0, $ACC7
422 vmovdqu $ACC5, 32*14-448($tp1)
423 vpaddq 32*16-448($tp1), $ACC7, $ACC7
424 vmovdqu $ACC6, 32*15-448($tp1)
425 vmovdqu $ACC7, 32*16-448($tp1)
437 #we need to fix indexes 32-39 to avoid overflow
438 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
439 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
440 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
441 lea 192(%rsp), $tp0 # 64+128=192
443 vpsrlq \$29, $ACC8, $TEMP1
444 vpand $AND_MASK, $ACC8, $ACC8
445 vpsrlq \$29, $ACC1, $TEMP2
446 vpand $AND_MASK, $ACC1, $ACC1
448 vpermq \$0x93, $TEMP1, $TEMP1
449 vpxor $ZERO, $ZERO, $ZERO
450 vpermq \$0x93, $TEMP2, $TEMP2
452 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
453 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
454 vpaddq $TEMP0, $ACC8, $ACC8
455 vpblendd \$3, $TEMP2, $ZERO, $TEMP2
456 vpaddq $TEMP1, $ACC1, $ACC1
457 vpaddq $TEMP2, $ACC2, $ACC2
458 vmovdqu $ACC1, 32*9-192($tp0)
459 vmovdqu $ACC2, 32*10-192($tp0)
465 vmovdqu 32*1(%rsp), $ACC1
466 vmovdqu 32*2-192($tp0), $ACC2
467 vmovdqu 32*3-192($tp0), $ACC3
468 vmovdqu 32*4-192($tp0), $ACC4
469 vmovdqu 32*5-192($tp0), $ACC5
470 vmovdqu 32*6-192($tp0), $ACC6
471 vmovdqu 32*7-192($tp0), $ACC7
475 and \$0x1fffffff, %eax
479 imulq -128($np), %rax
480 vpbroadcastq $Y1, $Y1
483 imulq 8-128($np), %rax
487 imulq 16-128($np), %rax
490 imulq 24-128($np), %rdx
495 and \$0x1fffffff, %eax
498 jmp .LOOP_REDUCE_1024
503 vpbroadcastq $Y2, $Y2
505 vpmuludq 32*1-128($np), $Y1, $TEMP0
507 imulq -128($np), %rax
508 vpaddq $TEMP0, $ACC1, $ACC1
510 vpmuludq 32*2-128($np), $Y1, $TEMP1
512 imulq 8-128($np), %rax
513 vpaddq $TEMP1, $ACC2, $ACC2
514 vpmuludq 32*3-128($np), $Y1, $TEMP2
519 imulq 16-128($np), %rax
521 vpaddq $TEMP2, $ACC3, $ACC3
522 vpmuludq 32*4-128($np), $Y1, $TEMP0
525 vpaddq $TEMP0, $ACC4, $ACC4
526 vpmuludq 32*5-128($np), $Y1, $TEMP1
529 vpaddq $TEMP1, $ACC5, $ACC5
530 vpmuludq 32*6-128($np), $Y1, $TEMP2
531 and \$0x1fffffff, %eax
532 vpaddq $TEMP2, $ACC6, $ACC6
533 vpmuludq 32*7-128($np), $Y1, $TEMP0
534 vpaddq $TEMP0, $ACC7, $ACC7
535 vpmuludq 32*8-128($np), $Y1, $TEMP1
537 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
538 vpaddq $TEMP1, $ACC8, $ACC8
539 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
540 vpbroadcastq $Y1, $Y1
542 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
543 vmovdqu 32*3-8-128($np), $TEMP1
545 imulq -128($np), %rax
546 vpaddq $TEMP2, $ACC1, $ACC1
547 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
548 vmovdqu 32*4-8-128($np), $TEMP2
551 imulq 8-128($np), %rax
552 vpaddq $TEMP0, $ACC2, $ACC2
555 vpmuludq $Y2, $TEMP1, $TEMP1
556 vmovdqu 32*5-8-128($np), $TEMP0
558 vpaddq $TEMP1, $ACC3, $ACC3
559 vpmuludq $Y2, $TEMP2, $TEMP2
560 vmovdqu 32*6-8-128($np), $TEMP1
564 vpaddq $TEMP2, $ACC4, $ACC4
565 vpmuludq $Y2, $TEMP0, $TEMP0
566 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
567 and \$0x1fffffff, %eax
568 vpaddq $TEMP0, $ACC5, $ACC5
569 vpmuludq $Y2, $TEMP1, $TEMP1
570 vmovdqu 32*8-8-128($np), $TEMP0
571 vpaddq $TEMP1, $ACC6, $ACC6
572 vpmuludq $Y2, $TEMP2, $TEMP2
573 vmovdqu 32*9-8-128($np), $ACC9
574 vmovd %eax, $ACC0 # borrow ACC0 for Y2
575 imulq -128($np), %rax
576 vpaddq $TEMP2, $ACC7, $ACC7
577 vpmuludq $Y2, $TEMP0, $TEMP0
578 vmovdqu 32*1-16-128($np), $TEMP1
579 vpbroadcastq $ACC0, $ACC0
580 vpaddq $TEMP0, $ACC8, $ACC8
581 vpmuludq $Y2, $ACC9, $ACC9
582 vmovdqu 32*2-16-128($np), $TEMP2
586 ($ACC0,$Y2)=($Y2,$ACC0);
588 vmovdqu 32*1-24-128($np), $ACC0
589 vpmuludq $Y1, $TEMP1, $TEMP1
590 vmovdqu 32*3-16-128($np), $TEMP0
591 vpaddq $TEMP1, $ACC1, $ACC1
592 vpmuludq $Y2, $ACC0, $ACC0
593 vpmuludq $Y1, $TEMP2, $TEMP2
594 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
595 vpaddq $ACC1, $ACC0, $ACC0
596 vpaddq $TEMP2, $ACC2, $ACC2
597 vpmuludq $Y1, $TEMP0, $TEMP0
598 vmovdqu 32*5-16-128($np), $TEMP2
601 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
602 vpaddq $TEMP0, $ACC3, $ACC3
603 vpmuludq $Y1, $TEMP1, $TEMP1
604 vmovdqu 32*6-16-128($np), $TEMP0
605 vpaddq $TEMP1, $ACC4, $ACC4
606 vpmuludq $Y1, $TEMP2, $TEMP2
607 vmovdqu 32*7-16-128($np), $TEMP1
608 vpaddq $TEMP2, $ACC5, $ACC5
609 vpmuludq $Y1, $TEMP0, $TEMP0
610 vmovdqu 32*8-16-128($np), $TEMP2
611 vpaddq $TEMP0, $ACC6, $ACC6
612 vpmuludq $Y1, $TEMP1, $TEMP1
614 vmovdqu 32*9-16-128($np), $TEMP0
616 vpaddq $TEMP1, $ACC7, $ACC7
617 vpmuludq $Y1, $TEMP2, $TEMP2
618 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
621 vpaddq $TEMP2, $ACC8, $ACC8
622 vpmuludq $Y1, $TEMP0, $TEMP0
623 and \$0x1fffffff, %eax
625 vmovdqu 32*3-24-128($np), $TEMP2
627 vpaddq $TEMP0, $ACC9, $ACC9
628 vpbroadcastq $Y1, $Y1
630 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
631 vmovdqu 32*4-24-128($np), $TEMP0
633 imulq -128($np), %rax
635 vpaddq $TEMP1, $ACC2, $ACC1
636 vpmuludq $Y2, $TEMP2, $TEMP2
637 vmovdqu 32*5-24-128($np), $TEMP1
640 imulq 8-128($np), %rax
644 vpaddq $TEMP2, $ACC3, $ACC2
645 vpmuludq $Y2, $TEMP0, $TEMP0
646 vmovdqu 32*6-24-128($np), $TEMP2
649 imulq 16-128($np), %rax
650 vpaddq $TEMP0, $ACC4, $ACC3
651 vpmuludq $Y2, $TEMP1, $TEMP1
652 vmovdqu 32*7-24-128($np), $TEMP0
653 imulq 24-128($np), %rdx # future $r3
656 vpaddq $TEMP1, $ACC5, $ACC4
657 vpmuludq $Y2, $TEMP2, $TEMP2
658 vmovdqu 32*8-24-128($np), $TEMP1
661 vpmuludq $Y2, $TEMP0, $TEMP0
662 vpaddq $TEMP2, $ACC6, $ACC5
663 vmovdqu 32*9-24-128($np), $TEMP2
664 and \$0x1fffffff, %eax
665 vpaddq $TEMP0, $ACC7, $ACC6
666 vpmuludq $Y2, $TEMP1, $TEMP1
668 vpaddq $TEMP1, $ACC8, $ACC7
669 vpmuludq $Y2, $TEMP2, $TEMP2
670 vpaddq $TEMP2, $ACC9, $ACC8
675 jnz .LOOP_REDUCE_1024
677 ($ACC0,$Y2)=($Y2,$ACC0);
679 lea 448(%rsp), $tp1 # size optimization
680 vpaddq $ACC9, $Y2, $ACC0
681 vpxor $ZERO, $ZERO, $ZERO
683 vpaddq 32*9-192($tp0), $ACC0, $ACC0
684 vpaddq 32*10-448($tp1), $ACC1, $ACC1
685 vpaddq 32*11-448($tp1), $ACC2, $ACC2
686 vpaddq 32*12-448($tp1), $ACC3, $ACC3
687 vpaddq 32*13-448($tp1), $ACC4, $ACC4
688 vpaddq 32*14-448($tp1), $ACC5, $ACC5
689 vpaddq 32*15-448($tp1), $ACC6, $ACC6
690 vpaddq 32*16-448($tp1), $ACC7, $ACC7
691 vpaddq 32*17-448($tp1), $ACC8, $ACC8
693 vpsrlq \$29, $ACC0, $TEMP1
694 vpand $AND_MASK, $ACC0, $ACC0
695 vpsrlq \$29, $ACC1, $TEMP2
696 vpand $AND_MASK, $ACC1, $ACC1
697 vpsrlq \$29, $ACC2, $TEMP3
698 vpermq \$0x93, $TEMP1, $TEMP1
699 vpand $AND_MASK, $ACC2, $ACC2
700 vpsrlq \$29, $ACC3, $TEMP4
701 vpermq \$0x93, $TEMP2, $TEMP2
702 vpand $AND_MASK, $ACC3, $ACC3
703 vpermq \$0x93, $TEMP3, $TEMP3
705 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
706 vpermq \$0x93, $TEMP4, $TEMP4
707 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
708 vpaddq $TEMP0, $ACC0, $ACC0
709 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
710 vpaddq $TEMP1, $ACC1, $ACC1
711 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
712 vpaddq $TEMP2, $ACC2, $ACC2
713 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
714 vpaddq $TEMP3, $ACC3, $ACC3
715 vpaddq $TEMP4, $ACC4, $ACC4
717 vpsrlq \$29, $ACC0, $TEMP1
718 vpand $AND_MASK, $ACC0, $ACC0
719 vpsrlq \$29, $ACC1, $TEMP2
720 vpand $AND_MASK, $ACC1, $ACC1
721 vpsrlq \$29, $ACC2, $TEMP3
722 vpermq \$0x93, $TEMP1, $TEMP1
723 vpand $AND_MASK, $ACC2, $ACC2
724 vpsrlq \$29, $ACC3, $TEMP4
725 vpermq \$0x93, $TEMP2, $TEMP2
726 vpand $AND_MASK, $ACC3, $ACC3
727 vpermq \$0x93, $TEMP3, $TEMP3
729 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
730 vpermq \$0x93, $TEMP4, $TEMP4
731 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
732 vpaddq $TEMP0, $ACC0, $ACC0
733 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
734 vpaddq $TEMP1, $ACC1, $ACC1
735 vmovdqu $ACC0, 32*0-128($rp)
736 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
737 vpaddq $TEMP2, $ACC2, $ACC2
738 vmovdqu $ACC1, 32*1-128($rp)
739 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
740 vpaddq $TEMP3, $ACC3, $ACC3
741 vmovdqu $ACC2, 32*2-128($rp)
742 vpaddq $TEMP4, $ACC4, $ACC4
743 vmovdqu $ACC3, 32*3-128($rp)
747 vpsrlq \$29, $ACC4, $TEMP1
748 vpand $AND_MASK, $ACC4, $ACC4
749 vpsrlq \$29, $ACC5, $TEMP2
750 vpand $AND_MASK, $ACC5, $ACC5
751 vpsrlq \$29, $ACC6, $TEMP3
752 vpermq \$0x93, $TEMP1, $TEMP1
753 vpand $AND_MASK, $ACC6, $ACC6
754 vpsrlq \$29, $ACC7, $TEMP4
755 vpermq \$0x93, $TEMP2, $TEMP2
756 vpand $AND_MASK, $ACC7, $ACC7
757 vpsrlq \$29, $ACC8, $TEMP5
758 vpermq \$0x93, $TEMP3, $TEMP3
759 vpand $AND_MASK, $ACC8, $ACC8
760 vpermq \$0x93, $TEMP4, $TEMP4
762 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
763 vpermq \$0x93, $TEMP5, $TEMP5
764 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
765 vpaddq $TEMP0, $ACC4, $ACC4
766 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
767 vpaddq $TEMP1, $ACC5, $ACC5
768 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
769 vpaddq $TEMP2, $ACC6, $ACC6
770 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
771 vpaddq $TEMP3, $ACC7, $ACC7
772 vpaddq $TEMP4, $ACC8, $ACC8
774 vpsrlq \$29, $ACC4, $TEMP1
775 vpand $AND_MASK, $ACC4, $ACC4
776 vpsrlq \$29, $ACC5, $TEMP2
777 vpand $AND_MASK, $ACC5, $ACC5
778 vpsrlq \$29, $ACC6, $TEMP3
779 vpermq \$0x93, $TEMP1, $TEMP1
780 vpand $AND_MASK, $ACC6, $ACC6
781 vpsrlq \$29, $ACC7, $TEMP4
782 vpermq \$0x93, $TEMP2, $TEMP2
783 vpand $AND_MASK, $ACC7, $ACC7
784 vpsrlq \$29, $ACC8, $TEMP5
785 vpermq \$0x93, $TEMP3, $TEMP3
786 vpand $AND_MASK, $ACC8, $ACC8
787 vpermq \$0x93, $TEMP4, $TEMP4
789 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
790 vpermq \$0x93, $TEMP5, $TEMP5
791 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
792 vpaddq $TEMP0, $ACC4, $ACC4
793 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
794 vpaddq $TEMP1, $ACC5, $ACC5
795 vmovdqu $ACC4, 32*4-128($rp)
796 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
797 vpaddq $TEMP2, $ACC6, $ACC6
798 vmovdqu $ACC5, 32*5-128($rp)
799 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
800 vpaddq $TEMP3, $ACC7, $ACC7
801 vmovdqu $ACC6, 32*6-128($rp)
802 vpaddq $TEMP4, $ACC8, $ACC8
803 vmovdqu $ACC7, 32*7-128($rp)
804 vmovdqu $ACC8, 32*8-128($rp)
808 jne .LOOP_GRANDE_SQR_1024
813 $code.=<<___ if ($win64);
814 movaps -0xd8(%rax),%xmm6
815 movaps -0xc8(%rax),%xmm7
816 movaps -0xb8(%rax),%xmm8
817 movaps -0xa8(%rax),%xmm9
818 movaps -0x98(%rax),%xmm10
819 movaps -0x88(%rax),%xmm11
820 movaps -0x78(%rax),%xmm12
821 movaps -0x68(%rax),%xmm13
822 movaps -0x58(%rax),%xmm14
823 movaps -0x48(%rax),%xmm15
832 lea (%rax),%rsp # restore %rsp
835 .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
840 my $rp="%rdi"; # BN_ULONG *rp,
841 my $ap="%rsi"; # const BN_ULONG *ap,
842 my $bp="%rdx"; # const BN_ULONG *bp,
843 my $np="%rcx"; # const BN_ULONG *np,
844 my $n0="%r8d"; # unsigned int n0);
846 # The registers that hold the accumulated redundant result
847 # The AMM works on 1024 bit operands, and redundant word size is 29
848 # Therefore: ceil(1024/29)/4 = 9
860 # Registers that hold the broadcasted words of multiplier, currently used
869 my $AND_MASK="%ymm15";
871 # alu registers that hold the first words of the ACC
880 $bp="%r13"; # reassigned argument
883 .globl rsaz_1024_mul_avx2
884 .type rsaz_1024_mul_avx2,\@function,5
895 $code.=<<___ if ($win64);
898 vmovaps %xmm6,-0xd8(%rax)
899 vmovaps %xmm7,-0xc8(%rax)
900 vmovaps %xmm8,-0xb8(%rax)
901 vmovaps %xmm9,-0xa8(%rax)
902 vmovaps %xmm10,-0x98(%rax)
903 vmovaps %xmm11,-0x88(%rax)
904 vmovaps %xmm12,-0x78(%rax)
905 vmovaps %xmm13,-0x68(%rax)
906 vmovaps %xmm14,-0x58(%rax)
907 vmovaps %xmm15,-0x48(%rax)
913 mov %rdx, $bp # reassigned argument
916 # unaligned 256-bit load that crosses page boundary can
917 # cause severe performance degradation here, so if $ap does
918 # cross page boundary, swap it with $bp [meaning that caller
919 # is advised to lay down $ap and $bp next to each other, so
920 # that only one can cross page boundary].
931 sub \$-128,$ap # size optimization
935 and \$4095, $tmp # see if $np crosses page
939 jz .Lmul_1024_no_n_copy
941 # unaligned 256-bit load that crosses page boundary can
942 # cause severe performance degradation here, so if $np does
943 # cross page boundary, copy it to stack and make sure stack
946 vmovdqu 32*0-128($np), $ACC0
948 vmovdqu 32*1-128($np), $ACC1
949 vmovdqu 32*2-128($np), $ACC2
950 vmovdqu 32*3-128($np), $ACC3
951 vmovdqu 32*4-128($np), $ACC4
952 vmovdqu 32*5-128($np), $ACC5
953 vmovdqu 32*6-128($np), $ACC6
954 vmovdqu 32*7-128($np), $ACC7
955 vmovdqu 32*8-128($np), $ACC8
957 vmovdqu $ACC0, 32*0-128($np)
958 vpxor $ACC0, $ACC0, $ACC0
959 vmovdqu $ACC1, 32*1-128($np)
960 vpxor $ACC1, $ACC1, $ACC1
961 vmovdqu $ACC2, 32*2-128($np)
962 vpxor $ACC2, $ACC2, $ACC2
963 vmovdqu $ACC3, 32*3-128($np)
964 vpxor $ACC3, $ACC3, $ACC3
965 vmovdqu $ACC4, 32*4-128($np)
966 vpxor $ACC4, $ACC4, $ACC4
967 vmovdqu $ACC5, 32*5-128($np)
968 vpxor $ACC5, $ACC5, $ACC5
969 vmovdqu $ACC6, 32*6-128($np)
970 vpxor $ACC6, $ACC6, $ACC6
971 vmovdqu $ACC7, 32*7-128($np)
972 vpxor $ACC7, $ACC7, $ACC7
973 vmovdqu $ACC8, 32*8-128($np)
975 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
976 .Lmul_1024_no_n_copy:
980 vpbroadcastq ($bp), $Bi
981 vmovdqu $ACC0, (%rsp) # clear top of stack
988 vmovdqu .Land_mask(%rip), $AND_MASK
994 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
996 imulq -128($ap), %rax
999 imulq 8-128($ap), $r1
1004 and \$0x1fffffff, %eax
1007 imulq 16-128($ap), $r2
1011 imulq 24-128($ap), $r3
1013 vpmuludq 32*1-128($ap),$Bi,$TEMP0
1015 vpaddq $TEMP0,$ACC1,$ACC1
1016 vpmuludq 32*2-128($ap),$Bi,$TEMP1
1017 vpbroadcastq $Yi, $Yi
1018 vpaddq $TEMP1,$ACC2,$ACC2
1019 vpmuludq 32*3-128($ap),$Bi,$TEMP2
1020 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
1021 vpaddq $TEMP2,$ACC3,$ACC3
1022 vpmuludq 32*4-128($ap),$Bi,$TEMP0
1023 vpaddq $TEMP0,$ACC4,$ACC4
1024 vpmuludq 32*5-128($ap),$Bi,$TEMP1
1025 vpaddq $TEMP1,$ACC5,$ACC5
1026 vpmuludq 32*6-128($ap),$Bi,$TEMP2
1027 vpaddq $TEMP2,$ACC6,$ACC6
1028 vpmuludq 32*7-128($ap),$Bi,$TEMP0
1029 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
1030 vpaddq $TEMP0,$ACC7,$ACC7
1031 vpmuludq 32*8-128($ap),$Bi,$TEMP1
1032 vpbroadcastq 8($bp), $Bi
1033 vpaddq $TEMP1,$ACC8,$ACC8
1036 imulq -128($np),%rax
1039 imulq 8-128($np),%rax
1042 imulq 16-128($np),%rax
1045 imulq 24-128($np),%rdx
1049 vpmuludq 32*1-128($np),$Yi,$TEMP2
1051 vpaddq $TEMP2,$ACC1,$ACC1
1052 vpmuludq 32*2-128($np),$Yi,$TEMP0
1053 vpaddq $TEMP0,$ACC2,$ACC2
1054 vpmuludq 32*3-128($np),$Yi,$TEMP1
1055 vpaddq $TEMP1,$ACC3,$ACC3
1056 vpmuludq 32*4-128($np),$Yi,$TEMP2
1057 vpaddq $TEMP2,$ACC4,$ACC4
1058 vpmuludq 32*5-128($np),$Yi,$TEMP0
1059 vpaddq $TEMP0,$ACC5,$ACC5
1060 vpmuludq 32*6-128($np),$Yi,$TEMP1
1061 vpaddq $TEMP1,$ACC6,$ACC6
1062 vpmuludq 32*7-128($np),$Yi,$TEMP2
1063 vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
1064 vpaddq $TEMP2,$ACC7,$ACC7
1065 vpmuludq 32*8-128($np),$Yi,$TEMP0
1066 vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
1067 vpaddq $TEMP0,$ACC8,$ACC8
1070 imulq -128($ap),%rax
1072 vmovdqu -8+32*1-128($ap),$TEMP1
1074 imulq 8-128($ap),%rax
1076 vmovdqu -8+32*2-128($ap),$TEMP2
1080 and \$0x1fffffff, %eax
1082 imulq 16-128($ap),%rbx
1084 vpmuludq $Bi,$TEMP1,$TEMP1
1086 vmovdqu -8+32*3-128($ap),$TEMP0
1087 vpaddq $TEMP1,$ACC1,$ACC1
1088 vpmuludq $Bi,$TEMP2,$TEMP2
1089 vpbroadcastq $Yi, $Yi
1090 vmovdqu -8+32*4-128($ap),$TEMP1
1091 vpaddq $TEMP2,$ACC2,$ACC2
1092 vpmuludq $Bi,$TEMP0,$TEMP0
1093 vmovdqu -8+32*5-128($ap),$TEMP2
1094 vpaddq $TEMP0,$ACC3,$ACC3
1095 vpmuludq $Bi,$TEMP1,$TEMP1
1096 vmovdqu -8+32*6-128($ap),$TEMP0
1097 vpaddq $TEMP1,$ACC4,$ACC4
1098 vpmuludq $Bi,$TEMP2,$TEMP2
1099 vmovdqu -8+32*7-128($ap),$TEMP1
1100 vpaddq $TEMP2,$ACC5,$ACC5
1101 vpmuludq $Bi,$TEMP0,$TEMP0
1102 vmovdqu -8+32*8-128($ap),$TEMP2
1103 vpaddq $TEMP0,$ACC6,$ACC6
1104 vpmuludq $Bi,$TEMP1,$TEMP1
1105 vmovdqu -8+32*9-128($ap),$ACC9
1106 vpaddq $TEMP1,$ACC7,$ACC7
1107 vpmuludq $Bi,$TEMP2,$TEMP2
1108 vpaddq $TEMP2,$ACC8,$ACC8
1109 vpmuludq $Bi,$ACC9,$ACC9
1110 vpbroadcastq 16($bp), $Bi
1113 imulq -128($np),%rax
1115 vmovdqu -8+32*1-128($np),$TEMP0
1117 imulq 8-128($np),%rax
1119 vmovdqu -8+32*2-128($np),$TEMP1
1121 imulq 16-128($np),%rdx
1125 vpmuludq $Yi,$TEMP0,$TEMP0
1127 vmovdqu -8+32*3-128($np),$TEMP2
1128 vpaddq $TEMP0,$ACC1,$ACC1
1129 vpmuludq $Yi,$TEMP1,$TEMP1
1130 vmovdqu -8+32*4-128($np),$TEMP0
1131 vpaddq $TEMP1,$ACC2,$ACC2
1132 vpmuludq $Yi,$TEMP2,$TEMP2
1133 vmovdqu -8+32*5-128($np),$TEMP1
1134 vpaddq $TEMP2,$ACC3,$ACC3
1135 vpmuludq $Yi,$TEMP0,$TEMP0
1136 vmovdqu -8+32*6-128($np),$TEMP2
1137 vpaddq $TEMP0,$ACC4,$ACC4
1138 vpmuludq $Yi,$TEMP1,$TEMP1
1139 vmovdqu -8+32*7-128($np),$TEMP0
1140 vpaddq $TEMP1,$ACC5,$ACC5
1141 vpmuludq $Yi,$TEMP2,$TEMP2
1142 vmovdqu -8+32*8-128($np),$TEMP1
1143 vpaddq $TEMP2,$ACC6,$ACC6
1144 vpmuludq $Yi,$TEMP0,$TEMP0
1145 vmovdqu -8+32*9-128($np),$TEMP2
1146 vpaddq $TEMP0,$ACC7,$ACC7
1147 vpmuludq $Yi,$TEMP1,$TEMP1
1148 vpaddq $TEMP1,$ACC8,$ACC8
1149 vpmuludq $Yi,$TEMP2,$TEMP2
1150 vpaddq $TEMP2,$ACC9,$ACC9
1152 vmovdqu -16+32*1-128($ap),$TEMP0
1154 imulq -128($ap),%rax
1157 vmovdqu -16+32*2-128($ap),$TEMP1
1160 and \$0x1fffffff, %eax
1162 imulq 8-128($ap),%rbx
1164 vpmuludq $Bi,$TEMP0,$TEMP0
1166 vmovdqu -16+32*3-128($ap),$TEMP2
1167 vpaddq $TEMP0,$ACC1,$ACC1
1168 vpmuludq $Bi,$TEMP1,$TEMP1
1169 vpbroadcastq $Yi, $Yi
1170 vmovdqu -16+32*4-128($ap),$TEMP0
1171 vpaddq $TEMP1,$ACC2,$ACC2
1172 vpmuludq $Bi,$TEMP2,$TEMP2
1173 vmovdqu -16+32*5-128($ap),$TEMP1
1174 vpaddq $TEMP2,$ACC3,$ACC3
1175 vpmuludq $Bi,$TEMP0,$TEMP0
1176 vmovdqu -16+32*6-128($ap),$TEMP2
1177 vpaddq $TEMP0,$ACC4,$ACC4
1178 vpmuludq $Bi,$TEMP1,$TEMP1
1179 vmovdqu -16+32*7-128($ap),$TEMP0
1180 vpaddq $TEMP1,$ACC5,$ACC5
1181 vpmuludq $Bi,$TEMP2,$TEMP2
1182 vmovdqu -16+32*8-128($ap),$TEMP1
1183 vpaddq $TEMP2,$ACC6,$ACC6
1184 vpmuludq $Bi,$TEMP0,$TEMP0
1185 vmovdqu -16+32*9-128($ap),$TEMP2
1186 vpaddq $TEMP0,$ACC7,$ACC7
1187 vpmuludq $Bi,$TEMP1,$TEMP1
1188 vpaddq $TEMP1,$ACC8,$ACC8
1189 vpmuludq $Bi,$TEMP2,$TEMP2
1190 vpbroadcastq 24($bp), $Bi
1191 vpaddq $TEMP2,$ACC9,$ACC9
1193 vmovdqu -16+32*1-128($np),$TEMP0
1195 imulq -128($np),%rax
1197 vmovdqu -16+32*2-128($np),$TEMP1
1198 imulq 8-128($np),%rdx
1202 vpmuludq $Yi,$TEMP0,$TEMP0
1204 vmovdqu -16+32*3-128($np),$TEMP2
1205 vpaddq $TEMP0,$ACC1,$ACC1
1206 vpmuludq $Yi,$TEMP1,$TEMP1
1207 vmovdqu -16+32*4-128($np),$TEMP0
1208 vpaddq $TEMP1,$ACC2,$ACC2
1209 vpmuludq $Yi,$TEMP2,$TEMP2
1210 vmovdqu -16+32*5-128($np),$TEMP1
1211 vpaddq $TEMP2,$ACC3,$ACC3
1212 vpmuludq $Yi,$TEMP0,$TEMP0
1213 vmovdqu -16+32*6-128($np),$TEMP2
1214 vpaddq $TEMP0,$ACC4,$ACC4
1215 vpmuludq $Yi,$TEMP1,$TEMP1
1216 vmovdqu -16+32*7-128($np),$TEMP0
1217 vpaddq $TEMP1,$ACC5,$ACC5
1218 vpmuludq $Yi,$TEMP2,$TEMP2
1219 vmovdqu -16+32*8-128($np),$TEMP1
1220 vpaddq $TEMP2,$ACC6,$ACC6
1221 vpmuludq $Yi,$TEMP0,$TEMP0
1222 vmovdqu -16+32*9-128($np),$TEMP2
1223 vpaddq $TEMP0,$ACC7,$ACC7
1224 vpmuludq $Yi,$TEMP1,$TEMP1
1225 vmovdqu -24+32*1-128($ap),$TEMP0
1226 vpaddq $TEMP1,$ACC8,$ACC8
1227 vpmuludq $Yi,$TEMP2,$TEMP2
1228 vmovdqu -24+32*2-128($ap),$TEMP1
1229 vpaddq $TEMP2,$ACC9,$ACC9
1232 imulq -128($ap),%rbx
1237 and \$0x1fffffff, %eax
1239 vpmuludq $Bi,$TEMP0,$TEMP0
1241 vmovdqu -24+32*3-128($ap),$TEMP2
1242 vpaddq $TEMP0,$ACC1,$ACC1
1243 vpmuludq $Bi,$TEMP1,$TEMP1
1244 vpbroadcastq $Yi, $Yi
1245 vmovdqu -24+32*4-128($ap),$TEMP0
1246 vpaddq $TEMP1,$ACC2,$ACC2
1247 vpmuludq $Bi,$TEMP2,$TEMP2
1248 vmovdqu -24+32*5-128($ap),$TEMP1
1249 vpaddq $TEMP2,$ACC3,$ACC3
1250 vpmuludq $Bi,$TEMP0,$TEMP0
1251 vmovdqu -24+32*6-128($ap),$TEMP2
1252 vpaddq $TEMP0,$ACC4,$ACC4
1253 vpmuludq $Bi,$TEMP1,$TEMP1
1254 vmovdqu -24+32*7-128($ap),$TEMP0
1255 vpaddq $TEMP1,$ACC5,$ACC5
1256 vpmuludq $Bi,$TEMP2,$TEMP2
1257 vmovdqu -24+32*8-128($ap),$TEMP1
1258 vpaddq $TEMP2,$ACC6,$ACC6
1259 vpmuludq $Bi,$TEMP0,$TEMP0
1260 vmovdqu -24+32*9-128($ap),$TEMP2
1261 vpaddq $TEMP0,$ACC7,$ACC7
1262 vpmuludq $Bi,$TEMP1,$TEMP1
1263 vpaddq $TEMP1,$ACC8,$ACC8
1264 vpmuludq $Bi,$TEMP2,$TEMP2
1265 vpbroadcastq 32($bp), $Bi
1266 vpaddq $TEMP2,$ACC9,$ACC9
1267 add \$32, $bp # $bp++
1269 vmovdqu -24+32*1-128($np),$TEMP0
1270 imulq -128($np),%rax
1274 vmovdqu -24+32*2-128($np),$TEMP1
1275 vpmuludq $Yi,$TEMP0,$TEMP0
1277 vmovdqu -24+32*3-128($np),$TEMP2
1278 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
1279 vpmuludq $Yi,$TEMP1,$TEMP1
1280 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
1281 vpaddq $TEMP1,$ACC2,$ACC1
1282 vmovdqu -24+32*4-128($np),$TEMP0
1283 vpmuludq $Yi,$TEMP2,$TEMP2
1284 vmovdqu -24+32*5-128($np),$TEMP1
1285 vpaddq $TEMP2,$ACC3,$ACC2
1286 vpmuludq $Yi,$TEMP0,$TEMP0
1287 vmovdqu -24+32*6-128($np),$TEMP2
1288 vpaddq $TEMP0,$ACC4,$ACC3
1289 vpmuludq $Yi,$TEMP1,$TEMP1
1290 vmovdqu -24+32*7-128($np),$TEMP0
1291 vpaddq $TEMP1,$ACC5,$ACC4
1292 vpmuludq $Yi,$TEMP2,$TEMP2
1293 vmovdqu -24+32*8-128($np),$TEMP1
1294 vpaddq $TEMP2,$ACC6,$ACC5
1295 vpmuludq $Yi,$TEMP0,$TEMP0
1296 vmovdqu -24+32*9-128($np),$TEMP2
1298 vpaddq $TEMP0,$ACC7,$ACC6
1299 vpmuludq $Yi,$TEMP1,$TEMP1
1301 vpaddq $TEMP1,$ACC8,$ACC7
1302 vpmuludq $Yi,$TEMP2,$TEMP2
1304 vpaddq $TEMP2,$ACC9,$ACC8
1310 # (*) Original implementation was correcting ACC1-ACC3 for overflow
1311 # after 7 loop runs, or after 28 iterations, or 56 additions.
1312 # But as we underutilize resources, it's possible to correct in
1313 # each iteration with marginal performance loss. But then, as
1314 # we do it in each iteration, we can correct less digits, and
1315 # avoid performance penalties completely. Also note that we
1316 # correct only three digits out of four. This works because
1317 # most significant digit is subjected to less additions.
1323 vpermq \$0, $AND_MASK, $AND_MASK
1324 vpaddq (%rsp), $TEMP1, $ACC0
1326 vpsrlq \$29, $ACC0, $TEMP1
1327 vpand $AND_MASK, $ACC0, $ACC0
1328 vpsrlq \$29, $ACC1, $TEMP2
1329 vpand $AND_MASK, $ACC1, $ACC1
1330 vpsrlq \$29, $ACC2, $TEMP3
1331 vpermq \$0x93, $TEMP1, $TEMP1
1332 vpand $AND_MASK, $ACC2, $ACC2
1333 vpsrlq \$29, $ACC3, $TEMP4
1334 vpermq \$0x93, $TEMP2, $TEMP2
1335 vpand $AND_MASK, $ACC3, $ACC3
1337 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1338 vpermq \$0x93, $TEMP3, $TEMP3
1339 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1340 vpermq \$0x93, $TEMP4, $TEMP4
1341 vpaddq $TEMP0, $ACC0, $ACC0
1342 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1343 vpaddq $TEMP1, $ACC1, $ACC1
1344 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1345 vpaddq $TEMP2, $ACC2, $ACC2
1346 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1347 vpaddq $TEMP3, $ACC3, $ACC3
1348 vpaddq $TEMP4, $ACC4, $ACC4
1350 vpsrlq \$29, $ACC0, $TEMP1
1351 vpand $AND_MASK, $ACC0, $ACC0
1352 vpsrlq \$29, $ACC1, $TEMP2
1353 vpand $AND_MASK, $ACC1, $ACC1
1354 vpsrlq \$29, $ACC2, $TEMP3
1355 vpermq \$0x93, $TEMP1, $TEMP1
1356 vpand $AND_MASK, $ACC2, $ACC2
1357 vpsrlq \$29, $ACC3, $TEMP4
1358 vpermq \$0x93, $TEMP2, $TEMP2
1359 vpand $AND_MASK, $ACC3, $ACC3
1360 vpermq \$0x93, $TEMP3, $TEMP3
1362 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1363 vpermq \$0x93, $TEMP4, $TEMP4
1364 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1365 vpaddq $TEMP0, $ACC0, $ACC0
1366 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1367 vpaddq $TEMP1, $ACC1, $ACC1
1368 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1369 vpaddq $TEMP2, $ACC2, $ACC2
1370 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1371 vpaddq $TEMP3, $ACC3, $ACC3
1372 vpaddq $TEMP4, $ACC4, $ACC4
1374 vmovdqu $ACC0, 0-128($rp)
1375 vmovdqu $ACC1, 32-128($rp)
1376 vmovdqu $ACC2, 64-128($rp)
1377 vmovdqu $ACC3, 96-128($rp)
1382 vpsrlq \$29, $ACC4, $TEMP1
1383 vpand $AND_MASK, $ACC4, $ACC4
1384 vpsrlq \$29, $ACC5, $TEMP2
1385 vpand $AND_MASK, $ACC5, $ACC5
1386 vpsrlq \$29, $ACC6, $TEMP3
1387 vpermq \$0x93, $TEMP1, $TEMP1
1388 vpand $AND_MASK, $ACC6, $ACC6
1389 vpsrlq \$29, $ACC7, $TEMP4
1390 vpermq \$0x93, $TEMP2, $TEMP2
1391 vpand $AND_MASK, $ACC7, $ACC7
1392 vpsrlq \$29, $ACC8, $TEMP5
1393 vpermq \$0x93, $TEMP3, $TEMP3
1394 vpand $AND_MASK, $ACC8, $ACC8
1395 vpermq \$0x93, $TEMP4, $TEMP4
1397 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1398 vpermq \$0x93, $TEMP5, $TEMP5
1399 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1400 vpaddq $TEMP0, $ACC4, $ACC4
1401 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1402 vpaddq $TEMP1, $ACC5, $ACC5
1403 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1404 vpaddq $TEMP2, $ACC6, $ACC6
1405 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1406 vpaddq $TEMP3, $ACC7, $ACC7
1407 vpaddq $TEMP4, $ACC8, $ACC8
1409 vpsrlq \$29, $ACC4, $TEMP1
1410 vpand $AND_MASK, $ACC4, $ACC4
1411 vpsrlq \$29, $ACC5, $TEMP2
1412 vpand $AND_MASK, $ACC5, $ACC5
1413 vpsrlq \$29, $ACC6, $TEMP3
1414 vpermq \$0x93, $TEMP1, $TEMP1
1415 vpand $AND_MASK, $ACC6, $ACC6
1416 vpsrlq \$29, $ACC7, $TEMP4
1417 vpermq \$0x93, $TEMP2, $TEMP2
1418 vpand $AND_MASK, $ACC7, $ACC7
1419 vpsrlq \$29, $ACC8, $TEMP5
1420 vpermq \$0x93, $TEMP3, $TEMP3
1421 vpand $AND_MASK, $ACC8, $ACC8
1422 vpermq \$0x93, $TEMP4, $TEMP4
1424 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1425 vpermq \$0x93, $TEMP5, $TEMP5
1426 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1427 vpaddq $TEMP0, $ACC4, $ACC4
1428 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1429 vpaddq $TEMP1, $ACC5, $ACC5
1430 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1431 vpaddq $TEMP2, $ACC6, $ACC6
1432 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1433 vpaddq $TEMP3, $ACC7, $ACC7
1434 vpaddq $TEMP4, $ACC8, $ACC8
1436 vmovdqu $ACC4, 128-128($rp)
1437 vmovdqu $ACC5, 160-128($rp)
1438 vmovdqu $ACC6, 192-128($rp)
1439 vmovdqu $ACC7, 224-128($rp)
1440 vmovdqu $ACC8, 256-128($rp)
1445 $code.=<<___ if ($win64);
1446 movaps -0xd8(%rax),%xmm6
1447 movaps -0xc8(%rax),%xmm7
1448 movaps -0xb8(%rax),%xmm8
1449 movaps -0xa8(%rax),%xmm9
1450 movaps -0x98(%rax),%xmm10
1451 movaps -0x88(%rax),%xmm11
1452 movaps -0x78(%rax),%xmm12
1453 movaps -0x68(%rax),%xmm13
1454 movaps -0x58(%rax),%xmm14
1455 movaps -0x48(%rax),%xmm15
1464 lea (%rax),%rsp # restore %rsp
1465 .Lmul_1024_epilogue:
1467 .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1471 my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1472 my @T = map("%r$_",(8..11));
1475 .globl rsaz_1024_red2norm_avx2
1476 .type rsaz_1024_red2norm_avx2,\@abi-omnipotent
1478 rsaz_1024_red2norm_avx2:
1479 sub \$-128,$inp # size optimization
1483 for ($j=0,$i=0; $i<16; $i++) {
1485 while (29*$j<64*($i+1)) { # load data till boundary
1486 $code.=" mov `8*$j-128`($inp), @T[0]\n";
1487 $j++; $k++; push(@T,shift(@T));
1490 while ($k>1) { # shift loaded data but last value
1491 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
1494 $code.=<<___; # shift last value
1496 shl \$`29*($j-1)`, @T[-1]
1497 shr \$`-29*($j-1)`, @T[0]
1499 while ($l) { # accumulate all values
1500 $code.=" add @T[-$l], %rax\n";
1504 adc \$0, @T[0] # consume eventual carry
1505 mov %rax, 8*$i($out)
1512 .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1514 .globl rsaz_1024_norm2red_avx2
1515 .type rsaz_1024_norm2red_avx2,\@abi-omnipotent
1517 rsaz_1024_norm2red_avx2:
1518 sub \$-128,$out # size optimization
1520 mov \$0x1fffffff,%eax
1522 for ($j=0,$i=0; $i<16; $i++) {
1523 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
1524 $code.=" xor @T[1],@T[1]\n" if ($i==15);
1526 while (29*($j+1)<64*($i+1)) {
1529 shr \$`29*$j`,@T[-$k]
1530 and %rax,@T[-$k] # &0x1fffffff
1531 mov @T[-$k],`8*$j-128`($out)
1536 shrd \$`29*$j`,@T[1],@T[0]
1538 mov @T[0],`8*$j-128`($out)
1544 mov @T[0],`8*$j-128`($out) # zero
1545 mov @T[0],`8*($j+1)-128`($out)
1546 mov @T[0],`8*($j+2)-128`($out)
1547 mov @T[0],`8*($j+3)-128`($out)
1549 .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1553 my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1556 .globl rsaz_1024_scatter5_avx2
1557 .type rsaz_1024_scatter5_avx2,\@abi-omnipotent
1559 rsaz_1024_scatter5_avx2:
1561 vmovdqu .Lscatter_permd(%rip),%ymm5
1563 lea ($out,$power),$out
1565 jmp .Loop_scatter_1024
1569 vmovdqu ($inp),%ymm0
1571 vpermd %ymm0,%ymm5,%ymm0
1572 vmovdqu %xmm0,($out)
1573 lea 16*32($out),$out
1575 jnz .Loop_scatter_1024
1579 .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1581 .globl rsaz_1024_gather5_avx2
1582 .type rsaz_1024_gather5_avx2,\@abi-omnipotent
1584 rsaz_1024_gather5_avx2:
1586 $code.=<<___ if ($win64);
1587 lea -0x88(%rsp),%rax
1589 .LSEH_begin_rsaz_1024_gather5:
1590 # I can't trust assembler to use specific encoding:-(
1591 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
1592 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
1593 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
1594 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
1595 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
1596 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
1597 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
1598 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
1599 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
1600 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
1601 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
1604 lea .Lgather_table(%rip),%r11
1607 shr \$2,%eax # cache line number
1608 shl \$4,$power # offset within cache line
1610 vmovdqu -32(%r11),%ymm7 # .Lgather_permd
1611 vpbroadcastb 8(%r11,%rax), %xmm8
1612 vpbroadcastb 7(%r11,%rax), %xmm9
1613 vpbroadcastb 6(%r11,%rax), %xmm10
1614 vpbroadcastb 5(%r11,%rax), %xmm11
1615 vpbroadcastb 4(%r11,%rax), %xmm12
1616 vpbroadcastb 3(%r11,%rax), %xmm13
1617 vpbroadcastb 2(%r11,%rax), %xmm14
1618 vpbroadcastb 1(%r11,%rax), %xmm15
1620 lea 64($inp,$power),$inp
1621 mov \$64,%r11 # size optimization
1623 jmp .Loop_gather_1024
1627 vpand -64($inp), %xmm8,%xmm0
1628 vpand ($inp), %xmm9,%xmm1
1629 vpand 64($inp), %xmm10,%xmm2
1630 vpand ($inp,%r11,2), %xmm11,%xmm3
1631 vpor %xmm0,%xmm1,%xmm1
1632 vpand 64($inp,%r11,2), %xmm12,%xmm4
1633 vpor %xmm2,%xmm3,%xmm3
1634 vpand ($inp,%r11,4), %xmm13,%xmm5
1635 vpor %xmm1,%xmm3,%xmm3
1636 vpand 64($inp,%r11,4), %xmm14,%xmm6
1637 vpor %xmm4,%xmm5,%xmm5
1638 vpand -128($inp,%r11,8), %xmm15,%xmm2
1639 lea ($inp,%r11,8),$inp
1640 vpor %xmm3,%xmm5,%xmm5
1641 vpor %xmm2,%xmm6,%xmm6
1642 vpor %xmm5,%xmm6,%xmm6
1643 vpermd %ymm6,%ymm7,%ymm6
1644 vmovdqu %ymm6,($out)
1647 jnz .Loop_gather_1024
1649 vpxor %ymm0,%ymm0,%ymm0
1650 vmovdqu %ymm0,($out)
1653 $code.=<<___ if ($win64);
1655 movaps 0x10(%rsp),%xmm7
1656 movaps 0x20(%rsp),%xmm8
1657 movaps 0x30(%rsp),%xmm9
1658 movaps 0x40(%rsp),%xmm10
1659 movaps 0x50(%rsp),%xmm11
1660 movaps 0x60(%rsp),%xmm12
1661 movaps 0x70(%rsp),%xmm13
1662 movaps 0x80(%rsp),%xmm14
1663 movaps 0x90(%rsp),%xmm15
1665 .LSEH_end_rsaz_1024_gather5:
1669 .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1674 .extern OPENSSL_ia32cap_P
1675 .globl rsaz_avx2_eligible
1676 .type rsaz_avx2_eligible,\@abi-omnipotent
1679 mov OPENSSL_ia32cap_P+8(%rip),%eax
1683 .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1687 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
1689 .long 0,2,4,6,7,7,7,7
1691 .long 0,7,1,7,2,7,3,7
1693 .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
1704 .extern __imp_RtlVirtualUnwind
1705 .type rsaz_se_handler,\@abi-omnipotent
1719 mov 120($context),%rax # pull context->Rax
1720 mov 248($context),%rbx # pull context->Rip
1722 mov 8($disp),%rsi # disp->ImageBase
1723 mov 56($disp),%r11 # disp->HandlerData
1725 mov 0(%r11),%r10d # HandlerData[0]
1726 lea (%rsi,%r10),%r10 # prologue label
1727 cmp %r10,%rbx # context->Rip<prologue label
1728 jb .Lcommon_seh_tail
1730 mov 152($context),%rax # pull context->Rsp
1732 mov 4(%r11),%r10d # HandlerData[1]
1733 lea (%rsi,%r10),%r10 # epilogue label
1734 cmp %r10,%rbx # context->Rip>=epilogue label
1735 jae .Lcommon_seh_tail
1737 mov 160($context),%rax # pull context->Rbp
1745 mov %r15,240($context)
1746 mov %r14,232($context)
1747 mov %r13,224($context)
1748 mov %r12,216($context)
1749 mov %rbp,160($context)
1750 mov %rbx,144($context)
1752 lea -0xd8(%rax),%rsi # %xmm save area
1753 lea 512($context),%rdi # & context.Xmm6
1754 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1755 .long 0xa548f3fc # cld; rep movsq
1760 mov %rax,152($context) # restore context->Rsp
1761 mov %rsi,168($context) # restore context->Rsi
1762 mov %rdi,176($context) # restore context->Rdi
1764 mov 40($disp),%rdi # disp->ContextRecord
1765 mov $context,%rsi # context
1766 mov \$154,%ecx # sizeof(CONTEXT)
1767 .long 0xa548f3fc # cld; rep movsq
1770 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1771 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1772 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1773 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1774 mov 40(%rsi),%r10 # disp->ContextRecord
1775 lea 56(%rsi),%r11 # &disp->HandlerData
1776 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1777 mov %r10,32(%rsp) # arg5
1778 mov %r11,40(%rsp) # arg6
1779 mov %r12,48(%rsp) # arg7
1780 mov %rcx,56(%rsp) # arg8, (NULL)
1781 call *__imp_RtlVirtualUnwind(%rip)
1783 mov \$1,%eax # ExceptionContinueSearch
1795 .size rsaz_se_handler,.-rsaz_se_handler
1799 .rva .LSEH_begin_rsaz_1024_sqr_avx2
1800 .rva .LSEH_end_rsaz_1024_sqr_avx2
1801 .rva .LSEH_info_rsaz_1024_sqr_avx2
1803 .rva .LSEH_begin_rsaz_1024_mul_avx2
1804 .rva .LSEH_end_rsaz_1024_mul_avx2
1805 .rva .LSEH_info_rsaz_1024_mul_avx2
1807 .rva .LSEH_begin_rsaz_1024_gather5
1808 .rva .LSEH_end_rsaz_1024_gather5
1809 .rva .LSEH_info_rsaz_1024_gather5
1812 .LSEH_info_rsaz_1024_sqr_avx2:
1814 .rva rsaz_se_handler
1815 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
1816 .LSEH_info_rsaz_1024_mul_avx2:
1818 .rva rsaz_se_handler
1819 .rva .Lmul_1024_body,.Lmul_1024_epilogue
1820 .LSEH_info_rsaz_1024_gather5:
1821 .byte 0x01,0x33,0x16,0x00
1822 .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
1823 .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
1824 .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
1825 .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
1826 .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
1827 .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
1828 .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
1829 .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
1830 .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
1831 .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
1832 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
1836 foreach (split("\n",$code)) {
1837 s/\`([^\`]*)\`/eval($1)/ge;
1839 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
1841 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1842 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1843 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1844 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1845 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1850 print <<___; # assembler is too old
1853 .globl rsaz_avx2_eligible
1854 .type rsaz_avx2_eligible,\@abi-omnipotent
1858 .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1860 .globl rsaz_1024_sqr_avx2
1861 .globl rsaz_1024_mul_avx2
1862 .globl rsaz_1024_norm2red_avx2
1863 .globl rsaz_1024_red2norm_avx2
1864 .globl rsaz_1024_scatter5_avx2
1865 .globl rsaz_1024_gather5_avx2
1866 .type rsaz_1024_sqr_avx2,\@abi-omnipotent
1869 rsaz_1024_norm2red_avx2:
1870 rsaz_1024_red2norm_avx2:
1871 rsaz_1024_scatter5_avx2:
1872 rsaz_1024_gather5_avx2:
1873 .byte 0x0f,0x0b # ud2
1875 .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2