2 # Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
3 # Copyright (c) 2014, Intel Corporation. All Rights Reserved.
5 # Licensed under the OpenSSL license (the "License"). You may not use
6 # this file except in compliance with the License. You can obtain a copy
7 # in the file LICENSE in the source distribution or at
8 # https://www.openssl.org/source/license.html
10 # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
12 # (2) University of Haifa, Israel
15 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
20 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
22 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
24 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
25 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
26 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
27 die "can't locate x86_64-xlate.pl";
29 open OUT,"| \"$^X\" $xlate $flavour $output";
32 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
33 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
34 $avx = ($1>=2.19) + ($1>=2.22);
38 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
39 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
40 $avx = ($1>=2.09) + ($1>=2.10);
44 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
45 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
46 $avx = ($1>=10) + ($1>=11);
50 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
51 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
52 $avx = ($ver>=3.0) + ($ver>=3.01);
66 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
67 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
68 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
69 .quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
70 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
71 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
72 .quad 0x00040000, 0x00040000, 0x00040000, 0x00040000
73 .quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000
74 .quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
77 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
78 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
79 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
80 .quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC
81 .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
82 .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
83 .quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE
84 .quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE
85 .quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC
88 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
89 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
90 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
91 .quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8
92 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
93 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
94 .quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC
95 .quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC
96 .quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8
99 .quad 0x00000020, 0x00000020, 0x00000020, 0x00000020
100 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
101 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
102 .quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000
103 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
104 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
105 .quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff
106 .quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff
107 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
109 # RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL
110 # Montgomery form (*2^256) to our format (*2^261)
113 .quad 0x00000400, 0x00000400, 0x00000400, 0x00000400
114 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
115 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
116 .quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000
117 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
118 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
119 .quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff
120 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
121 .quad 0x00000003, 0x00000003, 0x00000003, 0x00000003
124 .quad 0x00000001, 0x00000001, 0x00000001, 0x00000001
125 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
126 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
127 .quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00
128 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
129 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
130 .quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff
131 .quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff
132 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
135 .long 1,1,1,1,1,1,1,1
139 # This function receives a pointer to an array of four affine points
140 # (X, Y, <1>) and rearranges the data for AVX2 execution, while
141 # converting it to 2^29 radix redundant form
143 my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3,
144 $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15));
147 .globl ecp_nistz256_avx2_transpose_convert
148 .type ecp_nistz256_avx2_transpose_convert,\@function,2
150 ecp_nistz256_avx2_transpose_convert:
153 $code.=<<___ if ($win64);
154 lea -8-16*10(%rsp), %rsp
155 vmovaps %xmm6, -8-16*10(%rax)
156 vmovaps %xmm7, -8-16*9(%rax)
157 vmovaps %xmm8, -8-16*8(%rax)
158 vmovaps %xmm9, -8-16*7(%rax)
159 vmovaps %xmm10, -8-16*6(%rax)
160 vmovaps %xmm11, -8-16*5(%rax)
161 vmovaps %xmm12, -8-16*4(%rax)
162 vmovaps %xmm13, -8-16*3(%rax)
163 vmovaps %xmm14, -8-16*2(%rax)
164 vmovaps %xmm15, -8-16*1(%rax)
168 vmovdqa 32*0(%rsi), $X0
169 lea 112(%rsi), %rax # size optimization
170 vmovdqa 32*1(%rsi), $Y0
171 lea .LAVX2_AND_MASK(%rip), %rdx
172 vmovdqa 32*2(%rsi), $X1
173 vmovdqa 32*3(%rsi), $Y1
174 vmovdqa 32*4-112(%rax), $X2
175 vmovdqa 32*5-112(%rax), $Y2
176 vmovdqa 32*6-112(%rax), $X3
177 vmovdqa 32*7-112(%rax), $Y3
179 # Transpose X and Y independently
180 vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0]
181 vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0]
182 vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1]
183 vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1]
185 vpunpcklqdq $Y1, $Y0, $T4
186 vpunpcklqdq $Y3, $Y2, $T5
187 vpunpckhqdq $Y1, $Y0, $T6
188 vpunpckhqdq $Y3, $Y2, $T7
190 vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0]
191 vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1]
192 vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2]
193 vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3]
195 vperm2i128 \$0x20, $T5, $T4, $Y0
196 vperm2i128 \$0x20, $T7, $T6, $Y1
197 vperm2i128 \$0x31, $T5, $T4, $Y2
198 vperm2i128 \$0x31, $T7, $T6, $Y3
201 vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask;
202 vpsrlq \$29, $X0, $X0
203 vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask;
204 vpsrlq \$29, $X0, $X0
207 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
208 vpsrlq \$23, $X1, $X1
209 vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
210 vpsrlq \$29, $X1, $X1
211 vpsllq \$12, $X2, $T4
213 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
214 vpsrlq \$17, $X2, $X2
215 vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
216 vpsrlq \$29, $X2, $X2
217 vpsllq \$18, $X3, $T6
219 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
220 vpsrlq \$11, $X3, $X3
221 vmovdqa $T0, 32*0(%rdi)
222 lea 112(%rdi), %rax # size optimization
223 vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
224 vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
226 vmovdqa $T1, 32*1(%rdi)
227 vmovdqa $T2, 32*2(%rdi)
228 vmovdqa $T3, 32*3(%rdi)
229 vmovdqa $T4, 32*4-112(%rax)
230 vmovdqa $T5, 32*5-112(%rax)
231 vmovdqa $T6, 32*6-112(%rax)
232 vmovdqa $T0, 32*7-112(%rax)
233 vmovdqa $X3, 32*8-112(%rax)
234 lea 448(%rdi), %rax # size optimization
236 vpand $T7, $Y0, $T0 # out[0] = in[0] & mask;
237 vpsrlq \$29, $Y0, $Y0
238 vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask;
239 vpsrlq \$29, $Y0, $Y0
242 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
243 vpsrlq \$23, $Y1, $Y1
244 vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
245 vpsrlq \$29, $Y1, $Y1
246 vpsllq \$12, $Y2, $T4
248 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
249 vpsrlq \$17, $Y2, $Y2
250 vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
251 vpsrlq \$29, $Y2, $Y2
252 vpsllq \$18, $Y3, $T6
254 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
255 vpsrlq \$11, $Y3, $Y3
256 vmovdqa $T0, 32*9-448(%rax)
257 vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
258 vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
260 vmovdqa $T1, 32*10-448(%rax)
261 vmovdqa $T2, 32*11-448(%rax)
262 vmovdqa $T3, 32*12-448(%rax)
263 vmovdqa $T4, 32*13-448(%rax)
264 vmovdqa $T5, 32*14-448(%rax)
265 vmovdqa $T6, 32*15-448(%rax)
266 vmovdqa $T0, 32*16-448(%rax)
267 vmovdqa $Y3, 32*17-448(%rax)
271 $code.=<<___ if ($win64);
272 movaps 16*0(%rsp), %xmm6
273 movaps 16*1(%rsp), %xmm7
274 movaps 16*2(%rsp), %xmm8
275 movaps 16*3(%rsp), %xmm9
276 movaps 16*4(%rsp), %xmm10
277 movaps 16*5(%rsp), %xmm11
278 movaps 16*6(%rsp), %xmm12
279 movaps 16*7(%rsp), %xmm13
280 movaps 16*8(%rsp), %xmm14
281 movaps 16*9(%rsp), %xmm15
282 lea 8+16*10(%rsp), %rsp
286 .size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert
290 ################################################################################
291 # This function receives a pointer to an array of four AVX2 formatted points
292 # (X, Y, Z) convert the data to normal representation, and rearranges the data
294 my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
295 my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15));
299 .globl ecp_nistz256_avx2_convert_transpose_back
300 .type ecp_nistz256_avx2_convert_transpose_back,\@function,2
302 ecp_nistz256_avx2_convert_transpose_back:
305 $code.=<<___ if ($win64);
306 lea -8-16*10(%rsp), %rsp
307 vmovaps %xmm6, -8-16*10(%rax)
308 vmovaps %xmm7, -8-16*9(%rax)
309 vmovaps %xmm8, -8-16*8(%rax)
310 vmovaps %xmm9, -8-16*7(%rax)
311 vmovaps %xmm10, -8-16*6(%rax)
312 vmovaps %xmm11, -8-16*5(%rax)
313 vmovaps %xmm12, -8-16*4(%rax)
314 vmovaps %xmm13, -8-16*3(%rax)
315 vmovaps %xmm14, -8-16*2(%rax)
316 vmovaps %xmm15, -8-16*1(%rax)
322 vmovdqa 32*0(%rsi), $D0
323 lea 160(%rsi), %rax # size optimization
324 vmovdqa 32*1(%rsi), $D1
325 vmovdqa 32*2(%rsi), $D2
326 vmovdqa 32*3(%rsi), $D3
327 vmovdqa 32*4-160(%rax), $D4
328 vmovdqa 32*5-160(%rax), $D5
329 vmovdqa 32*6-160(%rax), $D6
330 vmovdqa 32*7-160(%rax), $D7
331 vmovdqa 32*8-160(%rax), $D8
333 vpsllq \$29, $D1, $D1
334 vpsllq \$58, $D2, $T0
336 vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2);
339 vpsllq \$23, $D3, $D3
340 vpsllq \$52, $D4, $T1
342 vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64);
344 vpsrlq \$12, $D4, $D4
345 vpsllq \$17, $D5, $D5
346 vpsllq \$46, $D6, $T2
348 vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64);
350 vpsrlq \$18, $D6, $D6
351 vpsllq \$11, $D7, $D7
352 vpsllq \$40, $D8, $T3
354 vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64);
356 vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0]
357 vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0]
358 vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1]
359 vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1]
361 vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0]
362 vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1]
363 vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2]
364 vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3]
366 vmovdqa $D0, 32*0(%rdi)
367 vmovdqa $D1, 32*3(%rdi)
368 vmovdqa $D2, 32*6(%rdi)
369 vmovdqa $D3, 32*9(%rdi)
379 $code.=<<___ if ($win64);
380 movaps 16*0(%rsp), %xmm6
381 movaps 16*1(%rsp), %xmm7
382 movaps 16*2(%rsp), %xmm8
383 movaps 16*3(%rsp), %xmm9
384 movaps 16*4(%rsp), %xmm10
385 movaps 16*5(%rsp), %xmm11
386 movaps 16*6(%rsp), %xmm12
387 movaps 16*7(%rsp), %xmm13
388 movaps 16*8(%rsp), %xmm14
389 movaps 16*9(%rsp), %xmm15
390 lea 8+16*10(%rsp), %rsp
394 .size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back
398 my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx");
399 my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8));
400 my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13));
404 vpsrlq $digit_size, $ACC0, $T0
405 vpand $AND_MASK, $ACC0, $ACC0
406 vpaddq $T0, $ACC1, $ACC1
408 vpsrlq $digit_size, $ACC1, $T0
409 vpand $AND_MASK, $ACC1, $ACC1
410 vpaddq $T0, $ACC2, $ACC2
412 vpsrlq $digit_size, $ACC2, $T0
413 vpand $AND_MASK, $ACC2, $ACC2
414 vpaddq $T0, $ACC3, $ACC3
416 vpsrlq $digit_size, $ACC3, $T0
417 vpand $AND_MASK, $ACC3, $ACC3
418 vpaddq $T0, $ACC4, $ACC4
420 vpsrlq $digit_size, $ACC4, $T0
421 vpand $AND_MASK, $ACC4, $ACC4
422 vpaddq $T0, $ACC5, $ACC5
424 vpsrlq $digit_size, $ACC5, $T0
425 vpand $AND_MASK, $ACC5, $ACC5
426 vpaddq $T0, $ACC6, $ACC6
428 vpsrlq $digit_size, $ACC6, $T0
429 vpand $AND_MASK, $ACC6, $ACC6
430 vpaddq $T0, $ACC7, $ACC7
432 vpsrlq $digit_size, $ACC7, $T0
433 vpand $AND_MASK, $ACC7, $ACC7
434 vpaddq $T0, $ACC8, $ACC8
435 #vpand $AND_MASK, $ACC8, $ACC8
442 vmovdqa $ACC0, 32*0(%rdi)
443 lea 160(%rdi), %rax # size optimization
444 vmovdqa $ACC1, 32*1(%rdi)
445 vmovdqa $ACC2, 32*2(%rdi)
446 vmovdqa $ACC3, 32*3(%rdi)
447 vmovdqa $ACC4, 32*4-160(%rax)
448 vmovdqa $ACC5, 32*5-160(%rax)
449 vmovdqa $ACC6, 32*6-160(%rax)
450 vmovdqa $ACC7, 32*7-160(%rax)
451 vmovdqa $ACC8, 32*8-160(%rax)
457 .type avx2_normalize,\@abi-omnipotent
460 vpsrlq $digit_size, $ACC0, $T0
461 vpand $AND_MASK, $ACC0, $ACC0
462 vpaddq $T0, $ACC1, $ACC1
464 vpsrlq $digit_size, $ACC1, $T0
465 vpand $AND_MASK, $ACC1, $ACC1
466 vpaddq $T0, $ACC2, $ACC2
468 vpsrlq $digit_size, $ACC2, $T0
469 vpand $AND_MASK, $ACC2, $ACC2
470 vpaddq $T0, $ACC3, $ACC3
472 vpsrlq $digit_size, $ACC3, $T0
473 vpand $AND_MASK, $ACC3, $ACC3
474 vpaddq $T0, $ACC4, $ACC4
476 vpsrlq $digit_size, $ACC4, $T0
477 vpand $AND_MASK, $ACC4, $ACC4
478 vpaddq $T0, $ACC5, $ACC5
480 vpsrlq $digit_size, $ACC5, $T0
481 vpand $AND_MASK, $ACC5, $ACC5
482 vpaddq $T0, $ACC6, $ACC6
484 vpsrlq $digit_size, $ACC6, $T0
485 vpand $AND_MASK, $ACC6, $ACC6
486 vpaddq $T0, $ACC7, $ACC7
488 vpsrlq $digit_size, $ACC7, $T0
489 vpand $AND_MASK, $ACC7, $ACC7
490 vpaddq $T0, $ACC8, $ACC8
491 #vpand $AND_MASK, $ACC8, $ACC8
494 .size avx2_normalize,.-avx2_normalize
496 .type avx2_normalize_n_store,\@abi-omnipotent
498 avx2_normalize_n_store:
499 vpsrlq $digit_size, $ACC0, $T0
500 vpand $AND_MASK, $ACC0, $ACC0
501 vpaddq $T0, $ACC1, $ACC1
503 vpsrlq $digit_size, $ACC1, $T0
504 vpand $AND_MASK, $ACC1, $ACC1
505 vmovdqa $ACC0, 32*0(%rdi)
506 lea 160(%rdi), %rax # size optimization
507 vpaddq $T0, $ACC2, $ACC2
509 vpsrlq $digit_size, $ACC2, $T0
510 vpand $AND_MASK, $ACC2, $ACC2
511 vmovdqa $ACC1, 32*1(%rdi)
512 vpaddq $T0, $ACC3, $ACC3
514 vpsrlq $digit_size, $ACC3, $T0
515 vpand $AND_MASK, $ACC3, $ACC3
516 vmovdqa $ACC2, 32*2(%rdi)
517 vpaddq $T0, $ACC4, $ACC4
519 vpsrlq $digit_size, $ACC4, $T0
520 vpand $AND_MASK, $ACC4, $ACC4
521 vmovdqa $ACC3, 32*3(%rdi)
522 vpaddq $T0, $ACC5, $ACC5
524 vpsrlq $digit_size, $ACC5, $T0
525 vpand $AND_MASK, $ACC5, $ACC5
526 vmovdqa $ACC4, 32*4-160(%rax)
527 vpaddq $T0, $ACC6, $ACC6
529 vpsrlq $digit_size, $ACC6, $T0
530 vpand $AND_MASK, $ACC6, $ACC6
531 vmovdqa $ACC5, 32*5-160(%rax)
532 vpaddq $T0, $ACC7, $ACC7
534 vpsrlq $digit_size, $ACC7, $T0
535 vpand $AND_MASK, $ACC7, $ACC7
536 vmovdqa $ACC6, 32*6-160(%rax)
537 vpaddq $T0, $ACC8, $ACC8
538 #vpand $AND_MASK, $ACC8, $ACC8
539 vmovdqa $ACC7, 32*7-160(%rax)
540 vmovdqa $ACC8, 32*8-160(%rax)
543 .size avx2_normalize_n_store,.-avx2_normalize_n_store
545 ################################################################################
546 # void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4);
547 .type avx2_mul_x4,\@abi-omnipotent
550 lea .LAVX2_POLY(%rip), %rax
552 vpxor $ACC0, $ACC0, $ACC0
553 vpxor $ACC1, $ACC1, $ACC1
554 vpxor $ACC2, $ACC2, $ACC2
555 vpxor $ACC3, $ACC3, $ACC3
556 vpxor $ACC4, $ACC4, $ACC4
557 vpxor $ACC5, $ACC5, $ACC5
558 vpxor $ACC6, $ACC6, $ACC6
559 vpxor $ACC7, $ACC7, $ACC7
561 vmovdqa 32*7(%rax), %ymm14
562 vmovdqa 32*8(%rax), %ymm15
565 lea -512($a_ptr), $a_ptr # strategic bias to control u-op density
566 jmp .Lavx2_mul_x4_loop
570 vmovdqa 32*0($b_ptr), $B
571 lea 32*1($b_ptr), $b_ptr
573 vpmuludq 32*0+512($a_ptr), $B, $T0
574 vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW
575 vpaddq $T0, $ACC0, $ACC0
576 vpmuludq 32*2+512($a_ptr), $B, $T0
577 vpaddq $OVERFLOW, $ACC1, $ACC1
578 vpand $AND_MASK, $ACC0, $Y
579 vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW
580 vpaddq $T0, $ACC2, $ACC2
581 vpmuludq 32*4+512($a_ptr), $B, $T0
582 vpaddq $OVERFLOW, $ACC3, $ACC3
583 vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW
584 vpaddq $T0, $ACC4, $ACC4
585 vpmuludq 32*6+512($a_ptr), $B, $T0
586 vpaddq $OVERFLOW, $ACC5, $ACC5
587 vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW
588 vpaddq $T0, $ACC6, $ACC6
590 # Skip some multiplications, optimizing for the constant poly
591 vpmuludq $AND_MASK, $Y, $T0
592 vpaddq $OVERFLOW, $ACC7, $ACC7
593 vpmuludq 32*8+512($a_ptr), $B, $ACC8
594 vpaddq $T0, $ACC0, $OVERFLOW
595 vpaddq $T0, $ACC1, $ACC0
596 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
597 vpaddq $T0, $ACC2, $ACC1
598 vpmuludq 32*3(%rax), $Y, $T0
599 vpaddq $OVERFLOW, $ACC0, $ACC0
600 vpaddq $T0, $ACC3, $ACC2
603 vpsllq \$18, $Y, $OVERFLOW
606 vpmuludq %ymm14, $Y, $T0
607 vpaddq $OVERFLOW, $ACC6, $ACC5
608 vpmuludq %ymm15, $Y, $OVERFLOW
609 vpaddq $T0, $ACC7, $ACC6
610 vpaddq $OVERFLOW, $ACC8, $ACC7
613 jnz .Lavx2_mul_x4_loop
615 vpxor $ACC8, $ACC8, $ACC8
618 .size avx2_mul_x4,.-avx2_mul_x4
620 # Function optimized for the constant 1
621 ################################################################################
622 # void avx2_mul_by1_x4(void* RESULTx4, void *Ax4);
623 .type avx2_mul_by1_x4,\@abi-omnipotent
626 lea .LAVX2_POLY(%rip), %rax
628 vpxor $ACC0, $ACC0, $ACC0
629 vpxor $ACC1, $ACC1, $ACC1
630 vpxor $ACC2, $ACC2, $ACC2
631 vpxor $ACC3, $ACC3, $ACC3
632 vpxor $ACC4, $ACC4, $ACC4
633 vpxor $ACC5, $ACC5, $ACC5
634 vpxor $ACC6, $ACC6, $ACC6
635 vpxor $ACC7, $ACC7, $ACC7
636 vpxor $ACC8, $ACC8, $ACC8
638 vmovdqa 32*3+.LONE(%rip), %ymm14
639 vmovdqa 32*7+.LONE(%rip), %ymm15
642 jmp .Lavx2_mul_by1_x4_loop
645 .Lavx2_mul_by1_x4_loop:
646 vmovdqa 32*0($a_ptr), $B
647 .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr
649 vpsllq \$5, $B, $OVERFLOW
650 vpmuludq %ymm14, $B, $T0
651 vpaddq $OVERFLOW, $ACC0, $ACC0
652 vpaddq $T0, $ACC3, $ACC3
654 vpmuludq $AND_MASK, $B, $T0
655 vpand $AND_MASK, $ACC0, $Y
656 vpaddq $T0, $ACC4, $ACC4
657 vpaddq $T0, $ACC5, $ACC5
658 vpaddq $T0, $ACC6, $ACC6
662 vpmuludq %ymm15, $B, $OVERFLOW
663 vpsubq $T0, $ACC6, $ACC6
665 vpmuludq $AND_MASK, $Y, $T0
666 vpaddq $OVERFLOW, $ACC7, $ACC7
667 vpaddq $T0, $ACC0, $OVERFLOW
668 vpaddq $T0, $ACC1, $ACC0
670 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
671 vpaddq $T0, $ACC2, $ACC1
672 vpmuludq 32*3(%rax), $Y, $T0
673 vpaddq $OVERFLOW, $ACC0, $ACC0
674 vpaddq $T0, $ACC3, $ACC2
676 vpsllq \$18, $Y, $OVERFLOW
678 vpmuludq 32*7(%rax), $Y, $T0
679 vpaddq $OVERFLOW, $ACC6, $ACC5
680 vpaddq $T0, $ACC7, $ACC6
681 vpmuludq 32*8(%rax), $Y, $ACC7
684 jnz .Lavx2_mul_by1_x4_loop
687 .size avx2_mul_by1_x4,.-avx2_mul_by1_x4
689 ################################################################################
690 # void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4);
691 .type avx2_sqr_x4,\@abi-omnipotent
694 lea .LAVX2_POLY(%rip), %rax
696 vmovdqa 32*7(%rax), %ymm14
697 vmovdqa 32*8(%rax), %ymm15
699 vmovdqa 32*0($a_ptr), $B
700 vmovdqa 32*1($a_ptr), $ACC1
701 vmovdqa 32*2($a_ptr), $ACC2
702 vmovdqa 32*3($a_ptr), $ACC3
703 vmovdqa 32*4($a_ptr), $ACC4
704 vmovdqa 32*5($a_ptr), $ACC5
705 vmovdqa 32*6($a_ptr), $ACC6
706 vmovdqa 32*7($a_ptr), $ACC7
707 vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7
708 vmovdqa 32*8($a_ptr), $ACC8
709 vpaddq $ACC2, $ACC2, $ACC2
710 vmovdqa $ACC1, 32*0(%rcx)
711 vpaddq $ACC3, $ACC3, $ACC3
712 vmovdqa $ACC2, 32*1(%rcx)
713 vpaddq $ACC4, $ACC4, $ACC4
714 vmovdqa $ACC3, 32*2(%rcx)
715 vpaddq $ACC5, $ACC5, $ACC5
716 vmovdqa $ACC4, 32*3(%rcx)
717 vpaddq $ACC6, $ACC6, $ACC6
718 vmovdqa $ACC5, 32*4(%rcx)
719 vpaddq $ACC7, $ACC7, $ACC7
720 vmovdqa $ACC6, 32*5(%rcx)
721 vpaddq $ACC8, $ACC8, $ACC8
722 vmovdqa $ACC7, 32*6(%rcx)
723 vmovdqa $ACC8, 32*7(%rcx)
726 vpmuludq $B, $B, $ACC0
727 vpmuludq $B, $ACC1, $ACC1
728 vpand $AND_MASK, $ACC0, $Y
729 vpmuludq $B, $ACC2, $ACC2
730 vpmuludq $B, $ACC3, $ACC3
731 vpmuludq $B, $ACC4, $ACC4
732 vpmuludq $B, $ACC5, $ACC5
733 vpmuludq $B, $ACC6, $ACC6
734 vpmuludq $AND_MASK, $Y, $T0
735 vpmuludq $B, $ACC7, $ACC7
736 vpmuludq $B, $ACC8, $ACC8
737 vmovdqa 32*1($a_ptr), $B
739 vpaddq $T0, $ACC0, $OVERFLOW
740 vpaddq $T0, $ACC1, $ACC0
741 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
742 vpaddq $T0, $ACC2, $ACC1
743 vpmuludq 32*3(%rax), $Y, $T0
744 vpaddq $OVERFLOW, $ACC0, $ACC0
745 vpaddq $T0, $ACC3, $ACC2
749 vpmuludq %ymm14, $Y, $OVERFLOW
750 vpaddq $T0, $ACC6, $ACC5
751 vpmuludq %ymm15, $Y, $T0
752 vpaddq $OVERFLOW, $ACC7, $ACC6
753 vpaddq $T0, $ACC8, $ACC7
756 vpmuludq $B, $B, $OVERFLOW
757 vpand $AND_MASK, $ACC0, $Y
758 vpmuludq 32*1(%rcx), $B, $T0
759 vpaddq $OVERFLOW, $ACC1, $ACC1
760 vpmuludq 32*2(%rcx), $B, $OVERFLOW
761 vpaddq $T0, $ACC2, $ACC2
762 vpmuludq 32*3(%rcx), $B, $T0
763 vpaddq $OVERFLOW, $ACC3, $ACC3
764 vpmuludq 32*4(%rcx), $B, $OVERFLOW
765 vpaddq $T0, $ACC4, $ACC4
766 vpmuludq 32*5(%rcx), $B, $T0
767 vpaddq $OVERFLOW, $ACC5, $ACC5
768 vpmuludq 32*6(%rcx), $B, $OVERFLOW
769 vpaddq $T0, $ACC6, $ACC6
771 vpmuludq $AND_MASK, $Y, $T0
772 vpaddq $OVERFLOW, $ACC7, $ACC7
773 vpmuludq 32*7(%rcx), $B, $ACC8
774 vmovdqa 32*2($a_ptr), $B
775 vpaddq $T0, $ACC0, $OVERFLOW
776 vpaddq $T0, $ACC1, $ACC0
777 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
778 vpaddq $T0, $ACC2, $ACC1
779 vpmuludq 32*3(%rax), $Y, $T0
780 vpaddq $OVERFLOW, $ACC0, $ACC0
781 vpaddq $T0, $ACC3, $ACC2
785 vpmuludq %ymm14, $Y, $OVERFLOW
786 vpaddq $T0, $ACC6, $ACC5
787 vpmuludq %ymm15, $Y, $T0
788 vpaddq $OVERFLOW, $ACC7, $ACC6
789 vpaddq $T0, $ACC8, $ACC7
793 vpand $AND_MASK, $ACC0, $Y
794 vpmuludq 32*2(%rcx), $B, $OVERFLOW
795 vpaddq $T0, $ACC2, $ACC2
796 vpmuludq 32*3(%rcx), $B, $T0
797 vpaddq $OVERFLOW, $ACC3, $ACC3
798 vpmuludq 32*4(%rcx), $B, $OVERFLOW
799 vpaddq $T0, $ACC4, $ACC4
800 vpmuludq 32*5(%rcx), $B, $T0
801 vpaddq $OVERFLOW, $ACC5, $ACC5
802 vpmuludq 32*6(%rcx), $B, $OVERFLOW
803 vpaddq $T0, $ACC6, $ACC6
805 vpmuludq $AND_MASK, $Y, $T0
806 vpaddq $OVERFLOW, $ACC7, $ACC7
807 vpmuludq 32*7(%rcx), $B, $ACC8
808 vmovdqa 32*3($a_ptr), $B
809 vpaddq $T0, $ACC0, $OVERFLOW
810 vpaddq $T0, $ACC1, $ACC0
811 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
812 vpaddq $T0, $ACC2, $ACC1
813 vpmuludq 32*3(%rax), $Y, $T0
814 vpaddq $OVERFLOW, $ACC0, $ACC0
815 vpaddq $T0, $ACC3, $ACC2
819 vpmuludq %ymm14, $Y, $OVERFLOW
820 vpaddq $T0, $ACC6, $ACC5
821 vpmuludq %ymm15, $Y, $T0
822 vpand $AND_MASK, $ACC0, $Y
823 vpaddq $OVERFLOW, $ACC7, $ACC6
824 vpaddq $T0, $ACC8, $ACC7
827 vpmuludq $B, $B, $OVERFLOW
828 vpmuludq 32*3(%rcx), $B, $T0
829 vpaddq $OVERFLOW, $ACC3, $ACC3
830 vpmuludq 32*4(%rcx), $B, $OVERFLOW
831 vpaddq $T0, $ACC4, $ACC4
832 vpmuludq 32*5(%rcx), $B, $T0
833 vpaddq $OVERFLOW, $ACC5, $ACC5
834 vpmuludq 32*6(%rcx), $B, $OVERFLOW
835 vpaddq $T0, $ACC6, $ACC6
837 vpmuludq $AND_MASK, $Y, $T0
838 vpaddq $OVERFLOW, $ACC7, $ACC7
839 vpmuludq 32*7(%rcx), $B, $ACC8
840 vmovdqa 32*4($a_ptr), $B
841 vpaddq $T0, $ACC0, $OVERFLOW
842 vpaddq $T0, $ACC1, $ACC0
843 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
844 vpaddq $T0, $ACC2, $ACC1
845 vpmuludq 32*3(%rax), $Y, $T0
846 vpaddq $OVERFLOW, $ACC0, $ACC0
847 vpaddq $T0, $ACC3, $ACC2
851 vpmuludq %ymm14, $Y, $OVERFLOW
852 vpaddq $T0, $ACC6, $ACC5
853 vpmuludq %ymm15, $Y, $T0
854 vpand $AND_MASK, $ACC0, $Y
855 vpaddq $OVERFLOW, $ACC7, $ACC6
856 vpaddq $T0, $ACC8, $ACC7
860 vpmuludq 32*4(%rcx), $B, $OVERFLOW
861 vpaddq $T0, $ACC4, $ACC4
862 vpmuludq 32*5(%rcx), $B, $T0
863 vpaddq $OVERFLOW, $ACC5, $ACC5
864 vpmuludq 32*6(%rcx), $B, $OVERFLOW
865 vpaddq $T0, $ACC6, $ACC6
867 vpmuludq $AND_MASK, $Y, $T0
868 vpaddq $OVERFLOW, $ACC7, $ACC7
869 vpmuludq 32*7(%rcx), $B, $ACC8
870 vmovdqa 32*5($a_ptr), $B
871 vpaddq $T0, $ACC0, $OVERFLOW
872 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
873 vpaddq $T0, $ACC1, $ACC0
874 vpaddq $T0, $ACC2, $ACC1
875 vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0
876 vpaddq $OVERFLOW, $ACC0, $ACC0
877 vpaddq $T0, $ACC3, $ACC2
881 vpmuludq %ymm14, $Y, $OVERFLOW
882 vpaddq $T0, $ACC6, $ACC5
883 vpmuludq %ymm15, $Y, $T0
884 vpand $AND_MASK, $ACC0, $Y
885 vpaddq $OVERFLOW, $ACC7, $ACC6
886 vpaddq $T0, $ACC8, $ACC7
889 vpmuludq $B, $B, $OVERFLOW
890 vpmuludq 32*5(%rcx), $B, $T0
891 vpaddq $OVERFLOW, $ACC5, $ACC5
892 vpmuludq 32*6(%rcx), $B, $OVERFLOW
893 vpaddq $T0, $ACC6, $ACC6
895 vpmuludq $AND_MASK, $Y, $T0
896 vpaddq $OVERFLOW, $ACC7, $ACC7
897 vpmuludq 32*7(%rcx), $B, $ACC8
898 vmovdqa 32*6($a_ptr), $B
899 vpaddq $T0, $ACC0, $OVERFLOW
900 vpaddq $T0, $ACC1, $ACC0
901 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
902 vpaddq $T0, $ACC2, $ACC1
903 vpmuludq 32*3(%rax), $Y, $T0
904 vpaddq $OVERFLOW, $ACC0, $ACC0
905 vpaddq $T0, $ACC3, $ACC2
909 vpmuludq %ymm14, $Y, $OVERFLOW
910 vpaddq $T0, $ACC6, $ACC5
911 vpmuludq %ymm15, $Y, $T0
912 vpand $AND_MASK, $ACC0, $Y
913 vpaddq $OVERFLOW, $ACC7, $ACC6
914 vpaddq $T0, $ACC8, $ACC7
918 vpmuludq 32*6(%rcx), $B, $OVERFLOW
919 vpaddq $T0, $ACC6, $ACC6
921 vpmuludq $AND_MASK, $Y, $T0
922 vpaddq $OVERFLOW, $ACC7, $ACC7
923 vpmuludq 32*7(%rcx), $B, $ACC8
924 vmovdqa 32*7($a_ptr), $B
925 vpaddq $T0, $ACC0, $OVERFLOW
926 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
927 vpaddq $T0, $ACC1, $ACC0
928 vpaddq $T0, $ACC2, $ACC1
929 vpmuludq 32*3(%rax), $Y, $T0
930 vpaddq $OVERFLOW, $ACC0, $ACC0
931 vpaddq $T0, $ACC3, $ACC2
935 vpmuludq %ymm14, $Y, $OVERFLOW
936 vpaddq $T0, $ACC6, $ACC5
937 vpmuludq %ymm15, $Y, $T0
938 vpand $AND_MASK, $ACC0, $Y
939 vpaddq $OVERFLOW, $ACC7, $ACC6
940 vpaddq $T0, $ACC8, $ACC7
943 vpmuludq $B, $B, $OVERFLOW
945 vpmuludq $AND_MASK, $Y, $T0
946 vpaddq $OVERFLOW, $ACC7, $ACC7
947 vpmuludq 32*7(%rcx), $B, $ACC8
948 vmovdqa 32*8($a_ptr), $B
949 vpaddq $T0, $ACC0, $OVERFLOW
950 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
951 vpaddq $T0, $ACC1, $ACC0
952 vpaddq $T0, $ACC2, $ACC1
953 vpmuludq 32*3(%rax), $Y, $T0
954 vpaddq $OVERFLOW, $ACC0, $ACC0
955 vpaddq $T0, $ACC3, $ACC2
959 vpmuludq %ymm14, $Y, $OVERFLOW
960 vpaddq $T0, $ACC6, $ACC5
961 vpmuludq %ymm15, $Y, $T0
962 vpand $AND_MASK, $ACC0, $Y
963 vpaddq $OVERFLOW, $ACC7, $ACC6
964 vpaddq $T0, $ACC8, $ACC7
967 vpmuludq $B, $B, $ACC8
969 vpmuludq $AND_MASK, $Y, $T0
970 vpaddq $T0, $ACC0, $OVERFLOW
971 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
972 vpaddq $T0, $ACC1, $ACC0
973 vpaddq $T0, $ACC2, $ACC1
974 vpmuludq 32*3(%rax), $Y, $T0
975 vpaddq $OVERFLOW, $ACC0, $ACC0
976 vpaddq $T0, $ACC3, $ACC2
980 vpmuludq %ymm14, $Y, $OVERFLOW
981 vpaddq $T0, $ACC6, $ACC5
982 vpmuludq %ymm15, $Y, $T0
983 vpaddq $OVERFLOW, $ACC7, $ACC6
984 vpaddq $T0, $ACC8, $ACC7
986 vpxor $ACC8, $ACC8, $ACC8
989 .size avx2_sqr_x4,.-avx2_sqr_x4
991 ################################################################################
992 # void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4);
993 .type avx2_sub_x4,\@abi-omnipotent
996 vmovdqa 32*0($a_ptr), $ACC0
997 lea 160($a_ptr), $a_ptr
998 lea .LAVX2_POLY_x8+128(%rip), %rax
999 lea 128($b_ptr), $b_ptr
1000 vmovdqa 32*1-160($a_ptr), $ACC1
1001 vmovdqa 32*2-160($a_ptr), $ACC2
1002 vmovdqa 32*3-160($a_ptr), $ACC3
1003 vmovdqa 32*4-160($a_ptr), $ACC4
1004 vmovdqa 32*5-160($a_ptr), $ACC5
1005 vmovdqa 32*6-160($a_ptr), $ACC6
1006 vmovdqa 32*7-160($a_ptr), $ACC7
1007 vmovdqa 32*8-160($a_ptr), $ACC8
1009 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1010 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1011 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1012 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1013 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1014 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1015 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1016 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1017 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1019 vpsubq 32*0-128($b_ptr), $ACC0, $ACC0
1020 vpsubq 32*1-128($b_ptr), $ACC1, $ACC1
1021 vpsubq 32*2-128($b_ptr), $ACC2, $ACC2
1022 vpsubq 32*3-128($b_ptr), $ACC3, $ACC3
1023 vpsubq 32*4-128($b_ptr), $ACC4, $ACC4
1024 vpsubq 32*5-128($b_ptr), $ACC5, $ACC5
1025 vpsubq 32*6-128($b_ptr), $ACC6, $ACC6
1026 vpsubq 32*7-128($b_ptr), $ACC7, $ACC7
1027 vpsubq 32*8-128($b_ptr), $ACC8, $ACC8
1030 .size avx2_sub_x4,.-avx2_sub_x4
1032 .type avx2_select_n_store,\@abi-omnipotent
1034 avx2_select_n_store:
1035 vmovdqa `8+32*9*8`(%rsp), $Y
1036 vpor `8+32*9*8+32`(%rsp), $Y, $Y
1038 vpandn $ACC0, $Y, $ACC0
1039 vpandn $ACC1, $Y, $ACC1
1040 vpandn $ACC2, $Y, $ACC2
1041 vpandn $ACC3, $Y, $ACC3
1042 vpandn $ACC4, $Y, $ACC4
1043 vpandn $ACC5, $Y, $ACC5
1044 vpandn $ACC6, $Y, $ACC6
1045 vmovdqa `8+32*9*8+32`(%rsp), $B
1046 vpandn $ACC7, $Y, $ACC7
1047 vpandn `8+32*9*8`(%rsp), $B, $B
1048 vpandn $ACC8, $Y, $ACC8
1050 vpand 32*0(%rsi), $B, $T0
1052 vpand 32*1(%rsi), $B, $Y
1053 vpxor $T0, $ACC0, $ACC0
1054 vpand 32*2(%rsi), $B, $T0
1055 vpxor $Y, $ACC1, $ACC1
1056 vpand 32*3(%rsi), $B, $Y
1057 vpxor $T0, $ACC2, $ACC2
1058 vpand 32*4-160(%rax), $B, $T0
1059 vpxor $Y, $ACC3, $ACC3
1060 vpand 32*5-160(%rax), $B, $Y
1061 vpxor $T0, $ACC4, $ACC4
1062 vpand 32*6-160(%rax), $B, $T0
1063 vpxor $Y, $ACC5, $ACC5
1064 vpand 32*7-160(%rax), $B, $Y
1065 vpxor $T0, $ACC6, $ACC6
1066 vpand 32*8-160(%rax), $B, $T0
1067 vmovdqa `8+32*9*8+32`(%rsp), $B
1068 vpxor $Y, $ACC7, $ACC7
1070 vpand 32*0(%rdx), $B, $Y
1072 vpxor $T0, $ACC8, $ACC8
1073 vpand 32*1(%rdx), $B, $T0
1074 vpxor $Y, $ACC0, $ACC0
1075 vpand 32*2(%rdx), $B, $Y
1076 vpxor $T0, $ACC1, $ACC1
1077 vpand 32*3(%rdx), $B, $T0
1078 vpxor $Y, $ACC2, $ACC2
1079 vpand 32*4-160(%rax), $B, $Y
1080 vpxor $T0, $ACC3, $ACC3
1081 vpand 32*5-160(%rax), $B, $T0
1082 vpxor $Y, $ACC4, $ACC4
1083 vpand 32*6-160(%rax), $B, $Y
1084 vpxor $T0, $ACC5, $ACC5
1085 vpand 32*7-160(%rax), $B, $T0
1086 vpxor $Y, $ACC6, $ACC6
1087 vpand 32*8-160(%rax), $B, $Y
1088 vpxor $T0, $ACC7, $ACC7
1089 vpxor $Y, $ACC8, $ACC8
1093 .size avx2_select_n_store,.-avx2_select_n_store
1095 $code.=<<___ if (0); # inlined
1096 ################################################################################
1097 # void avx2_mul_by2_x4(void* RESULTx4, void *Ax4);
1098 .type avx2_mul_by2_x4,\@abi-omnipotent
1101 vmovdqa 32*0($a_ptr), $ACC0
1102 lea 160($a_ptr), %rax
1103 vmovdqa 32*1($a_ptr), $ACC1
1104 vmovdqa 32*2($a_ptr), $ACC2
1105 vmovdqa 32*3($a_ptr), $ACC3
1106 vmovdqa 32*4-160(%rax), $ACC4
1107 vmovdqa 32*5-160(%rax), $ACC5
1108 vmovdqa 32*6-160(%rax), $ACC6
1109 vmovdqa 32*7-160(%rax), $ACC7
1110 vmovdqa 32*8-160(%rax), $ACC8
1112 vpaddq $ACC0, $ACC0, $ACC0
1113 vpaddq $ACC1, $ACC1, $ACC1
1114 vpaddq $ACC2, $ACC2, $ACC2
1115 vpaddq $ACC3, $ACC3, $ACC3
1116 vpaddq $ACC4, $ACC4, $ACC4
1117 vpaddq $ACC5, $ACC5, $ACC5
1118 vpaddq $ACC6, $ACC6, $ACC6
1119 vpaddq $ACC7, $ACC7, $ACC7
1120 vpaddq $ACC8, $ACC8, $ACC8
1123 .size avx2_mul_by2_x4,.-avx2_mul_by2_x4
1125 my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx");
1126 my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10");
1129 ################################################################################
1130 # void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4);
1131 .globl ecp_nistz256_avx2_point_add_affine_x4
1132 .type ecp_nistz256_avx2_point_add_affine_x4,\@function,3
1134 ecp_nistz256_avx2_point_add_affine_x4:
1139 $code.=<<___ if ($win64);
1140 lea -16*10(%rsp), %rsp
1141 vmovaps %xmm6, -8-16*10(%rax)
1142 vmovaps %xmm7, -8-16*9(%rax)
1143 vmovaps %xmm8, -8-16*8(%rax)
1144 vmovaps %xmm9, -8-16*7(%rax)
1145 vmovaps %xmm10, -8-16*6(%rax)
1146 vmovaps %xmm11, -8-16*5(%rax)
1147 vmovaps %xmm12, -8-16*4(%rax)
1148 vmovaps %xmm13, -8-16*3(%rax)
1149 vmovaps %xmm14, -8-16*2(%rax)
1150 vmovaps %xmm15, -8-16*1(%rax)
1155 # Result + 32*0 = Result.X
1156 # Result + 32*9 = Result.Y
1157 # Result + 32*18 = Result.Z
1166 sub \$`32*9*8+32*2+32*8`, %rsp
1169 mov $r_ptr_in, $r_ptr
1170 mov $a_ptr_in, $a_ptr
1171 mov $b_ptr_in, $b_ptr
1173 vmovdqa 32*0($a_ptr_in), %ymm0
1174 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1175 vpxor %ymm1, %ymm1, %ymm1
1176 lea 256($a_ptr_in), %rax # size optimization
1177 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1178 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1179 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1180 vpor 32*4-256(%rax), %ymm0, %ymm0
1181 lea 256(%rax), %rcx # size optimization
1182 vpor 32*5-256(%rax), %ymm0, %ymm0
1183 vpor 32*6-256(%rax), %ymm0, %ymm0
1184 vpor 32*7-256(%rax), %ymm0, %ymm0
1185 vpor 32*8-256(%rax), %ymm0, %ymm0
1186 vpor 32*9-256(%rax), %ymm0, %ymm0
1187 vpor 32*10-256(%rax), %ymm0, %ymm0
1188 vpor 32*11-256(%rax), %ymm0, %ymm0
1189 vpor 32*12-512(%rcx), %ymm0, %ymm0
1190 vpor 32*13-512(%rcx), %ymm0, %ymm0
1191 vpor 32*14-512(%rcx), %ymm0, %ymm0
1192 vpor 32*15-512(%rcx), %ymm0, %ymm0
1193 vpor 32*16-512(%rcx), %ymm0, %ymm0
1194 vpor 32*17-512(%rcx), %ymm0, %ymm0
1195 vpcmpeqq %ymm1, %ymm0, %ymm0
1196 vmovdqa %ymm0, `32*9*8`(%rsp)
1198 vpxor %ymm1, %ymm1, %ymm1
1199 vmovdqa 32*0($b_ptr), %ymm0
1200 lea 256($b_ptr), %rax # size optimization
1201 vpor 32*1($b_ptr), %ymm0, %ymm0
1202 vpor 32*2($b_ptr), %ymm0, %ymm0
1203 vpor 32*3($b_ptr), %ymm0, %ymm0
1204 vpor 32*4-256(%rax), %ymm0, %ymm0
1205 lea 256(%rax), %rcx # size optimization
1206 vpor 32*5-256(%rax), %ymm0, %ymm0
1207 vpor 32*6-256(%rax), %ymm0, %ymm0
1208 vpor 32*7-256(%rax), %ymm0, %ymm0
1209 vpor 32*8-256(%rax), %ymm0, %ymm0
1210 vpor 32*9-256(%rax), %ymm0, %ymm0
1211 vpor 32*10-256(%rax), %ymm0, %ymm0
1212 vpor 32*11-256(%rax), %ymm0, %ymm0
1213 vpor 32*12-512(%rcx), %ymm0, %ymm0
1214 vpor 32*13-512(%rcx), %ymm0, %ymm0
1215 vpor 32*14-512(%rcx), %ymm0, %ymm0
1216 vpor 32*15-512(%rcx), %ymm0, %ymm0
1217 vpor 32*16-512(%rcx), %ymm0, %ymm0
1218 vpor 32*17-512(%rcx), %ymm0, %ymm0
1219 vpcmpeqq %ymm1, %ymm0, %ymm0
1220 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1223 lea `32*9*2`($a_ptr), %rsi
1224 lea `32*9*2`(%rsp), %rdi
1225 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1227 call avx2_normalize_n_store
1230 lea `32*9*0`($b_ptr), %rsi
1231 lea `32*9*2`(%rsp), %rdx
1232 lea `32*9*0`(%rsp), %rdi
1234 #call avx2_normalize
1237 # S2 = Z1*Z1^2 = Z1^3
1238 lea `32*9*2`($a_ptr), %rsi
1239 lea `32*9*2`(%rsp), %rdx
1240 lea `32*9*1`(%rsp), %rdi
1242 call avx2_normalize_n_store
1244 # S2 = S2*Y2 = Y2*Z1^3
1245 lea `32*9*1`($b_ptr), %rsi
1246 lea `32*9*1`(%rsp), %rdx
1247 lea `32*9*1`(%rsp), %rdi
1249 call avx2_normalize_n_store
1251 # H = U2 - U1 = U2 - X1
1252 lea `32*9*0`(%rsp), %rsi
1253 lea `32*9*0`($a_ptr), %rdx
1254 lea `32*9*3`(%rsp), %rdi
1256 call avx2_normalize_n_store
1258 # R = S2 - S1 = S2 - Y1
1259 lea `32*9*1`(%rsp), %rsi
1260 lea `32*9*1`($a_ptr), %rdx
1261 lea `32*9*4`(%rsp), %rdi
1263 call avx2_normalize_n_store
1266 lea `32*9*3`(%rsp), %rsi
1267 lea `32*9*2`($a_ptr), %rdx
1268 lea `32*9*2`($r_ptr), %rdi
1272 lea .LONE(%rip), %rsi
1273 lea `32*9*2`($a_ptr), %rdx
1274 call avx2_select_n_store
1277 lea `32*9*4`(%rsp), %rsi
1278 lea `32*9*6`(%rsp), %rdi
1279 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1281 call avx2_normalize_n_store
1284 lea `32*9*3`(%rsp), %rsi
1285 lea `32*9*5`(%rsp), %rdi
1287 call avx2_normalize_n_store
1290 lea `32*9*3`(%rsp), %rsi
1291 lea `32*9*5`(%rsp), %rdx
1292 lea `32*9*7`(%rsp), %rdi
1294 call avx2_normalize_n_store
1297 lea `32*9*0`($a_ptr), %rsi
1298 lea `32*9*5`(%rsp), %rdx
1299 lea `32*9*0`(%rsp), %rdi
1301 #call avx2_normalize
1305 #lea 32*9*0(%rsp), %rsi
1306 #lea 32*9*5(%rsp), %rdi
1307 #call avx2_mul_by2_x4
1309 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1310 lea `32*9*5`(%rsp), %rdi
1311 vpaddq $ACC1, $ACC1, $ACC1
1312 vpaddq $ACC2, $ACC2, $ACC2
1313 vpaddq $ACC3, $ACC3, $ACC3
1314 vpaddq $ACC4, $ACC4, $ACC4
1315 vpaddq $ACC5, $ACC5, $ACC5
1316 vpaddq $ACC6, $ACC6, $ACC6
1317 vpaddq $ACC7, $ACC7, $ACC7
1318 vpaddq $ACC8, $ACC8, $ACC8
1319 call avx2_normalize_n_store
1322 #lea 32*9*6(%rsp), %rsi
1323 #lea 32*9*7(%rsp), %rdx
1324 #lea 32*9*5(%rsp), %rcx
1325 #lea 32*9*0($r_ptr), %rdi
1331 #lea 32*9*0($r_ptr), %rsi
1332 #lea 32*9*0($r_ptr), %rdi
1337 lea `32*9*6+128`(%rsp), %rsi
1338 lea .LAVX2_POLY_x2+128(%rip), %rax
1339 lea `32*9*7+128`(%rsp), %rdx
1340 lea `32*9*5+128`(%rsp), %rcx
1341 lea `32*9*0`($r_ptr), %rdi
1343 vmovdqa 32*0-128(%rsi), $ACC0
1344 vmovdqa 32*1-128(%rsi), $ACC1
1345 vmovdqa 32*2-128(%rsi), $ACC2
1346 vmovdqa 32*3-128(%rsi), $ACC3
1347 vmovdqa 32*4-128(%rsi), $ACC4
1348 vmovdqa 32*5-128(%rsi), $ACC5
1349 vmovdqa 32*6-128(%rsi), $ACC6
1350 vmovdqa 32*7-128(%rsi), $ACC7
1351 vmovdqa 32*8-128(%rsi), $ACC8
1353 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1354 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1355 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1356 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1357 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1358 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1359 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1360 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1361 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1363 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1364 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1365 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1366 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1367 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1368 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1369 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1370 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1371 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1373 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1374 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1375 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1376 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1377 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1378 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1379 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1380 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1381 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1384 lea 32*0($b_ptr), %rsi
1385 lea 32*0($a_ptr), %rdx
1386 call avx2_select_n_store
1389 lea `32*9*0`(%rsp), %rsi
1390 lea `32*9*0`($r_ptr), %rdx
1391 lea `32*9*3`(%rsp), %rdi
1393 call avx2_normalize_n_store
1396 lea `32*9*3`(%rsp), %rsi
1397 lea `32*9*4`(%rsp), %rdx
1398 lea `32*9*3`(%rsp), %rdi
1400 call avx2_normalize_n_store
1403 lea `32*9*7`(%rsp), %rsi
1404 lea `32*9*1`($a_ptr), %rdx
1405 lea `32*9*1`(%rsp), %rdi
1407 call avx2_normalize_n_store
1410 lea `32*9*3`(%rsp), %rsi
1411 lea `32*9*1`(%rsp), %rdx
1412 lea `32*9*1`($r_ptr), %rdi
1416 lea 32*9($b_ptr), %rsi
1417 lea 32*9($a_ptr), %rdx
1418 call avx2_select_n_store
1420 #lea 32*9*0($r_ptr), %rsi
1421 #lea 32*9*0($r_ptr), %rdi
1422 #call avx2_mul_by1_x4
1426 lea `32*9*1`($r_ptr), %rsi
1427 lea `32*9*1`($r_ptr), %rdi
1428 call avx2_mul_by1_x4
1429 call avx2_normalize_n_store
1433 $code.=<<___ if ($win64);
1434 movaps %xmm6, -16*10(%rbp)
1435 movaps %xmm7, -16*9(%rbp)
1436 movaps %xmm8, -16*8(%rbp)
1437 movaps %xmm9, -16*7(%rbp)
1438 movaps %xmm10, -16*6(%rbp)
1439 movaps %xmm11, -16*5(%rbp)
1440 movaps %xmm12, -16*4(%rbp)
1441 movaps %xmm13, -16*3(%rbp)
1442 movaps %xmm14, -16*2(%rbp)
1443 movaps %xmm15, -16*1(%rbp)
1449 .size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4
1451 ################################################################################
1452 # void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4);
1453 .globl ecp_nistz256_avx2_point_add_affines_x4
1454 .type ecp_nistz256_avx2_point_add_affines_x4,\@function,3
1456 ecp_nistz256_avx2_point_add_affines_x4:
1461 $code.=<<___ if ($win64);
1462 lea -16*10(%rsp), %rsp
1463 vmovaps %xmm6, -8-16*10(%rax)
1464 vmovaps %xmm7, -8-16*9(%rax)
1465 vmovaps %xmm8, -8-16*8(%rax)
1466 vmovaps %xmm9, -8-16*7(%rax)
1467 vmovaps %xmm10, -8-16*6(%rax)
1468 vmovaps %xmm11, -8-16*5(%rax)
1469 vmovaps %xmm12, -8-16*4(%rax)
1470 vmovaps %xmm13, -8-16*3(%rax)
1471 vmovaps %xmm14, -8-16*2(%rax)
1472 vmovaps %xmm15, -8-16*1(%rax)
1477 # Result + 32*0 = Result.X
1478 # Result + 32*9 = Result.Y
1479 # Result + 32*18 = Result.Z
1487 sub \$`32*9*8+32*2+32*8`, %rsp
1490 mov $r_ptr_in, $r_ptr
1491 mov $a_ptr_in, $a_ptr
1492 mov $b_ptr_in, $b_ptr
1494 vmovdqa 32*0($a_ptr_in), %ymm0
1495 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1496 vpxor %ymm1, %ymm1, %ymm1
1497 lea 256($a_ptr_in), %rax # size optimization
1498 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1499 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1500 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1501 vpor 32*4-256(%rax), %ymm0, %ymm0
1502 lea 256(%rax), %rcx # size optimization
1503 vpor 32*5-256(%rax), %ymm0, %ymm0
1504 vpor 32*6-256(%rax), %ymm0, %ymm0
1505 vpor 32*7-256(%rax), %ymm0, %ymm0
1506 vpor 32*8-256(%rax), %ymm0, %ymm0
1507 vpor 32*9-256(%rax), %ymm0, %ymm0
1508 vpor 32*10-256(%rax), %ymm0, %ymm0
1509 vpor 32*11-256(%rax), %ymm0, %ymm0
1510 vpor 32*12-512(%rcx), %ymm0, %ymm0
1511 vpor 32*13-512(%rcx), %ymm0, %ymm0
1512 vpor 32*14-512(%rcx), %ymm0, %ymm0
1513 vpor 32*15-512(%rcx), %ymm0, %ymm0
1514 vpor 32*16-512(%rcx), %ymm0, %ymm0
1515 vpor 32*17-512(%rcx), %ymm0, %ymm0
1516 vpcmpeqq %ymm1, %ymm0, %ymm0
1517 vmovdqa %ymm0, `32*9*8`(%rsp)
1519 vpxor %ymm1, %ymm1, %ymm1
1520 vmovdqa 32*0($b_ptr), %ymm0
1521 lea 256($b_ptr), %rax # size optimization
1522 vpor 32*1($b_ptr), %ymm0, %ymm0
1523 vpor 32*2($b_ptr), %ymm0, %ymm0
1524 vpor 32*3($b_ptr), %ymm0, %ymm0
1525 vpor 32*4-256(%rax), %ymm0, %ymm0
1526 lea 256(%rax), %rcx # size optimization
1527 vpor 32*5-256(%rax), %ymm0, %ymm0
1528 vpor 32*6-256(%rax), %ymm0, %ymm0
1529 vpor 32*7-256(%rax), %ymm0, %ymm0
1530 vpor 32*8-256(%rax), %ymm0, %ymm0
1531 vpor 32*9-256(%rax), %ymm0, %ymm0
1532 vpor 32*10-256(%rax), %ymm0, %ymm0
1533 vpor 32*11-256(%rax), %ymm0, %ymm0
1534 vpor 32*12-512(%rcx), %ymm0, %ymm0
1535 vpor 32*13-512(%rcx), %ymm0, %ymm0
1536 vpor 32*14-512(%rcx), %ymm0, %ymm0
1537 vpor 32*15-512(%rcx), %ymm0, %ymm0
1538 vpor 32*16-512(%rcx), %ymm0, %ymm0
1539 vpor 32*17-512(%rcx), %ymm0, %ymm0
1540 vpcmpeqq %ymm1, %ymm0, %ymm0
1541 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1543 # H = U2 - U1 = X2 - X1
1544 lea `32*9*0`($b_ptr), %rsi
1545 lea `32*9*0`($a_ptr), %rdx
1546 lea `32*9*3`(%rsp), %rdi
1548 call avx2_normalize_n_store
1550 # R = S2 - S1 = Y2 - Y1
1551 lea `32*9*1`($b_ptr), %rsi
1552 lea `32*9*1`($a_ptr), %rdx
1553 lea `32*9*4`(%rsp), %rdi
1555 call avx2_normalize_n_store
1558 lea `32*9*3`(%rsp), %rsi
1559 lea `32*9*2`($r_ptr), %rdi
1560 call avx2_mul_by1_x4
1563 vmovdqa `32*9*8`(%rsp), $B
1564 vpor `32*9*8+32`(%rsp), $B, $B
1566 vpandn $ACC0, $B, $ACC0
1567 lea .LONE+128(%rip), %rax
1568 vpandn $ACC1, $B, $ACC1
1569 vpandn $ACC2, $B, $ACC2
1570 vpandn $ACC3, $B, $ACC3
1571 vpandn $ACC4, $B, $ACC4
1572 vpandn $ACC5, $B, $ACC5
1573 vpandn $ACC6, $B, $ACC6
1574 vpandn $ACC7, $B, $ACC7
1576 vpand 32*0-128(%rax), $B, $T0
1577 vpandn $ACC8, $B, $ACC8
1578 vpand 32*1-128(%rax), $B, $Y
1579 vpxor $T0, $ACC0, $ACC0
1580 vpand 32*2-128(%rax), $B, $T0
1581 vpxor $Y, $ACC1, $ACC1
1582 vpand 32*3-128(%rax), $B, $Y
1583 vpxor $T0, $ACC2, $ACC2
1584 vpand 32*4-128(%rax), $B, $T0
1585 vpxor $Y, $ACC3, $ACC3
1586 vpand 32*5-128(%rax), $B, $Y
1587 vpxor $T0, $ACC4, $ACC4
1588 vpand 32*6-128(%rax), $B, $T0
1589 vpxor $Y, $ACC5, $ACC5
1590 vpand 32*7-128(%rax), $B, $Y
1591 vpxor $T0, $ACC6, $ACC6
1592 vpand 32*8-128(%rax), $B, $T0
1593 vpxor $Y, $ACC7, $ACC7
1594 vpxor $T0, $ACC8, $ACC8
1598 lea `32*9*4`(%rsp), %rsi
1599 lea `32*9*6`(%rsp), %rdi
1600 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1602 call avx2_normalize_n_store
1605 lea `32*9*3`(%rsp), %rsi
1606 lea `32*9*5`(%rsp), %rdi
1608 call avx2_normalize_n_store
1611 lea `32*9*3`(%rsp), %rsi
1612 lea `32*9*5`(%rsp), %rdx
1613 lea `32*9*7`(%rsp), %rdi
1615 call avx2_normalize_n_store
1618 lea `32*9*0`($a_ptr), %rsi
1619 lea `32*9*5`(%rsp), %rdx
1620 lea `32*9*0`(%rsp), %rdi
1622 #call avx2_normalize
1626 #lea 32*9*0(%rsp), %rsi
1627 #lea 32*9*5(%rsp), %rdi
1628 #call avx2_mul_by2_x4
1630 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1631 lea `32*9*5`(%rsp), %rdi
1632 vpaddq $ACC1, $ACC1, $ACC1
1633 vpaddq $ACC2, $ACC2, $ACC2
1634 vpaddq $ACC3, $ACC3, $ACC3
1635 vpaddq $ACC4, $ACC4, $ACC4
1636 vpaddq $ACC5, $ACC5, $ACC5
1637 vpaddq $ACC6, $ACC6, $ACC6
1638 vpaddq $ACC7, $ACC7, $ACC7
1639 vpaddq $ACC8, $ACC8, $ACC8
1640 call avx2_normalize_n_store
1643 #lea 32*9*6(%rsp), %rsi
1644 #lea 32*9*7(%rsp), %rdx
1645 #lea 32*9*5(%rsp), %rcx
1646 #lea 32*9*0($r_ptr), %rdi
1652 #lea 32*9*0($r_ptr), %rsi
1653 #lea 32*9*0($r_ptr), %rdi
1658 lea `32*9*6+128`(%rsp), %rsi
1659 lea .LAVX2_POLY_x2+128(%rip), %rax
1660 lea `32*9*7+128`(%rsp), %rdx
1661 lea `32*9*5+128`(%rsp), %rcx
1662 lea `32*9*0`($r_ptr), %rdi
1664 vmovdqa 32*0-128(%rsi), $ACC0
1665 vmovdqa 32*1-128(%rsi), $ACC1
1666 vmovdqa 32*2-128(%rsi), $ACC2
1667 vmovdqa 32*3-128(%rsi), $ACC3
1668 vmovdqa 32*4-128(%rsi), $ACC4
1669 vmovdqa 32*5-128(%rsi), $ACC5
1670 vmovdqa 32*6-128(%rsi), $ACC6
1671 vmovdqa 32*7-128(%rsi), $ACC7
1672 vmovdqa 32*8-128(%rsi), $ACC8
1674 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1675 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1676 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1677 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1678 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1679 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1680 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1681 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1682 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1684 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1685 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1686 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1687 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1688 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1689 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1690 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1691 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1692 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1694 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1695 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1696 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1697 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1698 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1699 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1700 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1701 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1702 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1705 lea 32*0($b_ptr), %rsi
1706 lea 32*0($a_ptr), %rdx
1707 call avx2_select_n_store
1710 lea `32*9*0`(%rsp), %rsi
1711 lea `32*9*0`($r_ptr), %rdx
1712 lea `32*9*3`(%rsp), %rdi
1714 call avx2_normalize_n_store
1717 lea `32*9*3`(%rsp), %rsi
1718 lea `32*9*4`(%rsp), %rdx
1719 lea `32*9*3`(%rsp), %rdi
1721 call avx2_normalize_n_store
1724 lea `32*9*7`(%rsp), %rsi
1725 lea `32*9*1`($a_ptr), %rdx
1726 lea `32*9*1`(%rsp), %rdi
1728 call avx2_normalize_n_store
1731 lea `32*9*3`(%rsp), %rsi
1732 lea `32*9*1`(%rsp), %rdx
1733 lea `32*9*1`($r_ptr), %rdi
1737 lea 32*9($b_ptr), %rsi
1738 lea 32*9($a_ptr), %rdx
1739 call avx2_select_n_store
1741 #lea 32*9*0($r_ptr), %rsi
1742 #lea 32*9*0($r_ptr), %rdi
1743 #call avx2_mul_by1_x4
1747 lea `32*9*1`($r_ptr), %rsi
1748 lea `32*9*1`($r_ptr), %rdi
1749 call avx2_mul_by1_x4
1750 call avx2_normalize_n_store
1754 $code.=<<___ if ($win64);
1755 movaps %xmm6, -16*10(%rbp)
1756 movaps %xmm7, -16*9(%rbp)
1757 movaps %xmm8, -16*8(%rbp)
1758 movaps %xmm9, -16*7(%rbp)
1759 movaps %xmm10, -16*6(%rbp)
1760 movaps %xmm11, -16*5(%rbp)
1761 movaps %xmm12, -16*4(%rbp)
1762 movaps %xmm13, -16*3(%rbp)
1763 movaps %xmm14, -16*2(%rbp)
1764 movaps %xmm15, -16*1(%rbp)
1770 .size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4
1772 ################################################################################
1773 # void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4);
1774 .globl ecp_nistz256_avx2_to_mont
1775 .type ecp_nistz256_avx2_to_mont,\@function,2
1777 ecp_nistz256_avx2_to_mont:
1780 $code.=<<___ if ($win64);
1781 lea -8-16*10(%rsp), %rsp
1782 vmovaps %xmm6, -8-16*10(%rax)
1783 vmovaps %xmm7, -8-16*9(%rax)
1784 vmovaps %xmm8, -8-16*8(%rax)
1785 vmovaps %xmm9, -8-16*7(%rax)
1786 vmovaps %xmm10, -8-16*6(%rax)
1787 vmovaps %xmm11, -8-16*5(%rax)
1788 vmovaps %xmm12, -8-16*4(%rax)
1789 vmovaps %xmm13, -8-16*3(%rax)
1790 vmovaps %xmm14, -8-16*2(%rax)
1791 vmovaps %xmm15, -8-16*1(%rax)
1794 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1795 lea .LTO_MONT_AVX2(%rip), %rdx
1797 call avx2_normalize_n_store
1801 $code.=<<___ if ($win64);
1802 movaps 16*0(%rsp), %xmm6
1803 movaps 16*1(%rsp), %xmm7
1804 movaps 16*2(%rsp), %xmm8
1805 movaps 16*3(%rsp), %xmm9
1806 movaps 16*4(%rsp), %xmm10
1807 movaps 16*5(%rsp), %xmm11
1808 movaps 16*6(%rsp), %xmm12
1809 movaps 16*7(%rsp), %xmm13
1810 movaps 16*8(%rsp), %xmm14
1811 movaps 16*9(%rsp), %xmm15
1812 lea 8+16*10(%rsp), %rsp
1816 .size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont
1818 ################################################################################
1819 # void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4);
1820 .globl ecp_nistz256_avx2_from_mont
1821 .type ecp_nistz256_avx2_from_mont,\@function,2
1823 ecp_nistz256_avx2_from_mont:
1826 $code.=<<___ if ($win64);
1827 lea -8-16*10(%rsp), %rsp
1828 vmovaps %xmm6, -8-16*10(%rax)
1829 vmovaps %xmm7, -8-16*9(%rax)
1830 vmovaps %xmm8, -8-16*8(%rax)
1831 vmovaps %xmm9, -8-16*7(%rax)
1832 vmovaps %xmm10, -8-16*6(%rax)
1833 vmovaps %xmm11, -8-16*5(%rax)
1834 vmovaps %xmm12, -8-16*4(%rax)
1835 vmovaps %xmm13, -8-16*3(%rax)
1836 vmovaps %xmm14, -8-16*2(%rax)
1837 vmovaps %xmm15, -8-16*1(%rax)
1840 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1841 lea .LFROM_MONT_AVX2(%rip), %rdx
1843 call avx2_normalize_n_store
1847 $code.=<<___ if ($win64);
1848 movaps 16*0(%rsp), %xmm6
1849 movaps 16*1(%rsp), %xmm7
1850 movaps 16*2(%rsp), %xmm8
1851 movaps 16*3(%rsp), %xmm9
1852 movaps 16*4(%rsp), %xmm10
1853 movaps 16*5(%rsp), %xmm11
1854 movaps 16*6(%rsp), %xmm12
1855 movaps 16*7(%rsp), %xmm13
1856 movaps 16*8(%rsp), %xmm14
1857 movaps 16*9(%rsp), %xmm15
1858 lea 8+16*10(%rsp), %rsp
1862 .size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont
1864 ################################################################################
1865 # void ecp_nistz256_avx2_set1(void* RESULTx4);
1866 .globl ecp_nistz256_avx2_set1
1867 .type ecp_nistz256_avx2_set1,\@function,1
1869 ecp_nistz256_avx2_set1:
1870 lea .LONE+128(%rip), %rax
1873 vmovdqa 32*0-128(%rax), %ymm0
1874 vmovdqa 32*1-128(%rax), %ymm1
1875 vmovdqa 32*2-128(%rax), %ymm2
1876 vmovdqa 32*3-128(%rax), %ymm3
1877 vmovdqa 32*4-128(%rax), %ymm4
1878 vmovdqa 32*5-128(%rax), %ymm5
1879 vmovdqa %ymm0, 32*0-128(%rdi)
1880 vmovdqa 32*6-128(%rax), %ymm0
1881 vmovdqa %ymm1, 32*1-128(%rdi)
1882 vmovdqa 32*7-128(%rax), %ymm1
1883 vmovdqa %ymm2, 32*2-128(%rdi)
1884 vmovdqa 32*8-128(%rax), %ymm2
1885 vmovdqa %ymm3, 32*3-128(%rdi)
1886 vmovdqa %ymm4, 32*4-128(%rdi)
1887 vmovdqa %ymm5, 32*5-128(%rdi)
1888 vmovdqa %ymm0, 32*6-128(%rdi)
1889 vmovdqa %ymm1, 32*7-128(%rdi)
1890 vmovdqa %ymm2, 32*8-128(%rdi)
1894 .size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1
1898 ################################################################################
1899 # void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in,
1900 # int index0, int index1, int index2, int index3);
1901 ################################################################################
1903 my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d");
1904 my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3));
1905 my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
1906 my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
1909 .globl ecp_nistz256_avx2_multi_gather_w7
1910 .type ecp_nistz256_avx2_multi_gather_w7,\@function,6
1912 ecp_nistz256_avx2_multi_gather_w7:
1915 $code.=<<___ if ($win64);
1916 lea -8-16*10(%rsp), %rsp
1917 vmovaps %xmm6, -8-16*10(%rax)
1918 vmovaps %xmm7, -8-16*9(%rax)
1919 vmovaps %xmm8, -8-16*8(%rax)
1920 vmovaps %xmm9, -8-16*7(%rax)
1921 vmovaps %xmm10, -8-16*6(%rax)
1922 vmovaps %xmm11, -8-16*5(%rax)
1923 vmovaps %xmm12, -8-16*4(%rax)
1924 vmovaps %xmm13, -8-16*3(%rax)
1925 vmovaps %xmm14, -8-16*2(%rax)
1926 vmovaps %xmm15, -8-16*1(%rax)
1929 lea .LIntOne(%rip), %rax
1931 vmovd $index0, %xmm0
1932 vmovd $index1, %xmm1
1933 vmovd $index2, %xmm2
1934 vmovd $index3, %xmm3
1936 vpxor $R0a, $R0a, $R0a
1937 vpxor $R0b, $R0b, $R0b
1938 vpxor $R1a, $R1a, $R1a
1939 vpxor $R1b, $R1b, $R1b
1940 vpxor $R2a, $R2a, $R2a
1941 vpxor $R2b, $R2b, $R2b
1942 vpxor $R3a, $R3a, $R3a
1943 vpxor $R3b, $R3b, $R3b
1946 vpermd $INDEX0, $R0a, $INDEX0
1947 vpermd $INDEX1, $R0a, $INDEX1
1948 vpermd $INDEX2, $R0a, $INDEX2
1949 vpermd $INDEX3, $R0a, $INDEX3
1952 lea 112($val), $val # size optimization
1953 jmp .Lmulti_select_loop_avx2
1955 # INDEX=0, corresponds to the point at infty (0,0)
1957 .Lmulti_select_loop_avx2:
1958 vpcmpeqd $INDEX0, $M0, $TMP0
1960 vmovdqa `32*0+32*64*2*0`($in_t), $T0
1961 vmovdqa `32*1+32*64*2*0`($in_t), $T1
1962 vpand $TMP0, $T0, $T0
1963 vpand $TMP0, $T1, $T1
1964 vpxor $T0, $R0a, $R0a
1965 vpxor $T1, $R0b, $R0b
1967 vpcmpeqd $INDEX1, $M0, $TMP0
1969 vmovdqa `32*0+32*64*2*1`($in_t), $T0
1970 vmovdqa `32*1+32*64*2*1`($in_t), $T1
1971 vpand $TMP0, $T0, $T0
1972 vpand $TMP0, $T1, $T1
1973 vpxor $T0, $R1a, $R1a
1974 vpxor $T1, $R1b, $R1b
1976 vpcmpeqd $INDEX2, $M0, $TMP0
1978 vmovdqa `32*0+32*64*2*2`($in_t), $T0
1979 vmovdqa `32*1+32*64*2*2`($in_t), $T1
1980 vpand $TMP0, $T0, $T0
1981 vpand $TMP0, $T1, $T1
1982 vpxor $T0, $R2a, $R2a
1983 vpxor $T1, $R2b, $R2b
1985 vpcmpeqd $INDEX3, $M0, $TMP0
1987 vmovdqa `32*0+32*64*2*3`($in_t), $T0
1988 vmovdqa `32*1+32*64*2*3`($in_t), $T1
1989 vpand $TMP0, $T0, $T0
1990 vpand $TMP0, $T1, $T1
1991 vpxor $T0, $R3a, $R3a
1992 vpxor $T1, $R3b, $R3b
1994 vpaddd (%rax), $M0, $M0 # increment
1995 lea 32*2($in_t), $in_t
1998 jnz .Lmulti_select_loop_avx2
2000 vmovdqu $R0a, 32*0-112($val)
2001 vmovdqu $R0b, 32*1-112($val)
2002 vmovdqu $R1a, 32*2-112($val)
2003 vmovdqu $R1b, 32*3-112($val)
2004 vmovdqu $R2a, 32*4-112($val)
2005 vmovdqu $R2b, 32*5-112($val)
2006 vmovdqu $R3a, 32*6-112($val)
2007 vmovdqu $R3b, 32*7-112($val)
2011 $code.=<<___ if ($win64);
2012 movaps 16*0(%rsp), %xmm6
2013 movaps 16*1(%rsp), %xmm7
2014 movaps 16*2(%rsp), %xmm8
2015 movaps 16*3(%rsp), %xmm9
2016 movaps 16*4(%rsp), %xmm10
2017 movaps 16*5(%rsp), %xmm11
2018 movaps 16*6(%rsp), %xmm12
2019 movaps 16*7(%rsp), %xmm13
2020 movaps 16*8(%rsp), %xmm14
2021 movaps 16*9(%rsp), %xmm15
2022 lea 8+16*10(%rsp), %rsp
2026 .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2028 .extern OPENSSL_ia32cap_P
2029 .globl ecp_nistz_avx2_eligible
2030 .type ecp_nistz_avx2_eligible,\@abi-omnipotent
2032 ecp_nistz_avx2_eligible:
2033 mov OPENSSL_ia32cap_P+8(%rip),%eax
2037 .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2040 }} else {{ # assembler is too old
2044 .globl ecp_nistz256_avx2_transpose_convert
2045 .globl ecp_nistz256_avx2_convert_transpose_back
2046 .globl ecp_nistz256_avx2_point_add_affine_x4
2047 .globl ecp_nistz256_avx2_point_add_affines_x4
2048 .globl ecp_nistz256_avx2_to_mont
2049 .globl ecp_nistz256_avx2_from_mont
2050 .globl ecp_nistz256_avx2_set1
2051 .globl ecp_nistz256_avx2_multi_gather_w7
2052 .type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent
2053 ecp_nistz256_avx2_transpose_convert:
2054 ecp_nistz256_avx2_convert_transpose_back:
2055 ecp_nistz256_avx2_point_add_affine_x4:
2056 ecp_nistz256_avx2_point_add_affines_x4:
2057 ecp_nistz256_avx2_to_mont:
2058 ecp_nistz256_avx2_from_mont:
2059 ecp_nistz256_avx2_set1:
2060 ecp_nistz256_avx2_multi_gather_w7:
2061 .byte 0x0f,0x0b # ud2
2063 .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2065 .globl ecp_nistz_avx2_eligible
2066 .type ecp_nistz_avx2_eligible,\@abi-omnipotent
2067 ecp_nistz_avx2_eligible:
2070 .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2074 foreach (split("\n",$code)) {
2075 s/\`([^\`]*)\`/eval($1)/geo;