3 ##############################################################################
5 # Copyright 2014 Intel Corporation #
7 # Licensed under the Apache License, Version 2.0 (the "License"); #
8 # you may not use this file except in compliance with the License. #
9 # You may obtain a copy of the License at #
11 # http://www.apache.org/licenses/LICENSE-2.0 #
13 # Unless required by applicable law or agreed to in writing, software #
14 # distributed under the License is distributed on an "AS IS" BASIS, #
15 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
16 # See the License for the specific language governing permissions and #
17 # limitations under the License. #
19 ##############################################################################
21 # Developers and authors: #
22 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
23 # (1) Intel Corporation, Israel Development Center #
24 # (2) University of Haifa #
26 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
29 ##############################################################################
33 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
39 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
40 die "can't locate x86_64-xlate.pl";
42 open OUT,"| \"$^X\" $xlate $flavour $output";
45 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
46 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
47 $avx = ($1>=2.19) + ($1>=2.22);
51 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
52 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
53 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
63 if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
64 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
65 $avx = ($ver>=3.0) + ($ver>=3.01);
79 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
80 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
81 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
82 .quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
83 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
84 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
85 .quad 0x00040000, 0x00040000, 0x00040000, 0x00040000
86 .quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000
87 .quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
90 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
91 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
92 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
93 .quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC
94 .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
95 .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
96 .quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE
97 .quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE
98 .quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC
101 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
102 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
103 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
104 .quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8
105 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
106 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
107 .quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC
108 .quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC
109 .quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8
112 .quad 0x00000020, 0x00000020, 0x00000020, 0x00000020
113 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
114 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
115 .quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000
116 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
117 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
118 .quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff
119 .quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff
120 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
122 # RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL
123 # Montgomery form (*2^256) to our format (*2^261)
126 .quad 0x00000400, 0x00000400, 0x00000400, 0x00000400
127 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
128 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
129 .quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000
130 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
131 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
132 .quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff
133 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
134 .quad 0x00000003, 0x00000003, 0x00000003, 0x00000003
137 .quad 0x00000001, 0x00000001, 0x00000001, 0x00000001
138 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
139 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
140 .quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00
141 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
142 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
143 .quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff
144 .quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff
145 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
148 .long 1,1,1,1,1,1,1,1
152 # This function receives a pointer to an array of four affine points
153 # (X, Y, <1>) and rearanges the data for AVX2 execution, while
154 # converting it to 2^29 radix redundant form
156 my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3,
157 $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15));
160 .globl ecp_nistz256_avx2_transpose_convert
161 .type ecp_nistz256_avx2_transpose_convert,\@function,2
163 ecp_nistz256_avx2_transpose_convert:
166 $code.=<<___ if ($win64);
167 lea -8-16*10(%rsp), %rsp
168 vmovaps %xmm6, -8-16*10(%rax)
169 vmovaps %xmm7, -8-16*9(%rax)
170 vmovaps %xmm8, -8-16*8(%rax)
171 vmovaps %xmm9, -8-16*7(%rax)
172 vmovaps %xmm10, -8-16*6(%rax)
173 vmovaps %xmm11, -8-16*5(%rax)
174 vmovaps %xmm12, -8-16*4(%rax)
175 vmovaps %xmm13, -8-16*3(%rax)
176 vmovaps %xmm14, -8-16*2(%rax)
177 vmovaps %xmm15, -8-16*1(%rax)
181 vmovdqa 32*0(%rsi), $X0
182 lea 112(%rsi), %rax # size optimization
183 vmovdqa 32*1(%rsi), $Y0
184 lea .LAVX2_AND_MASK(%rip), %rdx
185 vmovdqa 32*2(%rsi), $X1
186 vmovdqa 32*3(%rsi), $Y1
187 vmovdqa 32*4-112(%rax), $X2
188 vmovdqa 32*5-112(%rax), $Y2
189 vmovdqa 32*6-112(%rax), $X3
190 vmovdqa 32*7-112(%rax), $Y3
192 # Transpose X and Y independently
193 vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0]
194 vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0]
195 vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1]
196 vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1]
198 vpunpcklqdq $Y1, $Y0, $T4
199 vpunpcklqdq $Y3, $Y2, $T5
200 vpunpckhqdq $Y1, $Y0, $T6
201 vpunpckhqdq $Y3, $Y2, $T7
203 vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0]
204 vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1]
205 vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2]
206 vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3]
208 vperm2i128 \$0x20, $T5, $T4, $Y0
209 vperm2i128 \$0x20, $T7, $T6, $Y1
210 vperm2i128 \$0x31, $T5, $T4, $Y2
211 vperm2i128 \$0x31, $T7, $T6, $Y3
214 vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask;
215 vpsrlq \$29, $X0, $X0
216 vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask;
217 vpsrlq \$29, $X0, $X0
220 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
221 vpsrlq \$23, $X1, $X1
222 vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
223 vpsrlq \$29, $X1, $X1
224 vpsllq \$12, $X2, $T4
226 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
227 vpsrlq \$17, $X2, $X2
228 vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
229 vpsrlq \$29, $X2, $X2
230 vpsllq \$18, $X3, $T6
232 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
233 vpsrlq \$11, $X3, $X3
234 vmovdqa $T0, 32*0(%rdi)
235 lea 112(%rdi), %rax # size optimization
236 vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
237 vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
239 vmovdqa $T1, 32*1(%rdi)
240 vmovdqa $T2, 32*2(%rdi)
241 vmovdqa $T3, 32*3(%rdi)
242 vmovdqa $T4, 32*4-112(%rax)
243 vmovdqa $T5, 32*5-112(%rax)
244 vmovdqa $T6, 32*6-112(%rax)
245 vmovdqa $T0, 32*7-112(%rax)
246 vmovdqa $X3, 32*8-112(%rax)
247 lea 448(%rdi), %rax # size optimization
249 vpand $T7, $Y0, $T0 # out[0] = in[0] & mask;
250 vpsrlq \$29, $Y0, $Y0
251 vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask;
252 vpsrlq \$29, $Y0, $Y0
255 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
256 vpsrlq \$23, $Y1, $Y1
257 vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
258 vpsrlq \$29, $Y1, $Y1
259 vpsllq \$12, $Y2, $T4
261 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
262 vpsrlq \$17, $Y2, $Y2
263 vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
264 vpsrlq \$29, $Y2, $Y2
265 vpsllq \$18, $Y3, $T6
267 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
268 vpsrlq \$11, $Y3, $Y3
269 vmovdqa $T0, 32*9-448(%rax)
270 vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
271 vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
273 vmovdqa $T1, 32*10-448(%rax)
274 vmovdqa $T2, 32*11-448(%rax)
275 vmovdqa $T3, 32*12-448(%rax)
276 vmovdqa $T4, 32*13-448(%rax)
277 vmovdqa $T5, 32*14-448(%rax)
278 vmovdqa $T6, 32*15-448(%rax)
279 vmovdqa $T0, 32*16-448(%rax)
280 vmovdqa $Y3, 32*17-448(%rax)
284 $code.=<<___ if ($win64);
285 movaps 16*0(%rsp), %xmm6
286 movaps 16*1(%rsp), %xmm7
287 movaps 16*2(%rsp), %xmm8
288 movaps 16*3(%rsp), %xmm9
289 movaps 16*4(%rsp), %xmm10
290 movaps 16*5(%rsp), %xmm11
291 movaps 16*6(%rsp), %xmm12
292 movaps 16*7(%rsp), %xmm13
293 movaps 16*8(%rsp), %xmm14
294 movaps 16*9(%rsp), %xmm15
295 lea 8+16*10(%rsp), %rsp
299 .size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert
303 ################################################################################
304 # This function receives a pointer to an array of four AVX2 formatted points
305 # (X, Y, Z) convert the data to normal representation, and rearanges the data
307 my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
308 my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15));
312 .globl ecp_nistz256_avx2_convert_transpose_back
313 .type ecp_nistz256_avx2_convert_transpose_back,\@function,2
315 ecp_nistz256_avx2_convert_transpose_back:
318 $code.=<<___ if ($win64);
319 lea -8-16*10(%rsp), %rsp
320 vmovaps %xmm6, -8-16*10(%rax)
321 vmovaps %xmm7, -8-16*9(%rax)
322 vmovaps %xmm8, -8-16*8(%rax)
323 vmovaps %xmm9, -8-16*7(%rax)
324 vmovaps %xmm10, -8-16*6(%rax)
325 vmovaps %xmm11, -8-16*5(%rax)
326 vmovaps %xmm12, -8-16*4(%rax)
327 vmovaps %xmm13, -8-16*3(%rax)
328 vmovaps %xmm14, -8-16*2(%rax)
329 vmovaps %xmm15, -8-16*1(%rax)
335 vmovdqa 32*0(%rsi), $D0
336 lea 160(%rsi), %rax # size optimization
337 vmovdqa 32*1(%rsi), $D1
338 vmovdqa 32*2(%rsi), $D2
339 vmovdqa 32*3(%rsi), $D3
340 vmovdqa 32*4-160(%rax), $D4
341 vmovdqa 32*5-160(%rax), $D5
342 vmovdqa 32*6-160(%rax), $D6
343 vmovdqa 32*7-160(%rax), $D7
344 vmovdqa 32*8-160(%rax), $D8
346 vpsllq \$29, $D1, $D1
347 vpsllq \$58, $D2, $T0
349 vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2);
352 vpsllq \$23, $D3, $D3
353 vpsllq \$52, $D4, $T1
355 vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64);
357 vpsrlq \$12, $D4, $D4
358 vpsllq \$17, $D5, $D5
359 vpsllq \$46, $D6, $T2
361 vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64);
363 vpsrlq \$18, $D6, $D6
364 vpsllq \$11, $D7, $D7
365 vpsllq \$40, $D8, $T3
367 vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64);
369 vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0]
370 vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0]
371 vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1]
372 vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1]
374 vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0]
375 vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1]
376 vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2]
377 vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3]
379 vmovdqa $D0, 32*0(%rdi)
380 vmovdqa $D1, 32*3(%rdi)
381 vmovdqa $D2, 32*6(%rdi)
382 vmovdqa $D3, 32*9(%rdi)
392 $code.=<<___ if ($win64);
393 movaps 16*0(%rsp), %xmm6
394 movaps 16*1(%rsp), %xmm7
395 movaps 16*2(%rsp), %xmm8
396 movaps 16*3(%rsp), %xmm9
397 movaps 16*4(%rsp), %xmm10
398 movaps 16*5(%rsp), %xmm11
399 movaps 16*6(%rsp), %xmm12
400 movaps 16*7(%rsp), %xmm13
401 movaps 16*8(%rsp), %xmm14
402 movaps 16*9(%rsp), %xmm15
403 lea 8+16*10(%rsp), %rsp
407 .size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back
411 my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx");
412 my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8));
413 my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13));
417 vpsrlq $digit_size, $ACC0, $T0
418 vpand $AND_MASK, $ACC0, $ACC0
419 vpaddq $T0, $ACC1, $ACC1
421 vpsrlq $digit_size, $ACC1, $T0
422 vpand $AND_MASK, $ACC1, $ACC1
423 vpaddq $T0, $ACC2, $ACC2
425 vpsrlq $digit_size, $ACC2, $T0
426 vpand $AND_MASK, $ACC2, $ACC2
427 vpaddq $T0, $ACC3, $ACC3
429 vpsrlq $digit_size, $ACC3, $T0
430 vpand $AND_MASK, $ACC3, $ACC3
431 vpaddq $T0, $ACC4, $ACC4
433 vpsrlq $digit_size, $ACC4, $T0
434 vpand $AND_MASK, $ACC4, $ACC4
435 vpaddq $T0, $ACC5, $ACC5
437 vpsrlq $digit_size, $ACC5, $T0
438 vpand $AND_MASK, $ACC5, $ACC5
439 vpaddq $T0, $ACC6, $ACC6
441 vpsrlq $digit_size, $ACC6, $T0
442 vpand $AND_MASK, $ACC6, $ACC6
443 vpaddq $T0, $ACC7, $ACC7
445 vpsrlq $digit_size, $ACC7, $T0
446 vpand $AND_MASK, $ACC7, $ACC7
447 vpaddq $T0, $ACC8, $ACC8
448 #vpand $AND_MASK, $ACC8, $ACC8
455 vmovdqa $ACC0, 32*0(%rdi)
456 lea 160(%rdi), %rax # size optimization
457 vmovdqa $ACC1, 32*1(%rdi)
458 vmovdqa $ACC2, 32*2(%rdi)
459 vmovdqa $ACC3, 32*3(%rdi)
460 vmovdqa $ACC4, 32*4-160(%rax)
461 vmovdqa $ACC5, 32*5-160(%rax)
462 vmovdqa $ACC6, 32*6-160(%rax)
463 vmovdqa $ACC7, 32*7-160(%rax)
464 vmovdqa $ACC8, 32*8-160(%rax)
470 .type avx2_normalize,\@abi-omnipotent
473 vpsrlq $digit_size, $ACC0, $T0
474 vpand $AND_MASK, $ACC0, $ACC0
475 vpaddq $T0, $ACC1, $ACC1
477 vpsrlq $digit_size, $ACC1, $T0
478 vpand $AND_MASK, $ACC1, $ACC1
479 vpaddq $T0, $ACC2, $ACC2
481 vpsrlq $digit_size, $ACC2, $T0
482 vpand $AND_MASK, $ACC2, $ACC2
483 vpaddq $T0, $ACC3, $ACC3
485 vpsrlq $digit_size, $ACC3, $T0
486 vpand $AND_MASK, $ACC3, $ACC3
487 vpaddq $T0, $ACC4, $ACC4
489 vpsrlq $digit_size, $ACC4, $T0
490 vpand $AND_MASK, $ACC4, $ACC4
491 vpaddq $T0, $ACC5, $ACC5
493 vpsrlq $digit_size, $ACC5, $T0
494 vpand $AND_MASK, $ACC5, $ACC5
495 vpaddq $T0, $ACC6, $ACC6
497 vpsrlq $digit_size, $ACC6, $T0
498 vpand $AND_MASK, $ACC6, $ACC6
499 vpaddq $T0, $ACC7, $ACC7
501 vpsrlq $digit_size, $ACC7, $T0
502 vpand $AND_MASK, $ACC7, $ACC7
503 vpaddq $T0, $ACC8, $ACC8
504 #vpand $AND_MASK, $ACC8, $ACC8
507 .size avx2_normalize,.-avx2_normalize
509 .type avx2_normalize_n_store,\@abi-omnipotent
511 avx2_normalize_n_store:
512 vpsrlq $digit_size, $ACC0, $T0
513 vpand $AND_MASK, $ACC0, $ACC0
514 vpaddq $T0, $ACC1, $ACC1
516 vpsrlq $digit_size, $ACC1, $T0
517 vpand $AND_MASK, $ACC1, $ACC1
518 vmovdqa $ACC0, 32*0(%rdi)
519 lea 160(%rdi), %rax # size optimization
520 vpaddq $T0, $ACC2, $ACC2
522 vpsrlq $digit_size, $ACC2, $T0
523 vpand $AND_MASK, $ACC2, $ACC2
524 vmovdqa $ACC1, 32*1(%rdi)
525 vpaddq $T0, $ACC3, $ACC3
527 vpsrlq $digit_size, $ACC3, $T0
528 vpand $AND_MASK, $ACC3, $ACC3
529 vmovdqa $ACC2, 32*2(%rdi)
530 vpaddq $T0, $ACC4, $ACC4
532 vpsrlq $digit_size, $ACC4, $T0
533 vpand $AND_MASK, $ACC4, $ACC4
534 vmovdqa $ACC3, 32*3(%rdi)
535 vpaddq $T0, $ACC5, $ACC5
537 vpsrlq $digit_size, $ACC5, $T0
538 vpand $AND_MASK, $ACC5, $ACC5
539 vmovdqa $ACC4, 32*4-160(%rax)
540 vpaddq $T0, $ACC6, $ACC6
542 vpsrlq $digit_size, $ACC6, $T0
543 vpand $AND_MASK, $ACC6, $ACC6
544 vmovdqa $ACC5, 32*5-160(%rax)
545 vpaddq $T0, $ACC7, $ACC7
547 vpsrlq $digit_size, $ACC7, $T0
548 vpand $AND_MASK, $ACC7, $ACC7
549 vmovdqa $ACC6, 32*6-160(%rax)
550 vpaddq $T0, $ACC8, $ACC8
551 #vpand $AND_MASK, $ACC8, $ACC8
552 vmovdqa $ACC7, 32*7-160(%rax)
553 vmovdqa $ACC8, 32*8-160(%rax)
556 .size avx2_normalize_n_store,.-avx2_normalize_n_store
558 ################################################################################
559 # void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4);
560 .type avx2_mul_x4,\@abi-omnipotent
563 lea .LAVX2_POLY(%rip), %rax
565 vpxor $ACC0, $ACC0, $ACC0
566 vpxor $ACC1, $ACC1, $ACC1
567 vpxor $ACC2, $ACC2, $ACC2
568 vpxor $ACC3, $ACC3, $ACC3
569 vpxor $ACC4, $ACC4, $ACC4
570 vpxor $ACC5, $ACC5, $ACC5
571 vpxor $ACC6, $ACC6, $ACC6
572 vpxor $ACC7, $ACC7, $ACC7
574 vmovdqa 32*7(%rax), %ymm14
575 vmovdqa 32*8(%rax), %ymm15
578 lea -512($a_ptr), $a_ptr # strategic bias to control u-op density
579 jmp .Lavx2_mul_x4_loop
583 vmovdqa 32*0($b_ptr), $B
584 lea 32*1($b_ptr), $b_ptr
586 vpmuludq 32*0+512($a_ptr), $B, $T0
587 vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW
588 vpaddq $T0, $ACC0, $ACC0
589 vpmuludq 32*2+512($a_ptr), $B, $T0
590 vpaddq $OVERFLOW, $ACC1, $ACC1
591 vpand $AND_MASK, $ACC0, $Y
592 vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW
593 vpaddq $T0, $ACC2, $ACC2
594 vpmuludq 32*4+512($a_ptr), $B, $T0
595 vpaddq $OVERFLOW, $ACC3, $ACC3
596 vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW
597 vpaddq $T0, $ACC4, $ACC4
598 vpmuludq 32*6+512($a_ptr), $B, $T0
599 vpaddq $OVERFLOW, $ACC5, $ACC5
600 vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW
601 vpaddq $T0, $ACC6, $ACC6
603 # Skip some multiplications, optimizing for the constant poly
604 vpmuludq $AND_MASK, $Y, $T0
605 vpaddq $OVERFLOW, $ACC7, $ACC7
606 vpmuludq 32*8+512($a_ptr), $B, $ACC8
607 vpaddq $T0, $ACC0, $OVERFLOW
608 vpaddq $T0, $ACC1, $ACC0
609 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
610 vpaddq $T0, $ACC2, $ACC1
611 vpmuludq 32*3(%rax), $Y, $T0
612 vpaddq $OVERFLOW, $ACC0, $ACC0
613 vpaddq $T0, $ACC3, $ACC2
616 vpsllq \$18, $Y, $OVERFLOW
619 vpmuludq %ymm14, $Y, $T0
620 vpaddq $OVERFLOW, $ACC6, $ACC5
621 vpmuludq %ymm15, $Y, $OVERFLOW
622 vpaddq $T0, $ACC7, $ACC6
623 vpaddq $OVERFLOW, $ACC8, $ACC7
626 jnz .Lavx2_mul_x4_loop
628 vpxor $ACC8, $ACC8, $ACC8
631 .size avx2_mul_x4,.-avx2_mul_x4
633 # Function optimized for the constant 1
634 ################################################################################
635 # void avx2_mul_by1_x4(void* RESULTx4, void *Ax4);
636 .type avx2_mul_by1_x4,\@abi-omnipotent
639 lea .LAVX2_POLY(%rip), %rax
641 vpxor $ACC0, $ACC0, $ACC0
642 vpxor $ACC1, $ACC1, $ACC1
643 vpxor $ACC2, $ACC2, $ACC2
644 vpxor $ACC3, $ACC3, $ACC3
645 vpxor $ACC4, $ACC4, $ACC4
646 vpxor $ACC5, $ACC5, $ACC5
647 vpxor $ACC6, $ACC6, $ACC6
648 vpxor $ACC7, $ACC7, $ACC7
649 vpxor $ACC8, $ACC8, $ACC8
651 vmovdqa 32*3+.LONE(%rip), %ymm14
652 vmovdqa 32*7+.LONE(%rip), %ymm15
655 jmp .Lavx2_mul_by1_x4_loop
658 .Lavx2_mul_by1_x4_loop:
659 vmovdqa 32*0($a_ptr), $B
660 .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr
662 vpsllq \$5, $B, $OVERFLOW
663 vpmuludq %ymm14, $B, $T0
664 vpaddq $OVERFLOW, $ACC0, $ACC0
665 vpaddq $T0, $ACC3, $ACC3
667 vpmuludq $AND_MASK, $B, $T0
668 vpand $AND_MASK, $ACC0, $Y
669 vpaddq $T0, $ACC4, $ACC4
670 vpaddq $T0, $ACC5, $ACC5
671 vpaddq $T0, $ACC6, $ACC6
675 vpmuludq %ymm15, $B, $OVERFLOW
676 vpsubq $T0, $ACC6, $ACC6
678 vpmuludq $AND_MASK, $Y, $T0
679 vpaddq $OVERFLOW, $ACC7, $ACC7
680 vpaddq $T0, $ACC0, $OVERFLOW
681 vpaddq $T0, $ACC1, $ACC0
683 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
684 vpaddq $T0, $ACC2, $ACC1
685 vpmuludq 32*3(%rax), $Y, $T0
686 vpaddq $OVERFLOW, $ACC0, $ACC0
687 vpaddq $T0, $ACC3, $ACC2
689 vpsllq \$18, $Y, $OVERFLOW
691 vpmuludq 32*7(%rax), $Y, $T0
692 vpaddq $OVERFLOW, $ACC6, $ACC5
693 vpaddq $T0, $ACC7, $ACC6
694 vpmuludq 32*8(%rax), $Y, $ACC7
697 jnz .Lavx2_mul_by1_x4_loop
700 .size avx2_mul_by1_x4,.-avx2_mul_by1_x4
702 ################################################################################
703 # void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4);
704 .type avx2_sqr_x4,\@abi-omnipotent
707 lea .LAVX2_POLY(%rip), %rax
709 vmovdqa 32*7(%rax), %ymm14
710 vmovdqa 32*8(%rax), %ymm15
712 vmovdqa 32*0($a_ptr), $B
713 vmovdqa 32*1($a_ptr), $ACC1
714 vmovdqa 32*2($a_ptr), $ACC2
715 vmovdqa 32*3($a_ptr), $ACC3
716 vmovdqa 32*4($a_ptr), $ACC4
717 vmovdqa 32*5($a_ptr), $ACC5
718 vmovdqa 32*6($a_ptr), $ACC6
719 vmovdqa 32*7($a_ptr), $ACC7
720 vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7
721 vmovdqa 32*8($a_ptr), $ACC8
722 vpaddq $ACC2, $ACC2, $ACC2
723 vmovdqa $ACC1, 32*0(%rcx)
724 vpaddq $ACC3, $ACC3, $ACC3
725 vmovdqa $ACC2, 32*1(%rcx)
726 vpaddq $ACC4, $ACC4, $ACC4
727 vmovdqa $ACC3, 32*2(%rcx)
728 vpaddq $ACC5, $ACC5, $ACC5
729 vmovdqa $ACC4, 32*3(%rcx)
730 vpaddq $ACC6, $ACC6, $ACC6
731 vmovdqa $ACC5, 32*4(%rcx)
732 vpaddq $ACC7, $ACC7, $ACC7
733 vmovdqa $ACC6, 32*5(%rcx)
734 vpaddq $ACC8, $ACC8, $ACC8
735 vmovdqa $ACC7, 32*6(%rcx)
736 vmovdqa $ACC8, 32*7(%rcx)
739 vpmuludq $B, $B, $ACC0
740 vpmuludq $B, $ACC1, $ACC1
741 vpand $AND_MASK, $ACC0, $Y
742 vpmuludq $B, $ACC2, $ACC2
743 vpmuludq $B, $ACC3, $ACC3
744 vpmuludq $B, $ACC4, $ACC4
745 vpmuludq $B, $ACC5, $ACC5
746 vpmuludq $B, $ACC6, $ACC6
747 vpmuludq $AND_MASK, $Y, $T0
748 vpmuludq $B, $ACC7, $ACC7
749 vpmuludq $B, $ACC8, $ACC8
750 vmovdqa 32*1($a_ptr), $B
752 vpaddq $T0, $ACC0, $OVERFLOW
753 vpaddq $T0, $ACC1, $ACC0
754 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
755 vpaddq $T0, $ACC2, $ACC1
756 vpmuludq 32*3(%rax), $Y, $T0
757 vpaddq $OVERFLOW, $ACC0, $ACC0
758 vpaddq $T0, $ACC3, $ACC2
762 vpmuludq %ymm14, $Y, $OVERFLOW
763 vpaddq $T0, $ACC6, $ACC5
764 vpmuludq %ymm15, $Y, $T0
765 vpaddq $OVERFLOW, $ACC7, $ACC6
766 vpaddq $T0, $ACC8, $ACC7
769 vpmuludq $B, $B, $OVERFLOW
770 vpand $AND_MASK, $ACC0, $Y
771 vpmuludq 32*1(%rcx), $B, $T0
772 vpaddq $OVERFLOW, $ACC1, $ACC1
773 vpmuludq 32*2(%rcx), $B, $OVERFLOW
774 vpaddq $T0, $ACC2, $ACC2
775 vpmuludq 32*3(%rcx), $B, $T0
776 vpaddq $OVERFLOW, $ACC3, $ACC3
777 vpmuludq 32*4(%rcx), $B, $OVERFLOW
778 vpaddq $T0, $ACC4, $ACC4
779 vpmuludq 32*5(%rcx), $B, $T0
780 vpaddq $OVERFLOW, $ACC5, $ACC5
781 vpmuludq 32*6(%rcx), $B, $OVERFLOW
782 vpaddq $T0, $ACC6, $ACC6
784 vpmuludq $AND_MASK, $Y, $T0
785 vpaddq $OVERFLOW, $ACC7, $ACC7
786 vpmuludq 32*7(%rcx), $B, $ACC8
787 vmovdqa 32*2($a_ptr), $B
788 vpaddq $T0, $ACC0, $OVERFLOW
789 vpaddq $T0, $ACC1, $ACC0
790 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
791 vpaddq $T0, $ACC2, $ACC1
792 vpmuludq 32*3(%rax), $Y, $T0
793 vpaddq $OVERFLOW, $ACC0, $ACC0
794 vpaddq $T0, $ACC3, $ACC2
798 vpmuludq %ymm14, $Y, $OVERFLOW
799 vpaddq $T0, $ACC6, $ACC5
800 vpmuludq %ymm15, $Y, $T0
801 vpaddq $OVERFLOW, $ACC7, $ACC6
802 vpaddq $T0, $ACC8, $ACC7
806 vpand $AND_MASK, $ACC0, $Y
807 vpmuludq 32*2(%rcx), $B, $OVERFLOW
808 vpaddq $T0, $ACC2, $ACC2
809 vpmuludq 32*3(%rcx), $B, $T0
810 vpaddq $OVERFLOW, $ACC3, $ACC3
811 vpmuludq 32*4(%rcx), $B, $OVERFLOW
812 vpaddq $T0, $ACC4, $ACC4
813 vpmuludq 32*5(%rcx), $B, $T0
814 vpaddq $OVERFLOW, $ACC5, $ACC5
815 vpmuludq 32*6(%rcx), $B, $OVERFLOW
816 vpaddq $T0, $ACC6, $ACC6
818 vpmuludq $AND_MASK, $Y, $T0
819 vpaddq $OVERFLOW, $ACC7, $ACC7
820 vpmuludq 32*7(%rcx), $B, $ACC8
821 vmovdqa 32*3($a_ptr), $B
822 vpaddq $T0, $ACC0, $OVERFLOW
823 vpaddq $T0, $ACC1, $ACC0
824 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
825 vpaddq $T0, $ACC2, $ACC1
826 vpmuludq 32*3(%rax), $Y, $T0
827 vpaddq $OVERFLOW, $ACC0, $ACC0
828 vpaddq $T0, $ACC3, $ACC2
832 vpmuludq %ymm14, $Y, $OVERFLOW
833 vpaddq $T0, $ACC6, $ACC5
834 vpmuludq %ymm15, $Y, $T0
835 vpand $AND_MASK, $ACC0, $Y
836 vpaddq $OVERFLOW, $ACC7, $ACC6
837 vpaddq $T0, $ACC8, $ACC7
840 vpmuludq $B, $B, $OVERFLOW
841 vpmuludq 32*3(%rcx), $B, $T0
842 vpaddq $OVERFLOW, $ACC3, $ACC3
843 vpmuludq 32*4(%rcx), $B, $OVERFLOW
844 vpaddq $T0, $ACC4, $ACC4
845 vpmuludq 32*5(%rcx), $B, $T0
846 vpaddq $OVERFLOW, $ACC5, $ACC5
847 vpmuludq 32*6(%rcx), $B, $OVERFLOW
848 vpaddq $T0, $ACC6, $ACC6
850 vpmuludq $AND_MASK, $Y, $T0
851 vpaddq $OVERFLOW, $ACC7, $ACC7
852 vpmuludq 32*7(%rcx), $B, $ACC8
853 vmovdqa 32*4($a_ptr), $B
854 vpaddq $T0, $ACC0, $OVERFLOW
855 vpaddq $T0, $ACC1, $ACC0
856 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
857 vpaddq $T0, $ACC2, $ACC1
858 vpmuludq 32*3(%rax), $Y, $T0
859 vpaddq $OVERFLOW, $ACC0, $ACC0
860 vpaddq $T0, $ACC3, $ACC2
864 vpmuludq %ymm14, $Y, $OVERFLOW
865 vpaddq $T0, $ACC6, $ACC5
866 vpmuludq %ymm15, $Y, $T0
867 vpand $AND_MASK, $ACC0, $Y
868 vpaddq $OVERFLOW, $ACC7, $ACC6
869 vpaddq $T0, $ACC8, $ACC7
873 vpmuludq 32*4(%rcx), $B, $OVERFLOW
874 vpaddq $T0, $ACC4, $ACC4
875 vpmuludq 32*5(%rcx), $B, $T0
876 vpaddq $OVERFLOW, $ACC5, $ACC5
877 vpmuludq 32*6(%rcx), $B, $OVERFLOW
878 vpaddq $T0, $ACC6, $ACC6
880 vpmuludq $AND_MASK, $Y, $T0
881 vpaddq $OVERFLOW, $ACC7, $ACC7
882 vpmuludq 32*7(%rcx), $B, $ACC8
883 vmovdqa 32*5($a_ptr), $B
884 vpaddq $T0, $ACC0, $OVERFLOW
885 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
886 vpaddq $T0, $ACC1, $ACC0
887 vpaddq $T0, $ACC2, $ACC1
888 vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0
889 vpaddq $OVERFLOW, $ACC0, $ACC0
890 vpaddq $T0, $ACC3, $ACC2
894 vpmuludq %ymm14, $Y, $OVERFLOW
895 vpaddq $T0, $ACC6, $ACC5
896 vpmuludq %ymm15, $Y, $T0
897 vpand $AND_MASK, $ACC0, $Y
898 vpaddq $OVERFLOW, $ACC7, $ACC6
899 vpaddq $T0, $ACC8, $ACC7
902 vpmuludq $B, $B, $OVERFLOW
903 vpmuludq 32*5(%rcx), $B, $T0
904 vpaddq $OVERFLOW, $ACC5, $ACC5
905 vpmuludq 32*6(%rcx), $B, $OVERFLOW
906 vpaddq $T0, $ACC6, $ACC6
908 vpmuludq $AND_MASK, $Y, $T0
909 vpaddq $OVERFLOW, $ACC7, $ACC7
910 vpmuludq 32*7(%rcx), $B, $ACC8
911 vmovdqa 32*6($a_ptr), $B
912 vpaddq $T0, $ACC0, $OVERFLOW
913 vpaddq $T0, $ACC1, $ACC0
914 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
915 vpaddq $T0, $ACC2, $ACC1
916 vpmuludq 32*3(%rax), $Y, $T0
917 vpaddq $OVERFLOW, $ACC0, $ACC0
918 vpaddq $T0, $ACC3, $ACC2
922 vpmuludq %ymm14, $Y, $OVERFLOW
923 vpaddq $T0, $ACC6, $ACC5
924 vpmuludq %ymm15, $Y, $T0
925 vpand $AND_MASK, $ACC0, $Y
926 vpaddq $OVERFLOW, $ACC7, $ACC6
927 vpaddq $T0, $ACC8, $ACC7
931 vpmuludq 32*6(%rcx), $B, $OVERFLOW
932 vpaddq $T0, $ACC6, $ACC6
934 vpmuludq $AND_MASK, $Y, $T0
935 vpaddq $OVERFLOW, $ACC7, $ACC7
936 vpmuludq 32*7(%rcx), $B, $ACC8
937 vmovdqa 32*7($a_ptr), $B
938 vpaddq $T0, $ACC0, $OVERFLOW
939 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
940 vpaddq $T0, $ACC1, $ACC0
941 vpaddq $T0, $ACC2, $ACC1
942 vpmuludq 32*3(%rax), $Y, $T0
943 vpaddq $OVERFLOW, $ACC0, $ACC0
944 vpaddq $T0, $ACC3, $ACC2
948 vpmuludq %ymm14, $Y, $OVERFLOW
949 vpaddq $T0, $ACC6, $ACC5
950 vpmuludq %ymm15, $Y, $T0
951 vpand $AND_MASK, $ACC0, $Y
952 vpaddq $OVERFLOW, $ACC7, $ACC6
953 vpaddq $T0, $ACC8, $ACC7
956 vpmuludq $B, $B, $OVERFLOW
958 vpmuludq $AND_MASK, $Y, $T0
959 vpaddq $OVERFLOW, $ACC7, $ACC7
960 vpmuludq 32*7(%rcx), $B, $ACC8
961 vmovdqa 32*8($a_ptr), $B
962 vpaddq $T0, $ACC0, $OVERFLOW
963 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
964 vpaddq $T0, $ACC1, $ACC0
965 vpaddq $T0, $ACC2, $ACC1
966 vpmuludq 32*3(%rax), $Y, $T0
967 vpaddq $OVERFLOW, $ACC0, $ACC0
968 vpaddq $T0, $ACC3, $ACC2
972 vpmuludq %ymm14, $Y, $OVERFLOW
973 vpaddq $T0, $ACC6, $ACC5
974 vpmuludq %ymm15, $Y, $T0
975 vpand $AND_MASK, $ACC0, $Y
976 vpaddq $OVERFLOW, $ACC7, $ACC6
977 vpaddq $T0, $ACC8, $ACC7
980 vpmuludq $B, $B, $ACC8
982 vpmuludq $AND_MASK, $Y, $T0
983 vpaddq $T0, $ACC0, $OVERFLOW
984 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
985 vpaddq $T0, $ACC1, $ACC0
986 vpaddq $T0, $ACC2, $ACC1
987 vpmuludq 32*3(%rax), $Y, $T0
988 vpaddq $OVERFLOW, $ACC0, $ACC0
989 vpaddq $T0, $ACC3, $ACC2
993 vpmuludq %ymm14, $Y, $OVERFLOW
994 vpaddq $T0, $ACC6, $ACC5
995 vpmuludq %ymm15, $Y, $T0
996 vpaddq $OVERFLOW, $ACC7, $ACC6
997 vpaddq $T0, $ACC8, $ACC7
999 vpxor $ACC8, $ACC8, $ACC8
1002 .size avx2_sqr_x4,.-avx2_sqr_x4
1004 ################################################################################
1005 # void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4);
1006 .type avx2_sub_x4,\@abi-omnipotent
1009 vmovdqa 32*0($a_ptr), $ACC0
1010 lea 160($a_ptr), $a_ptr
1011 lea .LAVX2_POLY_x8+128(%rip), %rax
1012 lea 128($b_ptr), $b_ptr
1013 vmovdqa 32*1-160($a_ptr), $ACC1
1014 vmovdqa 32*2-160($a_ptr), $ACC2
1015 vmovdqa 32*3-160($a_ptr), $ACC3
1016 vmovdqa 32*4-160($a_ptr), $ACC4
1017 vmovdqa 32*5-160($a_ptr), $ACC5
1018 vmovdqa 32*6-160($a_ptr), $ACC6
1019 vmovdqa 32*7-160($a_ptr), $ACC7
1020 vmovdqa 32*8-160($a_ptr), $ACC8
1022 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1023 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1024 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1025 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1026 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1027 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1028 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1029 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1030 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1032 vpsubq 32*0-128($b_ptr), $ACC0, $ACC0
1033 vpsubq 32*1-128($b_ptr), $ACC1, $ACC1
1034 vpsubq 32*2-128($b_ptr), $ACC2, $ACC2
1035 vpsubq 32*3-128($b_ptr), $ACC3, $ACC3
1036 vpsubq 32*4-128($b_ptr), $ACC4, $ACC4
1037 vpsubq 32*5-128($b_ptr), $ACC5, $ACC5
1038 vpsubq 32*6-128($b_ptr), $ACC6, $ACC6
1039 vpsubq 32*7-128($b_ptr), $ACC7, $ACC7
1040 vpsubq 32*8-128($b_ptr), $ACC8, $ACC8
1043 .size avx2_sub_x4,.-avx2_sub_x4
1045 .type avx2_select_n_store,\@abi-omnipotent
1047 avx2_select_n_store:
1048 vmovdqa `8+32*9*8`(%rsp), $Y
1049 vpor `8+32*9*8+32`(%rsp), $Y, $Y
1051 vpandn $ACC0, $Y, $ACC0
1052 vpandn $ACC1, $Y, $ACC1
1053 vpandn $ACC2, $Y, $ACC2
1054 vpandn $ACC3, $Y, $ACC3
1055 vpandn $ACC4, $Y, $ACC4
1056 vpandn $ACC5, $Y, $ACC5
1057 vpandn $ACC6, $Y, $ACC6
1058 vmovdqa `8+32*9*8+32`(%rsp), $B
1059 vpandn $ACC7, $Y, $ACC7
1060 vpandn `8+32*9*8`(%rsp), $B, $B
1061 vpandn $ACC8, $Y, $ACC8
1063 vpand 32*0(%rsi), $B, $T0
1065 vpand 32*1(%rsi), $B, $Y
1066 vpxor $T0, $ACC0, $ACC0
1067 vpand 32*2(%rsi), $B, $T0
1068 vpxor $Y, $ACC1, $ACC1
1069 vpand 32*3(%rsi), $B, $Y
1070 vpxor $T0, $ACC2, $ACC2
1071 vpand 32*4-160(%rax), $B, $T0
1072 vpxor $Y, $ACC3, $ACC3
1073 vpand 32*5-160(%rax), $B, $Y
1074 vpxor $T0, $ACC4, $ACC4
1075 vpand 32*6-160(%rax), $B, $T0
1076 vpxor $Y, $ACC5, $ACC5
1077 vpand 32*7-160(%rax), $B, $Y
1078 vpxor $T0, $ACC6, $ACC6
1079 vpand 32*8-160(%rax), $B, $T0
1080 vmovdqa `8+32*9*8+32`(%rsp), $B
1081 vpxor $Y, $ACC7, $ACC7
1083 vpand 32*0(%rdx), $B, $Y
1085 vpxor $T0, $ACC8, $ACC8
1086 vpand 32*1(%rdx), $B, $T0
1087 vpxor $Y, $ACC0, $ACC0
1088 vpand 32*2(%rdx), $B, $Y
1089 vpxor $T0, $ACC1, $ACC1
1090 vpand 32*3(%rdx), $B, $T0
1091 vpxor $Y, $ACC2, $ACC2
1092 vpand 32*4-160(%rax), $B, $Y
1093 vpxor $T0, $ACC3, $ACC3
1094 vpand 32*5-160(%rax), $B, $T0
1095 vpxor $Y, $ACC4, $ACC4
1096 vpand 32*6-160(%rax), $B, $Y
1097 vpxor $T0, $ACC5, $ACC5
1098 vpand 32*7-160(%rax), $B, $T0
1099 vpxor $Y, $ACC6, $ACC6
1100 vpand 32*8-160(%rax), $B, $Y
1101 vpxor $T0, $ACC7, $ACC7
1102 vpxor $Y, $ACC8, $ACC8
1106 .size avx2_select_n_store,.-avx2_select_n_store
1108 $code.=<<___ if (0); # inlined
1109 ################################################################################
1110 # void avx2_mul_by2_x4(void* RESULTx4, void *Ax4);
1111 .type avx2_mul_by2_x4,\@abi-omnipotent
1114 vmovdqa 32*0($a_ptr), $ACC0
1115 lea 160($a_ptr), %rax
1116 vmovdqa 32*1($a_ptr), $ACC1
1117 vmovdqa 32*2($a_ptr), $ACC2
1118 vmovdqa 32*3($a_ptr), $ACC3
1119 vmovdqa 32*4-160(%rax), $ACC4
1120 vmovdqa 32*5-160(%rax), $ACC5
1121 vmovdqa 32*6-160(%rax), $ACC6
1122 vmovdqa 32*7-160(%rax), $ACC7
1123 vmovdqa 32*8-160(%rax), $ACC8
1125 vpaddq $ACC0, $ACC0, $ACC0
1126 vpaddq $ACC1, $ACC1, $ACC1
1127 vpaddq $ACC2, $ACC2, $ACC2
1128 vpaddq $ACC3, $ACC3, $ACC3
1129 vpaddq $ACC4, $ACC4, $ACC4
1130 vpaddq $ACC5, $ACC5, $ACC5
1131 vpaddq $ACC6, $ACC6, $ACC6
1132 vpaddq $ACC7, $ACC7, $ACC7
1133 vpaddq $ACC8, $ACC8, $ACC8
1136 .size avx2_mul_by2_x4,.-avx2_mul_by2_x4
1138 my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx");
1139 my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10");
1142 ################################################################################
1143 # void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4);
1144 .globl ecp_nistz256_avx2_point_add_affine_x4
1145 .type ecp_nistz256_avx2_point_add_affine_x4,\@function,3
1147 ecp_nistz256_avx2_point_add_affine_x4:
1152 $code.=<<___ if ($win64);
1153 lea -16*10(%rsp), %rsp
1154 vmovaps %xmm6, -8-16*10(%rax)
1155 vmovaps %xmm7, -8-16*9(%rax)
1156 vmovaps %xmm8, -8-16*8(%rax)
1157 vmovaps %xmm9, -8-16*7(%rax)
1158 vmovaps %xmm10, -8-16*6(%rax)
1159 vmovaps %xmm11, -8-16*5(%rax)
1160 vmovaps %xmm12, -8-16*4(%rax)
1161 vmovaps %xmm13, -8-16*3(%rax)
1162 vmovaps %xmm14, -8-16*2(%rax)
1163 vmovaps %xmm15, -8-16*1(%rax)
1168 # Result + 32*0 = Result.X
1169 # Result + 32*9 = Result.Y
1170 # Result + 32*18 = Result.Z
1179 sub \$`32*9*8+32*2+32*8`, %rsp
1182 mov $r_ptr_in, $r_ptr
1183 mov $a_ptr_in, $a_ptr
1184 mov $b_ptr_in, $b_ptr
1186 vmovdqa 32*0($a_ptr_in), %ymm0
1187 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1188 vpxor %ymm1, %ymm1, %ymm1
1189 lea 256($a_ptr_in), %rax # size optimization
1190 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1191 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1192 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1193 vpor 32*4-256(%rax), %ymm0, %ymm0
1194 lea 256(%rax), %rcx # size optimization
1195 vpor 32*5-256(%rax), %ymm0, %ymm0
1196 vpor 32*6-256(%rax), %ymm0, %ymm0
1197 vpor 32*7-256(%rax), %ymm0, %ymm0
1198 vpor 32*8-256(%rax), %ymm0, %ymm0
1199 vpor 32*9-256(%rax), %ymm0, %ymm0
1200 vpor 32*10-256(%rax), %ymm0, %ymm0
1201 vpor 32*11-256(%rax), %ymm0, %ymm0
1202 vpor 32*12-512(%rcx), %ymm0, %ymm0
1203 vpor 32*13-512(%rcx), %ymm0, %ymm0
1204 vpor 32*14-512(%rcx), %ymm0, %ymm0
1205 vpor 32*15-512(%rcx), %ymm0, %ymm0
1206 vpor 32*16-512(%rcx), %ymm0, %ymm0
1207 vpor 32*17-512(%rcx), %ymm0, %ymm0
1208 vpcmpeqq %ymm1, %ymm0, %ymm0
1209 vmovdqa %ymm0, `32*9*8`(%rsp)
1211 vpxor %ymm1, %ymm1, %ymm1
1212 vmovdqa 32*0($b_ptr), %ymm0
1213 lea 256($b_ptr), %rax # size optimization
1214 vpor 32*1($b_ptr), %ymm0, %ymm0
1215 vpor 32*2($b_ptr), %ymm0, %ymm0
1216 vpor 32*3($b_ptr), %ymm0, %ymm0
1217 vpor 32*4-256(%rax), %ymm0, %ymm0
1218 lea 256(%rax), %rcx # size optimization
1219 vpor 32*5-256(%rax), %ymm0, %ymm0
1220 vpor 32*6-256(%rax), %ymm0, %ymm0
1221 vpor 32*7-256(%rax), %ymm0, %ymm0
1222 vpor 32*8-256(%rax), %ymm0, %ymm0
1223 vpor 32*9-256(%rax), %ymm0, %ymm0
1224 vpor 32*10-256(%rax), %ymm0, %ymm0
1225 vpor 32*11-256(%rax), %ymm0, %ymm0
1226 vpor 32*12-512(%rcx), %ymm0, %ymm0
1227 vpor 32*13-512(%rcx), %ymm0, %ymm0
1228 vpor 32*14-512(%rcx), %ymm0, %ymm0
1229 vpor 32*15-512(%rcx), %ymm0, %ymm0
1230 vpor 32*16-512(%rcx), %ymm0, %ymm0
1231 vpor 32*17-512(%rcx), %ymm0, %ymm0
1232 vpcmpeqq %ymm1, %ymm0, %ymm0
1233 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1236 lea `32*9*2`($a_ptr), %rsi
1237 lea `32*9*2`(%rsp), %rdi
1238 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1240 call avx2_normalize_n_store
1243 lea `32*9*0`($b_ptr), %rsi
1244 lea `32*9*2`(%rsp), %rdx
1245 lea `32*9*0`(%rsp), %rdi
1247 #call avx2_normalize
1250 # S2 = Z1*Z1^2 = Z1^3
1251 lea `32*9*2`($a_ptr), %rsi
1252 lea `32*9*2`(%rsp), %rdx
1253 lea `32*9*1`(%rsp), %rdi
1255 call avx2_normalize_n_store
1257 # S2 = S2*Y2 = Y2*Z1^3
1258 lea `32*9*1`($b_ptr), %rsi
1259 lea `32*9*1`(%rsp), %rdx
1260 lea `32*9*1`(%rsp), %rdi
1262 call avx2_normalize_n_store
1264 # H = U2 - U1 = U2 - X1
1265 lea `32*9*0`(%rsp), %rsi
1266 lea `32*9*0`($a_ptr), %rdx
1267 lea `32*9*3`(%rsp), %rdi
1269 call avx2_normalize_n_store
1271 # R = S2 - S1 = S2 - Y1
1272 lea `32*9*1`(%rsp), %rsi
1273 lea `32*9*1`($a_ptr), %rdx
1274 lea `32*9*4`(%rsp), %rdi
1276 call avx2_normalize_n_store
1279 lea `32*9*3`(%rsp), %rsi
1280 lea `32*9*2`($a_ptr), %rdx
1281 lea `32*9*2`($r_ptr), %rdi
1285 lea .LONE(%rip), %rsi
1286 lea `32*9*2`($a_ptr), %rdx
1287 call avx2_select_n_store
1290 lea `32*9*4`(%rsp), %rsi
1291 lea `32*9*6`(%rsp), %rdi
1292 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1294 call avx2_normalize_n_store
1297 lea `32*9*3`(%rsp), %rsi
1298 lea `32*9*5`(%rsp), %rdi
1300 call avx2_normalize_n_store
1303 lea `32*9*3`(%rsp), %rsi
1304 lea `32*9*5`(%rsp), %rdx
1305 lea `32*9*7`(%rsp), %rdi
1307 call avx2_normalize_n_store
1310 lea `32*9*0`($a_ptr), %rsi
1311 lea `32*9*5`(%rsp), %rdx
1312 lea `32*9*0`(%rsp), %rdi
1314 #call avx2_normalize
1318 #lea 32*9*0(%rsp), %rsi
1319 #lea 32*9*5(%rsp), %rdi
1320 #call avx2_mul_by2_x4
1322 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1323 lea `32*9*5`(%rsp), %rdi
1324 vpaddq $ACC1, $ACC1, $ACC1
1325 vpaddq $ACC2, $ACC2, $ACC2
1326 vpaddq $ACC3, $ACC3, $ACC3
1327 vpaddq $ACC4, $ACC4, $ACC4
1328 vpaddq $ACC5, $ACC5, $ACC5
1329 vpaddq $ACC6, $ACC6, $ACC6
1330 vpaddq $ACC7, $ACC7, $ACC7
1331 vpaddq $ACC8, $ACC8, $ACC8
1332 call avx2_normalize_n_store
1335 #lea 32*9*6(%rsp), %rsi
1336 #lea 32*9*7(%rsp), %rdx
1337 #lea 32*9*5(%rsp), %rcx
1338 #lea 32*9*0($r_ptr), %rdi
1344 #lea 32*9*0($r_ptr), %rsi
1345 #lea 32*9*0($r_ptr), %rdi
1350 lea `32*9*6+128`(%rsp), %rsi
1351 lea .LAVX2_POLY_x2+128(%rip), %rax
1352 lea `32*9*7+128`(%rsp), %rdx
1353 lea `32*9*5+128`(%rsp), %rcx
1354 lea `32*9*0`($r_ptr), %rdi
1356 vmovdqa 32*0-128(%rsi), $ACC0
1357 vmovdqa 32*1-128(%rsi), $ACC1
1358 vmovdqa 32*2-128(%rsi), $ACC2
1359 vmovdqa 32*3-128(%rsi), $ACC3
1360 vmovdqa 32*4-128(%rsi), $ACC4
1361 vmovdqa 32*5-128(%rsi), $ACC5
1362 vmovdqa 32*6-128(%rsi), $ACC6
1363 vmovdqa 32*7-128(%rsi), $ACC7
1364 vmovdqa 32*8-128(%rsi), $ACC8
1366 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1367 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1368 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1369 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1370 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1371 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1372 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1373 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1374 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1376 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1377 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1378 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1379 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1380 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1381 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1382 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1383 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1384 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1386 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1387 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1388 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1389 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1390 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1391 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1392 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1393 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1394 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1397 lea 32*0($b_ptr), %rsi
1398 lea 32*0($a_ptr), %rdx
1399 call avx2_select_n_store
1402 lea `32*9*0`(%rsp), %rsi
1403 lea `32*9*0`($r_ptr), %rdx
1404 lea `32*9*3`(%rsp), %rdi
1406 call avx2_normalize_n_store
1409 lea `32*9*3`(%rsp), %rsi
1410 lea `32*9*4`(%rsp), %rdx
1411 lea `32*9*3`(%rsp), %rdi
1413 call avx2_normalize_n_store
1416 lea `32*9*7`(%rsp), %rsi
1417 lea `32*9*1`($a_ptr), %rdx
1418 lea `32*9*1`(%rsp), %rdi
1420 call avx2_normalize_n_store
1423 lea `32*9*3`(%rsp), %rsi
1424 lea `32*9*1`(%rsp), %rdx
1425 lea `32*9*1`($r_ptr), %rdi
1429 lea 32*9($b_ptr), %rsi
1430 lea 32*9($a_ptr), %rdx
1431 call avx2_select_n_store
1433 #lea 32*9*0($r_ptr), %rsi
1434 #lea 32*9*0($r_ptr), %rdi
1435 #call avx2_mul_by1_x4
1439 lea `32*9*1`($r_ptr), %rsi
1440 lea `32*9*1`($r_ptr), %rdi
1441 call avx2_mul_by1_x4
1442 call avx2_normalize_n_store
1446 $code.=<<___ if ($win64);
1447 movaps %xmm6, -16*10(%rbp)
1448 movaps %xmm7, -16*9(%rbp)
1449 movaps %xmm8, -16*8(%rbp)
1450 movaps %xmm9, -16*7(%rbp)
1451 movaps %xmm10, -16*6(%rbp)
1452 movaps %xmm11, -16*5(%rbp)
1453 movaps %xmm12, -16*4(%rbp)
1454 movaps %xmm13, -16*3(%rbp)
1455 movaps %xmm14, -16*2(%rbp)
1456 movaps %xmm15, -16*1(%rbp)
1462 .size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4
1464 ################################################################################
1465 # void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4);
1466 .globl ecp_nistz256_avx2_point_add_affines_x4
1467 .type ecp_nistz256_avx2_point_add_affines_x4,\@function,3
1469 ecp_nistz256_avx2_point_add_affines_x4:
1474 $code.=<<___ if ($win64);
1475 lea -16*10(%rsp), %rsp
1476 vmovaps %xmm6, -8-16*10(%rax)
1477 vmovaps %xmm7, -8-16*9(%rax)
1478 vmovaps %xmm8, -8-16*8(%rax)
1479 vmovaps %xmm9, -8-16*7(%rax)
1480 vmovaps %xmm10, -8-16*6(%rax)
1481 vmovaps %xmm11, -8-16*5(%rax)
1482 vmovaps %xmm12, -8-16*4(%rax)
1483 vmovaps %xmm13, -8-16*3(%rax)
1484 vmovaps %xmm14, -8-16*2(%rax)
1485 vmovaps %xmm15, -8-16*1(%rax)
1490 # Result + 32*0 = Result.X
1491 # Result + 32*9 = Result.Y
1492 # Result + 32*18 = Result.Z
1500 sub \$`32*9*8+32*2+32*8`, %rsp
1503 mov $r_ptr_in, $r_ptr
1504 mov $a_ptr_in, $a_ptr
1505 mov $b_ptr_in, $b_ptr
1507 vmovdqa 32*0($a_ptr_in), %ymm0
1508 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1509 vpxor %ymm1, %ymm1, %ymm1
1510 lea 256($a_ptr_in), %rax # size optimization
1511 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1512 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1513 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1514 vpor 32*4-256(%rax), %ymm0, %ymm0
1515 lea 256(%rax), %rcx # size optimization
1516 vpor 32*5-256(%rax), %ymm0, %ymm0
1517 vpor 32*6-256(%rax), %ymm0, %ymm0
1518 vpor 32*7-256(%rax), %ymm0, %ymm0
1519 vpor 32*8-256(%rax), %ymm0, %ymm0
1520 vpor 32*9-256(%rax), %ymm0, %ymm0
1521 vpor 32*10-256(%rax), %ymm0, %ymm0
1522 vpor 32*11-256(%rax), %ymm0, %ymm0
1523 vpor 32*12-512(%rcx), %ymm0, %ymm0
1524 vpor 32*13-512(%rcx), %ymm0, %ymm0
1525 vpor 32*14-512(%rcx), %ymm0, %ymm0
1526 vpor 32*15-512(%rcx), %ymm0, %ymm0
1527 vpor 32*16-512(%rcx), %ymm0, %ymm0
1528 vpor 32*17-512(%rcx), %ymm0, %ymm0
1529 vpcmpeqq %ymm1, %ymm0, %ymm0
1530 vmovdqa %ymm0, `32*9*8`(%rsp)
1532 vpxor %ymm1, %ymm1, %ymm1
1533 vmovdqa 32*0($b_ptr), %ymm0
1534 lea 256($b_ptr), %rax # size optimization
1535 vpor 32*1($b_ptr), %ymm0, %ymm0
1536 vpor 32*2($b_ptr), %ymm0, %ymm0
1537 vpor 32*3($b_ptr), %ymm0, %ymm0
1538 vpor 32*4-256(%rax), %ymm0, %ymm0
1539 lea 256(%rax), %rcx # size optimization
1540 vpor 32*5-256(%rax), %ymm0, %ymm0
1541 vpor 32*6-256(%rax), %ymm0, %ymm0
1542 vpor 32*7-256(%rax), %ymm0, %ymm0
1543 vpor 32*8-256(%rax), %ymm0, %ymm0
1544 vpor 32*9-256(%rax), %ymm0, %ymm0
1545 vpor 32*10-256(%rax), %ymm0, %ymm0
1546 vpor 32*11-256(%rax), %ymm0, %ymm0
1547 vpor 32*12-512(%rcx), %ymm0, %ymm0
1548 vpor 32*13-512(%rcx), %ymm0, %ymm0
1549 vpor 32*14-512(%rcx), %ymm0, %ymm0
1550 vpor 32*15-512(%rcx), %ymm0, %ymm0
1551 vpor 32*16-512(%rcx), %ymm0, %ymm0
1552 vpor 32*17-512(%rcx), %ymm0, %ymm0
1553 vpcmpeqq %ymm1, %ymm0, %ymm0
1554 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1556 # H = U2 - U1 = X2 - X1
1557 lea `32*9*0`($b_ptr), %rsi
1558 lea `32*9*0`($a_ptr), %rdx
1559 lea `32*9*3`(%rsp), %rdi
1561 call avx2_normalize_n_store
1563 # R = S2 - S1 = Y2 - Y1
1564 lea `32*9*1`($b_ptr), %rsi
1565 lea `32*9*1`($a_ptr), %rdx
1566 lea `32*9*4`(%rsp), %rdi
1568 call avx2_normalize_n_store
1571 lea `32*9*3`(%rsp), %rsi
1572 lea `32*9*2`($r_ptr), %rdi
1573 call avx2_mul_by1_x4
1576 vmovdqa `32*9*8`(%rsp), $B
1577 vpor `32*9*8+32`(%rsp), $B, $B
1579 vpandn $ACC0, $B, $ACC0
1580 lea .LONE+128(%rip), %rax
1581 vpandn $ACC1, $B, $ACC1
1582 vpandn $ACC2, $B, $ACC2
1583 vpandn $ACC3, $B, $ACC3
1584 vpandn $ACC4, $B, $ACC4
1585 vpandn $ACC5, $B, $ACC5
1586 vpandn $ACC6, $B, $ACC6
1587 vpandn $ACC7, $B, $ACC7
1589 vpand 32*0-128(%rax), $B, $T0
1590 vpandn $ACC8, $B, $ACC8
1591 vpand 32*1-128(%rax), $B, $Y
1592 vpxor $T0, $ACC0, $ACC0
1593 vpand 32*2-128(%rax), $B, $T0
1594 vpxor $Y, $ACC1, $ACC1
1595 vpand 32*3-128(%rax), $B, $Y
1596 vpxor $T0, $ACC2, $ACC2
1597 vpand 32*4-128(%rax), $B, $T0
1598 vpxor $Y, $ACC3, $ACC3
1599 vpand 32*5-128(%rax), $B, $Y
1600 vpxor $T0, $ACC4, $ACC4
1601 vpand 32*6-128(%rax), $B, $T0
1602 vpxor $Y, $ACC5, $ACC5
1603 vpand 32*7-128(%rax), $B, $Y
1604 vpxor $T0, $ACC6, $ACC6
1605 vpand 32*8-128(%rax), $B, $T0
1606 vpxor $Y, $ACC7, $ACC7
1607 vpxor $T0, $ACC8, $ACC8
1611 lea `32*9*4`(%rsp), %rsi
1612 lea `32*9*6`(%rsp), %rdi
1613 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1615 call avx2_normalize_n_store
1618 lea `32*9*3`(%rsp), %rsi
1619 lea `32*9*5`(%rsp), %rdi
1621 call avx2_normalize_n_store
1624 lea `32*9*3`(%rsp), %rsi
1625 lea `32*9*5`(%rsp), %rdx
1626 lea `32*9*7`(%rsp), %rdi
1628 call avx2_normalize_n_store
1631 lea `32*9*0`($a_ptr), %rsi
1632 lea `32*9*5`(%rsp), %rdx
1633 lea `32*9*0`(%rsp), %rdi
1635 #call avx2_normalize
1639 #lea 32*9*0(%rsp), %rsi
1640 #lea 32*9*5(%rsp), %rdi
1641 #call avx2_mul_by2_x4
1643 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1644 lea `32*9*5`(%rsp), %rdi
1645 vpaddq $ACC1, $ACC1, $ACC1
1646 vpaddq $ACC2, $ACC2, $ACC2
1647 vpaddq $ACC3, $ACC3, $ACC3
1648 vpaddq $ACC4, $ACC4, $ACC4
1649 vpaddq $ACC5, $ACC5, $ACC5
1650 vpaddq $ACC6, $ACC6, $ACC6
1651 vpaddq $ACC7, $ACC7, $ACC7
1652 vpaddq $ACC8, $ACC8, $ACC8
1653 call avx2_normalize_n_store
1656 #lea 32*9*6(%rsp), %rsi
1657 #lea 32*9*7(%rsp), %rdx
1658 #lea 32*9*5(%rsp), %rcx
1659 #lea 32*9*0($r_ptr), %rdi
1665 #lea 32*9*0($r_ptr), %rsi
1666 #lea 32*9*0($r_ptr), %rdi
1671 lea `32*9*6+128`(%rsp), %rsi
1672 lea .LAVX2_POLY_x2+128(%rip), %rax
1673 lea `32*9*7+128`(%rsp), %rdx
1674 lea `32*9*5+128`(%rsp), %rcx
1675 lea `32*9*0`($r_ptr), %rdi
1677 vmovdqa 32*0-128(%rsi), $ACC0
1678 vmovdqa 32*1-128(%rsi), $ACC1
1679 vmovdqa 32*2-128(%rsi), $ACC2
1680 vmovdqa 32*3-128(%rsi), $ACC3
1681 vmovdqa 32*4-128(%rsi), $ACC4
1682 vmovdqa 32*5-128(%rsi), $ACC5
1683 vmovdqa 32*6-128(%rsi), $ACC6
1684 vmovdqa 32*7-128(%rsi), $ACC7
1685 vmovdqa 32*8-128(%rsi), $ACC8
1687 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1688 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1689 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1690 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1691 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1692 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1693 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1694 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1695 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1697 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1698 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1699 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1700 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1701 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1702 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1703 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1704 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1705 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1707 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1708 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1709 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1710 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1711 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1712 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1713 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1714 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1715 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1718 lea 32*0($b_ptr), %rsi
1719 lea 32*0($a_ptr), %rdx
1720 call avx2_select_n_store
1723 lea `32*9*0`(%rsp), %rsi
1724 lea `32*9*0`($r_ptr), %rdx
1725 lea `32*9*3`(%rsp), %rdi
1727 call avx2_normalize_n_store
1730 lea `32*9*3`(%rsp), %rsi
1731 lea `32*9*4`(%rsp), %rdx
1732 lea `32*9*3`(%rsp), %rdi
1734 call avx2_normalize_n_store
1737 lea `32*9*7`(%rsp), %rsi
1738 lea `32*9*1`($a_ptr), %rdx
1739 lea `32*9*1`(%rsp), %rdi
1741 call avx2_normalize_n_store
1744 lea `32*9*3`(%rsp), %rsi
1745 lea `32*9*1`(%rsp), %rdx
1746 lea `32*9*1`($r_ptr), %rdi
1750 lea 32*9($b_ptr), %rsi
1751 lea 32*9($a_ptr), %rdx
1752 call avx2_select_n_store
1754 #lea 32*9*0($r_ptr), %rsi
1755 #lea 32*9*0($r_ptr), %rdi
1756 #call avx2_mul_by1_x4
1760 lea `32*9*1`($r_ptr), %rsi
1761 lea `32*9*1`($r_ptr), %rdi
1762 call avx2_mul_by1_x4
1763 call avx2_normalize_n_store
1767 $code.=<<___ if ($win64);
1768 movaps %xmm6, -16*10(%rbp)
1769 movaps %xmm7, -16*9(%rbp)
1770 movaps %xmm8, -16*8(%rbp)
1771 movaps %xmm9, -16*7(%rbp)
1772 movaps %xmm10, -16*6(%rbp)
1773 movaps %xmm11, -16*5(%rbp)
1774 movaps %xmm12, -16*4(%rbp)
1775 movaps %xmm13, -16*3(%rbp)
1776 movaps %xmm14, -16*2(%rbp)
1777 movaps %xmm15, -16*1(%rbp)
1783 .size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4
1785 ################################################################################
1786 # void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4);
1787 .globl ecp_nistz256_avx2_to_mont
1788 .type ecp_nistz256_avx2_to_mont,\@function,2
1790 ecp_nistz256_avx2_to_mont:
1793 $code.=<<___ if ($win64);
1794 lea -8-16*10(%rsp), %rsp
1795 vmovaps %xmm6, -8-16*10(%rax)
1796 vmovaps %xmm7, -8-16*9(%rax)
1797 vmovaps %xmm8, -8-16*8(%rax)
1798 vmovaps %xmm9, -8-16*7(%rax)
1799 vmovaps %xmm10, -8-16*6(%rax)
1800 vmovaps %xmm11, -8-16*5(%rax)
1801 vmovaps %xmm12, -8-16*4(%rax)
1802 vmovaps %xmm13, -8-16*3(%rax)
1803 vmovaps %xmm14, -8-16*2(%rax)
1804 vmovaps %xmm15, -8-16*1(%rax)
1807 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1808 lea .LTO_MONT_AVX2(%rip), %rdx
1810 call avx2_normalize_n_store
1814 $code.=<<___ if ($win64);
1815 movaps 16*0(%rsp), %xmm6
1816 movaps 16*1(%rsp), %xmm7
1817 movaps 16*2(%rsp), %xmm8
1818 movaps 16*3(%rsp), %xmm9
1819 movaps 16*4(%rsp), %xmm10
1820 movaps 16*5(%rsp), %xmm11
1821 movaps 16*6(%rsp), %xmm12
1822 movaps 16*7(%rsp), %xmm13
1823 movaps 16*8(%rsp), %xmm14
1824 movaps 16*9(%rsp), %xmm15
1825 lea 8+16*10(%rsp), %rsp
1829 .size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont
1831 ################################################################################
1832 # void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4);
1833 .globl ecp_nistz256_avx2_from_mont
1834 .type ecp_nistz256_avx2_from_mont,\@function,2
1836 ecp_nistz256_avx2_from_mont:
1839 $code.=<<___ if ($win64);
1840 lea -8-16*10(%rsp), %rsp
1841 vmovaps %xmm6, -8-16*10(%rax)
1842 vmovaps %xmm7, -8-16*9(%rax)
1843 vmovaps %xmm8, -8-16*8(%rax)
1844 vmovaps %xmm9, -8-16*7(%rax)
1845 vmovaps %xmm10, -8-16*6(%rax)
1846 vmovaps %xmm11, -8-16*5(%rax)
1847 vmovaps %xmm12, -8-16*4(%rax)
1848 vmovaps %xmm13, -8-16*3(%rax)
1849 vmovaps %xmm14, -8-16*2(%rax)
1850 vmovaps %xmm15, -8-16*1(%rax)
1853 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1854 lea .LFROM_MONT_AVX2(%rip), %rdx
1856 call avx2_normalize_n_store
1860 $code.=<<___ if ($win64);
1861 movaps 16*0(%rsp), %xmm6
1862 movaps 16*1(%rsp), %xmm7
1863 movaps 16*2(%rsp), %xmm8
1864 movaps 16*3(%rsp), %xmm9
1865 movaps 16*4(%rsp), %xmm10
1866 movaps 16*5(%rsp), %xmm11
1867 movaps 16*6(%rsp), %xmm12
1868 movaps 16*7(%rsp), %xmm13
1869 movaps 16*8(%rsp), %xmm14
1870 movaps 16*9(%rsp), %xmm15
1871 lea 8+16*10(%rsp), %rsp
1875 .size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont
1877 ################################################################################
1878 # void ecp_nistz256_avx2_set1(void* RESULTx4);
1879 .globl ecp_nistz256_avx2_set1
1880 .type ecp_nistz256_avx2_set1,\@function,1
1882 ecp_nistz256_avx2_set1:
1883 lea .LONE+128(%rip), %rax
1886 vmovdqa 32*0-128(%rax), %ymm0
1887 vmovdqa 32*1-128(%rax), %ymm1
1888 vmovdqa 32*2-128(%rax), %ymm2
1889 vmovdqa 32*3-128(%rax), %ymm3
1890 vmovdqa 32*4-128(%rax), %ymm4
1891 vmovdqa 32*5-128(%rax), %ymm5
1892 vmovdqa %ymm0, 32*0-128(%rdi)
1893 vmovdqa 32*6-128(%rax), %ymm0
1894 vmovdqa %ymm1, 32*1-128(%rdi)
1895 vmovdqa 32*7-128(%rax), %ymm1
1896 vmovdqa %ymm2, 32*2-128(%rdi)
1897 vmovdqa 32*8-128(%rax), %ymm2
1898 vmovdqa %ymm3, 32*3-128(%rdi)
1899 vmovdqa %ymm4, 32*4-128(%rdi)
1900 vmovdqa %ymm5, 32*5-128(%rdi)
1901 vmovdqa %ymm0, 32*6-128(%rdi)
1902 vmovdqa %ymm1, 32*7-128(%rdi)
1903 vmovdqa %ymm2, 32*8-128(%rdi)
1907 .size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1
1911 ################################################################################
1912 # void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in,
1913 # int index0, int index1, int index2, int index3);
1914 ################################################################################
1916 my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d");
1917 my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3));
1918 my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
1919 my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
1922 .globl ecp_nistz256_avx2_multi_gather_w7
1923 .type ecp_nistz256_avx2_multi_gather_w7,\@function,6
1925 ecp_nistz256_avx2_multi_gather_w7:
1928 $code.=<<___ if ($win64);
1929 lea -8-16*10(%rsp), %rsp
1930 vmovaps %xmm6, -8-16*10(%rax)
1931 vmovaps %xmm7, -8-16*9(%rax)
1932 vmovaps %xmm8, -8-16*8(%rax)
1933 vmovaps %xmm9, -8-16*7(%rax)
1934 vmovaps %xmm10, -8-16*6(%rax)
1935 vmovaps %xmm11, -8-16*5(%rax)
1936 vmovaps %xmm12, -8-16*4(%rax)
1937 vmovaps %xmm13, -8-16*3(%rax)
1938 vmovaps %xmm14, -8-16*2(%rax)
1939 vmovaps %xmm15, -8-16*1(%rax)
1942 lea .LIntOne(%rip), %rax
1944 vmovd $index0, %xmm0
1945 vmovd $index1, %xmm1
1946 vmovd $index2, %xmm2
1947 vmovd $index3, %xmm3
1949 vpxor $R0a, $R0a, $R0a
1950 vpxor $R0b, $R0b, $R0b
1951 vpxor $R1a, $R1a, $R1a
1952 vpxor $R1b, $R1b, $R1b
1953 vpxor $R2a, $R2a, $R2a
1954 vpxor $R2b, $R2b, $R2b
1955 vpxor $R3a, $R3a, $R3a
1956 vpxor $R3b, $R3b, $R3b
1959 vpermd $INDEX0, $R0a, $INDEX0
1960 vpermd $INDEX1, $R0a, $INDEX1
1961 vpermd $INDEX2, $R0a, $INDEX2
1962 vpermd $INDEX3, $R0a, $INDEX3
1965 lea 112($val), $val # size optimization
1966 jmp .Lmulti_select_loop_avx2
1968 # INDEX=0, corresponds to the point at infty (0,0)
1970 .Lmulti_select_loop_avx2:
1971 vpcmpeqd $INDEX0, $M0, $TMP0
1973 vmovdqa `32*0+32*64*2*0`($in_t), $T0
1974 vmovdqa `32*1+32*64*2*0`($in_t), $T1
1975 vpand $TMP0, $T0, $T0
1976 vpand $TMP0, $T1, $T1
1977 vpxor $T0, $R0a, $R0a
1978 vpxor $T1, $R0b, $R0b
1980 vpcmpeqd $INDEX1, $M0, $TMP0
1982 vmovdqa `32*0+32*64*2*1`($in_t), $T0
1983 vmovdqa `32*1+32*64*2*1`($in_t), $T1
1984 vpand $TMP0, $T0, $T0
1985 vpand $TMP0, $T1, $T1
1986 vpxor $T0, $R1a, $R1a
1987 vpxor $T1, $R1b, $R1b
1989 vpcmpeqd $INDEX2, $M0, $TMP0
1991 vmovdqa `32*0+32*64*2*2`($in_t), $T0
1992 vmovdqa `32*1+32*64*2*2`($in_t), $T1
1993 vpand $TMP0, $T0, $T0
1994 vpand $TMP0, $T1, $T1
1995 vpxor $T0, $R2a, $R2a
1996 vpxor $T1, $R2b, $R2b
1998 vpcmpeqd $INDEX3, $M0, $TMP0
2000 vmovdqa `32*0+32*64*2*3`($in_t), $T0
2001 vmovdqa `32*1+32*64*2*3`($in_t), $T1
2002 vpand $TMP0, $T0, $T0
2003 vpand $TMP0, $T1, $T1
2004 vpxor $T0, $R3a, $R3a
2005 vpxor $T1, $R3b, $R3b
2007 vpaddd (%rax), $M0, $M0 # increment
2008 lea 32*2($in_t), $in_t
2011 jnz .Lmulti_select_loop_avx2
2013 vmovdqu $R0a, 32*0-112($val)
2014 vmovdqu $R0b, 32*1-112($val)
2015 vmovdqu $R1a, 32*2-112($val)
2016 vmovdqu $R1b, 32*3-112($val)
2017 vmovdqu $R2a, 32*4-112($val)
2018 vmovdqu $R2b, 32*5-112($val)
2019 vmovdqu $R3a, 32*6-112($val)
2020 vmovdqu $R3b, 32*7-112($val)
2024 $code.=<<___ if ($win64);
2025 movaps 16*0(%rsp), %xmm6
2026 movaps 16*1(%rsp), %xmm7
2027 movaps 16*2(%rsp), %xmm8
2028 movaps 16*3(%rsp), %xmm9
2029 movaps 16*4(%rsp), %xmm10
2030 movaps 16*5(%rsp), %xmm11
2031 movaps 16*6(%rsp), %xmm12
2032 movaps 16*7(%rsp), %xmm13
2033 movaps 16*8(%rsp), %xmm14
2034 movaps 16*9(%rsp), %xmm15
2035 lea 8+16*10(%rsp), %rsp
2039 .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2041 .extern OPENSSL_ia32cap_P
2042 .globl ecp_nistz_avx2_eligible
2043 .type ecp_nistz_avx2_eligible,\@abi-omnipotent
2045 ecp_nistz_avx2_eligible:
2046 mov OPENSSL_ia32cap_P+8(%rip),%eax
2050 .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2053 }} else {{ # assembler is too old
2057 .globl ecp_nistz256_avx2_transpose_convert
2058 .globl ecp_nistz256_avx2_convert_transpose_back
2059 .globl ecp_nistz256_avx2_point_add_affine_x4
2060 .globl ecp_nistz256_avx2_point_add_affines_x4
2061 .globl ecp_nistz256_avx2_to_mont
2062 .globl ecp_nistz256_avx2_from_mont
2063 .globl ecp_nistz256_avx2_set1
2064 .globl ecp_nistz256_avx2_multi_gather_w7
2065 .type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent
2066 ecp_nistz256_avx2_transpose_convert:
2067 ecp_nistz256_avx2_convert_transpose_back:
2068 ecp_nistz256_avx2_point_add_affine_x4:
2069 ecp_nistz256_avx2_point_add_affines_x4:
2070 ecp_nistz256_avx2_to_mont:
2071 ecp_nistz256_avx2_from_mont:
2072 ecp_nistz256_avx2_set1:
2073 ecp_nistz256_avx2_multi_gather_w7:
2074 .byte 0x0f,0x0b # ud2
2076 .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2078 .globl ecp_nistz_avx2_eligible
2079 .type ecp_nistz_avx2_eligible,\@abi-omnipotent
2080 ecp_nistz_avx2_eligible:
2083 .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2087 foreach (split("\n",$code)) {
2088 s/\`([^\`]*)\`/eval($1)/geo;