x86_64 assembly pack: add Goldmont performance results.
[oweals/openssl.git] / crypto / poly1305 / asm / poly1305-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements Poly1305 hash for x86_64.
18 #
19 # March 2015
20 #
21 # Numbers are cycles per processed byte with poly1305_blocks alone,
22 # measured with rdtsc at fixed clock frequency.
23 #
24 #               IALU/gcc-4.8(*) AVX(**)         AVX2
25 # P4            4.46/+120%      -
26 # Core 2        2.41/+90%       -
27 # Westmere      1.88/+120%      -
28 # Sandy Bridge  1.39/+140%      1.10
29 # Haswell       1.14/+175%      1.11            0.65
30 # Skylake       1.13/+120%      0.96            0.51
31 # Silvermont    2.83/+95%       -
32 # Goldmont      1.70/+180%      -
33 # VIA Nano      1.82/+150%      -
34 # Sledgehammer  1.38/+160%      -
35 # Bulldozer     2.30/+130%      0.97
36 #
37 # (*)   improvement coefficients relative to clang are more modest and
38 #       are ~50% on most processors, in both cases we are comparing to
39 #       __int128 code;
40 # (**)  SSE2 implementation was attempted, but among non-AVX processors
41 #       it was faster than integer-only code only on older Intel P4 and
42 #       Core processors, 50-30%, less newer processor is, but slower on
43 #       contemporary ones, for example almost 2x slower on Atom, and as
44 #       former are naturally disappearing, SSE2 is deemed unnecessary;
45
46 $flavour = shift;
47 $output  = shift;
48 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
49
50 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
51
52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
54 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
55 die "can't locate x86_64-xlate.pl";
56
57 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
58                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
59         $avx = ($1>=2.19) + ($1>=2.22);
60 }
61
62 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
63            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
64         $avx = ($1>=2.09) + ($1>=2.10);
65 }
66
67 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
68            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
69         $avx = ($1>=10) + ($1>=12);
70 }
71
72 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
73         $avx = ($2>=3.0) + ($2>3.0);
74 }
75
76 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
77 *STDOUT=*OUT;
78
79 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
80 my ($mac,$nonce)=($inp,$len);   # *_emit arguments
81 my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
82 my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
83
84 sub poly1305_iteration {
85 # input:        copy of $r1 in %rax, $h0-$h2, $r0-$r1
86 # output:       $h0-$h2 *= $r0-$r1
87 $code.=<<___;
88         mulq    $h0                     # h0*r1
89         mov     %rax,$d2
90          mov    $r0,%rax
91         mov     %rdx,$d3
92
93         mulq    $h0                     # h0*r0
94         mov     %rax,$h0                # future $h0
95          mov    $r0,%rax
96         mov     %rdx,$d1
97
98         mulq    $h1                     # h1*r0
99         add     %rax,$d2
100          mov    $s1,%rax
101         adc     %rdx,$d3
102
103         mulq    $h1                     # h1*s1
104          mov    $h2,$h1                 # borrow $h1
105         add     %rax,$h0
106         adc     %rdx,$d1
107
108         imulq   $s1,$h1                 # h2*s1
109         add     $h1,$d2
110          mov    $d1,$h1
111         adc     \$0,$d3
112
113         imulq   $r0,$h2                 # h2*r0
114         add     $d2,$h1
115         mov     \$-4,%rax               # mask value
116         adc     $h2,$d3
117
118         and     $d3,%rax                # last reduction step
119         mov     $d3,$h2
120         shr     \$2,$d3
121         and     \$3,$h2
122         add     $d3,%rax
123         add     %rax,$h0
124         adc     \$0,$h1
125         adc     \$0,$h2
126 ___
127 }
128
129 ########################################################################
130 # Layout of opaque area is following.
131 #
132 #       unsigned __int64 h[3];          # current hash value base 2^64
133 #       unsigned __int64 r[2];          # key value base 2^64
134
135 $code.=<<___;
136 .text
137
138 .extern OPENSSL_ia32cap_P
139
140 .globl  poly1305_init
141 .hidden poly1305_init
142 .globl  poly1305_blocks
143 .hidden poly1305_blocks
144 .globl  poly1305_emit
145 .hidden poly1305_emit
146
147 .type   poly1305_init,\@function,3
148 .align  32
149 poly1305_init:
150         xor     %rax,%rax
151         mov     %rax,0($ctx)            # initialize hash value
152         mov     %rax,8($ctx)
153         mov     %rax,16($ctx)
154
155         cmp     \$0,$inp
156         je      .Lno_key
157
158         lea     poly1305_blocks(%rip),%r10
159         lea     poly1305_emit(%rip),%r11
160 ___
161 $code.=<<___    if ($avx);
162         mov     OPENSSL_ia32cap_P+4(%rip),%r9
163         lea     poly1305_blocks_avx(%rip),%rax
164         lea     poly1305_emit_avx(%rip),%rcx
165         bt      \$`60-32`,%r9           # AVX?
166         cmovc   %rax,%r10
167         cmovc   %rcx,%r11
168 ___
169 $code.=<<___    if ($avx>1);
170         lea     poly1305_blocks_avx2(%rip),%rax
171         bt      \$`5+32`,%r9            # AVX2?
172         cmovc   %rax,%r10
173 ___
174 $code.=<<___;
175         mov     \$0x0ffffffc0fffffff,%rax
176         mov     \$0x0ffffffc0ffffffc,%rcx
177         and     0($inp),%rax
178         and     8($inp),%rcx
179         mov     %rax,24($ctx)
180         mov     %rcx,32($ctx)
181 ___
182 $code.=<<___    if ($flavour !~ /elf32/);
183         mov     %r10,0(%rdx)
184         mov     %r11,8(%rdx)
185 ___
186 $code.=<<___    if ($flavour =~ /elf32/);
187         mov     %r10d,0(%rdx)
188         mov     %r11d,4(%rdx)
189 ___
190 $code.=<<___;
191         mov     \$1,%eax
192 .Lno_key:
193         ret
194 .size   poly1305_init,.-poly1305_init
195
196 .type   poly1305_blocks,\@function,4
197 .align  32
198 poly1305_blocks:
199 .Lblocks:
200         shr     \$4,$len
201         jz      .Lno_data               # too short
202
203         push    %rbx
204         push    %rbp
205         push    %r12
206         push    %r13
207         push    %r14
208         push    %r15
209 .Lblocks_body:
210
211         mov     $len,%r15               # reassign $len
212
213         mov     24($ctx),$r0            # load r
214         mov     32($ctx),$s1
215
216         mov     0($ctx),$h0             # load hash value
217         mov     8($ctx),$h1
218         mov     16($ctx),$h2
219
220         mov     $s1,$r1
221         shr     \$2,$s1
222         mov     $r1,%rax
223         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
224         jmp     .Loop
225
226 .align  32
227 .Loop:
228         add     0($inp),$h0             # accumulate input
229         adc     8($inp),$h1
230         lea     16($inp),$inp
231         adc     $padbit,$h2
232 ___
233         &poly1305_iteration();
234 $code.=<<___;
235         mov     $r1,%rax
236         dec     %r15                    # len-=16
237         jnz     .Loop
238
239         mov     $h0,0($ctx)             # store hash value
240         mov     $h1,8($ctx)
241         mov     $h2,16($ctx)
242
243         mov     0(%rsp),%r15
244         mov     8(%rsp),%r14
245         mov     16(%rsp),%r13
246         mov     24(%rsp),%r12
247         mov     32(%rsp),%rbp
248         mov     40(%rsp),%rbx
249         lea     48(%rsp),%rsp
250 .Lno_data:
251 .Lblocks_epilogue:
252         ret
253 .size   poly1305_blocks,.-poly1305_blocks
254
255 .type   poly1305_emit,\@function,3
256 .align  32
257 poly1305_emit:
258 .Lemit:
259         mov     0($ctx),%r8     # load hash value
260         mov     8($ctx),%r9
261         mov     16($ctx),%r10
262
263         mov     %r8,%rax
264         add     \$5,%r8         # compare to modulus
265         mov     %r9,%rcx
266         adc     \$0,%r9
267         adc     \$0,%r10
268         shr     \$2,%r10        # did 130-bit value overfow?
269         cmovnz  %r8,%rax
270         cmovnz  %r9,%rcx
271
272         add     0($nonce),%rax  # accumulate nonce
273         adc     8($nonce),%rcx
274         mov     %rax,0($mac)    # write result
275         mov     %rcx,8($mac)
276
277         ret
278 .size   poly1305_emit,.-poly1305_emit
279 ___
280 if ($avx) {
281
282 ########################################################################
283 # Layout of opaque area is following.
284 #
285 #       unsigned __int32 h[5];          # current hash value base 2^26
286 #       unsigned __int32 is_base2_26;
287 #       unsigned __int64 r[2];          # key value base 2^64
288 #       unsigned __int64 pad;
289 #       struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
290 #
291 # where r^n are base 2^26 digits of degrees of multiplier key. There are
292 # 5 digits, but last four are interleaved with multiples of 5, totalling
293 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
294
295 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
296     map("%xmm$_",(0..15));
297
298 $code.=<<___;
299 .type   __poly1305_block,\@abi-omnipotent
300 .align  32
301 __poly1305_block:
302 ___
303         &poly1305_iteration();
304 $code.=<<___;
305         ret
306 .size   __poly1305_block,.-__poly1305_block
307
308 .type   __poly1305_init_avx,\@abi-omnipotent
309 .align  32
310 __poly1305_init_avx:
311         mov     $r0,$h0
312         mov     $r1,$h1
313         xor     $h2,$h2
314
315         lea     48+64($ctx),$ctx        # size optimization
316
317         mov     $r1,%rax
318         call    __poly1305_block        # r^2
319
320         mov     \$0x3ffffff,%eax        # save interleaved r^2 and r base 2^26
321         mov     \$0x3ffffff,%edx
322         mov     $h0,$d1
323         and     $h0#d,%eax
324         mov     $r0,$d2
325         and     $r0#d,%edx
326         mov     %eax,`16*0+0-64`($ctx)
327         shr     \$26,$d1
328         mov     %edx,`16*0+4-64`($ctx)
329         shr     \$26,$d2
330
331         mov     \$0x3ffffff,%eax
332         mov     \$0x3ffffff,%edx
333         and     $d1#d,%eax
334         and     $d2#d,%edx
335         mov     %eax,`16*1+0-64`($ctx)
336         lea     (%rax,%rax,4),%eax      # *5
337         mov     %edx,`16*1+4-64`($ctx)
338         lea     (%rdx,%rdx,4),%edx      # *5
339         mov     %eax,`16*2+0-64`($ctx)
340         shr     \$26,$d1
341         mov     %edx,`16*2+4-64`($ctx)
342         shr     \$26,$d2
343
344         mov     $h1,%rax
345         mov     $r1,%rdx
346         shl     \$12,%rax
347         shl     \$12,%rdx
348         or      $d1,%rax
349         or      $d2,%rdx
350         and     \$0x3ffffff,%eax
351         and     \$0x3ffffff,%edx
352         mov     %eax,`16*3+0-64`($ctx)
353         lea     (%rax,%rax,4),%eax      # *5
354         mov     %edx,`16*3+4-64`($ctx)
355         lea     (%rdx,%rdx,4),%edx      # *5
356         mov     %eax,`16*4+0-64`($ctx)
357         mov     $h1,$d1
358         mov     %edx,`16*4+4-64`($ctx)
359         mov     $r1,$d2
360
361         mov     \$0x3ffffff,%eax
362         mov     \$0x3ffffff,%edx
363         shr     \$14,$d1
364         shr     \$14,$d2
365         and     $d1#d,%eax
366         and     $d2#d,%edx
367         mov     %eax,`16*5+0-64`($ctx)
368         lea     (%rax,%rax,4),%eax      # *5
369         mov     %edx,`16*5+4-64`($ctx)
370         lea     (%rdx,%rdx,4),%edx      # *5
371         mov     %eax,`16*6+0-64`($ctx)
372         shr     \$26,$d1
373         mov     %edx,`16*6+4-64`($ctx)
374         shr     \$26,$d2
375
376         mov     $h2,%rax
377         shl     \$24,%rax
378         or      %rax,$d1
379         mov     $d1#d,`16*7+0-64`($ctx)
380         lea     ($d1,$d1,4),$d1         # *5
381         mov     $d2#d,`16*7+4-64`($ctx)
382         lea     ($d2,$d2,4),$d2         # *5
383         mov     $d1#d,`16*8+0-64`($ctx)
384         mov     $d2#d,`16*8+4-64`($ctx)
385
386         mov     $r1,%rax
387         call    __poly1305_block        # r^3
388
389         mov     \$0x3ffffff,%eax        # save r^3 base 2^26
390         mov     $h0,$d1
391         and     $h0#d,%eax
392         shr     \$26,$d1
393         mov     %eax,`16*0+12-64`($ctx)
394
395         mov     \$0x3ffffff,%edx
396         and     $d1#d,%edx
397         mov     %edx,`16*1+12-64`($ctx)
398         lea     (%rdx,%rdx,4),%edx      # *5
399         shr     \$26,$d1
400         mov     %edx,`16*2+12-64`($ctx)
401
402         mov     $h1,%rax
403         shl     \$12,%rax
404         or      $d1,%rax
405         and     \$0x3ffffff,%eax
406         mov     %eax,`16*3+12-64`($ctx)
407         lea     (%rax,%rax,4),%eax      # *5
408         mov     $h1,$d1
409         mov     %eax,`16*4+12-64`($ctx)
410
411         mov     \$0x3ffffff,%edx
412         shr     \$14,$d1
413         and     $d1#d,%edx
414         mov     %edx,`16*5+12-64`($ctx)
415         lea     (%rdx,%rdx,4),%edx      # *5
416         shr     \$26,$d1
417         mov     %edx,`16*6+12-64`($ctx)
418
419         mov     $h2,%rax
420         shl     \$24,%rax
421         or      %rax,$d1
422         mov     $d1#d,`16*7+12-64`($ctx)
423         lea     ($d1,$d1,4),$d1         # *5
424         mov     $d1#d,`16*8+12-64`($ctx)
425
426         mov     $r1,%rax
427         call    __poly1305_block        # r^4
428
429         mov     \$0x3ffffff,%eax        # save r^4 base 2^26
430         mov     $h0,$d1
431         and     $h0#d,%eax
432         shr     \$26,$d1
433         mov     %eax,`16*0+8-64`($ctx)
434
435         mov     \$0x3ffffff,%edx
436         and     $d1#d,%edx
437         mov     %edx,`16*1+8-64`($ctx)
438         lea     (%rdx,%rdx,4),%edx      # *5
439         shr     \$26,$d1
440         mov     %edx,`16*2+8-64`($ctx)
441
442         mov     $h1,%rax
443         shl     \$12,%rax
444         or      $d1,%rax
445         and     \$0x3ffffff,%eax
446         mov     %eax,`16*3+8-64`($ctx)
447         lea     (%rax,%rax,4),%eax      # *5
448         mov     $h1,$d1
449         mov     %eax,`16*4+8-64`($ctx)
450
451         mov     \$0x3ffffff,%edx
452         shr     \$14,$d1
453         and     $d1#d,%edx
454         mov     %edx,`16*5+8-64`($ctx)
455         lea     (%rdx,%rdx,4),%edx      # *5
456         shr     \$26,$d1
457         mov     %edx,`16*6+8-64`($ctx)
458
459         mov     $h2,%rax
460         shl     \$24,%rax
461         or      %rax,$d1
462         mov     $d1#d,`16*7+8-64`($ctx)
463         lea     ($d1,$d1,4),$d1         # *5
464         mov     $d1#d,`16*8+8-64`($ctx)
465
466         lea     -48-64($ctx),$ctx       # size [de-]optimization
467         ret
468 .size   __poly1305_init_avx,.-__poly1305_init_avx
469
470 .type   poly1305_blocks_avx,\@function,4
471 .align  32
472 poly1305_blocks_avx:
473         mov     20($ctx),%r8d           # is_base2_26
474         cmp     \$128,$len
475         jae     .Lblocks_avx
476         test    %r8d,%r8d
477         jz      .Lblocks
478
479 .Lblocks_avx:
480         and     \$-16,$len
481         jz      .Lno_data_avx
482
483         vzeroupper
484
485         test    %r8d,%r8d
486         jz      .Lbase2_64_avx
487
488         test    \$31,$len
489         jz      .Leven_avx
490
491         push    %rbx
492         push    %rbp
493         push    %r12
494         push    %r13
495         push    %r14
496         push    %r15
497 .Lblocks_avx_body:
498
499         mov     $len,%r15               # reassign $len
500
501         mov     0($ctx),$d1             # load hash value
502         mov     8($ctx),$d2
503         mov     16($ctx),$h2#d
504
505         mov     24($ctx),$r0            # load r
506         mov     32($ctx),$s1
507
508         ################################# base 2^26 -> base 2^64
509         mov     $d1#d,$h0#d
510         and     \$`-1*(1<<31)`,$d1
511         mov     $d2,$r1                 # borrow $r1
512         mov     $d2#d,$h1#d
513         and     \$`-1*(1<<31)`,$d2
514
515         shr     \$6,$d1
516         shl     \$52,$r1
517         add     $d1,$h0
518         shr     \$12,$h1
519         shr     \$18,$d2
520         add     $r1,$h0
521         adc     $d2,$h1
522
523         mov     $h2,$d1
524         shl     \$40,$d1
525         shr     \$24,$h2
526         add     $d1,$h1
527         adc     \$0,$h2                 # can be partially reduced...
528
529         mov     \$-4,$d2                # ... so reduce
530         mov     $h2,$d1
531         and     $h2,$d2
532         shr     \$2,$d1
533         and     \$3,$h2
534         add     $d2,$d1                 # =*5
535         add     $d1,$h0
536         adc     \$0,$h1
537         adc     \$0,$h2
538
539         mov     $s1,$r1
540         mov     $s1,%rax
541         shr     \$2,$s1
542         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
543
544         add     0($inp),$h0             # accumulate input
545         adc     8($inp),$h1
546         lea     16($inp),$inp
547         adc     $padbit,$h2
548
549         call    __poly1305_block
550
551         test    $padbit,$padbit         # if $padbit is zero,
552         jz      .Lstore_base2_64_avx    # store hash in base 2^64 format
553
554         ################################# base 2^64 -> base 2^26
555         mov     $h0,%rax
556         mov     $h0,%rdx
557         shr     \$52,$h0
558         mov     $h1,$r0
559         mov     $h1,$r1
560         shr     \$26,%rdx
561         and     \$0x3ffffff,%rax        # h[0]
562         shl     \$12,$r0
563         and     \$0x3ffffff,%rdx        # h[1]
564         shr     \$14,$h1
565         or      $r0,$h0
566         shl     \$24,$h2
567         and     \$0x3ffffff,$h0         # h[2]
568         shr     \$40,$r1
569         and     \$0x3ffffff,$h1         # h[3]
570         or      $r1,$h2                 # h[4]
571
572         sub     \$16,%r15
573         jz      .Lstore_base2_26_avx
574
575         vmovd   %rax#d,$H0
576         vmovd   %rdx#d,$H1
577         vmovd   $h0#d,$H2
578         vmovd   $h1#d,$H3
579         vmovd   $h2#d,$H4
580         jmp     .Lproceed_avx
581
582 .align  32
583 .Lstore_base2_64_avx:
584         mov     $h0,0($ctx)
585         mov     $h1,8($ctx)
586         mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
587         jmp     .Ldone_avx
588
589 .align  16
590 .Lstore_base2_26_avx:
591         mov     %rax#d,0($ctx)          # store hash value base 2^26
592         mov     %rdx#d,4($ctx)
593         mov     $h0#d,8($ctx)
594         mov     $h1#d,12($ctx)
595         mov     $h2#d,16($ctx)
596 .align  16
597 .Ldone_avx:
598         mov     0(%rsp),%r15
599         mov     8(%rsp),%r14
600         mov     16(%rsp),%r13
601         mov     24(%rsp),%r12
602         mov     32(%rsp),%rbp
603         mov     40(%rsp),%rbx
604         lea     48(%rsp),%rsp
605 .Lno_data_avx:
606 .Lblocks_avx_epilogue:
607         ret
608
609 .align  32
610 .Lbase2_64_avx:
611         push    %rbx
612         push    %rbp
613         push    %r12
614         push    %r13
615         push    %r14
616         push    %r15
617 .Lbase2_64_avx_body:
618
619         mov     $len,%r15               # reassign $len
620
621         mov     24($ctx),$r0            # load r
622         mov     32($ctx),$s1
623
624         mov     0($ctx),$h0             # load hash value
625         mov     8($ctx),$h1
626         mov     16($ctx),$h2#d
627
628         mov     $s1,$r1
629         mov     $s1,%rax
630         shr     \$2,$s1
631         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
632
633         test    \$31,$len
634         jz      .Linit_avx
635
636         add     0($inp),$h0             # accumulate input
637         adc     8($inp),$h1
638         lea     16($inp),$inp
639         adc     $padbit,$h2
640         sub     \$16,%r15
641
642         call    __poly1305_block
643
644 .Linit_avx:
645         ################################# base 2^64 -> base 2^26
646         mov     $h0,%rax
647         mov     $h0,%rdx
648         shr     \$52,$h0
649         mov     $h1,$d1
650         mov     $h1,$d2
651         shr     \$26,%rdx
652         and     \$0x3ffffff,%rax        # h[0]
653         shl     \$12,$d1
654         and     \$0x3ffffff,%rdx        # h[1]
655         shr     \$14,$h1
656         or      $d1,$h0
657         shl     \$24,$h2
658         and     \$0x3ffffff,$h0         # h[2]
659         shr     \$40,$d2
660         and     \$0x3ffffff,$h1         # h[3]
661         or      $d2,$h2                 # h[4]
662
663         vmovd   %rax#d,$H0
664         vmovd   %rdx#d,$H1
665         vmovd   $h0#d,$H2
666         vmovd   $h1#d,$H3
667         vmovd   $h2#d,$H4
668         movl    \$1,20($ctx)            # set is_base2_26
669
670         call    __poly1305_init_avx
671
672 .Lproceed_avx:
673         mov     %r15,$len
674
675         mov     0(%rsp),%r15
676         mov     8(%rsp),%r14
677         mov     16(%rsp),%r13
678         mov     24(%rsp),%r12
679         mov     32(%rsp),%rbp
680         mov     40(%rsp),%rbx
681         lea     48(%rsp),%rax
682         lea     48(%rsp),%rsp
683 .Lbase2_64_avx_epilogue:
684         jmp     .Ldo_avx
685
686 .align  32
687 .Leven_avx:
688         vmovd           4*0($ctx),$H0           # load hash value
689         vmovd           4*1($ctx),$H1
690         vmovd           4*2($ctx),$H2
691         vmovd           4*3($ctx),$H3
692         vmovd           4*4($ctx),$H4
693
694 .Ldo_avx:
695 ___
696 $code.=<<___    if (!$win64);
697         lea             -0x58(%rsp),%r11
698         sub             \$0x178,%rsp
699 ___
700 $code.=<<___    if ($win64);
701         lea             -0xf8(%rsp),%r11
702         sub             \$0x218,%rsp
703         vmovdqa         %xmm6,0x50(%r11)
704         vmovdqa         %xmm7,0x60(%r11)
705         vmovdqa         %xmm8,0x70(%r11)
706         vmovdqa         %xmm9,0x80(%r11)
707         vmovdqa         %xmm10,0x90(%r11)
708         vmovdqa         %xmm11,0xa0(%r11)
709         vmovdqa         %xmm12,0xb0(%r11)
710         vmovdqa         %xmm13,0xc0(%r11)
711         vmovdqa         %xmm14,0xd0(%r11)
712         vmovdqa         %xmm15,0xe0(%r11)
713 .Ldo_avx_body:
714 ___
715 $code.=<<___;
716         sub             \$64,$len
717         lea             -32($inp),%rax
718         cmovc           %rax,$inp
719
720         vmovdqu         `16*3`($ctx),$D4        # preload r0^2
721         lea             `16*3+64`($ctx),$ctx    # size optimization
722         lea             .Lconst(%rip),%rcx
723
724         ################################################################
725         # load input
726         vmovdqu         16*2($inp),$T0
727         vmovdqu         16*3($inp),$T1
728         vmovdqa         64(%rcx),$MASK          # .Lmask26
729
730         vpsrldq         \$6,$T0,$T2             # splat input
731         vpsrldq         \$6,$T1,$T3
732         vpunpckhqdq     $T1,$T0,$T4             # 4
733         vpunpcklqdq     $T1,$T0,$T0             # 0:1
734         vpunpcklqdq     $T3,$T2,$T3             # 2:3
735
736         vpsrlq          \$40,$T4,$T4            # 4
737         vpsrlq          \$26,$T0,$T1
738         vpand           $MASK,$T0,$T0           # 0
739         vpsrlq          \$4,$T3,$T2
740         vpand           $MASK,$T1,$T1           # 1
741         vpsrlq          \$30,$T3,$T3
742         vpand           $MASK,$T2,$T2           # 2
743         vpand           $MASK,$T3,$T3           # 3
744         vpor            32(%rcx),$T4,$T4        # padbit, yes, always
745
746         jbe             .Lskip_loop_avx
747
748         # expand and copy pre-calculated table to stack
749         vmovdqu         `16*1-64`($ctx),$D1
750         vmovdqu         `16*2-64`($ctx),$D2
751         vpshufd         \$0xEE,$D4,$D3          # 34xx -> 3434
752         vpshufd         \$0x44,$D4,$D0          # xx12 -> 1212
753         vmovdqa         $D3,-0x90(%r11)
754         vmovdqa         $D0,0x00(%rsp)
755         vpshufd         \$0xEE,$D1,$D4
756         vmovdqu         `16*3-64`($ctx),$D0
757         vpshufd         \$0x44,$D1,$D1
758         vmovdqa         $D4,-0x80(%r11)
759         vmovdqa         $D1,0x10(%rsp)
760         vpshufd         \$0xEE,$D2,$D3
761         vmovdqu         `16*4-64`($ctx),$D1
762         vpshufd         \$0x44,$D2,$D2
763         vmovdqa         $D3,-0x70(%r11)
764         vmovdqa         $D2,0x20(%rsp)
765         vpshufd         \$0xEE,$D0,$D4
766         vmovdqu         `16*5-64`($ctx),$D2
767         vpshufd         \$0x44,$D0,$D0
768         vmovdqa         $D4,-0x60(%r11)
769         vmovdqa         $D0,0x30(%rsp)
770         vpshufd         \$0xEE,$D1,$D3
771         vmovdqu         `16*6-64`($ctx),$D0
772         vpshufd         \$0x44,$D1,$D1
773         vmovdqa         $D3,-0x50(%r11)
774         vmovdqa         $D1,0x40(%rsp)
775         vpshufd         \$0xEE,$D2,$D4
776         vmovdqu         `16*7-64`($ctx),$D1
777         vpshufd         \$0x44,$D2,$D2
778         vmovdqa         $D4,-0x40(%r11)
779         vmovdqa         $D2,0x50(%rsp)
780         vpshufd         \$0xEE,$D0,$D3
781         vmovdqu         `16*8-64`($ctx),$D2
782         vpshufd         \$0x44,$D0,$D0
783         vmovdqa         $D3,-0x30(%r11)
784         vmovdqa         $D0,0x60(%rsp)
785         vpshufd         \$0xEE,$D1,$D4
786         vpshufd         \$0x44,$D1,$D1
787         vmovdqa         $D4,-0x20(%r11)
788         vmovdqa         $D1,0x70(%rsp)
789         vpshufd         \$0xEE,$D2,$D3
790          vmovdqa        0x00(%rsp),$D4          # preload r0^2
791         vpshufd         \$0x44,$D2,$D2
792         vmovdqa         $D3,-0x10(%r11)
793         vmovdqa         $D2,0x80(%rsp)
794
795         jmp             .Loop_avx
796
797 .align  32
798 .Loop_avx:
799         ################################################################
800         # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
801         # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
802         #   \___________________/
803         # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
804         # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
805         #   \___________________/ \____________________/
806         #
807         # Note that we start with inp[2:3]*r^2. This is because it
808         # doesn't depend on reduction in previous iteration.
809         ################################################################
810         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
811         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
812         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
813         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
814         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
815         #
816         # though note that $Tx and $Hx are "reversed" in this section,
817         # and $D4 is preloaded with r0^2...
818
819         vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
820         vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
821           vmovdqa       $H2,0x20(%r11)                          # offload hash
822         vpmuludq        $T2,$D4,$D2             # d3 = h2*r0
823          vmovdqa        0x10(%rsp),$H2          # r1^2
824         vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
825         vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
826
827           vmovdqa       $H0,0x00(%r11)                          #
828         vpmuludq        0x20(%rsp),$T4,$H0      # h4*s1
829           vmovdqa       $H1,0x10(%r11)                          #
830         vpmuludq        $T3,$H2,$H1             # h3*r1
831         vpaddq          $H0,$D0,$D0             # d0 += h4*s1
832         vpaddq          $H1,$D4,$D4             # d4 += h3*r1
833           vmovdqa       $H3,0x30(%r11)                          #
834         vpmuludq        $T2,$H2,$H0             # h2*r1
835         vpmuludq        $T1,$H2,$H1             # h1*r1
836         vpaddq          $H0,$D3,$D3             # d3 += h2*r1
837          vmovdqa        0x30(%rsp),$H3          # r2^2
838         vpaddq          $H1,$D2,$D2             # d2 += h1*r1
839           vmovdqa       $H4,0x40(%r11)                          #
840         vpmuludq        $T0,$H2,$H2             # h0*r1
841          vpmuludq       $T2,$H3,$H0             # h2*r2
842         vpaddq          $H2,$D1,$D1             # d1 += h0*r1
843
844          vmovdqa        0x40(%rsp),$H4          # s2^2
845         vpaddq          $H0,$D4,$D4             # d4 += h2*r2
846         vpmuludq        $T1,$H3,$H1             # h1*r2
847         vpmuludq        $T0,$H3,$H3             # h0*r2
848         vpaddq          $H1,$D3,$D3             # d3 += h1*r2
849          vmovdqa        0x50(%rsp),$H2          # r3^2
850         vpaddq          $H3,$D2,$D2             # d2 += h0*r2
851         vpmuludq        $T4,$H4,$H0             # h4*s2
852         vpmuludq        $T3,$H4,$H4             # h3*s2
853         vpaddq          $H0,$D1,$D1             # d1 += h4*s2
854          vmovdqa        0x60(%rsp),$H3          # s3^2
855         vpaddq          $H4,$D0,$D0             # d0 += h3*s2
856
857          vmovdqa        0x80(%rsp),$H4          # s4^2
858         vpmuludq        $T1,$H2,$H1             # h1*r3
859         vpmuludq        $T0,$H2,$H2             # h0*r3
860         vpaddq          $H1,$D4,$D4             # d4 += h1*r3
861         vpaddq          $H2,$D3,$D3             # d3 += h0*r3
862         vpmuludq        $T4,$H3,$H0             # h4*s3
863         vpmuludq        $T3,$H3,$H1             # h3*s3
864         vpaddq          $H0,$D2,$D2             # d2 += h4*s3
865          vmovdqu        16*0($inp),$H0                          # load input
866         vpaddq          $H1,$D1,$D1             # d1 += h3*s3
867         vpmuludq        $T2,$H3,$H3             # h2*s3
868          vpmuludq       $T2,$H4,$T2             # h2*s4
869         vpaddq          $H3,$D0,$D0             # d0 += h2*s3
870
871          vmovdqu        16*1($inp),$H1                          #
872         vpaddq          $T2,$D1,$D1             # d1 += h2*s4
873         vpmuludq        $T3,$H4,$T3             # h3*s4
874         vpmuludq        $T4,$H4,$T4             # h4*s4
875          vpsrldq        \$6,$H0,$H2                             # splat input
876         vpaddq          $T3,$D2,$D2             # d2 += h3*s4
877         vpaddq          $T4,$D3,$D3             # d3 += h4*s4
878          vpsrldq        \$6,$H1,$H3                             #
879         vpmuludq        0x70(%rsp),$T0,$T4      # h0*r4
880         vpmuludq        $T1,$H4,$T0             # h1*s4
881          vpunpckhqdq    $H1,$H0,$H4             # 4
882         vpaddq          $T4,$D4,$D4             # d4 += h0*r4
883          vmovdqa        -0x90(%r11),$T4         # r0^4
884         vpaddq          $T0,$D0,$D0             # d0 += h1*s4
885
886         vpunpcklqdq     $H1,$H0,$H0             # 0:1
887         vpunpcklqdq     $H3,$H2,$H3             # 2:3
888
889         #vpsrlq         \$40,$H4,$H4            # 4
890         vpsrldq         \$`40/8`,$H4,$H4        # 4
891         vpsrlq          \$26,$H0,$H1
892         vpand           $MASK,$H0,$H0           # 0
893         vpsrlq          \$4,$H3,$H2
894         vpand           $MASK,$H1,$H1           # 1
895         vpand           0(%rcx),$H4,$H4         # .Lmask24
896         vpsrlq          \$30,$H3,$H3
897         vpand           $MASK,$H2,$H2           # 2
898         vpand           $MASK,$H3,$H3           # 3
899         vpor            32(%rcx),$H4,$H4        # padbit, yes, always
900
901         vpaddq          0x00(%r11),$H0,$H0      # add hash value
902         vpaddq          0x10(%r11),$H1,$H1
903         vpaddq          0x20(%r11),$H2,$H2
904         vpaddq          0x30(%r11),$H3,$H3
905         vpaddq          0x40(%r11),$H4,$H4
906
907         lea             16*2($inp),%rax
908         lea             16*4($inp),$inp
909         sub             \$64,$len
910         cmovc           %rax,$inp
911
912         ################################################################
913         # Now we accumulate (inp[0:1]+hash)*r^4
914         ################################################################
915         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
916         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
917         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
918         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
919         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
920
921         vpmuludq        $H0,$T4,$T0             # h0*r0
922         vpmuludq        $H1,$T4,$T1             # h1*r0
923         vpaddq          $T0,$D0,$D0
924         vpaddq          $T1,$D1,$D1
925          vmovdqa        -0x80(%r11),$T2         # r1^4
926         vpmuludq        $H2,$T4,$T0             # h2*r0
927         vpmuludq        $H3,$T4,$T1             # h3*r0
928         vpaddq          $T0,$D2,$D2
929         vpaddq          $T1,$D3,$D3
930         vpmuludq        $H4,$T4,$T4             # h4*r0
931          vpmuludq       -0x70(%r11),$H4,$T0     # h4*s1
932         vpaddq          $T4,$D4,$D4
933
934         vpaddq          $T0,$D0,$D0             # d0 += h4*s1
935         vpmuludq        $H2,$T2,$T1             # h2*r1
936         vpmuludq        $H3,$T2,$T0             # h3*r1
937         vpaddq          $T1,$D3,$D3             # d3 += h2*r1
938          vmovdqa        -0x60(%r11),$T3         # r2^4
939         vpaddq          $T0,$D4,$D4             # d4 += h3*r1
940         vpmuludq        $H1,$T2,$T1             # h1*r1
941         vpmuludq        $H0,$T2,$T2             # h0*r1
942         vpaddq          $T1,$D2,$D2             # d2 += h1*r1
943         vpaddq          $T2,$D1,$D1             # d1 += h0*r1
944
945          vmovdqa        -0x50(%r11),$T4         # s2^4
946         vpmuludq        $H2,$T3,$T0             # h2*r2
947         vpmuludq        $H1,$T3,$T1             # h1*r2
948         vpaddq          $T0,$D4,$D4             # d4 += h2*r2
949         vpaddq          $T1,$D3,$D3             # d3 += h1*r2
950          vmovdqa        -0x40(%r11),$T2         # r3^4
951         vpmuludq        $H0,$T3,$T3             # h0*r2
952         vpmuludq        $H4,$T4,$T0             # h4*s2
953         vpaddq          $T3,$D2,$D2             # d2 += h0*r2
954         vpaddq          $T0,$D1,$D1             # d1 += h4*s2
955          vmovdqa        -0x30(%r11),$T3         # s3^4
956         vpmuludq        $H3,$T4,$T4             # h3*s2
957          vpmuludq       $H1,$T2,$T1             # h1*r3
958         vpaddq          $T4,$D0,$D0             # d0 += h3*s2
959
960          vmovdqa        -0x10(%r11),$T4         # s4^4
961         vpaddq          $T1,$D4,$D4             # d4 += h1*r3
962         vpmuludq        $H0,$T2,$T2             # h0*r3
963         vpmuludq        $H4,$T3,$T0             # h4*s3
964         vpaddq          $T2,$D3,$D3             # d3 += h0*r3
965         vpaddq          $T0,$D2,$D2             # d2 += h4*s3
966          vmovdqu        16*2($inp),$T0                          # load input
967         vpmuludq        $H3,$T3,$T2             # h3*s3
968         vpmuludq        $H2,$T3,$T3             # h2*s3
969         vpaddq          $T2,$D1,$D1             # d1 += h3*s3
970          vmovdqu        16*3($inp),$T1                          #
971         vpaddq          $T3,$D0,$D0             # d0 += h2*s3
972
973         vpmuludq        $H2,$T4,$H2             # h2*s4
974         vpmuludq        $H3,$T4,$H3             # h3*s4
975          vpsrldq        \$6,$T0,$T2                             # splat input
976         vpaddq          $H2,$D1,$D1             # d1 += h2*s4
977         vpmuludq        $H4,$T4,$H4             # h4*s4
978          vpsrldq        \$6,$T1,$T3                             #
979         vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*s4
980         vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*s4
981         vpmuludq        -0x20(%r11),$H0,$H4     # h0*r4
982         vpmuludq        $H1,$T4,$H0
983          vpunpckhqdq    $T1,$T0,$T4             # 4
984         vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
985         vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
986
987         vpunpcklqdq     $T1,$T0,$T0             # 0:1
988         vpunpcklqdq     $T3,$T2,$T3             # 2:3
989
990         #vpsrlq         \$40,$T4,$T4            # 4
991         vpsrldq         \$`40/8`,$T4,$T4        # 4
992         vpsrlq          \$26,$T0,$T1
993          vmovdqa        0x00(%rsp),$D4          # preload r0^2
994         vpand           $MASK,$T0,$T0           # 0
995         vpsrlq          \$4,$T3,$T2
996         vpand           $MASK,$T1,$T1           # 1
997         vpand           0(%rcx),$T4,$T4         # .Lmask24
998         vpsrlq          \$30,$T3,$T3
999         vpand           $MASK,$T2,$T2           # 2
1000         vpand           $MASK,$T3,$T3           # 3
1001         vpor            32(%rcx),$T4,$T4        # padbit, yes, always
1002
1003         ################################################################
1004         # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1005         # and P. Schwabe
1006
1007         vpsrlq          \$26,$H3,$D3
1008         vpand           $MASK,$H3,$H3
1009         vpaddq          $D3,$H4,$H4             # h3 -> h4
1010
1011         vpsrlq          \$26,$H0,$D0
1012         vpand           $MASK,$H0,$H0
1013         vpaddq          $D0,$D1,$H1             # h0 -> h1
1014
1015         vpsrlq          \$26,$H4,$D0
1016         vpand           $MASK,$H4,$H4
1017
1018         vpsrlq          \$26,$H1,$D1
1019         vpand           $MASK,$H1,$H1
1020         vpaddq          $D1,$H2,$H2             # h1 -> h2
1021
1022         vpaddq          $D0,$H0,$H0
1023         vpsllq          \$2,$D0,$D0
1024         vpaddq          $D0,$H0,$H0             # h4 -> h0
1025
1026         vpsrlq          \$26,$H2,$D2
1027         vpand           $MASK,$H2,$H2
1028         vpaddq          $D2,$H3,$H3             # h2 -> h3
1029
1030         vpsrlq          \$26,$H0,$D0
1031         vpand           $MASK,$H0,$H0
1032         vpaddq          $D0,$H1,$H1             # h0 -> h1
1033
1034         vpsrlq          \$26,$H3,$D3
1035         vpand           $MASK,$H3,$H3
1036         vpaddq          $D3,$H4,$H4             # h3 -> h4
1037
1038         ja              .Loop_avx
1039
1040 .Lskip_loop_avx:
1041         ################################################################
1042         # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1043
1044         vpshufd         \$0x10,$D4,$D4          # r0^n, xx12 -> x1x2
1045         add             \$32,$len
1046         jnz             .Long_tail_avx
1047
1048         vpaddq          $H2,$T2,$T2
1049         vpaddq          $H0,$T0,$T0
1050         vpaddq          $H1,$T1,$T1
1051         vpaddq          $H3,$T3,$T3
1052         vpaddq          $H4,$T4,$T4
1053
1054 .Long_tail_avx:
1055         vmovdqa         $H2,0x20(%r11)
1056         vmovdqa         $H0,0x00(%r11)
1057         vmovdqa         $H1,0x10(%r11)
1058         vmovdqa         $H3,0x30(%r11)
1059         vmovdqa         $H4,0x40(%r11)
1060
1061         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1062         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1063         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1064         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1065         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1066
1067         vpmuludq        $T2,$D4,$D2             # d2 = h2*r0
1068         vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
1069          vpshufd        \$0x10,`16*1-64`($ctx),$H2              # r1^n
1070         vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
1071         vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
1072         vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
1073
1074         vpmuludq        $T3,$H2,$H0             # h3*r1
1075         vpaddq          $H0,$D4,$D4             # d4 += h3*r1
1076          vpshufd        \$0x10,`16*2-64`($ctx),$H3              # s1^n
1077         vpmuludq        $T2,$H2,$H1             # h2*r1
1078         vpaddq          $H1,$D3,$D3             # d3 += h2*r1
1079          vpshufd        \$0x10,`16*3-64`($ctx),$H4              # r2^n
1080         vpmuludq        $T1,$H2,$H0             # h1*r1
1081         vpaddq          $H0,$D2,$D2             # d2 += h1*r1
1082         vpmuludq        $T0,$H2,$H2             # h0*r1
1083         vpaddq          $H2,$D1,$D1             # d1 += h0*r1
1084         vpmuludq        $T4,$H3,$H3             # h4*s1
1085         vpaddq          $H3,$D0,$D0             # d0 += h4*s1
1086
1087          vpshufd        \$0x10,`16*4-64`($ctx),$H2              # s2^n
1088         vpmuludq        $T2,$H4,$H1             # h2*r2
1089         vpaddq          $H1,$D4,$D4             # d4 += h2*r2
1090         vpmuludq        $T1,$H4,$H0             # h1*r2
1091         vpaddq          $H0,$D3,$D3             # d3 += h1*r2
1092          vpshufd        \$0x10,`16*5-64`($ctx),$H3              # r3^n
1093         vpmuludq        $T0,$H4,$H4             # h0*r2
1094         vpaddq          $H4,$D2,$D2             # d2 += h0*r2
1095         vpmuludq        $T4,$H2,$H1             # h4*s2
1096         vpaddq          $H1,$D1,$D1             # d1 += h4*s2
1097          vpshufd        \$0x10,`16*6-64`($ctx),$H4              # s3^n
1098         vpmuludq        $T3,$H2,$H2             # h3*s2
1099         vpaddq          $H2,$D0,$D0             # d0 += h3*s2
1100
1101         vpmuludq        $T1,$H3,$H0             # h1*r3
1102         vpaddq          $H0,$D4,$D4             # d4 += h1*r3
1103         vpmuludq        $T0,$H3,$H3             # h0*r3
1104         vpaddq          $H3,$D3,$D3             # d3 += h0*r3
1105          vpshufd        \$0x10,`16*7-64`($ctx),$H2              # r4^n
1106         vpmuludq        $T4,$H4,$H1             # h4*s3
1107         vpaddq          $H1,$D2,$D2             # d2 += h4*s3
1108          vpshufd        \$0x10,`16*8-64`($ctx),$H3              # s4^n
1109         vpmuludq        $T3,$H4,$H0             # h3*s3
1110         vpaddq          $H0,$D1,$D1             # d1 += h3*s3
1111         vpmuludq        $T2,$H4,$H4             # h2*s3
1112         vpaddq          $H4,$D0,$D0             # d0 += h2*s3
1113
1114         vpmuludq        $T0,$H2,$H2             # h0*r4
1115         vpaddq          $H2,$D4,$D4             # h4 = d4 + h0*r4
1116         vpmuludq        $T4,$H3,$H1             # h4*s4
1117         vpaddq          $H1,$D3,$D3             # h3 = d3 + h4*s4
1118         vpmuludq        $T3,$H3,$H0             # h3*s4
1119         vpaddq          $H0,$D2,$D2             # h2 = d2 + h3*s4
1120         vpmuludq        $T2,$H3,$H1             # h2*s4
1121         vpaddq          $H1,$D1,$D1             # h1 = d1 + h2*s4
1122         vpmuludq        $T1,$H3,$H3             # h1*s4
1123         vpaddq          $H3,$D0,$D0             # h0 = d0 + h1*s4
1124
1125         jz              .Lshort_tail_avx
1126
1127         vmovdqu         16*0($inp),$H0          # load input
1128         vmovdqu         16*1($inp),$H1
1129
1130         vpsrldq         \$6,$H0,$H2             # splat input
1131         vpsrldq         \$6,$H1,$H3
1132         vpunpckhqdq     $H1,$H0,$H4             # 4
1133         vpunpcklqdq     $H1,$H0,$H0             # 0:1
1134         vpunpcklqdq     $H3,$H2,$H3             # 2:3
1135
1136         vpsrlq          \$40,$H4,$H4            # 4
1137         vpsrlq          \$26,$H0,$H1
1138         vpand           $MASK,$H0,$H0           # 0
1139         vpsrlq          \$4,$H3,$H2
1140         vpand           $MASK,$H1,$H1           # 1
1141         vpsrlq          \$30,$H3,$H3
1142         vpand           $MASK,$H2,$H2           # 2
1143         vpand           $MASK,$H3,$H3           # 3
1144         vpor            32(%rcx),$H4,$H4        # padbit, yes, always
1145
1146         vpshufd         \$0x32,`16*0-64`($ctx),$T4      # r0^n, 34xx -> x3x4
1147         vpaddq          0x00(%r11),$H0,$H0
1148         vpaddq          0x10(%r11),$H1,$H1
1149         vpaddq          0x20(%r11),$H2,$H2
1150         vpaddq          0x30(%r11),$H3,$H3
1151         vpaddq          0x40(%r11),$H4,$H4
1152
1153         ################################################################
1154         # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1155
1156         vpmuludq        $H0,$T4,$T0             # h0*r0
1157         vpaddq          $T0,$D0,$D0             # d0 += h0*r0
1158         vpmuludq        $H1,$T4,$T1             # h1*r0
1159         vpaddq          $T1,$D1,$D1             # d1 += h1*r0
1160         vpmuludq        $H2,$T4,$T0             # h2*r0
1161         vpaddq          $T0,$D2,$D2             # d2 += h2*r0
1162          vpshufd        \$0x32,`16*1-64`($ctx),$T2              # r1^n
1163         vpmuludq        $H3,$T4,$T1             # h3*r0
1164         vpaddq          $T1,$D3,$D3             # d3 += h3*r0
1165         vpmuludq        $H4,$T4,$T4             # h4*r0
1166         vpaddq          $T4,$D4,$D4             # d4 += h4*r0
1167
1168         vpmuludq        $H3,$T2,$T0             # h3*r1
1169         vpaddq          $T0,$D4,$D4             # d4 += h3*r1
1170          vpshufd        \$0x32,`16*2-64`($ctx),$T3              # s1
1171         vpmuludq        $H2,$T2,$T1             # h2*r1
1172         vpaddq          $T1,$D3,$D3             # d3 += h2*r1
1173          vpshufd        \$0x32,`16*3-64`($ctx),$T4              # r2
1174         vpmuludq        $H1,$T2,$T0             # h1*r1
1175         vpaddq          $T0,$D2,$D2             # d2 += h1*r1
1176         vpmuludq        $H0,$T2,$T2             # h0*r1
1177         vpaddq          $T2,$D1,$D1             # d1 += h0*r1
1178         vpmuludq        $H4,$T3,$T3             # h4*s1
1179         vpaddq          $T3,$D0,$D0             # d0 += h4*s1
1180
1181          vpshufd        \$0x32,`16*4-64`($ctx),$T2              # s2
1182         vpmuludq        $H2,$T4,$T1             # h2*r2
1183         vpaddq          $T1,$D4,$D4             # d4 += h2*r2
1184         vpmuludq        $H1,$T4,$T0             # h1*r2
1185         vpaddq          $T0,$D3,$D3             # d3 += h1*r2
1186          vpshufd        \$0x32,`16*5-64`($ctx),$T3              # r3
1187         vpmuludq        $H0,$T4,$T4             # h0*r2
1188         vpaddq          $T4,$D2,$D2             # d2 += h0*r2
1189         vpmuludq        $H4,$T2,$T1             # h4*s2
1190         vpaddq          $T1,$D1,$D1             # d1 += h4*s2
1191          vpshufd        \$0x32,`16*6-64`($ctx),$T4              # s3
1192         vpmuludq        $H3,$T2,$T2             # h3*s2
1193         vpaddq          $T2,$D0,$D0             # d0 += h3*s2
1194
1195         vpmuludq        $H1,$T3,$T0             # h1*r3
1196         vpaddq          $T0,$D4,$D4             # d4 += h1*r3
1197         vpmuludq        $H0,$T3,$T3             # h0*r3
1198         vpaddq          $T3,$D3,$D3             # d3 += h0*r3
1199          vpshufd        \$0x32,`16*7-64`($ctx),$T2              # r4
1200         vpmuludq        $H4,$T4,$T1             # h4*s3
1201         vpaddq          $T1,$D2,$D2             # d2 += h4*s3
1202          vpshufd        \$0x32,`16*8-64`($ctx),$T3              # s4
1203         vpmuludq        $H3,$T4,$T0             # h3*s3
1204         vpaddq          $T0,$D1,$D1             # d1 += h3*s3
1205         vpmuludq        $H2,$T4,$T4             # h2*s3
1206         vpaddq          $T4,$D0,$D0             # d0 += h2*s3
1207
1208         vpmuludq        $H0,$T2,$T2             # h0*r4
1209         vpaddq          $T2,$D4,$D4             # d4 += h0*r4
1210         vpmuludq        $H4,$T3,$T1             # h4*s4
1211         vpaddq          $T1,$D3,$D3             # d3 += h4*s4
1212         vpmuludq        $H3,$T3,$T0             # h3*s4
1213         vpaddq          $T0,$D2,$D2             # d2 += h3*s4
1214         vpmuludq        $H2,$T3,$T1             # h2*s4
1215         vpaddq          $T1,$D1,$D1             # d1 += h2*s4
1216         vpmuludq        $H1,$T3,$T3             # h1*s4
1217         vpaddq          $T3,$D0,$D0             # d0 += h1*s4
1218
1219 .Lshort_tail_avx:
1220         ################################################################
1221         # horizontal addition
1222
1223         vpsrldq         \$8,$D4,$T4
1224         vpsrldq         \$8,$D3,$T3
1225         vpsrldq         \$8,$D1,$T1
1226         vpsrldq         \$8,$D0,$T0
1227         vpsrldq         \$8,$D2,$T2
1228         vpaddq          $T3,$D3,$D3
1229         vpaddq          $T4,$D4,$D4
1230         vpaddq          $T0,$D0,$D0
1231         vpaddq          $T1,$D1,$D1
1232         vpaddq          $T2,$D2,$D2
1233
1234         ################################################################
1235         # lazy reduction
1236
1237         vpsrlq          \$26,$D3,$H3
1238         vpand           $MASK,$D3,$D3
1239         vpaddq          $H3,$D4,$D4             # h3 -> h4
1240
1241         vpsrlq          \$26,$D0,$H0
1242         vpand           $MASK,$D0,$D0
1243         vpaddq          $H0,$D1,$D1             # h0 -> h1
1244
1245         vpsrlq          \$26,$D4,$H4
1246         vpand           $MASK,$D4,$D4
1247
1248         vpsrlq          \$26,$D1,$H1
1249         vpand           $MASK,$D1,$D1
1250         vpaddq          $H1,$D2,$D2             # h1 -> h2
1251
1252         vpaddq          $H4,$D0,$D0
1253         vpsllq          \$2,$H4,$H4
1254         vpaddq          $H4,$D0,$D0             # h4 -> h0
1255
1256         vpsrlq          \$26,$D2,$H2
1257         vpand           $MASK,$D2,$D2
1258         vpaddq          $H2,$D3,$D3             # h2 -> h3
1259
1260         vpsrlq          \$26,$D0,$H0
1261         vpand           $MASK,$D0,$D0
1262         vpaddq          $H0,$D1,$D1             # h0 -> h1
1263
1264         vpsrlq          \$26,$D3,$H3
1265         vpand           $MASK,$D3,$D3
1266         vpaddq          $H3,$D4,$D4             # h3 -> h4
1267
1268         vmovd           $D0,`4*0-48-64`($ctx)   # save partially reduced
1269         vmovd           $D1,`4*1-48-64`($ctx)
1270         vmovd           $D2,`4*2-48-64`($ctx)
1271         vmovd           $D3,`4*3-48-64`($ctx)
1272         vmovd           $D4,`4*4-48-64`($ctx)
1273 ___
1274 $code.=<<___    if ($win64);
1275         vmovdqa         0x50(%r11),%xmm6
1276         vmovdqa         0x60(%r11),%xmm7
1277         vmovdqa         0x70(%r11),%xmm8
1278         vmovdqa         0x80(%r11),%xmm9
1279         vmovdqa         0x90(%r11),%xmm10
1280         vmovdqa         0xa0(%r11),%xmm11
1281         vmovdqa         0xb0(%r11),%xmm12
1282         vmovdqa         0xc0(%r11),%xmm13
1283         vmovdqa         0xd0(%r11),%xmm14
1284         vmovdqa         0xe0(%r11),%xmm15
1285         lea             0xf8(%r11),%rsp
1286 .Ldo_avx_epilogue:
1287 ___
1288 $code.=<<___    if (!$win64);
1289         lea             0x58(%r11),%rsp
1290 ___
1291 $code.=<<___;
1292         vzeroupper
1293         ret
1294 .size   poly1305_blocks_avx,.-poly1305_blocks_avx
1295
1296 .type   poly1305_emit_avx,\@function,3
1297 .align  32
1298 poly1305_emit_avx:
1299         cmpl    \$0,20($ctx)    # is_base2_26?
1300         je      .Lemit
1301
1302         mov     0($ctx),%eax    # load hash value base 2^26
1303         mov     4($ctx),%ecx
1304         mov     8($ctx),%r8d
1305         mov     12($ctx),%r11d
1306         mov     16($ctx),%r10d
1307
1308         shl     \$26,%rcx       # base 2^26 -> base 2^64
1309         mov     %r8,%r9
1310         shl     \$52,%r8
1311         add     %rcx,%rax
1312         shr     \$12,%r9
1313         add     %rax,%r8        # h0
1314         adc     \$0,%r9
1315
1316         shl     \$14,%r11
1317         mov     %r10,%rax
1318         shr     \$24,%r10
1319         add     %r11,%r9
1320         shl     \$40,%rax
1321         add     %rax,%r9        # h1
1322         adc     \$0,%r10        # h2
1323
1324         mov     %r10,%rax       # could be partially reduced, so reduce
1325         mov     %r10,%rcx
1326         and     \$3,%r10
1327         shr     \$2,%rax
1328         and     \$-4,%rcx
1329         add     %rcx,%rax
1330         add     %rax,%r8
1331         adc     \$0,%r9
1332         adc     \$0,%r10
1333
1334         mov     %r8,%rax
1335         add     \$5,%r8         # compare to modulus
1336         mov     %r9,%rcx
1337         adc     \$0,%r9
1338         adc     \$0,%r10
1339         shr     \$2,%r10        # did 130-bit value overfow?
1340         cmovnz  %r8,%rax
1341         cmovnz  %r9,%rcx
1342
1343         add     0($nonce),%rax  # accumulate nonce
1344         adc     8($nonce),%rcx
1345         mov     %rax,0($mac)    # write result
1346         mov     %rcx,8($mac)
1347
1348         ret
1349 .size   poly1305_emit_avx,.-poly1305_emit_avx
1350 ___
1351
1352 if ($avx>1) {
1353 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1354     map("%ymm$_",(0..15));
1355 my $S4=$MASK;
1356
1357 $code.=<<___;
1358 .type   poly1305_blocks_avx2,\@function,4
1359 .align  32
1360 poly1305_blocks_avx2:
1361         mov     20($ctx),%r8d           # is_base2_26
1362         cmp     \$128,$len
1363         jae     .Lblocks_avx2
1364         test    %r8d,%r8d
1365         jz      .Lblocks
1366
1367 .Lblocks_avx2:
1368         and     \$-16,$len
1369         jz      .Lno_data_avx2
1370
1371         vzeroupper
1372
1373         test    %r8d,%r8d
1374         jz      .Lbase2_64_avx2
1375
1376         test    \$63,$len
1377         jz      .Leven_avx2
1378
1379         push    %rbx
1380         push    %rbp
1381         push    %r12
1382         push    %r13
1383         push    %r14
1384         push    %r15
1385 .Lblocks_avx2_body:
1386
1387         mov     $len,%r15               # reassign $len
1388
1389         mov     0($ctx),$d1             # load hash value
1390         mov     8($ctx),$d2
1391         mov     16($ctx),$h2#d
1392
1393         mov     24($ctx),$r0            # load r
1394         mov     32($ctx),$s1
1395
1396         ################################# base 2^26 -> base 2^64
1397         mov     $d1#d,$h0#d
1398         and     \$`-1*(1<<31)`,$d1
1399         mov     $d2,$r1                 # borrow $r1
1400         mov     $d2#d,$h1#d
1401         and     \$`-1*(1<<31)`,$d2
1402
1403         shr     \$6,$d1
1404         shl     \$52,$r1
1405         add     $d1,$h0
1406         shr     \$12,$h1
1407         shr     \$18,$d2
1408         add     $r1,$h0
1409         adc     $d2,$h1
1410
1411         mov     $h2,$d1
1412         shl     \$40,$d1
1413         shr     \$24,$h2
1414         add     $d1,$h1
1415         adc     \$0,$h2                 # can be partially reduced...
1416
1417         mov     \$-4,$d2                # ... so reduce
1418         mov     $h2,$d1
1419         and     $h2,$d2
1420         shr     \$2,$d1
1421         and     \$3,$h2
1422         add     $d2,$d1                 # =*5
1423         add     $d1,$h0
1424         adc     \$0,$h1
1425         adc     \$0,$h2
1426
1427         mov     $s1,$r1
1428         mov     $s1,%rax
1429         shr     \$2,$s1
1430         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
1431
1432 .Lbase2_26_pre_avx2:
1433         add     0($inp),$h0             # accumulate input
1434         adc     8($inp),$h1
1435         lea     16($inp),$inp
1436         adc     $padbit,$h2
1437         sub     \$16,%r15
1438
1439         call    __poly1305_block
1440         mov     $r1,%rax
1441
1442         test    \$63,%r15
1443         jnz     .Lbase2_26_pre_avx2
1444
1445         test    $padbit,$padbit         # if $padbit is zero,
1446         jz      .Lstore_base2_64_avx2   # store hash in base 2^64 format
1447
1448         ################################# base 2^64 -> base 2^26
1449         mov     $h0,%rax
1450         mov     $h0,%rdx
1451         shr     \$52,$h0
1452         mov     $h1,$r0
1453         mov     $h1,$r1
1454         shr     \$26,%rdx
1455         and     \$0x3ffffff,%rax        # h[0]
1456         shl     \$12,$r0
1457         and     \$0x3ffffff,%rdx        # h[1]
1458         shr     \$14,$h1
1459         or      $r0,$h0
1460         shl     \$24,$h2
1461         and     \$0x3ffffff,$h0         # h[2]
1462         shr     \$40,$r1
1463         and     \$0x3ffffff,$h1         # h[3]
1464         or      $r1,$h2                 # h[4]
1465
1466         test    %r15,%r15
1467         jz      .Lstore_base2_26_avx2
1468
1469         vmovd   %rax#d,%x#$H0
1470         vmovd   %rdx#d,%x#$H1
1471         vmovd   $h0#d,%x#$H2
1472         vmovd   $h1#d,%x#$H3
1473         vmovd   $h2#d,%x#$H4
1474         jmp     .Lproceed_avx2
1475
1476 .align  32
1477 .Lstore_base2_64_avx2:
1478         mov     $h0,0($ctx)
1479         mov     $h1,8($ctx)
1480         mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
1481         jmp     .Ldone_avx2
1482
1483 .align  16
1484 .Lstore_base2_26_avx2:
1485         mov     %rax#d,0($ctx)          # store hash value base 2^26
1486         mov     %rdx#d,4($ctx)
1487         mov     $h0#d,8($ctx)
1488         mov     $h1#d,12($ctx)
1489         mov     $h2#d,16($ctx)
1490 .align  16
1491 .Ldone_avx2:
1492         mov     0(%rsp),%r15
1493         mov     8(%rsp),%r14
1494         mov     16(%rsp),%r13
1495         mov     24(%rsp),%r12
1496         mov     32(%rsp),%rbp
1497         mov     40(%rsp),%rbx
1498         lea     48(%rsp),%rsp
1499 .Lno_data_avx2:
1500 .Lblocks_avx2_epilogue:
1501         ret
1502
1503 .align  32
1504 .Lbase2_64_avx2:
1505         push    %rbx
1506         push    %rbp
1507         push    %r12
1508         push    %r13
1509         push    %r14
1510         push    %r15
1511 .Lbase2_64_avx2_body:
1512
1513         mov     $len,%r15               # reassign $len
1514
1515         mov     24($ctx),$r0            # load r
1516         mov     32($ctx),$s1
1517
1518         mov     0($ctx),$h0             # load hash value
1519         mov     8($ctx),$h1
1520         mov     16($ctx),$h2#d
1521
1522         mov     $s1,$r1
1523         mov     $s1,%rax
1524         shr     \$2,$s1
1525         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
1526
1527         test    \$63,$len
1528         jz      .Linit_avx2
1529
1530 .Lbase2_64_pre_avx2:
1531         add     0($inp),$h0             # accumulate input
1532         adc     8($inp),$h1
1533         lea     16($inp),$inp
1534         adc     $padbit,$h2
1535         sub     \$16,%r15
1536
1537         call    __poly1305_block
1538         mov     $r1,%rax
1539
1540         test    \$63,%r15
1541         jnz     .Lbase2_64_pre_avx2
1542
1543 .Linit_avx2:
1544         ################################# base 2^64 -> base 2^26
1545         mov     $h0,%rax
1546         mov     $h0,%rdx
1547         shr     \$52,$h0
1548         mov     $h1,$d1
1549         mov     $h1,$d2
1550         shr     \$26,%rdx
1551         and     \$0x3ffffff,%rax        # h[0]
1552         shl     \$12,$d1
1553         and     \$0x3ffffff,%rdx        # h[1]
1554         shr     \$14,$h1
1555         or      $d1,$h0
1556         shl     \$24,$h2
1557         and     \$0x3ffffff,$h0         # h[2]
1558         shr     \$40,$d2
1559         and     \$0x3ffffff,$h1         # h[3]
1560         or      $d2,$h2                 # h[4]
1561
1562         vmovd   %rax#d,%x#$H0
1563         vmovd   %rdx#d,%x#$H1
1564         vmovd   $h0#d,%x#$H2
1565         vmovd   $h1#d,%x#$H3
1566         vmovd   $h2#d,%x#$H4
1567         movl    \$1,20($ctx)            # set is_base2_26
1568
1569         call    __poly1305_init_avx
1570
1571 .Lproceed_avx2:
1572         mov     %r15,$len
1573
1574         mov     0(%rsp),%r15
1575         mov     8(%rsp),%r14
1576         mov     16(%rsp),%r13
1577         mov     24(%rsp),%r12
1578         mov     32(%rsp),%rbp
1579         mov     40(%rsp),%rbx
1580         lea     48(%rsp),%rax
1581         lea     48(%rsp),%rsp
1582 .Lbase2_64_avx2_epilogue:
1583         jmp     .Ldo_avx2
1584
1585 .align  32
1586 .Leven_avx2:
1587         vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
1588         vmovd           4*1($ctx),%x#$H1
1589         vmovd           4*2($ctx),%x#$H2
1590         vmovd           4*3($ctx),%x#$H3
1591         vmovd           4*4($ctx),%x#$H4
1592
1593 .Ldo_avx2:
1594 ___
1595 $code.=<<___    if (!$win64);
1596         lea             -8(%rsp),%r11
1597         sub             \$0x128,%rsp
1598 ___
1599 $code.=<<___    if ($win64);
1600         lea             -0xf8(%rsp),%r11
1601         sub             \$0x1c8,%rsp
1602         vmovdqa         %xmm6,0x50(%r11)
1603         vmovdqa         %xmm7,0x60(%r11)
1604         vmovdqa         %xmm8,0x70(%r11)
1605         vmovdqa         %xmm9,0x80(%r11)
1606         vmovdqa         %xmm10,0x90(%r11)
1607         vmovdqa         %xmm11,0xa0(%r11)
1608         vmovdqa         %xmm12,0xb0(%r11)
1609         vmovdqa         %xmm13,0xc0(%r11)
1610         vmovdqa         %xmm14,0xd0(%r11)
1611         vmovdqa         %xmm15,0xe0(%r11)
1612 .Ldo_avx2_body:
1613 ___
1614 $code.=<<___;
1615         lea             48+64($ctx),$ctx        # size optimization
1616         lea             .Lconst(%rip),%rcx
1617
1618         # expand and copy pre-calculated table to stack
1619         vmovdqu         `16*0-64`($ctx),%x#$T2
1620         and             \$-512,%rsp
1621         vmovdqu         `16*1-64`($ctx),%x#$T3
1622         vmovdqu         `16*2-64`($ctx),%x#$T4
1623         vmovdqu         `16*3-64`($ctx),%x#$D0
1624         vmovdqu         `16*4-64`($ctx),%x#$D1
1625         vmovdqu         `16*5-64`($ctx),%x#$D2
1626         vmovdqu         `16*6-64`($ctx),%x#$D3
1627         vpermq          \$0x15,$T2,$T2          # 00003412 -> 12343434
1628         vmovdqu         `16*7-64`($ctx),%x#$D4
1629         vpermq          \$0x15,$T3,$T3
1630         vpshufd         \$0xc8,$T2,$T2          # 12343434 -> 14243444
1631         vmovdqu         `16*8-64`($ctx),%x#$MASK
1632         vpermq          \$0x15,$T4,$T4
1633         vpshufd         \$0xc8,$T3,$T3
1634         vmovdqa         $T2,0x00(%rsp)
1635         vpermq          \$0x15,$D0,$D0
1636         vpshufd         \$0xc8,$T4,$T4
1637         vmovdqa         $T3,0x20(%rsp)
1638         vpermq          \$0x15,$D1,$D1
1639         vpshufd         \$0xc8,$D0,$D0
1640         vmovdqa         $T4,0x40(%rsp)
1641         vpermq          \$0x15,$D2,$D2
1642         vpshufd         \$0xc8,$D1,$D1
1643         vmovdqa         $D0,0x60(%rsp)
1644         vpermq          \$0x15,$D3,$D3
1645         vpshufd         \$0xc8,$D2,$D2
1646         vmovdqa         $D1,0x80(%rsp)
1647         vpermq          \$0x15,$D4,$D4
1648         vpshufd         \$0xc8,$D3,$D3
1649         vmovdqa         $D2,0xa0(%rsp)
1650         vpermq          \$0x15,$MASK,$MASK
1651         vpshufd         \$0xc8,$D4,$D4
1652         vmovdqa         $D3,0xc0(%rsp)
1653         vpshufd         \$0xc8,$MASK,$MASK
1654         vmovdqa         $D4,0xe0(%rsp)
1655         vmovdqa         $MASK,0x100(%rsp)
1656         vmovdqa         64(%rcx),$MASK          # .Lmask26
1657
1658         ################################################################
1659         # load input
1660         vmovdqu         16*0($inp),%x#$T0
1661         vmovdqu         16*1($inp),%x#$T1
1662         vinserti128     \$1,16*2($inp),$T0,$T0
1663         vinserti128     \$1,16*3($inp),$T1,$T1
1664         lea             16*4($inp),$inp
1665
1666         vpsrldq         \$6,$T0,$T2             # splat input
1667         vpsrldq         \$6,$T1,$T3
1668         vpunpckhqdq     $T1,$T0,$T4             # 4
1669         vpunpcklqdq     $T3,$T2,$T2             # 2:3
1670         vpunpcklqdq     $T1,$T0,$T0             # 0:1
1671
1672         vpsrlq          \$30,$T2,$T3
1673         vpsrlq          \$4,$T2,$T2
1674         vpsrlq          \$26,$T0,$T1
1675         vpsrlq          \$40,$T4,$T4            # 4
1676         vpand           $MASK,$T2,$T2           # 2
1677         vpand           $MASK,$T0,$T0           # 0
1678         vpand           $MASK,$T1,$T1           # 1
1679         vpand           $MASK,$T3,$T3           # 3
1680         vpor            32(%rcx),$T4,$T4        # padbit, yes, always
1681
1682         lea             0x90(%rsp),%rax         # size optimization
1683         vpaddq          $H2,$T2,$H2             # accumulate input
1684         sub             \$64,$len
1685         jz              .Ltail_avx2
1686         jmp             .Loop_avx2
1687
1688 .align  32
1689 .Loop_avx2:
1690         ################################################################
1691         # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
1692         # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
1693         # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
1694         # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
1695         #   \________/\________/
1696         ################################################################
1697         #vpaddq         $H2,$T2,$H2             # accumulate input
1698         vpaddq          $H0,$T0,$H0
1699         vmovdqa         `32*0`(%rsp),$T0        # r0^4
1700         vpaddq          $H1,$T1,$H1
1701         vmovdqa         `32*1`(%rsp),$T1        # r1^4
1702         vpaddq          $H3,$T3,$H3
1703         vmovdqa         `32*3`(%rsp),$T2        # r2^4
1704         vpaddq          $H4,$T4,$H4
1705         vmovdqa         `32*6-0x90`(%rax),$T3   # s3^4
1706         vmovdqa         `32*8-0x90`(%rax),$S4   # s4^4
1707
1708         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1709         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1710         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1711         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1712         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1713         #
1714         # however, as h2 is "chronologically" first one available pull
1715         # corresponding operations up, so it's
1716         #
1717         # d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
1718         # d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
1719         # d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1720         # d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
1721         # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
1722
1723         vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
1724         vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
1725         vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
1726         vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
1727         vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
1728
1729         vpmuludq        $H0,$T1,$T4             # h0*r1
1730         vpmuludq        $H1,$T1,$H2             # h1*r1, borrow $H2 as temp
1731         vpaddq          $T4,$D1,$D1             # d1 += h0*r1
1732         vpaddq          $H2,$D2,$D2             # d2 += h1*r1
1733         vpmuludq        $H3,$T1,$T4             # h3*r1
1734         vpmuludq        `32*2`(%rsp),$H4,$H2    # h4*s1
1735         vpaddq          $T4,$D4,$D4             # d4 += h3*r1
1736         vpaddq          $H2,$D0,$D0             # d0 += h4*s1
1737          vmovdqa        `32*4-0x90`(%rax),$T1   # s2
1738
1739         vpmuludq        $H0,$T0,$T4             # h0*r0
1740         vpmuludq        $H1,$T0,$H2             # h1*r0
1741         vpaddq          $T4,$D0,$D0             # d0 += h0*r0
1742         vpaddq          $H2,$D1,$D1             # d1 += h1*r0
1743         vpmuludq        $H3,$T0,$T4             # h3*r0
1744         vpmuludq        $H4,$T0,$H2             # h4*r0
1745          vmovdqu        16*0($inp),%x#$T0       # load input
1746         vpaddq          $T4,$D3,$D3             # d3 += h3*r0
1747         vpaddq          $H2,$D4,$D4             # d4 += h4*r0
1748          vinserti128    \$1,16*2($inp),$T0,$T0
1749
1750         vpmuludq        $H3,$T1,$T4             # h3*s2
1751         vpmuludq        $H4,$T1,$H2             # h4*s2
1752          vmovdqu        16*1($inp),%x#$T1
1753         vpaddq          $T4,$D0,$D0             # d0 += h3*s2
1754         vpaddq          $H2,$D1,$D1             # d1 += h4*s2
1755          vmovdqa        `32*5-0x90`(%rax),$H2   # r3
1756         vpmuludq        $H1,$T2,$T4             # h1*r2
1757         vpmuludq        $H0,$T2,$T2             # h0*r2
1758         vpaddq          $T4,$D3,$D3             # d3 += h1*r2
1759         vpaddq          $T2,$D2,$D2             # d2 += h0*r2
1760          vinserti128    \$1,16*3($inp),$T1,$T1
1761          lea            16*4($inp),$inp
1762
1763         vpmuludq        $H1,$H2,$T4             # h1*r3
1764         vpmuludq        $H0,$H2,$H2             # h0*r3
1765          vpsrldq        \$6,$T0,$T2             # splat input
1766         vpaddq          $T4,$D4,$D4             # d4 += h1*r3
1767         vpaddq          $H2,$D3,$D3             # d3 += h0*r3
1768         vpmuludq        $H3,$T3,$T4             # h3*s3
1769         vpmuludq        $H4,$T3,$H2             # h4*s3
1770          vpsrldq        \$6,$T1,$T3
1771         vpaddq          $T4,$D1,$D1             # d1 += h3*s3
1772         vpaddq          $H2,$D2,$D2             # d2 += h4*s3
1773          vpunpckhqdq    $T1,$T0,$T4             # 4
1774
1775         vpmuludq        $H3,$S4,$H3             # h3*s4
1776         vpmuludq        $H4,$S4,$H4             # h4*s4
1777          vpunpcklqdq    $T1,$T0,$T0             # 0:1
1778         vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
1779         vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
1780          vpunpcklqdq    $T3,$T2,$T3             # 2:3
1781         vpmuludq        `32*7-0x90`(%rax),$H0,$H4       # h0*r4
1782         vpmuludq        $H1,$S4,$H0             # h1*s4
1783         vmovdqa         64(%rcx),$MASK          # .Lmask26
1784         vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
1785         vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
1786
1787         ################################################################
1788         # lazy reduction (interleaved with tail of input splat)
1789
1790         vpsrlq          \$26,$H3,$D3
1791         vpand           $MASK,$H3,$H3
1792         vpaddq          $D3,$H4,$H4             # h3 -> h4
1793
1794         vpsrlq          \$26,$H0,$D0
1795         vpand           $MASK,$H0,$H0
1796         vpaddq          $D0,$D1,$H1             # h0 -> h1
1797
1798         vpsrlq          \$26,$H4,$D4
1799         vpand           $MASK,$H4,$H4
1800
1801          vpsrlq         \$4,$T3,$T2
1802
1803         vpsrlq          \$26,$H1,$D1
1804         vpand           $MASK,$H1,$H1
1805         vpaddq          $D1,$H2,$H2             # h1 -> h2
1806
1807         vpaddq          $D4,$H0,$H0
1808         vpsllq          \$2,$D4,$D4
1809         vpaddq          $D4,$H0,$H0             # h4 -> h0
1810
1811          vpand          $MASK,$T2,$T2           # 2
1812          vpsrlq         \$26,$T0,$T1
1813
1814         vpsrlq          \$26,$H2,$D2
1815         vpand           $MASK,$H2,$H2
1816         vpaddq          $D2,$H3,$H3             # h2 -> h3
1817
1818          vpaddq         $T2,$H2,$H2             # modulo-scheduled
1819          vpsrlq         \$30,$T3,$T3
1820
1821         vpsrlq          \$26,$H0,$D0
1822         vpand           $MASK,$H0,$H0
1823         vpaddq          $D0,$H1,$H1             # h0 -> h1
1824
1825          vpsrlq         \$40,$T4,$T4            # 4
1826
1827         vpsrlq          \$26,$H3,$D3
1828         vpand           $MASK,$H3,$H3
1829         vpaddq          $D3,$H4,$H4             # h3 -> h4
1830
1831          vpand          $MASK,$T0,$T0           # 0
1832          vpand          $MASK,$T1,$T1           # 1
1833          vpand          $MASK,$T3,$T3           # 3
1834          vpor           32(%rcx),$T4,$T4        # padbit, yes, always
1835
1836         sub             \$64,$len
1837         jnz             .Loop_avx2
1838
1839         .byte           0x66,0x90
1840 .Ltail_avx2:
1841         ################################################################
1842         # while above multiplications were by r^4 in all lanes, in last
1843         # iteration we multiply least significant lane by r^4 and most
1844         # significant one by r, so copy of above except that references
1845         # to the precomputed table are displaced by 4...
1846
1847         #vpaddq         $H2,$T2,$H2             # accumulate input
1848         vpaddq          $H0,$T0,$H0
1849         vmovdqu         `32*0+4`(%rsp),$T0      # r0^4
1850         vpaddq          $H1,$T1,$H1
1851         vmovdqu         `32*1+4`(%rsp),$T1      # r1^4
1852         vpaddq          $H3,$T3,$H3
1853         vmovdqu         `32*3+4`(%rsp),$T2      # r2^4
1854         vpaddq          $H4,$T4,$H4
1855         vmovdqu         `32*6+4-0x90`(%rax),$T3 # s3^4
1856         vmovdqu         `32*8+4-0x90`(%rax),$S4 # s4^4
1857
1858         vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
1859         vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
1860         vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
1861         vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
1862         vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
1863
1864         vpmuludq        $H0,$T1,$T4             # h0*r1
1865         vpmuludq        $H1,$T1,$H2             # h1*r1
1866         vpaddq          $T4,$D1,$D1             # d1 += h0*r1
1867         vpaddq          $H2,$D2,$D2             # d2 += h1*r1
1868         vpmuludq        $H3,$T1,$T4             # h3*r1
1869         vpmuludq        `32*2+4`(%rsp),$H4,$H2  # h4*s1
1870         vpaddq          $T4,$D4,$D4             # d4 += h3*r1
1871         vpaddq          $H2,$D0,$D0             # d0 += h4*s1
1872
1873         vpmuludq        $H0,$T0,$T4             # h0*r0
1874         vpmuludq        $H1,$T0,$H2             # h1*r0
1875         vpaddq          $T4,$D0,$D0             # d0 += h0*r0
1876          vmovdqu        `32*4+4-0x90`(%rax),$T1 # s2
1877         vpaddq          $H2,$D1,$D1             # d1 += h1*r0
1878         vpmuludq        $H3,$T0,$T4             # h3*r0
1879         vpmuludq        $H4,$T0,$H2             # h4*r0
1880         vpaddq          $T4,$D3,$D3             # d3 += h3*r0
1881         vpaddq          $H2,$D4,$D4             # d4 += h4*r0
1882
1883         vpmuludq        $H3,$T1,$T4             # h3*s2
1884         vpmuludq        $H4,$T1,$H2             # h4*s2
1885         vpaddq          $T4,$D0,$D0             # d0 += h3*s2
1886         vpaddq          $H2,$D1,$D1             # d1 += h4*s2
1887          vmovdqu        `32*5+4-0x90`(%rax),$H2 # r3
1888         vpmuludq        $H1,$T2,$T4             # h1*r2
1889         vpmuludq        $H0,$T2,$T2             # h0*r2
1890         vpaddq          $T4,$D3,$D3             # d3 += h1*r2
1891         vpaddq          $T2,$D2,$D2             # d2 += h0*r2
1892
1893         vpmuludq        $H1,$H2,$T4             # h1*r3
1894         vpmuludq        $H0,$H2,$H2             # h0*r3
1895         vpaddq          $T4,$D4,$D4             # d4 += h1*r3
1896         vpaddq          $H2,$D3,$D3             # d3 += h0*r3
1897         vpmuludq        $H3,$T3,$T4             # h3*s3
1898         vpmuludq        $H4,$T3,$H2             # h4*s3
1899         vpaddq          $T4,$D1,$D1             # d1 += h3*s3
1900         vpaddq          $H2,$D2,$D2             # d2 += h4*s3
1901
1902         vpmuludq        $H3,$S4,$H3             # h3*s4
1903         vpmuludq        $H4,$S4,$H4             # h4*s4
1904         vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
1905         vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
1906         vpmuludq        `32*7+4-0x90`(%rax),$H0,$H4             # h0*r4
1907         vpmuludq        $H1,$S4,$H0             # h1*s4
1908         vmovdqa         64(%rcx),$MASK          # .Lmask26
1909         vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
1910         vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
1911
1912         ################################################################
1913         # horizontal addition
1914
1915         vpsrldq         \$8,$D1,$T1
1916         vpsrldq         \$8,$H2,$T2
1917         vpsrldq         \$8,$H3,$T3
1918         vpsrldq         \$8,$H4,$T4
1919         vpsrldq         \$8,$H0,$T0
1920         vpaddq          $T1,$D1,$D1
1921         vpaddq          $T2,$H2,$H2
1922         vpaddq          $T3,$H3,$H3
1923         vpaddq          $T4,$H4,$H4
1924         vpaddq          $T0,$H0,$H0
1925
1926         vpermq          \$0x2,$H3,$T3
1927         vpermq          \$0x2,$H4,$T4
1928         vpermq          \$0x2,$H0,$T0
1929         vpermq          \$0x2,$D1,$T1
1930         vpermq          \$0x2,$H2,$T2
1931         vpaddq          $T3,$H3,$H3
1932         vpaddq          $T4,$H4,$H4
1933         vpaddq          $T0,$H0,$H0
1934         vpaddq          $T1,$D1,$D1
1935         vpaddq          $T2,$H2,$H2
1936
1937         ################################################################
1938         # lazy reduction
1939
1940         vpsrlq          \$26,$H3,$D3
1941         vpand           $MASK,$H3,$H3
1942         vpaddq          $D3,$H4,$H4             # h3 -> h4
1943
1944         vpsrlq          \$26,$H0,$D0
1945         vpand           $MASK,$H0,$H0
1946         vpaddq          $D0,$D1,$H1             # h0 -> h1
1947
1948         vpsrlq          \$26,$H4,$D4
1949         vpand           $MASK,$H4,$H4
1950
1951         vpsrlq          \$26,$H1,$D1
1952         vpand           $MASK,$H1,$H1
1953         vpaddq          $D1,$H2,$H2             # h1 -> h2
1954
1955         vpaddq          $D4,$H0,$H0
1956         vpsllq          \$2,$D4,$D4
1957         vpaddq          $D4,$H0,$H0             # h4 -> h0
1958
1959         vpsrlq          \$26,$H2,$D2
1960         vpand           $MASK,$H2,$H2
1961         vpaddq          $D2,$H3,$H3             # h2 -> h3
1962
1963         vpsrlq          \$26,$H0,$D0
1964         vpand           $MASK,$H0,$H0
1965         vpaddq          $D0,$H1,$H1             # h0 -> h1
1966
1967         vpsrlq          \$26,$H3,$D3
1968         vpand           $MASK,$H3,$H3
1969         vpaddq          $D3,$H4,$H4             # h3 -> h4
1970
1971         vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
1972         vmovd           %x#$H1,`4*1-48-64`($ctx)
1973         vmovd           %x#$H2,`4*2-48-64`($ctx)
1974         vmovd           %x#$H3,`4*3-48-64`($ctx)
1975         vmovd           %x#$H4,`4*4-48-64`($ctx)
1976 ___
1977 $code.=<<___    if ($win64);
1978         vmovdqa         0x50(%r11),%xmm6
1979         vmovdqa         0x60(%r11),%xmm7
1980         vmovdqa         0x70(%r11),%xmm8
1981         vmovdqa         0x80(%r11),%xmm9
1982         vmovdqa         0x90(%r11),%xmm10
1983         vmovdqa         0xa0(%r11),%xmm11
1984         vmovdqa         0xb0(%r11),%xmm12
1985         vmovdqa         0xc0(%r11),%xmm13
1986         vmovdqa         0xd0(%r11),%xmm14
1987         vmovdqa         0xe0(%r11),%xmm15
1988         lea             0xf8(%r11),%rsp
1989 .Ldo_avx2_epilogue:
1990 ___
1991 $code.=<<___    if (!$win64);
1992         lea             8(%r11),%rsp
1993 ___
1994 $code.=<<___;
1995         vzeroupper
1996         ret
1997 .size   poly1305_blocks_avx2,.-poly1305_blocks_avx2
1998 ___
1999 }
2000 $code.=<<___;
2001 .align  64
2002 .Lconst:
2003 .Lmask24:
2004 .long   0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
2005 .L129:
2006 .long   `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
2007 .Lmask26:
2008 .long   0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
2009 .Lfive:
2010 .long   5,0,5,0,5,0,5,0
2011 ___
2012 }
2013
2014 $code.=<<___;
2015 .asciz  "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2016 .align  16
2017 ___
2018
2019 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2020 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
2021 if ($win64) {
2022 $rec="%rcx";
2023 $frame="%rdx";
2024 $context="%r8";
2025 $disp="%r9";
2026
2027 $code.=<<___;
2028 .extern __imp_RtlVirtualUnwind
2029 .type   se_handler,\@abi-omnipotent
2030 .align  16
2031 se_handler:
2032         push    %rsi
2033         push    %rdi
2034         push    %rbx
2035         push    %rbp
2036         push    %r12
2037         push    %r13
2038         push    %r14
2039         push    %r15
2040         pushfq
2041         sub     \$64,%rsp
2042
2043         mov     120($context),%rax      # pull context->Rax
2044         mov     248($context),%rbx      # pull context->Rip
2045
2046         mov     8($disp),%rsi           # disp->ImageBase
2047         mov     56($disp),%r11          # disp->HandlerData
2048
2049         mov     0(%r11),%r10d           # HandlerData[0]
2050         lea     (%rsi,%r10),%r10        # prologue label
2051         cmp     %r10,%rbx               # context->Rip<.Lprologue
2052         jb      .Lcommon_seh_tail
2053
2054         mov     152($context),%rax      # pull context->Rsp
2055
2056         mov     4(%r11),%r10d           # HandlerData[1]
2057         lea     (%rsi,%r10),%r10        # epilogue label
2058         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
2059         jae     .Lcommon_seh_tail
2060
2061         lea     48(%rax),%rax
2062
2063         mov     -8(%rax),%rbx
2064         mov     -16(%rax),%rbp
2065         mov     -24(%rax),%r12
2066         mov     -32(%rax),%r13
2067         mov     -40(%rax),%r14
2068         mov     -48(%rax),%r15
2069         mov     %rbx,144($context)      # restore context->Rbx
2070         mov     %rbp,160($context)      # restore context->Rbp
2071         mov     %r12,216($context)      # restore context->R12
2072         mov     %r13,224($context)      # restore context->R13
2073         mov     %r14,232($context)      # restore context->R14
2074         mov     %r15,240($context)      # restore context->R14
2075
2076         jmp     .Lcommon_seh_tail
2077 .size   se_handler,.-se_handler
2078
2079 .type   avx_handler,\@abi-omnipotent
2080 .align  16
2081 avx_handler:
2082         push    %rsi
2083         push    %rdi
2084         push    %rbx
2085         push    %rbp
2086         push    %r12
2087         push    %r13
2088         push    %r14
2089         push    %r15
2090         pushfq
2091         sub     \$64,%rsp
2092
2093         mov     120($context),%rax      # pull context->Rax
2094         mov     248($context),%rbx      # pull context->Rip
2095
2096         mov     8($disp),%rsi           # disp->ImageBase
2097         mov     56($disp),%r11          # disp->HandlerData
2098
2099         mov     0(%r11),%r10d           # HandlerData[0]
2100         lea     (%rsi,%r10),%r10        # prologue label
2101         cmp     %r10,%rbx               # context->Rip<prologue label
2102         jb      .Lcommon_seh_tail
2103
2104         mov     152($context),%rax      # pull context->Rsp
2105
2106         mov     4(%r11),%r10d           # HandlerData[1]
2107         lea     (%rsi,%r10),%r10        # epilogue label
2108         cmp     %r10,%rbx               # context->Rip>=epilogue label
2109         jae     .Lcommon_seh_tail
2110
2111         mov     208($context),%rax      # pull context->R11
2112
2113         lea     0x50(%rax),%rsi
2114         lea     0xf8(%rax),%rax
2115         lea     512($context),%rdi      # &context.Xmm6
2116         mov     \$20,%ecx
2117         .long   0xa548f3fc              # cld; rep movsq
2118
2119 .Lcommon_seh_tail:
2120         mov     8(%rax),%rdi
2121         mov     16(%rax),%rsi
2122         mov     %rax,152($context)      # restore context->Rsp
2123         mov     %rsi,168($context)      # restore context->Rsi
2124         mov     %rdi,176($context)      # restore context->Rdi
2125
2126         mov     40($disp),%rdi          # disp->ContextRecord
2127         mov     $context,%rsi           # context
2128         mov     \$154,%ecx              # sizeof(CONTEXT)
2129         .long   0xa548f3fc              # cld; rep movsq
2130
2131         mov     $disp,%rsi
2132         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
2133         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
2134         mov     0(%rsi),%r8             # arg3, disp->ControlPc
2135         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
2136         mov     40(%rsi),%r10           # disp->ContextRecord
2137         lea     56(%rsi),%r11           # &disp->HandlerData
2138         lea     24(%rsi),%r12           # &disp->EstablisherFrame
2139         mov     %r10,32(%rsp)           # arg5
2140         mov     %r11,40(%rsp)           # arg6
2141         mov     %r12,48(%rsp)           # arg7
2142         mov     %rcx,56(%rsp)           # arg8, (NULL)
2143         call    *__imp_RtlVirtualUnwind(%rip)
2144
2145         mov     \$1,%eax                # ExceptionContinueSearch
2146         add     \$64,%rsp
2147         popfq
2148         pop     %r15
2149         pop     %r14
2150         pop     %r13
2151         pop     %r12
2152         pop     %rbp
2153         pop     %rbx
2154         pop     %rdi
2155         pop     %rsi
2156         ret
2157 .size   avx_handler,.-avx_handler
2158
2159 .section        .pdata
2160 .align  4
2161         .rva    .LSEH_begin_poly1305_init
2162         .rva    .LSEH_end_poly1305_init
2163         .rva    .LSEH_info_poly1305_init
2164
2165         .rva    .LSEH_begin_poly1305_blocks
2166         .rva    .LSEH_end_poly1305_blocks
2167         .rva    .LSEH_info_poly1305_blocks
2168
2169         .rva    .LSEH_begin_poly1305_emit
2170         .rva    .LSEH_end_poly1305_emit
2171         .rva    .LSEH_info_poly1305_emit
2172 ___
2173 $code.=<<___ if ($avx);
2174         .rva    .LSEH_begin_poly1305_blocks_avx
2175         .rva    .Lbase2_64_avx
2176         .rva    .LSEH_info_poly1305_blocks_avx_1
2177
2178         .rva    .Lbase2_64_avx
2179         .rva    .Leven_avx
2180         .rva    .LSEH_info_poly1305_blocks_avx_2
2181
2182         .rva    .Leven_avx
2183         .rva    .LSEH_end_poly1305_blocks_avx
2184         .rva    .LSEH_info_poly1305_blocks_avx_3
2185
2186         .rva    .LSEH_begin_poly1305_emit_avx
2187         .rva    .LSEH_end_poly1305_emit_avx
2188         .rva    .LSEH_info_poly1305_emit_avx
2189 ___
2190 $code.=<<___ if ($avx>1);
2191         .rva    .LSEH_begin_poly1305_blocks_avx2
2192         .rva    .Lbase2_64_avx2
2193         .rva    .LSEH_info_poly1305_blocks_avx2_1
2194
2195         .rva    .Lbase2_64_avx2
2196         .rva    .Leven_avx2
2197         .rva    .LSEH_info_poly1305_blocks_avx2_2
2198
2199         .rva    .Leven_avx2
2200         .rva    .LSEH_end_poly1305_blocks_avx2
2201         .rva    .LSEH_info_poly1305_blocks_avx2_3
2202 ___
2203 $code.=<<___;
2204 .section        .xdata
2205 .align  8
2206 .LSEH_info_poly1305_init:
2207         .byte   9,0,0,0
2208         .rva    se_handler
2209         .rva    .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
2210
2211 .LSEH_info_poly1305_blocks:
2212         .byte   9,0,0,0
2213         .rva    se_handler
2214         .rva    .Lblocks_body,.Lblocks_epilogue
2215
2216 .LSEH_info_poly1305_emit:
2217         .byte   9,0,0,0
2218         .rva    se_handler
2219         .rva    .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
2220 ___
2221 $code.=<<___ if ($avx);
2222 .LSEH_info_poly1305_blocks_avx_1:
2223         .byte   9,0,0,0
2224         .rva    se_handler
2225         .rva    .Lblocks_avx_body,.Lblocks_avx_epilogue         # HandlerData[]
2226
2227 .LSEH_info_poly1305_blocks_avx_2:
2228         .byte   9,0,0,0
2229         .rva    se_handler
2230         .rva    .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue     # HandlerData[]
2231
2232 .LSEH_info_poly1305_blocks_avx_3:
2233         .byte   9,0,0,0
2234         .rva    avx_handler
2235         .rva    .Ldo_avx_body,.Ldo_avx_epilogue                 # HandlerData[]
2236
2237 .LSEH_info_poly1305_emit_avx:
2238         .byte   9,0,0,0
2239         .rva    se_handler
2240         .rva    .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
2241 ___
2242 $code.=<<___ if ($avx>1);
2243 .LSEH_info_poly1305_blocks_avx2_1:
2244         .byte   9,0,0,0
2245         .rva    se_handler
2246         .rva    .Lblocks_avx2_body,.Lblocks_avx2_epilogue       # HandlerData[]
2247
2248 .LSEH_info_poly1305_blocks_avx2_2:
2249         .byte   9,0,0,0
2250         .rva    se_handler
2251         .rva    .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue   # HandlerData[]
2252
2253 .LSEH_info_poly1305_blocks_avx2_3:
2254         .byte   9,0,0,0
2255         .rva    avx_handler
2256         .rva    .Ldo_avx2_body,.Ldo_avx2_epilogue               # HandlerData[]
2257 ___
2258 }
2259
2260 foreach (split('\n',$code)) {
2261         s/\`([^\`]*)\`/eval($1)/ge;
2262         s/%r([a-z]+)#d/%e$1/g;
2263         s/%r([0-9]+)#d/%r$1d/g;
2264         s/%x#%y/%x/g;
2265
2266         print $_,"\n";
2267 }
2268 close STDOUT;