Optimize sha/asm/keccak1600-avx2.pl.
[oweals/openssl.git] / crypto / sha / asm / keccak1600-avx2.pl
1 #!/usr/bin/env perl
2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for AVX2.
17 #
18 # July 2017.
19 #
20 # To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
21 # 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
22 # other than A[0][0] in magic order into 6 [256-bit] registers, *each
23 # dedicated to one axis*, Pi permutation is reduced to intra-register
24 # shuffles...
25 #
26 # It makes other steps more intricate, but overall, is it a win? To be
27 # more specific index permutations organized by quadruples are:
28 #
29 #       [4][4] [3][3] [2][2] [1][1]<-+
30 #       [0][4] [0][3] [0][2] [0][1]<-+
31 #       [3][0] [1][0] [4][0] [2][0]  |
32 #       [4][3] [3][1] [2][4] [1][2]  |
33 #       [3][4] [1][3] [4][2] [2][1]  |
34 #       [2][3] [4][1] [1][4] [3][2]  |
35 #       [2][2] [4][4] [1][1] [3][3] -+
36 #
37 # This however is highly impractical for Theta and Chi. What would help
38 # Theta is if x indices were aligned column-wise, or in other words:
39 #
40 #       [0][4] [0][3] [0][2] [0][1]
41 #       [3][0] [1][0] [4][0] [2][0]
42 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
43 #       [2][4] [4][3] [1][2] [3][1]
44 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
45 #       [3][4] [1][3] [4][2] [2][1]
46 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
47 #       [1][4] [2][3] [3][2] [4][1]
48 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
49 #       [4][4] [3][3] [2][2] [1][1]
50 #
51 # So here we have it, lines not marked with vpermq() represent the magic
52 # order in which data is to be loaded and maintained. [And lines marked
53 # with vpermq() represent Pi circular permutation in chosen layout. Note
54 # that first step is permutation-free.] A[0][0] is loaded to register of
55 # its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
56 # Digits in variables' names denote right-most coordinates:
57
58 my ($A00,       # [0][0] [0][0] [0][0] [0][0]           # %ymm0
59     $A01,       # [0][4] [0][3] [0][2] [0][1]           # %ymm1
60     $A20,       # [3][0] [1][0] [4][0] [2][0]           # %ymm2
61     $A31,       # [2][4] [4][3] [1][2] [3][1]           # %ymm3
62     $A21,       # [3][4] [1][3] [4][2] [2][1]           # %ymm4
63     $A41,       # [1][4] [2][3] [3][2] [4][1]           # %ymm5
64     $A11) =     # [4][4] [3][3] [2][2] [1][1]           # %ymm6
65     map("%ymm$_",(0..6));
66
67 # We also need to map the magic order into offsets within structure:
68
69 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],      # [0][0..4]
70                 [2,2], [6,0], [3,1], [4,2], [5,3],      # [1][0..4]
71                 [2,0], [4,0], [6,1], [5,2], [3,3],      # [2][0..4]
72                 [2,3], [3,0], [5,1], [6,2], [4,3],      # [3][0..4]
73                 [2,1], [5,0], [4,1], [3,2], [6,3]);     # [4][0..4]
74    @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);     # ... and now linear
75
76 # But on the other hand Chi is much better off if y indices were aligned
77 # column-wise, not x. For this reason we have to shuffle data prior
78 # Chi and revert it afterwards. Prior shuffle is naturally merged with
79 # Pi itself:
80 #
81 #       [0][4] [0][3] [0][2] [0][1]
82 #       [3][0] [1][0] [4][0] [2][0]
83 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
84 #vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
85 #       [3][1] [1][2] [4][3] [2][4]
86 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
87 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
88 #       [3][4] [1][3] [4][2] [2][1]
89 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
90 #vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
91 #       [3][2] [1][4] [4][1] [2][3]
92 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
93 #vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
94 #       [3][3] [1][1] [4][4] [2][2]
95 #
96 # And reverse post-Chi permutation:
97 #
98 #       [0][4] [0][3] [0][2] [0][1]
99 #       [3][0] [1][0] [4][0] [2][0]
100 #vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
101 #       [2][4] [4][3] [1][2] [3][1]
102 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
103 #       [3][4] [1][3] [4][2] [2][1]
104 #vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
105 #       [1][4] [2][3] [3][2] [4][1]
106 #vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
107 #       [4][4] [3][3] [2][2] [1][1]
108 #
109 ########################################################################
110 # Numbers are cycles per processed byte out of large message.
111 #
112 #                       r=1088(*)
113 #
114 # Haswell               8.9/+8%
115 # Skylake               7.9/+19%
116 # Ryzen                 17(**)
117 #
118 # (*)   Corresponds to SHA3-256. Percentage after slash is improvement
119 #       coefficient in comparison to scalar keccak1600-x86_64.pl.
120 # (**)  It's expected that Ryzen performs poorly, because instruction
121 #       issue rate is limited to two AVX2 instructions per cycle and
122 #       in addition vpblendd is reportedly bound to specific port.
123 #       Obviously this code path should not be executed on Ryzen.
124
125 my @T = map("%ymm$_",(7..15));
126 my ($C14,$C00,$D00,$D14) = @T[5..8];
127
128 $code.=<<___;
129 .text
130
131 .type   __KeccakF1600,\@function
132 .align  32
133 __KeccakF1600:
134         lea             rhotates_left+96(%rip),%r8
135         lea             rhotates_right+96(%rip),%r9
136         lea             iotas(%rip),%r10
137         mov             \$24,%eax
138         jmp             .Loop_avx2
139
140 .align  32
141 .Loop_avx2:
142         ######################################### Theta
143         vpshufd         \$0b01001110,$A20,$C00
144         vpxor           $A31,$A01,$C14
145         vpxor           $A41,$A21,@T[0]
146         vpxor           $A11,$C14,$C14
147         vpxor           @T[0],$C14,$C14         # C[1..4]
148
149         vpermq          \$0b11111111,$C14,@T[3]
150         vpermq          \$0b10010011,$C14,@T[4]
151
152         vpxor           $A20,$C00,$C00
153         vpermq          \$0b01001110,$C00,@T[0]
154
155         vpsrlq          \$63,$C14,@T[1]
156         vpaddq          $C14,$C14,@T[2]
157         vpor            @T[2],@T[1],@T[1]       # ROL64(C[1..4],1)
158
159         vpermq          \$0b00111001,@T[1],$D14
160         vpxor           @T[3],@T[1],$D00
161
162         vpxor           $A00,$C00,$C00
163         vpxor           @T[0],$C00,$C00         # C[0..0]
164
165         vpsrlq          \$63,$C00,@T[0]
166         vpaddq          $C00,$C00,@T[1]
167         vpor            @T[0],@T[1],@T[1]       # ROL64(C[0..0],1)
168
169         vpermq          \$0b00000000,$D00,$D00  # D[0..0] = ROL64(C[1],1) ^ C[4]
170         vpxor           $D00,$A20,$A20          # ^= D[0..0]
171         vpxor           $D00,$A00,$A00          # ^= D[0..0]
172
173         vpblendd        \$0b11000000,@T[1],$D14,$D14
174         vpblendd        \$0b00000011,$C00,@T[4],@T[4]
175         vpxor           @T[4],$D14,$D14         # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
176
177         ######################################### Rho + Pi + pre-Chi shuffle
178         vpsllvq         0*32-96(%r8),$A20,@T[0]
179         vpsrlvq         0*32-96(%r9),$A20,$A20
180         vpor            $A20,@T[0],@T[0]        # $A20
181
182          vpxor          $D14,$A31,$A31          # ^= D[1..4]
183         vpsllvq         2*32-96(%r8),$A31,@T[2]
184         vpsrlvq         2*32-96(%r9),$A31,$A31
185         vpor            $A31,@T[2],@T[2]        # $A31
186
187          vpxor          $D14,$A21,$A21          # ^= D[1..4]
188         vpsllvq         3*32-96(%r8),$A21,@T[3]
189         vpsrlvq         3*32-96(%r9),$A21,$A21
190         vpor            $A21,@T[3],@T[3]        # $A21
191
192          vpermq         \$0b10001101,@T[0],$A31 # $A20 -> $A31
193          vpermq         \$0b10001101,@T[2],$A21 # $A31 -> $A21
194          vpxor          $D14,$A41,$A41          # ^= D[1..4]
195         vpsllvq         4*32-96(%r8),$A41,@T[4]
196         vpsrlvq         4*32-96(%r9),$A41,$A41
197
198          vpxor          $D14,$A01,$A01          # ^= D[1..4]
199          vpxor          $D14,$A11,$T[6]         # ^= D[1..4]
200         vpsllvq         1*32-96(%r8),$A01,@T[1]
201         vpsrlvq         1*32-96(%r9),$A01,$A01
202         vpor            $A41,@T[4],@T[4]        # $A41
203         vpor            @T[1],$A01,$A20         # $A01 -> $A20
204
205          vpermq         \$0b00011011,@T[3],$A41 # $A21 -> $A41
206          vpermq         \$0b01110010,@T[4],$A11 # $A41 -> $A11
207         vpsllvq         5*32-96(%r8),$T[6],@T[5]
208         vpsrlvq         5*32-96(%r9),@T[6],@T[6]
209         vpor            @T[5],@T[6],$A01        # $A11 -> $A01
210
211         ######################################### Chi
212         vpsrldq         \$8,$A01,@T[0]
213         vpandn          @T[0],$A01,@T[0]        # tgting  [0][0]
214
215         vpermq          \$0b00111001,$A01,@T[1]         # [0][1] [0][4] [0][3] [0][2]
216         vpermq          \$0b00011110,$A01,@T[8]         # [0][1] [0][2] [0][4] [0][3]
217         vpblendd        \$0b11000000,$A00,@T[1],@T[1]   # [0][0] [0][4] [0][3] [0][2]
218         vpblendd        \$0b00110000,$A00,@T[8],@T[8]   # [0][1] [0][0] [0][4] [0][3]
219         vpxor           @T[0],$A00,$A00         # broadcasted below
220         vpandn          @T[8],@T[1],@T[1]       # tgting  [0][4] [0][3] [0][2] [0][1]
221
222         vpblendd        \$0b00001100,$A41,$A21, @T[2]   #               [4][1] [2][1]
223         vpblendd        \$0b00001100,$A21,$A11, @T[4]   #               [4][2] [2][2]
224         vpblendd        \$0b00110000,$A11,@T[2],@T[2]   #        [1][1] [4][1] [2][1]
225         vpblendd        \$0b00110000,$A31,@T[4],@T[4]   #        [1][2] [4][2] [2][2]
226         vpblendd        \$0b11000000,$A31,@T[2],@T[2]   # [3][1] [1][1] [4][1] [2][1]
227         vpblendd        \$0b11000000,$A41,@T[4],@T[4]   # [3][2] [1][2] [4][2] [2][2]
228         vpandn          @T[4],@T[2],@T[2]       # tgting  [3][0] [1][0] [4][0] [2][0]
229
230         vpblendd        \$0b00001100,$A11,$A20, @T[3]   #               [4][4] [2][0]
231         vpblendd        \$0b00001100,$A20,$A21, @T[5]   #               [4][0] [2][1]
232         vpblendd        \$0b00110000,$A21,@T[3],@T[3]   #        [1][3] [4][4] [2][0]
233         vpblendd        \$0b00110000,$A41,@T[5],@T[5]   #        [1][4] [4][0] [2][1]
234         vpblendd        \$0b11000000,$A41,@T[3],@T[3]   # [3][2] [1][3] [4][4] [2][0]
235         vpblendd        \$0b11000000,$A11,@T[5],@T[5]   # [3][3] [1][4] [4][0] [2][1]
236         vpandn          @T[5],@T[3],@T[3]       # tgting  [3][1] [1][2] [4][3] [2][4]
237         vpxor           $A31,@T[3],@T[3]
238
239         vpblendd        \$0b00001100,$A21,$A31, @T[5]   #               [4][2] [2][4]
240         vpblendd        \$0b00001100,$A31,$A20, @T[6]   #               [4][3] [2][0]
241         vpblendd        \$0b00110000,$A20,@T[5],@T[5]   #        [1][0] [4][2] [2][4]
242         vpblendd        \$0b00110000,$A11,@T[6],@T[6]   #        [1][1] [4][3] [2][0]
243         vpblendd        \$0b11000000,$A11,@T[5],@T[5]   # [3][3] [1][0] [4][2] [2][4]
244         vpblendd        \$0b11000000,$A21,@T[6],@T[6]   # [3][4] [1][1] [4][3] [2][0]
245         vpandn          @T[6],@T[5],@T[5]       # tgting  [3][2] [1][4] [4][1] [2][3]
246         vpxor           $A41,@T[5],@T[5]
247
248         vpblendd        \$0b00001100,$A20,$A41, @T[6]   #               [4][0] [2][3]
249         vpblendd        \$0b00001100,$A41,$A31, @T[7]   #               [4][1] [2][4]
250         vpblendd        \$0b00110000,$A31,@T[6],@T[6]   #        [1][2] [4][0] [2][3]
251         vpblendd        \$0b00110000,$A21,@T[7],@T[7]   #        [1][3] [4][1] [2][4]
252         vpblendd        \$0b11000000,$A21,@T[6],@T[6]   # [3][4] [1][2] [4][0] [2][3]
253         vpblendd        \$0b11000000,$A20,@T[7],@T[7]   # [3][0] [1][3] [4][1] [2][4]
254         vpblendd        \$0b00001100,$A31,$A41, @T[4]   #        [1][4] [4][3]
255         vpblendd        \$0b11000000,$A31,$A41, @T[8]   # [3][1]               [2][3]
256         vpandn          @T[7],@T[6],@T[6]       # tgting  [3][3] [1][1] [4][4] [2][2]
257          vpermq         \$0b00011011,@T[3],$A31 ######### post-Chi shuffle
258          vpermq         \$0b10001101,@T[5],$A41
259         vpxor           $A11,@T[6],@T[6]
260          vpermq         \$0b00000000,$A00,$A00  # broadcast A[0][0]
261
262         vpblendd        \$0b00000011,$A11,@T[4],@T[4]   #        [1][4] [4][3] [2][2]
263         vpblendd        \$0b00001100,$A11,@T[8],@T[8]   # [3][1]        [4][4] [2][3]
264          vpermq         \$0b01110010,@T[6],$A11
265         vpblendd        \$0b11000000,$A20,@T[4],@T[4]   # [3][0] [1][4] [4][3] [2][2]
266         vpblendd        \$0b00110000,$A20,@T[8],@T[8]   # [3][1] [1][0] [4][4] [2][3]
267         vpandn          @T[8],@T[4],@T[4]       # tgting  [3][4] [1][3] [4][2] [2][1]
268
269         vpxor           @T[2],$A20,$A20
270         vpxor           @T[1],$A01,$A01
271         vpxor           @T[4],$A21,$A21
272
273         ######################################### Iota
274         vpxor           (%r10),$A00,$A00
275         lea             32(%r10),%r10
276
277         dec             %eax
278         jnz             .Loop_avx2
279
280         ret
281 .size   __KeccakF1600,.-__KeccakF1600
282 ___
283 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
284 my  $out = $inp;        # in squeeze
285
286 $code.=<<___;
287 .globl  SHA3_absorb
288 .type   SHA3_absorb,\@function
289 .align  32
290 SHA3_absorb:
291         mov     %rsp,%r11
292
293         lea     -240(%rsp),%rsp
294         and     \$-32,%rsp
295
296         lea     96($A_flat),$A_flat
297         lea     96($inp),$inp
298         lea     96(%rsp),%r10
299
300         vzeroupper
301
302         vpbroadcastq    -96($A_flat),$A00       # load A[5][5]
303         vmovdqu         8+32*0-96($A_flat),$A01
304         vmovdqu         8+32*1-96($A_flat),$A20
305         vmovdqu         8+32*2-96($A_flat),$A31
306         vmovdqu         8+32*3-96($A_flat),$A21
307         vmovdqu         8+32*4-96($A_flat),$A41
308         vmovdqu         8+32*5-96($A_flat),$A11
309
310         vpxor           @T[0],@T[0],@T[0]
311         vmovdqa         @T[0],32*2-96(%r10)     # zero transfer area on stack
312         vmovdqa         @T[0],32*3-96(%r10)
313         vmovdqa         @T[0],32*4-96(%r10)
314         vmovdqa         @T[0],32*5-96(%r10)
315         vmovdqa         @T[0],32*6-96(%r10)
316
317 .Loop_absorb_avx2:
318         mov             $bsz,%rax
319         sub             $bsz,$len
320         jc              .Ldone_absorb_avx2
321
322         shr             \$3,%eax
323         vpbroadcastq    0-96($inp),@T[0]
324         vmovdqu         8-96($inp),@T[1]
325         sub             \$4,%eax
326 ___
327 for(my $i=5; $i<25; $i++) {
328 $code.=<<___
329         dec     %eax
330         jz      .Labsorved_avx2
331         mov     8*$i-96($inp),%r8
332         mov     %r8,$A_jagged[$i]-96(%r10)
333 ___
334 }
335 $code.=<<___;
336 .Labsorved_avx2:
337         lea     ($inp,$bsz),$inp
338
339         vpxor   @T[0],$A00,$A00
340         vpxor   @T[1],$A01,$A01
341         vpxor   32*2-96(%r10),$A20,$A20
342         vpxor   32*3-96(%r10),$A31,$A31
343         vpxor   32*4-96(%r10),$A21,$A21
344         vpxor   32*5-96(%r10),$A41,$A41
345         vpxor   32*6-96(%r10),$A11,$A11
346
347         call    __KeccakF1600
348
349         lea     96(%rsp),%r10
350         jmp     .Loop_absorb_avx2
351
352 .Ldone_absorb_avx2:
353         vmovq   %xmm0,-96($A_flat)
354         vmovdqu $A01,8+32*0-96($A_flat)
355         vmovdqu $A20,8+32*1-96($A_flat)
356         vmovdqu $A31,8+32*2-96($A_flat)
357         vmovdqu $A21,8+32*3-96($A_flat)
358         vmovdqu $A41,8+32*4-96($A_flat)
359         vmovdqu $A11,8+32*5-96($A_flat)
360
361         vzeroupper
362
363         lea     (%r11),%rsp
364         lea     ($len,$bsz),%rax                # return value
365         ret
366 .size   SHA3_absorb,.-SHA3_absorb
367
368 .globl  SHA3_squeeze
369 .type   SHA3_squeeze,\@function
370 .align  32
371 SHA3_squeeze:
372         mov     %rsp,%r11
373
374         lea     96($A_flat),$A_flat
375         shr     \$3,$bsz
376
377         vzeroupper
378
379         vpbroadcastq    -96($A_flat),$A00
380         vpxor           @T[0],@T[0],@T[0]
381         vmovdqu         8+32*0-96($A_flat),$A01
382         vmovdqu         8+32*1-96($A_flat),$A20
383         vmovdqu         8+32*2-96($A_flat),$A31
384         vmovdqu         8+32*3-96($A_flat),$A21
385         vmovdqu         8+32*4-96($A_flat),$A41
386         vmovdqu         8+32*5-96($A_flat),$A11
387
388         mov     $bsz,%rax
389
390 .Loop_squeeze_avx2:
391         mov     @A_jagged[$i]-96($A_flat),%r8
392 ___
393 for (my $i=0; $i<25; $i++) {
394 $code.=<<___;
395         sub     \$8,$len
396         jc      .Ltail_squeeze_avx2
397         mov     %r8,($out)
398         lea     8($out),$out
399         je      .Ldone_squeeze_avx2
400         dec     %eax
401         je      .Lextend_output_avx2
402         mov     @A_jagged[$i+1]-120($A_flat),%r8
403 ___
404 }
405 $code.=<<___;
406 .Lextend_output_avx2:
407         call    __KeccakF1600
408
409         vmovq   %xmm0,-96($A_flat)
410         vmovdqu $A01,8+32*0-96($A_flat)
411         vmovdqu $A20,8+32*1-96($A_flat)
412         vmovdqu $A31,8+32*2-96($A_flat)
413         vmovdqu $A21,8+32*3-96($A_flat)
414         vmovdqu $A41,8+32*4-96($A_flat)
415         vmovdqu $A11,8+32*5-96($A_flat)
416
417         mov     $bsz,%rax
418         jmp     .Loop_squeeze_avx2
419
420
421 .Ltail_squeeze_avx2:
422         add     \$8,$len
423 .Loop_tail_avx2:
424         mov     %r8b,($out)
425         lea     1($out),$out
426         shr     \$8,%r8
427         dec     $len
428         jnz     .Loop_tail_avx2
429
430 .Ldone_squeeze_avx2:
431         vzeroupper
432
433         lea     (%r11),%rsp
434         ret
435 .size   SHA3_squeeze,.-SHA3_squeeze
436
437 .align  64
438 rhotates_left:
439         .quad   3,      18,     36,     41      # [2][0] [4][0] [1][0] [3][0]
440         .quad   1,      62,     28,     27      # [0][1] [0][2] [0][3] [0][4]
441         .quad   45,     6,      56,     39      # [3][1] [1][2] [4][3] [2][4]
442         .quad   10,     61,     55,     8       # [2][1] [4][2] [1][3] [3][4]
443         .quad   2,      15,     25,     20      # [4][1] [3][2] [2][3] [1][4]
444         .quad   44,     43,     21,     14      # [1][1] [2][2] [3][3] [4][4]
445 rhotates_right:
446         .quad   64-3,   64-18,  64-36,  64-41
447         .quad   64-1,   64-62,  64-28,  64-27
448         .quad   64-45,  64-6,   64-56,  64-39
449         .quad   64-10,  64-61,  64-55,  64-8
450         .quad   64-2,   64-15,  64-25,  64-20
451         .quad   64-44,  64-43,  64-21,  64-14
452 iotas:
453         .quad   0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
454         .quad   0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
455         .quad   0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
456         .quad   0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
457         .quad   0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
458         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
459         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
460         .quad   0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
461         .quad   0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
462         .quad   0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
463         .quad   0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
464         .quad   0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
465         .quad   0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
466         .quad   0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
467         .quad   0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
468         .quad   0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
469         .quad   0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
470         .quad   0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
471         .quad   0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
472         .quad   0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
473         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
474         .quad   0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
475         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
476         .quad   0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
477
478 .asciz  "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
479 ___
480
481 print $code;
482 close STDOUT;