2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for AVX2.
20 # To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
21 # 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
22 # other than A[0][0] in magic order into 6 [256-bit] registers, *each
23 # dedicated to one axis*, Pi permutation is reduced to intra-register
26 # It makes other steps more intricate, but overall, is it a win? To be
27 # more specific index permutations organized by quadruples are:
29 # [4][4] [3][3] [2][2] [1][1]<-+
30 # [0][4] [0][3] [0][2] [0][1]<-+
31 # [3][0] [1][0] [4][0] [2][0] |
32 # [4][3] [3][1] [2][4] [1][2] |
33 # [3][4] [1][3] [4][2] [2][1] |
34 # [2][3] [4][1] [1][4] [3][2] |
35 # [2][2] [4][4] [1][1] [3][3] -+
37 # This however is highly impractical for Theta and Chi. What would help
38 # Theta is if x indices were aligned column-wise, or in other words:
40 # [0][4] [0][3] [0][2] [0][1]
41 # [3][0] [1][0] [4][0] [2][0]
42 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
43 # [2][4] [4][3] [1][2] [3][1]
44 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
45 # [3][4] [1][3] [4][2] [2][1]
46 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
47 # [1][4] [2][3] [3][2] [4][1]
48 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
49 # [4][4] [3][3] [2][2] [1][1]
51 # So here we have it, lines not marked with vpermq() represent the magic
52 # order in which data is to be loaded and maintained. [And lines marked
53 # with vpermq() represent Pi circular permutation in chosen layout. Note
54 # that first step is permutation-free.] A[0][0] is loaded to register of
55 # its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
56 # Digits in variables' names denote right-most coordinates:
58 my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
59 $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
60 $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
61 $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
62 $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
63 $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
64 $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
67 # We also need to map the magic order into offsets within structure:
69 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
70 [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
71 [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
72 [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
73 [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
74 @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear
76 # But on the other hand Chi is much better off if y indices were aligned
77 # column-wise, not x. For this reason we have to shuffle data prior
78 # Chi and revert it afterwards. Prior shuffle is naturally merged with
81 # [0][4] [0][3] [0][2] [0][1]
82 # [3][0] [1][0] [4][0] [2][0]
83 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
84 #vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
85 # [3][1] [1][2] [4][3] [2][4]
86 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
87 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
88 # [3][4] [1][3] [4][2] [2][1]
89 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
90 #vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
91 # [3][2] [1][4] [4][1] [2][3]
92 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
93 #vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
94 # [3][3] [1][1] [4][4] [2][2]
96 # And reverse post-Chi permutation:
98 # [0][4] [0][3] [0][2] [0][1]
99 # [3][0] [1][0] [4][0] [2][0]
100 #vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
101 # [2][4] [4][3] [1][2] [3][1]
102 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
103 # [3][4] [1][3] [4][2] [2][1]
104 #vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
105 # [1][4] [2][3] [3][2] [4][1]
106 #vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
107 # [4][4] [3][3] [2][2] [1][1]
109 ########################################################################
110 # Numbers are cycles per processed byte out of large message.
118 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
119 # coefficient in comparison to scalar keccak1600-x86_64.pl.
120 # (**) It's expected that Ryzen performs poorly, because instruction
121 # issue rate is limited to two AVX2 instructions per cycle and
122 # in addition vpblendd is reportedly bound to specific port.
123 # Obviously this code path should not be executed on Ryzen.
125 my @T = map("%ymm$_",(7..15));
126 my ($C14,$C00,$D00,$D14) = @T[5..8];
131 .type __KeccakF1600,\@function
134 lea rhotates_left+96(%rip),%r8
135 lea rhotates_right+96(%rip),%r9
142 ######################################### Theta
143 vpshufd \$0b01001110,$A20,$C00
145 vpxor $A41,$A21,@T[0]
147 vpxor @T[0],$C14,$C14 # C[1..4]
149 vpermq \$0b11111111,$C14,@T[3]
150 vpermq \$0b10010011,$C14,@T[4]
153 vpermq \$0b01001110,$C00,@T[0]
155 vpsrlq \$63,$C14,@T[1]
156 vpaddq $C14,$C14,@T[2]
157 vpor @T[2],@T[1],@T[1] # ROL64(C[1..4],1)
159 vpermq \$0b00111001,@T[1],$D14
160 vpxor @T[3],@T[1],$D00
163 vpxor @T[0],$C00,$C00 # C[0..0]
165 vpsrlq \$63,$C00,@T[0]
166 vpaddq $C00,$C00,@T[1]
167 vpor @T[0],@T[1],@T[1] # ROL64(C[0..0],1)
169 vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
170 vpxor $D00,$A20,$A20 # ^= D[0..0]
171 vpxor $D00,$A00,$A00 # ^= D[0..0]
173 vpblendd \$0b11000000,@T[1],$D14,$D14
174 vpblendd \$0b00000011,$C00,@T[4],@T[4]
175 vpxor @T[4],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
177 ######################################### Rho + Pi + pre-Chi shuffle
178 vpsllvq 0*32-96(%r8),$A20,@T[0]
179 vpsrlvq 0*32-96(%r9),$A20,$A20
180 vpor $A20,@T[0],@T[0] # $A20
182 vpxor $D14,$A31,$A31 # ^= D[1..4]
183 vpsllvq 2*32-96(%r8),$A31,@T[2]
184 vpsrlvq 2*32-96(%r9),$A31,$A31
185 vpor $A31,@T[2],@T[2] # $A31
187 vpxor $D14,$A21,$A21 # ^= D[1..4]
188 vpsllvq 3*32-96(%r8),$A21,@T[3]
189 vpsrlvq 3*32-96(%r9),$A21,$A21
190 vpor $A21,@T[3],@T[3] # $A21
192 vpermq \$0b10001101,@T[0],$A31 # $A20 -> $A31
193 vpermq \$0b10001101,@T[2],$A21 # $A31 -> $A21
194 vpxor $D14,$A41,$A41 # ^= D[1..4]
195 vpsllvq 4*32-96(%r8),$A41,@T[4]
196 vpsrlvq 4*32-96(%r9),$A41,$A41
198 vpxor $D14,$A01,$A01 # ^= D[1..4]
199 vpxor $D14,$A11,$T[6] # ^= D[1..4]
200 vpsllvq 1*32-96(%r8),$A01,@T[1]
201 vpsrlvq 1*32-96(%r9),$A01,$A01
202 vpor $A41,@T[4],@T[4] # $A41
203 vpor @T[1],$A01,$A20 # $A01 -> $A20
205 vpermq \$0b00011011,@T[3],$A41 # $A21 -> $A41
206 vpermq \$0b01110010,@T[4],$A11 # $A41 -> $A11
207 vpsllvq 5*32-96(%r8),$T[6],@T[5]
208 vpsrlvq 5*32-96(%r9),@T[6],@T[6]
209 vpor @T[5],@T[6],$A01 # $A11 -> $A01
211 ######################################### Chi
212 vpsrldq \$8,$A01,@T[0]
213 vpandn @T[0],$A01,@T[0] # tgting [0][0]
215 vpermq \$0b00111001,$A01,@T[1] # [0][1] [0][4] [0][3] [0][2]
216 vpermq \$0b00011110,$A01,@T[8] # [0][1] [0][2] [0][4] [0][3]
217 vpblendd \$0b11000000,$A00,@T[1],@T[1] # [0][0] [0][4] [0][3] [0][2]
218 vpblendd \$0b00110000,$A00,@T[8],@T[8] # [0][1] [0][0] [0][4] [0][3]
219 vpxor @T[0],$A00,$A00 # broadcasted below
220 vpandn @T[8],@T[1],@T[1] # tgting [0][4] [0][3] [0][2] [0][1]
222 vpblendd \$0b00001100,$A41,$A21, @T[2] # [4][1] [2][1]
223 vpblendd \$0b00001100,$A21,$A11, @T[4] # [4][2] [2][2]
224 vpblendd \$0b00110000,$A11,@T[2],@T[2] # [1][1] [4][1] [2][1]
225 vpblendd \$0b00110000,$A31,@T[4],@T[4] # [1][2] [4][2] [2][2]
226 vpblendd \$0b11000000,$A31,@T[2],@T[2] # [3][1] [1][1] [4][1] [2][1]
227 vpblendd \$0b11000000,$A41,@T[4],@T[4] # [3][2] [1][2] [4][2] [2][2]
228 vpandn @T[4],@T[2],@T[2] # tgting [3][0] [1][0] [4][0] [2][0]
230 vpblendd \$0b00001100,$A11,$A20, @T[3] # [4][4] [2][0]
231 vpblendd \$0b00001100,$A20,$A21, @T[5] # [4][0] [2][1]
232 vpblendd \$0b00110000,$A21,@T[3],@T[3] # [1][3] [4][4] [2][0]
233 vpblendd \$0b00110000,$A41,@T[5],@T[5] # [1][4] [4][0] [2][1]
234 vpblendd \$0b11000000,$A41,@T[3],@T[3] # [3][2] [1][3] [4][4] [2][0]
235 vpblendd \$0b11000000,$A11,@T[5],@T[5] # [3][3] [1][4] [4][0] [2][1]
236 vpandn @T[5],@T[3],@T[3] # tgting [3][1] [1][2] [4][3] [2][4]
237 vpxor $A31,@T[3],@T[3]
239 vpblendd \$0b00001100,$A21,$A31, @T[5] # [4][2] [2][4]
240 vpblendd \$0b00001100,$A31,$A20, @T[6] # [4][3] [2][0]
241 vpblendd \$0b00110000,$A20,@T[5],@T[5] # [1][0] [4][2] [2][4]
242 vpblendd \$0b00110000,$A11,@T[6],@T[6] # [1][1] [4][3] [2][0]
243 vpblendd \$0b11000000,$A11,@T[5],@T[5] # [3][3] [1][0] [4][2] [2][4]
244 vpblendd \$0b11000000,$A21,@T[6],@T[6] # [3][4] [1][1] [4][3] [2][0]
245 vpandn @T[6],@T[5],@T[5] # tgting [3][2] [1][4] [4][1] [2][3]
246 vpxor $A41,@T[5],@T[5]
248 vpblendd \$0b00001100,$A20,$A41, @T[6] # [4][0] [2][3]
249 vpblendd \$0b00001100,$A41,$A31, @T[7] # [4][1] [2][4]
250 vpblendd \$0b00110000,$A31,@T[6],@T[6] # [1][2] [4][0] [2][3]
251 vpblendd \$0b00110000,$A21,@T[7],@T[7] # [1][3] [4][1] [2][4]
252 vpblendd \$0b11000000,$A21,@T[6],@T[6] # [3][4] [1][2] [4][0] [2][3]
253 vpblendd \$0b11000000,$A20,@T[7],@T[7] # [3][0] [1][3] [4][1] [2][4]
254 vpblendd \$0b00001100,$A31,$A41, @T[4] # [1][4] [4][3]
255 vpblendd \$0b11000000,$A31,$A41, @T[8] # [3][1] [2][3]
256 vpandn @T[7],@T[6],@T[6] # tgting [3][3] [1][1] [4][4] [2][2]
257 vpermq \$0b00011011,@T[3],$A31 ######### post-Chi shuffle
258 vpermq \$0b10001101,@T[5],$A41
259 vpxor $A11,@T[6],@T[6]
260 vpermq \$0b00000000,$A00,$A00 # broadcast A[0][0]
262 vpblendd \$0b00000011,$A11,@T[4],@T[4] # [1][4] [4][3] [2][2]
263 vpblendd \$0b00001100,$A11,@T[8],@T[8] # [3][1] [4][4] [2][3]
264 vpermq \$0b01110010,@T[6],$A11
265 vpblendd \$0b11000000,$A20,@T[4],@T[4] # [3][0] [1][4] [4][3] [2][2]
266 vpblendd \$0b00110000,$A20,@T[8],@T[8] # [3][1] [1][0] [4][4] [2][3]
267 vpandn @T[8],@T[4],@T[4] # tgting [3][4] [1][3] [4][2] [2][1]
269 vpxor @T[2],$A20,$A20
270 vpxor @T[1],$A01,$A01
271 vpxor @T[4],$A21,$A21
273 ######################################### Iota
274 vpxor (%r10),$A00,$A00
281 .size __KeccakF1600,.-__KeccakF1600
283 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
284 my $out = $inp; # in squeeze
288 .type SHA3_absorb,\@function
296 lea 96($A_flat),$A_flat
302 vpbroadcastq -96($A_flat),$A00 # load A[5][5]
303 vmovdqu 8+32*0-96($A_flat),$A01
304 vmovdqu 8+32*1-96($A_flat),$A20
305 vmovdqu 8+32*2-96($A_flat),$A31
306 vmovdqu 8+32*3-96($A_flat),$A21
307 vmovdqu 8+32*4-96($A_flat),$A41
308 vmovdqu 8+32*5-96($A_flat),$A11
310 vpxor @T[0],@T[0],@T[0]
311 vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
312 vmovdqa @T[0],32*3-96(%r10)
313 vmovdqa @T[0],32*4-96(%r10)
314 vmovdqa @T[0],32*5-96(%r10)
315 vmovdqa @T[0],32*6-96(%r10)
320 jc .Ldone_absorb_avx2
323 vpbroadcastq 0-96($inp),@T[0]
324 vmovdqu 8-96($inp),@T[1]
327 for(my $i=5; $i<25; $i++) {
331 mov 8*$i-96($inp),%r8
332 mov %r8,$A_jagged[$i]-96(%r10)
339 vpxor @T[0],$A00,$A00
340 vpxor @T[1],$A01,$A01
341 vpxor 32*2-96(%r10),$A20,$A20
342 vpxor 32*3-96(%r10),$A31,$A31
343 vpxor 32*4-96(%r10),$A21,$A21
344 vpxor 32*5-96(%r10),$A41,$A41
345 vpxor 32*6-96(%r10),$A11,$A11
350 jmp .Loop_absorb_avx2
353 vmovq %xmm0,-96($A_flat)
354 vmovdqu $A01,8+32*0-96($A_flat)
355 vmovdqu $A20,8+32*1-96($A_flat)
356 vmovdqu $A31,8+32*2-96($A_flat)
357 vmovdqu $A21,8+32*3-96($A_flat)
358 vmovdqu $A41,8+32*4-96($A_flat)
359 vmovdqu $A11,8+32*5-96($A_flat)
364 lea ($len,$bsz),%rax # return value
366 .size SHA3_absorb,.-SHA3_absorb
369 .type SHA3_squeeze,\@function
374 lea 96($A_flat),$A_flat
379 vpbroadcastq -96($A_flat),$A00
380 vpxor @T[0],@T[0],@T[0]
381 vmovdqu 8+32*0-96($A_flat),$A01
382 vmovdqu 8+32*1-96($A_flat),$A20
383 vmovdqu 8+32*2-96($A_flat),$A31
384 vmovdqu 8+32*3-96($A_flat),$A21
385 vmovdqu 8+32*4-96($A_flat),$A41
386 vmovdqu 8+32*5-96($A_flat),$A11
391 mov @A_jagged[$i]-96($A_flat),%r8
393 for (my $i=0; $i<25; $i++) {
396 jc .Ltail_squeeze_avx2
399 je .Ldone_squeeze_avx2
401 je .Lextend_output_avx2
402 mov @A_jagged[$i+1]-120($A_flat),%r8
406 .Lextend_output_avx2:
409 vmovq %xmm0,-96($A_flat)
410 vmovdqu $A01,8+32*0-96($A_flat)
411 vmovdqu $A20,8+32*1-96($A_flat)
412 vmovdqu $A31,8+32*2-96($A_flat)
413 vmovdqu $A21,8+32*3-96($A_flat)
414 vmovdqu $A41,8+32*4-96($A_flat)
415 vmovdqu $A11,8+32*5-96($A_flat)
418 jmp .Loop_squeeze_avx2
435 .size SHA3_squeeze,.-SHA3_squeeze
439 .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
440 .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
441 .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
442 .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
443 .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
444 .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
446 .quad 64-3, 64-18, 64-36, 64-41
447 .quad 64-1, 64-62, 64-28, 64-27
448 .quad 64-45, 64-6, 64-56, 64-39
449 .quad 64-10, 64-61, 64-55, 64-8
450 .quad 64-2, 64-15, 64-25, 64-20
451 .quad 64-44, 64-43, 64-21, 64-14
453 .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
454 .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
455 .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
456 .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
457 .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
458 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
459 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
460 .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
461 .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
462 .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
463 .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
464 .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
465 .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
466 .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
467 .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
468 .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
469 .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
470 .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
471 .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
472 .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
473 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
474 .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
475 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
476 .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
478 .asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"