2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for x86_64.
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
25 # How does it compare to x86_64 assembly module in Keccak Code Package?
26 # Depending on processor it's either as fast or faster by up to 15%...
28 ########################################################################
29 # Numbers are cycles per processed byte out of large message.
36 # Sandy Bridge 12.9(**)
46 # (*) Corresponds to SHA3-256. Improvement over compiler-generate
47 # varies a lot, most commont coefficient is 15% in comparison to
48 # gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
49 # (**) Sandy Bridge has broken rotate instruction. Performance can be
50 # improved by 14% by replacing rotates with double-precision
51 # shift with same register as source and destination.
55 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
57 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62 die "can't locate x86_64-xlate.pl";
64 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
67 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
68 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
70 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
71 my @D = map("%r$_",(8..12));
72 my @T = map("%r$_",(13..14));
75 my @rhotates = ([ 0, 1, 62, 28, 27 ],
76 [ 36, 44, 6, 55, 20 ],
77 [ 3, 10, 43, 25, 39 ],
78 [ 41, 45, 15, 21, 8 ],
79 [ 18, 2, 61, 56, 14 ]);
84 .type __KeccakF1600,\@abi-omnipotent
87 mov $A[4][0](%rdi),@C[0]
88 mov $A[4][1](%rdi),@C[1]
89 mov $A[4][2](%rdi),@C[2]
90 mov $A[4][3](%rdi),@C[3]
91 mov $A[4][4](%rdi),@C[4]
96 mov $A[0][0](%rdi),@D[0]
97 mov $A[1][1](%rdi),@D[1]
98 mov $A[2][2](%rdi),@D[2]
99 mov $A[3][3](%rdi),@D[3]
101 xor $A[0][2](%rdi),@C[2]
102 xor $A[0][3](%rdi),@C[3]
104 xor $A[0][1](%rdi),@C[1]
105 xor $A[1][2](%rdi),@C[2]
106 xor $A[1][0](%rdi),@C[0]
108 xor $A[0][4](%rdi),@C[4]
111 xor $A[2][0](%rdi),@C[0]
112 xor $A[1][3](%rdi),@C[3]
114 xor $A[1][4](%rdi),@C[4]
116 xor $A[3][2](%rdi),@C[2]
117 xor $A[3][0](%rdi),@C[0]
118 xor $A[2][3](%rdi),@C[3]
119 xor $A[2][1](%rdi),@C[1]
120 xor $A[2][4](%rdi),@C[4]
124 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
128 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
129 xor $A[3][1](%rdi),@C[1]
132 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
133 xor $A[3][4](%rdi),@C[4]
136 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
139 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
141 (@D[0..4], @C) = (@C[1..4,0], @D);
145 rol \$$rhotates[1][1],@C[1]
148 rol \$$rhotates[2][2],@C[2]
151 rol \$$rhotates[3][3],@C[3]
153 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
154 rol \$$rhotates[4][4],@C[4]
161 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
162 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
164 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
167 mov $A[4][2](%rdi),@C[4]
168 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
169 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
172 mov $A[1][4](%rdi),@C[1]
173 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
174 mov $A[2][0](%rdi),@C[2]
175 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
178 mov $A[0][3](%rdi),@C[0]
179 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
180 mov $A[3][1](%rdi),@C[3]
181 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
186 rol \$$rhotates[0][3],@C[0]
189 rol \$$rhotates[4][2],@C[4]
190 rol \$$rhotates[3][1],@C[3]
192 rol \$$rhotates[1][4],@C[1]
195 rol \$$rhotates[2][0],@C[2]
197 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
198 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
202 mov $A[0][1](%rdi),@C[0]
203 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
205 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
208 mov $A[1][2](%rdi),@C[1]
209 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
210 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
213 mov $A[4][0](%rdi),@C[4]
214 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
215 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
218 mov $A[2][3](%rdi),@C[2]
219 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
220 mov $A[3][4](%rdi),@C[3]
221 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
226 rol \$$rhotates[2][3],@C[2]
228 rol \$$rhotates[3][4],@C[3]
230 rol \$$rhotates[1][2],@C[1]
232 rol \$$rhotates[4][0],@C[4]
235 rol \$$rhotates[0][1],@C[0]
238 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
239 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
243 mov $A[2][1](%rdi),@C[2]
244 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
245 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
248 mov $A[4][3](%rdi),@C[4]
249 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
250 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
253 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
254 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
257 mov $A[1][0](%rdi),@C[1]
258 xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4])
259 mov $A[3][2](%rdi),@C[3]
260 mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
263 mov $A[0][4](%rdi),@C[0]
267 rol \$$rhotates[2][1],@C[2]
269 rol \$$rhotates[3][2],@C[3]
271 rol \$$rhotates[1][0],@C[1]
273 rol \$$rhotates[4][3],@C[4]
276 rol \$$rhotates[0][4],@C[0]
279 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
280 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
284 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
285 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
288 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
289 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
292 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
293 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
296 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
297 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
300 xor $A[0][2](%rdi),@D[2]
301 xor $A[1][3](%rdi),@D[3]
302 rol \$$rhotates[0][2],@D[2]
303 xor $A[4][1](%rdi),@D[1]
304 rol \$$rhotates[1][3],@D[3]
305 xor $A[2][4](%rdi),@D[4]
306 rol \$$rhotates[4][1],@D[1]
307 xor $A[3][0](%rdi),@D[0]
309 rol \$$rhotates[2][4],@D[4]
310 rol \$$rhotates[3][0],@D[0]
317 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
318 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
322 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
323 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
326 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
327 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
330 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
331 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
334 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
335 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
337 mov @C[0],@C[1] # harmonize with the loop top
343 lea -192($iotas),$iotas # rewind iotas
345 .size __KeccakF1600,.-__KeccakF1600
347 .type KeccakF1600,\@abi-omnipotent
364 lea 100(%rdi),%rdi # size optimization
366 .cfi_adjust_cfa_offset 200
375 lea iotas(%rip),$iotas
376 lea 100(%rsp),%rsi # size optimization
386 lea -100(%rdi),%rdi # preserve A[][]
389 .cfi_adjust_cfa_offset -200
405 .size KeccakF1600,.-KeccakF1600
408 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
409 ($A_flat,$inp) = ("%r8","%r9");
412 .type SHA3_absorb,\@function,4
429 lea 100(%rdi),%rdi # size optimization
431 .cfi_adjust_cfa_offset 232
434 lea 100(%rsp),%rsi # size optimization
442 lea iotas(%rip),$iotas
444 mov $bsz,216-100(%rsi) # save bsz
451 lea -100(%rdi),$A_flat
457 lea 8($A_flat),$A_flat
463 mov $inp,200-100(%rsi) # save inp
464 mov $len,208-100(%rsi) # save len
466 mov 200-100(%rsi),$inp # pull inp
467 mov 208-100(%rsi),$len # pull len
468 mov 216-100(%rsi),$bsz # pull bsz
473 mov $len,%rax # return value
483 .cfi_adjust_cfa_offset -232
499 .size SHA3_absorb,.-SHA3_absorb
502 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
503 ($out,$len,$bsz) = ("%r12","%r13","%r14");
507 .type SHA3_squeeze,\@function,4
534 sub \$8,$len # len -= 8
549 .byte 0xf3,0xa4 # rep movsb
560 .size SHA3_squeeze,.-SHA3_squeeze
565 .quad 0,0,0,0,0,0,0,0
568 .quad 0x0000000000000001
569 .quad 0x0000000000008082
570 .quad 0x800000000000808a
571 .quad 0x8000000080008000
572 .quad 0x000000000000808b
573 .quad 0x0000000080000001
574 .quad 0x8000000080008081
575 .quad 0x8000000000008009
576 .quad 0x000000000000008a
577 .quad 0x0000000000000088
578 .quad 0x0000000080008009
579 .quad 0x000000008000000a
580 .quad 0x000000008000808b
581 .quad 0x800000000000008b
582 .quad 0x8000000000008089
583 .quad 0x8000000000008003
584 .quad 0x8000000000008002
585 .quad 0x8000000000000080
586 .quad 0x000000000000800a
587 .quad 0x800000008000000a
588 .quad 0x8000000080008081
589 .quad 0x8000000000008080
590 .quad 0x0000000080000001
591 .quad 0x8000000080008008
593 .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
596 foreach (split("\n",$code)) {
597 # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
598 # Haswell, but it hurts other processors by up to 2-3-4x...
599 #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
600 # Below replacement results in 9.3 on Haswell [as well as
601 # on Ryzen, i.e. it *hurts* Ryzen]...
602 #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;