2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for x86_64.
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
25 # How does it compare to x86_64 assembly module in Keccak Code Package?
26 # Depending on processor it's either as fast or faster by up to 15%...
28 ########################################################################
29 # Numbers are cycles per processed byte out of large message.
36 # Sandy Bridge 12.9(**)
46 # (*) Corresponds to SHA3-256. Improvement over compiler-generate
47 # varies a lot, most commont coefficient is 15% in comparison to
48 # gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
49 # (**) Sandy Bridge has broken rotate instruction. Performance can be
50 # improved by 14% by replacing rotates with double-precision
51 # shift with same register as source and destination.
53 # $output is the last argument if it looks like a file (it has an extension)
54 # $flavour is the first argument if it doesn't look like a file
55 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
56 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
58 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
60 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
61 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
62 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
63 die "can't locate x86_64-xlate.pl";
65 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
66 or die "can't call $xlate: $!";
69 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
70 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
72 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
73 my @D = map("%r$_",(8..12));
74 my @T = map("%r$_",(13..14));
77 my @rhotates = ([ 0, 1, 62, 28, 27 ],
78 [ 36, 44, 6, 55, 20 ],
79 [ 3, 10, 43, 25, 39 ],
80 [ 41, 45, 15, 21, 8 ],
81 [ 18, 2, 61, 56, 14 ]);
86 .type __KeccakF1600,\@abi-omnipotent
90 mov $A[4][0](%rdi),@C[0]
91 mov $A[4][1](%rdi),@C[1]
92 mov $A[4][2](%rdi),@C[2]
93 mov $A[4][3](%rdi),@C[3]
94 mov $A[4][4](%rdi),@C[4]
99 mov $A[0][0](%rdi),@D[0]
100 mov $A[1][1](%rdi),@D[1]
101 mov $A[2][2](%rdi),@D[2]
102 mov $A[3][3](%rdi),@D[3]
104 xor $A[0][2](%rdi),@C[2]
105 xor $A[0][3](%rdi),@C[3]
107 xor $A[0][1](%rdi),@C[1]
108 xor $A[1][2](%rdi),@C[2]
109 xor $A[1][0](%rdi),@C[0]
111 xor $A[0][4](%rdi),@C[4]
114 xor $A[2][0](%rdi),@C[0]
115 xor $A[1][3](%rdi),@C[3]
117 xor $A[1][4](%rdi),@C[4]
119 xor $A[3][2](%rdi),@C[2]
120 xor $A[3][0](%rdi),@C[0]
121 xor $A[2][3](%rdi),@C[3]
122 xor $A[2][1](%rdi),@C[1]
123 xor $A[2][4](%rdi),@C[4]
127 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
131 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
132 xor $A[3][1](%rdi),@C[1]
135 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
136 xor $A[3][4](%rdi),@C[4]
139 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
142 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
144 (@D[0..4], @C) = (@C[1..4,0], @D);
148 rol \$$rhotates[1][1],@C[1]
151 rol \$$rhotates[2][2],@C[2]
154 rol \$$rhotates[3][3],@C[3]
156 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
157 rol \$$rhotates[4][4],@C[4]
164 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
165 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
167 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
170 mov $A[4][2](%rdi),@C[4]
171 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
172 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
175 mov $A[1][4](%rdi),@C[1]
176 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
177 mov $A[2][0](%rdi),@C[2]
178 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
181 mov $A[0][3](%rdi),@C[0]
182 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
183 mov $A[3][1](%rdi),@C[3]
184 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
189 rol \$$rhotates[0][3],@C[0]
192 rol \$$rhotates[4][2],@C[4]
193 rol \$$rhotates[3][1],@C[3]
195 rol \$$rhotates[1][4],@C[1]
198 rol \$$rhotates[2][0],@C[2]
200 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
201 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
205 mov $A[0][1](%rdi),@C[0]
206 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
208 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
211 mov $A[1][2](%rdi),@C[1]
212 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
213 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
216 mov $A[4][0](%rdi),@C[4]
217 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
218 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
221 mov $A[2][3](%rdi),@C[2]
222 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
223 mov $A[3][4](%rdi),@C[3]
224 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
229 rol \$$rhotates[2][3],@C[2]
231 rol \$$rhotates[3][4],@C[3]
233 rol \$$rhotates[1][2],@C[1]
235 rol \$$rhotates[4][0],@C[4]
238 rol \$$rhotates[0][1],@C[0]
241 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
242 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
246 mov $A[2][1](%rdi),@C[2]
247 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
248 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
251 mov $A[4][3](%rdi),@C[4]
252 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
253 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
256 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
257 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
260 mov $A[1][0](%rdi),@C[1]
261 xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4])
262 mov $A[3][2](%rdi),@C[3]
263 mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
266 mov $A[0][4](%rdi),@C[0]
270 rol \$$rhotates[2][1],@C[2]
272 rol \$$rhotates[3][2],@C[3]
274 rol \$$rhotates[1][0],@C[1]
276 rol \$$rhotates[4][3],@C[4]
279 rol \$$rhotates[0][4],@C[0]
282 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
283 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
287 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
288 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
291 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
292 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
295 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
296 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
299 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
300 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
303 xor $A[0][2](%rdi),@D[2]
304 xor $A[1][3](%rdi),@D[3]
305 rol \$$rhotates[0][2],@D[2]
306 xor $A[4][1](%rdi),@D[1]
307 rol \$$rhotates[1][3],@D[3]
308 xor $A[2][4](%rdi),@D[4]
309 rol \$$rhotates[4][1],@D[1]
310 xor $A[3][0](%rdi),@D[0]
312 rol \$$rhotates[2][4],@D[4]
313 rol \$$rhotates[3][0],@D[0]
320 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
321 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
325 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
326 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
329 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
330 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
333 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
334 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
337 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
338 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
340 mov @C[0],@C[1] # harmonize with the loop top
346 lea -192($iotas),$iotas # rewind iotas
349 .size __KeccakF1600,.-__KeccakF1600
351 .type KeccakF1600,\@abi-omnipotent
368 lea 100(%rdi),%rdi # size optimization
370 .cfi_adjust_cfa_offset 200
379 lea iotas(%rip),$iotas
380 lea 100(%rsp),%rsi # size optimization
390 lea -100(%rdi),%rdi # preserve A[][]
393 .cfi_adjust_cfa_offset -200
409 .size KeccakF1600,.-KeccakF1600
412 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
413 ($A_flat,$inp) = ("%r8","%r9");
416 .type SHA3_absorb,\@function,4
433 lea 100(%rdi),%rdi # size optimization
435 .cfi_adjust_cfa_offset 232
438 lea 100(%rsp),%rsi # size optimization
446 lea iotas(%rip),$iotas
448 mov $bsz,216-100(%rsi) # save bsz
455 lea -100(%rdi),$A_flat
461 lea 8($A_flat),$A_flat
467 mov $inp,200-100(%rsi) # save inp
468 mov $len,208-100(%rsi) # save len
470 mov 200-100(%rsi),$inp # pull inp
471 mov 208-100(%rsi),$len # pull len
472 mov 216-100(%rsi),$bsz # pull bsz
477 mov $len,%rax # return value
487 .cfi_adjust_cfa_offset -232
503 .size SHA3_absorb,.-SHA3_absorb
506 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
507 ($out,$len,$bsz) = ("%r12","%r13","%r14");
511 .type SHA3_squeeze,\@function,4
538 sub \$8,$len # len -= 8
553 .byte 0xf3,0xa4 # rep movsb
564 .size SHA3_squeeze,.-SHA3_squeeze
569 .quad 0,0,0,0,0,0,0,0
572 .quad 0x0000000000000001
573 .quad 0x0000000000008082
574 .quad 0x800000000000808a
575 .quad 0x8000000080008000
576 .quad 0x000000000000808b
577 .quad 0x0000000080000001
578 .quad 0x8000000080008081
579 .quad 0x8000000000008009
580 .quad 0x000000000000008a
581 .quad 0x0000000000000088
582 .quad 0x0000000080008009
583 .quad 0x000000008000000a
584 .quad 0x000000008000808b
585 .quad 0x800000000000008b
586 .quad 0x8000000000008089
587 .quad 0x8000000000008003
588 .quad 0x8000000000008002
589 .quad 0x8000000000000080
590 .quad 0x000000000000800a
591 .quad 0x800000008000000a
592 .quad 0x8000000080008081
593 .quad 0x8000000000008080
594 .quad 0x0000000080000001
595 .quad 0x8000000080008008
597 .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
600 foreach (split("\n",$code)) {
601 # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
602 # Haswell, but it hurts other processors by up to 2-3-4x...
603 #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
604 # Below replacement results in 9.3 on Haswell [as well as
605 # on Ryzen, i.e. it *hurts* Ryzen]...
606 #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
611 close STDOUT or die "error closing STDOUT";