2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for ARMv8.
20 # This is straightforward KECCAK_1X_ALT implementation. It makes no
21 # sense to attempt SIMD/NEON implementation for following reason.
22 # 64-bit lanes of vector registers can't be addressed as easily as in
23 # 32-bit mode. This means that 64-bit NEON is bound to be slower than
24 # 32-bit NEON, and this implementation is faster than 32-bit NEON on
25 # same processor. Even though it takes more scalar xor's and andn's,
26 # it gets compensated by availability of rotate. Not to forget that
27 # most processors achieve higher issue rate with scalar instructions.
29 ######################################################################
30 # Numbers are cycles per processed byte.
41 # (*) Corresponds to SHA3-256. No improvement coefficients are listed
42 # because they vary too much from compiler to compiler. Newer
43 # compiler does much better and improvement varies from 5% on
44 # Cortex-A57 to 25% on Cortex-A53. While in comparison to older
45 # compiler this code is at least 2x faster...
50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
52 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
53 die "can't locate arm-xlate.pl";
55 open OUT,"| \"$^X\" $xlate $flavour $output";
58 my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
60 $A[3][3] = "x25"; # x18 is reserved
62 my @C = map("x$_", (26,27,28,30));
64 my @rhotates = ([ 0, 1, 62, 28, 27 ],
65 [ 36, 44, 6, 55, 20 ],
66 [ 3, 10, 43, 25, 39 ],
67 [ 41, 45, 15, 21, 8 ],
68 [ 18, 2, 61, 56, 14 ]);
73 .align 8 // strategic alignment and padding that allows to use
74 // address value as loop termination condition...
78 .quad 0x0000000000000001
79 .quad 0x0000000000008082
80 .quad 0x800000000000808a
81 .quad 0x8000000080008000
82 .quad 0x000000000000808b
83 .quad 0x0000000080000001
84 .quad 0x8000000080008081
85 .quad 0x8000000000008009
86 .quad 0x000000000000008a
87 .quad 0x0000000000000088
88 .quad 0x0000000080008009
89 .quad 0x000000008000000a
90 .quad 0x000000008000808b
91 .quad 0x800000000000008b
92 .quad 0x8000000000008089
93 .quad 0x8000000000008003
94 .quad 0x8000000000008002
95 .quad 0x8000000000000080
96 .quad 0x000000000000800a
97 .quad 0x800000008000000a
98 .quad 0x8000000080008081
99 .quad 0x8000000000008080
100 .quad 0x0000000080000001
101 .quad 0x8000000080008008
104 .type KeccakF1600_int,%function
108 stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
112 ////////////////////////////////////////// Theta
113 eor $C[0],$A[0][0],$A[1][0]
114 stp $A[0][4],$A[1][4],[sp,#0] // offload pair...
115 eor $C[1],$A[0][1],$A[1][1]
116 eor $C[2],$A[0][2],$A[1][2]
117 eor $C[3],$A[0][3],$A[1][3]
122 eor $C[4],$A[0][4],$A[1][4]
123 eor $C[0],$C[0],$A[2][0]
124 eor $C[1],$C[1],$A[2][1]
125 eor $C[2],$C[2],$A[2][2]
126 eor $C[3],$C[3],$A[2][3]
127 eor $C[4],$C[4],$A[2][4]
128 eor $C[0],$C[0],$A[3][0]
129 eor $C[1],$C[1],$A[3][1]
130 eor $C[2],$C[2],$A[3][2]
131 eor $C[3],$C[3],$A[3][3]
132 eor $C[4],$C[4],$A[3][4]
133 eor $C[0],$C[0],$A[4][0]
134 eor $C[2],$C[2],$A[4][2]
135 eor $C[1],$C[1],$A[4][1]
136 eor $C[3],$C[3],$A[4][3]
137 eor $C[4],$C[4],$A[4][4]
139 eor $C[5],$C[0],$C[2],ror#63
141 eor $A[0][1],$A[0][1],$C[5]
142 eor $A[1][1],$A[1][1],$C[5]
143 eor $A[2][1],$A[2][1],$C[5]
144 eor $A[3][1],$A[3][1],$C[5]
145 eor $A[4][1],$A[4][1],$C[5]
147 eor $C[5],$C[1],$C[3],ror#63
148 eor $C[2],$C[2],$C[4],ror#63
149 eor $C[3],$C[3],$C[0],ror#63
150 eor $C[4],$C[4],$C[1],ror#63
152 eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2]
153 eor $A[1][2],$A[1][2],$C[5]
154 eor $A[2][2],$A[2][2],$C[5]
155 eor $A[3][2],$A[3][2],$C[5]
156 eor $A[4][2],$A[4][2],$C[5]
158 eor $A[0][0],$A[0][0],$C[4]
159 eor $A[1][0],$A[1][0],$C[4]
160 eor $A[2][0],$A[2][0],$C[4]
161 eor $A[3][0],$A[3][0],$C[4]
162 eor $A[4][0],$A[4][0],$C[4]
167 ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data
168 eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3]
169 eor $A[1][3],$A[1][3],$C[2]
170 eor $A[2][3],$A[2][3],$C[2]
171 eor $A[3][3],$A[3][3],$C[2]
172 eor $A[4][3],$A[4][3],$C[2]
174 eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4]
175 eor $A[1][4],$A[1][4],$C[3]
176 eor $A[2][4],$A[2][4],$C[3]
177 eor $A[3][4],$A[3][4],$C[3]
178 eor $A[4][4],$A[4][4],$C[3]
180 ////////////////////////////////////////// Rho+Pi
182 ror $A[0][1],$A[1][1],#64-$rhotates[1][1]
184 ror $A[0][2],$A[2][2],#64-$rhotates[2][2]
186 ror $A[0][3],$A[3][3],#64-$rhotates[3][3]
188 ror $A[0][4],$A[4][4],#64-$rhotates[4][4]
190 ror $A[1][1],$A[1][4],#64-$rhotates[1][4]
191 ror $A[2][2],$A[2][3],#64-$rhotates[2][3]
192 ror $A[3][3],$A[3][2],#64-$rhotates[3][2]
193 ror $A[4][4],$A[4][1],#64-$rhotates[4][1]
195 ror $A[1][4],$A[4][2],#64-$rhotates[4][2]
196 ror $A[2][3],$A[3][4],#64-$rhotates[3][4]
197 ror $A[3][2],$A[2][1],#64-$rhotates[2][1]
198 ror $A[4][1],$A[1][3],#64-$rhotates[1][3]
200 ror $A[4][2],$A[2][4],#64-$rhotates[2][4]
201 ror $A[3][4],$A[4][3],#64-$rhotates[4][3]
202 ror $A[2][1],$A[1][2],#64-$rhotates[1][2]
203 ror $A[1][3],$A[3][1],#64-$rhotates[3][1]
205 ror $A[2][4],$A[4][0],#64-$rhotates[4][0]
206 ror $A[4][3],$A[3][0],#64-$rhotates[3][0]
207 ror $A[1][2],$A[2][0],#64-$rhotates[2][0]
208 ror $A[3][1],$A[1][0],#64-$rhotates[1][0]
210 ror $A[1][0],$C[0],#64-$rhotates[0][3]
211 ror $A[2][0],$C[3],#64-$rhotates[0][1]
212 ror $A[3][0],$C[2],#64-$rhotates[0][4]
213 ror $A[4][0],$C[1],#64-$rhotates[0][2]
215 ////////////////////////////////////////// Chi+Iota
216 bic $C[0],$A[0][2],$A[0][1]
217 bic $C[1],$A[0][3],$A[0][2]
218 bic $C[2],$A[0][0],$A[0][4]
219 bic $C[3],$A[0][1],$A[0][0]
220 eor $A[0][0],$A[0][0],$C[0]
221 bic $C[0],$A[0][4],$A[0][3]
222 eor $A[0][1],$A[0][1],$C[1]
224 eor $A[0][3],$A[0][3],$C[2]
225 eor $A[0][4],$A[0][4],$C[3]
226 eor $A[0][2],$A[0][2],$C[0]
227 ldr $C[3],[$C[1]],#8 // Iota[i++]
229 bic $C[0],$A[1][2],$A[1][1]
230 tst $C[1],#255 // are we done?
232 bic $C[1],$A[1][3],$A[1][2]
233 bic $C[2],$A[1][0],$A[1][4]
234 eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota
235 bic $C[3],$A[1][1],$A[1][0]
236 eor $A[1][0],$A[1][0],$C[0]
237 bic $C[0],$A[1][4],$A[1][3]
238 eor $A[1][1],$A[1][1],$C[1]
239 eor $A[1][3],$A[1][3],$C[2]
240 eor $A[1][4],$A[1][4],$C[3]
241 eor $A[1][2],$A[1][2],$C[0]
243 bic $C[0],$A[2][2],$A[2][1]
244 bic $C[1],$A[2][3],$A[2][2]
245 bic $C[2],$A[2][0],$A[2][4]
246 bic $C[3],$A[2][1],$A[2][0]
247 eor $A[2][0],$A[2][0],$C[0]
248 bic $C[0],$A[2][4],$A[2][3]
249 eor $A[2][1],$A[2][1],$C[1]
250 eor $A[2][3],$A[2][3],$C[2]
251 eor $A[2][4],$A[2][4],$C[3]
252 eor $A[2][2],$A[2][2],$C[0]
254 bic $C[0],$A[3][2],$A[3][1]
255 bic $C[1],$A[3][3],$A[3][2]
256 bic $C[2],$A[3][0],$A[3][4]
257 bic $C[3],$A[3][1],$A[3][0]
258 eor $A[3][0],$A[3][0],$C[0]
259 bic $C[0],$A[3][4],$A[3][3]
260 eor $A[3][1],$A[3][1],$C[1]
261 eor $A[3][3],$A[3][3],$C[2]
262 eor $A[3][4],$A[3][4],$C[3]
263 eor $A[3][2],$A[3][2],$C[0]
265 bic $C[0],$A[4][2],$A[4][1]
266 bic $C[1],$A[4][3],$A[4][2]
267 bic $C[2],$A[4][0],$A[4][4]
268 bic $C[3],$A[4][1],$A[4][0]
269 eor $A[4][0],$A[4][0],$C[0]
270 bic $C[0],$A[4][4],$A[4][3]
271 eor $A[4][1],$A[4][1],$C[1]
272 eor $A[4][3],$A[4][3],$C[2]
273 eor $A[4][4],$A[4][4],$C[3]
274 eor $A[4][2],$A[4][2],$C[0]
280 .size KeccakF1600_int,.-KeccakF1600_int
282 .type KeccakF1600,%function
285 stp x29,x30,[sp,#-128]!
294 str x0,[sp,#32] // offload argument
296 ldp $A[0][0],$A[0][1],[x0,#16*0]
297 ldp $A[0][2],$A[0][3],[$C[0],#16*1]
298 ldp $A[0][4],$A[1][0],[$C[0],#16*2]
299 ldp $A[1][1],$A[1][2],[$C[0],#16*3]
300 ldp $A[1][3],$A[1][4],[$C[0],#16*4]
301 ldp $A[2][0],$A[2][1],[$C[0],#16*5]
302 ldp $A[2][2],$A[2][3],[$C[0],#16*6]
303 ldp $A[2][4],$A[3][0],[$C[0],#16*7]
304 ldp $A[3][1],$A[3][2],[$C[0],#16*8]
305 ldp $A[3][3],$A[3][4],[$C[0],#16*9]
306 ldp $A[4][0],$A[4][1],[$C[0],#16*10]
307 ldp $A[4][2],$A[4][3],[$C[0],#16*11]
308 ldr $A[4][4],[$C[0],#16*12]
313 stp $A[0][0],$A[0][1],[$C[0],#16*0]
314 stp $A[0][2],$A[0][3],[$C[0],#16*1]
315 stp $A[0][4],$A[1][0],[$C[0],#16*2]
316 stp $A[1][1],$A[1][2],[$C[0],#16*3]
317 stp $A[1][3],$A[1][4],[$C[0],#16*4]
318 stp $A[2][0],$A[2][1],[$C[0],#16*5]
319 stp $A[2][2],$A[2][3],[$C[0],#16*6]
320 stp $A[2][4],$A[3][0],[$C[0],#16*7]
321 stp $A[3][1],$A[3][2],[$C[0],#16*8]
322 stp $A[3][3],$A[3][4],[$C[0],#16*9]
323 stp $A[4][0],$A[4][1],[$C[0],#16*10]
324 stp $A[4][2],$A[4][3],[$C[0],#16*11]
325 str $A[4][4],[$C[0],#16*12]
327 ldp x19,x20,[x29,#16]
329 ldp x21,x22,[x29,#32]
330 ldp x23,x24,[x29,#48]
331 ldp x25,x26,[x29,#64]
332 ldp x27,x28,[x29,#80]
333 ldp x29,x30,[sp],#128
335 .size KeccakF1600,.-KeccakF1600
338 .type SHA3_absorb,%function
341 stp x29,x30,[sp,#-128]!
350 stp x0,x1,[sp,#32] // offload arguments
353 mov $C[0],x0 // uint64_t A[5][5]
354 mov $C[1],x1 // const void *inp
355 mov $C[2],x2 // size_t len
356 mov $C[3],x3 // size_t bsz
357 ldp $A[0][0],$A[0][1],[$C[0],#16*0]
358 ldp $A[0][2],$A[0][3],[$C[0],#16*1]
359 ldp $A[0][4],$A[1][0],[$C[0],#16*2]
360 ldp $A[1][1],$A[1][2],[$C[0],#16*3]
361 ldp $A[1][3],$A[1][4],[$C[0],#16*4]
362 ldp $A[2][0],$A[2][1],[$C[0],#16*5]
363 ldp $A[2][2],$A[2][3],[$C[0],#16*6]
364 ldp $A[2][4],$A[3][0],[$C[0],#16*7]
365 ldp $A[3][1],$A[3][2],[$C[0],#16*8]
366 ldp $A[3][3],$A[3][4],[$C[0],#16*9]
367 ldp $A[4][0],$A[4][1],[$C[0],#16*10]
368 ldp $A[4][2],$A[4][3],[$C[0],#16*11]
369 ldr $A[4][4],[$C[0],#16*12]
374 subs $C[0],$C[2],$C[3] // len - bsz
377 str $C[0],[sp,#48] // save len - bsz
378 ldr $C[0],[$C[1]],#8 // *inp++
382 eor $A[0][0],$A[0][0],$C[0]
385 ldr $C[0],[$C[1]],#8 // *inp++
389 eor $A[0][1],$A[0][1],$C[0]
391 ldr $C[0],[$C[1]],#8 // *inp++
395 eor $A[0][2],$A[0][2],$C[0]
398 ldr $C[0],[$C[1]],#8 // *inp++
402 eor $A[0][3],$A[0][3],$C[0]
404 ldr $C[0],[$C[1]],#8 // *inp++
408 eor $A[0][4],$A[0][4],$C[0]
411 ldr $C[0],[$C[1]],#8 // *inp++
415 eor $A[1][0],$A[1][0],$C[0]
417 ldr $C[0],[$C[1]],#8 // *inp++
421 eor $A[1][1],$A[1][1],$C[0]
424 ldr $C[0],[$C[1]],#8 // *inp++
428 eor $A[1][2],$A[1][2],$C[0]
430 ldr $C[0],[$C[1]],#8 // *inp++
434 eor $A[1][3],$A[1][3],$C[0]
437 ldr $C[0],[$C[1]],#8 // *inp++
441 eor $A[1][4],$A[1][4],$C[0]
443 ldr $C[0],[$C[1]],#8 // *inp++
447 eor $A[2][0],$A[2][0],$C[0]
450 ldr $C[0],[$C[1]],#8 // *inp++
454 eor $A[2][1],$A[2][1],$C[0]
456 ldr $C[0],[$C[1]],#8 // *inp++
460 eor $A[2][2],$A[2][2],$C[0]
463 ldr $C[0],[$C[1]],#8 // *inp++
467 eor $A[2][3],$A[2][3],$C[0]
469 ldr $C[0],[$C[1]],#8 // *inp++
473 eor $A[2][4],$A[2][4],$C[0]
476 ldr $C[0],[$C[1]],#8 // *inp++
480 eor $A[3][0],$A[3][0],$C[0]
482 ldr $C[0],[$C[1]],#8 // *inp++
486 eor $A[3][1],$A[3][1],$C[0]
489 ldr $C[0],[$C[1]],#8 // *inp++
493 eor $A[3][2],$A[3][2],$C[0]
495 ldr $C[0],[$C[1]],#8 // *inp++
499 eor $A[3][3],$A[3][3],$C[0]
502 ldr $C[0],[$C[1]],#8 // *inp++
506 eor $A[3][4],$A[3][4],$C[0]
508 ldr $C[0],[$C[1]],#8 // *inp++
512 eor $A[4][0],$A[4][0],$C[0]
515 ldr $C[0],[$C[1]],#8 // *inp++
519 eor $A[4][1],$A[4][1],$C[0]
521 ldr $C[0],[$C[1]],#8 // *inp++
525 eor $A[4][2],$A[4][2],$C[0]
528 ldr $C[0],[$C[1]],#8 // *inp++
532 eor $A[4][3],$A[4][3],$C[0]
534 ldr $C[0],[$C[1]],#8 // *inp++
538 eor $A[4][4],$A[4][4],$C[0]
541 str $C[1],[sp,#40] // save inp
545 ldr $C[1],[sp,#40] // restore arguments
546 ldp $C[2],$C[3],[sp,#48]
552 stp $A[0][0],$A[0][1],[$C[1],#16*0]
553 stp $A[0][2],$A[0][3],[$C[1],#16*1]
554 stp $A[0][4],$A[1][0],[$C[1],#16*2]
555 stp $A[1][1],$A[1][2],[$C[1],#16*3]
556 stp $A[1][3],$A[1][4],[$C[1],#16*4]
557 stp $A[2][0],$A[2][1],[$C[1],#16*5]
558 stp $A[2][2],$A[2][3],[$C[1],#16*6]
559 stp $A[2][4],$A[3][0],[$C[1],#16*7]
560 stp $A[3][1],$A[3][2],[$C[1],#16*8]
561 stp $A[3][3],$A[3][4],[$C[1],#16*9]
562 stp $A[4][0],$A[4][1],[$C[1],#16*10]
563 stp $A[4][2],$A[4][3],[$C[1],#16*11]
564 str $A[4][4],[$C[1],#16*12]
566 mov x0,$C[0] // return value
567 ldp x19,x20,[x29,#16]
569 ldp x21,x22,[x29,#32]
570 ldp x23,x24,[x29,#48]
571 ldp x25,x26,[x29,#64]
572 ldp x27,x28,[x29,#80]
573 ldp x29,x30,[sp],#128
575 .size SHA3_absorb,.-SHA3_absorb
578 my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
581 .type SHA3_squeeze,%function
584 stp x29,x30,[sp,#-48]!
589 mov $A_flat,x0 // put aside arguments
647 .size SHA3_squeeze,.-SHA3_squeeze
648 .asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"