2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for ARMv8.
20 # This is straightforward KECCAK_1X_ALT implementation. It makes no
21 # sense to attempt SIMD/NEON implementation for following reason.
22 # 64-bit lanes of vector registers can't be addressed as easily as in
23 # 32-bit mode. This means that 64-bit NEON is bound to be slower than
24 # 32-bit NEON, and this implementation is faster than 32-bit NEON on
25 # same processor. Even though it takes more scalar xor's and andn's,
26 # it gets compensated by availability of rotate. Not to forget that
27 # most processors achieve higher issue rate with scalar instructions.
31 # Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
32 # variant with register permutation/rotation twist that allows to
33 # eliminate copies to temporary registers. If you look closely you'll
34 # notice that it uses only one lane of vector registers. The new
35 # instructions effectively facilitate parallel hashing, which we don't
36 # support [yet?]. But lowest-level core procedure is prepared for it.
37 # The inner round is 67 [vector] instructions, so it's not actually
38 # obvious that it will provide performance improvement [in serial
39 # hash] as long as vector instructions issue rate is limited to 1 per
42 ######################################################################
43 # Numbers are cycles per processed byte.
56 # (*) Corresponds to SHA3-256. No improvement coefficients are listed
57 # because they vary too much from compiler to compiler. Newer
58 # compiler does much better and improvement varies from 5% on
59 # Cortex-A57 to 25% on Cortex-A53. While in comparison to older
60 # compiler this code is at least 2x faster...
65 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
66 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
67 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
68 die "can't locate arm-xlate.pl";
70 open OUT,"| \"$^X\" $xlate $flavour $output";
73 my @rhotates = ([ 0, 1, 62, 28, 27 ],
74 [ 36, 44, 6, 55, 20 ],
75 [ 3, 10, 43, 25, 39 ],
76 [ 41, 45, 15, 21, 8 ],
77 [ 18, 2, 61, 56, 14 ]);
82 .align 8 // strategic alignment and padding that allows to use
83 // address value as loop termination condition...
87 .quad 0x0000000000000001
88 .quad 0x0000000000008082
89 .quad 0x800000000000808a
90 .quad 0x8000000080008000
91 .quad 0x000000000000808b
92 .quad 0x0000000080000001
93 .quad 0x8000000080008081
94 .quad 0x8000000000008009
95 .quad 0x000000000000008a
96 .quad 0x0000000000000088
97 .quad 0x0000000080008009
98 .quad 0x000000008000000a
99 .quad 0x000000008000808b
100 .quad 0x800000000000008b
101 .quad 0x8000000000008089
102 .quad 0x8000000000008003
103 .quad 0x8000000000008002
104 .quad 0x8000000000000080
105 .quad 0x000000000000800a
106 .quad 0x800000008000000a
107 .quad 0x8000000080008081
108 .quad 0x8000000000008080
109 .quad 0x0000000080000001
110 .quad 0x8000000080008008
114 my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
116 $A[3][3] = "x25"; # x18 is reserved
118 my @C = map("x$_", (26,27,28,30));
121 .type KeccakF1600_int,%function
125 .inst 0xd503233f // paciasp
126 stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
130 ////////////////////////////////////////// Theta
131 eor $C[0],$A[0][0],$A[1][0]
132 stp $A[0][4],$A[1][4],[sp,#0] // offload pair...
133 eor $C[1],$A[0][1],$A[1][1]
134 eor $C[2],$A[0][2],$A[1][2]
135 eor $C[3],$A[0][3],$A[1][3]
140 eor $C[4],$A[0][4],$A[1][4]
141 eor $C[0],$C[0],$A[2][0]
142 eor $C[1],$C[1],$A[2][1]
143 eor $C[2],$C[2],$A[2][2]
144 eor $C[3],$C[3],$A[2][3]
145 eor $C[4],$C[4],$A[2][4]
146 eor $C[0],$C[0],$A[3][0]
147 eor $C[1],$C[1],$A[3][1]
148 eor $C[2],$C[2],$A[3][2]
149 eor $C[3],$C[3],$A[3][3]
150 eor $C[4],$C[4],$A[3][4]
151 eor $C[0],$C[0],$A[4][0]
152 eor $C[2],$C[2],$A[4][2]
153 eor $C[1],$C[1],$A[4][1]
154 eor $C[3],$C[3],$A[4][3]
155 eor $C[4],$C[4],$A[4][4]
157 eor $C[5],$C[0],$C[2],ror#63
159 eor $A[0][1],$A[0][1],$C[5]
160 eor $A[1][1],$A[1][1],$C[5]
161 eor $A[2][1],$A[2][1],$C[5]
162 eor $A[3][1],$A[3][1],$C[5]
163 eor $A[4][1],$A[4][1],$C[5]
165 eor $C[5],$C[1],$C[3],ror#63
166 eor $C[2],$C[2],$C[4],ror#63
167 eor $C[3],$C[3],$C[0],ror#63
168 eor $C[4],$C[4],$C[1],ror#63
170 eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2]
171 eor $A[1][2],$A[1][2],$C[5]
172 eor $A[2][2],$A[2][2],$C[5]
173 eor $A[3][2],$A[3][2],$C[5]
174 eor $A[4][2],$A[4][2],$C[5]
176 eor $A[0][0],$A[0][0],$C[4]
177 eor $A[1][0],$A[1][0],$C[4]
178 eor $A[2][0],$A[2][0],$C[4]
179 eor $A[3][0],$A[3][0],$C[4]
180 eor $A[4][0],$A[4][0],$C[4]
185 ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data
186 eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3]
187 eor $A[1][3],$A[1][3],$C[2]
188 eor $A[2][3],$A[2][3],$C[2]
189 eor $A[3][3],$A[3][3],$C[2]
190 eor $A[4][3],$A[4][3],$C[2]
192 eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4]
193 eor $A[1][4],$A[1][4],$C[3]
194 eor $A[2][4],$A[2][4],$C[3]
195 eor $A[3][4],$A[3][4],$C[3]
196 eor $A[4][4],$A[4][4],$C[3]
198 ////////////////////////////////////////// Rho+Pi
200 ror $A[0][1],$A[1][1],#64-$rhotates[1][1]
202 ror $A[0][2],$A[2][2],#64-$rhotates[2][2]
204 ror $A[0][3],$A[3][3],#64-$rhotates[3][3]
206 ror $A[0][4],$A[4][4],#64-$rhotates[4][4]
208 ror $A[1][1],$A[1][4],#64-$rhotates[1][4]
209 ror $A[2][2],$A[2][3],#64-$rhotates[2][3]
210 ror $A[3][3],$A[3][2],#64-$rhotates[3][2]
211 ror $A[4][4],$A[4][1],#64-$rhotates[4][1]
213 ror $A[1][4],$A[4][2],#64-$rhotates[4][2]
214 ror $A[2][3],$A[3][4],#64-$rhotates[3][4]
215 ror $A[3][2],$A[2][1],#64-$rhotates[2][1]
216 ror $A[4][1],$A[1][3],#64-$rhotates[1][3]
218 ror $A[4][2],$A[2][4],#64-$rhotates[2][4]
219 ror $A[3][4],$A[4][3],#64-$rhotates[4][3]
220 ror $A[2][1],$A[1][2],#64-$rhotates[1][2]
221 ror $A[1][3],$A[3][1],#64-$rhotates[3][1]
223 ror $A[2][4],$A[4][0],#64-$rhotates[4][0]
224 ror $A[4][3],$A[3][0],#64-$rhotates[3][0]
225 ror $A[1][2],$A[2][0],#64-$rhotates[2][0]
226 ror $A[3][1],$A[1][0],#64-$rhotates[1][0]
228 ror $A[1][0],$C[0],#64-$rhotates[0][3]
229 ror $A[2][0],$C[3],#64-$rhotates[0][1]
230 ror $A[3][0],$C[2],#64-$rhotates[0][4]
231 ror $A[4][0],$C[1],#64-$rhotates[0][2]
233 ////////////////////////////////////////// Chi+Iota
234 bic $C[0],$A[0][2],$A[0][1]
235 bic $C[1],$A[0][3],$A[0][2]
236 bic $C[2],$A[0][0],$A[0][4]
237 bic $C[3],$A[0][1],$A[0][0]
238 eor $A[0][0],$A[0][0],$C[0]
239 bic $C[0],$A[0][4],$A[0][3]
240 eor $A[0][1],$A[0][1],$C[1]
242 eor $A[0][3],$A[0][3],$C[2]
243 eor $A[0][4],$A[0][4],$C[3]
244 eor $A[0][2],$A[0][2],$C[0]
245 ldr $C[3],[$C[1]],#8 // Iota[i++]
247 bic $C[0],$A[1][2],$A[1][1]
248 tst $C[1],#255 // are we done?
250 bic $C[1],$A[1][3],$A[1][2]
251 bic $C[2],$A[1][0],$A[1][4]
252 eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota
253 bic $C[3],$A[1][1],$A[1][0]
254 eor $A[1][0],$A[1][0],$C[0]
255 bic $C[0],$A[1][4],$A[1][3]
256 eor $A[1][1],$A[1][1],$C[1]
257 eor $A[1][3],$A[1][3],$C[2]
258 eor $A[1][4],$A[1][4],$C[3]
259 eor $A[1][2],$A[1][2],$C[0]
261 bic $C[0],$A[2][2],$A[2][1]
262 bic $C[1],$A[2][3],$A[2][2]
263 bic $C[2],$A[2][0],$A[2][4]
264 bic $C[3],$A[2][1],$A[2][0]
265 eor $A[2][0],$A[2][0],$C[0]
266 bic $C[0],$A[2][4],$A[2][3]
267 eor $A[2][1],$A[2][1],$C[1]
268 eor $A[2][3],$A[2][3],$C[2]
269 eor $A[2][4],$A[2][4],$C[3]
270 eor $A[2][2],$A[2][2],$C[0]
272 bic $C[0],$A[3][2],$A[3][1]
273 bic $C[1],$A[3][3],$A[3][2]
274 bic $C[2],$A[3][0],$A[3][4]
275 bic $C[3],$A[3][1],$A[3][0]
276 eor $A[3][0],$A[3][0],$C[0]
277 bic $C[0],$A[3][4],$A[3][3]
278 eor $A[3][1],$A[3][1],$C[1]
279 eor $A[3][3],$A[3][3],$C[2]
280 eor $A[3][4],$A[3][4],$C[3]
281 eor $A[3][2],$A[3][2],$C[0]
283 bic $C[0],$A[4][2],$A[4][1]
284 bic $C[1],$A[4][3],$A[4][2]
285 bic $C[2],$A[4][0],$A[4][4]
286 bic $C[3],$A[4][1],$A[4][0]
287 eor $A[4][0],$A[4][0],$C[0]
288 bic $C[0],$A[4][4],$A[4][3]
289 eor $A[4][1],$A[4][1],$C[1]
290 eor $A[4][3],$A[4][3],$C[2]
291 eor $A[4][4],$A[4][4],$C[3]
292 eor $A[4][2],$A[4][2],$C[0]
297 .inst 0xd50323bf // autiasp
299 .size KeccakF1600_int,.-KeccakF1600_int
301 .type KeccakF1600,%function
304 .inst 0xd503233f // paciasp
305 stp x29,x30,[sp,#-128]!
314 str x0,[sp,#32] // offload argument
316 ldp $A[0][0],$A[0][1],[x0,#16*0]
317 ldp $A[0][2],$A[0][3],[$C[0],#16*1]
318 ldp $A[0][4],$A[1][0],[$C[0],#16*2]
319 ldp $A[1][1],$A[1][2],[$C[0],#16*3]
320 ldp $A[1][3],$A[1][4],[$C[0],#16*4]
321 ldp $A[2][0],$A[2][1],[$C[0],#16*5]
322 ldp $A[2][2],$A[2][3],[$C[0],#16*6]
323 ldp $A[2][4],$A[3][0],[$C[0],#16*7]
324 ldp $A[3][1],$A[3][2],[$C[0],#16*8]
325 ldp $A[3][3],$A[3][4],[$C[0],#16*9]
326 ldp $A[4][0],$A[4][1],[$C[0],#16*10]
327 ldp $A[4][2],$A[4][3],[$C[0],#16*11]
328 ldr $A[4][4],[$C[0],#16*12]
333 stp $A[0][0],$A[0][1],[$C[0],#16*0]
334 stp $A[0][2],$A[0][3],[$C[0],#16*1]
335 stp $A[0][4],$A[1][0],[$C[0],#16*2]
336 stp $A[1][1],$A[1][2],[$C[0],#16*3]
337 stp $A[1][3],$A[1][4],[$C[0],#16*4]
338 stp $A[2][0],$A[2][1],[$C[0],#16*5]
339 stp $A[2][2],$A[2][3],[$C[0],#16*6]
340 stp $A[2][4],$A[3][0],[$C[0],#16*7]
341 stp $A[3][1],$A[3][2],[$C[0],#16*8]
342 stp $A[3][3],$A[3][4],[$C[0],#16*9]
343 stp $A[4][0],$A[4][1],[$C[0],#16*10]
344 stp $A[4][2],$A[4][3],[$C[0],#16*11]
345 str $A[4][4],[$C[0],#16*12]
347 ldp x19,x20,[x29,#16]
349 ldp x21,x22,[x29,#32]
350 ldp x23,x24,[x29,#48]
351 ldp x25,x26,[x29,#64]
352 ldp x27,x28,[x29,#80]
353 ldp x29,x30,[sp],#128
354 .inst 0xd50323bf // autiasp
356 .size KeccakF1600,.-KeccakF1600
359 .type SHA3_absorb,%function
362 .inst 0xd503233f // paciasp
363 stp x29,x30,[sp,#-128]!
372 stp x0,x1,[sp,#32] // offload arguments
375 mov $C[0],x0 // uint64_t A[5][5]
376 mov $C[1],x1 // const void *inp
377 mov $C[2],x2 // size_t len
378 mov $C[3],x3 // size_t bsz
379 ldp $A[0][0],$A[0][1],[$C[0],#16*0]
380 ldp $A[0][2],$A[0][3],[$C[0],#16*1]
381 ldp $A[0][4],$A[1][0],[$C[0],#16*2]
382 ldp $A[1][1],$A[1][2],[$C[0],#16*3]
383 ldp $A[1][3],$A[1][4],[$C[0],#16*4]
384 ldp $A[2][0],$A[2][1],[$C[0],#16*5]
385 ldp $A[2][2],$A[2][3],[$C[0],#16*6]
386 ldp $A[2][4],$A[3][0],[$C[0],#16*7]
387 ldp $A[3][1],$A[3][2],[$C[0],#16*8]
388 ldp $A[3][3],$A[3][4],[$C[0],#16*9]
389 ldp $A[4][0],$A[4][1],[$C[0],#16*10]
390 ldp $A[4][2],$A[4][3],[$C[0],#16*11]
391 ldr $A[4][4],[$C[0],#16*12]
396 subs $C[0],$C[2],$C[3] // len - bsz
399 str $C[0],[sp,#48] // save len - bsz
401 for (my $i=0; $i<24; $i+=2) {
404 ldr $C[0],[$C[1]],#8 // *inp++
408 eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
411 ldr $C[0],[$C[1]],#8 // *inp++
415 eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
420 ldr $C[0],[$C[1]],#8 // *inp++
424 eor $A[4][4],$A[4][4],$C[0]
427 str $C[1],[sp,#40] // save inp
431 ldr $C[1],[sp,#40] // restore arguments
432 ldp $C[2],$C[3],[sp,#48]
438 stp $A[0][0],$A[0][1],[$C[1],#16*0]
439 stp $A[0][2],$A[0][3],[$C[1],#16*1]
440 stp $A[0][4],$A[1][0],[$C[1],#16*2]
441 stp $A[1][1],$A[1][2],[$C[1],#16*3]
442 stp $A[1][3],$A[1][4],[$C[1],#16*4]
443 stp $A[2][0],$A[2][1],[$C[1],#16*5]
444 stp $A[2][2],$A[2][3],[$C[1],#16*6]
445 stp $A[2][4],$A[3][0],[$C[1],#16*7]
446 stp $A[3][1],$A[3][2],[$C[1],#16*8]
447 stp $A[3][3],$A[3][4],[$C[1],#16*9]
448 stp $A[4][0],$A[4][1],[$C[1],#16*10]
449 stp $A[4][2],$A[4][3],[$C[1],#16*11]
450 str $A[4][4],[$C[1],#16*12]
452 mov x0,$C[2] // return value
453 ldp x19,x20,[x29,#16]
455 ldp x21,x22,[x29,#32]
456 ldp x23,x24,[x29,#48]
457 ldp x25,x26,[x29,#64]
458 ldp x27,x28,[x29,#80]
459 ldp x29,x30,[sp],#128
460 .inst 0xd50323bf // autiasp
462 .size SHA3_absorb,.-SHA3_absorb
465 my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
468 .type SHA3_squeeze,%function
471 .inst 0xd503233f // paciasp
472 stp x29,x30,[sp,#-48]!
477 mov $A_flat,x0 // put aside arguments
534 .inst 0xd50323bf // autiasp
536 .size SHA3_squeeze,.-SHA3_squeeze
540 my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
541 "v".($_+3).".16b", "v".($_+4).".16b" ],
544 my @C = map("v$_.16b", (25..31));
545 my @D = @C[4,5,6,2,3];
548 .type KeccakF1600_ce,%function
556 ////////////////////////////////////////////////// Theta
557 eor3 $C[0],$A[4][0],$A[3][0],$A[2][0]
558 eor3 $C[1],$A[4][1],$A[3][1],$A[2][1]
559 eor3 $C[2],$A[4][2],$A[3][2],$A[2][2]
560 eor3 $C[3],$A[4][3],$A[3][3],$A[2][3]
561 eor3 $C[4],$A[4][4],$A[3][4],$A[2][4]
562 eor3 $C[0],$C[0], $A[1][0],$A[0][0]
563 eor3 $C[1],$C[1], $A[1][1],$A[0][1]
564 eor3 $C[2],$C[2], $A[1][2],$A[0][2]
565 eor3 $C[3],$C[3], $A[1][3],$A[0][3]
566 eor3 $C[4],$C[4], $A[1][4],$A[0][4]
568 rax1 $C[5],$C[0],$C[2] // D[1]
569 rax1 $C[6],$C[1],$C[3] // D[2]
570 rax1 $C[2],$C[2],$C[4] // D[3]
571 rax1 $C[3],$C[3],$C[0] // D[4]
572 rax1 $C[4],$C[4],$C[1] // D[0]
574 ////////////////////////////////////////////////// Theta+Rho+Pi
575 xar $C[0], $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]
577 xar $A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
578 xar $A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
579 xar $A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
580 xar $A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
581 xar $A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]
583 xar $C[1], $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]
585 xar $A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
586 xar $A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
587 xar $A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
588 xar $A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
589 xar $A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]
591 xar $A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]
593 xar $D[4], $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
594 xar $A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
595 xar $A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
596 xar $A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
597 xar $A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]
599 xar $A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]
601 eor $A[0][0],$A[0][0],$D[0]
603 xar $D[3], $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
604 xar $A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
605 xar $D[1], $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
606 xar $D[2], $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
607 xar $D[0], $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]
609 ////////////////////////////////////////////////// Chi+Iota
610 bcax $A[4][0],$C[1], $A[4][2],$A[1][3] // A[1][3]=A[4][1]
611 bcax $A[4][1],$A[1][3],$A[4][3],$A[4][2] // A[1][3]=A[4][1]
612 bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3]
613 bcax $A[4][3],$A[4][3],$C[1], $A[4][4]
614 bcax $A[4][4],$A[4][4],$A[1][3],$C[1] // A[1][3]=A[4][1]
616 ld1r {$C[1]},[x10],#8
618 bcax $A[3][2],$D[1], $A[3][4],$A[0][3] // A[0][3]=A[3][3]
619 bcax $A[3][3],$A[0][3],$A[3][0],$A[3][4] // A[0][3]=A[3][3]
620 bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0]
621 bcax $A[3][0],$A[3][0],$D[1], $A[3][1]
622 bcax $A[3][1],$A[3][1],$A[0][3],$D[1] // A[0][3]=A[3][3]
624 bcax $A[2][0],$C[0], $A[2][2],$D[2]
625 bcax $A[2][1],$D[2], $A[2][3],$A[2][2]
626 bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3]
627 bcax $A[2][3],$A[2][3],$C[0], $A[2][4]
628 bcax $A[2][4],$A[2][4],$D[2], $C[0]
630 bcax $A[1][2],$D[0], $A[1][4],$A[0][4] // A[0][4]=A[1][3]
631 bcax $A[1][3],$A[0][4],$A[1][0],$A[1][4] // A[0][4]=A[1][3]
632 bcax $A[1][4],$A[1][4],$A[1][1],$A[1][0]
633 bcax $A[1][0],$A[1][0],$D[0], $A[1][1]
634 bcax $A[1][1],$A[1][1],$A[0][4],$D[0] // A[0][4]=A[1][3]
636 bcax $A[0][3],$D[3], $A[0][0],$D[4]
637 bcax $A[0][4],$D[4], $A[0][1],$A[0][0]
638 bcax $A[0][0],$A[0][0],$A[0][2],$A[0][1]
639 bcax $A[0][1],$A[0][1],$D[3], $A[0][2]
640 bcax $A[0][2],$A[0][2],$D[4], $D[3]
642 eor $A[0][0],$A[0][0],$C[1]
648 .size KeccakF1600_ce,.-KeccakF1600_ce
650 .type KeccakF1600_cext,%function
653 .inst 0xd503233f // paciasp
654 stp x29,x30,[sp,#-80]!
656 stp d8,d9,[sp,#16] // per ABI requirement
661 for($i=0; $i<24; $i+=2) { # load A[5][5]
664 ldp d$i,d$j,[x0,#8*$i]
672 for($i=0; $i<24; $i+=2) { # store A[5][5]
675 stp d$i,d$j,[x0,#8*$i]
686 .inst 0xd50323bf // autiasp
688 .size KeccakF1600_cext,.-KeccakF1600_cext
692 my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
695 .globl SHA3_absorb_cext
696 .type SHA3_absorb_cext,%function
699 .inst 0xd503233f // paciasp
700 stp x29,x30,[sp,#-80]!
702 stp d8,d9,[sp,#16] // per ABI requirement
707 for($i=0; $i<24; $i+=2) { # load A[5][5]
710 ldp d$i,d$j,[x0,#8*$i]
719 subs $len,$len,$bsz // len - bsz
722 for (my $i=0; $i<24; $i+=2) {
725 ldr d31,[$inp],#8 // *inp++
727 rev64 v31.16b,v31.16b
729 eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
731 blo .Lprocess_block_ce
732 ldr d31,[$inp],#8 // *inp++
734 rev64 v31.16b,v31.16b
736 eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
737 beq .Lprocess_block_ce
741 ldr d31,[$inp],#8 // *inp++
743 rev64 v31.16b,v31.16b
745 eor $A[4][4],$A[4][4],v31.16b
756 for($i=0; $i<24; $i+=2) { # store A[5][5]
759 stp d$i,d$j,[x0,#8*$i]
764 add x0,$len,$bsz // return value
771 .inst 0xd50323bf // autiasp
773 .size SHA3_absorb_cext,.-SHA3_absorb_cext
777 my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
779 .globl SHA3_squeeze_cext
780 .type SHA3_squeeze_cext,%function
783 .inst 0xd503233f // paciasp
784 stp x29,x30,[sp,#-16]!
792 blo .Lsqueeze_tail_ce
797 beq .Lsqueeze_done_ce
814 beq .Lsqueeze_done_ce
818 beq .Lsqueeze_done_ce
822 beq .Lsqueeze_done_ce
826 beq .Lsqueeze_done_ce
830 beq .Lsqueeze_done_ce
834 beq .Lsqueeze_done_ce
839 .inst 0xd50323bf // autiasp
841 .size SHA3_squeeze_cext,.-SHA3_squeeze_cext
845 .asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
849 "rax1" => 0xce608c00, "eor3" => 0xce000000,
850 "bcax" => 0xce200000, "xar" => 0xce800000 );
853 my ($mnemonic,$arg)=@_;
855 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
857 sprintf ".inst\t0x%08x\t//%s %s",
858 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
863 foreach(split("\n",$code)) {
865 s/\`([^\`]*)\`/eval($1)/ge;
867 m/\bld1r\b/ and s/\.16b/.2d/g or
868 s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;