2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for ARMv4.
20 # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21 # interleaving. How does it compare to Keccak Code Package? It's as
22 # fast, but several times smaller, and is endian- and ISA-neutral. ISA
23 # neutrality means that minimum ISA requirement is ARMv4, yet it can
24 # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25 # register layout taken from Keccak Code Package. It's also as fast,
26 # in fact faster by 10-15% on some processors, and endian-neutral.
30 # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31 # of rotate instructions with logical ones. This resulted in ~10%
32 # improvement on most processors. Switch to KECCAK_2X effectively
33 # minimizes re-loads from temporary storage, and merged rotates just
34 # eliminate corresponding instructions. As for latter. When examining
35 # code you'll notice commented ror instructions. These are eliminated
36 # ones, and you should trace destination register below to see what's
37 # going on. Just in case, why not all rotates are eliminated. Trouble
38 # is that you have operations that require both inputs to be rotated,
39 # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40 # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41 # that takes 'a' as input. And thing is that this next operation can
42 # be in next round. It's totally possible to "carry" rotate "factors"
43 # to the next round, but it makes code more complex. And the last word
44 # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
47 # Reduce per-round instruction count in Thumb-2 case by 16%. This is
48 # achieved by folding ldr/str pairs to their double-word counterparts.
49 # Theoretically this should have improved performance on single-issue
50 # cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
53 ########################################################################
54 # Numbers are cycles per processed byte. Non-NEON results account even
55 # for input bit interleaving.
57 # r=1088(*) Thumb-2(**) NEON
60 # Cortex-A5 88/+160%, 86, 36
61 # Cortex-A7 78/+160%, 68, 34
62 # Cortex-A8 51/+230%, 57, 30
63 # Cortex-A9 53/+210%, 51, 26
64 # Cortex-A15 42/+160%, 38, 18
65 # Snapdragon S4 43/+210%, 38, 24
67 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
68 # over compiler-generated KECCAK_2X reference code.
69 # (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70 # Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71 # processors are presented mostly for reference purposes.
73 # $output is the last argument if it looks like a file (it has an extension)
74 # $flavour is the first argument if it doesn't look like a file
75 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
76 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
78 if ($flavour && $flavour ne "void") {
79 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
80 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
81 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
82 die "can't locate arm-xlate.pl";
84 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
85 or die "can't call $xlate: $!";
87 $output and open STDOUT,">$output";
90 my @C = map("r$_",(0..9));
91 my @E = map("r$_",(10..12,14));
93 ########################################################################
95 # ----->+-----------------------+
96 # | uint64_t A[5][5] |
98 # +200->+-----------------------+
101 # +240->+-----------------------+
102 # | uint64_t T[5][5] |
104 # +440->+-----------------------+
106 # +444->+-----------------------+
108 # +448->+-----------------------+
111 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
112 my @D = map(8*$_, (25..29));
113 my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
116 #include "arm_arch.h"
118 #if defined(__thumb2__)
127 .type iotas32, %object
130 .long 0x00000001, 0x00000000
131 .long 0x00000000, 0x00000089
132 .long 0x00000000, 0x8000008b
133 .long 0x00000000, 0x80008080
134 .long 0x00000001, 0x0000008b
135 .long 0x00000001, 0x00008000
136 .long 0x00000001, 0x80008088
137 .long 0x00000001, 0x80000082
138 .long 0x00000000, 0x0000000b
139 .long 0x00000000, 0x0000000a
140 .long 0x00000001, 0x00008082
141 .long 0x00000000, 0x00008003
142 .long 0x00000001, 0x0000808b
143 .long 0x00000001, 0x8000000b
144 .long 0x00000001, 0x8000008a
145 .long 0x00000001, 0x80000081
146 .long 0x00000000, 0x80000081
147 .long 0x00000000, 0x80000008
148 .long 0x00000000, 0x00000083
149 .long 0x00000000, 0x80008003
150 .long 0x00000001, 0x80008088
151 .long 0x00000000, 0x80000088
152 .long 0x00000001, 0x00008000
153 .long 0x00000000, 0x80008082
154 .size iotas32,.-iotas32
156 .type KeccakF1600_int, %function
159 add @C[9],sp,#$A[4][2]
160 add @E[2],sp,#$A[0][0]
161 add @E[0],sp,#$A[1][0]
162 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
165 eor @E[1],@E[1],@E[1]
173 my (@A,@R); (@A[0..4],@R) = @_;
176 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
177 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
179 eor @C[0],@C[0],@E[0]
180 eor @C[1],@C[1],@E[1]
181 eor @C[2],@C[2],@E[2]
182 ldrd @E[0],@E[1],[sp,#$A[1][2]]
183 eor @C[3],@C[3],@E[3]
184 ldrd @E[2],@E[3],[sp,#$A[1][3]]
185 eor @C[4],@C[4],@E[0]
186 eor @C[5],@C[5],@E[1]
187 eor @C[6],@C[6],@E[2]
188 ldrd @E[0],@E[1],[sp,#$A[1][4]]
189 eor @C[7],@C[7],@E[3]
190 ldrd @E[2],@E[3],[sp,#$A[2][0]]
191 eor @C[8],@C[8],@E[0]
192 eor @C[9],@C[9],@E[1]
193 eor @C[0],@C[0],@E[2]
194 ldrd @E[0],@E[1],[sp,#$A[2][1]]
195 eor @C[1],@C[1],@E[3]
196 ldrd @E[2],@E[3],[sp,#$A[2][2]]
197 eor @C[2],@C[2],@E[0]
198 eor @C[3],@C[3],@E[1]
199 eor @C[4],@C[4],@E[2]
200 ldrd @E[0],@E[1],[sp,#$A[2][3]]
201 eor @C[5],@C[5],@E[3]
202 ldrd @E[2],@E[3],[sp,#$A[2][4]]
203 eor @C[6],@C[6],@E[0]
204 eor @C[7],@C[7],@E[1]
205 eor @C[8],@C[8],@E[2]
206 ldrd @E[0],@E[1],[sp,#$A[3][0]]
207 eor @C[9],@C[9],@E[3]
208 ldrd @E[2],@E[3],[sp,#$A[3][1]]
209 eor @C[0],@C[0],@E[0]
210 eor @C[1],@C[1],@E[1]
211 eor @C[2],@C[2],@E[2]
212 ldrd @E[0],@E[1],[sp,#$A[3][2]]
213 eor @C[3],@C[3],@E[3]
214 ldrd @E[2],@E[3],[sp,#$A[3][3]]
215 eor @C[4],@C[4],@E[0]
216 eor @C[5],@C[5],@E[1]
217 eor @C[6],@C[6],@E[2]
218 ldrd @E[0],@E[1],[sp,#$A[3][4]]
219 eor @C[7],@C[7],@E[3]
220 ldrd @E[2],@E[3],[sp,#$A[4][0]]
221 eor @C[8],@C[8],@E[0]
222 eor @C[9],@C[9],@E[1]
223 eor @C[0],@C[0],@E[2]
224 ldrd @E[0],@E[1],[sp,#$A[4][1]]
225 eor @C[1],@C[1],@E[3]
226 ldrd @E[2],@E[3],[sp,#$A[0][2]]
227 eor @C[2],@C[2],@E[0]
228 eor @C[3],@C[3],@E[1]
229 eor @C[4],@C[4],@E[2]
230 ldrd @E[0],@E[1],[sp,#$A[0][3]]
231 eor @C[5],@C[5],@E[3]
232 ldrd @E[2],@E[3],[sp,#$A[0][4]]
234 eor @C[0],@C[0],@E[0]
235 add @E[0],sp,#$A[1][2]
236 eor @C[1],@C[1],@E[1]
237 eor @C[2],@C[2],@E[2]
238 eor @C[3],@C[3],@E[3]
239 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
240 eor @C[4],@C[4],@E[0]
241 add @E[0],sp,#$A[1][4]
242 eor @C[5],@C[5],@E[1]
243 eor @C[6],@C[6],@E[2]
244 eor @C[7],@C[7],@E[3]
245 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
246 eor @C[8],@C[8],@E[0]
247 add @E[0],sp,#$A[2][1]
248 eor @C[9],@C[9],@E[1]
249 eor @C[0],@C[0],@E[2]
250 eor @C[1],@C[1],@E[3]
251 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
252 eor @C[2],@C[2],@E[0]
253 add @E[0],sp,#$A[2][3]
254 eor @C[3],@C[3],@E[1]
255 eor @C[4],@C[4],@E[2]
256 eor @C[5],@C[5],@E[3]
257 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
258 eor @C[6],@C[6],@E[0]
259 add @E[0],sp,#$A[3][0]
260 eor @C[7],@C[7],@E[1]
261 eor @C[8],@C[8],@E[2]
262 eor @C[9],@C[9],@E[3]
263 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
264 eor @C[0],@C[0],@E[0]
265 add @E[0],sp,#$A[3][2]
266 eor @C[1],@C[1],@E[1]
267 eor @C[2],@C[2],@E[2]
268 eor @C[3],@C[3],@E[3]
269 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
270 eor @C[4],@C[4],@E[0]
271 add @E[0],sp,#$A[3][4]
272 eor @C[5],@C[5],@E[1]
273 eor @C[6],@C[6],@E[2]
274 eor @C[7],@C[7],@E[3]
275 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
276 eor @C[8],@C[8],@E[0]
277 ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
278 eor @C[9],@C[9],@E[1]
279 ldr @E[1],[sp,#$A[4][1]+4]
280 eor @C[0],@C[0],@E[2]
281 ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
282 eor @C[1],@C[1],@E[3]
283 ldr @E[3],[sp,#$A[0][2]+4]
284 eor @C[2],@C[2],@E[0]
285 add @E[0],sp,#$A[0][3]
286 eor @C[3],@C[3],@E[1]
287 eor @C[4],@C[4],@E[2]
288 eor @C[5],@C[5],@E[3]
289 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
291 eor @C[6],@C[6],@E[0]
292 eor @C[7],@C[7],@E[1]
293 eor @C[8],@C[8],@E[2]
294 eor @C[9],@C[9],@E[3]
296 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
297 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
298 eor @E[1],@C[1],@C[4]
299 str.h @E[1],[sp,#$D[1]+4]
300 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
301 eor @E[3],@C[7],@C[0]
302 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
303 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
304 str.h @E[3],[sp,#$D[4]+4]
305 eor @C[1],@C[9],@C[2]
306 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
307 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
308 ldr.l @C[7],[sp,#$A[3][3]]
309 eor @C[3],@C[3],@C[6]
310 str.h @C[1],[sp,#$D[0]+4]
311 ldr.h @C[6],[sp,#$A[3][3]+4]
312 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
313 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
314 str.h @C[3],[sp,#$D[2]+4]
315 eor @C[5],@C[5],@C[8]
317 ldr.l @C[8],[sp,#$A[4][4]]
318 ldr.h @C[9],[sp,#$A[4][4]+4]
319 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
320 eor @C[7],@C[7],@C[4]
321 str.h @C[5],[sp,#$D[3]+4]
322 eor @C[6],@C[6],@C[5]
323 ldr.l @C[4],[sp,#$A[0][0]]
324 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
325 @ ror @C[6],@C[6],#32-11
326 ldr.h @C[5],[sp,#$A[0][0]+4]
327 eor @C[8],@C[8],@E[2]
328 eor @C[9],@C[9],@E[3]
329 ldr.l @E[2],[sp,#$A[2][2]]
330 eor @C[0],@C[0],@C[4]
331 ldr.h @E[3],[sp,#$A[2][2]+4]
332 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
333 @ ror @C[9],@C[9],#32-7
334 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
335 eor @E[2],@E[2],@C[2]
336 ldr.l @C[2],[sp,#$A[1][1]]
337 eor @E[3],@E[3],@C[3]
338 ldr.h @C[3],[sp,#$A[1][1]+4]
339 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
340 ldr @E[2],[sp,#444] @ load counter
341 eor @C[2],@C[2],@E[0]
343 ror @C[4],@E[3],#32-22
344 add @E[3],@E[0],@E[2]
345 eor @C[3],@C[3],@E[1]
347 $code.=<<___ if ($A[0][0] != $T[0][0]);
348 ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
350 $code.=<<___ if ($A[0][0] == $T[0][0]);
351 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
353 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
355 str @E[2],[sp,#444] @ store counter
358 bic @E[2],@C[4],@C[2],ror#32-22
359 bic @E[3],@C[5],@C[3],ror#32-22
360 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
361 ror @C[3],@C[3],#32-22
362 eor @E[2],@E[2],@C[0]
363 eor @E[3],@E[3],@C[1]
364 eor @E[0],@E[0],@E[2]
365 eor @E[1],@E[1],@E[3]
366 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
367 bic @E[2],@C[6],@C[4],ror#11
368 str.h @E[1],[sp,#$R[0][0]+4]
369 bic @E[3],@C[7],@C[5],ror#10
370 bic @E[0],@C[8],@C[6],ror#32-(11-7)
371 bic @E[1],@C[9],@C[7],ror#32-(10-7)
372 eor @E[2],@C[2],@E[2],ror#32-11
373 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
374 eor @E[3],@C[3],@E[3],ror#32-10
375 str.h @E[3],[sp,#$R[0][1]+4]
376 eor @E[0],@C[4],@E[0],ror#32-7
377 eor @E[1],@C[5],@E[1],ror#32-7
378 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
379 bic @E[2],@C[0],@C[8],ror#32-7
380 str.h @E[1],[sp,#$R[0][2]+4]
381 bic @E[3],@C[1],@C[9],ror#32-7
382 eor @E[2],@E[2],@C[6],ror#32-11
383 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
384 eor @E[3],@E[3],@C[7],ror#32-10
385 str.h @E[3],[sp,#$R[0][3]+4]
386 bic @E[0],@C[2],@C[0]
388 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
389 bic @E[1],@C[3],@C[1]
390 ldr.h @C[1],[sp,#$A[0][3]+4]
391 eor @E[0],@E[0],@C[8],ror#32-7
392 eor @E[1],@E[1],@C[9],ror#32-7
393 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
395 str.h @E[1],[sp,#$R[0][4]+4]
397 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
398 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
400 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
401 eor @C[0],@C[0],@E[0]
402 ldr.h @C[3],[sp,#$A[1][4]+4]
403 eor @C[1],@C[1],@E[1]
404 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
405 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
406 @ ror @C[1],@C[1],#32-14
407 ldr.h @E[1],[sp,#$A[3][1]+4]
409 eor @C[2],@C[2],@E[2]
410 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
411 eor @C[3],@C[3],@E[3]
412 ldr.h @C[5],[sp,#$A[2][0]+4]
413 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
414 @ ror @C[3],@C[3],#32-10
416 eor @C[6],@C[6],@C[4]
417 ldr.l @E[2],[sp,#$D[2]] @ D[2]
418 eor @C[7],@C[7],@C[5]
419 ldr.h @E[3],[sp,#$D[2]+4]
420 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
421 ror @C[4],@C[7],#32-2
423 eor @E[0],@E[0],@C[8]
424 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
425 eor @E[1],@E[1],@C[9]
426 ldr.h @C[9],[sp,#$A[4][2]+4]
427 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
428 ror @C[6],@E[1],#32-23
430 bic @E[0],@C[4],@C[2],ror#32-10
431 bic @E[1],@C[5],@C[3],ror#32-10
432 eor @E[2],@E[2],@C[8]
433 eor @E[3],@E[3],@C[9]
434 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
435 ror @C[8],@E[3],#32-31
436 eor @E[0],@E[0],@C[0],ror#32-14
437 eor @E[1],@E[1],@C[1],ror#32-14
438 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
439 bic @E[2],@C[6],@C[4]
440 str.h @E[1],[sp,#$R[1][0]+4]
441 bic @E[3],@C[7],@C[5]
442 eor @E[2],@E[2],@C[2],ror#32-10
443 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
444 eor @E[3],@E[3],@C[3],ror#32-10
445 str.h @E[3],[sp,#$R[1][1]+4]
446 bic @E[0],@C[8],@C[6]
447 bic @E[1],@C[9],@C[7]
448 bic @E[2],@C[0],@C[8],ror#14
449 bic @E[3],@C[1],@C[9],ror#14
450 eor @E[0],@E[0],@C[4]
451 eor @E[1],@E[1],@C[5]
452 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
453 bic @C[2],@C[2],@C[0],ror#32-(14-10)
454 str.h @E[1],[sp,#$R[1][2]+4]
455 eor @E[2],@C[6],@E[2],ror#32-14
456 bic @E[1],@C[3],@C[1],ror#32-(14-10)
457 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
458 eor @E[3],@C[7],@E[3],ror#32-14
459 str.h @E[3],[sp,#$R[1][3]+4]
461 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
462 eor @E[0],@C[8],@C[2],ror#32-10
463 ldr.h @C[0],[sp,#$A[0][1]+4]
464 eor @E[1],@C[9],@E[1],ror#32-10
465 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
466 str.h @E[1],[sp,#$R[1][4]+4]
469 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
470 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
471 ldr.h @C[3],[sp,#$A[1][2]+4]
472 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
474 eor @C[1],@C[1],@E[0]
475 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
476 eor @C[0],@C[0],@E[1]
477 ldr.h @C[5],[sp,#$A[2][3]+4]
478 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
480 eor @C[2],@C[2],@E[2]
481 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
482 eor @C[3],@C[3],@E[3]
483 ldr.h @E[1],[sp,#$A[3][4]+4]
484 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
485 ldr.l @E[2],[sp,#$D[0]] @ D[0]
486 @ ror @C[3],@C[3],#32-3
487 ldr.h @E[3],[sp,#$D[0]+4]
489 eor @C[4],@C[4],@C[6]
490 eor @C[5],@C[5],@C[7]
491 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
492 @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
494 eor @E[0],@E[0],@C[8]
495 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
496 eor @E[1],@E[1],@C[9]
497 ldr.h @C[9],[sp,#$A[4][0]+4]
498 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
499 ror @C[7],@E[1],#32-4
501 eor @E[2],@E[2],@C[8]
502 eor @E[3],@E[3],@C[9]
503 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
504 ror @C[9],@E[3],#32-9
506 bic @E[0],@C[5],@C[2],ror#13-3
507 bic @E[1],@C[4],@C[3],ror#12-3
508 bic @E[2],@C[6],@C[5],ror#32-13
509 bic @E[3],@C[7],@C[4],ror#32-12
510 eor @E[0],@C[0],@E[0],ror#32-13
511 eor @E[1],@C[1],@E[1],ror#32-12
512 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
513 eor @E[2],@E[2],@C[2],ror#32-3
514 str.h @E[1],[sp,#$R[2][0]+4]
515 eor @E[3],@E[3],@C[3],ror#32-3
516 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
517 bic @E[0],@C[8],@C[6]
518 bic @E[1],@C[9],@C[7]
519 str.h @E[3],[sp,#$R[2][1]+4]
520 eor @E[0],@E[0],@C[5],ror#32-13
521 eor @E[1],@E[1],@C[4],ror#32-12
522 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
523 bic @E[2],@C[0],@C[8]
524 str.h @E[1],[sp,#$R[2][2]+4]
525 bic @E[3],@C[1],@C[9]
526 eor @E[2],@E[2],@C[6]
527 eor @E[3],@E[3],@C[7]
528 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
529 bic @E[0],@C[2],@C[0],ror#3
530 str.h @E[3],[sp,#$R[2][3]+4]
531 bic @E[1],@C[3],@C[1],ror#3
532 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
533 eor @E[0],@C[8],@E[0],ror#32-3
534 ldr.h @C[0],[sp,#$A[0][4]+4]
535 eor @E[1],@C[9],@E[1],ror#32-3
536 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
538 str.h @E[1],[sp,#$R[2][4]+4]
540 ldr.l @E[0],[sp,#$D[4]] @ D[4]
541 ldr.h @E[1],[sp,#$D[4]+4]
542 ldr.l @E[2],[sp,#$D[0]] @ D[0]
543 ldr.h @E[3],[sp,#$D[0]+4]
545 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
547 eor @C[1],@C[1],@E[0]
548 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
549 eor @C[0],@C[0],@E[1]
550 ldr.h @C[3],[sp,#$A[1][0]+4]
551 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
552 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
553 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
554 ldr.h @C[5],[sp,#$A[2][1]+4]
556 eor @C[2],@C[2],@E[2]
557 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
558 eor @C[3],@C[3],@E[3]
559 ldr.h @E[1],[sp,#$A[3][2]+4]
560 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
561 ldr.l @E[2],[sp,#$D[3]] @ D[3]
562 @ ror @C[3],@C[3],#32-18
563 ldr.h @E[3],[sp,#$D[3]+4]
565 eor @C[6],@C[6],@C[4]
566 eor @C[7],@C[7],@C[5]
567 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
568 ror @C[5],@C[7],#32-5
570 eor @E[0],@E[0],@C[8]
571 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
572 eor @E[1],@E[1],@C[9]
573 ldr.h @C[9],[sp,#$A[4][3]+4]
574 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
575 ror @C[6],@E[1],#32-8
577 eor @E[2],@E[2],@C[8]
578 eor @E[3],@E[3],@C[9]
579 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
580 ror @C[9],@E[3],#32-28
582 bic @E[0],@C[4],@C[2],ror#32-18
583 bic @E[1],@C[5],@C[3],ror#32-18
584 eor @E[0],@E[0],@C[0],ror#32-14
585 eor @E[1],@E[1],@C[1],ror#32-13
586 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
587 bic @E[2],@C[6],@C[4]
588 str.h @E[1],[sp,#$R[3][0]+4]
589 bic @E[3],@C[7],@C[5]
590 eor @E[2],@E[2],@C[2],ror#32-18
591 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
592 eor @E[3],@E[3],@C[3],ror#32-18
593 str.h @E[3],[sp,#$R[3][1]+4]
594 bic @E[0],@C[8],@C[6]
595 bic @E[1],@C[9],@C[7]
596 bic @E[2],@C[0],@C[8],ror#14
597 bic @E[3],@C[1],@C[9],ror#13
598 eor @E[0],@E[0],@C[4]
599 eor @E[1],@E[1],@C[5]
600 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
601 bic @C[2],@C[2],@C[0],ror#18-14
602 str.h @E[1],[sp,#$R[3][2]+4]
603 eor @E[2],@C[6],@E[2],ror#32-14
604 bic @E[1],@C[3],@C[1],ror#18-13
605 eor @E[3],@C[7],@E[3],ror#32-13
606 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
607 str.h @E[3],[sp,#$R[3][3]+4]
609 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
610 eor @E[0],@C[8],@C[2],ror#32-18
611 ldr.h @C[1],[sp,#$A[0][2]+4]
612 eor @E[1],@C[9],@E[1],ror#32-18
613 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
614 str.h @E[1],[sp,#$R[3][4]+4]
616 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
617 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
618 ldr.h @C[3],[sp,#$A[1][3]+4]
619 ldr.l @C[6],[sp,#$D[4]] @ D[4]
620 ldr.h @C[7],[sp,#$D[4]+4]
622 eor @C[0],@C[0],@E[0]
623 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
624 eor @C[1],@C[1],@E[1]
625 ldr.h @C[5],[sp,#$A[2][4]+4]
626 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
627 ldr.l @C[8],[sp,#$D[0]] @ D[0]
628 @ ror @C[1],@C[1],#32-31
629 ldr.h @C[9],[sp,#$D[0]+4]
631 eor @E[2],@E[2],@C[2]
632 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
633 eor @E[3],@E[3],@C[3]
634 ldr.h @E[1],[sp,#$A[3][0]+4]
635 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
636 ldr.l @E[2],[sp,#$D[1]] @ D[1]
637 ror @C[2],@E[3],#32-28
638 ldr.h @E[3],[sp,#$D[1]+4]
640 eor @C[6],@C[6],@C[4]
641 eor @C[7],@C[7],@C[5]
642 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
643 ror @C[4],@C[7],#32-20
645 eor @E[0],@E[0],@C[8]
646 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
647 eor @E[1],@E[1],@C[9]
648 ldr.h @C[9],[sp,#$A[4][1]+4]
649 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
650 ror @C[6],@E[1],#32-21
652 eor @C[8],@C[8],@E[2]
653 eor @C[9],@C[9],@E[3]
654 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
655 @ ror @C[9],@C[3],#32-1
657 bic @E[0],@C[4],@C[2]
658 bic @E[1],@C[5],@C[3]
659 eor @E[0],@E[0],@C[0],ror#32-31
660 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
661 eor @E[1],@E[1],@C[1],ror#32-31
662 str.h @E[1],[sp,#$R[4][0]+4]
663 bic @E[2],@C[6],@C[4]
664 bic @E[3],@C[7],@C[5]
665 eor @E[2],@E[2],@C[2]
666 eor @E[3],@E[3],@C[3]
667 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
668 bic @E[0],@C[8],@C[6],ror#1
669 str.h @E[3],[sp,#$R[4][1]+4]
670 bic @E[1],@C[9],@C[7],ror#1
671 bic @E[2],@C[0],@C[8],ror#31-1
672 bic @E[3],@C[1],@C[9],ror#31-1
673 eor @C[4],@C[4],@E[0],ror#32-1
674 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
675 eor @C[5],@C[5],@E[1],ror#32-1
676 str.h @C[5],[sp,#$R[4][2]+4]
677 eor @C[6],@C[6],@E[2],ror#32-31
678 eor @C[7],@C[7],@E[3],ror#32-31
679 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
680 bic @E[0],@C[2],@C[0],ror#32-31
681 str.h @C[7],[sp,#$R[4][3]+4]
682 bic @E[1],@C[3],@C[1],ror#32-31
683 add @E[2],sp,#$R[0][0]
684 eor @C[8],@E[0],@C[8],ror#32-1
685 add @E[0],sp,#$R[1][0]
686 eor @C[9],@E[1],@C[9],ror#32-1
687 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
688 str.h @C[9],[sp,#$R[4][4]+4]
701 moveq pc,lr @ be binary compatible with V4, yet
702 bx lr @ interoperable with Thumb ISA:-)
704 .size KeccakF1600_int,.-KeccakF1600_int
706 .type KeccakF1600, %function
709 stmdb sp!,{r0,r4-r11,lr}
710 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
712 add @E[0],r0,#$A[1][0]
713 add @E[1],sp,#$A[1][0]
714 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
715 stmia sp, {@C[0]-@C[9]}
716 ldmia @E[0]!,{@C[0]-@C[9]}
717 stmia @E[1]!,{@C[0]-@C[9]}
718 ldmia @E[0]!,{@C[0]-@C[9]}
719 stmia @E[1]!,{@C[0]-@C[9]}
720 ldmia @E[0]!,{@C[0]-@C[9]}
721 stmia @E[1]!,{@C[0]-@C[9]}
722 ldmia @E[0], {@C[0]-@C[9]}
723 add @E[2],sp,#$A[0][0]
724 add @E[0],sp,#$A[1][0]
725 stmia @E[1], {@C[0]-@C[9]}
729 ldr @E[1], [sp,#440+16] @ restore pointer to A
730 ldmia sp, {@C[0]-@C[9]}
731 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
732 ldmia @E[0]!,{@C[0]-@C[9]}
733 stmia @E[1]!,{@C[0]-@C[9]}
734 ldmia @E[0]!,{@C[0]-@C[9]}
735 stmia @E[1]!,{@C[0]-@C[9]}
736 ldmia @E[0]!,{@C[0]-@C[9]}
737 stmia @E[1]!,{@C[0]-@C[9]}
738 ldmia @E[0], {@C[0]-@C[9]}
739 stmia @E[1], {@C[0]-@C[9]}
743 ldmia sp!,{r4-r11,pc}
745 ldmia sp!,{r4-r11,lr}
747 moveq pc,lr @ be binary compatible with V4, yet
748 bx lr @ interoperable with Thumb ISA:-)
750 .size KeccakF1600,.-KeccakF1600
752 { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
754 ########################################################################
756 # ----->+-----------------------+
757 # | uint64_t A[5][5] |
760 # +456->+-----------------------+
762 # +460->+-----------------------+
764 # +464->+-----------------------+
766 # +468->+-----------------------+
768 # +472->+-----------------------+
770 # +476->+-----------------------+
771 # | const void *inp |
772 # +480->+-----------------------+
774 # +484->+-----------------------+
776 # +488->+-----------------------+
781 .type SHA3_absorb,%function
784 stmdb sp!,{r0-r12,lr}
787 add $A_flat,r0,#$A[1][0]
795 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
796 stmia $inp!, {@C[0]-@C[9]}
797 ldmia $A_flat!,{@C[0]-@C[9]}
798 stmia $inp!, {@C[0]-@C[9]}
799 ldmia $A_flat!,{@C[0]-@C[9]}
800 stmia $inp!, {@C[0]-@C[9]}
801 ldmia $A_flat!,{@C[0]-@C[9]}
802 stmia $inp!, {@C[0]-@C[9]}
803 ldmia $A_flat!,{@C[0]-@C[9]}
804 stmia $inp, {@C[0]-@C[9]}
806 ldr $inp,[sp,#476] @ restore $inp
813 mov r6,#0x11 @ compose constants
818 orr r6,r6,r6,lsl#16 @ 0x11111111
819 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
820 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
821 orr r7,r6,r6,lsl#1 @ 0x33333333
822 orr r6,r6,r6,lsl#2 @ 0x55555555
835 str r0,[sp,#480] @ save len - bsz
848 orr r0,r0,r3,lsl#24 @ lo
852 orr r1,r1,r3,lsl#24 @ hi
854 and r2,r0,r6 @ &=0x55555555
855 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
856 and r3,r1,r6 @ &=0x55555555
857 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
862 and r2,r2,r7 @ &=0x33333333
863 and r0,r0,r7,lsl#2 @ &=0xcccccccc
864 and r3,r3,r7 @ &=0x33333333
865 and r1,r1,r7,lsl#2 @ &=0xcccccccc
870 and r2,r2,r8 @ &=0x0f0f0f0f
871 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
872 and r3,r3,r8 @ &=0x0f0f0f0f
873 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
874 ldmia $A_flat,{r4-r5} @ A_flat[i]
879 and r2,r2,r9 @ &=0x00ff00ff
880 and r0,r0,r9,lsl#8 @ &=0xff00ff00
881 and r3,r3,r9 @ &=0x00ff00ff
882 and r1,r1,r9,lsl#8 @ &=0xff00ff00
894 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
904 ldmia r14,{r6-r12,r14} @ restore constants and variables
909 add $inp,sp,#$A[1][0]
910 ldmia sp, {@C[0]-@C[9]}
911 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
912 ldmia $inp!, {@C[0]-@C[9]}
913 stmia $A_flat!,{@C[0]-@C[9]}
914 ldmia $inp!, {@C[0]-@C[9]}
915 stmia $A_flat!,{@C[0]-@C[9]}
916 ldmia $inp!, {@C[0]-@C[9]}
917 stmia $A_flat!,{@C[0]-@C[9]}
918 ldmia $inp, {@C[0]-@C[9]}
919 stmia $A_flat, {@C[0]-@C[9]}
923 mov r0,$len @ return value
925 ldmia sp!,{r4-r12,pc}
927 ldmia sp!,{r4-r12,lr}
929 moveq pc,lr @ be binary compatible with V4, yet
930 bx lr @ interoperable with Thumb ISA:-)
932 .size SHA3_absorb,.-SHA3_absorb
935 { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
939 .type SHA3_squeeze,%function
942 stmdb sp!,{r0,r3-r10,lr}
955 mov r6,#0x11 @ compose constants
960 orr r6,r6,r6,lsl#16 @ 0x11111111
961 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
962 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
963 orr r7,r6,r6,lsl#1 @ 0x33333333
964 orr r6,r6,r6,lsl#2 @ 0x55555555
973 ldmia $A_flat!,{r0,r1} @ A_flat[i++]
976 lsl r3,r1,#16 @ r3 = r1 << 16
977 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
979 lsr r0,r0,#16 @ r0 = r0 >> 16
980 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
986 and r2,r2,r9 @ &=0x00ff00ff
987 and r3,r3,r9,lsl#8 @ &=0xff00ff00
988 and r0,r0,r9 @ &=0x00ff00ff
989 and r1,r1,r9,lsl#8 @ &=0xff00ff00
994 and r2,r2,r8 @ &=0x0f0f0f0f
995 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
996 and r0,r0,r8 @ &=0x0f0f0f0f
997 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
1002 and r2,r2,r7 @ &=0x33333333
1003 and r3,r3,r7,lsl#2 @ &=0xcccccccc
1004 and r0,r0,r7 @ &=0x33333333
1005 and r1,r1,r7,lsl#2 @ &=0xcccccccc
1010 and r2,r2,r6 @ &=0x55555555
1011 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
1012 and r0,r0,r6 @ &=0x55555555
1013 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
1038 subs $bsz,$bsz,#8 @ bsz -= 8
1041 mov r0,r14 @ original $A_flat
1045 ldmia sp,{r6-r10,r12} @ restore constants and variables
1082 ldmia sp!,{r4-r10,pc}
1084 ldmia sp!,{r4-r10,lr}
1086 moveq pc,lr @ be binary compatible with V4, yet
1087 bx lr @ interoperable with Thumb ISA:-)
1089 .size SHA3_squeeze,.-SHA3_squeeze
1094 #if __ARM_MAX_ARCH__>=7
1097 .type iotas64, %object
1100 .quad 0x0000000000000001
1101 .quad 0x0000000000008082
1102 .quad 0x800000000000808a
1103 .quad 0x8000000080008000
1104 .quad 0x000000000000808b
1105 .quad 0x0000000080000001
1106 .quad 0x8000000080008081
1107 .quad 0x8000000000008009
1108 .quad 0x000000000000008a
1109 .quad 0x0000000000000088
1110 .quad 0x0000000080008009
1111 .quad 0x000000008000000a
1112 .quad 0x000000008000808b
1113 .quad 0x800000000000008b
1114 .quad 0x8000000000008089
1115 .quad 0x8000000000008003
1116 .quad 0x8000000000008002
1117 .quad 0x8000000000000080
1118 .quad 0x000000000000800a
1119 .quad 0x800000008000000a
1120 .quad 0x8000000080008081
1121 .quad 0x8000000000008080
1122 .quad 0x0000000080000001
1123 .quad 0x8000000080008008
1124 .size iotas64,.-iotas64
1126 .type KeccakF1600_neon, %function
1131 mov r3, #24 @ loop counter
1137 vst1.64 {q4}, [r0,:64] @ offload A[0..1][4]
1138 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
1139 vst1.64 {d18}, [r1,:64] @ offload A[2][4]
1140 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
1141 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
1142 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1143 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1144 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
1145 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
1146 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1147 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1148 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1149 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
1150 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
1151 veor d25, d25, d24 @ C[4]^=A[4][4]
1153 vadd.u64 q4, q13, q13 @ C[0..1]<<1
1154 vadd.u64 q15, q14, q14 @ C[2..3]<<1
1155 vadd.u64 d18, d25, d25 @ C[4]<<1
1156 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
1157 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
1158 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
1159 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
1160 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1161 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
1162 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
1164 veor d0, d0, d25 @ A[0][0] ^= C[4]
1165 veor d1, d1, d25 @ A[1][0] ^= C[4]
1166 veor d10, d10, d25 @ A[2][0] ^= C[4]
1167 veor d11, d11, d25 @ A[3][0] ^= C[4]
1168 veor d20, d20, d25 @ A[4][0] ^= C[4]
1170 veor d2, d2, d26 @ A[0][1] ^= D[1]
1171 veor d3, d3, d26 @ A[1][1] ^= D[1]
1172 veor d12, d12, d26 @ A[2][1] ^= D[1]
1173 veor d13, d13, d26 @ A[3][1] ^= D[1]
1174 veor d21, d21, d26 @ A[4][1] ^= D[1]
1177 veor d6, d6, d28 @ A[0][3] ^= C[2]
1178 veor d7, d7, d28 @ A[1][3] ^= C[2]
1179 veor d16, d16, d28 @ A[2][3] ^= C[2]
1180 veor d17, d17, d28 @ A[3][3] ^= C[2]
1181 veor d23, d23, d28 @ A[4][3] ^= C[2]
1182 vld1.64 {q4}, [r0,:64] @ restore A[0..1][4]
1185 vld1.64 {d18}, [r1,:64] @ restore A[2][4]
1186 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
1187 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
1188 veor d22, d22, d27 @ A[4][2] ^= D[2]
1190 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
1191 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
1192 veor d24, d24, d29 @ A[4][4] ^= C[3]
1195 vmov d26, d2 @ C[1] = A[0][1]
1196 vshl.u64 d2, d3, #44
1197 vmov d27, d4 @ C[2] = A[0][2]
1198 vshl.u64 d4, d14, #43
1199 vmov d28, d6 @ C[3] = A[0][3]
1200 vshl.u64 d6, d17, #21
1201 vmov d29, d8 @ C[4] = A[0][4]
1202 vshl.u64 d8, d24, #14
1203 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1204 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1205 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1206 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1208 vshl.u64 d3, d9, #20
1209 vshl.u64 d14, d16, #25
1210 vshl.u64 d17, d15, #15
1211 vshl.u64 d24, d21, #2
1212 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1213 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1214 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1215 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1217 vshl.u64 d9, d22, #61
1218 @ vshl.u64 d16, d19, #8
1219 vshl.u64 d15, d12, #10
1220 vshl.u64 d21, d7, #55
1221 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1222 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1223 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1224 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1226 vshl.u64 d22, d18, #39
1227 @ vshl.u64 d19, d23, #56
1228 vshl.u64 d12, d5, #6
1229 vshl.u64 d7, d13, #45
1230 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1231 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1232 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1233 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1235 vshl.u64 d18, d20, #18
1236 vshl.u64 d23, d11, #41
1237 vshl.u64 d5, d10, #3
1238 vshl.u64 d13, d1, #36
1239 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1240 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1241 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1242 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1244 vshl.u64 d1, d28, #28
1245 vshl.u64 d10, d26, #1
1246 vshl.u64 d11, d29, #27
1247 vshl.u64 d20, d27, #62
1248 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
1249 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
1250 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
1251 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
1257 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1258 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1259 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1260 vst1.64 {q13}, [r0,:64] @ offload A[0..1][0]
1263 vmov q1, q14 @ A[0..1][1]
1264 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1265 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1268 vmov q0, q5 @ A[2..3][0]
1270 vmov q15, q6 @ A[2..3][1]
1271 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1273 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1275 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1277 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1278 vmov q14, q10 @ A[4][0..1]
1279 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1281 vld1.64 d25, [r2,:64]! @ Iota[i++]
1284 vld1.64 {q0}, [r0,:64] @ restore A[0..1][0]
1285 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
1287 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
1289 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1291 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1292 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1293 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1299 .size KeccakF1600_neon,.-KeccakF1600_neon
1301 .global SHA3_absorb_neon
1302 .type SHA3_absorb_neon, %function
1305 stmdb sp!, {r4-r6,lr}
1306 vstmdb sp!, {d8-d15}
1312 vld1.32 {d0}, [r0,:64]! @ A[0][0]
1313 vld1.32 {d2}, [r0,:64]! @ A[0][1]
1314 vld1.32 {d4}, [r0,:64]! @ A[0][2]
1315 vld1.32 {d6}, [r0,:64]! @ A[0][3]
1316 vld1.32 {d8}, [r0,:64]! @ A[0][4]
1318 vld1.32 {d1}, [r0,:64]! @ A[1][0]
1319 vld1.32 {d3}, [r0,:64]! @ A[1][1]
1320 vld1.32 {d5}, [r0,:64]! @ A[1][2]
1321 vld1.32 {d7}, [r0,:64]! @ A[1][3]
1322 vld1.32 {d9}, [r0,:64]! @ A[1][4]
1324 vld1.32 {d10}, [r0,:64]! @ A[2][0]
1325 vld1.32 {d12}, [r0,:64]! @ A[2][1]
1326 vld1.32 {d14}, [r0,:64]! @ A[2][2]
1327 vld1.32 {d16}, [r0,:64]! @ A[2][3]
1328 vld1.32 {d18}, [r0,:64]! @ A[2][4]
1330 vld1.32 {d11}, [r0,:64]! @ A[3][0]
1331 vld1.32 {d13}, [r0,:64]! @ A[3][1]
1332 vld1.32 {d15}, [r0,:64]! @ A[3][2]
1333 vld1.32 {d17}, [r0,:64]! @ A[3][3]
1334 vld1.32 {d19}, [r0,:64]! @ A[3][4]
1336 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..3]
1337 vld1.32 {d24}, [r0,:64] @ A[4][4]
1338 sub r0, r0, #24*8 @ rewind
1343 subs r12, r5, r6 @ len - bsz
1347 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1349 veor d0, d0, d31 @ A[0][0] ^= *inp++
1352 veor d2, d2, d31 @ A[0][1] ^= *inp++
1356 veor d4, d4, d31 @ A[0][2] ^= *inp++
1359 veor d6, d6, d31 @ A[0][3] ^= *inp++
1363 veor d8, d8, d31 @ A[0][4] ^= *inp++
1367 veor d1, d1, d31 @ A[1][0] ^= *inp++
1371 veor d3, d3, d31 @ A[1][1] ^= *inp++
1374 veor d5, d5, d31 @ A[1][2] ^= *inp++
1378 veor d7, d7, d31 @ A[1][3] ^= *inp++
1381 veor d9, d9, d31 @ A[1][4] ^= *inp++
1386 veor d10, d10, d31 @ A[2][0] ^= *inp++
1389 veor d12, d12, d31 @ A[2][1] ^= *inp++
1393 veor d14, d14, d31 @ A[2][2] ^= *inp++
1396 veor d16, d16, d31 @ A[2][3] ^= *inp++
1400 veor d18, d18, d31 @ A[2][4] ^= *inp++
1404 veor d11, d11, d31 @ A[3][0] ^= *inp++
1408 veor d13, d13, d31 @ A[3][1] ^= *inp++
1411 veor d15, d15, d31 @ A[3][2] ^= *inp++
1415 veor d17, d17, d31 @ A[3][3] ^= *inp++
1418 veor d19, d19, d31 @ A[3][4] ^= *inp++
1423 veor d20, d20, d31 @ A[4][0] ^= *inp++
1426 veor d21, d21, d31 @ A[4][1] ^= *inp++
1430 veor d22, d22, d31 @ A[4][2] ^= *inp++
1433 veor d23, d23, d31 @ A[4][3] ^= *inp++
1436 veor d24, d24, d31 @ A[4][4] ^= *inp++
1444 vst1.32 {d0}, [r0,:64]! @ A[0][0..4]
1445 vst1.32 {d2}, [r0,:64]!
1446 vst1.32 {d4}, [r0,:64]!
1447 vst1.32 {d6}, [r0,:64]!
1448 vst1.32 {d8}, [r0,:64]!
1450 vst1.32 {d1}, [r0,:64]! @ A[1][0..4]
1451 vst1.32 {d3}, [r0,:64]!
1452 vst1.32 {d5}, [r0,:64]!
1453 vst1.32 {d7}, [r0,:64]!
1454 vst1.32 {d9}, [r0,:64]!
1456 vst1.32 {d10}, [r0,:64]! @ A[2][0..4]
1457 vst1.32 {d12}, [r0,:64]!
1458 vst1.32 {d14}, [r0,:64]!
1459 vst1.32 {d16}, [r0,:64]!
1460 vst1.32 {d18}, [r0,:64]!
1462 vst1.32 {d11}, [r0,:64]! @ A[3][0..4]
1463 vst1.32 {d13}, [r0,:64]!
1464 vst1.32 {d15}, [r0,:64]!
1465 vst1.32 {d17}, [r0,:64]!
1466 vst1.32 {d19}, [r0,:64]!
1468 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1469 vst1.32 {d24}, [r0,:64]
1471 mov r0, r5 @ return value
1472 vldmia sp!, {d8-d15}
1473 ldmia sp!, {r4-r6,pc}
1474 .size SHA3_absorb_neon,.-SHA3_absorb_neon
1476 .global SHA3_squeeze_neon
1477 .type SHA3_squeeze_neon, %function
1480 stmdb sp!, {r4-r6,lr}
1485 mov r12, r0 @ A_flat
1487 b .Loop_squeeze_neon
1492 blo .Lsqueeze_neon_tail
1493 vld1.32 {d0}, [r12]!
1494 vst1.8 {d0}, [r4]! @ endian-neutral store
1496 subs r5, r5, #8 @ len -= 8
1497 beq .Lsqueeze_neon_done
1499 subs r14, r14, #8 @ bsz -= 8
1500 bhi .Loop_squeeze_neon
1502 vstmdb sp!, {d8-d15}
1504 vld1.32 {d0}, [r0,:64]! @ A[0][0..4]
1505 vld1.32 {d2}, [r0,:64]!
1506 vld1.32 {d4}, [r0,:64]!
1507 vld1.32 {d6}, [r0,:64]!
1508 vld1.32 {d8}, [r0,:64]!
1510 vld1.32 {d1}, [r0,:64]! @ A[1][0..4]
1511 vld1.32 {d3}, [r0,:64]!
1512 vld1.32 {d5}, [r0,:64]!
1513 vld1.32 {d7}, [r0,:64]!
1514 vld1.32 {d9}, [r0,:64]!
1516 vld1.32 {d10}, [r0,:64]! @ A[2][0..4]
1517 vld1.32 {d12}, [r0,:64]!
1518 vld1.32 {d14}, [r0,:64]!
1519 vld1.32 {d16}, [r0,:64]!
1520 vld1.32 {d18}, [r0,:64]!
1522 vld1.32 {d11}, [r0,:64]! @ A[3][0..4]
1523 vld1.32 {d13}, [r0,:64]!
1524 vld1.32 {d15}, [r0,:64]!
1525 vld1.32 {d17}, [r0,:64]!
1526 vld1.32 {d19}, [r0,:64]!
1528 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1529 vld1.32 {d24}, [r0,:64]
1530 sub r0, r0, #24*8 @ rewind
1534 mov r12, r0 @ A_flat
1535 vst1.32 {d0}, [r0,:64]! @ A[0][0..4]
1536 vst1.32 {d2}, [r0,:64]!
1537 vst1.32 {d4}, [r0,:64]!
1538 vst1.32 {d6}, [r0,:64]!
1539 vst1.32 {d8}, [r0,:64]!
1541 vst1.32 {d1}, [r0,:64]! @ A[1][0..4]
1542 vst1.32 {d3}, [r0,:64]!
1543 vst1.32 {d5}, [r0,:64]!
1544 vst1.32 {d7}, [r0,:64]!
1545 vst1.32 {d9}, [r0,:64]!
1547 vst1.32 {d10}, [r0,:64]! @ A[2][0..4]
1548 vst1.32 {d12}, [r0,:64]!
1549 vst1.32 {d14}, [r0,:64]!
1550 vst1.32 {d16}, [r0,:64]!
1551 vst1.32 {d18}, [r0,:64]!
1553 vst1.32 {d11}, [r0,:64]! @ A[3][0..4]
1554 vst1.32 {d13}, [r0,:64]!
1555 vst1.32 {d15}, [r0,:64]!
1556 vst1.32 {d17}, [r0,:64]!
1557 vst1.32 {d19}, [r0,:64]!
1559 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1561 vst1.32 {d24}, [r0,:64]
1562 mov r0, r12 @ rewind
1564 vldmia sp!, {d8-d15}
1565 b .Loop_squeeze_neon
1568 .Lsqueeze_neon_tail:
1571 strb r2, [r4],#1 @ endian-neutral store
1573 blo .Lsqueeze_neon_done
1576 beq .Lsqueeze_neon_done
1580 blo .Lsqueeze_neon_done
1582 beq .Lsqueeze_neon_done
1587 blo .Lsqueeze_neon_done
1590 beq .Lsqueeze_neon_done
1593 .Lsqueeze_neon_done:
1594 ldmia sp!, {r4-r6,pc}
1595 .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
1597 .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1605 my ($mnemonic,$half,$reg,$ea) = @_;
1606 my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1611 sprintf "#ifndef __thumb2__\n" .
1613 "#endif", $mnemonic,$reg,$ea;
1615 sprintf "#ifndef __thumb2__\n" .
1618 " %sd\t%s,%s,%s\n" .
1619 "#endif", $mnemonic,$reg,$ea,
1620 $mnemonic,$$op{reg},$reg,$$op{ea};
1625 foreach (split($/,$code)) {
1626 s/\`([^\`]*)\`/eval $1/ge;
1628 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1629 s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or
1630 s/\bret\b/bx lr/g or
1631 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
1636 close STDOUT or die "error closing STDOUT"; # enforce flush