2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for ARMv4.
20 # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21 # interleaving. How does it compare to Keccak Code Package? It's as
22 # fast, but several times smaller, and is endian- and ISA-neutral. ISA
23 # neutrality means that minimum ISA requirement is ARMv4, yet it can
24 # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25 # register layout taken from Keccak Code Package. It's also as fast,
26 # in fact faster by 10-15% on some processors, and endian-neutral.
30 # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31 # of rotate instructions with logical ones. This resulted in ~10%
32 # improvement on most processors. Switch to KECCAK_2X effectively
33 # minimizes re-loads from temporary storage, and merged rotates just
34 # eliminate corresponding instructions. As for latter. When examining
35 # code you'll notice commented ror instructions. These are eliminated
36 # ones, and you should trace destination register below to see what's
37 # going on. Just in case, why not all rotates are eliminated. Trouble
38 # is that you have operations that require both inputs to be rotated,
39 # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40 # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41 # that takes 'a' as input. And thing is that this next operation can
42 # be in next round. It's totally possible to "carry" rotate "factors"
43 # to the next round, but it makes code more complex. And the last word
44 # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
47 # Reduce per-round instruction count in Thumb-2 case by 16%. This is
48 # achieved by folding ldr/str pairs to their double-word counterparts.
49 # Theoretically this should have improved performance on single-issue
50 # cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
53 ########################################################################
54 # Numbers are cycles per processed byte. Non-NEON results account even
55 # for input bit interleaving.
57 # r=1088(*) Thumb-2(**) NEON
60 # Cortex-A5 88/+160%, 86, 36
61 # Cortex-A7 78/+160%, 68, 34
62 # Cortex-A8 51/+230%, 57, 30
63 # Cortex-A9 53/+210%, 51, 26
64 # Cortex-A15 42/+160%, 38, 18
65 # Snapdragon S4 43/+210%, 38, 24
67 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
68 # over compiler-generated KECCAK_2X reference code.
69 # (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70 # Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71 # processors are presented mostly for reference purposes.
74 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
75 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
77 if ($flavour && $flavour ne "void") {
78 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
79 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
80 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
81 die "can't locate arm-xlate.pl";
83 open STDOUT,"| \"$^X\" $xlate $flavour $output";
85 open STDOUT,">$output";
88 my @C = map("r$_",(0..9));
89 my @E = map("r$_",(10..12,14));
91 ########################################################################
93 # ----->+-----------------------+
94 # | uint64_t A[5][5] |
96 # +200->+-----------------------+
99 # +240->+-----------------------+
100 # | uint64_t T[5][5] |
102 # +440->+-----------------------+
104 # +444->+-----------------------+
106 # +448->+-----------------------+
109 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
110 my @D = map(8*$_, (25..29));
111 my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
114 #include "arm_arch.h"
118 #if defined(__thumb2__)
125 .type iotas32, %object
128 .long 0x00000001, 0x00000000
129 .long 0x00000000, 0x00000089
130 .long 0x00000000, 0x8000008b
131 .long 0x00000000, 0x80008080
132 .long 0x00000001, 0x0000008b
133 .long 0x00000001, 0x00008000
134 .long 0x00000001, 0x80008088
135 .long 0x00000001, 0x80000082
136 .long 0x00000000, 0x0000000b
137 .long 0x00000000, 0x0000000a
138 .long 0x00000001, 0x00008082
139 .long 0x00000000, 0x00008003
140 .long 0x00000001, 0x0000808b
141 .long 0x00000001, 0x8000000b
142 .long 0x00000001, 0x8000008a
143 .long 0x00000001, 0x80000081
144 .long 0x00000000, 0x80000081
145 .long 0x00000000, 0x80000008
146 .long 0x00000000, 0x00000083
147 .long 0x00000000, 0x80008003
148 .long 0x00000001, 0x80008088
149 .long 0x00000000, 0x80000088
150 .long 0x00000001, 0x00008000
151 .long 0x00000000, 0x80008082
152 .size iotas32,.-iotas32
154 .type KeccakF1600_int, %function
157 add @C[9],sp,#$A[4][2]
158 add @E[2],sp,#$A[0][0]
159 add @E[0],sp,#$A[1][0]
160 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
163 eor @E[1],@E[1],@E[1]
171 my (@A,@R); (@A[0..4],@R) = @_;
174 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
175 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
177 eor @C[0],@C[0],@E[0]
178 eor @C[1],@C[1],@E[1]
179 eor @C[2],@C[2],@E[2]
180 ldrd @E[0],@E[1],[sp,#$A[1][2]]
181 eor @C[3],@C[3],@E[3]
182 ldrd @E[2],@E[3],[sp,#$A[1][3]]
183 eor @C[4],@C[4],@E[0]
184 eor @C[5],@C[5],@E[1]
185 eor @C[6],@C[6],@E[2]
186 ldrd @E[0],@E[1],[sp,#$A[1][4]]
187 eor @C[7],@C[7],@E[3]
188 ldrd @E[2],@E[3],[sp,#$A[2][0]]
189 eor @C[8],@C[8],@E[0]
190 eor @C[9],@C[9],@E[1]
191 eor @C[0],@C[0],@E[2]
192 ldrd @E[0],@E[1],[sp,#$A[2][1]]
193 eor @C[1],@C[1],@E[3]
194 ldrd @E[2],@E[3],[sp,#$A[2][2]]
195 eor @C[2],@C[2],@E[0]
196 eor @C[3],@C[3],@E[1]
197 eor @C[4],@C[4],@E[2]
198 ldrd @E[0],@E[1],[sp,#$A[2][3]]
199 eor @C[5],@C[5],@E[3]
200 ldrd @E[2],@E[3],[sp,#$A[2][4]]
201 eor @C[6],@C[6],@E[0]
202 eor @C[7],@C[7],@E[1]
203 eor @C[8],@C[8],@E[2]
204 ldrd @E[0],@E[1],[sp,#$A[3][0]]
205 eor @C[9],@C[9],@E[3]
206 ldrd @E[2],@E[3],[sp,#$A[3][1]]
207 eor @C[0],@C[0],@E[0]
208 eor @C[1],@C[1],@E[1]
209 eor @C[2],@C[2],@E[2]
210 ldrd @E[0],@E[1],[sp,#$A[3][2]]
211 eor @C[3],@C[3],@E[3]
212 ldrd @E[2],@E[3],[sp,#$A[3][3]]
213 eor @C[4],@C[4],@E[0]
214 eor @C[5],@C[5],@E[1]
215 eor @C[6],@C[6],@E[2]
216 ldrd @E[0],@E[1],[sp,#$A[3][4]]
217 eor @C[7],@C[7],@E[3]
218 ldrd @E[2],@E[3],[sp,#$A[4][0]]
219 eor @C[8],@C[8],@E[0]
220 eor @C[9],@C[9],@E[1]
221 eor @C[0],@C[0],@E[2]
222 ldrd @E[0],@E[1],[sp,#$A[4][1]]
223 eor @C[1],@C[1],@E[3]
224 ldrd @E[2],@E[3],[sp,#$A[0][2]]
225 eor @C[2],@C[2],@E[0]
226 eor @C[3],@C[3],@E[1]
227 eor @C[4],@C[4],@E[2]
228 ldrd @E[0],@E[1],[sp,#$A[0][3]]
229 eor @C[5],@C[5],@E[3]
230 ldrd @E[2],@E[3],[sp,#$A[0][4]]
232 eor @C[0],@C[0],@E[0]
233 add @E[0],sp,#$A[1][2]
234 eor @C[1],@C[1],@E[1]
235 eor @C[2],@C[2],@E[2]
236 eor @C[3],@C[3],@E[3]
237 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
238 eor @C[4],@C[4],@E[0]
239 add @E[0],sp,#$A[1][4]
240 eor @C[5],@C[5],@E[1]
241 eor @C[6],@C[6],@E[2]
242 eor @C[7],@C[7],@E[3]
243 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
244 eor @C[8],@C[8],@E[0]
245 add @E[0],sp,#$A[2][1]
246 eor @C[9],@C[9],@E[1]
247 eor @C[0],@C[0],@E[2]
248 eor @C[1],@C[1],@E[3]
249 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
250 eor @C[2],@C[2],@E[0]
251 add @E[0],sp,#$A[2][3]
252 eor @C[3],@C[3],@E[1]
253 eor @C[4],@C[4],@E[2]
254 eor @C[5],@C[5],@E[3]
255 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
256 eor @C[6],@C[6],@E[0]
257 add @E[0],sp,#$A[3][0]
258 eor @C[7],@C[7],@E[1]
259 eor @C[8],@C[8],@E[2]
260 eor @C[9],@C[9],@E[3]
261 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
262 eor @C[0],@C[0],@E[0]
263 add @E[0],sp,#$A[3][2]
264 eor @C[1],@C[1],@E[1]
265 eor @C[2],@C[2],@E[2]
266 eor @C[3],@C[3],@E[3]
267 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
268 eor @C[4],@C[4],@E[0]
269 add @E[0],sp,#$A[3][4]
270 eor @C[5],@C[5],@E[1]
271 eor @C[6],@C[6],@E[2]
272 eor @C[7],@C[7],@E[3]
273 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
274 eor @C[8],@C[8],@E[0]
275 ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
276 eor @C[9],@C[9],@E[1]
277 ldr @E[1],[sp,#$A[4][1]+4]
278 eor @C[0],@C[0],@E[2]
279 ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
280 eor @C[1],@C[1],@E[3]
281 ldr @E[3],[sp,#$A[0][2]+4]
282 eor @C[2],@C[2],@E[0]
283 add @E[0],sp,#$A[0][3]
284 eor @C[3],@C[3],@E[1]
285 eor @C[4],@C[4],@E[2]
286 eor @C[5],@C[5],@E[3]
287 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
289 eor @C[6],@C[6],@E[0]
290 eor @C[7],@C[7],@E[1]
291 eor @C[8],@C[8],@E[2]
292 eor @C[9],@C[9],@E[3]
294 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
295 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
296 eor @E[1],@C[1],@C[4]
297 str.h @E[1],[sp,#$D[1]+4]
298 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
299 eor @E[3],@C[7],@C[0]
300 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
301 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
302 str.h @E[3],[sp,#$D[4]+4]
303 eor @C[1],@C[9],@C[2]
304 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
305 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
306 ldr.l @C[7],[sp,#$A[3][3]]
307 eor @C[3],@C[3],@C[6]
308 str.h @C[1],[sp,#$D[0]+4]
309 ldr.h @C[6],[sp,#$A[3][3]+4]
310 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
311 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
312 str.h @C[3],[sp,#$D[2]+4]
313 eor @C[5],@C[5],@C[8]
315 ldr.l @C[8],[sp,#$A[4][4]]
316 ldr.h @C[9],[sp,#$A[4][4]+4]
317 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
318 eor @C[7],@C[7],@C[4]
319 str.h @C[5],[sp,#$D[3]+4]
320 eor @C[6],@C[6],@C[5]
321 ldr.l @C[4],[sp,#$A[0][0]]
322 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
323 @ ror @C[6],@C[6],#32-11
324 ldr.h @C[5],[sp,#$A[0][0]+4]
325 eor @C[8],@C[8],@E[2]
326 eor @C[9],@C[9],@E[3]
327 ldr.l @E[2],[sp,#$A[2][2]]
328 eor @C[0],@C[0],@C[4]
329 ldr.h @E[3],[sp,#$A[2][2]+4]
330 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
331 @ ror @C[9],@C[9],#32-7
332 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
333 eor @E[2],@E[2],@C[2]
334 ldr.l @C[2],[sp,#$A[1][1]]
335 eor @E[3],@E[3],@C[3]
336 ldr.h @C[3],[sp,#$A[1][1]+4]
337 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
338 ldr @E[2],[sp,#444] @ load counter
339 eor @C[2],@C[2],@E[0]
341 ror @C[4],@E[3],#32-22
342 add @E[3],@E[0],@E[2]
343 eor @C[3],@C[3],@E[1]
345 $code.=<<___ if ($A[0][0] != $T[0][0]);
346 ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
348 $code.=<<___ if ($A[0][0] == $T[0][0]);
349 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
351 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
353 str @E[2],[sp,#444] @ store counter
356 bic @E[2],@C[4],@C[2],ror#32-22
357 bic @E[3],@C[5],@C[3],ror#32-22
358 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
359 ror @C[3],@C[3],#32-22
360 eor @E[2],@E[2],@C[0]
361 eor @E[3],@E[3],@C[1]
362 eor @E[0],@E[0],@E[2]
363 eor @E[1],@E[1],@E[3]
364 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
365 bic @E[2],@C[6],@C[4],ror#11
366 str.h @E[1],[sp,#$R[0][0]+4]
367 bic @E[3],@C[7],@C[5],ror#10
368 bic @E[0],@C[8],@C[6],ror#32-(11-7)
369 bic @E[1],@C[9],@C[7],ror#32-(10-7)
370 eor @E[2],@C[2],@E[2],ror#32-11
371 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
372 eor @E[3],@C[3],@E[3],ror#32-10
373 str.h @E[3],[sp,#$R[0][1]+4]
374 eor @E[0],@C[4],@E[0],ror#32-7
375 eor @E[1],@C[5],@E[1],ror#32-7
376 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
377 bic @E[2],@C[0],@C[8],ror#32-7
378 str.h @E[1],[sp,#$R[0][2]+4]
379 bic @E[3],@C[1],@C[9],ror#32-7
380 eor @E[2],@E[2],@C[6],ror#32-11
381 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
382 eor @E[3],@E[3],@C[7],ror#32-10
383 str.h @E[3],[sp,#$R[0][3]+4]
384 bic @E[0],@C[2],@C[0]
386 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
387 bic @E[1],@C[3],@C[1]
388 ldr.h @C[1],[sp,#$A[0][3]+4]
389 eor @E[0],@E[0],@C[8],ror#32-7
390 eor @E[1],@E[1],@C[9],ror#32-7
391 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
393 str.h @E[1],[sp,#$R[0][4]+4]
395 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
396 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
398 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
399 eor @C[0],@C[0],@E[0]
400 ldr.h @C[3],[sp,#$A[1][4]+4]
401 eor @C[1],@C[1],@E[1]
402 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
403 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
404 @ ror @C[1],@C[1],#32-14
405 ldr.h @E[1],[sp,#$A[3][1]+4]
407 eor @C[2],@C[2],@E[2]
408 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
409 eor @C[3],@C[3],@E[3]
410 ldr.h @C[5],[sp,#$A[2][0]+4]
411 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
412 @ ror @C[3],@C[3],#32-10
414 eor @C[6],@C[6],@C[4]
415 ldr.l @E[2],[sp,#$D[2]] @ D[2]
416 eor @C[7],@C[7],@C[5]
417 ldr.h @E[3],[sp,#$D[2]+4]
418 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
419 ror @C[4],@C[7],#32-2
421 eor @E[0],@E[0],@C[8]
422 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
423 eor @E[1],@E[1],@C[9]
424 ldr.h @C[9],[sp,#$A[4][2]+4]
425 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
426 ror @C[6],@E[1],#32-23
428 bic @E[0],@C[4],@C[2],ror#32-10
429 bic @E[1],@C[5],@C[3],ror#32-10
430 eor @E[2],@E[2],@C[8]
431 eor @E[3],@E[3],@C[9]
432 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
433 ror @C[8],@E[3],#32-31
434 eor @E[0],@E[0],@C[0],ror#32-14
435 eor @E[1],@E[1],@C[1],ror#32-14
436 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
437 bic @E[2],@C[6],@C[4]
438 str.h @E[1],[sp,#$R[1][0]+4]
439 bic @E[3],@C[7],@C[5]
440 eor @E[2],@E[2],@C[2],ror#32-10
441 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
442 eor @E[3],@E[3],@C[3],ror#32-10
443 str.h @E[3],[sp,#$R[1][1]+4]
444 bic @E[0],@C[8],@C[6]
445 bic @E[1],@C[9],@C[7]
446 bic @E[2],@C[0],@C[8],ror#14
447 bic @E[3],@C[1],@C[9],ror#14
448 eor @E[0],@E[0],@C[4]
449 eor @E[1],@E[1],@C[5]
450 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
451 bic @C[2],@C[2],@C[0],ror#32-(14-10)
452 str.h @E[1],[sp,#$R[1][2]+4]
453 eor @E[2],@C[6],@E[2],ror#32-14
454 bic @E[1],@C[3],@C[1],ror#32-(14-10)
455 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
456 eor @E[3],@C[7],@E[3],ror#32-14
457 str.h @E[3],[sp,#$R[1][3]+4]
459 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
460 eor @E[0],@C[8],@C[2],ror#32-10
461 ldr.h @C[0],[sp,#$A[0][1]+4]
462 eor @E[1],@C[9],@E[1],ror#32-10
463 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
464 str.h @E[1],[sp,#$R[1][4]+4]
467 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
468 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
469 ldr.h @C[3],[sp,#$A[1][2]+4]
470 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
472 eor @C[1],@C[1],@E[0]
473 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
474 eor @C[0],@C[0],@E[1]
475 ldr.h @C[5],[sp,#$A[2][3]+4]
476 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
478 eor @C[2],@C[2],@E[2]
479 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
480 eor @C[3],@C[3],@E[3]
481 ldr.h @E[1],[sp,#$A[3][4]+4]
482 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
483 ldr.l @E[2],[sp,#$D[0]] @ D[0]
484 @ ror @C[3],@C[3],#32-3
485 ldr.h @E[3],[sp,#$D[0]+4]
487 eor @C[4],@C[4],@C[6]
488 eor @C[5],@C[5],@C[7]
489 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
490 @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
492 eor @E[0],@E[0],@C[8]
493 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
494 eor @E[1],@E[1],@C[9]
495 ldr.h @C[9],[sp,#$A[4][0]+4]
496 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
497 ror @C[7],@E[1],#32-4
499 eor @E[2],@E[2],@C[8]
500 eor @E[3],@E[3],@C[9]
501 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
502 ror @C[9],@E[3],#32-9
504 bic @E[0],@C[5],@C[2],ror#13-3
505 bic @E[1],@C[4],@C[3],ror#12-3
506 bic @E[2],@C[6],@C[5],ror#32-13
507 bic @E[3],@C[7],@C[4],ror#32-12
508 eor @E[0],@C[0],@E[0],ror#32-13
509 eor @E[1],@C[1],@E[1],ror#32-12
510 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
511 eor @E[2],@E[2],@C[2],ror#32-3
512 str.h @E[1],[sp,#$R[2][0]+4]
513 eor @E[3],@E[3],@C[3],ror#32-3
514 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
515 bic @E[0],@C[8],@C[6]
516 bic @E[1],@C[9],@C[7]
517 str.h @E[3],[sp,#$R[2][1]+4]
518 eor @E[0],@E[0],@C[5],ror#32-13
519 eor @E[1],@E[1],@C[4],ror#32-12
520 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
521 bic @E[2],@C[0],@C[8]
522 str.h @E[1],[sp,#$R[2][2]+4]
523 bic @E[3],@C[1],@C[9]
524 eor @E[2],@E[2],@C[6]
525 eor @E[3],@E[3],@C[7]
526 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
527 bic @E[0],@C[2],@C[0],ror#3
528 str.h @E[3],[sp,#$R[2][3]+4]
529 bic @E[1],@C[3],@C[1],ror#3
530 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
531 eor @E[0],@C[8],@E[0],ror#32-3
532 ldr.h @C[0],[sp,#$A[0][4]+4]
533 eor @E[1],@C[9],@E[1],ror#32-3
534 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
536 str.h @E[1],[sp,#$R[2][4]+4]
538 ldr.l @E[0],[sp,#$D[4]] @ D[4]
539 ldr.h @E[1],[sp,#$D[4]+4]
540 ldr.l @E[2],[sp,#$D[0]] @ D[0]
541 ldr.h @E[3],[sp,#$D[0]+4]
543 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
545 eor @C[1],@C[1],@E[0]
546 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
547 eor @C[0],@C[0],@E[1]
548 ldr.h @C[3],[sp,#$A[1][0]+4]
549 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
550 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
551 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
552 ldr.h @C[5],[sp,#$A[2][1]+4]
554 eor @C[2],@C[2],@E[2]
555 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
556 eor @C[3],@C[3],@E[3]
557 ldr.h @E[1],[sp,#$A[3][2]+4]
558 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
559 ldr.l @E[2],[sp,#$D[3]] @ D[3]
560 @ ror @C[3],@C[3],#32-18
561 ldr.h @E[3],[sp,#$D[3]+4]
563 eor @C[6],@C[6],@C[4]
564 eor @C[7],@C[7],@C[5]
565 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
566 ror @C[5],@C[7],#32-5
568 eor @E[0],@E[0],@C[8]
569 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
570 eor @E[1],@E[1],@C[9]
571 ldr.h @C[9],[sp,#$A[4][3]+4]
572 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
573 ror @C[6],@E[1],#32-8
575 eor @E[2],@E[2],@C[8]
576 eor @E[3],@E[3],@C[9]
577 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
578 ror @C[9],@E[3],#32-28
580 bic @E[0],@C[4],@C[2],ror#32-18
581 bic @E[1],@C[5],@C[3],ror#32-18
582 eor @E[0],@E[0],@C[0],ror#32-14
583 eor @E[1],@E[1],@C[1],ror#32-13
584 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
585 bic @E[2],@C[6],@C[4]
586 str.h @E[1],[sp,#$R[3][0]+4]
587 bic @E[3],@C[7],@C[5]
588 eor @E[2],@E[2],@C[2],ror#32-18
589 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
590 eor @E[3],@E[3],@C[3],ror#32-18
591 str.h @E[3],[sp,#$R[3][1]+4]
592 bic @E[0],@C[8],@C[6]
593 bic @E[1],@C[9],@C[7]
594 bic @E[2],@C[0],@C[8],ror#14
595 bic @E[3],@C[1],@C[9],ror#13
596 eor @E[0],@E[0],@C[4]
597 eor @E[1],@E[1],@C[5]
598 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
599 bic @C[2],@C[2],@C[0],ror#18-14
600 str.h @E[1],[sp,#$R[3][2]+4]
601 eor @E[2],@C[6],@E[2],ror#32-14
602 bic @E[1],@C[3],@C[1],ror#18-13
603 eor @E[3],@C[7],@E[3],ror#32-13
604 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
605 str.h @E[3],[sp,#$R[3][3]+4]
607 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
608 eor @E[0],@C[8],@C[2],ror#32-18
609 ldr.h @C[1],[sp,#$A[0][2]+4]
610 eor @E[1],@C[9],@E[1],ror#32-18
611 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
612 str.h @E[1],[sp,#$R[3][4]+4]
614 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
615 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
616 ldr.h @C[3],[sp,#$A[1][3]+4]
617 ldr.l @C[6],[sp,#$D[4]] @ D[4]
618 ldr.h @C[7],[sp,#$D[4]+4]
620 eor @C[0],@C[0],@E[0]
621 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
622 eor @C[1],@C[1],@E[1]
623 ldr.h @C[5],[sp,#$A[2][4]+4]
624 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
625 ldr.l @C[8],[sp,#$D[0]] @ D[0]
626 @ ror @C[1],@C[1],#32-31
627 ldr.h @C[9],[sp,#$D[0]+4]
629 eor @E[2],@E[2],@C[2]
630 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
631 eor @E[3],@E[3],@C[3]
632 ldr.h @E[1],[sp,#$A[3][0]+4]
633 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
634 ldr.l @E[2],[sp,#$D[1]] @ D[1]
635 ror @C[2],@E[3],#32-28
636 ldr.h @E[3],[sp,#$D[1]+4]
638 eor @C[6],@C[6],@C[4]
639 eor @C[7],@C[7],@C[5]
640 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
641 ror @C[4],@C[7],#32-20
643 eor @E[0],@E[0],@C[8]
644 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
645 eor @E[1],@E[1],@C[9]
646 ldr.h @C[9],[sp,#$A[4][1]+4]
647 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
648 ror @C[6],@E[1],#32-21
650 eor @C[8],@C[8],@E[2]
651 eor @C[9],@C[9],@E[3]
652 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
653 @ ror @C[9],@C[3],#32-1
655 bic @E[0],@C[4],@C[2]
656 bic @E[1],@C[5],@C[3]
657 eor @E[0],@E[0],@C[0],ror#32-31
658 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
659 eor @E[1],@E[1],@C[1],ror#32-31
660 str.h @E[1],[sp,#$R[4][0]+4]
661 bic @E[2],@C[6],@C[4]
662 bic @E[3],@C[7],@C[5]
663 eor @E[2],@E[2],@C[2]
664 eor @E[3],@E[3],@C[3]
665 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
666 bic @E[0],@C[8],@C[6],ror#1
667 str.h @E[3],[sp,#$R[4][1]+4]
668 bic @E[1],@C[9],@C[7],ror#1
669 bic @E[2],@C[0],@C[8],ror#31-1
670 bic @E[3],@C[1],@C[9],ror#31-1
671 eor @C[4],@C[4],@E[0],ror#32-1
672 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
673 eor @C[5],@C[5],@E[1],ror#32-1
674 str.h @C[5],[sp,#$R[4][2]+4]
675 eor @C[6],@C[6],@E[2],ror#32-31
676 eor @C[7],@C[7],@E[3],ror#32-31
677 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
678 bic @E[0],@C[2],@C[0],ror#32-31
679 str.h @C[7],[sp,#$R[4][3]+4]
680 bic @E[1],@C[3],@C[1],ror#32-31
681 add @E[2],sp,#$R[0][0]
682 eor @C[8],@E[0],@C[8],ror#32-1
683 add @E[0],sp,#$R[1][0]
684 eor @C[9],@E[1],@C[9],ror#32-1
685 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
686 str.h @C[9],[sp,#$R[4][4]+4]
695 .size KeccakF1600_int,.-KeccakF1600_int
697 .type KeccakF1600, %function
700 stmdb sp!,{r0,r4-r11,lr}
701 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
703 add @E[0],r0,#$A[1][0]
704 add @E[1],sp,#$A[1][0]
705 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
706 stmia sp, {@C[0]-@C[9]}
707 ldmia @E[0]!,{@C[0]-@C[9]}
708 stmia @E[1]!,{@C[0]-@C[9]}
709 ldmia @E[0]!,{@C[0]-@C[9]}
710 stmia @E[1]!,{@C[0]-@C[9]}
711 ldmia @E[0]!,{@C[0]-@C[9]}
712 stmia @E[1]!,{@C[0]-@C[9]}
713 ldmia @E[0], {@C[0]-@C[9]}
714 add @E[2],sp,#$A[0][0]
715 add @E[0],sp,#$A[1][0]
716 stmia @E[1], {@C[0]-@C[9]}
720 ldr @E[1], [sp,#440+16] @ restore pointer to A
721 ldmia sp, {@C[0]-@C[9]}
722 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
723 ldmia @E[0]!,{@C[0]-@C[9]}
724 stmia @E[1]!,{@C[0]-@C[9]}
725 ldmia @E[0]!,{@C[0]-@C[9]}
726 stmia @E[1]!,{@C[0]-@C[9]}
727 ldmia @E[0]!,{@C[0]-@C[9]}
728 stmia @E[1]!,{@C[0]-@C[9]}
729 ldmia @E[0], {@C[0]-@C[9]}
730 stmia @E[1], {@C[0]-@C[9]}
733 ldmia sp!,{r4-r11,pc}
734 .size KeccakF1600,.-KeccakF1600
736 { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
738 ########################################################################
740 # ----->+-----------------------+
741 # | uint64_t A[5][5] |
744 # +456->+-----------------------+
746 # +460->+-----------------------+
748 # +464->+-----------------------+
750 # +468->+-----------------------+
752 # +472->+-----------------------+
754 # +476->+-----------------------+
755 # | const void *inp |
756 # +480->+-----------------------+
758 # +484->+-----------------------+
760 # +488->+-----------------------+
765 .type SHA3_absorb,%function
768 stmdb sp!,{r0-r12,lr}
771 add $A_flat,r0,#$A[1][0]
779 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
780 stmia $inp!, {@C[0]-@C[9]}
781 ldmia $A_flat!,{@C[0]-@C[9]}
782 stmia $inp!, {@C[0]-@C[9]}
783 ldmia $A_flat!,{@C[0]-@C[9]}
784 stmia $inp!, {@C[0]-@C[9]}
785 ldmia $A_flat!,{@C[0]-@C[9]}
786 stmia $inp!, {@C[0]-@C[9]}
787 ldmia $A_flat!,{@C[0]-@C[9]}
788 stmia $inp, {@C[0]-@C[9]}
790 ldr $inp,[sp,#476] @ restore $inp
797 mov r6,#0x11 @ compose constants
802 orr r6,r6,r6,lsl#16 @ 0x11111111
803 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
804 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
805 orr r7,r6,r6,lsl#1 @ 0x33333333
806 orr r6,r6,r6,lsl#2 @ 0x55555555
819 str r0,[sp,#480] @ save len - bsz
832 orr r0,r0,r3,lsl#24 @ lo
836 orr r1,r1,r3,lsl#24 @ hi
838 and r2,r0,r6 @ &=0x55555555
839 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
840 and r3,r1,r6 @ &=0x55555555
841 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
846 and r2,r2,r7 @ &=0x33333333
847 and r0,r0,r7,lsl#2 @ &=0xcccccccc
848 and r3,r3,r7 @ &=0x33333333
849 and r1,r1,r7,lsl#2 @ &=0xcccccccc
854 and r2,r2,r8 @ &=0x0f0f0f0f
855 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
856 and r3,r3,r8 @ &=0x0f0f0f0f
857 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
858 ldmia $A_flat,{r4-r5} @ A_flat[i]
863 and r2,r2,r9 @ &=0x00ff00ff
864 and r0,r0,r9,lsl#8 @ &=0xff00ff00
865 and r3,r3,r9 @ &=0x00ff00ff
866 and r1,r1,r9,lsl#8 @ &=0xff00ff00
878 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
888 ldmia r14,{r6-r12,r14} @ restore constants and variables
893 add $inp,sp,#$A[1][0]
894 ldmia sp, {@C[0]-@C[9]}
895 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
896 ldmia $inp!, {@C[0]-@C[9]}
897 stmia $A_flat!,{@C[0]-@C[9]}
898 ldmia $inp!, {@C[0]-@C[9]}
899 stmia $A_flat!,{@C[0]-@C[9]}
900 ldmia $inp!, {@C[0]-@C[9]}
901 stmia $A_flat!,{@C[0]-@C[9]}
902 ldmia $inp, {@C[0]-@C[9]}
903 stmia $A_flat, {@C[0]-@C[9]}
907 mov r0,$len @ return value
908 ldmia sp!,{r4-r12,pc}
909 .size SHA3_absorb,.-SHA3_absorb
912 { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
916 .type SHA3_squeeze,%function
919 stmdb sp!,{r0,r3-r10,lr}
932 mov r6,#0x11 @ compose constants
937 orr r6,r6,r6,lsl#16 @ 0x11111111
938 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
939 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
940 orr r7,r6,r6,lsl#1 @ 0x33333333
941 orr r6,r6,r6,lsl#2 @ 0x55555555
950 ldmia $A_flat!,{r0,r1} @ A_flat[i++]
953 lsl r3,r1,#16 @ r3 = r1 << 16
954 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
956 lsr r0,r0,#16 @ r0 = r0 >> 16
957 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
963 and r2,r2,r9 @ &=0x00ff00ff
964 and r3,r3,r9,lsl#8 @ &=0xff00ff00
965 and r0,r0,r9 @ &=0x00ff00ff
966 and r1,r1,r9,lsl#8 @ &=0xff00ff00
971 and r2,r2,r8 @ &=0x0f0f0f0f
972 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
973 and r0,r0,r8 @ &=0x0f0f0f0f
974 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
979 and r2,r2,r7 @ &=0x33333333
980 and r3,r3,r7,lsl#2 @ &=0xcccccccc
981 and r0,r0,r7 @ &=0x33333333
982 and r1,r1,r7,lsl#2 @ &=0xcccccccc
987 and r2,r2,r6 @ &=0x55555555
988 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
989 and r0,r0,r6 @ &=0x55555555
990 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
1015 subs $bsz,$bsz,#8 @ bsz -= 8
1018 mov r0,r14 @ original $A_flat
1022 ldmia sp,{r6-r10,r12} @ restore constants and variables
1058 ldmia sp!,{r4-r10,pc}
1059 .size SHA3_squeeze,.-SHA3_squeeze
1064 #if __ARM_MAX_ARCH__>=7
1067 .type iotas64, %object
1070 .quad 0x0000000000000001
1071 .quad 0x0000000000008082
1072 .quad 0x800000000000808a
1073 .quad 0x8000000080008000
1074 .quad 0x000000000000808b
1075 .quad 0x0000000080000001
1076 .quad 0x8000000080008081
1077 .quad 0x8000000000008009
1078 .quad 0x000000000000008a
1079 .quad 0x0000000000000088
1080 .quad 0x0000000080008009
1081 .quad 0x000000008000000a
1082 .quad 0x000000008000808b
1083 .quad 0x800000000000008b
1084 .quad 0x8000000000008089
1085 .quad 0x8000000000008003
1086 .quad 0x8000000000008002
1087 .quad 0x8000000000000080
1088 .quad 0x000000000000800a
1089 .quad 0x800000008000000a
1090 .quad 0x8000000080008081
1091 .quad 0x8000000000008080
1092 .quad 0x0000000080000001
1093 .quad 0x8000000080008008
1094 .size iotas64,.-iotas64
1096 .type KeccakF1600_neon, %function
1101 mov r3, #24 @ loop counter
1107 vst1.64 {q4}, [r0:64] @ offload A[0..1][4]
1108 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
1109 vst1.64 {d18}, [r1:64] @ offload A[2][4]
1110 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
1111 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
1112 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1113 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1114 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
1115 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
1116 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1117 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1118 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1119 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
1120 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
1121 veor d25, d25, d24 @ C[4]^=A[4][4]
1123 vadd.u64 q4, q13, q13 @ C[0..1]<<1
1124 vadd.u64 q15, q14, q14 @ C[2..3]<<1
1125 vadd.u64 d18, d25, d25 @ C[4]<<1
1126 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
1127 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
1128 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
1129 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
1130 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1131 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
1132 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
1134 veor d0, d0, d25 @ A[0][0] ^= C[4]
1135 veor d1, d1, d25 @ A[1][0] ^= C[4]
1136 veor d10, d10, d25 @ A[2][0] ^= C[4]
1137 veor d11, d11, d25 @ A[3][0] ^= C[4]
1138 veor d20, d20, d25 @ A[4][0] ^= C[4]
1140 veor d2, d2, d26 @ A[0][1] ^= D[1]
1141 veor d3, d3, d26 @ A[1][1] ^= D[1]
1142 veor d12, d12, d26 @ A[2][1] ^= D[1]
1143 veor d13, d13, d26 @ A[3][1] ^= D[1]
1144 veor d21, d21, d26 @ A[4][1] ^= D[1]
1147 veor d6, d6, d28 @ A[0][3] ^= C[2]
1148 veor d7, d7, d28 @ A[1][3] ^= C[2]
1149 veor d16, d16, d28 @ A[2][3] ^= C[2]
1150 veor d17, d17, d28 @ A[3][3] ^= C[2]
1151 veor d23, d23, d28 @ A[4][3] ^= C[2]
1152 vld1.64 {q4}, [r0:64] @ restore A[0..1][4]
1155 vld1.64 {d18}, [r1:64] @ restore A[2][4]
1156 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
1157 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
1158 veor d22, d22, d27 @ A[4][2] ^= D[2]
1160 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
1161 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
1162 veor d24, d24, d29 @ A[4][4] ^= C[3]
1165 vmov d26, d2 @ C[1] = A[0][1]
1166 vshl.u64 d2, d3, #44
1167 vmov d27, d4 @ C[2] = A[0][2]
1168 vshl.u64 d4, d14, #43
1169 vmov d28, d6 @ C[3] = A[0][3]
1170 vshl.u64 d6, d17, #21
1171 vmov d29, d8 @ C[4] = A[0][4]
1172 vshl.u64 d8, d24, #14
1173 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1174 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1175 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1176 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1178 vshl.u64 d3, d9, #20
1179 vshl.u64 d14, d16, #25
1180 vshl.u64 d17, d15, #15
1181 vshl.u64 d24, d21, #2
1182 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1183 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1184 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1185 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1187 vshl.u64 d9, d22, #61
1188 @ vshl.u64 d16, d19, #8
1189 vshl.u64 d15, d12, #10
1190 vshl.u64 d21, d7, #55
1191 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1192 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1193 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1194 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1196 vshl.u64 d22, d18, #39
1197 @ vshl.u64 d19, d23, #56
1198 vshl.u64 d12, d5, #6
1199 vshl.u64 d7, d13, #45
1200 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1201 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1202 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1203 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1205 vshl.u64 d18, d20, #18
1206 vshl.u64 d23, d11, #41
1207 vshl.u64 d5, d10, #3
1208 vshl.u64 d13, d1, #36
1209 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1210 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1211 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1212 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1214 vshl.u64 d1, d28, #28
1215 vshl.u64 d10, d26, #1
1216 vshl.u64 d11, d29, #27
1217 vshl.u64 d20, d27, #62
1218 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
1219 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
1220 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
1221 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
1227 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1228 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1229 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1230 vst1.64 {q13}, [r0:64] @ offload A[0..1][0]
1233 vmov q1, q14 @ A[0..1][1]
1234 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1235 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1238 vmov q0, q5 @ A[2..3][0]
1240 vmov q15, q6 @ A[2..3][1]
1241 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1243 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1245 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1247 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1248 vmov q14, q10 @ A[4][0..1]
1249 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1251 vld1.64 d25, [r2:64]! @ Iota[i++]
1254 vld1.64 {q0}, [r0:64] @ restore A[0..1][0]
1255 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
1257 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
1259 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1261 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1262 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1263 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1269 .size KeccakF1600_neon,.-KeccakF1600_neon
1271 .global SHA3_absorb_neon
1272 .type SHA3_absorb_neon, %function
1275 stmdb sp!, {r4-r6,lr}
1276 vstmdb sp!, {d8-d15}
1282 vld1.32 {d0}, [r0:64]! @ A[0][0]
1283 vld1.32 {d2}, [r0:64]! @ A[0][1]
1284 vld1.32 {d4}, [r0:64]! @ A[0][2]
1285 vld1.32 {d6}, [r0:64]! @ A[0][3]
1286 vld1.32 {d8}, [r0:64]! @ A[0][4]
1288 vld1.32 {d1}, [r0:64]! @ A[1][0]
1289 vld1.32 {d3}, [r0:64]! @ A[1][1]
1290 vld1.32 {d5}, [r0:64]! @ A[1][2]
1291 vld1.32 {d7}, [r0:64]! @ A[1][3]
1292 vld1.32 {d9}, [r0:64]! @ A[1][4]
1294 vld1.32 {d10}, [r0:64]! @ A[2][0]
1295 vld1.32 {d12}, [r0:64]! @ A[2][1]
1296 vld1.32 {d14}, [r0:64]! @ A[2][2]
1297 vld1.32 {d16}, [r0:64]! @ A[2][3]
1298 vld1.32 {d18}, [r0:64]! @ A[2][4]
1300 vld1.32 {d11}, [r0:64]! @ A[3][0]
1301 vld1.32 {d13}, [r0:64]! @ A[3][1]
1302 vld1.32 {d15}, [r0:64]! @ A[3][2]
1303 vld1.32 {d17}, [r0:64]! @ A[3][3]
1304 vld1.32 {d19}, [r0:64]! @ A[3][4]
1306 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3]
1307 vld1.32 {d24}, [r0:64] @ A[4][4]
1308 sub r0, r0, #24*8 @ rewind
1313 subs r12, r5, r6 @ len - bsz
1317 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1319 veor d0, d0, d31 @ A[0][0] ^= *inp++
1322 veor d2, d2, d31 @ A[0][1] ^= *inp++
1326 veor d4, d4, d31 @ A[0][2] ^= *inp++
1329 veor d6, d6, d31 @ A[0][3] ^= *inp++
1333 veor d8, d8, d31 @ A[0][4] ^= *inp++
1337 veor d1, d1, d31 @ A[1][0] ^= *inp++
1341 veor d3, d3, d31 @ A[1][1] ^= *inp++
1344 veor d5, d5, d31 @ A[1][2] ^= *inp++
1348 veor d7, d7, d31 @ A[1][3] ^= *inp++
1351 veor d9, d9, d31 @ A[1][4] ^= *inp++
1356 veor d10, d10, d31 @ A[2][0] ^= *inp++
1359 veor d12, d12, d31 @ A[2][1] ^= *inp++
1363 veor d14, d14, d31 @ A[2][2] ^= *inp++
1366 veor d16, d16, d31 @ A[2][3] ^= *inp++
1370 veor d18, d18, d31 @ A[2][4] ^= *inp++
1374 veor d11, d11, d31 @ A[3][0] ^= *inp++
1378 veor d13, d13, d31 @ A[3][1] ^= *inp++
1381 veor d15, d15, d31 @ A[3][2] ^= *inp++
1385 veor d17, d17, d31 @ A[3][3] ^= *inp++
1388 veor d19, d19, d31 @ A[3][4] ^= *inp++
1393 veor d20, d20, d31 @ A[4][0] ^= *inp++
1396 veor d21, d21, d31 @ A[4][1] ^= *inp++
1400 veor d22, d22, d31 @ A[4][2] ^= *inp++
1403 veor d23, d23, d31 @ A[4][3] ^= *inp++
1406 veor d24, d24, d31 @ A[4][4] ^= *inp++
1414 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1415 vst1.32 {d2}, [r0:64]!
1416 vst1.32 {d4}, [r0:64]!
1417 vst1.32 {d6}, [r0:64]!
1418 vst1.32 {d8}, [r0:64]!
1420 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1421 vst1.32 {d3}, [r0:64]!
1422 vst1.32 {d5}, [r0:64]!
1423 vst1.32 {d7}, [r0:64]!
1424 vst1.32 {d9}, [r0:64]!
1426 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1427 vst1.32 {d12}, [r0:64]!
1428 vst1.32 {d14}, [r0:64]!
1429 vst1.32 {d16}, [r0:64]!
1430 vst1.32 {d18}, [r0:64]!
1432 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1433 vst1.32 {d13}, [r0:64]!
1434 vst1.32 {d15}, [r0:64]!
1435 vst1.32 {d17}, [r0:64]!
1436 vst1.32 {d19}, [r0:64]!
1438 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1439 vst1.32 {d24}, [r0:64]
1441 mov r0, r5 @ return value
1442 vldmia sp!, {d8-d15}
1443 ldmia sp!, {r4-r6,pc}
1444 .size SHA3_absorb_neon,.-SHA3_absorb_neon
1446 .global SHA3_squeeze_neon
1447 .type SHA3_squeeze_neon, %function
1450 stmdb sp!, {r4-r6,lr}
1455 mov r12, r0 @ A_flat
1457 b .Loop_squeeze_neon
1462 blo .Lsqueeze_neon_tail
1463 vld1.32 {d0}, [r12]!
1464 vst1.8 {d0}, [r4]! @ endian-neutral store
1466 subs r5, r5, #8 @ len -= 8
1467 beq .Lsqueeze_neon_done
1469 subs r14, r14, #8 @ bsz -= 8
1470 bhi .Loop_squeeze_neon
1472 vstmdb sp!, {d8-d15}
1474 vld1.32 {d0}, [r0:64]! @ A[0][0..4]
1475 vld1.32 {d2}, [r0:64]!
1476 vld1.32 {d4}, [r0:64]!
1477 vld1.32 {d6}, [r0:64]!
1478 vld1.32 {d8}, [r0:64]!
1480 vld1.32 {d1}, [r0:64]! @ A[1][0..4]
1481 vld1.32 {d3}, [r0:64]!
1482 vld1.32 {d5}, [r0:64]!
1483 vld1.32 {d7}, [r0:64]!
1484 vld1.32 {d9}, [r0:64]!
1486 vld1.32 {d10}, [r0:64]! @ A[2][0..4]
1487 vld1.32 {d12}, [r0:64]!
1488 vld1.32 {d14}, [r0:64]!
1489 vld1.32 {d16}, [r0:64]!
1490 vld1.32 {d18}, [r0:64]!
1492 vld1.32 {d11}, [r0:64]! @ A[3][0..4]
1493 vld1.32 {d13}, [r0:64]!
1494 vld1.32 {d15}, [r0:64]!
1495 vld1.32 {d17}, [r0:64]!
1496 vld1.32 {d19}, [r0:64]!
1498 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1499 vld1.32 {d24}, [r0:64]
1500 sub r0, r0, #24*8 @ rewind
1504 mov r12, r0 @ A_flat
1505 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1506 vst1.32 {d2}, [r0:64]!
1507 vst1.32 {d4}, [r0:64]!
1508 vst1.32 {d6}, [r0:64]!
1509 vst1.32 {d8}, [r0:64]!
1511 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1512 vst1.32 {d3}, [r0:64]!
1513 vst1.32 {d5}, [r0:64]!
1514 vst1.32 {d7}, [r0:64]!
1515 vst1.32 {d9}, [r0:64]!
1517 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1518 vst1.32 {d12}, [r0:64]!
1519 vst1.32 {d14}, [r0:64]!
1520 vst1.32 {d16}, [r0:64]!
1521 vst1.32 {d18}, [r0:64]!
1523 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1524 vst1.32 {d13}, [r0:64]!
1525 vst1.32 {d15}, [r0:64]!
1526 vst1.32 {d17}, [r0:64]!
1527 vst1.32 {d19}, [r0:64]!
1529 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1531 vst1.32 {d24}, [r0:64]
1532 mov r0, r12 @ rewind
1534 vldmia sp!, {d8-d15}
1535 b .Loop_squeeze_neon
1538 .Lsqueeze_neon_tail:
1541 strb r2, [r4],#1 @ endian-neutral store
1543 blo .Lsqueeze_neon_done
1546 beq .Lsqueeze_neon_done
1550 blo .Lsqueeze_neon_done
1552 beq .Lsqueeze_neon_done
1557 blo .Lsqueeze_neon_done
1560 beq .Lsqueeze_neon_done
1563 .Lsqueeze_neon_done:
1564 ldmia sp!, {r4-r6,pc}
1565 .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
1567 .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1575 my ($mnemonic,$half,$reg,$ea) = @_;
1576 my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1581 sprintf "#ifndef __thumb2__\n" .
1583 "#endif", $mnemonic,$reg,$ea;
1585 sprintf "#ifndef __thumb2__\n" .
1588 " %sd\t%s,%s,%s\n" .
1589 "#endif", $mnemonic,$reg,$ea,
1590 $mnemonic,$$op{reg},$reg,$$op{ea};
1595 foreach (split($/,$code)) {
1596 s/\`([^\`]*)\`/eval $1/ge;
1598 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1599 s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or
1600 s/\bret\b/bx lr/g or
1601 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
1606 close STDOUT; # enforce flush