2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for ARMv4.
20 # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21 # interleaving. How does it compare to Keccak Code Package? It's as
22 # fast, but several times smaller, and is endian- and ISA-neutral. ISA
23 # neutrality means that minimum ISA requirement is ARMv4, yet it can
24 # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25 # register layout taken from Keccak Code Package. It's also as fast,
26 # in fact faster by 10-15% on some processors, and endian-neutral.
30 # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31 # of rotate instructions with logical ones. This resulted in ~10%
32 # improvement on most processors. Switch to KECCAK_2X effectively
33 # minimizes re-loads from temporary storage, and merged rotates just
34 # eliminate corresponding instructions. As for latter. When examining
35 # code you'll notice commented ror instructions. These are eliminated
36 # ones, and you should trace destination register below to see what's
37 # going on. Just in case, why not all rotates are eliminated. Trouble
38 # is that you have operations that require both inputs to be rotated,
39 # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40 # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41 # that takes 'a' as input. And thing is that this next operation can
42 # be in next round. It's totally possible to "carry" rotate "factors"
43 # to the next round, but it makes code more complex. And the last word
44 # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
47 # Reduce per-round instruction count in Thumb-2 case by 16%. This is
48 # achieved by folding ldr/str pairs to their double-word counterparts.
49 # Theoretically this should have improved performance on single-issue
50 # cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
53 ########################################################################
54 # Numbers are cycles per processed byte. Non-NEON results account even
55 # for input bit interleaving.
57 # r=1088(*) Thumb-2(**) NEON
60 # Cortex-A5 88/+160%, 86, 36
61 # Cortex-A7 78/+160%, 68, 34
62 # Cortex-A8 51/+230%, 57, 30
63 # Cortex-A9 53/+210%, 51, 26
64 # Cortex-A15 42/+160%, 38, 18
65 # Snapdragon S4 43/+210%, 38, 24
67 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
68 # over compiler-generated KECCAK_2X reference code.
69 # (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70 # Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71 # processors are presented mostly for reference purposes.
73 my @C = map("r$_",(0..9));
74 my @E = map("r$_",(10..12,14));
76 ########################################################################
78 # ----->+-----------------------+
79 # | uint64_t A[5][5] |
81 # +200->+-----------------------+
84 # +240->+-----------------------+
85 # | uint64_t T[5][5] |
87 # +440->+-----------------------+
89 # +444->+-----------------------+
91 # +448->+-----------------------+
94 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
95 my @D = map(8*$_, (25..29));
96 my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
101 #if defined(__thumb2__)
108 .type iotas32, %object
111 .long 0x00000001, 0x00000000
112 .long 0x00000000, 0x00000089
113 .long 0x00000000, 0x8000008b
114 .long 0x00000000, 0x80008080
115 .long 0x00000001, 0x0000008b
116 .long 0x00000001, 0x00008000
117 .long 0x00000001, 0x80008088
118 .long 0x00000001, 0x80000082
119 .long 0x00000000, 0x0000000b
120 .long 0x00000000, 0x0000000a
121 .long 0x00000001, 0x00008082
122 .long 0x00000000, 0x00008003
123 .long 0x00000001, 0x0000808b
124 .long 0x00000001, 0x8000000b
125 .long 0x00000001, 0x8000008a
126 .long 0x00000001, 0x80000081
127 .long 0x00000000, 0x80000081
128 .long 0x00000000, 0x80000008
129 .long 0x00000000, 0x00000083
130 .long 0x00000000, 0x80008003
131 .long 0x00000001, 0x80008088
132 .long 0x00000000, 0x80000088
133 .long 0x00000001, 0x00008000
134 .long 0x00000000, 0x80008082
135 .size iotas32,.-iotas32
137 .type KeccakF1600_int, %function
140 add @C[9],sp,#$A[4][2]
141 add @E[2],sp,#$A[0][0]
142 add @E[0],sp,#$A[1][0]
143 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
146 eor @E[1],@E[1],@E[1]
154 my (@A,@R); (@A[0..4],@R) = @_;
157 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
158 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
160 eor @C[0],@C[0],@E[0]
161 eor @C[1],@C[1],@E[1]
162 eor @C[2],@C[2],@E[2]
163 ldrd @E[0],@E[1],[sp,#$A[1][2]]
164 eor @C[3],@C[3],@E[3]
165 ldrd @E[2],@E[3],[sp,#$A[1][3]]
166 eor @C[4],@C[4],@E[0]
167 eor @C[5],@C[5],@E[1]
168 eor @C[6],@C[6],@E[2]
169 ldrd @E[0],@E[1],[sp,#$A[1][4]]
170 eor @C[7],@C[7],@E[3]
171 ldrd @E[2],@E[3],[sp,#$A[2][0]]
172 eor @C[8],@C[8],@E[0]
173 eor @C[9],@C[9],@E[1]
174 eor @C[0],@C[0],@E[2]
175 ldrd @E[0],@E[1],[sp,#$A[2][1]]
176 eor @C[1],@C[1],@E[3]
177 ldrd @E[2],@E[3],[sp,#$A[2][2]]
178 eor @C[2],@C[2],@E[0]
179 eor @C[3],@C[3],@E[1]
180 eor @C[4],@C[4],@E[2]
181 ldrd @E[0],@E[1],[sp,#$A[2][3]]
182 eor @C[5],@C[5],@E[3]
183 ldrd @E[2],@E[3],[sp,#$A[2][4]]
184 eor @C[6],@C[6],@E[0]
185 eor @C[7],@C[7],@E[1]
186 eor @C[8],@C[8],@E[2]
187 ldrd @E[0],@E[1],[sp,#$A[3][0]]
188 eor @C[9],@C[9],@E[3]
189 ldrd @E[2],@E[3],[sp,#$A[3][1]]
190 eor @C[0],@C[0],@E[0]
191 eor @C[1],@C[1],@E[1]
192 eor @C[2],@C[2],@E[2]
193 ldrd @E[0],@E[1],[sp,#$A[3][2]]
194 eor @C[3],@C[3],@E[3]
195 ldrd @E[2],@E[3],[sp,#$A[3][3]]
196 eor @C[4],@C[4],@E[0]
197 eor @C[5],@C[5],@E[1]
198 eor @C[6],@C[6],@E[2]
199 ldrd @E[0],@E[1],[sp,#$A[3][4]]
200 eor @C[7],@C[7],@E[3]
201 ldrd @E[2],@E[3],[sp,#$A[4][0]]
202 eor @C[8],@C[8],@E[0]
203 eor @C[9],@C[9],@E[1]
204 eor @C[0],@C[0],@E[2]
205 ldrd @E[0],@E[1],[sp,#$A[4][1]]
206 eor @C[1],@C[1],@E[3]
207 ldrd @E[2],@E[3],[sp,#$A[0][2]]
208 eor @C[2],@C[2],@E[0]
209 eor @C[3],@C[3],@E[1]
210 eor @C[4],@C[4],@E[2]
211 ldrd @E[0],@E[1],[sp,#$A[0][3]]
212 eor @C[5],@C[5],@E[3]
213 ldrd @E[2],@E[3],[sp,#$A[0][4]]
215 eor @C[0],@C[0],@E[0]
216 add @E[0],sp,#$A[1][2]
217 eor @C[1],@C[1],@E[1]
218 eor @C[2],@C[2],@E[2]
219 eor @C[3],@C[3],@E[3]
220 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
221 eor @C[4],@C[4],@E[0]
222 add @E[0],sp,#$A[1][4]
223 eor @C[5],@C[5],@E[1]
224 eor @C[6],@C[6],@E[2]
225 eor @C[7],@C[7],@E[3]
226 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
227 eor @C[8],@C[8],@E[0]
228 add @E[0],sp,#$A[2][1]
229 eor @C[9],@C[9],@E[1]
230 eor @C[0],@C[0],@E[2]
231 eor @C[1],@C[1],@E[3]
232 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
233 eor @C[2],@C[2],@E[0]
234 add @E[0],sp,#$A[2][3]
235 eor @C[3],@C[3],@E[1]
236 eor @C[4],@C[4],@E[2]
237 eor @C[5],@C[5],@E[3]
238 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
239 eor @C[6],@C[6],@E[0]
240 add @E[0],sp,#$A[3][0]
241 eor @C[7],@C[7],@E[1]
242 eor @C[8],@C[8],@E[2]
243 eor @C[9],@C[9],@E[3]
244 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
245 eor @C[0],@C[0],@E[0]
246 add @E[0],sp,#$A[3][2]
247 eor @C[1],@C[1],@E[1]
248 eor @C[2],@C[2],@E[2]
249 eor @C[3],@C[3],@E[3]
250 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
251 eor @C[4],@C[4],@E[0]
252 add @E[0],sp,#$A[3][4]
253 eor @C[5],@C[5],@E[1]
254 eor @C[6],@C[6],@E[2]
255 eor @C[7],@C[7],@E[3]
256 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
257 eor @C[8],@C[8],@E[0]
258 ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
259 eor @C[9],@C[9],@E[1]
260 ldr @E[1],[sp,#$A[4][1]+4]
261 eor @C[0],@C[0],@E[2]
262 ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
263 eor @C[1],@C[1],@E[3]
264 ldr @E[3],[sp,#$A[0][2]+4]
265 eor @C[2],@C[2],@E[0]
266 add @E[0],sp,#$A[0][3]
267 eor @C[3],@C[3],@E[1]
268 eor @C[4],@C[4],@E[2]
269 eor @C[5],@C[5],@E[3]
270 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
272 eor @C[6],@C[6],@E[0]
273 eor @C[7],@C[7],@E[1]
274 eor @C[8],@C[8],@E[2]
275 eor @C[9],@C[9],@E[3]
277 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
278 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
279 eor @E[1],@C[1],@C[4]
280 str.h @E[1],[sp,#$D[1]+4]
281 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
282 eor @E[3],@C[7],@C[0]
283 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
284 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
285 str.h @E[3],[sp,#$D[4]+4]
286 eor @C[1],@C[9],@C[2]
287 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
288 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
289 ldr.l @C[7],[sp,#$A[3][3]]
290 eor @C[3],@C[3],@C[6]
291 str.h @C[1],[sp,#$D[0]+4]
292 ldr.h @C[6],[sp,#$A[3][3]+4]
293 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
294 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
295 str.h @C[3],[sp,#$D[2]+4]
296 eor @C[5],@C[5],@C[8]
298 ldr.l @C[8],[sp,#$A[4][4]]
299 ldr.h @C[9],[sp,#$A[4][4]+4]
300 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
301 eor @C[7],@C[7],@C[4]
302 str.h @C[5],[sp,#$D[3]+4]
303 eor @C[6],@C[6],@C[5]
304 ldr.l @C[4],[sp,#$A[0][0]]
305 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
306 @ ror @C[6],@C[6],#32-11
307 ldr.h @C[5],[sp,#$A[0][0]+4]
308 eor @C[8],@C[8],@E[2]
309 eor @C[9],@C[9],@E[3]
310 ldr.l @E[2],[sp,#$A[2][2]]
311 eor @C[0],@C[0],@C[4]
312 ldr.h @E[3],[sp,#$A[2][2]+4]
313 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
314 @ ror @C[9],@C[9],#32-7
315 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
316 eor @E[2],@E[2],@C[2]
317 ldr.l @C[2],[sp,#$A[1][1]]
318 eor @E[3],@E[3],@C[3]
319 ldr.h @C[3],[sp,#$A[1][1]+4]
320 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
321 ldr @E[2],[sp,#444] @ load counter
322 eor @C[2],@C[2],@E[0]
324 ror @C[4],@E[3],#32-22
325 add @E[3],@E[0],@E[2]
326 eor @C[3],@C[3],@E[1]
328 $code.=<<___ if ($A[0][0] != $T[0][0]);
329 ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
331 $code.=<<___ if ($A[0][0] == $T[0][0]);
332 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
334 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
336 str @E[2],[sp,#444] @ store counter
339 bic @E[2],@C[4],@C[2],ror#32-22
340 bic @E[3],@C[5],@C[3],ror#32-22
341 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
342 ror @C[3],@C[3],#32-22
343 eor @E[2],@E[2],@C[0]
344 eor @E[3],@E[3],@C[1]
345 eor @E[0],@E[0],@E[2]
346 eor @E[1],@E[1],@E[3]
347 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
348 bic @E[2],@C[6],@C[4],ror#11
349 str.h @E[1],[sp,#$R[0][0]+4]
350 bic @E[3],@C[7],@C[5],ror#10
351 bic @E[0],@C[8],@C[6],ror#32-(11-7)
352 bic @E[1],@C[9],@C[7],ror#32-(10-7)
353 eor @E[2],@C[2],@E[2],ror#32-11
354 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
355 eor @E[3],@C[3],@E[3],ror#32-10
356 str.h @E[3],[sp,#$R[0][1]+4]
357 eor @E[0],@C[4],@E[0],ror#32-7
358 eor @E[1],@C[5],@E[1],ror#32-7
359 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
360 bic @E[2],@C[0],@C[8],ror#32-7
361 str.h @E[1],[sp,#$R[0][2]+4]
362 bic @E[3],@C[1],@C[9],ror#32-7
363 eor @E[2],@E[2],@C[6],ror#32-11
364 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
365 eor @E[3],@E[3],@C[7],ror#32-10
366 str.h @E[3],[sp,#$R[0][3]+4]
367 bic @E[0],@C[2],@C[0]
369 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
370 bic @E[1],@C[3],@C[1]
371 ldr.h @C[1],[sp,#$A[0][3]+4]
372 eor @E[0],@E[0],@C[8],ror#32-7
373 eor @E[1],@E[1],@C[9],ror#32-7
374 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
376 str.h @E[1],[sp,#$R[0][4]+4]
378 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
379 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
381 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
382 eor @C[0],@C[0],@E[0]
383 ldr.h @C[3],[sp,#$A[1][4]+4]
384 eor @C[1],@C[1],@E[1]
385 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
386 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
387 @ ror @C[1],@C[1],#32-14
388 ldr.h @E[1],[sp,#$A[3][1]+4]
390 eor @C[2],@C[2],@E[2]
391 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
392 eor @C[3],@C[3],@E[3]
393 ldr.h @C[5],[sp,#$A[2][0]+4]
394 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
395 @ ror @C[3],@C[3],#32-10
397 eor @C[6],@C[6],@C[4]
398 ldr.l @E[2],[sp,#$D[2]] @ D[2]
399 eor @C[7],@C[7],@C[5]
400 ldr.h @E[3],[sp,#$D[2]+4]
401 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
402 ror @C[4],@C[7],#32-2
404 eor @E[0],@E[0],@C[8]
405 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
406 eor @E[1],@E[1],@C[9]
407 ldr.h @C[9],[sp,#$A[4][2]+4]
408 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
409 ror @C[6],@E[1],#32-23
411 bic @E[0],@C[4],@C[2],ror#32-10
412 bic @E[1],@C[5],@C[3],ror#32-10
413 eor @E[2],@E[2],@C[8]
414 eor @E[3],@E[3],@C[9]
415 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
416 ror @C[8],@E[3],#32-31
417 eor @E[0],@E[0],@C[0],ror#32-14
418 eor @E[1],@E[1],@C[1],ror#32-14
419 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
420 bic @E[2],@C[6],@C[4]
421 str.h @E[1],[sp,#$R[1][0]+4]
422 bic @E[3],@C[7],@C[5]
423 eor @E[2],@E[2],@C[2],ror#32-10
424 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
425 eor @E[3],@E[3],@C[3],ror#32-10
426 str.h @E[3],[sp,#$R[1][1]+4]
427 bic @E[0],@C[8],@C[6]
428 bic @E[1],@C[9],@C[7]
429 bic @E[2],@C[0],@C[8],ror#14
430 bic @E[3],@C[1],@C[9],ror#14
431 eor @E[0],@E[0],@C[4]
432 eor @E[1],@E[1],@C[5]
433 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
434 bic @C[2],@C[2],@C[0],ror#32-(14-10)
435 str.h @E[1],[sp,#$R[1][2]+4]
436 eor @E[2],@C[6],@E[2],ror#32-14
437 bic @E[1],@C[3],@C[1],ror#32-(14-10)
438 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
439 eor @E[3],@C[7],@E[3],ror#32-14
440 str.h @E[3],[sp,#$R[1][3]+4]
442 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
443 eor @E[0],@C[8],@C[2],ror#32-10
444 ldr.h @C[0],[sp,#$A[0][1]+4]
445 eor @E[1],@C[9],@E[1],ror#32-10
446 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
447 str.h @E[1],[sp,#$R[1][4]+4]
450 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
451 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
452 ldr.h @C[3],[sp,#$A[1][2]+4]
453 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
455 eor @C[1],@C[1],@E[0]
456 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
457 eor @C[0],@C[0],@E[1]
458 ldr.h @C[5],[sp,#$A[2][3]+4]
459 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
461 eor @C[2],@C[2],@E[2]
462 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
463 eor @C[3],@C[3],@E[3]
464 ldr.h @E[1],[sp,#$A[3][4]+4]
465 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
466 ldr.l @E[2],[sp,#$D[0]] @ D[0]
467 @ ror @C[3],@C[3],#32-3
468 ldr.h @E[3],[sp,#$D[0]+4]
470 eor @C[4],@C[4],@C[6]
471 eor @C[5],@C[5],@C[7]
472 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
473 @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
475 eor @E[0],@E[0],@C[8]
476 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
477 eor @E[1],@E[1],@C[9]
478 ldr.h @C[9],[sp,#$A[4][0]+4]
479 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
480 ror @C[7],@E[1],#32-4
482 eor @E[2],@E[2],@C[8]
483 eor @E[3],@E[3],@C[9]
484 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
485 ror @C[9],@E[3],#32-9
487 bic @E[0],@C[5],@C[2],ror#13-3
488 bic @E[1],@C[4],@C[3],ror#12-3
489 bic @E[2],@C[6],@C[5],ror#32-13
490 bic @E[3],@C[7],@C[4],ror#32-12
491 eor @E[0],@C[0],@E[0],ror#32-13
492 eor @E[1],@C[1],@E[1],ror#32-12
493 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
494 eor @E[2],@E[2],@C[2],ror#32-3
495 str.h @E[1],[sp,#$R[2][0]+4]
496 eor @E[3],@E[3],@C[3],ror#32-3
497 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
498 bic @E[0],@C[8],@C[6]
499 bic @E[1],@C[9],@C[7]
500 str.h @E[3],[sp,#$R[2][1]+4]
501 eor @E[0],@E[0],@C[5],ror#32-13
502 eor @E[1],@E[1],@C[4],ror#32-12
503 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
504 bic @E[2],@C[0],@C[8]
505 str.h @E[1],[sp,#$R[2][2]+4]
506 bic @E[3],@C[1],@C[9]
507 eor @E[2],@E[2],@C[6]
508 eor @E[3],@E[3],@C[7]
509 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
510 bic @E[0],@C[2],@C[0],ror#3
511 str.h @E[3],[sp,#$R[2][3]+4]
512 bic @E[1],@C[3],@C[1],ror#3
513 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
514 eor @E[0],@C[8],@E[0],ror#32-3
515 ldr.h @C[0],[sp,#$A[0][4]+4]
516 eor @E[1],@C[9],@E[1],ror#32-3
517 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
519 str.h @E[1],[sp,#$R[2][4]+4]
521 ldr.l @E[0],[sp,#$D[4]] @ D[4]
522 ldr.h @E[1],[sp,#$D[4]+4]
523 ldr.l @E[2],[sp,#$D[0]] @ D[0]
524 ldr.h @E[3],[sp,#$D[0]+4]
526 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
528 eor @C[1],@C[1],@E[0]
529 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
530 eor @C[0],@C[0],@E[1]
531 ldr.h @C[3],[sp,#$A[1][0]+4]
532 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
533 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
534 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
535 ldr.h @C[5],[sp,#$A[2][1]+4]
537 eor @C[2],@C[2],@E[2]
538 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
539 eor @C[3],@C[3],@E[3]
540 ldr.h @E[1],[sp,#$A[3][2]+4]
541 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
542 ldr.l @E[2],[sp,#$D[3]] @ D[3]
543 @ ror @C[3],@C[3],#32-18
544 ldr.h @E[3],[sp,#$D[3]+4]
546 eor @C[6],@C[6],@C[4]
547 eor @C[7],@C[7],@C[5]
548 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
549 ror @C[5],@C[7],#32-5
551 eor @E[0],@E[0],@C[8]
552 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
553 eor @E[1],@E[1],@C[9]
554 ldr.h @C[9],[sp,#$A[4][3]+4]
555 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
556 ror @C[6],@E[1],#32-8
558 eor @E[2],@E[2],@C[8]
559 eor @E[3],@E[3],@C[9]
560 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
561 ror @C[9],@E[3],#32-28
563 bic @E[0],@C[4],@C[2],ror#32-18
564 bic @E[1],@C[5],@C[3],ror#32-18
565 eor @E[0],@E[0],@C[0],ror#32-14
566 eor @E[1],@E[1],@C[1],ror#32-13
567 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
568 bic @E[2],@C[6],@C[4]
569 str.h @E[1],[sp,#$R[3][0]+4]
570 bic @E[3],@C[7],@C[5]
571 eor @E[2],@E[2],@C[2],ror#32-18
572 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
573 eor @E[3],@E[3],@C[3],ror#32-18
574 str.h @E[3],[sp,#$R[3][1]+4]
575 bic @E[0],@C[8],@C[6]
576 bic @E[1],@C[9],@C[7]
577 bic @E[2],@C[0],@C[8],ror#14
578 bic @E[3],@C[1],@C[9],ror#13
579 eor @E[0],@E[0],@C[4]
580 eor @E[1],@E[1],@C[5]
581 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
582 bic @C[2],@C[2],@C[0],ror#18-14
583 str.h @E[1],[sp,#$R[3][2]+4]
584 eor @E[2],@C[6],@E[2],ror#32-14
585 bic @E[1],@C[3],@C[1],ror#18-13
586 eor @E[3],@C[7],@E[3],ror#32-13
587 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
588 str.h @E[3],[sp,#$R[3][3]+4]
590 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
591 eor @E[0],@C[8],@C[2],ror#32-18
592 ldr.h @C[1],[sp,#$A[0][2]+4]
593 eor @E[1],@C[9],@E[1],ror#32-18
594 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
595 str.h @E[1],[sp,#$R[3][4]+4]
597 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
598 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
599 ldr.h @C[3],[sp,#$A[1][3]+4]
600 ldr.l @C[6],[sp,#$D[4]] @ D[4]
601 ldr.h @C[7],[sp,#$D[4]+4]
603 eor @C[0],@C[0],@E[0]
604 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
605 eor @C[1],@C[1],@E[1]
606 ldr.h @C[5],[sp,#$A[2][4]+4]
607 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
608 ldr.l @C[8],[sp,#$D[0]] @ D[0]
609 @ ror @C[1],@C[1],#32-31
610 ldr.h @C[9],[sp,#$D[0]+4]
612 eor @E[2],@E[2],@C[2]
613 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
614 eor @E[3],@E[3],@C[3]
615 ldr.h @E[1],[sp,#$A[3][0]+4]
616 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
617 ldr.l @E[2],[sp,#$D[1]] @ D[1]
618 ror @C[2],@E[3],#32-28
619 ldr.h @E[3],[sp,#$D[1]+4]
621 eor @C[6],@C[6],@C[4]
622 eor @C[7],@C[7],@C[5]
623 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
624 ror @C[4],@C[7],#32-20
626 eor @E[0],@E[0],@C[8]
627 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
628 eor @E[1],@E[1],@C[9]
629 ldr.h @C[9],[sp,#$A[4][1]+4]
630 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
631 ror @C[6],@E[1],#32-21
633 eor @C[8],@C[8],@E[2]
634 eor @C[9],@C[9],@E[3]
635 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
636 @ ror @C[9],@C[3],#32-1
638 bic @E[0],@C[4],@C[2]
639 bic @E[1],@C[5],@C[3]
640 eor @E[0],@E[0],@C[0],ror#32-31
641 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
642 eor @E[1],@E[1],@C[1],ror#32-31
643 str.h @E[1],[sp,#$R[4][0]+4]
644 bic @E[2],@C[6],@C[4]
645 bic @E[3],@C[7],@C[5]
646 eor @E[2],@E[2],@C[2]
647 eor @E[3],@E[3],@C[3]
648 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
649 bic @E[0],@C[8],@C[6],ror#1
650 str.h @E[3],[sp,#$R[4][1]+4]
651 bic @E[1],@C[9],@C[7],ror#1
652 bic @E[2],@C[0],@C[8],ror#31-1
653 bic @E[3],@C[1],@C[9],ror#31-1
654 eor @C[4],@C[4],@E[0],ror#32-1
655 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
656 eor @C[5],@C[5],@E[1],ror#32-1
657 str.h @C[5],[sp,#$R[4][2]+4]
658 eor @C[6],@C[6],@E[2],ror#32-31
659 eor @C[7],@C[7],@E[3],ror#32-31
660 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
661 bic @E[0],@C[2],@C[0],ror#32-31
662 str.h @C[7],[sp,#$R[4][3]+4]
663 bic @E[1],@C[3],@C[1],ror#32-31
664 add @E[2],sp,#$R[0][0]
665 eor @C[8],@E[0],@C[8],ror#32-1
666 add @E[0],sp,#$R[1][0]
667 eor @C[9],@E[1],@C[9],ror#32-1
668 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
669 str.h @C[9],[sp,#$R[4][4]+4]
678 .size KeccakF1600_int,.-KeccakF1600_int
680 .type KeccakF1600, %function
683 stmdb sp!,{r0,r4-r11,lr}
684 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
686 add @E[0],r0,#$A[1][0]
687 add @E[1],sp,#$A[1][0]
688 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
689 stmia sp, {@C[0]-@C[9]}
690 ldmia @E[0]!,{@C[0]-@C[9]}
691 stmia @E[1]!,{@C[0]-@C[9]}
692 ldmia @E[0]!,{@C[0]-@C[9]}
693 stmia @E[1]!,{@C[0]-@C[9]}
694 ldmia @E[0]!,{@C[0]-@C[9]}
695 stmia @E[1]!,{@C[0]-@C[9]}
696 ldmia @E[0], {@C[0]-@C[9]}
697 add @E[2],sp,#$A[0][0]
698 add @E[0],sp,#$A[1][0]
699 stmia @E[1], {@C[0]-@C[9]}
703 ldr @E[1], [sp,#440+16] @ restore pointer to A
704 ldmia sp, {@C[0]-@C[9]}
705 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
706 ldmia @E[0]!,{@C[0]-@C[9]}
707 stmia @E[1]!,{@C[0]-@C[9]}
708 ldmia @E[0]!,{@C[0]-@C[9]}
709 stmia @E[1]!,{@C[0]-@C[9]}
710 ldmia @E[0]!,{@C[0]-@C[9]}
711 stmia @E[1]!,{@C[0]-@C[9]}
712 ldmia @E[0], {@C[0]-@C[9]}
713 stmia @E[1], {@C[0]-@C[9]}
716 ldmia sp!,{r4-r11,pc}
717 .size KeccakF1600,.-KeccakF1600
719 { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
721 ########################################################################
723 # ----->+-----------------------+
724 # | uint64_t A[5][5] |
727 # +456->+-----------------------+
729 # +460->+-----------------------+
731 # +464->+-----------------------+
733 # +468->+-----------------------+
735 # +472->+-----------------------+
737 # +476->+-----------------------+
738 # | const void *inp |
739 # +480->+-----------------------+
741 # +484->+-----------------------+
743 # +488->+-----------------------+
748 .type SHA3_absorb,%function
751 stmdb sp!,{r0-r12,lr}
754 add $A_flat,r0,#$A[1][0]
762 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
763 stmia $inp!, {@C[0]-@C[9]}
764 ldmia $A_flat!,{@C[0]-@C[9]}
765 stmia $inp!, {@C[0]-@C[9]}
766 ldmia $A_flat!,{@C[0]-@C[9]}
767 stmia $inp!, {@C[0]-@C[9]}
768 ldmia $A_flat!,{@C[0]-@C[9]}
769 stmia $inp!, {@C[0]-@C[9]}
770 ldmia $A_flat!,{@C[0]-@C[9]}
771 stmia $inp, {@C[0]-@C[9]}
773 ldr $inp,[sp,#476] @ restore $inp
780 mov r6,#0x11 @ compose constants
785 orr r6,r6,r6,lsl#16 @ 0x11111111
786 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
787 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
788 orr r7,r6,r6,lsl#1 @ 0x33333333
789 orr r6,r6,r6,lsl#2 @ 0x55555555
802 str r0,[sp,#480] @ save len - bsz
815 orr r0,r0,r3,lsl#24 @ lo
819 orr r1,r1,r3,lsl#24 @ hi
821 and r2,r0,r6 @ &=0x55555555
822 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
823 and r3,r1,r6 @ &=0x55555555
824 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
829 and r2,r2,r7 @ &=0x33333333
830 and r0,r0,r7,lsl#2 @ &=0xcccccccc
831 and r3,r3,r7 @ &=0x33333333
832 and r1,r1,r7,lsl#2 @ &=0xcccccccc
837 and r2,r2,r8 @ &=0x0f0f0f0f
838 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
839 and r3,r3,r8 @ &=0x0f0f0f0f
840 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
841 ldmia $A_flat,{r4-r5} @ A_flat[i]
846 and r2,r2,r9 @ &=0x00ff00ff
847 and r0,r0,r9,lsl#8 @ &=0xff00ff00
848 and r3,r3,r9 @ &=0x00ff00ff
849 and r1,r1,r9,lsl#8 @ &=0xff00ff00
861 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
871 ldmia r14,{r6-r12,r14} @ restore constants and variables
876 add $inp,sp,#$A[1][0]
877 ldmia sp, {@C[0]-@C[9]}
878 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
879 ldmia $inp!, {@C[0]-@C[9]}
880 stmia $A_flat!,{@C[0]-@C[9]}
881 ldmia $inp!, {@C[0]-@C[9]}
882 stmia $A_flat!,{@C[0]-@C[9]}
883 ldmia $inp!, {@C[0]-@C[9]}
884 stmia $A_flat!,{@C[0]-@C[9]}
885 ldmia $inp, {@C[0]-@C[9]}
886 stmia $A_flat, {@C[0]-@C[9]}
890 mov r0,$len @ return value
891 ldmia sp!,{r4-r12,pc}
892 .size SHA3_absorb,.-SHA3_absorb
895 { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
899 .type SHA3_squeeze,%function
902 stmdb sp!,{r0,r3-r10,lr}
915 mov r6,#0x11 @ compose constants
920 orr r6,r6,r6,lsl#16 @ 0x11111111
921 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
922 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
923 orr r7,r6,r6,lsl#1 @ 0x33333333
924 orr r6,r6,r6,lsl#2 @ 0x55555555
933 ldmia $A_flat!,{r0,r1} @ A_flat[i++]
936 lsl r3,r1,#16 @ r3 = r1 << 16
937 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
939 lsr r0,r0,#16 @ r0 = r0 >> 16
940 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
946 and r2,r2,r9 @ &=0x00ff00ff
947 and r3,r3,r9,lsl#8 @ &=0xff00ff00
948 and r0,r0,r9 @ &=0x00ff00ff
949 and r1,r1,r9,lsl#8 @ &=0xff00ff00
954 and r2,r2,r8 @ &=0x0f0f0f0f
955 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
956 and r0,r0,r8 @ &=0x0f0f0f0f
957 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
962 and r2,r2,r7 @ &=0x33333333
963 and r3,r3,r7,lsl#2 @ &=0xcccccccc
964 and r0,r0,r7 @ &=0x33333333
965 and r1,r1,r7,lsl#2 @ &=0xcccccccc
970 and r2,r2,r6 @ &=0x55555555
971 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
972 and r0,r0,r6 @ &=0x55555555
973 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
998 subs $bsz,$bsz,#8 @ bsz -= 8
1001 mov r0,r14 @ original $A_flat
1005 ldmia sp,{r6-r10,r12} @ restore constants and variables
1041 ldmia sp!,{r4-r10,pc}
1042 .size SHA3_squeeze,.-SHA3_squeeze
1049 .type iotas64, %object
1052 .quad 0x0000000000000001
1053 .quad 0x0000000000008082
1054 .quad 0x800000000000808a
1055 .quad 0x8000000080008000
1056 .quad 0x000000000000808b
1057 .quad 0x0000000080000001
1058 .quad 0x8000000080008081
1059 .quad 0x8000000000008009
1060 .quad 0x000000000000008a
1061 .quad 0x0000000000000088
1062 .quad 0x0000000080008009
1063 .quad 0x000000008000000a
1064 .quad 0x000000008000808b
1065 .quad 0x800000000000008b
1066 .quad 0x8000000000008089
1067 .quad 0x8000000000008003
1068 .quad 0x8000000000008002
1069 .quad 0x8000000000000080
1070 .quad 0x000000000000800a
1071 .quad 0x800000008000000a
1072 .quad 0x8000000080008081
1073 .quad 0x8000000000008080
1074 .quad 0x0000000080000001
1075 .quad 0x8000000080008008
1076 .size iotas64,.-iotas64
1078 .type KeccakF1600_neon, %function
1083 mov r3, #24 @ loop counter
1089 vst1.64 {q4}, [r0:64] @ offload A[0..1][4]
1090 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
1091 vst1.64 {d18}, [r1:64] @ offload A[2][4]
1092 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
1093 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
1094 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1095 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1096 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
1097 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
1098 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1099 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1100 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1101 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
1102 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
1103 veor d25, d25, d24 @ C[4]^=A[4][4]
1105 vadd.u64 q4, q13, q13 @ C[0..1]<<1
1106 vadd.u64 q15, q14, q14 @ C[2..3]<<1
1107 vadd.u64 d18, d25, d25 @ C[4]<<1
1108 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
1109 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
1110 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
1111 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
1112 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1113 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
1114 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
1116 veor d0, d0, d25 @ A[0][0] ^= C[4]
1117 veor d1, d1, d25 @ A[1][0] ^= C[4]
1118 veor d10, d10, d25 @ A[2][0] ^= C[4]
1119 veor d11, d11, d25 @ A[3][0] ^= C[4]
1120 veor d20, d20, d25 @ A[4][0] ^= C[4]
1122 veor d2, d2, d26 @ A[0][1] ^= D[1]
1123 veor d3, d3, d26 @ A[1][1] ^= D[1]
1124 veor d12, d12, d26 @ A[2][1] ^= D[1]
1125 veor d13, d13, d26 @ A[3][1] ^= D[1]
1126 veor d21, d21, d26 @ A[4][1] ^= D[1]
1129 veor d6, d6, d28 @ A[0][3] ^= C[2]
1130 veor d7, d7, d28 @ A[1][3] ^= C[2]
1131 veor d16, d16, d28 @ A[2][3] ^= C[2]
1132 veor d17, d17, d28 @ A[3][3] ^= C[2]
1133 veor d23, d23, d28 @ A[4][3] ^= C[2]
1134 vld1.64 {q4}, [r0:64] @ restore A[0..1][4]
1137 vld1.64 {d18}, [r1:64] @ restore A[2][4]
1138 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
1139 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
1140 veor d22, d22, d27 @ A[4][2] ^= D[2]
1142 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
1143 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
1144 veor d24, d24, d29 @ A[4][4] ^= C[3]
1147 vmov d26, d2 @ C[1] = A[0][1]
1148 vshl.u64 d2, d3, #44
1149 vmov d27, d4 @ C[2] = A[0][2]
1150 vshl.u64 d4, d14, #43
1151 vmov d28, d6 @ C[3] = A[0][3]
1152 vshl.u64 d6, d17, #21
1153 vmov d29, d8 @ C[4] = A[0][4]
1154 vshl.u64 d8, d24, #14
1155 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1156 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1157 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1158 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1160 vshl.u64 d3, d9, #20
1161 vshl.u64 d14, d16, #25
1162 vshl.u64 d17, d15, #15
1163 vshl.u64 d24, d21, #2
1164 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1165 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1166 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1167 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1169 vshl.u64 d9, d22, #61
1170 @ vshl.u64 d16, d19, #8
1171 vshl.u64 d15, d12, #10
1172 vshl.u64 d21, d7, #55
1173 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1174 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1175 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1176 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1178 vshl.u64 d22, d18, #39
1179 @ vshl.u64 d19, d23, #56
1180 vshl.u64 d12, d5, #6
1181 vshl.u64 d7, d13, #45
1182 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1183 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1184 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1185 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1187 vshl.u64 d18, d20, #18
1188 vshl.u64 d23, d11, #41
1189 vshl.u64 d5, d10, #3
1190 vshl.u64 d13, d1, #36
1191 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1192 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1193 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1194 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1196 vshl.u64 d1, d28, #28
1197 vshl.u64 d10, d26, #1
1198 vshl.u64 d11, d29, #27
1199 vshl.u64 d20, d27, #62
1200 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
1201 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
1202 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
1203 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
1209 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1210 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1211 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1212 vst1.64 {q13}, [r0:64] @ offload A[0..1][0]
1215 vmov q1, q14 @ A[0..1][1]
1216 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1217 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1220 vmov q0, q5 @ A[2..3][0]
1222 vmov q15, q6 @ A[2..3][1]
1223 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1225 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1227 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1229 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1230 vmov q14, q10 @ A[4][0..1]
1231 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1233 vld1.64 d25, [r2:64]! @ Iota[i++]
1236 vld1.64 {q0}, [r0:64] @ restore A[0..1][0]
1237 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
1239 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
1241 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1243 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1244 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1245 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1251 .size KeccakF1600_neon,.-KeccakF1600_neon
1253 .global SHA3_absorb_neon
1254 .type SHA3_absorb_neon, %function
1257 stmdb sp!, {r4-r6,lr}
1258 vstmdb sp!, {d8-d15}
1264 vld1.32 {d0}, [r0:64]! @ A[0][0]
1265 vld1.32 {d2}, [r0:64]! @ A[0][1]
1266 vld1.32 {d4}, [r0:64]! @ A[0][2]
1267 vld1.32 {d6}, [r0:64]! @ A[0][3]
1268 vld1.32 {d8}, [r0:64]! @ A[0][4]
1270 vld1.32 {d1}, [r0:64]! @ A[1][0]
1271 vld1.32 {d3}, [r0:64]! @ A[1][1]
1272 vld1.32 {d5}, [r0:64]! @ A[1][2]
1273 vld1.32 {d7}, [r0:64]! @ A[1][3]
1274 vld1.32 {d9}, [r0:64]! @ A[1][4]
1276 vld1.32 {d10}, [r0:64]! @ A[2][0]
1277 vld1.32 {d12}, [r0:64]! @ A[2][1]
1278 vld1.32 {d14}, [r0:64]! @ A[2][2]
1279 vld1.32 {d16}, [r0:64]! @ A[2][3]
1280 vld1.32 {d18}, [r0:64]! @ A[2][4]
1282 vld1.32 {d11}, [r0:64]! @ A[3][0]
1283 vld1.32 {d13}, [r0:64]! @ A[3][1]
1284 vld1.32 {d15}, [r0:64]! @ A[3][2]
1285 vld1.32 {d17}, [r0:64]! @ A[3][3]
1286 vld1.32 {d19}, [r0:64]! @ A[3][4]
1288 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3]
1289 vld1.32 {d24}, [r0:64] @ A[4][4]
1290 sub r0, r0, #24*8 @ rewind
1295 subs r12, r5, r6 @ len - bsz
1299 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1301 veor d0, d0, d31 @ A[0][0] ^= *inp++
1304 veor d2, d2, d31 @ A[0][1] ^= *inp++
1308 veor d4, d4, d31 @ A[0][2] ^= *inp++
1311 veor d6, d6, d31 @ A[0][3] ^= *inp++
1315 veor d8, d8, d31 @ A[0][4] ^= *inp++
1319 veor d1, d1, d31 @ A[1][0] ^= *inp++
1323 veor d3, d3, d31 @ A[1][1] ^= *inp++
1326 veor d5, d5, d31 @ A[1][2] ^= *inp++
1330 veor d7, d7, d31 @ A[1][3] ^= *inp++
1333 veor d9, d9, d31 @ A[1][4] ^= *inp++
1338 veor d10, d10, d31 @ A[2][0] ^= *inp++
1341 veor d12, d12, d31 @ A[2][1] ^= *inp++
1345 veor d14, d14, d31 @ A[2][2] ^= *inp++
1348 veor d16, d16, d31 @ A[2][3] ^= *inp++
1352 veor d18, d18, d31 @ A[2][4] ^= *inp++
1356 veor d11, d11, d31 @ A[3][0] ^= *inp++
1360 veor d13, d13, d31 @ A[3][1] ^= *inp++
1363 veor d15, d15, d31 @ A[3][2] ^= *inp++
1367 veor d17, d17, d31 @ A[3][3] ^= *inp++
1370 veor d19, d19, d31 @ A[3][4] ^= *inp++
1375 veor d20, d20, d31 @ A[4][0] ^= *inp++
1378 veor d21, d21, d31 @ A[4][1] ^= *inp++
1382 veor d22, d22, d31 @ A[4][2] ^= *inp++
1385 veor d23, d23, d31 @ A[4][3] ^= *inp++
1388 veor d24, d24, d31 @ A[4][4] ^= *inp++
1396 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1397 vst1.32 {d2}, [r0:64]!
1398 vst1.32 {d4}, [r0:64]!
1399 vst1.32 {d6}, [r0:64]!
1400 vst1.32 {d8}, [r0:64]!
1402 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1403 vst1.32 {d3}, [r0:64]!
1404 vst1.32 {d5}, [r0:64]!
1405 vst1.32 {d7}, [r0:64]!
1406 vst1.32 {d9}, [r0:64]!
1408 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1409 vst1.32 {d12}, [r0:64]!
1410 vst1.32 {d14}, [r0:64]!
1411 vst1.32 {d16}, [r0:64]!
1412 vst1.32 {d18}, [r0:64]!
1414 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1415 vst1.32 {d13}, [r0:64]!
1416 vst1.32 {d15}, [r0:64]!
1417 vst1.32 {d17}, [r0:64]!
1418 vst1.32 {d19}, [r0:64]!
1420 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1421 vst1.32 {d24}, [r0:64]
1423 mov r0, r5 @ return value
1424 vldmia sp!, {d8-d15}
1425 ldmia sp!, {r4-r6,pc}
1426 .size SHA3_absorb_neon,.-SHA3_absorb_neon
1428 .global SHA3_squeeze_neon
1429 .type SHA3_squeeze_neon, %function
1432 stmdb sp!, {r4-r6,lr}
1437 mov r12, r0 @ A_flat
1439 b .Loop_squeeze_neon
1444 blo .Lsqueeze_neon_tail
1445 vld1.32 {d0}, [r12]!
1446 vst1.8 {d0}, [r4]! @ endian-neutral store
1448 subs r5, r5, #8 @ len -= 8
1449 beq .Lsqueeze_neon_done
1451 subs r14, r14, #8 @ bsz -= 8
1452 bhi .Loop_squeeze_neon
1454 vstmdb sp!, {d8-d15}
1456 vld1.32 {d0}, [r0:64]! @ A[0][0..4]
1457 vld1.32 {d2}, [r0:64]!
1458 vld1.32 {d4}, [r0:64]!
1459 vld1.32 {d6}, [r0:64]!
1460 vld1.32 {d8}, [r0:64]!
1462 vld1.32 {d1}, [r0:64]! @ A[1][0..4]
1463 vld1.32 {d3}, [r0:64]!
1464 vld1.32 {d5}, [r0:64]!
1465 vld1.32 {d7}, [r0:64]!
1466 vld1.32 {d9}, [r0:64]!
1468 vld1.32 {d10}, [r0:64]! @ A[2][0..4]
1469 vld1.32 {d12}, [r0:64]!
1470 vld1.32 {d14}, [r0:64]!
1471 vld1.32 {d16}, [r0:64]!
1472 vld1.32 {d18}, [r0:64]!
1474 vld1.32 {d11}, [r0:64]! @ A[3][0..4]
1475 vld1.32 {d13}, [r0:64]!
1476 vld1.32 {d15}, [r0:64]!
1477 vld1.32 {d17}, [r0:64]!
1478 vld1.32 {d19}, [r0:64]!
1480 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1481 vld1.32 {d24}, [r0:64]
1482 sub r0, r0, #24*8 @ rewind
1486 mov r12, r0 @ A_flat
1487 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1488 vst1.32 {d2}, [r0:64]!
1489 vst1.32 {d4}, [r0:64]!
1490 vst1.32 {d6}, [r0:64]!
1491 vst1.32 {d8}, [r0:64]!
1493 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1494 vst1.32 {d3}, [r0:64]!
1495 vst1.32 {d5}, [r0:64]!
1496 vst1.32 {d7}, [r0:64]!
1497 vst1.32 {d9}, [r0:64]!
1499 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1500 vst1.32 {d12}, [r0:64]!
1501 vst1.32 {d14}, [r0:64]!
1502 vst1.32 {d16}, [r0:64]!
1503 vst1.32 {d18}, [r0:64]!
1505 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1506 vst1.32 {d13}, [r0:64]!
1507 vst1.32 {d15}, [r0:64]!
1508 vst1.32 {d17}, [r0:64]!
1509 vst1.32 {d19}, [r0:64]!
1511 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1513 vst1.32 {d24}, [r0:64]
1514 mov r0, r12 @ rewind
1516 vldmia sp!, {d8-d15}
1517 b .Loop_squeeze_neon
1520 .Lsqueeze_neon_tail:
1523 strb r2, [r4],#1 @ endian-neutral store
1525 blo .Lsqueeze_neon_done
1528 beq .Lsqueeze_neon_done
1532 blo .Lsqueeze_neon_done
1534 beq .Lsqueeze_neon_done
1539 blo .Lsqueeze_neon_done
1542 beq .Lsqueeze_neon_done
1545 .Lsqueeze_neon_done:
1546 ldmia sp!, {r4-r6,pc}
1547 .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
1548 .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1556 my ($mnemonic,$half,$reg,$ea) = @_;
1557 my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1562 sprintf "#ifndef __thumb2__\n" .
1564 "#endif", $mnemonic,$reg,$ea;
1566 sprintf "#ifndef __thumb2__\n" .
1569 " %sd\t%s,%s,%s\n" .
1570 "#endif", $mnemonic,$reg,$ea,
1571 $mnemonic,$$op{reg},$reg,$$op{ea};
1577 open STDOUT,">$output";
1579 foreach (split($/,$code)) {
1580 s/\`([^\`]*)\`/eval $1/ge;
1582 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1583 s/\bret\b/bx lr/g or
1584 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
1589 close STDOUT; # enforce flush