2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for ARMv4.
20 # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21 # interleaving. How does it compare to Keccak Code Package? It's as
22 # fast, but several times smaller, and is endian- and ISA-neutral. ISA
23 # neutrality means that minimum ISA requirement is ARMv4, yet it can
24 # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25 # register layout taken from Keccak Code Package. It's also as fast,
26 # in fact faster by 10-15% on some processors, and endian-neutral.
28 ########################################################################
29 # Numbers are cycles per processed byte. Non-NEON results account even
30 # for input bit interleaving.
32 # r=1600(*),NEON r=1088(**),NEON
34 # Cortex-A5 67/+130%, 24 96, 36
35 # Cortex-A7 60/+90%, 23 87, 34
36 # Cortex-A8 39/+220%, 20 56, 30
37 # Cortex-A9 41/+160%, 17 58, 26
38 # Cortex-A15 30/+65%, 12 41, 18
39 # Snapdragon S4 35/+120%, 16 50, 24
41 # (*) Not used in real life, meaningful as estimate for single absorb
42 # operation performance. Percentage after slash is improvement
43 # over compiler-generated KECCAK_1X reference code.
44 # (**) Corresponds to SHA3-256, 8KB message size.
46 my @C = map("r$_",(0..9));
47 my @E = map("r$_",(10..12,14));
49 ########################################################################
51 # ----->+-----------------------+
52 # | uint64_t A[5][5] |
54 # +200->+-----------------------+
57 # +240->+-----------------------+
58 # | uint64_t T[2][5] |
60 # +320->+-----------------------+
62 # +324->+-----------------------+
64 # +328->+-----------------------+
67 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
68 my @D = map(8*$_, (25..29));
69 my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35));
74 #if defined(__thumb2__)
81 .type iotas32, %object
84 .long 0x00000001, 0x00000000
85 .long 0x00000000, 0x00000089
86 .long 0x00000000, 0x8000008b
87 .long 0x00000000, 0x80008080
88 .long 0x00000001, 0x0000008b
89 .long 0x00000001, 0x00008000
90 .long 0x00000001, 0x80008088
91 .long 0x00000001, 0x80000082
92 .long 0x00000000, 0x0000000b
93 .long 0x00000000, 0x0000000a
94 .long 0x00000001, 0x00008082
95 .long 0x00000000, 0x00008003
96 .long 0x00000001, 0x0000808b
97 .long 0x00000001, 0x8000000b
98 .long 0x00000001, 0x8000008a
99 .long 0x00000001, 0x80000081
100 .long 0x00000000, 0x80000081
101 .long 0x00000000, 0x80000008
102 .long 0x00000000, 0x00000083
103 .long 0x00000000, 0x80008003
104 .long 0x00000001, 0x80008088
105 .long 0x00000000, 0x80000088
106 .long 0x00000001, 0x00008000
107 .long 0x00000000, 0x80008082
108 .size iotas32,.-iotas32
110 .type KeccakF1600_int, %function
113 ldmia sp,{@C[0]-@C[9]} @ A[0][0..4]
114 add @E[0],sp,#$A[1][0]
117 eor @E[1],@E[1],@E[1]
123 ldmia sp,{@C[0]-@C[9]} @ A[0][0..4]
125 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
126 eor @C[0],@C[0],@E[0]
127 add @E[0],sp,#$A[1][2]
128 eor @C[1],@C[1],@E[1]
129 eor @C[2],@C[2],@E[2]
130 eor @C[3],@C[3],@E[3]
131 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
132 eor @C[4],@C[4],@E[0]
133 add @E[0],sp,#$A[1][4]
134 eor @C[5],@C[5],@E[1]
135 eor @C[6],@C[6],@E[2]
136 eor @C[7],@C[7],@E[3]
137 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
138 eor @C[8],@C[8],@E[0]
139 add @E[0],sp,#$A[2][1]
140 eor @C[9],@C[9],@E[1]
141 eor @C[0],@C[0],@E[2]
142 eor @C[1],@C[1],@E[3]
143 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
144 eor @C[2],@C[2],@E[0]
145 add @E[0],sp,#$A[2][3]
146 eor @C[3],@C[3],@E[1]
147 eor @C[4],@C[4],@E[2]
148 eor @C[5],@C[5],@E[3]
149 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
150 eor @C[6],@C[6],@E[0]
151 add @E[0],sp,#$A[3][0]
152 eor @C[7],@C[7],@E[1]
153 eor @C[8],@C[8],@E[2]
154 eor @C[9],@C[9],@E[3]
155 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
156 eor @C[0],@C[0],@E[0]
157 add @E[0],sp,#$A[3][2]
158 eor @C[1],@C[1],@E[1]
159 eor @C[2],@C[2],@E[2]
160 eor @C[3],@C[3],@E[3]
161 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
162 eor @C[4],@C[4],@E[0]
163 add @E[0],sp,#$A[3][4]
164 eor @C[5],@C[5],@E[1]
165 eor @C[6],@C[6],@E[2]
166 eor @C[7],@C[7],@E[3]
167 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
168 eor @C[8],@C[8],@E[0]
169 add @E[0],sp,#$A[4][1]
170 eor @C[9],@C[9],@E[1]
171 eor @C[0],@C[0],@E[2]
172 eor @C[1],@C[1],@E[3]
173 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[4][1..2]
174 eor @C[2],@C[2],@E[0]
175 add @E[0],sp,#$A[4][3]
176 eor @C[3],@C[3],@E[1]
177 eor @C[4],@C[4],@E[2]
178 eor @C[5],@C[5],@E[3]
179 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[4][3..4]
180 eor @C[6],@C[6],@E[0]
181 eor @C[7],@C[7],@E[1]
182 eor @C[8],@C[8],@E[2]
183 eor @C[9],@C[9],@E[3]
185 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
186 eor @E[1],@C[1],@C[4]
187 str @E[0],[sp,#$D[1]] @ D[1] = E[0]
188 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
189 str @E[1],[sp,#$D[1]+4]
190 eor @E[3],@C[7],@C[0]
191 str @E[2],[sp,#$D[4]] @ D[4] = E[1]
192 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
193 str @E[3],[sp,#$D[4]+4]
194 eor @C[1],@C[9],@C[2]
195 str @C[0],[sp,#$D[0]] @ D[0] = C[0]
196 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
197 str @C[1],[sp,#$D[0]+4]
198 eor @C[3],@C[3],@C[6]
199 str @C[2],[sp,#$D[2]] @ D[2] = C[1]
200 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
201 str @C[3],[sp,#$D[2]+4]
202 eor @C[5],@C[5],@C[8]
203 ldr @C[8],[sp,#$A[3][0]]
204 ldr @C[9],[sp,#$A[3][0]+4]
205 str @C[4],[sp,#$D[3]] @ D[3] = C[2]
206 str @C[5],[sp,#$D[3]+4]
208 ldr @C[6],[sp,#$A[0][1]]
209 eor @C[8],@C[8],@C[0]
210 ldr @C[7],[sp,#$A[0][1]+4]
211 eor @C[9],@C[9],@C[1]
212 str @C[8],[sp,#$T[0][0]] @ T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
213 ldr @C[8],[sp,#$A[0][2]]
214 str @C[9],[sp,#$T[0][0]+4]
215 ldr @C[9],[sp,#$A[0][2]+4]
216 eor @C[6],@C[6],@E[0]
217 eor @C[7],@C[7],@E[1]
218 str @C[6],[sp,#$T[0][1]] @ T[0][1] = A[0][1] ^ E[0]; /* D[1] */
219 ldr @C[6],[sp,#$A[0][3]]
220 str @C[7],[sp,#$T[0][1]+4]
221 ldr @C[7],[sp,#$A[0][3]+4]
222 eor @C[8],@C[8],@C[2]
223 eor @C[9],@C[9],@C[3]
224 str @C[8],[sp,#$T[0][2]] @ T[0][2] = A[0][2] ^ C[1]; /* D[2] */
225 ldr @C[8],[sp,#$A[0][4]]
226 str @C[9],[sp,#$T[0][2]+4]
227 ldr @C[9],[sp,#$A[0][4]+4]
228 eor @C[6],@C[6],@C[4]
229 eor @C[7],@C[7],@C[5]
230 str @C[6],[sp,#$T[0][3]] @ T[0][3] = A[0][3] ^ C[2]; /* D[3] */
231 eor @C[8],@C[8],@E[2]
232 str @C[7],[sp,#$T[0][3]+4]
233 eor @C[9],@C[9],@E[3]
234 ldr @C[6],[sp,#$A[3][3]]
235 ldr @C[7],[sp,#$A[3][3]+4]
236 str @C[8],[sp,#$T[0][4]] @ T[0][4] = A[0][4] ^ E[1]; /* D[4] */
237 str @C[9],[sp,#$T[0][4]+4]
239 ldr @C[8],[sp,#$A[4][4]]
240 eor @C[4],@C[4],@C[6]
241 ldr @C[9],[sp,#$A[4][4]+4]
242 eor @C[5],@C[5],@C[7]
243 ror @C[7],@C[4],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
244 ldr @C[4],[sp,#$A[0][0]]
245 ror @C[6],@C[5],#32-11
246 ldr @C[5],[sp,#$A[0][0]+4]
247 eor @C[8],@C[8],@E[2]
248 eor @C[9],@C[9],@E[3]
249 ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
250 ldr @E[2],[sp,#$A[2][2]]
251 ror @C[9],@C[9],#32-7
252 ldr @E[3],[sp,#$A[2][2]+4]
253 eor @C[0],@C[0],@C[4]
254 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
255 eor @E[2],@E[2],@C[2]
256 ldr @C[2],[sp,#$A[1][1]]
257 eor @E[3],@E[3],@C[3]
258 ldr @C[3],[sp,#$A[1][1]+4]
259 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
260 ldr @E[2],[sp,#324] @ load counter
261 eor @C[2],@C[2],@E[0]
262 ror @C[4],@E[3],#32-22
264 eor @C[3],@C[3],@E[1]
265 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
266 add @E[3],@E[3],@E[2]
267 ror @C[3],@C[3],#32-22
269 ldr @E[0],[@E[3],#0] @ iotas[i].lo
271 ldr @E[1],[@E[3],#4] @ iotas[i].hi
273 str @E[2],[sp,#324] @ store counter
275 bic @E[2],@C[4],@C[2]
276 bic @E[3],@C[5],@C[3]
277 eor @E[2],@E[2],@C[0]
278 eor @E[3],@E[3],@C[1]
279 eor @E[0],@E[0],@E[2]
280 eor @E[1],@E[1],@E[3]
281 str @E[0],[sp,#$A[0][0]] @ A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
282 bic @E[2],@C[6],@C[4]
283 str @E[1],[sp,#$A[0][0]+4]
284 bic @E[3],@C[7],@C[5]
285 eor @E[2],@E[2],@C[2]
286 eor @E[3],@E[3],@C[3]
287 str @E[2],[sp,#$A[0][1]] @ A[0][1] = C[1] ^ (~C[2] & C[3]);
288 bic @E[0],@C[8],@C[6]
289 str @E[3],[sp,#$A[0][1]+4]
290 bic @E[1],@C[9],@C[7]
291 eor @E[0],@E[0],@C[4]
292 eor @E[1],@E[1],@C[5]
293 str @E[0],[sp,#$A[0][2]] @ A[0][2] = C[2] ^ (~C[3] & C[4]);
294 bic @E[2],@C[0],@C[8]
295 str @E[1],[sp,#$A[0][2]+4]
296 bic @E[3],@C[1],@C[9]
297 eor @E[2],@E[2],@C[6]
298 eor @E[3],@E[3],@C[7]
299 str @E[2],[sp,#$A[0][3]] @ A[0][3] = C[3] ^ (~C[4] & C[0]);
300 bic @E[0],@C[2],@C[0]
301 str @E[3],[sp,#$A[0][3]+4]
303 bic @E[1],@C[3],@C[1]
304 eor @E[0],@E[0],@C[8]
305 eor @E[1],@E[1],@C[9]
306 str @E[0],[sp,#$A[0][4]] @ A[0][4] = C[4] ^ (~C[0] & C[1]);
307 str @E[1],[sp,#$A[0][4]+4]
309 ldmia @E[3],{@C[6]-@C[9],@E[0],@E[1],@E[2],@E[3]} @ D[0..3]
310 ldr @C[0],[sp,#$A[1][0]]
311 ldr @C[1],[sp,#$A[1][0]+4]
312 ldr @C[2],[sp,#$A[2][1]]
313 ldr @C[3],[sp,#$A[2][1]+4]
314 ldr @C[4],[sp,#$D[4]]
315 eor @C[0],@C[0],@C[6]
316 ldr @C[5],[sp,#$D[4]+4]
317 eor @C[1],@C[1],@C[7]
318 str @C[0],[sp,#$T[1][0]] @ T[1][0] = A[1][0] ^ (C[3] = D[0]);
319 add @C[0],sp,#$A[1][2]
320 str @C[1],[sp,#$T[1][0]+4]
321 eor @C[2],@C[2],@C[8]
322 eor @C[3],@C[3],@C[9]
323 str @C[2],[sp,#$T[1][1]] @ T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
324 str @C[3],[sp,#$T[1][1]+4]
325 ldmia @C[0],{@C[0]-@C[3]} @ A[1][2..3]
326 eor @C[0],@C[0],@E[0]
327 eor @C[1],@C[1],@E[1]
328 str @C[0],[sp,#$T[1][2]] @ T[1][2] = A[1][2] ^ (E[0] = D[2]);
329 ldr @C[0],[sp,#$A[2][4]]
330 str @C[1],[sp,#$T[1][2]+4]
331 ldr @C[1],[sp,#$A[2][4]+4]
332 eor @C[2],@C[2],@E[2]
333 eor @C[3],@C[3],@E[3]
334 str @C[2],[sp,#$T[1][3]] @ T[1][3] = A[1][3] ^ (E[1] = D[3]);
335 ldr @C[2],[sp,#$T[0][3]]
336 str @C[3],[sp,#$T[1][3]+4]
337 ldr @C[3],[sp,#$T[0][3]+4]
338 eor @C[0],@C[0],@C[4]
339 ldr @E[2],[sp,#$A[1][4]]
340 eor @C[1],@C[1],@C[5]
341 ldr @E[3],[sp,#$A[1][4]+4]
342 str @C[0],[sp,#$T[1][4]] @ T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
344 ror @C[0],@C[2],#32-14 @ C[0] = ROL64(T[0][3], rhotates[0][3]);
345 str @C[1],[sp,#$T[1][4]+4]
346 ror @C[1],@C[3],#32-14
347 eor @C[2],@E[2],@C[4]
348 ldr @C[4],[sp,#$A[2][0]]
349 eor @C[3],@E[3],@C[5]
350 ldr @C[5],[sp,#$A[2][0]+4]
351 ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
352 ldr @E[2],[sp,#$A[3][1]]
353 ror @C[3],@C[3],#32-10
354 ldr @E[3],[sp,#$A[3][1]+4]
355 eor @C[6],@C[6],@C[4]
356 eor @C[7],@C[7],@C[5]
357 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
358 eor @E[2],@E[2],@C[8]
359 ror @C[4],@C[7],#32-2
360 ldr @C[8],[sp,#$A[4][2]]
361 eor @E[3],@E[3],@C[9]
362 ldr @C[9],[sp,#$A[4][2]+4]
363 ror @C[7],@E[2],#32-22 @ C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
364 eor @E[0],@E[0],@C[8]
365 ror @C[6],@E[3],#32-23
366 eor @E[1],@E[1],@C[9]
367 ror @C[9],@E[0],#32-30 @ C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
369 bic @E[0],@C[4],@C[2]
370 ror @C[8],@E[1],#32-31
371 bic @E[1],@C[5],@C[3]
372 eor @E[0],@E[0],@C[0]
373 eor @E[1],@E[1],@C[1]
374 str @E[0],[sp,#$A[1][0]] @ A[1][0] = C[0] ^ (~C[1] & C[2])
375 bic @E[2],@C[6],@C[4]
376 str @E[1],[sp,#$A[1][0]+4]
377 bic @E[3],@C[7],@C[5]
378 eor @E[2],@E[2],@C[2]
379 eor @E[3],@E[3],@C[3]
380 str @E[2],[sp,#$A[1][1]] @ A[1][1] = C[1] ^ (~C[2] & C[3]);
381 bic @E[0],@C[8],@C[6]
382 str @E[3],[sp,#$A[1][1]+4]
383 bic @E[1],@C[9],@C[7]
384 eor @E[0],@E[0],@C[4]
385 eor @E[1],@E[1],@C[5]
386 str @E[0],[sp,#$A[1][2]] @ A[1][2] = C[2] ^ (~C[3] & C[4]);
387 bic @E[2],@C[0],@C[8]
388 str @E[1],[sp,#$A[1][2]+4]
389 bic @E[3],@C[1],@C[9]
390 eor @E[2],@E[2],@C[6]
391 eor @E[3],@E[3],@C[7]
392 str @E[2],[sp,#$A[1][3]] @ A[1][3] = C[3] ^ (~C[4] & C[0]);
393 bic @E[0],@C[2],@C[0]
394 str @E[3],[sp,#$A[1][3]+4]
396 bic @E[1],@C[3],@C[1]
397 ldr @C[1],[sp,#$T[0][1]]
398 eor @E[0],@E[0],@C[8]
399 ldr @C[0],[sp,#$T[0][1]+4]
400 eor @E[1],@E[1],@C[9]
401 str @E[0],[sp,#$A[1][4]] @ A[1][4] = C[4] ^ (~C[0] & C[1]);
402 str @E[1],[sp,#$A[1][4]+4]
404 ldr @C[2],[sp,#$T[1][2]]
405 ldr @C[3],[sp,#$T[1][2]+4]
406 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
407 ldr @C[4],[sp,#$A[2][3]]
408 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(T[0][1], rhotates[0][1]);
409 ldr @C[5],[sp,#$A[2][3]+4]
410 ror @C[2],@C[2],#32-3 @ C[1] = ROL64(T[1][2], rhotates[1][2]);
411 ldr @C[6],[sp,#$A[3][4]]
412 ror @C[3],@C[3],#32-3
413 ldr @C[7],[sp,#$A[3][4]+4]
414 eor @E[0],@E[0],@C[4]
415 ldr @C[8],[sp,#$A[4][0]]
416 eor @E[1],@E[1],@C[5]
417 ldr @C[9],[sp,#$A[4][0]+4]
418 ror @C[5],@E[0],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
419 ldr @E[0],[sp,#$D[0]]
420 ror @C[4],@E[1],#32-13
421 ldr @E[1],[sp,#$D[0]+4]
422 eor @C[6],@C[6],@E[2]
423 eor @C[7],@C[7],@E[3]
424 ror @C[6],@C[6],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
425 eor @C[8],@C[8],@E[0]
426 ror @C[7],@C[7],#32-4
427 eor @C[9],@C[9],@E[1]
428 ror @C[8],@C[8],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
430 bic @E[0],@C[4],@C[2]
431 ror @C[9],@C[9],#32-9
432 bic @E[1],@C[5],@C[3]
433 eor @E[0],@E[0],@C[0]
434 eor @E[1],@E[1],@C[1]
435 str @E[0],[sp,#$A[2][0]] @ A[2][0] = C[0] ^ (~C[1] & C[2])
436 bic @E[2],@C[6],@C[4]
437 str @E[1],[sp,#$A[2][0]+4]
438 bic @E[3],@C[7],@C[5]
439 eor @E[2],@E[2],@C[2]
440 eor @E[3],@E[3],@C[3]
441 str @E[2],[sp,#$A[2][1]] @ A[2][1] = C[1] ^ (~C[2] & C[3]);
442 bic @E[0],@C[8],@C[6]
443 str @E[3],[sp,#$A[2][1]+4]
444 bic @E[1],@C[9],@C[7]
445 eor @E[0],@E[0],@C[4]
446 eor @E[1],@E[1],@C[5]
447 str @E[0],[sp,#$A[2][2]] @ A[2][2] = C[2] ^ (~C[3] & C[4]);
448 bic @E[2],@C[0],@C[8]
449 str @E[1],[sp,#$A[2][2]+4]
450 bic @E[3],@C[1],@C[9]
451 eor @E[2],@E[2],@C[6]
452 eor @E[3],@E[3],@C[7]
453 str @E[2],[sp,#$A[2][3]] @ A[2][3] = C[3] ^ (~C[4] & C[0]);
454 bic @E[0],@C[2],@C[0]
455 str @E[3],[sp,#$A[2][3]+4]
456 bic @E[1],@C[3],@C[1]
457 eor @E[0],@E[0],@C[8]
458 eor @E[1],@E[1],@C[9]
459 str @E[0],[sp,#$A[2][4]] @ A[2][4] = C[4] ^ (~C[0] & C[1]);
460 add @C[2],sp,#$T[1][0]
461 str @E[1],[sp,#$A[2][4]+4]
464 ldr @C[1],[sp,#$T[0][4]]
465 ldr @C[0],[sp,#$T[0][4]+4]
466 ldmia @C[2],{@C[2]-@C[5]} @ T[1][0..1]
467 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
468 ror @C[1],@C[1],#32-13 @ C[0] = ROL64(T[0][4], rhotates[0][4]);
469 ldr @C[6],[sp,#$A[3][2]]
470 ror @C[0],@C[0],#32-14
471 ldr @C[7],[sp,#$A[3][2]+4]
472 ror @C[2],@C[2],#32-18 @ C[1] = ROL64(T[1][0], rhotates[1][0]);
473 ldr @C[8],[sp,#$A[4][3]]
474 ror @C[3],@C[3],#32-18
475 ldr @C[9],[sp,#$A[4][3]+4]
476 ror @C[4],@C[4],#32-5 @ C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
477 eor @E[0],@E[0],@C[6]
478 ror @C[5],@C[5],#32-5
479 eor @E[1],@E[1],@C[7]
480 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
481 eor @C[8],@C[8],@E[2]
482 ror @C[6],@E[1],#32-8
483 eor @C[9],@C[9],@E[3]
484 ror @C[8],@C[8],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
486 bic @E[0],@C[4],@C[2]
487 ror @C[9],@C[9],#32-28
488 bic @E[1],@C[5],@C[3]
489 eor @E[0],@E[0],@C[0]
490 eor @E[1],@E[1],@C[1]
491 str @E[0],[sp,#$A[3][0]] @ A[3][0] = C[0] ^ (~C[1] & C[2])
492 bic @E[2],@C[6],@C[4]
493 str @E[1],[sp,#$A[3][0]+4]
494 bic @E[3],@C[7],@C[5]
495 eor @E[2],@E[2],@C[2]
496 eor @E[3],@E[3],@C[3]
497 str @E[2],[sp,#$A[3][1]] @ A[3][1] = C[1] ^ (~C[2] & C[3]);
498 bic @E[0],@C[8],@C[6]
499 str @E[3],[sp,#$A[3][1]+4]
500 bic @E[1],@C[9],@C[7]
501 eor @E[0],@E[0],@C[4]
502 eor @E[1],@E[1],@C[5]
503 str @E[0],[sp,#$A[3][2]] @ A[3][2] = C[2] ^ (~C[3] & C[4]);
504 bic @E[2],@C[0],@C[8]
505 str @E[1],[sp,#$A[3][2]+4]
506 bic @E[3],@C[1],@C[9]
507 eor @E[2],@E[2],@C[6]
508 eor @E[3],@E[3],@C[7]
509 str @E[2],[sp,#$A[3][3]] @ A[3][3] = C[3] ^ (~C[4] & C[0]);
510 bic @E[0],@C[2],@C[0]
511 str @E[3],[sp,#$A[3][3]+4]
512 bic @E[1],@C[3],@C[1]
513 eor @E[0],@E[0],@C[8]
514 eor @E[1],@E[1],@C[9]
515 str @E[0],[sp,#$A[3][4]] @ A[3][4] = C[4] ^ (~C[0] & C[1]);
516 add @E[3],sp,#$T[1][3]
517 str @E[1],[sp,#$A[3][4]+4]
519 ldr @C[0],[sp,#$T[0][2]]
520 ldr @C[1],[sp,#$T[0][2]+4]
521 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ T[1][3..4]
522 ldr @C[7],[sp,#$T[0][0]]
523 ror @C[0],@C[0],#32-31 @ C[0] = ROL64(T[0][2], rhotates[0][2]);
524 ldr @C[6],[sp,#$T[0][0]+4]
525 ror @C[1],@C[1],#32-31
526 ldr @C[8],[sp,#$A[4][1]]
527 ror @C[3],@E[0],#32-27 @ C[1] = ROL64(T[1][3], rhotates[1][3]);
528 ldr @E[0],[sp,#$D[1]]
529 ror @C[2],@E[1],#32-28
530 ldr @C[9],[sp,#$A[4][1]+4]
531 ror @C[5],@E[2],#32-19 @ C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
532 ldr @E[1],[sp,#$D[1]+4]
533 ror @C[4],@E[3],#32-20
534 eor @C[8],@C[8],@E[0]
535 ror @C[7],@C[7],#32-20 @ C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
536 eor @C[9],@C[9],@E[1]
537 ror @C[6],@C[6],#32-21
539 bic @E[0],@C[4],@C[2]
540 ror @C[8],@C[8],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
541 bic @E[1],@C[5],@C[3]
542 ror @C[9],@C[9],#32-1
543 eor @E[0],@E[0],@C[0]
544 eor @E[1],@E[1],@C[1]
545 str @E[0],[sp,#$A[4][0]] @ A[4][0] = C[0] ^ (~C[1] & C[2])
546 bic @E[2],@C[6],@C[4]
547 str @E[1],[sp,#$A[4][0]+4]
548 bic @E[3],@C[7],@C[5]
549 eor @E[2],@E[2],@C[2]
550 eor @E[3],@E[3],@C[3]
551 str @E[2],[sp,#$A[4][1]] @ A[4][1] = C[1] ^ (~C[2] & C[3]);
552 bic @E[0],@C[8],@C[6]
553 str @E[3],[sp,#$A[4][1]+4]
554 bic @E[1],@C[9],@C[7]
555 eor @E[0],@E[0],@C[4]
556 eor @E[1],@E[1],@C[5]
557 str @E[0],[sp,#$A[4][2]] @ A[4][2] = C[2] ^ (~C[3] & C[4]);
558 bic @E[2],@C[0],@C[8]
559 str @E[1],[sp,#$A[4][2]+4]
560 bic @E[3],@C[1],@C[9]
561 eor @E[2],@E[2],@C[6]
562 eor @E[3],@E[3],@C[7]
563 str @E[2],[sp,#$A[4][3]] @ A[4][3] = C[3] ^ (~C[4] & C[0]);
564 bic @E[0],@C[2],@C[0]
565 str @E[3],[sp,#$A[4][3]+4]
566 bic @E[1],@C[3],@C[1]
567 eor @E[2],@E[0],@C[8]
568 eor @E[3],@E[1],@C[9]
569 str @E[2],[sp,#$A[4][4]] @ A[4][4] = C[4] ^ (~C[0] & C[1]);
570 add @E[0],sp,#$A[1][0]
571 str @E[3],[sp,#$A[4][4]+4]
576 .size KeccakF1600_int,.-KeccakF1600_int
578 .type KeccakF1600, %function
581 stmdb sp!,{r0,r4-r11,lr}
582 sub sp,sp,#320+16 @ space for A[5][5],D[5],T[2][5],...
584 add @E[0],r0,#$A[1][0]
585 add @E[1],sp,#$A[1][0]
587 ldmia @E[0]!,{@C[0]-@C[9]} @ copy A[5][5] to stack
588 stmia @E[1]!,{@C[0]-@C[9]}
589 ldmia @E[0]!,{@C[0]-@C[9]}
590 stmia @E[1]!,{@C[0]-@C[9]}
591 ldmia @E[0]!,{@C[0]-@C[9]}
592 stmia @E[1]!,{@C[0]-@C[9]}
593 ldmia @E[0], {@C[0]-@C[9]}
594 stmia @E[1], {@C[0]-@C[9]}
595 ldmia @E[2], {@C[0]-@C[9]} @ A[0][0..4]
596 add @E[0],sp,#$A[1][0]
597 stmia sp, {@C[0]-@C[9]}
601 ldr @E[1], [sp,#320+16] @ restore pointer to A
602 ldmia sp, {@C[0]-@C[9]}
603 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
604 ldmia @E[0]!,{@C[0]-@C[9]}
605 stmia @E[1]!,{@C[0]-@C[9]}
606 ldmia @E[0]!,{@C[0]-@C[9]}
607 stmia @E[1]!,{@C[0]-@C[9]}
608 ldmia @E[0]!,{@C[0]-@C[9]}
609 stmia @E[1]!,{@C[0]-@C[9]}
610 ldmia @E[0], {@C[0]-@C[9]}
611 stmia @E[1], {@C[0]-@C[9]}
614 ldmia sp!,{r4-r11,pc}
615 .size KeccakF1600,.-KeccakF1600
617 { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
619 ########################################################################
621 # ----->+-----------------------+
622 # | uint64_t A[5][5] |
625 # +336->+-----------------------+
627 # +340->+-----------------------+
629 # +344->+-----------------------+
631 # +348->+-----------------------+
633 # +352->+-----------------------+
635 # +356->+-----------------------+
636 # | const void *inp |
637 # +360->+-----------------------+
639 # +364->+-----------------------+
641 # +368->+-----------------------+
646 .type SHA3_absorb,%function
649 stmdb sp!,{r0-r12,lr}
652 add $A_flat,r0,#$A[1][0]
660 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
661 stmia $inp!, {@C[0]-@C[9]}
662 ldmia $A_flat!,{@C[0]-@C[9]}
663 stmia $inp!, {@C[0]-@C[9]}
664 ldmia $A_flat!,{@C[0]-@C[9]}
665 stmia $inp!, {@C[0]-@C[9]}
666 ldmia $A_flat!,{@C[0]-@C[9]}
667 stmia $inp!, {@C[0]-@C[9]}
668 ldmia $A_flat!,{@C[0]-@C[9]}
669 stmia $inp, {@C[0]-@C[9]}
671 ldr $inp,[sp,#356] @ restore $inp
678 mov r6,#0x11 @ compose constants
683 orr r6,r6,r6,lsl#16 @ 0x11111111
684 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
685 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
686 orr r7,r6,r6,lsl#1 @ 0x33333333
687 orr r6,r6,r6,lsl#2 @ 0x55555555
700 str r0,[sp,#360] @ save len - bsz
713 orr r0,r0,r3,lsl#24 @ lo
717 orr r1,r1,r3,lsl#24 @ hi
719 and r2,r0,r6 @ &=0x55555555
720 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
721 and r3,r1,r6 @ &=0x55555555
722 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
727 and r2,r2,r7 @ &=0x33333333
728 and r0,r0,r7,lsl#2 @ &=0xcccccccc
729 and r3,r3,r7 @ &=0x33333333
730 and r1,r1,r7,lsl#2 @ &=0xcccccccc
735 and r2,r2,r8 @ &=0x0f0f0f0f
736 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
737 and r3,r3,r8 @ &=0x0f0f0f0f
738 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
739 ldmia $A_flat,{r4-r5} @ A_flat[i]
744 and r2,r2,r9 @ &=0x00ff00ff
745 and r0,r0,r9,lsl#8 @ &=0xff00ff00
746 and r3,r3,r9 @ &=0x00ff00ff
747 and r1,r1,r9,lsl#8 @ &=0xff00ff00
759 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
769 ldmia r14,{r6-r12,r14} @ restore constants and variables
774 add $inp,sp,#$A[1][0]
775 ldmia sp, {@C[0]-@C[9]}
776 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
777 ldmia $inp!, {@C[0]-@C[9]}
778 stmia $A_flat!,{@C[0]-@C[9]}
779 ldmia $inp!, {@C[0]-@C[9]}
780 stmia $A_flat!,{@C[0]-@C[9]}
781 ldmia $inp!, {@C[0]-@C[9]}
782 stmia $A_flat!,{@C[0]-@C[9]}
783 ldmia $inp, {@C[0]-@C[9]}
784 stmia $A_flat, {@C[0]-@C[9]}
788 mov r0,$len @ return value
789 ldmia sp!,{r4-r12,pc}
790 .size SHA3_absorb,.-SHA3_absorb
793 { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
797 .type SHA3_squeeze,%function
800 stmdb sp!,{r0,r3-r10,lr}
813 mov r6,#0x11 @ compose constants
818 orr r6,r6,r6,lsl#16 @ 0x11111111
819 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
820 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
821 orr r7,r6,r6,lsl#1 @ 0x33333333
822 orr r6,r6,r6,lsl#2 @ 0x55555555
831 ldmia $A_flat!,{r0,r1} @ A_flat[i++]
834 lsl r3,r1,#16 @ r3 = r1 << 16
835 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
837 lsr r0,r0,#16 @ r0 = r0 >> 16
838 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
844 and r2,r2,r9 @ &=0x00ff00ff
845 and r3,r3,r9,lsl#8 @ &=0xff00ff00
846 and r0,r0,r9 @ &=0x00ff00ff
847 and r1,r1,r9,lsl#8 @ &=0xff00ff00
852 and r2,r2,r8 @ &=0x0f0f0f0f
853 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
854 and r0,r0,r8 @ &=0x0f0f0f0f
855 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
860 and r2,r2,r7 @ &=0x33333333
861 and r3,r3,r7,lsl#2 @ &=0xcccccccc
862 and r0,r0,r7 @ &=0x33333333
863 and r1,r1,r7,lsl#2 @ &=0xcccccccc
868 and r2,r2,r6 @ &=0x55555555
869 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
870 and r0,r0,r6 @ &=0x55555555
871 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
896 subs $bsz,$bsz,#8 @ bsz -= 8
899 mov r0,r14 @ original $A_flat
903 ldmia sp,{r6-r10,r12} @ restore constants and variables
939 ldmia sp!,{r4-r10,pc}
940 .size SHA3_squeeze,.-SHA3_squeeze
947 .type iotas64, %object
950 .quad 0x0000000000000001
951 .quad 0x0000000000008082
952 .quad 0x800000000000808a
953 .quad 0x8000000080008000
954 .quad 0x000000000000808b
955 .quad 0x0000000080000001
956 .quad 0x8000000080008081
957 .quad 0x8000000000008009
958 .quad 0x000000000000008a
959 .quad 0x0000000000000088
960 .quad 0x0000000080008009
961 .quad 0x000000008000000a
962 .quad 0x000000008000808b
963 .quad 0x800000000000008b
964 .quad 0x8000000000008089
965 .quad 0x8000000000008003
966 .quad 0x8000000000008002
967 .quad 0x8000000000000080
968 .quad 0x000000000000800a
969 .quad 0x800000008000000a
970 .quad 0x8000000080008081
971 .quad 0x8000000000008080
972 .quad 0x0000000080000001
973 .quad 0x8000000080008008
974 .size iotas64,.-iotas64
976 .type KeccakF1600_neon, %function
981 mov r3, #24 @ loop counter
987 vst1.64 {q4}, [r0:64] @ offload A[0..1][4]
988 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
989 vst1.64 {d18}, [r1:64] @ offload A[2][4]
990 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
991 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
992 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
993 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
994 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
995 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
996 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
997 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
998 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
999 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
1000 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
1001 veor d25, d25, d24 @ C[4]^=A[4][4]
1003 vadd.u64 q4, q13, q13 @ C[0..1]<<1
1004 vadd.u64 q15, q14, q14 @ C[2..3]<<1
1005 vadd.u64 d18, d25, d25 @ C[4]<<1
1006 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
1007 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
1008 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
1009 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
1010 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1011 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
1012 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
1014 veor d0, d0, d25 @ A[0][0] ^= C[4]
1015 veor d1, d1, d25 @ A[1][0] ^= C[4]
1016 veor d10, d10, d25 @ A[2][0] ^= C[4]
1017 veor d11, d11, d25 @ A[3][0] ^= C[4]
1018 veor d20, d20, d25 @ A[4][0] ^= C[4]
1020 veor d2, d2, d26 @ A[0][1] ^= D[1]
1021 veor d3, d3, d26 @ A[1][1] ^= D[1]
1022 veor d12, d12, d26 @ A[2][1] ^= D[1]
1023 veor d13, d13, d26 @ A[3][1] ^= D[1]
1024 veor d21, d21, d26 @ A[4][1] ^= D[1]
1027 veor d6, d6, d28 @ A[0][3] ^= C[2]
1028 veor d7, d7, d28 @ A[1][3] ^= C[2]
1029 veor d16, d16, d28 @ A[2][3] ^= C[2]
1030 veor d17, d17, d28 @ A[3][3] ^= C[2]
1031 veor d23, d23, d28 @ A[4][3] ^= C[2]
1032 vld1.64 {q4}, [r0:64] @ restore A[0..1][4]
1035 vld1.64 {d18}, [r1:64] @ restore A[2][4]
1036 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
1037 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
1038 veor d22, d22, d27 @ A[4][2] ^= D[2]
1040 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
1041 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
1042 veor d24, d24, d29 @ A[4][4] ^= C[3]
1045 vmov d26, d2 @ C[1] = A[0][1]
1046 vshl.u64 d2, d3, #44
1047 vmov d27, d4 @ C[2] = A[0][2]
1048 vshl.u64 d4, d14, #43
1049 vmov d28, d6 @ C[3] = A[0][3]
1050 vshl.u64 d6, d17, #21
1051 vmov d29, d8 @ C[4] = A[0][4]
1052 vshl.u64 d8, d24, #14
1053 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1054 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1055 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1056 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1058 vshl.u64 d3, d9, #20
1059 vshl.u64 d14, d16, #25
1060 vshl.u64 d17, d15, #15
1061 vshl.u64 d24, d21, #2
1062 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1063 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1064 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1065 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1067 vshl.u64 d9, d22, #61
1068 @ vshl.u64 d16, d19, #8
1069 vshl.u64 d15, d12, #10
1070 vshl.u64 d21, d7, #55
1071 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1072 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1073 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1074 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1076 vshl.u64 d22, d18, #39
1077 @ vshl.u64 d19, d23, #56
1078 vshl.u64 d12, d5, #6
1079 vshl.u64 d7, d13, #45
1080 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1081 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1082 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1083 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1085 vshl.u64 d18, d20, #18
1086 vshl.u64 d23, d11, #41
1087 vshl.u64 d5, d10, #3
1088 vshl.u64 d13, d1, #36
1089 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1090 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1091 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1092 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1094 vshl.u64 d1, d28, #28
1095 vshl.u64 d10, d26, #1
1096 vshl.u64 d11, d29, #27
1097 vshl.u64 d20, d27, #62
1098 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
1099 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
1100 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
1101 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
1107 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1108 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1109 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1110 vst1.64 {q13}, [r0:64] @ offload A[0..1][0]
1113 vmov q1, q14 @ A[0..1][1]
1114 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1115 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1118 vmov q0, q5 @ A[2..3][0]
1120 vmov q15, q6 @ A[2..3][1]
1121 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1123 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1125 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1127 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1128 vmov q14, q10 @ A[4][0..1]
1129 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1131 vld1.64 d25, [r2:64]! @ Iota[i++]
1134 vld1.64 {q0}, [r0:64] @ restore A[0..1][0]
1135 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
1137 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
1139 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1141 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1142 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1143 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1149 .size KeccakF1600_neon,.-KeccakF1600_neon
1151 .global SHA3_absorb_neon
1152 .type SHA3_absorb_neon, %function
1155 stmdb sp!, {r4-r6,lr}
1156 vstmdb sp!, {d8-d15}
1162 vld1.32 {d0}, [r0:64]! @ A[0][0]
1163 vld1.32 {d2}, [r0:64]! @ A[0][1]
1164 vld1.32 {d4}, [r0:64]! @ A[0][2]
1165 vld1.32 {d6}, [r0:64]! @ A[0][3]
1166 vld1.32 {d8}, [r0:64]! @ A[0][4]
1168 vld1.32 {d1}, [r0:64]! @ A[1][0]
1169 vld1.32 {d3}, [r0:64]! @ A[1][1]
1170 vld1.32 {d5}, [r0:64]! @ A[1][2]
1171 vld1.32 {d7}, [r0:64]! @ A[1][3]
1172 vld1.32 {d9}, [r0:64]! @ A[1][4]
1174 vld1.32 {d10}, [r0:64]! @ A[2][0]
1175 vld1.32 {d12}, [r0:64]! @ A[2][1]
1176 vld1.32 {d14}, [r0:64]! @ A[2][2]
1177 vld1.32 {d16}, [r0:64]! @ A[2][3]
1178 vld1.32 {d18}, [r0:64]! @ A[2][4]
1180 vld1.32 {d11}, [r0:64]! @ A[3][0]
1181 vld1.32 {d13}, [r0:64]! @ A[3][1]
1182 vld1.32 {d15}, [r0:64]! @ A[3][2]
1183 vld1.32 {d17}, [r0:64]! @ A[3][3]
1184 vld1.32 {d19}, [r0:64]! @ A[3][4]
1186 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3]
1187 vld1.32 {d24}, [r0:64] @ A[4][4]
1188 sub r0, r0, #24*8 @ rewind
1193 subs r12, r5, r6 @ len - bsz
1197 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1199 veor d0, d0, d31 @ A[0][0] ^= *inp++
1202 veor d2, d2, d31 @ A[0][1] ^= *inp++
1206 veor d4, d4, d31 @ A[0][2] ^= *inp++
1209 veor d6, d6, d31 @ A[0][3] ^= *inp++
1213 veor d8, d8, d31 @ A[0][4] ^= *inp++
1217 veor d1, d1, d31 @ A[1][0] ^= *inp++
1221 veor d3, d3, d31 @ A[1][1] ^= *inp++
1224 veor d5, d5, d31 @ A[1][2] ^= *inp++
1228 veor d7, d7, d31 @ A[1][3] ^= *inp++
1231 veor d9, d9, d31 @ A[1][4] ^= *inp++
1236 veor d10, d10, d31 @ A[2][0] ^= *inp++
1239 veor d12, d12, d31 @ A[2][1] ^= *inp++
1243 veor d14, d14, d31 @ A[2][2] ^= *inp++
1246 veor d16, d16, d31 @ A[2][3] ^= *inp++
1250 veor d18, d18, d31 @ A[2][4] ^= *inp++
1254 veor d11, d11, d31 @ A[3][0] ^= *inp++
1258 veor d13, d13, d31 @ A[3][1] ^= *inp++
1261 veor d15, d15, d31 @ A[3][2] ^= *inp++
1265 veor d17, d17, d31 @ A[3][3] ^= *inp++
1268 veor d19, d19, d31 @ A[3][4] ^= *inp++
1273 veor d20, d20, d31 @ A[4][0] ^= *inp++
1276 veor d21, d21, d31 @ A[4][1] ^= *inp++
1280 veor d22, d22, d31 @ A[4][2] ^= *inp++
1283 veor d23, d23, d31 @ A[4][3] ^= *inp++
1286 veor d24, d24, d31 @ A[4][4] ^= *inp++
1294 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1295 vst1.32 {d2}, [r0:64]!
1296 vst1.32 {d4}, [r0:64]!
1297 vst1.32 {d6}, [r0:64]!
1298 vst1.32 {d8}, [r0:64]!
1300 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1301 vst1.32 {d3}, [r0:64]!
1302 vst1.32 {d5}, [r0:64]!
1303 vst1.32 {d7}, [r0:64]!
1304 vst1.32 {d9}, [r0:64]!
1306 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1307 vst1.32 {d12}, [r0:64]!
1308 vst1.32 {d14}, [r0:64]!
1309 vst1.32 {d16}, [r0:64]!
1310 vst1.32 {d18}, [r0:64]!
1312 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1313 vst1.32 {d13}, [r0:64]!
1314 vst1.32 {d15}, [r0:64]!
1315 vst1.32 {d17}, [r0:64]!
1316 vst1.32 {d19}, [r0:64]!
1318 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1319 vst1.32 {d24}, [r0:64]
1321 mov r0, r5 @ return value
1322 vldmia sp!, {d8-d15}
1323 ldmia sp!, {r4-r6,pc}
1324 .size SHA3_absorb_neon,.-SHA3_absorb_neon
1326 .global SHA3_squeeze_neon
1327 .type SHA3_squeeze_neon, %function
1330 stmdb sp!, {r4-r6,lr}
1335 mov r12, r0 @ A_flat
1337 b .Loop_squeeze_neon
1342 blo .Lsqueeze_neon_tail
1343 vld1.32 {d0}, [r12]!
1344 vst1.8 {d0}, [r4]! @ endian-neutral store
1346 subs r5, r5, #8 @ len -= 8
1347 beq .Lsqueeze_neon_done
1349 subs r14, r14, #8 @ bsz -= 8
1350 bhi .Loop_squeeze_neon
1352 vstmdb sp!, {d8-d15}
1354 vld1.32 {d0}, [r0:64]! @ A[0][0..4]
1355 vld1.32 {d2}, [r0:64]!
1356 vld1.32 {d4}, [r0:64]!
1357 vld1.32 {d6}, [r0:64]!
1358 vld1.32 {d8}, [r0:64]!
1360 vld1.32 {d1}, [r0:64]! @ A[1][0..4]
1361 vld1.32 {d3}, [r0:64]!
1362 vld1.32 {d5}, [r0:64]!
1363 vld1.32 {d7}, [r0:64]!
1364 vld1.32 {d9}, [r0:64]!
1366 vld1.32 {d10}, [r0:64]! @ A[2][0..4]
1367 vld1.32 {d12}, [r0:64]!
1368 vld1.32 {d14}, [r0:64]!
1369 vld1.32 {d16}, [r0:64]!
1370 vld1.32 {d18}, [r0:64]!
1372 vld1.32 {d11}, [r0:64]! @ A[3][0..4]
1373 vld1.32 {d13}, [r0:64]!
1374 vld1.32 {d15}, [r0:64]!
1375 vld1.32 {d17}, [r0:64]!
1376 vld1.32 {d19}, [r0:64]!
1378 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1379 vld1.32 {d24}, [r0:64]
1380 sub r0, r0, #24*8 @ rewind
1384 mov r12, r0 @ A_flat
1385 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1386 vst1.32 {d2}, [r0:64]!
1387 vst1.32 {d4}, [r0:64]!
1388 vst1.32 {d6}, [r0:64]!
1389 vst1.32 {d8}, [r0:64]!
1391 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1392 vst1.32 {d3}, [r0:64]!
1393 vst1.32 {d5}, [r0:64]!
1394 vst1.32 {d7}, [r0:64]!
1395 vst1.32 {d9}, [r0:64]!
1397 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1398 vst1.32 {d12}, [r0:64]!
1399 vst1.32 {d14}, [r0:64]!
1400 vst1.32 {d16}, [r0:64]!
1401 vst1.32 {d18}, [r0:64]!
1403 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1404 vst1.32 {d13}, [r0:64]!
1405 vst1.32 {d15}, [r0:64]!
1406 vst1.32 {d17}, [r0:64]!
1407 vst1.32 {d19}, [r0:64]!
1409 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1411 vst1.32 {d24}, [r0:64]
1412 mov r0, r12 @ rewind
1414 vldmia sp!, {d8-d15}
1415 b .Loop_squeeze_neon
1418 .Lsqueeze_neon_tail:
1421 strb r2, [r4],#1 @ endian-neutral store
1423 blo .Lsqueeze_neon_done
1426 beq .Lsqueeze_neon_done
1430 blo .Lsqueeze_neon_done
1432 beq .Lsqueeze_neon_done
1437 blo .Lsqueeze_neon_done
1440 beq .Lsqueeze_neon_done
1443 .Lsqueeze_neon_done:
1444 ldmia sp!, {r4-r6,pc}
1445 .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
1446 .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1452 close STDOUT; # enforce flush