2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # [ABI- and endian-neutral] Keccak-1600 for C64x.
20 # This is straightforward KECCAK_1X_ALT variant (see sha/keccak1600.c)
21 # with bit interleaving. 64-bit values are simply split between A- and
22 # B-files, with A-file holding least significant halves. This works
23 # out perfectly, because all operations including cross-communications
24 # [in rotate operations] are always complementary. Performance is
25 # [incredible for a 32-bit processor] 10.9 cycles per processed byte
26 # for r=1088, which corresponds to SHA3-256. This is >15x faster than
27 # compiler-generated KECCAK_1X_ALT code, and >10x than other variants.
28 # On average processor ends up issuing ~4.5 instructions per cycle...
30 my @A = map([ $_, ($_+1), ($_+2), ($_+3), ($_+4) ], (5,10,16,21,26));
31 $A[1][4] = 31; # B14 is reserved, A14 is used as iota[]
32 ($A[3][0],$A[4][1]) = ($A[4][1],$A[3][0]);
33 my @C = (0..4,$A[3][0],$A[4][0]);
36 my @rhotates = ([ 0, 1, 62, 28, 27 ],
37 [ 36, 44, 6, 55, 20 ],
38 [ 3, 10, 43, 25, 39 ],
39 [ 41, 45, 15, 21, 8 ],
40 [ 18, 2, 61, 56, 14 ]);
43 my ($src,$rot,$dst,$p) = @_;
47 $p ROTL B$src,$rot/2+1,A$dst
48 || ROTL A$src,$rot/2, B$dst
52 $p ROTL A$src,$rot/2,A$dst
53 || ROTL B$src,$rot/2,B$dst
58 ########################################################################
61 # SP--->+------+------+
63 # +1--->+------+------+<- -9 below 4 slots are used by KeccakF1600_int
65 # +2--->+------+------+<- -8
67 # +3--->+------+------+<- -7
68 # | A2 | A3 | A3:A2 are preserved by KeccakF1600_int
69 # +4--->+------+------+<- -6
70 # | B2 | B3 | B3:B2 are preserved by KeccakF1600_int
71 # +5--->+------+------+<- -5 below is ABI-compliant layout
73 # +6--->+------+------+<- -4
75 # +7--->+------+------+<- -3
77 # +8--->+------+------+<- -2
79 # +9--->+------+------+<- -1
81 # +------+------+<---FP
88 .if .ASSEMBLER_VERSION<7000000
93 .asg KeccakF1600,_KeccakF1600
94 .asg SHA3_absorb,_SHA3_absorb
95 .asg SHA3_squeeze,_SHA3_squeeze
109 ADDKPC _KeccakF1600_int,B0
110 || MVKL \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
111 MVKH \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
113 ADDKPC _KeccakF1600_int,B0
114 || MVKL (iotas-_KeccakF1600_int),$iotas
115 MVKH (iotas-_KeccakF1600_int),$iotas
119 XOR A$A[0][2],A$A[1][2],A$C[2] ; Theta
120 || XOR B$A[0][2],B$A[1][2],B$C[2]
121 || XOR A$A[0][3],A$A[1][3],A$C[3]
122 || XOR B$A[0][3],B$A[1][3],B$C[3]
123 || XOR A$A[0][0],A$A[1][0],A$C[0]
124 || XOR B$A[0][0],B$A[1][0],B$C[0]
125 XOR A$A[2][2],A$C[2],A$C[2]
126 || XOR B$A[2][2],B$C[2],B$C[2]
127 || XOR A$A[2][3],A$C[3],A$C[3]
128 || XOR B$A[2][3],B$C[3],B$C[3]
129 || XOR A$A[2][0],A$C[0],A$C[0]
130 || XOR B$A[2][0],B$C[0],B$C[0]
131 XOR A$A[3][2],A$C[2],A$C[2]
132 || XOR B$A[3][2],B$C[2],B$C[2]
133 || XOR A$A[3][3],A$C[3],A$C[3]
134 || XOR B$A[3][3],B$C[3],B$C[3]
135 || XOR A$A[3][0],A$C[0],A$C[0]
136 || XOR B$A[3][0],B$C[0],B$C[0]
137 XOR A$A[4][2],A$C[2],A$C[2]
138 || XOR B$A[4][2],B$C[2],B$C[2]
139 || XOR A$A[4][3],A$C[3],A$C[3]
140 || XOR B$A[4][3],B$C[3],B$C[3]
141 || XOR A$A[4][0],A$C[0],A$C[0]
142 || XOR B$A[4][0],B$C[0],B$C[0]
143 XOR A$A[0][4],A$A[1][4],A$C[4]
144 || XOR B$A[0][4],B$A[1][4],B$C[4]
145 || XOR A$A[0][1],A$A[1][1],A$C[1]
146 || XOR B$A[0][1],B$A[1][1],B$C[1]
147 || STDW A$A[3][0]:A$A[4][0],*SP[1] ; offload some data
148 STDW B$A[3][0]:B$A[4][0],*SP[2]
149 || XOR A$A[2][4],A$C[4],A$C[4]
150 || XOR B$A[2][4],B$C[4],B$C[4]
151 || XOR A$A[2][1],A$C[1],A$C[1]
152 || XOR B$A[2][1],B$C[1],B$C[1]
153 || ROTL B$C[2],1,A$C[5] ; ROL64(C[2],1)
154 || ROTL A$C[2],0,B$C[5]
155 XOR A$A[3][4],A$C[4],A$C[4]
156 || XOR B$A[3][4],B$C[4],B$C[4]
157 || XOR A$A[3][1],A$C[1],A$C[1]
158 || XOR B$A[3][1],B$C[1],B$C[1]
159 || ROTL B$C[3],1,A$C[6] ; ROL64(C[3],1)
160 || ROTL A$C[3],0,B$C[6]
161 XOR A$A[4][4],A$C[4],A$C[4]
162 || XOR B$A[4][4],B$C[4],B$C[4]
163 || XOR A$A[4][1],A$C[1],A$C[1]
164 || XOR B$A[4][1],B$C[1],B$C[1]
165 || XOR A$C[0],A$C[5],A$C[5] ; C[0] ^ ROL64(C[2],1)
166 || XOR B$C[0],B$C[5],B$C[5]
167 XOR A$C[5],A$A[0][1],A$A[0][1]
168 || XOR B$C[5],B$A[0][1],B$A[0][1]
169 || XOR A$C[5],A$A[1][1],A$A[1][1]
170 || XOR B$C[5],B$A[1][1],B$A[1][1]
171 || XOR A$C[5],A$A[2][1],A$A[2][1]
172 || XOR B$C[5],B$A[2][1],B$A[2][1]
173 XOR A$C[5],A$A[3][1],A$A[3][1]
174 || XOR B$C[5],B$A[3][1],B$A[3][1]
175 || XOR A$C[5],A$A[4][1],A$A[4][1]
176 || XOR B$C[5],B$A[4][1],B$A[4][1]
177 || ROTL B$C[4],1,A$C[5] ; ROL64(C[4],1)
178 || ROTL A$C[4],0,B$C[5]
179 || XOR A$C[1],A$C[6],A$C[6] ; C[1] ^ ROL64(C[3],1)
180 || XOR B$C[1],B$C[6],B$C[6]
181 XOR A$C[6],A$A[0][2],A$A[0][2]
182 || XOR B$C[6],B$A[0][2],B$A[0][2]
183 || XOR A$C[6],A$A[1][2],A$A[1][2]
184 || XOR B$C[6],B$A[1][2],B$A[1][2]
185 || XOR A$C[6],A$A[2][2],A$A[2][2]
186 || XOR B$C[6],B$A[2][2],B$A[2][2]
187 || ROTL B$C[1],1,A$C[1] ; ROL64(C[1],1)
188 || ROTL A$C[1],0,B$C[1]
189 XOR A$C[6],A$A[3][2],A$A[3][2]
190 || XOR B$C[6],B$A[3][2],B$A[3][2]
191 || XOR A$C[6],A$A[4][2],A$A[4][2]
192 || XOR B$C[6],B$A[4][2],B$A[4][2]
193 || ROTL B$C[0],1,A$C[6] ; ROL64(C[0],1)
194 || ROTL A$C[0],0,B$C[6]
195 || XOR A$C[5],A$C[2],A$C[2] ; C[2] ^= ROL64(C[4],1)
196 || XOR B$C[5],B$C[2],B$C[2]
197 XOR A$C[2],A$A[0][3],A$A[0][3]
198 || XOR B$C[2],B$A[0][3],B$A[0][3]
199 || XOR A$C[2],A$A[1][3],A$A[1][3]
200 || XOR B$C[2],B$A[1][3],B$A[1][3]
201 || XOR A$C[2],A$A[2][3],A$A[2][3]
202 || XOR B$C[2],B$A[2][3],B$A[2][3]
203 XOR A$C[6],A$C[3],A$C[3] ; C[3] ^= ROL64(C[0],1)
204 || XOR B$C[6],B$C[3],B$C[3]
205 || LDDW *FP[-9],A$A[3][0]:A$A[4][0] ; restore offloaded data
206 || LDDW *SP[2],B$A[3][0]:B$A[4][0]
207 || XOR A$C[2],A$A[3][3],A$A[3][3]
208 || XOR B$C[2],B$A[3][3],B$A[3][3]
209 XOR A$C[2],A$A[4][3],A$A[4][3]
210 || XOR B$C[2],B$A[4][3],B$A[4][3]
211 || XOR A$C[3],A$A[0][4],A$A[0][4]
212 || XOR B$C[3],B$A[0][4],B$A[0][4]
213 || XOR A$C[3],A$A[1][4],A$A[1][4]
214 || XOR B$C[3],B$A[1][4],B$A[1][4]
215 XOR A$C[3],A$A[2][4],A$A[2][4]
216 || XOR B$C[3],B$A[2][4],B$A[2][4]
217 || XOR A$C[3],A$A[3][4],A$A[3][4]
218 || XOR B$C[3],B$A[3][4],B$A[3][4]
219 || XOR A$C[3],A$A[4][4],A$A[4][4]
220 || XOR B$C[3],B$A[4][4],B$A[4][4]
221 XOR A$C[1],A$C[4],A$C[4] ; C[4] ^= ROL64(C[1],1)
222 || XOR B$C[1],B$C[4],B$C[4]
223 || MV A$A[0][1],A$C[1] ; Rho+Pi, "early start"
224 || MV B$A[0][1],B$C[1]
226 &ROL64 ($A[1][1],$rhotates[1][1],$A[0][1],"||");
228 XOR A$C[4],A$A[0][0],A$A[0][0]
229 || XOR B$C[4],B$A[0][0],B$A[0][0]
230 || XOR A$C[4],A$A[1][0],A$A[1][0]
231 || XOR B$C[4],B$A[1][0],B$A[1][0]
232 || MV A$A[0][3],A$C[3]
233 || MV B$A[0][3],B$C[3]
235 &ROL64 ($A[3][3],$rhotates[3][3],$A[0][3],"||");
237 XOR A$C[4],A$A[2][0],A$A[2][0]
238 || XOR B$C[4],B$A[2][0],B$A[2][0]
239 || XOR A$C[4],A$A[3][0],A$A[3][0]
240 || XOR B$C[4],B$A[3][0],B$A[3][0]
241 || MV A$A[0][2],A$C[2]
242 || MV B$A[0][2],B$C[2]
244 &ROL64 ($A[2][2],$rhotates[2][2],$A[0][2],"||");
246 XOR A$C[4],A$A[4][0],A$A[4][0]
247 || XOR B$C[4],B$A[4][0],B$A[4][0]
248 || MV A$A[0][4],A$C[4]
249 || MV B$A[0][4],B$C[4]
251 &ROL64 ($A[4][4],$rhotates[4][4],$A[0][4],"||");
253 &ROL64 ($A[1][4],$rhotates[1][4],$A[1][1]);
255 || LDW *${iotas}++[2],A$C[0]
257 &ROL64 ($A[2][3],$rhotates[2][3],$A[2][2]);
259 || LDW *${iotas}[-1],B$C[0]
261 &ROL64 ($A[3][2],$rhotates[3][2],$A[3][3]);
262 &ROL64 ($A[4][1],$rhotates[4][1],$A[4][4]);
264 &ROL64 ($A[4][2],$rhotates[4][2],$A[1][4]);
265 &ROL64 ($A[3][4],$rhotates[3][4],$A[2][3]);
266 &ROL64 ($A[2][1],$rhotates[2][1],$A[3][2]);
267 &ROL64 ($A[1][3],$rhotates[1][3],$A[4][1]);
269 &ROL64 ($A[2][4],$rhotates[2][4],$A[4][2]);
270 &ROL64 ($A[4][3],$rhotates[4][3],$A[3][4]);
271 &ROL64 ($A[1][2],$rhotates[1][2],$A[2][1]);
272 &ROL64 ($A[3][1],$rhotates[3][1],$A[1][3]);
274 &ROL64 ($A[4][0],$rhotates[4][0],$A[2][4]);
275 &ROL64 ($A[3][0],$rhotates[3][0],$A[4][3]);
276 &ROL64 ($A[2][0],$rhotates[2][0],$A[1][2]);
277 &ROL64 ($A[1][0],$rhotates[1][0],$A[3][1]);
279 #&ROL64 ($C[3], $rhotates[0][3],$A[1][0]); # moved below
280 &ROL64 ($C[1], $rhotates[0][1],$A[2][0]);
281 &ROL64 ($C[4], $rhotates[0][4],$A[3][0]);
282 &ROL64 ($C[2], $rhotates[0][2],$A[4][0]);
284 || ANDN A$A[0][2],A$A[0][1],A$C[4] ; Chi+Iota
285 || ANDN B$A[0][2],B$A[0][1],B$C[4]
286 || ANDN A$A[0][3],A$A[0][2],A$C[1]
287 || ANDN B$A[0][3],B$A[0][2],B$C[1]
288 || ANDN A$A[0][4],A$A[0][3],A$C[2]
289 || ANDN B$A[0][4],B$A[0][3],B$C[2]
291 &ROL64 ($C[3], $rhotates[0][3],$A[1][0]);
293 || ANDN A$A[0][0],A$A[0][4],A$C[3]
294 || ANDN B$A[0][0],B$A[0][4],B$C[3]
295 || XOR A$C[4],A$A[0][0],A$A[0][0]
296 || XOR B$C[4],B$A[0][0],B$A[0][0]
297 || ANDN A$A[0][1],A$A[0][0],A$C[4]
298 || ANDN B$A[0][1],B$A[0][0],B$C[4]
299 XOR A$C[1],A$A[0][1],A$A[0][1]
300 || XOR B$C[1],B$A[0][1],B$A[0][1]
301 || XOR A$C[2],A$A[0][2],A$A[0][2]
302 || XOR B$C[2],B$A[0][2],B$A[0][2]
303 || XOR A$C[3],A$A[0][3],A$A[0][3]
304 || XOR B$C[3],B$A[0][3],B$A[0][3]
305 XOR A$C[4],A$A[0][4],A$A[0][4]
306 || XOR B$C[4],B$A[0][4],B$A[0][4]
307 || XOR A$C[0],A$A[0][0],A$A[0][0] ; A[0][0] ^= iotas[i++];
308 || XOR B$C[0],B$A[0][0],B$A[0][0]
309 || EXTU $iotas,24,24,A0 ; A0 is A$C[0], as we done?
311 ANDN A$A[1][2],A$A[1][1],A$C[4]
312 || ANDN B$A[1][2],B$A[1][1],B$C[4]
313 || ANDN A$A[1][3],A$A[1][2],A$C[1]
314 || ANDN B$A[1][3],B$A[1][2],B$C[1]
315 || ANDN A$A[1][4],A$A[1][3],A$C[2]
316 || ANDN B$A[1][4],B$A[1][3],B$C[2]
317 ANDN A$A[1][0],A$A[1][4],A$C[3]
318 || ANDN B$A[1][0],B$A[1][4],B$C[3]
319 || XOR A$C[4],A$A[1][0],A$A[1][0]
320 || XOR B$C[4],B$A[1][0],B$A[1][0]
321 || ANDN A$A[1][1],A$A[1][0],A$C[4]
322 || ANDN B$A[1][1],B$A[1][0],B$C[4]
323 XOR A$C[1],A$A[1][1],A$A[1][1]
324 || XOR B$C[1],B$A[1][1],B$A[1][1]
325 || XOR A$C[2],A$A[1][2],A$A[1][2]
326 || XOR B$C[2],B$A[1][2],B$A[1][2]
327 || XOR A$C[3],A$A[1][3],A$A[1][3]
328 || XOR B$C[3],B$A[1][3],B$A[1][3]
329 XOR A$C[4],A$A[1][4],A$A[1][4]
330 || XOR B$C[4],B$A[1][4],B$A[1][4]
332 || ANDN A$A[2][2],A$A[2][1],A$C[4]
333 || ANDN B$A[2][2],B$A[2][1],B$C[4]
334 || ANDN A$A[2][3],A$A[2][2],A$C[1]
335 || ANDN B$A[2][3],B$A[2][2],B$C[1]
336 ANDN A$A[2][4],A$A[2][3],A$C[2]
337 || ANDN B$A[2][4],B$A[2][3],B$C[2]
338 || ANDN A$A[2][0],A$A[2][4],A$C[3]
339 || ANDN B$A[2][0],B$A[2][4],B$C[3]
340 || XOR A$C[4],A$A[2][0],A$A[2][0]
341 || XOR B$C[4],B$A[2][0],B$A[2][0]
342 ANDN A$A[2][1],A$A[2][0],A$C[4]
343 || ANDN B$A[2][1],B$A[2][0],B$C[4]
344 || XOR A$C[1],A$A[2][1],A$A[2][1]
345 || XOR B$C[1],B$A[2][1],B$A[2][1]
346 || XOR A$C[2],A$A[2][2],A$A[2][2]
347 || XOR B$C[2],B$A[2][2],B$A[2][2]
348 XOR A$C[3],A$A[2][3],A$A[2][3]
349 || XOR B$C[3],B$A[2][3],B$A[2][3]
350 || XOR A$C[4],A$A[2][4],A$A[2][4]
351 || XOR B$C[4],B$A[2][4],B$A[2][4]
353 ANDN A$A[3][2],A$A[3][1],A$C[4]
354 || ANDN B$A[3][2],B$A[3][1],B$C[4]
355 || ANDN A$A[3][3],A$A[3][2],A$C[1]
356 || ANDN B$A[3][3],B$A[3][2],B$C[1]
357 || ANDN A$A[3][4],A$A[3][3],A$C[2]
358 || ANDN B$A[3][4],B$A[3][3],B$C[2]
359 ANDN A$A[3][0],A$A[3][4],A$C[3]
360 || ANDN B$A[3][0],B$A[3][4],B$C[3]
361 || XOR A$C[4],A$A[3][0],A$A[3][0]
362 || XOR B$C[4],B$A[3][0],B$A[3][0]
363 || ANDN A$A[3][1],A$A[3][0],A$C[4]
364 || ANDN B$A[3][1],B$A[3][0],B$C[4]
365 XOR A$C[1],A$A[3][1],A$A[3][1]
366 || XOR B$C[1],B$A[3][1],B$A[3][1]
367 || XOR A$C[2],A$A[3][2],A$A[3][2]
368 || XOR B$C[2],B$A[3][2],B$A[3][2]
369 || XOR A$C[3],A$A[3][3],A$A[3][3]
371 XOR B$C[3],B$A[3][3],B$A[3][3]
372 || XOR A$C[4],A$A[3][4],A$A[3][4]
373 || XOR B$C[4],B$A[3][4],B$A[3][4]
374 ||[!A0] LDDW *FP[-7],A3:A2
375 ||[!A0] LDDW *SP[4], RA:B2
377 ANDN A$A[4][2],A$A[4][1],A$C[4]
378 || ANDN B$A[4][2],B$A[4][1],B$C[4]
379 || ANDN A$A[4][3],A$A[4][2],A$C[1]
380 || ANDN B$A[4][3],B$A[4][2],B$C[1]
381 || ANDN A$A[4][4],A$A[4][3],A$C[2]
382 || ANDN B$A[4][4],B$A[4][3],B$C[2]
383 ANDN A$A[4][0],A$A[4][4],A$C[3]
384 || ANDN B$A[4][0],B$A[4][4],B$C[3]
385 || XOR A$C[4],A$A[4][0],A$A[4][0]
386 || XOR B$C[4],B$A[4][0],B$A[4][0]
387 || ANDN A$A[4][1],A$A[4][0],A$C[4]
388 || ANDN B$A[4][1],B$A[4][0],B$C[4]
389 XOR A$C[1],A$A[4][1],A$A[4][1]
390 || XOR B$C[1],B$A[4][1],B$A[4][1]
391 || XOR A$C[2],A$A[4][2],A$A[4][2]
392 || XOR B$C[2],B$A[4][2],B$A[4][2]
393 || XOR A$C[3],A$A[4][3],A$A[4][3]
394 || XOR B$C[3],B$A[4][3],B$A[4][3]
395 XOR A$C[4],A$A[4][4],A$A[4][4]
396 || XOR B$C[4],B$A[4][4],B$A[4][4]
397 ;;===== branch to loop? is taken here
406 .asmfunc stack_usage(80)
407 STW FP,*SP--(80) ; save frame pointer
410 || STDW A13:A12,*FP[-4]
412 || STDW A11:A10,*FP[-5]
418 LDW *A2++[2],A$A[0][0] ; load A[5][5]
419 || LDW *B2++[2],B$A[0][0]
420 LDW *A2++[2],A$A[0][1]
421 || LDW *B2++[2],B$A[0][1]
422 LDW *A2++[2],A$A[0][2]
423 || LDW *B2++[2],B$A[0][2]
424 LDW *A2++[2],A$A[0][3]
425 || LDW *B2++[2],B$A[0][3]
426 LDW *A2++[2],A$A[0][4]
427 || LDW *B2++[2],B$A[0][4]
429 LDW *A2++[2],A$A[1][0]
430 || LDW *B2++[2],B$A[1][0]
431 LDW *A2++[2],A$A[1][1]
432 || LDW *B2++[2],B$A[1][1]
433 LDW *A2++[2],A$A[1][2]
434 || LDW *B2++[2],B$A[1][2]
435 LDW *A2++[2],A$A[1][3]
436 || LDW *B2++[2],B$A[1][3]
437 LDW *A2++[2],A$A[1][4]
438 || LDW *B2++[2],B$A[1][4]
440 LDW *A2++[2],A$A[2][0]
441 || LDW *B2++[2],B$A[2][0]
442 LDW *A2++[2],A$A[2][1]
443 || LDW *B2++[2],B$A[2][1]
444 LDW *A2++[2],A$A[2][2]
445 || LDW *B2++[2],B$A[2][2]
446 LDW *A2++[2],A$A[2][3]
447 || LDW *B2++[2],B$A[2][3]
448 LDW *A2++[2],A$A[2][4]
449 || LDW *B2++[2],B$A[2][4]
451 LDW *A2++[2],A$A[3][0]
452 || LDW *B2++[2],B$A[3][0]
453 LDW *A2++[2],A$A[3][1]
454 || LDW *B2++[2],B$A[3][1]
455 LDW *A2++[2],A$A[3][2]
456 || LDW *B2++[2],B$A[3][2]
457 LDW *A2++[2],A$A[3][3]
458 || LDW *B2++[2],B$A[3][3]
459 LDW *A2++[2],A$A[3][4]
460 || LDW *B2++[2],B$A[3][4]
461 || BNOP _KeccakF1600_int
464 || LDW *A2++[2],A$A[4][0]
465 || LDW *B2++[2],B$A[4][0]
466 LDW *A2++[2],A$A[4][1]
467 || LDW *B2++[2],B$A[4][1]
468 LDW *A2++[2],A$A[4][2]
469 || LDW *B2++[2],B$A[4][2]
470 LDW *A2++[2],A$A[4][3]
471 || LDW *B2++[2],B$A[4][3]
474 || ADDK -192,A2 ; rewind
479 STW A$A[0][0],*A2++[2] ; store A[5][5]
480 || STW B$A[0][0],*B2++[2]
481 STW A$A[0][1],*A2++[2]
482 || STW B$A[0][1],*B2++[2]
483 STW A$A[0][2],*A2++[2]
484 || STW B$A[0][2],*B2++[2]
485 STW A$A[0][3],*A2++[2]
486 || STW B$A[0][3],*B2++[2]
487 STW A$A[0][4],*A2++[2]
488 || STW B$A[0][4],*B2++[2]
490 STW A$A[1][0],*A2++[2]
491 || STW B$A[1][0],*B2++[2]
492 STW A$A[1][1],*A2++[2]
493 || STW B$A[1][1],*B2++[2]
494 STW A$A[1][2],*A2++[2]
495 || STW B$A[1][2],*B2++[2]
496 STW A$A[1][3],*A2++[2]
497 || STW B$A[1][3],*B2++[2]
498 STW A$A[1][4],*A2++[2]
499 || STW B$A[1][4],*B2++[2]
501 STW A$A[2][0],*A2++[2]
502 || STW B$A[2][0],*B2++[2]
503 STW A$A[2][1],*A2++[2]
504 || STW B$A[2][1],*B2++[2]
505 STW A$A[2][2],*A2++[2]
506 || STW B$A[2][2],*B2++[2]
507 STW A$A[2][3],*A2++[2]
508 || STW B$A[2][3],*B2++[2]
509 STW A$A[2][4],*A2++[2]
510 || STW B$A[2][4],*B2++[2]
512 STW A$A[3][0],*A2++[2]
513 || STW B$A[3][0],*B2++[2]
514 STW A$A[3][1],*A2++[2]
515 || STW B$A[3][1],*B2++[2]
516 STW A$A[3][2],*A2++[2]
517 || STW B$A[3][2],*B2++[2]
518 STW A$A[3][3],*A2++[2]
519 || STW B$A[3][3],*B2++[2]
520 STW A$A[3][4],*A2++[2]
521 || STW B$A[3][4],*B2++[2]
526 STW A$A[4][0],*A2++[2]
527 || STW B$A[4][0],*B2++[2]
528 STW A$A[4][1],*A2++[2]
529 || STW B$A[4][1],*B2++[2]
530 STW A$A[4][2],*A2++[2]
531 || STW B$A[4][2],*B2++[2]
532 STW A$A[4][3],*A2++[2]
533 || STW B$A[4][3],*B2++[2]
536 || ADDK -192,A2 ; rewind
538 MV A2,A4 ; return original A4
539 || LDDW *SP[8], B11:B10
540 || LDDW *FP[-5],A11:A10
542 || LDDW *FP[-4],A13:A12
544 LDW *++SP(80),FP ; restore frame pointer
545 NOP 4 ; wait till FP is committed
555 .asmfunc stack_usage(80)
556 STW FP,*SP--(80) ; save frame pointer
559 || STDW A13:A12,*FP[-4]
561 || STDW A11:A10,*FP[-5]
565 STW A4,*SP[1] ; save A[][]
566 || MV B4,INP ; reassign arguments
571 LDW *A4++[2],A$A[0][0] ; load A[5][5]
572 || LDW *B4++[2],B$A[0][0]
573 LDW *A4++[2],A$A[0][1]
574 || LDW *B4++[2],B$A[0][1]
575 LDW *A4++[2],A$A[0][2]
576 || LDW *B4++[2],B$A[0][2]
577 LDW *A4++[2],A$A[0][3]
578 || LDW *B4++[2],B$A[0][3]
579 LDW *A4++[2],A$A[0][4]
580 || LDW *B4++[2],B$A[0][4]
582 LDW *A4++[2],A$A[1][0]
583 || LDW *B4++[2],B$A[1][0]
584 LDW *A4++[2],A$A[1][1]
585 || LDW *B4++[2],B$A[1][1]
586 LDW *A4++[2],A$A[1][2]
587 || LDW *B4++[2],B$A[1][2]
588 LDW *A4++[2],A$A[1][3]
589 || LDW *B4++[2],B$A[1][3]
590 LDW *A4++[2],A$A[1][4]
591 || LDW *B4++[2],B$A[1][4]
593 LDW *A4++[2],A$A[2][0]
594 || LDW *B4++[2],B$A[2][0]
595 LDW *A4++[2],A$A[2][1]
596 || LDW *B4++[2],B$A[2][1]
597 LDW *A4++[2],A$A[2][2]
598 || LDW *B4++[2],B$A[2][2]
599 LDW *A4++[2],A$A[2][3]
600 || LDW *B4++[2],B$A[2][3]
601 LDW *A4++[2],A$A[2][4]
602 || LDW *B4++[2],B$A[2][4]
604 LDW *A4++[2],A$A[3][0]
605 || LDW *B4++[2],B$A[3][0]
606 LDW *A4++[2],A$A[3][1]
607 || LDW *B4++[2],B$A[3][1]
608 LDW *A4++[2],A$A[3][2]
609 || LDW *B4++[2],B$A[3][2]
610 LDW *A4++[2],A$A[3][3]
611 || LDW *B4++[2],B$A[3][3]
612 LDW *A4++[2],A$A[3][4]
613 || LDW *B4++[2],B$A[3][4]
615 LDW *A4++[2],A$A[4][0]
616 || LDW *B4++[2],B$A[4][0]
617 LDW *A4++[2],A$A[4][1]
618 || LDW *B4++[2],B$A[4][1]
619 LDW *A4++[2],A$A[4][2]
620 || LDW *B4++[2],B$A[4][2]
621 LDW *A4++[2],A$A[4][3]
622 || LDW *B4++[2],B$A[4][3]
629 CMPLTU LEN,BSZ,A0 ; len < bsz?
633 ||[A0] LDW *SP[1],A2 ; pull A[][]
634 [BSZ] LDNDW *INP++,A1:A0
635 ||[BSZ] SUB LEN,8,LEN
636 ||[BSZ] SUB BSZ,1,BSZ
639 for ($y = 0; $y < 5; $y++) {
640 for ($x = 0; $x < ($y<4 ? 5 : 4); $x++) {
647 ||[!BSZ]BNOP _KeccakF1600_cheat
648 ||[!BSZ]STDW LEN:INP,*SP[3]
651 [!BSZ]BNOP _KeccakF1600_cheat
652 ||[!BSZ]STDW LEN:INP,*SP[3]
655 [BSZ] LDNDW *INP++,A1:A0
658 ||[BSZ] SUB BSZ,1,BSZ
661 XOR A0,A$A[$y][$x],A$A[$y][$x]
662 XOR A1,B$A[$y][$x],B$A[$y][$x]
673 BNOP _KeccakF1600_cheat
674 || STDW LEN:INP,*SP[3]
680 XOR A0,A$A[4][4],A$A[4][4]
681 XOR A1,B$A[4][4],B$A[4][4]
685 MV LEN,A4 ; return value
688 STW A$A[0][0],*A2++[2] ; store A[5][5]
689 || STW B$A[0][0],*B2++[2]
690 STW A$A[0][1],*A2++[2]
691 || STW B$A[0][1],*B2++[2]
692 STW A$A[0][2],*A2++[2]
693 || STW B$A[0][2],*B2++[2]
694 STW A$A[0][3],*A2++[2]
695 || STW B$A[0][3],*B2++[2]
696 STW A$A[0][4],*A2++[2]
697 || STW B$A[0][4],*B2++[2]
699 STW A$A[1][0],*A2++[2]
700 || STW B$A[1][0],*B2++[2]
701 STW A$A[1][1],*A2++[2]
702 || STW B$A[1][1],*B2++[2]
703 STW A$A[1][2],*A2++[2]
704 || STW B$A[1][2],*B2++[2]
705 STW A$A[1][3],*A2++[2]
706 || STW B$A[1][3],*B2++[2]
707 STW A$A[1][4],*A2++[2]
708 || STW B$A[1][4],*B2++[2]
710 STW A$A[2][0],*A2++[2]
711 || STW B$A[2][0],*B2++[2]
712 STW A$A[2][1],*A2++[2]
713 || STW B$A[2][1],*B2++[2]
714 STW A$A[2][2],*A2++[2]
715 || STW B$A[2][2],*B2++[2]
716 STW A$A[2][3],*A2++[2]
717 || STW B$A[2][3],*B2++[2]
718 STW A$A[2][4],*A2++[2]
719 || STW B$A[2][4],*B2++[2]
724 STW A$A[3][0],*A2++[2]
725 || STW B$A[3][0],*B2++[2]
726 STW A$A[3][1],*A2++[2]
727 || STW B$A[3][1],*B2++[2]
728 STW A$A[3][2],*A2++[2]
729 || STW B$A[3][2],*B2++[2]
730 STW A$A[3][3],*A2++[2]
731 || STW B$A[3][3],*B2++[2]
732 STW A$A[3][4],*A2++[2]
733 || STW B$A[3][4],*B2++[2]
736 || LDDW *FP[-5],A11:A10
738 || LDDW *FP[-4],A13:A12
740 || LDW *++SP(80),FP ; restore frame pointer
742 STW A$A[4][0],*A2++[2]
743 || STW B$A[4][0],*B2++[2]
744 STW A$A[4][1],*A2++[2]
745 || STW B$A[4][1],*B2++[2]
746 STW A$A[4][2],*A2++[2]
747 || STW B$A[4][2],*B2++[2]
748 STW A$A[4][3],*A2++[2]
749 || STW B$A[4][3],*B2++[2]
750 STW A$A[4][4],*A2++[2]
751 || STW B$A[4][4],*B2++[2]
755 .global _SHA3_squeeze
761 .asmfunc stack_usage(24)
762 STW FP,*SP--(24) ; save frame pointer
767 || MV B4,OUT ; reassign arguments
772 LDW *SP[5],RA ; reload RA
777 CMPLTU LEN,8,A0 ; len < 8?
781 || SUB LEN,8,LEN ; len -= 8
783 || SUB A1,1,A1 ; bsz--
841 LDW *++SP(24),FP ; restore frame pointer
842 NOP 4 ; wait till FP is committed
846 .sect ".text:sha_asm.const"
848 .sect ".const:sha_asm"
851 .uword 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
853 .uword 0x00000001, 0x00000000
854 .uword 0x00000000, 0x00000089
855 .uword 0x00000000, 0x8000008b
856 .uword 0x00000000, 0x80008080
857 .uword 0x00000001, 0x0000008b
858 .uword 0x00000001, 0x00008000
859 .uword 0x00000001, 0x80008088
860 .uword 0x00000001, 0x80000082
861 .uword 0x00000000, 0x0000000b
862 .uword 0x00000000, 0x0000000a
863 .uword 0x00000001, 0x00008082
864 .uword 0x00000000, 0x00008003
865 .uword 0x00000001, 0x0000808b
866 .uword 0x00000001, 0x8000000b
867 .uword 0x00000001, 0x8000008a
868 .uword 0x00000001, 0x80000081
869 .uword 0x00000000, 0x80000081
870 .uword 0x00000000, 0x80000008
871 .uword 0x00000000, 0x00000083
872 .uword 0x00000000, 0x80008003
873 .uword 0x00000001, 0x80008088
874 .uword 0x00000000, 0x80000088
875 .uword 0x00000001, 0x00008000
876 .uword 0x00000000, 0x80008082
878 .cstring "Keccak-1600 absorb and squeeze for C64x, CRYPTOGAMS by <appro\@openssl.org>"
882 $output=pop and open STDOUT,">$output";