2 * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 * Licensed under the OpenSSL license (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
10 #include <openssl/e_os2.h>
14 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
16 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r);
18 #ifndef KECCAK1600_ASM
20 #if defined(__x86_64__) || defined(__aarch64__) || \
21 defined(__mips64) || defined(__ia64) || \
22 (defined(__VMS) && !defined(__vax))
24 * These are available even in ILP32 flavours, but even then they are
25 * capable of performing 64-bit operations as efficiently as in *P64.
26 * Since it's not given that we can use sizeof(void *), just shunt it.
28 # define BIT_INTERLEAVE (0)
30 # define BIT_INTERLEAVE (sizeof(void *) < 8)
33 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
35 static uint64_t ROL64(uint64_t val, int offset)
39 } else if (!BIT_INTERLEAVE) {
40 return (val << offset) | (val >> (64-offset));
42 uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
48 hi = ROL32(lo, offset);
49 lo = ROL32(tmp, offset + 1);
52 lo = ROL32(lo, offset);
53 hi = ROL32(hi, offset);
56 return ((uint64_t)hi << 32) | lo;
60 static const unsigned char rhotates[5][5] = {
62 { 36, 44, 6, 55, 20 },
63 { 3, 10, 43, 25, 39 },
64 { 41, 45, 15, 21, 8 },
68 static const uint64_t iotas[] = {
69 BIT_INTERLEAVE ? 0x0000000000000001U : 0x0000000000000001U,
70 BIT_INTERLEAVE ? 0x0000008900000000U : 0x0000000000008082U,
71 BIT_INTERLEAVE ? 0x8000008b00000000U : 0x800000000000808aU,
72 BIT_INTERLEAVE ? 0x8000808000000000U : 0x8000000080008000U,
73 BIT_INTERLEAVE ? 0x0000008b00000001U : 0x000000000000808bU,
74 BIT_INTERLEAVE ? 0x0000800000000001U : 0x0000000080000001U,
75 BIT_INTERLEAVE ? 0x8000808800000001U : 0x8000000080008081U,
76 BIT_INTERLEAVE ? 0x8000008200000001U : 0x8000000000008009U,
77 BIT_INTERLEAVE ? 0x0000000b00000000U : 0x000000000000008aU,
78 BIT_INTERLEAVE ? 0x0000000a00000000U : 0x0000000000000088U,
79 BIT_INTERLEAVE ? 0x0000808200000001U : 0x0000000080008009U,
80 BIT_INTERLEAVE ? 0x0000800300000000U : 0x000000008000000aU,
81 BIT_INTERLEAVE ? 0x0000808b00000001U : 0x000000008000808bU,
82 BIT_INTERLEAVE ? 0x8000000b00000001U : 0x800000000000008bU,
83 BIT_INTERLEAVE ? 0x8000008a00000001U : 0x8000000000008089U,
84 BIT_INTERLEAVE ? 0x8000008100000001U : 0x8000000000008003U,
85 BIT_INTERLEAVE ? 0x8000008100000000U : 0x8000000000008002U,
86 BIT_INTERLEAVE ? 0x8000000800000000U : 0x8000000000000080U,
87 BIT_INTERLEAVE ? 0x0000008300000000U : 0x000000000000800aU,
88 BIT_INTERLEAVE ? 0x8000800300000000U : 0x800000008000000aU,
89 BIT_INTERLEAVE ? 0x8000808800000001U : 0x8000000080008081U,
90 BIT_INTERLEAVE ? 0x8000008800000000U : 0x8000000000008080U,
91 BIT_INTERLEAVE ? 0x0000800000000001U : 0x0000000080000001U,
92 BIT_INTERLEAVE ? 0x8000808200000000U : 0x8000000080008008U
95 #if defined(KECCAK_REF)
97 * This is straightforward or "maximum clarity" implementation aiming
98 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
99 * Permutation-Based Hash and Extendible-Output Functions" as much as
100 * possible. With one caveat. Because of the way C stores matrices,
101 * references to A[x,y] in the specification are presented as A[y][x].
102 * Implementation unrolls inner x-loops so that modulo 5 operations are
103 * explicitly pre-computed.
105 static void Theta(uint64_t A[5][5])
116 for (y = 1; y < 5; y++) {
124 D[0] = ROL64(C[1], 1) ^ C[4];
125 D[1] = ROL64(C[2], 1) ^ C[0];
126 D[2] = ROL64(C[3], 1) ^ C[1];
127 D[3] = ROL64(C[4], 1) ^ C[2];
128 D[4] = ROL64(C[0], 1) ^ C[3];
130 for (y = 0; y < 5; y++) {
139 static void Rho(uint64_t A[5][5])
143 for (y = 0; y < 5; y++) {
144 A[y][0] = ROL64(A[y][0], rhotates[y][0]);
145 A[y][1] = ROL64(A[y][1], rhotates[y][1]);
146 A[y][2] = ROL64(A[y][2], rhotates[y][2]);
147 A[y][3] = ROL64(A[y][3], rhotates[y][3]);
148 A[y][4] = ROL64(A[y][4], rhotates[y][4]);
152 static void Pi(uint64_t A[5][5])
158 * A[y][x] = T[x][(3*y+x)%5]
160 memcpy(T, A, sizeof(T));
193 static void Chi(uint64_t A[5][5])
198 for (y = 0; y < 5; y++) {
199 C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
200 C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
201 C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
202 C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
203 C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
213 static void Iota(uint64_t A[5][5], size_t i)
215 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
219 static void KeccakF1600(uint64_t A[5][5])
223 for (i = 0; i < 24; i++) {
232 #elif defined(KECCAK_1X)
234 * This implementation is optimization of above code featuring unroll
235 * of even y-loops, their fusion and code motion. It also minimizes
236 * temporary storage. Compiler would normally do all these things for
237 * you, purpose of manual optimization is to provide "unobscured"
238 * reference for assembly implementation [in case this approach is
239 * chosen for implementation on some platform]. In the nutshell it's
240 * equivalent of "plane-per-plane processing" approach discussed in
241 * section 2.4 of "Keccak implementation overview".
243 static void Round(uint64_t A[5][5], size_t i)
245 uint64_t C[5], E[2]; /* registers */
246 uint64_t D[5], T[2][5]; /* memory */
248 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
250 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
251 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
252 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
253 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
254 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
257 D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
258 D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
259 D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
260 D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
261 D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
263 T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
264 T[0][1] = A[0][1] ^ E[0]; /* D[1] */
265 T[0][2] = A[0][2] ^ C[1]; /* D[2] */
266 T[0][3] = A[0][3] ^ C[2]; /* D[3] */
267 T[0][4] = A[0][4] ^ E[1]; /* D[4] */
269 C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
270 C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
271 C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
272 C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
273 C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
275 D[0] = ROL64(C[1], 1) ^ C[4];
276 D[1] = ROL64(C[2], 1) ^ C[0];
277 D[2] = ROL64(C[3], 1) ^ C[1];
278 D[3] = ROL64(C[4], 1) ^ C[2];
279 D[4] = ROL64(C[0], 1) ^ C[3];
281 T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
282 T[0][1] = A[0][1] ^ D[1];
283 T[0][2] = A[0][2] ^ D[2];
284 T[0][3] = A[0][3] ^ D[3];
285 T[0][4] = A[0][4] ^ D[4];
287 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
288 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
289 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
290 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
291 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
293 A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
294 A[0][1] = C[1] ^ (~C[2] & C[3]);
295 A[0][2] = C[2] ^ (~C[3] & C[4]);
296 A[0][3] = C[3] ^ (~C[4] & C[0]);
297 A[0][4] = C[4] ^ (~C[0] & C[1]);
299 T[1][0] = A[1][0] ^ (C[3] = D[0]);
300 T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
301 T[1][2] = A[1][2] ^ (E[0] = D[2]);
302 T[1][3] = A[1][3] ^ (E[1] = D[3]);
303 T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
305 C[0] = ROL64(T[0][3], rhotates[0][3]);
306 C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
307 C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
308 C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
309 C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
311 A[1][0] = C[0] ^ (~C[1] & C[2]);
312 A[1][1] = C[1] ^ (~C[2] & C[3]);
313 A[1][2] = C[2] ^ (~C[3] & C[4]);
314 A[1][3] = C[3] ^ (~C[4] & C[0]);
315 A[1][4] = C[4] ^ (~C[0] & C[1]);
317 C[0] = ROL64(T[0][1], rhotates[0][1]);
318 C[1] = ROL64(T[1][2], rhotates[1][2]);
319 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
320 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
321 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
323 A[2][0] = C[0] ^ (~C[1] & C[2]);
324 A[2][1] = C[1] ^ (~C[2] & C[3]);
325 A[2][2] = C[2] ^ (~C[3] & C[4]);
326 A[2][3] = C[3] ^ (~C[4] & C[0]);
327 A[2][4] = C[4] ^ (~C[0] & C[1]);
329 C[0] = ROL64(T[0][4], rhotates[0][4]);
330 C[1] = ROL64(T[1][0], rhotates[1][0]);
331 C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
332 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
333 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
335 A[3][0] = C[0] ^ (~C[1] & C[2]);
336 A[3][1] = C[1] ^ (~C[2] & C[3]);
337 A[3][2] = C[2] ^ (~C[3] & C[4]);
338 A[3][3] = C[3] ^ (~C[4] & C[0]);
339 A[3][4] = C[4] ^ (~C[0] & C[1]);
341 C[0] = ROL64(T[0][2], rhotates[0][2]);
342 C[1] = ROL64(T[1][3], rhotates[1][3]);
343 C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
344 C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
345 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
347 A[4][0] = C[0] ^ (~C[1] & C[2]);
348 A[4][1] = C[1] ^ (~C[2] & C[3]);
349 A[4][2] = C[2] ^ (~C[3] & C[4]);
350 A[4][3] = C[3] ^ (~C[4] & C[0]);
351 A[4][4] = C[4] ^ (~C[0] & C[1]);
354 static void KeccakF1600(uint64_t A[5][5])
358 for (i = 0; i < 24; i++) {
363 #elif defined(KECCAK_1X_ALT)
365 * This is variant of above KECCAK_1X that reduces requirement for
366 * temporary storage even further, but at cost of more updates to A[][].
367 * It's less suitable if A[][] is memory bound, but better if it's
371 static void Round(uint64_t A[5][5], size_t i)
375 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
377 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
378 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
379 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
380 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
381 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
383 D[1] = C[0] ^ ROL64(C[2], 1);
384 D[2] = C[1] ^ ROL64(C[3], 1);
385 D[3] = C[2] ^= ROL64(C[4], 1);
386 D[4] = C[3] ^= ROL64(C[0], 1);
387 D[0] = C[4] ^= ROL64(C[1], 1);
424 A[0][1] = ROL64(A[1][1], rhotates[1][1]);
425 A[0][2] = ROL64(A[2][2], rhotates[2][2]);
426 A[0][3] = ROL64(A[3][3], rhotates[3][3]);
427 A[0][4] = ROL64(A[4][4], rhotates[4][4]);
429 A[1][1] = ROL64(A[1][4], rhotates[1][4]);
430 A[2][2] = ROL64(A[2][3], rhotates[2][3]);
431 A[3][3] = ROL64(A[3][2], rhotates[3][2]);
432 A[4][4] = ROL64(A[4][1], rhotates[4][1]);
434 A[1][4] = ROL64(A[4][2], rhotates[4][2]);
435 A[2][3] = ROL64(A[3][4], rhotates[3][4]);
436 A[3][2] = ROL64(A[2][1], rhotates[2][1]);
437 A[4][1] = ROL64(A[1][3], rhotates[1][3]);
439 A[4][2] = ROL64(A[2][4], rhotates[2][4]);
440 A[3][4] = ROL64(A[4][3], rhotates[4][3]);
441 A[2][1] = ROL64(A[1][2], rhotates[1][2]);
442 A[1][3] = ROL64(A[3][1], rhotates[3][1]);
444 A[2][4] = ROL64(A[4][0], rhotates[4][0]);
445 A[4][3] = ROL64(A[3][0], rhotates[3][0]);
446 A[1][2] = ROL64(A[2][0], rhotates[2][0]);
447 A[3][1] = ROL64(A[1][0], rhotates[1][0]);
449 A[1][0] = ROL64(C[3], rhotates[0][3]);
450 A[2][0] = ROL64(C[1], rhotates[0][1]);
451 A[3][0] = ROL64(C[4], rhotates[0][4]);
452 A[4][0] = ROL64(C[2], rhotates[0][2]);
459 A[0][0] ^= (~A[0][1] & A[0][2]);
460 A[1][0] ^= (~A[1][1] & A[1][2]);
461 A[0][1] ^= (~A[0][2] & A[0][3]);
462 A[1][1] ^= (~A[1][2] & A[1][3]);
463 A[0][2] ^= (~A[0][3] & A[0][4]);
464 A[1][2] ^= (~A[1][3] & A[1][4]);
465 A[0][3] ^= (~A[0][4] & C[0]);
466 A[1][3] ^= (~A[1][4] & C[1]);
467 A[0][4] ^= (~C[0] & D[0]);
468 A[1][4] ^= (~C[1] & D[1]);
475 A[2][0] ^= (~A[2][1] & A[2][2]);
476 A[3][0] ^= (~A[3][1] & A[3][2]);
477 A[2][1] ^= (~A[2][2] & A[2][3]);
478 A[3][1] ^= (~A[3][2] & A[3][3]);
479 A[2][2] ^= (~A[2][3] & A[2][4]);
480 A[3][2] ^= (~A[3][3] & A[3][4]);
481 A[2][3] ^= (~A[2][4] & C[2]);
482 A[3][3] ^= (~A[3][4] & C[3]);
483 A[2][4] ^= (~C[2] & D[2]);
484 A[3][4] ^= (~C[3] & D[3]);
489 A[4][0] ^= (~A[4][1] & A[4][2]);
490 A[4][1] ^= (~A[4][2] & A[4][3]);
491 A[4][2] ^= (~A[4][3] & A[4][4]);
492 A[4][3] ^= (~A[4][4] & C[4]);
493 A[4][4] ^= (~C[4] & D[4]);
497 static void KeccakF1600(uint64_t A[5][5])
501 for (i = 0; i < 24; i++) {
506 #elif defined(KECCAK_2X)
508 * This implementation is variant of KECCAK_1X above with outer-most
509 * round loop unrolled twice. This allows to take temporary storage
510 * out of round procedure and simplify references to it by alternating
511 * it with actual data (see round loop below). Just like original, it's
512 * rather meant as reference for an assembly implementation. It's likely
513 * to provide best instruction per processed byte ratio at minimal
514 * round unroll factor...
516 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
520 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
522 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
523 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
524 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
525 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
526 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
528 D[0] = ROL64(C[1], 1) ^ C[4];
529 D[1] = ROL64(C[2], 1) ^ C[0];
530 D[2] = ROL64(C[3], 1) ^ C[1];
531 D[3] = ROL64(C[4], 1) ^ C[2];
532 D[4] = ROL64(C[0], 1) ^ C[3];
534 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
535 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
536 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
537 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
538 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
540 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
541 R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
542 R[0][1] = C[1] ^ (~C[2] | C[3]);
543 R[0][2] = C[2] ^ ( C[3] & C[4]);
544 R[0][3] = C[3] ^ ( C[4] | C[0]);
545 R[0][4] = C[4] ^ ( C[0] & C[1]);
547 R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
548 R[0][1] = C[1] ^ (~C[2] & C[3]);
549 R[0][2] = C[2] ^ (~C[3] & C[4]);
550 R[0][3] = C[3] ^ (~C[4] & C[0]);
551 R[0][4] = C[4] ^ (~C[0] & C[1]);
554 C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
555 C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
556 C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
557 C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
558 C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
560 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
561 R[1][0] = C[0] ^ (C[1] | C[2]);
562 R[1][1] = C[1] ^ (C[2] & C[3]);
563 R[1][2] = C[2] ^ (C[3] | ~C[4]);
564 R[1][3] = C[3] ^ (C[4] | C[0]);
565 R[1][4] = C[4] ^ (C[0] & C[1]);
567 R[1][0] = C[0] ^ (~C[1] & C[2]);
568 R[1][1] = C[1] ^ (~C[2] & C[3]);
569 R[1][2] = C[2] ^ (~C[3] & C[4]);
570 R[1][3] = C[3] ^ (~C[4] & C[0]);
571 R[1][4] = C[4] ^ (~C[0] & C[1]);
574 C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
575 C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
576 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
577 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
578 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
580 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
581 R[2][0] = C[0] ^ ( C[1] | C[2]);
582 R[2][1] = C[1] ^ ( C[2] & C[3]);
583 R[2][2] = C[2] ^ (~C[3] & C[4]);
584 R[2][3] = ~C[3] ^ ( C[4] | C[0]);
585 R[2][4] = C[4] ^ ( C[0] & C[1]);
587 R[2][0] = C[0] ^ (~C[1] & C[2]);
588 R[2][1] = C[1] ^ (~C[2] & C[3]);
589 R[2][2] = C[2] ^ (~C[3] & C[4]);
590 R[2][3] = C[3] ^ (~C[4] & C[0]);
591 R[2][4] = C[4] ^ (~C[0] & C[1]);
594 C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
595 C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
596 C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
597 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
598 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
600 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
601 R[3][0] = C[0] ^ ( C[1] & C[2]);
602 R[3][1] = C[1] ^ ( C[2] | C[3]);
603 R[3][2] = C[2] ^ (~C[3] | C[4]);
604 R[3][3] = ~C[3] ^ ( C[4] & C[0]);
605 R[3][4] = C[4] ^ ( C[0] | C[1]);
607 R[3][0] = C[0] ^ (~C[1] & C[2]);
608 R[3][1] = C[1] ^ (~C[2] & C[3]);
609 R[3][2] = C[2] ^ (~C[3] & C[4]);
610 R[3][3] = C[3] ^ (~C[4] & C[0]);
611 R[3][4] = C[4] ^ (~C[0] & C[1]);
614 C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
615 C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
616 C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
617 C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
618 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
620 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
621 R[4][0] = C[0] ^ (~C[1] & C[2]);
622 R[4][1] = ~C[1] ^ ( C[2] | C[3]);
623 R[4][2] = C[2] ^ ( C[3] & C[4]);
624 R[4][3] = C[3] ^ ( C[4] | C[0]);
625 R[4][4] = C[4] ^ ( C[0] & C[1]);
627 R[4][0] = C[0] ^ (~C[1] & C[2]);
628 R[4][1] = C[1] ^ (~C[2] & C[3]);
629 R[4][2] = C[2] ^ (~C[3] & C[4]);
630 R[4][3] = C[3] ^ (~C[4] & C[0]);
631 R[4][4] = C[4] ^ (~C[0] & C[1]);
635 static void KeccakF1600(uint64_t A[5][5])
640 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
649 for (i = 0; i < 24; i += 2) {
654 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
666 * This implementation is KECCAK_1X from above combined 4 times with
667 * a twist that allows to omit temporary storage and perform in-place
668 * processing. It's discussed in section 2.5 of "Keccak implementation
669 * overview". It's likely to be best suited for processors with large
672 static void FourRounds(uint64_t A[5][5], size_t i)
674 uint64_t B[5], C[5], D[5];
676 assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4));
679 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
680 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
681 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
682 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
683 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
685 D[0] = ROL64(C[1], 1) ^ C[4];
686 D[1] = ROL64(C[2], 1) ^ C[0];
687 D[2] = ROL64(C[3], 1) ^ C[1];
688 D[3] = ROL64(C[4], 1) ^ C[2];
689 D[4] = ROL64(C[0], 1) ^ C[3];
691 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
692 B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
693 B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
694 B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
695 B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
697 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
698 C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
699 C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
700 C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
701 C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
703 B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
704 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
705 B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
706 B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
707 B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
709 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
710 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
711 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
712 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
713 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
715 B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
716 B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
717 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
718 B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
719 B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
721 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
722 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
723 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
724 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
725 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
727 B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
728 B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
729 B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
730 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
731 B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
733 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
734 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
735 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
736 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
737 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
739 B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
740 B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
741 B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
742 B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
743 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
745 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
746 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
747 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
748 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
749 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
752 D[0] = ROL64(C[1], 1) ^ C[4];
753 D[1] = ROL64(C[2], 1) ^ C[0];
754 D[2] = ROL64(C[3], 1) ^ C[1];
755 D[3] = ROL64(C[4], 1) ^ C[2];
756 D[4] = ROL64(C[0], 1) ^ C[3];
758 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
759 B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
760 B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
761 B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
762 B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
764 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
765 C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
766 C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
767 C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
768 C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
770 B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
771 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
772 B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
773 B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
774 B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
776 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
777 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
778 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
779 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
780 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
782 B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
783 B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
784 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
785 B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
786 B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
788 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
789 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
790 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
791 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
792 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
794 B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
795 B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
796 B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
797 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
798 B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
800 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
801 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
802 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
803 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
804 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
806 B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
807 B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
808 B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
809 B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
810 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
812 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
813 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
814 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
815 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
816 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
819 D[0] = ROL64(C[1], 1) ^ C[4];
820 D[1] = ROL64(C[2], 1) ^ C[0];
821 D[2] = ROL64(C[3], 1) ^ C[1];
822 D[3] = ROL64(C[4], 1) ^ C[2];
823 D[4] = ROL64(C[0], 1) ^ C[3];
825 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
826 B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
827 B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
828 B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
829 B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
831 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
832 C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
833 C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
834 C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
835 C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
837 B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
838 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
839 B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
840 B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
841 B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
843 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
844 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
845 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
846 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
847 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
849 B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
850 B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
851 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
852 B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
853 B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
855 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
856 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
857 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
858 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
859 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
861 B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
862 B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
863 B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
864 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
865 B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
867 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
868 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
869 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
870 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
871 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
873 B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
874 B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
875 B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
876 B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
877 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
879 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
880 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
881 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
882 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
883 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
886 D[0] = ROL64(C[1], 1) ^ C[4];
887 D[1] = ROL64(C[2], 1) ^ C[0];
888 D[2] = ROL64(C[3], 1) ^ C[1];
889 D[3] = ROL64(C[4], 1) ^ C[2];
890 D[4] = ROL64(C[0], 1) ^ C[3];
892 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
893 B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
894 B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
895 B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
896 B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
898 /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
899 /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
900 /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
901 /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
902 /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
904 B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
905 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
906 B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
907 B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
908 B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
910 /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
911 /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
912 /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
913 /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
914 /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
916 B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
917 B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
918 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
919 B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
920 B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
922 /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
923 /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
924 /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
925 /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
926 /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
928 B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
929 B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
930 B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
931 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
932 B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
934 /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
935 /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
936 /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
937 /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
938 /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
940 B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
941 B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
942 B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
943 B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
944 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
946 /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
947 /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
948 /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
949 /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
950 /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
953 static void KeccakF1600(uint64_t A[5][5])
957 for (i = 0; i < 24; i += 4) {
964 static uint64_t BitInterleave(uint64_t Ai)
966 if (BIT_INTERLEAVE) {
967 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
970 t0 = lo & 0x55555555;
971 t0 |= t0 >> 1; t0 &= 0x33333333;
972 t0 |= t0 >> 2; t0 &= 0x0f0f0f0f;
973 t0 |= t0 >> 4; t0 &= 0x00ff00ff;
974 t0 |= t0 >> 8; t0 &= 0x0000ffff;
976 t1 = hi & 0x55555555;
977 t1 |= t1 >> 1; t1 &= 0x33333333;
978 t1 |= t1 >> 2; t1 &= 0x0f0f0f0f;
979 t1 |= t1 >> 4; t1 &= 0x00ff00ff;
980 t1 |= t1 >> 8; t1 <<= 16;
983 lo |= lo << 1; lo &= 0xcccccccc;
984 lo |= lo << 2; lo &= 0xf0f0f0f0;
985 lo |= lo << 4; lo &= 0xff00ff00;
986 lo |= lo << 8; lo >>= 16;
989 hi |= hi << 1; hi &= 0xcccccccc;
990 hi |= hi << 2; hi &= 0xf0f0f0f0;
991 hi |= hi << 4; hi &= 0xff00ff00;
992 hi |= hi << 8; hi &= 0xffff0000;
994 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1000 static uint64_t BitDeinterleave(uint64_t Ai)
1002 if (BIT_INTERLEAVE) {
1003 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1006 t0 = lo & 0x0000ffff;
1007 t0 |= t0 << 8; t0 &= 0x00ff00ff;
1008 t0 |= t0 << 4; t0 &= 0x0f0f0f0f;
1009 t0 |= t0 << 2; t0 &= 0x33333333;
1010 t0 |= t0 << 1; t0 &= 0x55555555;
1013 t1 |= t1 >> 8; t1 &= 0xff00ff00;
1014 t1 |= t1 >> 4; t1 &= 0xf0f0f0f0;
1015 t1 |= t1 >> 2; t1 &= 0xcccccccc;
1016 t1 |= t1 >> 1; t1 &= 0xaaaaaaaa;
1019 lo |= lo << 8; lo &= 0x00ff00ff;
1020 lo |= lo << 4; lo &= 0x0f0f0f0f;
1021 lo |= lo << 2; lo &= 0x33333333;
1022 lo |= lo << 1; lo &= 0x55555555;
1025 hi |= hi >> 8; hi &= 0xff00ff00;
1026 hi |= hi >> 4; hi &= 0xf0f0f0f0;
1027 hi |= hi >> 2; hi &= 0xcccccccc;
1028 hi |= hi >> 1; hi &= 0xaaaaaaaa;
1030 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1037 * SHA3_absorb can be called multiple times, but at each invocation
1038 * largest multiple of |r| out of |len| bytes are processed. Then
1039 * remaining amount of bytes is returned. This is done to spare caller
1040 * trouble of calculating the largest multiple of |r|. |r| can be viewed
1041 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1042 * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1043 * padding and intermediate sub-block buffering, byte- or bitwise, is
1044 * caller's reponsibility.
1046 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1049 uint64_t *A_flat = (uint64_t *)A;
1050 size_t i, w = r / 8;
1052 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1055 for (i = 0; i < w; i++) {
1056 uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 |
1057 (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
1058 (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
1059 (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1062 A_flat[i] ^= BitInterleave(Ai);
1072 * SHA3_squeeze is called once at the end to generate |out| hash value
1075 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r)
1077 uint64_t *A_flat = (uint64_t *)A;
1078 size_t i, w = r / 8;
1080 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1083 for (i = 0; i < w && len != 0; i++) {
1084 uint64_t Ai = BitDeinterleave(A_flat[i]);
1087 for (i = 0; i < len; i++) {
1088 *out++ = (unsigned char)Ai;
1094 out[0] = (unsigned char)(Ai);
1095 out[1] = (unsigned char)(Ai >> 8);
1096 out[2] = (unsigned char)(Ai >> 16);
1097 out[3] = (unsigned char)(Ai >> 24);
1098 out[4] = (unsigned char)(Ai >> 32);
1099 out[5] = (unsigned char)(Ai >> 40);
1100 out[6] = (unsigned char)(Ai >> 48);
1101 out[7] = (unsigned char)(Ai >> 56);
1110 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1112 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r);
1117 * Post-padding one-shot implementations would look as following:
1119 * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1120 * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1121 * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1122 * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1123 * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8);
1124 * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8);
1127 void SHA3_sponge(const unsigned char *inp, size_t len,
1128 unsigned char *out, size_t d, size_t r)
1132 memset(A, 0, sizeof(A));
1133 SHA3_absorb(A, inp, len, r);
1134 SHA3_squeeze(A, out, d, r);
1142 * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1144 unsigned char test[168] = { '\xf3', '\x3' };
1145 unsigned char out[512];
1147 static const unsigned char result[512] = {
1148 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1149 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1150 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1151 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1152 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1153 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1154 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1155 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1156 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1157 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1158 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1159 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1160 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1161 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1162 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1163 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1164 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1165 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1166 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1167 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1168 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1169 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1170 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1171 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1172 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1173 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1174 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1175 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1176 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1177 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1178 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1179 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1180 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1181 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1182 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1183 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1184 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1185 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1186 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1187 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1188 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1189 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1190 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1191 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1192 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1193 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1194 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1195 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1196 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1197 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1198 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1199 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1200 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1201 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1202 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1203 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1204 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1205 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1206 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1207 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1208 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1209 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1210 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1211 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1215 SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1218 * Rationale behind keeping output [formatted as below] is that
1219 * one should be able to redirect it to a file, then copy-n-paste
1220 * final "output val" from official example to another file, and
1221 * compare the two with diff(1).
1223 for (i = 0; i < sizeof(out);) {
1224 printf("%02X", out[i]);
1225 printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1228 if (memcmp(out,result,sizeof(out))) {
1229 fprintf(stderr,"failure\n");
1232 fprintf(stderr,"success\n");