2 * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
10 #include <openssl/e_os2.h>
14 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
16 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r);
18 #if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
21 * Choose some sensible defaults
23 #if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \
24 !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
25 # define KECCAK_2X /* default to KECCAK_2X variant */
28 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
29 (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \
30 defined(__mips) || defined(__riscv) || defined(__s390__) || \
31 defined(__EMSCRIPTEN__)
33 * These don't have "and with complement" instruction, so minimize amount
34 * of "not"-s. Implemented only in the [default] KECCAK_2X variant.
36 # define KECCAK_COMPLEMENTING_TRANSFORM
39 #if defined(__x86_64__) || defined(__aarch64__) || \
40 defined(__mips64) || defined(__ia64) || \
41 (defined(__VMS) && !defined(__vax))
43 * These are available even in ILP32 flavours, but even then they are
44 * capable of performing 64-bit operations as efficiently as in *P64.
45 * Since it's not given that we can use sizeof(void *), just shunt it.
47 # define BIT_INTERLEAVE (0)
49 # define BIT_INTERLEAVE (sizeof(void *) < 8)
52 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
54 static uint64_t ROL64(uint64_t val, int offset)
58 } else if (!BIT_INTERLEAVE) {
59 return (val << offset) | (val >> (64-offset));
61 uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
67 hi = ROL32(lo, offset);
68 lo = ROL32(tmp, offset + 1);
71 lo = ROL32(lo, offset);
72 hi = ROL32(hi, offset);
75 return ((uint64_t)hi << 32) | lo;
79 static const unsigned char rhotates[5][5] = {
81 { 36, 44, 6, 55, 20 },
82 { 3, 10, 43, 25, 39 },
83 { 41, 45, 15, 21, 8 },
87 static const uint64_t iotas[] = {
88 BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
89 BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
90 BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
91 BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
92 BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
93 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
94 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
95 BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
96 BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
97 BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
98 BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
99 BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
100 BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
101 BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
102 BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
103 BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
104 BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
105 BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
106 BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
107 BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
108 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
109 BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
110 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
111 BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
114 #if defined(KECCAK_REF)
116 * This is straightforward or "maximum clarity" implementation aiming
117 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
118 * Permutation-Based Hash and Extendible-Output Functions" as much as
119 * possible. With one caveat. Because of the way C stores matrices,
120 * references to A[x,y] in the specification are presented as A[y][x].
121 * Implementation unrolls inner x-loops so that modulo 5 operations are
122 * explicitly pre-computed.
124 static void Theta(uint64_t A[5][5])
135 for (y = 1; y < 5; y++) {
143 D[0] = ROL64(C[1], 1) ^ C[4];
144 D[1] = ROL64(C[2], 1) ^ C[0];
145 D[2] = ROL64(C[3], 1) ^ C[1];
146 D[3] = ROL64(C[4], 1) ^ C[2];
147 D[4] = ROL64(C[0], 1) ^ C[3];
149 for (y = 0; y < 5; y++) {
158 static void Rho(uint64_t A[5][5])
162 for (y = 0; y < 5; y++) {
163 A[y][0] = ROL64(A[y][0], rhotates[y][0]);
164 A[y][1] = ROL64(A[y][1], rhotates[y][1]);
165 A[y][2] = ROL64(A[y][2], rhotates[y][2]);
166 A[y][3] = ROL64(A[y][3], rhotates[y][3]);
167 A[y][4] = ROL64(A[y][4], rhotates[y][4]);
171 static void Pi(uint64_t A[5][5])
177 * A[y][x] = T[x][(3*y+x)%5]
179 memcpy(T, A, sizeof(T));
212 static void Chi(uint64_t A[5][5])
217 for (y = 0; y < 5; y++) {
218 C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
219 C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
220 C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
221 C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
222 C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
232 static void Iota(uint64_t A[5][5], size_t i)
234 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
238 static void KeccakF1600(uint64_t A[5][5])
242 for (i = 0; i < 24; i++) {
251 #elif defined(KECCAK_1X)
253 * This implementation is optimization of above code featuring unroll
254 * of even y-loops, their fusion and code motion. It also minimizes
255 * temporary storage. Compiler would normally do all these things for
256 * you, purpose of manual optimization is to provide "unobscured"
257 * reference for assembly implementation [in case this approach is
258 * chosen for implementation on some platform]. In the nutshell it's
259 * equivalent of "plane-per-plane processing" approach discussed in
260 * section 2.4 of "Keccak implementation overview".
262 static void Round(uint64_t A[5][5], size_t i)
264 uint64_t C[5], E[2]; /* registers */
265 uint64_t D[5], T[2][5]; /* memory */
267 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
269 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
270 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
271 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
272 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
273 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
276 D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
277 D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
278 D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
279 D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
280 D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
282 T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
283 T[0][1] = A[0][1] ^ E[0]; /* D[1] */
284 T[0][2] = A[0][2] ^ C[1]; /* D[2] */
285 T[0][3] = A[0][3] ^ C[2]; /* D[3] */
286 T[0][4] = A[0][4] ^ E[1]; /* D[4] */
288 C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
289 C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
290 C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
291 C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
292 C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
294 D[0] = ROL64(C[1], 1) ^ C[4];
295 D[1] = ROL64(C[2], 1) ^ C[0];
296 D[2] = ROL64(C[3], 1) ^ C[1];
297 D[3] = ROL64(C[4], 1) ^ C[2];
298 D[4] = ROL64(C[0], 1) ^ C[3];
300 T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
301 T[0][1] = A[0][1] ^ D[1];
302 T[0][2] = A[0][2] ^ D[2];
303 T[0][3] = A[0][3] ^ D[3];
304 T[0][4] = A[0][4] ^ D[4];
306 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
307 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
308 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
309 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
310 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
312 A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
313 A[0][1] = C[1] ^ (~C[2] & C[3]);
314 A[0][2] = C[2] ^ (~C[3] & C[4]);
315 A[0][3] = C[3] ^ (~C[4] & C[0]);
316 A[0][4] = C[4] ^ (~C[0] & C[1]);
318 T[1][0] = A[1][0] ^ (C[3] = D[0]);
319 T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
320 T[1][2] = A[1][2] ^ (E[0] = D[2]);
321 T[1][3] = A[1][3] ^ (E[1] = D[3]);
322 T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
324 C[0] = ROL64(T[0][3], rhotates[0][3]);
325 C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
326 C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
327 C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
328 C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
330 A[1][0] = C[0] ^ (~C[1] & C[2]);
331 A[1][1] = C[1] ^ (~C[2] & C[3]);
332 A[1][2] = C[2] ^ (~C[3] & C[4]);
333 A[1][3] = C[3] ^ (~C[4] & C[0]);
334 A[1][4] = C[4] ^ (~C[0] & C[1]);
336 C[0] = ROL64(T[0][1], rhotates[0][1]);
337 C[1] = ROL64(T[1][2], rhotates[1][2]);
338 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
339 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
340 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
342 A[2][0] = C[0] ^ (~C[1] & C[2]);
343 A[2][1] = C[1] ^ (~C[2] & C[3]);
344 A[2][2] = C[2] ^ (~C[3] & C[4]);
345 A[2][3] = C[3] ^ (~C[4] & C[0]);
346 A[2][4] = C[4] ^ (~C[0] & C[1]);
348 C[0] = ROL64(T[0][4], rhotates[0][4]);
349 C[1] = ROL64(T[1][0], rhotates[1][0]);
350 C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
351 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
352 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
354 A[3][0] = C[0] ^ (~C[1] & C[2]);
355 A[3][1] = C[1] ^ (~C[2] & C[3]);
356 A[3][2] = C[2] ^ (~C[3] & C[4]);
357 A[3][3] = C[3] ^ (~C[4] & C[0]);
358 A[3][4] = C[4] ^ (~C[0] & C[1]);
360 C[0] = ROL64(T[0][2], rhotates[0][2]);
361 C[1] = ROL64(T[1][3], rhotates[1][3]);
362 C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
363 C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
364 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
366 A[4][0] = C[0] ^ (~C[1] & C[2]);
367 A[4][1] = C[1] ^ (~C[2] & C[3]);
368 A[4][2] = C[2] ^ (~C[3] & C[4]);
369 A[4][3] = C[3] ^ (~C[4] & C[0]);
370 A[4][4] = C[4] ^ (~C[0] & C[1]);
373 static void KeccakF1600(uint64_t A[5][5])
377 for (i = 0; i < 24; i++) {
382 #elif defined(KECCAK_1X_ALT)
384 * This is variant of above KECCAK_1X that reduces requirement for
385 * temporary storage even further, but at cost of more updates to A[][].
386 * It's less suitable if A[][] is memory bound, but better if it's
390 static void Round(uint64_t A[5][5], size_t i)
394 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
396 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
397 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
398 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
399 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
400 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
402 D[1] = C[0] ^ ROL64(C[2], 1);
403 D[2] = C[1] ^ ROL64(C[3], 1);
404 D[3] = C[2] ^= ROL64(C[4], 1);
405 D[4] = C[3] ^= ROL64(C[0], 1);
406 D[0] = C[4] ^= ROL64(C[1], 1);
443 A[0][1] = ROL64(A[1][1], rhotates[1][1]);
444 A[0][2] = ROL64(A[2][2], rhotates[2][2]);
445 A[0][3] = ROL64(A[3][3], rhotates[3][3]);
446 A[0][4] = ROL64(A[4][4], rhotates[4][4]);
448 A[1][1] = ROL64(A[1][4], rhotates[1][4]);
449 A[2][2] = ROL64(A[2][3], rhotates[2][3]);
450 A[3][3] = ROL64(A[3][2], rhotates[3][2]);
451 A[4][4] = ROL64(A[4][1], rhotates[4][1]);
453 A[1][4] = ROL64(A[4][2], rhotates[4][2]);
454 A[2][3] = ROL64(A[3][4], rhotates[3][4]);
455 A[3][2] = ROL64(A[2][1], rhotates[2][1]);
456 A[4][1] = ROL64(A[1][3], rhotates[1][3]);
458 A[4][2] = ROL64(A[2][4], rhotates[2][4]);
459 A[3][4] = ROL64(A[4][3], rhotates[4][3]);
460 A[2][1] = ROL64(A[1][2], rhotates[1][2]);
461 A[1][3] = ROL64(A[3][1], rhotates[3][1]);
463 A[2][4] = ROL64(A[4][0], rhotates[4][0]);
464 A[4][3] = ROL64(A[3][0], rhotates[3][0]);
465 A[1][2] = ROL64(A[2][0], rhotates[2][0]);
466 A[3][1] = ROL64(A[1][0], rhotates[1][0]);
468 A[1][0] = ROL64(C[3], rhotates[0][3]);
469 A[2][0] = ROL64(C[1], rhotates[0][1]);
470 A[3][0] = ROL64(C[4], rhotates[0][4]);
471 A[4][0] = ROL64(C[2], rhotates[0][2]);
478 A[0][0] ^= (~A[0][1] & A[0][2]);
479 A[1][0] ^= (~A[1][1] & A[1][2]);
480 A[0][1] ^= (~A[0][2] & A[0][3]);
481 A[1][1] ^= (~A[1][2] & A[1][3]);
482 A[0][2] ^= (~A[0][3] & A[0][4]);
483 A[1][2] ^= (~A[1][3] & A[1][4]);
484 A[0][3] ^= (~A[0][4] & C[0]);
485 A[1][3] ^= (~A[1][4] & C[1]);
486 A[0][4] ^= (~C[0] & D[0]);
487 A[1][4] ^= (~C[1] & D[1]);
494 A[2][0] ^= (~A[2][1] & A[2][2]);
495 A[3][0] ^= (~A[3][1] & A[3][2]);
496 A[2][1] ^= (~A[2][2] & A[2][3]);
497 A[3][1] ^= (~A[3][2] & A[3][3]);
498 A[2][2] ^= (~A[2][3] & A[2][4]);
499 A[3][2] ^= (~A[3][3] & A[3][4]);
500 A[2][3] ^= (~A[2][4] & C[2]);
501 A[3][3] ^= (~A[3][4] & C[3]);
502 A[2][4] ^= (~C[2] & D[2]);
503 A[3][4] ^= (~C[3] & D[3]);
508 A[4][0] ^= (~A[4][1] & A[4][2]);
509 A[4][1] ^= (~A[4][2] & A[4][3]);
510 A[4][2] ^= (~A[4][3] & A[4][4]);
511 A[4][3] ^= (~A[4][4] & C[4]);
512 A[4][4] ^= (~C[4] & D[4]);
516 static void KeccakF1600(uint64_t A[5][5])
520 for (i = 0; i < 24; i++) {
525 #elif defined(KECCAK_2X)
527 * This implementation is variant of KECCAK_1X above with outer-most
528 * round loop unrolled twice. This allows to take temporary storage
529 * out of round procedure and simplify references to it by alternating
530 * it with actual data (see round loop below). Originally it was meant
531 * rather as reference for an assembly implementation, but it seems to
532 * play best with compilers [as well as provide best instruction per
533 * processed byte ratio at minimal round unroll factor]...
535 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
539 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
541 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
542 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
543 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
544 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
545 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
547 D[0] = ROL64(C[1], 1) ^ C[4];
548 D[1] = ROL64(C[2], 1) ^ C[0];
549 D[2] = ROL64(C[3], 1) ^ C[1];
550 D[3] = ROL64(C[4], 1) ^ C[2];
551 D[4] = ROL64(C[0], 1) ^ C[3];
553 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
554 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
555 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
556 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
557 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
559 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
560 R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
561 R[0][1] = C[1] ^ (~C[2] | C[3]);
562 R[0][2] = C[2] ^ ( C[3] & C[4]);
563 R[0][3] = C[3] ^ ( C[4] | C[0]);
564 R[0][4] = C[4] ^ ( C[0] & C[1]);
566 R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
567 R[0][1] = C[1] ^ (~C[2] & C[3]);
568 R[0][2] = C[2] ^ (~C[3] & C[4]);
569 R[0][3] = C[3] ^ (~C[4] & C[0]);
570 R[0][4] = C[4] ^ (~C[0] & C[1]);
573 C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
574 C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
575 C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
576 C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
577 C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
579 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
580 R[1][0] = C[0] ^ (C[1] | C[2]);
581 R[1][1] = C[1] ^ (C[2] & C[3]);
582 R[1][2] = C[2] ^ (C[3] | ~C[4]);
583 R[1][3] = C[3] ^ (C[4] | C[0]);
584 R[1][4] = C[4] ^ (C[0] & C[1]);
586 R[1][0] = C[0] ^ (~C[1] & C[2]);
587 R[1][1] = C[1] ^ (~C[2] & C[3]);
588 R[1][2] = C[2] ^ (~C[3] & C[4]);
589 R[1][3] = C[3] ^ (~C[4] & C[0]);
590 R[1][4] = C[4] ^ (~C[0] & C[1]);
593 C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
594 C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
595 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
596 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
597 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
599 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
600 R[2][0] = C[0] ^ ( C[1] | C[2]);
601 R[2][1] = C[1] ^ ( C[2] & C[3]);
602 R[2][2] = C[2] ^ (~C[3] & C[4]);
603 R[2][3] = ~C[3] ^ ( C[4] | C[0]);
604 R[2][4] = C[4] ^ ( C[0] & C[1]);
606 R[2][0] = C[0] ^ (~C[1] & C[2]);
607 R[2][1] = C[1] ^ (~C[2] & C[3]);
608 R[2][2] = C[2] ^ (~C[3] & C[4]);
609 R[2][3] = C[3] ^ (~C[4] & C[0]);
610 R[2][4] = C[4] ^ (~C[0] & C[1]);
613 C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
614 C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
615 C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
616 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
617 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
619 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
620 R[3][0] = C[0] ^ ( C[1] & C[2]);
621 R[3][1] = C[1] ^ ( C[2] | C[3]);
622 R[3][2] = C[2] ^ (~C[3] | C[4]);
623 R[3][3] = ~C[3] ^ ( C[4] & C[0]);
624 R[3][4] = C[4] ^ ( C[0] | C[1]);
626 R[3][0] = C[0] ^ (~C[1] & C[2]);
627 R[3][1] = C[1] ^ (~C[2] & C[3]);
628 R[3][2] = C[2] ^ (~C[3] & C[4]);
629 R[3][3] = C[3] ^ (~C[4] & C[0]);
630 R[3][4] = C[4] ^ (~C[0] & C[1]);
633 C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
634 C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
635 C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
636 C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
637 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
639 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
640 R[4][0] = C[0] ^ (~C[1] & C[2]);
641 R[4][1] = ~C[1] ^ ( C[2] | C[3]);
642 R[4][2] = C[2] ^ ( C[3] & C[4]);
643 R[4][3] = C[3] ^ ( C[4] | C[0]);
644 R[4][4] = C[4] ^ ( C[0] & C[1]);
646 R[4][0] = C[0] ^ (~C[1] & C[2]);
647 R[4][1] = C[1] ^ (~C[2] & C[3]);
648 R[4][2] = C[2] ^ (~C[3] & C[4]);
649 R[4][3] = C[3] ^ (~C[4] & C[0]);
650 R[4][4] = C[4] ^ (~C[0] & C[1]);
654 static void KeccakF1600(uint64_t A[5][5])
659 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
668 for (i = 0; i < 24; i += 2) {
673 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
683 #else /* define KECCAK_INPLACE to compile this code path */
685 * This implementation is KECCAK_1X from above combined 4 times with
686 * a twist that allows to omit temporary storage and perform in-place
687 * processing. It's discussed in section 2.5 of "Keccak implementation
688 * overview". It's likely to be best suited for processors with large
689 * register bank... On the other hand processor with large register
690 * bank can as well use KECCAK_1X_ALT, it would be as fast but much
693 static void FourRounds(uint64_t A[5][5], size_t i)
695 uint64_t B[5], C[5], D[5];
697 assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4));
700 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
701 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
702 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
703 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
704 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
706 D[0] = ROL64(C[1], 1) ^ C[4];
707 D[1] = ROL64(C[2], 1) ^ C[0];
708 D[2] = ROL64(C[3], 1) ^ C[1];
709 D[3] = ROL64(C[4], 1) ^ C[2];
710 D[4] = ROL64(C[0], 1) ^ C[3];
712 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
713 B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
714 B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
715 B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
716 B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
718 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
719 C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
720 C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
721 C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
722 C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
724 B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
725 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
726 B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
727 B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
728 B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
730 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
731 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
732 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
733 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
734 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
736 B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
737 B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
738 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
739 B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
740 B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
742 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
743 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
744 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
745 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
746 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
748 B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
749 B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
750 B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
751 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
752 B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
754 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
755 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
756 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
757 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
758 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
760 B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
761 B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
762 B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
763 B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
764 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
766 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
767 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
768 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
769 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
770 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
773 D[0] = ROL64(C[1], 1) ^ C[4];
774 D[1] = ROL64(C[2], 1) ^ C[0];
775 D[2] = ROL64(C[3], 1) ^ C[1];
776 D[3] = ROL64(C[4], 1) ^ C[2];
777 D[4] = ROL64(C[0], 1) ^ C[3];
779 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
780 B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
781 B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
782 B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
783 B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
785 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
786 C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
787 C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
788 C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
789 C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
791 B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
792 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
793 B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
794 B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
795 B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
797 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
798 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
799 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
800 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
801 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
803 B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
804 B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
805 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
806 B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
807 B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
809 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
810 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
811 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
812 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
813 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
815 B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
816 B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
817 B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
818 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
819 B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
821 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
822 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
823 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
824 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
825 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
827 B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
828 B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
829 B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
830 B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
831 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
833 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
834 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
835 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
836 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
837 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
840 D[0] = ROL64(C[1], 1) ^ C[4];
841 D[1] = ROL64(C[2], 1) ^ C[0];
842 D[2] = ROL64(C[3], 1) ^ C[1];
843 D[3] = ROL64(C[4], 1) ^ C[2];
844 D[4] = ROL64(C[0], 1) ^ C[3];
846 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
847 B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
848 B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
849 B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
850 B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
852 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
853 C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
854 C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
855 C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
856 C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
858 B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
859 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
860 B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
861 B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
862 B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
864 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
865 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
866 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
867 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
868 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
870 B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
871 B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
872 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
873 B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
874 B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
876 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
877 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
878 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
879 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
880 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
882 B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
883 B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
884 B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
885 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
886 B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
888 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
889 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
890 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
891 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
892 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
894 B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
895 B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
896 B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
897 B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
898 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
900 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
901 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
902 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
903 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
904 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
907 D[0] = ROL64(C[1], 1) ^ C[4];
908 D[1] = ROL64(C[2], 1) ^ C[0];
909 D[2] = ROL64(C[3], 1) ^ C[1];
910 D[3] = ROL64(C[4], 1) ^ C[2];
911 D[4] = ROL64(C[0], 1) ^ C[3];
913 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
914 B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
915 B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
916 B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
917 B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
919 /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
920 /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
921 /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
922 /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
923 /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
925 B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
926 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
927 B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
928 B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
929 B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
931 /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
932 /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
933 /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
934 /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
935 /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
937 B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
938 B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
939 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
940 B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
941 B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
943 /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
944 /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
945 /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
946 /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
947 /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
949 B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
950 B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
951 B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
952 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
953 B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
955 /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
956 /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
957 /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
958 /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
959 /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
961 B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
962 B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
963 B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
964 B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
965 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
967 /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
968 /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
969 /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
970 /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
971 /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
974 static void KeccakF1600(uint64_t A[5][5])
978 for (i = 0; i < 24; i += 4) {
985 static uint64_t BitInterleave(uint64_t Ai)
987 if (BIT_INTERLEAVE) {
988 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
991 t0 = lo & 0x55555555;
992 t0 |= t0 >> 1; t0 &= 0x33333333;
993 t0 |= t0 >> 2; t0 &= 0x0f0f0f0f;
994 t0 |= t0 >> 4; t0 &= 0x00ff00ff;
995 t0 |= t0 >> 8; t0 &= 0x0000ffff;
997 t1 = hi & 0x55555555;
998 t1 |= t1 >> 1; t1 &= 0x33333333;
999 t1 |= t1 >> 2; t1 &= 0x0f0f0f0f;
1000 t1 |= t1 >> 4; t1 &= 0x00ff00ff;
1001 t1 |= t1 >> 8; t1 <<= 16;
1004 lo |= lo << 1; lo &= 0xcccccccc;
1005 lo |= lo << 2; lo &= 0xf0f0f0f0;
1006 lo |= lo << 4; lo &= 0xff00ff00;
1007 lo |= lo << 8; lo >>= 16;
1010 hi |= hi << 1; hi &= 0xcccccccc;
1011 hi |= hi << 2; hi &= 0xf0f0f0f0;
1012 hi |= hi << 4; hi &= 0xff00ff00;
1013 hi |= hi << 8; hi &= 0xffff0000;
1015 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1021 static uint64_t BitDeinterleave(uint64_t Ai)
1023 if (BIT_INTERLEAVE) {
1024 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1027 t0 = lo & 0x0000ffff;
1028 t0 |= t0 << 8; t0 &= 0x00ff00ff;
1029 t0 |= t0 << 4; t0 &= 0x0f0f0f0f;
1030 t0 |= t0 << 2; t0 &= 0x33333333;
1031 t0 |= t0 << 1; t0 &= 0x55555555;
1034 t1 |= t1 >> 8; t1 &= 0xff00ff00;
1035 t1 |= t1 >> 4; t1 &= 0xf0f0f0f0;
1036 t1 |= t1 >> 2; t1 &= 0xcccccccc;
1037 t1 |= t1 >> 1; t1 &= 0xaaaaaaaa;
1040 lo |= lo << 8; lo &= 0x00ff00ff;
1041 lo |= lo << 4; lo &= 0x0f0f0f0f;
1042 lo |= lo << 2; lo &= 0x33333333;
1043 lo |= lo << 1; lo &= 0x55555555;
1046 hi |= hi >> 8; hi &= 0xff00ff00;
1047 hi |= hi >> 4; hi &= 0xf0f0f0f0;
1048 hi |= hi >> 2; hi &= 0xcccccccc;
1049 hi |= hi >> 1; hi &= 0xaaaaaaaa;
1051 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1058 * SHA3_absorb can be called multiple times, but at each invocation
1059 * largest multiple of |r| out of |len| bytes are processed. Then
1060 * remaining amount of bytes is returned. This is done to spare caller
1061 * trouble of calculating the largest multiple of |r|. |r| can be viewed
1062 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1063 * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1064 * padding and intermediate sub-block buffering, byte- or bitwise, is
1065 * caller's responsibility.
1067 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1070 uint64_t *A_flat = (uint64_t *)A;
1071 size_t i, w = r / 8;
1073 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1076 for (i = 0; i < w; i++) {
1077 uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 |
1078 (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
1079 (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
1080 (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1083 A_flat[i] ^= BitInterleave(Ai);
1093 * sha3_squeeze is called once at the end to generate |out| hash value
1096 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r)
1098 uint64_t *A_flat = (uint64_t *)A;
1099 size_t i, w = r / 8;
1101 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1104 for (i = 0; i < w && len != 0; i++) {
1105 uint64_t Ai = BitDeinterleave(A_flat[i]);
1108 for (i = 0; i < len; i++) {
1109 *out++ = (unsigned char)Ai;
1115 out[0] = (unsigned char)(Ai);
1116 out[1] = (unsigned char)(Ai >> 8);
1117 out[2] = (unsigned char)(Ai >> 16);
1118 out[3] = (unsigned char)(Ai >> 24);
1119 out[4] = (unsigned char)(Ai >> 32);
1120 out[5] = (unsigned char)(Ai >> 40);
1121 out[6] = (unsigned char)(Ai >> 48);
1122 out[7] = (unsigned char)(Ai >> 56);
1134 * Post-padding one-shot implementations would look as following:
1136 * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1137 * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1138 * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1139 * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1140 * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8);
1141 * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8);
1144 void SHA3_sponge(const unsigned char *inp, size_t len,
1145 unsigned char *out, size_t d, size_t r)
1149 memset(A, 0, sizeof(A));
1150 SHA3_absorb(A, inp, len, r);
1151 SHA3_squeeze(A, out, d, r);
1159 * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1161 unsigned char test[168] = { '\xf3', '\x3' };
1162 unsigned char out[512];
1164 static const unsigned char result[512] = {
1165 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1166 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1167 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1168 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1169 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1170 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1171 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1172 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1173 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1174 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1175 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1176 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1177 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1178 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1179 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1180 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1181 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1182 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1183 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1184 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1185 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1186 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1187 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1188 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1189 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1190 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1191 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1192 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1193 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1194 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1195 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1196 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1197 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1198 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1199 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1200 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1201 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1202 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1203 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1204 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1205 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1206 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1207 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1208 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1209 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1210 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1211 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1212 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1213 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1214 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1215 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1216 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1217 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1218 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1219 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1220 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1221 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1222 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1223 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1224 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1225 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1226 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1227 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1228 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1232 SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1235 * Rationale behind keeping output [formatted as below] is that
1236 * one should be able to redirect it to a file, then copy-n-paste
1237 * final "output val" from official example to another file, and
1238 * compare the two with diff(1).
1240 for (i = 0; i < sizeof(out);) {
1241 printf("%02X", out[i]);
1242 printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1245 if (memcmp(out,result,sizeof(out))) {
1246 fprintf(stderr,"failure\n");
1249 fprintf(stderr,"success\n");