1 /* vi: set sw=4 ts=4: */
5 * Copyright (C) 2010 Denys Vlasenko
7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
12 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
13 * (for rotX32, there is no difference). Why? My guess is that
14 * macro requires clever common subexpression elimination heuristics
15 * in gcc, while inline basically forces it to happen.
17 //#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
18 static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
20 return (x << n) | (x >> (32 - n));
22 //#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
23 static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
25 return (x >> n) | (x << (32 - n));
27 /* rotr64 in needed for sha512 only: */
28 //#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
29 static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
31 return (x >> n) | (x << (64 - n));
34 /* rotl64 only used for sha3 currently */
35 static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
37 return (x << n) | (x >> (64 - n));
40 /* Feed data through a temporary buffer.
41 * The internal buffer remembers previous data until it has 64
42 * bytes worth to pass on.
44 static void FAST_FUNC common64_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
46 unsigned bufpos = ctx->total64 & 63;
51 unsigned remaining = 64 - bufpos;
54 /* Copy data into aligned buffer */
55 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
57 buffer = (const char *)buffer + remaining;
59 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
63 /* Buffer is filled up, process it */
64 ctx->process_block(ctx);
65 /*bufpos = 0; - already is */
69 /* Process the remaining bytes in the buffer */
70 static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
72 unsigned bufpos = ctx->total64 & 63;
73 /* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
74 ctx->wbuffer[bufpos++] = 0x80;
76 /* This loop iterates either once or twice, no more, no less */
78 unsigned remaining = 64 - bufpos;
79 memset(ctx->wbuffer + bufpos, 0, remaining);
80 /* Do we have enough space for the length count? */
82 /* Store the 64-bit counter of bits in the buffer */
83 uint64_t t = ctx->total64 << 3;
86 /* wbuffer is suitably aligned for this */
87 *(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
89 ctx->process_block(ctx);
98 * Compute MD5 checksum of strings according to the
99 * definition of MD5 in RFC 1321 from April 1992.
101 * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
103 * Copyright (C) 1995-1999 Free Software Foundation, Inc.
104 * Copyright (C) 2001 Manuel Novoa III
105 * Copyright (C) 2003 Glenn L. McGrath
106 * Copyright (C) 2003 Erik Andersen
108 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
111 /* 0: fastest, 3: smallest */
112 #if CONFIG_MD5_SMALL < 0
114 #elif CONFIG_MD5_SMALL > 3
117 # define MD5_SMALL CONFIG_MD5_SMALL
120 /* These are the four functions used in the four steps of the MD5 algorithm
121 * and defined in the RFC 1321. The first function is a little bit optimized
122 * (as found in Colin Plumbs public domain implementation).
123 * #define FF(b, c, d) ((b & c) | (~b & d))
129 #define FF(b, c, d) (d ^ (b & (c ^ d)))
130 #define FG(b, c, d) FF(d, b, c)
131 #define FH(b, c, d) (b ^ c ^ d)
132 #define FI(b, c, d) (c ^ (b | ~d))
134 /* Hash a single block, 64 bytes long and 4-byte aligned */
135 static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
138 /* Before we start, one word to the strange constants.
139 They are defined in RFC 1321 as
140 T[i] = (int)(2^32 * fabs(sin(i))), i=1..64
142 static const uint32_t C_array[] = {
144 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
145 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
146 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
147 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
149 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
150 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
151 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
152 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
154 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
155 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
156 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
157 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
159 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
160 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
161 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
162 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
164 static const char P_array[] ALIGN1 = {
166 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
168 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
169 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
170 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 /* 4 */
173 uint32_t *words = (void*) ctx->wbuffer;
174 uint32_t A = ctx->hash[0];
175 uint32_t B = ctx->hash[1];
176 uint32_t C = ctx->hash[2];
177 uint32_t D = ctx->hash[3];
179 #if MD5_SMALL >= 2 /* 2 or 3 */
181 static const char S_array[] ALIGN1 = {
194 for (i = 0; i < 16; i++)
195 words[i] = SWAP_LE32(words[i]);
202 for (i = 0; i < 64; i++) {
216 default: /* case 3 */
219 temp += words[(int) (*pp++)] + *pc++;
220 temp = rotl32(temp, ps[i & 3]);
227 # else /* MD5_SMALL == 2 */
232 for (i = 0; i < 16; i++) {
233 temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
234 temp = rotl32(temp, ps[i & 3]);
242 for (i = 0; i < 16; i++) {
243 temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
244 temp = rotl32(temp, ps[i & 3]);
252 for (i = 0; i < 16; i++) {
253 temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
254 temp = rotl32(temp, ps[i & 3]);
262 for (i = 0; i < 16; i++) {
263 temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
264 temp = rotl32(temp, ps[i & 3]);
272 /* Add checksum to the starting values */
278 #else /* MD5_SMALL == 0 or 1 */
286 /* First round: using the given function, the context and a constant
287 the next context is computed. Because the algorithm's processing
288 unit is a 32-bit word and it is determined to work on words in
289 little endian byte order we perhaps have to change the byte order
290 before the computation. To reduce the work for the next steps
291 we save swapped words in WORDS array. */
293 # define OP(a, b, c, d, s, T) \
295 a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
304 for (i = 0; i < 4; i++) {
305 OP(A, B, C, D, 7, *pc++);
306 OP(D, A, B, C, 12, *pc++);
307 OP(C, D, A, B, 17, *pc++);
308 OP(B, C, D, A, 22, *pc++);
311 OP(A, B, C, D, 7, 0xd76aa478);
312 OP(D, A, B, C, 12, 0xe8c7b756);
313 OP(C, D, A, B, 17, 0x242070db);
314 OP(B, C, D, A, 22, 0xc1bdceee);
315 OP(A, B, C, D, 7, 0xf57c0faf);
316 OP(D, A, B, C, 12, 0x4787c62a);
317 OP(C, D, A, B, 17, 0xa8304613);
318 OP(B, C, D, A, 22, 0xfd469501);
319 OP(A, B, C, D, 7, 0x698098d8);
320 OP(D, A, B, C, 12, 0x8b44f7af);
321 OP(C, D, A, B, 17, 0xffff5bb1);
322 OP(B, C, D, A, 22, 0x895cd7be);
323 OP(A, B, C, D, 7, 0x6b901122);
324 OP(D, A, B, C, 12, 0xfd987193);
325 OP(C, D, A, B, 17, 0xa679438e);
326 OP(B, C, D, A, 22, 0x49b40821);
330 /* For the second to fourth round we have the possibly swapped words
331 in WORDS. Redefine the macro to take an additional first
332 argument specifying the function to use. */
334 # define OP(f, a, b, c, d, k, s, T) \
336 a += f(b, c, d) + words[k] + T; \
344 for (i = 0; i < 4; i++) {
345 OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
346 OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
347 OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
348 OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
351 OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
352 OP(FG, D, A, B, C, 6, 9, 0xc040b340);
353 OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
354 OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
355 OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
356 OP(FG, D, A, B, C, 10, 9, 0x02441453);
357 OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
358 OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
359 OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
360 OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
361 OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
362 OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
363 OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
364 OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
365 OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
366 OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
371 for (i = 0; i < 4; i++) {
372 OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
373 OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
374 OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
375 OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
378 OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
379 OP(FH, D, A, B, C, 8, 11, 0x8771f681);
380 OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
381 OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
382 OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
383 OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
384 OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
385 OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
386 OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
387 OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
388 OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
389 OP(FH, B, C, D, A, 6, 23, 0x04881d05);
390 OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
391 OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
392 OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
393 OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
398 for (i = 0; i < 4; i++) {
399 OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
400 OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
401 OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
402 OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
405 OP(FI, A, B, C, D, 0, 6, 0xf4292244);
406 OP(FI, D, A, B, C, 7, 10, 0x432aff97);
407 OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
408 OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
409 OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
410 OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
411 OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
412 OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
413 OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
414 OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
415 OP(FI, C, D, A, B, 6, 15, 0xa3014314);
416 OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
417 OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
418 OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
419 OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
420 OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
423 /* Add checksum to the starting values */
435 /* Initialize structure containing state of computation.
436 * (RFC 1321, 3.3: Step 3)
438 void FAST_FUNC md5_begin(md5_ctx_t *ctx)
440 ctx->hash[0] = 0x67452301;
441 ctx->hash[1] = 0xefcdab89;
442 ctx->hash[2] = 0x98badcfe;
443 ctx->hash[3] = 0x10325476;
445 ctx->process_block = md5_process_block64;
448 /* Used also for sha1 and sha256 */
449 void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
451 common64_hash(ctx, buffer, len);
454 /* Process the remaining bytes in the buffer and put result from CTX
455 * in first 16 bytes following RESBUF. The result is always in little
456 * endian byte order, so that a byte-wise output yields to the wanted
457 * ASCII representation of the message digest.
459 void FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
461 /* MD5 stores total in LE, need to swap on BE arches: */
462 common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
464 /* The MD5 result is in little endian byte order */
466 ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
467 ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
468 ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
469 ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
472 memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
478 * Copyright 2007 Rob Landley <rob@landley.net>
480 * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
481 * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
483 * Licensed under GPLv2, see file LICENSE in this source tree.
485 * ---------------------------------------------------------------------------
487 * SHA256 and SHA512 parts are:
488 * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
489 * Shrank by Denys Vlasenko.
491 * ---------------------------------------------------------------------------
493 * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
494 * and replace "4096" with something like "2000 + time(NULL) % 2097",
495 * then rebuild and compare "shaNNNsum bigfile" results.
498 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
500 static const uint32_t rconsts[] = {
501 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
506 uint32_t a, b, c, d, e;
508 /* On-stack work buffer frees up one register in the main loop
509 * which otherwise will be needed to hold ctx pointer */
510 for (i = 0; i < 16; i++)
511 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
519 /* 4 rounds of 20 operations each */
521 for (i = 0; i < 4; i++) {
528 work = (work & b) ^ d;
531 /* Used to do SWAP_BE32 here, but this
532 * requires ctx (see comment above) */
536 work = ((b | c) & d) | (b & c);
537 else /* i = 1 or 3 */
540 W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
543 work += e + rotl32(a, 5) + rconsts[i];
545 /* Rotate by one for next time */
548 c = /* b = */ rotl32(b, 30);
551 cnt = (cnt + 1) & 15;
562 /* Constants for SHA512 from FIPS 180-2:4.2.3.
563 * SHA256 constants from FIPS 180-2:4.2.2
564 * are the most significant half of first 64 elements
567 static const uint64_t sha_K[80] = {
568 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
569 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
570 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
571 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
572 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
573 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
574 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
575 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
576 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
577 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
578 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
579 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
580 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
581 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
582 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
583 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
584 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
585 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
586 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
587 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
588 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
589 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
590 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
591 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
592 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
593 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
594 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
595 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
596 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
597 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
598 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
599 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
600 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, /* [64]+ are used for sha512 only */
601 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
602 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
603 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
604 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
605 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
606 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
607 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
617 static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
620 uint32_t W[64], a, b, c, d, e, f, g, h;
621 const uint32_t *words = (uint32_t*) ctx->wbuffer;
623 /* Operators defined in FIPS 180-2:4.1.2. */
624 #define Ch(x, y, z) ((x & y) ^ (~x & z))
625 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
626 #define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
627 #define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
628 #define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
629 #define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
631 /* Compute the message schedule according to FIPS 180-2:6.2.2 step 2. */
632 for (t = 0; t < 16; ++t)
633 W[t] = SWAP_BE32(words[t]);
634 for (/*t = 16*/; t < 64; ++t)
635 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
646 /* The actual computation according to FIPS 180-2:6.2.2 step 3. */
647 for (t = 0; t < 64; ++t) {
648 /* Need to fetch upper half of sha_K[t]
649 * (I hope compiler is clever enough to just fetch
652 uint32_t K_t = sha_K[t] >> 32;
653 uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
654 uint32_t T2 = S0(a) + Maj(a, b, c);
670 /* Add the starting values of the context according to FIPS 180-2:6.2.2
682 static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
686 /* On i386, having assignments here (not later as sha256 does)
687 * produces 99 bytes smaller code with gcc 4.3.1
689 uint64_t a = ctx->hash[0];
690 uint64_t b = ctx->hash[1];
691 uint64_t c = ctx->hash[2];
692 uint64_t d = ctx->hash[3];
693 uint64_t e = ctx->hash[4];
694 uint64_t f = ctx->hash[5];
695 uint64_t g = ctx->hash[6];
696 uint64_t h = ctx->hash[7];
697 const uint64_t *words = (uint64_t*) ctx->wbuffer;
699 /* Operators defined in FIPS 180-2:4.1.2. */
700 #define Ch(x, y, z) ((x & y) ^ (~x & z))
701 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
702 #define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
703 #define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
704 #define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
705 #define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
707 /* Compute the message schedule according to FIPS 180-2:6.3.2 step 2. */
708 for (t = 0; t < 16; ++t)
709 W[t] = SWAP_BE64(words[t]);
710 for (/*t = 16*/; t < 80; ++t)
711 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
713 /* The actual computation according to FIPS 180-2:6.3.2 step 3. */
714 for (t = 0; t < 80; ++t) {
715 uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
716 uint64_t T2 = S0(a) + Maj(a, b, c);
732 /* Add the starting values of the context according to FIPS 180-2:6.3.2
745 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
747 ctx->hash[0] = 0x67452301;
748 ctx->hash[1] = 0xefcdab89;
749 ctx->hash[2] = 0x98badcfe;
750 ctx->hash[3] = 0x10325476;
751 ctx->hash[4] = 0xc3d2e1f0;
753 ctx->process_block = sha1_process_block64;
756 static const uint32_t init256[] = {
768 static const uint32_t init512_lo[] = {
781 /* Initialize structure containing state of computation.
782 (FIPS 180-2:5.3.2) */
783 void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
785 memcpy(&ctx->total64, init256, sizeof(init256));
786 /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
787 ctx->process_block = sha256_process_block64;
790 /* Initialize structure containing state of computation.
791 (FIPS 180-2:5.3.3) */
792 void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
795 /* Two extra iterations zero out ctx->total64[2] */
796 uint64_t *tp = ctx->total64;
797 for (i = 0; i < 2+8; i++)
798 tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
799 /*ctx->total64[0] = ctx->total64[1] = 0; - already done */
802 void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
804 unsigned bufpos = ctx->total64[0] & 127;
807 /* First increment the byte count. FIPS 180-2 specifies the possible
808 length of the file up to 2^128 _bits_.
809 We compute the number of _bytes_ and convert to bits later. */
810 ctx->total64[0] += len;
811 if (ctx->total64[0] < len)
814 remaining = 128 - bufpos;
816 /* Hash whole blocks */
817 while (len >= remaining) {
818 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
819 buffer = (const char *)buffer + remaining;
823 sha512_process_block128(ctx);
826 /* Save last, partial blosk */
827 memcpy(ctx->wbuffer + bufpos, buffer, len);
830 remaining = 128 - bufpos;
833 /* Copy data into aligned buffer */
834 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
836 buffer = (const char *)buffer + remaining;
838 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
842 /* Buffer is filled up, process it */
843 sha512_process_block128(ctx);
844 /*bufpos = 0; - already is */
849 /* Used also for sha256 */
850 void FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
854 /* SHA stores total in BE, need to swap on LE arches: */
855 common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
857 hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
858 /* This way we do not impose alignment constraints on resbuf: */
859 if (BB_LITTLE_ENDIAN) {
861 for (i = 0; i < hash_size; ++i)
862 ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
864 memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * hash_size);
867 void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
869 unsigned bufpos = ctx->total64[0] & 127;
871 /* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
872 ctx->wbuffer[bufpos++] = 0x80;
875 unsigned remaining = 128 - bufpos;
876 memset(ctx->wbuffer + bufpos, 0, remaining);
877 if (remaining >= 16) {
878 /* Store the 128-bit counter of bits in the buffer in BE format */
880 t = ctx->total64[0] << 3;
882 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
883 t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
885 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
887 sha512_process_block128(ctx);
893 if (BB_LITTLE_ENDIAN) {
895 for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
896 ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
898 memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
903 * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
904 * Michael Peeters and Gilles Van Assche. For more information, feedback or
905 * questions, please refer to our website: http://keccak.noekeon.org/
907 * Implementation by Ronny Van Keer,
908 * hereby denoted as "the implementer".
910 * To the extent possible under law, the implementer has waived all copyright
911 * and related or neighboring rights to the source code in this file.
912 * http://creativecommons.org/publicdomain/zero/1.0/
914 * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
917 #if CONFIG_SHA3_SMALL < 0
918 # define SHA3_SMALL 0
919 #elif CONFIG_SHA3_SMALL > 1
920 # define SHA3_SMALL 1
922 # define SHA3_SMALL CONFIG_SHA3_SMALL
925 #define OPTIMIZE_SHA3_FOR_32 0
927 * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
928 * every 64-bit word of state[] can be split into two 32-bit words
929 * by even/odd bits. In this form, all rotations of sha3 round
930 * are 32-bit - and there are lots of them.
931 * However, it requires either splitting/combining state words
932 * before/after sha3 round (code does this now)
933 * or shuffling bits before xor'ing them into state and in sha3_end.
934 * Without shuffling, bit-slicing results in -130 bytes of code
935 * and marginal speedup (but of course it gives wrong result).
936 * With shuffling it works, but +260 code bytes, and slower.
939 #if 0 /* LONG_MAX == 0x7fffffff */
940 # undef OPTIMIZE_SHA3_FOR_32
941 # define OPTIMIZE_SHA3_FOR_32 1
944 #if OPTIMIZE_SHA3_FOR_32
945 /* This splits every 64-bit word into a pair of 32-bit words,
946 * even bits go into first word, odd bits go to second one.
947 * The conversion is done in-place.
949 static void split_halves(uint64_t *state)
951 /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
952 uint32_t *s32 = (uint32_t*)state;
955 for (i = 24; i >= 0; --i) {
957 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
958 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
959 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
960 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
962 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
963 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
964 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
965 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
966 *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
967 *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
970 /* The reverse operation */
971 static void combine_halves(uint64_t *state)
973 uint32_t *s32 = (uint32_t*)state;
976 for (i = 24; i >= 0; --i) {
979 t = (x0 & 0x0000FFFF) | (x1 << 16);
980 x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
982 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
983 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
984 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
985 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
987 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
988 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
989 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
990 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
997 * In the crypto literature this function is usually called Keccak-f().
999 static void sha3_process_block72(uint64_t *state)
1001 enum { NROUNDS = 24 };
1003 #if OPTIMIZE_SHA3_FOR_32
1005 static const uint32_t IOTA_CONST_0[NROUNDS] = {
1031 ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1033 uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1034 static const uint32_t IOTA_CONST_1[NROUNDS] = {
1061 uint32_t *const s32 = (uint32_t*)state;
1064 split_halves(state);
1066 for (round = 0; round < NROUNDS; round++) {
1072 for (x = 0; x < 10; ++x) {
1073 BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1075 for (x = 0; x < 10; x += 2) {
1077 ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1078 tb = BC[x+9] ^ BC[x+2];
1093 uint32_t t0a,t0b, t1a,t1b;
1097 #define RhoPi(PI_LANE, ROT_CONST) \
1098 t0a = s32[PI_LANE*2+0];\
1099 t0b = s32[PI_LANE*2+1];\
1100 if (ROT_CONST & 1) {\
1101 s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1102 s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1104 s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1105 s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1107 t1a = t0a; t1b = t0b;
1136 for (x = 0; x <= 40;) {
1137 uint32_t BC0, BC1, BC2, BC3, BC4;
1141 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1143 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1145 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1146 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1147 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1152 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1154 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1156 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1157 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1158 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1162 s32[0] ^= IOTA_CONST_0bits & 1;
1163 IOTA_CONST_0bits >>= 1;
1164 s32[1] ^= IOTA_CONST_1[round];
1167 combine_halves(state);
1169 /* Native 64-bit algorithm */
1170 static const uint16_t IOTA_CONST[NROUNDS] = {
1171 /* Elements should be 64-bit, but top half is always zero
1172 * or 0x80000000. We encode 63rd bits in a separate word below.
1173 * Same is true for 31th bits, which lets us use 16-bit table
1174 * instead of 64-bit. The speed penalty is lost in the noise.
1201 /* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1202 const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1203 /* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1204 const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1206 static const uint8_t ROT_CONST[24] = {
1207 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1208 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1210 static const uint8_t PI_LANE[24] = {
1211 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1212 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1214 /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
1219 if (BB_BIG_ENDIAN) {
1220 for (x = 0; x < 25; x++) {
1221 state[x] = SWAP_LE64(state[x]);
1225 for (round = 0; round < NROUNDS; ++round) {
1229 for (x = 0; x < 5; ++x) {
1230 BC[x + 5] = BC[x] = state[x]
1231 ^ state[x + 5] ^ state[x + 10]
1232 ^ state[x + 15] ^ state[x + 20];
1234 /* Using 2x5 vector above eliminates the need to use
1235 * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
1236 * and the code is a bit _smaller_.
1238 for (x = 0; x < 5; ++x) {
1239 uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
1241 state[x + 5] ^= temp;
1242 state[x + 10] ^= temp;
1243 state[x + 15] ^= temp;
1244 state[x + 20] ^= temp;
1250 uint64_t t1 = state[1];
1251 for (x = 0; x < 24; ++x) {
1252 uint64_t t0 = state[PI_LANE[x]];
1253 state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
1257 /* Especially large benefit for 32-bit arch (75% faster):
1258 * 64-bit rotations by non-constant usually are SLOW on those.
1259 * We resort to unrolling here.
1260 * This optimizes out PI_LANE[] and ROT_CONST[],
1261 * but generates 300-500 more bytes of code.
1264 uint64_t t1 = state[1];
1265 #define RhoPi_twice(x) \
1266 t0 = state[PI_LANE[x ]]; \
1267 state[PI_LANE[x ]] = rotl64(t1, ROT_CONST[x ]); \
1268 t1 = state[PI_LANE[x+1]]; \
1269 state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
1270 RhoPi_twice(0); RhoPi_twice(2);
1271 RhoPi_twice(4); RhoPi_twice(6);
1272 RhoPi_twice(8); RhoPi_twice(10);
1273 RhoPi_twice(12); RhoPi_twice(14);
1274 RhoPi_twice(16); RhoPi_twice(18);
1275 RhoPi_twice(20); RhoPi_twice(22);
1279 # if LONG_MAX > 0x7fffffff
1280 for (x = 0; x <= 20; x += 5) {
1281 uint64_t BC0, BC1, BC2, BC3, BC4;
1285 state[x + 0] = BC0 ^ ((~BC1) & BC2);
1287 state[x + 1] = BC1 ^ ((~BC2) & BC3);
1289 state[x + 2] = BC2 ^ ((~BC3) & BC4);
1290 state[x + 3] = BC3 ^ ((~BC4) & BC0);
1291 state[x + 4] = BC4 ^ ((~BC0) & BC1);
1294 /* Reduced register pressure version
1295 * for register-starved 32-bit arches
1296 * (i386: -95 bytes, and it is _faster_)
1298 for (x = 0; x <= 40;) {
1299 uint32_t BC0, BC1, BC2, BC3, BC4;
1300 uint32_t *const s32 = (uint32_t*)state;
1307 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1309 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1311 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1312 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1313 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1323 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1325 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1327 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1328 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1329 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1333 # endif /* long is 32-bit */
1335 state[0] ^= IOTA_CONST[round]
1336 | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1337 | (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
1340 if (BB_BIG_ENDIAN) {
1341 for (x = 0; x < 25; x++) {
1342 state[x] = SWAP_LE64(state[x]);
1348 void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1350 memset(ctx, 0, sizeof(*ctx));
1351 /* SHA3-512, user can override */
1352 ctx->input_block_bytes = (1600 - 512*2) / 8; /* 72 bytes */
1355 void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
1358 const uint8_t *data = buffer;
1359 unsigned bufpos = ctx->bytes_queued;
1362 unsigned remaining = ctx->input_block_bytes - bufpos;
1363 if (remaining > len)
1366 /* XOR data into buffer */
1367 while (remaining != 0) {
1368 uint8_t *buf = (uint8_t*)ctx->state;
1369 buf[bufpos] ^= *data++;
1373 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
1374 bufpos -= ctx->input_block_bytes;
1377 /* Buffer is filled up, process it */
1378 sha3_process_block72(ctx->state);
1379 /*bufpos = 0; - already is */
1381 ctx->bytes_queued = bufpos + ctx->input_block_bytes;
1383 /* +50 bytes code size, but a bit faster because of long-sized XORs */
1384 const uint8_t *data = buffer;
1385 unsigned bufpos = ctx->bytes_queued;
1386 unsigned iblk_bytes = ctx->input_block_bytes;
1388 /* If already data in queue, continue queuing first */
1391 uint8_t *buf = (uint8_t*)ctx->state;
1392 buf[bufpos] ^= *data++;
1395 if (bufpos == iblk_bytes) {
1402 /* Absorb complete blocks */
1403 while (len >= iblk_bytes) {
1404 /* XOR data onto beginning of state[].
1405 * We try to be efficient - operate one word at a time, not byte.
1406 * Careful wrt unaligned access: can't just use "*(long*)data"!
1408 unsigned count = iblk_bytes / sizeof(long);
1409 long *buf = (long*)ctx->state;
1412 move_from_unaligned_long(v, (long*)data);
1414 data += sizeof(long);
1418 sha3_process_block72(ctx->state);
1421 /* Queue remaining data bytes */
1423 uint8_t *buf = (uint8_t*)ctx->state;
1424 buf[bufpos] ^= *data++;
1429 ctx->bytes_queued = bufpos;
1433 void FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
1436 uint8_t *buf = (uint8_t*)ctx->state;
1438 * Keccak block padding is: add 1 bit after last bit of input,
1439 * then add zero bits until the end of block, and add the last 1 bit
1440 * (the last bit in the block) - the "10*1" pattern.
1441 * SHA3 standard appends additional two bits, 01, before that padding:
1443 * SHA3-224(M) = KECCAK[448](M||01, 224)
1444 * SHA3-256(M) = KECCAK[512](M||01, 256)
1445 * SHA3-384(M) = KECCAK[768](M||01, 384)
1446 * SHA3-512(M) = KECCAK[1024](M||01, 512)
1447 * (M is the input, || is bit concatenation)
1449 * The 6 below contains 01 "SHA3" bits and the first 1 "Keccak" bit:
1451 buf[ctx->bytes_queued] ^= 6; /* bit pattern 00000110 */
1452 buf[ctx->input_block_bytes - 1] ^= 0x80;
1454 sha3_process_block72(ctx->state);
1457 memcpy(resbuf, ctx->state, 64);