1 /* vi: set sw=4 ts=4: */
5 * Copyright (C) 2010 Denys Vlasenko
7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
12 #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
14 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
15 * (for rotX32, there is no difference). Why? My guess is that
16 * macro requires clever common subexpression elimination heuristics
17 * in gcc, while inline basically forces it to happen.
19 //#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
20 static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
22 return (x << n) | (x >> (32 - n));
24 //#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
25 static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
27 return (x >> n) | (x << (32 - n));
29 /* rotr64 in needed for sha512 only: */
30 //#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
31 static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
33 return (x >> n) | (x << (64 - n));
36 /* rotl64 only used for sha3 currently */
37 static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
39 return (x << n) | (x >> (64 - n));
42 /* Feed data through a temporary buffer.
43 * The internal buffer remembers previous data until it has 64
44 * bytes worth to pass on.
46 static void FAST_FUNC common64_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
48 unsigned bufpos = ctx->total64 & 63;
53 unsigned remaining = 64 - bufpos;
56 /* Copy data into aligned buffer */
57 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
59 buffer = (const char *)buffer + remaining;
61 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
65 /* Buffer is filled up, process it */
66 ctx->process_block(ctx);
67 /*bufpos = 0; - already is */
71 /* Process the remaining bytes in the buffer */
72 static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
74 unsigned bufpos = ctx->total64 & 63;
75 /* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
76 ctx->wbuffer[bufpos++] = 0x80;
78 /* This loop iterates either once or twice, no more, no less */
80 unsigned remaining = 64 - bufpos;
81 memset(ctx->wbuffer + bufpos, 0, remaining);
82 /* Do we have enough space for the length count? */
84 /* Store the 64-bit counter of bits in the buffer */
85 uint64_t t = ctx->total64 << 3;
88 /* wbuffer is suitably aligned for this */
89 *(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
91 ctx->process_block(ctx);
100 * Compute MD5 checksum of strings according to the
101 * definition of MD5 in RFC 1321 from April 1992.
103 * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
105 * Copyright (C) 1995-1999 Free Software Foundation, Inc.
106 * Copyright (C) 2001 Manuel Novoa III
107 * Copyright (C) 2003 Glenn L. McGrath
108 * Copyright (C) 2003 Erik Andersen
110 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
113 /* 0: fastest, 3: smallest */
114 #if CONFIG_MD5_SMALL < 0
116 #elif CONFIG_MD5_SMALL > 3
119 # define MD5_SMALL CONFIG_MD5_SMALL
122 /* These are the four functions used in the four steps of the MD5 algorithm
123 * and defined in the RFC 1321. The first function is a little bit optimized
124 * (as found in Colin Plumbs public domain implementation).
125 * #define FF(b, c, d) ((b & c) | (~b & d))
131 #define FF(b, c, d) (d ^ (b & (c ^ d)))
132 #define FG(b, c, d) FF(d, b, c)
133 #define FH(b, c, d) (b ^ c ^ d)
134 #define FI(b, c, d) (c ^ (b | ~d))
136 /* Hash a single block, 64 bytes long and 4-byte aligned */
137 static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
140 /* Before we start, one word to the strange constants.
141 They are defined in RFC 1321 as
142 T[i] = (int)(2^32 * fabs(sin(i))), i=1..64
144 static const uint32_t C_array[] = {
146 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
147 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
148 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
149 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
151 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
152 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
153 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
154 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
156 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
157 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
158 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
159 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
161 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
162 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
163 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
164 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
166 static const char P_array[] ALIGN1 = {
168 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
170 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
171 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
172 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 /* 4 */
175 uint32_t *words = (void*) ctx->wbuffer;
176 uint32_t A = ctx->hash[0];
177 uint32_t B = ctx->hash[1];
178 uint32_t C = ctx->hash[2];
179 uint32_t D = ctx->hash[3];
181 #if MD5_SMALL >= 2 /* 2 or 3 */
183 static const char S_array[] ALIGN1 = {
196 for (i = 0; i < 16; i++)
197 words[i] = SWAP_LE32(words[i]);
204 for (i = 0; i < 64; i++) {
218 default: /* case 3 */
221 temp += words[(int) (*pp++)] + *pc++;
222 temp = rotl32(temp, ps[i & 3]);
229 # else /* MD5_SMALL == 2 */
234 for (i = 0; i < 16; i++) {
235 temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
236 temp = rotl32(temp, ps[i & 3]);
244 for (i = 0; i < 16; i++) {
245 temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
246 temp = rotl32(temp, ps[i & 3]);
254 for (i = 0; i < 16; i++) {
255 temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
256 temp = rotl32(temp, ps[i & 3]);
264 for (i = 0; i < 16; i++) {
265 temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
266 temp = rotl32(temp, ps[i & 3]);
274 /* Add checksum to the starting values */
280 #else /* MD5_SMALL == 0 or 1 */
288 /* First round: using the given function, the context and a constant
289 the next context is computed. Because the algorithm's processing
290 unit is a 32-bit word and it is determined to work on words in
291 little endian byte order we perhaps have to change the byte order
292 before the computation. To reduce the work for the next steps
293 we save swapped words in WORDS array. */
295 # define OP(a, b, c, d, s, T) \
297 a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
306 for (i = 0; i < 4; i++) {
307 OP(A, B, C, D, 7, *pc++);
308 OP(D, A, B, C, 12, *pc++);
309 OP(C, D, A, B, 17, *pc++);
310 OP(B, C, D, A, 22, *pc++);
313 OP(A, B, C, D, 7, 0xd76aa478);
314 OP(D, A, B, C, 12, 0xe8c7b756);
315 OP(C, D, A, B, 17, 0x242070db);
316 OP(B, C, D, A, 22, 0xc1bdceee);
317 OP(A, B, C, D, 7, 0xf57c0faf);
318 OP(D, A, B, C, 12, 0x4787c62a);
319 OP(C, D, A, B, 17, 0xa8304613);
320 OP(B, C, D, A, 22, 0xfd469501);
321 OP(A, B, C, D, 7, 0x698098d8);
322 OP(D, A, B, C, 12, 0x8b44f7af);
323 OP(C, D, A, B, 17, 0xffff5bb1);
324 OP(B, C, D, A, 22, 0x895cd7be);
325 OP(A, B, C, D, 7, 0x6b901122);
326 OP(D, A, B, C, 12, 0xfd987193);
327 OP(C, D, A, B, 17, 0xa679438e);
328 OP(B, C, D, A, 22, 0x49b40821);
332 /* For the second to fourth round we have the possibly swapped words
333 in WORDS. Redefine the macro to take an additional first
334 argument specifying the function to use. */
336 # define OP(f, a, b, c, d, k, s, T) \
338 a += f(b, c, d) + words[k] + T; \
346 for (i = 0; i < 4; i++) {
347 OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
348 OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
349 OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
350 OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
353 OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
354 OP(FG, D, A, B, C, 6, 9, 0xc040b340);
355 OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
356 OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
357 OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
358 OP(FG, D, A, B, C, 10, 9, 0x02441453);
359 OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
360 OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
361 OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
362 OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
363 OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
364 OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
365 OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
366 OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
367 OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
368 OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
373 for (i = 0; i < 4; i++) {
374 OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
375 OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
376 OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
377 OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
380 OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
381 OP(FH, D, A, B, C, 8, 11, 0x8771f681);
382 OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
383 OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
384 OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
385 OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
386 OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
387 OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
388 OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
389 OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
390 OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
391 OP(FH, B, C, D, A, 6, 23, 0x04881d05);
392 OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
393 OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
394 OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
395 OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
400 for (i = 0; i < 4; i++) {
401 OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
402 OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
403 OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
404 OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
407 OP(FI, A, B, C, D, 0, 6, 0xf4292244);
408 OP(FI, D, A, B, C, 7, 10, 0x432aff97);
409 OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
410 OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
411 OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
412 OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
413 OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
414 OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
415 OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
416 OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
417 OP(FI, C, D, A, B, 6, 15, 0xa3014314);
418 OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
419 OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
420 OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
421 OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
422 OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
425 /* Add checksum to the starting values */
437 /* Initialize structure containing state of computation.
438 * (RFC 1321, 3.3: Step 3)
440 void FAST_FUNC md5_begin(md5_ctx_t *ctx)
442 ctx->hash[0] = 0x67452301;
443 ctx->hash[1] = 0xefcdab89;
444 ctx->hash[2] = 0x98badcfe;
445 ctx->hash[3] = 0x10325476;
447 ctx->process_block = md5_process_block64;
450 /* Used also for sha1 and sha256 */
451 void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
453 common64_hash(ctx, buffer, len);
456 /* Process the remaining bytes in the buffer and put result from CTX
457 * in first 16 bytes following RESBUF. The result is always in little
458 * endian byte order, so that a byte-wise output yields to the wanted
459 * ASCII representation of the message digest.
461 unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
463 /* MD5 stores total in LE, need to swap on BE arches: */
464 common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
466 /* The MD5 result is in little endian byte order */
468 ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
469 ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
470 ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
471 ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
474 memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
475 return sizeof(ctx->hash[0]) * 4;
481 * Copyright 2007 Rob Landley <rob@landley.net>
483 * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
484 * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
486 * Licensed under GPLv2, see file LICENSE in this source tree.
488 * ---------------------------------------------------------------------------
490 * SHA256 and SHA512 parts are:
491 * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
492 * Shrank by Denys Vlasenko.
494 * ---------------------------------------------------------------------------
496 * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
497 * and replace "4096" with something like "2000 + time(NULL) % 2097",
498 * then rebuild and compare "shaNNNsum bigfile" results.
501 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
503 static const uint32_t rconsts[] = {
504 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
509 uint32_t a, b, c, d, e;
511 /* On-stack work buffer frees up one register in the main loop
512 * which otherwise will be needed to hold ctx pointer */
513 for (i = 0; i < 16; i++)
514 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
522 /* 4 rounds of 20 operations each */
524 for (i = 0; i < 4; i++) {
531 work = (work & b) ^ d;
534 /* Used to do SWAP_BE32 here, but this
535 * requires ctx (see comment above) */
539 work = ((b | c) & d) | (b & c);
540 else /* i = 1 or 3 */
543 W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
546 work += e + rotl32(a, 5) + rconsts[i];
548 /* Rotate by one for next time */
551 c = /* b = */ rotl32(b, 30);
554 cnt = (cnt + 1) & 15;
565 /* Constants for SHA512 from FIPS 180-2:4.2.3.
566 * SHA256 constants from FIPS 180-2:4.2.2
567 * are the most significant half of first 64 elements
572 typedef uint64_t sha_K_int;
575 typedef uint32_t sha_K_int;
576 # define K(v) (uint32_t)(v >> 32)
578 static const sha_K_int sha_K[] = {
579 K(0x428a2f98d728ae22ULL), K(0x7137449123ef65cdULL),
580 K(0xb5c0fbcfec4d3b2fULL), K(0xe9b5dba58189dbbcULL),
581 K(0x3956c25bf348b538ULL), K(0x59f111f1b605d019ULL),
582 K(0x923f82a4af194f9bULL), K(0xab1c5ed5da6d8118ULL),
583 K(0xd807aa98a3030242ULL), K(0x12835b0145706fbeULL),
584 K(0x243185be4ee4b28cULL), K(0x550c7dc3d5ffb4e2ULL),
585 K(0x72be5d74f27b896fULL), K(0x80deb1fe3b1696b1ULL),
586 K(0x9bdc06a725c71235ULL), K(0xc19bf174cf692694ULL),
587 K(0xe49b69c19ef14ad2ULL), K(0xefbe4786384f25e3ULL),
588 K(0x0fc19dc68b8cd5b5ULL), K(0x240ca1cc77ac9c65ULL),
589 K(0x2de92c6f592b0275ULL), K(0x4a7484aa6ea6e483ULL),
590 K(0x5cb0a9dcbd41fbd4ULL), K(0x76f988da831153b5ULL),
591 K(0x983e5152ee66dfabULL), K(0xa831c66d2db43210ULL),
592 K(0xb00327c898fb213fULL), K(0xbf597fc7beef0ee4ULL),
593 K(0xc6e00bf33da88fc2ULL), K(0xd5a79147930aa725ULL),
594 K(0x06ca6351e003826fULL), K(0x142929670a0e6e70ULL),
595 K(0x27b70a8546d22ffcULL), K(0x2e1b21385c26c926ULL),
596 K(0x4d2c6dfc5ac42aedULL), K(0x53380d139d95b3dfULL),
597 K(0x650a73548baf63deULL), K(0x766a0abb3c77b2a8ULL),
598 K(0x81c2c92e47edaee6ULL), K(0x92722c851482353bULL),
599 K(0xa2bfe8a14cf10364ULL), K(0xa81a664bbc423001ULL),
600 K(0xc24b8b70d0f89791ULL), K(0xc76c51a30654be30ULL),
601 K(0xd192e819d6ef5218ULL), K(0xd69906245565a910ULL),
602 K(0xf40e35855771202aULL), K(0x106aa07032bbd1b8ULL),
603 K(0x19a4c116b8d2d0c8ULL), K(0x1e376c085141ab53ULL),
604 K(0x2748774cdf8eeb99ULL), K(0x34b0bcb5e19b48a8ULL),
605 K(0x391c0cb3c5c95a63ULL), K(0x4ed8aa4ae3418acbULL),
606 K(0x5b9cca4f7763e373ULL), K(0x682e6ff3d6b2b8a3ULL),
607 K(0x748f82ee5defb2fcULL), K(0x78a5636f43172f60ULL),
608 K(0x84c87814a1f0ab72ULL), K(0x8cc702081a6439ecULL),
609 K(0x90befffa23631e28ULL), K(0xa4506cebde82bde9ULL),
610 K(0xbef9a3f7b2c67915ULL), K(0xc67178f2e372532bULL),
611 #if NEED_SHA512 /* [64]+ are used for sha512 only */
612 K(0xca273eceea26619cULL), K(0xd186b8c721c0c207ULL),
613 K(0xeada7dd6cde0eb1eULL), K(0xf57d4f7fee6ed178ULL),
614 K(0x06f067aa72176fbaULL), K(0x0a637dc5a2c898a6ULL),
615 K(0x113f9804bef90daeULL), K(0x1b710b35131c471bULL),
616 K(0x28db77f523047d84ULL), K(0x32caab7b40c72493ULL),
617 K(0x3c9ebe0a15c9bebcULL), K(0x431d67c49c100d4cULL),
618 K(0x4cc5d4becb3e42b6ULL), K(0x597f299cfc657e2aULL),
619 K(0x5fcb6fab3ad6faecULL), K(0x6c44198c4a475817ULL),
631 static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
634 uint32_t W[64], a, b, c, d, e, f, g, h;
635 const uint32_t *words = (uint32_t*) ctx->wbuffer;
637 /* Operators defined in FIPS 180-2:4.1.2. */
638 #define Ch(x, y, z) ((x & y) ^ (~x & z))
639 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
640 #define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
641 #define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
642 #define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
643 #define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
645 /* Compute the message schedule according to FIPS 180-2:6.2.2 step 2. */
646 for (t = 0; t < 16; ++t)
647 W[t] = SWAP_BE32(words[t]);
648 for (/*t = 16*/; t < 64; ++t)
649 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
660 /* The actual computation according to FIPS 180-2:6.2.2 step 3. */
661 for (t = 0; t < 64; ++t) {
662 /* Need to fetch upper half of sha_K[t]
663 * (I hope compiler is clever enough to just fetch
666 uint32_t K_t = NEED_SHA512 ? (sha_K[t] >> 32) : sha_K[t];
667 uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
668 uint32_t T2 = S0(a) + Maj(a, b, c);
684 /* Add the starting values of the context according to FIPS 180-2:6.2.2
697 static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
701 /* On i386, having assignments here (not later as sha256 does)
702 * produces 99 bytes smaller code with gcc 4.3.1
704 uint64_t a = ctx->hash[0];
705 uint64_t b = ctx->hash[1];
706 uint64_t c = ctx->hash[2];
707 uint64_t d = ctx->hash[3];
708 uint64_t e = ctx->hash[4];
709 uint64_t f = ctx->hash[5];
710 uint64_t g = ctx->hash[6];
711 uint64_t h = ctx->hash[7];
712 const uint64_t *words = (uint64_t*) ctx->wbuffer;
714 /* Operators defined in FIPS 180-2:4.1.2. */
715 #define Ch(x, y, z) ((x & y) ^ (~x & z))
716 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
717 #define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
718 #define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
719 #define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
720 #define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
722 /* Compute the message schedule according to FIPS 180-2:6.3.2 step 2. */
723 for (t = 0; t < 16; ++t)
724 W[t] = SWAP_BE64(words[t]);
725 for (/*t = 16*/; t < 80; ++t)
726 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
728 /* The actual computation according to FIPS 180-2:6.3.2 step 3. */
729 for (t = 0; t < 80; ++t) {
730 uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
731 uint64_t T2 = S0(a) + Maj(a, b, c);
747 /* Add the starting values of the context according to FIPS 180-2:6.3.2
758 #endif /* NEED_SHA512 */
760 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
762 ctx->hash[0] = 0x67452301;
763 ctx->hash[1] = 0xefcdab89;
764 ctx->hash[2] = 0x98badcfe;
765 ctx->hash[3] = 0x10325476;
766 ctx->hash[4] = 0xc3d2e1f0;
768 ctx->process_block = sha1_process_block64;
771 static const uint32_t init256[] = {
784 static const uint32_t init512_lo[] = {
796 #endif /* NEED_SHA512 */
798 /* Initialize structure containing state of computation.
799 (FIPS 180-2:5.3.2) */
800 void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
802 memcpy(&ctx->total64, init256, sizeof(init256));
803 /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
804 ctx->process_block = sha256_process_block64;
808 /* Initialize structure containing state of computation.
809 (FIPS 180-2:5.3.3) */
810 void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
813 /* Two extra iterations zero out ctx->total64[2] */
814 uint64_t *tp = ctx->total64;
815 for (i = 0; i < 2+8; i++)
816 tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
817 /*ctx->total64[0] = ctx->total64[1] = 0; - already done */
820 void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
822 unsigned bufpos = ctx->total64[0] & 127;
825 /* First increment the byte count. FIPS 180-2 specifies the possible
826 length of the file up to 2^128 _bits_.
827 We compute the number of _bytes_ and convert to bits later. */
828 ctx->total64[0] += len;
829 if (ctx->total64[0] < len)
832 remaining = 128 - bufpos;
834 /* Hash whole blocks */
835 while (len >= remaining) {
836 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
837 buffer = (const char *)buffer + remaining;
841 sha512_process_block128(ctx);
844 /* Save last, partial blosk */
845 memcpy(ctx->wbuffer + bufpos, buffer, len);
848 remaining = 128 - bufpos;
851 /* Copy data into aligned buffer */
852 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
854 buffer = (const char *)buffer + remaining;
856 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
860 /* Buffer is filled up, process it */
861 sha512_process_block128(ctx);
862 /*bufpos = 0; - already is */
866 #endif /* NEED_SHA512 */
868 /* Used also for sha256 */
869 unsigned FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
873 /* SHA stores total in BE, need to swap on LE arches: */
874 common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
876 hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
877 /* This way we do not impose alignment constraints on resbuf: */
878 if (BB_LITTLE_ENDIAN) {
880 for (i = 0; i < hash_size; ++i)
881 ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
883 hash_size *= sizeof(ctx->hash[0]);
884 memcpy(resbuf, ctx->hash, hash_size);
889 unsigned FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
891 unsigned bufpos = ctx->total64[0] & 127;
893 /* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
894 ctx->wbuffer[bufpos++] = 0x80;
897 unsigned remaining = 128 - bufpos;
898 memset(ctx->wbuffer + bufpos, 0, remaining);
899 if (remaining >= 16) {
900 /* Store the 128-bit counter of bits in the buffer in BE format */
902 t = ctx->total64[0] << 3;
904 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
905 t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
907 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
909 sha512_process_block128(ctx);
915 if (BB_LITTLE_ENDIAN) {
917 for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
918 ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
920 memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
921 return sizeof(ctx->hash);
923 #endif /* NEED_SHA512 */
927 * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
928 * Michael Peeters and Gilles Van Assche. For more information, feedback or
929 * questions, please refer to our website: http://keccak.noekeon.org/
931 * Implementation by Ronny Van Keer,
932 * hereby denoted as "the implementer".
934 * To the extent possible under law, the implementer has waived all copyright
935 * and related or neighboring rights to the source code in this file.
936 * http://creativecommons.org/publicdomain/zero/1.0/
938 * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
941 #if CONFIG_SHA3_SMALL < 0
942 # define SHA3_SMALL 0
943 #elif CONFIG_SHA3_SMALL > 1
944 # define SHA3_SMALL 1
946 # define SHA3_SMALL CONFIG_SHA3_SMALL
949 #define OPTIMIZE_SHA3_FOR_32 0
951 * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
952 * every 64-bit word of state[] can be split into two 32-bit words
953 * by even/odd bits. In this form, all rotations of sha3 round
954 * are 32-bit - and there are lots of them.
955 * However, it requires either splitting/combining state words
956 * before/after sha3 round (code does this now)
957 * or shuffling bits before xor'ing them into state and in sha3_end.
958 * Without shuffling, bit-slicing results in -130 bytes of code
959 * and marginal speedup (but of course it gives wrong result).
960 * With shuffling it works, but +260 code bytes, and slower.
963 #if 0 /* LONG_MAX == 0x7fffffff */
964 # undef OPTIMIZE_SHA3_FOR_32
965 # define OPTIMIZE_SHA3_FOR_32 1
968 #if OPTIMIZE_SHA3_FOR_32
969 /* This splits every 64-bit word into a pair of 32-bit words,
970 * even bits go into first word, odd bits go to second one.
971 * The conversion is done in-place.
973 static void split_halves(uint64_t *state)
975 /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
976 uint32_t *s32 = (uint32_t*)state;
979 for (i = 24; i >= 0; --i) {
981 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
982 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
983 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
984 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
986 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
987 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
988 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
989 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
990 *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
991 *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
994 /* The reverse operation */
995 static void combine_halves(uint64_t *state)
997 uint32_t *s32 = (uint32_t*)state;
1000 for (i = 24; i >= 0; --i) {
1003 t = (x0 & 0x0000FFFF) | (x1 << 16);
1004 x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
1006 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
1007 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
1008 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
1009 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
1011 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
1012 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
1013 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
1014 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
1021 * In the crypto literature this function is usually called Keccak-f().
1023 static void sha3_process_block72(uint64_t *state)
1025 enum { NROUNDS = 24 };
1027 #if OPTIMIZE_SHA3_FOR_32
1029 static const uint32_t IOTA_CONST_0[NROUNDS] = {
1055 ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1057 uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1058 static const uint32_t IOTA_CONST_1[NROUNDS] = {
1085 uint32_t *const s32 = (uint32_t*)state;
1088 split_halves(state);
1090 for (round = 0; round < NROUNDS; round++) {
1096 for (x = 0; x < 10; ++x) {
1097 BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1099 for (x = 0; x < 10; x += 2) {
1101 ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1102 tb = BC[x+9] ^ BC[x+2];
1117 uint32_t t0a,t0b, t1a,t1b;
1121 #define RhoPi(PI_LANE, ROT_CONST) \
1122 t0a = s32[PI_LANE*2+0];\
1123 t0b = s32[PI_LANE*2+1];\
1124 if (ROT_CONST & 1) {\
1125 s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1126 s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1128 s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1129 s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1131 t1a = t0a; t1b = t0b;
1160 for (x = 0; x <= 40;) {
1161 uint32_t BC0, BC1, BC2, BC3, BC4;
1165 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1167 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1169 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1170 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1171 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1176 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1178 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1180 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1181 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1182 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1186 s32[0] ^= IOTA_CONST_0bits & 1;
1187 IOTA_CONST_0bits >>= 1;
1188 s32[1] ^= IOTA_CONST_1[round];
1191 combine_halves(state);
1193 /* Native 64-bit algorithm */
1194 static const uint16_t IOTA_CONST[NROUNDS] = {
1195 /* Elements should be 64-bit, but top half is always zero
1196 * or 0x80000000. We encode 63rd bits in a separate word below.
1197 * Same is true for 31th bits, which lets us use 16-bit table
1198 * instead of 64-bit. The speed penalty is lost in the noise.
1225 /* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1226 const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1227 /* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1228 const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1230 static const uint8_t ROT_CONST[24] = {
1231 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1232 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1234 static const uint8_t PI_LANE[24] = {
1235 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1236 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1238 /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
1243 if (BB_BIG_ENDIAN) {
1244 for (x = 0; x < 25; x++) {
1245 state[x] = SWAP_LE64(state[x]);
1249 for (round = 0; round < NROUNDS; ++round) {
1253 for (x = 0; x < 5; ++x) {
1254 BC[x + 5] = BC[x] = state[x]
1255 ^ state[x + 5] ^ state[x + 10]
1256 ^ state[x + 15] ^ state[x + 20];
1258 /* Using 2x5 vector above eliminates the need to use
1259 * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
1260 * and the code is a bit _smaller_.
1262 for (x = 0; x < 5; ++x) {
1263 uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
1265 state[x + 5] ^= temp;
1266 state[x + 10] ^= temp;
1267 state[x + 15] ^= temp;
1268 state[x + 20] ^= temp;
1274 uint64_t t1 = state[1];
1275 for (x = 0; x < 24; ++x) {
1276 uint64_t t0 = state[PI_LANE[x]];
1277 state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
1281 /* Especially large benefit for 32-bit arch (75% faster):
1282 * 64-bit rotations by non-constant usually are SLOW on those.
1283 * We resort to unrolling here.
1284 * This optimizes out PI_LANE[] and ROT_CONST[],
1285 * but generates 300-500 more bytes of code.
1288 uint64_t t1 = state[1];
1289 #define RhoPi_twice(x) \
1290 t0 = state[PI_LANE[x ]]; \
1291 state[PI_LANE[x ]] = rotl64(t1, ROT_CONST[x ]); \
1292 t1 = state[PI_LANE[x+1]]; \
1293 state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
1294 RhoPi_twice(0); RhoPi_twice(2);
1295 RhoPi_twice(4); RhoPi_twice(6);
1296 RhoPi_twice(8); RhoPi_twice(10);
1297 RhoPi_twice(12); RhoPi_twice(14);
1298 RhoPi_twice(16); RhoPi_twice(18);
1299 RhoPi_twice(20); RhoPi_twice(22);
1303 # if LONG_MAX > 0x7fffffff
1304 for (x = 0; x <= 20; x += 5) {
1305 uint64_t BC0, BC1, BC2, BC3, BC4;
1309 state[x + 0] = BC0 ^ ((~BC1) & BC2);
1311 state[x + 1] = BC1 ^ ((~BC2) & BC3);
1313 state[x + 2] = BC2 ^ ((~BC3) & BC4);
1314 state[x + 3] = BC3 ^ ((~BC4) & BC0);
1315 state[x + 4] = BC4 ^ ((~BC0) & BC1);
1318 /* Reduced register pressure version
1319 * for register-starved 32-bit arches
1320 * (i386: -95 bytes, and it is _faster_)
1322 for (x = 0; x <= 40;) {
1323 uint32_t BC0, BC1, BC2, BC3, BC4;
1324 uint32_t *const s32 = (uint32_t*)state;
1331 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1333 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1335 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1336 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1337 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1347 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1349 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1351 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1352 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1353 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1357 # endif /* long is 32-bit */
1359 state[0] ^= IOTA_CONST[round]
1360 | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1361 | (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
1364 if (BB_BIG_ENDIAN) {
1365 for (x = 0; x < 25; x++) {
1366 state[x] = SWAP_LE64(state[x]);
1372 void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1374 memset(ctx, 0, sizeof(*ctx));
1375 /* SHA3-512, user can override */
1376 ctx->input_block_bytes = (1600 - 512*2) / 8; /* 72 bytes */
1379 void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
1382 const uint8_t *data = buffer;
1383 unsigned bufpos = ctx->bytes_queued;
1386 unsigned remaining = ctx->input_block_bytes - bufpos;
1387 if (remaining > len)
1390 /* XOR data into buffer */
1391 while (remaining != 0) {
1392 uint8_t *buf = (uint8_t*)ctx->state;
1393 buf[bufpos] ^= *data++;
1397 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
1398 bufpos -= ctx->input_block_bytes;
1401 /* Buffer is filled up, process it */
1402 sha3_process_block72(ctx->state);
1403 /*bufpos = 0; - already is */
1405 ctx->bytes_queued = bufpos + ctx->input_block_bytes;
1407 /* +50 bytes code size, but a bit faster because of long-sized XORs */
1408 const uint8_t *data = buffer;
1409 unsigned bufpos = ctx->bytes_queued;
1410 unsigned iblk_bytes = ctx->input_block_bytes;
1412 /* If already data in queue, continue queuing first */
1415 uint8_t *buf = (uint8_t*)ctx->state;
1416 buf[bufpos] ^= *data++;
1419 if (bufpos == iblk_bytes) {
1426 /* Absorb complete blocks */
1427 while (len >= iblk_bytes) {
1428 /* XOR data onto beginning of state[].
1429 * We try to be efficient - operate one word at a time, not byte.
1430 * Careful wrt unaligned access: can't just use "*(long*)data"!
1432 unsigned count = iblk_bytes / sizeof(long);
1433 long *buf = (long*)ctx->state;
1436 move_from_unaligned_long(v, (long*)data);
1438 data += sizeof(long);
1442 sha3_process_block72(ctx->state);
1445 /* Queue remaining data bytes */
1447 uint8_t *buf = (uint8_t*)ctx->state;
1448 buf[bufpos] ^= *data++;
1453 ctx->bytes_queued = bufpos;
1457 unsigned FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
1460 uint8_t *buf = (uint8_t*)ctx->state;
1462 * Keccak block padding is: add 1 bit after last bit of input,
1463 * then add zero bits until the end of block, and add the last 1 bit
1464 * (the last bit in the block) - the "10*1" pattern.
1465 * SHA3 standard appends additional two bits, 01, before that padding:
1467 * SHA3-224(M) = KECCAK[448](M||01, 224)
1468 * SHA3-256(M) = KECCAK[512](M||01, 256)
1469 * SHA3-384(M) = KECCAK[768](M||01, 384)
1470 * SHA3-512(M) = KECCAK[1024](M||01, 512)
1471 * (M is the input, || is bit concatenation)
1473 * The 6 below contains 01 "SHA3" bits and the first 1 "Keccak" bit:
1475 buf[ctx->bytes_queued] ^= 6; /* bit pattern 00000110 */
1476 buf[ctx->input_block_bytes - 1] ^= 0x80;
1478 sha3_process_block72(ctx->state);
1481 memcpy(resbuf, ctx->state, 64);