1 /* vi: set sw=4 ts=4: */
5 * Copyright (C) 2010 Denys Vlasenko
7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
11 #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
13 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
14 * (for rotX32, there is no difference). Why? My guess is that
15 * macro requires clever common subexpression elimination heuristics
16 * in gcc, while inline basically forces it to happen.
18 //#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
19 static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
21 return (x << n) | (x >> (32 - n));
23 //#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
24 static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
26 return (x >> n) | (x << (32 - n));
28 /* rotr64 in needed for sha512 only: */
29 //#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
30 static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
32 return (x >> n) | (x << (64 - n));
35 /* rotl64 only used for sha3 currently */
36 static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
38 return (x << n) | (x >> (64 - n));
41 /* Feed data through a temporary buffer.
42 * The internal buffer remembers previous data until it has 64
43 * bytes worth to pass on.
45 static void FAST_FUNC common64_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
47 unsigned bufpos = ctx->total64 & 63;
52 unsigned remaining = 64 - bufpos;
55 /* Copy data into aligned buffer */
56 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
58 buffer = (const char *)buffer + remaining;
60 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
64 /* Buffer is filled up, process it */
65 ctx->process_block(ctx);
66 /*bufpos = 0; - already is */
70 /* Process the remaining bytes in the buffer */
71 static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
73 unsigned bufpos = ctx->total64 & 63;
74 /* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
75 ctx->wbuffer[bufpos++] = 0x80;
77 /* This loop iterates either once or twice, no more, no less */
79 unsigned remaining = 64 - bufpos;
80 memset(ctx->wbuffer + bufpos, 0, remaining);
81 /* Do we have enough space for the length count? */
83 /* Store the 64-bit counter of bits in the buffer */
84 uint64_t t = ctx->total64 << 3;
87 /* wbuffer is suitably aligned for this */
88 *(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
90 ctx->process_block(ctx);
99 * Compute MD5 checksum of strings according to the
100 * definition of MD5 in RFC 1321 from April 1992.
102 * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
104 * Copyright (C) 1995-1999 Free Software Foundation, Inc.
105 * Copyright (C) 2001 Manuel Novoa III
106 * Copyright (C) 2003 Glenn L. McGrath
107 * Copyright (C) 2003 Erik Andersen
109 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
112 /* 0: fastest, 3: smallest */
113 #if CONFIG_MD5_SMALL < 0
115 #elif CONFIG_MD5_SMALL > 3
118 # define MD5_SMALL CONFIG_MD5_SMALL
121 /* These are the four functions used in the four steps of the MD5 algorithm
122 * and defined in the RFC 1321. The first function is a little bit optimized
123 * (as found in Colin Plumbs public domain implementation).
124 * #define FF(b, c, d) ((b & c) | (~b & d))
130 #define FF(b, c, d) (d ^ (b & (c ^ d)))
131 #define FG(b, c, d) FF(d, b, c)
132 #define FH(b, c, d) (b ^ c ^ d)
133 #define FI(b, c, d) (c ^ (b | ~d))
135 /* Hash a single block, 64 bytes long and 4-byte aligned */
136 static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
139 /* Before we start, one word to the strange constants.
140 They are defined in RFC 1321 as
141 T[i] = (int)(2^32 * fabs(sin(i))), i=1..64
143 static const uint32_t C_array[] = {
145 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
146 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
147 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
148 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
150 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
151 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
152 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
153 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
155 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
156 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
157 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
158 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
160 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
161 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
162 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
163 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
165 static const char P_array[] ALIGN1 = {
167 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
169 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
170 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
171 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 /* 4 */
174 uint32_t *words = (void*) ctx->wbuffer;
175 uint32_t A = ctx->hash[0];
176 uint32_t B = ctx->hash[1];
177 uint32_t C = ctx->hash[2];
178 uint32_t D = ctx->hash[3];
180 #if MD5_SMALL >= 2 /* 2 or 3 */
182 static const char S_array[] ALIGN1 = {
195 for (i = 0; i < 16; i++)
196 words[i] = SWAP_LE32(words[i]);
203 for (i = 0; i < 64; i++) {
217 default: /* case 3 */
220 temp += words[(int) (*pp++)] + *pc++;
221 temp = rotl32(temp, ps[i & 3]);
228 # else /* MD5_SMALL == 2 */
233 for (i = 0; i < 16; i++) {
234 temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
235 temp = rotl32(temp, ps[i & 3]);
243 for (i = 0; i < 16; i++) {
244 temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
245 temp = rotl32(temp, ps[i & 3]);
253 for (i = 0; i < 16; i++) {
254 temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
255 temp = rotl32(temp, ps[i & 3]);
263 for (i = 0; i < 16; i++) {
264 temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
265 temp = rotl32(temp, ps[i & 3]);
273 /* Add checksum to the starting values */
279 #else /* MD5_SMALL == 0 or 1 */
287 /* First round: using the given function, the context and a constant
288 the next context is computed. Because the algorithm's processing
289 unit is a 32-bit word and it is determined to work on words in
290 little endian byte order we perhaps have to change the byte order
291 before the computation. To reduce the work for the next steps
292 we save swapped words in WORDS array. */
294 # define OP(a, b, c, d, s, T) \
296 a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
305 for (i = 0; i < 4; i++) {
306 OP(A, B, C, D, 7, *pc++);
307 OP(D, A, B, C, 12, *pc++);
308 OP(C, D, A, B, 17, *pc++);
309 OP(B, C, D, A, 22, *pc++);
312 OP(A, B, C, D, 7, 0xd76aa478);
313 OP(D, A, B, C, 12, 0xe8c7b756);
314 OP(C, D, A, B, 17, 0x242070db);
315 OP(B, C, D, A, 22, 0xc1bdceee);
316 OP(A, B, C, D, 7, 0xf57c0faf);
317 OP(D, A, B, C, 12, 0x4787c62a);
318 OP(C, D, A, B, 17, 0xa8304613);
319 OP(B, C, D, A, 22, 0xfd469501);
320 OP(A, B, C, D, 7, 0x698098d8);
321 OP(D, A, B, C, 12, 0x8b44f7af);
322 OP(C, D, A, B, 17, 0xffff5bb1);
323 OP(B, C, D, A, 22, 0x895cd7be);
324 OP(A, B, C, D, 7, 0x6b901122);
325 OP(D, A, B, C, 12, 0xfd987193);
326 OP(C, D, A, B, 17, 0xa679438e);
327 OP(B, C, D, A, 22, 0x49b40821);
331 /* For the second to fourth round we have the possibly swapped words
332 in WORDS. Redefine the macro to take an additional first
333 argument specifying the function to use. */
335 # define OP(f, a, b, c, d, k, s, T) \
337 a += f(b, c, d) + words[k] + T; \
345 for (i = 0; i < 4; i++) {
346 OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
347 OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
348 OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
349 OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
352 OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
353 OP(FG, D, A, B, C, 6, 9, 0xc040b340);
354 OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
355 OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
356 OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
357 OP(FG, D, A, B, C, 10, 9, 0x02441453);
358 OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
359 OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
360 OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
361 OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
362 OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
363 OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
364 OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
365 OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
366 OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
367 OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
372 for (i = 0; i < 4; i++) {
373 OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
374 OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
375 OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
376 OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
379 OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
380 OP(FH, D, A, B, C, 8, 11, 0x8771f681);
381 OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
382 OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
383 OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
384 OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
385 OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
386 OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
387 OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
388 OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
389 OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
390 OP(FH, B, C, D, A, 6, 23, 0x04881d05);
391 OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
392 OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
393 OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
394 OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
399 for (i = 0; i < 4; i++) {
400 OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
401 OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
402 OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
403 OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
406 OP(FI, A, B, C, D, 0, 6, 0xf4292244);
407 OP(FI, D, A, B, C, 7, 10, 0x432aff97);
408 OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
409 OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
410 OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
411 OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
412 OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
413 OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
414 OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
415 OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
416 OP(FI, C, D, A, B, 6, 15, 0xa3014314);
417 OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
418 OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
419 OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
420 OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
421 OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
424 /* Add checksum to the starting values */
436 /* Initialize structure containing state of computation.
437 * (RFC 1321, 3.3: Step 3)
439 void FAST_FUNC md5_begin(md5_ctx_t *ctx)
441 ctx->hash[0] = 0x67452301;
442 ctx->hash[1] = 0xefcdab89;
443 ctx->hash[2] = 0x98badcfe;
444 ctx->hash[3] = 0x10325476;
446 ctx->process_block = md5_process_block64;
449 /* Used also for sha1 and sha256 */
450 void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
452 common64_hash(ctx, buffer, len);
455 /* Process the remaining bytes in the buffer and put result from CTX
456 * in first 16 bytes following RESBUF. The result is always in little
457 * endian byte order, so that a byte-wise output yields to the wanted
458 * ASCII representation of the message digest.
460 unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
462 /* MD5 stores total in LE, need to swap on BE arches: */
463 common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
465 /* The MD5 result is in little endian byte order */
467 ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
468 ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
469 ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
470 ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
473 memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
474 return sizeof(ctx->hash[0]) * 4;
480 * Copyright 2007 Rob Landley <rob@landley.net>
482 * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
483 * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
485 * Licensed under GPLv2, see file LICENSE in this source tree.
487 * ---------------------------------------------------------------------------
489 * SHA256 and SHA512 parts are:
490 * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
491 * Shrank by Denys Vlasenko.
493 * ---------------------------------------------------------------------------
495 * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
496 * and replace "4096" with something like "2000 + time(NULL) % 2097",
497 * then rebuild and compare "shaNNNsum bigfile" results.
500 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
502 static const uint32_t rconsts[] = {
503 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
508 uint32_t a, b, c, d, e;
510 /* On-stack work buffer frees up one register in the main loop
511 * which otherwise will be needed to hold ctx pointer */
512 for (i = 0; i < 16; i++)
513 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
521 /* 4 rounds of 20 operations each */
523 for (i = 0; i < 4; i++) {
530 work = (work & b) ^ d;
533 /* Used to do SWAP_BE32 here, but this
534 * requires ctx (see comment above) */
538 work = ((b | c) & d) | (b & c);
539 else /* i = 1 or 3 */
542 W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
545 work += e + rotl32(a, 5) + rconsts[i];
547 /* Rotate by one for next time */
550 c = /* b = */ rotl32(b, 30);
553 cnt = (cnt + 1) & 15;
564 /* Constants for SHA512 from FIPS 180-2:4.2.3.
565 * SHA256 constants from FIPS 180-2:4.2.2
566 * are the most significant half of first 64 elements
571 typedef uint64_t sha_K_int;
574 typedef uint32_t sha_K_int;
575 # define K(v) (uint32_t)(v >> 32)
577 static const sha_K_int sha_K[] = {
578 K(0x428a2f98d728ae22ULL), K(0x7137449123ef65cdULL),
579 K(0xb5c0fbcfec4d3b2fULL), K(0xe9b5dba58189dbbcULL),
580 K(0x3956c25bf348b538ULL), K(0x59f111f1b605d019ULL),
581 K(0x923f82a4af194f9bULL), K(0xab1c5ed5da6d8118ULL),
582 K(0xd807aa98a3030242ULL), K(0x12835b0145706fbeULL),
583 K(0x243185be4ee4b28cULL), K(0x550c7dc3d5ffb4e2ULL),
584 K(0x72be5d74f27b896fULL), K(0x80deb1fe3b1696b1ULL),
585 K(0x9bdc06a725c71235ULL), K(0xc19bf174cf692694ULL),
586 K(0xe49b69c19ef14ad2ULL), K(0xefbe4786384f25e3ULL),
587 K(0x0fc19dc68b8cd5b5ULL), K(0x240ca1cc77ac9c65ULL),
588 K(0x2de92c6f592b0275ULL), K(0x4a7484aa6ea6e483ULL),
589 K(0x5cb0a9dcbd41fbd4ULL), K(0x76f988da831153b5ULL),
590 K(0x983e5152ee66dfabULL), K(0xa831c66d2db43210ULL),
591 K(0xb00327c898fb213fULL), K(0xbf597fc7beef0ee4ULL),
592 K(0xc6e00bf33da88fc2ULL), K(0xd5a79147930aa725ULL),
593 K(0x06ca6351e003826fULL), K(0x142929670a0e6e70ULL),
594 K(0x27b70a8546d22ffcULL), K(0x2e1b21385c26c926ULL),
595 K(0x4d2c6dfc5ac42aedULL), K(0x53380d139d95b3dfULL),
596 K(0x650a73548baf63deULL), K(0x766a0abb3c77b2a8ULL),
597 K(0x81c2c92e47edaee6ULL), K(0x92722c851482353bULL),
598 K(0xa2bfe8a14cf10364ULL), K(0xa81a664bbc423001ULL),
599 K(0xc24b8b70d0f89791ULL), K(0xc76c51a30654be30ULL),
600 K(0xd192e819d6ef5218ULL), K(0xd69906245565a910ULL),
601 K(0xf40e35855771202aULL), K(0x106aa07032bbd1b8ULL),
602 K(0x19a4c116b8d2d0c8ULL), K(0x1e376c085141ab53ULL),
603 K(0x2748774cdf8eeb99ULL), K(0x34b0bcb5e19b48a8ULL),
604 K(0x391c0cb3c5c95a63ULL), K(0x4ed8aa4ae3418acbULL),
605 K(0x5b9cca4f7763e373ULL), K(0x682e6ff3d6b2b8a3ULL),
606 K(0x748f82ee5defb2fcULL), K(0x78a5636f43172f60ULL),
607 K(0x84c87814a1f0ab72ULL), K(0x8cc702081a6439ecULL),
608 K(0x90befffa23631e28ULL), K(0xa4506cebde82bde9ULL),
609 K(0xbef9a3f7b2c67915ULL), K(0xc67178f2e372532bULL),
610 #if NEED_SHA512 /* [64]+ are used for sha512 only */
611 K(0xca273eceea26619cULL), K(0xd186b8c721c0c207ULL),
612 K(0xeada7dd6cde0eb1eULL), K(0xf57d4f7fee6ed178ULL),
613 K(0x06f067aa72176fbaULL), K(0x0a637dc5a2c898a6ULL),
614 K(0x113f9804bef90daeULL), K(0x1b710b35131c471bULL),
615 K(0x28db77f523047d84ULL), K(0x32caab7b40c72493ULL),
616 K(0x3c9ebe0a15c9bebcULL), K(0x431d67c49c100d4cULL),
617 K(0x4cc5d4becb3e42b6ULL), K(0x597f299cfc657e2aULL),
618 K(0x5fcb6fab3ad6faecULL), K(0x6c44198c4a475817ULL),
630 static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
633 uint32_t W[64], a, b, c, d, e, f, g, h;
634 const uint32_t *words = (uint32_t*) ctx->wbuffer;
636 /* Operators defined in FIPS 180-2:4.1.2. */
637 #define Ch(x, y, z) ((x & y) ^ (~x & z))
638 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
639 #define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
640 #define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
641 #define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
642 #define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
644 /* Compute the message schedule according to FIPS 180-2:6.2.2 step 2. */
645 for (t = 0; t < 16; ++t)
646 W[t] = SWAP_BE32(words[t]);
647 for (/*t = 16*/; t < 64; ++t)
648 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
659 /* The actual computation according to FIPS 180-2:6.2.2 step 3. */
660 for (t = 0; t < 64; ++t) {
661 /* Need to fetch upper half of sha_K[t]
662 * (I hope compiler is clever enough to just fetch
665 uint32_t K_t = NEED_SHA512 ? (sha_K[t] >> 32) : sha_K[t];
666 uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
667 uint32_t T2 = S0(a) + Maj(a, b, c);
683 /* Add the starting values of the context according to FIPS 180-2:6.2.2
696 static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
700 /* On i386, having assignments here (not later as sha256 does)
701 * produces 99 bytes smaller code with gcc 4.3.1
703 uint64_t a = ctx->hash[0];
704 uint64_t b = ctx->hash[1];
705 uint64_t c = ctx->hash[2];
706 uint64_t d = ctx->hash[3];
707 uint64_t e = ctx->hash[4];
708 uint64_t f = ctx->hash[5];
709 uint64_t g = ctx->hash[6];
710 uint64_t h = ctx->hash[7];
711 const uint64_t *words = (uint64_t*) ctx->wbuffer;
713 /* Operators defined in FIPS 180-2:4.1.2. */
714 #define Ch(x, y, z) ((x & y) ^ (~x & z))
715 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
716 #define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
717 #define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
718 #define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
719 #define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
721 /* Compute the message schedule according to FIPS 180-2:6.3.2 step 2. */
722 for (t = 0; t < 16; ++t)
723 W[t] = SWAP_BE64(words[t]);
724 for (/*t = 16*/; t < 80; ++t)
725 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
727 /* The actual computation according to FIPS 180-2:6.3.2 step 3. */
728 for (t = 0; t < 80; ++t) {
729 uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
730 uint64_t T2 = S0(a) + Maj(a, b, c);
746 /* Add the starting values of the context according to FIPS 180-2:6.3.2
757 #endif /* NEED_SHA512 */
759 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
761 ctx->hash[0] = 0x67452301;
762 ctx->hash[1] = 0xefcdab89;
763 ctx->hash[2] = 0x98badcfe;
764 ctx->hash[3] = 0x10325476;
765 ctx->hash[4] = 0xc3d2e1f0;
767 ctx->process_block = sha1_process_block64;
770 static const uint32_t init256[] = {
783 static const uint32_t init512_lo[] = {
795 #endif /* NEED_SHA512 */
797 // Note: SHA-384 is identical to SHA-512, except that initial hash values are
798 // 0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
799 // 0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4,
800 // and the output is constructed by omitting last two 64-bit words of it.
802 /* Initialize structure containing state of computation.
803 (FIPS 180-2:5.3.2) */
804 void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
806 memcpy(&ctx->total64, init256, sizeof(init256));
807 /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
808 ctx->process_block = sha256_process_block64;
812 /* Initialize structure containing state of computation.
813 (FIPS 180-2:5.3.3) */
814 void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
817 /* Two extra iterations zero out ctx->total64[2] */
818 uint64_t *tp = ctx->total64;
819 for (i = 0; i < 2+8; i++)
820 tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
821 /*ctx->total64[0] = ctx->total64[1] = 0; - already done */
824 void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
826 unsigned bufpos = ctx->total64[0] & 127;
829 /* First increment the byte count. FIPS 180-2 specifies the possible
830 length of the file up to 2^128 _bits_.
831 We compute the number of _bytes_ and convert to bits later. */
832 ctx->total64[0] += len;
833 if (ctx->total64[0] < len)
836 remaining = 128 - bufpos;
838 /* Hash whole blocks */
839 while (len >= remaining) {
840 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
841 buffer = (const char *)buffer + remaining;
845 sha512_process_block128(ctx);
848 /* Save last, partial blosk */
849 memcpy(ctx->wbuffer + bufpos, buffer, len);
852 remaining = 128 - bufpos;
855 /* Copy data into aligned buffer */
856 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
858 buffer = (const char *)buffer + remaining;
860 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
864 /* Buffer is filled up, process it */
865 sha512_process_block128(ctx);
866 /*bufpos = 0; - already is */
870 #endif /* NEED_SHA512 */
872 /* Used also for sha256 */
873 unsigned FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
877 /* SHA stores total in BE, need to swap on LE arches: */
878 common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
880 hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
881 /* This way we do not impose alignment constraints on resbuf: */
882 if (BB_LITTLE_ENDIAN) {
884 for (i = 0; i < hash_size; ++i)
885 ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
887 hash_size *= sizeof(ctx->hash[0]);
888 memcpy(resbuf, ctx->hash, hash_size);
893 unsigned FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
895 unsigned bufpos = ctx->total64[0] & 127;
897 /* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
898 ctx->wbuffer[bufpos++] = 0x80;
901 unsigned remaining = 128 - bufpos;
902 memset(ctx->wbuffer + bufpos, 0, remaining);
903 if (remaining >= 16) {
904 /* Store the 128-bit counter of bits in the buffer in BE format */
906 t = ctx->total64[0] << 3;
908 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
909 t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
911 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
913 sha512_process_block128(ctx);
919 if (BB_LITTLE_ENDIAN) {
921 for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
922 ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
924 memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
925 return sizeof(ctx->hash);
927 #endif /* NEED_SHA512 */
931 * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
932 * Michael Peeters and Gilles Van Assche. For more information, feedback or
933 * questions, please refer to our website: http://keccak.noekeon.org/
935 * Implementation by Ronny Van Keer,
936 * hereby denoted as "the implementer".
938 * To the extent possible under law, the implementer has waived all copyright
939 * and related or neighboring rights to the source code in this file.
940 * http://creativecommons.org/publicdomain/zero/1.0/
942 * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
945 #if CONFIG_SHA3_SMALL < 0
946 # define SHA3_SMALL 0
947 #elif CONFIG_SHA3_SMALL > 1
948 # define SHA3_SMALL 1
950 # define SHA3_SMALL CONFIG_SHA3_SMALL
953 #define OPTIMIZE_SHA3_FOR_32 0
955 * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
956 * every 64-bit word of state[] can be split into two 32-bit words
957 * by even/odd bits. In this form, all rotations of sha3 round
958 * are 32-bit - and there are lots of them.
959 * However, it requires either splitting/combining state words
960 * before/after sha3 round (code does this now)
961 * or shuffling bits before xor'ing them into state and in sha3_end.
962 * Without shuffling, bit-slicing results in -130 bytes of code
963 * and marginal speedup (but of course it gives wrong result).
964 * With shuffling it works, but +260 code bytes, and slower.
967 #if 0 /* LONG_MAX == 0x7fffffff */
968 # undef OPTIMIZE_SHA3_FOR_32
969 # define OPTIMIZE_SHA3_FOR_32 1
972 #if OPTIMIZE_SHA3_FOR_32
973 /* This splits every 64-bit word into a pair of 32-bit words,
974 * even bits go into first word, odd bits go to second one.
975 * The conversion is done in-place.
977 static void split_halves(uint64_t *state)
979 /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
980 uint32_t *s32 = (uint32_t*)state;
983 for (i = 24; i >= 0; --i) {
985 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
986 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
987 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
988 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
990 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
991 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
992 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
993 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
994 *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
995 *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
998 /* The reverse operation */
999 static void combine_halves(uint64_t *state)
1001 uint32_t *s32 = (uint32_t*)state;
1004 for (i = 24; i >= 0; --i) {
1007 t = (x0 & 0x0000FFFF) | (x1 << 16);
1008 x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
1010 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
1011 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
1012 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
1013 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
1015 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
1016 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
1017 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
1018 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
1025 * In the crypto literature this function is usually called Keccak-f().
1027 static void sha3_process_block72(uint64_t *state)
1029 enum { NROUNDS = 24 };
1031 #if OPTIMIZE_SHA3_FOR_32
1033 static const uint32_t IOTA_CONST_0[NROUNDS] = {
1059 ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1061 uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1062 static const uint32_t IOTA_CONST_1[NROUNDS] = {
1089 uint32_t *const s32 = (uint32_t*)state;
1092 split_halves(state);
1094 for (round = 0; round < NROUNDS; round++) {
1100 for (x = 0; x < 10; ++x) {
1101 BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1103 for (x = 0; x < 10; x += 2) {
1105 ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1106 tb = BC[x+9] ^ BC[x+2];
1121 uint32_t t0a,t0b, t1a,t1b;
1125 #define RhoPi(PI_LANE, ROT_CONST) \
1126 t0a = s32[PI_LANE*2+0];\
1127 t0b = s32[PI_LANE*2+1];\
1128 if (ROT_CONST & 1) {\
1129 s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1130 s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1132 s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1133 s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1135 t1a = t0a; t1b = t0b;
1164 for (x = 0; x <= 40;) {
1165 uint32_t BC0, BC1, BC2, BC3, BC4;
1169 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1171 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1173 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1174 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1175 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1180 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1182 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1184 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1185 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1186 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1190 s32[0] ^= IOTA_CONST_0bits & 1;
1191 IOTA_CONST_0bits >>= 1;
1192 s32[1] ^= IOTA_CONST_1[round];
1195 combine_halves(state);
1197 /* Native 64-bit algorithm */
1198 static const uint16_t IOTA_CONST[NROUNDS] = {
1199 /* Elements should be 64-bit, but top half is always zero
1200 * or 0x80000000. We encode 63rd bits in a separate word below.
1201 * Same is true for 31th bits, which lets us use 16-bit table
1202 * instead of 64-bit. The speed penalty is lost in the noise.
1229 /* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1230 const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1231 /* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1232 const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1234 static const uint8_t ROT_CONST[24] = {
1235 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1236 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1238 static const uint8_t PI_LANE[24] = {
1239 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1240 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1242 /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
1247 if (BB_BIG_ENDIAN) {
1248 for (x = 0; x < 25; x++) {
1249 state[x] = SWAP_LE64(state[x]);
1253 for (round = 0; round < NROUNDS; ++round) {
1257 for (x = 0; x < 5; ++x) {
1258 BC[x + 5] = BC[x] = state[x]
1259 ^ state[x + 5] ^ state[x + 10]
1260 ^ state[x + 15] ^ state[x + 20];
1262 /* Using 2x5 vector above eliminates the need to use
1263 * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
1264 * and the code is a bit _smaller_.
1266 for (x = 0; x < 5; ++x) {
1267 uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
1269 state[x + 5] ^= temp;
1270 state[x + 10] ^= temp;
1271 state[x + 15] ^= temp;
1272 state[x + 20] ^= temp;
1278 uint64_t t1 = state[1];
1279 for (x = 0; x < 24; ++x) {
1280 uint64_t t0 = state[PI_LANE[x]];
1281 state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
1285 /* Especially large benefit for 32-bit arch (75% faster):
1286 * 64-bit rotations by non-constant usually are SLOW on those.
1287 * We resort to unrolling here.
1288 * This optimizes out PI_LANE[] and ROT_CONST[],
1289 * but generates 300-500 more bytes of code.
1292 uint64_t t1 = state[1];
1293 #define RhoPi_twice(x) \
1294 t0 = state[PI_LANE[x ]]; \
1295 state[PI_LANE[x ]] = rotl64(t1, ROT_CONST[x ]); \
1296 t1 = state[PI_LANE[x+1]]; \
1297 state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
1298 RhoPi_twice(0); RhoPi_twice(2);
1299 RhoPi_twice(4); RhoPi_twice(6);
1300 RhoPi_twice(8); RhoPi_twice(10);
1301 RhoPi_twice(12); RhoPi_twice(14);
1302 RhoPi_twice(16); RhoPi_twice(18);
1303 RhoPi_twice(20); RhoPi_twice(22);
1307 # if LONG_MAX > 0x7fffffff
1308 for (x = 0; x <= 20; x += 5) {
1309 uint64_t BC0, BC1, BC2, BC3, BC4;
1313 state[x + 0] = BC0 ^ ((~BC1) & BC2);
1315 state[x + 1] = BC1 ^ ((~BC2) & BC3);
1317 state[x + 2] = BC2 ^ ((~BC3) & BC4);
1318 state[x + 3] = BC3 ^ ((~BC4) & BC0);
1319 state[x + 4] = BC4 ^ ((~BC0) & BC1);
1322 /* Reduced register pressure version
1323 * for register-starved 32-bit arches
1324 * (i386: -95 bytes, and it is _faster_)
1326 for (x = 0; x <= 40;) {
1327 uint32_t BC0, BC1, BC2, BC3, BC4;
1328 uint32_t *const s32 = (uint32_t*)state;
1335 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1337 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1339 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1340 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1341 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1351 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1353 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1355 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1356 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1357 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1361 # endif /* long is 32-bit */
1363 state[0] ^= IOTA_CONST[round]
1364 | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1365 | (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
1368 if (BB_BIG_ENDIAN) {
1369 for (x = 0; x < 25; x++) {
1370 state[x] = SWAP_LE64(state[x]);
1376 void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1378 memset(ctx, 0, sizeof(*ctx));
1379 /* SHA3-512, user can override */
1380 ctx->input_block_bytes = (1600 - 512*2) / 8; /* 72 bytes */
1383 void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
1386 const uint8_t *data = buffer;
1387 unsigned bufpos = ctx->bytes_queued;
1390 unsigned remaining = ctx->input_block_bytes - bufpos;
1391 if (remaining > len)
1394 /* XOR data into buffer */
1395 while (remaining != 0) {
1396 uint8_t *buf = (uint8_t*)ctx->state;
1397 buf[bufpos] ^= *data++;
1401 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
1402 bufpos -= ctx->input_block_bytes;
1405 /* Buffer is filled up, process it */
1406 sha3_process_block72(ctx->state);
1407 /*bufpos = 0; - already is */
1409 ctx->bytes_queued = bufpos + ctx->input_block_bytes;
1411 /* +50 bytes code size, but a bit faster because of long-sized XORs */
1412 const uint8_t *data = buffer;
1413 unsigned bufpos = ctx->bytes_queued;
1414 unsigned iblk_bytes = ctx->input_block_bytes;
1416 /* If already data in queue, continue queuing first */
1419 uint8_t *buf = (uint8_t*)ctx->state;
1420 buf[bufpos] ^= *data++;
1423 if (bufpos == iblk_bytes) {
1430 /* Absorb complete blocks */
1431 while (len >= iblk_bytes) {
1432 /* XOR data onto beginning of state[].
1433 * We try to be efficient - operate one word at a time, not byte.
1434 * Careful wrt unaligned access: can't just use "*(long*)data"!
1436 unsigned count = iblk_bytes / sizeof(long);
1437 long *buf = (long*)ctx->state;
1440 move_from_unaligned_long(v, (long*)data);
1442 data += sizeof(long);
1446 sha3_process_block72(ctx->state);
1449 /* Queue remaining data bytes */
1451 uint8_t *buf = (uint8_t*)ctx->state;
1452 buf[bufpos] ^= *data++;
1457 ctx->bytes_queued = bufpos;
1461 unsigned FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
1464 uint8_t *buf = (uint8_t*)ctx->state;
1466 * Keccak block padding is: add 1 bit after last bit of input,
1467 * then add zero bits until the end of block, and add the last 1 bit
1468 * (the last bit in the block) - the "10*1" pattern.
1469 * SHA3 standard appends additional two bits, 01, before that padding:
1471 * SHA3-224(M) = KECCAK[448](M||01, 224)
1472 * SHA3-256(M) = KECCAK[512](M||01, 256)
1473 * SHA3-384(M) = KECCAK[768](M||01, 384)
1474 * SHA3-512(M) = KECCAK[1024](M||01, 512)
1475 * (M is the input, || is bit concatenation)
1477 * The 6 below contains 01 "SHA3" bits and the first 1 "Keccak" bit:
1479 buf[ctx->bytes_queued] ^= 6; /* bit pattern 00000110 */
1480 buf[ctx->input_block_bytes - 1] ^= 0x80;
1482 sha3_process_block72(ctx->state);
1485 memcpy(resbuf, ctx->state, 64);