libbb/hash_md5_sha.c

   1 /* vi: set sw=4 ts=4: */
   2 /*
   3  * Utility routines.
   4  *
   5  * Copyright (C) 2010 Denys Vlasenko
   6  *
   7  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
   8  */
   9 #include "libbb.h"
  10
  11 #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
  12
  13 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
  14  * (for rotX32, there is no difference). Why? My guess is that
  15  * macro requires clever common subexpression elimination heuristics
  16  * in gcc, while inline basically forces it to happen.
  17  */
  18 //#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
  19 static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
  20 {
  21         return (x << n) | (x >> (32 - n));
  22 }
  23 //#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
  24 static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
  25 {
  26         return (x >> n) | (x << (32 - n));
  27 }
  28 /* rotr64 in needed for sha512 only: */
  29 //#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
  30 static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
  31 {
  32         return (x >> n) | (x << (64 - n));
  33 }
  34
  35 /* rotl64 only used for sha3 currently */
  36 static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
  37 {
  38         return (x << n) | (x >> (64 - n));
  39 }
  40
  41 /* Feed data through a temporary buffer.
  42  * The internal buffer remembers previous data until it has 64
  43  * bytes worth to pass on.
  44  */
  45 static void FAST_FUNC common64_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
  46 {
  47         unsigned bufpos = ctx->total64 & 63;
  48
  49         ctx->total64 += len;
  50
  51         while (1) {
  52                 unsigned remaining = 64 - bufpos;
  53                 if (remaining > len)
  54                         remaining = len;
  55                 /* Copy data into aligned buffer */
  56                 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
  57                 len -= remaining;
  58                 buffer = (const char *)buffer + remaining;
  59                 bufpos += remaining;
  60                 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
  61                 bufpos -= 64;
  62                 if (bufpos != 0)
  63                         break;
  64                 /* Buffer is filled up, process it */
  65                 ctx->process_block(ctx);
  66                 /*bufpos = 0; - already is */
  67         }
  68 }
  69
  70 /* Process the remaining bytes in the buffer */
  71 static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
  72 {
  73         unsigned bufpos = ctx->total64 & 63;
  74         /* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
  75         ctx->wbuffer[bufpos++] = 0x80;
  76
  77         /* This loop iterates either once or twice, no more, no less */
  78         while (1) {
  79                 unsigned remaining = 64 - bufpos;
  80                 memset(ctx->wbuffer + bufpos, 0, remaining);
  81                 /* Do we have enough space for the length count? */
  82                 if (remaining >= 8) {
  83                         /* Store the 64-bit counter of bits in the buffer */
  84                         uint64_t t = ctx->total64 << 3;
  85                         if (swap_needed)
  86                                 t = bb_bswap_64(t);
  87                         /* wbuffer is suitably aligned for this */
  88                         *(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
  89                 }
  90                 ctx->process_block(ctx);
  91                 if (remaining >= 8)
  92                         break;
  93                 bufpos = 0;
  94         }
  95 }
  96
  97
  98 /*
  99  * Compute MD5 checksum of strings according to the
 100  * definition of MD5 in RFC 1321 from April 1992.
 101  *
 102  * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
 103  *
 104  * Copyright (C) 1995-1999 Free Software Foundation, Inc.
 105  * Copyright (C) 2001 Manuel Novoa III
 106  * Copyright (C) 2003 Glenn L. McGrath
 107  * Copyright (C) 2003 Erik Andersen
 108  *
 109  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
 110  */
 111
 112 /* 0: fastest, 3: smallest */
 113 #if CONFIG_MD5_SMALL < 0
 114 # define MD5_SMALL 0
 115 #elif CONFIG_MD5_SMALL > 3
 116 # define MD5_SMALL 3
 117 #else
 118 # define MD5_SMALL CONFIG_MD5_SMALL
 119 #endif
 120
 121 /* These are the four functions used in the four steps of the MD5 algorithm
 122  * and defined in the RFC 1321.  The first function is a little bit optimized
 123  * (as found in Colin Plumbs public domain implementation).
 124  * #define FF(b, c, d) ((b & c) | (~b & d))
 125  */
 126 #undef FF
 127 #undef FG
 128 #undef FH
 129 #undef FI
 130 #define FF(b, c, d) (d ^ (b & (c ^ d)))
 131 #define FG(b, c, d) FF(d, b, c)
 132 #define FH(b, c, d) (b ^ c ^ d)
 133 #define FI(b, c, d) (c ^ (b | ~d))
 134
 135 /* Hash a single block, 64 bytes long and 4-byte aligned */
 136 static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
 137 {
 138 #if MD5_SMALL > 0
 139         /* Before we start, one word to the strange constants.
 140            They are defined in RFC 1321 as
 141            T[i] = (int)(2^32 * fabs(sin(i))), i=1..64
 142          */
 143         static const uint32_t C_array[] = {
 144                 /* round 1 */
 145                 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
 146                 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
 147                 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
 148                 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
 149                 /* round 2 */
 150                 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
 151                 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
 152                 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
 153                 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
 154                 /* round 3 */
 155                 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
 156                 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
 157                 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
 158                 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
 159                 /* round 4 */
 160                 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
 161                 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
 162                 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
 163                 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
 164         };
 165         static const char P_array[] ALIGN1 = {
 166 # if MD5_SMALL > 1
 167                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
 168 # endif
 169                 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
 170                 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
 171                 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9  /* 4 */
 172         };
 173 #endif
 174         uint32_t *words = (void*) ctx->wbuffer;
 175         uint32_t A = ctx->hash[0];
 176         uint32_t B = ctx->hash[1];
 177         uint32_t C = ctx->hash[2];
 178         uint32_t D = ctx->hash[3];
 179
 180 #if MD5_SMALL >= 2  /* 2 or 3 */
 181
 182         static const char S_array[] ALIGN1 = {
 183                 7, 12, 17, 22,
 184                 5, 9, 14, 20,
 185                 4, 11, 16, 23,
 186                 6, 10, 15, 21
 187         };
 188         const uint32_t *pc;
 189         const char *pp;
 190         const char *ps;
 191         int i;
 192         uint32_t temp;
 193
 194         if (BB_BIG_ENDIAN)
 195                 for (i = 0; i < 16; i++)
 196                         words[i] = SWAP_LE32(words[i]);
 197
 198 # if MD5_SMALL == 3
 199         pc = C_array;
 200         pp = P_array;
 201         ps = S_array - 4;
 202
 203         for (i = 0; i < 64; i++) {
 204                 if ((i & 0x0f) == 0)
 205                         ps += 4;
 206                 temp = A;
 207                 switch (i >> 4) {
 208                 case 0:
 209                         temp += FF(B, C, D);
 210                         break;
 211                 case 1:
 212                         temp += FG(B, C, D);
 213                         break;
 214                 case 2:
 215                         temp += FH(B, C, D);
 216                         break;
 217                 default: /* case 3 */
 218                         temp += FI(B, C, D);
 219                 }
 220                 temp += words[(int) (*pp++)] + *pc++;
 221                 temp = rotl32(temp, ps[i & 3]);
 222                 temp += B;
 223                 A = D;
 224                 D = C;
 225                 C = B;
 226                 B = temp;
 227         }
 228 # else  /* MD5_SMALL == 2 */
 229         pc = C_array;
 230         pp = P_array;
 231         ps = S_array;
 232
 233         for (i = 0; i < 16; i++) {
 234                 temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
 235                 temp = rotl32(temp, ps[i & 3]);
 236                 temp += B;
 237                 A = D;
 238                 D = C;
 239                 C = B;
 240                 B = temp;
 241         }
 242         ps += 4;
 243         for (i = 0; i < 16; i++) {
 244                 temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
 245                 temp = rotl32(temp, ps[i & 3]);
 246                 temp += B;
 247                 A = D;
 248                 D = C;
 249                 C = B;
 250                 B = temp;
 251         }
 252         ps += 4;
 253         for (i = 0; i < 16; i++) {
 254                 temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
 255                 temp = rotl32(temp, ps[i & 3]);
 256                 temp += B;
 257                 A = D;
 258                 D = C;
 259                 C = B;
 260                 B = temp;
 261         }
 262         ps += 4;
 263         for (i = 0; i < 16; i++) {
 264                 temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
 265                 temp = rotl32(temp, ps[i & 3]);
 266                 temp += B;
 267                 A = D;
 268                 D = C;
 269                 C = B;
 270                 B = temp;
 271         }
 272 # endif
 273         /* Add checksum to the starting values */
 274         ctx->hash[0] += A;
 275         ctx->hash[1] += B;
 276         ctx->hash[2] += C;
 277         ctx->hash[3] += D;
 278
 279 #else  /* MD5_SMALL == 0 or 1 */
 280
 281 # if MD5_SMALL == 1
 282         const uint32_t *pc;
 283         const char *pp;
 284         int i;
 285 # endif
 286
 287         /* First round: using the given function, the context and a constant
 288            the next context is computed.  Because the algorithm's processing
 289            unit is a 32-bit word and it is determined to work on words in
 290            little endian byte order we perhaps have to change the byte order
 291            before the computation.  To reduce the work for the next steps
 292            we save swapped words in WORDS array.  */
 293 # undef OP
 294 # define OP(a, b, c, d, s, T) \
 295         do { \
 296                 a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
 297                 words++; \
 298                 a = rotl32(a, s); \
 299                 a += b; \
 300         } while (0)
 301
 302         /* Round 1 */
 303 # if MD5_SMALL == 1
 304         pc = C_array;
 305         for (i = 0; i < 4; i++) {
 306                 OP(A, B, C, D, 7, *pc++);
 307                 OP(D, A, B, C, 12, *pc++);
 308                 OP(C, D, A, B, 17, *pc++);
 309                 OP(B, C, D, A, 22, *pc++);
 310         }
 311 # else
 312         OP(A, B, C, D, 7, 0xd76aa478);
 313         OP(D, A, B, C, 12, 0xe8c7b756);
 314         OP(C, D, A, B, 17, 0x242070db);
 315         OP(B, C, D, A, 22, 0xc1bdceee);
 316         OP(A, B, C, D, 7, 0xf57c0faf);
 317         OP(D, A, B, C, 12, 0x4787c62a);
 318         OP(C, D, A, B, 17, 0xa8304613);
 319         OP(B, C, D, A, 22, 0xfd469501);
 320         OP(A, B, C, D, 7, 0x698098d8);
 321         OP(D, A, B, C, 12, 0x8b44f7af);
 322         OP(C, D, A, B, 17, 0xffff5bb1);
 323         OP(B, C, D, A, 22, 0x895cd7be);
 324         OP(A, B, C, D, 7, 0x6b901122);
 325         OP(D, A, B, C, 12, 0xfd987193);
 326         OP(C, D, A, B, 17, 0xa679438e);
 327         OP(B, C, D, A, 22, 0x49b40821);
 328 # endif
 329         words -= 16;
 330
 331         /* For the second to fourth round we have the possibly swapped words
 332            in WORDS.  Redefine the macro to take an additional first
 333            argument specifying the function to use.  */
 334 # undef OP
 335 # define OP(f, a, b, c, d, k, s, T) \
 336         do { \
 337                 a += f(b, c, d) + words[k] + T; \
 338                 a = rotl32(a, s); \
 339                 a += b; \
 340         } while (0)
 341
 342         /* Round 2 */
 343 # if MD5_SMALL == 1
 344         pp = P_array;
 345         for (i = 0; i < 4; i++) {
 346                 OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
 347                 OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
 348                 OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
 349                 OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
 350         }
 351 # else
 352         OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
 353         OP(FG, D, A, B, C, 6, 9, 0xc040b340);
 354         OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
 355         OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
 356         OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
 357         OP(FG, D, A, B, C, 10, 9, 0x02441453);
 358         OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
 359         OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
 360         OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
 361         OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
 362         OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
 363         OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
 364         OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
 365         OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
 366         OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
 367         OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
 368 # endif
 369
 370         /* Round 3 */
 371 # if MD5_SMALL == 1
 372         for (i = 0; i < 4; i++) {
 373                 OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
 374                 OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
 375                 OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
 376                 OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
 377         }
 378 # else
 379         OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
 380         OP(FH, D, A, B, C, 8, 11, 0x8771f681);
 381         OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
 382         OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
 383         OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
 384         OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
 385         OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
 386         OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
 387         OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
 388         OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
 389         OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
 390         OP(FH, B, C, D, A, 6, 23, 0x04881d05);
 391         OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
 392         OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
 393         OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
 394         OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
 395 # endif
 396
 397         /* Round 4 */
 398 # if MD5_SMALL == 1
 399         for (i = 0; i < 4; i++) {
 400                 OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
 401                 OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
 402                 OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
 403                 OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
 404         }
 405 # else
 406         OP(FI, A, B, C, D, 0, 6, 0xf4292244);
 407         OP(FI, D, A, B, C, 7, 10, 0x432aff97);
 408         OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
 409         OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
 410         OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
 411         OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
 412         OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
 413         OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
 414         OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
 415         OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
 416         OP(FI, C, D, A, B, 6, 15, 0xa3014314);
 417         OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
 418         OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
 419         OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
 420         OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
 421         OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
 422 # undef OP
 423 # endif
 424         /* Add checksum to the starting values */
 425         ctx->hash[0] += A;
 426         ctx->hash[1] += B;
 427         ctx->hash[2] += C;
 428         ctx->hash[3] += D;
 429 #endif
 430 }
 431 #undef FF
 432 #undef FG
 433 #undef FH
 434 #undef FI
 435
 436 /* Initialize structure containing state of computation.
 437  * (RFC 1321, 3.3: Step 3)
 438  */
 439 void FAST_FUNC md5_begin(md5_ctx_t *ctx)
 440 {
 441         ctx->hash[0] = 0x67452301;
 442         ctx->hash[1] = 0xefcdab89;
 443         ctx->hash[2] = 0x98badcfe;
 444         ctx->hash[3] = 0x10325476;
 445         ctx->total64 = 0;
 446         ctx->process_block = md5_process_block64;
 447 }
 448
 449 /* Used also for sha1 and sha256 */
 450 void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
 451 {
 452         common64_hash(ctx, buffer, len);
 453 }
 454
 455 /* Process the remaining bytes in the buffer and put result from CTX
 456  * in first 16 bytes following RESBUF.  The result is always in little
 457  * endian byte order, so that a byte-wise output yields to the wanted
 458  * ASCII representation of the message digest.
 459  */
 460 unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
 461 {
 462         /* MD5 stores total in LE, need to swap on BE arches: */
 463         common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
 464
 465         /* The MD5 result is in little endian byte order */
 466         if (BB_BIG_ENDIAN) {
 467                 ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
 468                 ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
 469                 ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
 470                 ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
 471         }
 472
 473         memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
 474         return sizeof(ctx->hash[0]) * 4;
 475 }
 476
 477
 478 /*
 479  * SHA1 part is:
 480  * Copyright 2007 Rob Landley <rob@landley.net>
 481  *
 482  * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
 483  * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
 484  *
 485  * Licensed under GPLv2, see file LICENSE in this source tree.
 486  *
 487  * ---------------------------------------------------------------------------
 488  *
 489  * SHA256 and SHA512 parts are:
 490  * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
 491  * Shrank by Denys Vlasenko.
 492  *
 493  * ---------------------------------------------------------------------------
 494  *
 495  * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
 496  * and replace "4096" with something like "2000 + time(NULL) % 2097",
 497  * then rebuild and compare "shaNNNsum bigfile" results.
 498  */
 499
 500 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 501 {
 502         static const uint32_t rconsts[] = {
 503                 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
 504         };
 505         int i, j;
 506         int cnt;
 507         uint32_t W[16+16];
 508         uint32_t a, b, c, d, e;
 509
 510         /* On-stack work buffer frees up one register in the main loop
 511          * which otherwise will be needed to hold ctx pointer */
 512         for (i = 0; i < 16; i++)
 513                 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
 514
 515         a = ctx->hash[0];
 516         b = ctx->hash[1];
 517         c = ctx->hash[2];
 518         d = ctx->hash[3];
 519         e = ctx->hash[4];
 520
 521         /* 4 rounds of 20 operations each */
 522         cnt = 0;
 523         for (i = 0; i < 4; i++) {
 524                 j = 19;
 525                 do {
 526                         uint32_t work;
 527
 528                         work = c ^ d;
 529                         if (i == 0) {
 530                                 work = (work & b) ^ d;
 531                                 if (j <= 3)
 532                                         goto ge16;
 533                                 /* Used to do SWAP_BE32 here, but this
 534                                  * requires ctx (see comment above) */
 535                                 work += W[cnt];
 536                         } else {
 537                                 if (i == 2)
 538                                         work = ((b | c) & d) | (b & c);
 539                                 else /* i = 1 or 3 */
 540                                         work ^= b;
 541  ge16:
 542                                 W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
 543                                 work += W[cnt];
 544                         }
 545                         work += e + rotl32(a, 5) + rconsts[i];
 546
 547                         /* Rotate by one for next time */
 548                         e = d;
 549                         d = c;
 550                         c = /* b = */ rotl32(b, 30);
 551                         b = a;
 552                         a = work;
 553                         cnt = (cnt + 1) & 15;
 554                 } while (--j >= 0);
 555         }
 556
 557         ctx->hash[0] += a;
 558         ctx->hash[1] += b;
 559         ctx->hash[2] += c;
 560         ctx->hash[3] += d;
 561         ctx->hash[4] += e;
 562 }
 563
 564 /* Constants for SHA512 from FIPS 180-2:4.2.3.
 565  * SHA256 constants from FIPS 180-2:4.2.2
 566  * are the most significant half of first 64 elements
 567  * of the same array.
 568  */
 569 #undef K
 570 #if NEED_SHA512
 571 typedef uint64_t sha_K_int;
 572 # define K(v) v
 573 #else
 574 typedef uint32_t sha_K_int;
 575 # define K(v) (uint32_t)(v >> 32)
 576 #endif
 577 static const sha_K_int sha_K[] = {
 578         K(0x428a2f98d728ae22ULL), K(0x7137449123ef65cdULL),
 579         K(0xb5c0fbcfec4d3b2fULL), K(0xe9b5dba58189dbbcULL),
 580         K(0x3956c25bf348b538ULL), K(0x59f111f1b605d019ULL),
 581         K(0x923f82a4af194f9bULL), K(0xab1c5ed5da6d8118ULL),
 582         K(0xd807aa98a3030242ULL), K(0x12835b0145706fbeULL),
 583         K(0x243185be4ee4b28cULL), K(0x550c7dc3d5ffb4e2ULL),
 584         K(0x72be5d74f27b896fULL), K(0x80deb1fe3b1696b1ULL),
 585         K(0x9bdc06a725c71235ULL), K(0xc19bf174cf692694ULL),
 586         K(0xe49b69c19ef14ad2ULL), K(0xefbe4786384f25e3ULL),
 587         K(0x0fc19dc68b8cd5b5ULL), K(0x240ca1cc77ac9c65ULL),
 588         K(0x2de92c6f592b0275ULL), K(0x4a7484aa6ea6e483ULL),
 589         K(0x5cb0a9dcbd41fbd4ULL), K(0x76f988da831153b5ULL),
 590         K(0x983e5152ee66dfabULL), K(0xa831c66d2db43210ULL),
 591         K(0xb00327c898fb213fULL), K(0xbf597fc7beef0ee4ULL),
 592         K(0xc6e00bf33da88fc2ULL), K(0xd5a79147930aa725ULL),
 593         K(0x06ca6351e003826fULL), K(0x142929670a0e6e70ULL),
 594         K(0x27b70a8546d22ffcULL), K(0x2e1b21385c26c926ULL),
 595         K(0x4d2c6dfc5ac42aedULL), K(0x53380d139d95b3dfULL),
 596         K(0x650a73548baf63deULL), K(0x766a0abb3c77b2a8ULL),
 597         K(0x81c2c92e47edaee6ULL), K(0x92722c851482353bULL),
 598         K(0xa2bfe8a14cf10364ULL), K(0xa81a664bbc423001ULL),
 599         K(0xc24b8b70d0f89791ULL), K(0xc76c51a30654be30ULL),
 600         K(0xd192e819d6ef5218ULL), K(0xd69906245565a910ULL),
 601         K(0xf40e35855771202aULL), K(0x106aa07032bbd1b8ULL),
 602         K(0x19a4c116b8d2d0c8ULL), K(0x1e376c085141ab53ULL),
 603         K(0x2748774cdf8eeb99ULL), K(0x34b0bcb5e19b48a8ULL),
 604         K(0x391c0cb3c5c95a63ULL), K(0x4ed8aa4ae3418acbULL),
 605         K(0x5b9cca4f7763e373ULL), K(0x682e6ff3d6b2b8a3ULL),
 606         K(0x748f82ee5defb2fcULL), K(0x78a5636f43172f60ULL),
 607         K(0x84c87814a1f0ab72ULL), K(0x8cc702081a6439ecULL),
 608         K(0x90befffa23631e28ULL), K(0xa4506cebde82bde9ULL),
 609         K(0xbef9a3f7b2c67915ULL), K(0xc67178f2e372532bULL),
 610 #if NEED_SHA512  /* [64]+ are used for sha512 only */
 611         K(0xca273eceea26619cULL), K(0xd186b8c721c0c207ULL),
 612         K(0xeada7dd6cde0eb1eULL), K(0xf57d4f7fee6ed178ULL),
 613         K(0x06f067aa72176fbaULL), K(0x0a637dc5a2c898a6ULL),
 614         K(0x113f9804bef90daeULL), K(0x1b710b35131c471bULL),
 615         K(0x28db77f523047d84ULL), K(0x32caab7b40c72493ULL),
 616         K(0x3c9ebe0a15c9bebcULL), K(0x431d67c49c100d4cULL),
 617         K(0x4cc5d4becb3e42b6ULL), K(0x597f299cfc657e2aULL),
 618         K(0x5fcb6fab3ad6faecULL), K(0x6c44198c4a475817ULL),
 619 #endif
 620 };
 621 #undef K
 622
 623 #undef Ch
 624 #undef Maj
 625 #undef S0
 626 #undef S1
 627 #undef R0
 628 #undef R1
 629
 630 static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
 631 {
 632         unsigned t;
 633         uint32_t W[64], a, b, c, d, e, f, g, h;
 634         const uint32_t *words = (uint32_t*) ctx->wbuffer;
 635
 636         /* Operators defined in FIPS 180-2:4.1.2.  */
 637 #define Ch(x, y, z) ((x & y) ^ (~x & z))
 638 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
 639 #define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
 640 #define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
 641 #define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
 642 #define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
 643
 644         /* Compute the message schedule according to FIPS 180-2:6.2.2 step 2.  */
 645         for (t = 0; t < 16; ++t)
 646                 W[t] = SWAP_BE32(words[t]);
 647         for (/*t = 16*/; t < 64; ++t)
 648                 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
 649
 650         a = ctx->hash[0];
 651         b = ctx->hash[1];
 652         c = ctx->hash[2];
 653         d = ctx->hash[3];
 654         e = ctx->hash[4];
 655         f = ctx->hash[5];
 656         g = ctx->hash[6];
 657         h = ctx->hash[7];
 658
 659         /* The actual computation according to FIPS 180-2:6.2.2 step 3.  */
 660         for (t = 0; t < 64; ++t) {
 661                 /* Need to fetch upper half of sha_K[t]
 662                  * (I hope compiler is clever enough to just fetch
 663                  * upper half)
 664                  */
 665                 uint32_t K_t = NEED_SHA512 ? (sha_K[t] >> 32) : sha_K[t];
 666                 uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
 667                 uint32_t T2 = S0(a) + Maj(a, b, c);
 668                 h = g;
 669                 g = f;
 670                 f = e;
 671                 e = d + T1;
 672                 d = c;
 673                 c = b;
 674                 b = a;
 675                 a = T1 + T2;
 676         }
 677 #undef Ch
 678 #undef Maj
 679 #undef S0
 680 #undef S1
 681 #undef R0
 682 #undef R1
 683         /* Add the starting values of the context according to FIPS 180-2:6.2.2
 684            step 4.  */
 685         ctx->hash[0] += a;
 686         ctx->hash[1] += b;
 687         ctx->hash[2] += c;
 688         ctx->hash[3] += d;
 689         ctx->hash[4] += e;
 690         ctx->hash[5] += f;
 691         ctx->hash[6] += g;
 692         ctx->hash[7] += h;
 693 }
 694
 695 #if NEED_SHA512
 696 static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
 697 {
 698         unsigned t;
 699         uint64_t W[80];
 700         /* On i386, having assignments here (not later as sha256 does)
 701          * produces 99 bytes smaller code with gcc 4.3.1
 702          */
 703         uint64_t a = ctx->hash[0];
 704         uint64_t b = ctx->hash[1];
 705         uint64_t c = ctx->hash[2];
 706         uint64_t d = ctx->hash[3];
 707         uint64_t e = ctx->hash[4];
 708         uint64_t f = ctx->hash[5];
 709         uint64_t g = ctx->hash[6];
 710         uint64_t h = ctx->hash[7];
 711         const uint64_t *words = (uint64_t*) ctx->wbuffer;
 712
 713         /* Operators defined in FIPS 180-2:4.1.2.  */
 714 #define Ch(x, y, z) ((x & y) ^ (~x & z))
 715 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
 716 #define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
 717 #define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
 718 #define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
 719 #define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
 720
 721         /* Compute the message schedule according to FIPS 180-2:6.3.2 step 2.  */
 722         for (t = 0; t < 16; ++t)
 723                 W[t] = SWAP_BE64(words[t]);
 724         for (/*t = 16*/; t < 80; ++t)
 725                 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
 726
 727         /* The actual computation according to FIPS 180-2:6.3.2 step 3.  */
 728         for (t = 0; t < 80; ++t) {
 729                 uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
 730                 uint64_t T2 = S0(a) + Maj(a, b, c);
 731                 h = g;
 732                 g = f;
 733                 f = e;
 734                 e = d + T1;
 735                 d = c;
 736                 c = b;
 737                 b = a;
 738                 a = T1 + T2;
 739         }
 740 #undef Ch
 741 #undef Maj
 742 #undef S0
 743 #undef S1
 744 #undef R0
 745 #undef R1
 746         /* Add the starting values of the context according to FIPS 180-2:6.3.2
 747            step 4.  */
 748         ctx->hash[0] += a;
 749         ctx->hash[1] += b;
 750         ctx->hash[2] += c;
 751         ctx->hash[3] += d;
 752         ctx->hash[4] += e;
 753         ctx->hash[5] += f;
 754         ctx->hash[6] += g;
 755         ctx->hash[7] += h;
 756 }
 757 #endif /* NEED_SHA512 */
 758
 759 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 760 {
 761         ctx->hash[0] = 0x67452301;
 762         ctx->hash[1] = 0xefcdab89;
 763         ctx->hash[2] = 0x98badcfe;
 764         ctx->hash[3] = 0x10325476;
 765         ctx->hash[4] = 0xc3d2e1f0;
 766         ctx->total64 = 0;
 767         ctx->process_block = sha1_process_block64;
 768 }
 769
 770 static const uint32_t init256[] = {
 771         0,
 772         0,
 773         0x6a09e667,
 774         0xbb67ae85,
 775         0x3c6ef372,
 776         0xa54ff53a,
 777         0x510e527f,
 778         0x9b05688c,
 779         0x1f83d9ab,
 780         0x5be0cd19,
 781 };
 782 #if NEED_SHA512
 783 static const uint32_t init512_lo[] = {
 784         0,
 785         0,
 786         0xf3bcc908,
 787         0x84caa73b,
 788         0xfe94f82b,
 789         0x5f1d36f1,
 790         0xade682d1,
 791         0x2b3e6c1f,
 792         0xfb41bd6b,
 793         0x137e2179,
 794 };
 795 #endif /* NEED_SHA512 */
 796
 797 // Note: SHA-384 is identical to SHA-512, except that initial hash values are
 798 // 0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
 799 // 0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4,
 800 // and the output is constructed by omitting last two 64-bit words of it.
 801
 802 /* Initialize structure containing state of computation.
 803    (FIPS 180-2:5.3.2)  */
 804 void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
 805 {
 806         memcpy(&ctx->total64, init256, sizeof(init256));
 807         /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
 808         ctx->process_block = sha256_process_block64;
 809 }
 810
 811 #if NEED_SHA512
 812 /* Initialize structure containing state of computation.
 813    (FIPS 180-2:5.3.3)  */
 814 void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
 815 {
 816         int i;
 817         /* Two extra iterations zero out ctx->total64[2] */
 818         uint64_t *tp = ctx->total64;
 819         for (i = 0; i < 2+8; i++)
 820                 tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
 821         /*ctx->total64[0] = ctx->total64[1] = 0; - already done */
 822 }
 823
 824 void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
 825 {
 826         unsigned bufpos = ctx->total64[0] & 127;
 827         unsigned remaining;
 828
 829         /* First increment the byte count.  FIPS 180-2 specifies the possible
 830            length of the file up to 2^128 _bits_.
 831            We compute the number of _bytes_ and convert to bits later.  */
 832         ctx->total64[0] += len;
 833         if (ctx->total64[0] < len)
 834                 ctx->total64[1]++;
 835 # if 0
 836         remaining = 128 - bufpos;
 837
 838         /* Hash whole blocks */
 839         while (len >= remaining) {
 840                 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
 841                 buffer = (const char *)buffer + remaining;
 842                 len -= remaining;
 843                 remaining = 128;
 844                 bufpos = 0;
 845                 sha512_process_block128(ctx);
 846         }
 847
 848         /* Save last, partial blosk */
 849         memcpy(ctx->wbuffer + bufpos, buffer, len);
 850 # else
 851         while (1) {
 852                 remaining = 128 - bufpos;
 853                 if (remaining > len)
 854                         remaining = len;
 855                 /* Copy data into aligned buffer */
 856                 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
 857                 len -= remaining;
 858                 buffer = (const char *)buffer + remaining;
 859                 bufpos += remaining;
 860                 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
 861                 bufpos -= 128;
 862                 if (bufpos != 0)
 863                         break;
 864                 /* Buffer is filled up, process it */
 865                 sha512_process_block128(ctx);
 866                 /*bufpos = 0; - already is */
 867         }
 868 # endif
 869 }
 870 #endif /* NEED_SHA512 */
 871
 872 /* Used also for sha256 */
 873 unsigned FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
 874 {
 875         unsigned hash_size;
 876
 877         /* SHA stores total in BE, need to swap on LE arches: */
 878         common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
 879
 880         hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
 881         /* This way we do not impose alignment constraints on resbuf: */
 882         if (BB_LITTLE_ENDIAN) {
 883                 unsigned i;
 884                 for (i = 0; i < hash_size; ++i)
 885                         ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
 886         }
 887         hash_size *= sizeof(ctx->hash[0]);
 888         memcpy(resbuf, ctx->hash, hash_size);
 889         return hash_size;
 890 }
 891
 892 #if NEED_SHA512
 893 unsigned FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
 894 {
 895         unsigned bufpos = ctx->total64[0] & 127;
 896
 897         /* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
 898         ctx->wbuffer[bufpos++] = 0x80;
 899
 900         while (1) {
 901                 unsigned remaining = 128 - bufpos;
 902                 memset(ctx->wbuffer + bufpos, 0, remaining);
 903                 if (remaining >= 16) {
 904                         /* Store the 128-bit counter of bits in the buffer in BE format */
 905                         uint64_t t;
 906                         t = ctx->total64[0] << 3;
 907                         t = SWAP_BE64(t);
 908                         *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
 909                         t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
 910                         t = SWAP_BE64(t);
 911                         *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
 912                 }
 913                 sha512_process_block128(ctx);
 914                 if (remaining >= 16)
 915                         break;
 916                 bufpos = 0;
 917         }
 918
 919         if (BB_LITTLE_ENDIAN) {
 920                 unsigned i;
 921                 for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
 922                         ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
 923         }
 924         memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
 925         return sizeof(ctx->hash);
 926 }
 927 #endif /* NEED_SHA512 */
 928
 929
 930 /*
 931  * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
 932  * Michael Peeters and Gilles Van Assche. For more information, feedback or
 933  * questions, please refer to our website: http://keccak.noekeon.org/
 934  *
 935  * Implementation by Ronny Van Keer,
 936  * hereby denoted as "the implementer".
 937  *
 938  * To the extent possible under law, the implementer has waived all copyright
 939  * and related or neighboring rights to the source code in this file.
 940  * http://creativecommons.org/publicdomain/zero/1.0/
 941  *
 942  * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
 943  */
 944
 945 #if CONFIG_SHA3_SMALL < 0
 946 # define SHA3_SMALL 0
 947 #elif CONFIG_SHA3_SMALL > 1
 948 # define SHA3_SMALL 1
 949 #else
 950 # define SHA3_SMALL CONFIG_SHA3_SMALL
 951 #endif
 952
 953 #define OPTIMIZE_SHA3_FOR_32 0
 954 /*
 955  * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
 956  * every 64-bit word of state[] can be split into two 32-bit words
 957  * by even/odd bits. In this form, all rotations of sha3 round
 958  * are 32-bit - and there are lots of them.
 959  * However, it requires either splitting/combining state words
 960  * before/after sha3 round (code does this now)
 961  * or shuffling bits before xor'ing them into state and in sha3_end.
 962  * Without shuffling, bit-slicing results in -130 bytes of code
 963  * and marginal speedup (but of course it gives wrong result).
 964  * With shuffling it works, but +260 code bytes, and slower.
 965  * Disabled for now:
 966  */
 967 #if 0 /* LONG_MAX == 0x7fffffff */
 968 # undef OPTIMIZE_SHA3_FOR_32
 969 # define OPTIMIZE_SHA3_FOR_32 1
 970 #endif
 971
 972 #if OPTIMIZE_SHA3_FOR_32
 973 /* This splits every 64-bit word into a pair of 32-bit words,
 974  * even bits go into first word, odd bits go to second one.
 975  * The conversion is done in-place.
 976  */
 977 static void split_halves(uint64_t *state)
 978 {
 979         /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
 980         uint32_t *s32 = (uint32_t*)state;
 981         uint32_t t, x0, x1;
 982         int i;
 983         for (i = 24; i >= 0; --i) {
 984                 x0 = s32[0];
 985                 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
 986                 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
 987                 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
 988                 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
 989                 x1 = s32[1];
 990                 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
 991                 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
 992                 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
 993                 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
 994                 *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
 995                 *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
 996         }
 997 }
 998 /* The reverse operation */
 999 static void combine_halves(uint64_t *state)
1000 {
1001         uint32_t *s32 = (uint32_t*)state;
1002         uint32_t t, x0, x1;
1003         int i;
1004         for (i = 24; i >= 0; --i) {
1005                 x0 = s32[0];
1006                 x1 = s32[1];
1007                 t = (x0 & 0x0000FFFF) | (x1 << 16);
1008                 x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
1009                 x0 = t;
1010                 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
1011                 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
1012                 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
1013                 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
1014                 *s32++ = x0;
1015                 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
1016                 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
1017                 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
1018                 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
1019                 *s32++ = x1;
1020         }
1021 }
1022 #endif
1023
1024 /*
1025  * In the crypto literature this function is usually called Keccak-f().
1026  */
1027 static void sha3_process_block72(uint64_t *state)
1028 {
1029         enum { NROUNDS = 24 };
1030
1031 #if OPTIMIZE_SHA3_FOR_32
1032         /*
1033         static const uint32_t IOTA_CONST_0[NROUNDS] = {
1034                 0x00000001UL,
1035                 0x00000000UL,
1036                 0x00000000UL,
1037                 0x00000000UL,
1038                 0x00000001UL,
1039                 0x00000001UL,
1040                 0x00000001UL,
1041                 0x00000001UL,
1042                 0x00000000UL,
1043                 0x00000000UL,
1044                 0x00000001UL,
1045                 0x00000000UL,
1046                 0x00000001UL,
1047                 0x00000001UL,
1048                 0x00000001UL,
1049                 0x00000001UL,
1050                 0x00000000UL,
1051                 0x00000000UL,
1052                 0x00000000UL,
1053                 0x00000000UL,
1054                 0x00000001UL,
1055                 0x00000000UL,
1056                 0x00000001UL,
1057                 0x00000000UL,
1058         };
1059         ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1060         */
1061         uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1062         static const uint32_t IOTA_CONST_1[NROUNDS] = {
1063                 0x00000000UL,
1064                 0x00000089UL,
1065                 0x8000008bUL,
1066                 0x80008080UL,
1067                 0x0000008bUL,
1068                 0x00008000UL,
1069                 0x80008088UL,
1070                 0x80000082UL,
1071                 0x0000000bUL,
1072                 0x0000000aUL,
1073                 0x00008082UL,
1074                 0x00008003UL,
1075                 0x0000808bUL,
1076                 0x8000000bUL,
1077                 0x8000008aUL,
1078                 0x80000081UL,
1079                 0x80000081UL,
1080                 0x80000008UL,
1081                 0x00000083UL,
1082                 0x80008003UL,
1083                 0x80008088UL,
1084                 0x80000088UL,
1085                 0x00008000UL,
1086                 0x80008082UL,
1087         };
1088
1089         uint32_t *const s32 = (uint32_t*)state;
1090         unsigned round;
1091
1092         split_halves(state);
1093
1094         for (round = 0; round < NROUNDS; round++) {
1095                 unsigned x;
1096
1097                 /* Theta */
1098                 {
1099                         uint32_t BC[20];
1100                         for (x = 0; x < 10; ++x) {
1101                                 BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1102                         }
1103                         for (x = 0; x < 10; x += 2) {
1104                                 uint32_t ta, tb;
1105                                 ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1106                                 tb = BC[x+9] ^ BC[x+2];
1107                                 s32[x+0] ^= ta;
1108                                 s32[x+1] ^= tb;
1109                                 s32[x+10] ^= ta;
1110                                 s32[x+11] ^= tb;
1111                                 s32[x+20] ^= ta;
1112                                 s32[x+21] ^= tb;
1113                                 s32[x+30] ^= ta;
1114                                 s32[x+31] ^= tb;
1115                                 s32[x+40] ^= ta;
1116                                 s32[x+41] ^= tb;
1117                         }
1118                 }
1119                 /* RhoPi */
1120                 {
1121                         uint32_t t0a,t0b, t1a,t1b;
1122                         t1a = s32[1*2+0];
1123                         t1b = s32[1*2+1];
1124
1125 #define RhoPi(PI_LANE, ROT_CONST) \
1126         t0a = s32[PI_LANE*2+0];\
1127         t0b = s32[PI_LANE*2+1];\
1128         if (ROT_CONST & 1) {\
1129                 s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1130                 s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1131         } else {\
1132                 s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1133                 s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1134         }\
1135         t1a = t0a; t1b = t0b;
1136
1137                         RhoPi(10, 1)
1138                         RhoPi( 7, 3)
1139                         RhoPi(11, 6)
1140                         RhoPi(17,10)
1141                         RhoPi(18,15)
1142                         RhoPi( 3,21)
1143                         RhoPi( 5,28)
1144                         RhoPi(16,36)
1145                         RhoPi( 8,45)
1146                         RhoPi(21,55)
1147                         RhoPi(24, 2)
1148                         RhoPi( 4,14)
1149                         RhoPi(15,27)
1150                         RhoPi(23,41)
1151                         RhoPi(19,56)
1152                         RhoPi(13, 8)
1153                         RhoPi(12,25)
1154                         RhoPi( 2,43)
1155                         RhoPi(20,62)
1156                         RhoPi(14,18)
1157                         RhoPi(22,39)
1158                         RhoPi( 9,61)
1159                         RhoPi( 6,20)
1160                         RhoPi( 1,44)
1161 #undef RhoPi
1162                 }
1163                 /* Chi */
1164                 for (x = 0; x <= 40;) {
1165                         uint32_t BC0, BC1, BC2, BC3, BC4;
1166                         BC0 = s32[x + 0*2];
1167                         BC1 = s32[x + 1*2];
1168                         BC2 = s32[x + 2*2];
1169                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1170                         BC3 = s32[x + 3*2];
1171                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1172                         BC4 = s32[x + 4*2];
1173                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1174                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1175                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1176                         x++;
1177                         BC0 = s32[x + 0*2];
1178                         BC1 = s32[x + 1*2];
1179                         BC2 = s32[x + 2*2];
1180                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1181                         BC3 = s32[x + 3*2];
1182                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1183                         BC4 = s32[x + 4*2];
1184                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1185                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1186                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1187                         x += 9;
1188                 }
1189                 /* Iota */
1190                 s32[0] ^= IOTA_CONST_0bits & 1;
1191                 IOTA_CONST_0bits >>= 1;
1192                 s32[1] ^= IOTA_CONST_1[round];
1193         }
1194
1195         combine_halves(state);
1196 #else
1197         /* Native 64-bit algorithm */
1198         static const uint16_t IOTA_CONST[NROUNDS] = {
1199                 /* Elements should be 64-bit, but top half is always zero
1200                  * or 0x80000000. We encode 63rd bits in a separate word below.
1201                  * Same is true for 31th bits, which lets us use 16-bit table
1202                  * instead of 64-bit. The speed penalty is lost in the noise.
1203                  */
1204                 0x0001,
1205                 0x8082,
1206                 0x808a,
1207                 0x8000,
1208                 0x808b,
1209                 0x0001,
1210                 0x8081,
1211                 0x8009,
1212                 0x008a,
1213                 0x0088,
1214                 0x8009,
1215                 0x000a,
1216                 0x808b,
1217                 0x008b,
1218                 0x8089,
1219                 0x8003,
1220                 0x8002,
1221                 0x0080,
1222                 0x800a,
1223                 0x000a,
1224                 0x8081,
1225                 0x8080,
1226                 0x0001,
1227                 0x8008,
1228         };
1229         /* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1230         const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1231         /* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1232         const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1233
1234         static const uint8_t ROT_CONST[24] = {
1235                 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1236                 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1237         };
1238         static const uint8_t PI_LANE[24] = {
1239                 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1240                 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1241         };
1242         /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
1243
1244         unsigned x;
1245         unsigned round;
1246
1247         if (BB_BIG_ENDIAN) {
1248                 for (x = 0; x < 25; x++) {
1249                         state[x] = SWAP_LE64(state[x]);
1250                 }
1251         }
1252
1253         for (round = 0; round < NROUNDS; ++round) {
1254                 /* Theta */
1255                 {
1256                         uint64_t BC[10];
1257                         for (x = 0; x < 5; ++x) {
1258                                 BC[x + 5] = BC[x] = state[x]
1259                                         ^ state[x + 5] ^ state[x + 10]
1260                                         ^ state[x + 15] ^ state[x + 20];
1261                         }
1262                         /* Using 2x5 vector above eliminates the need to use
1263                          * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
1264                          * and the code is a bit _smaller_.
1265                          */
1266                         for (x = 0; x < 5; ++x) {
1267                                 uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
1268                                 state[x] ^= temp;
1269                                 state[x + 5] ^= temp;
1270                                 state[x + 10] ^= temp;
1271                                 state[x + 15] ^= temp;
1272                                 state[x + 20] ^= temp;
1273                         }
1274                 }
1275
1276                 /* Rho Pi */
1277                 if (SHA3_SMALL) {
1278                         uint64_t t1 = state[1];
1279                         for (x = 0; x < 24; ++x) {
1280                                 uint64_t t0 = state[PI_LANE[x]];
1281                                 state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
1282                                 t1 = t0;
1283                         }
1284                 } else {
1285                         /* Especially large benefit for 32-bit arch (75% faster):
1286                          * 64-bit rotations by non-constant usually are SLOW on those.
1287                          * We resort to unrolling here.
1288                          * This optimizes out PI_LANE[] and ROT_CONST[],
1289                          * but generates 300-500 more bytes of code.
1290                          */
1291                         uint64_t t0;
1292                         uint64_t t1 = state[1];
1293 #define RhoPi_twice(x) \
1294         t0 = state[PI_LANE[x  ]]; \
1295         state[PI_LANE[x  ]] = rotl64(t1, ROT_CONST[x  ]); \
1296         t1 = state[PI_LANE[x+1]]; \
1297         state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
1298                         RhoPi_twice(0); RhoPi_twice(2);
1299                         RhoPi_twice(4); RhoPi_twice(6);
1300                         RhoPi_twice(8); RhoPi_twice(10);
1301                         RhoPi_twice(12); RhoPi_twice(14);
1302                         RhoPi_twice(16); RhoPi_twice(18);
1303                         RhoPi_twice(20); RhoPi_twice(22);
1304 #undef RhoPi_twice
1305                 }
1306                 /* Chi */
1307 # if LONG_MAX > 0x7fffffff
1308                 for (x = 0; x <= 20; x += 5) {
1309                         uint64_t BC0, BC1, BC2, BC3, BC4;
1310                         BC0 = state[x + 0];
1311                         BC1 = state[x + 1];
1312                         BC2 = state[x + 2];
1313                         state[x + 0] = BC0 ^ ((~BC1) & BC2);
1314                         BC3 = state[x + 3];
1315                         state[x + 1] = BC1 ^ ((~BC2) & BC3);
1316                         BC4 = state[x + 4];
1317                         state[x + 2] = BC2 ^ ((~BC3) & BC4);
1318                         state[x + 3] = BC3 ^ ((~BC4) & BC0);
1319                         state[x + 4] = BC4 ^ ((~BC0) & BC1);
1320                 }
1321 # else
1322                 /* Reduced register pressure version
1323                  * for register-starved 32-bit arches
1324                  * (i386: -95 bytes, and it is _faster_)
1325                  */
1326                 for (x = 0; x <= 40;) {
1327                         uint32_t BC0, BC1, BC2, BC3, BC4;
1328                         uint32_t *const s32 = (uint32_t*)state;
1329 #  if SHA3_SMALL
1330  do_half:
1331 #  endif
1332                         BC0 = s32[x + 0*2];
1333                         BC1 = s32[x + 1*2];
1334                         BC2 = s32[x + 2*2];
1335                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1336                         BC3 = s32[x + 3*2];
1337                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1338                         BC4 = s32[x + 4*2];
1339                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1340                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1341                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1342                         x++;
1343 #  if SHA3_SMALL
1344                         if (x & 1)
1345                                 goto do_half;
1346                         x += 8;
1347 #  else
1348                         BC0 = s32[x + 0*2];
1349                         BC1 = s32[x + 1*2];
1350                         BC2 = s32[x + 2*2];
1351                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1352                         BC3 = s32[x + 3*2];
1353                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1354                         BC4 = s32[x + 4*2];
1355                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1356                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1357                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1358                         x += 9;
1359 #  endif
1360                 }
1361 # endif /* long is 32-bit */
1362                 /* Iota */
1363                 state[0] ^= IOTA_CONST[round]
1364                         | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1365                         | (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
1366         }
1367
1368         if (BB_BIG_ENDIAN) {
1369                 for (x = 0; x < 25; x++) {
1370                         state[x] = SWAP_LE64(state[x]);
1371                 }
1372         }
1373 #endif
1374 }
1375
1376 void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1377 {
1378         memset(ctx, 0, sizeof(*ctx));
1379         /* SHA3-512, user can override */
1380         ctx->input_block_bytes = (1600 - 512*2) / 8; /* 72 bytes */
1381 }
1382
1383 void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
1384 {
1385 #if SHA3_SMALL
1386         const uint8_t *data = buffer;
1387         unsigned bufpos = ctx->bytes_queued;
1388
1389         while (1) {
1390                 unsigned remaining = ctx->input_block_bytes - bufpos;
1391                 if (remaining > len)
1392                         remaining = len;
1393                 len -= remaining;
1394                 /* XOR data into buffer */
1395                 while (remaining != 0) {
1396                         uint8_t *buf = (uint8_t*)ctx->state;
1397                         buf[bufpos] ^= *data++;
1398                         bufpos++;
1399                         remaining--;
1400                 }
1401                 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
1402                 bufpos -= ctx->input_block_bytes;
1403                 if (bufpos != 0)
1404                         break;
1405                 /* Buffer is filled up, process it */
1406                 sha3_process_block72(ctx->state);
1407                 /*bufpos = 0; - already is */
1408         }
1409         ctx->bytes_queued = bufpos + ctx->input_block_bytes;
1410 #else
1411         /* +50 bytes code size, but a bit faster because of long-sized XORs */
1412         const uint8_t *data = buffer;
1413         unsigned bufpos = ctx->bytes_queued;
1414         unsigned iblk_bytes = ctx->input_block_bytes;
1415
1416         /* If already data in queue, continue queuing first */
1417         if (bufpos != 0) {
1418                 while (len != 0) {
1419                         uint8_t *buf = (uint8_t*)ctx->state;
1420                         buf[bufpos] ^= *data++;
1421                         len--;
1422                         bufpos++;
1423                         if (bufpos == iblk_bytes) {
1424                                 bufpos = 0;
1425                                 goto do_block;
1426                         }
1427                 }
1428         }
1429
1430         /* Absorb complete blocks */
1431         while (len >= iblk_bytes) {
1432                 /* XOR data onto beginning of state[].
1433                  * We try to be efficient - operate one word at a time, not byte.
1434                  * Careful wrt unaligned access: can't just use "*(long*)data"!
1435                  */
1436                 unsigned count = iblk_bytes / sizeof(long);
1437                 long *buf = (long*)ctx->state;
1438                 do {
1439                         long v;
1440                         move_from_unaligned_long(v, (long*)data);
1441                         *buf++ ^= v;
1442                         data += sizeof(long);
1443                 } while (--count);
1444                 len -= iblk_bytes;
1445  do_block:
1446                 sha3_process_block72(ctx->state);
1447         }
1448
1449         /* Queue remaining data bytes */
1450         while (len != 0) {
1451                 uint8_t *buf = (uint8_t*)ctx->state;
1452                 buf[bufpos] ^= *data++;
1453                 bufpos++;
1454                 len--;
1455         }
1456
1457         ctx->bytes_queued = bufpos;
1458 #endif
1459 }
1460
1461 unsigned FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
1462 {
1463         /* Padding */
1464         uint8_t *buf = (uint8_t*)ctx->state;
1465         /*
1466          * Keccak block padding is: add 1 bit after last bit of input,
1467          * then add zero bits until the end of block, and add the last 1 bit
1468          * (the last bit in the block) - the "10*1" pattern.
1469          * SHA3 standard appends additional two bits, 01,  before that padding:
1470          *
1471          * SHA3-224(M) = KECCAK[448](M||01, 224)
1472          * SHA3-256(M) = KECCAK[512](M||01, 256)
1473          * SHA3-384(M) = KECCAK[768](M||01, 384)
1474          * SHA3-512(M) = KECCAK[1024](M||01, 512)
1475          * (M is the input, || is bit concatenation)
1476          *
1477          * The 6 below contains 01 "SHA3" bits and the first 1 "Keccak" bit:
1478          */
1479         buf[ctx->bytes_queued]          ^= 6; /* bit pattern 00000110 */
1480         buf[ctx->input_block_bytes - 1] ^= 0x80;
1481
1482         sha3_process_block72(ctx->state);
1483
1484         /* Output */
1485         memcpy(resbuf, ctx->state, 64);
1486         return 64;
1487 }