libbb/hash_md5_sha.c

   1 /* vi: set sw=4 ts=4: */
   2 /*
   3  * Utility routines.
   4  *
   5  * Copyright (C) 2010 Denys Vlasenko
   6  *
   7  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
   8  */
   9
  10 #include "libbb.h"
  11
  12 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
  13  * (for rotX32, there is no difference). Why? My guess is that
  14  * macro requires clever common subexpression elimination heuristics
  15  * in gcc, while inline basically forces it to happen.
  16  */
  17 //#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
  18 static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
  19 {
  20         return (x << n) | (x >> (32 - n));
  21 }
  22 //#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
  23 static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
  24 {
  25         return (x >> n) | (x << (32 - n));
  26 }
  27 /* rotr64 in needed for sha512 only: */
  28 //#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
  29 static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
  30 {
  31         return (x >> n) | (x << (64 - n));
  32 }
  33
  34 /* rotl64 only used for sha3 currently */
  35 static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
  36 {
  37         return (x << n) | (x >> (64 - n));
  38 }
  39
  40 /* Feed data through a temporary buffer.
  41  * The internal buffer remembers previous data until it has 64
  42  * bytes worth to pass on.
  43  */
  44 static void FAST_FUNC common64_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
  45 {
  46         unsigned bufpos = ctx->total64 & 63;
  47
  48         ctx->total64 += len;
  49
  50         while (1) {
  51                 unsigned remaining = 64 - bufpos;
  52                 if (remaining > len)
  53                         remaining = len;
  54                 /* Copy data into aligned buffer */
  55                 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
  56                 len -= remaining;
  57                 buffer = (const char *)buffer + remaining;
  58                 bufpos += remaining;
  59                 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
  60                 bufpos -= 64;
  61                 if (bufpos != 0)
  62                         break;
  63                 /* Buffer is filled up, process it */
  64                 ctx->process_block(ctx);
  65                 /*bufpos = 0; - already is */
  66         }
  67 }
  68
  69 /* Process the remaining bytes in the buffer */
  70 static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
  71 {
  72         unsigned bufpos = ctx->total64 & 63;
  73         /* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
  74         ctx->wbuffer[bufpos++] = 0x80;
  75
  76         /* This loop iterates either once or twice, no more, no less */
  77         while (1) {
  78                 unsigned remaining = 64 - bufpos;
  79                 memset(ctx->wbuffer + bufpos, 0, remaining);
  80                 /* Do we have enough space for the length count? */
  81                 if (remaining >= 8) {
  82                         /* Store the 64-bit counter of bits in the buffer */
  83                         uint64_t t = ctx->total64 << 3;
  84                         if (swap_needed)
  85                                 t = bb_bswap_64(t);
  86                         /* wbuffer is suitably aligned for this */
  87                         *(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
  88                 }
  89                 ctx->process_block(ctx);
  90                 if (remaining >= 8)
  91                         break;
  92                 bufpos = 0;
  93         }
  94 }
  95
  96
  97 /*
  98  * Compute MD5 checksum of strings according to the
  99  * definition of MD5 in RFC 1321 from April 1992.
 100  *
 101  * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
 102  *
 103  * Copyright (C) 1995-1999 Free Software Foundation, Inc.
 104  * Copyright (C) 2001 Manuel Novoa III
 105  * Copyright (C) 2003 Glenn L. McGrath
 106  * Copyright (C) 2003 Erik Andersen
 107  *
 108  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
 109  */
 110
 111 /* 0: fastest, 3: smallest */
 112 #if CONFIG_MD5_SMALL < 0
 113 # define MD5_SMALL 0
 114 #elif CONFIG_MD5_SMALL > 3
 115 # define MD5_SMALL 3
 116 #else
 117 # define MD5_SMALL CONFIG_MD5_SMALL
 118 #endif
 119
 120 /* These are the four functions used in the four steps of the MD5 algorithm
 121  * and defined in the RFC 1321.  The first function is a little bit optimized
 122  * (as found in Colin Plumbs public domain implementation).
 123  * #define FF(b, c, d) ((b & c) | (~b & d))
 124  */
 125 #undef FF
 126 #undef FG
 127 #undef FH
 128 #undef FI
 129 #define FF(b, c, d) (d ^ (b & (c ^ d)))
 130 #define FG(b, c, d) FF(d, b, c)
 131 #define FH(b, c, d) (b ^ c ^ d)
 132 #define FI(b, c, d) (c ^ (b | ~d))
 133
 134 /* Hash a single block, 64 bytes long and 4-byte aligned */
 135 static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
 136 {
 137 #if MD5_SMALL > 0
 138         /* Before we start, one word to the strange constants.
 139            They are defined in RFC 1321 as
 140            T[i] = (int)(2^32 * fabs(sin(i))), i=1..64
 141          */
 142         static const uint32_t C_array[] = {
 143                 /* round 1 */
 144                 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
 145                 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
 146                 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
 147                 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
 148                 /* round 2 */
 149                 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
 150                 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
 151                 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
 152                 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
 153                 /* round 3 */
 154                 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
 155                 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
 156                 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
 157                 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
 158                 /* round 4 */
 159                 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
 160                 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
 161                 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
 162                 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
 163         };
 164         static const char P_array[] ALIGN1 = {
 165 # if MD5_SMALL > 1
 166                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
 167 # endif
 168                 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
 169                 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
 170                 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9  /* 4 */
 171         };
 172 #endif
 173         uint32_t *words = (void*) ctx->wbuffer;
 174         uint32_t A = ctx->hash[0];
 175         uint32_t B = ctx->hash[1];
 176         uint32_t C = ctx->hash[2];
 177         uint32_t D = ctx->hash[3];
 178
 179 #if MD5_SMALL >= 2  /* 2 or 3 */
 180
 181         static const char S_array[] ALIGN1 = {
 182                 7, 12, 17, 22,
 183                 5, 9, 14, 20,
 184                 4, 11, 16, 23,
 185                 6, 10, 15, 21
 186         };
 187         const uint32_t *pc;
 188         const char *pp;
 189         const char *ps;
 190         int i;
 191         uint32_t temp;
 192
 193         if (BB_BIG_ENDIAN)
 194                 for (i = 0; i < 16; i++)
 195                         words[i] = SWAP_LE32(words[i]);
 196
 197 # if MD5_SMALL == 3
 198         pc = C_array;
 199         pp = P_array;
 200         ps = S_array - 4;
 201
 202         for (i = 0; i < 64; i++) {
 203                 if ((i & 0x0f) == 0)
 204                         ps += 4;
 205                 temp = A;
 206                 switch (i >> 4) {
 207                 case 0:
 208                         temp += FF(B, C, D);
 209                         break;
 210                 case 1:
 211                         temp += FG(B, C, D);
 212                         break;
 213                 case 2:
 214                         temp += FH(B, C, D);
 215                         break;
 216                 default: /* case 3 */
 217                         temp += FI(B, C, D);
 218                 }
 219                 temp += words[(int) (*pp++)] + *pc++;
 220                 temp = rotl32(temp, ps[i & 3]);
 221                 temp += B;
 222                 A = D;
 223                 D = C;
 224                 C = B;
 225                 B = temp;
 226         }
 227 # else  /* MD5_SMALL == 2 */
 228         pc = C_array;
 229         pp = P_array;
 230         ps = S_array;
 231
 232         for (i = 0; i < 16; i++) {
 233                 temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
 234                 temp = rotl32(temp, ps[i & 3]);
 235                 temp += B;
 236                 A = D;
 237                 D = C;
 238                 C = B;
 239                 B = temp;
 240         }
 241         ps += 4;
 242         for (i = 0; i < 16; i++) {
 243                 temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
 244                 temp = rotl32(temp, ps[i & 3]);
 245                 temp += B;
 246                 A = D;
 247                 D = C;
 248                 C = B;
 249                 B = temp;
 250         }
 251         ps += 4;
 252         for (i = 0; i < 16; i++) {
 253                 temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
 254                 temp = rotl32(temp, ps[i & 3]);
 255                 temp += B;
 256                 A = D;
 257                 D = C;
 258                 C = B;
 259                 B = temp;
 260         }
 261         ps += 4;
 262         for (i = 0; i < 16; i++) {
 263                 temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
 264                 temp = rotl32(temp, ps[i & 3]);
 265                 temp += B;
 266                 A = D;
 267                 D = C;
 268                 C = B;
 269                 B = temp;
 270         }
 271 # endif
 272         /* Add checksum to the starting values */
 273         ctx->hash[0] += A;
 274         ctx->hash[1] += B;
 275         ctx->hash[2] += C;
 276         ctx->hash[3] += D;
 277
 278 #else  /* MD5_SMALL == 0 or 1 */
 279
 280 # if MD5_SMALL == 1
 281         const uint32_t *pc;
 282         const char *pp;
 283         int i;
 284 # endif
 285
 286         /* First round: using the given function, the context and a constant
 287            the next context is computed.  Because the algorithm's processing
 288            unit is a 32-bit word and it is determined to work on words in
 289            little endian byte order we perhaps have to change the byte order
 290            before the computation.  To reduce the work for the next steps
 291            we save swapped words in WORDS array.  */
 292 # undef OP
 293 # define OP(a, b, c, d, s, T) \
 294         do { \
 295                 a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
 296                 words++; \
 297                 a = rotl32(a, s); \
 298                 a += b; \
 299         } while (0)
 300
 301         /* Round 1 */
 302 # if MD5_SMALL == 1
 303         pc = C_array;
 304         for (i = 0; i < 4; i++) {
 305                 OP(A, B, C, D, 7, *pc++);
 306                 OP(D, A, B, C, 12, *pc++);
 307                 OP(C, D, A, B, 17, *pc++);
 308                 OP(B, C, D, A, 22, *pc++);
 309         }
 310 # else
 311         OP(A, B, C, D, 7, 0xd76aa478);
 312         OP(D, A, B, C, 12, 0xe8c7b756);
 313         OP(C, D, A, B, 17, 0x242070db);
 314         OP(B, C, D, A, 22, 0xc1bdceee);
 315         OP(A, B, C, D, 7, 0xf57c0faf);
 316         OP(D, A, B, C, 12, 0x4787c62a);
 317         OP(C, D, A, B, 17, 0xa8304613);
 318         OP(B, C, D, A, 22, 0xfd469501);
 319         OP(A, B, C, D, 7, 0x698098d8);
 320         OP(D, A, B, C, 12, 0x8b44f7af);
 321         OP(C, D, A, B, 17, 0xffff5bb1);
 322         OP(B, C, D, A, 22, 0x895cd7be);
 323         OP(A, B, C, D, 7, 0x6b901122);
 324         OP(D, A, B, C, 12, 0xfd987193);
 325         OP(C, D, A, B, 17, 0xa679438e);
 326         OP(B, C, D, A, 22, 0x49b40821);
 327 # endif
 328         words -= 16;
 329
 330         /* For the second to fourth round we have the possibly swapped words
 331            in WORDS.  Redefine the macro to take an additional first
 332            argument specifying the function to use.  */
 333 # undef OP
 334 # define OP(f, a, b, c, d, k, s, T) \
 335         do { \
 336                 a += f(b, c, d) + words[k] + T; \
 337                 a = rotl32(a, s); \
 338                 a += b; \
 339         } while (0)
 340
 341         /* Round 2 */
 342 # if MD5_SMALL == 1
 343         pp = P_array;
 344         for (i = 0; i < 4; i++) {
 345                 OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
 346                 OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
 347                 OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
 348                 OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
 349         }
 350 # else
 351         OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
 352         OP(FG, D, A, B, C, 6, 9, 0xc040b340);
 353         OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
 354         OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
 355         OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
 356         OP(FG, D, A, B, C, 10, 9, 0x02441453);
 357         OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
 358         OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
 359         OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
 360         OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
 361         OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
 362         OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
 363         OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
 364         OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
 365         OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
 366         OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
 367 # endif
 368
 369         /* Round 3 */
 370 # if MD5_SMALL == 1
 371         for (i = 0; i < 4; i++) {
 372                 OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
 373                 OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
 374                 OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
 375                 OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
 376         }
 377 # else
 378         OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
 379         OP(FH, D, A, B, C, 8, 11, 0x8771f681);
 380         OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
 381         OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
 382         OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
 383         OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
 384         OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
 385         OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
 386         OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
 387         OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
 388         OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
 389         OP(FH, B, C, D, A, 6, 23, 0x04881d05);
 390         OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
 391         OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
 392         OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
 393         OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
 394 # endif
 395
 396         /* Round 4 */
 397 # if MD5_SMALL == 1
 398         for (i = 0; i < 4; i++) {
 399                 OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
 400                 OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
 401                 OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
 402                 OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
 403         }
 404 # else
 405         OP(FI, A, B, C, D, 0, 6, 0xf4292244);
 406         OP(FI, D, A, B, C, 7, 10, 0x432aff97);
 407         OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
 408         OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
 409         OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
 410         OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
 411         OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
 412         OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
 413         OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
 414         OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
 415         OP(FI, C, D, A, B, 6, 15, 0xa3014314);
 416         OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
 417         OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
 418         OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
 419         OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
 420         OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
 421 # undef OP
 422 # endif
 423         /* Add checksum to the starting values */
 424         ctx->hash[0] += A;
 425         ctx->hash[1] += B;
 426         ctx->hash[2] += C;
 427         ctx->hash[3] += D;
 428 #endif
 429 }
 430 #undef FF
 431 #undef FG
 432 #undef FH
 433 #undef FI
 434
 435 /* Initialize structure containing state of computation.
 436  * (RFC 1321, 3.3: Step 3)
 437  */
 438 void FAST_FUNC md5_begin(md5_ctx_t *ctx)
 439 {
 440         ctx->hash[0] = 0x67452301;
 441         ctx->hash[1] = 0xefcdab89;
 442         ctx->hash[2] = 0x98badcfe;
 443         ctx->hash[3] = 0x10325476;
 444         ctx->total64 = 0;
 445         ctx->process_block = md5_process_block64;
 446 }
 447
 448 /* Used also for sha1 and sha256 */
 449 void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
 450 {
 451         common64_hash(ctx, buffer, len);
 452 }
 453
 454 /* Process the remaining bytes in the buffer and put result from CTX
 455  * in first 16 bytes following RESBUF.  The result is always in little
 456  * endian byte order, so that a byte-wise output yields to the wanted
 457  * ASCII representation of the message digest.
 458  */
 459 void FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
 460 {
 461         /* MD5 stores total in LE, need to swap on BE arches: */
 462         common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
 463
 464         /* The MD5 result is in little endian byte order */
 465         if (BB_BIG_ENDIAN) {
 466                 ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
 467                 ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
 468                 ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
 469                 ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
 470         }
 471
 472         memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
 473 }
 474
 475
 476 /*
 477  * SHA1 part is:
 478  * Copyright 2007 Rob Landley <rob@landley.net>
 479  *
 480  * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
 481  * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
 482  *
 483  * Licensed under GPLv2, see file LICENSE in this source tree.
 484  *
 485  * ---------------------------------------------------------------------------
 486  *
 487  * SHA256 and SHA512 parts are:
 488  * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
 489  * Shrank by Denys Vlasenko.
 490  *
 491  * ---------------------------------------------------------------------------
 492  *
 493  * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
 494  * and replace "4096" with something like "2000 + time(NULL) % 2097",
 495  * then rebuild and compare "shaNNNsum bigfile" results.
 496  */
 497
 498 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 499 {
 500         static const uint32_t rconsts[] = {
 501                 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
 502         };
 503         int i, j;
 504         int cnt;
 505         uint32_t W[16+16];
 506         uint32_t a, b, c, d, e;
 507
 508         /* On-stack work buffer frees up one register in the main loop
 509          * which otherwise will be needed to hold ctx pointer */
 510         for (i = 0; i < 16; i++)
 511                 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
 512
 513         a = ctx->hash[0];
 514         b = ctx->hash[1];
 515         c = ctx->hash[2];
 516         d = ctx->hash[3];
 517         e = ctx->hash[4];
 518
 519         /* 4 rounds of 20 operations each */
 520         cnt = 0;
 521         for (i = 0; i < 4; i++) {
 522                 j = 19;
 523                 do {
 524                         uint32_t work;
 525
 526                         work = c ^ d;
 527                         if (i == 0) {
 528                                 work = (work & b) ^ d;
 529                                 if (j <= 3)
 530                                         goto ge16;
 531                                 /* Used to do SWAP_BE32 here, but this
 532                                  * requires ctx (see comment above) */
 533                                 work += W[cnt];
 534                         } else {
 535                                 if (i == 2)
 536                                         work = ((b | c) & d) | (b & c);
 537                                 else /* i = 1 or 3 */
 538                                         work ^= b;
 539  ge16:
 540                                 W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
 541                                 work += W[cnt];
 542                         }
 543                         work += e + rotl32(a, 5) + rconsts[i];
 544
 545                         /* Rotate by one for next time */
 546                         e = d;
 547                         d = c;
 548                         c = /* b = */ rotl32(b, 30);
 549                         b = a;
 550                         a = work;
 551                         cnt = (cnt + 1) & 15;
 552                 } while (--j >= 0);
 553         }
 554
 555         ctx->hash[0] += a;
 556         ctx->hash[1] += b;
 557         ctx->hash[2] += c;
 558         ctx->hash[3] += d;
 559         ctx->hash[4] += e;
 560 }
 561
 562 /* Constants for SHA512 from FIPS 180-2:4.2.3.
 563  * SHA256 constants from FIPS 180-2:4.2.2
 564  * are the most significant half of first 64 elements
 565  * of the same array.
 566  */
 567 static const uint64_t sha_K[80] = {
 568         0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
 569         0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
 570         0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
 571         0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
 572         0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
 573         0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
 574         0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
 575         0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
 576         0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
 577         0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
 578         0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
 579         0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
 580         0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
 581         0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
 582         0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
 583         0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
 584         0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
 585         0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
 586         0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
 587         0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
 588         0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
 589         0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
 590         0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
 591         0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
 592         0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
 593         0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
 594         0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
 595         0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
 596         0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
 597         0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
 598         0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
 599         0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
 600         0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, /* [64]+ are used for sha512 only */
 601         0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
 602         0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
 603         0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
 604         0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
 605         0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
 606         0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
 607         0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
 608 };
 609
 610 #undef Ch
 611 #undef Maj
 612 #undef S0
 613 #undef S1
 614 #undef R0
 615 #undef R1
 616
 617 static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
 618 {
 619         unsigned t;
 620         uint32_t W[64], a, b, c, d, e, f, g, h;
 621         const uint32_t *words = (uint32_t*) ctx->wbuffer;
 622
 623         /* Operators defined in FIPS 180-2:4.1.2.  */
 624 #define Ch(x, y, z) ((x & y) ^ (~x & z))
 625 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
 626 #define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
 627 #define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
 628 #define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
 629 #define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
 630
 631         /* Compute the message schedule according to FIPS 180-2:6.2.2 step 2.  */
 632         for (t = 0; t < 16; ++t)
 633                 W[t] = SWAP_BE32(words[t]);
 634         for (/*t = 16*/; t < 64; ++t)
 635                 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
 636
 637         a = ctx->hash[0];
 638         b = ctx->hash[1];
 639         c = ctx->hash[2];
 640         d = ctx->hash[3];
 641         e = ctx->hash[4];
 642         f = ctx->hash[5];
 643         g = ctx->hash[6];
 644         h = ctx->hash[7];
 645
 646         /* The actual computation according to FIPS 180-2:6.2.2 step 3.  */
 647         for (t = 0; t < 64; ++t) {
 648                 /* Need to fetch upper half of sha_K[t]
 649                  * (I hope compiler is clever enough to just fetch
 650                  * upper half)
 651                  */
 652                 uint32_t K_t = sha_K[t] >> 32;
 653                 uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
 654                 uint32_t T2 = S0(a) + Maj(a, b, c);
 655                 h = g;
 656                 g = f;
 657                 f = e;
 658                 e = d + T1;
 659                 d = c;
 660                 c = b;
 661                 b = a;
 662                 a = T1 + T2;
 663         }
 664 #undef Ch
 665 #undef Maj
 666 #undef S0
 667 #undef S1
 668 #undef R0
 669 #undef R1
 670         /* Add the starting values of the context according to FIPS 180-2:6.2.2
 671            step 4.  */
 672         ctx->hash[0] += a;
 673         ctx->hash[1] += b;
 674         ctx->hash[2] += c;
 675         ctx->hash[3] += d;
 676         ctx->hash[4] += e;
 677         ctx->hash[5] += f;
 678         ctx->hash[6] += g;
 679         ctx->hash[7] += h;
 680 }
 681
 682 static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
 683 {
 684         unsigned t;
 685         uint64_t W[80];
 686         /* On i386, having assignments here (not later as sha256 does)
 687          * produces 99 bytes smaller code with gcc 4.3.1
 688          */
 689         uint64_t a = ctx->hash[0];
 690         uint64_t b = ctx->hash[1];
 691         uint64_t c = ctx->hash[2];
 692         uint64_t d = ctx->hash[3];
 693         uint64_t e = ctx->hash[4];
 694         uint64_t f = ctx->hash[5];
 695         uint64_t g = ctx->hash[6];
 696         uint64_t h = ctx->hash[7];
 697         const uint64_t *words = (uint64_t*) ctx->wbuffer;
 698
 699         /* Operators defined in FIPS 180-2:4.1.2.  */
 700 #define Ch(x, y, z) ((x & y) ^ (~x & z))
 701 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
 702 #define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
 703 #define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
 704 #define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
 705 #define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
 706
 707         /* Compute the message schedule according to FIPS 180-2:6.3.2 step 2.  */
 708         for (t = 0; t < 16; ++t)
 709                 W[t] = SWAP_BE64(words[t]);
 710         for (/*t = 16*/; t < 80; ++t)
 711                 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
 712
 713         /* The actual computation according to FIPS 180-2:6.3.2 step 3.  */
 714         for (t = 0; t < 80; ++t) {
 715                 uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
 716                 uint64_t T2 = S0(a) + Maj(a, b, c);
 717                 h = g;
 718                 g = f;
 719                 f = e;
 720                 e = d + T1;
 721                 d = c;
 722                 c = b;
 723                 b = a;
 724                 a = T1 + T2;
 725         }
 726 #undef Ch
 727 #undef Maj
 728 #undef S0
 729 #undef S1
 730 #undef R0
 731 #undef R1
 732         /* Add the starting values of the context according to FIPS 180-2:6.3.2
 733            step 4.  */
 734         ctx->hash[0] += a;
 735         ctx->hash[1] += b;
 736         ctx->hash[2] += c;
 737         ctx->hash[3] += d;
 738         ctx->hash[4] += e;
 739         ctx->hash[5] += f;
 740         ctx->hash[6] += g;
 741         ctx->hash[7] += h;
 742 }
 743
 744
 745 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 746 {
 747         ctx->hash[0] = 0x67452301;
 748         ctx->hash[1] = 0xefcdab89;
 749         ctx->hash[2] = 0x98badcfe;
 750         ctx->hash[3] = 0x10325476;
 751         ctx->hash[4] = 0xc3d2e1f0;
 752         ctx->total64 = 0;
 753         ctx->process_block = sha1_process_block64;
 754 }
 755
 756 static const uint32_t init256[] = {
 757         0,
 758         0,
 759         0x6a09e667,
 760         0xbb67ae85,
 761         0x3c6ef372,
 762         0xa54ff53a,
 763         0x510e527f,
 764         0x9b05688c,
 765         0x1f83d9ab,
 766         0x5be0cd19,
 767 };
 768 static const uint32_t init512_lo[] = {
 769         0,
 770         0,
 771         0xf3bcc908,
 772         0x84caa73b,
 773         0xfe94f82b,
 774         0x5f1d36f1,
 775         0xade682d1,
 776         0x2b3e6c1f,
 777         0xfb41bd6b,
 778         0x137e2179,
 779 };
 780
 781 /* Initialize structure containing state of computation.
 782    (FIPS 180-2:5.3.2)  */
 783 void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
 784 {
 785         memcpy(&ctx->total64, init256, sizeof(init256));
 786         /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
 787         ctx->process_block = sha256_process_block64;
 788 }
 789
 790 /* Initialize structure containing state of computation.
 791    (FIPS 180-2:5.3.3)  */
 792 void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
 793 {
 794         int i;
 795         /* Two extra iterations zero out ctx->total64[2] */
 796         uint64_t *tp = ctx->total64;
 797         for (i = 0; i < 2+8; i++)
 798                 tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
 799         /*ctx->total64[0] = ctx->total64[1] = 0; - already done */
 800 }
 801
 802 void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
 803 {
 804         unsigned bufpos = ctx->total64[0] & 127;
 805         unsigned remaining;
 806
 807         /* First increment the byte count.  FIPS 180-2 specifies the possible
 808            length of the file up to 2^128 _bits_.
 809            We compute the number of _bytes_ and convert to bits later.  */
 810         ctx->total64[0] += len;
 811         if (ctx->total64[0] < len)
 812                 ctx->total64[1]++;
 813 #if 0
 814         remaining = 128 - bufpos;
 815
 816         /* Hash whole blocks */
 817         while (len >= remaining) {
 818                 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
 819                 buffer = (const char *)buffer + remaining;
 820                 len -= remaining;
 821                 remaining = 128;
 822                 bufpos = 0;
 823                 sha512_process_block128(ctx);
 824         }
 825
 826         /* Save last, partial blosk */
 827         memcpy(ctx->wbuffer + bufpos, buffer, len);
 828 #else
 829         while (1) {
 830                 remaining = 128 - bufpos;
 831                 if (remaining > len)
 832                         remaining = len;
 833                 /* Copy data into aligned buffer */
 834                 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
 835                 len -= remaining;
 836                 buffer = (const char *)buffer + remaining;
 837                 bufpos += remaining;
 838                 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
 839                 bufpos -= 128;
 840                 if (bufpos != 0)
 841                         break;
 842                 /* Buffer is filled up, process it */
 843                 sha512_process_block128(ctx);
 844                 /*bufpos = 0; - already is */
 845         }
 846 #endif
 847 }
 848
 849 /* Used also for sha256 */
 850 void FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
 851 {
 852         unsigned hash_size;
 853
 854         /* SHA stores total in BE, need to swap on LE arches: */
 855         common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
 856
 857         hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
 858         /* This way we do not impose alignment constraints on resbuf: */
 859         if (BB_LITTLE_ENDIAN) {
 860                 unsigned i;
 861                 for (i = 0; i < hash_size; ++i)
 862                         ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
 863         }
 864         memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * hash_size);
 865 }
 866
 867 void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
 868 {
 869         unsigned bufpos = ctx->total64[0] & 127;
 870
 871         /* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
 872         ctx->wbuffer[bufpos++] = 0x80;
 873
 874         while (1) {
 875                 unsigned remaining = 128 - bufpos;
 876                 memset(ctx->wbuffer + bufpos, 0, remaining);
 877                 if (remaining >= 16) {
 878                         /* Store the 128-bit counter of bits in the buffer in BE format */
 879                         uint64_t t;
 880                         t = ctx->total64[0] << 3;
 881                         t = SWAP_BE64(t);
 882                         *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
 883                         t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
 884                         t = SWAP_BE64(t);
 885                         *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
 886                 }
 887                 sha512_process_block128(ctx);
 888                 if (remaining >= 16)
 889                         break;
 890                 bufpos = 0;
 891         }
 892
 893         if (BB_LITTLE_ENDIAN) {
 894                 unsigned i;
 895                 for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
 896                         ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
 897         }
 898         memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
 899 }
 900
 901
 902 /*
 903  * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
 904  * Michael Peeters and Gilles Van Assche. For more information, feedback or
 905  * questions, please refer to our website: http://keccak.noekeon.org/
 906  *
 907  * Implementation by Ronny Van Keer,
 908  * hereby denoted as "the implementer".
 909  *
 910  * To the extent possible under law, the implementer has waived all copyright
 911  * and related or neighboring rights to the source code in this file.
 912  * http://creativecommons.org/publicdomain/zero/1.0/
 913  *
 914  * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
 915  */
 916
 917 #if CONFIG_SHA3_SMALL < 0
 918 # define SHA3_SMALL 0
 919 #elif CONFIG_SHA3_SMALL > 1
 920 # define SHA3_SMALL 1
 921 #else
 922 # define SHA3_SMALL CONFIG_SHA3_SMALL
 923 #endif
 924
 925 #define OPTIMIZE_SHA3_FOR_32 0
 926 /*
 927  * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
 928  * every 64-bit word of state[] can be split into two 32-bit words
 929  * by even/odd bits. In this form, all rotations of sha3 round
 930  * are 32-bit - and there are lots of them.
 931  * However, it requires either splitting/combining state words
 932  * before/after sha3 round (code does this now)
 933  * or shuffling bits before xor'ing them into state and in sha3_end.
 934  * Without shuffling, bit-slicing results in -130 bytes of code
 935  * and marginal speedup (but of course it gives wrong result).
 936  * With shuffling it works, but +260 code bytes, and slower.
 937  * Disabled for now:
 938  */
 939 #if 0 /* LONG_MAX == 0x7fffffff */
 940 # undef OPTIMIZE_SHA3_FOR_32
 941 # define OPTIMIZE_SHA3_FOR_32 1
 942 #endif
 943
 944 #if OPTIMIZE_SHA3_FOR_32
 945 /* This splits every 64-bit word into a pair of 32-bit words,
 946  * even bits go into first word, odd bits go to second one.
 947  * The conversion is done in-place.
 948  */
 949 static void split_halves(uint64_t *state)
 950 {
 951         /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
 952         uint32_t *s32 = (uint32_t*)state;
 953         uint32_t t, x0, x1;
 954         int i;
 955         for (i = 24; i >= 0; --i) {
 956                 x0 = s32[0];
 957                 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
 958                 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
 959                 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
 960                 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
 961                 x1 = s32[1];
 962                 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
 963                 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
 964                 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
 965                 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
 966                 *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
 967                 *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
 968         }
 969 }
 970 /* The reverse operation */
 971 static void combine_halves(uint64_t *state)
 972 {
 973         uint32_t *s32 = (uint32_t*)state;
 974         uint32_t t, x0, x1;
 975         int i;
 976         for (i = 24; i >= 0; --i) {
 977                 x0 = s32[0];
 978                 x1 = s32[1];
 979                 t = (x0 & 0x0000FFFF) | (x1 << 16);
 980                 x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
 981                 x0 = t;
 982                 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
 983                 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
 984                 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
 985                 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
 986                 *s32++ = x0;
 987                 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
 988                 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
 989                 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
 990                 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
 991                 *s32++ = x1;
 992         }
 993 }
 994 #endif
 995
 996 /*
 997  * In the crypto literature this function is usually called Keccak-f().
 998  */
 999 static void sha3_process_block72(uint64_t *state)
1000 {
1001         enum { NROUNDS = 24 };
1002
1003 #if OPTIMIZE_SHA3_FOR_32
1004         /*
1005         static const uint32_t IOTA_CONST_0[NROUNDS] = {
1006                 0x00000001UL,
1007                 0x00000000UL,
1008                 0x00000000UL,
1009                 0x00000000UL,
1010                 0x00000001UL,
1011                 0x00000001UL,
1012                 0x00000001UL,
1013                 0x00000001UL,
1014                 0x00000000UL,
1015                 0x00000000UL,
1016                 0x00000001UL,
1017                 0x00000000UL,
1018                 0x00000001UL,
1019                 0x00000001UL,
1020                 0x00000001UL,
1021                 0x00000001UL,
1022                 0x00000000UL,
1023                 0x00000000UL,
1024                 0x00000000UL,
1025                 0x00000000UL,
1026                 0x00000001UL,
1027                 0x00000000UL,
1028                 0x00000001UL,
1029                 0x00000000UL,
1030         };
1031         ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1032         */
1033         uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1034         static const uint32_t IOTA_CONST_1[NROUNDS] = {
1035                 0x00000000UL,
1036                 0x00000089UL,
1037                 0x8000008bUL,
1038                 0x80008080UL,
1039                 0x0000008bUL,
1040                 0x00008000UL,
1041                 0x80008088UL,
1042                 0x80000082UL,
1043                 0x0000000bUL,
1044                 0x0000000aUL,
1045                 0x00008082UL,
1046                 0x00008003UL,
1047                 0x0000808bUL,
1048                 0x8000000bUL,
1049                 0x8000008aUL,
1050                 0x80000081UL,
1051                 0x80000081UL,
1052                 0x80000008UL,
1053                 0x00000083UL,
1054                 0x80008003UL,
1055                 0x80008088UL,
1056                 0x80000088UL,
1057                 0x00008000UL,
1058                 0x80008082UL,
1059         };
1060
1061         uint32_t *const s32 = (uint32_t*)state;
1062         unsigned round;
1063
1064         split_halves(state);
1065
1066         for (round = 0; round < NROUNDS; round++) {
1067                 unsigned x;
1068
1069                 /* Theta */
1070                 {
1071                         uint32_t BC[20];
1072                         for (x = 0; x < 10; ++x) {
1073                                 BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1074                         }
1075                         for (x = 0; x < 10; x += 2) {
1076                                 uint32_t ta, tb;
1077                                 ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1078                                 tb = BC[x+9] ^ BC[x+2];
1079                                 s32[x+0] ^= ta;
1080                                 s32[x+1] ^= tb;
1081                                 s32[x+10] ^= ta;
1082                                 s32[x+11] ^= tb;
1083                                 s32[x+20] ^= ta;
1084                                 s32[x+21] ^= tb;
1085                                 s32[x+30] ^= ta;
1086                                 s32[x+31] ^= tb;
1087                                 s32[x+40] ^= ta;
1088                                 s32[x+41] ^= tb;
1089                         }
1090                 }
1091                 /* RhoPi */
1092                 {
1093                         uint32_t t0a,t0b, t1a,t1b;
1094                         t1a = s32[1*2+0];
1095                         t1b = s32[1*2+1];
1096
1097 #define RhoPi(PI_LANE, ROT_CONST) \
1098         t0a = s32[PI_LANE*2+0];\
1099         t0b = s32[PI_LANE*2+1];\
1100         if (ROT_CONST & 1) {\
1101                 s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1102                 s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1103         } else {\
1104                 s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1105                 s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1106         }\
1107         t1a = t0a; t1b = t0b;
1108
1109                         RhoPi(10, 1)
1110                         RhoPi( 7, 3)
1111                         RhoPi(11, 6)
1112                         RhoPi(17,10)
1113                         RhoPi(18,15)
1114                         RhoPi( 3,21)
1115                         RhoPi( 5,28)
1116                         RhoPi(16,36)
1117                         RhoPi( 8,45)
1118                         RhoPi(21,55)
1119                         RhoPi(24, 2)
1120                         RhoPi( 4,14)
1121                         RhoPi(15,27)
1122                         RhoPi(23,41)
1123                         RhoPi(19,56)
1124                         RhoPi(13, 8)
1125                         RhoPi(12,25)
1126                         RhoPi( 2,43)
1127                         RhoPi(20,62)
1128                         RhoPi(14,18)
1129                         RhoPi(22,39)
1130                         RhoPi( 9,61)
1131                         RhoPi( 6,20)
1132                         RhoPi( 1,44)
1133 #undef RhoPi
1134                 }
1135                 /* Chi */
1136                 for (x = 0; x <= 40;) {
1137                         uint32_t BC0, BC1, BC2, BC3, BC4;
1138                         BC0 = s32[x + 0*2];
1139                         BC1 = s32[x + 1*2];
1140                         BC2 = s32[x + 2*2];
1141                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1142                         BC3 = s32[x + 3*2];
1143                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1144                         BC4 = s32[x + 4*2];
1145                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1146                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1147                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1148                         x++;
1149                         BC0 = s32[x + 0*2];
1150                         BC1 = s32[x + 1*2];
1151                         BC2 = s32[x + 2*2];
1152                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1153                         BC3 = s32[x + 3*2];
1154                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1155                         BC4 = s32[x + 4*2];
1156                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1157                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1158                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1159                         x += 9;
1160                 }
1161                 /* Iota */
1162                 s32[0] ^= IOTA_CONST_0bits & 1;
1163                 IOTA_CONST_0bits >>= 1;
1164                 s32[1] ^= IOTA_CONST_1[round];
1165         }
1166
1167         combine_halves(state);
1168 #else
1169         /* Native 64-bit algorithm */
1170         static const uint16_t IOTA_CONST[NROUNDS] = {
1171                 /* Elements should be 64-bit, but top half is always zero
1172                  * or 0x80000000. We encode 63rd bits in a separate word below.
1173                  * Same is true for 31th bits, which lets us use 16-bit table
1174                  * instead of 64-bit. The speed penalty is lost in the noise.
1175                  */
1176                 0x0001,
1177                 0x8082,
1178                 0x808a,
1179                 0x8000,
1180                 0x808b,
1181                 0x0001,
1182                 0x8081,
1183                 0x8009,
1184                 0x008a,
1185                 0x0088,
1186                 0x8009,
1187                 0x000a,
1188                 0x808b,
1189                 0x008b,
1190                 0x8089,
1191                 0x8003,
1192                 0x8002,
1193                 0x0080,
1194                 0x800a,
1195                 0x000a,
1196                 0x8081,
1197                 0x8080,
1198                 0x0001,
1199                 0x8008,
1200         };
1201         /* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1202         const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1203         /* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1204         const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1205
1206         static const uint8_t ROT_CONST[24] = {
1207                 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1208                 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1209         };
1210         static const uint8_t PI_LANE[24] = {
1211                 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1212                 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1213         };
1214         /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
1215
1216         unsigned x;
1217         unsigned round;
1218
1219         if (BB_BIG_ENDIAN) {
1220                 for (x = 0; x < 25; x++) {
1221                         state[x] = SWAP_LE64(state[x]);
1222                 }
1223         }
1224
1225         for (round = 0; round < NROUNDS; ++round) {
1226                 /* Theta */
1227                 {
1228                         uint64_t BC[10];
1229                         for (x = 0; x < 5; ++x) {
1230                                 BC[x + 5] = BC[x] = state[x]
1231                                         ^ state[x + 5] ^ state[x + 10]
1232                                         ^ state[x + 15] ^ state[x + 20];
1233                         }
1234                         /* Using 2x5 vector above eliminates the need to use
1235                          * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
1236                          * and the code is a bit _smaller_.
1237                          */
1238                         for (x = 0; x < 5; ++x) {
1239                                 uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
1240                                 state[x] ^= temp;
1241                                 state[x + 5] ^= temp;
1242                                 state[x + 10] ^= temp;
1243                                 state[x + 15] ^= temp;
1244                                 state[x + 20] ^= temp;
1245                         }
1246                 }
1247
1248                 /* Rho Pi */
1249                 if (SHA3_SMALL) {
1250                         uint64_t t1 = state[1];
1251                         for (x = 0; x < 24; ++x) {
1252                                 uint64_t t0 = state[PI_LANE[x]];
1253                                 state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
1254                                 t1 = t0;
1255                         }
1256                 } else {
1257                         /* Especially large benefit for 32-bit arch (75% faster):
1258                          * 64-bit rotations by non-constant usually are SLOW on those.
1259                          * We resort to unrolling here.
1260                          * This optimizes out PI_LANE[] and ROT_CONST[],
1261                          * but generates 300-500 more bytes of code.
1262                          */
1263                         uint64_t t0;
1264                         uint64_t t1 = state[1];
1265 #define RhoPi_twice(x) \
1266         t0 = state[PI_LANE[x  ]]; \
1267         state[PI_LANE[x  ]] = rotl64(t1, ROT_CONST[x  ]); \
1268         t1 = state[PI_LANE[x+1]]; \
1269         state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
1270                         RhoPi_twice(0); RhoPi_twice(2);
1271                         RhoPi_twice(4); RhoPi_twice(6);
1272                         RhoPi_twice(8); RhoPi_twice(10);
1273                         RhoPi_twice(12); RhoPi_twice(14);
1274                         RhoPi_twice(16); RhoPi_twice(18);
1275                         RhoPi_twice(20); RhoPi_twice(22);
1276 #undef RhoPi_twice
1277                 }
1278                 /* Chi */
1279 # if LONG_MAX > 0x7fffffff
1280                 for (x = 0; x <= 20; x += 5) {
1281                         uint64_t BC0, BC1, BC2, BC3, BC4;
1282                         BC0 = state[x + 0];
1283                         BC1 = state[x + 1];
1284                         BC2 = state[x + 2];
1285                         state[x + 0] = BC0 ^ ((~BC1) & BC2);
1286                         BC3 = state[x + 3];
1287                         state[x + 1] = BC1 ^ ((~BC2) & BC3);
1288                         BC4 = state[x + 4];
1289                         state[x + 2] = BC2 ^ ((~BC3) & BC4);
1290                         state[x + 3] = BC3 ^ ((~BC4) & BC0);
1291                         state[x + 4] = BC4 ^ ((~BC0) & BC1);
1292                 }
1293 # else
1294                 /* Reduced register pressure version
1295                  * for register-starved 32-bit arches
1296                  * (i386: -95 bytes, and it is _faster_)
1297                  */
1298                 for (x = 0; x <= 40;) {
1299                         uint32_t BC0, BC1, BC2, BC3, BC4;
1300                         uint32_t *const s32 = (uint32_t*)state;
1301 #  if SHA3_SMALL
1302  do_half:
1303 #  endif
1304                         BC0 = s32[x + 0*2];
1305                         BC1 = s32[x + 1*2];
1306                         BC2 = s32[x + 2*2];
1307                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1308                         BC3 = s32[x + 3*2];
1309                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1310                         BC4 = s32[x + 4*2];
1311                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1312                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1313                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1314                         x++;
1315 #  if SHA3_SMALL
1316                         if (x & 1)
1317                                 goto do_half;
1318                         x += 8;
1319 #  else
1320                         BC0 = s32[x + 0*2];
1321                         BC1 = s32[x + 1*2];
1322                         BC2 = s32[x + 2*2];
1323                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1324                         BC3 = s32[x + 3*2];
1325                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1326                         BC4 = s32[x + 4*2];
1327                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1328                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1329                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1330                         x += 9;
1331 #  endif
1332                 }
1333 # endif /* long is 32-bit */
1334                 /* Iota */
1335                 state[0] ^= IOTA_CONST[round]
1336                         | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1337                         | (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
1338         }
1339
1340         if (BB_BIG_ENDIAN) {
1341                 for (x = 0; x < 25; x++) {
1342                         state[x] = SWAP_LE64(state[x]);
1343                 }
1344         }
1345 #endif
1346 }
1347
1348 void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1349 {
1350         memset(ctx, 0, sizeof(*ctx));
1351         /* SHA3-512, user can override */
1352         ctx->input_block_bytes = (1600 - 512*2) / 8; /* 72 bytes */
1353 }
1354
1355 void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
1356 {
1357 #if SHA3_SMALL
1358         const uint8_t *data = buffer;
1359         unsigned bufpos = ctx->bytes_queued;
1360
1361         while (1) {
1362                 unsigned remaining = ctx->input_block_bytes - bufpos;
1363                 if (remaining > len)
1364                         remaining = len;
1365                 len -= remaining;
1366                 /* XOR data into buffer */
1367                 while (remaining != 0) {
1368                         uint8_t *buf = (uint8_t*)ctx->state;
1369                         buf[bufpos] ^= *data++;
1370                         bufpos++;
1371                         remaining--;
1372                 }
1373                 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
1374                 bufpos -= ctx->input_block_bytes;
1375                 if (bufpos != 0)
1376                         break;
1377                 /* Buffer is filled up, process it */
1378                 sha3_process_block72(ctx->state);
1379                 /*bufpos = 0; - already is */
1380         }
1381         ctx->bytes_queued = bufpos + ctx->input_block_bytes;
1382 #else
1383         /* +50 bytes code size, but a bit faster because of long-sized XORs */
1384         const uint8_t *data = buffer;
1385         unsigned bufpos = ctx->bytes_queued;
1386         unsigned iblk_bytes = ctx->input_block_bytes;
1387
1388         /* If already data in queue, continue queuing first */
1389         if (bufpos != 0) {
1390                 while (len != 0) {
1391                         uint8_t *buf = (uint8_t*)ctx->state;
1392                         buf[bufpos] ^= *data++;
1393                         len--;
1394                         bufpos++;
1395                         if (bufpos == iblk_bytes) {
1396                                 bufpos = 0;
1397                                 goto do_block;
1398                         }
1399                 }
1400         }
1401
1402         /* Absorb complete blocks */
1403         while (len >= iblk_bytes) {
1404                 /* XOR data onto beginning of state[].
1405                  * We try to be efficient - operate one word at a time, not byte.
1406                  * Careful wrt unaligned access: can't just use "*(long*)data"!
1407                  */
1408                 unsigned count = iblk_bytes / sizeof(long);
1409                 long *buf = (long*)ctx->state;
1410                 do {
1411                         long v;
1412                         move_from_unaligned_long(v, (long*)data);
1413                         *buf++ ^= v;
1414                         data += sizeof(long);
1415                 } while (--count);
1416                 len -= iblk_bytes;
1417  do_block:
1418                 sha3_process_block72(ctx->state);
1419         }
1420
1421         /* Queue remaining data bytes */
1422         while (len != 0) {
1423                 uint8_t *buf = (uint8_t*)ctx->state;
1424                 buf[bufpos] ^= *data++;
1425                 bufpos++;
1426                 len--;
1427         }
1428
1429         ctx->bytes_queued = bufpos;
1430 #endif
1431 }
1432
1433 void FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
1434 {
1435         /* Padding */
1436         uint8_t *buf = (uint8_t*)ctx->state;
1437         /*
1438          * Keccak block padding is: add 1 bit after last bit of input,
1439          * then add zero bits until the end of block, and add the last 1 bit
1440          * (the last bit in the block) - the "10*1" pattern.
1441          * SHA3 standard appends additional two bits, 01,  before that padding:
1442          *
1443          * SHA3-224(M) = KECCAK[448](M||01, 224)
1444          * SHA3-256(M) = KECCAK[512](M||01, 256)
1445          * SHA3-384(M) = KECCAK[768](M||01, 384)
1446          * SHA3-512(M) = KECCAK[1024](M||01, 512)
1447          * (M is the input, || is bit concatenation)
1448          *
1449          * The 6 below contains 01 "SHA3" bits and the first 1 "Keccak" bit:
1450          */
1451         buf[ctx->bytes_queued]          ^= 6; /* bit pattern 00000110 */
1452         buf[ctx->input_block_bytes - 1] ^= 0x80;
1453
1454         sha3_process_block72(ctx->state);
1455
1456         /* Output */
1457         memcpy(resbuf, ctx->state, 64);
1458 }