libbb/hash_md5_sha.c

   1 /* vi: set sw=4 ts=4: */
   2 /*
   3  * Utility routines.
   4  *
   5  * Copyright (C) 2010 Denys Vlasenko
   6  *
   7  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
   8  */
   9
  10 #include "libbb.h"
  11
  12 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
  13  * (for rotX32, there is no difference). Why? My guess is that
  14  * macro requires clever common subexpression elimination heuristics
  15  * in gcc, while inline basically forces it to happen.
  16  */
  17 //#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
  18 static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
  19 {
  20         return (x << n) | (x >> (32 - n));
  21 }
  22 //#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
  23 static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
  24 {
  25         return (x >> n) | (x << (32 - n));
  26 }
  27 /* rotr64 in needed for sha512 only: */
  28 //#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
  29 static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
  30 {
  31         return (x >> n) | (x << (64 - n));
  32 }
  33
  34 /* rotl64 only used for sha3 currently */
  35 static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
  36 {
  37         return (x << n) | (x >> (64 - n));
  38 }
  39
  40 /* Feed data through a temporary buffer.
  41  * The internal buffer remembers previous data until it has 64
  42  * bytes worth to pass on.
  43  */
  44 static void FAST_FUNC common64_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
  45 {
  46         unsigned bufpos = ctx->total64 & 63;
  47
  48         ctx->total64 += len;
  49
  50         while (1) {
  51                 unsigned remaining = 64 - bufpos;
  52                 if (remaining > len)
  53                         remaining = len;
  54                 /* Copy data into aligned buffer */
  55                 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
  56                 len -= remaining;
  57                 buffer = (const char *)buffer + remaining;
  58                 bufpos += remaining;
  59                 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
  60                 bufpos -= 64;
  61                 if (bufpos != 0)
  62                         break;
  63                 /* Buffer is filled up, process it */
  64                 ctx->process_block(ctx);
  65                 /*bufpos = 0; - already is */
  66         }
  67 }
  68
  69 /* Process the remaining bytes in the buffer */
  70 static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
  71 {
  72         unsigned bufpos = ctx->total64 & 63;
  73         /* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
  74         ctx->wbuffer[bufpos++] = 0x80;
  75
  76         /* This loop iterates either once or twice, no more, no less */
  77         while (1) {
  78                 unsigned remaining = 64 - bufpos;
  79                 memset(ctx->wbuffer + bufpos, 0, remaining);
  80                 /* Do we have enough space for the length count? */
  81                 if (remaining >= 8) {
  82                         /* Store the 64-bit counter of bits in the buffer */
  83                         uint64_t t = ctx->total64 << 3;
  84                         if (swap_needed)
  85                                 t = bb_bswap_64(t);
  86                         /* wbuffer is suitably aligned for this */
  87                         *(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
  88                 }
  89                 ctx->process_block(ctx);
  90                 if (remaining >= 8)
  91                         break;
  92                 bufpos = 0;
  93         }
  94 }
  95
  96
  97 /*
  98  * Compute MD5 checksum of strings according to the
  99  * definition of MD5 in RFC 1321 from April 1992.
 100  *
 101  * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
 102  *
 103  * Copyright (C) 1995-1999 Free Software Foundation, Inc.
 104  * Copyright (C) 2001 Manuel Novoa III
 105  * Copyright (C) 2003 Glenn L. McGrath
 106  * Copyright (C) 2003 Erik Andersen
 107  *
 108  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
 109  */
 110
 111 /* 0: fastest, 3: smallest */
 112 #if CONFIG_MD5_SMALL < 0
 113 # define MD5_SMALL 0
 114 #elif CONFIG_MD5_SMALL > 3
 115 # define MD5_SMALL 3
 116 #else
 117 # define MD5_SMALL CONFIG_MD5_SMALL
 118 #endif
 119
 120 /* These are the four functions used in the four steps of the MD5 algorithm
 121  * and defined in the RFC 1321.  The first function is a little bit optimized
 122  * (as found in Colin Plumbs public domain implementation).
 123  * #define FF(b, c, d) ((b & c) | (~b & d))
 124  */
 125 #undef FF
 126 #undef FG
 127 #undef FH
 128 #undef FI
 129 #define FF(b, c, d) (d ^ (b & (c ^ d)))
 130 #define FG(b, c, d) FF(d, b, c)
 131 #define FH(b, c, d) (b ^ c ^ d)
 132 #define FI(b, c, d) (c ^ (b | ~d))
 133
 134 /* Hash a single block, 64 bytes long and 4-byte aligned */
 135 static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
 136 {
 137 #if MD5_SMALL > 0
 138         /* Before we start, one word to the strange constants.
 139            They are defined in RFC 1321 as
 140            T[i] = (int)(4294967296.0 * fabs(sin(i))), i=1..64
 141          */
 142         static const uint32_t C_array[] = {
 143                 /* round 1 */
 144                 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
 145                 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
 146                 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
 147                 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
 148                 /* round 2 */
 149                 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
 150                 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
 151                 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
 152                 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
 153                 /* round 3 */
 154                 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
 155                 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
 156                 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
 157                 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
 158                 /* round 4 */
 159                 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
 160                 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
 161                 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
 162                 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
 163         };
 164         static const char P_array[] ALIGN1 = {
 165 # if MD5_SMALL > 1
 166                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
 167 # endif
 168                 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
 169                 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
 170                 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9  /* 4 */
 171         };
 172 #endif
 173         uint32_t *words = (void*) ctx->wbuffer;
 174         uint32_t A = ctx->hash[0];
 175         uint32_t B = ctx->hash[1];
 176         uint32_t C = ctx->hash[2];
 177         uint32_t D = ctx->hash[3];
 178
 179 #if MD5_SMALL >= 2  /* 2 or 3 */
 180
 181         static const char S_array[] ALIGN1 = {
 182                 7, 12, 17, 22,
 183                 5, 9, 14, 20,
 184                 4, 11, 16, 23,
 185                 6, 10, 15, 21
 186         };
 187         const uint32_t *pc;
 188         const char *pp;
 189         const char *ps;
 190         int i;
 191         uint32_t temp;
 192
 193         if (BB_BIG_ENDIAN)
 194                 for (i = 0; i < 16; i++)
 195                         words[i] = SWAP_LE32(words[i]);
 196
 197 # if MD5_SMALL == 3
 198         pc = C_array;
 199         pp = P_array;
 200         ps = S_array - 4;
 201
 202         for (i = 0; i < 64; i++) {
 203                 if ((i & 0x0f) == 0)
 204                         ps += 4;
 205                 temp = A;
 206                 switch (i >> 4) {
 207                 case 0:
 208                         temp += FF(B, C, D);
 209                         break;
 210                 case 1:
 211                         temp += FG(B, C, D);
 212                         break;
 213                 case 2:
 214                         temp += FH(B, C, D);
 215                         break;
 216                 case 3:
 217                         temp += FI(B, C, D);
 218                 }
 219                 temp += words[(int) (*pp++)] + *pc++;
 220                 temp = rotl32(temp, ps[i & 3]);
 221                 temp += B;
 222                 A = D;
 223                 D = C;
 224                 C = B;
 225                 B = temp;
 226         }
 227 # else  /* MD5_SMALL == 2 */
 228         pc = C_array;
 229         pp = P_array;
 230         ps = S_array;
 231
 232         for (i = 0; i < 16; i++) {
 233                 temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
 234                 temp = rotl32(temp, ps[i & 3]);
 235                 temp += B;
 236                 A = D;
 237                 D = C;
 238                 C = B;
 239                 B = temp;
 240         }
 241         ps += 4;
 242         for (i = 0; i < 16; i++) {
 243                 temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
 244                 temp = rotl32(temp, ps[i & 3]);
 245                 temp += B;
 246                 A = D;
 247                 D = C;
 248                 C = B;
 249                 B = temp;
 250         }
 251         ps += 4;
 252         for (i = 0; i < 16; i++) {
 253                 temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
 254                 temp = rotl32(temp, ps[i & 3]);
 255                 temp += B;
 256                 A = D;
 257                 D = C;
 258                 C = B;
 259                 B = temp;
 260         }
 261         ps += 4;
 262         for (i = 0; i < 16; i++) {
 263                 temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
 264                 temp = rotl32(temp, ps[i & 3]);
 265                 temp += B;
 266                 A = D;
 267                 D = C;
 268                 C = B;
 269                 B = temp;
 270         }
 271 # endif
 272         /* Add checksum to the starting values */
 273         ctx->hash[0] += A;
 274         ctx->hash[1] += B;
 275         ctx->hash[2] += C;
 276         ctx->hash[3] += D;
 277
 278 #else  /* MD5_SMALL == 0 or 1 */
 279
 280         uint32_t A_save = A;
 281         uint32_t B_save = B;
 282         uint32_t C_save = C;
 283         uint32_t D_save = D;
 284 # if MD5_SMALL == 1
 285         const uint32_t *pc;
 286         const char *pp;
 287         int i;
 288 # endif
 289
 290         /* First round: using the given function, the context and a constant
 291            the next context is computed.  Because the algorithm's processing
 292            unit is a 32-bit word and it is determined to work on words in
 293            little endian byte order we perhaps have to change the byte order
 294            before the computation.  To reduce the work for the next steps
 295            we save swapped words in WORDS array.  */
 296 # undef OP
 297 # define OP(a, b, c, d, s, T) \
 298         do { \
 299                 a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
 300                 words++; \
 301                 a = rotl32(a, s); \
 302                 a += b; \
 303         } while (0)
 304
 305         /* Round 1 */
 306 # if MD5_SMALL == 1
 307         pc = C_array;
 308         for (i = 0; i < 4; i++) {
 309                 OP(A, B, C, D, 7, *pc++);
 310                 OP(D, A, B, C, 12, *pc++);
 311                 OP(C, D, A, B, 17, *pc++);
 312                 OP(B, C, D, A, 22, *pc++);
 313         }
 314 # else
 315         OP(A, B, C, D, 7, 0xd76aa478);
 316         OP(D, A, B, C, 12, 0xe8c7b756);
 317         OP(C, D, A, B, 17, 0x242070db);
 318         OP(B, C, D, A, 22, 0xc1bdceee);
 319         OP(A, B, C, D, 7, 0xf57c0faf);
 320         OP(D, A, B, C, 12, 0x4787c62a);
 321         OP(C, D, A, B, 17, 0xa8304613);
 322         OP(B, C, D, A, 22, 0xfd469501);
 323         OP(A, B, C, D, 7, 0x698098d8);
 324         OP(D, A, B, C, 12, 0x8b44f7af);
 325         OP(C, D, A, B, 17, 0xffff5bb1);
 326         OP(B, C, D, A, 22, 0x895cd7be);
 327         OP(A, B, C, D, 7, 0x6b901122);
 328         OP(D, A, B, C, 12, 0xfd987193);
 329         OP(C, D, A, B, 17, 0xa679438e);
 330         OP(B, C, D, A, 22, 0x49b40821);
 331 # endif
 332         words -= 16;
 333
 334         /* For the second to fourth round we have the possibly swapped words
 335            in WORDS.  Redefine the macro to take an additional first
 336            argument specifying the function to use.  */
 337 # undef OP
 338 # define OP(f, a, b, c, d, k, s, T) \
 339         do { \
 340                 a += f(b, c, d) + words[k] + T; \
 341                 a = rotl32(a, s); \
 342                 a += b; \
 343         } while (0)
 344
 345         /* Round 2 */
 346 # if MD5_SMALL == 1
 347         pp = P_array;
 348         for (i = 0; i < 4; i++) {
 349                 OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
 350                 OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
 351                 OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
 352                 OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
 353         }
 354 # else
 355         OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
 356         OP(FG, D, A, B, C, 6, 9, 0xc040b340);
 357         OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
 358         OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
 359         OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
 360         OP(FG, D, A, B, C, 10, 9, 0x02441453);
 361         OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
 362         OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
 363         OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
 364         OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
 365         OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
 366         OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
 367         OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
 368         OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
 369         OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
 370         OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
 371 # endif
 372
 373         /* Round 3 */
 374 # if MD5_SMALL == 1
 375         for (i = 0; i < 4; i++) {
 376                 OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
 377                 OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
 378                 OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
 379                 OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
 380         }
 381 # else
 382         OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
 383         OP(FH, D, A, B, C, 8, 11, 0x8771f681);
 384         OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
 385         OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
 386         OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
 387         OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
 388         OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
 389         OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
 390         OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
 391         OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
 392         OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
 393         OP(FH, B, C, D, A, 6, 23, 0x04881d05);
 394         OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
 395         OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
 396         OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
 397         OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
 398 # endif
 399
 400         /* Round 4 */
 401 # if MD5_SMALL == 1
 402         for (i = 0; i < 4; i++) {
 403                 OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
 404                 OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
 405                 OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
 406                 OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
 407         }
 408 # else
 409         OP(FI, A, B, C, D, 0, 6, 0xf4292244);
 410         OP(FI, D, A, B, C, 7, 10, 0x432aff97);
 411         OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
 412         OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
 413         OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
 414         OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
 415         OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
 416         OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
 417         OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
 418         OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
 419         OP(FI, C, D, A, B, 6, 15, 0xa3014314);
 420         OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
 421         OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
 422         OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
 423         OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
 424         OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
 425 # undef OP
 426 # endif
 427         /* Add checksum to the starting values */
 428         ctx->hash[0] = A_save + A;
 429         ctx->hash[1] = B_save + B;
 430         ctx->hash[2] = C_save + C;
 431         ctx->hash[3] = D_save + D;
 432 #endif
 433 }
 434 #undef FF
 435 #undef FG
 436 #undef FH
 437 #undef FI
 438
 439 /* Initialize structure containing state of computation.
 440  * (RFC 1321, 3.3: Step 3)
 441  */
 442 void FAST_FUNC md5_begin(md5_ctx_t *ctx)
 443 {
 444         ctx->hash[0] = 0x67452301;
 445         ctx->hash[1] = 0xefcdab89;
 446         ctx->hash[2] = 0x98badcfe;
 447         ctx->hash[3] = 0x10325476;
 448         ctx->total64 = 0;
 449         ctx->process_block = md5_process_block64;
 450 }
 451
 452 /* Used also for sha1 and sha256 */
 453 void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
 454 {
 455         common64_hash(ctx, buffer, len);
 456 }
 457
 458 /* Process the remaining bytes in the buffer and put result from CTX
 459  * in first 16 bytes following RESBUF.  The result is always in little
 460  * endian byte order, so that a byte-wise output yields to the wanted
 461  * ASCII representation of the message digest.
 462  */
 463 void FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
 464 {
 465         /* MD5 stores total in LE, need to swap on BE arches: */
 466         common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
 467
 468         /* The MD5 result is in little endian byte order */
 469         if (BB_BIG_ENDIAN) {
 470                 ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
 471                 ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
 472                 ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
 473                 ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
 474         }
 475
 476         memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
 477 }
 478
 479
 480 /*
 481  * SHA1 part is:
 482  * Copyright 2007 Rob Landley <rob@landley.net>
 483  *
 484  * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
 485  * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
 486  *
 487  * Licensed under GPLv2, see file LICENSE in this source tree.
 488  *
 489  * ---------------------------------------------------------------------------
 490  *
 491  * SHA256 and SHA512 parts are:
 492  * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
 493  * Shrank by Denys Vlasenko.
 494  *
 495  * ---------------------------------------------------------------------------
 496  *
 497  * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
 498  * and replace "4096" with something like "2000 + time(NULL) % 2097",
 499  * then rebuild and compare "shaNNNsum bigfile" results.
 500  */
 501
 502 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 503 {
 504         static const uint32_t rconsts[] = {
 505                 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
 506         };
 507         int i, j;
 508         int cnt;
 509         uint32_t W[16+16];
 510         uint32_t a, b, c, d, e;
 511
 512         /* On-stack work buffer frees up one register in the main loop
 513          * which otherwise will be needed to hold ctx pointer */
 514         for (i = 0; i < 16; i++)
 515                 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
 516
 517         a = ctx->hash[0];
 518         b = ctx->hash[1];
 519         c = ctx->hash[2];
 520         d = ctx->hash[3];
 521         e = ctx->hash[4];
 522
 523         /* 4 rounds of 20 operations each */
 524         cnt = 0;
 525         for (i = 0; i < 4; i++) {
 526                 j = 19;
 527                 do {
 528                         uint32_t work;
 529
 530                         work = c ^ d;
 531                         if (i == 0) {
 532                                 work = (work & b) ^ d;
 533                                 if (j <= 3)
 534                                         goto ge16;
 535                                 /* Used to do SWAP_BE32 here, but this
 536                                  * requires ctx (see comment above) */
 537                                 work += W[cnt];
 538                         } else {
 539                                 if (i == 2)
 540                                         work = ((b | c) & d) | (b & c);
 541                                 else /* i = 1 or 3 */
 542                                         work ^= b;
 543  ge16:
 544                                 W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
 545                                 work += W[cnt];
 546                         }
 547                         work += e + rotl32(a, 5) + rconsts[i];
 548
 549                         /* Rotate by one for next time */
 550                         e = d;
 551                         d = c;
 552                         c = /* b = */ rotl32(b, 30);
 553                         b = a;
 554                         a = work;
 555                         cnt = (cnt + 1) & 15;
 556                 } while (--j >= 0);
 557         }
 558
 559         ctx->hash[0] += a;
 560         ctx->hash[1] += b;
 561         ctx->hash[2] += c;
 562         ctx->hash[3] += d;
 563         ctx->hash[4] += e;
 564 }
 565
 566 /* Constants for SHA512 from FIPS 180-2:4.2.3.
 567  * SHA256 constants from FIPS 180-2:4.2.2
 568  * are the most significant half of first 64 elements
 569  * of the same array.
 570  */
 571 static const uint64_t sha_K[80] = {
 572         0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
 573         0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
 574         0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
 575         0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
 576         0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
 577         0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
 578         0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
 579         0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
 580         0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
 581         0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
 582         0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
 583         0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
 584         0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
 585         0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
 586         0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
 587         0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
 588         0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
 589         0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
 590         0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
 591         0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
 592         0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
 593         0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
 594         0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
 595         0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
 596         0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
 597         0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
 598         0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
 599         0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
 600         0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
 601         0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
 602         0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
 603         0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
 604         0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, /* [64]+ are used for sha512 only */
 605         0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
 606         0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
 607         0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
 608         0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
 609         0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
 610         0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
 611         0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
 612 };
 613
 614 #undef Ch
 615 #undef Maj
 616 #undef S0
 617 #undef S1
 618 #undef R0
 619 #undef R1
 620
 621 static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
 622 {
 623         unsigned t;
 624         uint32_t W[64], a, b, c, d, e, f, g, h;
 625         const uint32_t *words = (uint32_t*) ctx->wbuffer;
 626
 627         /* Operators defined in FIPS 180-2:4.1.2.  */
 628 #define Ch(x, y, z) ((x & y) ^ (~x & z))
 629 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
 630 #define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
 631 #define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
 632 #define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
 633 #define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
 634
 635         /* Compute the message schedule according to FIPS 180-2:6.2.2 step 2.  */
 636         for (t = 0; t < 16; ++t)
 637                 W[t] = SWAP_BE32(words[t]);
 638         for (/*t = 16*/; t < 64; ++t)
 639                 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
 640
 641         a = ctx->hash[0];
 642         b = ctx->hash[1];
 643         c = ctx->hash[2];
 644         d = ctx->hash[3];
 645         e = ctx->hash[4];
 646         f = ctx->hash[5];
 647         g = ctx->hash[6];
 648         h = ctx->hash[7];
 649
 650         /* The actual computation according to FIPS 180-2:6.2.2 step 3.  */
 651         for (t = 0; t < 64; ++t) {
 652                 /* Need to fetch upper half of sha_K[t]
 653                  * (I hope compiler is clever enough to just fetch
 654                  * upper half)
 655                  */
 656                 uint32_t K_t = sha_K[t] >> 32;
 657                 uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
 658                 uint32_t T2 = S0(a) + Maj(a, b, c);
 659                 h = g;
 660                 g = f;
 661                 f = e;
 662                 e = d + T1;
 663                 d = c;
 664                 c = b;
 665                 b = a;
 666                 a = T1 + T2;
 667         }
 668 #undef Ch
 669 #undef Maj
 670 #undef S0
 671 #undef S1
 672 #undef R0
 673 #undef R1
 674         /* Add the starting values of the context according to FIPS 180-2:6.2.2
 675            step 4.  */
 676         ctx->hash[0] += a;
 677         ctx->hash[1] += b;
 678         ctx->hash[2] += c;
 679         ctx->hash[3] += d;
 680         ctx->hash[4] += e;
 681         ctx->hash[5] += f;
 682         ctx->hash[6] += g;
 683         ctx->hash[7] += h;
 684 }
 685
 686 static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
 687 {
 688         unsigned t;
 689         uint64_t W[80];
 690         /* On i386, having assignments here (not later as sha256 does)
 691          * produces 99 bytes smaller code with gcc 4.3.1
 692          */
 693         uint64_t a = ctx->hash[0];
 694         uint64_t b = ctx->hash[1];
 695         uint64_t c = ctx->hash[2];
 696         uint64_t d = ctx->hash[3];
 697         uint64_t e = ctx->hash[4];
 698         uint64_t f = ctx->hash[5];
 699         uint64_t g = ctx->hash[6];
 700         uint64_t h = ctx->hash[7];
 701         const uint64_t *words = (uint64_t*) ctx->wbuffer;
 702
 703         /* Operators defined in FIPS 180-2:4.1.2.  */
 704 #define Ch(x, y, z) ((x & y) ^ (~x & z))
 705 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
 706 #define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
 707 #define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
 708 #define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
 709 #define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
 710
 711         /* Compute the message schedule according to FIPS 180-2:6.3.2 step 2.  */
 712         for (t = 0; t < 16; ++t)
 713                 W[t] = SWAP_BE64(words[t]);
 714         for (/*t = 16*/; t < 80; ++t)
 715                 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
 716
 717         /* The actual computation according to FIPS 180-2:6.3.2 step 3.  */
 718         for (t = 0; t < 80; ++t) {
 719                 uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
 720                 uint64_t T2 = S0(a) + Maj(a, b, c);
 721                 h = g;
 722                 g = f;
 723                 f = e;
 724                 e = d + T1;
 725                 d = c;
 726                 c = b;
 727                 b = a;
 728                 a = T1 + T2;
 729         }
 730 #undef Ch
 731 #undef Maj
 732 #undef S0
 733 #undef S1
 734 #undef R0
 735 #undef R1
 736         /* Add the starting values of the context according to FIPS 180-2:6.3.2
 737            step 4.  */
 738         ctx->hash[0] += a;
 739         ctx->hash[1] += b;
 740         ctx->hash[2] += c;
 741         ctx->hash[3] += d;
 742         ctx->hash[4] += e;
 743         ctx->hash[5] += f;
 744         ctx->hash[6] += g;
 745         ctx->hash[7] += h;
 746 }
 747
 748
 749 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 750 {
 751         ctx->hash[0] = 0x67452301;
 752         ctx->hash[1] = 0xefcdab89;
 753         ctx->hash[2] = 0x98badcfe;
 754         ctx->hash[3] = 0x10325476;
 755         ctx->hash[4] = 0xc3d2e1f0;
 756         ctx->total64 = 0;
 757         ctx->process_block = sha1_process_block64;
 758 }
 759
 760 static const uint32_t init256[] = {
 761         0,
 762         0,
 763         0x6a09e667,
 764         0xbb67ae85,
 765         0x3c6ef372,
 766         0xa54ff53a,
 767         0x510e527f,
 768         0x9b05688c,
 769         0x1f83d9ab,
 770         0x5be0cd19,
 771 };
 772 static const uint32_t init512_lo[] = {
 773         0,
 774         0,
 775         0xf3bcc908,
 776         0x84caa73b,
 777         0xfe94f82b,
 778         0x5f1d36f1,
 779         0xade682d1,
 780         0x2b3e6c1f,
 781         0xfb41bd6b,
 782         0x137e2179,
 783 };
 784
 785 /* Initialize structure containing state of computation.
 786    (FIPS 180-2:5.3.2)  */
 787 void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
 788 {
 789         memcpy(&ctx->total64, init256, sizeof(init256));
 790         /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
 791         ctx->process_block = sha256_process_block64;
 792 }
 793
 794 /* Initialize structure containing state of computation.
 795    (FIPS 180-2:5.3.3)  */
 796 void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
 797 {
 798         int i;
 799         /* Two extra iterations zero out ctx->total64[2] */
 800         uint64_t *tp = ctx->total64;
 801         for (i = 0; i < 2+8; i++)
 802                 tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
 803         /*ctx->total64[0] = ctx->total64[1] = 0; - already done */
 804 }
 805
 806 void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
 807 {
 808         unsigned bufpos = ctx->total64[0] & 127;
 809         unsigned remaining;
 810
 811         /* First increment the byte count.  FIPS 180-2 specifies the possible
 812            length of the file up to 2^128 _bits_.
 813            We compute the number of _bytes_ and convert to bits later.  */
 814         ctx->total64[0] += len;
 815         if (ctx->total64[0] < len)
 816                 ctx->total64[1]++;
 817 #if 0
 818         remaining = 128 - bufpos;
 819
 820         /* Hash whole blocks */
 821         while (len >= remaining) {
 822                 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
 823                 buffer = (const char *)buffer + remaining;
 824                 len -= remaining;
 825                 remaining = 128;
 826                 bufpos = 0;
 827                 sha512_process_block128(ctx);
 828         }
 829
 830         /* Save last, partial blosk */
 831         memcpy(ctx->wbuffer + bufpos, buffer, len);
 832 #else
 833         while (1) {
 834                 remaining = 128 - bufpos;
 835                 if (remaining > len)
 836                         remaining = len;
 837                 /* Copy data into aligned buffer */
 838                 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
 839                 len -= remaining;
 840                 buffer = (const char *)buffer + remaining;
 841                 bufpos += remaining;
 842                 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
 843                 bufpos -= 128;
 844                 if (bufpos != 0)
 845                         break;
 846                 /* Buffer is filled up, process it */
 847                 sha512_process_block128(ctx);
 848                 /*bufpos = 0; - already is */
 849         }
 850 #endif
 851 }
 852
 853 /* Used also for sha256 */
 854 void FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
 855 {
 856         unsigned hash_size;
 857
 858         /* SHA stores total in BE, need to swap on LE arches: */
 859         common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
 860
 861         hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
 862         /* This way we do not impose alignment constraints on resbuf: */
 863         if (BB_LITTLE_ENDIAN) {
 864                 unsigned i;
 865                 for (i = 0; i < hash_size; ++i)
 866                         ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
 867         }
 868         memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * hash_size);
 869 }
 870
 871 void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
 872 {
 873         unsigned bufpos = ctx->total64[0] & 127;
 874
 875         /* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
 876         ctx->wbuffer[bufpos++] = 0x80;
 877
 878         while (1) {
 879                 unsigned remaining = 128 - bufpos;
 880                 memset(ctx->wbuffer + bufpos, 0, remaining);
 881                 if (remaining >= 16) {
 882                         /* Store the 128-bit counter of bits in the buffer in BE format */
 883                         uint64_t t;
 884                         t = ctx->total64[0] << 3;
 885                         t = SWAP_BE64(t);
 886                         *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
 887                         t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
 888                         t = SWAP_BE64(t);
 889                         *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
 890                 }
 891                 sha512_process_block128(ctx);
 892                 if (remaining >= 16)
 893                         break;
 894                 bufpos = 0;
 895         }
 896
 897         if (BB_LITTLE_ENDIAN) {
 898                 unsigned i;
 899                 for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
 900                         ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
 901         }
 902         memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
 903 }
 904
 905
 906 /*
 907  * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
 908  * Michael Peeters and Gilles Van Assche. For more information, feedback or
 909  * questions, please refer to our website: http://keccak.noekeon.org/
 910  *
 911  * Implementation by Ronny Van Keer,
 912  * hereby denoted as "the implementer".
 913  *
 914  * To the extent possible under law, the implementer has waived all copyright
 915  * and related or neighboring rights to the source code in this file.
 916  * http://creativecommons.org/publicdomain/zero/1.0/
 917  *
 918  * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
 919  */
 920
 921 #if CONFIG_SHA3_SMALL < 0
 922 # define SHA3_SMALL 0
 923 #elif CONFIG_SHA3_SMALL > 1
 924 # define SHA3_SMALL 1
 925 #else
 926 # define SHA3_SMALL CONFIG_SHA3_SMALL
 927 #endif
 928
 929 #define OPTIMIZE_SHA3_FOR_32 0
 930 /*
 931  * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
 932  * every 64-bit word of state[] can be split into two 32-bit words
 933  * by even/odd bits. In this form, all rotations of sha3 round
 934  * are 32-bit - and there are lots of them.
 935  * However, it requires either splitting/combining state words
 936  * before/after sha3 round (code does this now)
 937  * or shuffling bits before xor'ing them into state and in sha3_end.
 938  * Without shuffling, bit-slicing results in -130 bytes of code
 939  * and marginal speedup (but of course it gives wrong result).
 940  * With shuffling it works, but +260 code bytes, and slower.
 941  * Disabled for now:
 942  */
 943 #if 0 /* LONG_MAX == 0x7fffffff */
 944 # undef OPTIMIZE_SHA3_FOR_32
 945 # define OPTIMIZE_SHA3_FOR_32 1
 946 #endif
 947
 948 enum {
 949         SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */
 950 };
 951
 952 #if OPTIMIZE_SHA3_FOR_32
 953 /* This splits every 64-bit word into a pair of 32-bit words,
 954  * even bits go into first word, odd bits go to second one.
 955  * The conversion is done in-place.
 956  */
 957 static void split_halves(uint64_t *state)
 958 {
 959         /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
 960         uint32_t *s32 = (uint32_t*)state;
 961         uint32_t t, x0, x1;
 962         int i;
 963         for (i = 24; i >= 0; --i) {
 964                 x0 = s32[0];
 965                 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
 966                 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
 967                 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
 968                 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
 969                 x1 = s32[1];
 970                 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
 971                 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
 972                 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
 973                 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
 974                 *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
 975                 *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
 976         }
 977 }
 978 /* The reverse operation */
 979 static void combine_halves(uint64_t *state)
 980 {
 981         uint32_t *s32 = (uint32_t*)state;
 982         uint32_t t, x0, x1;
 983         int i;
 984         for (i = 24; i >= 0; --i) {
 985                 x0 = s32[0];
 986                 x1 = s32[1];
 987                 t = (x0 & 0x0000FFFF) | (x1 << 16);
 988                 x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
 989                 x0 = t;
 990                 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
 991                 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
 992                 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
 993                 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
 994                 *s32++ = x0;
 995                 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
 996                 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
 997                 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
 998                 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
 999                 *s32++ = x1;
1000         }
1001 }
1002 #endif
1003
1004 /*
1005  * In the crypto literature this function is usually called Keccak-f().
1006  */
1007 static void sha3_process_block72(uint64_t *state)
1008 {
1009         enum { NROUNDS = 24 };
1010
1011 #if OPTIMIZE_SHA3_FOR_32
1012         /*
1013         static const uint32_t IOTA_CONST_0[NROUNDS] = {
1014                 0x00000001UL,
1015                 0x00000000UL,
1016                 0x00000000UL,
1017                 0x00000000UL,
1018                 0x00000001UL,
1019                 0x00000001UL,
1020                 0x00000001UL,
1021                 0x00000001UL,
1022                 0x00000000UL,
1023                 0x00000000UL,
1024                 0x00000001UL,
1025                 0x00000000UL,
1026                 0x00000001UL,
1027                 0x00000001UL,
1028                 0x00000001UL,
1029                 0x00000001UL,
1030                 0x00000000UL,
1031                 0x00000000UL,
1032                 0x00000000UL,
1033                 0x00000000UL,
1034                 0x00000001UL,
1035                 0x00000000UL,
1036                 0x00000001UL,
1037                 0x00000000UL,
1038         };
1039         ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1040         */
1041         uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1042         static const uint32_t IOTA_CONST_1[NROUNDS] = {
1043                 0x00000000UL,
1044                 0x00000089UL,
1045                 0x8000008bUL,
1046                 0x80008080UL,
1047                 0x0000008bUL,
1048                 0x00008000UL,
1049                 0x80008088UL,
1050                 0x80000082UL,
1051                 0x0000000bUL,
1052                 0x0000000aUL,
1053                 0x00008082UL,
1054                 0x00008003UL,
1055                 0x0000808bUL,
1056                 0x8000000bUL,
1057                 0x8000008aUL,
1058                 0x80000081UL,
1059                 0x80000081UL,
1060                 0x80000008UL,
1061                 0x00000083UL,
1062                 0x80008003UL,
1063                 0x80008088UL,
1064                 0x80000088UL,
1065                 0x00008000UL,
1066                 0x80008082UL,
1067         };
1068
1069         uint32_t *const s32 = (uint32_t*)state;
1070         unsigned round;
1071
1072         split_halves(state);
1073
1074         for (round = 0; round < NROUNDS; round++) {
1075                 unsigned x;
1076
1077                 /* Theta */
1078                 {
1079                         uint32_t BC[20];
1080                         for (x = 0; x < 10; ++x) {
1081                                 BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1082                         }
1083                         for (x = 0; x < 10; x += 2) {
1084                                 uint32_t ta, tb;
1085                                 ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1086                                 tb = BC[x+9] ^ BC[x+2];
1087                                 s32[x+0] ^= ta;
1088                                 s32[x+1] ^= tb;
1089                                 s32[x+10] ^= ta;
1090                                 s32[x+11] ^= tb;
1091                                 s32[x+20] ^= ta;
1092                                 s32[x+21] ^= tb;
1093                                 s32[x+30] ^= ta;
1094                                 s32[x+31] ^= tb;
1095                                 s32[x+40] ^= ta;
1096                                 s32[x+41] ^= tb;
1097                         }
1098                 }
1099                 /* RhoPi */
1100                 {
1101                         uint32_t t0a,t0b, t1a,t1b;
1102                         t1a = s32[1*2+0];
1103                         t1b = s32[1*2+1];
1104
1105 #define RhoPi(PI_LANE, ROT_CONST) \
1106         t0a = s32[PI_LANE*2+0];\
1107         t0b = s32[PI_LANE*2+1];\
1108         if (ROT_CONST & 1) {\
1109                 s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1110                 s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1111         } else {\
1112                 s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1113                 s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1114         }\
1115         t1a = t0a; t1b = t0b;
1116
1117                         RhoPi(10, 1)
1118                         RhoPi( 7, 3)
1119                         RhoPi(11, 6)
1120                         RhoPi(17,10)
1121                         RhoPi(18,15)
1122                         RhoPi( 3,21)
1123                         RhoPi( 5,28)
1124                         RhoPi(16,36)
1125                         RhoPi( 8,45)
1126                         RhoPi(21,55)
1127                         RhoPi(24, 2)
1128                         RhoPi( 4,14)
1129                         RhoPi(15,27)
1130                         RhoPi(23,41)
1131                         RhoPi(19,56)
1132                         RhoPi(13, 8)
1133                         RhoPi(12,25)
1134                         RhoPi( 2,43)
1135                         RhoPi(20,62)
1136                         RhoPi(14,18)
1137                         RhoPi(22,39)
1138                         RhoPi( 9,61)
1139                         RhoPi( 6,20)
1140                         RhoPi( 1,44)
1141 #undef RhoPi
1142                 }
1143                 /* Chi */
1144                 for (x = 0; x <= 40;) {
1145                         uint32_t BC0, BC1, BC2, BC3, BC4;
1146                         BC0 = s32[x + 0*2];
1147                         BC1 = s32[x + 1*2];
1148                         BC2 = s32[x + 2*2];
1149                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1150                         BC3 = s32[x + 3*2];
1151                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1152                         BC4 = s32[x + 4*2];
1153                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1154                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1155                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1156                         x++;
1157                         BC0 = s32[x + 0*2];
1158                         BC1 = s32[x + 1*2];
1159                         BC2 = s32[x + 2*2];
1160                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1161                         BC3 = s32[x + 3*2];
1162                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1163                         BC4 = s32[x + 4*2];
1164                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1165                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1166                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1167                         x += 9;
1168                 }
1169                 /* Iota */
1170                 s32[0] ^= IOTA_CONST_0bits & 1;
1171                 IOTA_CONST_0bits >>= 1;
1172                 s32[1] ^= IOTA_CONST_1[round];
1173         }
1174
1175         combine_halves(state);
1176 #else
1177         /* Native 64-bit algorithm */
1178         static const uint16_t IOTA_CONST[NROUNDS] = {
1179                 /* Elements should be 64-bit, but top half is always zero
1180                  * or 0x80000000. We encode 63rd bits in a separate word below.
1181                  * Same is true for 31th bits, which lets us use 16-bit table
1182                  * instead of 64-bit. The speed penalty is lost in the noise.
1183                  */
1184                 0x0001,
1185                 0x8082,
1186                 0x808a,
1187                 0x8000,
1188                 0x808b,
1189                 0x0001,
1190                 0x8081,
1191                 0x8009,
1192                 0x008a,
1193                 0x0088,
1194                 0x8009,
1195                 0x000a,
1196                 0x808b,
1197                 0x008b,
1198                 0x8089,
1199                 0x8003,
1200                 0x8002,
1201                 0x0080,
1202                 0x800a,
1203                 0x000a,
1204                 0x8081,
1205                 0x8080,
1206                 0x0001,
1207                 0x8008,
1208         };
1209         /* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1210         const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1211         /* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1212         const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1213
1214         static const uint8_t ROT_CONST[24] = {
1215                 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1216                 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1217         };
1218         static const uint8_t PI_LANE[24] = {
1219                 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1220                 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1221         };
1222         /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
1223
1224         unsigned x;
1225         unsigned round;
1226
1227         if (BB_BIG_ENDIAN) {
1228                 for (x = 0; x < 25; x++) {
1229                         state[x] = SWAP_LE64(state[x]);
1230                 }
1231         }
1232
1233         for (round = 0; round < NROUNDS; ++round) {
1234                 /* Theta */
1235                 {
1236                         uint64_t BC[10];
1237                         for (x = 0; x < 5; ++x) {
1238                                 BC[x + 5] = BC[x] = state[x]
1239                                         ^ state[x + 5] ^ state[x + 10]
1240                                         ^ state[x + 15] ^ state[x + 20];
1241                         }
1242                         /* Using 2x5 vector above eliminates the need to use
1243                          * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
1244                          * and the code is a bit _smaller_.
1245                          */
1246                         for (x = 0; x < 5; ++x) {
1247                                 uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
1248                                 state[x] ^= temp;
1249                                 state[x + 5] ^= temp;
1250                                 state[x + 10] ^= temp;
1251                                 state[x + 15] ^= temp;
1252                                 state[x + 20] ^= temp;
1253                         }
1254                 }
1255
1256                 /* Rho Pi */
1257                 if (SHA3_SMALL) {
1258                         uint64_t t1 = state[1];
1259                         for (x = 0; x < 24; ++x) {
1260                                 uint64_t t0 = state[PI_LANE[x]];
1261                                 state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
1262                                 t1 = t0;
1263                         }
1264                 } else {
1265                         /* Especially large benefit for 32-bit arch (75% faster):
1266                          * 64-bit rotations by non-constant usually are SLOW on those.
1267                          * We resort to unrolling here.
1268                          * This optimizes out PI_LANE[] and ROT_CONST[],
1269                          * but generates 300-500 more bytes of code.
1270                          */
1271                         uint64_t t0;
1272                         uint64_t t1 = state[1];
1273 #define RhoPi_twice(x) \
1274         t0 = state[PI_LANE[x  ]]; \
1275         state[PI_LANE[x  ]] = rotl64(t1, ROT_CONST[x  ]); \
1276         t1 = state[PI_LANE[x+1]]; \
1277         state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
1278                         RhoPi_twice(0); RhoPi_twice(2);
1279                         RhoPi_twice(4); RhoPi_twice(6);
1280                         RhoPi_twice(8); RhoPi_twice(10);
1281                         RhoPi_twice(12); RhoPi_twice(14);
1282                         RhoPi_twice(16); RhoPi_twice(18);
1283                         RhoPi_twice(20); RhoPi_twice(22);
1284 #undef RhoPi_twice
1285                 }
1286                 /* Chi */
1287 # if LONG_MAX > 0x7fffffff
1288                 for (x = 0; x <= 20; x += 5) {
1289                         uint64_t BC0, BC1, BC2, BC3, BC4;
1290                         BC0 = state[x + 0];
1291                         BC1 = state[x + 1];
1292                         BC2 = state[x + 2];
1293                         state[x + 0] = BC0 ^ ((~BC1) & BC2);
1294                         BC3 = state[x + 3];
1295                         state[x + 1] = BC1 ^ ((~BC2) & BC3);
1296                         BC4 = state[x + 4];
1297                         state[x + 2] = BC2 ^ ((~BC3) & BC4);
1298                         state[x + 3] = BC3 ^ ((~BC4) & BC0);
1299                         state[x + 4] = BC4 ^ ((~BC0) & BC1);
1300                 }
1301 # else
1302                 /* Reduced register pressure version
1303                  * for register-starved 32-bit arches
1304                  * (i386: -95 bytes, and it is _faster_)
1305                  */
1306                 for (x = 0; x <= 40;) {
1307                         uint32_t BC0, BC1, BC2, BC3, BC4;
1308                         uint32_t *const s32 = (uint32_t*)state;
1309 #  if SHA3_SMALL
1310  do_half:
1311 #  endif
1312                         BC0 = s32[x + 0*2];
1313                         BC1 = s32[x + 1*2];
1314                         BC2 = s32[x + 2*2];
1315                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1316                         BC3 = s32[x + 3*2];
1317                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1318                         BC4 = s32[x + 4*2];
1319                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1320                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1321                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1322                         x++;
1323 #  if SHA3_SMALL
1324                         if (x & 1)
1325                                 goto do_half;
1326                         x += 8;
1327 #  else
1328                         BC0 = s32[x + 0*2];
1329                         BC1 = s32[x + 1*2];
1330                         BC2 = s32[x + 2*2];
1331                         s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1332                         BC3 = s32[x + 3*2];
1333                         s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1334                         BC4 = s32[x + 4*2];
1335                         s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1336                         s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1337                         s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1338                         x += 9;
1339 #  endif
1340                 }
1341 # endif /* long is 32-bit */
1342                 /* Iota */
1343                 state[0] ^= IOTA_CONST[round]
1344                         | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1345                         | (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
1346         }
1347
1348         if (BB_BIG_ENDIAN) {
1349                 for (x = 0; x < 25; x++) {
1350                         state[x] = SWAP_LE64(state[x]);
1351                 }
1352         }
1353 #endif
1354 }
1355
1356 void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1357 {
1358         memset(ctx, 0, sizeof(*ctx));
1359 }
1360
1361 void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
1362 {
1363 #if SHA3_SMALL
1364         const uint8_t *data = buffer;
1365         unsigned bufpos = ctx->bytes_queued;
1366
1367         while (1) {
1368                 unsigned remaining = SHA3_IBLK_BYTES - bufpos;
1369                 if (remaining > len)
1370                         remaining = len;
1371                 len -= remaining;
1372                 /* XOR data into buffer */
1373                 while (remaining != 0) {
1374                         uint8_t *buf = (uint8_t*)ctx->state;
1375                         buf[bufpos] ^= *data++;
1376                         bufpos++;
1377                         remaining--;
1378                 }
1379                 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
1380                 bufpos -= SHA3_IBLK_BYTES;
1381                 if (bufpos != 0)
1382                         break;
1383                 /* Buffer is filled up, process it */
1384                 sha3_process_block72(ctx->state);
1385                 /*bufpos = 0; - already is */
1386         }
1387         ctx->bytes_queued = bufpos + SHA3_IBLK_BYTES;
1388 #else
1389         /* +50 bytes code size, but a bit faster because of long-sized XORs */
1390         const uint8_t *data = buffer;
1391         unsigned bufpos = ctx->bytes_queued;
1392
1393         /* If already data in queue, continue queuing first */
1394         while (len != 0 && bufpos != 0) {
1395                 uint8_t *buf = (uint8_t*)ctx->state;
1396                 buf[bufpos] ^= *data++;
1397                 len--;
1398                 bufpos++;
1399                 if (bufpos == SHA3_IBLK_BYTES) {
1400                         bufpos = 0;
1401                         goto do_block;
1402                 }
1403         }
1404
1405         /* Absorb complete blocks */
1406         while (len >= SHA3_IBLK_BYTES) {
1407                 /* XOR data onto beginning of state[].
1408                  * We try to be efficient - operate one word at a time, not byte.
1409                  * Careful wrt unaligned access: can't just use "*(long*)data"!
1410                  */
1411                 unsigned count = SHA3_IBLK_BYTES / sizeof(long);
1412                 long *buf = (long*)ctx->state;
1413                 do {
1414                         long v;
1415                         move_from_unaligned_long(v, (long*)data);
1416                         *buf++ ^= v;
1417                         data += sizeof(long);
1418                 } while (--count);
1419                 len -= SHA3_IBLK_BYTES;
1420  do_block:
1421                 sha3_process_block72(ctx->state);
1422         }
1423
1424         /* Queue remaining data bytes */
1425         while (len != 0) {
1426                 uint8_t *buf = (uint8_t*)ctx->state;
1427                 buf[bufpos] ^= *data++;
1428                 bufpos++;
1429                 len--;
1430         }
1431
1432         ctx->bytes_queued = bufpos;
1433 #endif
1434 }
1435
1436 void FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
1437 {
1438         /* Padding */
1439         uint8_t *buf = (uint8_t*)ctx->state;
1440         buf[ctx->bytes_queued]   ^= 1;
1441         buf[SHA3_IBLK_BYTES - 1] ^= 0x80;
1442
1443         sha3_process_block72(ctx->state);
1444
1445         /* Output */
1446         memcpy(resbuf, ctx->state, 64);
1447 }