crypto/modes/gcm128.c

   1 /*
   2  * Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 #include <string.h>
  11 #include <openssl/crypto.h>
  12 #include "internal/cryptlib.h"
  13 #include "crypto/modes.h"
  14
  15 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
  16 /* redefine, because alignment is ensured */
  17 # undef  GETU32
  18 # define GETU32(p)       BSWAP4(*(const u32 *)(p))
  19 # undef  PUTU32
  20 # define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
  21 #endif
  22
  23 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
  24 #define REDUCE1BIT(V)   do { \
  25         if (sizeof(size_t)==8) { \
  26                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
  27                 V.lo  = (V.hi<<63)|(V.lo>>1); \
  28                 V.hi  = (V.hi>>1 )^T; \
  29         } \
  30         else { \
  31                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
  32                 V.lo  = (V.hi<<63)|(V.lo>>1); \
  33                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
  34         } \
  35 } while(0)
  36
  37 /*-
  38  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
  39  * never be set to 8. 8 is effectively reserved for testing purposes.
  40  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
  41  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
  42  * whole spectrum of possible table driven implementations. Why? In
  43  * non-"Shoup's" case memory access pattern is segmented in such manner,
  44  * that it's trivial to see that cache timing information can reveal
  45  * fair portion of intermediate hash value. Given that ciphertext is
  46  * always available to attacker, it's possible for him to attempt to
  47  * deduce secret parameter H and if successful, tamper with messages
  48  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
  49  * not as trivial, but there is no reason to believe that it's resistant
  50  * to cache-timing attack. And the thing about "8-bit" implementation is
  51  * that it consumes 16 (sixteen) times more memory, 4KB per individual
  52  * key + 1KB shared. Well, on pros side it should be twice as fast as
  53  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
  54  * was observed to run ~75% faster, closer to 100% for commercial
  55  * compilers... Yet "4-bit" procedure is preferred, because it's
  56  * believed to provide better security-performance balance and adequate
  57  * all-round performance. "All-round" refers to things like:
  58  *
  59  * - shorter setup time effectively improves overall timing for
  60  *   handling short messages;
  61  * - larger table allocation can become unbearable because of VM
  62  *   subsystem penalties (for example on Windows large enough free
  63  *   results in VM working set trimming, meaning that consequent
  64  *   malloc would immediately incur working set expansion);
  65  * - larger table has larger cache footprint, which can affect
  66  *   performance of other code paths (not necessarily even from same
  67  *   thread in Hyper-Threading world);
  68  *
  69  * Value of 1 is not appropriate for performance reasons.
  70  */
  71 #if     TABLE_BITS==8
  72
  73 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
  74 {
  75     int i, j;
  76     u128 V;
  77
  78     Htable[0].hi = 0;
  79     Htable[0].lo = 0;
  80     V.hi = H[0];
  81     V.lo = H[1];
  82
  83     for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
  84         REDUCE1BIT(V);
  85         Htable[i] = V;
  86     }
  87
  88     for (i = 2; i < 256; i <<= 1) {
  89         u128 *Hi = Htable + i, H0 = *Hi;
  90         for (j = 1; j < i; ++j) {
  91             Hi[j].hi = H0.hi ^ Htable[j].hi;
  92             Hi[j].lo = H0.lo ^ Htable[j].lo;
  93         }
  94     }
  95 }
  96
  97 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
  98 {
  99     u128 Z = { 0, 0 };
 100     const u8 *xi = (const u8 *)Xi + 15;
 101     size_t rem, n = *xi;
 102     const union {
 103         long one;
 104         char little;
 105     } is_endian = { 1 };
 106     static const size_t rem_8bit[256] = {
 107         PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
 108         PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
 109         PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
 110         PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
 111         PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
 112         PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
 113         PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
 114         PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
 115         PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
 116         PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
 117         PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
 118         PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
 119         PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
 120         PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
 121         PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
 122         PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
 123         PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
 124         PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
 125         PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
 126         PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
 127         PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
 128         PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
 129         PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
 130         PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
 131         PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
 132         PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
 133         PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
 134         PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
 135         PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
 136         PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
 137         PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
 138         PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
 139         PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
 140         PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
 141         PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
 142         PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
 143         PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
 144         PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
 145         PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
 146         PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
 147         PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
 148         PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
 149         PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
 150         PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
 151         PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
 152         PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
 153         PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
 154         PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
 155         PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
 156         PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
 157         PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
 158         PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
 159         PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
 160         PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
 161         PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
 162         PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
 163         PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
 164         PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
 165         PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
 166         PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
 167         PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
 168         PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
 169         PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
 170         PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
 171     };
 172
 173     while (1) {
 174         Z.hi ^= Htable[n].hi;
 175         Z.lo ^= Htable[n].lo;
 176
 177         if ((u8 *)Xi == xi)
 178             break;
 179
 180         n = *(--xi);
 181
 182         rem = (size_t)Z.lo & 0xff;
 183         Z.lo = (Z.hi << 56) | (Z.lo >> 8);
 184         Z.hi = (Z.hi >> 8);
 185         if (sizeof(size_t) == 8)
 186             Z.hi ^= rem_8bit[rem];
 187         else
 188             Z.hi ^= (u64)rem_8bit[rem] << 32;
 189     }
 190
 191     if (is_endian.little) {
 192 # ifdef BSWAP8
 193         Xi[0] = BSWAP8(Z.hi);
 194         Xi[1] = BSWAP8(Z.lo);
 195 # else
 196         u8 *p = (u8 *)Xi;
 197         u32 v;
 198         v = (u32)(Z.hi >> 32);
 199         PUTU32(p, v);
 200         v = (u32)(Z.hi);
 201         PUTU32(p + 4, v);
 202         v = (u32)(Z.lo >> 32);
 203         PUTU32(p + 8, v);
 204         v = (u32)(Z.lo);
 205         PUTU32(p + 12, v);
 206 # endif
 207     } else {
 208         Xi[0] = Z.hi;
 209         Xi[1] = Z.lo;
 210     }
 211 }
 212
 213 # define GCM_MUL(ctx)      gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
 214
 215 #elif   TABLE_BITS==4
 216
 217 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
 218 {
 219     u128 V;
 220 # if defined(OPENSSL_SMALL_FOOTPRINT)
 221     int i;
 222 # endif
 223
 224     Htable[0].hi = 0;
 225     Htable[0].lo = 0;
 226     V.hi = H[0];
 227     V.lo = H[1];
 228
 229 # if defined(OPENSSL_SMALL_FOOTPRINT)
 230     for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
 231         REDUCE1BIT(V);
 232         Htable[i] = V;
 233     }
 234
 235     for (i = 2; i < 16; i <<= 1) {
 236         u128 *Hi = Htable + i;
 237         int j;
 238         for (V = *Hi, j = 1; j < i; ++j) {
 239             Hi[j].hi = V.hi ^ Htable[j].hi;
 240             Hi[j].lo = V.lo ^ Htable[j].lo;
 241         }
 242     }
 243 # else
 244     Htable[8] = V;
 245     REDUCE1BIT(V);
 246     Htable[4] = V;
 247     REDUCE1BIT(V);
 248     Htable[2] = V;
 249     REDUCE1BIT(V);
 250     Htable[1] = V;
 251     Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
 252     V = Htable[4];
 253     Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
 254     Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
 255     Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
 256     V = Htable[8];
 257     Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
 258     Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
 259     Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
 260     Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
 261     Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
 262     Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
 263     Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
 264 # endif
 265 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
 266     /*
 267      * ARM assembler expects specific dword order in Htable.
 268      */
 269     {
 270         int j;
 271         const union {
 272             long one;
 273             char little;
 274         } is_endian = { 1 };
 275
 276         if (is_endian.little)
 277             for (j = 0; j < 16; ++j) {
 278                 V = Htable[j];
 279                 Htable[j].hi = V.lo;
 280                 Htable[j].lo = V.hi;
 281         } else
 282             for (j = 0; j < 16; ++j) {
 283                 V = Htable[j];
 284                 Htable[j].hi = V.lo << 32 | V.lo >> 32;
 285                 Htable[j].lo = V.hi << 32 | V.hi >> 32;
 286             }
 287     }
 288 # endif
 289 }
 290
 291 # ifndef GHASH_ASM
 292 static const size_t rem_4bit[16] = {
 293     PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
 294     PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
 295     PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
 296     PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
 297 };
 298
 299 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
 300 {
 301     u128 Z;
 302     int cnt = 15;
 303     size_t rem, nlo, nhi;
 304     const union {
 305         long one;
 306         char little;
 307     } is_endian = { 1 };
 308
 309     nlo = ((const u8 *)Xi)[15];
 310     nhi = nlo >> 4;
 311     nlo &= 0xf;
 312
 313     Z.hi = Htable[nlo].hi;
 314     Z.lo = Htable[nlo].lo;
 315
 316     while (1) {
 317         rem = (size_t)Z.lo & 0xf;
 318         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 319         Z.hi = (Z.hi >> 4);
 320         if (sizeof(size_t) == 8)
 321             Z.hi ^= rem_4bit[rem];
 322         else
 323             Z.hi ^= (u64)rem_4bit[rem] << 32;
 324
 325         Z.hi ^= Htable[nhi].hi;
 326         Z.lo ^= Htable[nhi].lo;
 327
 328         if (--cnt < 0)
 329             break;
 330
 331         nlo = ((const u8 *)Xi)[cnt];
 332         nhi = nlo >> 4;
 333         nlo &= 0xf;
 334
 335         rem = (size_t)Z.lo & 0xf;
 336         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 337         Z.hi = (Z.hi >> 4);
 338         if (sizeof(size_t) == 8)
 339             Z.hi ^= rem_4bit[rem];
 340         else
 341             Z.hi ^= (u64)rem_4bit[rem] << 32;
 342
 343         Z.hi ^= Htable[nlo].hi;
 344         Z.lo ^= Htable[nlo].lo;
 345     }
 346
 347     if (is_endian.little) {
 348 #  ifdef BSWAP8
 349         Xi[0] = BSWAP8(Z.hi);
 350         Xi[1] = BSWAP8(Z.lo);
 351 #  else
 352         u8 *p = (u8 *)Xi;
 353         u32 v;
 354         v = (u32)(Z.hi >> 32);
 355         PUTU32(p, v);
 356         v = (u32)(Z.hi);
 357         PUTU32(p + 4, v);
 358         v = (u32)(Z.lo >> 32);
 359         PUTU32(p + 8, v);
 360         v = (u32)(Z.lo);
 361         PUTU32(p + 12, v);
 362 #  endif
 363     } else {
 364         Xi[0] = Z.hi;
 365         Xi[1] = Z.lo;
 366     }
 367 }
 368
 369 #  if !defined(OPENSSL_SMALL_FOOTPRINT)
 370 /*
 371  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
 372  * details... Compiler-generated code doesn't seem to give any
 373  * performance improvement, at least not on x86[_64]. It's here
 374  * mostly as reference and a placeholder for possible future
 375  * non-trivial optimization[s]...
 376  */
 377 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
 378                            const u8 *inp, size_t len)
 379 {
 380     u128 Z;
 381     int cnt;
 382     size_t rem, nlo, nhi;
 383     const union {
 384         long one;
 385         char little;
 386     } is_endian = { 1 };
 387
 388 #   if 1
 389     do {
 390         cnt = 15;
 391         nlo = ((const u8 *)Xi)[15];
 392         nlo ^= inp[15];
 393         nhi = nlo >> 4;
 394         nlo &= 0xf;
 395
 396         Z.hi = Htable[nlo].hi;
 397         Z.lo = Htable[nlo].lo;
 398
 399         while (1) {
 400             rem = (size_t)Z.lo & 0xf;
 401             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 402             Z.hi = (Z.hi >> 4);
 403             if (sizeof(size_t) == 8)
 404                 Z.hi ^= rem_4bit[rem];
 405             else
 406                 Z.hi ^= (u64)rem_4bit[rem] << 32;
 407
 408             Z.hi ^= Htable[nhi].hi;
 409             Z.lo ^= Htable[nhi].lo;
 410
 411             if (--cnt < 0)
 412                 break;
 413
 414             nlo = ((const u8 *)Xi)[cnt];
 415             nlo ^= inp[cnt];
 416             nhi = nlo >> 4;
 417             nlo &= 0xf;
 418
 419             rem = (size_t)Z.lo & 0xf;
 420             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 421             Z.hi = (Z.hi >> 4);
 422             if (sizeof(size_t) == 8)
 423                 Z.hi ^= rem_4bit[rem];
 424             else
 425                 Z.hi ^= (u64)rem_4bit[rem] << 32;
 426
 427             Z.hi ^= Htable[nlo].hi;
 428             Z.lo ^= Htable[nlo].lo;
 429         }
 430 #   else
 431     /*
 432      * Extra 256+16 bytes per-key plus 512 bytes shared tables
 433      * [should] give ~50% improvement... One could have PACK()-ed
 434      * the rem_8bit even here, but the priority is to minimize
 435      * cache footprint...
 436      */
 437     u128 Hshr4[16];             /* Htable shifted right by 4 bits */
 438     u8 Hshl4[16];               /* Htable shifted left by 4 bits */
 439     static const unsigned short rem_8bit[256] = {
 440         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
 441         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
 442         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
 443         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
 444         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
 445         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
 446         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
 447         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
 448         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
 449         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
 450         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
 451         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
 452         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
 453         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
 454         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
 455         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
 456         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
 457         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
 458         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
 459         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
 460         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
 461         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
 462         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
 463         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
 464         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
 465         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
 466         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
 467         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
 468         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
 469         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
 470         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
 471         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
 472     };
 473     /*
 474      * This pre-processing phase slows down procedure by approximately
 475      * same time as it makes each loop spin faster. In other words
 476      * single block performance is approximately same as straightforward
 477      * "4-bit" implementation, and then it goes only faster...
 478      */
 479     for (cnt = 0; cnt < 16; ++cnt) {
 480         Z.hi = Htable[cnt].hi;
 481         Z.lo = Htable[cnt].lo;
 482         Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
 483         Hshr4[cnt].hi = (Z.hi >> 4);
 484         Hshl4[cnt] = (u8)(Z.lo << 4);
 485     }
 486
 487     do {
 488         for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
 489             nlo = ((const u8 *)Xi)[cnt];
 490             nlo ^= inp[cnt];
 491             nhi = nlo >> 4;
 492             nlo &= 0xf;
 493
 494             Z.hi ^= Htable[nlo].hi;
 495             Z.lo ^= Htable[nlo].lo;
 496
 497             rem = (size_t)Z.lo & 0xff;
 498
 499             Z.lo = (Z.hi << 56) | (Z.lo >> 8);
 500             Z.hi = (Z.hi >> 8);
 501
 502             Z.hi ^= Hshr4[nhi].hi;
 503             Z.lo ^= Hshr4[nhi].lo;
 504             Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
 505         }
 506
 507         nlo = ((const u8 *)Xi)[0];
 508         nlo ^= inp[0];
 509         nhi = nlo >> 4;
 510         nlo &= 0xf;
 511
 512         Z.hi ^= Htable[nlo].hi;
 513         Z.lo ^= Htable[nlo].lo;
 514
 515         rem = (size_t)Z.lo & 0xf;
 516
 517         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 518         Z.hi = (Z.hi >> 4);
 519
 520         Z.hi ^= Htable[nhi].hi;
 521         Z.lo ^= Htable[nhi].lo;
 522         Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
 523 #   endif
 524
 525         if (is_endian.little) {
 526 #   ifdef BSWAP8
 527             Xi[0] = BSWAP8(Z.hi);
 528             Xi[1] = BSWAP8(Z.lo);
 529 #   else
 530             u8 *p = (u8 *)Xi;
 531             u32 v;
 532             v = (u32)(Z.hi >> 32);
 533             PUTU32(p, v);
 534             v = (u32)(Z.hi);
 535             PUTU32(p + 4, v);
 536             v = (u32)(Z.lo >> 32);
 537             PUTU32(p + 8, v);
 538             v = (u32)(Z.lo);
 539             PUTU32(p + 12, v);
 540 #   endif
 541         } else {
 542             Xi[0] = Z.hi;
 543             Xi[1] = Z.lo;
 544         }
 545     } while (inp += 16, len -= 16);
 546 }
 547 #  endif
 548 # else
 549 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
 550 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 551                     size_t len);
 552 # endif
 553
 554 # define GCM_MUL(ctx)      gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
 555 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
 556 #  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
 557 /*
 558  * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
 559  * effect. In other words idea is to hash data while it's still in L1 cache
 560  * after encryption pass...
 561  */
 562 #  define GHASH_CHUNK       (3*1024)
 563 # endif
 564
 565 #else                           /* TABLE_BITS */
 566
 567 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
 568 {
 569     u128 V, Z = { 0, 0 };
 570     long X;
 571     int i, j;
 572     const long *xi = (const long *)Xi;
 573     const union {
 574         long one;
 575         char little;
 576     } is_endian = { 1 };
 577
 578     V.hi = H[0];                /* H is in host byte order, no byte swapping */
 579     V.lo = H[1];
 580
 581     for (j = 0; j < 16 / sizeof(long); ++j) {
 582         if (is_endian.little) {
 583             if (sizeof(long) == 8) {
 584 # ifdef BSWAP8
 585                 X = (long)(BSWAP8(xi[j]));
 586 # else
 587                 const u8 *p = (const u8 *)(xi + j);
 588                 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
 589 # endif
 590             } else {
 591                 const u8 *p = (const u8 *)(xi + j);
 592                 X = (long)GETU32(p);
 593             }
 594         } else
 595             X = xi[j];
 596
 597         for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
 598             u64 M = (u64)(X >> (8 * sizeof(long) - 1));
 599             Z.hi ^= V.hi & M;
 600             Z.lo ^= V.lo & M;
 601
 602             REDUCE1BIT(V);
 603         }
 604     }
 605
 606     if (is_endian.little) {
 607 # ifdef BSWAP8
 608         Xi[0] = BSWAP8(Z.hi);
 609         Xi[1] = BSWAP8(Z.lo);
 610 # else
 611         u8 *p = (u8 *)Xi;
 612         u32 v;
 613         v = (u32)(Z.hi >> 32);
 614         PUTU32(p, v);
 615         v = (u32)(Z.hi);
 616         PUTU32(p + 4, v);
 617         v = (u32)(Z.lo >> 32);
 618         PUTU32(p + 8, v);
 619         v = (u32)(Z.lo);
 620         PUTU32(p + 12, v);
 621 # endif
 622     } else {
 623         Xi[0] = Z.hi;
 624         Xi[1] = Z.lo;
 625     }
 626 }
 627
 628 # define GCM_MUL(ctx)      gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
 629
 630 #endif
 631
 632 #if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
 633 # if    !defined(I386_ONLY) && \
 634         (defined(__i386)        || defined(__i386__)    || \
 635          defined(__x86_64)      || defined(__x86_64__)  || \
 636          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
 637 #  define GHASH_ASM_X86_OR_64
 638 #  define GCM_FUNCREF_4BIT
 639
 640 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
 641 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
 642 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 643                      size_t len);
 644
 645 #  if defined(__i386) || defined(__i386__) || defined(_M_IX86)
 646 #   define gcm_init_avx   gcm_init_clmul
 647 #   define gcm_gmult_avx  gcm_gmult_clmul
 648 #   define gcm_ghash_avx  gcm_ghash_clmul
 649 #  else
 650 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
 651 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
 652 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 653                    size_t len);
 654 #  endif
 655
 656 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
 657 #   define GHASH_ASM_X86
 658 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
 659 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 660                         size_t len);
 661
 662 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
 663 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 664                         size_t len);
 665 #  endif
 666 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
 667 #  include "arm_arch.h"
 668 #  if __ARM_MAX_ARCH__>=7
 669 #   define GHASH_ASM_ARM
 670 #   define GCM_FUNCREF_4BIT
 671 #   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL)
 672 #   if defined(__arm__) || defined(__arm)
 673 #    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON)
 674 #   endif
 675 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
 676 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
 677 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 678                     size_t len);
 679 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
 680 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
 681 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 682                   size_t len);
 683 #  endif
 684 # elif defined(__sparc__) || defined(__sparc)
 685 #  include "sparc_arch.h"
 686 #  define GHASH_ASM_SPARC
 687 #  define GCM_FUNCREF_4BIT
 688 extern unsigned int OPENSSL_sparcv9cap_P[];
 689 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
 690 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
 691 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 692                     size_t len);
 693 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
 694 #  include "ppc_arch.h"
 695 #  define GHASH_ASM_PPC
 696 #  define GCM_FUNCREF_4BIT
 697 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
 698 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
 699 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 700                   size_t len);
 701 # endif
 702 #endif
 703
 704 #ifdef GCM_FUNCREF_4BIT
 705 # undef  GCM_MUL
 706 # define GCM_MUL(ctx)           (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
 707 # ifdef GHASH
 708 #  undef  GHASH
 709 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
 710 # endif
 711 #endif
 712
 713 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
 714 {
 715     const union {
 716         long one;
 717         char little;
 718     } is_endian = { 1 };
 719
 720     memset(ctx, 0, sizeof(*ctx));
 721     ctx->block = block;
 722     ctx->key = key;
 723
 724     (*block) (ctx->H.c, ctx->H.c, key);
 725
 726     if (is_endian.little) {
 727         /* H is stored in host byte order */
 728 #ifdef BSWAP8
 729         ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
 730         ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
 731 #else
 732         u8 *p = ctx->H.c;
 733         u64 hi, lo;
 734         hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
 735         lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
 736         ctx->H.u[0] = hi;
 737         ctx->H.u[1] = lo;
 738 #endif
 739     }
 740 #if     TABLE_BITS==8
 741     gcm_init_8bit(ctx->Htable, ctx->H.u);
 742 #elif   TABLE_BITS==4
 743 # if    defined(GHASH)
 744 #  define CTX__GHASH(f) (ctx->ghash = (f))
 745 # else
 746 #  define CTX__GHASH(f) (ctx->ghash = NULL)
 747 # endif
 748 # if    defined(GHASH_ASM_X86_OR_64)
 749 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
 750     if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
 751         if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
 752             gcm_init_avx(ctx->Htable, ctx->H.u);
 753             ctx->gmult = gcm_gmult_avx;
 754             CTX__GHASH(gcm_ghash_avx);
 755         } else {
 756             gcm_init_clmul(ctx->Htable, ctx->H.u);
 757             ctx->gmult = gcm_gmult_clmul;
 758             CTX__GHASH(gcm_ghash_clmul);
 759         }
 760         return;
 761     }
 762 #  endif
 763     gcm_init_4bit(ctx->Htable, ctx->H.u);
 764 #  if   defined(GHASH_ASM_X86)  /* x86 only */
 765 #   if  defined(OPENSSL_IA32_SSE2)
 766     if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
 767 #   else
 768     if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
 769 #   endif
 770         ctx->gmult = gcm_gmult_4bit_mmx;
 771         CTX__GHASH(gcm_ghash_4bit_mmx);
 772     } else {
 773         ctx->gmult = gcm_gmult_4bit_x86;
 774         CTX__GHASH(gcm_ghash_4bit_x86);
 775     }
 776 #  else
 777     ctx->gmult = gcm_gmult_4bit;
 778     CTX__GHASH(gcm_ghash_4bit);
 779 #  endif
 780 # elif  defined(GHASH_ASM_ARM)
 781 #  ifdef PMULL_CAPABLE
 782     if (PMULL_CAPABLE) {
 783         gcm_init_v8(ctx->Htable, ctx->H.u);
 784         ctx->gmult = gcm_gmult_v8;
 785         CTX__GHASH(gcm_ghash_v8);
 786     } else
 787 #  endif
 788 #  ifdef NEON_CAPABLE
 789     if (NEON_CAPABLE) {
 790         gcm_init_neon(ctx->Htable, ctx->H.u);
 791         ctx->gmult = gcm_gmult_neon;
 792         CTX__GHASH(gcm_ghash_neon);
 793     } else
 794 #  endif
 795     {
 796         gcm_init_4bit(ctx->Htable, ctx->H.u);
 797         ctx->gmult = gcm_gmult_4bit;
 798         CTX__GHASH(gcm_ghash_4bit);
 799     }
 800 # elif  defined(GHASH_ASM_SPARC)
 801     if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
 802         gcm_init_vis3(ctx->Htable, ctx->H.u);
 803         ctx->gmult = gcm_gmult_vis3;
 804         CTX__GHASH(gcm_ghash_vis3);
 805     } else {
 806         gcm_init_4bit(ctx->Htable, ctx->H.u);
 807         ctx->gmult = gcm_gmult_4bit;
 808         CTX__GHASH(gcm_ghash_4bit);
 809     }
 810 # elif  defined(GHASH_ASM_PPC)
 811     if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
 812         gcm_init_p8(ctx->Htable, ctx->H.u);
 813         ctx->gmult = gcm_gmult_p8;
 814         CTX__GHASH(gcm_ghash_p8);
 815     } else {
 816         gcm_init_4bit(ctx->Htable, ctx->H.u);
 817         ctx->gmult = gcm_gmult_4bit;
 818         CTX__GHASH(gcm_ghash_4bit);
 819     }
 820 # else
 821     gcm_init_4bit(ctx->Htable, ctx->H.u);
 822 # endif
 823 # undef CTX__GHASH
 824 #endif
 825 }
 826
 827 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
 828                          size_t len)
 829 {
 830     const union {
 831         long one;
 832         char little;
 833     } is_endian = { 1 };
 834     unsigned int ctr;
 835 #ifdef GCM_FUNCREF_4BIT
 836     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 837 #endif
 838
 839     ctx->len.u[0] = 0;          /* AAD length */
 840     ctx->len.u[1] = 0;          /* message length */
 841     ctx->ares = 0;
 842     ctx->mres = 0;
 843
 844     if (len == 12) {
 845         memcpy(ctx->Yi.c, iv, 12);
 846         ctx->Yi.c[12] = 0;
 847         ctx->Yi.c[13] = 0;
 848         ctx->Yi.c[14] = 0;
 849         ctx->Yi.c[15] = 1;
 850         ctr = 1;
 851     } else {
 852         size_t i;
 853         u64 len0 = len;
 854
 855         /* Borrow ctx->Xi to calculate initial Yi */
 856         ctx->Xi.u[0] = 0;
 857         ctx->Xi.u[1] = 0;
 858
 859         while (len >= 16) {
 860             for (i = 0; i < 16; ++i)
 861                 ctx->Xi.c[i] ^= iv[i];
 862             GCM_MUL(ctx);
 863             iv += 16;
 864             len -= 16;
 865         }
 866         if (len) {
 867             for (i = 0; i < len; ++i)
 868                 ctx->Xi.c[i] ^= iv[i];
 869             GCM_MUL(ctx);
 870         }
 871         len0 <<= 3;
 872         if (is_endian.little) {
 873 #ifdef BSWAP8
 874             ctx->Xi.u[1] ^= BSWAP8(len0);
 875 #else
 876             ctx->Xi.c[8] ^= (u8)(len0 >> 56);
 877             ctx->Xi.c[9] ^= (u8)(len0 >> 48);
 878             ctx->Xi.c[10] ^= (u8)(len0 >> 40);
 879             ctx->Xi.c[11] ^= (u8)(len0 >> 32);
 880             ctx->Xi.c[12] ^= (u8)(len0 >> 24);
 881             ctx->Xi.c[13] ^= (u8)(len0 >> 16);
 882             ctx->Xi.c[14] ^= (u8)(len0 >> 8);
 883             ctx->Xi.c[15] ^= (u8)(len0);
 884 #endif
 885         } else {
 886             ctx->Xi.u[1] ^= len0;
 887         }
 888
 889         GCM_MUL(ctx);
 890
 891         if (is_endian.little)
 892 #ifdef BSWAP4
 893             ctr = BSWAP4(ctx->Xi.d[3]);
 894 #else
 895             ctr = GETU32(ctx->Xi.c + 12);
 896 #endif
 897         else
 898             ctr = ctx->Xi.d[3];
 899
 900         /* Copy borrowed Xi to Yi */
 901         ctx->Yi.u[0] = ctx->Xi.u[0];
 902         ctx->Yi.u[1] = ctx->Xi.u[1];
 903     }
 904
 905     ctx->Xi.u[0] = 0;
 906     ctx->Xi.u[1] = 0;
 907
 908     (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
 909     ++ctr;
 910     if (is_endian.little)
 911 #ifdef BSWAP4
 912         ctx->Yi.d[3] = BSWAP4(ctr);
 913 #else
 914         PUTU32(ctx->Yi.c + 12, ctr);
 915 #endif
 916     else
 917         ctx->Yi.d[3] = ctr;
 918 }
 919
 920 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
 921                       size_t len)
 922 {
 923     size_t i;
 924     unsigned int n;
 925     u64 alen = ctx->len.u[0];
 926 #ifdef GCM_FUNCREF_4BIT
 927     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 928 # ifdef GHASH
 929     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
 930                          const u8 *inp, size_t len) = ctx->ghash;
 931 # endif
 932 #endif
 933
 934     if (ctx->len.u[1])
 935         return -2;
 936
 937     alen += len;
 938     if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
 939         return -1;
 940     ctx->len.u[0] = alen;
 941
 942     n = ctx->ares;
 943     if (n) {
 944         while (n && len) {
 945             ctx->Xi.c[n] ^= *(aad++);
 946             --len;
 947             n = (n + 1) % 16;
 948         }
 949         if (n == 0)
 950             GCM_MUL(ctx);
 951         else {
 952             ctx->ares = n;
 953             return 0;
 954         }
 955     }
 956 #ifdef GHASH
 957     if ((i = (len & (size_t)-16))) {
 958         GHASH(ctx, aad, i);
 959         aad += i;
 960         len -= i;
 961     }
 962 #else
 963     while (len >= 16) {
 964         for (i = 0; i < 16; ++i)
 965             ctx->Xi.c[i] ^= aad[i];
 966         GCM_MUL(ctx);
 967         aad += 16;
 968         len -= 16;
 969     }
 970 #endif
 971     if (len) {
 972         n = (unsigned int)len;
 973         for (i = 0; i < len; ++i)
 974             ctx->Xi.c[i] ^= aad[i];
 975     }
 976
 977     ctx->ares = n;
 978     return 0;
 979 }
 980
 981 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
 982                           const unsigned char *in, unsigned char *out,
 983                           size_t len)
 984 {
 985     const union {
 986         long one;
 987         char little;
 988     } is_endian = { 1 };
 989     unsigned int n, ctr, mres;
 990     size_t i;
 991     u64 mlen = ctx->len.u[1];
 992     block128_f block = ctx->block;
 993     void *key = ctx->key;
 994 #ifdef GCM_FUNCREF_4BIT
 995     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 996 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
 997     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
 998                          const u8 *inp, size_t len) = ctx->ghash;
 999 # endif
1000 #endif
1001
1002     mlen += len;
1003     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1004         return -1;
1005     ctx->len.u[1] = mlen;
1006
1007     mres = ctx->mres;
1008
1009     if (ctx->ares) {
1010         /* First call to encrypt finalizes GHASH(AAD) */
1011 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1012         if (len == 0) {
1013             GCM_MUL(ctx);
1014             ctx->ares = 0;
1015             return 0;
1016         }
1017         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1018         ctx->Xi.u[0] = 0;
1019         ctx->Xi.u[1] = 0;
1020         mres = sizeof(ctx->Xi);
1021 #else
1022         GCM_MUL(ctx);
1023 #endif
1024         ctx->ares = 0;
1025     }
1026
1027     if (is_endian.little)
1028 #ifdef BSWAP4
1029         ctr = BSWAP4(ctx->Yi.d[3]);
1030 #else
1031         ctr = GETU32(ctx->Yi.c + 12);
1032 #endif
1033     else
1034         ctr = ctx->Yi.d[3];
1035
1036     n = mres % 16;
1037 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1038     if (16 % sizeof(size_t) == 0) { /* always true actually */
1039         do {
1040             if (n) {
1041 # if defined(GHASH)
1042                 while (n && len) {
1043                     ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1044                     --len;
1045                     n = (n + 1) % 16;
1046                 }
1047                 if (n == 0) {
1048                     GHASH(ctx, ctx->Xn, mres);
1049                     mres = 0;
1050                 } else {
1051                     ctx->mres = mres;
1052                     return 0;
1053                 }
1054 # else
1055                 while (n && len) {
1056                     ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1057                     --len;
1058                     n = (n + 1) % 16;
1059                 }
1060                 if (n == 0) {
1061                     GCM_MUL(ctx);
1062                     mres = 0;
1063                 } else {
1064                     ctx->mres = n;
1065                     return 0;
1066                 }
1067 # endif
1068             }
1069 # if defined(STRICT_ALIGNMENT)
1070             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1071                 break;
1072 # endif
1073 # if defined(GHASH)
1074             if (len >= 16 && mres) {
1075                 GHASH(ctx, ctx->Xn, mres);
1076                 mres = 0;
1077             }
1078 #  if defined(GHASH_CHUNK)
1079             while (len >= GHASH_CHUNK) {
1080                 size_t j = GHASH_CHUNK;
1081
1082                 while (j) {
1083                     size_t *out_t = (size_t *)out;
1084                     const size_t *in_t = (const size_t *)in;
1085
1086                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1087                     ++ctr;
1088                     if (is_endian.little)
1089 #   ifdef BSWAP4
1090                         ctx->Yi.d[3] = BSWAP4(ctr);
1091 #   else
1092                         PUTU32(ctx->Yi.c + 12, ctr);
1093 #   endif
1094                     else
1095                         ctx->Yi.d[3] = ctr;
1096                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1097                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1098                     out += 16;
1099                     in += 16;
1100                     j -= 16;
1101                 }
1102                 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1103                 len -= GHASH_CHUNK;
1104             }
1105 #  endif
1106             if ((i = (len & (size_t)-16))) {
1107                 size_t j = i;
1108
1109                 while (len >= 16) {
1110                     size_t *out_t = (size_t *)out;
1111                     const size_t *in_t = (const size_t *)in;
1112
1113                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1114                     ++ctr;
1115                     if (is_endian.little)
1116 #  ifdef BSWAP4
1117                         ctx->Yi.d[3] = BSWAP4(ctr);
1118 #  else
1119                         PUTU32(ctx->Yi.c + 12, ctr);
1120 #  endif
1121                     else
1122                         ctx->Yi.d[3] = ctr;
1123                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1124                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1125                     out += 16;
1126                     in += 16;
1127                     len -= 16;
1128                 }
1129                 GHASH(ctx, out - j, j);
1130             }
1131 # else
1132             while (len >= 16) {
1133                 size_t *out_t = (size_t *)out;
1134                 const size_t *in_t = (const size_t *)in;
1135
1136                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1137                 ++ctr;
1138                 if (is_endian.little)
1139 #  ifdef BSWAP4
1140                     ctx->Yi.d[3] = BSWAP4(ctr);
1141 #  else
1142                     PUTU32(ctx->Yi.c + 12, ctr);
1143 #  endif
1144                 else
1145                     ctx->Yi.d[3] = ctr;
1146                 for (i = 0; i < 16 / sizeof(size_t); ++i)
1147                     ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1148                 GCM_MUL(ctx);
1149                 out += 16;
1150                 in += 16;
1151                 len -= 16;
1152             }
1153 # endif
1154             if (len) {
1155                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1156                 ++ctr;
1157                 if (is_endian.little)
1158 # ifdef BSWAP4
1159                     ctx->Yi.d[3] = BSWAP4(ctr);
1160 # else
1161                     PUTU32(ctx->Yi.c + 12, ctr);
1162 # endif
1163                 else
1164                     ctx->Yi.d[3] = ctr;
1165 # if defined(GHASH)
1166                 while (len--) {
1167                     ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1168                     ++n;
1169                 }
1170 # else
1171                 while (len--) {
1172                     ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1173                     ++n;
1174                 }
1175                 mres = n;
1176 # endif
1177             }
1178
1179             ctx->mres = mres;
1180             return 0;
1181         } while (0);
1182     }
1183 #endif
1184     for (i = 0; i < len; ++i) {
1185         if (n == 0) {
1186             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1187             ++ctr;
1188             if (is_endian.little)
1189 #ifdef BSWAP4
1190                 ctx->Yi.d[3] = BSWAP4(ctr);
1191 #else
1192                 PUTU32(ctx->Yi.c + 12, ctr);
1193 #endif
1194             else
1195                 ctx->Yi.d[3] = ctr;
1196         }
1197 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1198         ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1199         n = (n + 1) % 16;
1200         if (mres == sizeof(ctx->Xn)) {
1201             GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1202             mres = 0;
1203         }
1204 #else
1205         ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1206         mres = n = (n + 1) % 16;
1207         if (n == 0)
1208             GCM_MUL(ctx);
1209 #endif
1210     }
1211
1212     ctx->mres = mres;
1213     return 0;
1214 }
1215
1216 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1217                           const unsigned char *in, unsigned char *out,
1218                           size_t len)
1219 {
1220     const union {
1221         long one;
1222         char little;
1223     } is_endian = { 1 };
1224     unsigned int n, ctr, mres;
1225     size_t i;
1226     u64 mlen = ctx->len.u[1];
1227     block128_f block = ctx->block;
1228     void *key = ctx->key;
1229 #ifdef GCM_FUNCREF_4BIT
1230     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1231 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1232     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1233                          const u8 *inp, size_t len) = ctx->ghash;
1234 # endif
1235 #endif
1236
1237     mlen += len;
1238     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1239         return -1;
1240     ctx->len.u[1] = mlen;
1241
1242     mres = ctx->mres;
1243
1244     if (ctx->ares) {
1245         /* First call to decrypt finalizes GHASH(AAD) */
1246 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1247         if (len == 0) {
1248             GCM_MUL(ctx);
1249             ctx->ares = 0;
1250             return 0;
1251         }
1252         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1253         ctx->Xi.u[0] = 0;
1254         ctx->Xi.u[1] = 0;
1255         mres = sizeof(ctx->Xi);
1256 #else
1257         GCM_MUL(ctx);
1258 #endif
1259         ctx->ares = 0;
1260     }
1261
1262     if (is_endian.little)
1263 #ifdef BSWAP4
1264         ctr = BSWAP4(ctx->Yi.d[3]);
1265 #else
1266         ctr = GETU32(ctx->Yi.c + 12);
1267 #endif
1268     else
1269         ctr = ctx->Yi.d[3];
1270
1271     n = mres % 16;
1272 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1273     if (16 % sizeof(size_t) == 0) { /* always true actually */
1274         do {
1275             if (n) {
1276 # if defined(GHASH)
1277                 while (n && len) {
1278                     *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1279                     --len;
1280                     n = (n + 1) % 16;
1281                 }
1282                 if (n == 0) {
1283                     GHASH(ctx, ctx->Xn, mres);
1284                     mres = 0;
1285                 } else {
1286                     ctx->mres = mres;
1287                     return 0;
1288                 }
1289 # else
1290                 while (n && len) {
1291                     u8 c = *(in++);
1292                     *(out++) = c ^ ctx->EKi.c[n];
1293                     ctx->Xi.c[n] ^= c;
1294                     --len;
1295                     n = (n + 1) % 16;
1296                 }
1297                 if (n == 0) {
1298                     GCM_MUL(ctx);
1299                     mres = 0;
1300                 } else {
1301                     ctx->mres = n;
1302                     return 0;
1303                 }
1304 # endif
1305             }
1306 # if defined(STRICT_ALIGNMENT)
1307             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1308                 break;
1309 # endif
1310 # if defined(GHASH)
1311             if (len >= 16 && mres) {
1312                 GHASH(ctx, ctx->Xn, mres);
1313                 mres = 0;
1314             }
1315 #  if defined(GHASH_CHUNK)
1316             while (len >= GHASH_CHUNK) {
1317                 size_t j = GHASH_CHUNK;
1318
1319                 GHASH(ctx, in, GHASH_CHUNK);
1320                 while (j) {
1321                     size_t *out_t = (size_t *)out;
1322                     const size_t *in_t = (const size_t *)in;
1323
1324                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1325                     ++ctr;
1326                     if (is_endian.little)
1327 #   ifdef BSWAP4
1328                         ctx->Yi.d[3] = BSWAP4(ctr);
1329 #   else
1330                         PUTU32(ctx->Yi.c + 12, ctr);
1331 #   endif
1332                     else
1333                         ctx->Yi.d[3] = ctr;
1334                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1335                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1336                     out += 16;
1337                     in += 16;
1338                     j -= 16;
1339                 }
1340                 len -= GHASH_CHUNK;
1341             }
1342 #  endif
1343             if ((i = (len & (size_t)-16))) {
1344                 GHASH(ctx, in, i);
1345                 while (len >= 16) {
1346                     size_t *out_t = (size_t *)out;
1347                     const size_t *in_t = (const size_t *)in;
1348
1349                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1350                     ++ctr;
1351                     if (is_endian.little)
1352 #  ifdef BSWAP4
1353                         ctx->Yi.d[3] = BSWAP4(ctr);
1354 #  else
1355                         PUTU32(ctx->Yi.c + 12, ctr);
1356 #  endif
1357                     else
1358                         ctx->Yi.d[3] = ctr;
1359                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1360                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1361                     out += 16;
1362                     in += 16;
1363                     len -= 16;
1364                 }
1365             }
1366 # else
1367             while (len >= 16) {
1368                 size_t *out_t = (size_t *)out;
1369                 const size_t *in_t = (const size_t *)in;
1370
1371                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1372                 ++ctr;
1373                 if (is_endian.little)
1374 #  ifdef BSWAP4
1375                     ctx->Yi.d[3] = BSWAP4(ctr);
1376 #  else
1377                     PUTU32(ctx->Yi.c + 12, ctr);
1378 #  endif
1379                 else
1380                     ctx->Yi.d[3] = ctr;
1381                 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1382                     size_t c = in[i];
1383                     out[i] = c ^ ctx->EKi.t[i];
1384                     ctx->Xi.t[i] ^= c;
1385                 }
1386                 GCM_MUL(ctx);
1387                 out += 16;
1388                 in += 16;
1389                 len -= 16;
1390             }
1391 # endif
1392             if (len) {
1393                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1394                 ++ctr;
1395                 if (is_endian.little)
1396 # ifdef BSWAP4
1397                     ctx->Yi.d[3] = BSWAP4(ctr);
1398 # else
1399                     PUTU32(ctx->Yi.c + 12, ctr);
1400 # endif
1401                 else
1402                     ctx->Yi.d[3] = ctr;
1403 # if defined(GHASH)
1404                 while (len--) {
1405                     out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1406                     ++n;
1407                 }
1408 # else
1409                 while (len--) {
1410                     u8 c = in[n];
1411                     ctx->Xi.c[n] ^= c;
1412                     out[n] = c ^ ctx->EKi.c[n];
1413                     ++n;
1414                 }
1415                 mres = n;
1416 # endif
1417             }
1418
1419             ctx->mres = mres;
1420             return 0;
1421         } while (0);
1422     }
1423 #endif
1424     for (i = 0; i < len; ++i) {
1425         u8 c;
1426         if (n == 0) {
1427             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1428             ++ctr;
1429             if (is_endian.little)
1430 #ifdef BSWAP4
1431                 ctx->Yi.d[3] = BSWAP4(ctr);
1432 #else
1433                 PUTU32(ctx->Yi.c + 12, ctr);
1434 #endif
1435             else
1436                 ctx->Yi.d[3] = ctr;
1437         }
1438 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1439         out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1440         n = (n + 1) % 16;
1441         if (mres == sizeof(ctx->Xn)) {
1442             GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1443             mres = 0;
1444         }
1445 #else
1446         c = in[i];
1447         out[i] = c ^ ctx->EKi.c[n];
1448         ctx->Xi.c[n] ^= c;
1449         mres = n = (n + 1) % 16;
1450         if (n == 0)
1451             GCM_MUL(ctx);
1452 #endif
1453     }
1454
1455     ctx->mres = mres;
1456     return 0;
1457 }
1458
1459 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1460                                 const unsigned char *in, unsigned char *out,
1461                                 size_t len, ctr128_f stream)
1462 {
1463 #if defined(OPENSSL_SMALL_FOOTPRINT)
1464     return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1465 #else
1466     const union {
1467         long one;
1468         char little;
1469     } is_endian = { 1 };
1470     unsigned int n, ctr, mres;
1471     size_t i;
1472     u64 mlen = ctx->len.u[1];
1473     void *key = ctx->key;
1474 # ifdef GCM_FUNCREF_4BIT
1475     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1476 #  ifdef GHASH
1477     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1478                          const u8 *inp, size_t len) = ctx->ghash;
1479 #  endif
1480 # endif
1481
1482     mlen += len;
1483     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1484         return -1;
1485     ctx->len.u[1] = mlen;
1486
1487     mres = ctx->mres;
1488
1489     if (ctx->ares) {
1490         /* First call to encrypt finalizes GHASH(AAD) */
1491 #if defined(GHASH)
1492         if (len == 0) {
1493             GCM_MUL(ctx);
1494             ctx->ares = 0;
1495             return 0;
1496         }
1497         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1498         ctx->Xi.u[0] = 0;
1499         ctx->Xi.u[1] = 0;
1500         mres = sizeof(ctx->Xi);
1501 #else
1502         GCM_MUL(ctx);
1503 #endif
1504         ctx->ares = 0;
1505     }
1506
1507     if (is_endian.little)
1508 # ifdef BSWAP4
1509         ctr = BSWAP4(ctx->Yi.d[3]);
1510 # else
1511         ctr = GETU32(ctx->Yi.c + 12);
1512 # endif
1513     else
1514         ctr = ctx->Yi.d[3];
1515
1516     n = mres % 16;
1517     if (n) {
1518 # if defined(GHASH)
1519         while (n && len) {
1520             ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1521             --len;
1522             n = (n + 1) % 16;
1523         }
1524         if (n == 0) {
1525             GHASH(ctx, ctx->Xn, mres);
1526             mres = 0;
1527         } else {
1528             ctx->mres = mres;
1529             return 0;
1530         }
1531 # else
1532         while (n && len) {
1533             ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1534             --len;
1535             n = (n + 1) % 16;
1536         }
1537         if (n == 0) {
1538             GCM_MUL(ctx);
1539             mres = 0;
1540         } else {
1541             ctx->mres = n;
1542             return 0;
1543         }
1544 # endif
1545     }
1546 # if defined(GHASH)
1547         if (len >= 16 && mres) {
1548             GHASH(ctx, ctx->Xn, mres);
1549             mres = 0;
1550         }
1551 #  if defined(GHASH_CHUNK)
1552     while (len >= GHASH_CHUNK) {
1553         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1554         ctr += GHASH_CHUNK / 16;
1555         if (is_endian.little)
1556 #   ifdef BSWAP4
1557             ctx->Yi.d[3] = BSWAP4(ctr);
1558 #   else
1559             PUTU32(ctx->Yi.c + 12, ctr);
1560 #   endif
1561         else
1562             ctx->Yi.d[3] = ctr;
1563         GHASH(ctx, out, GHASH_CHUNK);
1564         out += GHASH_CHUNK;
1565         in += GHASH_CHUNK;
1566         len -= GHASH_CHUNK;
1567     }
1568 #  endif
1569 # endif
1570     if ((i = (len & (size_t)-16))) {
1571         size_t j = i / 16;
1572
1573         (*stream) (in, out, j, key, ctx->Yi.c);
1574         ctr += (unsigned int)j;
1575         if (is_endian.little)
1576 # ifdef BSWAP4
1577             ctx->Yi.d[3] = BSWAP4(ctr);
1578 # else
1579             PUTU32(ctx->Yi.c + 12, ctr);
1580 # endif
1581         else
1582             ctx->Yi.d[3] = ctr;
1583         in += i;
1584         len -= i;
1585 # if defined(GHASH)
1586         GHASH(ctx, out, i);
1587         out += i;
1588 # else
1589         while (j--) {
1590             for (i = 0; i < 16; ++i)
1591                 ctx->Xi.c[i] ^= out[i];
1592             GCM_MUL(ctx);
1593             out += 16;
1594         }
1595 # endif
1596     }
1597     if (len) {
1598         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1599         ++ctr;
1600         if (is_endian.little)
1601 # ifdef BSWAP4
1602             ctx->Yi.d[3] = BSWAP4(ctr);
1603 # else
1604             PUTU32(ctx->Yi.c + 12, ctr);
1605 # endif
1606         else
1607             ctx->Yi.d[3] = ctr;
1608         while (len--) {
1609 # if defined(GHASH)
1610             ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1611 # else
1612             ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1613 # endif
1614             ++n;
1615         }
1616     }
1617
1618     ctx->mres = mres;
1619     return 0;
1620 #endif
1621 }
1622
1623 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1624                                 const unsigned char *in, unsigned char *out,
1625                                 size_t len, ctr128_f stream)
1626 {
1627 #if defined(OPENSSL_SMALL_FOOTPRINT)
1628     return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1629 #else
1630     const union {
1631         long one;
1632         char little;
1633     } is_endian = { 1 };
1634     unsigned int n, ctr, mres;
1635     size_t i;
1636     u64 mlen = ctx->len.u[1];
1637     void *key = ctx->key;
1638 # ifdef GCM_FUNCREF_4BIT
1639     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1640 #  ifdef GHASH
1641     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1642                          const u8 *inp, size_t len) = ctx->ghash;
1643 #  endif
1644 # endif
1645
1646     mlen += len;
1647     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1648         return -1;
1649     ctx->len.u[1] = mlen;
1650
1651     mres = ctx->mres;
1652
1653     if (ctx->ares) {
1654         /* First call to decrypt finalizes GHASH(AAD) */
1655 # if defined(GHASH)
1656         if (len == 0) {
1657             GCM_MUL(ctx);
1658             ctx->ares = 0;
1659             return 0;
1660         }
1661         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1662         ctx->Xi.u[0] = 0;
1663         ctx->Xi.u[1] = 0;
1664         mres = sizeof(ctx->Xi);
1665 # else
1666         GCM_MUL(ctx);
1667 # endif
1668         ctx->ares = 0;
1669     }
1670
1671     if (is_endian.little)
1672 # ifdef BSWAP4
1673         ctr = BSWAP4(ctx->Yi.d[3]);
1674 # else
1675         ctr = GETU32(ctx->Yi.c + 12);
1676 # endif
1677     else
1678         ctr = ctx->Yi.d[3];
1679
1680     n = mres % 16;
1681     if (n) {
1682 # if defined(GHASH)
1683         while (n && len) {
1684             *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1685             --len;
1686             n = (n + 1) % 16;
1687         }
1688         if (n == 0) {
1689             GHASH(ctx, ctx->Xn, mres);
1690             mres = 0;
1691         } else {
1692             ctx->mres = mres;
1693             return 0;
1694         }
1695 # else
1696         while (n && len) {
1697             u8 c = *(in++);
1698             *(out++) = c ^ ctx->EKi.c[n];
1699             ctx->Xi.c[n] ^= c;
1700             --len;
1701             n = (n + 1) % 16;
1702         }
1703         if (n == 0) {
1704             GCM_MUL(ctx);
1705             mres = 0;
1706         } else {
1707             ctx->mres = n;
1708             return 0;
1709         }
1710 # endif
1711     }
1712 # if defined(GHASH)
1713     if (len >= 16 && mres) {
1714         GHASH(ctx, ctx->Xn, mres);
1715         mres = 0;
1716     }
1717 #  if defined(GHASH_CHUNK)
1718     while (len >= GHASH_CHUNK) {
1719         GHASH(ctx, in, GHASH_CHUNK);
1720         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1721         ctr += GHASH_CHUNK / 16;
1722         if (is_endian.little)
1723 #   ifdef BSWAP4
1724             ctx->Yi.d[3] = BSWAP4(ctr);
1725 #   else
1726             PUTU32(ctx->Yi.c + 12, ctr);
1727 #   endif
1728         else
1729             ctx->Yi.d[3] = ctr;
1730         out += GHASH_CHUNK;
1731         in += GHASH_CHUNK;
1732         len -= GHASH_CHUNK;
1733     }
1734 #  endif
1735 # endif
1736     if ((i = (len & (size_t)-16))) {
1737         size_t j = i / 16;
1738
1739 # if defined(GHASH)
1740         GHASH(ctx, in, i);
1741 # else
1742         while (j--) {
1743             size_t k;
1744             for (k = 0; k < 16; ++k)
1745                 ctx->Xi.c[k] ^= in[k];
1746             GCM_MUL(ctx);
1747             in += 16;
1748         }
1749         j = i / 16;
1750         in -= i;
1751 # endif
1752         (*stream) (in, out, j, key, ctx->Yi.c);
1753         ctr += (unsigned int)j;
1754         if (is_endian.little)
1755 # ifdef BSWAP4
1756             ctx->Yi.d[3] = BSWAP4(ctr);
1757 # else
1758             PUTU32(ctx->Yi.c + 12, ctr);
1759 # endif
1760         else
1761             ctx->Yi.d[3] = ctr;
1762         out += i;
1763         in += i;
1764         len -= i;
1765     }
1766     if (len) {
1767         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1768         ++ctr;
1769         if (is_endian.little)
1770 # ifdef BSWAP4
1771             ctx->Yi.d[3] = BSWAP4(ctr);
1772 # else
1773             PUTU32(ctx->Yi.c + 12, ctr);
1774 # endif
1775         else
1776             ctx->Yi.d[3] = ctr;
1777         while (len--) {
1778 # if defined(GHASH)
1779             out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1780 # else
1781             u8 c = in[n];
1782             ctx->Xi.c[mres++] ^= c;
1783             out[n] = c ^ ctx->EKi.c[n];
1784 # endif
1785             ++n;
1786         }
1787     }
1788
1789     ctx->mres = mres;
1790     return 0;
1791 #endif
1792 }
1793
1794 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1795                          size_t len)
1796 {
1797     const union {
1798         long one;
1799         char little;
1800     } is_endian = { 1 };
1801     u64 alen = ctx->len.u[0] << 3;
1802     u64 clen = ctx->len.u[1] << 3;
1803 #ifdef GCM_FUNCREF_4BIT
1804     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1805 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1806     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1807                          const u8 *inp, size_t len) = ctx->ghash;
1808 # endif
1809 #endif
1810
1811 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1812     u128 bitlen;
1813     unsigned int mres = ctx->mres;
1814
1815     if (mres) {
1816         unsigned blocks = (mres + 15) & -16;
1817
1818         memset(ctx->Xn + mres, 0, blocks - mres);
1819         mres = blocks;
1820         if (mres == sizeof(ctx->Xn)) {
1821             GHASH(ctx, ctx->Xn, mres);
1822             mres = 0;
1823         }
1824     } else if (ctx->ares) {
1825         GCM_MUL(ctx);
1826     }
1827 #else
1828     if (ctx->mres || ctx->ares)
1829         GCM_MUL(ctx);
1830 #endif
1831
1832     if (is_endian.little) {
1833 #ifdef BSWAP8
1834         alen = BSWAP8(alen);
1835         clen = BSWAP8(clen);
1836 #else
1837         u8 *p = ctx->len.c;
1838
1839         ctx->len.u[0] = alen;
1840         ctx->len.u[1] = clen;
1841
1842         alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1843         clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1844 #endif
1845     }
1846
1847 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1848     bitlen.hi = alen;
1849     bitlen.lo = clen;
1850     memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1851     mres += sizeof(bitlen);
1852     GHASH(ctx, ctx->Xn, mres);
1853 #else
1854     ctx->Xi.u[0] ^= alen;
1855     ctx->Xi.u[1] ^= clen;
1856     GCM_MUL(ctx);
1857 #endif
1858
1859     ctx->Xi.u[0] ^= ctx->EK0.u[0];
1860     ctx->Xi.u[1] ^= ctx->EK0.u[1];
1861
1862     if (tag && len <= sizeof(ctx->Xi))
1863         return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1864     else
1865         return -1;
1866 }
1867
1868 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1869 {
1870     CRYPTO_gcm128_finish(ctx, NULL, 0);
1871     memcpy(tag, ctx->Xi.c,
1872            len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1873 }
1874
1875 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1876 {
1877     GCM128_CONTEXT *ret;
1878
1879     if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1880         CRYPTO_gcm128_init(ret, key, block);
1881
1882     return ret;
1883 }
1884
1885 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1886 {
1887     OPENSSL_clear_free(ctx, sizeof(*ctx));
1888 }