2 * Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
11 #include <openssl/crypto.h>
12 #include "internal/cryptlib.h"
13 #include "crypto/modes.h"
15 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
16 /* redefine, because alignment is ensured */
18 # define GETU32(p) BSWAP4(*(const u32 *)(p))
20 # define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
23 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
24 #define REDUCE1BIT(V) do { \
25 if (sizeof(size_t)==8) { \
26 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
27 V.lo = (V.hi<<63)|(V.lo>>1); \
28 V.hi = (V.hi>>1 )^T; \
31 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
32 V.lo = (V.hi<<63)|(V.lo>>1); \
33 V.hi = (V.hi>>1 )^((u64)T<<32); \
38 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
39 * never be set to 8. 8 is effectively reserved for testing purposes.
40 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
41 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
42 * whole spectrum of possible table driven implementations. Why? In
43 * non-"Shoup's" case memory access pattern is segmented in such manner,
44 * that it's trivial to see that cache timing information can reveal
45 * fair portion of intermediate hash value. Given that ciphertext is
46 * always available to attacker, it's possible for him to attempt to
47 * deduce secret parameter H and if successful, tamper with messages
48 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
49 * not as trivial, but there is no reason to believe that it's resistant
50 * to cache-timing attack. And the thing about "8-bit" implementation is
51 * that it consumes 16 (sixteen) times more memory, 4KB per individual
52 * key + 1KB shared. Well, on pros side it should be twice as fast as
53 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
54 * was observed to run ~75% faster, closer to 100% for commercial
55 * compilers... Yet "4-bit" procedure is preferred, because it's
56 * believed to provide better security-performance balance and adequate
57 * all-round performance. "All-round" refers to things like:
59 * - shorter setup time effectively improves overall timing for
60 * handling short messages;
61 * - larger table allocation can become unbearable because of VM
62 * subsystem penalties (for example on Windows large enough free
63 * results in VM working set trimming, meaning that consequent
64 * malloc would immediately incur working set expansion);
65 * - larger table has larger cache footprint, which can affect
66 * performance of other code paths (not necessarily even from same
67 * thread in Hyper-Threading world);
69 * Value of 1 is not appropriate for performance reasons.
73 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
83 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
88 for (i = 2; i < 256; i <<= 1) {
89 u128 *Hi = Htable + i, H0 = *Hi;
90 for (j = 1; j < i; ++j) {
91 Hi[j].hi = H0.hi ^ Htable[j].hi;
92 Hi[j].lo = H0.lo ^ Htable[j].lo;
97 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
100 const u8 *xi = (const u8 *)Xi + 15;
106 static const size_t rem_8bit[256] = {
107 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
108 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
109 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
110 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
111 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
112 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
113 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
114 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
115 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
116 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
117 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
118 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
119 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
120 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
121 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
122 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
123 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
124 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
125 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
126 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
127 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
128 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
129 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
130 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
131 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
132 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
133 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
134 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
135 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
136 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
137 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
138 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
139 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
140 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
141 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
142 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
143 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
144 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
145 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
146 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
147 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
148 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
149 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
150 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
151 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
152 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
153 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
154 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
155 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
156 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
157 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
158 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
159 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
160 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
161 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
162 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
163 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
164 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
165 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
166 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
167 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
168 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
169 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
170 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
174 Z.hi ^= Htable[n].hi;
175 Z.lo ^= Htable[n].lo;
182 rem = (size_t)Z.lo & 0xff;
183 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
185 if (sizeof(size_t) == 8)
186 Z.hi ^= rem_8bit[rem];
188 Z.hi ^= (u64)rem_8bit[rem] << 32;
191 if (is_endian.little) {
193 Xi[0] = BSWAP8(Z.hi);
194 Xi[1] = BSWAP8(Z.lo);
198 v = (u32)(Z.hi >> 32);
202 v = (u32)(Z.lo >> 32);
213 # define GCM_MUL(ctx) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
217 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
220 # if defined(OPENSSL_SMALL_FOOTPRINT)
229 # if defined(OPENSSL_SMALL_FOOTPRINT)
230 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
235 for (i = 2; i < 16; i <<= 1) {
236 u128 *Hi = Htable + i;
238 for (V = *Hi, j = 1; j < i; ++j) {
239 Hi[j].hi = V.hi ^ Htable[j].hi;
240 Hi[j].lo = V.lo ^ Htable[j].lo;
251 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
253 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
254 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
255 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
257 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
258 Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
259 Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
260 Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
261 Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
262 Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
263 Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
265 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
267 * ARM assembler expects specific dword order in Htable.
276 if (is_endian.little)
277 for (j = 0; j < 16; ++j) {
282 for (j = 0; j < 16; ++j) {
284 Htable[j].hi = V.lo << 32 | V.lo >> 32;
285 Htable[j].lo = V.hi << 32 | V.hi >> 32;
292 static const size_t rem_4bit[16] = {
293 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
294 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
295 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
296 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
299 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
303 size_t rem, nlo, nhi;
309 nlo = ((const u8 *)Xi)[15];
313 Z.hi = Htable[nlo].hi;
314 Z.lo = Htable[nlo].lo;
317 rem = (size_t)Z.lo & 0xf;
318 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
320 if (sizeof(size_t) == 8)
321 Z.hi ^= rem_4bit[rem];
323 Z.hi ^= (u64)rem_4bit[rem] << 32;
325 Z.hi ^= Htable[nhi].hi;
326 Z.lo ^= Htable[nhi].lo;
331 nlo = ((const u8 *)Xi)[cnt];
335 rem = (size_t)Z.lo & 0xf;
336 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
338 if (sizeof(size_t) == 8)
339 Z.hi ^= rem_4bit[rem];
341 Z.hi ^= (u64)rem_4bit[rem] << 32;
343 Z.hi ^= Htable[nlo].hi;
344 Z.lo ^= Htable[nlo].lo;
347 if (is_endian.little) {
349 Xi[0] = BSWAP8(Z.hi);
350 Xi[1] = BSWAP8(Z.lo);
354 v = (u32)(Z.hi >> 32);
358 v = (u32)(Z.lo >> 32);
369 # if !defined(OPENSSL_SMALL_FOOTPRINT)
371 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
372 * details... Compiler-generated code doesn't seem to give any
373 * performance improvement, at least not on x86[_64]. It's here
374 * mostly as reference and a placeholder for possible future
375 * non-trivial optimization[s]...
377 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
378 const u8 *inp, size_t len)
382 size_t rem, nlo, nhi;
391 nlo = ((const u8 *)Xi)[15];
396 Z.hi = Htable[nlo].hi;
397 Z.lo = Htable[nlo].lo;
400 rem = (size_t)Z.lo & 0xf;
401 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
403 if (sizeof(size_t) == 8)
404 Z.hi ^= rem_4bit[rem];
406 Z.hi ^= (u64)rem_4bit[rem] << 32;
408 Z.hi ^= Htable[nhi].hi;
409 Z.lo ^= Htable[nhi].lo;
414 nlo = ((const u8 *)Xi)[cnt];
419 rem = (size_t)Z.lo & 0xf;
420 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
422 if (sizeof(size_t) == 8)
423 Z.hi ^= rem_4bit[rem];
425 Z.hi ^= (u64)rem_4bit[rem] << 32;
427 Z.hi ^= Htable[nlo].hi;
428 Z.lo ^= Htable[nlo].lo;
432 * Extra 256+16 bytes per-key plus 512 bytes shared tables
433 * [should] give ~50% improvement... One could have PACK()-ed
434 * the rem_8bit even here, but the priority is to minimize
437 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
438 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
439 static const unsigned short rem_8bit[256] = {
440 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
441 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
442 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
443 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
444 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
445 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
446 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
447 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
448 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
449 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
450 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
451 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
452 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
453 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
454 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
455 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
456 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
457 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
458 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
459 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
460 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
461 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
462 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
463 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
464 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
465 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
466 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
467 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
468 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
469 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
470 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
471 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
474 * This pre-processing phase slows down procedure by approximately
475 * same time as it makes each loop spin faster. In other words
476 * single block performance is approximately same as straightforward
477 * "4-bit" implementation, and then it goes only faster...
479 for (cnt = 0; cnt < 16; ++cnt) {
480 Z.hi = Htable[cnt].hi;
481 Z.lo = Htable[cnt].lo;
482 Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
483 Hshr4[cnt].hi = (Z.hi >> 4);
484 Hshl4[cnt] = (u8)(Z.lo << 4);
488 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
489 nlo = ((const u8 *)Xi)[cnt];
494 Z.hi ^= Htable[nlo].hi;
495 Z.lo ^= Htable[nlo].lo;
497 rem = (size_t)Z.lo & 0xff;
499 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
502 Z.hi ^= Hshr4[nhi].hi;
503 Z.lo ^= Hshr4[nhi].lo;
504 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
507 nlo = ((const u8 *)Xi)[0];
512 Z.hi ^= Htable[nlo].hi;
513 Z.lo ^= Htable[nlo].lo;
515 rem = (size_t)Z.lo & 0xf;
517 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
520 Z.hi ^= Htable[nhi].hi;
521 Z.lo ^= Htable[nhi].lo;
522 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
525 if (is_endian.little) {
527 Xi[0] = BSWAP8(Z.hi);
528 Xi[1] = BSWAP8(Z.lo);
532 v = (u32)(Z.hi >> 32);
536 v = (u32)(Z.lo >> 32);
545 } while (inp += 16, len -= 16);
549 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
550 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
554 # define GCM_MUL(ctx) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
555 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
556 # define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
558 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
559 * effect. In other words idea is to hash data while it's still in L1 cache
560 * after encryption pass...
562 # define GHASH_CHUNK (3*1024)
565 #else /* TABLE_BITS */
567 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
569 u128 V, Z = { 0, 0 };
572 const long *xi = (const long *)Xi;
578 V.hi = H[0]; /* H is in host byte order, no byte swapping */
581 for (j = 0; j < 16 / sizeof(long); ++j) {
582 if (is_endian.little) {
583 if (sizeof(long) == 8) {
585 X = (long)(BSWAP8(xi[j]));
587 const u8 *p = (const u8 *)(xi + j);
588 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
591 const u8 *p = (const u8 *)(xi + j);
597 for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
598 u64 M = (u64)(X >> (8 * sizeof(long) - 1));
606 if (is_endian.little) {
608 Xi[0] = BSWAP8(Z.hi);
609 Xi[1] = BSWAP8(Z.lo);
613 v = (u32)(Z.hi >> 32);
617 v = (u32)(Z.lo >> 32);
628 # define GCM_MUL(ctx) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
632 #if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
633 # if !defined(I386_ONLY) && \
634 (defined(__i386) || defined(__i386__) || \
635 defined(__x86_64) || defined(__x86_64__) || \
636 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
637 # define GHASH_ASM_X86_OR_64
638 # define GCM_FUNCREF_4BIT
640 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
641 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
642 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
645 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
646 # define gcm_init_avx gcm_init_clmul
647 # define gcm_gmult_avx gcm_gmult_clmul
648 # define gcm_ghash_avx gcm_ghash_clmul
650 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
651 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
652 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
656 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
657 # define GHASH_ASM_X86
658 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
659 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
662 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
663 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
666 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
667 # include "arm_arch.h"
668 # if __ARM_MAX_ARCH__>=7
669 # define GHASH_ASM_ARM
670 # define GCM_FUNCREF_4BIT
671 # define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
672 # if defined(__arm__) || defined(__arm)
673 # define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
675 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
676 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
677 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
679 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
680 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
681 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
684 # elif defined(__sparc__) || defined(__sparc)
685 # include "sparc_arch.h"
686 # define GHASH_ASM_SPARC
687 # define GCM_FUNCREF_4BIT
688 extern unsigned int OPENSSL_sparcv9cap_P[];
689 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
690 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
691 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
693 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
694 # include "ppc_arch.h"
695 # define GHASH_ASM_PPC
696 # define GCM_FUNCREF_4BIT
697 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
698 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
699 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
704 #ifdef GCM_FUNCREF_4BIT
706 # define GCM_MUL(ctx) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
709 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
713 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
720 memset(ctx, 0, sizeof(*ctx));
724 (*block) (ctx->H.c, ctx->H.c, key);
726 if (is_endian.little) {
727 /* H is stored in host byte order */
729 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
730 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
734 hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
735 lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
741 gcm_init_8bit(ctx->Htable, ctx->H.u);
744 # define CTX__GHASH(f) (ctx->ghash = (f))
746 # define CTX__GHASH(f) (ctx->ghash = NULL)
748 # if defined(GHASH_ASM_X86_OR_64)
749 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
750 if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
751 if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
752 gcm_init_avx(ctx->Htable, ctx->H.u);
753 ctx->gmult = gcm_gmult_avx;
754 CTX__GHASH(gcm_ghash_avx);
756 gcm_init_clmul(ctx->Htable, ctx->H.u);
757 ctx->gmult = gcm_gmult_clmul;
758 CTX__GHASH(gcm_ghash_clmul);
763 gcm_init_4bit(ctx->Htable, ctx->H.u);
764 # if defined(GHASH_ASM_X86) /* x86 only */
765 # if defined(OPENSSL_IA32_SSE2)
766 if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
768 if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
770 ctx->gmult = gcm_gmult_4bit_mmx;
771 CTX__GHASH(gcm_ghash_4bit_mmx);
773 ctx->gmult = gcm_gmult_4bit_x86;
774 CTX__GHASH(gcm_ghash_4bit_x86);
777 ctx->gmult = gcm_gmult_4bit;
778 CTX__GHASH(gcm_ghash_4bit);
780 # elif defined(GHASH_ASM_ARM)
781 # ifdef PMULL_CAPABLE
783 gcm_init_v8(ctx->Htable, ctx->H.u);
784 ctx->gmult = gcm_gmult_v8;
785 CTX__GHASH(gcm_ghash_v8);
790 gcm_init_neon(ctx->Htable, ctx->H.u);
791 ctx->gmult = gcm_gmult_neon;
792 CTX__GHASH(gcm_ghash_neon);
796 gcm_init_4bit(ctx->Htable, ctx->H.u);
797 ctx->gmult = gcm_gmult_4bit;
798 CTX__GHASH(gcm_ghash_4bit);
800 # elif defined(GHASH_ASM_SPARC)
801 if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
802 gcm_init_vis3(ctx->Htable, ctx->H.u);
803 ctx->gmult = gcm_gmult_vis3;
804 CTX__GHASH(gcm_ghash_vis3);
806 gcm_init_4bit(ctx->Htable, ctx->H.u);
807 ctx->gmult = gcm_gmult_4bit;
808 CTX__GHASH(gcm_ghash_4bit);
810 # elif defined(GHASH_ASM_PPC)
811 if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
812 gcm_init_p8(ctx->Htable, ctx->H.u);
813 ctx->gmult = gcm_gmult_p8;
814 CTX__GHASH(gcm_ghash_p8);
816 gcm_init_4bit(ctx->Htable, ctx->H.u);
817 ctx->gmult = gcm_gmult_4bit;
818 CTX__GHASH(gcm_ghash_4bit);
821 gcm_init_4bit(ctx->Htable, ctx->H.u);
827 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
835 #ifdef GCM_FUNCREF_4BIT
836 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
839 ctx->len.u[0] = 0; /* AAD length */
840 ctx->len.u[1] = 0; /* message length */
845 memcpy(ctx->Yi.c, iv, 12);
855 /* Borrow ctx->Xi to calculate initial Yi */
860 for (i = 0; i < 16; ++i)
861 ctx->Xi.c[i] ^= iv[i];
867 for (i = 0; i < len; ++i)
868 ctx->Xi.c[i] ^= iv[i];
872 if (is_endian.little) {
874 ctx->Xi.u[1] ^= BSWAP8(len0);
876 ctx->Xi.c[8] ^= (u8)(len0 >> 56);
877 ctx->Xi.c[9] ^= (u8)(len0 >> 48);
878 ctx->Xi.c[10] ^= (u8)(len0 >> 40);
879 ctx->Xi.c[11] ^= (u8)(len0 >> 32);
880 ctx->Xi.c[12] ^= (u8)(len0 >> 24);
881 ctx->Xi.c[13] ^= (u8)(len0 >> 16);
882 ctx->Xi.c[14] ^= (u8)(len0 >> 8);
883 ctx->Xi.c[15] ^= (u8)(len0);
886 ctx->Xi.u[1] ^= len0;
891 if (is_endian.little)
893 ctr = BSWAP4(ctx->Xi.d[3]);
895 ctr = GETU32(ctx->Xi.c + 12);
900 /* Copy borrowed Xi to Yi */
901 ctx->Yi.u[0] = ctx->Xi.u[0];
902 ctx->Yi.u[1] = ctx->Xi.u[1];
908 (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
910 if (is_endian.little)
912 ctx->Yi.d[3] = BSWAP4(ctr);
914 PUTU32(ctx->Yi.c + 12, ctr);
920 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
925 u64 alen = ctx->len.u[0];
926 #ifdef GCM_FUNCREF_4BIT
927 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
929 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
930 const u8 *inp, size_t len) = ctx->ghash;
938 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
940 ctx->len.u[0] = alen;
945 ctx->Xi.c[n] ^= *(aad++);
957 if ((i = (len & (size_t)-16))) {
964 for (i = 0; i < 16; ++i)
965 ctx->Xi.c[i] ^= aad[i];
972 n = (unsigned int)len;
973 for (i = 0; i < len; ++i)
974 ctx->Xi.c[i] ^= aad[i];
981 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
982 const unsigned char *in, unsigned char *out,
989 unsigned int n, ctr, mres;
991 u64 mlen = ctx->len.u[1];
992 block128_f block = ctx->block;
993 void *key = ctx->key;
994 #ifdef GCM_FUNCREF_4BIT
995 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
996 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
997 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
998 const u8 *inp, size_t len) = ctx->ghash;
1003 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1005 ctx->len.u[1] = mlen;
1010 /* First call to encrypt finalizes GHASH(AAD) */
1011 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1017 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1020 mres = sizeof(ctx->Xi);
1027 if (is_endian.little)
1029 ctr = BSWAP4(ctx->Yi.d[3]);
1031 ctr = GETU32(ctx->Yi.c + 12);
1037 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1038 if (16 % sizeof(size_t) == 0) { /* always true actually */
1043 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1048 GHASH(ctx, ctx->Xn, mres);
1056 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1069 # if defined(STRICT_ALIGNMENT)
1070 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1074 if (len >= 16 && mres) {
1075 GHASH(ctx, ctx->Xn, mres);
1078 # if defined(GHASH_CHUNK)
1079 while (len >= GHASH_CHUNK) {
1080 size_t j = GHASH_CHUNK;
1083 size_t *out_t = (size_t *)out;
1084 const size_t *in_t = (const size_t *)in;
1086 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1088 if (is_endian.little)
1090 ctx->Yi.d[3] = BSWAP4(ctr);
1092 PUTU32(ctx->Yi.c + 12, ctr);
1096 for (i = 0; i < 16 / sizeof(size_t); ++i)
1097 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1102 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1106 if ((i = (len & (size_t)-16))) {
1110 size_t *out_t = (size_t *)out;
1111 const size_t *in_t = (const size_t *)in;
1113 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1115 if (is_endian.little)
1117 ctx->Yi.d[3] = BSWAP4(ctr);
1119 PUTU32(ctx->Yi.c + 12, ctr);
1123 for (i = 0; i < 16 / sizeof(size_t); ++i)
1124 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1129 GHASH(ctx, out - j, j);
1133 size_t *out_t = (size_t *)out;
1134 const size_t *in_t = (const size_t *)in;
1136 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1138 if (is_endian.little)
1140 ctx->Yi.d[3] = BSWAP4(ctr);
1142 PUTU32(ctx->Yi.c + 12, ctr);
1146 for (i = 0; i < 16 / sizeof(size_t); ++i)
1147 ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1155 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1157 if (is_endian.little)
1159 ctx->Yi.d[3] = BSWAP4(ctr);
1161 PUTU32(ctx->Yi.c + 12, ctr);
1167 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1172 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1184 for (i = 0; i < len; ++i) {
1186 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1188 if (is_endian.little)
1190 ctx->Yi.d[3] = BSWAP4(ctr);
1192 PUTU32(ctx->Yi.c + 12, ctr);
1197 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1198 ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1200 if (mres == sizeof(ctx->Xn)) {
1201 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1205 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1206 mres = n = (n + 1) % 16;
1216 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1217 const unsigned char *in, unsigned char *out,
1223 } is_endian = { 1 };
1224 unsigned int n, ctr, mres;
1226 u64 mlen = ctx->len.u[1];
1227 block128_f block = ctx->block;
1228 void *key = ctx->key;
1229 #ifdef GCM_FUNCREF_4BIT
1230 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1231 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1232 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1233 const u8 *inp, size_t len) = ctx->ghash;
1238 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1240 ctx->len.u[1] = mlen;
1245 /* First call to decrypt finalizes GHASH(AAD) */
1246 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1252 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1255 mres = sizeof(ctx->Xi);
1262 if (is_endian.little)
1264 ctr = BSWAP4(ctx->Yi.d[3]);
1266 ctr = GETU32(ctx->Yi.c + 12);
1272 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1273 if (16 % sizeof(size_t) == 0) { /* always true actually */
1278 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1283 GHASH(ctx, ctx->Xn, mres);
1292 *(out++) = c ^ ctx->EKi.c[n];
1306 # if defined(STRICT_ALIGNMENT)
1307 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1311 if (len >= 16 && mres) {
1312 GHASH(ctx, ctx->Xn, mres);
1315 # if defined(GHASH_CHUNK)
1316 while (len >= GHASH_CHUNK) {
1317 size_t j = GHASH_CHUNK;
1319 GHASH(ctx, in, GHASH_CHUNK);
1321 size_t *out_t = (size_t *)out;
1322 const size_t *in_t = (const size_t *)in;
1324 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1326 if (is_endian.little)
1328 ctx->Yi.d[3] = BSWAP4(ctr);
1330 PUTU32(ctx->Yi.c + 12, ctr);
1334 for (i = 0; i < 16 / sizeof(size_t); ++i)
1335 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1343 if ((i = (len & (size_t)-16))) {
1346 size_t *out_t = (size_t *)out;
1347 const size_t *in_t = (const size_t *)in;
1349 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1351 if (is_endian.little)
1353 ctx->Yi.d[3] = BSWAP4(ctr);
1355 PUTU32(ctx->Yi.c + 12, ctr);
1359 for (i = 0; i < 16 / sizeof(size_t); ++i)
1360 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1368 size_t *out_t = (size_t *)out;
1369 const size_t *in_t = (const size_t *)in;
1371 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1373 if (is_endian.little)
1375 ctx->Yi.d[3] = BSWAP4(ctr);
1377 PUTU32(ctx->Yi.c + 12, ctr);
1381 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1383 out[i] = c ^ ctx->EKi.t[i];
1393 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1395 if (is_endian.little)
1397 ctx->Yi.d[3] = BSWAP4(ctr);
1399 PUTU32(ctx->Yi.c + 12, ctr);
1405 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1412 out[n] = c ^ ctx->EKi.c[n];
1424 for (i = 0; i < len; ++i) {
1427 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1429 if (is_endian.little)
1431 ctx->Yi.d[3] = BSWAP4(ctr);
1433 PUTU32(ctx->Yi.c + 12, ctr);
1438 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1439 out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1441 if (mres == sizeof(ctx->Xn)) {
1442 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1447 out[i] = c ^ ctx->EKi.c[n];
1449 mres = n = (n + 1) % 16;
1459 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1460 const unsigned char *in, unsigned char *out,
1461 size_t len, ctr128_f stream)
1463 #if defined(OPENSSL_SMALL_FOOTPRINT)
1464 return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1469 } is_endian = { 1 };
1470 unsigned int n, ctr, mres;
1472 u64 mlen = ctx->len.u[1];
1473 void *key = ctx->key;
1474 # ifdef GCM_FUNCREF_4BIT
1475 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1477 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1478 const u8 *inp, size_t len) = ctx->ghash;
1483 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1485 ctx->len.u[1] = mlen;
1490 /* First call to encrypt finalizes GHASH(AAD) */
1497 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1500 mres = sizeof(ctx->Xi);
1507 if (is_endian.little)
1509 ctr = BSWAP4(ctx->Yi.d[3]);
1511 ctr = GETU32(ctx->Yi.c + 12);
1520 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1525 GHASH(ctx, ctx->Xn, mres);
1533 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1547 if (len >= 16 && mres) {
1548 GHASH(ctx, ctx->Xn, mres);
1551 # if defined(GHASH_CHUNK)
1552 while (len >= GHASH_CHUNK) {
1553 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1554 ctr += GHASH_CHUNK / 16;
1555 if (is_endian.little)
1557 ctx->Yi.d[3] = BSWAP4(ctr);
1559 PUTU32(ctx->Yi.c + 12, ctr);
1563 GHASH(ctx, out, GHASH_CHUNK);
1570 if ((i = (len & (size_t)-16))) {
1573 (*stream) (in, out, j, key, ctx->Yi.c);
1574 ctr += (unsigned int)j;
1575 if (is_endian.little)
1577 ctx->Yi.d[3] = BSWAP4(ctr);
1579 PUTU32(ctx->Yi.c + 12, ctr);
1590 for (i = 0; i < 16; ++i)
1591 ctx->Xi.c[i] ^= out[i];
1598 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1600 if (is_endian.little)
1602 ctx->Yi.d[3] = BSWAP4(ctr);
1604 PUTU32(ctx->Yi.c + 12, ctr);
1610 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1612 ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1623 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1624 const unsigned char *in, unsigned char *out,
1625 size_t len, ctr128_f stream)
1627 #if defined(OPENSSL_SMALL_FOOTPRINT)
1628 return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1633 } is_endian = { 1 };
1634 unsigned int n, ctr, mres;
1636 u64 mlen = ctx->len.u[1];
1637 void *key = ctx->key;
1638 # ifdef GCM_FUNCREF_4BIT
1639 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1641 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1642 const u8 *inp, size_t len) = ctx->ghash;
1647 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1649 ctx->len.u[1] = mlen;
1654 /* First call to decrypt finalizes GHASH(AAD) */
1661 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1664 mres = sizeof(ctx->Xi);
1671 if (is_endian.little)
1673 ctr = BSWAP4(ctx->Yi.d[3]);
1675 ctr = GETU32(ctx->Yi.c + 12);
1684 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1689 GHASH(ctx, ctx->Xn, mres);
1698 *(out++) = c ^ ctx->EKi.c[n];
1713 if (len >= 16 && mres) {
1714 GHASH(ctx, ctx->Xn, mres);
1717 # if defined(GHASH_CHUNK)
1718 while (len >= GHASH_CHUNK) {
1719 GHASH(ctx, in, GHASH_CHUNK);
1720 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1721 ctr += GHASH_CHUNK / 16;
1722 if (is_endian.little)
1724 ctx->Yi.d[3] = BSWAP4(ctr);
1726 PUTU32(ctx->Yi.c + 12, ctr);
1736 if ((i = (len & (size_t)-16))) {
1744 for (k = 0; k < 16; ++k)
1745 ctx->Xi.c[k] ^= in[k];
1752 (*stream) (in, out, j, key, ctx->Yi.c);
1753 ctr += (unsigned int)j;
1754 if (is_endian.little)
1756 ctx->Yi.d[3] = BSWAP4(ctr);
1758 PUTU32(ctx->Yi.c + 12, ctr);
1767 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1769 if (is_endian.little)
1771 ctx->Yi.d[3] = BSWAP4(ctr);
1773 PUTU32(ctx->Yi.c + 12, ctr);
1779 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1782 ctx->Xi.c[mres++] ^= c;
1783 out[n] = c ^ ctx->EKi.c[n];
1794 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1800 } is_endian = { 1 };
1801 u64 alen = ctx->len.u[0] << 3;
1802 u64 clen = ctx->len.u[1] << 3;
1803 #ifdef GCM_FUNCREF_4BIT
1804 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1805 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1806 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1807 const u8 *inp, size_t len) = ctx->ghash;
1811 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1813 unsigned int mres = ctx->mres;
1816 unsigned blocks = (mres + 15) & -16;
1818 memset(ctx->Xn + mres, 0, blocks - mres);
1820 if (mres == sizeof(ctx->Xn)) {
1821 GHASH(ctx, ctx->Xn, mres);
1824 } else if (ctx->ares) {
1828 if (ctx->mres || ctx->ares)
1832 if (is_endian.little) {
1834 alen = BSWAP8(alen);
1835 clen = BSWAP8(clen);
1839 ctx->len.u[0] = alen;
1840 ctx->len.u[1] = clen;
1842 alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1843 clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1847 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1850 memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1851 mres += sizeof(bitlen);
1852 GHASH(ctx, ctx->Xn, mres);
1854 ctx->Xi.u[0] ^= alen;
1855 ctx->Xi.u[1] ^= clen;
1859 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1860 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1862 if (tag && len <= sizeof(ctx->Xi))
1863 return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1868 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1870 CRYPTO_gcm128_finish(ctx, NULL, 0);
1871 memcpy(tag, ctx->Xi.c,
1872 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1875 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1877 GCM128_CONTEXT *ret;
1879 if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1880 CRYPTO_gcm128_init(ret, key, block);
1885 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1887 OPENSSL_clear_free(ctx, sizeof(*ctx));