2 * Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
4 * Licensed under the OpenSSL license (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
10 #include <openssl/crypto.h>
11 #include "modes_lcl.h"
14 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
15 /* redefine, because alignment is ensured */
17 # define GETU32(p) BSWAP4(*(const u32 *)(p))
19 # define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
22 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
23 #define REDUCE1BIT(V) do { \
24 if (sizeof(size_t)==8) { \
25 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
26 V.lo = (V.hi<<63)|(V.lo>>1); \
27 V.hi = (V.hi>>1 )^T; \
30 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
31 V.lo = (V.hi<<63)|(V.lo>>1); \
32 V.hi = (V.hi>>1 )^((u64)T<<32); \
37 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
38 * never be set to 8. 8 is effectively reserved for testing purposes.
39 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
40 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
41 * whole spectrum of possible table driven implementations. Why? In
42 * non-"Shoup's" case memory access pattern is segmented in such manner,
43 * that it's trivial to see that cache timing information can reveal
44 * fair portion of intermediate hash value. Given that ciphertext is
45 * always available to attacker, it's possible for him to attempt to
46 * deduce secret parameter H and if successful, tamper with messages
47 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
48 * not as trivial, but there is no reason to believe that it's resistant
49 * to cache-timing attack. And the thing about "8-bit" implementation is
50 * that it consumes 16 (sixteen) times more memory, 4KB per individual
51 * key + 1KB shared. Well, on pros side it should be twice as fast as
52 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
53 * was observed to run ~75% faster, closer to 100% for commercial
54 * compilers... Yet "4-bit" procedure is preferred, because it's
55 * believed to provide better security-performance balance and adequate
56 * all-round performance. "All-round" refers to things like:
58 * - shorter setup time effectively improves overall timing for
59 * handling short messages;
60 * - larger table allocation can become unbearable because of VM
61 * subsystem penalties (for example on Windows large enough free
62 * results in VM working set trimming, meaning that consequent
63 * malloc would immediately incur working set expansion);
64 * - larger table has larger cache footprint, which can affect
65 * performance of other code paths (not necessarily even from same
66 * thread in Hyper-Threading world);
68 * Value of 1 is not appropriate for performance reasons.
72 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
82 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
87 for (i = 2; i < 256; i <<= 1) {
88 u128 *Hi = Htable + i, H0 = *Hi;
89 for (j = 1; j < i; ++j) {
90 Hi[j].hi = H0.hi ^ Htable[j].hi;
91 Hi[j].lo = H0.lo ^ Htable[j].lo;
96 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
99 const u8 *xi = (const u8 *)Xi + 15;
105 static const size_t rem_8bit[256] = {
106 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
107 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
108 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
109 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
110 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
111 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
112 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
113 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
114 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
115 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
116 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
117 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
118 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
119 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
120 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
121 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
122 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
123 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
124 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
125 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
126 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
127 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
128 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
129 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
130 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
131 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
132 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
133 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
134 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
135 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
136 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
137 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
138 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
139 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
140 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
141 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
142 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
143 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
144 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
145 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
146 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
147 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
148 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
149 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
150 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
151 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
152 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
153 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
154 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
155 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
156 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
157 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
158 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
159 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
160 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
161 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
162 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
163 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
164 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
165 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
166 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
167 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
168 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
169 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
173 Z.hi ^= Htable[n].hi;
174 Z.lo ^= Htable[n].lo;
181 rem = (size_t)Z.lo & 0xff;
182 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
184 if (sizeof(size_t) == 8)
185 Z.hi ^= rem_8bit[rem];
187 Z.hi ^= (u64)rem_8bit[rem] << 32;
190 if (is_endian.little) {
192 Xi[0] = BSWAP8(Z.hi);
193 Xi[1] = BSWAP8(Z.lo);
197 v = (u32)(Z.hi >> 32);
201 v = (u32)(Z.lo >> 32);
212 # define GCM_MUL(ctx) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
216 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
219 # if defined(OPENSSL_SMALL_FOOTPRINT)
228 # if defined(OPENSSL_SMALL_FOOTPRINT)
229 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
234 for (i = 2; i < 16; i <<= 1) {
235 u128 *Hi = Htable + i;
237 for (V = *Hi, j = 1; j < i; ++j) {
238 Hi[j].hi = V.hi ^ Htable[j].hi;
239 Hi[j].lo = V.lo ^ Htable[j].lo;
250 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
252 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
253 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
254 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
256 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
257 Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
258 Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
259 Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
260 Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
261 Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
262 Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
264 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
266 * ARM assembler expects specific dword order in Htable.
275 if (is_endian.little)
276 for (j = 0; j < 16; ++j) {
281 for (j = 0; j < 16; ++j) {
283 Htable[j].hi = V.lo << 32 | V.lo >> 32;
284 Htable[j].lo = V.hi << 32 | V.hi >> 32;
291 static const size_t rem_4bit[16] = {
292 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
293 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
294 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
295 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
298 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
302 size_t rem, nlo, nhi;
308 nlo = ((const u8 *)Xi)[15];
312 Z.hi = Htable[nlo].hi;
313 Z.lo = Htable[nlo].lo;
316 rem = (size_t)Z.lo & 0xf;
317 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
319 if (sizeof(size_t) == 8)
320 Z.hi ^= rem_4bit[rem];
322 Z.hi ^= (u64)rem_4bit[rem] << 32;
324 Z.hi ^= Htable[nhi].hi;
325 Z.lo ^= Htable[nhi].lo;
330 nlo = ((const u8 *)Xi)[cnt];
334 rem = (size_t)Z.lo & 0xf;
335 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
337 if (sizeof(size_t) == 8)
338 Z.hi ^= rem_4bit[rem];
340 Z.hi ^= (u64)rem_4bit[rem] << 32;
342 Z.hi ^= Htable[nlo].hi;
343 Z.lo ^= Htable[nlo].lo;
346 if (is_endian.little) {
348 Xi[0] = BSWAP8(Z.hi);
349 Xi[1] = BSWAP8(Z.lo);
353 v = (u32)(Z.hi >> 32);
357 v = (u32)(Z.lo >> 32);
368 # if !defined(OPENSSL_SMALL_FOOTPRINT)
370 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
371 * details... Compiler-generated code doesn't seem to give any
372 * performance improvement, at least not on x86[_64]. It's here
373 * mostly as reference and a placeholder for possible future
374 * non-trivial optimization[s]...
376 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
377 const u8 *inp, size_t len)
381 size_t rem, nlo, nhi;
390 nlo = ((const u8 *)Xi)[15];
395 Z.hi = Htable[nlo].hi;
396 Z.lo = Htable[nlo].lo;
399 rem = (size_t)Z.lo & 0xf;
400 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
402 if (sizeof(size_t) == 8)
403 Z.hi ^= rem_4bit[rem];
405 Z.hi ^= (u64)rem_4bit[rem] << 32;
407 Z.hi ^= Htable[nhi].hi;
408 Z.lo ^= Htable[nhi].lo;
413 nlo = ((const u8 *)Xi)[cnt];
418 rem = (size_t)Z.lo & 0xf;
419 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
421 if (sizeof(size_t) == 8)
422 Z.hi ^= rem_4bit[rem];
424 Z.hi ^= (u64)rem_4bit[rem] << 32;
426 Z.hi ^= Htable[nlo].hi;
427 Z.lo ^= Htable[nlo].lo;
431 * Extra 256+16 bytes per-key plus 512 bytes shared tables
432 * [should] give ~50% improvement... One could have PACK()-ed
433 * the rem_8bit even here, but the priority is to minimize
436 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
437 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
438 static const unsigned short rem_8bit[256] = {
439 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
440 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
441 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
442 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
443 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
444 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
445 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
446 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
447 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
448 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
449 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
450 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
451 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
452 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
453 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
454 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
455 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
456 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
457 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
458 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
459 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
460 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
461 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
462 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
463 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
464 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
465 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
466 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
467 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
468 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
469 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
470 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
473 * This pre-processing phase slows down procedure by approximately
474 * same time as it makes each loop spin faster. In other words
475 * single block performance is approximately same as straightforward
476 * "4-bit" implementation, and then it goes only faster...
478 for (cnt = 0; cnt < 16; ++cnt) {
479 Z.hi = Htable[cnt].hi;
480 Z.lo = Htable[cnt].lo;
481 Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
482 Hshr4[cnt].hi = (Z.hi >> 4);
483 Hshl4[cnt] = (u8)(Z.lo << 4);
487 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
488 nlo = ((const u8 *)Xi)[cnt];
493 Z.hi ^= Htable[nlo].hi;
494 Z.lo ^= Htable[nlo].lo;
496 rem = (size_t)Z.lo & 0xff;
498 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
501 Z.hi ^= Hshr4[nhi].hi;
502 Z.lo ^= Hshr4[nhi].lo;
503 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
506 nlo = ((const u8 *)Xi)[0];
511 Z.hi ^= Htable[nlo].hi;
512 Z.lo ^= Htable[nlo].lo;
514 rem = (size_t)Z.lo & 0xf;
516 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
519 Z.hi ^= Htable[nhi].hi;
520 Z.lo ^= Htable[nhi].lo;
521 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
524 if (is_endian.little) {
526 Xi[0] = BSWAP8(Z.hi);
527 Xi[1] = BSWAP8(Z.lo);
531 v = (u32)(Z.hi >> 32);
535 v = (u32)(Z.lo >> 32);
544 } while (inp += 16, len -= 16);
548 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
549 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
553 # define GCM_MUL(ctx) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
554 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
555 # define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
557 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
558 * effect. In other words idea is to hash data while it's still in L1 cache
559 * after encryption pass...
561 # define GHASH_CHUNK (3*1024)
564 #else /* TABLE_BITS */
566 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
568 u128 V, Z = { 0, 0 };
571 const long *xi = (const long *)Xi;
577 V.hi = H[0]; /* H is in host byte order, no byte swapping */
580 for (j = 0; j < 16 / sizeof(long); ++j) {
581 if (is_endian.little) {
582 if (sizeof(long) == 8) {
584 X = (long)(BSWAP8(xi[j]));
586 const u8 *p = (const u8 *)(xi + j);
587 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
590 const u8 *p = (const u8 *)(xi + j);
596 for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
597 u64 M = (u64)(X >> (8 * sizeof(long) - 1));
605 if (is_endian.little) {
607 Xi[0] = BSWAP8(Z.hi);
608 Xi[1] = BSWAP8(Z.lo);
612 v = (u32)(Z.hi >> 32);
616 v = (u32)(Z.lo >> 32);
627 # define GCM_MUL(ctx) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
631 #if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
632 # if !defined(I386_ONLY) && \
633 (defined(__i386) || defined(__i386__) || \
634 defined(__x86_64) || defined(__x86_64__) || \
635 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
636 # define GHASH_ASM_X86_OR_64
637 # define GCM_FUNCREF_4BIT
638 extern unsigned int OPENSSL_ia32cap_P[];
640 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
641 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
642 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
645 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
646 # define gcm_init_avx gcm_init_clmul
647 # define gcm_gmult_avx gcm_gmult_clmul
648 # define gcm_ghash_avx gcm_ghash_clmul
650 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
651 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
652 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
656 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
657 # define GHASH_ASM_X86
658 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
659 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
662 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
663 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
666 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
667 # include "arm_arch.h"
668 # if __ARM_MAX_ARCH__>=7
669 # define GHASH_ASM_ARM
670 # define GCM_FUNCREF_4BIT
671 # define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
672 # if defined(__arm__) || defined(__arm)
673 # define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
675 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
676 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
677 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
679 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
680 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
681 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
684 # elif defined(__sparc__) || defined(__sparc)
685 # include "sparc_arch.h"
686 # define GHASH_ASM_SPARC
687 # define GCM_FUNCREF_4BIT
688 extern unsigned int OPENSSL_sparcv9cap_P[];
689 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
690 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
691 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
693 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
694 # include "ppc_arch.h"
695 # define GHASH_ASM_PPC
696 # define GCM_FUNCREF_4BIT
697 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
698 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
699 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
704 #ifdef GCM_FUNCREF_4BIT
706 # define GCM_MUL(ctx) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
709 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
713 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
720 memset(ctx, 0, sizeof(*ctx));
724 (*block) (ctx->H.c, ctx->H.c, key);
726 if (is_endian.little) {
727 /* H is stored in host byte order */
729 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
730 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
734 hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
735 lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
741 gcm_init_8bit(ctx->Htable, ctx->H.u);
744 # define CTX__GHASH(f) (ctx->ghash = (f))
746 # define CTX__GHASH(f) (ctx->ghash = NULL)
748 # if defined(GHASH_ASM_X86_OR_64)
749 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
750 if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
751 if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
752 gcm_init_avx(ctx->Htable, ctx->H.u);
753 ctx->gmult = gcm_gmult_avx;
754 CTX__GHASH(gcm_ghash_avx);
756 gcm_init_clmul(ctx->Htable, ctx->H.u);
757 ctx->gmult = gcm_gmult_clmul;
758 CTX__GHASH(gcm_ghash_clmul);
763 gcm_init_4bit(ctx->Htable, ctx->H.u);
764 # if defined(GHASH_ASM_X86) /* x86 only */
765 # if defined(OPENSSL_IA32_SSE2)
766 if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
768 if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
770 ctx->gmult = gcm_gmult_4bit_mmx;
771 CTX__GHASH(gcm_ghash_4bit_mmx);
773 ctx->gmult = gcm_gmult_4bit_x86;
774 CTX__GHASH(gcm_ghash_4bit_x86);
777 ctx->gmult = gcm_gmult_4bit;
778 CTX__GHASH(gcm_ghash_4bit);
780 # elif defined(GHASH_ASM_ARM)
781 # ifdef PMULL_CAPABLE
783 gcm_init_v8(ctx->Htable, ctx->H.u);
784 ctx->gmult = gcm_gmult_v8;
785 CTX__GHASH(gcm_ghash_v8);
790 gcm_init_neon(ctx->Htable, ctx->H.u);
791 ctx->gmult = gcm_gmult_neon;
792 CTX__GHASH(gcm_ghash_neon);
796 gcm_init_4bit(ctx->Htable, ctx->H.u);
797 ctx->gmult = gcm_gmult_4bit;
798 CTX__GHASH(gcm_ghash_4bit);
800 # elif defined(GHASH_ASM_SPARC)
801 if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
802 gcm_init_vis3(ctx->Htable, ctx->H.u);
803 ctx->gmult = gcm_gmult_vis3;
804 CTX__GHASH(gcm_ghash_vis3);
806 gcm_init_4bit(ctx->Htable, ctx->H.u);
807 ctx->gmult = gcm_gmult_4bit;
808 CTX__GHASH(gcm_ghash_4bit);
810 # elif defined(GHASH_ASM_PPC)
811 if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
812 gcm_init_p8(ctx->Htable, ctx->H.u);
813 ctx->gmult = gcm_gmult_p8;
814 CTX__GHASH(gcm_ghash_p8);
816 gcm_init_4bit(ctx->Htable, ctx->H.u);
817 ctx->gmult = gcm_gmult_4bit;
818 CTX__GHASH(gcm_ghash_4bit);
821 gcm_init_4bit(ctx->Htable, ctx->H.u);
827 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
835 #ifdef GCM_FUNCREF_4BIT
836 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
839 ctx->len.u[0] = 0; /* AAD length */
840 ctx->len.u[1] = 0; /* message length */
845 memcpy(ctx->Yi.c, iv, 12);
855 /* Borrow ctx->Xi to calculate initial Yi */
860 for (i = 0; i < 16; ++i)
861 ctx->Xi.c[i] ^= iv[i];
867 for (i = 0; i < len; ++i)
868 ctx->Xi.c[i] ^= iv[i];
872 if (is_endian.little) {
874 ctx->Xi.u[1] ^= BSWAP8(len0);
876 ctx->Xi.c[8] ^= (u8)(len0 >> 56);
877 ctx->Xi.c[9] ^= (u8)(len0 >> 48);
878 ctx->Xi.c[10] ^= (u8)(len0 >> 40);
879 ctx->Xi.c[11] ^= (u8)(len0 >> 32);
880 ctx->Xi.c[12] ^= (u8)(len0 >> 24);
881 ctx->Xi.c[13] ^= (u8)(len0 >> 16);
882 ctx->Xi.c[14] ^= (u8)(len0 >> 8);
883 ctx->Xi.c[15] ^= (u8)(len0);
886 ctx->Xi.u[1] ^= len0;
891 if (is_endian.little)
893 ctr = BSWAP4(ctx->Xi.d[3]);
895 ctr = GETU32(ctx->Xi.c + 12);
900 /* Copy borrowed Xi to Yi */
901 ctx->Yi.u[0] = ctx->Xi.u[0];
902 ctx->Yi.u[1] = ctx->Xi.u[1];
908 (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
910 if (is_endian.little)
912 ctx->Yi.d[3] = BSWAP4(ctr);
914 PUTU32(ctx->Yi.c + 12, ctr);
920 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
925 u64 alen = ctx->len.u[0];
926 #ifdef GCM_FUNCREF_4BIT
927 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
929 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
930 const u8 *inp, size_t len) = ctx->ghash;
938 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
940 ctx->len.u[0] = alen;
945 ctx->Xi.c[n] ^= *(aad++);
957 if ((i = (len & (size_t)-16))) {
964 for (i = 0; i < 16; ++i)
965 ctx->Xi.c[i] ^= aad[i];
972 n = (unsigned int)len;
973 for (i = 0; i < len; ++i)
974 ctx->Xi.c[i] ^= aad[i];
981 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
982 const unsigned char *in, unsigned char *out,
991 u64 mlen = ctx->len.u[1];
992 block128_f block = ctx->block;
993 void *key = ctx->key;
994 #ifdef GCM_FUNCREF_4BIT
995 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
996 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
997 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
998 const u8 *inp, size_t len) = ctx->ghash;
1003 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1005 ctx->len.u[1] = mlen;
1008 /* First call to encrypt finalizes GHASH(AAD) */
1013 if (is_endian.little)
1015 ctr = BSWAP4(ctx->Yi.d[3]);
1017 ctr = GETU32(ctx->Yi.c + 12);
1023 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1024 if (16 % sizeof(size_t) == 0) { /* always true actually */
1028 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1039 # if defined(STRICT_ALIGNMENT)
1040 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1044 # if defined(GHASH_CHUNK)
1045 while (len >= GHASH_CHUNK) {
1046 size_t j = GHASH_CHUNK;
1049 size_t *out_t = (size_t *)out;
1050 const size_t *in_t = (const size_t *)in;
1052 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1054 if (is_endian.little)
1056 ctx->Yi.d[3] = BSWAP4(ctr);
1058 PUTU32(ctx->Yi.c + 12, ctr);
1062 for (i = 0; i < 16 / sizeof(size_t); ++i)
1063 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1068 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1072 if ((i = (len & (size_t)-16))) {
1076 size_t *out_t = (size_t *)out;
1077 const size_t *in_t = (const size_t *)in;
1079 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1081 if (is_endian.little)
1083 ctx->Yi.d[3] = BSWAP4(ctr);
1085 PUTU32(ctx->Yi.c + 12, ctr);
1089 for (i = 0; i < 16 / sizeof(size_t); ++i)
1090 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1095 GHASH(ctx, out - j, j);
1099 size_t *out_t = (size_t *)out;
1100 const size_t *in_t = (const size_t *)in;
1102 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1104 if (is_endian.little)
1106 ctx->Yi.d[3] = BSWAP4(ctr);
1108 PUTU32(ctx->Yi.c + 12, ctr);
1112 for (i = 0; i < 16 / sizeof(size_t); ++i)
1113 ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1121 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1123 if (is_endian.little)
1125 ctx->Yi.d[3] = BSWAP4(ctr);
1127 PUTU32(ctx->Yi.c + 12, ctr);
1132 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1142 for (i = 0; i < len; ++i) {
1144 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1146 if (is_endian.little)
1148 ctx->Yi.d[3] = BSWAP4(ctr);
1150 PUTU32(ctx->Yi.c + 12, ctr);
1155 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1165 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1166 const unsigned char *in, unsigned char *out,
1172 } is_endian = { 1 };
1173 unsigned int n, ctr;
1175 u64 mlen = ctx->len.u[1];
1176 block128_f block = ctx->block;
1177 void *key = ctx->key;
1178 #ifdef GCM_FUNCREF_4BIT
1179 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1180 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1181 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1182 const u8 *inp, size_t len) = ctx->ghash;
1187 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1189 ctx->len.u[1] = mlen;
1192 /* First call to decrypt finalizes GHASH(AAD) */
1197 if (is_endian.little)
1199 ctr = BSWAP4(ctx->Yi.d[3]);
1201 ctr = GETU32(ctx->Yi.c + 12);
1207 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1208 if (16 % sizeof(size_t) == 0) { /* always true actually */
1213 *(out++) = c ^ ctx->EKi.c[n];
1225 # if defined(STRICT_ALIGNMENT)
1226 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1230 # if defined(GHASH_CHUNK)
1231 while (len >= GHASH_CHUNK) {
1232 size_t j = GHASH_CHUNK;
1234 GHASH(ctx, in, GHASH_CHUNK);
1236 size_t *out_t = (size_t *)out;
1237 const size_t *in_t = (const size_t *)in;
1239 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1241 if (is_endian.little)
1243 ctx->Yi.d[3] = BSWAP4(ctr);
1245 PUTU32(ctx->Yi.c + 12, ctr);
1249 for (i = 0; i < 16 / sizeof(size_t); ++i)
1250 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1258 if ((i = (len & (size_t)-16))) {
1261 size_t *out_t = (size_t *)out;
1262 const size_t *in_t = (const size_t *)in;
1264 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1266 if (is_endian.little)
1268 ctx->Yi.d[3] = BSWAP4(ctr);
1270 PUTU32(ctx->Yi.c + 12, ctr);
1274 for (i = 0; i < 16 / sizeof(size_t); ++i)
1275 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1283 size_t *out_t = (size_t *)out;
1284 const size_t *in_t = (const size_t *)in;
1286 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1288 if (is_endian.little)
1290 ctx->Yi.d[3] = BSWAP4(ctr);
1292 PUTU32(ctx->Yi.c + 12, ctr);
1296 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1298 out[i] = c ^ ctx->EKi.t[i];
1308 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1310 if (is_endian.little)
1312 ctx->Yi.d[3] = BSWAP4(ctr);
1314 PUTU32(ctx->Yi.c + 12, ctr);
1321 out[n] = c ^ ctx->EKi.c[n];
1331 for (i = 0; i < len; ++i) {
1334 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1336 if (is_endian.little)
1338 ctx->Yi.d[3] = BSWAP4(ctr);
1340 PUTU32(ctx->Yi.c + 12, ctr);
1346 out[i] = c ^ ctx->EKi.c[n];
1357 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1358 const unsigned char *in, unsigned char *out,
1359 size_t len, ctr128_f stream)
1361 #if defined(OPENSSL_SMALL_FOOTPRINT)
1362 return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1367 } is_endian = { 1 };
1368 unsigned int n, ctr;
1370 u64 mlen = ctx->len.u[1];
1371 void *key = ctx->key;
1372 # ifdef GCM_FUNCREF_4BIT
1373 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1375 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1376 const u8 *inp, size_t len) = ctx->ghash;
1381 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1383 ctx->len.u[1] = mlen;
1386 /* First call to encrypt finalizes GHASH(AAD) */
1391 if (is_endian.little)
1393 ctr = BSWAP4(ctx->Yi.d[3]);
1395 ctr = GETU32(ctx->Yi.c + 12);
1403 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1414 # if defined(GHASH) && defined(GHASH_CHUNK)
1415 while (len >= GHASH_CHUNK) {
1416 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1417 ctr += GHASH_CHUNK / 16;
1418 if (is_endian.little)
1420 ctx->Yi.d[3] = BSWAP4(ctr);
1422 PUTU32(ctx->Yi.c + 12, ctr);
1426 GHASH(ctx, out, GHASH_CHUNK);
1432 if ((i = (len & (size_t)-16))) {
1435 (*stream) (in, out, j, key, ctx->Yi.c);
1436 ctr += (unsigned int)j;
1437 if (is_endian.little)
1439 ctx->Yi.d[3] = BSWAP4(ctr);
1441 PUTU32(ctx->Yi.c + 12, ctr);
1452 for (i = 0; i < 16; ++i)
1453 ctx->Xi.c[i] ^= out[i];
1460 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1462 if (is_endian.little)
1464 ctx->Yi.d[3] = BSWAP4(ctr);
1466 PUTU32(ctx->Yi.c + 12, ctr);
1471 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1481 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1482 const unsigned char *in, unsigned char *out,
1483 size_t len, ctr128_f stream)
1485 #if defined(OPENSSL_SMALL_FOOTPRINT)
1486 return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1491 } is_endian = { 1 };
1492 unsigned int n, ctr;
1494 u64 mlen = ctx->len.u[1];
1495 void *key = ctx->key;
1496 # ifdef GCM_FUNCREF_4BIT
1497 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1499 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1500 const u8 *inp, size_t len) = ctx->ghash;
1505 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1507 ctx->len.u[1] = mlen;
1510 /* First call to decrypt finalizes GHASH(AAD) */
1515 if (is_endian.little)
1517 ctr = BSWAP4(ctx->Yi.d[3]);
1519 ctr = GETU32(ctx->Yi.c + 12);
1528 *(out++) = c ^ ctx->EKi.c[n];
1540 # if defined(GHASH) && defined(GHASH_CHUNK)
1541 while (len >= GHASH_CHUNK) {
1542 GHASH(ctx, in, GHASH_CHUNK);
1543 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1544 ctr += GHASH_CHUNK / 16;
1545 if (is_endian.little)
1547 ctx->Yi.d[3] = BSWAP4(ctr);
1549 PUTU32(ctx->Yi.c + 12, ctr);
1558 if ((i = (len & (size_t)-16))) {
1566 for (k = 0; k < 16; ++k)
1567 ctx->Xi.c[k] ^= in[k];
1574 (*stream) (in, out, j, key, ctx->Yi.c);
1575 ctr += (unsigned int)j;
1576 if (is_endian.little)
1578 ctx->Yi.d[3] = BSWAP4(ctr);
1580 PUTU32(ctx->Yi.c + 12, ctr);
1589 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1591 if (is_endian.little)
1593 ctx->Yi.d[3] = BSWAP4(ctr);
1595 PUTU32(ctx->Yi.c + 12, ctr);
1602 out[n] = c ^ ctx->EKi.c[n];
1612 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1618 } is_endian = { 1 };
1619 u64 alen = ctx->len.u[0] << 3;
1620 u64 clen = ctx->len.u[1] << 3;
1621 #ifdef GCM_FUNCREF_4BIT
1622 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1625 if (ctx->mres || ctx->ares)
1628 if (is_endian.little) {
1630 alen = BSWAP8(alen);
1631 clen = BSWAP8(clen);
1635 ctx->len.u[0] = alen;
1636 ctx->len.u[1] = clen;
1638 alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1639 clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1643 ctx->Xi.u[0] ^= alen;
1644 ctx->Xi.u[1] ^= clen;
1647 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1648 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1650 if (tag && len <= sizeof(ctx->Xi))
1651 return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1656 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1658 CRYPTO_gcm128_finish(ctx, NULL, 0);
1659 memcpy(tag, ctx->Xi.c,
1660 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1663 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1665 GCM128_CONTEXT *ret;
1667 if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1668 CRYPTO_gcm128_init(ret, key, block);
1673 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1675 OPENSSL_clear_free(ctx, sizeof(*ctx));