1 /* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
30 * 6. Redistributions of any form whatsoever must retain the following
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
51 #include "modes_lcl.h"
61 typedef struct { u64 hi,lo; } u128;
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
66 #define GETU32(p) BSWAP4(*(const u32 *)(p))
68 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
71 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V) do { \
73 if (sizeof(size_t)==8) { \
74 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75 V.lo = (V.hi<<63)|(V.lo>>1); \
76 V.hi = (V.hi>>1 )^T; \
79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 V.lo = (V.hi<<63)|(V.lo>>1); \
81 V.hi = (V.hi>>1 )^((u64)T<<32); \
89 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
90 * never be set to 8. 8 is effectively reserved for testing purposes.
91 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
92 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
93 * whole spectrum of possible table driven implementations. Why? In
94 * non-"Shoup's" case memory access pattern is segmented in such manner,
95 * that it's trivial to see that cache timing information can reveal
96 * fair portion of intermediate hash value. Given that ciphertext is
97 * always available to attacker, it's possible for him to attempt to
98 * deduce secret parameter H and if successful, tamper with messages
99 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
100 * not as trivial, but there is no reason to believe that it's resistant
101 * to cache-timing attack. And the thing about "8-bit" implementation is
102 * that it consumes 16 (sixteen) times more memory, 4KB per individual
103 * key + 1KB shared. Well, on pros side it should be twice as fast as
104 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
105 * was observed to run ~75% faster, closer to 100% for commercial
106 * compilers... Yet "4-bit" procedure is preferred, because it's
107 * believed to provide better security-performance balance and adequate
108 * all-round performance. "All-round" refers to things like:
110 * - shorter setup time effectively improves overall timing for
111 * handling short messages;
112 * - larger table allocation can become unbearable because of VM
113 * subsystem penalties (for example on Windows large enough free
114 * results in VM working set trimming, meaning that consequent
115 * malloc would immediately incur working set expansion);
116 * - larger table has larger cache footprint, which can affect
117 * performance of other code paths (not necessarily even from same
118 * thread in Hyper-Threading world);
124 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
134 for (Htable[128]=V, i=64; i>0; i>>=1) {
139 for (i=2; i<256; i<<=1) {
140 u128 *Hi = Htable+i, H0 = *Hi;
141 for (j=1; j<i; ++j) {
142 Hi[j].hi = H0.hi^Htable[j].hi;
143 Hi[j].lo = H0.lo^Htable[j].lo;
148 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
151 const u8 *xi = (const u8 *)Xi+15;
153 const union { long one; char little; } is_endian = {1};
154 static const size_t rem_8bit[256] = {
155 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
156 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
157 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
158 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
159 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
160 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
161 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
162 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
163 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
164 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
165 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
166 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
167 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
168 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
169 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
170 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
171 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
172 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
173 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
174 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
175 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
176 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
177 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
178 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
179 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
180 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
181 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
182 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
183 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
184 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
185 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
186 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
187 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
188 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
189 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
190 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
191 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
192 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
193 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
194 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
195 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
196 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
197 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
198 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
199 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
200 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
201 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
202 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
203 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
204 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
205 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
206 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
207 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
208 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
209 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
210 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
211 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
212 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
213 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
214 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
215 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
216 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
217 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
218 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
221 Z.hi ^= Htable[n].hi;
222 Z.lo ^= Htable[n].lo;
224 if ((u8 *)Xi==xi) break;
228 rem = (size_t)Z.lo&0xff;
229 Z.lo = (Z.hi<<56)|(Z.lo>>8);
231 if (sizeof(size_t)==8)
232 Z.hi ^= rem_8bit[rem];
234 Z.hi ^= (u64)rem_8bit[rem]<<32;
237 if (is_endian.little) {
239 Xi[0] = BSWAP8(Z.hi);
240 Xi[1] = BSWAP8(Z.lo);
244 v = (u32)(Z.hi>>32); PUTU32(p,v);
245 v = (u32)(Z.hi); PUTU32(p+4,v);
246 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
247 v = (u32)(Z.lo); PUTU32(p+12,v);
255 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
259 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
262 #if defined(OPENSSL_SMALL_FOOTPRINT)
271 #if defined(OPENSSL_SMALL_FOOTPRINT)
272 for (Htable[8]=V, i=4; i>0; i>>=1) {
277 for (i=2; i<16; i<<=1) {
280 for (V=*Hi, j=1; j<i; ++j) {
281 Hi[j].hi = V.hi^Htable[j].hi;
282 Hi[j].lo = V.lo^Htable[j].lo;
293 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
295 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
296 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
297 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
299 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
300 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
301 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
302 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
303 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
304 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
305 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
307 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
309 * ARM assembler expects specific dword order in Htable.
313 const union { long one; char little; } is_endian = {1};
315 if (is_endian.little)
324 Htable[j].hi = V.lo<<32|V.lo>>32;
325 Htable[j].lo = V.hi<<32|V.hi>>32;
332 static const size_t rem_4bit[16] = {
333 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
334 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
335 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
336 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
338 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
342 size_t rem, nlo, nhi;
343 const union { long one; char little; } is_endian = {1};
345 nlo = ((const u8 *)Xi)[15];
349 Z.hi = Htable[nlo].hi;
350 Z.lo = Htable[nlo].lo;
353 rem = (size_t)Z.lo&0xf;
354 Z.lo = (Z.hi<<60)|(Z.lo>>4);
356 if (sizeof(size_t)==8)
357 Z.hi ^= rem_4bit[rem];
359 Z.hi ^= (u64)rem_4bit[rem]<<32;
361 Z.hi ^= Htable[nhi].hi;
362 Z.lo ^= Htable[nhi].lo;
366 nlo = ((const u8 *)Xi)[cnt];
370 rem = (size_t)Z.lo&0xf;
371 Z.lo = (Z.hi<<60)|(Z.lo>>4);
373 if (sizeof(size_t)==8)
374 Z.hi ^= rem_4bit[rem];
376 Z.hi ^= (u64)rem_4bit[rem]<<32;
378 Z.hi ^= Htable[nlo].hi;
379 Z.lo ^= Htable[nlo].lo;
382 if (is_endian.little) {
384 Xi[0] = BSWAP8(Z.hi);
385 Xi[1] = BSWAP8(Z.lo);
389 v = (u32)(Z.hi>>32); PUTU32(p,v);
390 v = (u32)(Z.hi); PUTU32(p+4,v);
391 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
392 v = (u32)(Z.lo); PUTU32(p+12,v);
401 #if !defined(OPENSSL_SMALL_FOOTPRINT)
403 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
404 * details... Compiler-generated code doesn't seem to give any
405 * performance improvement, at least not on x86[_64]. It's here
406 * mostly as reference and a placeholder for possible future
407 * non-trivial optimization[s]...
409 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
410 const u8 *inp,size_t len)
414 size_t rem, nlo, nhi;
415 const union { long one; char little; } is_endian = {1};
420 nlo = ((const u8 *)Xi)[15];
425 Z.hi = Htable[nlo].hi;
426 Z.lo = Htable[nlo].lo;
429 rem = (size_t)Z.lo&0xf;
430 Z.lo = (Z.hi<<60)|(Z.lo>>4);
432 if (sizeof(size_t)==8)
433 Z.hi ^= rem_4bit[rem];
435 Z.hi ^= (u64)rem_4bit[rem]<<32;
437 Z.hi ^= Htable[nhi].hi;
438 Z.lo ^= Htable[nhi].lo;
442 nlo = ((const u8 *)Xi)[cnt];
447 rem = (size_t)Z.lo&0xf;
448 Z.lo = (Z.hi<<60)|(Z.lo>>4);
450 if (sizeof(size_t)==8)
451 Z.hi ^= rem_4bit[rem];
453 Z.hi ^= (u64)rem_4bit[rem]<<32;
455 Z.hi ^= Htable[nlo].hi;
456 Z.lo ^= Htable[nlo].lo;
460 * Extra 256+16 bytes per-key plus 512 bytes shared tables
461 * [should] give ~50% improvement... One could have PACK()-ed
462 * the rem_8bit even here, but the priority is to minimize
465 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
466 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
467 static const unsigned short rem_8bit[256] = {
468 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
501 * This pre-processing phase slows down procedure by approximately
502 * same time as it makes each loop spin faster. In other words
503 * single block performance is approximately same as straightforward
504 * "4-bit" implementation, and then it goes only faster...
506 for (cnt=0; cnt<16; ++cnt) {
507 Z.hi = Htable[cnt].hi;
508 Z.lo = Htable[cnt].lo;
509 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510 Hshr4[cnt].hi = (Z.hi>>4);
511 Hshl4[cnt] = (u8)(Z.lo<<4);
515 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516 nlo = ((const u8 *)Xi)[cnt];
521 Z.hi ^= Htable[nlo].hi;
522 Z.lo ^= Htable[nlo].lo;
524 rem = (size_t)Z.lo&0xff;
526 Z.lo = (Z.hi<<56)|(Z.lo>>8);
529 Z.hi ^= Hshr4[nhi].hi;
530 Z.lo ^= Hshr4[nhi].lo;
531 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
534 nlo = ((const u8 *)Xi)[0];
539 Z.hi ^= Htable[nlo].hi;
540 Z.lo ^= Htable[nlo].lo;
542 rem = (size_t)Z.lo&0xf;
544 Z.lo = (Z.hi<<60)|(Z.lo>>4);
547 Z.hi ^= Htable[nhi].hi;
548 Z.lo ^= Htable[nhi].lo;
549 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
552 if (is_endian.little) {
554 Xi[0] = BSWAP8(Z.hi);
555 Xi[1] = BSWAP8(Z.lo);
559 v = (u32)(Z.hi>>32); PUTU32(p,v);
560 v = (u32)(Z.hi); PUTU32(p+4,v);
561 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
562 v = (u32)(Z.lo); PUTU32(p+12,v);
569 } while (inp+=16, len-=16);
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
577 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581 * trashing effect. In other words idea is to hash data while it's
582 * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK 1024
586 #else /* TABLE_BITS */
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
593 const long *xi = (const long *)Xi;
594 const union { long one; char little; } is_endian = {1};
596 V.hi = H[0]; /* H is in host byte order, no byte swapping */
599 for (j=0; j<16/sizeof(long); ++j) {
600 if (is_endian.little) {
601 if (sizeof(long)==8) {
603 X = (long)(BSWAP8(xi[j]));
605 const u8 *p = (const u8 *)(xi+j);
606 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
610 const u8 *p = (const u8 *)(xi+j);
617 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618 u64 M = (u64)(X>>(8*sizeof(long)-1));
626 if (is_endian.little) {
628 Xi[0] = BSWAP8(Z.hi);
629 Xi[1] = BSWAP8(Z.lo);
633 v = (u32)(Z.hi>>32); PUTU32(p,v);
634 v = (u32)(Z.hi); PUTU32(p+4,v);
635 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
636 v = (u32)(Z.lo); PUTU32(p+12,v);
644 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
648 struct gcm128_context {
649 /* Following 6 names follow names in GCM specification */
650 union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
652 /* Pre-computed table used by gcm_gmult_* */
657 void (*gmult)(u64 Xi[2],const u128 Htable[16]);
658 void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660 unsigned int res, pad;
665 #if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
666 (defined(__i386) || defined(__i386__) || \
667 defined(__x86_64) || defined(__x86_64__) || \
668 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
669 # define GHASH_ASM_IAX
670 extern unsigned int OPENSSL_ia32cap_P[2];
672 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
673 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
676 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
677 # define GHASH_ASM_X86
678 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
679 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
681 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
682 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
686 # define GCM_MUL(ctx,Xi) (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
688 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
691 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
693 const union { long one; char little; } is_endian = {1};
695 memset(ctx,0,sizeof(*ctx));
699 (*block)(ctx->H.c,ctx->H.c,key);
701 if (is_endian.little) {
702 /* H is stored in host byte order */
704 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
705 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
709 hi = (u64)GETU32(p) <<32|GETU32(p+4);
710 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
717 gcm_init_8bit(ctx->Htable,ctx->H.u);
719 # if defined(GHASH_ASM_IAX) /* both x86 and x86_64 */
720 if (OPENSSL_ia32cap_P[1]&(1<<1)) {
721 gcm_init_clmul(ctx->Htable,ctx->H.u);
722 ctx->gmult = gcm_gmult_clmul;
723 ctx->ghash = gcm_ghash_clmul;
726 gcm_init_4bit(ctx->Htable,ctx->H.u);
727 # if defined(GHASH_ASM_X86) /* x86 only */
728 if (OPENSSL_ia32cap_P[0]&(1<<23)) {
729 ctx->gmult = gcm_gmult_4bit_mmx;
730 ctx->ghash = gcm_ghash_4bit_mmx;
732 ctx->gmult = gcm_gmult_4bit_x86;
733 ctx->ghash = gcm_ghash_4bit_x86;
736 ctx->gmult = gcm_gmult_4bit;
737 ctx->ghash = gcm_ghash_4bit;
740 gcm_init_4bit(ctx->Htable,ctx->H.u);
745 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
747 const union { long one; char little; } is_endian = {1};
759 memcpy(ctx->Yi.c,iv,12);
768 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
774 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
778 if (is_endian.little) {
780 ctx->Yi.u[1] ^= BSWAP8(len0);
782 ctx->Yi.c[8] ^= (u8)(len0>>56);
783 ctx->Yi.c[9] ^= (u8)(len0>>48);
784 ctx->Yi.c[10] ^= (u8)(len0>>40);
785 ctx->Yi.c[11] ^= (u8)(len0>>32);
786 ctx->Yi.c[12] ^= (u8)(len0>>24);
787 ctx->Yi.c[13] ^= (u8)(len0>>16);
788 ctx->Yi.c[14] ^= (u8)(len0>>8);
789 ctx->Yi.c[15] ^= (u8)(len0);
793 ctx->Yi.u[1] ^= len0;
797 if (is_endian.little)
798 ctr = GETU32(ctx->Yi.c+12);
803 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
805 if (is_endian.little)
806 PUTU32(ctx->Yi.c+12,ctr);
811 void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
815 ctx->len.u[0] += len;
818 if ((i = (len&(size_t)-16))) {
825 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
832 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
837 void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
838 const unsigned char *in, unsigned char *out,
841 const union { long one; char little; } is_endian = {1};
845 ctx->len.u[1] += len;
847 if (is_endian.little)
848 ctr = GETU32(ctx->Yi.c+12);
852 #if !defined(OPENSSL_SMALL_FOOTPRINT)
853 if (16%sizeof(size_t) == 0) do { /* always true actually */
856 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
860 if (n==0) GCM_MUL(ctx,Xi);
866 #if defined(STRICT_ALIGNMENT)
867 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
870 #if defined(GHASH) && defined(GHASH_CHUNK)
871 while (len>=GHASH_CHUNK) {
872 size_t j=GHASH_CHUNK;
875 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
877 if (is_endian.little)
878 PUTU32(ctx->Yi.c+12,ctr);
881 for (i=0; i<16; i+=sizeof(size_t))
883 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
888 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
891 if ((i = (len&(size_t)-16))) {
895 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
897 if (is_endian.little)
898 PUTU32(ctx->Yi.c+12,ctr);
901 for (i=0; i<16; i+=sizeof(size_t))
903 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
912 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
914 if (is_endian.little)
915 PUTU32(ctx->Yi.c+12,ctr);
918 for (i=0; i<16; i+=sizeof(size_t))
919 *(size_t *)(ctx->Xi.c+i) ^=
921 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
929 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
931 if (is_endian.little)
932 PUTU32(ctx->Yi.c+12,ctr);
936 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
945 for (i=0;i<len;++i) {
947 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
949 if (is_endian.little)
950 PUTU32(ctx->Yi.c+12,ctr);
954 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
963 void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
964 const unsigned char *in, unsigned char *out,
967 const union { long one; char little; } is_endian = {1};
971 ctx->len.u[1] += len;
973 if (is_endian.little)
974 ctr = GETU32(ctx->Yi.c+12);
978 #if !defined(OPENSSL_SMALL_FOOTPRINT)
979 if (16%sizeof(size_t) == 0) do { /* always true actually */
983 *(out++) = c^ctx->EKi.c[n];
988 if (n==0) GCM_MUL (ctx,Xi);
994 #if defined(STRICT_ALIGNMENT)
995 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
998 #if defined(GHASH) && defined(GHASH_CHUNK)
999 while (len>=GHASH_CHUNK) {
1000 size_t j=GHASH_CHUNK;
1002 GHASH(ctx,in,GHASH_CHUNK);
1004 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1006 if (is_endian.little)
1007 PUTU32(ctx->Yi.c+12,ctr);
1010 for (i=0; i<16; i+=sizeof(size_t))
1011 *(size_t *)(out+i) =
1012 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1019 if ((i = (len&(size_t)-16))) {
1022 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1024 if (is_endian.little)
1025 PUTU32(ctx->Yi.c+12,ctr);
1028 for (i=0; i<16; i+=sizeof(size_t))
1029 *(size_t *)(out+i) =
1030 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1038 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1040 if (is_endian.little)
1041 PUTU32(ctx->Yi.c+12,ctr);
1044 for (i=0; i<16; i+=sizeof(size_t)) {
1045 size_t c = *(size_t *)(in+i);
1046 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1047 *(size_t *)(ctx->Xi.c+i) ^= c;
1056 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1058 if (is_endian.little)
1059 PUTU32(ctx->Yi.c+12,ctr);
1065 out[n] = c^ctx->EKi.c[n];
1074 for (i=0;i<len;++i) {
1077 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1079 if (is_endian.little)
1080 PUTU32(ctx->Yi.c+12,ctr);
1085 out[i] ^= ctx->EKi.c[n];
1095 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1098 const union { long one; char little; } is_endian = {1};
1099 u64 alen = ctx->len.u[0]<<3;
1100 u64 clen = ctx->len.u[1]<<3;
1105 if (is_endian.little) {
1107 alen = BSWAP8(alen);
1108 clen = BSWAP8(clen);
1112 ctx->len.u[0] = alen;
1113 ctx->len.u[1] = clen;
1115 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1116 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1120 ctx->Xi.u[0] ^= alen;
1121 ctx->Xi.u[1] ^= clen;
1124 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1125 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1127 if (tag && len<=sizeof(ctx->Xi))
1128 return memcmp(ctx->Xi.c,tag,len);
1133 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1135 GCM128_CONTEXT *ret;
1137 if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1138 CRYPTO_gcm128_init(ret,key,block);
1143 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1146 OPENSSL_cleanse(ctx,sizeof(*ctx));
1151 #if defined(SELFTEST)
1153 #include <openssl/aes.h>
1156 static const u8 K1[16],
1161 T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1167 static const u8 P2[16],
1168 C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1169 T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1173 static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1174 P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1175 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1176 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1177 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1178 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1179 C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1180 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1181 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1182 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1183 T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1188 static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1189 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1190 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1191 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1192 A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1193 0xab,0xad,0xda,0xd2},
1194 C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1195 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1196 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1197 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1198 T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1203 static const u8 A5[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1204 0xab,0xad,0xda,0xd2},
1205 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1206 C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1207 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1208 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1209 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1210 T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1216 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1217 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1218 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1219 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1220 C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1221 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1222 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1223 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1224 T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1227 static const u8 K7[24],
1232 T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1238 static const u8 P8[16],
1239 C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1240 T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1244 static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1245 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1246 P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1247 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1248 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1249 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1250 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1251 C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1252 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1253 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1254 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1255 T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1260 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1261 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1262 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1263 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1264 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1265 0xab,0xad,0xda,0xd2},
1266 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1267 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1268 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1269 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1270 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1276 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1277 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1278 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1279 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1280 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1281 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1287 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1288 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1289 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1290 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1291 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1292 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1293 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1294 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1295 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1298 static const u8 K13[32],
1303 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1308 static const u8 P14[16],
1310 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1311 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1315 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1316 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1317 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1318 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1319 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1320 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1321 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1322 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1323 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1324 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1325 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1326 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1331 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1332 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1333 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1334 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1335 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1336 0xab,0xad,0xda,0xd2},
1337 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1338 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1339 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1340 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1341 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1347 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1348 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1349 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1350 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1351 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1352 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1358 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1359 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1360 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1361 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1362 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1363 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1364 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1365 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1366 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1368 #define TEST_CASE(n) do { \
1369 u8 out[sizeof(P##n)]; \
1370 AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
1371 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
1372 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1373 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1374 if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
1375 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1376 (C##n && memcmp(out,C##n,sizeof(out)))) \
1377 ret++, printf ("encrypt test#%d failed.\n",n);\
1378 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1379 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1380 if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
1381 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1382 (P##n && memcmp(out,P##n,sizeof(out)))) \
1383 ret++, printf ("decrypt test#%d failed.\n",n); \
1411 #ifdef OPENSSL_CPUID_OBJ
1413 size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1414 union { u64 u; u8 c[1024]; } buf;
1417 AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1418 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1419 CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1421 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1422 start = OPENSSL_rdtsc();
1423 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1424 gcm_t = OPENSSL_rdtsc() - start;
1426 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1427 &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1428 (block128_f)AES_encrypt);
1429 start = OPENSSL_rdtsc();
1430 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1431 &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1432 (block128_f)AES_encrypt);
1433 ctr_t = OPENSSL_rdtsc() - start;
1435 printf("%.2f-%.2f=%.2f\n",
1436 gcm_t/(double)sizeof(buf),
1437 ctr_t/(double)sizeof(buf),
1438 (gcm_t-ctr_t)/(double)sizeof(buf));
1440 GHASH(&ctx,buf.c,sizeof(buf));
1441 start = OPENSSL_rdtsc();
1442 for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1443 gcm_t = OPENSSL_rdtsc() - start;
1444 printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);