1 /* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
30 * 6. Redistributions of any form whatsoever must retain the following
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
50 #include "modes_lcl.h"
60 typedef struct { u64 hi,lo; } u128;
62 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
63 /* redefine, because alignment is ensured */
65 #define GETU32(p) BSWAP4(*(const u32 *)(p))
67 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
70 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
71 #define REDUCE1BIT(V) do { \
72 if (sizeof(size_t)==8) { \
73 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
74 V.lo = (V.hi<<63)|(V.lo>>1); \
75 V.hi = (V.hi>>1 )^T; \
78 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
79 V.lo = (V.hi<<63)|(V.lo>>1); \
80 V.hi = (V.hi>>1 )^((u64)T<<32); \
88 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
89 * never be set to 8. 8 is effectively reserved for testing purposes.
90 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
91 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
92 * whole spectrum of possible table driven implementations. Why? In
93 * non-"Shoup's" case memory access pattern is segmented in such manner,
94 * that it's trivial to see that cache timing information can reveal
95 * fair portion of intermediate hash value. Given that ciphertext is
96 * always available to attacker, it's possible for him to attempt to
97 * deduce secret parameter H and if successful, tamper with messages
98 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
99 * not as trivial, but there is no reason to believe that it's resistant
100 * to cache-timing attack. And the thing about "8-bit" implementation is
101 * that it consumes 16 (sixteen) times more memory, 4KB per individual
102 * key + 1KB shared. Well, on pros side it should be twice as fast as
103 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
104 * was observed to run ~75% faster, closer to 100% for commercial
105 * compilers... Yet "4-bit" procedure is preferred, because it's
106 * believed to provide better security-performance balance and adequate
107 * all-round performance. "All-round" refers to things like:
109 * - shorter setup time effectively improves overall timing for
110 * handling short messages;
111 * - larger table allocation can become unbearable because of VM
112 * subsystem penalties (for example on Windows large enough free
113 * results in VM working set trimming, meaning that consequent
114 * malloc would immediately incur working set expansion);
115 * - larger table has larger cache footprint, which can affect
116 * performance of other code paths (not necessarily even from same
117 * thread in Hyper-Threading world);
123 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
133 for (Htable[128]=V, i=64; i>0; i>>=1) {
138 for (i=2; i<256; i<<=1) {
139 u128 *Hi = Htable+i, H0 = *Hi;
140 for (j=1; j<i; ++j) {
141 Hi[j].hi = H0.hi^Htable[j].hi;
142 Hi[j].lo = H0.lo^Htable[j].lo;
147 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
150 const u8 *xi = (const u8 *)Xi+15;
152 const union { long one; char little; } is_endian = {1};
153 static const size_t rem_8bit[256] = {
154 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
155 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
156 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
157 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
158 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
159 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
160 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
161 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
162 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
163 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
164 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
165 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
166 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
167 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
168 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
169 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
170 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
171 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
172 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
173 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
174 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
175 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
176 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
177 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
178 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
179 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
180 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
181 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
182 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
183 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
184 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
185 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
186 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
187 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
188 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
189 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
190 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
191 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
192 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
193 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
194 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
195 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
196 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
197 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
198 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
199 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
200 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
201 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
202 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
203 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
204 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
205 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
206 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
207 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
208 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
209 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
210 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
211 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
212 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
213 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
214 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
215 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
216 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
217 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
220 Z.hi ^= Htable[n].hi;
221 Z.lo ^= Htable[n].lo;
223 if ((u8 *)Xi==xi) break;
227 rem = (size_t)Z.lo&0xff;
228 Z.lo = (Z.hi<<56)|(Z.lo>>8);
230 if (sizeof(size_t)==8)
231 Z.hi ^= rem_8bit[rem];
233 Z.hi ^= (u64)rem_8bit[rem]<<32;
236 if (is_endian.little) {
238 Xi[0] = BSWAP8(Z.hi);
239 Xi[1] = BSWAP8(Z.lo);
243 v = (u32)(Z.hi>>32); PUTU32(p,v);
244 v = (u32)(Z.hi); PUTU32(p+4,v);
245 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
246 v = (u32)(Z.lo); PUTU32(p+12,v);
254 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
258 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
261 #if defined(OPENSSL_SMALL_FOOTPRINT)
270 #if defined(OPENSSL_SMALL_FOOTPRINT)
271 for (Htable[8]=V, i=4; i>0; i>>=1) {
276 for (i=2; i<16; i<<=1) {
279 for (V=*Hi, j=1; j<i; ++j) {
280 Hi[j].hi = V.hi^Htable[j].hi;
281 Hi[j].lo = V.lo^Htable[j].lo;
292 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
294 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
295 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
296 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
298 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
299 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
300 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
301 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
302 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
303 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
304 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
306 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
308 * ARM assembler expects specific dword order in Htable.
312 const union { long one; char little; } is_endian = {1};
314 if (is_endian.little)
323 Htable[j].hi = V.lo<<32|V.lo>>32;
324 Htable[j].lo = V.hi<<32|V.hi>>32;
331 static const size_t rem_4bit[16] = {
332 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
333 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
334 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
335 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
337 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
341 size_t rem, nlo, nhi;
342 const union { long one; char little; } is_endian = {1};
344 nlo = ((const u8 *)Xi)[15];
348 Z.hi = Htable[nlo].hi;
349 Z.lo = Htable[nlo].lo;
352 rem = (size_t)Z.lo&0xf;
353 Z.lo = (Z.hi<<60)|(Z.lo>>4);
355 if (sizeof(size_t)==8)
356 Z.hi ^= rem_4bit[rem];
358 Z.hi ^= (u64)rem_4bit[rem]<<32;
360 Z.hi ^= Htable[nhi].hi;
361 Z.lo ^= Htable[nhi].lo;
365 nlo = ((const u8 *)Xi)[cnt];
369 rem = (size_t)Z.lo&0xf;
370 Z.lo = (Z.hi<<60)|(Z.lo>>4);
372 if (sizeof(size_t)==8)
373 Z.hi ^= rem_4bit[rem];
375 Z.hi ^= (u64)rem_4bit[rem]<<32;
377 Z.hi ^= Htable[nlo].hi;
378 Z.lo ^= Htable[nlo].lo;
381 if (is_endian.little) {
383 Xi[0] = BSWAP8(Z.hi);
384 Xi[1] = BSWAP8(Z.lo);
388 v = (u32)(Z.hi>>32); PUTU32(p,v);
389 v = (u32)(Z.hi); PUTU32(p+4,v);
390 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
391 v = (u32)(Z.lo); PUTU32(p+12,v);
400 #if !defined(OPENSSL_SMALL_FOOTPRINT)
402 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
403 * details... Compiler-generated code doesn't seem to give any
404 * performance improvement, at least not on x86[_64]. It's here
405 * mostly as reference and a placeholder for possible future
406 * non-trivial optimization[s]...
408 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
409 const u8 *inp,size_t len)
413 size_t rem, nlo, nhi;
414 const union { long one; char little; } is_endian = {1};
419 nlo = ((const u8 *)Xi)[15];
424 Z.hi = Htable[nlo].hi;
425 Z.lo = Htable[nlo].lo;
428 rem = (size_t)Z.lo&0xf;
429 Z.lo = (Z.hi<<60)|(Z.lo>>4);
431 if (sizeof(size_t)==8)
432 Z.hi ^= rem_4bit[rem];
434 Z.hi ^= (u64)rem_4bit[rem]<<32;
436 Z.hi ^= Htable[nhi].hi;
437 Z.lo ^= Htable[nhi].lo;
441 nlo = ((const u8 *)Xi)[cnt];
446 rem = (size_t)Z.lo&0xf;
447 Z.lo = (Z.hi<<60)|(Z.lo>>4);
449 if (sizeof(size_t)==8)
450 Z.hi ^= rem_4bit[rem];
452 Z.hi ^= (u64)rem_4bit[rem]<<32;
454 Z.hi ^= Htable[nlo].hi;
455 Z.lo ^= Htable[nlo].lo;
459 * Extra 256+16 bytes per-key plus 512 bytes shared tables
460 * [should] give ~50% improvement... One could have PACK()-ed
461 * the rem_8bit even here, but priority is to minimize memory
464 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
465 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
466 static const unsigned short rem_8bit[256] = {
467 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
468 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
469 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
470 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
471 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
472 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
473 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
474 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
475 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
476 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
477 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
478 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
479 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
480 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
481 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
482 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
483 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
484 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
485 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
486 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
487 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
488 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
489 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
490 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
491 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
492 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
493 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
494 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
495 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
496 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
497 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
498 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
501 * This pre-processing phase slows down procedure by approximately
502 * same time as it makes each loop spin faster. In other words
503 * single block performance is approximately same as straightforward
504 * "4-bit" implementation, and then it goes only faster...
506 for (cnt=0; cnt<16; ++cnt) {
507 Z.hi = Htable[cnt].hi;
508 Z.lo = Htable[cnt].lo;
509 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510 Hshr4[cnt].hi = (Z.hi>>4);
511 Hshl4[cnt] = (u8)(Z.lo<<4);
515 nlo = ((const u8 *)Xi)[15];
520 Z.hi = Htable[nlo].hi;
521 Z.lo = Htable[nlo].lo;
523 rem = (size_t)Z.lo&0xff;
525 Z.lo = (Z.hi<<56)|(Z.lo>>8);
528 Z.hi ^= Hshr4[nhi].hi;
529 Z.lo ^= Hshr4[nhi].lo;
530 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
532 for (cnt=14; cnt>0; --cnt) {
533 nlo = ((const u8 *)Xi)[cnt];
538 Z.hi ^= Htable[nlo].hi;
539 Z.lo ^= Htable[nlo].lo;
541 rem = (size_t)Z.lo&0xff;
543 Z.lo = (Z.hi<<56)|(Z.lo>>8);
546 Z.hi ^= Hshr4[nhi].hi;
547 Z.lo ^= Hshr4[nhi].lo;
548 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
551 nlo = ((const u8 *)Xi)[0];
556 Z.hi ^= Htable[nlo].hi;
557 Z.lo ^= Htable[nlo].lo;
559 rem = (size_t)Z.lo&0xf;
561 Z.lo = (Z.hi<<60)|(Z.lo>>4);
564 Z.hi ^= Htable[nhi].hi;
565 Z.lo ^= Htable[nhi].lo;
566 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
569 if (is_endian.little) {
571 Xi[0] = BSWAP8(Z.hi);
572 Xi[1] = BSWAP8(Z.lo);
576 v = (u32)(Z.hi>>32); PUTU32(p,v);
577 v = (u32)(Z.hi); PUTU32(p+4,v);
578 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
579 v = (u32)(Z.lo); PUTU32(p+12,v);
586 } while (inp+=16, len-=16);
590 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
591 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
594 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
595 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
596 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
597 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
598 * trashing effect. In other words idea is to hash data while it's
599 * still in L1 cache after encryption pass... */
600 #define GHASH_CHUNK 1024
603 #else /* TABLE_BITS */
605 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
610 const long *xi = (const long *)Xi;
611 const union { long one; char little; } is_endian = {1};
613 V.hi = H[0]; /* H is in host byte order, no byte swapping */
616 for (j=0; j<16/sizeof(long); ++j) {
617 if (is_endian.little) {
618 if (sizeof(long)==8) {
620 X = (long)(BSWAP8(xi[j]));
622 const u8 *p = (const u8 *)(xi+j);
623 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
627 const u8 *p = (const u8 *)(xi+j);
634 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
635 u64 M = (u64)(X>>(8*sizeof(long)-1));
643 if (is_endian.little) {
645 Xi[0] = BSWAP8(Z.hi);
646 Xi[1] = BSWAP8(Z.lo);
650 v = (u32)(Z.hi>>32); PUTU32(p,v);
651 v = (u32)(Z.hi); PUTU32(p+4,v);
652 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
653 v = (u32)(Z.lo); PUTU32(p+12,v);
661 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
665 struct gcm128_context {
666 /* Following 6 names follow names in GCM specification */
667 union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
669 /* Pre-computed table used by gcm_gmult_* */
674 void (*gmult)(u64 Xi[2],const u128 Htable[16]);
675 void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
677 unsigned int res, pad;
682 #if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
683 (defined(__i386) || defined(__i386__) || \
684 defined(__x86_64) || defined(__x86_64__) || \
685 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
686 # define GHASH_ASM_IAX
687 extern unsigned int OPENSSL_ia32cap_P[2];
689 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
690 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
691 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
693 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
694 # define GHASH_ASM_X86
695 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
696 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
698 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
699 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
703 # define GCM_MUL(ctx,Xi) (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
705 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
708 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
710 const union { long one; char little; } is_endian = {1};
712 memset(ctx,0,sizeof(*ctx));
716 (*block)(ctx->H.c,ctx->H.c,key);
718 if (is_endian.little) {
719 /* H is stored in host byte order */
721 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
722 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
726 hi = (u64)GETU32(p) <<32|GETU32(p+4);
727 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
734 gcm_init_8bit(ctx->Htable,ctx->H.u);
736 # if defined(GHASH_ASM_IAX)
737 if (OPENSSL_ia32cap_P[1]&(1<<1)) {
738 gcm_init_clmul(ctx->Htable,ctx->H.u);
739 ctx->gmult = gcm_gmult_clmul;
740 ctx->ghash = gcm_ghash_clmul;
743 gcm_init_4bit(ctx->Htable,ctx->H.u);
744 # if defined(GHASH_ASM_X86)
745 if (OPENSSL_ia32cap_P[0]&(1<<23)) {
746 ctx->gmult = gcm_gmult_4bit_mmx;
747 ctx->ghash = gcm_ghash_4bit_mmx;
749 ctx->gmult = gcm_gmult_4bit_x86;
750 ctx->ghash = gcm_ghash_4bit_x86;
753 ctx->gmult = gcm_gmult_4bit;
754 ctx->ghash = gcm_ghash_4bit;
757 gcm_init_4bit(ctx->Htable,ctx->H.u);
762 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
764 const union { long one; char little; } is_endian = {1};
776 memcpy(ctx->Yi.c,iv,12);
785 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
791 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
795 if (is_endian.little) {
797 ctx->Yi.u[1] ^= BSWAP8(len0);
799 ctx->Yi.c[8] ^= (u8)(len0>>56);
800 ctx->Yi.c[9] ^= (u8)(len0>>48);
801 ctx->Yi.c[10] ^= (u8)(len0>>40);
802 ctx->Yi.c[11] ^= (u8)(len0>>32);
803 ctx->Yi.c[12] ^= (u8)(len0>>24);
804 ctx->Yi.c[13] ^= (u8)(len0>>16);
805 ctx->Yi.c[14] ^= (u8)(len0>>8);
806 ctx->Yi.c[15] ^= (u8)(len0);
810 ctx->Yi.u[1] ^= len0;
814 if (is_endian.little)
815 ctr = GETU32(ctx->Yi.c+12);
820 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
822 if (is_endian.little)
823 PUTU32(ctx->Yi.c+12,ctr);
828 void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
832 ctx->len.u[0] += len;
835 if ((i = (len&(size_t)-16))) {
842 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
849 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
854 void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
855 const unsigned char *in, unsigned char *out,
858 const union { long one; char little; } is_endian = {1};
862 ctx->len.u[1] += len;
864 if (is_endian.little)
865 ctr = GETU32(ctx->Yi.c+12);
869 #if !defined(OPENSSL_SMALL_FOOTPRINT)
870 if (16%sizeof(size_t) == 0) do { /* always true actually */
873 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
877 if (n==0) GCM_MUL(ctx,Xi);
883 #if defined(STRICT_ALIGNMENT)
884 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
887 #if defined(GHASH) && defined(GHASH_CHUNK)
888 while (len>=GHASH_CHUNK) {
889 size_t j=GHASH_CHUNK;
892 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
894 if (is_endian.little)
895 PUTU32(ctx->Yi.c+12,ctr);
898 for (i=0; i<16; i+=sizeof(size_t))
900 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
905 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
908 if ((i = (len&(size_t)-16))) {
912 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
914 if (is_endian.little)
915 PUTU32(ctx->Yi.c+12,ctr);
918 for (i=0; i<16; i+=sizeof(size_t))
920 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
929 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
931 if (is_endian.little)
932 PUTU32(ctx->Yi.c+12,ctr);
935 for (i=0; i<16; i+=sizeof(size_t))
936 *(size_t *)(ctx->Xi.c+i) ^=
938 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
946 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
948 if (is_endian.little)
949 PUTU32(ctx->Yi.c+12,ctr);
953 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
962 for (i=0;i<len;++i) {
964 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
966 if (is_endian.little)
967 PUTU32(ctx->Yi.c+12,ctr);
971 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
980 void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
981 const unsigned char *in, unsigned char *out,
984 const union { long one; char little; } is_endian = {1};
988 ctx->len.u[1] += len;
990 if (is_endian.little)
991 ctr = GETU32(ctx->Yi.c+12);
995 #if !defined(OPENSSL_SMALL_FOOTPRINT)
996 if (16%sizeof(size_t) == 0) do { /* always true actually */
1000 *(out++) = c^ctx->EKi.c[n];
1005 if (n==0) GCM_MUL (ctx,Xi);
1011 #if defined(STRICT_ALIGNMENT)
1012 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1015 #if defined(GHASH) && defined(GHASH_CHUNK)
1016 while (len>=GHASH_CHUNK) {
1017 size_t j=GHASH_CHUNK;
1019 GHASH(ctx,in,GHASH_CHUNK);
1021 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1023 if (is_endian.little)
1024 PUTU32(ctx->Yi.c+12,ctr);
1027 for (i=0; i<16; i+=sizeof(size_t))
1028 *(size_t *)(out+i) =
1029 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1036 if ((i = (len&(size_t)-16))) {
1039 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1041 if (is_endian.little)
1042 PUTU32(ctx->Yi.c+12,ctr);
1045 for (i=0; i<16; i+=sizeof(size_t))
1046 *(size_t *)(out+i) =
1047 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1055 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1057 if (is_endian.little)
1058 PUTU32(ctx->Yi.c+12,ctr);
1061 for (i=0; i<16; i+=sizeof(size_t)) {
1062 size_t c = *(size_t *)(in+i);
1063 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1064 *(size_t *)(ctx->Xi.c+i) ^= c;
1073 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1075 if (is_endian.little)
1076 PUTU32(ctx->Yi.c+12,ctr);
1082 out[n] = c^ctx->EKi.c[n];
1091 for (i=0;i<len;++i) {
1094 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1096 if (is_endian.little)
1097 PUTU32(ctx->Yi.c+12,ctr);
1102 out[i] ^= ctx->EKi.c[n];
1112 void CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx)
1114 const union { long one; char little; } is_endian = {1};
1115 u64 alen = ctx->len.u[0]<<3;
1116 u64 clen = ctx->len.u[1]<<3;
1121 if (is_endian.little) {
1123 alen = BSWAP8(alen);
1124 clen = BSWAP8(clen);
1128 ctx->len.u[0] = alen;
1129 ctx->len.u[1] = clen;
1131 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1132 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1136 ctx->Xi.u[0] ^= alen;
1137 ctx->Xi.u[1] ^= clen;
1140 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1141 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1144 #if defined(SELFTEST)
1146 #include <openssl/aes.h>
1149 static const u8 K1[16],
1154 T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1160 static const u8 P2[16],
1161 C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1162 T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1166 static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1167 P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1168 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1169 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1170 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1171 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1172 C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1173 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1174 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1175 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1176 T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1181 static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1182 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1183 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1184 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1185 A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1186 0xab,0xad,0xda,0xd2},
1187 C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1188 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1189 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1190 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1191 T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1196 static const u8 A5[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1197 0xab,0xad,0xda,0xd2},
1198 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1199 C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1200 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1201 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1202 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1203 T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1209 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1210 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1211 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1212 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1213 C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1214 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1215 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1216 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1217 T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1220 static const u8 K7[24],
1225 T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1231 static const u8 P8[16],
1232 C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1233 T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1237 static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1238 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1239 P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1240 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1241 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1242 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1243 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1244 C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1245 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1246 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1247 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1248 T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1253 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1254 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1255 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1256 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1257 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1258 0xab,0xad,0xda,0xd2},
1259 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1260 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1261 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1262 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1263 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1269 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1270 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1271 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1272 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1273 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1274 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1280 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1281 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1282 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1283 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1284 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1285 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1286 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1287 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1288 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1291 static const u8 K13[32],
1296 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1301 static const u8 P14[16],
1303 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1304 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1308 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1309 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1310 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1311 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1312 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1313 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1314 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1315 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1316 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1317 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1318 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1319 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1324 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1325 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1326 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1327 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1328 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1329 0xab,0xad,0xda,0xd2},
1330 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1331 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1332 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1333 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1334 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1340 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1341 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1342 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1343 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1344 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1345 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1351 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1352 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1353 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1354 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1355 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1356 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1357 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1358 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1359 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1361 #define TEST_CASE(n) do { \
1362 u8 out[sizeof(P##n)]; \
1363 AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
1364 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
1365 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1366 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1367 if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
1368 CRYPTO_gcm128_finish(&ctx); \
1369 if (memcmp(ctx.Xi.c,T##n,16) || (C##n && memcmp(out,C##n,sizeof(out)))) \
1370 ret++, printf ("encrypt test#%d failed.\n",n);\
1371 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1372 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1373 if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
1374 CRYPTO_gcm128_finish(&ctx); \
1375 if (memcmp(ctx.Xi.c,T##n,16) || (P##n && memcmp(out,P##n,sizeof(out)))) \
1376 ret++, printf ("decrypt test#%d failed.\n",n);\
1404 #ifdef OPENSSL_CPUID_OBJ
1406 size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1407 union { u64 u; u8 c[1024]; } buf;
1410 AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1411 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1412 CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1414 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1415 start = OPENSSL_rdtsc();
1416 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1417 gcm_t = OPENSSL_rdtsc() - start;
1419 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1420 &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1421 (block128_f)AES_encrypt);
1422 start = OPENSSL_rdtsc();
1423 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1424 &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1425 (block128_f)AES_encrypt);
1426 ctr_t = OPENSSL_rdtsc() - start;
1428 printf("%.2f-%.2f=%.2f\n",
1429 gcm_t/(double)sizeof(buf),
1430 ctr_t/(double)sizeof(buf),
1431 (gcm_t-ctr_t)/(double)sizeof(buf));
1433 GHASH(&ctx,buf.c,sizeof(buf));
1434 start = OPENSSL_rdtsc();
1435 for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1436 gcm_t = OPENSSL_rdtsc() - start;
1437 printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);