1 /* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
30 * 6. Redistributions of any form whatsoever must retain the following
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
50 #define OPENSSL_FIPSAPI
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
63 typedef struct { u64 hi,lo; } u128;
65 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
66 /* redefine, because alignment is ensured */
68 #define GETU32(p) BSWAP4(*(const u32 *)(p))
70 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
73 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
74 #define REDUCE1BIT(V) do { \
75 if (sizeof(size_t)==8) { \
76 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
77 V.lo = (V.hi<<63)|(V.lo>>1); \
78 V.hi = (V.hi>>1 )^T; \
81 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
82 V.lo = (V.hi<<63)|(V.lo>>1); \
83 V.hi = (V.hi>>1 )^((u64)T<<32); \
91 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
92 * never be set to 8. 8 is effectively reserved for testing purposes.
93 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
94 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
95 * whole spectrum of possible table driven implementations. Why? In
96 * non-"Shoup's" case memory access pattern is segmented in such manner,
97 * that it's trivial to see that cache timing information can reveal
98 * fair portion of intermediate hash value. Given that ciphertext is
99 * always available to attacker, it's possible for him to attempt to
100 * deduce secret parameter H and if successful, tamper with messages
101 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
102 * not as trivial, but there is no reason to believe that it's resistant
103 * to cache-timing attack. And the thing about "8-bit" implementation is
104 * that it consumes 16 (sixteen) times more memory, 4KB per individual
105 * key + 1KB shared. Well, on pros side it should be twice as fast as
106 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
107 * was observed to run ~75% faster, closer to 100% for commercial
108 * compilers... Yet "4-bit" procedure is preferred, because it's
109 * believed to provide better security-performance balance and adequate
110 * all-round performance. "All-round" refers to things like:
112 * - shorter setup time effectively improves overall timing for
113 * handling short messages;
114 * - larger table allocation can become unbearable because of VM
115 * subsystem penalties (for example on Windows large enough free
116 * results in VM working set trimming, meaning that consequent
117 * malloc would immediately incur working set expansion);
118 * - larger table has larger cache footprint, which can affect
119 * performance of other code paths (not necessarily even from same
120 * thread in Hyper-Threading world);
126 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
136 for (Htable[128]=V, i=64; i>0; i>>=1) {
141 for (i=2; i<256; i<<=1) {
142 u128 *Hi = Htable+i, H0 = *Hi;
143 for (j=1; j<i; ++j) {
144 Hi[j].hi = H0.hi^Htable[j].hi;
145 Hi[j].lo = H0.lo^Htable[j].lo;
150 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
153 const u8 *xi = (const u8 *)Xi+15;
155 const union { long one; char little; } is_endian = {1};
156 static const size_t rem_8bit[256] = {
157 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
158 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
159 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
160 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
161 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
162 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
163 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
164 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
165 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
166 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
167 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
168 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
169 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
170 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
171 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
172 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
173 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
174 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
175 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
176 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
177 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
178 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
179 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
180 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
181 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
182 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
183 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
184 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
185 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
186 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
187 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
188 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
189 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
190 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
191 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
192 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
193 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
194 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
195 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
196 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
197 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
198 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
199 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
200 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
201 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
202 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
203 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
204 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
205 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
206 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
207 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
208 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
209 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
210 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
211 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
212 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
213 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
214 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
215 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
216 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
217 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
218 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
219 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
220 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
223 Z.hi ^= Htable[n].hi;
224 Z.lo ^= Htable[n].lo;
226 if ((u8 *)Xi==xi) break;
230 rem = (size_t)Z.lo&0xff;
231 Z.lo = (Z.hi<<56)|(Z.lo>>8);
233 if (sizeof(size_t)==8)
234 Z.hi ^= rem_8bit[rem];
236 Z.hi ^= (u64)rem_8bit[rem]<<32;
239 if (is_endian.little) {
241 Xi[0] = BSWAP8(Z.hi);
242 Xi[1] = BSWAP8(Z.lo);
246 v = (u32)(Z.hi>>32); PUTU32(p,v);
247 v = (u32)(Z.hi); PUTU32(p+4,v);
248 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
249 v = (u32)(Z.lo); PUTU32(p+12,v);
257 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
261 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
264 #if defined(OPENSSL_SMALL_FOOTPRINT)
273 #if defined(OPENSSL_SMALL_FOOTPRINT)
274 for (Htable[8]=V, i=4; i>0; i>>=1) {
279 for (i=2; i<16; i<<=1) {
282 for (V=*Hi, j=1; j<i; ++j) {
283 Hi[j].hi = V.hi^Htable[j].hi;
284 Hi[j].lo = V.lo^Htable[j].lo;
295 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
297 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
298 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
299 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
301 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
302 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
303 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
304 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
305 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
306 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
307 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
309 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
311 * ARM assembler expects specific dword order in Htable.
315 const union { long one; char little; } is_endian = {1};
317 if (is_endian.little)
326 Htable[j].hi = V.lo<<32|V.lo>>32;
327 Htable[j].lo = V.hi<<32|V.hi>>32;
334 static const size_t rem_4bit[16] = {
335 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
336 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
337 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
338 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
340 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
344 size_t rem, nlo, nhi;
345 const union { long one; char little; } is_endian = {1};
347 nlo = ((const u8 *)Xi)[15];
351 Z.hi = Htable[nlo].hi;
352 Z.lo = Htable[nlo].lo;
355 rem = (size_t)Z.lo&0xf;
356 Z.lo = (Z.hi<<60)|(Z.lo>>4);
358 if (sizeof(size_t)==8)
359 Z.hi ^= rem_4bit[rem];
361 Z.hi ^= (u64)rem_4bit[rem]<<32;
363 Z.hi ^= Htable[nhi].hi;
364 Z.lo ^= Htable[nhi].lo;
368 nlo = ((const u8 *)Xi)[cnt];
372 rem = (size_t)Z.lo&0xf;
373 Z.lo = (Z.hi<<60)|(Z.lo>>4);
375 if (sizeof(size_t)==8)
376 Z.hi ^= rem_4bit[rem];
378 Z.hi ^= (u64)rem_4bit[rem]<<32;
380 Z.hi ^= Htable[nlo].hi;
381 Z.lo ^= Htable[nlo].lo;
384 if (is_endian.little) {
386 Xi[0] = BSWAP8(Z.hi);
387 Xi[1] = BSWAP8(Z.lo);
391 v = (u32)(Z.hi>>32); PUTU32(p,v);
392 v = (u32)(Z.hi); PUTU32(p+4,v);
393 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
394 v = (u32)(Z.lo); PUTU32(p+12,v);
403 #if !defined(OPENSSL_SMALL_FOOTPRINT)
405 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
406 * details... Compiler-generated code doesn't seem to give any
407 * performance improvement, at least not on x86[_64]. It's here
408 * mostly as reference and a placeholder for possible future
409 * non-trivial optimization[s]...
411 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
412 const u8 *inp,size_t len)
416 size_t rem, nlo, nhi;
417 const union { long one; char little; } is_endian = {1};
422 nlo = ((const u8 *)Xi)[15];
427 Z.hi = Htable[nlo].hi;
428 Z.lo = Htable[nlo].lo;
431 rem = (size_t)Z.lo&0xf;
432 Z.lo = (Z.hi<<60)|(Z.lo>>4);
434 if (sizeof(size_t)==8)
435 Z.hi ^= rem_4bit[rem];
437 Z.hi ^= (u64)rem_4bit[rem]<<32;
439 Z.hi ^= Htable[nhi].hi;
440 Z.lo ^= Htable[nhi].lo;
444 nlo = ((const u8 *)Xi)[cnt];
449 rem = (size_t)Z.lo&0xf;
450 Z.lo = (Z.hi<<60)|(Z.lo>>4);
452 if (sizeof(size_t)==8)
453 Z.hi ^= rem_4bit[rem];
455 Z.hi ^= (u64)rem_4bit[rem]<<32;
457 Z.hi ^= Htable[nlo].hi;
458 Z.lo ^= Htable[nlo].lo;
462 * Extra 256+16 bytes per-key plus 512 bytes shared tables
463 * [should] give ~50% improvement... One could have PACK()-ed
464 * the rem_8bit even here, but the priority is to minimize
467 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
468 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
469 static const unsigned short rem_8bit[256] = {
470 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
471 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
472 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
473 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
474 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
475 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
476 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
477 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
478 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
479 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
480 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
481 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
482 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
483 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
484 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
485 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
486 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
487 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
488 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
489 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
490 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
491 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
492 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
493 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
494 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
495 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
496 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
497 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
498 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
499 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
500 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
501 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
503 * This pre-processing phase slows down procedure by approximately
504 * same time as it makes each loop spin faster. In other words
505 * single block performance is approximately same as straightforward
506 * "4-bit" implementation, and then it goes only faster...
508 for (cnt=0; cnt<16; ++cnt) {
509 Z.hi = Htable[cnt].hi;
510 Z.lo = Htable[cnt].lo;
511 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
512 Hshr4[cnt].hi = (Z.hi>>4);
513 Hshl4[cnt] = (u8)(Z.lo<<4);
517 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
518 nlo = ((const u8 *)Xi)[cnt];
523 Z.hi ^= Htable[nlo].hi;
524 Z.lo ^= Htable[nlo].lo;
526 rem = (size_t)Z.lo&0xff;
528 Z.lo = (Z.hi<<56)|(Z.lo>>8);
531 Z.hi ^= Hshr4[nhi].hi;
532 Z.lo ^= Hshr4[nhi].lo;
533 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
536 nlo = ((const u8 *)Xi)[0];
541 Z.hi ^= Htable[nlo].hi;
542 Z.lo ^= Htable[nlo].lo;
544 rem = (size_t)Z.lo&0xf;
546 Z.lo = (Z.hi<<60)|(Z.lo>>4);
549 Z.hi ^= Htable[nhi].hi;
550 Z.lo ^= Htable[nhi].lo;
551 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
554 if (is_endian.little) {
556 Xi[0] = BSWAP8(Z.hi);
557 Xi[1] = BSWAP8(Z.lo);
561 v = (u32)(Z.hi>>32); PUTU32(p,v);
562 v = (u32)(Z.hi); PUTU32(p+4,v);
563 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
564 v = (u32)(Z.lo); PUTU32(p+12,v);
571 } while (inp+=16, len-=16);
575 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
576 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
579 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
580 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
581 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
582 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
583 * trashing effect. In other words idea is to hash data while it's
584 * still in L1 cache after encryption pass... */
585 #define GHASH_CHUNK (3*1024)
588 #else /* TABLE_BITS */
590 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
595 const long *xi = (const long *)Xi;
596 const union { long one; char little; } is_endian = {1};
598 V.hi = H[0]; /* H is in host byte order, no byte swapping */
601 for (j=0; j<16/sizeof(long); ++j) {
602 if (is_endian.little) {
603 if (sizeof(long)==8) {
605 X = (long)(BSWAP8(xi[j]));
607 const u8 *p = (const u8 *)(xi+j);
608 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
612 const u8 *p = (const u8 *)(xi+j);
619 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
620 u64 M = (u64)(X>>(8*sizeof(long)-1));
628 if (is_endian.little) {
630 Xi[0] = BSWAP8(Z.hi);
631 Xi[1] = BSWAP8(Z.lo);
635 v = (u32)(Z.hi>>32); PUTU32(p,v);
636 v = (u32)(Z.hi); PUTU32(p+4,v);
637 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
638 v = (u32)(Z.lo); PUTU32(p+12,v);
646 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
650 struct gcm128_context {
651 /* Following 6 names follow names in GCM specification */
652 union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
654 /* Pre-computed table used by gcm_gmult_* */
659 void (*gmult)(u64 Xi[2],const u128 Htable[16]);
660 void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662 unsigned int mres, ares;
667 #if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
668 (defined(__i386) || defined(__i386__) || \
669 defined(__x86_64) || defined(__x86_64__) || \
670 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
671 # define GHASH_ASM_IAX
672 extern unsigned int OPENSSL_ia32cap_P[2];
674 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
675 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
676 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
678 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
679 # define GHASH_ASM_X86
680 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
681 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
683 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
684 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
688 # define GCM_MUL(ctx,Xi) (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
690 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
693 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
695 const union { long one; char little; } is_endian = {1};
697 memset(ctx,0,sizeof(*ctx));
701 (*block)(ctx->H.c,ctx->H.c,key);
703 if (is_endian.little) {
704 /* H is stored in host byte order */
706 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
707 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
711 hi = (u64)GETU32(p) <<32|GETU32(p+4);
712 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
719 gcm_init_8bit(ctx->Htable,ctx->H.u);
721 # if defined(GHASH_ASM_IAX) /* both x86 and x86_64 */
722 if (OPENSSL_ia32cap_P[1]&(1<<1)) {
723 gcm_init_clmul(ctx->Htable,ctx->H.u);
724 ctx->gmult = gcm_gmult_clmul;
725 ctx->ghash = gcm_ghash_clmul;
728 gcm_init_4bit(ctx->Htable,ctx->H.u);
729 # if defined(GHASH_ASM_X86) /* x86 only */
730 if (OPENSSL_ia32cap_P[0]&(1<<23)) {
731 ctx->gmult = gcm_gmult_4bit_mmx;
732 ctx->ghash = gcm_ghash_4bit_mmx;
734 ctx->gmult = gcm_gmult_4bit_x86;
735 ctx->ghash = gcm_ghash_4bit_x86;
738 ctx->gmult = gcm_gmult_4bit;
739 ctx->ghash = gcm_ghash_4bit;
742 gcm_init_4bit(ctx->Htable,ctx->H.u);
747 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
749 const union { long one; char little; } is_endian = {1};
756 ctx->len.u[0] = 0; /* AAD length */
757 ctx->len.u[1] = 0; /* message length */
762 memcpy(ctx->Yi.c,iv,12);
771 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
777 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
781 if (is_endian.little) {
783 ctx->Yi.u[1] ^= BSWAP8(len0);
785 ctx->Yi.c[8] ^= (u8)(len0>>56);
786 ctx->Yi.c[9] ^= (u8)(len0>>48);
787 ctx->Yi.c[10] ^= (u8)(len0>>40);
788 ctx->Yi.c[11] ^= (u8)(len0>>32);
789 ctx->Yi.c[12] ^= (u8)(len0>>24);
790 ctx->Yi.c[13] ^= (u8)(len0>>16);
791 ctx->Yi.c[14] ^= (u8)(len0>>8);
792 ctx->Yi.c[15] ^= (u8)(len0);
796 ctx->Yi.u[1] ^= len0;
800 if (is_endian.little)
801 ctr = GETU32(ctx->Yi.c+12);
806 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
808 if (is_endian.little)
809 PUTU32(ctx->Yi.c+12,ctr);
814 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
818 u64 alen = ctx->len.u[0];
820 if (ctx->len.u[1]) return -2;
823 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
825 ctx->len.u[0] = alen;
830 ctx->Xi.c[n] ^= *(aad++);
834 if (n==0) GCM_MUL(ctx,Xi);
842 if ((i = (len&(size_t)-16))) {
849 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
856 n = (unsigned int)len;
857 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
864 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
865 const unsigned char *in, unsigned char *out,
868 const union { long one; char little; } is_endian = {1};
871 u64 mlen = ctx->len.u[1];
874 n = (unsigned int)mlen%16; /* alternative to ctx->mres */
877 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
879 ctx->len.u[1] = mlen;
882 /* First call to encrypt finalizes GHASH(AAD) */
887 if (is_endian.little)
888 ctr = GETU32(ctx->Yi.c+12);
893 #if !defined(OPENSSL_SMALL_FOOTPRINT)
894 if (16%sizeof(size_t) == 0) do { /* always true actually */
897 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
901 if (n==0) GCM_MUL(ctx,Xi);
907 #if defined(STRICT_ALIGNMENT)
908 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
911 #if defined(GHASH) && defined(GHASH_CHUNK)
912 while (len>=GHASH_CHUNK) {
913 size_t j=GHASH_CHUNK;
916 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
918 if (is_endian.little)
919 PUTU32(ctx->Yi.c+12,ctr);
922 for (i=0; i<16; i+=sizeof(size_t))
924 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
929 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
932 if ((i = (len&(size_t)-16))) {
936 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
938 if (is_endian.little)
939 PUTU32(ctx->Yi.c+12,ctr);
942 for (i=0; i<16; i+=sizeof(size_t))
944 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
953 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
955 if (is_endian.little)
956 PUTU32(ctx->Yi.c+12,ctr);
959 for (i=0; i<16; i+=sizeof(size_t))
960 *(size_t *)(ctx->Xi.c+i) ^=
962 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
970 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
972 if (is_endian.little)
973 PUTU32(ctx->Yi.c+12,ctr);
977 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
986 for (i=0;i<len;++i) {
988 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
990 if (is_endian.little)
991 PUTU32(ctx->Yi.c+12,ctr);
995 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1005 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1006 const unsigned char *in, unsigned char *out,
1009 const union { long one; char little; } is_endian = {1};
1010 unsigned int n, ctr;
1012 u64 mlen = ctx->len.u[1];
1015 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1017 ctx->len.u[1] = mlen;
1020 /* First call to decrypt finalizes GHASH(AAD) */
1025 if (is_endian.little)
1026 ctr = GETU32(ctx->Yi.c+12);
1031 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1032 if (16%sizeof(size_t) == 0) do { /* always true actually */
1036 *(out++) = c^ctx->EKi.c[n];
1041 if (n==0) GCM_MUL (ctx,Xi);
1047 #if defined(STRICT_ALIGNMENT)
1048 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1051 #if defined(GHASH) && defined(GHASH_CHUNK)
1052 while (len>=GHASH_CHUNK) {
1053 size_t j=GHASH_CHUNK;
1055 GHASH(ctx,in,GHASH_CHUNK);
1057 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1059 if (is_endian.little)
1060 PUTU32(ctx->Yi.c+12,ctr);
1063 for (i=0; i<16; i+=sizeof(size_t))
1064 *(size_t *)(out+i) =
1065 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1072 if ((i = (len&(size_t)-16))) {
1075 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1077 if (is_endian.little)
1078 PUTU32(ctx->Yi.c+12,ctr);
1081 for (i=0; i<16; i+=sizeof(size_t))
1082 *(size_t *)(out+i) =
1083 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1091 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1093 if (is_endian.little)
1094 PUTU32(ctx->Yi.c+12,ctr);
1097 for (i=0; i<16; i+=sizeof(size_t)) {
1098 size_t c = *(size_t *)(in+i);
1099 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1100 *(size_t *)(ctx->Xi.c+i) ^= c;
1109 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1111 if (is_endian.little)
1112 PUTU32(ctx->Yi.c+12,ctr);
1118 out[n] = c^ctx->EKi.c[n];
1127 for (i=0;i<len;++i) {
1130 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1132 if (is_endian.little)
1133 PUTU32(ctx->Yi.c+12,ctr);
1138 out[i] = c^ctx->EKi.c[n];
1149 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1150 const unsigned char *in, unsigned char *out,
1151 size_t len, ctr128_f stream)
1153 const union { long one; char little; } is_endian = {1};
1154 unsigned int n, ctr;
1156 u64 mlen = ctx->len.u[1];
1159 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1161 ctx->len.u[1] = mlen;
1164 /* First call to encrypt finalizes GHASH(AAD) */
1169 if (is_endian.little)
1170 ctr = GETU32(ctx->Yi.c+12);
1177 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1181 if (n==0) GCM_MUL(ctx,Xi);
1187 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1188 while (len>=GHASH_CHUNK) {
1189 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1190 ctr += GHASH_CHUNK/16;
1191 if (is_endian.little)
1192 PUTU32(ctx->Yi.c+12,ctr);
1195 GHASH(ctx,out,GHASH_CHUNK);
1201 if ((i = (len&(size_t)-16))) {
1204 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1205 ctr += (unsigned int)j;
1206 if (is_endian.little)
1207 PUTU32(ctx->Yi.c+12,ctr);
1217 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1224 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1226 if (is_endian.little)
1227 PUTU32(ctx->Yi.c+12,ctr);
1231 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1240 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1241 const unsigned char *in, unsigned char *out,
1242 size_t len,ctr128_f stream)
1244 const union { long one; char little; } is_endian = {1};
1245 unsigned int n, ctr;
1247 u64 mlen = ctx->len.u[1];
1250 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1252 ctx->len.u[1] = mlen;
1255 /* First call to decrypt finalizes GHASH(AAD) */
1260 if (is_endian.little)
1261 ctr = GETU32(ctx->Yi.c+12);
1269 *(out++) = c^ctx->EKi.c[n];
1274 if (n==0) GCM_MUL (ctx,Xi);
1280 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1281 while (len>=GHASH_CHUNK) {
1282 GHASH(ctx,in,GHASH_CHUNK);
1283 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1284 ctr += GHASH_CHUNK/16;
1285 if (is_endian.little)
1286 PUTU32(ctx->Yi.c+12,ctr);
1294 if ((i = (len&(size_t)-16))) {
1302 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1309 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1310 ctr += (unsigned int)j;
1311 if (is_endian.little)
1312 PUTU32(ctx->Yi.c+12,ctr);
1320 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1322 if (is_endian.little)
1323 PUTU32(ctx->Yi.c+12,ctr);
1329 out[n] = c^ctx->EKi.c[n];
1338 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1341 const union { long one; char little; } is_endian = {1};
1342 u64 alen = ctx->len.u[0]<<3;
1343 u64 clen = ctx->len.u[1]<<3;
1348 if (is_endian.little) {
1350 alen = BSWAP8(alen);
1351 clen = BSWAP8(clen);
1355 ctx->len.u[0] = alen;
1356 ctx->len.u[1] = clen;
1358 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1359 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1363 ctx->Xi.u[0] ^= alen;
1364 ctx->Xi.u[1] ^= clen;
1367 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1368 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1370 if (tag && len<=sizeof(ctx->Xi))
1371 return memcmp(ctx->Xi.c,tag,len);
1376 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1378 CRYPTO_gcm128_finish(ctx, NULL, 0);
1379 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1382 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1384 GCM128_CONTEXT *ret;
1386 if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1387 CRYPTO_gcm128_init(ret,key,block);
1392 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1395 OPENSSL_cleanse(ctx,sizeof(*ctx));
1400 #if defined(SELFTEST)
1402 #include <openssl/aes.h>
1405 static const u8 K1[16],
1410 T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1416 static const u8 P2[16],
1417 C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1418 T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1422 static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1423 P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1424 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1425 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1426 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1427 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1428 C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1429 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1430 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1431 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1432 T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1437 static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1438 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1439 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1440 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1441 A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1442 0xab,0xad,0xda,0xd2},
1443 C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1444 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1445 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1446 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1447 T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1452 static const u8 A5[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1453 0xab,0xad,0xda,0xd2},
1454 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1455 C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1456 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1457 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1458 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1459 T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1465 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1466 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1467 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1468 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1469 C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1470 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1471 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1472 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1473 T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1476 static const u8 K7[24],
1481 T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1487 static const u8 P8[16],
1488 C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1489 T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1493 static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1494 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1495 P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1496 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1497 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1498 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1499 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1500 C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1501 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1502 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1503 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1504 T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1509 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1510 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1511 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1512 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1513 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1514 0xab,0xad,0xda,0xd2},
1515 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1516 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1517 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1518 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1519 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1525 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1526 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1527 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1528 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1529 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1530 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1536 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1537 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1538 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1539 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1540 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1541 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1542 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1543 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1544 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1547 static const u8 K13[32],
1552 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1557 static const u8 P14[16],
1559 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1560 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1564 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1565 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1566 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1567 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1568 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1569 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1570 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1571 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1572 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1573 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1574 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1575 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1580 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1581 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1582 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1583 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1584 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1585 0xab,0xad,0xda,0xd2},
1586 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1587 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1588 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1589 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1590 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1596 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1597 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1598 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1599 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1600 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1601 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1607 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1608 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1609 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1610 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1611 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1612 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1613 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1614 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1615 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1617 #define TEST_CASE(n) do { \
1618 u8 out[sizeof(P##n)]; \
1619 AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
1620 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
1621 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1622 memset(out,0,sizeof(out)); \
1623 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1624 if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
1625 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1626 (C##n && memcmp(out,C##n,sizeof(out)))) \
1627 ret++, printf ("encrypt test#%d failed.\n",n); \
1628 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1629 memset(out,0,sizeof(out)); \
1630 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1631 if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
1632 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1633 (P##n && memcmp(out,P##n,sizeof(out)))) \
1634 ret++, printf ("decrypt test#%d failed.\n",n); \
1662 #ifdef OPENSSL_CPUID_OBJ
1664 size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1665 union { u64 u; u8 c[1024]; } buf;
1668 AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1669 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1670 CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1672 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1673 start = OPENSSL_rdtsc();
1674 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1675 gcm_t = OPENSSL_rdtsc() - start;
1677 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1678 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1679 (block128_f)AES_encrypt);
1680 start = OPENSSL_rdtsc();
1681 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1682 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1683 (block128_f)AES_encrypt);
1684 ctr_t = OPENSSL_rdtsc() - start;
1686 printf("%.2f-%.2f=%.2f\n",
1687 gcm_t/(double)sizeof(buf),
1688 ctr_t/(double)sizeof(buf),
1689 (gcm_t-ctr_t)/(double)sizeof(buf));
1691 GHASH(&ctx,buf.c,sizeof(buf));
1692 start = OPENSSL_rdtsc();
1693 for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1694 gcm_t = OPENSSL_rdtsc() - start;
1695 printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);