1 /* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
30 * 6. Redistributions of any form whatsoever must retain the following
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
50 #define OPENSSL_FIPSAPI
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
66 #define GETU32(p) BSWAP4(*(const u32 *)(p))
68 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
71 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V) do { \
73 if (sizeof(size_t)==8) { \
74 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75 V.lo = (V.hi<<63)|(V.lo>>1); \
76 V.hi = (V.hi>>1 )^T; \
79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 V.lo = (V.hi<<63)|(V.lo>>1); \
81 V.hi = (V.hi>>1 )^((u64)T<<32); \
86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87 * never be set to 8. 8 is effectively reserved for testing purposes.
88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90 * whole spectrum of possible table driven implementations. Why? In
91 * non-"Shoup's" case memory access pattern is segmented in such manner,
92 * that it's trivial to see that cache timing information can reveal
93 * fair portion of intermediate hash value. Given that ciphertext is
94 * always available to attacker, it's possible for him to attempt to
95 * deduce secret parameter H and if successful, tamper with messages
96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97 * not as trivial, but there is no reason to believe that it's resistant
98 * to cache-timing attack. And the thing about "8-bit" implementation is
99 * that it consumes 16 (sixteen) times more memory, 4KB per individual
100 * key + 1KB shared. Well, on pros side it should be twice as fast as
101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102 * was observed to run ~75% faster, closer to 100% for commercial
103 * compilers... Yet "4-bit" procedure is preferred, because it's
104 * believed to provide better security-performance balance and adequate
105 * all-round performance. "All-round" refers to things like:
107 * - shorter setup time effectively improves overall timing for
108 * handling short messages;
109 * - larger table allocation can become unbearable because of VM
110 * subsystem penalties (for example on Windows large enough free
111 * results in VM working set trimming, meaning that consequent
112 * malloc would immediately incur working set expansion);
113 * - larger table has larger cache footprint, which can affect
114 * performance of other code paths (not necessarily even from same
115 * thread in Hyper-Threading world);
117 * Value of 1 is not appropriate for performance reasons.
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
131 for (Htable[128]=V, i=64; i>0; i>>=1) {
136 for (i=2; i<256; i<<=1) {
137 u128 *Hi = Htable+i, H0 = *Hi;
138 for (j=1; j<i; ++j) {
139 Hi[j].hi = H0.hi^Htable[j].hi;
140 Hi[j].lo = H0.lo^Htable[j].lo;
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
148 const u8 *xi = (const u8 *)Xi+15;
150 const union { long one; char little; } is_endian = {1};
151 static const size_t rem_8bit[256] = {
152 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
218 Z.hi ^= Htable[n].hi;
219 Z.lo ^= Htable[n].lo;
221 if ((u8 *)Xi==xi) break;
225 rem = (size_t)Z.lo&0xff;
226 Z.lo = (Z.hi<<56)|(Z.lo>>8);
228 if (sizeof(size_t)==8)
229 Z.hi ^= rem_8bit[rem];
231 Z.hi ^= (u64)rem_8bit[rem]<<32;
234 if (is_endian.little) {
236 Xi[0] = BSWAP8(Z.hi);
237 Xi[1] = BSWAP8(Z.lo);
241 v = (u32)(Z.hi>>32); PUTU32(p,v);
242 v = (u32)(Z.hi); PUTU32(p+4,v);
243 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
244 v = (u32)(Z.lo); PUTU32(p+12,v);
252 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269 for (Htable[8]=V, i=4; i>0; i>>=1) {
274 for (i=2; i<16; i<<=1) {
277 for (V=*Hi, j=1; j<i; ++j) {
278 Hi[j].hi = V.hi^Htable[j].hi;
279 Hi[j].lo = V.lo^Htable[j].lo;
290 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
292 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
293 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
294 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
296 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
297 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
306 * ARM assembler expects specific dword order in Htable.
310 const union { long one; char little; } is_endian = {1};
312 if (is_endian.little)
321 Htable[j].hi = V.lo<<32|V.lo>>32;
322 Htable[j].lo = V.hi<<32|V.hi>>32;
329 static const size_t rem_4bit[16] = {
330 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
339 size_t rem, nlo, nhi;
340 const union { long one; char little; } is_endian = {1};
342 nlo = ((const u8 *)Xi)[15];
346 Z.hi = Htable[nlo].hi;
347 Z.lo = Htable[nlo].lo;
350 rem = (size_t)Z.lo&0xf;
351 Z.lo = (Z.hi<<60)|(Z.lo>>4);
353 if (sizeof(size_t)==8)
354 Z.hi ^= rem_4bit[rem];
356 Z.hi ^= (u64)rem_4bit[rem]<<32;
358 Z.hi ^= Htable[nhi].hi;
359 Z.lo ^= Htable[nhi].lo;
363 nlo = ((const u8 *)Xi)[cnt];
367 rem = (size_t)Z.lo&0xf;
368 Z.lo = (Z.hi<<60)|(Z.lo>>4);
370 if (sizeof(size_t)==8)
371 Z.hi ^= rem_4bit[rem];
373 Z.hi ^= (u64)rem_4bit[rem]<<32;
375 Z.hi ^= Htable[nlo].hi;
376 Z.lo ^= Htable[nlo].lo;
379 if (is_endian.little) {
381 Xi[0] = BSWAP8(Z.hi);
382 Xi[1] = BSWAP8(Z.lo);
386 v = (u32)(Z.hi>>32); PUTU32(p,v);
387 v = (u32)(Z.hi); PUTU32(p+4,v);
388 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
389 v = (u32)(Z.lo); PUTU32(p+12,v);
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
400 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401 * details... Compiler-generated code doesn't seem to give any
402 * performance improvement, at least not on x86[_64]. It's here
403 * mostly as reference and a placeholder for possible future
404 * non-trivial optimization[s]...
406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407 const u8 *inp,size_t len)
411 size_t rem, nlo, nhi;
412 const union { long one; char little; } is_endian = {1};
417 nlo = ((const u8 *)Xi)[15];
422 Z.hi = Htable[nlo].hi;
423 Z.lo = Htable[nlo].lo;
426 rem = (size_t)Z.lo&0xf;
427 Z.lo = (Z.hi<<60)|(Z.lo>>4);
429 if (sizeof(size_t)==8)
430 Z.hi ^= rem_4bit[rem];
432 Z.hi ^= (u64)rem_4bit[rem]<<32;
434 Z.hi ^= Htable[nhi].hi;
435 Z.lo ^= Htable[nhi].lo;
439 nlo = ((const u8 *)Xi)[cnt];
444 rem = (size_t)Z.lo&0xf;
445 Z.lo = (Z.hi<<60)|(Z.lo>>4);
447 if (sizeof(size_t)==8)
448 Z.hi ^= rem_4bit[rem];
450 Z.hi ^= (u64)rem_4bit[rem]<<32;
452 Z.hi ^= Htable[nlo].hi;
453 Z.lo ^= Htable[nlo].lo;
457 * Extra 256+16 bytes per-key plus 512 bytes shared tables
458 * [should] give ~50% improvement... One could have PACK()-ed
459 * the rem_8bit even here, but the priority is to minimize
462 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
463 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
464 static const unsigned short rem_8bit[256] = {
465 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
498 * This pre-processing phase slows down procedure by approximately
499 * same time as it makes each loop spin faster. In other words
500 * single block performance is approximately same as straightforward
501 * "4-bit" implementation, and then it goes only faster...
503 for (cnt=0; cnt<16; ++cnt) {
504 Z.hi = Htable[cnt].hi;
505 Z.lo = Htable[cnt].lo;
506 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507 Hshr4[cnt].hi = (Z.hi>>4);
508 Hshl4[cnt] = (u8)(Z.lo<<4);
512 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513 nlo = ((const u8 *)Xi)[cnt];
518 Z.hi ^= Htable[nlo].hi;
519 Z.lo ^= Htable[nlo].lo;
521 rem = (size_t)Z.lo&0xff;
523 Z.lo = (Z.hi<<56)|(Z.lo>>8);
526 Z.hi ^= Hshr4[nhi].hi;
527 Z.lo ^= Hshr4[nhi].lo;
528 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
531 nlo = ((const u8 *)Xi)[0];
536 Z.hi ^= Htable[nlo].hi;
537 Z.lo ^= Htable[nlo].lo;
539 rem = (size_t)Z.lo&0xf;
541 Z.lo = (Z.hi<<60)|(Z.lo>>4);
544 Z.hi ^= Htable[nhi].hi;
545 Z.lo ^= Htable[nhi].lo;
546 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
549 if (is_endian.little) {
551 Xi[0] = BSWAP8(Z.hi);
552 Xi[1] = BSWAP8(Z.lo);
556 v = (u32)(Z.hi>>32); PUTU32(p,v);
557 v = (u32)(Z.hi); PUTU32(p+4,v);
558 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
559 v = (u32)(Z.lo); PUTU32(p+12,v);
566 } while (inp+=16, len-=16);
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
574 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578 * trashing effect. In other words idea is to hash data while it's
579 * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK (3*1024)
583 #else /* TABLE_BITS */
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
590 const long *xi = (const long *)Xi;
591 const union { long one; char little; } is_endian = {1};
593 V.hi = H[0]; /* H is in host byte order, no byte swapping */
596 for (j=0; j<16/sizeof(long); ++j) {
597 if (is_endian.little) {
598 if (sizeof(long)==8) {
600 X = (long)(BSWAP8(xi[j]));
602 const u8 *p = (const u8 *)(xi+j);
603 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 const u8 *p = (const u8 *)(xi+j);
614 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615 u64 M = (u64)(X>>(8*sizeof(long)-1));
623 if (is_endian.little) {
625 Xi[0] = BSWAP8(Z.hi);
626 Xi[1] = BSWAP8(Z.lo);
630 v = (u32)(Z.hi>>32); PUTU32(p,v);
631 v = (u32)(Z.hi); PUTU32(p+4,v);
632 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
633 v = (u32)(Z.lo); PUTU32(p+12,v);
641 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645 #if TABLE_BITS==4 && defined(GHASH_ASM)
646 # if !defined(I386_ONLY) && \
647 (defined(__i386) || defined(__i386__) || \
648 defined(__x86_64) || defined(__x86_64__) || \
649 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
650 # define GHASH_ASM_X86_OR_64
651 # define GCM_FUNCREF_4BIT
652 extern unsigned int OPENSSL_ia32cap_P[2];
654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
658 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
659 # define GHASH_ASM_X86
660 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
663 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
666 # elif defined(__arm__) || defined(__arm)
667 # include "arm_arch.h"
669 # define GHASH_ASM_ARM
670 # define GCM_FUNCREF_4BIT
671 extern unsigned int OPENSSL_armcap;
673 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
679 #ifdef GCM_FUNCREF_4BIT
681 # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
684 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
688 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
690 const union { long one; char little; } is_endian = {1};
692 memset(ctx,0,sizeof(*ctx));
696 (*block)(ctx->H.c,ctx->H.c,key);
698 if (is_endian.little) {
699 /* H is stored in host byte order */
701 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
702 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
706 hi = (u64)GETU32(p) <<32|GETU32(p+4);
707 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
714 gcm_init_8bit(ctx->Htable,ctx->H.u);
716 # if defined(GHASH_ASM_X86_OR_64)
717 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
718 if (OPENSSL_ia32cap_P[1]&(1<<1)) { /* check PCLMULQDQ bit */
719 gcm_init_clmul(ctx->Htable,ctx->H.u);
720 ctx->gmult = gcm_gmult_clmul;
721 ctx->ghash = gcm_ghash_clmul;
725 gcm_init_4bit(ctx->Htable,ctx->H.u);
726 # if defined(GHASH_ASM_X86) /* x86 only */
727 if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */
728 ctx->gmult = gcm_gmult_4bit_mmx;
729 ctx->ghash = gcm_ghash_4bit_mmx;
731 ctx->gmult = gcm_gmult_4bit_x86;
732 ctx->ghash = gcm_ghash_4bit_x86;
735 ctx->gmult = gcm_gmult_4bit;
736 ctx->ghash = gcm_ghash_4bit;
738 # elif defined(GHASH_ASM_ARM)
739 if (OPENSSL_armcap & 1) {
740 ctx->gmult = gcm_gmult_neon;
741 ctx->ghash = gcm_ghash_neon;
743 gcm_init_4bit(ctx->Htable,ctx->H.u);
744 ctx->gmult = gcm_gmult_4bit;
745 ctx->ghash = gcm_ghash_4bit;
748 gcm_init_4bit(ctx->Htable,ctx->H.u);
753 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
755 const union { long one; char little; } is_endian = {1};
757 #ifdef GCM_FUNCREF_4BIT
758 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
765 ctx->len.u[0] = 0; /* AAD length */
766 ctx->len.u[1] = 0; /* message length */
771 memcpy(ctx->Yi.c,iv,12);
780 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
786 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
790 if (is_endian.little) {
792 ctx->Yi.u[1] ^= BSWAP8(len0);
794 ctx->Yi.c[8] ^= (u8)(len0>>56);
795 ctx->Yi.c[9] ^= (u8)(len0>>48);
796 ctx->Yi.c[10] ^= (u8)(len0>>40);
797 ctx->Yi.c[11] ^= (u8)(len0>>32);
798 ctx->Yi.c[12] ^= (u8)(len0>>24);
799 ctx->Yi.c[13] ^= (u8)(len0>>16);
800 ctx->Yi.c[14] ^= (u8)(len0>>8);
801 ctx->Yi.c[15] ^= (u8)(len0);
805 ctx->Yi.u[1] ^= len0;
809 if (is_endian.little)
810 ctr = GETU32(ctx->Yi.c+12);
815 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
817 if (is_endian.little)
818 PUTU32(ctx->Yi.c+12,ctr);
823 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
827 u64 alen = ctx->len.u[0];
828 #ifdef GCM_FUNCREF_4BIT
829 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
831 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
832 const u8 *inp,size_t len) = ctx->ghash;
836 if (ctx->len.u[1]) return -2;
839 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
841 ctx->len.u[0] = alen;
846 ctx->Xi.c[n] ^= *(aad++);
850 if (n==0) GCM_MUL(ctx,Xi);
858 if ((i = (len&(size_t)-16))) {
865 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
872 n = (unsigned int)len;
873 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
880 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
881 const unsigned char *in, unsigned char *out,
884 const union { long one; char little; } is_endian = {1};
887 u64 mlen = ctx->len.u[1];
888 #ifdef GCM_FUNCREF_4BIT
889 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
891 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
892 const u8 *inp,size_t len) = ctx->ghash;
897 n = (unsigned int)mlen%16; /* alternative to ctx->mres */
900 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
902 ctx->len.u[1] = mlen;
905 /* First call to encrypt finalizes GHASH(AAD) */
910 if (is_endian.little)
911 ctr = GETU32(ctx->Yi.c+12);
916 #if !defined(OPENSSL_SMALL_FOOTPRINT)
917 if (16%sizeof(size_t) == 0) do { /* always true actually */
920 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
924 if (n==0) GCM_MUL(ctx,Xi);
930 #if defined(STRICT_ALIGNMENT)
931 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
934 #if defined(GHASH) && defined(GHASH_CHUNK)
935 while (len>=GHASH_CHUNK) {
936 size_t j=GHASH_CHUNK;
939 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
941 if (is_endian.little)
942 PUTU32(ctx->Yi.c+12,ctr);
945 for (i=0; i<16; i+=sizeof(size_t))
947 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
952 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
955 if ((i = (len&(size_t)-16))) {
959 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
961 if (is_endian.little)
962 PUTU32(ctx->Yi.c+12,ctr);
965 for (i=0; i<16; i+=sizeof(size_t))
967 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
976 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
978 if (is_endian.little)
979 PUTU32(ctx->Yi.c+12,ctr);
982 for (i=0; i<16; i+=sizeof(size_t))
983 *(size_t *)(ctx->Xi.c+i) ^=
985 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
993 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
995 if (is_endian.little)
996 PUTU32(ctx->Yi.c+12,ctr);
1000 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1009 for (i=0;i<len;++i) {
1011 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1013 if (is_endian.little)
1014 PUTU32(ctx->Yi.c+12,ctr);
1018 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1028 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1029 const unsigned char *in, unsigned char *out,
1032 const union { long one; char little; } is_endian = {1};
1033 unsigned int n, ctr;
1035 u64 mlen = ctx->len.u[1];
1036 #ifdef GCM_FUNCREF_4BIT
1037 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1039 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1040 const u8 *inp,size_t len) = ctx->ghash;
1045 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1047 ctx->len.u[1] = mlen;
1050 /* First call to decrypt finalizes GHASH(AAD) */
1055 if (is_endian.little)
1056 ctr = GETU32(ctx->Yi.c+12);
1061 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1062 if (16%sizeof(size_t) == 0) do { /* always true actually */
1066 *(out++) = c^ctx->EKi.c[n];
1071 if (n==0) GCM_MUL (ctx,Xi);
1077 #if defined(STRICT_ALIGNMENT)
1078 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1081 #if defined(GHASH) && defined(GHASH_CHUNK)
1082 while (len>=GHASH_CHUNK) {
1083 size_t j=GHASH_CHUNK;
1085 GHASH(ctx,in,GHASH_CHUNK);
1087 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1089 if (is_endian.little)
1090 PUTU32(ctx->Yi.c+12,ctr);
1093 for (i=0; i<16; i+=sizeof(size_t))
1094 *(size_t *)(out+i) =
1095 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1102 if ((i = (len&(size_t)-16))) {
1105 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1107 if (is_endian.little)
1108 PUTU32(ctx->Yi.c+12,ctr);
1111 for (i=0; i<16; i+=sizeof(size_t))
1112 *(size_t *)(out+i) =
1113 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1121 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1123 if (is_endian.little)
1124 PUTU32(ctx->Yi.c+12,ctr);
1127 for (i=0; i<16; i+=sizeof(size_t)) {
1128 size_t c = *(size_t *)(in+i);
1129 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1130 *(size_t *)(ctx->Xi.c+i) ^= c;
1139 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1141 if (is_endian.little)
1142 PUTU32(ctx->Yi.c+12,ctr);
1148 out[n] = c^ctx->EKi.c[n];
1157 for (i=0;i<len;++i) {
1160 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1162 if (is_endian.little)
1163 PUTU32(ctx->Yi.c+12,ctr);
1168 out[i] = c^ctx->EKi.c[n];
1179 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1180 const unsigned char *in, unsigned char *out,
1181 size_t len, ctr128_f stream)
1183 const union { long one; char little; } is_endian = {1};
1184 unsigned int n, ctr;
1186 u64 mlen = ctx->len.u[1];
1187 #ifdef GCM_FUNCREF_4BIT
1188 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1190 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1191 const u8 *inp,size_t len) = ctx->ghash;
1196 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1198 ctx->len.u[1] = mlen;
1201 /* First call to encrypt finalizes GHASH(AAD) */
1206 if (is_endian.little)
1207 ctr = GETU32(ctx->Yi.c+12);
1214 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1218 if (n==0) GCM_MUL(ctx,Xi);
1224 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1225 while (len>=GHASH_CHUNK) {
1226 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1227 ctr += GHASH_CHUNK/16;
1228 if (is_endian.little)
1229 PUTU32(ctx->Yi.c+12,ctr);
1232 GHASH(ctx,out,GHASH_CHUNK);
1238 if ((i = (len&(size_t)-16))) {
1241 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1242 ctr += (unsigned int)j;
1243 if (is_endian.little)
1244 PUTU32(ctx->Yi.c+12,ctr);
1254 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1261 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1263 if (is_endian.little)
1264 PUTU32(ctx->Yi.c+12,ctr);
1268 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1277 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1278 const unsigned char *in, unsigned char *out,
1279 size_t len,ctr128_f stream)
1281 const union { long one; char little; } is_endian = {1};
1282 unsigned int n, ctr;
1284 u64 mlen = ctx->len.u[1];
1285 #ifdef GCM_FUNCREF_4BIT
1286 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1288 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1289 const u8 *inp,size_t len) = ctx->ghash;
1294 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1296 ctx->len.u[1] = mlen;
1299 /* First call to decrypt finalizes GHASH(AAD) */
1304 if (is_endian.little)
1305 ctr = GETU32(ctx->Yi.c+12);
1313 *(out++) = c^ctx->EKi.c[n];
1318 if (n==0) GCM_MUL (ctx,Xi);
1324 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1325 while (len>=GHASH_CHUNK) {
1326 GHASH(ctx,in,GHASH_CHUNK);
1327 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1328 ctr += GHASH_CHUNK/16;
1329 if (is_endian.little)
1330 PUTU32(ctx->Yi.c+12,ctr);
1338 if ((i = (len&(size_t)-16))) {
1346 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1353 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1354 ctr += (unsigned int)j;
1355 if (is_endian.little)
1356 PUTU32(ctx->Yi.c+12,ctr);
1364 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1366 if (is_endian.little)
1367 PUTU32(ctx->Yi.c+12,ctr);
1373 out[n] = c^ctx->EKi.c[n];
1382 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1385 const union { long one; char little; } is_endian = {1};
1386 u64 alen = ctx->len.u[0]<<3;
1387 u64 clen = ctx->len.u[1]<<3;
1388 #ifdef GCM_FUNCREF_4BIT
1389 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1395 if (is_endian.little) {
1397 alen = BSWAP8(alen);
1398 clen = BSWAP8(clen);
1402 ctx->len.u[0] = alen;
1403 ctx->len.u[1] = clen;
1405 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1406 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1410 ctx->Xi.u[0] ^= alen;
1411 ctx->Xi.u[1] ^= clen;
1414 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1415 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1417 if (tag && len<=sizeof(ctx->Xi))
1418 return memcmp(ctx->Xi.c,tag,len);
1423 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1425 CRYPTO_gcm128_finish(ctx, NULL, 0);
1426 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1429 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1431 GCM128_CONTEXT *ret;
1433 if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1434 CRYPTO_gcm128_init(ret,key,block);
1439 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1442 OPENSSL_cleanse(ctx,sizeof(*ctx));
1447 #if defined(SELFTEST)
1449 #include <openssl/aes.h>
1452 static const u8 K1[16],
1457 T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1463 static const u8 P2[16],
1464 C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1465 T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1469 static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1470 P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1471 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1472 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1473 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1474 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1475 C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1476 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1477 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1478 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1479 T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1484 static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1485 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1486 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1487 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1488 A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1489 0xab,0xad,0xda,0xd2},
1490 C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1491 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1492 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1493 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1494 T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1500 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1501 C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1502 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1503 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1504 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1505 T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1511 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1512 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1513 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1514 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1515 C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1516 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1517 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1518 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1519 T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1522 static const u8 K7[24],
1527 T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1533 static const u8 P8[16],
1534 C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1535 T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1539 static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1540 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1541 P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1542 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1543 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1544 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1545 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1546 C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1547 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1548 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1549 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1550 T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1555 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1556 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1557 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1558 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1559 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1560 0xab,0xad,0xda,0xd2},
1561 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1562 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1563 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1564 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1565 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1571 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1572 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1573 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1574 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1575 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1576 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1582 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1583 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1584 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1585 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1586 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1587 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1588 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1589 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1590 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1593 static const u8 K13[32],
1598 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1603 static const u8 P14[16],
1605 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1606 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1610 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1611 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1612 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1613 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1614 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1615 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1616 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1617 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1618 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1619 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1620 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1621 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1626 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1627 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1628 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1629 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1630 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1631 0xab,0xad,0xda,0xd2},
1632 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1633 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1634 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1635 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1636 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1642 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1643 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1644 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1645 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1646 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1647 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1653 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1654 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1655 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1656 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1657 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1658 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1659 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1660 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1661 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1663 #define TEST_CASE(n) do { \
1664 u8 out[sizeof(P##n)]; \
1665 AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
1666 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
1667 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1668 memset(out,0,sizeof(out)); \
1669 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1670 if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
1671 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1672 (C##n && memcmp(out,C##n,sizeof(out)))) \
1673 ret++, printf ("encrypt test#%d failed.\n",n); \
1674 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1675 memset(out,0,sizeof(out)); \
1676 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1677 if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
1678 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1679 (P##n && memcmp(out,P##n,sizeof(out)))) \
1680 ret++, printf ("decrypt test#%d failed.\n",n); \
1708 #ifdef OPENSSL_CPUID_OBJ
1710 size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1711 union { u64 u; u8 c[1024]; } buf;
1714 AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1715 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1716 CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1718 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1719 start = OPENSSL_rdtsc();
1720 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1721 gcm_t = OPENSSL_rdtsc() - start;
1723 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1724 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1725 (block128_f)AES_encrypt);
1726 start = OPENSSL_rdtsc();
1727 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1728 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1729 (block128_f)AES_encrypt);
1730 ctr_t = OPENSSL_rdtsc() - start;
1732 printf("%.2f-%.2f=%.2f\n",
1733 gcm_t/(double)sizeof(buf),
1734 ctr_t/(double)sizeof(buf),
1735 (gcm_t-ctr_t)/(double)sizeof(buf));
1737 GHASH(&ctx,buf.c,sizeof(buf));
1738 start = OPENSSL_rdtsc();
1739 for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1740 gcm_t = OPENSSL_rdtsc() - start;
1741 printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);