1287bce7735dec159e395433e2f847d6a2421a10
[oweals/openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 typedef struct { u64 hi,lo; } u128;
64
65 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
66 /* redefine, because alignment is ensured */
67 #undef  GETU32
68 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
69 #undef  PUTU32
70 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
71 #endif
72
73 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
74 #define REDUCE1BIT(V)   do { \
75         if (sizeof(size_t)==8) { \
76                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
77                 V.lo  = (V.hi<<63)|(V.lo>>1); \
78                 V.hi  = (V.hi>>1 )^T; \
79         } \
80         else { \
81                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
82                 V.lo  = (V.hi<<63)|(V.lo>>1); \
83                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
84         } \
85 } while(0)
86
87 #ifdef  TABLE_BITS
88 #undef  TABLE_BITS
89 #endif
90 /*
91  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
92  * never be set to 8. 8 is effectively reserved for testing purposes.
93  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
94  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
95  * whole spectrum of possible table driven implementations. Why? In
96  * non-"Shoup's" case memory access pattern is segmented in such manner,
97  * that it's trivial to see that cache timing information can reveal
98  * fair portion of intermediate hash value. Given that ciphertext is
99  * always available to attacker, it's possible for him to attempt to
100  * deduce secret parameter H and if successful, tamper with messages
101  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
102  * not as trivial, but there is no reason to believe that it's resistant
103  * to cache-timing attack. And the thing about "8-bit" implementation is
104  * that it consumes 16 (sixteen) times more memory, 4KB per individual
105  * key + 1KB shared. Well, on pros side it should be twice as fast as
106  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
107  * was observed to run ~75% faster, closer to 100% for commercial
108  * compilers... Yet "4-bit" procedure is preferred, because it's
109  * believed to provide better security-performance balance and adequate
110  * all-round performance. "All-round" refers to things like:
111  *
112  * - shorter setup time effectively improves overall timing for
113  *   handling short messages;
114  * - larger table allocation can become unbearable because of VM
115  *   subsystem penalties (for example on Windows large enough free
116  *   results in VM working set trimming, meaning that consequent
117  *   malloc would immediately incur working set expansion);
118  * - larger table has larger cache footprint, which can affect
119  *   performance of other code paths (not necessarily even from same
120  *   thread in Hyper-Threading world);
121  */
122 #define TABLE_BITS 4
123
124 #if     TABLE_BITS==8
125
126 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
127 {
128         int  i, j;
129         u128 V;
130
131         Htable[0].hi = 0;
132         Htable[0].lo = 0;
133         V.hi = H[0];
134         V.lo = H[1];
135
136         for (Htable[128]=V, i=64; i>0; i>>=1) {
137                 REDUCE1BIT(V);
138                 Htable[i] = V;
139         }
140
141         for (i=2; i<256; i<<=1) {
142                 u128 *Hi = Htable+i, H0 = *Hi;
143                 for (j=1; j<i; ++j) {
144                         Hi[j].hi = H0.hi^Htable[j].hi;
145                         Hi[j].lo = H0.lo^Htable[j].lo;
146                 }
147         }
148 }
149
150 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
151 {
152         u128 Z = { 0, 0};
153         const u8 *xi = (const u8 *)Xi+15;
154         size_t rem, n = *xi;
155         const union { long one; char little; } is_endian = {1};
156         static const size_t rem_8bit[256] = {
157                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
158                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
159                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
160                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
161                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
162                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
163                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
164                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
165                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
166                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
167                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
168                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
169                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
170                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
171                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
172                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
173                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
174                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
175                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
176                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
177                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
178                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
179                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
180                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
181                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
182                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
183                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
184                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
185                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
186                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
187                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
188                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
189                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
190                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
191                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
192                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
193                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
194                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
195                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
196                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
197                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
198                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
199                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
200                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
201                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
202                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
203                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
204                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
205                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
206                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
207                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
208                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
209                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
210                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
211                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
212                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
213                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
214                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
215                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
216                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
217                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
218                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
219                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
220                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
221
222         while (1) {
223                 Z.hi ^= Htable[n].hi;
224                 Z.lo ^= Htable[n].lo;
225
226                 if ((u8 *)Xi==xi)       break;
227
228                 n = *(--xi);
229
230                 rem  = (size_t)Z.lo&0xff;
231                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
232                 Z.hi = (Z.hi>>8);
233                 if (sizeof(size_t)==8)
234                         Z.hi ^= rem_8bit[rem];
235                 else
236                         Z.hi ^= (u64)rem_8bit[rem]<<32;
237         }
238
239         if (is_endian.little) {
240 #ifdef BSWAP8
241                 Xi[0] = BSWAP8(Z.hi);
242                 Xi[1] = BSWAP8(Z.lo);
243 #else
244                 u8 *p = (u8 *)Xi;
245                 u32 v;
246                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
247                 v = (u32)(Z.hi);        PUTU32(p+4,v);
248                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
249                 v = (u32)(Z.lo);        PUTU32(p+12,v);
250 #endif
251         }
252         else {
253                 Xi[0] = Z.hi;
254                 Xi[1] = Z.lo;
255         }
256 }
257 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
258
259 #elif   TABLE_BITS==4
260
261 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
262 {
263         u128 V;
264 #if defined(OPENSSL_SMALL_FOOTPRINT)
265         int  i;
266 #endif
267
268         Htable[0].hi = 0;
269         Htable[0].lo = 0;
270         V.hi = H[0];
271         V.lo = H[1];
272
273 #if defined(OPENSSL_SMALL_FOOTPRINT)
274         for (Htable[8]=V, i=4; i>0; i>>=1) {
275                 REDUCE1BIT(V);
276                 Htable[i] = V;
277         }
278
279         for (i=2; i<16; i<<=1) {
280                 u128 *Hi = Htable+i;
281                 int   j;
282                 for (V=*Hi, j=1; j<i; ++j) {
283                         Hi[j].hi = V.hi^Htable[j].hi;
284                         Hi[j].lo = V.lo^Htable[j].lo;
285                 }
286         }
287 #else
288         Htable[8] = V;
289         REDUCE1BIT(V);
290         Htable[4] = V;
291         REDUCE1BIT(V);
292         Htable[2] = V;
293         REDUCE1BIT(V);
294         Htable[1] = V;
295         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
296         V=Htable[4];
297         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
298         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
299         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
300         V=Htable[8];
301         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
302         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
303         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
304         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
305         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
306         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
307         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
308 #endif
309 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
310         /*
311          * ARM assembler expects specific dword order in Htable.
312          */
313         {
314         int j;
315         const union { long one; char little; } is_endian = {1};
316
317         if (is_endian.little)
318                 for (j=0;j<16;++j) {
319                         V = Htable[j];
320                         Htable[j].hi = V.lo;
321                         Htable[j].lo = V.hi;
322                 }
323         else
324                 for (j=0;j<16;++j) {
325                         V = Htable[j];
326                         Htable[j].hi = V.lo<<32|V.lo>>32;
327                         Htable[j].lo = V.hi<<32|V.hi>>32;
328                 }
329         }
330 #endif
331 }
332
333 #ifndef GHASH_ASM
334 static const size_t rem_4bit[16] = {
335         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
336         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
337         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
338         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
339
340 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
341 {
342         u128 Z;
343         int cnt = 15;
344         size_t rem, nlo, nhi;
345         const union { long one; char little; } is_endian = {1};
346
347         nlo  = ((const u8 *)Xi)[15];
348         nhi  = nlo>>4;
349         nlo &= 0xf;
350
351         Z.hi = Htable[nlo].hi;
352         Z.lo = Htable[nlo].lo;
353
354         while (1) {
355                 rem  = (size_t)Z.lo&0xf;
356                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
357                 Z.hi = (Z.hi>>4);
358                 if (sizeof(size_t)==8)
359                         Z.hi ^= rem_4bit[rem];
360                 else
361                         Z.hi ^= (u64)rem_4bit[rem]<<32;
362
363                 Z.hi ^= Htable[nhi].hi;
364                 Z.lo ^= Htable[nhi].lo;
365
366                 if (--cnt<0)            break;
367
368                 nlo  = ((const u8 *)Xi)[cnt];
369                 nhi  = nlo>>4;
370                 nlo &= 0xf;
371
372                 rem  = (size_t)Z.lo&0xf;
373                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
374                 Z.hi = (Z.hi>>4);
375                 if (sizeof(size_t)==8)
376                         Z.hi ^= rem_4bit[rem];
377                 else
378                         Z.hi ^= (u64)rem_4bit[rem]<<32;
379
380                 Z.hi ^= Htable[nlo].hi;
381                 Z.lo ^= Htable[nlo].lo;
382         }
383
384         if (is_endian.little) {
385 #ifdef BSWAP8
386                 Xi[0] = BSWAP8(Z.hi);
387                 Xi[1] = BSWAP8(Z.lo);
388 #else
389                 u8 *p = (u8 *)Xi;
390                 u32 v;
391                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
392                 v = (u32)(Z.hi);        PUTU32(p+4,v);
393                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
394                 v = (u32)(Z.lo);        PUTU32(p+12,v);
395 #endif
396         }
397         else {
398                 Xi[0] = Z.hi;
399                 Xi[1] = Z.lo;
400         }
401 }
402
403 #if !defined(OPENSSL_SMALL_FOOTPRINT)
404 /*
405  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
406  * details... Compiler-generated code doesn't seem to give any
407  * performance improvement, at least not on x86[_64]. It's here
408  * mostly as reference and a placeholder for possible future
409  * non-trivial optimization[s]...
410  */
411 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
412                                 const u8 *inp,size_t len)
413 {
414     u128 Z;
415     int cnt;
416     size_t rem, nlo, nhi;
417     const union { long one; char little; } is_endian = {1};
418
419 #if 1
420     do {
421         cnt  = 15;
422         nlo  = ((const u8 *)Xi)[15];
423         nlo ^= inp[15];
424         nhi  = nlo>>4;
425         nlo &= 0xf;
426
427         Z.hi = Htable[nlo].hi;
428         Z.lo = Htable[nlo].lo;
429
430         while (1) {
431                 rem  = (size_t)Z.lo&0xf;
432                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
433                 Z.hi = (Z.hi>>4);
434                 if (sizeof(size_t)==8)
435                         Z.hi ^= rem_4bit[rem];
436                 else
437                         Z.hi ^= (u64)rem_4bit[rem]<<32;
438
439                 Z.hi ^= Htable[nhi].hi;
440                 Z.lo ^= Htable[nhi].lo;
441
442                 if (--cnt<0)            break;
443
444                 nlo  = ((const u8 *)Xi)[cnt];
445                 nlo ^= inp[cnt];
446                 nhi  = nlo>>4;
447                 nlo &= 0xf;
448
449                 rem  = (size_t)Z.lo&0xf;
450                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
451                 Z.hi = (Z.hi>>4);
452                 if (sizeof(size_t)==8)
453                         Z.hi ^= rem_4bit[rem];
454                 else
455                         Z.hi ^= (u64)rem_4bit[rem]<<32;
456
457                 Z.hi ^= Htable[nlo].hi;
458                 Z.lo ^= Htable[nlo].lo;
459         }
460 #else
461     /*
462      * Extra 256+16 bytes per-key plus 512 bytes shared tables
463      * [should] give ~50% improvement... One could have PACK()-ed
464      * the rem_8bit even here, but the priority is to minimize
465      * cache footprint...
466      */ 
467     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
468     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
469     static const unsigned short rem_8bit[256] = {
470         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
471         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
472         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
473         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
474         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
475         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
476         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
477         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
478         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
479         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
480         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
481         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
482         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
483         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
484         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
485         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
486         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
487         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
488         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
489         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
490         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
491         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
492         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
493         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
494         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
495         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
496         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
497         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
498         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
499         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
500         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
501         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
502     /*
503      * This pre-processing phase slows down procedure by approximately
504      * same time as it makes each loop spin faster. In other words
505      * single block performance is approximately same as straightforward
506      * "4-bit" implementation, and then it goes only faster...
507      */
508     for (cnt=0; cnt<16; ++cnt) {
509         Z.hi = Htable[cnt].hi;
510         Z.lo = Htable[cnt].lo;
511         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
512         Hshr4[cnt].hi = (Z.hi>>4);
513         Hshl4[cnt]    = (u8)(Z.lo<<4);
514     }
515
516     do {
517         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
518                 nlo  = ((const u8 *)Xi)[cnt];
519                 nlo ^= inp[cnt];
520                 nhi  = nlo>>4;
521                 nlo &= 0xf;
522
523                 Z.hi ^= Htable[nlo].hi;
524                 Z.lo ^= Htable[nlo].lo;
525
526                 rem = (size_t)Z.lo&0xff;
527
528                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
529                 Z.hi = (Z.hi>>8);
530
531                 Z.hi ^= Hshr4[nhi].hi;
532                 Z.lo ^= Hshr4[nhi].lo;
533                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
534         }
535
536         nlo  = ((const u8 *)Xi)[0];
537         nlo ^= inp[0];
538         nhi  = nlo>>4;
539         nlo &= 0xf;
540
541         Z.hi ^= Htable[nlo].hi;
542         Z.lo ^= Htable[nlo].lo;
543
544         rem = (size_t)Z.lo&0xf;
545
546         Z.lo = (Z.hi<<60)|(Z.lo>>4);
547         Z.hi = (Z.hi>>4);
548
549         Z.hi ^= Htable[nhi].hi;
550         Z.lo ^= Htable[nhi].lo;
551         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
552 #endif
553
554         if (is_endian.little) {
555 #ifdef BSWAP8
556                 Xi[0] = BSWAP8(Z.hi);
557                 Xi[1] = BSWAP8(Z.lo);
558 #else
559                 u8 *p = (u8 *)Xi;
560                 u32 v;
561                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
562                 v = (u32)(Z.hi);        PUTU32(p+4,v);
563                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
564                 v = (u32)(Z.lo);        PUTU32(p+12,v);
565 #endif
566         }
567         else {
568                 Xi[0] = Z.hi;
569                 Xi[1] = Z.lo;
570         }
571     } while (inp+=16, len-=16);
572 }
573 #endif
574 #else
575 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
576 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
577 #endif
578
579 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
580 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
581 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
582 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
583  * trashing effect. In other words idea is to hash data while it's
584  * still in L1 cache after encryption pass... */
585 #define GHASH_CHUNK       (3*1024)
586 #endif
587
588 #else   /* TABLE_BITS */
589
590 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
591 {
592         u128 V,Z = { 0,0 };
593         long X;
594         int  i,j;
595         const long *xi = (const long *)Xi;
596         const union { long one; char little; } is_endian = {1};
597
598         V.hi = H[0];    /* H is in host byte order, no byte swapping */
599         V.lo = H[1];
600
601         for (j=0; j<16/sizeof(long); ++j) {
602                 if (is_endian.little) {
603                         if (sizeof(long)==8) {
604 #ifdef BSWAP8
605                                 X = (long)(BSWAP8(xi[j]));
606 #else
607                                 const u8 *p = (const u8 *)(xi+j);
608                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
609 #endif
610                         }
611                         else {
612                                 const u8 *p = (const u8 *)(xi+j);
613                                 X = (long)GETU32(p);
614                         }
615                 }
616                 else
617                         X = xi[j];
618
619                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
620                         u64 M = (u64)(X>>(8*sizeof(long)-1));
621                         Z.hi ^= V.hi&M;
622                         Z.lo ^= V.lo&M;
623
624                         REDUCE1BIT(V);
625                 }
626         }
627
628         if (is_endian.little) {
629 #ifdef BSWAP8
630                 Xi[0] = BSWAP8(Z.hi);
631                 Xi[1] = BSWAP8(Z.lo);
632 #else
633                 u8 *p = (u8 *)Xi;
634                 u32 v;
635                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
636                 v = (u32)(Z.hi);        PUTU32(p+4,v);
637                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
638                 v = (u32)(Z.lo);        PUTU32(p+12,v);
639 #endif
640         }
641         else {
642                 Xi[0] = Z.hi;
643                 Xi[1] = Z.lo;
644         }
645 }
646 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
647
648 #endif
649
650 struct gcm128_context {
651         /* Following 6 names follow names in GCM specification */
652         union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
653                                                 Xi,H,len;
654         /* Pre-computed table used by gcm_gmult_* */
655 #if TABLE_BITS==8
656         u128 Htable[256];
657 #else
658         u128 Htable[16];
659         void (*gmult)(u64 Xi[2],const u128 Htable[16]);
660         void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
661 #endif
662         unsigned int mres, ares;
663         block128_f block;
664         void *key;
665 };
666
667 #if     TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
668         (defined(__i386)        || defined(__i386__)    || \
669          defined(__x86_64)      || defined(__x86_64__)  || \
670          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
671 # define GHASH_ASM_IAX
672 extern unsigned int OPENSSL_ia32cap_P[2];
673
674 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
675 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
676 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
677
678 # if    defined(__i386) || defined(__i386__) || defined(_M_IX86)
679 #  define GHASH_ASM_X86
680 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
681 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
682
683 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
684 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
685 # endif
686
687 # undef  GCM_MUL
688 # define GCM_MUL(ctx,Xi)   (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
689 # undef  GHASH
690 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
691 #endif
692
693 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
694 {
695         const union { long one; char little; } is_endian = {1};
696
697         memset(ctx,0,sizeof(*ctx));
698         ctx->block = block;
699         ctx->key   = key;
700
701         (*block)(ctx->H.c,ctx->H.c,key);
702
703         if (is_endian.little) {
704                 /* H is stored in host byte order */
705 #ifdef BSWAP8
706                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
707                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
708 #else
709                 u8 *p = ctx->H.c;
710                 u64 hi,lo;
711                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
712                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
713                 ctx->H.u[0] = hi;
714                 ctx->H.u[1] = lo;
715 #endif
716         }
717
718 #if     TABLE_BITS==8
719         gcm_init_8bit(ctx->Htable,ctx->H.u);
720 #elif   TABLE_BITS==4
721 # if    defined(GHASH_ASM_IAX)                  /* both x86 and x86_64 */
722         if (OPENSSL_ia32cap_P[1]&(1<<1)) {
723                 gcm_init_clmul(ctx->Htable,ctx->H.u);
724                 ctx->gmult = gcm_gmult_clmul;
725                 ctx->ghash = gcm_ghash_clmul;
726                 return;
727         }
728         gcm_init_4bit(ctx->Htable,ctx->H.u);
729 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
730         if (OPENSSL_ia32cap_P[0]&(1<<23)) {
731                 ctx->gmult = gcm_gmult_4bit_mmx;
732                 ctx->ghash = gcm_ghash_4bit_mmx;
733         } else {
734                 ctx->gmult = gcm_gmult_4bit_x86;
735                 ctx->ghash = gcm_ghash_4bit_x86;
736         }
737 #  else
738         ctx->gmult = gcm_gmult_4bit;
739         ctx->ghash = gcm_ghash_4bit;
740 #  endif
741 # else
742         gcm_init_4bit(ctx->Htable,ctx->H.u);
743 # endif
744 #endif
745 }
746
747 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
748 {
749         const union { long one; char little; } is_endian = {1};
750         unsigned int ctr;
751
752         ctx->Yi.u[0]  = 0;
753         ctx->Yi.u[1]  = 0;
754         ctx->Xi.u[0]  = 0;
755         ctx->Xi.u[1]  = 0;
756         ctx->len.u[0] = 0;      /* AAD length */
757         ctx->len.u[1] = 0;      /* message length */
758         ctx->ares = 0;
759         ctx->mres = 0;
760
761         if (len==12) {
762                 memcpy(ctx->Yi.c,iv,12);
763                 ctx->Yi.c[15]=1;
764                 ctr=1;
765         }
766         else {
767                 size_t i;
768                 u64 len0 = len;
769
770                 while (len>=16) {
771                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
772                         GCM_MUL(ctx,Yi);
773                         iv += 16;
774                         len -= 16;
775                 }
776                 if (len) {
777                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
778                         GCM_MUL(ctx,Yi);
779                 }
780                 len0 <<= 3;
781                 if (is_endian.little) {
782 #ifdef BSWAP8
783                         ctx->Yi.u[1]  ^= BSWAP8(len0);
784 #else
785                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
786                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
787                         ctx->Yi.c[10] ^= (u8)(len0>>40);
788                         ctx->Yi.c[11] ^= (u8)(len0>>32);
789                         ctx->Yi.c[12] ^= (u8)(len0>>24);
790                         ctx->Yi.c[13] ^= (u8)(len0>>16);
791                         ctx->Yi.c[14] ^= (u8)(len0>>8);
792                         ctx->Yi.c[15] ^= (u8)(len0);
793 #endif
794                 }
795                 else
796                         ctx->Yi.u[1]  ^= len0;
797
798                 GCM_MUL(ctx,Yi);
799
800                 if (is_endian.little)
801                         ctr = GETU32(ctx->Yi.c+12);
802                 else
803                         ctr = ctx->Yi.d[3];
804         }
805
806         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
807         ++ctr;
808         if (is_endian.little)
809                 PUTU32(ctx->Yi.c+12,ctr);
810         else
811                 ctx->Yi.d[3] = ctr;
812 }
813
814 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
815 {
816         size_t i;
817         unsigned int n;
818         u64 alen = ctx->len.u[0];
819
820         if (ctx->len.u[1]) return -2;
821
822         alen += len;
823         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
824                 return -1;
825         ctx->len.u[0] = alen;
826
827         n = ctx->ares;
828         if (n) {
829                 while (n && len) {
830                         ctx->Xi.c[n] ^= *(aad++);
831                         --len;
832                         n = (n+1)%16;
833                 }
834                 if (n==0) GCM_MUL(ctx,Xi);
835                 else {
836                         ctx->ares = n;
837                         return 0;
838                 }
839         }
840
841 #ifdef GHASH
842         if ((i = (len&(size_t)-16))) {
843                 GHASH(ctx,aad,i);
844                 aad += i;
845                 len -= i;
846         }
847 #else
848         while (len>=16) {
849                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
850                 GCM_MUL(ctx,Xi);
851                 aad += 16;
852                 len -= 16;
853         }
854 #endif
855         if (len) {
856                 n = (unsigned int)len;
857                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
858         }
859
860         ctx->ares = n;
861         return 0;
862 }
863
864 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
865                 const unsigned char *in, unsigned char *out,
866                 size_t len)
867 {
868         const union { long one; char little; } is_endian = {1};
869         unsigned int n, ctr;
870         size_t i;
871         u64 mlen = ctx->len.u[1];
872
873 #if 0
874         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
875 #endif
876         mlen += len;
877         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
878                 return -1;
879         ctx->len.u[1] = mlen;
880
881         if (ctx->ares) {
882                 /* First call to encrypt finalizes GHASH(AAD) */
883                 GCM_MUL(ctx,Xi);
884                 ctx->ares = 0;
885         }
886
887         if (is_endian.little)
888                 ctr = GETU32(ctx->Yi.c+12);
889         else
890                 ctr = ctx->Yi.d[3];
891
892         n = ctx->mres;
893 #if !defined(OPENSSL_SMALL_FOOTPRINT)
894         if (16%sizeof(size_t) == 0) do {        /* always true actually */
895                 if (n) {
896                         while (n && len) {
897                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
898                                 --len;
899                                 n = (n+1)%16;
900                         }
901                         if (n==0) GCM_MUL(ctx,Xi);
902                         else {
903                                 ctx->mres = n;
904                                 return 0;
905                         }
906                 }
907 #if defined(STRICT_ALIGNMENT)
908                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
909                         break;
910 #endif
911 #if defined(GHASH) && defined(GHASH_CHUNK)
912                 while (len>=GHASH_CHUNK) {
913                     size_t j=GHASH_CHUNK;
914
915                     while (j) {
916                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
917                         ++ctr;
918                         if (is_endian.little)
919                                 PUTU32(ctx->Yi.c+12,ctr);
920                         else
921                                 ctx->Yi.d[3] = ctr;
922                         for (i=0; i<16; i+=sizeof(size_t))
923                                 *(size_t *)(out+i) =
924                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
925                         out += 16;
926                         in  += 16;
927                         j   -= 16;
928                     }
929                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
930                     len -= GHASH_CHUNK;
931                 }
932                 if ((i = (len&(size_t)-16))) {
933                     size_t j=i;
934
935                     while (len>=16) {
936                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
937                         ++ctr;
938                         if (is_endian.little)
939                                 PUTU32(ctx->Yi.c+12,ctr);
940                         else
941                                 ctx->Yi.d[3] = ctr;
942                         for (i=0; i<16; i+=sizeof(size_t))
943                                 *(size_t *)(out+i) =
944                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
945                         out += 16;
946                         in  += 16;
947                         len -= 16;
948                     }
949                     GHASH(ctx,out-j,j);
950                 }
951 #else
952                 while (len>=16) {
953                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
954                         ++ctr;
955                         if (is_endian.little)
956                                 PUTU32(ctx->Yi.c+12,ctr);
957                         else
958                                 ctx->Yi.d[3] = ctr;
959                         for (i=0; i<16; i+=sizeof(size_t))
960                                 *(size_t *)(ctx->Xi.c+i) ^=
961                                 *(size_t *)(out+i) =
962                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
963                         GCM_MUL(ctx,Xi);
964                         out += 16;
965                         in  += 16;
966                         len -= 16;
967                 }
968 #endif
969                 if (len) {
970                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
971                         ++ctr;
972                         if (is_endian.little)
973                                 PUTU32(ctx->Yi.c+12,ctr);
974                         else
975                                 ctx->Yi.d[3] = ctr;
976                         while (len--) {
977                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
978                                 ++n;
979                         }
980                 }
981
982                 ctx->mres = n;
983                 return 0;
984         } while(0);
985 #endif
986         for (i=0;i<len;++i) {
987                 if (n==0) {
988                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
989                         ++ctr;
990                         if (is_endian.little)
991                                 PUTU32(ctx->Yi.c+12,ctr);
992                         else
993                                 ctx->Yi.d[3] = ctr;
994                 }
995                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
996                 n = (n+1)%16;
997                 if (n==0)
998                         GCM_MUL(ctx,Xi);
999         }
1000
1001         ctx->mres = n;
1002         return 0;
1003 }
1004
1005 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1006                 const unsigned char *in, unsigned char *out,
1007                 size_t len)
1008 {
1009         const union { long one; char little; } is_endian = {1};
1010         unsigned int n, ctr;
1011         size_t i;
1012         u64 mlen = ctx->len.u[1];
1013
1014         mlen += len;
1015         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1016                 return -1;
1017         ctx->len.u[1] = mlen;
1018
1019         if (ctx->ares) {
1020                 /* First call to decrypt finalizes GHASH(AAD) */
1021                 GCM_MUL(ctx,Xi);
1022                 ctx->ares = 0;
1023         }
1024
1025         if (is_endian.little)
1026                 ctr = GETU32(ctx->Yi.c+12);
1027         else
1028                 ctr = ctx->Yi.d[3];
1029
1030         n = ctx->mres;
1031 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1032         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1033                 if (n) {
1034                         while (n && len) {
1035                                 u8 c = *(in++);
1036                                 *(out++) = c^ctx->EKi.c[n];
1037                                 ctx->Xi.c[n] ^= c;
1038                                 --len;
1039                                 n = (n+1)%16;
1040                         }
1041                         if (n==0) GCM_MUL (ctx,Xi);
1042                         else {
1043                                 ctx->mres = n;
1044                                 return 0;
1045                         }
1046                 }
1047 #if defined(STRICT_ALIGNMENT)
1048                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1049                         break;
1050 #endif
1051 #if defined(GHASH) && defined(GHASH_CHUNK)
1052                 while (len>=GHASH_CHUNK) {
1053                     size_t j=GHASH_CHUNK;
1054
1055                     GHASH(ctx,in,GHASH_CHUNK);
1056                     while (j) {
1057                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1058                         ++ctr;
1059                         if (is_endian.little)
1060                                 PUTU32(ctx->Yi.c+12,ctr);
1061                         else
1062                                 ctx->Yi.d[3] = ctr;
1063                         for (i=0; i<16; i+=sizeof(size_t))
1064                                 *(size_t *)(out+i) =
1065                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1066                         out += 16;
1067                         in  += 16;
1068                         j   -= 16;
1069                     }
1070                     len -= GHASH_CHUNK;
1071                 }
1072                 if ((i = (len&(size_t)-16))) {
1073                     GHASH(ctx,in,i);
1074                     while (len>=16) {
1075                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1076                         ++ctr;
1077                         if (is_endian.little)
1078                                 PUTU32(ctx->Yi.c+12,ctr);
1079                         else
1080                                 ctx->Yi.d[3] = ctr;
1081                         for (i=0; i<16; i+=sizeof(size_t))
1082                                 *(size_t *)(out+i) =
1083                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1084                         out += 16;
1085                         in  += 16;
1086                         len -= 16;
1087                     }
1088                 }
1089 #else
1090                 while (len>=16) {
1091                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1092                         ++ctr;
1093                         if (is_endian.little)
1094                                 PUTU32(ctx->Yi.c+12,ctr);
1095                         else
1096                                 ctx->Yi.d[3] = ctr;
1097                         for (i=0; i<16; i+=sizeof(size_t)) {
1098                                 size_t c = *(size_t *)(in+i);
1099                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1100                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1101                         }
1102                         GCM_MUL(ctx,Xi);
1103                         out += 16;
1104                         in  += 16;
1105                         len -= 16;
1106                 }
1107 #endif
1108                 if (len) {
1109                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1110                         ++ctr;
1111                         if (is_endian.little)
1112                                 PUTU32(ctx->Yi.c+12,ctr);
1113                         else
1114                                 ctx->Yi.d[3] = ctr;
1115                         while (len--) {
1116                                 u8 c = in[n];
1117                                 ctx->Xi.c[n] ^= c;
1118                                 out[n] = c^ctx->EKi.c[n];
1119                                 ++n;
1120                         }
1121                 }
1122
1123                 ctx->mres = n;
1124                 return 0;
1125         } while(0);
1126 #endif
1127         for (i=0;i<len;++i) {
1128                 u8 c;
1129                 if (n==0) {
1130                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1131                         ++ctr;
1132                         if (is_endian.little)
1133                                 PUTU32(ctx->Yi.c+12,ctr);
1134                         else
1135                                 ctx->Yi.d[3] = ctr;
1136                 }
1137                 c = in[i];
1138                 out[i] = c^ctx->EKi.c[n];
1139                 ctx->Xi.c[n] ^= c;
1140                 n = (n+1)%16;
1141                 if (n==0)
1142                         GCM_MUL(ctx,Xi);
1143         }
1144
1145         ctx->mres = n;
1146         return 0;
1147 }
1148
1149 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1150                 const unsigned char *in, unsigned char *out,
1151                 size_t len, ctr128_f stream)
1152 {
1153         const union { long one; char little; } is_endian = {1};
1154         unsigned int n, ctr;
1155         size_t i;
1156         u64 mlen = ctx->len.u[1];
1157
1158         mlen += len;
1159         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1160                 return -1;
1161         ctx->len.u[1] = mlen;
1162
1163         if (ctx->ares) {
1164                 /* First call to encrypt finalizes GHASH(AAD) */
1165                 GCM_MUL(ctx,Xi);
1166                 ctx->ares = 0;
1167         }
1168
1169         if (is_endian.little)
1170                 ctr = GETU32(ctx->Yi.c+12);
1171         else
1172                 ctr = ctx->Yi.d[3];
1173
1174         n = ctx->mres;
1175         if (n) {
1176                 while (n && len) {
1177                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1178                         --len;
1179                         n = (n+1)%16;
1180                 }
1181                 if (n==0) GCM_MUL(ctx,Xi);
1182                 else {
1183                         ctx->mres = n;
1184                         return 0;
1185                 }
1186         }
1187 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1188         while (len>=GHASH_CHUNK) {
1189                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1190                 ctr += GHASH_CHUNK/16;
1191                 if (is_endian.little)
1192                         PUTU32(ctx->Yi.c+12,ctr);
1193                 else
1194                         ctx->Yi.d[3] = ctr;
1195                 GHASH(ctx,out,GHASH_CHUNK);
1196                 out += GHASH_CHUNK;
1197                 in  += GHASH_CHUNK;
1198                 len -= GHASH_CHUNK;
1199         }
1200 #endif
1201         if ((i = (len&(size_t)-16))) {
1202                 size_t j=i/16;
1203
1204                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1205                 ctr += (unsigned int)j;
1206                 if (is_endian.little)
1207                         PUTU32(ctx->Yi.c+12,ctr);
1208                 else
1209                         ctx->Yi.d[3] = ctr;
1210                 in  += i;
1211                 len -= i;
1212 #if defined(GHASH)
1213                 GHASH(ctx,out,i);
1214                 out += i;
1215 #else
1216                 while (j--) {
1217                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1218                         GCM_MUL(ctx,Xi);
1219                         out += 16;
1220                 }
1221 #endif
1222         }
1223         if (len) {
1224                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1225                 ++ctr;
1226                 if (is_endian.little)
1227                         PUTU32(ctx->Yi.c+12,ctr);
1228                 else
1229                         ctx->Yi.d[3] = ctr;
1230                 while (len--) {
1231                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1232                         ++n;
1233                 }
1234         }
1235
1236         ctx->mres = n;
1237         return 0;
1238 }
1239
1240 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1241                 const unsigned char *in, unsigned char *out,
1242                 size_t len,ctr128_f stream)
1243 {
1244         const union { long one; char little; } is_endian = {1};
1245         unsigned int n, ctr;
1246         size_t i;
1247         u64 mlen = ctx->len.u[1];
1248
1249         mlen += len;
1250         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1251                 return -1;
1252         ctx->len.u[1] = mlen;
1253
1254         if (ctx->ares) {
1255                 /* First call to decrypt finalizes GHASH(AAD) */
1256                 GCM_MUL(ctx,Xi);
1257                 ctx->ares = 0;
1258         }
1259
1260         if (is_endian.little)
1261                 ctr = GETU32(ctx->Yi.c+12);
1262         else
1263                 ctr = ctx->Yi.d[3];
1264
1265         n = ctx->mres;
1266         if (n) {
1267                 while (n && len) {
1268                         u8 c = *(in++);
1269                         *(out++) = c^ctx->EKi.c[n];
1270                         ctx->Xi.c[n] ^= c;
1271                         --len;
1272                         n = (n+1)%16;
1273                 }
1274                 if (n==0) GCM_MUL (ctx,Xi);
1275                 else {
1276                         ctx->mres = n;
1277                         return 0;
1278                 }
1279         }
1280 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1281         while (len>=GHASH_CHUNK) {
1282                 GHASH(ctx,in,GHASH_CHUNK);
1283                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1284                 ctr += GHASH_CHUNK/16;
1285                 if (is_endian.little)
1286                         PUTU32(ctx->Yi.c+12,ctr);
1287                 else
1288                         ctx->Yi.d[3] = ctr;
1289                 out += GHASH_CHUNK;
1290                 in  += GHASH_CHUNK;
1291                 len -= GHASH_CHUNK;
1292         }
1293 #endif
1294         if ((i = (len&(size_t)-16))) {
1295                 size_t j=i/16;
1296
1297 #if defined(GHASH)
1298                 GHASH(ctx,in,i);
1299 #else
1300                 while (j--) {
1301                         size_t k;
1302                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1303                         GCM_MUL(ctx,Xi);
1304                         in += 16;
1305                 }
1306                 j   = i/16;
1307                 in -= i;
1308 #endif
1309                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1310                 ctr += (unsigned int)j;
1311                 if (is_endian.little)
1312                         PUTU32(ctx->Yi.c+12,ctr);
1313                 else
1314                         ctx->Yi.d[3] = ctr;
1315                 out += i;
1316                 in  += i;
1317                 len -= i;
1318         }
1319         if (len) {
1320                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1321                 ++ctr;
1322                 if (is_endian.little)
1323                         PUTU32(ctx->Yi.c+12,ctr);
1324                 else
1325                         ctx->Yi.d[3] = ctr;
1326                 while (len--) {
1327                         u8 c = in[n];
1328                         ctx->Xi.c[n] ^= c;
1329                         out[n] = c^ctx->EKi.c[n];
1330                         ++n;
1331                 }
1332         }
1333
1334         ctx->mres = n;
1335         return 0;
1336 }
1337
1338 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1339                         size_t len)
1340 {
1341         const union { long one; char little; } is_endian = {1};
1342         u64 alen = ctx->len.u[0]<<3;
1343         u64 clen = ctx->len.u[1]<<3;
1344
1345         if (ctx->mres)
1346                 GCM_MUL(ctx,Xi);
1347
1348         if (is_endian.little) {
1349 #ifdef BSWAP8
1350                 alen = BSWAP8(alen);
1351                 clen = BSWAP8(clen);
1352 #else
1353                 u8 *p = ctx->len.c;
1354
1355                 ctx->len.u[0] = alen;
1356                 ctx->len.u[1] = clen;
1357
1358                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1359                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1360 #endif
1361         }
1362
1363         ctx->Xi.u[0] ^= alen;
1364         ctx->Xi.u[1] ^= clen;
1365         GCM_MUL(ctx,Xi);
1366
1367         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1368         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1369
1370         if (tag && len<=sizeof(ctx->Xi))
1371                 return memcmp(ctx->Xi.c,tag,len);
1372         else
1373                 return -1;
1374 }
1375
1376 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1377 {
1378         CRYPTO_gcm128_finish(ctx, NULL, 0);
1379         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1380 }
1381
1382 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1383 {
1384         GCM128_CONTEXT *ret;
1385
1386         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1387                 CRYPTO_gcm128_init(ret,key,block);
1388
1389         return ret;
1390 }
1391
1392 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1393 {
1394         if (ctx) {
1395                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1396                 OPENSSL_free(ctx);
1397         }
1398 }
1399
1400 #if defined(SELFTEST)
1401 #include <stdio.h>
1402 #include <openssl/aes.h>
1403
1404 /* Test Case 1 */
1405 static const u8 K1[16],
1406                 *P1=NULL,
1407                 *A1=NULL,
1408                 IV1[12],
1409                 *C1=NULL,
1410                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1411
1412 /* Test Case 2 */
1413 #define K2 K1
1414 #define A2 A1
1415 #define IV2 IV1
1416 static const u8 P2[16],
1417                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1418                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1419
1420 /* Test Case 3 */
1421 #define A3 A2
1422 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1423                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1424                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1425                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1426                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1427                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1428                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1429                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1430                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1431                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1432                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1433
1434 /* Test Case 4 */
1435 #define K4 K3
1436 #define IV4 IV3
1437 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1438                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1439                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1440                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1441                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1442                         0xab,0xad,0xda,0xd2},
1443                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1444                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1445                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1446                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1447                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1448
1449 /* Test Case 5 */
1450 #define K5 K4
1451 #define P5 P4
1452 static const u8 A5[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1453                         0xab,0xad,0xda,0xd2},
1454                 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1455                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1456                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1457                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1458                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1459                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1460
1461 /* Test Case 6 */
1462 #define K6 K5
1463 #define P6 P5
1464 #define A6 A5
1465 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1466                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1467                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1468                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1469                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1470                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1471                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1472                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1473                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1474
1475 /* Test Case 7 */
1476 static const u8 K7[24],
1477                 *P7=NULL,
1478                 *A7=NULL,
1479                 IV7[12],
1480                 *C7=NULL,
1481                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1482
1483 /* Test Case 8 */
1484 #define K8 K7
1485 #define IV8 IV7
1486 #define A8 A7
1487 static const u8 P8[16],
1488                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1489                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1490
1491 /* Test Case 9 */
1492 #define A9 A8
1493 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1494                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1495                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1496                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1497                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1498                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1499                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1500                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1501                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1502                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1503                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1504                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1505
1506 /* Test Case 10 */
1507 #define K10 K9
1508 #define IV10 IV9
1509 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1510                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1511                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1512                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1513                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1514                         0xab,0xad,0xda,0xd2},
1515                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1516                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1517                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1518                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1519                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1520
1521 /* Test Case 11 */
1522 #define K11 K10
1523 #define P11 P10
1524 #define A11 A10
1525 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1526                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1527                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1528                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1529                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1530                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1531
1532 /* Test Case 12 */
1533 #define K12 K11
1534 #define P12 P11
1535 #define A12 A11
1536 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1537                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1538                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1539                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1540                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1541                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1542                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1543                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1544                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1545
1546 /* Test Case 13 */
1547 static const u8 K13[32],
1548                 *P13=NULL,
1549                 *A13=NULL,
1550                 IV13[12],
1551                 *C13=NULL,
1552                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1553
1554 /* Test Case 14 */
1555 #define K14 K13
1556 #define A14 A13
1557 static const u8 P14[16],
1558                 IV14[12],
1559                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1560                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1561
1562 /* Test Case 15 */
1563 #define A15 A14
1564 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1565                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1566                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1567                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1568                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1569                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1570                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1571                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1572                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1573                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1574                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1575                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1576
1577 /* Test Case 16 */
1578 #define K16 K15
1579 #define IV16 IV15
1580 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1581                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1582                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1583                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1584                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1585                         0xab,0xad,0xda,0xd2},
1586                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1587                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1588                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1589                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1590                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1591
1592 /* Test Case 17 */
1593 #define K17 K16
1594 #define P17 P16
1595 #define A17 A16
1596 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1597                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1598                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1599                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1600                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1601                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1602
1603 /* Test Case 18 */
1604 #define K18 K17
1605 #define P18 P17
1606 #define A18 A17
1607 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1608                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1609                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1610                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1611                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1612                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1613                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1614                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1615                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1616
1617 #define TEST_CASE(n)    do {                                    \
1618         u8 out[sizeof(P##n)];                                   \
1619         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1620         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1621         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1622         memset(out,0,sizeof(out));                              \
1623         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1624         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1625         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1626             (C##n && memcmp(out,C##n,sizeof(out))))             \
1627                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1628         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1629         memset(out,0,sizeof(out));                              \
1630         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1631         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1632         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1633             (P##n && memcmp(out,P##n,sizeof(out))))             \
1634                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1635         } while(0)
1636
1637 int main()
1638 {
1639         GCM128_CONTEXT ctx;
1640         AES_KEY key;
1641         int ret=0;
1642
1643         TEST_CASE(1);
1644         TEST_CASE(2);
1645         TEST_CASE(3);
1646         TEST_CASE(4);
1647         TEST_CASE(5);
1648         TEST_CASE(6);
1649         TEST_CASE(7);
1650         TEST_CASE(8);
1651         TEST_CASE(9);
1652         TEST_CASE(10);
1653         TEST_CASE(11);
1654         TEST_CASE(12);
1655         TEST_CASE(13);
1656         TEST_CASE(14);
1657         TEST_CASE(15);
1658         TEST_CASE(16);
1659         TEST_CASE(17);
1660         TEST_CASE(18);
1661
1662 #ifdef OPENSSL_CPUID_OBJ
1663         {
1664         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1665         union { u64 u; u8 c[1024]; } buf;
1666         int i;
1667
1668         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1669         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1670         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1671
1672         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1673         start = OPENSSL_rdtsc();
1674         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1675         gcm_t = OPENSSL_rdtsc() - start;
1676
1677         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1678                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1679                         (block128_f)AES_encrypt);
1680         start = OPENSSL_rdtsc();
1681         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1682                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1683                         (block128_f)AES_encrypt);
1684         ctr_t = OPENSSL_rdtsc() - start;
1685
1686         printf("%.2f-%.2f=%.2f\n",
1687                         gcm_t/(double)sizeof(buf),
1688                         ctr_t/(double)sizeof(buf),
1689                         (gcm_t-ctr_t)/(double)sizeof(buf));
1690 #ifdef GHASH
1691         GHASH(&ctx,buf.c,sizeof(buf));
1692         start = OPENSSL_rdtsc();
1693         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1694         gcm_t = OPENSSL_rdtsc() - start;
1695         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1696 #endif
1697         }
1698 #endif
1699
1700         return ret;
1701 }
1702 #endif