Add AES SPARC T4 module from master.
[oweals/openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         static const size_t rem_8bit[256] = {
152                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217         while (1) {
218                 Z.hi ^= Htable[n].hi;
219                 Z.lo ^= Htable[n].lo;
220
221                 if ((u8 *)Xi==xi)       break;
222
223                 n = *(--xi);
224
225                 rem  = (size_t)Z.lo&0xff;
226                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
227                 Z.hi = (Z.hi>>8);
228                 if (sizeof(size_t)==8)
229                         Z.hi ^= rem_8bit[rem];
230                 else
231                         Z.hi ^= (u64)rem_8bit[rem]<<32;
232         }
233
234         if (is_endian.little) {
235 #ifdef BSWAP8
236                 Xi[0] = BSWAP8(Z.hi);
237                 Xi[1] = BSWAP8(Z.lo);
238 #else
239                 u8 *p = (u8 *)Xi;
240                 u32 v;
241                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
242                 v = (u32)(Z.hi);        PUTU32(p+4,v);
243                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
244                 v = (u32)(Z.lo);        PUTU32(p+12,v);
245 #endif
246         }
247         else {
248                 Xi[0] = Z.hi;
249                 Xi[1] = Z.lo;
250         }
251 }
252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254 #elif   TABLE_BITS==4
255
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 {
258         u128 V;
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260         int  i;
261 #endif
262
263         Htable[0].hi = 0;
264         Htable[0].lo = 0;
265         V.hi = H[0];
266         V.lo = H[1];
267
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269         for (Htable[8]=V, i=4; i>0; i>>=1) {
270                 REDUCE1BIT(V);
271                 Htable[i] = V;
272         }
273
274         for (i=2; i<16; i<<=1) {
275                 u128 *Hi = Htable+i;
276                 int   j;
277                 for (V=*Hi, j=1; j<i; ++j) {
278                         Hi[j].hi = V.hi^Htable[j].hi;
279                         Hi[j].lo = V.lo^Htable[j].lo;
280                 }
281         }
282 #else
283         Htable[8] = V;
284         REDUCE1BIT(V);
285         Htable[4] = V;
286         REDUCE1BIT(V);
287         Htable[2] = V;
288         REDUCE1BIT(V);
289         Htable[1] = V;
290         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291         V=Htable[4];
292         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295         V=Htable[8];
296         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303 #endif
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305         /*
306          * ARM assembler expects specific dword order in Htable.
307          */
308         {
309         int j;
310         const union { long one; char little; } is_endian = {1};
311
312         if (is_endian.little)
313                 for (j=0;j<16;++j) {
314                         V = Htable[j];
315                         Htable[j].hi = V.lo;
316                         Htable[j].lo = V.hi;
317                 }
318         else
319                 for (j=0;j<16;++j) {
320                         V = Htable[j];
321                         Htable[j].hi = V.lo<<32|V.lo>>32;
322                         Htable[j].lo = V.hi<<32|V.hi>>32;
323                 }
324         }
325 #endif
326 }
327
328 #ifndef GHASH_ASM
329 static const size_t rem_4bit[16] = {
330         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336 {
337         u128 Z;
338         int cnt = 15;
339         size_t rem, nlo, nhi;
340         const union { long one; char little; } is_endian = {1};
341
342         nlo  = ((const u8 *)Xi)[15];
343         nhi  = nlo>>4;
344         nlo &= 0xf;
345
346         Z.hi = Htable[nlo].hi;
347         Z.lo = Htable[nlo].lo;
348
349         while (1) {
350                 rem  = (size_t)Z.lo&0xf;
351                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
352                 Z.hi = (Z.hi>>4);
353                 if (sizeof(size_t)==8)
354                         Z.hi ^= rem_4bit[rem];
355                 else
356                         Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358                 Z.hi ^= Htable[nhi].hi;
359                 Z.lo ^= Htable[nhi].lo;
360
361                 if (--cnt<0)            break;
362
363                 nlo  = ((const u8 *)Xi)[cnt];
364                 nhi  = nlo>>4;
365                 nlo &= 0xf;
366
367                 rem  = (size_t)Z.lo&0xf;
368                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
369                 Z.hi = (Z.hi>>4);
370                 if (sizeof(size_t)==8)
371                         Z.hi ^= rem_4bit[rem];
372                 else
373                         Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375                 Z.hi ^= Htable[nlo].hi;
376                 Z.lo ^= Htable[nlo].lo;
377         }
378
379         if (is_endian.little) {
380 #ifdef BSWAP8
381                 Xi[0] = BSWAP8(Z.hi);
382                 Xi[1] = BSWAP8(Z.lo);
383 #else
384                 u8 *p = (u8 *)Xi;
385                 u32 v;
386                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
387                 v = (u32)(Z.hi);        PUTU32(p+4,v);
388                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
389                 v = (u32)(Z.lo);        PUTU32(p+12,v);
390 #endif
391         }
392         else {
393                 Xi[0] = Z.hi;
394                 Xi[1] = Z.lo;
395         }
396 }
397
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
399 /*
400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401  * details... Compiler-generated code doesn't seem to give any
402  * performance improvement, at least not on x86[_64]. It's here
403  * mostly as reference and a placeholder for possible future
404  * non-trivial optimization[s]...
405  */
406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407                                 const u8 *inp,size_t len)
408 {
409     u128 Z;
410     int cnt;
411     size_t rem, nlo, nhi;
412     const union { long one; char little; } is_endian = {1};
413
414 #if 1
415     do {
416         cnt  = 15;
417         nlo  = ((const u8 *)Xi)[15];
418         nlo ^= inp[15];
419         nhi  = nlo>>4;
420         nlo &= 0xf;
421
422         Z.hi = Htable[nlo].hi;
423         Z.lo = Htable[nlo].lo;
424
425         while (1) {
426                 rem  = (size_t)Z.lo&0xf;
427                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
428                 Z.hi = (Z.hi>>4);
429                 if (sizeof(size_t)==8)
430                         Z.hi ^= rem_4bit[rem];
431                 else
432                         Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434                 Z.hi ^= Htable[nhi].hi;
435                 Z.lo ^= Htable[nhi].lo;
436
437                 if (--cnt<0)            break;
438
439                 nlo  = ((const u8 *)Xi)[cnt];
440                 nlo ^= inp[cnt];
441                 nhi  = nlo>>4;
442                 nlo &= 0xf;
443
444                 rem  = (size_t)Z.lo&0xf;
445                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
446                 Z.hi = (Z.hi>>4);
447                 if (sizeof(size_t)==8)
448                         Z.hi ^= rem_4bit[rem];
449                 else
450                         Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452                 Z.hi ^= Htable[nlo].hi;
453                 Z.lo ^= Htable[nlo].lo;
454         }
455 #else
456     /*
457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
458      * [should] give ~50% improvement... One could have PACK()-ed
459      * the rem_8bit even here, but the priority is to minimize
460      * cache footprint...
461      */ 
462     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
463     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
464     static const unsigned short rem_8bit[256] = {
465         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497     /*
498      * This pre-processing phase slows down procedure by approximately
499      * same time as it makes each loop spin faster. In other words
500      * single block performance is approximately same as straightforward
501      * "4-bit" implementation, and then it goes only faster...
502      */
503     for (cnt=0; cnt<16; ++cnt) {
504         Z.hi = Htable[cnt].hi;
505         Z.lo = Htable[cnt].lo;
506         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507         Hshr4[cnt].hi = (Z.hi>>4);
508         Hshl4[cnt]    = (u8)(Z.lo<<4);
509     }
510
511     do {
512         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513                 nlo  = ((const u8 *)Xi)[cnt];
514                 nlo ^= inp[cnt];
515                 nhi  = nlo>>4;
516                 nlo &= 0xf;
517
518                 Z.hi ^= Htable[nlo].hi;
519                 Z.lo ^= Htable[nlo].lo;
520
521                 rem = (size_t)Z.lo&0xff;
522
523                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
524                 Z.hi = (Z.hi>>8);
525
526                 Z.hi ^= Hshr4[nhi].hi;
527                 Z.lo ^= Hshr4[nhi].lo;
528                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529         }
530
531         nlo  = ((const u8 *)Xi)[0];
532         nlo ^= inp[0];
533         nhi  = nlo>>4;
534         nlo &= 0xf;
535
536         Z.hi ^= Htable[nlo].hi;
537         Z.lo ^= Htable[nlo].lo;
538
539         rem = (size_t)Z.lo&0xf;
540
541         Z.lo = (Z.hi<<60)|(Z.lo>>4);
542         Z.hi = (Z.hi>>4);
543
544         Z.hi ^= Htable[nhi].hi;
545         Z.lo ^= Htable[nhi].lo;
546         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547 #endif
548
549         if (is_endian.little) {
550 #ifdef BSWAP8
551                 Xi[0] = BSWAP8(Z.hi);
552                 Xi[1] = BSWAP8(Z.lo);
553 #else
554                 u8 *p = (u8 *)Xi;
555                 u32 v;
556                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
557                 v = (u32)(Z.hi);        PUTU32(p+4,v);
558                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
559                 v = (u32)(Z.lo);        PUTU32(p+12,v);
560 #endif
561         }
562         else {
563                 Xi[0] = Z.hi;
564                 Xi[1] = Z.lo;
565         }
566     } while (inp+=16, len-=16);
567 }
568 #endif
569 #else
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572 #endif
573
574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578  * trashing effect. In other words idea is to hash data while it's
579  * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK       (3*1024)
581 #endif
582
583 #else   /* TABLE_BITS */
584
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586 {
587         u128 V,Z = { 0,0 };
588         long X;
589         int  i,j;
590         const long *xi = (const long *)Xi;
591         const union { long one; char little; } is_endian = {1};
592
593         V.hi = H[0];    /* H is in host byte order, no byte swapping */
594         V.lo = H[1];
595
596         for (j=0; j<16/sizeof(long); ++j) {
597                 if (is_endian.little) {
598                         if (sizeof(long)==8) {
599 #ifdef BSWAP8
600                                 X = (long)(BSWAP8(xi[j]));
601 #else
602                                 const u8 *p = (const u8 *)(xi+j);
603                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604 #endif
605                         }
606                         else {
607                                 const u8 *p = (const u8 *)(xi+j);
608                                 X = (long)GETU32(p);
609                         }
610                 }
611                 else
612                         X = xi[j];
613
614                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615                         u64 M = (u64)(X>>(8*sizeof(long)-1));
616                         Z.hi ^= V.hi&M;
617                         Z.lo ^= V.lo&M;
618
619                         REDUCE1BIT(V);
620                 }
621         }
622
623         if (is_endian.little) {
624 #ifdef BSWAP8
625                 Xi[0] = BSWAP8(Z.hi);
626                 Xi[1] = BSWAP8(Z.lo);
627 #else
628                 u8 *p = (u8 *)Xi;
629                 u32 v;
630                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
631                 v = (u32)(Z.hi);        PUTU32(p+4,v);
632                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
633                 v = (u32)(Z.lo);        PUTU32(p+12,v);
634 #endif
635         }
636         else {
637                 Xi[0] = Z.hi;
638                 Xi[1] = Z.lo;
639         }
640 }
641 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643 #endif
644
645 #if     TABLE_BITS==4 && defined(GHASH_ASM)
646 # if    !defined(I386_ONLY) && \
647         (defined(__i386)        || defined(__i386__)    || \
648          defined(__x86_64)      || defined(__x86_64__)  || \
649          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
650 #  define GHASH_ASM_X86_OR_64
651 #  define GCM_FUNCREF_4BIT
652 extern unsigned int OPENSSL_ia32cap_P[2];
653
654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658 #if defined(__i386) || defined(__i386__) || defined(_M_IX86)
659 # define gcm_init_avx   gcm_init_clmul
660 # define gcm_gmult_avx  gcm_gmult_clmul
661 # define gcm_ghash_avx  gcm_ghash_clmul
662 #else
663 void gcm_init_avx(u128 Htable[16],const u64 Xi[2]);
664 void gcm_gmult_avx(u64 Xi[2],const u128 Htable[16]);
665 void gcm_ghash_avx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
666 #endif
667
668 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
669 #   define GHASH_ASM_X86
670 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
671 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
672
673 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
675 #  endif
676 # elif defined(__arm__) || defined(__arm)
677 #  include "arm_arch.h"
678 #  if __ARM_ARCH__>=7
679 #   define GHASH_ASM_ARM
680 #   define GCM_FUNCREF_4BIT
681 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
682 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
683 #  endif
684 # elif defined(__sparc__) || defined(__sparc)
685 #  include "sparc_arch.h"
686 #  define GHASH_ASM_SPARC
687 #  define GCM_FUNCREF_4BIT
688 extern unsigned int OPENSSL_sparcv9cap_P[];
689 void gcm_init_vis3(u128 Htable[16],const u64 Xi[2]);
690 void gcm_gmult_vis3(u64 Xi[2],const u128 Htable[16]);
691 void gcm_ghash_vis3(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
692 # endif
693 #endif
694
695 #ifdef GCM_FUNCREF_4BIT
696 # undef  GCM_MUL
697 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
698 # ifdef GHASH
699 #  undef  GHASH
700 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
701 # endif
702 #endif
703
704 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
705 {
706         const union { long one; char little; } is_endian = {1};
707
708         memset(ctx,0,sizeof(*ctx));
709         ctx->block = block;
710         ctx->key   = key;
711
712         (*block)(ctx->H.c,ctx->H.c,key);
713
714         if (is_endian.little) {
715                 /* H is stored in host byte order */
716 #ifdef BSWAP8
717                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
718                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
719 #else
720                 u8 *p = ctx->H.c;
721                 u64 hi,lo;
722                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
723                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
724                 ctx->H.u[0] = hi;
725                 ctx->H.u[1] = lo;
726 #endif
727         }
728
729 #if     TABLE_BITS==8
730         gcm_init_8bit(ctx->Htable,ctx->H.u);
731 #elif   TABLE_BITS==4
732 # if    defined(GHASH_ASM_X86_OR_64)
733 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
734         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
735             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
736                 if (((OPENSSL_ia32cap_P[1]>>22)&0x41)==0x41) {  /* AVX+MOVBE */
737                         gcm_init_avx(ctx->Htable,ctx->H.u);
738                         ctx->gmult = gcm_gmult_avx;
739                         ctx->ghash = gcm_ghash_avx;
740                 } else {
741                         gcm_init_clmul(ctx->Htable,ctx->H.u);
742                         ctx->gmult = gcm_gmult_clmul;
743                         ctx->ghash = gcm_ghash_clmul;
744                 }
745                 return;
746         }
747 #  endif
748         gcm_init_4bit(ctx->Htable,ctx->H.u);
749 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
750 #   if  defined(OPENSSL_IA32_SSE2)
751         if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
752 #   else
753         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
754 #   endif
755                 ctx->gmult = gcm_gmult_4bit_mmx;
756                 ctx->ghash = gcm_ghash_4bit_mmx;
757         } else {
758                 ctx->gmult = gcm_gmult_4bit_x86;
759                 ctx->ghash = gcm_ghash_4bit_x86;
760         }
761 #  else
762         ctx->gmult = gcm_gmult_4bit;
763         ctx->ghash = gcm_ghash_4bit;
764 #  endif
765 # elif  defined(GHASH_ASM_ARM)
766         if (OPENSSL_armcap_P & ARMV7_NEON) {
767                 ctx->gmult = gcm_gmult_neon;
768                 ctx->ghash = gcm_ghash_neon;
769         } else {
770                 gcm_init_4bit(ctx->Htable,ctx->H.u);
771                 ctx->gmult = gcm_gmult_4bit;
772                 ctx->ghash = gcm_ghash_4bit;
773         }
774 # elif  defined(GHASH_ASM_SPARC)
775         if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
776                 gcm_init_vis3(ctx->Htable,ctx->H.u);
777                 ctx->gmult = gcm_gmult_vis3;
778                 ctx->ghash = gcm_ghash_vis3;
779         } else {
780                 gcm_init_4bit(ctx->Htable,ctx->H.u);
781                 ctx->gmult = gcm_gmult_4bit;
782                 ctx->ghash = gcm_ghash_4bit;
783         }
784 # else
785         gcm_init_4bit(ctx->Htable,ctx->H.u);
786 # endif
787 #endif
788 }
789
790 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
791 {
792         const union { long one; char little; } is_endian = {1};
793         unsigned int ctr;
794 #ifdef GCM_FUNCREF_4BIT
795         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
796 #endif
797
798         ctx->Yi.u[0]  = 0;
799         ctx->Yi.u[1]  = 0;
800         ctx->Xi.u[0]  = 0;
801         ctx->Xi.u[1]  = 0;
802         ctx->len.u[0] = 0;      /* AAD length */
803         ctx->len.u[1] = 0;      /* message length */
804         ctx->ares = 0;
805         ctx->mres = 0;
806
807         if (len==12) {
808                 memcpy(ctx->Yi.c,iv,12);
809                 ctx->Yi.c[15]=1;
810                 ctr=1;
811         }
812         else {
813                 size_t i;
814                 u64 len0 = len;
815
816                 while (len>=16) {
817                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
818                         GCM_MUL(ctx,Yi);
819                         iv += 16;
820                         len -= 16;
821                 }
822                 if (len) {
823                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
824                         GCM_MUL(ctx,Yi);
825                 }
826                 len0 <<= 3;
827                 if (is_endian.little) {
828 #ifdef BSWAP8
829                         ctx->Yi.u[1]  ^= BSWAP8(len0);
830 #else
831                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
832                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
833                         ctx->Yi.c[10] ^= (u8)(len0>>40);
834                         ctx->Yi.c[11] ^= (u8)(len0>>32);
835                         ctx->Yi.c[12] ^= (u8)(len0>>24);
836                         ctx->Yi.c[13] ^= (u8)(len0>>16);
837                         ctx->Yi.c[14] ^= (u8)(len0>>8);
838                         ctx->Yi.c[15] ^= (u8)(len0);
839 #endif
840                 }
841                 else
842                         ctx->Yi.u[1]  ^= len0;
843
844                 GCM_MUL(ctx,Yi);
845
846                 if (is_endian.little)
847                         ctr = GETU32(ctx->Yi.c+12);
848                 else
849                         ctr = ctx->Yi.d[3];
850         }
851
852         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
853         ++ctr;
854         if (is_endian.little)
855                 PUTU32(ctx->Yi.c+12,ctr);
856         else
857                 ctx->Yi.d[3] = ctr;
858 }
859
860 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
861 {
862         size_t i;
863         unsigned int n;
864         u64 alen = ctx->len.u[0];
865 #ifdef GCM_FUNCREF_4BIT
866         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
867 # ifdef GHASH
868         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
869                                 const u8 *inp,size_t len)       = ctx->ghash;
870 # endif
871 #endif
872
873         if (ctx->len.u[1]) return -2;
874
875         alen += len;
876         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
877                 return -1;
878         ctx->len.u[0] = alen;
879
880         n = ctx->ares;
881         if (n) {
882                 while (n && len) {
883                         ctx->Xi.c[n] ^= *(aad++);
884                         --len;
885                         n = (n+1)%16;
886                 }
887                 if (n==0) GCM_MUL(ctx,Xi);
888                 else {
889                         ctx->ares = n;
890                         return 0;
891                 }
892         }
893
894 #ifdef GHASH
895         if ((i = (len&(size_t)-16))) {
896                 GHASH(ctx,aad,i);
897                 aad += i;
898                 len -= i;
899         }
900 #else
901         while (len>=16) {
902                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
903                 GCM_MUL(ctx,Xi);
904                 aad += 16;
905                 len -= 16;
906         }
907 #endif
908         if (len) {
909                 n = (unsigned int)len;
910                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
911         }
912
913         ctx->ares = n;
914         return 0;
915 }
916
917 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
918                 const unsigned char *in, unsigned char *out,
919                 size_t len)
920 {
921         const union { long one; char little; } is_endian = {1};
922         unsigned int n, ctr;
923         size_t i;
924         u64        mlen  = ctx->len.u[1];
925         block128_f block = ctx->block;
926         void      *key   = ctx->key;
927 #ifdef GCM_FUNCREF_4BIT
928         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
929 # ifdef GHASH
930         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
931                                 const u8 *inp,size_t len)       = ctx->ghash;
932 # endif
933 #endif
934
935 #if 0
936         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
937 #endif
938         mlen += len;
939         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
940                 return -1;
941         ctx->len.u[1] = mlen;
942
943         if (ctx->ares) {
944                 /* First call to encrypt finalizes GHASH(AAD) */
945                 GCM_MUL(ctx,Xi);
946                 ctx->ares = 0;
947         }
948
949         if (is_endian.little)
950                 ctr = GETU32(ctx->Yi.c+12);
951         else
952                 ctr = ctx->Yi.d[3];
953
954         n = ctx->mres;
955 #if !defined(OPENSSL_SMALL_FOOTPRINT)
956         if (16%sizeof(size_t) == 0) do {        /* always true actually */
957                 if (n) {
958                         while (n && len) {
959                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
960                                 --len;
961                                 n = (n+1)%16;
962                         }
963                         if (n==0) GCM_MUL(ctx,Xi);
964                         else {
965                                 ctx->mres = n;
966                                 return 0;
967                         }
968                 }
969 #if defined(STRICT_ALIGNMENT)
970                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
971                         break;
972 #endif
973 #if defined(GHASH) && defined(GHASH_CHUNK)
974                 while (len>=GHASH_CHUNK) {
975                     size_t j=GHASH_CHUNK;
976
977                     while (j) {
978                         size_t *out_t=(size_t *)out;
979                         const size_t *in_t=(const size_t *)in;
980
981                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
982                         ++ctr;
983                         if (is_endian.little)
984                                 PUTU32(ctx->Yi.c+12,ctr);
985                         else
986                                 ctx->Yi.d[3] = ctr;
987                         for (i=0; i<16/sizeof(size_t); ++i)
988                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
989                         out += 16;
990                         in  += 16;
991                         j   -= 16;
992                     }
993                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
994                     len -= GHASH_CHUNK;
995                 }
996                 if ((i = (len&(size_t)-16))) {
997                     size_t j=i;
998
999                     while (len>=16) {
1000                         size_t *out_t=(size_t *)out;
1001                         const size_t *in_t=(const size_t *)in;
1002
1003                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1004                         ++ctr;
1005                         if (is_endian.little)
1006                                 PUTU32(ctx->Yi.c+12,ctr);
1007                         else
1008                                 ctx->Yi.d[3] = ctr;
1009                         for (i=0; i<16/sizeof(size_t); ++i)
1010                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1011                         out += 16;
1012                         in  += 16;
1013                         len -= 16;
1014                     }
1015                     GHASH(ctx,out-j,j);
1016                 }
1017 #else
1018                 while (len>=16) {
1019                         size_t *out_t=(size_t *)out;
1020                         const size_t *in_t=(const size_t *)in;
1021
1022                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1023                         ++ctr;
1024                         if (is_endian.little)
1025                                 PUTU32(ctx->Yi.c+12,ctr);
1026                         else
1027                                 ctx->Yi.d[3] = ctr;
1028                         for (i=0; i<16/sizeof(size_t); ++i)
1029                                 ctx->Xi.t[i] ^=
1030                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1031                         GCM_MUL(ctx,Xi);
1032                         out += 16;
1033                         in  += 16;
1034                         len -= 16;
1035                 }
1036 #endif
1037                 if (len) {
1038                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1039                         ++ctr;
1040                         if (is_endian.little)
1041                                 PUTU32(ctx->Yi.c+12,ctr);
1042                         else
1043                                 ctx->Yi.d[3] = ctr;
1044                         while (len--) {
1045                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1046                                 ++n;
1047                         }
1048                 }
1049
1050                 ctx->mres = n;
1051                 return 0;
1052         } while(0);
1053 #endif
1054         for (i=0;i<len;++i) {
1055                 if (n==0) {
1056                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1057                         ++ctr;
1058                         if (is_endian.little)
1059                                 PUTU32(ctx->Yi.c+12,ctr);
1060                         else
1061                                 ctx->Yi.d[3] = ctr;
1062                 }
1063                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1064                 n = (n+1)%16;
1065                 if (n==0)
1066                         GCM_MUL(ctx,Xi);
1067         }
1068
1069         ctx->mres = n;
1070         return 0;
1071 }
1072
1073 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1074                 const unsigned char *in, unsigned char *out,
1075                 size_t len)
1076 {
1077         const union { long one; char little; } is_endian = {1};
1078         unsigned int n, ctr;
1079         size_t i;
1080         u64        mlen  = ctx->len.u[1];
1081         block128_f block = ctx->block;
1082         void      *key   = ctx->key;
1083 #ifdef GCM_FUNCREF_4BIT
1084         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1085 # ifdef GHASH
1086         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1087                                 const u8 *inp,size_t len)       = ctx->ghash;
1088 # endif
1089 #endif
1090
1091         mlen += len;
1092         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1093                 return -1;
1094         ctx->len.u[1] = mlen;
1095
1096         if (ctx->ares) {
1097                 /* First call to decrypt finalizes GHASH(AAD) */
1098                 GCM_MUL(ctx,Xi);
1099                 ctx->ares = 0;
1100         }
1101
1102         if (is_endian.little)
1103                 ctr = GETU32(ctx->Yi.c+12);
1104         else
1105                 ctr = ctx->Yi.d[3];
1106
1107         n = ctx->mres;
1108 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1109         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1110                 if (n) {
1111                         while (n && len) {
1112                                 u8 c = *(in++);
1113                                 *(out++) = c^ctx->EKi.c[n];
1114                                 ctx->Xi.c[n] ^= c;
1115                                 --len;
1116                                 n = (n+1)%16;
1117                         }
1118                         if (n==0) GCM_MUL (ctx,Xi);
1119                         else {
1120                                 ctx->mres = n;
1121                                 return 0;
1122                         }
1123                 }
1124 #if defined(STRICT_ALIGNMENT)
1125                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1126                         break;
1127 #endif
1128 #if defined(GHASH) && defined(GHASH_CHUNK)
1129                 while (len>=GHASH_CHUNK) {
1130                     size_t j=GHASH_CHUNK;
1131
1132                     GHASH(ctx,in,GHASH_CHUNK);
1133                     while (j) {
1134                         size_t *out_t=(size_t *)out;
1135                         const size_t *in_t=(const size_t *)in;
1136
1137                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1138                         ++ctr;
1139                         if (is_endian.little)
1140                                 PUTU32(ctx->Yi.c+12,ctr);
1141                         else
1142                                 ctx->Yi.d[3] = ctr;
1143                         for (i=0; i<16/sizeof(size_t); ++i)
1144                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1145                         out += 16;
1146                         in  += 16;
1147                         j   -= 16;
1148                     }
1149                     len -= GHASH_CHUNK;
1150                 }
1151                 if ((i = (len&(size_t)-16))) {
1152                     GHASH(ctx,in,i);
1153                     while (len>=16) {
1154                         size_t *out_t=(size_t *)out;
1155                         const size_t *in_t=(const size_t *)in;
1156
1157                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1158                         ++ctr;
1159                         if (is_endian.little)
1160                                 PUTU32(ctx->Yi.c+12,ctr);
1161                         else
1162                                 ctx->Yi.d[3] = ctr;
1163                         for (i=0; i<16/sizeof(size_t); ++i)
1164                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1165                         out += 16;
1166                         in  += 16;
1167                         len -= 16;
1168                     }
1169                 }
1170 #else
1171                 while (len>=16) {
1172                         size_t *out_t=(size_t *)out;
1173                         const size_t *in_t=(const size_t *)in;
1174
1175                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1176                         ++ctr;
1177                         if (is_endian.little)
1178                                 PUTU32(ctx->Yi.c+12,ctr);
1179                         else
1180                                 ctx->Yi.d[3] = ctr;
1181                         for (i=0; i<16/sizeof(size_t); ++i) {
1182                                 size_t c = in[i];
1183                                 out[i] = c^ctx->EKi.t[i];
1184                                 ctx->Xi.t[i] ^= c;
1185                         }
1186                         GCM_MUL(ctx,Xi);
1187                         out += 16;
1188                         in  += 16;
1189                         len -= 16;
1190                 }
1191 #endif
1192                 if (len) {
1193                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1194                         ++ctr;
1195                         if (is_endian.little)
1196                                 PUTU32(ctx->Yi.c+12,ctr);
1197                         else
1198                                 ctx->Yi.d[3] = ctr;
1199                         while (len--) {
1200                                 u8 c = in[n];
1201                                 ctx->Xi.c[n] ^= c;
1202                                 out[n] = c^ctx->EKi.c[n];
1203                                 ++n;
1204                         }
1205                 }
1206
1207                 ctx->mres = n;
1208                 return 0;
1209         } while(0);
1210 #endif
1211         for (i=0;i<len;++i) {
1212                 u8 c;
1213                 if (n==0) {
1214                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1215                         ++ctr;
1216                         if (is_endian.little)
1217                                 PUTU32(ctx->Yi.c+12,ctr);
1218                         else
1219                                 ctx->Yi.d[3] = ctr;
1220                 }
1221                 c = in[i];
1222                 out[i] = c^ctx->EKi.c[n];
1223                 ctx->Xi.c[n] ^= c;
1224                 n = (n+1)%16;
1225                 if (n==0)
1226                         GCM_MUL(ctx,Xi);
1227         }
1228
1229         ctx->mres = n;
1230         return 0;
1231 }
1232
1233 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1234                 const unsigned char *in, unsigned char *out,
1235                 size_t len, ctr128_f stream)
1236 {
1237         const union { long one; char little; } is_endian = {1};
1238         unsigned int n, ctr;
1239         size_t i;
1240         u64   mlen = ctx->len.u[1];
1241         void *key  = ctx->key;
1242 #ifdef GCM_FUNCREF_4BIT
1243         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1244 # ifdef GHASH
1245         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1246                                 const u8 *inp,size_t len)       = ctx->ghash;
1247 # endif
1248 #endif
1249
1250         mlen += len;
1251         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1252                 return -1;
1253         ctx->len.u[1] = mlen;
1254
1255         if (ctx->ares) {
1256                 /* First call to encrypt finalizes GHASH(AAD) */
1257                 GCM_MUL(ctx,Xi);
1258                 ctx->ares = 0;
1259         }
1260
1261         if (is_endian.little)
1262                 ctr = GETU32(ctx->Yi.c+12);
1263         else
1264                 ctr = ctx->Yi.d[3];
1265
1266         n = ctx->mres;
1267         if (n) {
1268                 while (n && len) {
1269                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1270                         --len;
1271                         n = (n+1)%16;
1272                 }
1273                 if (n==0) GCM_MUL(ctx,Xi);
1274                 else {
1275                         ctx->mres = n;
1276                         return 0;
1277                 }
1278         }
1279 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1280         while (len>=GHASH_CHUNK) {
1281                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1282                 ctr += GHASH_CHUNK/16;
1283                 if (is_endian.little)
1284                         PUTU32(ctx->Yi.c+12,ctr);
1285                 else
1286                         ctx->Yi.d[3] = ctr;
1287                 GHASH(ctx,out,GHASH_CHUNK);
1288                 out += GHASH_CHUNK;
1289                 in  += GHASH_CHUNK;
1290                 len -= GHASH_CHUNK;
1291         }
1292 #endif
1293         if ((i = (len&(size_t)-16))) {
1294                 size_t j=i/16;
1295
1296                 (*stream)(in,out,j,key,ctx->Yi.c);
1297                 ctr += (unsigned int)j;
1298                 if (is_endian.little)
1299                         PUTU32(ctx->Yi.c+12,ctr);
1300                 else
1301                         ctx->Yi.d[3] = ctr;
1302                 in  += i;
1303                 len -= i;
1304 #if defined(GHASH)
1305                 GHASH(ctx,out,i);
1306                 out += i;
1307 #else
1308                 while (j--) {
1309                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1310                         GCM_MUL(ctx,Xi);
1311                         out += 16;
1312                 }
1313 #endif
1314         }
1315         if (len) {
1316                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1317                 ++ctr;
1318                 if (is_endian.little)
1319                         PUTU32(ctx->Yi.c+12,ctr);
1320                 else
1321                         ctx->Yi.d[3] = ctr;
1322                 while (len--) {
1323                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1324                         ++n;
1325                 }
1326         }
1327
1328         ctx->mres = n;
1329         return 0;
1330 }
1331
1332 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1333                 const unsigned char *in, unsigned char *out,
1334                 size_t len,ctr128_f stream)
1335 {
1336         const union { long one; char little; } is_endian = {1};
1337         unsigned int n, ctr;
1338         size_t i;
1339         u64   mlen = ctx->len.u[1];
1340         void *key  = ctx->key;
1341 #ifdef GCM_FUNCREF_4BIT
1342         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1343 # ifdef GHASH
1344         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1345                                 const u8 *inp,size_t len)       = ctx->ghash;
1346 # endif
1347 #endif
1348
1349         mlen += len;
1350         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1351                 return -1;
1352         ctx->len.u[1] = mlen;
1353
1354         if (ctx->ares) {
1355                 /* First call to decrypt finalizes GHASH(AAD) */
1356                 GCM_MUL(ctx,Xi);
1357                 ctx->ares = 0;
1358         }
1359
1360         if (is_endian.little)
1361                 ctr = GETU32(ctx->Yi.c+12);
1362         else
1363                 ctr = ctx->Yi.d[3];
1364
1365         n = ctx->mres;
1366         if (n) {
1367                 while (n && len) {
1368                         u8 c = *(in++);
1369                         *(out++) = c^ctx->EKi.c[n];
1370                         ctx->Xi.c[n] ^= c;
1371                         --len;
1372                         n = (n+1)%16;
1373                 }
1374                 if (n==0) GCM_MUL (ctx,Xi);
1375                 else {
1376                         ctx->mres = n;
1377                         return 0;
1378                 }
1379         }
1380 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1381         while (len>=GHASH_CHUNK) {
1382                 GHASH(ctx,in,GHASH_CHUNK);
1383                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1384                 ctr += GHASH_CHUNK/16;
1385                 if (is_endian.little)
1386                         PUTU32(ctx->Yi.c+12,ctr);
1387                 else
1388                         ctx->Yi.d[3] = ctr;
1389                 out += GHASH_CHUNK;
1390                 in  += GHASH_CHUNK;
1391                 len -= GHASH_CHUNK;
1392         }
1393 #endif
1394         if ((i = (len&(size_t)-16))) {
1395                 size_t j=i/16;
1396
1397 #if defined(GHASH)
1398                 GHASH(ctx,in,i);
1399 #else
1400                 while (j--) {
1401                         size_t k;
1402                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1403                         GCM_MUL(ctx,Xi);
1404                         in += 16;
1405                 }
1406                 j   = i/16;
1407                 in -= i;
1408 #endif
1409                 (*stream)(in,out,j,key,ctx->Yi.c);
1410                 ctr += (unsigned int)j;
1411                 if (is_endian.little)
1412                         PUTU32(ctx->Yi.c+12,ctr);
1413                 else
1414                         ctx->Yi.d[3] = ctr;
1415                 out += i;
1416                 in  += i;
1417                 len -= i;
1418         }
1419         if (len) {
1420                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1421                 ++ctr;
1422                 if (is_endian.little)
1423                         PUTU32(ctx->Yi.c+12,ctr);
1424                 else
1425                         ctx->Yi.d[3] = ctr;
1426                 while (len--) {
1427                         u8 c = in[n];
1428                         ctx->Xi.c[n] ^= c;
1429                         out[n] = c^ctx->EKi.c[n];
1430                         ++n;
1431                 }
1432         }
1433
1434         ctx->mres = n;
1435         return 0;
1436 }
1437
1438 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1439                         size_t len)
1440 {
1441         const union { long one; char little; } is_endian = {1};
1442         u64 alen = ctx->len.u[0]<<3;
1443         u64 clen = ctx->len.u[1]<<3;
1444 #ifdef GCM_FUNCREF_4BIT
1445         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1446 #endif
1447
1448         if (ctx->mres || ctx->ares)
1449                 GCM_MUL(ctx,Xi);
1450
1451         if (is_endian.little) {
1452 #ifdef BSWAP8
1453                 alen = BSWAP8(alen);
1454                 clen = BSWAP8(clen);
1455 #else
1456                 u8 *p = ctx->len.c;
1457
1458                 ctx->len.u[0] = alen;
1459                 ctx->len.u[1] = clen;
1460
1461                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1462                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1463 #endif
1464         }
1465
1466         ctx->Xi.u[0] ^= alen;
1467         ctx->Xi.u[1] ^= clen;
1468         GCM_MUL(ctx,Xi);
1469
1470         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1471         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1472
1473         if (tag && len<=sizeof(ctx->Xi))
1474                 return memcmp(ctx->Xi.c,tag,len);
1475         else
1476                 return -1;
1477 }
1478
1479 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1480 {
1481         CRYPTO_gcm128_finish(ctx, NULL, 0);
1482         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1483 }
1484
1485 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1486 {
1487         GCM128_CONTEXT *ret;
1488
1489         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1490                 CRYPTO_gcm128_init(ret,key,block);
1491
1492         return ret;
1493 }
1494
1495 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1496 {
1497         if (ctx) {
1498                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1499                 OPENSSL_free(ctx);
1500         }
1501 }
1502
1503 #if defined(SELFTEST)
1504 #include <stdio.h>
1505 #include <openssl/aes.h>
1506
1507 /* Test Case 1 */
1508 static const u8 K1[16],
1509                 *P1=NULL,
1510                 *A1=NULL,
1511                 IV1[12],
1512                 *C1=NULL,
1513                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1514
1515 /* Test Case 2 */
1516 #define K2 K1
1517 #define A2 A1
1518 #define IV2 IV1
1519 static const u8 P2[16],
1520                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1521                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1522
1523 /* Test Case 3 */
1524 #define A3 A2
1525 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1526                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1527                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1528                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1529                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1530                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1531                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1532                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1533                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1534                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1535                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1536
1537 /* Test Case 4 */
1538 #define K4 K3
1539 #define IV4 IV3
1540 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1541                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1542                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1543                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1544                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1545                         0xab,0xad,0xda,0xd2},
1546                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1547                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1548                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1549                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1550                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1551
1552 /* Test Case 5 */
1553 #define K5 K4
1554 #define P5 P4
1555 #define A5 A4
1556 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1557                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1558                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1559                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1560                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1561                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1562
1563 /* Test Case 6 */
1564 #define K6 K5
1565 #define P6 P5
1566 #define A6 A5
1567 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1568                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1569                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1570                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1571                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1572                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1573                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1574                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1575                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1576
1577 /* Test Case 7 */
1578 static const u8 K7[24],
1579                 *P7=NULL,
1580                 *A7=NULL,
1581                 IV7[12],
1582                 *C7=NULL,
1583                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1584
1585 /* Test Case 8 */
1586 #define K8 K7
1587 #define IV8 IV7
1588 #define A8 A7
1589 static const u8 P8[16],
1590                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1591                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1592
1593 /* Test Case 9 */
1594 #define A9 A8
1595 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1596                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1597                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1598                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1599                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1600                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1601                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1602                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1603                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1604                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1605                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1606                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1607
1608 /* Test Case 10 */
1609 #define K10 K9
1610 #define IV10 IV9
1611 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1612                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1613                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1614                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1615                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1616                         0xab,0xad,0xda,0xd2},
1617                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1618                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1619                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1620                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1621                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1622
1623 /* Test Case 11 */
1624 #define K11 K10
1625 #define P11 P10
1626 #define A11 A10
1627 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1628                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1629                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1630                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1631                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1632                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1633
1634 /* Test Case 12 */
1635 #define K12 K11
1636 #define P12 P11
1637 #define A12 A11
1638 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1639                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1640                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1641                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1642                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1643                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1644                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1645                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1646                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1647
1648 /* Test Case 13 */
1649 static const u8 K13[32],
1650                 *P13=NULL,
1651                 *A13=NULL,
1652                 IV13[12],
1653                 *C13=NULL,
1654                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1655
1656 /* Test Case 14 */
1657 #define K14 K13
1658 #define A14 A13
1659 static const u8 P14[16],
1660                 IV14[12],
1661                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1662                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1663
1664 /* Test Case 15 */
1665 #define A15 A14
1666 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1667                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1668                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1669                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1670                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1671                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1672                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1673                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1674                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1675                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1676                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1677                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1678
1679 /* Test Case 16 */
1680 #define K16 K15
1681 #define IV16 IV15
1682 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1683                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1684                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1685                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1686                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1687                         0xab,0xad,0xda,0xd2},
1688                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1689                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1690                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1691                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1692                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1693
1694 /* Test Case 17 */
1695 #define K17 K16
1696 #define P17 P16
1697 #define A17 A16
1698 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1699                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1700                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1701                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1702                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1703                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1704
1705 /* Test Case 18 */
1706 #define K18 K17
1707 #define P18 P17
1708 #define A18 A17
1709 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1710                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1711                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1712                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1713                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1714                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1715                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1716                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1717                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1718
1719 /* Test Case 19 */
1720 #define K19 K1
1721 #define P19 P1
1722 #define IV19 IV1
1723 #define C19 C1
1724 static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1725                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1726                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1727                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1728                         0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1729                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1730                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1731                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1732                 T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1733
1734 /* Test Case 20 */
1735 #define K20 K1
1736 #define A20 A1
1737 static const u8 IV20[64]={0xff,0xff,0xff,0xff}, /* this results in 0xff in counter LSB */
1738                 P20[288],
1739                 C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
1740                         0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
1741                         0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
1742                         0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
1743                         0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
1744                         0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
1745                         0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
1746                         0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
1747                         0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
1748                         0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
1749                         0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
1750                         0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
1751                         0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
1752                         0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
1753                         0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
1754                         0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
1755                         0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
1756                         0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
1757                 T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
1758
1759 #define TEST_CASE(n)    do {                                    \
1760         u8 out[sizeof(P##n)];                                   \
1761         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1762         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1763         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1764         memset(out,0,sizeof(out));                              \
1765         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1766         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1767         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1768             (C##n && memcmp(out,C##n,sizeof(out))))             \
1769                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1770         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1771         memset(out,0,sizeof(out));                              \
1772         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1773         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1774         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1775             (P##n && memcmp(out,P##n,sizeof(out))))             \
1776                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1777         } while(0)
1778
1779 int main()
1780 {
1781         GCM128_CONTEXT ctx;
1782         AES_KEY key;
1783         int ret=0;
1784
1785         TEST_CASE(1);
1786         TEST_CASE(2);
1787         TEST_CASE(3);
1788         TEST_CASE(4);
1789         TEST_CASE(5);
1790         TEST_CASE(6);
1791         TEST_CASE(7);
1792         TEST_CASE(8);
1793         TEST_CASE(9);
1794         TEST_CASE(10);
1795         TEST_CASE(11);
1796         TEST_CASE(12);
1797         TEST_CASE(13);
1798         TEST_CASE(14);
1799         TEST_CASE(15);
1800         TEST_CASE(16);
1801         TEST_CASE(17);
1802         TEST_CASE(18);
1803         TEST_CASE(19);
1804         TEST_CASE(20);
1805
1806 #ifdef OPENSSL_CPUID_OBJ
1807         {
1808         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1809         union { u64 u; u8 c[1024]; } buf;
1810         int i;
1811
1812         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1813         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1814         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1815
1816         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1817         start = OPENSSL_rdtsc();
1818         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1819         gcm_t = OPENSSL_rdtsc() - start;
1820
1821         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1822                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1823                         (block128_f)AES_encrypt);
1824         start = OPENSSL_rdtsc();
1825         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1826                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1827                         (block128_f)AES_encrypt);
1828         ctr_t = OPENSSL_rdtsc() - start;
1829
1830         printf("%.2f-%.2f=%.2f\n",
1831                         gcm_t/(double)sizeof(buf),
1832                         ctr_t/(double)sizeof(buf),
1833                         (gcm_t-ctr_t)/(double)sizeof(buf));
1834 #ifdef GHASH
1835         {
1836         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1837                                 const u8 *inp,size_t len)       = ctx.ghash;
1838
1839         GHASH((&ctx),buf.c,sizeof(buf));
1840         start = OPENSSL_rdtsc();
1841         for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1842         gcm_t = OPENSSL_rdtsc() - start;
1843         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1844         }
1845 #endif
1846         }
1847 #endif
1848
1849         return ret;
1850 }
1851 #endif