Engage GHASH for ARMv8.
[oweals/openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         static const size_t rem_8bit[256] = {
152                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217         while (1) {
218                 Z.hi ^= Htable[n].hi;
219                 Z.lo ^= Htable[n].lo;
220
221                 if ((u8 *)Xi==xi)       break;
222
223                 n = *(--xi);
224
225                 rem  = (size_t)Z.lo&0xff;
226                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
227                 Z.hi = (Z.hi>>8);
228                 if (sizeof(size_t)==8)
229                         Z.hi ^= rem_8bit[rem];
230                 else
231                         Z.hi ^= (u64)rem_8bit[rem]<<32;
232         }
233
234         if (is_endian.little) {
235 #ifdef BSWAP8
236                 Xi[0] = BSWAP8(Z.hi);
237                 Xi[1] = BSWAP8(Z.lo);
238 #else
239                 u8 *p = (u8 *)Xi;
240                 u32 v;
241                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
242                 v = (u32)(Z.hi);        PUTU32(p+4,v);
243                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
244                 v = (u32)(Z.lo);        PUTU32(p+12,v);
245 #endif
246         }
247         else {
248                 Xi[0] = Z.hi;
249                 Xi[1] = Z.lo;
250         }
251 }
252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254 #elif   TABLE_BITS==4
255
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 {
258         u128 V;
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260         int  i;
261 #endif
262
263         Htable[0].hi = 0;
264         Htable[0].lo = 0;
265         V.hi = H[0];
266         V.lo = H[1];
267
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269         for (Htable[8]=V, i=4; i>0; i>>=1) {
270                 REDUCE1BIT(V);
271                 Htable[i] = V;
272         }
273
274         for (i=2; i<16; i<<=1) {
275                 u128 *Hi = Htable+i;
276                 int   j;
277                 for (V=*Hi, j=1; j<i; ++j) {
278                         Hi[j].hi = V.hi^Htable[j].hi;
279                         Hi[j].lo = V.lo^Htable[j].lo;
280                 }
281         }
282 #else
283         Htable[8] = V;
284         REDUCE1BIT(V);
285         Htable[4] = V;
286         REDUCE1BIT(V);
287         Htable[2] = V;
288         REDUCE1BIT(V);
289         Htable[1] = V;
290         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291         V=Htable[4];
292         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295         V=Htable[8];
296         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303 #endif
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305         /*
306          * ARM assembler expects specific dword order in Htable.
307          */
308         {
309         int j;
310         const union { long one; char little; } is_endian = {1};
311
312         if (is_endian.little)
313                 for (j=0;j<16;++j) {
314                         V = Htable[j];
315                         Htable[j].hi = V.lo;
316                         Htable[j].lo = V.hi;
317                 }
318         else
319                 for (j=0;j<16;++j) {
320                         V = Htable[j];
321                         Htable[j].hi = V.lo<<32|V.lo>>32;
322                         Htable[j].lo = V.hi<<32|V.hi>>32;
323                 }
324         }
325 #endif
326 }
327
328 #ifndef GHASH_ASM
329 static const size_t rem_4bit[16] = {
330         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336 {
337         u128 Z;
338         int cnt = 15;
339         size_t rem, nlo, nhi;
340         const union { long one; char little; } is_endian = {1};
341
342         nlo  = ((const u8 *)Xi)[15];
343         nhi  = nlo>>4;
344         nlo &= 0xf;
345
346         Z.hi = Htable[nlo].hi;
347         Z.lo = Htable[nlo].lo;
348
349         while (1) {
350                 rem  = (size_t)Z.lo&0xf;
351                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
352                 Z.hi = (Z.hi>>4);
353                 if (sizeof(size_t)==8)
354                         Z.hi ^= rem_4bit[rem];
355                 else
356                         Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358                 Z.hi ^= Htable[nhi].hi;
359                 Z.lo ^= Htable[nhi].lo;
360
361                 if (--cnt<0)            break;
362
363                 nlo  = ((const u8 *)Xi)[cnt];
364                 nhi  = nlo>>4;
365                 nlo &= 0xf;
366
367                 rem  = (size_t)Z.lo&0xf;
368                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
369                 Z.hi = (Z.hi>>4);
370                 if (sizeof(size_t)==8)
371                         Z.hi ^= rem_4bit[rem];
372                 else
373                         Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375                 Z.hi ^= Htable[nlo].hi;
376                 Z.lo ^= Htable[nlo].lo;
377         }
378
379         if (is_endian.little) {
380 #ifdef BSWAP8
381                 Xi[0] = BSWAP8(Z.hi);
382                 Xi[1] = BSWAP8(Z.lo);
383 #else
384                 u8 *p = (u8 *)Xi;
385                 u32 v;
386                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
387                 v = (u32)(Z.hi);        PUTU32(p+4,v);
388                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
389                 v = (u32)(Z.lo);        PUTU32(p+12,v);
390 #endif
391         }
392         else {
393                 Xi[0] = Z.hi;
394                 Xi[1] = Z.lo;
395         }
396 }
397
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
399 /*
400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401  * details... Compiler-generated code doesn't seem to give any
402  * performance improvement, at least not on x86[_64]. It's here
403  * mostly as reference and a placeholder for possible future
404  * non-trivial optimization[s]...
405  */
406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407                                 const u8 *inp,size_t len)
408 {
409     u128 Z;
410     int cnt;
411     size_t rem, nlo, nhi;
412     const union { long one; char little; } is_endian = {1};
413
414 #if 1
415     do {
416         cnt  = 15;
417         nlo  = ((const u8 *)Xi)[15];
418         nlo ^= inp[15];
419         nhi  = nlo>>4;
420         nlo &= 0xf;
421
422         Z.hi = Htable[nlo].hi;
423         Z.lo = Htable[nlo].lo;
424
425         while (1) {
426                 rem  = (size_t)Z.lo&0xf;
427                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
428                 Z.hi = (Z.hi>>4);
429                 if (sizeof(size_t)==8)
430                         Z.hi ^= rem_4bit[rem];
431                 else
432                         Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434                 Z.hi ^= Htable[nhi].hi;
435                 Z.lo ^= Htable[nhi].lo;
436
437                 if (--cnt<0)            break;
438
439                 nlo  = ((const u8 *)Xi)[cnt];
440                 nlo ^= inp[cnt];
441                 nhi  = nlo>>4;
442                 nlo &= 0xf;
443
444                 rem  = (size_t)Z.lo&0xf;
445                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
446                 Z.hi = (Z.hi>>4);
447                 if (sizeof(size_t)==8)
448                         Z.hi ^= rem_4bit[rem];
449                 else
450                         Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452                 Z.hi ^= Htable[nlo].hi;
453                 Z.lo ^= Htable[nlo].lo;
454         }
455 #else
456     /*
457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
458      * [should] give ~50% improvement... One could have PACK()-ed
459      * the rem_8bit even here, but the priority is to minimize
460      * cache footprint...
461      */ 
462     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
463     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
464     static const unsigned short rem_8bit[256] = {
465         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497     /*
498      * This pre-processing phase slows down procedure by approximately
499      * same time as it makes each loop spin faster. In other words
500      * single block performance is approximately same as straightforward
501      * "4-bit" implementation, and then it goes only faster...
502      */
503     for (cnt=0; cnt<16; ++cnt) {
504         Z.hi = Htable[cnt].hi;
505         Z.lo = Htable[cnt].lo;
506         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507         Hshr4[cnt].hi = (Z.hi>>4);
508         Hshl4[cnt]    = (u8)(Z.lo<<4);
509     }
510
511     do {
512         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513                 nlo  = ((const u8 *)Xi)[cnt];
514                 nlo ^= inp[cnt];
515                 nhi  = nlo>>4;
516                 nlo &= 0xf;
517
518                 Z.hi ^= Htable[nlo].hi;
519                 Z.lo ^= Htable[nlo].lo;
520
521                 rem = (size_t)Z.lo&0xff;
522
523                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
524                 Z.hi = (Z.hi>>8);
525
526                 Z.hi ^= Hshr4[nhi].hi;
527                 Z.lo ^= Hshr4[nhi].lo;
528                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529         }
530
531         nlo  = ((const u8 *)Xi)[0];
532         nlo ^= inp[0];
533         nhi  = nlo>>4;
534         nlo &= 0xf;
535
536         Z.hi ^= Htable[nlo].hi;
537         Z.lo ^= Htable[nlo].lo;
538
539         rem = (size_t)Z.lo&0xf;
540
541         Z.lo = (Z.hi<<60)|(Z.lo>>4);
542         Z.hi = (Z.hi>>4);
543
544         Z.hi ^= Htable[nhi].hi;
545         Z.lo ^= Htable[nhi].lo;
546         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547 #endif
548
549         if (is_endian.little) {
550 #ifdef BSWAP8
551                 Xi[0] = BSWAP8(Z.hi);
552                 Xi[1] = BSWAP8(Z.lo);
553 #else
554                 u8 *p = (u8 *)Xi;
555                 u32 v;
556                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
557                 v = (u32)(Z.hi);        PUTU32(p+4,v);
558                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
559                 v = (u32)(Z.lo);        PUTU32(p+12,v);
560 #endif
561         }
562         else {
563                 Xi[0] = Z.hi;
564                 Xi[1] = Z.lo;
565         }
566     } while (inp+=16, len-=16);
567 }
568 #endif
569 #else
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572 #endif
573
574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578  * trashing effect. In other words idea is to hash data while it's
579  * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK       (3*1024)
581 #endif
582
583 #else   /* TABLE_BITS */
584
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586 {
587         u128 V,Z = { 0,0 };
588         long X;
589         int  i,j;
590         const long *xi = (const long *)Xi;
591         const union { long one; char little; } is_endian = {1};
592
593         V.hi = H[0];    /* H is in host byte order, no byte swapping */
594         V.lo = H[1];
595
596         for (j=0; j<16/sizeof(long); ++j) {
597                 if (is_endian.little) {
598                         if (sizeof(long)==8) {
599 #ifdef BSWAP8
600                                 X = (long)(BSWAP8(xi[j]));
601 #else
602                                 const u8 *p = (const u8 *)(xi+j);
603                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604 #endif
605                         }
606                         else {
607                                 const u8 *p = (const u8 *)(xi+j);
608                                 X = (long)GETU32(p);
609                         }
610                 }
611                 else
612                         X = xi[j];
613
614                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615                         u64 M = (u64)(X>>(8*sizeof(long)-1));
616                         Z.hi ^= V.hi&M;
617                         Z.lo ^= V.lo&M;
618
619                         REDUCE1BIT(V);
620                 }
621         }
622
623         if (is_endian.little) {
624 #ifdef BSWAP8
625                 Xi[0] = BSWAP8(Z.hi);
626                 Xi[1] = BSWAP8(Z.lo);
627 #else
628                 u8 *p = (u8 *)Xi;
629                 u32 v;
630                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
631                 v = (u32)(Z.hi);        PUTU32(p+4,v);
632                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
633                 v = (u32)(Z.lo);        PUTU32(p+12,v);
634 #endif
635         }
636         else {
637                 Xi[0] = Z.hi;
638                 Xi[1] = Z.lo;
639         }
640 }
641 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643 #endif
644
645 #if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
646 # if    !defined(I386_ONLY) && \
647         (defined(__i386)        || defined(__i386__)    || \
648          defined(__x86_64)      || defined(__x86_64__)  || \
649          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
650 #  define GHASH_ASM_X86_OR_64
651 #  define GCM_FUNCREF_4BIT
652 extern unsigned int OPENSSL_ia32cap_P[2];
653
654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658 #if defined(__i386) || defined(__i386__) || defined(_M_IX86)
659 # define gcm_init_avx   gcm_init_clmul
660 # define gcm_gmult_avx  gcm_gmult_clmul
661 # define gcm_ghash_avx  gcm_ghash_clmul
662 #else
663 void gcm_init_avx(u128 Htable[16],const u64 Xi[2]);
664 void gcm_gmult_avx(u64 Xi[2],const u128 Htable[16]);
665 void gcm_ghash_avx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
666 #endif
667
668 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
669 #   define GHASH_ASM_X86
670 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
671 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
672
673 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
675 #  endif
676 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
677 #  include "arm_arch.h"
678 #  if __ARM_ARCH__>=7
679 #   define GHASH_ASM_ARM
680 #   define GCM_FUNCREF_4BIT
681 #   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL)
682 #   if defined(__arm__) || defined(__arm)
683 #    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON)
684 #   endif
685 void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
686 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
687 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
688 void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
689 void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
690 void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
691 #  endif
692 # elif defined(__sparc__) || defined(__sparc)
693 #  include "sparc_arch.h"
694 #  define GHASH_ASM_SPARC
695 #  define GCM_FUNCREF_4BIT
696 extern unsigned int OPENSSL_sparcv9cap_P[];
697 void gcm_init_vis3(u128 Htable[16],const u64 Xi[2]);
698 void gcm_gmult_vis3(u64 Xi[2],const u128 Htable[16]);
699 void gcm_ghash_vis3(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
700 # endif
701 #endif
702
703 #ifdef GCM_FUNCREF_4BIT
704 # undef  GCM_MUL
705 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
706 # ifdef GHASH
707 #  undef  GHASH
708 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
709 # endif
710 #endif
711
712 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
713 {
714         const union { long one; char little; } is_endian = {1};
715
716         memset(ctx,0,sizeof(*ctx));
717         ctx->block = block;
718         ctx->key   = key;
719
720         (*block)(ctx->H.c,ctx->H.c,key);
721
722         if (is_endian.little) {
723                 /* H is stored in host byte order */
724 #ifdef BSWAP8
725                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
726                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
727 #else
728                 u8 *p = ctx->H.c;
729                 u64 hi,lo;
730                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
731                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
732                 ctx->H.u[0] = hi;
733                 ctx->H.u[1] = lo;
734 #endif
735         }
736
737 #if     TABLE_BITS==8
738         gcm_init_8bit(ctx->Htable,ctx->H.u);
739 #elif   TABLE_BITS==4
740 # if    defined(GHASH_ASM_X86_OR_64)
741 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
742         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
743             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
744                 if (((OPENSSL_ia32cap_P[1]>>22)&0x41)==0x41) {  /* AVX+MOVBE */
745                         gcm_init_avx(ctx->Htable,ctx->H.u);
746                         ctx->gmult = gcm_gmult_avx;
747                         ctx->ghash = gcm_ghash_avx;
748                 } else {
749                         gcm_init_clmul(ctx->Htable,ctx->H.u);
750                         ctx->gmult = gcm_gmult_clmul;
751                         ctx->ghash = gcm_ghash_clmul;
752                 }
753                 return;
754         }
755 #  endif
756         gcm_init_4bit(ctx->Htable,ctx->H.u);
757 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
758 #   if  defined(OPENSSL_IA32_SSE2)
759         if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
760 #   else
761         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
762 #   endif
763                 ctx->gmult = gcm_gmult_4bit_mmx;
764                 ctx->ghash = gcm_ghash_4bit_mmx;
765         } else {
766                 ctx->gmult = gcm_gmult_4bit_x86;
767                 ctx->ghash = gcm_ghash_4bit_x86;
768         }
769 #  else
770         ctx->gmult = gcm_gmult_4bit;
771         ctx->ghash = gcm_ghash_4bit;
772 #  endif
773 # elif  defined(GHASH_ASM_ARM)
774 #  ifdef PMULL_CAPABLE
775         if (PMULL_CAPABLE) {
776                 gcm_init_v8(ctx->Htable,ctx->H.u);
777                 ctx->gmult = gcm_gmult_v8;
778                 ctx->ghash = gcm_ghash_v8;
779         } else
780 #  endif
781 #  ifdef NEON_CAPABLE
782         if (NEON_CAPABLE) {
783                 gcm_init_neon(ctx->Htable,ctx->H.u);
784                 ctx->gmult = gcm_gmult_neon;
785                 ctx->ghash = gcm_ghash_neon;
786         } else
787 #  endif
788         {
789                 gcm_init_4bit(ctx->Htable,ctx->H.u);
790                 ctx->gmult = gcm_gmult_4bit;
791                 ctx->ghash = gcm_ghash_4bit;
792         }
793 # elif  defined(GHASH_ASM_SPARC)
794         if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
795                 gcm_init_vis3(ctx->Htable,ctx->H.u);
796                 ctx->gmult = gcm_gmult_vis3;
797                 ctx->ghash = gcm_ghash_vis3;
798         } else {
799                 gcm_init_4bit(ctx->Htable,ctx->H.u);
800                 ctx->gmult = gcm_gmult_4bit;
801                 ctx->ghash = gcm_ghash_4bit;
802         }
803 # else
804         gcm_init_4bit(ctx->Htable,ctx->H.u);
805 # endif
806 #endif
807 }
808
809 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
810 {
811         const union { long one; char little; } is_endian = {1};
812         unsigned int ctr;
813 #ifdef GCM_FUNCREF_4BIT
814         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
815 #endif
816
817         ctx->Yi.u[0]  = 0;
818         ctx->Yi.u[1]  = 0;
819         ctx->Xi.u[0]  = 0;
820         ctx->Xi.u[1]  = 0;
821         ctx->len.u[0] = 0;      /* AAD length */
822         ctx->len.u[1] = 0;      /* message length */
823         ctx->ares = 0;
824         ctx->mres = 0;
825
826         if (len==12) {
827                 memcpy(ctx->Yi.c,iv,12);
828                 ctx->Yi.c[15]=1;
829                 ctr=1;
830         }
831         else {
832                 size_t i;
833                 u64 len0 = len;
834
835                 while (len>=16) {
836                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
837                         GCM_MUL(ctx,Yi);
838                         iv += 16;
839                         len -= 16;
840                 }
841                 if (len) {
842                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
843                         GCM_MUL(ctx,Yi);
844                 }
845                 len0 <<= 3;
846                 if (is_endian.little) {
847 #ifdef BSWAP8
848                         ctx->Yi.u[1]  ^= BSWAP8(len0);
849 #else
850                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
851                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
852                         ctx->Yi.c[10] ^= (u8)(len0>>40);
853                         ctx->Yi.c[11] ^= (u8)(len0>>32);
854                         ctx->Yi.c[12] ^= (u8)(len0>>24);
855                         ctx->Yi.c[13] ^= (u8)(len0>>16);
856                         ctx->Yi.c[14] ^= (u8)(len0>>8);
857                         ctx->Yi.c[15] ^= (u8)(len0);
858 #endif
859                 }
860                 else
861                         ctx->Yi.u[1]  ^= len0;
862
863                 GCM_MUL(ctx,Yi);
864
865                 if (is_endian.little)
866 #ifdef BSWAP4
867                         ctr = BSWAP4(ctx->Yi.d[3]);
868 #else
869                         ctr = GETU32(ctx->Yi.c+12);
870 #endif
871                 else
872                         ctr = ctx->Yi.d[3];
873         }
874
875         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
876         ++ctr;
877         if (is_endian.little)
878 #ifdef BSWAP4
879                 ctx->Yi.d[3] = BSWAP4(ctr);
880 #else
881                 PUTU32(ctx->Yi.c+12,ctr);
882 #endif
883         else
884                 ctx->Yi.d[3] = ctr;
885 }
886
887 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
888 {
889         size_t i;
890         unsigned int n;
891         u64 alen = ctx->len.u[0];
892 #ifdef GCM_FUNCREF_4BIT
893         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
894 # ifdef GHASH
895         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
896                                 const u8 *inp,size_t len)       = ctx->ghash;
897 # endif
898 #endif
899
900         if (ctx->len.u[1]) return -2;
901
902         alen += len;
903         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
904                 return -1;
905         ctx->len.u[0] = alen;
906
907         n = ctx->ares;
908         if (n) {
909                 while (n && len) {
910                         ctx->Xi.c[n] ^= *(aad++);
911                         --len;
912                         n = (n+1)%16;
913                 }
914                 if (n==0) GCM_MUL(ctx,Xi);
915                 else {
916                         ctx->ares = n;
917                         return 0;
918                 }
919         }
920
921 #ifdef GHASH
922         if ((i = (len&(size_t)-16))) {
923                 GHASH(ctx,aad,i);
924                 aad += i;
925                 len -= i;
926         }
927 #else
928         while (len>=16) {
929                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
930                 GCM_MUL(ctx,Xi);
931                 aad += 16;
932                 len -= 16;
933         }
934 #endif
935         if (len) {
936                 n = (unsigned int)len;
937                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
938         }
939
940         ctx->ares = n;
941         return 0;
942 }
943
944 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
945                 const unsigned char *in, unsigned char *out,
946                 size_t len)
947 {
948         const union { long one; char little; } is_endian = {1};
949         unsigned int n, ctr;
950         size_t i;
951         u64        mlen  = ctx->len.u[1];
952         block128_f block = ctx->block;
953         void      *key   = ctx->key;
954 #ifdef GCM_FUNCREF_4BIT
955         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
956 # ifdef GHASH
957         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
958                                 const u8 *inp,size_t len)       = ctx->ghash;
959 # endif
960 #endif
961
962 #if 0
963         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
964 #endif
965         mlen += len;
966         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
967                 return -1;
968         ctx->len.u[1] = mlen;
969
970         if (ctx->ares) {
971                 /* First call to encrypt finalizes GHASH(AAD) */
972                 GCM_MUL(ctx,Xi);
973                 ctx->ares = 0;
974         }
975
976         if (is_endian.little)
977 #ifdef BSWAP4
978                 ctr = BSWAP4(ctx->Yi.d[3]);
979 #else
980                 ctr = GETU32(ctx->Yi.c+12);
981 #endif
982         else
983                 ctr = ctx->Yi.d[3];
984
985         n = ctx->mres;
986 #if !defined(OPENSSL_SMALL_FOOTPRINT)
987         if (16%sizeof(size_t) == 0) do {        /* always true actually */
988                 if (n) {
989                         while (n && len) {
990                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
991                                 --len;
992                                 n = (n+1)%16;
993                         }
994                         if (n==0) GCM_MUL(ctx,Xi);
995                         else {
996                                 ctx->mres = n;
997                                 return 0;
998                         }
999                 }
1000 #if defined(STRICT_ALIGNMENT)
1001                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1002                         break;
1003 #endif
1004 #if defined(GHASH) && defined(GHASH_CHUNK)
1005                 while (len>=GHASH_CHUNK) {
1006                     size_t j=GHASH_CHUNK;
1007
1008                     while (j) {
1009                         size_t *out_t=(size_t *)out;
1010                         const size_t *in_t=(const size_t *)in;
1011
1012                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1013                         ++ctr;
1014                         if (is_endian.little)
1015 #ifdef BSWAP4
1016                                 ctx->Yi.d[3] = BSWAP4(ctr);
1017 #else
1018                                 PUTU32(ctx->Yi.c+12,ctr);
1019 #endif
1020                         else
1021                                 ctx->Yi.d[3] = ctr;
1022                         for (i=0; i<16/sizeof(size_t); ++i)
1023                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1024                         out += 16;
1025                         in  += 16;
1026                         j   -= 16;
1027                     }
1028                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
1029                     len -= GHASH_CHUNK;
1030                 }
1031                 if ((i = (len&(size_t)-16))) {
1032                     size_t j=i;
1033
1034                     while (len>=16) {
1035                         size_t *out_t=(size_t *)out;
1036                         const size_t *in_t=(const size_t *)in;
1037
1038                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1039                         ++ctr;
1040                         if (is_endian.little)
1041 #ifdef BSWAP4
1042                                 ctx->Yi.d[3] = BSWAP4(ctr);
1043 #else
1044                                 PUTU32(ctx->Yi.c+12,ctr);
1045 #endif
1046                         else
1047                                 ctx->Yi.d[3] = ctr;
1048                         for (i=0; i<16/sizeof(size_t); ++i)
1049                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1050                         out += 16;
1051                         in  += 16;
1052                         len -= 16;
1053                     }
1054                     GHASH(ctx,out-j,j);
1055                 }
1056 #else
1057                 while (len>=16) {
1058                         size_t *out_t=(size_t *)out;
1059                         const size_t *in_t=(const size_t *)in;
1060
1061                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1062                         ++ctr;
1063                         if (is_endian.little)
1064 #ifdef BSWAP4
1065                                 ctx->Yi.d[3] = BSWAP4(ctr);
1066 #else
1067                                 PUTU32(ctx->Yi.c+12,ctr);
1068 #endif
1069                         else
1070                                 ctx->Yi.d[3] = ctr;
1071                         for (i=0; i<16/sizeof(size_t); ++i)
1072                                 ctx->Xi.t[i] ^=
1073                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1074                         GCM_MUL(ctx,Xi);
1075                         out += 16;
1076                         in  += 16;
1077                         len -= 16;
1078                 }
1079 #endif
1080                 if (len) {
1081                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1082                         ++ctr;
1083                         if (is_endian.little)
1084 #ifdef BSWAP4
1085                                 ctx->Yi.d[3] = BSWAP4(ctr);
1086 #else
1087                                 PUTU32(ctx->Yi.c+12,ctr);
1088 #endif
1089                         else
1090                                 ctx->Yi.d[3] = ctr;
1091                         while (len--) {
1092                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1093                                 ++n;
1094                         }
1095                 }
1096
1097                 ctx->mres = n;
1098                 return 0;
1099         } while(0);
1100 #endif
1101         for (i=0;i<len;++i) {
1102                 if (n==0) {
1103                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1104                         ++ctr;
1105                         if (is_endian.little)
1106 #ifdef BSWAP4
1107                                 ctx->Yi.d[3] = BSWAP4(ctr);
1108 #else
1109                                 PUTU32(ctx->Yi.c+12,ctr);
1110 #endif
1111                         else
1112                                 ctx->Yi.d[3] = ctr;
1113                 }
1114                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1115                 n = (n+1)%16;
1116                 if (n==0)
1117                         GCM_MUL(ctx,Xi);
1118         }
1119
1120         ctx->mres = n;
1121         return 0;
1122 }
1123
1124 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1125                 const unsigned char *in, unsigned char *out,
1126                 size_t len)
1127 {
1128         const union { long one; char little; } is_endian = {1};
1129         unsigned int n, ctr;
1130         size_t i;
1131         u64        mlen  = ctx->len.u[1];
1132         block128_f block = ctx->block;
1133         void      *key   = ctx->key;
1134 #ifdef GCM_FUNCREF_4BIT
1135         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1136 # ifdef GHASH
1137         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1138                                 const u8 *inp,size_t len)       = ctx->ghash;
1139 # endif
1140 #endif
1141
1142         mlen += len;
1143         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1144                 return -1;
1145         ctx->len.u[1] = mlen;
1146
1147         if (ctx->ares) {
1148                 /* First call to decrypt finalizes GHASH(AAD) */
1149                 GCM_MUL(ctx,Xi);
1150                 ctx->ares = 0;
1151         }
1152
1153         if (is_endian.little)
1154 #ifdef BSWAP4
1155                 ctr = BSWAP4(ctx->Yi.d[3]);
1156 #else
1157                 ctr = GETU32(ctx->Yi.c+12);
1158 #endif
1159         else
1160                 ctr = ctx->Yi.d[3];
1161
1162         n = ctx->mres;
1163 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1164         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1165                 if (n) {
1166                         while (n && len) {
1167                                 u8 c = *(in++);
1168                                 *(out++) = c^ctx->EKi.c[n];
1169                                 ctx->Xi.c[n] ^= c;
1170                                 --len;
1171                                 n = (n+1)%16;
1172                         }
1173                         if (n==0) GCM_MUL (ctx,Xi);
1174                         else {
1175                                 ctx->mres = n;
1176                                 return 0;
1177                         }
1178                 }
1179 #if defined(STRICT_ALIGNMENT)
1180                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1181                         break;
1182 #endif
1183 #if defined(GHASH) && defined(GHASH_CHUNK)
1184                 while (len>=GHASH_CHUNK) {
1185                     size_t j=GHASH_CHUNK;
1186
1187                     GHASH(ctx,in,GHASH_CHUNK);
1188                     while (j) {
1189                         size_t *out_t=(size_t *)out;
1190                         const size_t *in_t=(const size_t *)in;
1191
1192                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1193                         ++ctr;
1194                         if (is_endian.little)
1195 #ifdef BSWAP4
1196                                 ctx->Yi.d[3] = BSWAP4(ctr);
1197 #else
1198                                 PUTU32(ctx->Yi.c+12,ctr);
1199 #endif
1200                         else
1201                                 ctx->Yi.d[3] = ctr;
1202                         for (i=0; i<16/sizeof(size_t); ++i)
1203                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1204                         out += 16;
1205                         in  += 16;
1206                         j   -= 16;
1207                     }
1208                     len -= GHASH_CHUNK;
1209                 }
1210                 if ((i = (len&(size_t)-16))) {
1211                     GHASH(ctx,in,i);
1212                     while (len>=16) {
1213                         size_t *out_t=(size_t *)out;
1214                         const size_t *in_t=(const size_t *)in;
1215
1216                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1217                         ++ctr;
1218                         if (is_endian.little)
1219 #ifdef BSWAP4
1220                                 ctx->Yi.d[3] = BSWAP4(ctr);
1221 #else
1222                                 PUTU32(ctx->Yi.c+12,ctr);
1223 #endif
1224                         else
1225                                 ctx->Yi.d[3] = ctr;
1226                         for (i=0; i<16/sizeof(size_t); ++i)
1227                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1228                         out += 16;
1229                         in  += 16;
1230                         len -= 16;
1231                     }
1232                 }
1233 #else
1234                 while (len>=16) {
1235                         size_t *out_t=(size_t *)out;
1236                         const size_t *in_t=(const size_t *)in;
1237
1238                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1239                         ++ctr;
1240                         if (is_endian.little)
1241 #ifdef BSWAP4
1242                                 ctx->Yi.d[3] = BSWAP4(ctr);
1243 #else
1244                                 PUTU32(ctx->Yi.c+12,ctr);
1245 #endif
1246                         else
1247                                 ctx->Yi.d[3] = ctr;
1248                         for (i=0; i<16/sizeof(size_t); ++i) {
1249                                 size_t c = in[i];
1250                                 out[i] = c^ctx->EKi.t[i];
1251                                 ctx->Xi.t[i] ^= c;
1252                         }
1253                         GCM_MUL(ctx,Xi);
1254                         out += 16;
1255                         in  += 16;
1256                         len -= 16;
1257                 }
1258 #endif
1259                 if (len) {
1260                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1261                         ++ctr;
1262                         if (is_endian.little)
1263 #ifdef BSWAP4
1264                                 ctx->Yi.d[3] = BSWAP4(ctr);
1265 #else
1266                                 PUTU32(ctx->Yi.c+12,ctr);
1267 #endif
1268                         else
1269                                 ctx->Yi.d[3] = ctr;
1270                         while (len--) {
1271                                 u8 c = in[n];
1272                                 ctx->Xi.c[n] ^= c;
1273                                 out[n] = c^ctx->EKi.c[n];
1274                                 ++n;
1275                         }
1276                 }
1277
1278                 ctx->mres = n;
1279                 return 0;
1280         } while(0);
1281 #endif
1282         for (i=0;i<len;++i) {
1283                 u8 c;
1284                 if (n==0) {
1285                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1286                         ++ctr;
1287                         if (is_endian.little)
1288 #ifdef BSWAP4
1289                                 ctx->Yi.d[3] = BSWAP4(ctr);
1290 #else
1291                                 PUTU32(ctx->Yi.c+12,ctr);
1292 #endif
1293                         else
1294                                 ctx->Yi.d[3] = ctr;
1295                 }
1296                 c = in[i];
1297                 out[i] = c^ctx->EKi.c[n];
1298                 ctx->Xi.c[n] ^= c;
1299                 n = (n+1)%16;
1300                 if (n==0)
1301                         GCM_MUL(ctx,Xi);
1302         }
1303
1304         ctx->mres = n;
1305         return 0;
1306 }
1307
1308 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1309                 const unsigned char *in, unsigned char *out,
1310                 size_t len, ctr128_f stream)
1311 {
1312         const union { long one; char little; } is_endian = {1};
1313         unsigned int n, ctr;
1314         size_t i;
1315         u64   mlen = ctx->len.u[1];
1316         void *key  = ctx->key;
1317 #ifdef GCM_FUNCREF_4BIT
1318         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1319 # ifdef GHASH
1320         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1321                                 const u8 *inp,size_t len)       = ctx->ghash;
1322 # endif
1323 #endif
1324
1325         mlen += len;
1326         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1327                 return -1;
1328         ctx->len.u[1] = mlen;
1329
1330         if (ctx->ares) {
1331                 /* First call to encrypt finalizes GHASH(AAD) */
1332                 GCM_MUL(ctx,Xi);
1333                 ctx->ares = 0;
1334         }
1335
1336         if (is_endian.little)
1337 #ifdef BSWAP4
1338                 ctr = BSWAP4(ctx->Yi.d[3]);
1339 #else
1340                 ctr = GETU32(ctx->Yi.c+12);
1341 #endif
1342         else
1343                 ctr = ctx->Yi.d[3];
1344
1345         n = ctx->mres;
1346         if (n) {
1347                 while (n && len) {
1348                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1349                         --len;
1350                         n = (n+1)%16;
1351                 }
1352                 if (n==0) GCM_MUL(ctx,Xi);
1353                 else {
1354                         ctx->mres = n;
1355                         return 0;
1356                 }
1357         }
1358 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1359         while (len>=GHASH_CHUNK) {
1360                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1361                 ctr += GHASH_CHUNK/16;
1362                 if (is_endian.little)
1363 #ifdef BSWAP4
1364                         ctx->Yi.d[3] = BSWAP4(ctr);
1365 #else
1366                         PUTU32(ctx->Yi.c+12,ctr);
1367 #endif
1368                 else
1369                         ctx->Yi.d[3] = ctr;
1370                 GHASH(ctx,out,GHASH_CHUNK);
1371                 out += GHASH_CHUNK;
1372                 in  += GHASH_CHUNK;
1373                 len -= GHASH_CHUNK;
1374         }
1375 #endif
1376         if ((i = (len&(size_t)-16))) {
1377                 size_t j=i/16;
1378
1379                 (*stream)(in,out,j,key,ctx->Yi.c);
1380                 ctr += (unsigned int)j;
1381                 if (is_endian.little)
1382 #ifdef BSWAP4
1383                         ctx->Yi.d[3] = BSWAP4(ctr);
1384 #else
1385                         PUTU32(ctx->Yi.c+12,ctr);
1386 #endif
1387                 else
1388                         ctx->Yi.d[3] = ctr;
1389                 in  += i;
1390                 len -= i;
1391 #if defined(GHASH)
1392                 GHASH(ctx,out,i);
1393                 out += i;
1394 #else
1395                 while (j--) {
1396                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1397                         GCM_MUL(ctx,Xi);
1398                         out += 16;
1399                 }
1400 #endif
1401         }
1402         if (len) {
1403                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1404                 ++ctr;
1405                 if (is_endian.little)
1406 #ifdef BSWAP4
1407                         ctx->Yi.d[3] = BSWAP4(ctr);
1408 #else
1409                         PUTU32(ctx->Yi.c+12,ctr);
1410 #endif
1411                 else
1412                         ctx->Yi.d[3] = ctr;
1413                 while (len--) {
1414                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1415                         ++n;
1416                 }
1417         }
1418
1419         ctx->mres = n;
1420         return 0;
1421 }
1422
1423 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1424                 const unsigned char *in, unsigned char *out,
1425                 size_t len,ctr128_f stream)
1426 {
1427         const union { long one; char little; } is_endian = {1};
1428         unsigned int n, ctr;
1429         size_t i;
1430         u64   mlen = ctx->len.u[1];
1431         void *key  = ctx->key;
1432 #ifdef GCM_FUNCREF_4BIT
1433         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1434 # ifdef GHASH
1435         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1436                                 const u8 *inp,size_t len)       = ctx->ghash;
1437 # endif
1438 #endif
1439
1440         mlen += len;
1441         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1442                 return -1;
1443         ctx->len.u[1] = mlen;
1444
1445         if (ctx->ares) {
1446                 /* First call to decrypt finalizes GHASH(AAD) */
1447                 GCM_MUL(ctx,Xi);
1448                 ctx->ares = 0;
1449         }
1450
1451         if (is_endian.little)
1452 #ifdef BSWAP4
1453                 ctr = BSWAP4(ctx->Yi.d[3]);
1454 #else
1455                 ctr = GETU32(ctx->Yi.c+12);
1456 #endif
1457         else
1458                 ctr = ctx->Yi.d[3];
1459
1460         n = ctx->mres;
1461         if (n) {
1462                 while (n && len) {
1463                         u8 c = *(in++);
1464                         *(out++) = c^ctx->EKi.c[n];
1465                         ctx->Xi.c[n] ^= c;
1466                         --len;
1467                         n = (n+1)%16;
1468                 }
1469                 if (n==0) GCM_MUL (ctx,Xi);
1470                 else {
1471                         ctx->mres = n;
1472                         return 0;
1473                 }
1474         }
1475 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1476         while (len>=GHASH_CHUNK) {
1477                 GHASH(ctx,in,GHASH_CHUNK);
1478                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1479                 ctr += GHASH_CHUNK/16;
1480                 if (is_endian.little)
1481 #ifdef BSWAP4
1482                         ctx->Yi.d[3] = BSWAP4(ctr);
1483 #else
1484                         PUTU32(ctx->Yi.c+12,ctr);
1485 #endif
1486                 else
1487                         ctx->Yi.d[3] = ctr;
1488                 out += GHASH_CHUNK;
1489                 in  += GHASH_CHUNK;
1490                 len -= GHASH_CHUNK;
1491         }
1492 #endif
1493         if ((i = (len&(size_t)-16))) {
1494                 size_t j=i/16;
1495
1496 #if defined(GHASH)
1497                 GHASH(ctx,in,i);
1498 #else
1499                 while (j--) {
1500                         size_t k;
1501                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1502                         GCM_MUL(ctx,Xi);
1503                         in += 16;
1504                 }
1505                 j   = i/16;
1506                 in -= i;
1507 #endif
1508                 (*stream)(in,out,j,key,ctx->Yi.c);
1509                 ctr += (unsigned int)j;
1510                 if (is_endian.little)
1511 #ifdef BSWAP4
1512                         ctx->Yi.d[3] = BSWAP4(ctr);
1513 #else
1514                         PUTU32(ctx->Yi.c+12,ctr);
1515 #endif
1516                 else
1517                         ctx->Yi.d[3] = ctr;
1518                 out += i;
1519                 in  += i;
1520                 len -= i;
1521         }
1522         if (len) {
1523                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1524                 ++ctr;
1525                 if (is_endian.little)
1526 #ifdef BSWAP4
1527                         ctx->Yi.d[3] = BSWAP4(ctr);
1528 #else
1529                         PUTU32(ctx->Yi.c+12,ctr);
1530 #endif
1531                 else
1532                         ctx->Yi.d[3] = ctr;
1533                 while (len--) {
1534                         u8 c = in[n];
1535                         ctx->Xi.c[n] ^= c;
1536                         out[n] = c^ctx->EKi.c[n];
1537                         ++n;
1538                 }
1539         }
1540
1541         ctx->mres = n;
1542         return 0;
1543 }
1544
1545 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1546                         size_t len)
1547 {
1548         const union { long one; char little; } is_endian = {1};
1549         u64 alen = ctx->len.u[0]<<3;
1550         u64 clen = ctx->len.u[1]<<3;
1551 #ifdef GCM_FUNCREF_4BIT
1552         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1553 #endif
1554
1555         if (ctx->mres || ctx->ares)
1556                 GCM_MUL(ctx,Xi);
1557
1558         if (is_endian.little) {
1559 #ifdef BSWAP8
1560                 alen = BSWAP8(alen);
1561                 clen = BSWAP8(clen);
1562 #else
1563                 u8 *p = ctx->len.c;
1564
1565                 ctx->len.u[0] = alen;
1566                 ctx->len.u[1] = clen;
1567
1568                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1569                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1570 #endif
1571         }
1572
1573         ctx->Xi.u[0] ^= alen;
1574         ctx->Xi.u[1] ^= clen;
1575         GCM_MUL(ctx,Xi);
1576
1577         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1578         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1579
1580         if (tag && len<=sizeof(ctx->Xi))
1581                 return memcmp(ctx->Xi.c,tag,len);
1582         else
1583                 return -1;
1584 }
1585
1586 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1587 {
1588         CRYPTO_gcm128_finish(ctx, NULL, 0);
1589         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1590 }
1591
1592 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1593 {
1594         GCM128_CONTEXT *ret;
1595
1596         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1597                 CRYPTO_gcm128_init(ret,key,block);
1598
1599         return ret;
1600 }
1601
1602 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1603 {
1604         if (ctx) {
1605                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1606                 OPENSSL_free(ctx);
1607         }
1608 }
1609
1610 #if defined(SELFTEST)
1611 #include <stdio.h>
1612 #include <openssl/aes.h>
1613
1614 /* Test Case 1 */
1615 static const u8 K1[16],
1616                 *P1=NULL,
1617                 *A1=NULL,
1618                 IV1[12],
1619                 *C1=NULL,
1620                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1621
1622 /* Test Case 2 */
1623 #define K2 K1
1624 #define A2 A1
1625 #define IV2 IV1
1626 static const u8 P2[16],
1627                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1628                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1629
1630 /* Test Case 3 */
1631 #define A3 A2
1632 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1633                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1634                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1635                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1636                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1637                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1638                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1639                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1640                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1641                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1642                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1643
1644 /* Test Case 4 */
1645 #define K4 K3
1646 #define IV4 IV3
1647 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1648                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1649                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1650                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1651                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1652                         0xab,0xad,0xda,0xd2},
1653                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1654                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1655                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1656                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1657                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1658
1659 /* Test Case 5 */
1660 #define K5 K4
1661 #define P5 P4
1662 #define A5 A4
1663 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1664                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1665                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1666                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1667                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1668                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1669
1670 /* Test Case 6 */
1671 #define K6 K5
1672 #define P6 P5
1673 #define A6 A5
1674 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1675                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1676                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1677                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1678                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1679                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1680                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1681                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1682                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1683
1684 /* Test Case 7 */
1685 static const u8 K7[24],
1686                 *P7=NULL,
1687                 *A7=NULL,
1688                 IV7[12],
1689                 *C7=NULL,
1690                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1691
1692 /* Test Case 8 */
1693 #define K8 K7
1694 #define IV8 IV7
1695 #define A8 A7
1696 static const u8 P8[16],
1697                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1698                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1699
1700 /* Test Case 9 */
1701 #define A9 A8
1702 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1703                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1704                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1705                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1706                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1707                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1708                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1709                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1710                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1711                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1712                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1713                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1714
1715 /* Test Case 10 */
1716 #define K10 K9
1717 #define IV10 IV9
1718 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1719                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1720                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1721                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1722                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1723                         0xab,0xad,0xda,0xd2},
1724                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1725                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1726                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1727                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1728                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1729
1730 /* Test Case 11 */
1731 #define K11 K10
1732 #define P11 P10
1733 #define A11 A10
1734 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1735                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1736                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1737                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1738                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1739                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1740
1741 /* Test Case 12 */
1742 #define K12 K11
1743 #define P12 P11
1744 #define A12 A11
1745 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1746                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1747                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1748                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1749                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1750                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1751                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1752                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1753                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1754
1755 /* Test Case 13 */
1756 static const u8 K13[32],
1757                 *P13=NULL,
1758                 *A13=NULL,
1759                 IV13[12],
1760                 *C13=NULL,
1761                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1762
1763 /* Test Case 14 */
1764 #define K14 K13
1765 #define A14 A13
1766 static const u8 P14[16],
1767                 IV14[12],
1768                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1769                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1770
1771 /* Test Case 15 */
1772 #define A15 A14
1773 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1774                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1775                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1776                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1777                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1778                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1779                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1780                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1781                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1782                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1783                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1784                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1785
1786 /* Test Case 16 */
1787 #define K16 K15
1788 #define IV16 IV15
1789 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1790                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1791                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1792                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1793                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1794                         0xab,0xad,0xda,0xd2},
1795                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1796                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1797                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1798                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1799                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1800
1801 /* Test Case 17 */
1802 #define K17 K16
1803 #define P17 P16
1804 #define A17 A16
1805 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1806                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1807                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1808                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1809                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1810                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1811
1812 /* Test Case 18 */
1813 #define K18 K17
1814 #define P18 P17
1815 #define A18 A17
1816 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1817                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1818                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1819                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1820                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1821                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1822                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1823                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1824                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1825
1826 /* Test Case 19 */
1827 #define K19 K1
1828 #define P19 P1
1829 #define IV19 IV1
1830 #define C19 C1
1831 static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1832                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1833                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1834                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1835                         0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1836                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1837                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1838                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1839                 T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1840
1841 /* Test Case 20 */
1842 #define K20 K1
1843 #define A20 A1
1844 static const u8 IV20[64]={0xff,0xff,0xff,0xff}, /* this results in 0xff in counter LSB */
1845                 P20[288],
1846                 C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
1847                         0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
1848                         0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
1849                         0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
1850                         0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
1851                         0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
1852                         0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
1853                         0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
1854                         0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
1855                         0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
1856                         0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
1857                         0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
1858                         0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
1859                         0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
1860                         0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
1861                         0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
1862                         0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
1863                         0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
1864                 T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
1865
1866 #define TEST_CASE(n)    do {                                    \
1867         u8 out[sizeof(P##n)];                                   \
1868         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1869         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1870         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1871         memset(out,0,sizeof(out));                              \
1872         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1873         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1874         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1875             (C##n && memcmp(out,C##n,sizeof(out))))             \
1876                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1877         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1878         memset(out,0,sizeof(out));                              \
1879         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1880         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1881         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1882             (P##n && memcmp(out,P##n,sizeof(out))))             \
1883                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1884         } while(0)
1885
1886 int main()
1887 {
1888         GCM128_CONTEXT ctx;
1889         AES_KEY key;
1890         int ret=0;
1891
1892         TEST_CASE(1);
1893         TEST_CASE(2);
1894         TEST_CASE(3);
1895         TEST_CASE(4);
1896         TEST_CASE(5);
1897         TEST_CASE(6);
1898         TEST_CASE(7);
1899         TEST_CASE(8);
1900         TEST_CASE(9);
1901         TEST_CASE(10);
1902         TEST_CASE(11);
1903         TEST_CASE(12);
1904         TEST_CASE(13);
1905         TEST_CASE(14);
1906         TEST_CASE(15);
1907         TEST_CASE(16);
1908         TEST_CASE(17);
1909         TEST_CASE(18);
1910         TEST_CASE(19);
1911         TEST_CASE(20);
1912
1913 #ifdef OPENSSL_CPUID_OBJ
1914         {
1915         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1916         union { u64 u; u8 c[1024]; } buf;
1917         int i;
1918
1919         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1920         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1921         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1922
1923         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1924         start = OPENSSL_rdtsc();
1925         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1926         gcm_t = OPENSSL_rdtsc() - start;
1927
1928         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1929                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1930                         (block128_f)AES_encrypt);
1931         start = OPENSSL_rdtsc();
1932         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1933                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1934                         (block128_f)AES_encrypt);
1935         ctr_t = OPENSSL_rdtsc() - start;
1936
1937         printf("%.2f-%.2f=%.2f\n",
1938                         gcm_t/(double)sizeof(buf),
1939                         ctr_t/(double)sizeof(buf),
1940                         (gcm_t-ctr_t)/(double)sizeof(buf));
1941 #ifdef GHASH
1942         {
1943         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1944                                 const u8 *inp,size_t len)       = ctx.ghash;
1945
1946         GHASH((&ctx),buf.c,sizeof(buf));
1947         start = OPENSSL_rdtsc();
1948         for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1949         gcm_t = OPENSSL_rdtsc() - start;
1950         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1951         }
1952 #endif
1953         }
1954 #endif
1955
1956         return ret;
1957 }
1958 #endif