From ab8a4e54dbada9bdc7f4786fb54e1f6e7ae52bc7 Mon Sep 17 00:00:00 2001
From: "Dr. Stephen Henson" <steve@openssl.org>
Date: Sat, 19 Feb 2011 22:16:52 +0000
Subject: [PATCH] Move gcm128_context definition to modes_lcl.h (along with
 some related definitions) so we can use it in EVP GCM code avoiding need to
 allocate it.

---
 crypto/Makefile          |  2 +-
 crypto/evp/e_aes.c       | 35 +++++++++---------------
 crypto/modes/gcm128.c    | 56 --------------------------------------
 crypto/modes/modes_lcl.h | 58 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+), 80 deletions(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index 7595cb4578..4147d2d638 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -7,7 +7,7 @@ TOP=		..
 CC=		cc
 INCLUDE=	-I. -I$(TOP) -I../include $(ZLIB_INCLUDE)
 # INCLUDES targets sudbirs!
-INCLUDES=	-I.. -I../.. -I../asn1 -I../evp -I../../include $(ZLIB_INCLUDE)
+INCLUDES=	-I.. -I../.. -I../modes -I../asn1 -I../evp -I../../include $(ZLIB_INCLUDE)
 CFLAG=		-g
 MAKEDEPPROG=	makedepend
 MAKEDEPEND=	$(TOP)/util/domd $(TOP) -MD $(MAKEDEPPROG)
diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
index 2f937af0ba..b1a701b65d 100644
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@@ -58,7 +58,7 @@
 #include <assert.h>
 #include <openssl/aes.h>
 #include "evp_locl.h"
-#include <openssl/modes.h>
+#include "modes_lcl.h"
 #include <openssl/rand.h>
 
 static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
@@ -196,8 +196,7 @@ typedef struct
 	int key_set;
 	/* Set if an iv is set */
 	int iv_set;
-	/* Pointer to GCM128_CTX: FIXME actual structure later */
-	GCM128_CONTEXT *gcm;
+	GCM128_CONTEXT gcm;
 	/* Temporary IV store */
 	unsigned char *iv;
 	/* IV length */
@@ -212,8 +211,7 @@ typedef struct
 static int aes_gcm_cleanup(EVP_CIPHER_CTX *c)
 	{
 	EVP_AES_GCM_CTX *gctx = c->cipher_data;
-	if (gctx->gcm)
-		CRYPTO_gcm128_release(gctx->gcm);
+	OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
 	if (gctx->iv != c->iv)
 		OPENSSL_free(gctx->iv);
 	return 1;
@@ -239,7 +237,6 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
 	switch (type)
 		{
 	case EVP_CTRL_INIT:
-		gctx->gcm = NULL;
 		gctx->key_set = 0;
 		gctx->iv_set = 0;
 		gctx->ivlen = c->cipher->iv_len;
@@ -304,7 +301,7 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
 	case EVP_CTRL_GCM_IV_GEN:
 		if (gctx->iv_gen == 0 || gctx->key_set == 0)
 			return 0;
-		CRYPTO_gcm128_setiv(gctx->gcm, gctx->iv, gctx->ivlen);
+		CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
 		memcpy(ptr, gctx->iv, gctx->ivlen);
 		/* Invocation field will be at least 8 bytes in size and
 		 * so no need to check wrap around or increment more than
@@ -329,15 +326,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	if (key)
 		{
 		AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
-		if (!gctx->gcm)
-			{
-			gctx->gcm =
-				CRYPTO_gcm128_new(&gctx->ks, (block128_f)AES_encrypt);
-			if (!gctx->gcm)
-				return 0;
-			}
-		else
-			CRYPTO_gcm128_init(gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
+		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
 		/* If we have an iv can set it directly, otherwise use
 		 * saved IV.
 		 */
@@ -345,7 +334,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 			iv = gctx->iv;
 		if (iv)
 			{
-			CRYPTO_gcm128_setiv(gctx->gcm, iv, gctx->ivlen);
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
 			gctx->iv_set = 1;
 			}
 		gctx->key_set = 1;
@@ -354,7 +343,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 		{
 		/* If key set use IV, otherwise copy */
 		if (gctx->key_set)
-			CRYPTO_gcm128_setiv(gctx->gcm, iv, gctx->ivlen);
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
 		else
 			memcpy(gctx->iv, iv, gctx->ivlen);
 		gctx->iv_set = 1;
@@ -376,17 +365,17 @@ static int aes_gcm(EVP_CIPHER_CTX *ctx, unsigned char *out,
 		{
 		if (out == NULL)
 			{
-			if (CRYPTO_gcm128_aad(gctx->gcm, in, len))
+			if (CRYPTO_gcm128_aad(&gctx->gcm, in, len))
 				return -1;
 			}
 		else if (ctx->encrypt)
 			{
-			if (CRYPTO_gcm128_encrypt(gctx->gcm, in, out, len))
+			if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
 				return -1;
 			}
 		else
 			{
-			if (CRYPTO_gcm128_decrypt(gctx->gcm, in, out, len))
+			if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
 				return -1;
 			}
 		return len;
@@ -395,13 +384,13 @@ static int aes_gcm(EVP_CIPHER_CTX *ctx, unsigned char *out,
 		{
 		if (!ctx->encrypt)
 			{
-			if (CRYPTO_gcm128_finish(gctx->gcm,
+			if (CRYPTO_gcm128_finish(&gctx->gcm,
 					gctx->tag, gctx->taglen) != 0)
 				return -1;
 			gctx->iv_set = 0;
 			return 0;
 			}
-		CRYPTO_gcm128_tag(gctx->gcm, gctx->tag, 16);
+		CRYPTO_gcm128_tag(&gctx->gcm, gctx->tag, 16);
 		gctx->taglen = 16;
 		/* Don't reuse the IV */
 		gctx->iv_set = 0;
diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c
index cdb6c847a9..8a48e90ac5 100644
--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@@ -60,8 +60,6 @@
 #endif
 #include <assert.h>
 
-typedef struct { u64 hi,lo; } u128;
-
 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
 /* redefine, because alignment is ensured */
 #undef	GETU32
@@ -84,43 +82,6 @@ typedef struct { u64 hi,lo; } u128;
 	} \
 } while(0)
 
-#ifdef	TABLE_BITS
-#undef	TABLE_BITS
-#endif
-/*
- * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
- * never be set to 8. 8 is effectively reserved for testing purposes.
- * TABLE_BITS>1 are lookup-table-driven implementations referred to as
- * "Shoup's" in GCM specification. In other words OpenSSL does not cover
- * whole spectrum of possible table driven implementations. Why? In
- * non-"Shoup's" case memory access pattern is segmented in such manner,
- * that it's trivial to see that cache timing information can reveal
- * fair portion of intermediate hash value. Given that ciphertext is
- * always available to attacker, it's possible for him to attempt to
- * deduce secret parameter H and if successful, tamper with messages
- * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
- * not as trivial, but there is no reason to believe that it's resistant
- * to cache-timing attack. And the thing about "8-bit" implementation is
- * that it consumes 16 (sixteen) times more memory, 4KB per individual
- * key + 1KB shared. Well, on pros side it should be twice as fast as
- * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
- * was observed to run ~75% faster, closer to 100% for commercial
- * compilers... Yet "4-bit" procedure is preferred, because it's
- * believed to provide better security-performance balance and adequate
- * all-round performance. "All-round" refers to things like:
- *
- * - shorter setup time effectively improves overall timing for
- *   handling short messages;
- * - larger table allocation can become unbearable because of VM
- *   subsystem penalties (for example on Windows large enough free
- *   results in VM working set trimming, meaning that consequent
- *   malloc would immediately incur working set expansion);
- * - larger table has larger cache footprint, which can affect
- *   performance of other code paths (not necessarily even from same
- *   thread in Hyper-Threading world);
- */
-#define	TABLE_BITS 4
-
 #if	TABLE_BITS==8
 
 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
@@ -647,23 +608,6 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
 
 #endif
 
-struct gcm128_context {
-	/* Following 6 names follow names in GCM specification */
-	union { u64 u[2]; u32 d[4]; u8 c[16]; }	Yi,EKi,EK0,
-						Xi,H,len;
-	/* Pre-computed table used by gcm_gmult_* */
-#if TABLE_BITS==8
-	u128 Htable[256];
-#else
-	u128 Htable[16];
-	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
-	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-#endif
-	unsigned int mres, ares;
-	block128_f block;
-	void *key;
-};
-
 #if	TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
 	(defined(__i386)	|| defined(__i386__)	|| \
 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
diff --git a/crypto/modes/modes_lcl.h b/crypto/modes/modes_lcl.h
index 12368fb039..201a69115e 100644
--- a/crypto/modes/modes_lcl.h
+++ b/crypto/modes/modes_lcl.h
@@ -73,3 +73,61 @@ typedef unsigned char u8;
 #define GETU32(p)	((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
 #define PUTU32(p,v)	((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
 #endif
+
+/* GCM definitions */
+
+typedef struct { u64 hi,lo; } u128;
+
+#ifdef	TABLE_BITS
+#undef	TABLE_BITS
+#endif
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8. 8 is effectively reserved for testing purposes.
+ * TABLE_BITS>1 are lookup-table-driven implementations referred to as
+ * "Shoup's" in GCM specification. In other words OpenSSL does not cover
+ * whole spectrum of possible table driven implementations. Why? In
+ * non-"Shoup's" case memory access pattern is segmented in such manner,
+ * that it's trivial to see that cache timing information can reveal
+ * fair portion of intermediate hash value. Given that ciphertext is
+ * always available to attacker, it's possible for him to attempt to
+ * deduce secret parameter H and if successful, tamper with messages
+ * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
+ * not as trivial, but there is no reason to believe that it's resistant
+ * to cache-timing attack. And the thing about "8-bit" implementation is
+ * that it consumes 16 (sixteen) times more memory, 4KB per individual
+ * key + 1KB shared. Well, on pros side it should be twice as fast as
+ * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
+ * was observed to run ~75% faster, closer to 100% for commercial
+ * compilers... Yet "4-bit" procedure is preferred, because it's
+ * believed to provide better security-performance balance and adequate
+ * all-round performance. "All-round" refers to things like:
+ *
+ * - shorter setup time effectively improves overall timing for
+ *   handling short messages;
+ * - larger table allocation can become unbearable because of VM
+ *   subsystem penalties (for example on Windows large enough free
+ *   results in VM working set trimming, meaning that consequent
+ *   malloc would immediately incur working set expansion);
+ * - larger table has larger cache footprint, which can affect
+ *   performance of other code paths (not necessarily even from same
+ *   thread in Hyper-Threading world);
+ */
+#define	TABLE_BITS 4
+
+struct gcm128_context {
+	/* Following 6 names follow names in GCM specification */
+	union { u64 u[2]; u32 d[4]; u8 c[16]; }	Yi,EKi,EK0,
+						Xi,H,len;
+	/* Pre-computed table used by gcm_gmult_* */
+#if TABLE_BITS==8
+	u128 Htable[256];
+#else
+	u128 Htable[16];
+	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
+	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+	unsigned int mres, ares;
+	block128_f block;
+	void *key;
+};
-- 
2.25.1