From 81f3d6323dcda6a18b06c718600d6a4739e83263 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Mon, 30 Nov 2015 23:07:38 +0100
Subject: [PATCH] modes/ocb128.c: split fixed block xors to aligned and
 misaligned.

Main goal was to improve performance on RISC platforms, e.g. 10%
was measured on MIPS, POWER8...

Reviewed-by: Matt Caswell <matt@openssl.org>
---
 crypto/modes/modes_lcl.h | 25 +++++++++--------
 crypto/modes/ocb128.c    | 58 ++++++++++------------------------------
 2 files changed, 26 insertions(+), 57 deletions(-)

diff --git a/crypto/modes/modes_lcl.h b/crypto/modes/modes_lcl.h
index 0fd11ce6c4..2f61afe5dc 100644
--- a/crypto/modes/modes_lcl.h
+++ b/crypto/modes/modes_lcl.h
@@ -144,20 +144,19 @@ struct ccm128_context {
 
 #ifndef OPENSSL_NO_OCB
 
-# ifdef STRICT_ALIGNMENT
-typedef struct {
-    unsigned char a[16];
+typedef union {
+    u64 a[2];
+    unsigned char c[16];
 } OCB_BLOCK;
-#  define ocb_block16_xor(in1,in2,out) \
-    ocb_block_xor((in1)->a,(in2)->a,16,(out)->a)
-# else                          /* STRICT_ALIGNMENT */
-typedef struct {
-    u64 a;
-    u64 b;
-} OCB_BLOCK;
-#  define ocb_block16_xor(in1,in2,out) \
-    (out)->a=(in1)->a^(in2)->a; (out)->b=(in1)->b^(in2)->b;
-# endif                         /* STRICT_ALIGNMENT */
+# define ocb_block16_xor(in1,in2,out) \
+    ( (out)->a[0]=(in1)->a[0]^(in2)->a[0], \
+      (out)->a[1]=(in1)->a[1]^(in2)->a[1] )
+# if STRICT_ALIGNMENT
+#  define ocb_block16_xor_misaligned(in1,in2,out) \
+    ocb_block_xor((in1)->c,(in2)->c,16,(out)->c)
+# else
+#  define ocb_block16_xor_misaligned ocb_block16_xor
+# endif
 
 struct ocb128_context {
     /* Need both encrypt and decrypt key schedules for decryption */
diff --git a/crypto/modes/ocb128.c b/crypto/modes/ocb128.c
index 5408d50df1..d49aa6ede9 100644
--- a/crypto/modes/ocb128.c
+++ b/crypto/modes/ocb128.c
@@ -53,11 +53,6 @@
 
 #ifndef OPENSSL_NO_OCB
 
-union ublock {
-    unsigned char *chrblk;
-    OCB_BLOCK *ocbblk;
-};
-
 /*
  * Calculate the number of binary trailing zero's in any given number
  */
@@ -88,23 +83,18 @@ static void ocb_block_lshift(OCB_BLOCK *in, size_t shift, OCB_BLOCK *out)
     unsigned char shift_mask;
     int i;
     unsigned char mask[15];
-    union ublock locin;
-    union ublock locout;
-
-    locin.ocbblk = in;
-    locout.ocbblk = out;
 
     shift_mask = 0xff;
     shift_mask <<= (8 - shift);
     for (i = 15; i >= 0; i--) {
         if (i > 0) {
-            mask[i - 1] = locin.chrblk[i] & shift_mask;
+            mask[i - 1] = in->c[i] & shift_mask;
             mask[i - 1] >>= 8 - shift;
         }
-        locout.chrblk[i] = locin.chrblk[i] << shift;
+        out->c[i] = in->c[i] << shift;
 
         if (i != 15) {
-            locout.chrblk[i] ^= mask[i];
+            out->c[i] ^= mask[i];
         }
     }
 }
@@ -115,23 +105,18 @@ static void ocb_block_lshift(OCB_BLOCK *in, size_t shift, OCB_BLOCK *out)
 static void ocb_double(OCB_BLOCK *in, OCB_BLOCK *out)
 {
     unsigned char mask;
-    union ublock locin;
-    union ublock locout;
-
-    locin.ocbblk = in;
-    locout.ocbblk = out;
 
     /*
      * Calculate the mask based on the most significant bit. There are more
      * efficient ways to do this - but this way is constant time
      */
-    mask = locin.chrblk[0] & 0x80;
+    mask = in->c[0] & 0x80;
     mask >>= 7;
     mask *= 135;
 
     ocb_block_lshift(in, 1, out);
 
-    locout.chrblk[15] ^= mask;
+    out->c[15] ^= mask;
 }
 
 /*
@@ -191,13 +176,7 @@ static OCB_BLOCK *ocb_lookup_l(OCB128_CONTEXT *ctx, size_t idx)
 static void ocb_encrypt(OCB128_CONTEXT *ctx, OCB_BLOCK *in, OCB_BLOCK *out,
                         void *keyenc)
 {
-    union ublock locin;
-    union ublock locout;
-
-    locin.ocbblk = in;
-    locout.ocbblk = out;
-
-    ctx->encrypt(locin.chrblk, locout.chrblk, keyenc);
+    ctx->encrypt(in->c, out->c, keyenc);
 }
 
 /*
@@ -206,13 +185,7 @@ static void ocb_encrypt(OCB128_CONTEXT *ctx, OCB_BLOCK *in, OCB_BLOCK *out,
 static void ocb_decrypt(OCB128_CONTEXT *ctx, OCB_BLOCK *in, OCB_BLOCK *out,
                         void *keydec)
 {
-    union ublock locin;
-    union ublock locout;
-
-    locin.ocbblk = in;
-    locout.ocbblk = out;
-
-    ctx->decrypt(locin.chrblk, locout.chrblk, keydec);
+    ctx->decrypt(in->c, out->c, keydec);
 }
 
 /*
@@ -305,9 +278,6 @@ int CRYPTO_ocb128_setiv(OCB128_CONTEXT *ctx, const unsigned char *iv,
     unsigned char ktop[16], tmp[16], mask;
     unsigned char stretch[24], nonce[16];
     size_t bottom, shift;
-    union ublock offset;
-
-    offset.ocbblk = &ctx->offset;
 
     /*
      * Spec says IV is 120 bits or fewer - it allows non byte aligned lengths.
@@ -341,7 +311,7 @@ int CRYPTO_ocb128_setiv(OCB128_CONTEXT *ctx, const unsigned char *iv,
                      &ctx->offset);
     mask = 0xff;
     mask <<= 8 - shift;
-    offset.chrblk[15] |=
+    ctx->offset.c[15] |=
         (*(stretch + (bottom / 8) + 16) & mask) >> (8 - shift);
 
     return 1;
@@ -444,13 +414,13 @@ int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
 
         /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
         inblock = (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
-        ocb_block16_xor(&ctx->offset, inblock, &tmp1);
+        ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
         /* Checksum_i = Checksum_{i-1} xor P_i */
-        ocb_block16_xor(&ctx->checksum, inblock, &ctx->checksum);
+        ocb_block16_xor_misaligned(&ctx->checksum, inblock, &ctx->checksum);
         ocb_encrypt(ctx, &tmp1, &tmp2, ctx->keyenc);
         outblock =
             (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
-        ocb_block16_xor(&ctx->offset, &tmp2, outblock);
+        ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
 
     }
 
@@ -517,14 +487,14 @@ int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
 
         /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
         inblock = (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
-        ocb_block16_xor(&ctx->offset, inblock, &tmp1);
+        ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
         ocb_decrypt(ctx, &tmp1, &tmp2, ctx->keydec);
         outblock =
             (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
-        ocb_block16_xor(&ctx->offset, &tmp2, outblock);
+        ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
 
         /* Checksum_i = Checksum_{i-1} xor P_i */
-        ocb_block16_xor(&ctx->checksum, outblock, &ctx->checksum);
+        ocb_block16_xor_misaligned(&ctx->checksum, outblock, &ctx->checksum);
     }
 
     /*
-- 
2.25.1