tls: speed up xor'ing of aligned 16-byte buffers
authorDenys Vlasenko <vda.linux@googlemail.com>
Sat, 24 Nov 2018 13:08:29 +0000 (14:08 +0100)
committerDenys Vlasenko <vda.linux@googlemail.com>
Sat, 24 Nov 2018 13:08:29 +0000 (14:08 +0100)
function                                             old     new   delta
xorbuf_aligned_AES_BLOCK_SIZE                          -      23     +23
xwrite_encrypted                                     585     580      -5
aesgcm_GHASH                                         233     228      -5
GMULT                                                192     187      -5
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 0/3 up/down: 23/-15)              Total: 8 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
networking/tls.c
networking/tls.h
networking/tls_aesgcm.c

index 1f8c21f8bab6b2555d9ed8a1eacd98094041d4e1..b774340aee90909c11497eaf729ff626099b138a 100644 (file)
@@ -357,6 +357,20 @@ void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count)
        xorbuf3(dst, dst, src, count);
 }
 
+void FAST_FUNC xorbuf_aligned_AES_BLOCK_SIZE(void *dst, const void *src)
+{
+       unsigned long *d = dst;
+       const unsigned long *s = src;
+       d[0] ^= s[0];
+#if ULONG_MAX <= 0xffffffffffffffff
+       d[1] ^= s[1];
+ #if ULONG_MAX == 0xffffffff
+       d[2] ^= s[2];
+       d[3] ^= s[3];
+ #endif
+#endif
+}
+
 /* Nondestructively see the current hash value */
 static unsigned sha_peek(md5sha_ctx_t *ctx, void *buffer)
 {
@@ -802,10 +816,10 @@ static void xwrite_encrypted_aesgcm(tls_state_t *tls, unsigned size, unsigned ty
 {
 #define COUNTER(v) (*(uint32_t*)(v + 12))
 
-       uint8_t aad[13 + 3] ALIGNED(4);   /* +3 creates [16] buffer, simplifying GHASH() */
-       uint8_t nonce[12 + 4] ALIGNED(4); /* +4 creates space for AES block counter */
-       uint8_t scratch[AES_BLOCK_SIZE] ALIGNED(4); //[16]
-       uint8_t authtag[AES_BLOCK_SIZE] ALIGNED(4); //[16]
+       uint8_t aad[13 + 3] ALIGNED_long;   /* +3 creates [16] buffer, simplifying GHASH() */
+       uint8_t nonce[12 + 4] ALIGNED_long; /* +4 creates space for AES block counter */
+       uint8_t scratch[AES_BLOCK_SIZE] ALIGNED_long; //[16]
+       uint8_t authtag[AES_BLOCK_SIZE] ALIGNED_long; //[16]
        uint8_t *buf;
        struct record_hdr *xhdr;
        unsigned remaining;
@@ -850,7 +864,7 @@ static void xwrite_encrypted_aesgcm(tls_state_t *tls, unsigned size, unsigned ty
        aesgcm_GHASH(tls->H, aad, /*sizeof(aad),*/ tls->outbuf + OUTBUF_PFX, size, authtag /*, sizeof(authtag)*/);
        COUNTER(nonce) = htonl(1);
        aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch);
-       xorbuf(authtag, scratch, sizeof(authtag));
+       xorbuf_aligned_AES_BLOCK_SIZE(authtag, scratch);
 
        memcpy(buf, authtag, sizeof(authtag));
 #undef COUNTER
@@ -938,10 +952,10 @@ static void tls_aesgcm_decrypt(tls_state_t *tls, uint8_t *buf, int size)
 {
 #define COUNTER(v) (*(uint32_t*)(v + 12))
 
-       //uint8_t aad[13 + 3] ALIGNED(4); /* +3 creates [16] buffer, simplifying GHASH() */
-       uint8_t nonce[12 + 4] ALIGNED(4); /* +4 creates space for AES block counter */
-       uint8_t scratch[AES_BLOCK_SIZE] ALIGNED(4); //[16]
-       //uint8_t authtag[AES_BLOCK_SIZE] ALIGNED(4); //[16]
+       //uint8_t aad[13 + 3] ALIGNED_long; /* +3 creates [16] buffer, simplifying GHASH() */
+       uint8_t nonce[12 + 4] ALIGNED_long; /* +4 creates space for AES block counter */
+       uint8_t scratch[AES_BLOCK_SIZE] ALIGNED_long; //[16]
+       //uint8_t authtag[AES_BLOCK_SIZE] ALIGNED_long; //[16]
        unsigned remaining;
        unsigned cnt;
 
@@ -973,7 +987,7 @@ static void tls_aesgcm_decrypt(tls_state_t *tls, uint8_t *buf, int size)
        //aesgcm_GHASH(tls->H, aad, tls->inbuf + RECHDR_LEN, size, authtag);
        //COUNTER(nonce) = htonl(1);
        //aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch);
-       //xorbuf(authtag, scratch, sizeof(authtag));
+       //xorbuf_aligned_AES_BLOCK_SIZE(authtag, scratch);
 
        //memcmp(buf, authtag, sizeof(authtag)) || DIE("HASH DOES NOT MATCH!");
 #undef COUNTER
index 4b0dc7459f3a6adaca679677942ed99bfe540d0d..494ed78c467ff5329719da9054227ae2ecf2da05 100644 (file)
@@ -81,8 +81,12 @@ typedef  int16_t  int16;
 #define AES_BLOCK_SIZE  16
 
 void tls_get_random(void *buf, unsigned len) FAST_FUNC;
+
 void xorbuf(void* buf, const void* mask, unsigned count) FAST_FUNC;
 
+#define ALIGNED_long ALIGNED(sizeof(long))
+void xorbuf_aligned_AES_BLOCK_SIZE(void* buf, const void* mask) FAST_FUNC;
+
 #define matrixCryptoGetPrngData(buf, len, userPtr) (tls_get_random(buf, len), PS_SUCCESS)
 
 #define psFree(p, pool)    free(p)
index db720e5f676a010bc68999282e1c4255ff9bd426..fd72540c4091b7c2180d2b4f665087b5d6ecff89 100644 (file)
@@ -50,8 +50,8 @@ static void RIGHTSHIFTX(byte* x)
 
 static void GMULT(byte* X, byte* Y)
 {
-    byte Z[AES_BLOCK_SIZE];
-    byte V[AES_BLOCK_SIZE];
+    byte Z[AES_BLOCK_SIZE] ALIGNED_long;
+    byte V[AES_BLOCK_SIZE] ALIGNED_long;
     int i, j;
 
     XMEMSET(Z, 0, AES_BLOCK_SIZE);
@@ -62,7 +62,7 @@ static void GMULT(byte* X, byte* Y)
         for (j = 0; j < 8; j++)
         {
             if (y & 0x80) {
-                xorbuf(Z, V, AES_BLOCK_SIZE);
+                xorbuf_aligned_AES_BLOCK_SIZE(Z, V);
             }
 
             RIGHTSHIFTX(V);
@@ -86,8 +86,8 @@ void FAST_FUNC aesgcm_GHASH(byte* h,
     byte* s //, unsigned sSz
 )
 {
-    byte x[AES_BLOCK_SIZE] ALIGNED(4);
-    byte scratch[AES_BLOCK_SIZE] ALIGNED(4);
+    byte x[AES_BLOCK_SIZE] ALIGNED_long;
+    byte scratch[AES_BLOCK_SIZE] ALIGNED_long;
     word32 blocks, partial;
     //was: byte* h = aes->H;
 
@@ -116,6 +116,7 @@ void FAST_FUNC aesgcm_GHASH(byte* h,
         blocks = cSz / AES_BLOCK_SIZE;
         partial = cSz % AES_BLOCK_SIZE;
         while (blocks--) {
+            //xorbuf_aligned_AES_BLOCK_SIZE(x, c); - c is not guaranteed to be aligned
             xorbuf(x, c, AES_BLOCK_SIZE);
             GMULT(x, h);
             c += AES_BLOCK_SIZE;
@@ -124,7 +125,7 @@ void FAST_FUNC aesgcm_GHASH(byte* h,
             //XMEMSET(scratch, 0, AES_BLOCK_SIZE);
             //XMEMCPY(scratch, c, partial);
             //xorbuf(x, scratch, AES_BLOCK_SIZE);
-            xorbuf(x, c, partial);
+            xorbuf(x, c, partial);//same result as above
             GMULT(x, h);
         }
     }
@@ -132,7 +133,7 @@ void FAST_FUNC aesgcm_GHASH(byte* h,
     /* Hash in the lengths of A and C in bits */
     FlattenSzInBits(&scratch[0], aSz);
     FlattenSzInBits(&scratch[8], cSz);
-    xorbuf(x, scratch, AES_BLOCK_SIZE);
+    xorbuf_aligned_AES_BLOCK_SIZE(x, scratch);
     GMULT(x, h);
 
     /* Copy the result into s. */