From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Thu, 19 Jan 2017 15:45:41 +0000 (+0100)
Subject: tls: fix ROL/ROR x86 optimization
X-Git-Tag: 1_27_0~202
X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=f7806f9d8fc889f1d6cd365b69d9d99a4a5a6e26;p=oweals%2Fbusybox.git

tls: fix ROL/ROR x86 optimization

ALWAYS_INLINE:

function                                             old     new   delta
psAesInitKey                                         825     824      -1
ROR                                                    5       -      -5
setup_mix2                                           148     134     -14
psAesDecryptBlock                                   1184    1139     -45
psAesEncryptBlock                                   1193    1102     -91
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 0/4 up/down: 0/-156)           Total: -156 bytes

ALWAYS_INLINE + __builtin_constant_p(shift_cnt):

function                                             old     new   delta
ROR                                                    5       -      -5
psAesInitKey                                         825     818      -7
setup_mix2                                           148     123     -25
psAesDecryptBlock                                   1184    1078    -106
psAesEncryptBlock                                   1193    1017    -176
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 0/4 up/down: 0/-319)           Total: -319 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---

diff --git a/networking/tls_symmetric.h b/networking/tls_symmetric.h
index b6b55c78c..8488b437e 100644
--- a/networking/tls_symmetric.h
+++ b/networking/tls_symmetric.h
@@ -7,9 +7,6 @@
 
 /* The part below is a section of matrixssl-3-7-2b-open/crypto/cryptolib.h
  * Changes are flagged with //bbox
- * TODO:
- * Take a look at "roll %%cl" part... rotates by constant use fewer registers,
- * and on many Intel CPUs rotates by %cl are slower: they take 2 cycles, not 1.
  */
 
 /******************************************************************************/
@@ -28,16 +25,28 @@
 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && \
 		!defined(INTEL_CC) && !defined(PS_NO_ASM)
 
-static inline unsigned ROL(unsigned word, int i)
+static ALWAYS_INLINE unsigned ROL(unsigned word, int i)
 {
+ if (__builtin_constant_p(i)) //box
+   // Rotates by constant use fewer registers,
+   // and on many Intel CPUs rotates by %cl take 2 cycles, not 1.
+   asm ("roll %2,%0" //box
+	  :"=r" (word)
+	  :"0" (word),"i" (i));
+ else  //box
    asm ("roll %%cl,%0"
 	  :"=r" (word)
 	  :"0" (word),"c" (i));
    return word;
 }
 
-static inline unsigned ROR(unsigned word, int i)
+static ALWAYS_INLINE unsigned ROR(unsigned word, int i)
 {
+ if (__builtin_constant_p(i)) //box
+   asm ("rorl %2,%0" //box
+	  :"=r" (word)
+	  :"0" (word),"i" (i));
+ else //box
    asm ("rorl %%cl,%0"
 	  :"=r" (word)
 	  :"0" (word),"c" (i));