tls: fix ROL/ROR x86 optimization
authorDenys Vlasenko <vda.linux@googlemail.com>
Thu, 19 Jan 2017 15:45:41 +0000 (16:45 +0100)
committerDenys Vlasenko <vda.linux@googlemail.com>
Thu, 19 Jan 2017 15:45:41 +0000 (16:45 +0100)
ALWAYS_INLINE:

function                                             old     new   delta
psAesInitKey                                         825     824      -1
ROR                                                    5       -      -5
setup_mix2                                           148     134     -14
psAesDecryptBlock                                   1184    1139     -45
psAesEncryptBlock                                   1193    1102     -91
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 0/4 up/down: 0/-156)           Total: -156 bytes

ALWAYS_INLINE + __builtin_constant_p(shift_cnt):

function                                             old     new   delta
ROR                                                    5       -      -5
psAesInitKey                                         825     818      -7
setup_mix2                                           148     123     -25
psAesDecryptBlock                                   1184    1078    -106
psAesEncryptBlock                                   1193    1017    -176
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 0/4 up/down: 0/-319)           Total: -319 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
networking/tls_symmetric.h

index b6b55c78c8f6e552a5f0795e7f15efc39dd99d52..8488b437e69b9aff09378383bf002aafd71fecb8 100644 (file)
@@ -7,9 +7,6 @@
 
 /* The part below is a section of matrixssl-3-7-2b-open/crypto/cryptolib.h
  * Changes are flagged with //bbox
- * TODO:
- * Take a look at "roll %%cl" part... rotates by constant use fewer registers,
- * and on many Intel CPUs rotates by %cl are slower: they take 2 cycles, not 1.
  */
 
 /******************************************************************************/
 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && \
                !defined(INTEL_CC) && !defined(PS_NO_ASM)
 
-static inline unsigned ROL(unsigned word, int i)
+static ALWAYS_INLINE unsigned ROL(unsigned word, int i)
 {
+ if (__builtin_constant_p(i)) //box
+   // Rotates by constant use fewer registers,
+   // and on many Intel CPUs rotates by %cl take 2 cycles, not 1.
+   asm ("roll %2,%0" //box
+         :"=r" (word)
+         :"0" (word),"i" (i));
+ else  //box
    asm ("roll %%cl,%0"
          :"=r" (word)
          :"0" (word),"c" (i));
    return word;
 }
 
-static inline unsigned ROR(unsigned word, int i)
+static ALWAYS_INLINE unsigned ROR(unsigned word, int i)
 {
+ if (__builtin_constant_p(i)) //box
+   asm ("rorl %2,%0" //box
+         :"=r" (word)
+         :"0" (word),"i" (i));
+ else //box
    asm ("rorl %%cl,%0"
          :"=r" (word)
          :"0" (word),"c" (i));