$lflags =
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags =
$bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
-$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -Wl,-search_paths_first%
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags =
$bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
-$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -Wl,-search_paths_first%
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags =
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -ldl -no_cpprt
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -lws2_32 -lgdi32 -lcrypt32
$bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -lsocket -lnsl -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
$lflags = -lsocket -lnsl -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = x86_64cpuid.o
-$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
$des_obj =
$aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o
$bf_obj =
--- /dev/null
+/******************************************************************************\r
+* Copyright(c) 2012, Intel Corp. \r
+* Developers and authors: \r
+* Shay Gueron (1, 2), and Vlad Krasnov (1) \r
+* (1) Intel Corporation, Israel Development Center, Haifa, Israel \r
+* (2) University of Haifa, Israel \r
+******************************************************************************\r
+* LICENSE: \r
+* This submission to OpenSSL is to be made available under the OpenSSL \r
+* license, and only to the OpenSSL project, in order to allow integration \r
+* into the publicly distributed code. \r
+* The use of this code, or portions of this code, or concepts embedded in\r
+* this code, or modification of this code and/or algorithm(s) in it, or the\r
+* use of this code for any other purpose than stated above, requires special\r
+* licensing. \r
+******************************************************************************\r
+* DISCLAIMER: \r
+* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS \r
+* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED \r
+* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR \r
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT\r
+* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, \r
+* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF \r
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS \r
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN \r
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) \r
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE \r
+* POSSIBILITY OF SUCH DAMAGE. \r
+******************************************************************************/\r
+\r
+#include "rsaz_exp.h"\r
+\r
+/*\r
+ * See crypto/bn/asm/rsaz-avx2.pl for further details.\r
+ */\r
+void rsaz_1024_norm2red_avx2(void *red,const void *norm);\r
+void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,unsigned long k);\r
+void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,unsigned long k,int cnt);\r
+void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i);\r
+void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i);\r
+void rsaz_1024_red2norm_avx2(void *norm,const void *red);\r
+\r
+#if defined(__GNUC__)\r
+# define ALIGN64 __attribute__((aligned(64)))\r
+#elif defined(_MSC_VER)\r
+# define ALIGN64 __declspec(align(64))\r
+#elif defined(__SUNPRO_C)\r
+# define ALIGN64\r
+# pragma align 64(one,two80)\r
+#else\r
+# define ALIGN64 /* not fatal, might hurt performance a little */\r
+#endif\r
+\r
+ALIGN64 static const unsigned long one[40] =\r
+ {1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\r
+ALIGN64 static const unsigned long two80[40] =\r
+ {0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\r
+\r
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],\r
+ const BN_ULONG base_norm[16], const BN_ULONG exponent[16],\r
+ const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0)\r
+{\r
+ unsigned char storage[320*3+32*9*16+64]; /* 5.5KB */\r
+ unsigned char *p_str = storage + (64-((size_t)storage%64));\r
+ unsigned char *a_inv, *m, *result,\r
+ *table_s = p_str+320*3,\r
+ *R2 = table_s; /* borrow */\r
+ int index;\r
+ int wvalue;\r
+\r
+ if ((((size_t)p_str&4095)+320)>>12) {\r
+ result = p_str;\r
+ a_inv = p_str + 320;\r
+ m = p_str + 320*2; /* should not cross page */\r
+ } else {\r
+ m = p_str; /* should not cross page */\r
+ result = p_str + 320;\r
+ a_inv = p_str + 320*2;\r
+ }\r
+\r
+ rsaz_1024_norm2red_avx2(m, m_norm);\r
+ rsaz_1024_norm2red_avx2(a_inv, base_norm);\r
+ rsaz_1024_norm2red_avx2(R2, RR);\r
+\r
+ rsaz_1024_mul_avx2(R2, R2, R2, m, k0);\r
+ rsaz_1024_mul_avx2(R2, R2, two80, m, k0);\r
+\r
+ /* table[0] = 1 */\r
+ rsaz_1024_mul_avx2(result, R2, one, m, k0);\r
+ /* table[1] = a_inv^1 */\r
+ rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);\r
+\r
+ rsaz_1024_scatter5_avx2(table_s,result,0);\r
+ rsaz_1024_scatter5_avx2(table_s,a_inv,1);\r
+\r
+ /* table[2] = a_inv^2 */\r
+ rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,2);\r
+#if 0\r
+ /* this is almost 2x smaller and less than 1% slower */\r
+ for (index=3; index<32; index++) {\r
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,index);\r
+ }\r
+#else\r
+ /* table[4] = a_inv^4 */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,4);\r
+ /* table[8] = a_inv^8 */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,8);\r
+ /* table[16] = a_inv^16 */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,16);\r
+ /* table[17] = a_inv^17 */\r
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,17);\r
+\r
+ /* table[3] */\r
+ rsaz_1024_gather5_avx2(result,table_s,2);\r
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,3);\r
+ /* table[6] */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,6);\r
+ /* table[12] */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,12);\r
+ /* table[24] */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,24);\r
+ /* table[25] */\r
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,25);\r
+\r
+ /* table[5] */\r
+ rsaz_1024_gather5_avx2(result,table_s,4);\r
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,5);\r
+ /* table[10] */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,10);\r
+ /* table[20] */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,20);\r
+ /* table[21] */\r
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,21);\r
+\r
+ /* table[7] */\r
+ rsaz_1024_gather5_avx2(result,table_s,6);\r
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,7);\r
+ /* table[14] */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,14);\r
+ /* table[28] */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,28);\r
+ /* table[29] */\r
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,29);\r
+\r
+ /* table[9] */\r
+ rsaz_1024_gather5_avx2(result,table_s,8);\r
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,9);\r
+ /* table[18] */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,18);\r
+ /* table[19] */\r
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,19);\r
+\r
+ /* table[11] */\r
+ rsaz_1024_gather5_avx2(result,table_s,10);\r
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,11);\r
+ /* table[22] */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,22);\r
+ /* table[23] */\r
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,23);\r
+\r
+ /* table[13] */\r
+ rsaz_1024_gather5_avx2(result,table_s,12);\r
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,13);\r
+ /* table[26] */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,26);\r
+ /* table[27] */\r
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,27);\r
+\r
+ /* table[15] */\r
+ rsaz_1024_gather5_avx2(result,table_s,14);\r
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,15);\r
+ /* table[30] */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);\r
+ rsaz_1024_scatter5_avx2(table_s,result,30);\r
+ /* table[31] */\r
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);\r
+ rsaz_1024_scatter5_avx2(table_s,result,31);\r
+#endif\r
+\r
+ /* load first window */\r
+ p_str = (unsigned char*)exponent;\r
+ wvalue = p_str[127] >> 3;\r
+ rsaz_1024_gather5_avx2(result,table_s,wvalue);\r
+\r
+ index = 1014;\r
+\r
+ while(index > -1) { /* loop for the remaining 127 windows */\r
+\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 5);\r
+\r
+ wvalue = *((unsigned short*)&p_str[index/8]);\r
+ wvalue = (wvalue>> (index%8)) & 31;\r
+ index-=5;\r
+\r
+ rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */\r
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);\r
+ }\r
+\r
+ /* square four times */\r
+ rsaz_1024_sqr_avx2(result, result, m, k0, 4);\r
+\r
+ wvalue = p_str[0] & 15;\r
+\r
+ rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */\r
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);\r
+\r
+ /* from Montgomery */\r
+ rsaz_1024_mul_avx2(result, result, one, m, k0);\r
+\r
+ rsaz_1024_red2norm_avx2(result_norm, result);\r
+\r
+ OPENSSL_cleanse(storage,sizeof(storage));\r
+}\r
+\r
+/*\r
+ * See crypto/bn/rsaz-x86_64.pl for further details.\r
+ */\r
+void rsaz_512_mul(void *ret,const void *a,const void *b,const void *n,unsigned long k);\r
+void rsaz_512_mul_scatter4(void *ret,const void *a,const void *n,unsigned long k,const void *tbl,unsigned int power);\r
+void rsaz_512_mul_gather4(void *ret,const void *a,const void *tbl,const void *n,unsigned long k,unsigned int power);\r
+void rsaz_512_mul_by_one(void *ret,const void *a,const void *n,unsigned long k);\r
+void rsaz_512_sqr(void *ret,const void *a,const void *n,unsigned long k,int cnt);\r
+void rsaz_512_scatter4(void *tbl, const unsigned long *val, int power);\r
+void rsaz_512_gather4(unsigned long *val, const void *tbl, int power);\r
+\r
+void RSAZ_512_mod_exp(BN_ULONG result[8],\r
+ const BN_ULONG base[8], const BN_ULONG exponent[8],\r
+ const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])\r
+{\r
+ unsigned char storage[16*8*8+64*2+64]; /* 1.2KB */\r
+ unsigned char *table = storage + (64-((size_t)storage%64));\r
+ unsigned long *a_inv = (unsigned long *)(table+16*8*8),\r
+ *temp = (unsigned long *)(table+16*8*8+8*8);\r
+ unsigned char *p_str = (unsigned char*)exponent;\r
+ int index;\r
+ unsigned int wvalue;\r
+\r
+ /* table[0] = 1_inv */\r
+ temp[0] = 0-m[0]; temp[1] = ~m[1];\r
+ temp[2] = ~m[2]; temp[3] = ~m[3];\r
+ temp[4] = ~m[4]; temp[5] = ~m[5];\r
+ temp[6] = ~m[6]; temp[7] = ~m[7];\r
+ rsaz_512_scatter4(table, temp, 0);\r
+\r
+ /* table [1] = a_inv^1 */\r
+ rsaz_512_mul(a_inv, base, RR, m, k0);\r
+ rsaz_512_scatter4(table, a_inv, 1);\r
+\r
+ /* table [2] = a_inv^2 */\r
+ rsaz_512_sqr(temp, a_inv, m, k0, 1);\r
+ rsaz_512_scatter4(table, temp, 2);\r
+\r
+ for (index=3; index<16; index++)\r
+ rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);\r
+\r
+ /* load first window */\r
+ wvalue = p_str[63];\r
+\r
+ rsaz_512_gather4(temp, table, wvalue>>4);\r
+ rsaz_512_sqr(temp, temp, m, k0, 4);\r
+ rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0xf);\r
+\r
+ for (index=62; index>=0; index--) {\r
+ wvalue = p_str[index];\r
+\r
+ rsaz_512_sqr(temp, temp, m, k0, 4);\r
+ rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue>>4);\r
+\r
+ rsaz_512_sqr(temp, temp, m, k0, 4);\r
+ rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0x0f);\r
+ }\r
+\r
+ /* from Montgomery */\r
+ rsaz_512_mul_by_one(result, temp, m, k0);\r
+\r
+ OPENSSL_cleanse(storage,sizeof(storage));\r
+}\r