# forms are granted according to the OpenSSL license.
# ====================================================================
#
-# Version 4.1.
+# Version 4.2.
#
# You might fail to appreciate this module performance from the first
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
# aggressively pre-fetched.
#
# Version 4.0 effectively rolls back to 3.6 and instead implements
-# additional set of functions, _[x86|mmx]_AES_[en|de]crypt_compact,
+# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
# which use exclusively 256 byte S-box. These functions are to be
# called in modes not concealing plain text, such as ECB, or when
# we're asked to process smaller amount of data [or unconditionally
# Pentium 120 160 77
#
# Version 4.1 switches to compact S-box even in key schedule setup.
+#
+# Version 4.2 prefetches compact S-box in every SSE round or in other
+# words every cache-line is *guaranteed* to be accessed within ~50
+# cycles window. Why just SSE? Because it's needed on hyper-threading
+# CPU! Which is also why it's prefetched with 64 byte stride. Best
+# part is that it has no negative effect on performance:-)
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
&mov ($s2,$v1); # s[2]=t[2]
}
-# More experimental code... MMX one... Even though this one eliminates
+# More experimental code... SSE one... Even though this one eliminates
# *all* references to stack, it's not faster...
-sub mmx_encbody()
+sub sse_encbody()
{
&movz ($acc,&LB("eax")); # 0
&mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
&function_end_B("_x86_AES_encrypt_compact");
######################################################################
-# "Compact" MMX block function.
+# "Compact" SSE block function.
######################################################################
#
# Performance is not actually extraordinary in comparison to pure
# instructions are twice as many, they can be scheduled every cycle
# and not every second one when they are operating on xmm register,
# so that "arithmetic throughput" remains virtually the same. And
-# finally the code can be executed even on elder MMX-only CPUs:-)
+# finally the code can be executed even on elder SSE-only CPUs:-)
-sub mmx_enccompact()
+sub sse_enccompact()
{
&pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
&pshufw ("mm5","mm4",0x0d); # 15,14,11,10
}
&public_label("AES_Te");
-&function_begin_B("_mmx_AES_encrypt_compact");
+&function_begin_B("_sse_AES_encrypt_compact");
&pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
&pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
&mov ($s3,&DWP(224-128,$tbl));
&set_label("loop",16);
- &mmx_enccompact();
+ &sse_enccompact();
&add ($key,16);
&cmp ($key,&DWP(24,"esp"));
&ja (&label("out"));
&movq ("mm3","mm1"); &movq ("mm7","mm5");
&movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
&psrld ("mm1",8); &psrld ("mm5",8);
+ &mov ($s0,&DWP(0-128,$tbl));
&pslld ("mm3",24); &pslld ("mm7",24);
+ &mov ($s1,&DWP(64-128,$tbl));
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
+ &mov ($s2,&DWP(128-128,$tbl));
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
+ &mov ($s3,&DWP(192-128,$tbl));
&pxor ("mm0","mm2"); &pxor ("mm4","mm6");
&jmp (&label("loop"));
&pxor ("mm4",&QWP(8,$key));
&ret ();
-&function_end_B("_mmx_AES_encrypt_compact");
+&function_end_B("_sse_AES_encrypt_compact");
######################################################################
# Vanilla block function.
&ret ();
-&set_label("AES_Te",1024); # Yes! I keep it in the code segment!
+&set_label("AES_Te",64); # Yes! I keep it in the code segment!
&_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
&_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
&_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
#rcon:
&data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
&data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
- &data_word(0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0);
+ &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
+ &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
&function_end_B("_x86_AES_encrypt");
# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
&picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point"));
&lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
# pick Te4 copy which can't "overlap" with stack frame or key schedule
- &lea ($s1,&DWP(768,"esp"));
+ &lea ($s1,&DWP(768-4,"esp"));
+ &sub ($s1,$tbl);
&and ($s1,0x300);
&lea ($tbl,&DWP(2048+128,$tbl,$s1));
- &bt (&DWP(0,$s0),23); # check for MMX bit
- &jc (&label("mmx"));
+ &bt (&DWP(0,$s0),25); # check for SSE bit
+ &jnc (&label("x86"));
+
+ &movq ("mm0",&QWP(0,$acc));
+ &movq ("mm4",&QWP(8,$acc));
+ &call ("_sse_AES_encrypt_compact");
+ &mov ("esp",&DWP(28,"esp")); # restore stack pointer
+ &mov ($acc,&wparam(1)); # load out
+ &movq (&QWP(0,$acc),"mm0"); # write output data
+ &movq (&QWP(8,$acc),"mm4");
+ &emms ();
+ &function_end_A();
+ &set_label("x86",16);
&mov (&DWP(24,"esp"),$tbl);
&mov ($s0,&DWP(0,$acc)); # load input data
&mov ($s1,&DWP(4,$acc));
&mov (&DWP(4,$acc),$s1);
&mov (&DWP(8,$acc),$s2);
&mov (&DWP(12,$acc),$s3);
- &function_end_A();
-
- &set_label("mmx",16);
- &movq ("mm0",&QWP(0,$acc));
- &movq ("mm4",&QWP(8,$acc));
- &call ("_mmx_AES_encrypt_compact");
- &mov ("esp",&DWP(28,"esp")); # restore stack pointer
- &mov ($acc,&wparam(1)); # load out
- &movq (&QWP(0,$acc),"mm0"); # write output data
- &movq (&QWP(8,$acc),"mm4");
- &emms ();
&function_end("AES_encrypt");
#--------------------------------------------------------------------#
&function_end_B("_x86_AES_decrypt_compact");
######################################################################
-# "Compact" MMX block function.
+# "Compact" SSE block function.
######################################################################
-sub mmx_deccompact()
+sub sse_deccompact()
{
&pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
&movd ("eax","mm1"); # 7, 6, 1, 0
}
&public_label("AES_Td");
-&function_begin_B("_mmx_AES_decrypt_compact");
+&function_begin_B("_sse_AES_decrypt_compact");
&pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
&pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
&mov ($s3,&DWP(224-128,$tbl));
&set_label("loop",16);
- &mmx_deccompact();
+ &sse_deccompact();
&add ($key,16);
&cmp ($key,&DWP(24,"esp"));
&ja (&label("out"));
&pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
&pslld ("mm1",8); &pslld ("mm5",8);
&psrld ("mm3",8); &psrld ("mm7",8);
- &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&DWP(8,$key));
+ &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
+ &mov ($s0,&DWP(0-128,$tbl));
&pslld ("mm1",16); &pslld ("mm5",16);
+ &mov ($s1,&DWP(64-128,$tbl));
&psrld ("mm3",16); &psrld ("mm7",16);
+ &mov ($s2,&DWP(128-128,$tbl));
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
+ &mov ($s3,&DWP(192-128,$tbl));
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
&pxor ("mm0","mm2"); &pxor ("mm4","mm6");
&pxor ("mm4",&QWP(8,$key));
&ret ();
-&function_end_B("_mmx_AES_decrypt_compact");
+&function_end_B("_sse_AES_decrypt_compact");
######################################################################
# Vanilla block function.
&ret ();
-&set_label("AES_Td",1024); # Yes! I keep it in the code segment!
+&set_label("AES_Td",64); # Yes! I keep it in the code segment!
&_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
&_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
&_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
&picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point"));
&lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
# pick Td4 copy which can't "overlap" with stack frame or key schedule
- &lea ($s1,&DWP(768,"esp"));
+ &lea ($s1,&DWP(768-4,"esp"));
+ &sub ($s1,$tbl);
&and ($s1,0x300);
&lea ($tbl,&DWP(2048+128,$tbl,$s1));
- &bt (&DWP(0,$s0),23); # check for MMX bit
- &jc (&label("mmx"));
+ &bt (&DWP(0,$s0),25); # check for SSE bit
+ &jnc (&label("x86"));
+
+ &movq ("mm0",&QWP(0,$acc));
+ &movq ("mm4",&QWP(8,$acc));
+ &call ("_sse_AES_decrypt_compact");
+ &mov ("esp",&DWP(28,"esp")); # restore stack pointer
+ &mov ($acc,&wparam(1)); # load out
+ &movq (&QWP(0,$acc),"mm0"); # write output data
+ &movq (&QWP(8,$acc),"mm4");
+ &emms ();
+ &function_end_A();
+ &set_label("x86",16);
&mov (&DWP(24,"esp"),$tbl);
&mov ($s0,&DWP(0,$acc)); # load input data
&mov ($s1,&DWP(4,$acc));
&mov (&DWP(4,$acc),$s1);
&mov (&DWP(8,$acc),$s2);
&mov (&DWP(12,$acc),$s3);
- &function_end_A();
-
- &set_label("mmx",16);
- &movq ("mm0",&QWP(0,$acc));
- &movq ("mm4",&QWP(8,$acc));
- &call ("_mmx_AES_decrypt_compact");
- &mov ("esp",&DWP(28,"esp")); # restore stack pointer
- &mov ($acc,&wparam(1)); # load out
- &movq (&QWP(0,$acc),"mm0"); # write output data
- &movq (&QWP(8,$acc),"mm4");
- &emms ();
&function_end("AES_decrypt");
# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
&shl ("ebx",16);
&xor ("eax","ebx");
- &xor ("eax",&BP(1024-128,$tbl,"ecx",4)); # rcon
+ &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
}
# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,