# forms are granted according to the OpenSSL license.
# ====================================================================
#
-# Version 4.0.
+# Version 4.1.
#
# You might fail to appreciate this module performance from the first
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
# byte for 128-bit key.
#
# ECB encrypt ECB decrypt CBC large chunk
-# P4 57[60] 84[100] 23
+# P4 56[60] 84[100] 23
# AMD K8 48[44] 70[79] 18
# PIII 41[50] 61[91] 24
# Pentium 120 160 77
+#
+# Version 4.1 switches to compact S-box even in key schedule setup.
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
# *all* references to stack, it's not faster...
sub mmx_encbody()
{
- &movz ("esi",&LB("eax")); # 0
- &mov ("ecx",&DWP(0,$tbl,"esi",8)); # 0
+ &movz ($acc,&LB("eax")); # 0
+ &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
&pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
&movz ("edx",&HB("eax")); # 1
&mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
&shr ("eax",16); # 5, 4
- &movz ("esi",&LB("ebx")); # 10
- &xor ("ecx",&DWP(2,$tbl,"esi",8)); # 10
+ &movz ($acc,&LB("ebx")); # 10
+ &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
&pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
- &movz ("esi",&HB("ebx")); # 11
- &xor ("edx",&DWP(1,$tbl,"esi",8)); # 11
+ &movz ($acc,&HB("ebx")); # 11
+ &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
&shr ("ebx",16); # 15,14
- &movz ("esi",&HB("eax")); # 5
- &xor ("ecx",&DWP(3,$tbl,"esi",8)); # 5
+ &movz ($acc,&HB("eax")); # 5
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
&movq ("mm3",QWP(16,$key));
- &movz ("esi",&HB("ebx")); # 15
- &xor ("ecx",&DWP(1,$tbl,"esi",8)); # 15
+ &movz ($acc,&HB("ebx")); # 15
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
&movd ("mm0","ecx"); # t[0] collected
- &movz ("esi",&LB("eax")); # 4
- &mov ("ecx",&DWP(0,$tbl,"esi",8)); # 4
+ &movz ($acc,&LB("eax")); # 4
+ &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
&movd ("eax","mm2"); # 7, 6, 3, 2
- &movz ("esi",&LB("ebx")); # 14
- &xor ("ecx",&DWP(2,$tbl,"esi",8)); # 14
+ &movz ($acc,&LB("ebx")); # 14
+ &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
&movd ("ebx","mm6"); # 13,12, 9, 8
- &movz ("esi",&HB("eax")); # 3
- &xor ("ecx",&DWP(1,$tbl,"esi",8)); # 3
- &movz ("esi",&HB("ebx")); # 9
- &xor ("ecx",&DWP(3,$tbl,"esi",8)); # 9
+ &movz ($acc,&HB("eax")); # 3
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
+ &movz ($acc,&HB("ebx")); # 9
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
&movd ("mm1","ecx"); # t[1] collected
- &movz ("esi",&LB("eax")); # 2
- &mov ("ecx",&DWP(2,$tbl,"esi",8)); # 2
+ &movz ($acc,&LB("eax")); # 2
+ &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
&shr ("eax",16); # 7, 6
&punpckldq ("mm0","mm1"); # t[0,1] collected
- &movz ("esi",&LB("ebx")); # 8
- &xor ("ecx",&DWP(0,$tbl,"esi",8)); # 8
+ &movz ($acc,&LB("ebx")); # 8
+ &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
&shr ("ebx",16); # 13,12
- &movz ("esi",&HB("eax")); # 7
- &xor ("ecx",&DWP(1,$tbl,"esi",8)); # 7
+ &movz ($acc,&HB("eax")); # 7
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
&pxor ("mm0","mm3");
&movz ("eax",&LB("eax")); # 6
&xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
&pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
- &movz ("esi",&HB("ebx")); # 13
- &xor ("ecx",&DWP(3,$tbl,"esi",8)); # 13
+ &movz ($acc,&HB("ebx")); # 13
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
&xor ("ecx",&DWP(24,$key)); # t[2]
&movd ("mm4","ecx"); # t[2] collected
&movz ("ebx",&LB("ebx")); # 12
&and ($out,0xFF);
if ($i==1) { &shr ($s[0],16); }#%ebx[1]
if ($i==2) { &shr ($s[0],24); }#%ecx[2]
- &movz ($out,&DWP(-128,$te,$out,1));
+ &movz ($out,&BP(-128,$te,$out,1));
if ($i==3) { $tmp=$s[1]; }##%eax
&movz ($tmp,&HB($s[1]));
- &movz ($tmp,&DWP(-128,$te,$tmp,1));
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
&shl ($tmp,8);
&xor ($out,$tmp);
&shr ($tmp,16); }
if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
&and ($tmp,0xFF);
- &movz ($tmp,&DWP(-128,$te,$tmp,1));
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
&shl ($tmp,16);
&xor ($out,$tmp);
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]);
&shr ($tmp,24); }
- &movz ($tmp,&DWP(-128,$te,$tmp,1));
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
&shl ($tmp,24);
&xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
#
# Performance is not actually extraordinary in comparison to pure
# x86 code. In particular encrypt performance is virtually the same.
-# same. Decrypt performance on the other hand is 15-20% better on
-# newer µ-archs [but we're thankful for *any* improvement here], and
-# ~50% better on PIII:-) And additionally on the pros side this code
+# Decrypt performance on the other hand is 15-20% better on newer
+# µ-archs [but we're thankful for *any* improvement here], and ~50%
+# better on PIII:-) And additionally on the pros side this code
# eliminates redundant references to stack and thus relieves/
# minimizes the pressure on the memory bus.
#
&movd ("eax","mm1"); # 5, 4, 1, 0
&movd ("ebx","mm5"); # 15,14,11,10
- &movz ("esi",&LB("eax")); # 0
- &movz ("ecx",&DWP(-128,$tbl,"esi",1));# 0
+ &movz ($acc,&LB("eax")); # 0
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
&pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
&movz ("edx",&HB("eax")); # 1
- &movz ("edx",&DWP(-128,$tbl,"edx",1));# 1
+ &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
&shl ("edx",8); # 1
&shr ("eax",16); # 5, 4
- &movz ("esi",&LB("ebx")); # 10
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 10
- &shl ("esi",16); # 10
- &or ("ecx","esi"); # 10
+ &movz ($acc,&LB("ebx")); # 10
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
+ &shl ($acc,16); # 10
+ &or ("ecx",$acc); # 10
&pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
- &movz ("esi",&HB("ebx")); # 11
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 11
- &shl ("esi",24); # 11
- &or ("edx","esi"); # 11
+ &movz ($acc,&HB("ebx")); # 11
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
+ &shl ($acc,24); # 11
+ &or ("edx",$acc); # 11
&shr ("ebx",16); # 15,14
- &movz ("esi",&HB("eax")); # 5
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 5
- &shl ("esi",8); # 5
- &or ("ecx","esi"); # 5
- &movz ("esi",&HB("ebx")); # 15
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 15
- &shl ("esi",24); # 15
- &or ("ecx","esi"); # 15
+ &movz ($acc,&HB("eax")); # 5
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
+ &shl ($acc,8); # 5
+ &or ("ecx",$acc); # 5
+ &movz ($acc,&HB("ebx")); # 15
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
+ &shl ($acc,24); # 15
+ &or ("ecx",$acc); # 15
&movd ("mm0","ecx"); # t[0] collected
- &movz ("esi",&LB("eax")); # 4
- &movz ("ecx",&DWP(-128,$tbl,"esi",1));# 4
+ &movz ($acc,&LB("eax")); # 4
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
&movd ("eax","mm2"); # 7, 6, 3, 2
- &movz ("esi",&LB("ebx")); # 14
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 14
- &shl ("esi",16); # 14
- &or ("ecx","esi"); # 14
+ &movz ($acc,&LB("ebx")); # 14
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
+ &shl ($acc,16); # 14
+ &or ("ecx",$acc); # 14
&movd ("ebx","mm6"); # 13,12, 9, 8
- &movz ("esi",&HB("eax")); # 3
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 3
- &shl ("esi",24); # 3
- &or ("ecx","esi"); # 3
- &movz ("esi",&HB("ebx")); # 9
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 9
- &shl ("esi",8); # 9
- &or ("ecx","esi"); # 9
+ &movz ($acc,&HB("eax")); # 3
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
+ &shl ($acc,24); # 3
+ &or ("ecx",$acc); # 3
+ &movz ($acc,&HB("ebx")); # 9
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
+ &shl ($acc,8); # 9
+ &or ("ecx",$acc); # 9
&movd ("mm1","ecx"); # t[1] collected
- &movz ("esi",&LB("ebx")); # 8
- &movz ("ecx",&DWP(-128,$tbl,"esi",1));# 8
+ &movz ($acc,&LB("ebx")); # 8
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
&shr ("ebx",16); # 13,12
- &movz ("esi",&LB("eax")); # 2
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 2
- &shl ("esi",16); # 2
- &or ("ecx","esi"); # 2
+ &movz ($acc,&LB("eax")); # 2
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
+ &shl ($acc,16); # 2
+ &or ("ecx",$acc); # 2
&shr ("eax",16); # 7, 6
&punpckldq ("mm0","mm1"); # t[0,1] collected
- &movz ("esi",&HB("eax")); # 7
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 7
- &shl ("esi",24); # 7
- &or ("ecx","esi"); # 7
+ &movz ($acc,&HB("eax")); # 7
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
+ &shl ($acc,24); # 7
+ &or ("ecx",$acc); # 7
&and ("eax",0xff); # 6
- &movz ("eax",&DWP(-128,$tbl,"eax",1));# 6
+ &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
&shl ("eax",16); # 6
&or ("edx","eax"); # 6
- &movz ("esi",&HB("ebx")); # 13
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 13
- &shl ("esi",8); # 13
- &or ("ecx","esi"); # 13
+ &movz ($acc,&HB("ebx")); # 13
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
+ &shl ($acc,8); # 13
+ &or ("ecx",$acc); # 13
&movd ("mm4","ecx"); # t[2] collected
&and ("ebx",0xff); # 12
- &movz ("ebx",&DWP(-128,$tbl,"ebx",1));# 12
+ &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
&or ("edx","ebx"); # 12
&movd ("mm5","edx"); # t[3] collected
&movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
&pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
&pand ("mm3","mm2"); &pand ("mm7","mm2");
- &movq ("mm2","mm0"); &movq ("mm6","mm4"); # r0
+ &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
&paddb ("mm0","mm0"); &paddb ("mm4","mm4");
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
- &movq ("mm3","mm2"); &movq ("mm7","mm6");
- &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r2^r0
+ &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
+ &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
+ &movq ("mm2","mm3"); &movq ("mm6","mm7");
&pslld ("mm3",8); &pslld ("mm7",8);
- &psrld ("mm2",16); &psrld ("mm6",16);
+ &psrld ("mm2",24); &psrld ("mm6",24);
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
- &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>16
- &pslld ("mm3",8); &pslld ("mm7",8);
- &psrld ("mm2",8); &psrld ("mm6",8);
- &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<16
- &movq ("mm3","mm1"); &movq ("mm7","mm5");
&pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
- &psrld ("mm1",8); &psrld ("mm5",8);
+ &movq ("mm3","mm1"); &movq ("mm7","mm5");
&movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
+ &psrld ("mm1",8); &psrld ("mm5",8);
&pslld ("mm3",24); &pslld ("mm7",24);
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
&mov (&DWP(4,$acc),$s1);
&mov (&DWP(8,$acc),$s2);
&mov (&DWP(12,$acc),$s3);
- &jmp (&label("ret"));
+ &function_end_A();
&set_label("mmx",16);
&movq ("mm0",&QWP(0,$acc));
&movq (&QWP(0,$acc),"mm0"); # write output data
&movq (&QWP(8,$acc),"mm4");
&emms ();
-
-&set_label("ret",4);
&function_end("AES_encrypt");
#--------------------------------------------------------------------#
if($i==3) { &$Fn ($key,&DWP(20,"esp")); }
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
- &movz ($out,&DWP(-128,$td,$out,1));
+ &movz ($out,&BP(-128,$td,$out,1));
if ($i==3) { $tmp=$s[1]; }
&movz ($tmp,&HB($s[1]));
- &movz ($tmp,&DWP(-128,$td,$tmp,1));
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
&shl ($tmp,8);
&xor ($out,$tmp);
else { mov ($tmp,$s[2]); }
&shr ($tmp,16);
&and ($tmp,0xFF);
- &movz ($tmp,&DWP(-128,$td,$tmp,1));
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
&shl ($tmp,16);
&xor ($out,$tmp);
if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],&DWP(8,"esp")); }
else { &mov ($tmp,$s[3]); }
&shr ($tmp,24);
- &movz ($tmp,&DWP(-128,$td,$tmp,1));
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
&shl ($tmp,24);
&xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
&movd ("eax","mm1"); # 7, 6, 1, 0
&pshufw ("mm5","mm4",0x09); # 13,12,11,10
- &movz ("esi",&LB("eax")); # 0
- &movz ("ecx",&DWP(-128,$tbl,"esi",1));# 0
+ &movz ($acc,&LB("eax")); # 0
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
&movd ("ebx","mm5"); # 13,12,11,10
&movz ("edx",&HB("eax")); # 1
- &movz ("edx",&DWP(-128,$tbl,"edx",1));# 1
+ &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
&shl ("edx",8); # 1
&pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
- &movz ("esi",&LB("ebx")); # 10
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 10
- &shl ("esi",16); # 10
- &or ("ecx","esi"); # 10
+ &movz ($acc,&LB("ebx")); # 10
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
+ &shl ($acc,16); # 10
+ &or ("ecx",$acc); # 10
&shr ("eax",16); # 7, 6
- &movz ("esi",&HB("ebx")); # 11
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 11
- &shl ("esi",24); # 11
- &or ("edx","esi"); # 11
+ &movz ($acc,&HB("ebx")); # 11
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
+ &shl ($acc,24); # 11
+ &or ("edx",$acc); # 11
&shr ("ebx",16); # 13,12
&pshufw ("mm6","mm4",0x03); # 9, 8,15,14
- &movz ("esi",&HB("eax")); # 7
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 7
- &shl ("esi",24); # 7
- &or ("ecx","esi"); # 7
- &movz ("esi",&HB("ebx")); # 13
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 13
- &shl ("esi",8); # 13
- &or ("ecx","esi"); # 13
+ &movz ($acc,&HB("eax")); # 7
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
+ &shl ($acc,24); # 7
+ &or ("ecx",$acc); # 7
+ &movz ($acc,&HB("ebx")); # 13
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
+ &shl ($acc,8); # 13
+ &or ("ecx",$acc); # 13
&movd ("mm0","ecx"); # t[0] collected
- &movz ("esi",&LB("eax")); # 6
+ &movz ($acc,&LB("eax")); # 6
&movd ("eax","mm2"); # 3, 2, 5, 4
- &movz ("ecx",&DWP(-128,$tbl,"esi",1));# 6
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
&shl ("ecx",16); # 6
- &movz ("esi",&LB("ebx")); # 12
+ &movz ($acc,&LB("ebx")); # 12
&movd ("ebx","mm6"); # 9, 8,15,14
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 12
- &or ("ecx","esi"); # 12
-
- &movz ("esi",&LB("eax")); # 4
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 4
- &or ("edx","esi"); # 4
- &movz ("esi",&LB("ebx")); # 14
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 14
- &shl ("esi",16); # 14
- &or ("edx","esi"); # 14
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
+ &or ("ecx",$acc); # 12
+
+ &movz ($acc,&LB("eax")); # 4
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
+ &or ("edx",$acc); # 4
+ &movz ($acc,&LB("ebx")); # 14
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
+ &shl ($acc,16); # 14
+ &or ("edx",$acc); # 14
&movd ("mm1","edx"); # t[1] collected
- &movz ("esi",&HB("eax")); # 5
- &movz ("edx",&DWP(-128,$tbl,"esi",1));# 5
+ &movz ($acc,&HB("eax")); # 5
+ &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
&shl ("edx",8); # 5
- &movz ("esi",&HB("ebx")); # 15
+ &movz ($acc,&HB("ebx")); # 15
&shr ("eax",16); # 3, 2
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 15
- &shl ("esi",24); # 15
- &or ("edx","esi"); # 15
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
+ &shl ($acc,24); # 15
+ &or ("edx",$acc); # 15
&shr ("ebx",16); # 9, 8
&punpckldq ("mm0","mm1"); # t[0,1] collected
- &movz ("esi",&HB("ebx")); # 9
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 9
- &shl ("esi",8); # 9
- &or ("ecx","esi"); # 9
+ &movz ($acc,&HB("ebx")); # 9
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
+ &shl ($acc,8); # 9
+ &or ("ecx",$acc); # 9
&and ("ebx",0xff); # 8
- &movz ("ebx",&DWP(-128,$tbl,"ebx",1));# 8
+ &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
&or ("edx","ebx"); # 8
- &movz ("esi",&LB("eax")); # 2
- &movz ("esi",&DWP(-128,$tbl,"esi",1));# 2
- &shl ("esi",16); # 2
- &or ("edx","esi"); # 2
+ &movz ($acc,&LB("eax")); # 2
+ &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
+ &shl ($acc,16); # 2
+ &or ("edx",$acc); # 2
&movd ("mm4","edx"); # t[2] collected
&movz ("eax",&HB("eax")); # 3
- &movz ("eax",&DWP(-128,$tbl,"eax",1));# 3
+ &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
&shl ("eax",24); # 3
&or ("ecx","eax"); # 3
&movd ("mm5","ecx"); # t[3] collected
&mov ($s2,&DWP(192-128,$tbl));
&mov ($s3,&DWP(224-128,$tbl));
- &align (4);
&set_label("loop",16);
&mmx_deccompact();
&add ($key,16);
if($i==3) { &mov ($key,&DWP(20,"esp")); }
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
- &movz ($out,&DWP(0,$td,$out,1));
+ &movz ($out,&BP(0,$td,$out,1));
if ($i==3) { $tmp=$s[1]; }
&movz ($tmp,&HB($s[1]));
- &movz ($tmp,&DWP(0,$td,$tmp,1));
+ &movz ($tmp,&BP(0,$td,$tmp,1));
&shl ($tmp,8);
&xor ($out,$tmp);
else { mov ($tmp,$s[2]); }
&shr ($tmp,16);
&and ($tmp,0xFF);
- &movz ($tmp,&DWP(0,$td,$tmp,1));
+ &movz ($tmp,&BP(0,$td,$tmp,1));
&shl ($tmp,16);
&xor ($out,$tmp);
if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
else { &mov ($tmp,$s[3]); }
&shr ($tmp,24);
- &movz ($tmp,&DWP(0,$td,$tmp,1));
+ &movz ($tmp,&BP(0,$td,$tmp,1));
&shl ($tmp,24);
&xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
&mov (&DWP(4,$acc),$s1);
&mov (&DWP(8,$acc),$s2);
&mov (&DWP(12,$acc),$s3);
- &jmp (&label("ret"));
+ &function_end_A();
&set_label("mmx",16);
&movq ("mm0",&QWP(0,$acc));
&movq (&QWP(0,$acc),"mm0"); # write output data
&movq (&QWP(8,$acc),"mm4");
&emms ();
-
-&set_label("ret",4);
&function_end("AES_decrypt");
# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
sub enckey()
{
&movz ("esi",&LB("edx")); # rk[i]>>0
- &mov ("ebx",&DWP(2,$tbl,"esi",8));
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
&movz ("esi",&HB("edx")); # rk[i]>>8
- &and ("ebx",0xFF000000);
+ &shl ("ebx",24);
&xor ("eax","ebx");
- &mov ("ebx",&DWP(2,$tbl,"esi",8));
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
&shr ("edx",16);
- &and ("ebx",0x000000FF);
&movz ("esi",&LB("edx")); # rk[i]>>16
&xor ("eax","ebx");
- &mov ("ebx",&DWP(0,$tbl,"esi",8));
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
&movz ("esi",&HB("edx")); # rk[i]>>24
- &and ("ebx",0x0000FF00);
+ &shl ("ebx",8);
&xor ("eax","ebx");
- &mov ("ebx",&DWP(0,$tbl,"esi",8));
- &and ("ebx",0x00FF0000);
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
+ &shl ("ebx",16);
&xor ("eax","ebx");
- &xor ("eax",&DWP(2048+1024,$tbl,"ecx",4)); # rcon
+ &xor ("eax",&BP(1024-128,$tbl,"ecx",4)); # rcon
}
# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
&set_label("pic_point");
&blindpop($tbl);
&lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
+ &lea ($tbl,&DWP(2048+128,$tbl));
+
+ # prefetch Te4
+ &mov ("eax",&DWP(0-128,$tbl));
+ &mov ("ebx",&DWP(32-128,$tbl));
+ &mov ("ecx",&DWP(64-128,$tbl));
+ &mov ("edx",&DWP(96-128,$tbl));
+ &mov ("eax",&DWP(128-128,$tbl));
+ &mov ("ebx",&DWP(160-128,$tbl));
+ &mov ("ecx",&DWP(192-128,$tbl));
+ &mov ("edx",&DWP(224-128,$tbl));
&mov ("ecx",&wparam(1)); # number of bits in key
&cmp ("ecx",128);
&mov ("edx","eax");
&mov ("eax",&DWP(16,"edi")); # rk[4]
&movz ("esi",&LB("edx")); # rk[11]>>0
- &mov ("ebx",&DWP(2,$tbl,"esi",8));
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
&movz ("esi",&HB("edx")); # rk[11]>>8
- &and ("ebx",0x000000FF);
&xor ("eax","ebx");
- &mov ("ebx",&DWP(0,$tbl,"esi",8));
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
&shr ("edx",16);
- &and ("ebx",0x0000FF00);
+ &shl ("ebx",8);
&movz ("esi",&LB("edx")); # rk[11]>>16
&xor ("eax","ebx");
- &mov ("ebx",&DWP(0,$tbl,"esi",8));
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
&movz ("esi",&HB("edx")); # rk[11]>>24
- &and ("ebx",0x00FF0000);
+ &shl ("ebx",16);
&xor ("eax","ebx");
- &mov ("ebx",&DWP(2,$tbl,"esi",8));
- &and ("ebx",0xFF000000);
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
+ &shl ("ebx",24);
&xor ("eax","ebx");
&mov (&DWP(48,"edi"),"eax"); # rk[12]
&function_end("AES_set_encrypt_key");
sub deckey()
-{ my ($i,$ptr,$te,$td) = @_;
+{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
+ my $tmp = $tbl;
- &mov ("eax",&DWP($i,$ptr));
- &mov ("edx","eax");
- &movz ("ebx",&HB("eax"));
- &shr ("edx",16);
- &and ("eax",0xFF);
- &movz ("eax",&BP(2,$te,"eax",8));
- &movz ("ebx",&BP(2,$te,"ebx",8));
- &mov ("eax",&DWP(0,$td,"eax",8));
- &xor ("eax",&DWP(3,$td,"ebx",8));
- &movz ("ebx",&HB("edx"));
- &and ("edx",0xFF);
- &movz ("edx",&BP(2,$te,"edx",8));
- &movz ("ebx",&BP(2,$te,"ebx",8));
- &xor ("eax",&DWP(2,$td,"edx",8));
- &xor ("eax",&DWP(1,$td,"ebx",8));
- &mov (&DWP($i,$ptr),"eax");
+ &mov ($acc,$tp1);
+ &and ($acc,0x80808080);
+ &mov ($tmp,$acc);
+ &mov ($tp2,$tp1);
+ &shr ($tmp,7);
+ &and ($tp2,0x7f7f7f7f);
+ &sub ($acc,$tmp);
+ &add ($tp2,$tp2);
+ &and ($acc,0x1b1b1b1b);
+ &xor ($acc,$tp2);
+ &mov ($tp2,$acc);
+
+ &and ($acc,0x80808080);
+ &mov ($tmp,$acc);
+ &mov ($tp4,$tp2);
+ &xor ($tp2,$tp1); # tp2^tp1
+ &shr ($tmp,7);
+ &and ($tp4,0x7f7f7f7f);
+ &sub ($acc,$tmp);
+ &add ($tp4,$tp4);
+ &and ($acc,0x1b1b1b1b);
+ &xor ($acc,$tp4);
+ &mov ($tp4,$acc);
+
+ &and ($acc,0x80808080);
+ &mov ($tmp,$acc);
+ &mov ($tp8,$tp4);
+ &xor ($tp4,$tp1); # tp4^tp1
+ &shr ($tmp,7);
+ &and ($tp8,0x7f7f7f7f);
+ &sub ($acc,$tmp);
+ &add ($tp8,$tp8);
+ &and ($acc,0x1b1b1b1b);
+ &rotl ($tp1,8); # = ROTATE(tp1,8)
+ &xor ($tp8,$acc);
+
+ &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
+
+ &xor ($tp1,$tp2);
+ &xor ($tp2,$tp8);
+ &xor ($tp1,$tp4);
+ &rotl ($tp2,24);
+ &xor ($tp4,$tp8);
+ &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
+ &rotl ($tp4,16);
+ &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
+ &rotl ($tp8,8);
+ &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
+ &mov ($tp2,$tmp);
+ &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
+
+ &mov (&DWP(4*$i,$key),$tp1);
}
# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
&lea ("ecx",&DWP(0,"","ecx",4));
&lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
- &align (4);
- &set_label("invert"); # invert order of chunks
+ &set_label("invert",4); # invert order of chunks
&mov ("eax",&DWP(0,"esi"));
&mov ("ebx",&DWP(4,"esi"));
&mov ("ecx",&DWP(0,"edi"));
&cmp ("esi","edi");
&jne (&label("invert"));
- &call (&label("pic_point"));
- &set_label("pic_point");
- blindpop($tbl);
- &lea ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
- &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
+ &mov ($key,&wparam(2));
+ &mov ($acc,&DWP(240,$key)); # pull number of rounds
+ &lea ($acc,&DWP(-2,$acc,$acc));
+ &lea ($acc,&DWP(0,$key,$acc,8));
+ &mov (&wparam(2),$acc);
- &mov ("esi",&wparam(2));
- &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
- &dec ("ecx");
- &align (4);
- &set_label("permute"); # permute the key schedule
- &add ("esi",16);
- &deckey (0,"esi",$tbl,"edi");
- &deckey (4,"esi",$tbl,"edi");
- &deckey (8,"esi",$tbl,"edi");
- &deckey (12,"esi",$tbl,"edi");
- &dec ("ecx");
- &jnz (&label("permute"));
+ &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
+ &set_label("permute",4); # permute the key schedule
+ &add ($key,16);
+ &deckey (0,$key,$s0,$s1,$s2,$s3);
+ &deckey (1,$key,$s1,$s2,$s3,$s0);
+ &deckey (2,$key,$s2,$s3,$s0,$s1);
+ &deckey (3,$key,$s3,$s0,$s1,$s2);
+ &cmp ($key,&wparam(2));
+ &jb (&label("permute"));
&xor ("eax","eax"); # return success
&function_end("AES_set_decrypt_key");