From d90bf2ab2179f363d1180fe0a5a26422cc5c6794 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 12 Nov 2012 18:11:17 +0000 Subject: [PATCH] [vp]aes-x86[_64].pl: update from HEAD. --- crypto/aes/asm/aes-586.pl | 283 +++++++++++++++++---------------- crypto/aes/asm/aes-x86_64.pl | 250 ++++++++++++++--------------- crypto/aes/asm/vpaes-x86.pl | 91 ++++++----- crypto/aes/asm/vpaes-x86_64.pl | 85 +++++----- 4 files changed, 354 insertions(+), 355 deletions(-) diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl index 687ed811be..451d0e0ed1 100755 --- a/crypto/aes/asm/aes-586.pl +++ b/crypto/aes/asm/aes-586.pl @@ -39,7 +39,7 @@ # but exhibits up to 10% improvement on other cores. # # Second version is "monolithic" replacement for aes_core.c, which in -# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key. +# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key. # This made it possible to implement little-endian variant of the # algorithm without modifying the base C code. Motivating factor for # the undertaken effort was that it appeared that in tight IA-32 @@ -103,11 +103,12 @@ # byte for 128-bit key. # # ECB encrypt ECB decrypt CBC large chunk -# P4 56[60] 84[100] 23 -# AMD K8 48[44] 70[79] 18 -# PIII 41[50] 61[91] 24 -# Core 2 32[38] 45[70] 18.5 -# Pentium 120 160 77 +# P4 52[54] 83[95] 23 +# AMD K8 46[41] 66[70] 18 +# PIII 41[50] 60[77] 24 +# Core 2 31[36] 45[64] 18.5 +# Atom 76[100] 96[138] 60 +# Pentium 115 150 77 # # Version 4.1 switches to compact S-box even in key schedule setup. # @@ -242,7 +243,7 @@ $vertical_spin=0; # shift "verticaly" defaults to 0, because of sub encvert() { my ($te,@s) = @_; - my $v0 = $acc, $v1 = $key; + my ($v0,$v1) = ($acc,$key); &mov ($v0,$s[3]); # copy s3 &mov (&DWP(4,"esp"),$s[2]); # save s2 @@ -299,7 +300,7 @@ sub encvert() # Another experimental routine, which features "horizontal spin," but # eliminates one reference to stack. Strangely enough runs slower... sub enchoriz() -{ my $v0 = $key, $v1 = $acc; +{ my ($v0,$v1) = ($key,$acc); &movz ($v0,&LB($s0)); # 3, 2, 1, 0* &rotr ($s2,8); # 8,11,10, 9 @@ -427,7 +428,7 @@ sub sse_encbody() ###################################################################### sub enccompact() -{ my $Fn = mov; +{ my $Fn = \&mov; while ($#_>5) { pop(@_); $Fn=sub{}; } my ($i,$te,@s)=@_; my $tmp = $key; @@ -476,24 +477,25 @@ sub enctransform() my $tmp = $tbl; my $r2 = $key ; - &mov ($acc,$s[$i]); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &and ($tmp,$s[$i]); &lea ($r2,&DWP(0,$s[$i],$s[$i])); - &sub ($acc,$tmp); + &mov ($acc,$tmp); + &shr ($tmp,7); &and ($r2,0xfefefefe); - &and ($acc,0x1b1b1b1b); + &sub ($acc,$tmp); &mov ($tmp,$s[$i]); + &and ($acc,0x1b1b1b1b); + &rotr ($tmp,16); &xor ($acc,$r2); # r2 + &mov ($r2,$s[$i]); &xor ($s[$i],$acc); # r0 ^ r2 + &rotr ($r2,16+8); + &xor ($acc,$tmp); &rotl ($s[$i],24); - &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2 - &rotr ($tmp,16); - &xor ($s[$i],$tmp); - &rotr ($tmp,8); - &xor ($s[$i],$tmp); + &xor ($acc,$r2); + &mov ($tmp,0x80808080) if ($i!=1); + &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2 } &function_begin_B("_x86_AES_encrypt_compact"); @@ -526,6 +528,7 @@ sub enctransform() &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1); &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1); &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1); + &mov ($tbl,0x80808080); &enctransform(2); &enctransform(3); &enctransform(0); @@ -607,82 +610,84 @@ sub sse_enccompact() &pshufw ("mm5","mm4",0x0d); # 15,14,11,10 &movd ("eax","mm1"); # 5, 4, 1, 0 &movd ("ebx","mm5"); # 15,14,11,10 + &mov ($__key,$key); &movz ($acc,&LB("eax")); # 0 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 - &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 &movz ("edx",&HB("eax")); # 1 + &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 + &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 + &movz ($key,&LB("ebx")); # 10 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 - &shl ("edx",8); # 1 &shr ("eax",16); # 5, 4 + &shl ("edx",8); # 1 - &movz ($acc,&LB("ebx")); # 10 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 + &movz ($key,&HB("ebx")); # 11 &shl ($acc,16); # 10 - &or ("ecx",$acc); # 10 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 - &movz ($acc,&HB("ebx")); # 11 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 + &or ("ecx",$acc); # 10 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 + &movz ($key,&HB("eax")); # 5 &shl ($acc,24); # 11 - &or ("edx",$acc); # 11 &shr ("ebx",16); # 15,14 + &or ("edx",$acc); # 11 - &movz ($acc,&HB("eax")); # 5 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 + &movz ($key,&HB("ebx")); # 15 &shl ($acc,8); # 5 &or ("ecx",$acc); # 5 - &movz ($acc,&HB("ebx")); # 15 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 15 + &movz ($key,&LB("eax")); # 4 &shl ($acc,24); # 15 &or ("ecx",$acc); # 15 - &movd ("mm0","ecx"); # t[0] collected - &movz ($acc,&LB("eax")); # 4 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 + &movz ($key,&LB("ebx")); # 14 &movd ("eax","mm2"); # 7, 6, 3, 2 - &movz ($acc,&LB("ebx")); # 14 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 - &shl ($acc,16); # 14 + &movd ("mm0","ecx"); # t[0] collected + &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14 + &movz ($key,&HB("eax")); # 3 + &shl ("ecx",16); # 14 + &movd ("ebx","mm6"); # 13,12, 9, 8 &or ("ecx",$acc); # 14 - &movd ("ebx","mm6"); # 13,12, 9, 8 - &movz ($acc,&HB("eax")); # 3 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 3 + &movz ($key,&HB("ebx")); # 9 &shl ($acc,24); # 3 &or ("ecx",$acc); # 3 - &movz ($acc,&HB("ebx")); # 9 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 + &movz ($key,&LB("ebx")); # 8 &shl ($acc,8); # 9 + &shr ("ebx",16); # 13,12 &or ("ecx",$acc); # 9 - &movd ("mm1","ecx"); # t[1] collected - &movz ($acc,&LB("ebx")); # 8 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8 - &shr ("ebx",16); # 13,12 - &movz ($acc,&LB("eax")); # 2 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 - &shl ($acc,16); # 2 - &or ("ecx",$acc); # 2 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 8 + &movz ($key,&LB("eax")); # 2 &shr ("eax",16); # 7, 6 + &movd ("mm1","ecx"); # t[1] collected + &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2 + &movz ($key,&HB("eax")); # 7 + &shl ("ecx",16); # 2 + &and ("eax",0xff); # 6 + &or ("ecx",$acc); # 2 &punpckldq ("mm0","mm1"); # t[0,1] collected - &movz ($acc,&HB("eax")); # 7 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 + &movz ($key,&HB("ebx")); # 13 &shl ($acc,24); # 7 - &or ("ecx",$acc); # 7 - &and ("eax",0xff); # 6 + &and ("ebx",0xff); # 12 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6 + &or ("ecx",$acc); # 7 &shl ("eax",16); # 6 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 &or ("edx","eax"); # 6 - &movz ($acc,&HB("ebx")); # 13 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 &shl ($acc,8); # 13 - &or ("ecx",$acc); # 13 - &movd ("mm4","ecx"); # t[2] collected - &and ("ebx",0xff); # 12 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12 + &or ("ecx",$acc); # 13 &or ("edx","ebx"); # 12 + &mov ($key,$__key); + &movd ("mm4","ecx"); # t[2] collected &movd ("mm5","edx"); # t[3] collected &punpckldq ("mm4","mm5"); # t[2,3] collected @@ -1222,7 +1227,7 @@ sub enclast() ###################################################################### sub deccompact() -{ my $Fn = mov; +{ my $Fn = \&mov; while ($#_>5) { pop(@_); $Fn=sub{}; } my ($i,$td,@s)=@_; my $tmp = $key; @@ -1270,30 +1275,30 @@ sub dectransform() my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1); my $tp8 = $tbl; - &mov ($acc,$s[$i]); - &and ($acc,0x80808080); - &mov ($tmp,$acc); + &mov ($tmp,0x80808080); + &and ($tmp,$s[$i]); + &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp2,&DWP(0,$s[$i],$s[$i])); &sub ($acc,$tmp); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); - &xor ($acc,$tp2); - &mov ($tp2,$acc); + &xor ($tp2,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); + &and ($tmp,$tp2); + &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp4,&DWP(0,$tp2,$tp2)); &sub ($acc,$tmp); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$s[$i]); # tp2^tp1 - &xor ($acc,$tp4); - &mov ($tp4,$acc); + &xor ($tp4,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); + &and ($tmp,$tp4); + &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp8,&DWP(0,$tp4,$tp4)); &sub ($acc,$tmp); @@ -1305,13 +1310,13 @@ sub dectransform() &xor ($s[$i],$tp2); &xor ($tp2,$tp8); - &rotl ($tp2,24); &xor ($s[$i],$tp4); &xor ($tp4,$tp8); - &rotl ($tp4,16); + &rotl ($tp2,24); &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) - &rotl ($tp8,8); + &rotl ($tp4,16); &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) + &rotl ($tp8,8); &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) &mov ($s[0],$__s0) if($i==2); #prefetch $s0 &mov ($s[1],$__s1) if($i==3); #prefetch $s1 @@ -1389,85 +1394,87 @@ sub dectransform() sub sse_deccompact() { &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0 + &pshufw ("mm5","mm4",0x09); # 13,12,11,10 &movd ("eax","mm1"); # 7, 6, 1, 0 + &movd ("ebx","mm5"); # 13,12,11,10 + &mov ($__key,$key); - &pshufw ("mm5","mm4",0x09); # 13,12,11,10 &movz ($acc,&LB("eax")); # 0 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 - &movd ("ebx","mm5"); # 13,12,11,10 &movz ("edx",&HB("eax")); # 1 + &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 + &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 + &movz ($key,&LB("ebx")); # 10 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 + &shr ("eax",16); # 7, 6 &shl ("edx",8); # 1 - &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 - &movz ($acc,&LB("ebx")); # 10 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 + &movz ($key,&HB("ebx")); # 11 &shl ($acc,16); # 10 + &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 &or ("ecx",$acc); # 10 - &shr ("eax",16); # 7, 6 - &movz ($acc,&HB("ebx")); # 11 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 + &movz ($key,&HB("eax")); # 7 &shl ($acc,24); # 11 - &or ("edx",$acc); # 11 &shr ("ebx",16); # 13,12 + &or ("edx",$acc); # 11 - &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 - &movz ($acc,&HB("eax")); # 7 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 + &movz ($key,&HB("ebx")); # 13 &shl ($acc,24); # 7 &or ("ecx",$acc); # 7 - &movz ($acc,&HB("ebx")); # 13 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 + &movz ($key,&LB("eax")); # 6 &shl ($acc,8); # 13 + &movd ("eax","mm2"); # 3, 2, 5, 4 &or ("ecx",$acc); # 13 - &movd ("mm0","ecx"); # t[0] collected - &movz ($acc,&LB("eax")); # 6 - &movd ("eax","mm2"); # 3, 2, 5, 4 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6 - &shl ("ecx",16); # 6 - &movz ($acc,&LB("ebx")); # 12 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 6 + &movz ($key,&LB("ebx")); # 12 + &shl ($acc,16); # 6 &movd ("ebx","mm6"); # 9, 8,15,14 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12 + &movd ("mm0","ecx"); # t[0] collected + &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12 + &movz ($key,&LB("eax")); # 4 &or ("ecx",$acc); # 12 - &movz ($acc,&LB("eax")); # 4 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 + &movz ($key,&LB("ebx")); # 14 &or ("edx",$acc); # 4 - &movz ($acc,&LB("ebx")); # 14 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 14 + &movz ($key,&HB("eax")); # 5 &shl ($acc,16); # 14 + &shr ("eax",16); # 3, 2 &or ("edx",$acc); # 14 - &movd ("mm1","edx"); # t[1] collected - &movz ($acc,&HB("eax")); # 5 - &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5 - &shl ("edx",8); # 5 - &movz ($acc,&HB("ebx")); # 15 - &shr ("eax",16); # 3, 2 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 - &shl ($acc,24); # 15 - &or ("edx",$acc); # 15 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 + &movz ($key,&HB("ebx")); # 15 &shr ("ebx",16); # 9, 8 + &shl ($acc,8); # 5 + &movd ("mm1","edx"); # t[1] collected + &movz ("edx",&BP(-128,$tbl,$key,1)); # 15 + &movz ($key,&HB("ebx")); # 9 + &shl ("edx",24); # 15 + &and ("ebx",0xff); # 8 + &or ("edx",$acc); # 15 &punpckldq ("mm0","mm1"); # t[0,1] collected - &movz ($acc,&HB("ebx")); # 9 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 + &movz ($key,&LB("eax")); # 2 &shl ($acc,8); # 9 - &or ("ecx",$acc); # 9 - &and ("ebx",0xff); # 8 + &movz ("eax",&HB("eax")); # 3 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8 + &or ("ecx",$acc); # 9 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 2 &or ("edx","ebx"); # 8 - &movz ($acc,&LB("eax")); # 2 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 &shl ($acc,16); # 2 - &or ("edx",$acc); # 2 - &movd ("mm4","edx"); # t[2] collected - &movz ("eax",&HB("eax")); # 3 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3 + &or ("edx",$acc); # 2 &shl ("eax",24); # 3 &or ("ecx","eax"); # 3 + &mov ($key,$__key); + &movd ("mm4","edx"); # t[2] collected &movd ("mm5","ecx"); # t[3] collected &punpckldq ("mm4","mm5"); # t[2,3] collected @@ -2181,8 +2188,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds &mov ("ecx",240/4); &xor ("eax","eax"); &align (4); - &data_word(0xABF3F689); # rep stosd - &set_label("skip_ezero") + &data_word(0xABF3F689); # rep stosd + &set_label("skip_ezero"); &mov ("esp",$_esp); &popf (); &set_label("drop_out"); @@ -2301,8 +2308,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds &mov ("ecx",240/4); &xor ("eax","eax"); &align (4); - &data_word(0xABF3F689); # rep stosd - &set_label("skip_dzero") + &data_word(0xABF3F689); # rep stosd + &set_label("skip_dzero"); &mov ("esp",$_esp); &popf (); &function_end_A(); @@ -2865,32 +2872,32 @@ sub deckey() { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; my $tmp = $tbl; - &mov ($acc,$tp1); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &mov ($tmp,0x80808080); + &and ($tmp,$tp1); &lea ($tp2,&DWP(0,$tp1,$tp1)); + &mov ($acc,$tmp); + &shr ($tmp,7); &sub ($acc,$tmp); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); - &xor ($acc,$tp2); - &mov ($tp2,$acc); + &xor ($tp2,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &and ($tmp,$tp2); &lea ($tp4,&DWP(0,$tp2,$tp2)); + &mov ($acc,$tmp); + &shr ($tmp,7); &sub ($acc,$tmp); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$tp1); # tp2^tp1 - &xor ($acc,$tp4); - &mov ($tp4,$acc); + &xor ($tp4,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &and ($tmp,$tp4); &lea ($tp8,&DWP(0,$tp4,$tp4)); + &mov ($acc,$tmp); + &shr ($tmp,7); &xor ($tp4,$tp1); # tp4^tp1 &sub ($acc,$tmp); &and ($tp8,0xfefefefe); diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl index 91459287dd..98066a594f 100755 --- a/crypto/aes/asm/aes-x86_64.pl +++ b/crypto/aes/asm/aes-x86_64.pl @@ -19,9 +19,10 @@ # Performance in number of cycles per processed byte for 128-bit key: # # ECB encrypt ECB decrypt CBC large chunk -# AMD64 33 41 13.0 -# EM64T 38 59 18.6(*) -# Core 2 30 43 14.5(*) +# AMD64 33 43 13.0 +# EM64T 38 56 18.6(*) +# Core 2 30 42 14.5(*) +# Atom 65 86 32.1(*) # # (*) with hyper-threading off @@ -365,68 +366,66 @@ $code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 - movzb ($sbox,$t0,1),$t0 - movzb ($sbox,$t1,1),$t1 - movzb ($sbox,$t2,1),$t2 - movzb `&lo("$s3")`,$t3 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 + shr \$16,$s2 + movzb `&hi("$s3")`,$acc2 + movzb ($sbox,$t0,1),$t0 + movzb ($sbox,$t1,1),$t1 + movzb ($sbox,$t2,1),$t2 movzb ($sbox,$t3,1),$t3 - movzb ($sbox,$acc0,1),$t4 #$t0 - movzb ($sbox,$acc1,1),$t5 #$t1 - movzb `&hi("$s3")`,$acc2 + movzb ($sbox,$acc0,1),$t4 #$t0 movzb `&hi("$s0")`,$acc0 - shr \$16,$s2 + movzb ($sbox,$acc1,1),$t5 #$t1 + movzb `&lo("$s2")`,$acc1 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 - shr \$16,$s3 - movzb `&lo("$s2")`,$acc1 shl \$8,$t4 + shr \$16,$s3 shl \$8,$t5 - movzb ($sbox,$acc1,1),$acc1 #$t0 xor $t4,$t0 - xor $t5,$t1 - - movzb `&lo("$s3")`,$t4 shr \$16,$s0 + movzb `&lo("$s3")`,$t4 shr \$16,$s1 - movzb `&lo("$s0")`,$t5 + xor $t5,$t1 shl \$8,$acc2 - shl \$8,$acc0 - movzb ($sbox,$t4,1),$t4 #$t1 - movzb ($sbox,$t5,1),$t5 #$t2 + movzb `&lo("$s0")`,$t5 + movzb ($sbox,$acc1,1),$acc1 #$t0 xor $acc2,$t2 - xor $acc0,$t3 + shl \$8,$acc0 movzb `&lo("$s1")`,$acc2 - movzb `&hi("$s3")`,$acc0 shl \$16,$acc1 - movzb ($sbox,$acc2,1),$acc2 #$t3 - movzb ($sbox,$acc0,1),$acc0 #$t0 + xor $acc0,$t3 + movzb ($sbox,$t4,1),$t4 #$t1 + movzb `&hi("$s3")`,$acc0 + movzb ($sbox,$t5,1),$t5 #$t2 xor $acc1,$t0 - movzb `&hi("$s0")`,$acc1 shr \$8,$s2 + movzb `&hi("$s0")`,$acc1 + shl \$16,$t4 shr \$8,$s1 + shl \$16,$t5 + xor $t4,$t1 + movzb ($sbox,$acc2,1),$acc2 #$t3 + movzb ($sbox,$acc0,1),$acc0 #$t0 movzb ($sbox,$acc1,1),$acc1 #$t1 movzb ($sbox,$s2,1),$s3 #$t3 movzb ($sbox,$s1,1),$s2 #$t2 - shl \$16,$t4 - shl \$16,$t5 + shl \$16,$acc2 - xor $t4,$t1 xor $t5,$t2 - xor $acc2,$t3 - shl \$24,$acc0 + xor $acc2,$t3 shl \$24,$acc1 - shl \$24,$s3 xor $acc0,$t0 - shl \$24,$s2 + shl \$24,$s3 xor $acc1,$t1 + shl \$24,$s2 mov $t0,$s0 mov $t1,$s1 xor $t2,$s2 @@ -465,12 +464,12 @@ sub enctransform() { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); $code.=<<___; - mov $s0,$acc0 - mov $s1,$acc1 - and \$0x80808080,$acc0 - and \$0x80808080,$acc1 - mov $acc0,$t0 - mov $acc1,$t1 + mov \$0x80808080,$t0 + mov \$0x80808080,$t1 + and $s0,$t0 + and $s1,$t1 + mov $t0,$acc0 + mov $t1,$acc1 shr \$7,$t0 lea ($s0,$s0),$r20 shr \$7,$t1 @@ -488,25 +487,25 @@ $code.=<<___; xor $r20,$s0 xor $r21,$s1 - mov $s2,$acc0 - mov $s3,$acc1 + mov \$0x80808080,$t2 rol \$24,$s0 + mov \$0x80808080,$t3 rol \$24,$s1 - and \$0x80808080,$acc0 - and \$0x80808080,$acc1 + and $s2,$t2 + and $s3,$t3 xor $r20,$s0 xor $r21,$s1 - mov $acc0,$t2 - mov $acc1,$t3 + mov $t2,$acc0 ror \$16,$t0 + mov $t3,$acc1 ror \$16,$t1 - shr \$7,$t2 lea ($s2,$s2),$r20 + shr \$7,$t2 xor $t0,$s0 - xor $t1,$s1 shr \$7,$t3 - lea ($s3,$s3),$r21 + xor $t1,$s1 ror \$8,$t0 + lea ($s3,$s3),$r21 ror \$8,$t1 sub $t2,$acc0 sub $t3,$acc1 @@ -522,23 +521,23 @@ $code.=<<___; xor $acc0,$r20 xor $acc1,$r21 + ror \$16,$t2 xor $r20,$s2 + ror \$16,$t3 xor $r21,$s3 rol \$24,$s2 + mov 0($sbox),$acc0 # prefetch Te4 rol \$24,$s3 xor $r20,$s2 - xor $r21,$s3 - mov 0($sbox),$acc0 # prefetch Te4 - ror \$16,$t2 - ror \$16,$t3 mov 64($sbox),$acc1 - xor $t2,$s2 - xor $t3,$s3 + xor $r21,$s3 mov 128($sbox),$r20 + xor $t2,$s2 ror \$8,$t2 + xor $t3,$s3 ror \$8,$t3 - mov 192($sbox),$r21 xor $t2,$s2 + mov 192($sbox),$r21 xor $t3,$s3 ___ } @@ -935,70 +934,69 @@ $code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 - movzb ($sbox,$t0,1),$t0 - movzb ($sbox,$t1,1),$t1 - movzb ($sbox,$t2,1),$t2 - movzb `&lo("$s3")`,$t3 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 + shr \$16,$s3 + movzb `&hi("$s1")`,$acc2 + movzb ($sbox,$t0,1),$t0 + movzb ($sbox,$t1,1),$t1 + movzb ($sbox,$t2,1),$t2 movzb ($sbox,$t3,1),$t3 - movzb ($sbox,$acc0,1),$t4 #$t0 - movzb ($sbox,$acc1,1),$t5 #$t1 - movzb `&hi("$s1")`,$acc2 + movzb ($sbox,$acc0,1),$t4 #$t0 movzb `&hi("$s2")`,$acc0 - shr \$16,$s2 + movzb ($sbox,$acc1,1),$t5 #$t1 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 - shr \$16,$s3 - movzb `&lo("$s2")`,$acc1 - shl \$8,$t4 + shr \$16,$s2 shl \$8,$t5 - movzb ($sbox,$acc1,1),$acc1 #$t0 - xor $t4,$t0 - xor $t5,$t1 - - movzb `&lo("$s3")`,$t4 + shl \$8,$t4 + movzb `&lo("$s2")`,$acc1 shr \$16,$s0 + xor $t4,$t0 shr \$16,$s1 - movzb `&lo("$s0")`,$t5 + movzb `&lo("$s3")`,$t4 + shl \$8,$acc2 + xor $t5,$t1 shl \$8,$acc0 - movzb ($sbox,$t4,1),$t4 #$t1 - movzb ($sbox,$t5,1),$t5 #$t2 + movzb `&lo("$s0")`,$t5 + movzb ($sbox,$acc1,1),$acc1 #$t0 xor $acc2,$t2 - xor $acc0,$t3 - movzb `&lo("$s1")`,$acc2 - movzb `&hi("$s1")`,$acc0 + shl \$16,$acc1 + xor $acc0,$t3 + movzb ($sbox,$t4,1),$t4 #$t1 + movzb `&hi("$s1")`,$acc0 movzb ($sbox,$acc2,1),$acc2 #$t3 - movzb ($sbox,$acc0,1),$acc0 #$t0 xor $acc1,$t0 - + movzb ($sbox,$t5,1),$t5 #$t2 movzb `&hi("$s2")`,$acc1 + + shl \$16,$acc2 shl \$16,$t4 shl \$16,$t5 - movzb ($sbox,$acc1,1),$s1 #$t1 + xor $acc2,$t3 + movzb `&hi("$s3")`,$acc2 xor $t4,$t1 + shr \$8,$s0 xor $t5,$t2 - movzb `&hi("$s3")`,$acc1 - shr \$8,$s0 - shl \$16,$acc2 - movzb ($sbox,$acc1,1),$s2 #$t2 + movzb ($sbox,$acc0,1),$acc0 #$t0 + movzb ($sbox,$acc1,1),$s1 #$t1 + movzb ($sbox,$acc2,1),$s2 #$t2 movzb ($sbox,$s0,1),$s3 #$t3 - xor $acc2,$t3 + mov $t0,$s0 shl \$24,$acc0 shl \$24,$s1 shl \$24,$s2 - xor $acc0,$t0 + xor $acc0,$s0 shl \$24,$s3 xor $t1,$s1 - mov $t0,$s0 xor $t2,$s2 xor $t3,$s3 ___ @@ -1013,12 +1011,12 @@ sub dectransform() my $prefetch = shift; $code.=<<___; - mov $tp10,$acc0 - mov $tp18,$acc8 - and $mask80,$acc0 - and $mask80,$acc8 - mov $acc0,$tp40 - mov $acc8,$tp48 + mov $mask80,$tp40 + mov $mask80,$tp48 + and $tp10,$tp40 + and $tp18,$tp48 + mov $tp40,$acc0 + mov $tp48,$acc8 shr \$7,$tp40 lea ($tp10,$tp10),$tp20 shr \$7,$tp48 @@ -1029,15 +1027,15 @@ $code.=<<___; and $maskfe,$tp28 and $mask1b,$acc0 and $mask1b,$acc8 - xor $tp20,$acc0 - xor $tp28,$acc8 - mov $acc0,$tp20 - mov $acc8,$tp28 - - and $mask80,$acc0 - and $mask80,$acc8 - mov $acc0,$tp80 - mov $acc8,$tp88 + xor $acc0,$tp20 + xor $acc8,$tp28 + mov $mask80,$tp80 + mov $mask80,$tp88 + + and $tp20,$tp80 + and $tp28,$tp88 + mov $tp80,$acc0 + mov $tp88,$acc8 shr \$7,$tp80 lea ($tp20,$tp20),$tp40 shr \$7,$tp88 @@ -1048,15 +1046,15 @@ $code.=<<___; and $maskfe,$tp48 and $mask1b,$acc0 and $mask1b,$acc8 - xor $tp40,$acc0 - xor $tp48,$acc8 - mov $acc0,$tp40 - mov $acc8,$tp48 - - and $mask80,$acc0 - and $mask80,$acc8 - mov $acc0,$tp80 - mov $acc8,$tp88 + xor $acc0,$tp40 + xor $acc8,$tp48 + mov $mask80,$tp80 + mov $mask80,$tp88 + + and $tp40,$tp80 + and $tp48,$tp88 + mov $tp80,$acc0 + mov $tp88,$acc8 shr \$7,$tp80 xor $tp10,$tp20 # tp2^=tp1 shr \$7,$tp88 @@ -1081,51 +1079,51 @@ $code.=<<___; mov $tp10,$acc0 mov $tp18,$acc8 xor $tp80,$tp40 # tp4^tp1^=tp8 - xor $tp88,$tp48 # tp4^tp1^=tp8 shr \$32,$acc0 + xor $tp88,$tp48 # tp4^tp1^=tp8 shr \$32,$acc8 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 - xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) + xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 + rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 - rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) xor `&LO("$tp80")`,`&LO("$tp10")` - xor `&LO("$tp88")`,`&LO("$tp18")` shr \$32,$tp80 + xor `&LO("$tp88")`,`&LO("$tp18")` shr \$32,$tp88 xor `&LO("$tp80")`,`&LO("$acc0")` xor `&LO("$tp88")`,`&LO("$acc8")` mov $tp20,$tp80 - mov $tp28,$tp88 - shr \$32,$tp80 - shr \$32,$tp88 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) + mov $tp28,$tp88 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) - rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) - rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) + shr \$32,$tp80 xor `&LO("$tp20")`,`&LO("$tp10")` + shr \$32,$tp88 xor `&LO("$tp28")`,`&LO("$tp18")` + rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) mov $tp40,$tp20 + rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) mov $tp48,$tp28 + shr \$32,$tp20 xor `&LO("$tp80")`,`&LO("$acc0")` + shr \$32,$tp28 xor `&LO("$tp88")`,`&LO("$acc8")` `"mov 0($sbox),$mask80" if ($prefetch)` - shr \$32,$tp20 - shr \$32,$tp28 - `"mov 64($sbox),$maskfe" if ($prefetch)` rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) + `"mov 64($sbox),$maskfe" if ($prefetch)` rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) `"mov 128($sbox),$mask1b" if ($prefetch)` rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) - rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) `"mov 192($sbox),$tp80" if ($prefetch)` xor `&LO("$tp40")`,`&LO("$tp10")` + rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) xor `&LO("$tp48")`,`&LO("$tp18")` `"mov 256($sbox),$tp88" if ($prefetch)` xor `&LO("$tp20")`,`&LO("$acc0")` @@ -1301,10 +1299,6 @@ private_AES_set_encrypt_key: call _x86_64_AES_set_encrypt_key - mov 8(%rsp),%r15 - mov 16(%rsp),%r14 - mov 24(%rsp),%r13 - mov 32(%rsp),%r12 mov 40(%rsp),%rbp mov 48(%rsp),%rbx add \$56,%rsp diff --git a/crypto/aes/asm/vpaes-x86.pl b/crypto/aes/asm/vpaes-x86.pl index 1533e2c304..433912ff57 100644 --- a/crypto/aes/asm/vpaes-x86.pl +++ b/crypto/aes/asm/vpaes-x86.pl @@ -27,9 +27,9 @@ # # aes-586.pl vpaes-x86.pl # -# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) -# Nehalem 27.9/40.4/18.1 10.3/12.0 -# Atom 102./119./60.1 64.5/85.3(***) +# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) +# Nehalem 27.9/40.4/18.1 10.2/11.9 +# Atom 70.7/92.1/60.1 61.1/81.0(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast @@ -40,8 +40,8 @@ # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. # # (***) Less impressive improvement on Core 2 and Atom is due to slow -# pshufb, yet it's respectable +32%/65% improvement on Core 2 -# and +58%/40% on Atom (as implied, over "hyper-threading-safe" +# pshufb, yet it's respectable +28%/64% improvement on Core 2 +# and +15% on Atom (as implied, over "hyper-threading-safe" # code path). # # @@ -183,35 +183,35 @@ $k_dsbo=0x2c0; # decryption sbox final output &movdqa ("xmm1","xmm6") &movdqa ("xmm2",&QWP($k_ipt,$const)); &pandn ("xmm1","xmm0"); - &movdqu ("xmm5",&QWP(0,$key)); - &psrld ("xmm1",4); &pand ("xmm0","xmm6"); + &movdqu ("xmm5",&QWP(0,$key)); &pshufb ("xmm2","xmm0"); &movdqa ("xmm0",&QWP($k_ipt+16,$const)); - &pshufb ("xmm0","xmm1"); &pxor ("xmm2","xmm5"); - &pxor ("xmm0","xmm2"); + &psrld ("xmm1",4); &add ($key,16); + &pshufb ("xmm0","xmm1"); &lea ($base,&DWP($k_mc_backward,$const)); + &pxor ("xmm0","xmm2"); &jmp (&label("enc_entry")); &set_label("enc_loop",16); # middle of middle round &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u - &pshufb ("xmm4","xmm2"); # 4 = sb1u - &pxor ("xmm4","xmm5"); # 4 = sb1u + k &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t + &pshufb ("xmm4","xmm2"); # 4 = sb1u &pshufb ("xmm0","xmm3"); # 0 = sb1t - &pxor ("xmm0","xmm4"); # 0 = A + &pxor ("xmm4","xmm5"); # 4 = sb1u + k &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u - &pshufb ("xmm5","xmm2"); # 4 = sb2u + &pxor ("xmm0","xmm4"); # 0 = A &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] + &pshufb ("xmm5","xmm2"); # 4 = sb2u &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t - &pshufb ("xmm2","xmm3"); # 2 = sb2t - &pxor ("xmm2","xmm5"); # 2 = 2A &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] + &pshufb ("xmm2","xmm3"); # 2 = sb2t &movdqa ("xmm3","xmm0"); # 3 = A + &pxor ("xmm2","xmm5"); # 2 = 2A &pshufb ("xmm0","xmm1"); # 0 = B &add ($key,16); # next key &pxor ("xmm0","xmm2"); # 0 = 2A+B @@ -220,30 +220,30 @@ $k_dsbo=0x2c0; # decryption sbox final output &pxor ("xmm3","xmm0"); # 3 = 2A+B+D &pshufb ("xmm0","xmm1"); # 0 = 2B+C &and ($magic,0x30); # ... mod 4 - &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D &sub ($round,1); # nr-- + &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D &set_label("enc_entry"); # top of round &movdqa ("xmm1","xmm6"); # 1 : i + &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k &pandn ("xmm1","xmm0"); # 1 = i<<4 &psrld ("xmm1",4); # 1 = i &pand ("xmm0","xmm6"); # 0 = k - &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k &pshufb ("xmm5","xmm0"); # 2 = a/k - &pxor ("xmm0","xmm1"); # 0 = j &movdqa ("xmm3","xmm7"); # 3 : 1/i + &pxor ("xmm0","xmm1"); # 0 = j &pshufb ("xmm3","xmm1"); # 3 = 1/i - &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k &movdqa ("xmm4","xmm7"); # 4 : 1/j + &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k &pshufb ("xmm4","xmm0"); # 4 = 1/j - &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k &movdqa ("xmm2","xmm7"); # 2 : 1/iak + &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k &pshufb ("xmm2","xmm3"); # 2 = 1/iak - &pxor ("xmm2","xmm0"); # 2 = io &movdqa ("xmm3","xmm7"); # 3 : 1/jak - &movdqu ("xmm5",&QWP(0,$key)); + &pxor ("xmm2","xmm0"); # 2 = io &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &movdqu ("xmm5",&QWP(0,$key)); &pxor ("xmm3","xmm1"); # 3 = jo &jnz (&label("enc_loop")); @@ -265,8 +265,8 @@ $k_dsbo=0x2c0; # decryption sbox final output ## Same API as encryption core. ## &function_begin_B("_vpaes_decrypt_core"); - &mov ($round,&DWP(240,$key)); &lea ($base,&DWP($k_dsbd,$const)); + &mov ($round,&DWP(240,$key)); &movdqa ("xmm1","xmm6"); &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); &pandn ("xmm1","xmm0"); @@ -292,62 +292,61 @@ $k_dsbo=0x2c0; # decryption sbox final output ## Inverse mix columns ## &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u + &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t &pshufb ("xmm4","xmm2"); # 4 = sb9u + &pshufb ("xmm1","xmm3"); # 0 = sb9t &pxor ("xmm4","xmm0"); - &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t - &pshufb ("xmm0","xmm3"); # 0 = sb9t - &pxor ("xmm0","xmm4"); # 0 = ch &add ($key,16); # next round key + &pxor ("xmm1","xmm4"); # 0 = ch - &pshufb ("xmm0","xmm5"); # MC ch &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu + &pshufb ("xmm1","xmm5"); # MC ch &pshufb ("xmm4","xmm2"); # 4 = sbdu - &pxor ("xmm4","xmm0"); # 4 = ch &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt + &pxor ("xmm4","xmm1"); # 4 = ch &pshufb ("xmm0","xmm3"); # 0 = sbdt - &pxor ("xmm0","xmm4"); # 0 = ch &sub ($round,1); # nr-- + &pxor ("xmm0","xmm4"); # 0 = ch - &pshufb ("xmm0","xmm5"); # MC ch &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu + &pshufb ("xmm0","xmm5"); # MC ch + &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt &pshufb ("xmm4","xmm2"); # 4 = sbbu + &pshufb ("xmm1","xmm3"); # 0 = sbbt &pxor ("xmm4","xmm0"); # 4 = ch - &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt - &pshufb ("xmm0","xmm3"); # 0 = sbbt - &pxor ("xmm0","xmm4"); # 0 = ch + &pxor ("xmm1","xmm4"); # 0 = ch - &pshufb ("xmm0","xmm5"); # MC ch &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu - &pshufb ("xmm4","xmm2"); # 4 = sbeu - &pxor ("xmm4","xmm0"); # 4 = ch + &pshufb ("xmm1","xmm5"); # MC ch &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet + &pshufb ("xmm4","xmm2"); # 4 = sbeu &pshufb ("xmm0","xmm3"); # 0 = sbet - &pxor ("xmm0","xmm4"); # 0 = ch - &palignr("xmm5","xmm5",12); + &pxor ("xmm4","xmm1"); # 4 = ch + &pxor ("xmm0","xmm4"); # 0 = ch &set_label("dec_entry"); # top of round &movdqa ("xmm1","xmm6"); # 1 : i &pandn ("xmm1","xmm0"); # 1 = i<<4 + &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k &psrld ("xmm1",4); # 1 = i &pand ("xmm0","xmm6"); # 0 = k - &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k &pshufb ("xmm2","xmm0"); # 2 = a/k - &pxor ("xmm0","xmm1"); # 0 = j &movdqa ("xmm3","xmm7"); # 3 : 1/i + &pxor ("xmm0","xmm1"); # 0 = j &pshufb ("xmm3","xmm1"); # 3 = 1/i - &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k &movdqa ("xmm4","xmm7"); # 4 : 1/j + &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k &pshufb ("xmm4","xmm0"); # 4 = 1/j &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k &movdqa ("xmm2","xmm7"); # 2 : 1/iak &pshufb ("xmm2","xmm3"); # 2 = 1/iak - &pxor ("xmm2","xmm0"); # 2 = io &movdqa ("xmm3","xmm7"); # 3 : 1/jak + &pxor ("xmm2","xmm0"); # 2 = io &pshufb ("xmm3","xmm4"); # 3 = 1/jak - &pxor ("xmm3","xmm1"); # 3 = jo &movdqu ("xmm0",&QWP(0,$key)); + &pxor ("xmm3","xmm1"); # 3 = jo &jnz (&label("dec_loop")); # middle of last round @@ -542,12 +541,12 @@ $k_dsbo=0x2c0; # decryption sbox final output ## %xmm0: b+c+d b+c b a ## &function_begin_B("_vpaes_schedule_192_smear"); - &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 - &pxor ("xmm6","xmm0"); # -> c+d c 0 0 + &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a + &pxor ("xmm6","xmm1"); # -> c+d c 0 0 + &pxor ("xmm1","xmm1"); &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a &movdqa ("xmm0","xmm6"); - &pxor ("xmm1","xmm1"); &movhlps("xmm6","xmm1"); # clobber low side with zeros &ret (); &function_end_B("_vpaes_schedule_192_smear"); diff --git a/crypto/aes/asm/vpaes-x86_64.pl b/crypto/aes/asm/vpaes-x86_64.pl index 8455c90834..bca97fdcdf 100644 --- a/crypto/aes/asm/vpaes-x86_64.pl +++ b/crypto/aes/asm/vpaes-x86_64.pl @@ -27,9 +27,9 @@ # # aes-x86_64.pl vpaes-x86_64.pl # -# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***) -# Nehalem 30.5/42.2/14.6 9.8/11.8 -# Atom 63.9/79.0/32.1 64.0/84.8(***) +# Core 2(**) 29.6/41.1/14.3 21.9/25.2(***) +# Nehalem 29.6/40.3/14.6 10.0/11.8 +# Atom 57.3/74.2/32.1 60.9/82.3(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast @@ -40,7 +40,7 @@ # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. # # (***) Less impressive improvement on Core 2 and Atom is due to slow -# pshufb, yet it's respectable +40%/78% improvement on Core 2 +# pshufb, yet it's respectable +36%/62% improvement on Core 2 # (as implied, over "hyper-threading-safe" code path). # # @@ -94,8 +94,8 @@ _vpaes_encrypt_core: movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi pshufb %xmm1, %xmm0 pxor %xmm5, %xmm2 - pxor %xmm2, %xmm0 add \$16, %r9 + pxor %xmm2, %xmm0 lea .Lk_mc_backward(%rip),%r10 jmp .Lenc_entry @@ -103,19 +103,19 @@ _vpaes_encrypt_core: .Lenc_loop: # middle of middle round movdqa %xmm13, %xmm4 # 4 : sb1u - pshufb %xmm2, %xmm4 # 4 = sb1u - pxor %xmm5, %xmm4 # 4 = sb1u + k movdqa %xmm12, %xmm0 # 0 : sb1t + pshufb %xmm2, %xmm4 # 4 = sb1u pshufb %xmm3, %xmm0 # 0 = sb1t - pxor %xmm4, %xmm0 # 0 = A + pxor %xmm5, %xmm4 # 4 = sb1u + k movdqa %xmm15, %xmm5 # 4 : sb2u - pshufb %xmm2, %xmm5 # 4 = sb2u + pxor %xmm4, %xmm0 # 0 = A movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + pshufb %xmm2, %xmm5 # 4 = sb2u + movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] movdqa %xmm14, %xmm2 # 2 : sb2t pshufb %xmm3, %xmm2 # 2 = sb2t - pxor %xmm5, %xmm2 # 2 = 2A - movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] movdqa %xmm0, %xmm3 # 3 = A + pxor %xmm5, %xmm2 # 2 = 2A pshufb %xmm1, %xmm0 # 0 = B add \$16, %r9 # next key pxor %xmm2, %xmm0 # 0 = 2A+B @@ -124,30 +124,30 @@ _vpaes_encrypt_core: pxor %xmm0, %xmm3 # 3 = 2A+B+D pshufb %xmm1, %xmm0 # 0 = 2B+C and \$0x30, %r11 # ... mod 4 - pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D sub \$1,%rax # nr-- + pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D .Lenc_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i + movdqa %xmm11, %xmm5 # 2 : a/k pandn %xmm0, %xmm1 # 1 = i<<4 psrld \$4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k - movdqa %xmm11, %xmm5 # 2 : a/k pshufb %xmm0, %xmm5 # 2 = a/k - pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i + pxor %xmm1, %xmm0 # 0 = j pshufb %xmm1, %xmm3 # 3 = 1/i - pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j + pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k pshufb %xmm0, %xmm4 # 4 = 1/j - pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak + pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k pshufb %xmm3, %xmm2 # 2 = 1/iak - pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak - movdqu (%r9), %xmm5 + pxor %xmm0, %xmm2 # 2 = io pshufb %xmm4, %xmm3 # 3 = 1/jak + movdqu (%r9), %xmm5 pxor %xmm1, %xmm3 # 3 = jo jnz .Lenc_loop @@ -200,62 +200,61 @@ _vpaes_decrypt_core: ## Inverse mix columns ## movdqa -0x20(%r10),%xmm4 # 4 : sb9u + movdqa -0x10(%r10),%xmm1 # 0 : sb9t pshufb %xmm2, %xmm4 # 4 = sb9u + pshufb %xmm3, %xmm1 # 0 = sb9t pxor %xmm0, %xmm4 - movdqa -0x10(%r10),%xmm0 # 0 : sb9t - pshufb %xmm3, %xmm0 # 0 = sb9t - pxor %xmm4, %xmm0 # 0 = ch add \$16, %r9 # next round key + pxor %xmm4, %xmm1 # 0 = ch - pshufb %xmm5, %xmm0 # MC ch movdqa 0x00(%r10),%xmm4 # 4 : sbdu + pshufb %xmm5, %xmm1 # MC ch pshufb %xmm2, %xmm4 # 4 = sbdu - pxor %xmm0, %xmm4 # 4 = ch movdqa 0x10(%r10),%xmm0 # 0 : sbdt + pxor %xmm1, %xmm4 # 4 = ch pshufb %xmm3, %xmm0 # 0 = sbdt - pxor %xmm4, %xmm0 # 0 = ch sub \$1,%rax # nr-- - - pshufb %xmm5, %xmm0 # MC ch + pxor %xmm4, %xmm0 # 0 = ch + movdqa 0x20(%r10),%xmm4 # 4 : sbbu + pshufb %xmm5, %xmm0 # MC ch + movdqa 0x30(%r10),%xmm1 # 0 : sbbt pshufb %xmm2, %xmm4 # 4 = sbbu + pshufb %xmm3, %xmm1 # 0 = sbbt pxor %xmm0, %xmm4 # 4 = ch - movdqa 0x30(%r10),%xmm0 # 0 : sbbt - pshufb %xmm3, %xmm0 # 0 = sbbt - pxor %xmm4, %xmm0 # 0 = ch - - pshufb %xmm5, %xmm0 # MC ch + pxor %xmm4, %xmm1 # 0 = ch + movdqa 0x40(%r10),%xmm4 # 4 : sbeu - pshufb %xmm2, %xmm4 # 4 = sbeu - pxor %xmm0, %xmm4 # 4 = ch + pshufb %xmm5, %xmm1 # MC ch movdqa 0x50(%r10),%xmm0 # 0 : sbet + pshufb %xmm2, %xmm4 # 4 = sbeu pshufb %xmm3, %xmm0 # 0 = sbet + palignr \$12, %xmm5, %xmm5 + pxor %xmm1, %xmm4 # 4 = ch pxor %xmm4, %xmm0 # 0 = ch - palignr \$12, %xmm5, %xmm5 - .Ldec_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i pandn %xmm0, %xmm1 # 1 = i<<4 + movdqa %xmm11, %xmm2 # 2 : a/k psrld \$4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k - movdqa %xmm11, %xmm2 # 2 : a/k pshufb %xmm0, %xmm2 # 2 = a/k - pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i + pxor %xmm1, %xmm0 # 0 = j pshufb %xmm1, %xmm3 # 3 = 1/i - pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j + pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k pshufb %xmm0, %xmm4 # 4 = 1/j pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak pshufb %xmm3, %xmm2 # 2 = 1/iak - pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak + pxor %xmm0, %xmm2 # 2 = io pshufb %xmm4, %xmm3 # 3 = 1/jak - pxor %xmm1, %xmm3 # 3 = jo movdqu (%r9), %xmm0 + pxor %xmm1, %xmm3 # 3 = jo jnz .Ldec_loop # middle of last round @@ -463,12 +462,12 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,\@abi-omnipotent .align 16 _vpaes_schedule_192_smear: - pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 - pxor %xmm0, %xmm6 # -> c+d c 0 0 + pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + pxor %xmm1, %xmm6 # -> c+d c 0 0 + pxor %xmm1, %xmm1 pxor %xmm0, %xmm6 # -> b+c+d b+c b a movdqa %xmm6, %xmm0 - pxor %xmm1, %xmm1 movhlps %xmm1, %xmm6 # clobber low side with zeros ret .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear -- 2.25.1