# forms are granted according to the OpenSSL license.
# ====================================================================
#
-# Version 4.2.
+# Version 4.3.
#
# You might fail to appreciate this module performance from the first
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
# cycles window. Why just SSE? Because it's needed on hyper-threading
# CPU! Which is also why it's prefetched with 64 byte stride. Best
# part is that it has no negative effect on performance:-)
+#
+# Version 4.3 implements switch between compact and non-compact block
+# functions in AES_cbc_encrypt depending on how much data was asked
+# to process in one stroke.
+#
+# Timing attacks are classified in two classes: synchronous when
+# attacker consciously initiates cryptographic operation and collect
+# timing data of various character afterwards, and asynchronous when
+# malicious code is executed on same CPU simultaneously with AES,
+# instruments itself and performs statistical analysis of this data.
+#
+# As far as synchronous attacks go the root to the AES timing
+# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
+# are referred to in single 128-bit block operation. Well, in C
+# implementation with 4 distinct tables it's actually as little as 40
+# references per 256 elements table, but anyway... Secondly, even
+# though S-box elements are clustered into smaller amount of cache-
+# lines, smaller than 160 and even 40, it turned out that for certain
+# plain-text pattern[s] or simply put chosen plain-text and given key
+# few cache-lines remain unaccessed during block operation. Now, if
+# attacker can figure out this access pattern, he can deduct the key
+# [or at least part of it]. The natural way to mitigate this kind of
+# attacks is to minimize the amount of cache-lines in S-box and/or
+# prefetch them to ensure that every one is accessed for more uniform
+# timing. But note that *if* plain-text was concealed in such way that
+# input to block function is distributed *uniformly*, then attack
+# wouldn't apply. Now note that some encryption modes, most notably
+# CBC, do masks the plain-text in this exact way [secure cipher output
+# is distributed uniformly]. Yes, one still might find input that
+# would reveal the information about given key, but if amount of
+# candidate inputs to be tried is larger than amount possible key
+# combinations then attack becomes infeasible. This is why revised
+# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
+# of data is to be processed in one stroke. The current size limit of
+# 512 bytes is chosen to provide same [diminishigly low] probability
+# for cache-line to remain untouched in large chunk operation with
+# large S-box as for single block operation with compact S-box and
+# surely needs more careful consideration...
+#
+# As for asynchronous attacks. There are two flavours: attacker code
+# being interleaved with AES on hyper-threading CPU at *instruction*
+# level, and two processes time sharing single core. As for latter.
+# Two vectors. 1. Given that attacker process has higher priority,
+# yield execution to process performing AES just before timer fires
+# off the scheduler, immediately regain control of CPU and analyze the
+# cache state. For this attack to be efficient attacker would have to
+# effectively slow down the operation by several *orders* of magnitute,
+# by ratio of time slice to duration of handful of AES rounds, which
+# unlikely to remain unnoticed. Not to mention that this also means
+# that he would spend correspondigly more time to collect enough
+# statistical data to mount the attack. It's probably appropriate to
+# say that if adeversary reckons that this attack is beneficial and
+# risks to be noticed, you probably have larger problems having him
+# mere opportunity. In other words suggested code design expects you
+# to preclude/mitigate this attack by overall system security design.
+# 2. Attacker manages to make his code interrupt driven. In order for
+# this kind of attack to be feasible, interrupt rate has to be high
+# enough, again comparable to duration of handful of AES rounds. But
+# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
+# generates interrupts at such raging rate...
+#
+# And now back to the former, hyper-threading CPU or more specifically
+# Intel P4. Recall that asynchronous attack implies that malicious
+# code instruments itself. And naturally instrumentation granularity
+# has be noticeably lower than duration of codepath accessing S-box.
+# Given that all cache-lines are accessed during that time that is.
+# Current implementation accesses *all* cache-lines within ~50 cycles
+# window, which is actually *less* than RDTSC latency on Intel P4!
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
-&asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386");
+&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
$s0="eax";
$s1="ebx";
$acc="esi";
$tbl="ebp";
+# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
+# by caller
+$__ra=&DWP(0,"esp"); # return address
+$__s0=&DWP(4,"esp"); # s0 backing store
+$__s1=&DWP(8,"esp"); # s1 backing store
+$__s2=&DWP(12,"esp"); # s2 backing store
+$__s3=&DWP(16,"esp"); # s3 backing store
+$__key=&DWP(20,"esp"); # pointer to key schedule
+$__end=&DWP(24,"esp"); # pointer to end of key schedule
+$__tbl=&DWP(28,"esp"); # %ebp backing store
+
+# stack frame layout in AES_[en|crypt] routines, which differs from
+# above by 4 and overlaps by %ebp backing store
+$_tbl=&DWP(24,"esp");
+$_esp=&DWP(28,"esp");
+
sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
-$compromise=0; # $compromise=128 abstains from copying key
- # schedule to stack when encrypting inputs
- # shorter than 128 bytes at the cost of
- # risksing aliasing with S-boxes. In return
- # you get way better, up to +70%, small block
- # performance.
+$speed_limit=512; # chunks smaller than $speed_limit are
+ # processed with compact routine in CBC mode
$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
# recent ยต-archs], but ~5 times smaller!
# I favor compact code to minimize cache
&movz ($v0,&HB($v1));
&and ($v1,0xFF);
&xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
- &mov ($key,&DWP(20,"esp")); # reincarnate v1 as key
+ &mov ($key,$__key); # reincarnate v1 as key
&xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
}
&xor ($v1,&DWP(2,$te,$v0,8)); # 10
&movz ($v0,&HB($s3)); # 13,12,15*,14
&xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
- &mov (&DWP(4,"esp"),$v1); # t[0] saved
+ &mov ($__s0,$v1); # t[0] saved
&movz ($v0,&LB($s1)); # 7, 6, 5, 4*
&shr ($s1,16); # -, -, 7, 6
&shr ($s0,24); # 1*
&mov ($s2,&DWP(1,$te,$v0,8)); # 11
&xor ($s2,&DWP(3,$te,$s0,8)); # 1
- &mov ($s0,&DWP(4,"esp")); # s[0]=t[0]
+ &mov ($s0,$__s0); # s[0]=t[0]
&movz ($v0,&LB($s3)); # 13,12, 7, 6*
&shr ($s3,16); # , ,13,12
&xor ($s2,&DWP(2,$te,$v0,8)); # 6
- &mov ($key,&DWP(20,"esp")); # reincarnate v0 as key
+ &mov ($key,$__key); # reincarnate v0 as key
&and ($s3,0xff); # , ,13,12*
&mov ($s3,&DWP(0,$te,$s3,8)); # 12
&xor ($s3,$s2); # s[2]=t[3] collected
# $Fn is used in first compact round and its purpose is to
# void restoration of some values from stack, so that after
# 4xenccompact with extra argument $key value is left there...
- if ($i==3) { &$Fn ($key,&DWP(20,"esp")); }##%edx
+ if ($i==3) { &$Fn ($key,$__key); }##%edx
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
if ($i==1) { &shr ($s[0],16); }#%ebx[1]
&shl ($tmp,8);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
else { &mov ($tmp,$s[2]);
&shr ($tmp,16); }
if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
&shl ($tmp,16);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]);
&shr ($tmp,24); }
&public_label("AES_Te");
&function_begin_B("_x86_AES_encrypt_compact");
# note that caller is expected to allocate stack frame for me!
- &mov (&DWP(20,"esp"),$key); # save key
+ &mov ($__key,$key); # save key
&xor ($s0,&DWP(0,$key)); # xor with key
&xor ($s1,&DWP(4,$key));
&mov ($acc,&DWP(240,$key)); # load key->rounds
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
# prefetch Te4
&mov ($key,&DWP(0-128,$tbl));
&enctransform(3);
&enctransform(0);
&enctransform(1);
- &mov ($key,&DWP(20,"esp"));
- &mov ($tbl,&DWP(28,"esp"));
+ &mov ($key,$__key);
+ &mov ($tbl,$__tbl);
&add ($key,16); # advance rd_key
&xor ($s0,&DWP(0,$key));
&xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key));
&xor ($s3,&DWP(12,$key));
- &cmp ($key,&DWP(24,"esp"));
- &mov (&DWP(20,"esp"),$key);
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
&jb (&label("loop"));
&enccompact(0,$tbl,$s0,$s1,$s2,$s3);
&punpckldq ("mm4","mm5"); # t[2,3] collected
}
+ if (!$x86only) {
&public_label("AES_Te");
&function_begin_B("_sse_AES_encrypt_compact");
&pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
&mov ($acc,&DWP(240,$key)); # load key->rounds
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
&mov ($s0,0x1b1b1b1b); # magic constant
&mov (&DWP(8,"esp"),$s0);
&set_label("loop",16);
&sse_enccompact();
&add ($key,16);
- &cmp ($key,&DWP(24,"esp"));
+ &cmp ($key,$__end);
&ja (&label("out"));
&movq ("mm2",&QWP(8,"esp"));
&ret ();
&function_end_B("_sse_AES_encrypt_compact");
+ }
######################################################################
# Vanilla block function.
my $out = $i==3?$s[0]:$acc;
# lines marked with #%e?x[i] denote "reordered" instructions...
- if ($i==3) { &mov ($key,&DWP(20,"esp")); }##%edx
+ if ($i==3) { &mov ($key,$__key); }##%edx
else { &mov ($out,$s[0]);
&and ($out,0xFF); }
if ($i==1) { &shr ($s[0],16); }#%ebx[1]
&movz ($tmp,&HB($s[1]));
&xor ($out,&DWP(3,$te,$tmp,8));
- if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
else { &mov ($tmp,$s[2]);
&shr ($tmp,16); }
if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
&and ($tmp,0xFF);
&xor ($out,&DWP(2,$te,$tmp,8));
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]);
&shr ($tmp,24) }
my $tmp = $key;
my $out = $i==3?$s[0]:$acc;
- if ($i==3) { &mov ($key,&DWP(20,"esp")); }##%edx
+ if ($i==3) { &mov ($key,$__key); }##%edx
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
if ($i==1) { &shr ($s[0],16); }#%ebx[1]
&and ($tmp,0x0000ff00);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
else { &mov ($tmp,$s[2]);
&shr ($tmp,16); }
if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
&and ($tmp,0x00ff0000);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]);
&shr ($tmp,24); }
}
# note that caller is expected to allocate stack frame for me!
- &mov (&DWP(20,"esp"),$key); # save key
+ &mov ($__key,$key); # save key
&xor ($s0,&DWP(0,$key)); # xor with key
&xor ($s1,&DWP(4,$key));
if ($small_footprint) {
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
&set_label("loop",16);
if ($vertical_spin) {
&xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key));
&xor ($s3,&DWP(12,$key));
- &cmp ($key,&DWP(24,"esp"));
- &mov (&DWP(20,"esp"),$key);
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
&jb (&label("loop"));
}
else {
&xor ($s3,&DWP(16*$i+12,$key));
}
&add ($key,32);
- &mov (&DWP(20,"esp"),$key); # advance rd_key
+ &mov ($__key,$key); # advance rd_key
&set_label("12rounds",4);
for ($i=1;$i<3;$i++) {
if ($vertical_spin) {
&xor ($s3,&DWP(16*$i+12,$key));
}
&add ($key,32);
- &mov (&DWP(20,"esp"),$key); # advance rd_key
+ &mov ($__key,$key); # advance rd_key
&set_label("10rounds",4);
for ($i=1;$i<10;$i++) {
if ($vertical_spin) {
&and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
&sub ("esp",$s1);
&add ("esp",4); # 4 is reserved for caller's return address
- &mov (&DWP(28,"esp"),$s0); # save stack pointer
+ &mov ($_esp,$s0); # save stack pointer
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($tbl);
- &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point"));
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
&lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
+
# pick Te4 copy which can't "overlap" with stack frame or key schedule
&lea ($s1,&DWP(768-4,"esp"));
&sub ($s1,$tbl);
&and ($s1,0x300);
&lea ($tbl,&DWP(2048+128,$tbl,$s1));
- &bt (&DWP(0,$s0),25); # check for SSE bit
+ if (!$x86only) {
+ &bt (&DWP(0,$s0),25); # check for SSE bit
&jnc (&label("x86"));
&movq ("mm0",&QWP(0,$acc));
&movq ("mm4",&QWP(8,$acc));
&call ("_sse_AES_encrypt_compact");
- &mov ("esp",&DWP(28,"esp")); # restore stack pointer
+ &mov ("esp",$_esp); # restore stack pointer
&mov ($acc,&wparam(1)); # load out
&movq (&QWP(0,$acc),"mm0"); # write output data
&movq (&QWP(8,$acc),"mm4");
&emms ();
&function_end_A();
-
+ }
&set_label("x86",16);
- &mov (&DWP(24,"esp"),$tbl);
+ &mov ($_tbl,$tbl);
&mov ($s0,&DWP(0,$acc)); # load input data
&mov ($s1,&DWP(4,$acc));
&mov ($s2,&DWP(8,$acc));
&mov ($s3,&DWP(12,$acc));
&call ("_x86_AES_encrypt_compact");
- &mov ("esp",&DWP(28,"esp")); # restore stack pointer
+ &mov ("esp",$_esp); # restore stack pointer
&mov ($acc,&wparam(1)); # load out
&mov (&DWP(0,$acc),$s0); # write output data
&mov (&DWP(4,$acc),$s1);
# void restoration of some values from stack, so that after
# 4xdeccompact with extra argument $key, $s0 and $s1 values
# are left there...
- if($i==3) { &$Fn ($key,&DWP(20,"esp")); }
+ if($i==3) { &$Fn ($key,$__key); }
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
&movz ($out,&BP(-128,$td,$out,1));
&shl ($tmp,16);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],&DWP(8,"esp")); }
+ if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
else { &mov ($tmp,$s[3]); }
&shr ($tmp,24);
&movz ($tmp,&BP(-128,$td,$tmp,1));
&shl ($tmp,24);
&xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
- if ($i==3) { &$Fn ($s[3],&DWP(4,"esp")); }
+ if ($i==3) { &$Fn ($s[3],$__s0); }
}
# must be called with 2,3,0,1 as argument sequence!!!
&xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
&xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
- &mov ($s[0],&DWP(4,"esp")) if($i==2); #prefetch $s0
- &mov ($s[1],&DWP(8,"esp")) if($i==3); #prefetch $s1
- &mov ($s[2],&DWP(12,"esp")) if($i==1);
- &mov ($s[3],&DWP(16,"esp")) if($i==1);
+ &mov ($s[0],$__s0) if($i==2); #prefetch $s0
+ &mov ($s[1],$__s1) if($i==3); #prefetch $s1
+ &mov ($s[2],$__s2) if($i==1);
+ &mov ($s[3],$__s3) if($i==1);
&mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
}
&public_label("AES_Td");
&function_begin_B("_x86_AES_decrypt_compact");
# note that caller is expected to allocate stack frame for me!
- &mov (&DWP(20,"esp"),$key); # save key
+ &mov ($__key,$key); # save key
&xor ($s0,&DWP(0,$key)); # xor with key
&xor ($s1,&DWP(4,$key));
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
# prefetch Td4
&mov ($key,&DWP(0-128,$tbl));
&dectransform(3);
&dectransform(0);
&dectransform(1);
- &mov ($key,&DWP(20,"esp"));
- &mov ($tbl,&DWP(28,"esp"));
+ &mov ($key,$__key);
+ &mov ($tbl,$__tbl);
&add ($key,16); # advance rd_key
&xor ($s0,&DWP(0,$key));
&xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key));
&xor ($s3,&DWP(12,$key));
- &cmp ($key,&DWP(24,"esp"));
- &mov (&DWP(20,"esp"),$key);
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
&jb (&label("loop"));
&deccompact(0,$tbl,$s0,$s3,$s2,$s1);
&punpckldq ("mm4","mm5"); # t[2,3] collected
}
+ if (!$x86only) {
&public_label("AES_Td");
&function_begin_B("_sse_AES_decrypt_compact");
&pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
&mov ($acc,&DWP(240,$key)); # load key->rounds
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
&mov ($s0,0x1b1b1b1b); # magic constant
&mov (&DWP(8,"esp"),$s0);
&set_label("loop",16);
&sse_deccompact();
&add ($key,16);
- &cmp ($key,&DWP(24,"esp"));
+ &cmp ($key,$__end);
&ja (&label("out"));
# ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
&ret ();
&function_end_B("_sse_AES_decrypt_compact");
+ }
######################################################################
# Vanilla block function.
# optimal... or rather that all attempts to reorder didn't
# result in better performance [which by the way is not a
# bit lower than ecryption].
- if($i==3) { &mov ($key,&DWP(20,"esp")); }
+ if($i==3) { &mov ($key,$__key); }
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
&mov ($out,&DWP(0,$td,$out,8));
&and ($tmp,0xFF);
&xor ($out,&DWP(2,$td,$tmp,8));
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
else { &mov ($tmp,$s[3]); }
&shr ($tmp,24);
&xor ($out,&DWP(1,$td,$tmp,8));
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
- if ($i==3) { &mov ($s[3],&DWP(4,"esp")); }
+ if ($i==3) { &mov ($s[3],$__s0); }
&comment();
}
&mov ($tmp,&DWP(192-128,$td));
&mov ($acc,&DWP(224-128,$td));
&lea ($td,&DWP(-128,$td)); }
- if($i==3) { &mov ($key,&DWP(20,"esp")); }
+ if($i==3) { &mov ($key,$__key); }
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
&movz ($out,&BP(0,$td,$out,1));
&shl ($tmp,16);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
else { &mov ($tmp,$s[3]); }
&shr ($tmp,24);
&movz ($tmp,&BP(0,$td,$tmp,1));
&shl ($tmp,24);
&xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
- if ($i==3) { &mov ($s[3],&DWP(4,"esp"));
+ if ($i==3) { &mov ($s[3],$__s0);
&lea ($td,&DWP(-2048,$td)); }
}
&public_label("AES_Td");
&function_begin_B("_x86_AES_decrypt");
# note that caller is expected to allocate stack frame for me!
- &mov (&DWP(20,"esp"),$key); # save key
+ &mov ($__key,$key); # save key
&xor ($s0,&DWP(0,$key)); # xor with key
&xor ($s1,&DWP(4,$key));
if ($small_footprint) {
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
&set_label("loop",16);
&decstep(0,$tbl,$s0,$s3,$s2,$s1);
&decstep(1,$tbl,$s1,$s0,$s3,$s2);
&xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key));
&xor ($s3,&DWP(12,$key));
- &cmp ($key,&DWP(24,"esp"));
- &mov (&DWP(20,"esp"),$key);
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
&jb (&label("loop"));
}
else {
&xor ($s3,&DWP(16*$i+12,$key));
}
&add ($key,32);
- &mov (&DWP(20,"esp"),$key); # advance rd_key
+ &mov ($__key,$key); # advance rd_key
&set_label("12rounds",4);
for ($i=1;$i<3;$i++) {
&decstep(0,$tbl,$s0,$s3,$s2,$s1);
&xor ($s3,&DWP(16*$i+12,$key));
}
&add ($key,32);
- &mov (&DWP(20,"esp"),$key); # advance rd_key
+ &mov ($__key,$key); # advance rd_key
&set_label("10rounds",4);
for ($i=1;$i<10;$i++) {
&decstep(0,$tbl,$s0,$s3,$s2,$s1);
&and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
&sub ("esp",$s1);
&add ("esp",4); # 4 is reserved for caller's return address
- &mov (&DWP(28,"esp"),$s0); # save stack pointer
+ &mov ($_esp,$s0); # save stack pointer
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($tbl);
- &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point"));
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
&lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
+
# pick Td4 copy which can't "overlap" with stack frame or key schedule
&lea ($s1,&DWP(768-4,"esp"));
&sub ($s1,$tbl);
&and ($s1,0x300);
&lea ($tbl,&DWP(2048+128,$tbl,$s1));
- &bt (&DWP(0,$s0),25); # check for SSE bit
+ if (!$x86only) {
+ &bt (&DWP(0,$s0),25); # check for SSE bit
&jnc (&label("x86"));
&movq ("mm0",&QWP(0,$acc));
&movq ("mm4",&QWP(8,$acc));
&call ("_sse_AES_decrypt_compact");
- &mov ("esp",&DWP(28,"esp")); # restore stack pointer
+ &mov ("esp",$_esp); # restore stack pointer
&mov ($acc,&wparam(1)); # load out
&movq (&QWP(0,$acc),"mm0"); # write output data
&movq (&QWP(8,$acc),"mm4");
&emms ();
&function_end_A();
-
+ }
&set_label("x86",16);
- &mov (&DWP(24,"esp"),$tbl);
+ &mov ($_tbl,$tbl);
&mov ($s0,&DWP(0,$acc)); # load input data
&mov ($s1,&DWP(4,$acc));
&mov ($s2,&DWP(8,$acc));
&mov ($s3,&DWP(12,$acc));
&call ("_x86_AES_decrypt_compact");
- &mov ("esp",&DWP(28,"esp")); # restore stack pointer
+ &mov ("esp",$_esp); # restore stack pointer
&mov ($acc,&wparam(1)); # load out
&mov (&DWP(0,$acc),$s0); # write output data
&mov (&DWP(4,$acc),$s1);
# unsigned char *ivp,const int enc);
{
# stack frame layout
-# -4(%esp) 0(%esp) return address
-# 0(%esp) 4(%esp) s0 backup
-# 4(%esp) 8(%esp) s1 backup
-# 8(%esp) 12(%esp) s2 backup
-# 12(%esp) 16(%esp) s3 backup
-# 16(%esp) 20(%esp) key backup
-# 20(%esp) 24(%esp) end of key schedule
-# 24(%esp) 28(%esp) ebp backup
-my $_esp=&DWP(28,"esp"); #saved %esp
-my $_inp=&DWP(32,"esp"); #copy of wparam(0)
-my $_out=&DWP(36,"esp"); #copy of wparam(1)
-my $_len=&DWP(40,"esp"); #copy of wparam(2)
-my $_key=&DWP(44,"esp"); #copy of wparam(3)
-my $_ivp=&DWP(48,"esp"); #copy of wparam(4)
-my $_tmp=&DWP(52,"esp"); #volatile variable
-my $ivec=&DWP(56,"esp"); #ivec[16]
-my $aes_key=&DWP(72,"esp"); #copy of aes_key
-my $mark=&DWP(72+240,"esp"); #copy of aes_key->rounds
+# -4(%esp) # return address 0(%esp)
+# 0(%esp) # s0 backing store 4(%esp)
+# 4(%esp) # s1 backing store 8(%esp)
+# 8(%esp) # s2 backing store 12(%esp)
+# 12(%esp) # s3 backing store 16(%esp)
+# 16(%esp) # key backup 20(%esp)
+# 20(%esp) # end of key schedule 24(%esp)
+# 24(%esp) # %ebp backup 28(%esp)
+# 28(%esp) # %esp backup
+my $_inp=&DWP(32,"esp"); # copy of wparam(0)
+my $_out=&DWP(36,"esp"); # copy of wparam(1)
+my $_len=&DWP(40,"esp"); # copy of wparam(2)
+my $_key=&DWP(44,"esp"); # copy of wparam(3)
+my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
+my $_tmp=&DWP(52,"esp"); # volatile variable
+#
+my $ivec=&DWP(60,"esp"); # ivec[16]
+my $aes_key=&DWP(76,"esp"); # copy of aes_key
+my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
&public_label("AES_Te");
&public_label("AES_Td");
&function_begin("AES_cbc_encrypt");
&mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
&cmp ($s2,0);
- &je (&label("enc_out"));
+ &je (&label("drop_out"));
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($tbl);
-
- &pushf ();
- &cld ();
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
&cmp (&wparam(5),0);
- &je (&label("DECRYPT"));
-
&lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
+ &jne (&label("picked_te"));
+ &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
+ &set_label("picked_te");
- # allocate aligned stack frame...
- &lea ($key,&DWP(-76-244,"esp"));
- &and ($key,-64);
+ # one can argue if this is required
+ &pushf ();
+ &cld ();
- # ... and make sure it doesn't alias with AES_Te modulo 4096
+ &cmp ($s2,$speed_limit);
+ &jb (&label("slow_way"));
+ &test ($s2,15);
+ &jnz (&label("slow_way"));
+ if (!$x86only) {
+ &bt (&DWP(0,$s0),28); # check for hyper-threading bit
+ &jc (&label("slow_way"));
+ }
+ # pre-allocate aligned stack frame...
+ &lea ($acc,&DWP(-80-244,"esp"));
+ &and ($acc,-64);
+
+ # ... and make sure it doesn't alias with $tbl modulo 4096
&mov ($s0,$tbl);
- &lea ($s1,&DWP(2048,$tbl));
- &mov ($s3,$key);
+ &lea ($s1,&DWP(2048+256,$tbl));
+ &mov ($s3,$acc);
&and ($s0,0xfff); # s = %ebp&0xfff
- &and ($s1,0xfff); # e = (%ebp+2048)&0xfff
+ &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
&and ($s3,0xfff); # p = %esp&0xfff
&cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
- &jb (&label("te_break_out"));
+ &jb (&label("tbl_break_out"));
&sub ($s3,$s1);
- &sub ($key,$s3);
- &jmp (&label("te_ok"));
- &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz;
+ &sub ($acc,$s3);
+ &jmp (&label("tbl_ok"));
+ &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
&sub ($s3,$s0);
&and ($s3,0xfff);
- &add ($s3,72+256);
- &sub ($key,$s3);
- &align (4);
- &set_label("te_ok");
-
- &mov ($s0,&wparam(0)); # load inp
- &mov ($s1,&wparam(1)); # load out
- &mov ($s3,&wparam(3)); # load key
- &mov ($acc,&wparam(4)); # load ivp
+ &add ($s3,384);
+ &sub ($acc,$s3);
+ &set_label("tbl_ok",4);
- &exch ("esp",$key);
+ &lea ($s3,&wparam(0)); # obtain pointer to parameter block
+ &exch ("esp",$acc); # allocate stack frame
&add ("esp",4); # reserve for return address!
- &mov ($_esp,$key); # save %esp
+ &mov ($_tbl,$tbl); # save %ebp
+ &mov ($_esp,$acc); # save %esp
+
+ &mov ($s0,&DWP(0,$s3)); # load inp
+ &mov ($s1,&DWP(4,$s3)); # load out
+ #&mov ($s2,&DWP(8,$s3)); # load len
+ &mov ($key,&DWP(12,$s3)); # load key
+ &mov ($acc,&DWP(16,$s3)); # load ivp
+ &mov ($s3,&DWP(20,$s3)); # load enc flag
&mov ($_inp,$s0); # save copy of inp
&mov ($_out,$s1); # save copy of out
&mov ($_len,$s2); # save copy of len
- &mov ($_key,$s3); # save copy of key
+ &mov ($_key,$key); # save copy of key
&mov ($_ivp,$acc); # save copy of ivp
&mov ($mark,0); # copy of aes_key->rounds = 0;
- if ($compromise) {
- &cmp ($s2,$compromise);
- &jb (&label("skip_ecopy"));
- }
# do we copy key schedule to stack?
- &mov ($s1 eq "ebx" ? $s1 : "",$s3);
+ &mov ($s1 eq "ebx" ? $s1 : "",$key);
&mov ($s2 eq "ecx" ? $s2 : "",244/4);
&sub ($s1,$tbl);
- &mov ("esi",$s3);
+ &mov ("esi",$key);
&and ($s1,0xfff);
&lea ("edi",$aes_key);
- &cmp ($s1,2048);
- &jb (&label("do_ecopy"));
+ &cmp ($s1,2048+256);
+ &jb (&label("do_copy"));
&cmp ($s1,4096-244);
- &jb (&label("skip_ecopy"));
- &align (4);
- &set_label("do_ecopy");
+ &jb (&label("skip_copy"));
+ &set_label("do_copy",4);
&mov ($_key,"edi");
&data_word(0xA5F3F689); # rep movsd
- &set_label("skip_ecopy");
+ &set_label("skip_copy");
- &mov ($acc,$s0);
&mov ($key,16);
- &align (4);
- &set_label("prefetch_te");
+ &set_label("prefetch_tbl",4);
&mov ($s0,&DWP(0,$tbl));
&mov ($s1,&DWP(32,$tbl));
&mov ($s2,&DWP(64,$tbl));
- &mov ($s3,&DWP(96,$tbl));
+ &mov ($acc,&DWP(96,$tbl));
&lea ($tbl,&DWP(128,$tbl));
- &dec ($key);
- &jnz (&label("prefetch_te"));
+ &sub ($key,1);
+ &jnz (&label("prefetch_tbl"));
&sub ($tbl,2048);
- &mov (&DWP(24,"esp"),$tbl);
- &mov ($s2,$_len);
+ &mov ($acc,$_inp);
&mov ($key,$_ivp);
- &test ($s2,0xFFFFFFF0);
- &jz (&label("enc_tail")); # short input...
+ &cmp ($s3,0);
+ &je (&label("fast_decrypt"));
+
+#----------------------------- ENCRYPT -----------------------------#
&mov ($s0,&DWP(0,$key)); # load iv
&mov ($s1,&DWP(4,$key));
- &align (4);
- &set_label("enc_loop");
+ &set_label("fast_enc_loop",16);
&mov ($s2,&DWP(8,$key));
&mov ($s3,&DWP(12,$key));
&mov (&DWP(8,$key),$s2);
&mov (&DWP(12,$key),$s3);
+ &lea ($acc,&DWP(16,$acc)); # advance inp
&mov ($s2,$_len); # load len
-
- &lea ($acc,&DWP(16,$acc));
&mov ($_inp,$acc); # save inp
-
- &lea ($s3,&DWP(16,$key));
+ &lea ($s3,&DWP(16,$key)); # advance out
&mov ($_out,$s3); # save out
-
- &sub ($s2,16);
- &test ($s2,0xFFFFFFF0);
+ &sub ($s2,16); # decrease len
&mov ($_len,$s2); # save len
- &jnz (&label("enc_loop"));
- &test ($s2,15);
- &jnz (&label("enc_tail"));
+ &jnz (&label("fast_enc_loop"));
&mov ($acc,$_ivp); # load ivp
- &mov ($s2,&DWP(8,$key)); # restore last dwords
+ &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
&mov ($s3,&DWP(12,$key));
&mov (&DWP(0,$acc),$s0); # save ivec
&mov (&DWP(4,$acc),$s1);
&cmp ($mark,0); # was the key schedule copied?
&mov ("edi",$_key);
- &mov ("esp",$_esp);
&je (&label("skip_ezero"));
# zero copy of key schedule
&mov ("ecx",240/4);
&align (4);
&data_word(0xABF3F689); # rep stosd
&set_label("skip_ezero")
+ &mov ("esp",$_esp);
&popf ();
- &set_label("enc_out");
+ &set_label("drop_out");
&function_end_A();
&pushf (); # kludge, never executed
- &align (4);
- &set_label("enc_tail");
- &push ($key eq "edi" ? $key : ""); # push ivp
- &mov ($key,$_out); # load out
- &mov ($s1,16);
- &sub ($s1,$s2);
- &cmp ($key,$acc); # compare with inp
- &je (&label("enc_in_place"));
- &align (4);
- &data_word(0xA4F3F689); # rep movsb # copy input
- &jmp (&label("enc_skip_in_place"));
- &set_label("enc_in_place");
- &lea ($key,&DWP(0,$key,$s2));
- &set_label("enc_skip_in_place");
- &mov ($s2,$s1);
- &xor ($s0,$s0);
- &align (4);
- &data_word(0xAAF3F689); # rep stosb # zero tail
- &pop ($key); # pop ivp
-
- &mov ($acc,$_out); # output as input
- &mov ($s0,&DWP(0,$key));
- &mov ($s1,&DWP(4,$key));
- &mov ($_len,16); # len=16
- &jmp (&label("enc_loop")); # one more spin...
-
#----------------------------- DECRYPT -----------------------------#
-&align (4);
-&set_label("DECRYPT");
- &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
-
- # allocate aligned stack frame...
- &lea ($key,&DWP(-64-244,"esp"));
- &and ($key,-64);
-
- # ... and make sure it doesn't alias with AES_Td modulo 4096
- &mov ($s0,$tbl);
- &lea ($s1,&DWP(2048+256,$tbl));
- &mov ($s3,$key);
- &and ($s0,0xfff); # s = %ebp&0xfff
- &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
- &and ($s3,0xfff); # p = %esp&0xfff
-
- &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
- &jb (&label("td_break_out"));
- &sub ($s3,$s1);
- &sub ($key,$s3);
- &jmp (&label("td_ok"));
- &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz;
- &sub ($s3,$s0);
- &and ($s3,0xfff);
- &add ($s3,72+256);
- &sub ($key,$s3);
- &align (4);
- &set_label("td_ok");
-
- &mov ($s0,&wparam(0)); # load inp
- &mov ($s1,&wparam(1)); # load out
- &mov ($s3,&wparam(3)); # load key
- &mov ($acc,&wparam(4)); # load ivp
-
- &exch ("esp",$key);
- &add ("esp",4); # reserve for return address!
- &mov ($_esp,$key); # save %esp
-
- &mov ($_inp,$s0); # save copy of inp
- &mov ($_out,$s1); # save copy of out
- &mov ($_len,$s2); # save copy of len
- &mov ($_key,$s3); # save copy of key
- &mov ($_ivp,$acc); # save copy of ivp
-
- &mov ($mark,0); # copy of aes_key->rounds = 0;
- if ($compromise) {
- &cmp ($s2,$compromise);
- &jb (&label("skip_dcopy"));
- }
- # do we copy key schedule to stack?
- &mov ($s1 eq "ebx" ? $s1 : "",$s3);
- &mov ($s2 eq "ecx" ? $s2 : "",244/4);
- &sub ($s1,$tbl);
- &mov ("esi",$s3);
- &and ($s1,0xfff);
- &lea ("edi",$aes_key);
- &cmp ($s1,2048+256);
- &jb (&label("do_dcopy"));
- &cmp ($s1,4096-244);
- &jb (&label("skip_dcopy"));
- &align (4);
- &set_label("do_dcopy");
- &mov ($_key,"edi");
- &data_word(0xA5F3F689); # rep movsd
- &set_label("skip_dcopy");
-
- &mov ($acc,$s0);
- &mov ($key,18);
- &align (4);
- &set_label("prefetch_td");
- &mov ($s0,&DWP(0,$tbl));
- &mov ($s1,&DWP(32,$tbl));
- &mov ($s2,&DWP(64,$tbl));
- &mov ($s3,&DWP(96,$tbl));
- &lea ($tbl,&DWP(128,$tbl));
- &dec ($key);
- &jnz (&label("prefetch_td"));
- &sub ($tbl,2048+256);
- &mov (&DWP(24,"esp"),$tbl);
+&set_label("fast_decrypt",16);
&cmp ($acc,$_out);
- &je (&label("dec_in_place")); # in-place processing...
+ &je (&label("fast_dec_in_place")); # in-place processing...
- &mov ($key,$_ivp); # load ivp
&mov ($_tmp,$key);
&align (4);
- &set_label("dec_loop");
+ &set_label("fast_dec_loop",16);
&mov ($s0,&DWP(0,$acc)); # read input
&mov ($s1,&DWP(4,$acc));
&mov ($s2,&DWP(8,$acc));
&xor ($s2,&DWP(8,$key));
&xor ($s3,&DWP(12,$key));
- &sub ($acc,16);
- &jc (&label("dec_partial"));
- &mov ($_len,$acc); # save len
- &mov ($acc,$_inp); # load inp
&mov ($key,$_out); # load out
+ &mov ($acc,$_inp); # load inp
&mov (&DWP(0,$key),$s0); # write output
&mov (&DWP(4,$key),$s1);
&mov (&DWP(8,$key),$s2);
&mov (&DWP(12,$key),$s3);
+ &mov ($s2,$_len); # load len
&mov ($_tmp,$acc); # save ivp
- &lea ($acc,&DWP(16,$acc));
+ &lea ($acc,&DWP(16,$acc)); # advance inp
&mov ($_inp,$acc); # save inp
-
- &lea ($key,&DWP(16,$key));
+ &lea ($key,&DWP(16,$key)); # advance out
&mov ($_out,$key); # save out
-
- &jnz (&label("dec_loop"));
+ &sub ($s2,16); # decrease len
+ &mov ($_len,$s2); # save len
+ &jnz (&label("fast_dec_loop"));
&mov ($key,$_tmp); # load temp ivp
- &set_label("dec_end");
&mov ($acc,$_ivp); # load user ivp
&mov ($s0,&DWP(0,$key)); # load iv
&mov ($s1,&DWP(4,$key));
&mov (&DWP(4,$acc),$s1);
&mov (&DWP(8,$acc),$s2);
&mov (&DWP(12,$acc),$s3);
- &jmp (&label("dec_out"));
+ &jmp (&label("fast_dec_out"));
- &align (4);
- &set_label("dec_partial");
- &lea ($key,$ivec);
- &mov (&DWP(0,$key),$s0); # dump output to stack
- &mov (&DWP(4,$key),$s1);
- &mov (&DWP(8,$key),$s2);
- &mov (&DWP(12,$key),$s3);
- &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc));
- &mov ($acc eq "esi" ? $acc : "",$key);
- &mov ($key eq "edi" ? $key : "",$_out); # load out
- &data_word(0xA4F3F689); # rep movsb # copy output
- &mov ($key,$_inp); # use inp as temp ivp
- &jmp (&label("dec_end"));
-
- &align (4);
- &set_label("dec_in_place");
- &set_label("dec_in_place_loop");
- &lea ($key,$ivec);
+ &set_label("fast_dec_in_place",16);
+ &set_label("fast_dec_in_place_loop");
&mov ($s0,&DWP(0,$acc)); # read input
&mov ($s1,&DWP(4,$acc));
&mov ($s2,&DWP(8,$acc));
&mov ($s3,&DWP(12,$acc));
+ &lea ($key,$ivec);
&mov (&DWP(0,$key),$s0); # copy to temp
&mov (&DWP(4,$key),$s1);
&mov (&DWP(8,$key),$s2);
&mov (&DWP(8,$acc),$s2);
&mov (&DWP(12,$acc),$s3);
- &lea ($acc,&DWP(16,$acc));
+ &lea ($acc,&DWP(16,$acc)); # advance out
&mov ($_out,$acc); # save out
&lea ($acc,$ivec);
&mov (&DWP(12,$key),$s3);
&mov ($acc,$_inp); # load inp
+ &mov ($s2,$_len); # load len
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &sub ($s2,16); # decrease len
+ &mov ($_len,$s2); # save len
+ &jnz (&label("fast_dec_in_place_loop"));
- &lea ($acc,&DWP(16,$acc));
+ &set_label("fast_dec_out",4);
+ &cmp ($mark,0); # was the key schedule copied?
+ &mov ("edi",$_key);
+ &je (&label("skip_dzero"));
+ # zero copy of key schedule
+ &mov ("ecx",240/4);
+ &xor ("eax","eax");
+ &align (4);
+ &data_word(0xABF3F689); # rep stosd
+ &set_label("skip_dzero")
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+
+#--------------------------- SLOW ROUTINE ---------------------------#
+&set_label("slow_way",16);
+
+ &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
+ &mov ($key,&wparam(3)); # load key
+
+ # pre-allocate aligned stack frame...
+ &lea ($acc,&DWP(-80,"esp"));
+ &and ($acc,-64);
+
+ # ... and make sure it doesn't alias with $key modulo 1024
+ &lea ($s1,&DWP(-80-63,$key));
+ &sub ($s1,$acc);
+ &neg ($s1);
+ &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
+ &sub ($acc,$s1);
+
+ # pick S-box copy which can't overlap with stack frame or $key
+ &lea ($s1,&DWP(768,$acc));
+ &sub ($s1,$tbl);
+ &and ($s1,0x300);
+ &lea ($tbl,&DWP(2048+128,$tbl,$s1));
+
+ &lea ($s3,&wparam(0)); # pointer to parameter block
+
+ &exch ("esp",$acc);
+ &add ("esp",4); # reserve for return address!
+ &mov ($_tbl,$tbl); # save %ebp
+ &mov ($_esp,$acc); # save %esp
+ &mov ($_tmp,$s0); # save OPENSSL_ia32cap
+
+ &mov ($s0,&DWP(0,$s3)); # load inp
+ &mov ($s1,&DWP(4,$s3)); # load out
+ #&mov ($s2,&DWP(8,$s3)); # load len
+ #&mov ($key,&DWP(12,$s3)); # load key
+ &mov ($acc,&DWP(16,$s3)); # load ivp
+ &mov ($s3,&DWP(20,$s3)); # load enc flag
+
+ &mov ($_inp,$s0); # save copy of inp
+ &mov ($_out,$s1); # save copy of out
+ &mov ($_len,$s2); # save copy of len
+ &mov ($_key,$key); # save copy of key
+ &mov ($_ivp,$acc); # save copy of ivp
+
+ &mov ($key,$acc);
+ &mov ($acc,$s0);
+
+ &cmp ($s3,0);
+ &je (&label("slow_decrypt"));
+
+#--------------------------- SLOW ENCRYPT ---------------------------#
+ &cmp ($s2,16);
+ &jb (&label("slow_enc_tail"));
+
+ if (!$x86only) {
+ &bt ($_tmp,25); # check for SSE bit
+ &jnc (&label("slow_enc_x86"));
+
+ &movq ("mm0",&QWP(0,$key)); # load iv
+ &movq ("mm4",&QWP(8,$key));
+
+ &set_label("slow_enc_loop_sse",16);
+ &pxor ("mm0",&QWP(0,$acc)); # xor input data
+ &pxor ("mm4",&QWP(8,$acc));
+
+ &mov ($key,$_key);
+ &call ("_sse_AES_encrypt_compact");
+
+ &mov ($acc,$_inp); # load inp
+ &mov ($key,$_out); # load out
+ &mov ($s2,$_len); # load len
+
+ &movq (&QWP(0,$key),"mm0"); # save output data
+ &movq (&QWP(8,$key),"mm4");
+
+ &lea ($acc,&DWP(16,$acc)); # advance inp
&mov ($_inp,$acc); # save inp
+ &lea ($s3,&DWP(16,$key)); # advance out
+ &mov ($_out,$s3); # save out
+ &sub ($s2,16); # decrease len
+ &cmp ($s2,16);
+ &mov ($_len,$s2); # save len
+ &jae (&label("slow_enc_loop_sse"));
+ &test ($s2,15);
+ &jnz (&label("slow_enc_tail"));
+ &mov ($acc,$_ivp); # load ivp
+ &movq (&QWP(0,$acc),"mm0"); # save ivec
+ &movq (&QWP(8,$acc),"mm4");
+ &emms ();
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+ }
+ &set_label("slow_enc_x86",16);
+ &mov ($s0,&DWP(0,$key)); # load iv
+ &mov ($s1,&DWP(4,$key));
+
+ &set_label("slow_enc_loop_x86",4);
+ &mov ($s2,&DWP(8,$key));
+ &mov ($s3,&DWP(12,$key));
+
+ &xor ($s0,&DWP(0,$acc)); # xor input data
+ &xor ($s1,&DWP(4,$acc));
+ &xor ($s2,&DWP(8,$acc));
+ &xor ($s3,&DWP(12,$acc));
+
+ &mov ($key,$_key); # load key
+ &call ("_x86_AES_encrypt_compact");
+
+ &mov ($acc,$_inp); # load inp
+ &mov ($key,$_out); # load out
+
+ &mov (&DWP(0,$key),$s0); # save output data
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
&mov ($s2,$_len); # load len
- &sub ($s2,16);
- &jc (&label("dec_in_place_partial"));
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &lea ($s3,&DWP(16,$key)); # advance out
+ &mov ($_out,$s3); # save out
+ &sub ($s2,16); # decrease len
+ &cmp ($s2,16);
&mov ($_len,$s2); # save len
- &jnz (&label("dec_in_place_loop"));
- &jmp (&label("dec_out"));
-
- &align (4);
- &set_label("dec_in_place_partial");
- # one can argue if this is actually required...
- &mov ($key eq "edi" ? $key : "",$_out);
- &lea ($acc eq "esi" ? $acc : "",$ivec);
+ &jae (&label("slow_enc_loop_x86"));
+ &test ($s2,15);
+ &jnz (&label("slow_enc_tail"));
+ &mov ($acc,$_ivp); # load ivp
+ &mov ($s2,&DWP(8,$key)); # restore last dwords
+ &mov ($s3,&DWP(12,$key));
+ &mov (&DWP(0,$acc),$s0); # save ivec
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+
+ &set_label("slow_enc_tail",16);
+ &emms ();
+ &mov ($key eq "edi"? $key:"",$s3); # load out to edi
+ &mov ($s1,16);
+ &sub ($s1,$s2);
+ &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp
+ &je (&label("enc_in_place"));
+ &align (4);
+ &data_word(0xA4F3F689); # rep movsb # copy input
+ &jmp (&label("enc_skip_in_place"));
+ &set_label("enc_in_place");
&lea ($key,&DWP(0,$key,$s2));
- &lea ($acc,&DWP(16,$acc,$s2));
- &neg ($s2 eq "ecx" ? $s2 : "");
- &data_word(0xA4F3F689); # rep movsb # restore tail
-
- &align (4);
- &set_label("dec_out");
- &cmp ($mark,0); # was the key schedule copied?
- &mov ("edi",$_key);
- &mov ("esp",$_esp);
- &je (&label("skip_dzero"));
- # zero copy of key schedule
- &mov ("ecx",240/4);
- &xor ("eax","eax");
- &align (4);
- &data_word(0xABF3F689); # rep stosd
- &set_label("skip_dzero")
- &popf ();
+ &set_label("enc_skip_in_place");
+ &mov ($s2,$s1);
+ &xor ($s0,$s0);
+ &align (4);
+ &data_word(0xAAF3F689); # rep stosb # zero tail
+
+ &lea ($key,&DWP(-16,$s3)); # restore ivp
+ &mov ($acc,$s3); # output as input
+ &mov ($s0,&DWP(0,$key));
+ &mov ($s1,&DWP(4,$key));
+ &mov ($_len,16); # len=16
+ &jmp (&label("slow_enc_loop_x86")); # one more spin...
+
+#--------------------------- SLOW DECRYPT ---------------------------#
+&set_label("slow_decrypt",16);
+ if (!$x86only) {
+ &bt ($_tmp,25); # check for SSE bit
+ &jnc (&label("slow_dec_loop_x86"));
+
+ &set_label("slow_dec_loop_sse",4);
+ &movq ("mm0",&QWP(0,$acc)); # read input
+ &movq ("mm4",&QWP(8,$acc));
+
+ &mov ($key,$_key);
+ &call ("_sse_AES_decrypt_compact");
+
+ &mov ($acc,$_inp); # load inp
+ &lea ($s0,$ivec);
+ &mov ($s1,$_out); # load out
+ &mov ($s2,$_len); # load len
+ &mov ($key,$_ivp); # load ivp
+
+ &movq ("mm1",&QWP(0,$acc)); # re-read input
+ &movq ("mm5",&QWP(8,$acc));
+
+ &pxor ("mm0",&QWP(0,$key)); # xor iv
+ &pxor ("mm4",&QWP(8,$key));
+
+ &movq (&QWP(0,$key),"mm1"); # copy input to iv
+ &movq (&QWP(8,$key),"mm5");
+
+ &sub ($s2,16); # decrease len
+ &jc (&label("slow_dec_partial_sse"));
+
+ &movq (&QWP(0,$s1),"mm0"); # write output
+ &movq (&QWP(8,$s1),"mm4");
+
+ &lea ($s1,&DWP(16,$s1)); # advance out
+ &mov ($_out,$s1); # save out
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &mov ($_len,$s2); # save len
+ &jnz (&label("slow_dec_loop_sse"));
+ &emms ();
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+
+ &set_label("slow_dec_partial_sse",16);
+ &movq (&QWP(0,$s0),"mm0"); # save output to temp
+ &movq (&QWP(8,$s0),"mm4");
+ &emms ();
+
+ &add ($s2 eq "ecx" ? "ecx":"",16);
+ &mov ("edi",$s1); # out
+ &mov ("esi",$s0); # temp
+ &align (4);
+ &data_word(0xA4F3F689); # rep movsb # copy partial output
+
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+ }
+ &set_label("slow_dec_loop_x86",16);
+ &mov ($s0,&DWP(0,$acc)); # read input
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &lea ($key,$ivec);
+ &mov (&DWP(0,$key),$s0); # copy to temp
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ($key,$_key); # load key
+ &call ("_x86_AES_decrypt_compact");
+
+ &mov ($key,$_ivp); # load ivp
+ &mov ($acc,$_len); # load len
+ &xor ($s0,&DWP(0,$key)); # xor iv
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &sub ($acc,16);
+ &jc (&label("slow_dec_partial_x86"));
+
+ &mov ($_len,$acc); # save len
+ &mov ($acc,$_out); # load out
+
+ &mov (&DWP(0,$acc),$s0); # write output
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+
+ &lea ($acc,&DWP(16,$acc)); # advance out
+ &mov ($_out,$acc); # save out
+
+ &lea ($acc,$ivec);
+ &mov ($s0,&DWP(0,$acc)); # read temp
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &mov (&DWP(0,$key),$s0); # copy it to iv
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ($acc,$_inp); # load inp
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &mov ($_len,$s2); # save len
+ &jnz (&label("slow_dec_loop_x86"));
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+
+ &set_label("slow_dec_partial_x86",16);
+ &lea ($acc,$ivec);
+ &mov (&DWP(0,$acc),$s0); # save output to temp
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+
+ &mov ($acc,$_inp);
+ &mov ($s0,&DWP(0,$acc)); # re-read input
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &mov (&DWP(0,$key),$s0); # copy it to iv
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ("ecx",$_len);
+ &mov ("edi",$_out);
+ &lea ("esi",$ivec);
+ &align (4);
+ &data_word(0xA4F3F689); # rep movsb # copy partial output
+
+ &mov ("esp",$_esp);
+ &popf ();
&function_end("AES_cbc_encrypt");
}