#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Version 4.3.
# P4 56[60] 84[100] 23
# AMD K8 48[44] 70[79] 18
# PIII 41[50] 61[91] 24
+# Core 2 32[38] 45[70] 18.5
# Pentium 120 160 77
#
# Version 4.1 switches to compact S-box even in key schedule setup.
# Current implementation accesses *all* cache-lines within ~50 cycles
# window, which is actually *less* than RDTSC latency on Intel P4!
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
&mov ($acc,$s[$i]);
&and ($acc,0x80808080);
&mov ($tmp,$acc);
- &mov ($r2,$s[$i]);
&shr ($tmp,7);
- &and ($r2,0x7f7f7f7f);
+ &lea ($r2,&DWP(0,$s[$i],$s[$i]));
&sub ($acc,$tmp);
- &lea ($r2,&DWP(0,$r2,$r2));
+ &and ($r2,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&mov ($tmp,$s[$i]);
&xor ($acc,$r2); # r2
&mov ($acc,$s[$i]);
&and ($acc,0x80808080);
&mov ($tmp,$acc);
- &mov ($tp2,$s[$i]);
&shr ($tmp,7);
- &and ($tp2,0x7f7f7f7f);
+ &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
&sub ($acc,$tmp);
- &add ($tp2,$tp2);
+ &and ($tp2,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&xor ($acc,$tp2);
&mov ($tp2,$acc);
&and ($acc,0x80808080);
&mov ($tmp,$acc);
- &mov ($tp4,$tp2);
- &xor ($tp2,$s[$i]); # tp2^tp1
&shr ($tmp,7);
- &and ($tp4,0x7f7f7f7f);
+ &lea ($tp4,&DWP(0,$tp2,$tp2));
&sub ($acc,$tmp);
- &add ($tp4,$tp4);
+ &and ($tp4,0xfefefefe);
&and ($acc,0x1b1b1b1b);
+ &xor ($tp2,$s[$i]); # tp2^tp1
&xor ($acc,$tp4);
&mov ($tp4,$acc);
&and ($acc,0x80808080);
&mov ($tmp,$acc);
- &mov ($tp8,$tp4);
- &xor ($tp4,$s[$i]); # tp4^tp1
&shr ($tmp,7);
- &and ($tp8,0x7f7f7f7f);
+ &lea ($tp8,&DWP(0,$tp4,$tp4));
&sub ($acc,$tmp);
- &add ($tp8,$tp8);
+ &and ($tp8,0xfefefefe);
&and ($acc,0x1b1b1b1b);
+ &xor ($tp4,$s[$i]); # tp4^tp1
&rotl ($s[$i],8); # = ROTATE(tp1,8)
&xor ($tp8,$acc);
&xor ($s[$i],$tp2);
&xor ($tp2,$tp8);
- &xor ($s[$i],$tp4);
&rotl ($tp2,24);
+ &xor ($s[$i],$tp4);
&xor ($tp4,$tp8);
- &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
&rotl ($tp4,16);
- &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
+ &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
&rotl ($tp8,8);
+ &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
&xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
+ &mov ($s[0],$__s0) if($i==2); #prefetch $s0
+ &mov ($s[1],$__s1) if($i==3); #prefetch $s1
+ &mov ($s[2],$__s2) if($i==1);
&xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
- &mov ($s[0],$__s0) if($i==2); #prefetch $s0
- &mov ($s[1],$__s1) if($i==3); #prefetch $s1
- &mov ($s[2],$__s2) if($i==1);
&mov ($s[3],$__s3) if($i==1);
&mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
}
&mov ($acc,$tp1);
&and ($acc,0x80808080);
&mov ($tmp,$acc);
- &mov ($tp2,$tp1);
&shr ($tmp,7);
- &and ($tp2,0x7f7f7f7f);
+ &lea ($tp2,&DWP(0,$tp1,$tp1));
&sub ($acc,$tmp);
- &add ($tp2,$tp2);
+ &and ($tp2,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&xor ($acc,$tp2);
&mov ($tp2,$acc);
&and ($acc,0x80808080);
&mov ($tmp,$acc);
- &mov ($tp4,$tp2);
- &xor ($tp2,$tp1); # tp2^tp1
&shr ($tmp,7);
- &and ($tp4,0x7f7f7f7f);
+ &lea ($tp4,&DWP(0,$tp2,$tp2));
&sub ($acc,$tmp);
- &add ($tp4,$tp4);
+ &and ($tp4,0xfefefefe);
&and ($acc,0x1b1b1b1b);
+ &xor ($tp2,$tp1); # tp2^tp1
&xor ($acc,$tp4);
&mov ($tp4,$acc);
&and ($acc,0x80808080);
&mov ($tmp,$acc);
- &mov ($tp8,$tp4);
- &xor ($tp4,$tp1); # tp4^tp1
&shr ($tmp,7);
- &and ($tp8,0x7f7f7f7f);
+ &lea ($tp8,&DWP(0,$tp4,$tp4));
+ &xor ($tp4,$tp1); # tp4^tp1
&sub ($acc,$tmp);
- &add ($tp8,$tp8);
+ &and ($tp8,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&rotl ($tp1,8); # = ROTATE(tp1,8)
&xor ($tp8,$acc);
&xor ("eax","eax"); # return success
&function_end("AES_set_decrypt_key");
+&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
tmhl %r0,`0x8000>>2`
jz .Lekey_internal
- l $t1,0($inp) # just copy 128 bits...
- l $t2,4($inp)
- l $bits,8($inp)
- l $inp,12($inp)
- st $t1,0($key)
- st $t2,4($key)
- st $bits,8($key)
- st $inp,12($key)
+ lmg $t1,$t2,0($inp) # just copy 128 bits...
+ stmg $t1,$t2,0($key)
lghi $t1,10
st $t1,236($key) # ... postpone key setup
st $t1,240($key)
.align 16
.Lekey_internal:
- stmg %r6,%r13,48($sp) # all volatile regs, but $ra!
+ stmg %r6,%r13,48($sp) # all non-volatile regs
bras $tbl,1f
1: aghi $tbl,AES_Te+2048-.
.align 16
AES_set_decrypt_key:
stg $key,32($sp) # I rely on AES_set_encrypt_key to
- stg $ra,112($sp) # save [other] volatile registers!
+ stg $ra,112($sp) # save non-volatile registers!
bras $ra,AES_set_encrypt_key
lg $key,32($sp)
lg $ra,112($sp)
c $t1,236($key)
je .Lgo
- l $t1,0($key) # just copy 128 bits otherwise
- l $t2,4($key)
- l $t3,8($key)
- l $bits,12($key)
- st $t1,160($key)
- st $t2,164($key)
- st $t3,168($key)
- st $bits,172($key)
+ lmg $t1,$t2,0($key) # just copy 128 bits otherwise
+ stmg $t1,$t2,160($key)
lghi %r2,0
br $ra
lg $ra,40($sp)
.Lgo: llgf $rounds,240($key)
- lghi $i1,0
+ la $i1,0($key)
sllg $i2,$rounds,4
+ la $i2,0($i2,$key)
srl $rounds,1
.align 8
-.Linv: l $s0,0($i1,$key)
- l $s1,4($i1,$key)
- l $s2,8($i1,$key)
- l $s3,12($i1,$key)
- l $t1,0($i2,$key)
- l $t2,4($i2,$key)
- l $t3,8($i2,$key)
- l $i3,12($i2,$key)
- st $s0,0($i2,$key)
- st $s1,4($i2,$key)
- st $s2,8($i2,$key)
- st $s3,12($i2,$key)
- st $t1,0($i1,$key)
- st $t2,4($i1,$key)
- st $t3,8($i1,$key)
- st $i3,12($i1,$key)
+.Linv: lmg $s0,$s1,0($i1)
+ lmg $s2,$s3,0($i2)
+ stmg $s0,$s1,0($i2)
+ stmg $s2,$s3,0($i1)
aghi $i1,16
aghi $i2,-16
brct $rounds,.Linv
la $key,4($key)
brct $rounds,.Lmix
- lmg %r6,%r13,48($sp)# this was saved by AES_set_encrypt_key!
+ lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
lghi %r2,0
br $ra
.size AES_set_decrypt_key,.-AES_set_decrypt_key