From ed998634cd91db50ec1c9dcab7f69310aad96256 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 19 Mar 2012 20:23:32 +0000 Subject: [PATCH] e_padlock-x86[_64].pl: better understanding of prefetch errata and proper workaround. --- engines/asm/e_padlock-x86.pl | 104 ++++++++++++++----- engines/asm/e_padlock-x86_64.pl | 178 ++++++++++++++++++++++---------- 2 files changed, 204 insertions(+), 78 deletions(-) diff --git a/engines/asm/e_padlock-x86.pl b/engines/asm/e_padlock-x86.pl index 1b2ba52253..4148468c41 100644 --- a/engines/asm/e_padlock-x86.pl +++ b/engines/asm/e_padlock-x86.pl @@ -37,7 +37,7 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); -%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata +%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 $ctx="edx"; @@ -188,10 +188,6 @@ my ($mode,$opcode) = @_; &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter } else { &xor ("ebx","ebx"); - if ($PADLOCK_MARGIN{$mode}) { - &cmp ($len,$PADLOCK_MARGIN{$mode}); - &jbe (&label("${mode}_short")); - } &test (&DWP(0,$ctx),1<<5); # align bit in control word &jnz (&label("${mode}_aligned")); &test ($out,0x0f); @@ -212,7 +208,27 @@ my ($mode,$opcode) = @_; &neg ("eax"); &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK &lea ("esp",&DWP(0,"eax","ebp")); # alloca + &mov ("eax",$PADLOCK_CHUNK); + &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK + &mov ("eax","ebp"); + &and ("ebp",-16); &and ("esp",-16); + &mov (&DWP(16,"ebp"),"eax"); + if ($PADLOCK_PREFETCH{$mode}) { + &cmp ($len,$chunk); + &ja (&label("${mode}_loop")); + &mov ("eax",$inp); # check if prefetch crosses page + &cmp ("ebp","esp"); + &cmove ("eax",$out); + &add ("eax",$len); + &neg ("eax"); + &and ("eax",0xfff); # distance to page boundary + &cmp ("eax",$PADLOCK_PREFETCH{$mode}); + &mov ("eax",-$PADLOCK_PREFETCH{$mode}); + &cmovae ("eax",$chunk); # mask=distance128, cbc=>64, ctr32=>64); # prefetch errata +%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 $ctx="%rdx"; @@ -285,17 +285,6 @@ padlock_${mode}_encrypt: lea 16($ctx),$ctx # control word xor %eax,%eax xor %ebx,%ebx -___ -# Formally speaking correct condtion is $len<=$margin and $inp+$margin -# crosses page boundary [and next page is unreadable]. But $inp can -# be unaligned in which case data can be copied to $out if latter is -# aligned, in which case $out+$margin has to be checked. Covering all -# cases appears more complicated than just copying short input... -$code.=<<___ if ($PADLOCK_MARGIN{$mode}); - cmp \$$PADLOCK_MARGIN{$mode},$len - jbe .L${mode}_short -___ -$code.=<<___; testl \$`1<<5`,($ctx) # align bit in control word jnz .L${mode}_aligned test \$0x0f,$out @@ -315,6 +304,8 @@ $code.=<<___; neg %rax and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK lea (%rax,%rbp),%rsp + mov \$$PADLOCK_CHUNK,%rax + cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK ___ $code.=<<___ if ($mode eq "ctr32"); .L${mode}_reenter: @@ -322,10 +313,27 @@ $code.=<<___ if ($mode eq "ctr32"); bswap %eax neg %eax and \$`$PADLOCK_CHUNK/16-1`,%eax - jz .L${mode}_loop + mov \$$PADLOCK_CHUNK,$chunk shl \$4,%eax + cmovz $chunk,%rax cmp %rax,$len cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK + cmovbe $len,$chunk +___ +$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); + cmp $chunk,$len + ja .L${mode}_loop + mov $inp,%rax # check if prefetch crosses page + cmp %rsp,%rbp + cmove $out,%rax + add $len,%rax + neg %rax + and \$0xfff,%rax # distance to page boundary + cmp \$$PADLOCK_PREFETCH{$mode},%rax + mov \$-$PADLOCK_PREFETCH{$mode},%rax + cmovae $chunk,%rax # mask=distance