&asm_init($ARGV[0],$0);
+%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata
$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16
$ctx="edx";
&movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
} else {
&xor ("ebx","ebx");
+ if ($PADLOCK_MARGIN{$mode}) {
+ &cmp ($len,$PADLOCK_MARGIN{$mode});
+ &jbe (&label("${mode}_short"));
+ }
&test (&DWP(0,$ctx),1<<5); # align bit in control word
&jnz (&label("${mode}_aligned"));
&test ($out,0x0f);
&mov ($chunk,$PADLOCK_CHUNK);
&jnz (&label("${mode}_loop"));
if ($mode ne "ctr32") {
- &test ($out,0x0f); # out_misaligned
- &jz (&label("${mode}_done"));
+ &cmp ("esp","ebp");
+ &je (&label("${mode}_done"));
}
- &mov ($len,"ebp");
- &mov ($out,"esp");
- &sub ($len,"esp");
- &xor ("eax","eax");
- &shr ($len,2);
- &data_byte(0xf3,0xab); # rep stosl
+ &pxor ("xmm0","xmm0");
+ &lea ("eax",&DWP(0,"esp"));
+&set_label("${mode}_bzero");
+ &movaps (&QWP(0,"eax"),"xmm0");
+ &lea ("eax",&DWP(16,"eax"));
+ &cmp ("ebp","eax");
+ &ja (&label("${mode}_bzero"));
+
&set_label("${mode}_done");
&lea ("esp",&DWP(24,"ebp"));
if ($mode ne "ctr32") {
&jmp (&label("${mode}_exit"));
+&set_label("${mode}_short",16);
+ &xor ("eax","eax");
+ &lea ("ebp",&DWP(-24,"esp"));
+ &sub ("eax",$len);
+ &lea ("esp",&DWP(0,"eax","ebp"));
+ &and ("esp",-16);
+ &xor ($chunk,$chunk);
+&set_label("${mode}_short_copy");
+ &movups ("xmm0",&QWP(0,$inp,$chunk));
+ &lea ($chunk,&DWP(16,$chunk));
+ &cmp ($len,$chunk);
+ &movaps (&QWP(-16,"esp",$chunk),"xmm0");
+ &ja (&label("${mode}_short_copy"));
+ &mov ($inp,"esp");
+ &mov ($chunk,$len);
+ &jmp (&label("${mode}_loop"));
+
&set_label("${mode}_aligned",16);
&lea ("eax",&DWP(-16,$ctx)); # ivp
&lea ("ebx",&DWP(16,$ctx)); # key
$code=".text\n";
+%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata
$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
$ctx="%rdx";
lea 16($ctx),$ctx # control word
xor %eax,%eax
xor %ebx,%ebx
+___
+# Formally speaking correct condtion is $len<=$margin and $inp+$margin
+# crosses page boundary [and next page is unreadable]. But $inp can
+# be unaligned in which case data can be copied to $out if latter is
+# aligned, in which case $out+$margin has to be checked. Covering all
+# cases appears more complicated than just copying short input...
+$code.=<<___ if ($PADLOCK_MARGIN{$mode});
+ cmp \$$PADLOCK_MARGIN{$mode},$len
+ jbe .L${mode}_short
+___
+$code.=<<___;
testl \$`1<<5`,($ctx) # align bit in control word
jnz .L${mode}_aligned
test \$0x0f,$out
lea (%rax,%rbp),%rsp
___
$code.=<<___ if ($mode eq "ctr32");
+.L${mode}_reenter:
mov -4($ctx),%eax # pull 32-bit counter
bswap %eax
neg %eax
mov \$$PADLOCK_CHUNK,$chunk
jnz .L${mode}_loop
- test \$0x0f,$out
- jz .L${mode}_done
+ cmp %rsp,%rbp
+ je .L${mode}_done
+
+ pxor %xmm0,%xmm0
+ lea (%rsp),%rax
+.L${mode}_bzero:
+ movaps %xmm0,(%rax)
+ lea 16(%rax),%rax
+ cmp %rax,%rbp
+ ja .L${mode}_bzero
- mov %rbp,$len
- mov %rsp,$out
- sub %rsp,$len
- xor %rax,%rax
- shr \$3,$len
- .byte 0xf3,0x48,0xab # rep stosq
.L${mode}_done:
lea (%rbp),%rsp
jmp .L${mode}_exit
-
+___
+$code.=<<___ if ($PADLOCK_MARGIN{$mode});
+.align 16
+.L${mode}_short:
+ mov %rsp,%rbp
+ sub $len,%rsp
+ xor $chunk,$chunk
+.L${mode}_short_copy:
+ movups ($inp,$chunk),%xmm0
+ lea 16($chunk),$chunk
+ cmp $chunk,$len
+ movaps %xmm0,-16(%rsp,$chunk)
+ ja .L${mode}_short_copy
+ mov %rsp,$inp
+ mov $len,$chunk
+ jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"`
+___
+$code.=<<___;
.align 16
.L${mode}_aligned:
___