&set_label("${mode}_pic_point");
&lea ($ctx,&DWP(16,$ctx)); # control word
&xor ("eax","eax");
- if ($mode eq "ctr16") {
+ if ($mode eq "ctr32") {
&movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
} else {
&xor ("ebx","ebx");
&mov (&DWP(8,"ebp"),$len);
&mov ($len,$chunk);
&mov (&DWP(12,"ebp"),$chunk); # chunk
- if ($mode eq "ctr16") {
+ if ($mode eq "ctr32") {
&mov ("ecx",&DWP(-4,$ctx));
&xor ($out,$out);
&mov ("eax",&DWP(-8,$ctx)); # borrow $len
}
&mov ($out,&DWP(0,"ebp")); # restore parameters
&mov ($chunk,&DWP(12,"ebp"));
- if ($mode eq "ctr16") {
+ if ($mode eq "ctr32") {
&mov ($inp,&DWP(4,"ebp"));
&xor ($len,$len);
&set_label("${mode}_xor");
&sub ($len,$chunk);
&mov ($chunk,$PADLOCK_CHUNK);
&jnz (&label("${mode}_loop"));
- if ($mode ne "ctr16") {
+ if ($mode ne "ctr32") {
&test ($out,0x0f); # out_misaligned
&jz (&label("${mode}_done"));
}
&data_byte(0xf3,0xab); # rep stosl
&set_label("${mode}_done");
&lea ("esp",&DWP(24,"ebp"));
- if ($mode ne "ctr16") {
+ if ($mode ne "ctr32") {
&jmp (&label("${mode}_exit"));
&set_label("${mode}_aligned",16);
&set_label("${mode}_exit"); }
&mov ("eax",1);
&lea ("esp",&DWP(4,"esp")); # popf
- &emms () if ($mode eq "ctr16");
+ &emms () if ($mode eq "ctr32");
&set_label("${mode}_abort");
&function_end("padlock_${mode}_encrypt");
}
&generate_mode("cbc",0xd0);
&generate_mode("cfb",0xe0);
&generate_mode("ofb",0xe8);
-&generate_mode("ctr16",0xc8); # yes, it implements own ctr with ecb opcode,
- # because hardware ctr was introduced later
- # and even has errata on certain CPU stepping.
- # own implementation *always* works...
+&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode,
+ # because hardware CTR was introduced later
+ # and even has errata on certain C7 stepping.
+ # own implementation *always* works, though
+ # ~15% slower than dedicated hardware...
&function_begin_B("padlock_xstore");
&push ("edi");
# September 2011
#
-# Assembler helpers for Padlock engine.
+# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
+# details.
$flavour = shift;
$output = shift;
$code=".text\n";
-$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16
+$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
$ctx="%rdx";
$out="%rdi";
neg %rax
and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
lea (%rax,%rbp),%rsp
+___
+$code.=<<___ if ($mode eq "ctr32");
+ mov -4($ctx),%eax # pull 32-bit counter
+ bswap %eax
+ neg %eax
+ and \$`$PADLOCK_CHUNK/16-1`,%eax
+ jz .L${mode}_loop
+ shl \$4,%eax
+ cmp %rax,$len
+ cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
+___
+$code.=<<___;
jmp .L${mode}_loop
.align 16
.L${mode}_loop:
+ cmp $len,$chunk # ctr32 artefact
+ cmova $len,$chunk # ctr32 artefact
mov $out,%r8 # save parameters
mov $inp,%r9
mov $len,%r10
movdqa (%rax),%xmm0
movdqa %xmm0,-16($ctx) # copy [or refresh] iv
___
+$code.=<<___ if ($mode eq "ctr32");
+ mov -4($ctx),%eax # pull 32-bit counter
+ test \$0xffff0000,%eax
+ jnz .L${mode}_no_corr
+ bswap %eax
+ add \$0x10000,%eax
+ bswap %eax
+ mov %eax,-4($ctx)
+.L${mode}_no_corr:
+___
$code.=<<___;
mov %r8,$out # restore paramters
mov %r11,$chunk
.align 16
.L${mode}_aligned:
+___
+$code.=<<___ if ($mode eq "ctr32");
+ mov -4($ctx),%eax # pull 32-bit counter
+ mov \$`16*0x10000`,$chunk
+ bswap %eax
+ cmp $len,$chunk
+ cmova $len,$chunk
+ neg %eax
+ and \$0xffff,%eax
+ jz .L${mode}_aligned_loop
+ shl \$4,%eax
+ cmp %rax,$len
+ cmova %rax,$chunk # don't let counter cross 2^16
+ jmp .L${mode}_aligned_loop
+.align 16
+.L${mode}_aligned_loop:
+ cmp $len,$chunk
+ cmova $len,$chunk
+ mov $len,%r10 # save parameters
+ mov $chunk,$len
+ mov $chunk,%r11
+___
+$code.=<<___;
lea -16($ctx),%rax # ivp
lea 16($ctx),%rbx # key
shr \$4,$len # len/=AES_BLOCK_SIZE
movdqa (%rax),%xmm0
movdqa %xmm0,-16($ctx) # copy [or refresh] iv
___
+$code.=<<___ if ($mode eq "ctr32");
+ mov -4($ctx),%eax # pull 32-bit counter
+ bswap %eax
+ add \$0x10000,%eax
+ bswap %eax
+ mov %eax,-4($ctx)
+
+ mov %r11,$chunk # restore paramters
+ mov %r10,$len
+ sub $chunk,$len
+ mov \$`16*0x10000`,$chunk
+ jnz .L${mode}_aligned_loop
+___
$code.=<<___;
.L${mode}_exit:
mov \$1,%eax
&generate_mode("cbc",0xd0);
&generate_mode("cfb",0xe0);
&generate_mode("ofb",0xe8);
-&generate_mode("ctr16",0xd8);
+&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
$code.=<<___;
.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
#endif
#include <openssl/rand.h>
#include <openssl/err.h>
+#include <openssl/modes.h>
#ifndef OPENSSL_NO_HW
#ifndef OPENSSL_NO_HW_PADLOCK
NID_aes_128_cbc,
NID_aes_128_cfb,
NID_aes_128_ofb,
+ NID_aes_128_ctr,
NID_aes_192_ecb,
NID_aes_192_cbc,
NID_aes_192_cfb,
NID_aes_192_ofb,
+ NID_aes_192_ctr,
NID_aes_256_ecb,
NID_aes_256_cbc,
NID_aes_256_cfb,
NID_aes_256_ofb,
+ NID_aes_256_ctr
};
static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/
sizeof(padlock_cipher_nids[0]));
return 1;
}
+static void padlock_ctr32_encrypt_glue(const unsigned char *in,
+ unsigned char *out, size_t blocks,
+ struct padlock_cipher_data *ctx,
+ const unsigned char *ivec)
+{
+ memcpy(ctx->iv,ivec,AES_BLOCK_SIZE);
+ padlock_ctr32_encrypt(out,in,ctx,AES_BLOCK_SIZE*blocks);
+}
+
+static int
+padlock_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
+ const unsigned char *in_arg, size_t nbytes)
+{
+ struct padlock_cipher_data *cdata = ALIGNED_CIPHER_DATA(ctx);
+ unsigned int num = ctx->num;
+
+ CRYPTO_ctr128_encrypt_ctr32(in_arg,out_arg,nbytes,
+ cdata,ctx->iv,ctx->buf,&num,
+ (ctr128_f)padlock_ctr32_encrypt_glue);
+
+ ctx->num = (size_t)num;
+ return 1;
+}
+
#define EVP_CIPHER_block_size_ECB AES_BLOCK_SIZE
#define EVP_CIPHER_block_size_CBC AES_BLOCK_SIZE
#define EVP_CIPHER_block_size_OFB 1
#define EVP_CIPHER_block_size_CFB 1
+#define EVP_CIPHER_block_size_CTR 1
/* Declaring so many ciphers by hand would be a pain.
Instead introduce a bit of preprocessor magic :-) */
DECLARE_AES_EVP(128,cbc,CBC);
DECLARE_AES_EVP(128,cfb,CFB);
DECLARE_AES_EVP(128,ofb,OFB);
+DECLARE_AES_EVP(128,ctr,CTR);
DECLARE_AES_EVP(192,ecb,ECB);
DECLARE_AES_EVP(192,cbc,CBC);
DECLARE_AES_EVP(192,cfb,CFB);
DECLARE_AES_EVP(192,ofb,OFB);
+DECLARE_AES_EVP(192,ctr,CTR);
DECLARE_AES_EVP(256,ecb,ECB);
DECLARE_AES_EVP(256,cbc,CBC);
DECLARE_AES_EVP(256,cfb,CFB);
DECLARE_AES_EVP(256,ofb,OFB);
+DECLARE_AES_EVP(256,ctr,CTR);
static int
padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid)
case NID_aes_128_ofb:
*cipher = &padlock_aes_128_ofb;
break;
+ case NID_aes_128_ctr:
+ *cipher = &padlock_aes_128_ctr;
+ break;
case NID_aes_192_ecb:
*cipher = &padlock_aes_192_ecb;
case NID_aes_192_ofb:
*cipher = &padlock_aes_192_ofb;
break;
+ case NID_aes_192_ctr:
+ *cipher = &padlock_aes_192_ctr;
+ break;
case NID_aes_256_ecb:
*cipher = &padlock_aes_256_ecb;
case NID_aes_256_ofb:
*cipher = &padlock_aes_256_ofb;
break;
+ case NID_aes_256_ctr:
+ *cipher = &padlock_aes_256_ctr;
+ break;
default:
/* Sorry, we don't support this NID */
{
struct padlock_cipher_data *cdata;
int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8;
+ unsigned long mode = EVP_CIPHER_CTX_mode(ctx);
if (key==NULL) return 0; /* ERROR */
memset(cdata, 0, sizeof(struct padlock_cipher_data));
/* Prepare Control word. */
- if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE)
+ if (mode == EVP_CIPH_OFB_MODE || mode == EVP_CIPH_CTR_MODE)
cdata->cword.b.encdec = 0;
else
cdata->cword.b.encdec = (ctx->encrypt == 0);
and is listed as hardware errata. They most
likely will fix it at some point and then
a check for stepping would be due here. */
- if ((EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_ECB_MODE ||
- EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_CBC_MODE)
+ if ((mode == EVP_CIPH_ECB_MODE ||
+ mode == EVP_CIPH_CBC_MODE)
&& !enc)
AES_set_decrypt_key(key, key_len, &cdata->ks);
else