From: Andy Polyakov <appro@openssl.org>
Date: Sun, 7 Aug 2011 17:47:56 +0000 (+0000)
Subject: aes/asm/aesni-*.pl: fix CCM and further optimize it.
X-Git-Tag: OpenSSL-fips-2_0-rc1~232
X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=267b481c47a937d926aca4a9c866af7397fc040d;p=oweals%2Fopenssl.git

aes/asm/aesni-*.pl: fix CCM and further optimize it.
modes/ccm128.c: minor branch optimization.
---

diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index b3c8d1f60a..f95bf520d3 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -594,6 +594,7 @@ if ($PREFIX eq "aesni") {
 
 	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
 	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
+	&mov	($rounds,&DWP(240,$key));
 
 	# compose byte-swap control mask for pshufb on stack
 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
@@ -602,34 +603,29 @@ if ($PREFIX eq "aesni") {
 	&mov	(&DWP(12,"esp"),0x00010203);
 
 	# compose counter increment vector on stack
-	&mov	($rounds,1);
+	&mov	($rounds_,1);
 	&xor	($key_,$key_);
-	&mov	(&DWP(16,"esp"),$rounds);
+	&mov	(&DWP(16,"esp"),$rounds_);
 	&mov	(&DWP(20,"esp"),$key_);
 	&mov	(&DWP(24,"esp"),$key_);
 	&mov	(&DWP(28,"esp"),$key_);
 
-	&movdqa	($inout3,&QWP(0,"esp"));
-	&pshufb	($ivec,$inout3);		# keep iv in reverse order
-
-	&mov	($rounds,&DWP(240,$key));
-	&mov	($key_,$key);
-	&mov	($rounds_,$rounds);
+	&shr	($rounds,1);
+	&lea	($key_,&DWP(0,$key));
 	&movdqa	($inout0,$ivec);
+	&mov	($rounds_,$rounds);
+	&movdqa	($inout3,&QWP(0,"esp"));
 
 &set_label("ccm64_enc_outer");
-	&movups		($in0,&QWP(0,$inp));
-	&pshufb		($inout0,$inout3);
-	&mov		($key,$key_);
+	&$movekey	($rndkey0,&QWP(0,$key_));
 	&mov		($rounds,$rounds_);
+	&movups		($in0,&QWP(0,$inp));
 
-	&$movekey	($rndkey0,&QWP(0,$key));
-	&shr		($rounds,1);
-	&$movekey	($rndkey1,&QWP(16,$key));
-	&xorps		($in0,$rndkey0);
-	&lea		($key,&DWP(32,$key));
 	&xorps		($inout0,$rndkey0);
-	&xorps		($cmac,$in0);		# cmac^=inp
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&xorps		($rndkey0,$in0);
+	&lea		($key,&DWP(32,$key_));
+	&xorps		($cmac,$rndkey0);		# cmac^=inp
 	&$movekey	($rndkey0,&QWP(0,$key));
 
 &set_label("ccm64_enc2_loop");
@@ -642,18 +638,20 @@ if ($PREFIX eq "aesni") {
 	&aesenc		($cmac,$rndkey0);
 	&$movekey	($rndkey0,&QWP(0,$key));
 	&jnz		(&label("ccm64_enc2_loop"));
+	&pshufb		($ivec,$inout3);
 	&aesenc		($inout0,$rndkey1);
 	&aesenc		($cmac,$rndkey1);
+	&paddq		($ivec,&QWP(16,"esp"));
 	&aesenclast	($inout0,$rndkey0);
 	&aesenclast	($cmac,$rndkey0);
 
-	&paddq	($ivec,&QWP(16,"esp"));
 	&dec	($len);
 	&lea	($inp,&DWP(16,$inp));
 	&xorps	($in0,$inout0);			# inp^=E(ivec)
 	&movdqa	($inout0,$ivec);
-	&movups	(&QWP(0,$out),$in0);
+	&movups	(&QWP(0,$out),$in0);		# save output
 	&lea	($out,&DWP(16,$out));
+	&pshufb	($ivec,$inout3);
 	&jnz	(&label("ccm64_enc_outer"));
 
 	&mov	("esp",&DWP(48,"esp"));
@@ -675,6 +673,7 @@ if ($PREFIX eq "aesni") {
 
 	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
 	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
+	&mov	($rounds,&DWP(240,$key));
 
 	# compose byte-swap control mask for pshufb on stack
 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
@@ -683,46 +682,45 @@ if ($PREFIX eq "aesni") {
 	&mov	(&DWP(12,"esp"),0x00010203);
 
 	# compose counter increment vector on stack
-	&mov	($rounds,1);
+	&mov	($rounds_,1);
 	&xor	($key_,$key_);
-	&mov	(&DWP(16,"esp"),$rounds);
+	&mov	(&DWP(16,"esp"),$rounds_);
 	&mov	(&DWP(20,"esp"),$key_);
 	&mov	(&DWP(24,"esp"),$key_);
 	&mov	(&DWP(28,"esp"),$key_);
 
 	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
 	&movdqa	($inout0,$ivec);
-	&pshufb	($ivec,$inout3);		# keep iv in reverse order
 
-	&mov	($rounds,&DWP(240,$key));
 	&mov	($key_,$key);
 	&mov	($rounds_,$rounds);
 
+	&pshufb	($ivec,$inout3);
 	if ($inline)
 	{   &aesni_inline_generate1("enc");	}
 	else
 	{   &call	("_aesni_encrypt1");	}
-
-&set_label("ccm64_dec_outer");
-	&paddq	($ivec,&QWP(16,"esp"));
 	&movups	($in0,&QWP(0,$inp));		# load inp
-	&xorps	($in0,$inout0);
-	&movdqa	($inout0,$ivec);
+	&paddq	($ivec,&QWP(16,"esp"));
+	&pshufb	($ivec,$inout3);
 	&lea	($inp,&QWP(16,$inp));
-	&pshufb	($inout0,$inout3);
-	&mov	($key,$key_);
+	&jmp	(&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_outer",16);
+	&xorps	($in0,$inout0);			# inp ^= E(ivec)
+	&movdqa	($inout0,$ivec);
 	&mov	($rounds,$rounds_);
-	&movups	(&QWP(0,$out),$in0);
+	&movups	(&QWP(0,$out),$in0);		# save output
 	&lea	($out,&DWP(16,$out));
 
 	&sub	($len,1);
 	&jz	(&label("ccm64_dec_break"));
 
-	&$movekey	($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey0,&QWP(0,$key_));
 	&shr		($rounds,1);
-	&$movekey	($rndkey1,&QWP(16,$key));
+	&$movekey	($rndkey1,&QWP(16,$key_));
 	&xorps		($in0,$rndkey0);
-	&lea		($key,&DWP(32,$key));
+	&lea		($key,&DWP(32,$key_));
 	&xorps		($inout0,$rndkey0);
 	&xorps		($cmac,$in0);		# cmac^=out
 	&$movekey	($rndkey0,&QWP(0,$key));
@@ -737,13 +735,18 @@ if ($PREFIX eq "aesni") {
 	&aesenc		($cmac,$rndkey0);
 	&$movekey	($rndkey0,&QWP(0,$key));
 	&jnz		(&label("ccm64_dec2_loop"));
+	&movups		($in0,&QWP(0,$inp));	# load inp
+	&paddq		($ivec,&QWP(16,"esp"));
 	&aesenc		($inout0,$rndkey1);
 	&aesenc		($cmac,$rndkey1);
+	&pshufb		($ivec,$inout3);
+	&lea		($inp,&QWP(16,$inp));
 	&aesenclast	($inout0,$rndkey0);
 	&aesenclast	($cmac,$rndkey0);
 	&jmp	(&label("ccm64_dec_outer"));
 
 &set_label("ccm64_dec_break",16);
+	&mov	($key,$key_);
 	if ($inline)
 	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
 	else
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index ae0ad7f809..98c0dd55bf 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -821,8 +821,8 @@ ___
 {
 my $cmac="%r9";	# 6th argument
 
-my $increment="%xmm8";
-my $bswap_mask="%xmm9";
+my $increment="%xmm6";
+my $bswap_mask="%xmm7";
 
 $code.=<<___;
 .globl	aesni_ccm64_encrypt_blocks
@@ -839,30 +839,28 @@ $code.=<<___ if ($win64);
 .Lccm64_enc_body:
 ___
 $code.=<<___;
+	mov	240($key),$rounds		# key->rounds
 	movdqu	($ivp),$iv
-	movdqu	($cmac),$inout1
 	movdqa	.Lincrement64(%rip),$increment
 	movdqa	.Lbswap_mask(%rip),$bswap_mask
-	pshufb	$bswap_mask,$iv			# keep iv in reverse order
 
-	mov	240($key),$rounds		# key->rounds
-	mov	$key,$key_
-	mov	$rounds,$rnds_
+	shr	\$1,$rounds
+	lea	0($key),$key_
+	movdqu	($cmac),$inout1
 	movdqa	$iv,$inout0
-
+	mov	$rounds,$rnds_
+	jmp	.Lccm64_enc_outer
+.align	16
 .Lccm64_enc_outer:
-	movups	($inp),$in0			# load inp
-	pshufb	$bswap_mask,$inout0
-	mov	$key_,$key
+	$movkey	($key_),$rndkey0
 	mov	$rnds_,$rounds
+	movups	($inp),$in0			# load inp
 
-	$movkey	($key),$rndkey0
-	shr	\$1,$rounds
-	$movkey	16($key),$rndkey1
-	xorps	$rndkey0,$in0
-	lea	32($key),$key
-	xorps	$rndkey0,$inout0
-	xorps	$inout1,$in0			# cmac^=inp
+	xorps	$rndkey0,$inout0		# counter
+	$movkey	16($key_),$rndkey1
+	xorps	$in0,$rndkey0
+	lea	32($key_),$key
+	xorps	$rndkey0,$inout1		# cmac^=inp
 	$movkey	($key),$rndkey0
 
 .Lccm64_enc2_loop:
@@ -875,18 +873,20 @@ $code.=<<___;
 	aesenc	$rndkey0,$inout1
 	$movkey	0($key),$rndkey0
 	jnz	.Lccm64_enc2_loop
+	pshufb	$bswap_mask,$iv
 	aesenc	$rndkey1,$inout0
 	aesenc	$rndkey1,$inout1
+	paddq	$increment,$iv
 	aesenclast	$rndkey0,$inout0
 	aesenclast	$rndkey0,$inout1
 
-	paddq	$increment,$iv
 	dec	$len
 	lea	16($inp),$inp
 	xorps	$inout0,$in0			# inp ^= E(iv)
 	movdqa	$iv,$inout0
 	movups	$in0,($out)			# save output
 	lea	16($out),$out
+	pshufb	$bswap_mask,$iv
 	jnz	.Lccm64_enc_outer
 
 	movups	$inout1,($cmac)
@@ -919,39 +919,40 @@ $code.=<<___ if ($win64);
 .Lccm64_dec_body:
 ___
 $code.=<<___;
-	movdqu	($ivp),$iv
+	mov	240($key),$rounds		# key->rounds
+	movups	($ivp),$iv
 	movdqu	($cmac),$inout1
 	movdqa	.Lincrement64(%rip),$increment
 	movdqa	.Lbswap_mask(%rip),$bswap_mask
 
-	mov	240($key),$rounds		# key->rounds
-	movdqa	$iv,$inout0
-	pshufb	$bswap_mask,$iv			# keep iv in reverse order
+	movaps	$iv,$inout0
 	mov	$rounds,$rnds_
 	mov	$key,$key_
+	pshufb	$bswap_mask,$iv
 ___
 	&aesni_generate1("enc",$key,$rounds);
 $code.=<<___;
-.Lccm64_dec_outer:
-	paddq	$increment,$iv
 	movups	($inp),$in0			# load inp
-	xorps	$inout0,$in0
-	movdqa	$iv,$inout0
+	paddq	$increment,$iv
+	pshufb	$bswap_mask,$iv
 	lea	16($inp),$inp
-	pshufb	$bswap_mask,$inout0
-	mov	$key_,$key
+	jmp	.Lccm64_dec_outer
+.align	16
+.Lccm64_dec_outer:
+	xorps	$inout0,$in0			# inp ^= E(iv)
+	movdqa	$iv,$inout0
 	mov	$rnds_,$rounds
-	movups	$in0,($out)
+	movups	$in0,($out)			# save output
 	lea	16($out),$out
 
 	sub	\$1,$len
 	jz	.Lccm64_dec_break
 
-	$movkey	($key),$rndkey0
+	$movkey	($key_),$rndkey0
 	shr	\$1,$rounds
-	$movkey	16($key),$rndkey1
+	$movkey	16($key_),$rndkey1
 	xorps	$rndkey0,$in0
-	lea	32($key),$key
+	lea	32($key_),$key
 	xorps	$rndkey0,$inout0
 	xorps	$in0,$inout1			# cmac^=out
 	$movkey	($key),$rndkey0
@@ -966,15 +967,21 @@ $code.=<<___;
 	aesenc	$rndkey0,$inout1
 	$movkey	0($key),$rndkey0
 	jnz	.Lccm64_dec2_loop
+	movups	($inp),$in0			# load inp
+	paddq	$increment,$iv
 	aesenc	$rndkey1,$inout0
 	aesenc	$rndkey1,$inout1
+	pshufb	$bswap_mask,$iv
+	lea	16($inp),$inp
 	aesenclast	$rndkey0,$inout0
+	aesenclast	$rndkey0,$inout1
 	jmp	.Lccm64_dec_outer
 
 .align	16
 .Lccm64_dec_break:
+	#xorps	$in0,$inout1			# cmac^=out
 ___
-	&aesni_generate1("enc",$key,$rounds,$inout1);
+	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
 $code.=<<___;
 	movups	$inout1,($cmac)
 ___
diff --git a/crypto/modes/ccm128.c b/crypto/modes/ccm128.c
index 001fdff658..c9b35e5b35 100644
--- a/crypto/modes/ccm128.c
+++ b/crypto/modes/ccm128.c
@@ -356,10 +356,10 @@ int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
 		inp += n;
 		out += n;
 		len -= n;
+		if (len) ctr64_add(ctx->nonce.c,n/16);
 	}
 
 	if (len) {
-		if (n) ctr64_add(ctx->nonce.c,n/16);
 		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
 		(*block)(ctx->cmac.c,ctx->cmac.c,key);
 		(*block)(ctx->nonce.c,scratch.c,key);
@@ -409,10 +409,10 @@ int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
 		inp += n;
 		out += n;
 		len -= n;
+		if (len) ctr64_add(ctx->nonce.c,n/16);
 	}
 
 	if (len) {
-		if (n) ctr64_add(ctx->nonce.c,n/16);
 		(*block)(ctx->nonce.c,scratch.c,key);
 		for (i=0; i<len; ++i)
 			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);