--- /dev/null
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" Galois field multiplication and
+# streamed GHASH function. "4-bit" means that it uses 256 bytes
+# per-key table [+128 bytes shared table]. Streamed GHASH performance
+# was measured to be 6.35 cycles per processed byte on Itanium 2,
+# which is >90% better than Microsoft compiler generated code. Well,
+# the number should have been ~6.5. The deviation has everything to do
+# with the way performance is measured, as difference between GCM and
+# straightforward 128-bit counter mode. To anchor to something else
+# sha1-ia64.pl module processes one byte in 6.0 cycles. On Itanium
+# GHASH should run at ~8.5 cycles per byte.
+
+$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+
+if ($^O eq "hpux") {
+ $ADDP="addp4";
+ for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
+ $big_endian=0 if (/\-DL_ENDIAN/); }
+if (!defined($big_endian))
+ { $big_endian=(unpack('L',pack('N',1))==1); }
+
+sub loop() {
+my $label=shift;
+my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
+
+# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
+# in scalable manner;-) Naturally assuming data in L1 cache...
+# Special note about 'dep' instruction, which is used to construct
+# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
+# bytes boundary and lower 7 bits of its address are guaranteed to
+# be zero.
+$code.=<<___;
+$label:
+{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
+ (p19) dep rem=Zlo,rem_4bitp,3,4 }
+{ .mfi; (p19) xor Zhi=Zhi,Hhi
+ ($p17) xor xi[1]=xi[1],in[1] };;
+{ .mfi; (p18) ld8 Hhi=[Hi[1]]
+ (p19) shrp Zlo=Zhi,Zlo,4 }
+{ .mfi; (p19) ld8 rem=[rem]
+ (p18) and Hi[1]=mask0xf0,xi[2] };;
+{ .mmi; ($p16) ld1 in[0]=[inp],-1
+ (p18) xor Zlo=Zlo,Hlo
+ (p19) shr.u Zhi=Zhi,4 }
+{ .mib; (p19) xor Hhi=Hhi,rem
+ (p18) add Hi[1]=Htbl,Hi[1] };;
+
+{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
+ (p18) dep rem=Zlo,rem_4bitp,3,4 }
+{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
+ (p18) xor Zhi=Zhi,Hhi };;
+{ .mfi; (p18) ld8 Hhi=[Hi[1]]
+ (p18) shrp Zlo=Zhi,Zlo,4 }
+{ .mfi; (p18) ld8 rem=[rem]
+ (p17) and Hi[0]=mask0xf0,Hi[0] };;
+{ .mmi; (p16) ld1 xi[0]=[Xi],-1
+ (p18) xor Zlo=Zlo,Hlo
+ (p18) shr.u Zhi=Zhi,4 }
+{ .mib; (p18) xor Hhi=Hhi,rem
+ (p17) add Hi[0]=Htbl,Hi[0]
+ br.ctop.sptk $label };;
+___
+}
+
+$code=<<___;
+.explicit
+.text
+
+prevfs=r2; prevlc=r3; prevpr=r8;
+mask0xf0=r21;
+rem=r22; rem_4bitp=r23;
+Xi=r24; Htbl=r25;
+inp=r26; end=r27;
+Hhi=r28; Hlo=r29;
+Zhi=r30; Zlo=r31;
+
+.global gcm_gmult_4bit#
+.proc gcm_gmult_4bit#
+.align 128
+.skip 16;; // aligns loop body
+gcm_gmult_4bit:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,2,6,0,8
+ $ADDP Xi=15,in0 // &Xi[15]
+ mov rem_4bitp=ip }
+{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc
+ .save pr,prevpr
+ mov prevpr=pr };;
+
+ .body
+ .rotr in[3],xi[3],Hi[2]
+
+{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
+ mov mask0xf0=0xf0
+ brp.loop.imp .Loop1,.Lend1-16};;
+{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
+ };;
+{ .mii; shladd Hi[1]=xi[2],4,r0
+ mov pr.rot=0x7<<16
+ mov ar.lc=13 };;
+{ .mii; and Hi[1]=mask0xf0,Hi[1]
+ mov ar.ec=3
+ xor Zlo=Zlo,Zlo };;
+{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
+ add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
+ xor Zhi=Zhi,Zhi };;
+___
+ &loop (".Loop1",1);
+$code.=<<___;
+.Lend1:
+{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
+{ .mib; mux1 Zlo=Zlo,\@rev };;
+{ .mib; mux1 Zhi=Zhi,\@rev };;
+{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
+ add Hhi=1,Xi };; // pipeline flush on Itanium
+{ .mib; st8 [Hlo]=Zlo
+ mov pr=prevpr,-2 };;
+{ .mib; st8 [Hhi]=Zhi
+ mov ar.lc=prevlc
+ br.ret.sptk.many b0 };;
+.endp gcm_gmult_4bit#
+
+.global gcm_ghash_4bit#
+.proc gcm_ghash_4bit#
+.align 32;;
+gcm_ghash_4bit:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,4,4,0,8
+ $ADDP inp=15,in0 // &inp[15]
+ mov rem_4bitp=ip }
+{ .mmi; $ADDP end=in1,in0 // &inp[len]
+ $ADDP Xi=15,in2 // &Xi[15]
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc };;
+{ .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo
+ mov mask0xf0=0xf0
+ .save pr,prevpr
+ mov prevpr=pr }
+
+ .body
+ .rotr in[3],xi[3],Hi[2]
+
+{ .mmi; ld1 in[2]=[inp],-1 // inp[15]
+ ld1 xi[2]=[Xi],-1 // Xi[15]
+ add end=-17,end };;
+{ .mmi; ld1 in[1]=[inp],-1 // inp[14]
+ ld1 xi[1]=[Xi],-1 // Xi[14]
+ xor xi[2]=xi[2],in[2] };;
+{ .mii; shladd Hi[1]=xi[2],4,r0
+ mov pr.rot=0x7<<16
+ mov ar.lc=13 };;
+{ .mii; and Hi[1]=mask0xf0,Hi[1]
+ mov ar.ec=3
+ xor Zlo=Zlo,Zlo };;
+{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
+ add rem_4bitp=rem_4bit#-gcm_ghash_4bit#,rem_4bitp
+ xor Zhi=Zhi,Zhi };;
+___
+ &loop (".LoopN");
+$code.=<<___;
+{ .mib; xor Zhi=Zhi,Hhi // modulo-scheduling artefact
+ extr.u xi[2]=Zlo,0,8 } // Xi[15]
+{ .mib; cmp.ltu p6,p0=inp,end // are we done?
+ add inp=32,inp // advance inp
+ clrrrb.pr };;
+{ .mii;
+(p6) ld1 in[2]=[inp],-1 // inp[15]
+(p6) extr.u xi[1]=Zlo,8,8 // Xi[14]
+(p6) mov ar.lc=13 };;
+{ .mii;
+(p6) ld1 in[1]=[inp],-1 // inp[14]
+(p6) mov ar.ec=3
+ mux1 Zlo=Zlo,\@rev };;
+{ .mii;
+(p6) xor xi[2]=xi[2],in[2]
+ mux1 Zhi=Zhi,\@rev };;
+{ .mii;
+(p6) shladd Hi[1]=xi[2],4,r0
+ add Hlo=9,Xi // Xi is &Xi[-1]
+ add Hhi=1,Xi };;
+{ .mii;
+(p6) and Hi[1]=mask0xf0,Hi[1]
+(p6) add Xi=14,Xi // &Xi[13]
+(p6) mov pr.rot=0x7<<16 };;
+
+{ .mii; st8 [Hlo]=Zlo
+(p6) xor Zlo=Zlo,Zlo
+(p6) add Hi[1]=Htbl,Hi[1] };;
+{ .mib; st8 [Hhi]=Zhi
+(p6) xor Zhi=Zhi,Zhi
+(p6) br.cond.dptk.many .LoopN };;
+
+{ .mib; mov pr=prevpr,-2 }
+{ .mib; mov ar.lc=prevlc
+ br.ret.sptk.many b0 };;
+.endp gcm_ghash_4bit#
+
+.align 128;;
+.type rem_4bit#,\@object
+rem_4bit:
+ data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+ data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+ data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+ data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.size rem_4bit#,128
+stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
+
+print $code;
+close STDOUT;
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
+# March 2010
+#
# The module implements "4-bit" Galois field multiplication and
# streamed GHASH function. "4-bit" means that it uses 256 bytes
-# per-key table [+128/256 bytes fixed table]. It has two code paths:
+# per-key table [+64/128 bytes fixed table]. It has two code paths:
# vanilla x86 and vanilla MMX. Former will be executed on 486 and
# Pentium, latter on all others. Performance results are for streamed
# GHASH subroutine and are expressed in cycles per processed byte,
# gcc 2.95.3(*) MMX assembler x86 assembler
#
# Pentium 100/112(**) - 50
-# PIII 63 /77 17 24
-# P4 96 /122 33 84(***)
-# Opteron 50 /71 22 30
-# Core2 63 /102 21 28
+# PIII 63 /77 16 24
+# P4 96 /122 30 84(***)
+# Opteron 50 /71 21 30
+# Core2 63 /102 19 28
#
# (*) gcc 3.4.x was observed to generate few percent slower code,
-# which is one of reasons why 2.95.3 result were chosen;
+# which is one of reasons why 2.95.3 results were chosen,
# another reason is lack of 3.4.x results for older CPUs;
# (**) second number is result for code compiled with -fPIC flag,
# which is actually more relevant, because assembler code is
# (***) see comment in non-MMX routine for further details;
#
# To summarize, it's 2-3 times faster than gcc-generated code. To
-# anchor it to something else SHA1 assembler processes single byte
-# in 11-13 cycles.
+# anchor it to something else SHA1 assembler processes one byte in
+# 11-13 cycles on contemporary x86 cores.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
# than unrolled, which has to be weighted against
- # almost 2x code size reduction. Well, *overall*
- # code size. x86-specific code shrinks by 7.5x...
+ # 1.7x code size reduction. Well, *overall* 1.7x,
+ # x86-specific code itself shrinks by 2.5x...
sub mmx_loop() {
-# MMX version performs 2.5 times better on P4 (see comment in non-MMX
-# routine for further details), 35% better on Opteron and Core2, 40%
-# better on PIII... In other words effort is considered to be well
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron, 50% better
+# on PIII and Core2... In other words effort is considered to be well
# spent...
my $inp = shift;
my $rem_4bit = shift;
&xor ($nlo,$nlo); # avoid partial register stalls on PIII
&mov ($nhi,$Zll);
&mov (&LB($nlo),&LB($nhi));
- &mov ($cnt,15);
+ &mov ($cnt,14);
&shl (&LB($nlo),4);
&and ($nhi,0xf0);
&movq ($Zlo,&QWP(8,$Htbl,$nlo));
&set_label("mmx_loop",16);
&psrlq ($Zlo,4);
&and ($rem,0xf);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
&movq ($tmp,$Zhi);
&psrlq ($Zhi,4);
+ &mov (&LB($nlo),&BP(0,$inp,$cnt));
&dec ($cnt);
- &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
&psllq ($tmp,60);
&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
&movd ($rem,$Zlo);
&pxor ($Zhi,&QWP(0,$Htbl,$nhi));
+ &mov ($nhi,$nlo);
&pxor ($Zlo,$tmp);
&js (&label("mmx_break"));
- &movz ($nhi,&BP(0,$inp,$cnt));
+ &shl (&LB($nlo),4);
+ &and ($rem,0xf);
&psrlq ($Zlo,4);
- &mov (&LB($nlo),&LB($nhi));
+ &and ($nhi,0xf0);
&movq ($tmp,$Zhi);
- &shl (&LB($nlo),4);
&psrlq ($Zhi,4);
- &and ($rem,0xf);
&pxor ($Zlo,&QWP(8,$Htbl,$nlo));
&psllq ($tmp,60);
&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
&movd ($rem,$Zlo);
&pxor ($Zhi,&QWP(0,$Htbl,$nlo));
&pxor ($Zlo,$tmp);
- &and ($nhi,0xf0);
&jmp (&label("mmx_loop"));
&set_label("mmx_break",16);
+ &shl (&LB($nlo),4);
+ &and ($rem,0xf);
+ &psrlq ($Zlo,4);
+ &and ($nhi,0xf0);
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
+ &psllq ($tmp,60);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+ &movd ($rem,$Zlo);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
+ &pxor ($Zlo,$tmp);
+
+ &psrlq ($Zlo,4);
+ &and ($rem,0xf);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &psllq ($tmp,60);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+ &movd ($rem,$Zlo);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
+ &mov ($nhi,$nlo);
+ &pxor ($Zlo,$tmp);
+
&psrlq ($Zlo,32); # lower part of Zlo is already there
&movd ($Zhl,$Zhi);
&psrlq ($Zhi,32);