3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # The module implements "4-bit" GCM GHASH function and underlying
13 # single multiplication operation in GF(2^128). "4-bit" means that it
14 # uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15 # it processes one byte in 19 cycles, which is more than twice as fast
16 # as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for 8
17 # cycles, but measured performance on PA-8600 system is ~9 cycles per
18 # processed byte. This is ~2.2x faster than 64-bit code generated by
19 # vendor compiler (which used to be very hard to beat:-).
21 # Special thanks to polarhome.com for providing HP-UX account.
25 open STDOUT,">$output";
27 if ($flavour =~ /64/) {
38 $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
49 $FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50 # [+ argument transfer]
52 ################# volatile registers
53 $Xi="%r26"; # argument block
57 $Hhh=$Htbl; # variables
66 ################# preserved registers
80 $rem2="%r6"; # used in PA-RISC 2.0 code
85 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
87 .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
91 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
93 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
94 $PUSHMA %r3,$FRAME(%sp)
95 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
96 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
97 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
99 $code.=<<___ if ($SIZE_T==4);
100 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
101 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
102 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
103 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
104 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
110 andcm $rem_4bit,$rem,$rem_4bit
112 ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
115 $code.=<<___ if ($SIZE_T==4);
118 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
127 and $mask0xf0,$nlo,$nhi
128 depd,z $nlo,59,4,$nlo
133 depd,z $Zll,60,4,$rem
134 shrpd $Zhh,$Zll,4,$Zll
135 extrd,u $Zhh,59,60,$Zhh
140 and $mask0xf0,$nlo,$nhi
141 depd,z $nlo,59,4,$nlo
145 ldd $rem($rem_4bit),$rem
151 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
152 depd,z $Zll,60,4,$rem
154 shrpd $Zhh,$Zll,4,$Zll
155 extrd,u $Zhh,59,60,$Zhh
161 ldd $rem($rem_4bit),$rem
164 depd,z $Zll,60,4,$rem
167 shrpd $Zhh,$Zll,4,$Zll
168 extrd,u $Zhh,59,60,$Zhh
172 and $mask0xf0,$nlo,$nhi
173 depd,z $nlo,59,4,$nlo
174 ldd $rem($rem_4bit),$rem
177 addib,uv -1,$cnt,L\$oop_gmult_pa2
181 depd,z $Zll,60,4,$rem
183 shrpd $Zhh,$Zll,4,$Zll
184 extrd,u $Zhh,59,60,$Zhh
190 ldd $rem($rem_4bit),$rem
193 depd,z $Zll,60,4,$rem
195 shrpd $Zhh,$Zll,4,$Zll
196 extrd,u $Zhh,59,60,$Zhh
202 ldd $rem($rem_4bit),$rem
209 $code.=<<___ if ($SIZE_T==4);
219 and $mask0xf0,$nlo,$nhi
228 ldwx $rem($rem_4bit),$rem
229 shrpw $Zlh,$Zll,4,$Zll
231 shrpw $Zhl,$Zlh,4,$Zlh
233 shrpw $Zhh,$Zhl,4,$Zhl
235 extru $Zhh,27,28,$Zhh
238 and $mask0xf0,$nlo,$nhi
254 ldwx $rem($rem_4bit),$rem
255 shrpw $Zlh,$Zll,4,$Zll
257 shrpw $Zhl,$Zlh,4,$Zlh
261 shrpw $Zhh,$Zhl,4,$Zhl
264 extru $Zhh,27,28,$Zhh
271 shrpw $Zlh,$Zll,4,$Zll
272 ldwx $rem($rem_4bit),$rem
273 shrpw $Zhl,$Zlh,4,$Zlh
274 shrpw $Zhh,$Zhl,4,$Zhl
275 and $mask0xf0,$nlo,$nhi
276 extru $Zhh,27,28,$Zhh
283 addib,uv -1,$cnt,L\$oop_gmult_pa1
289 ldwx $rem($rem_4bit),$rem
290 shrpw $Zlh,$Zll,4,$Zll
292 shrpw $Zhl,$Zlh,4,$Zlh
295 shrpw $Zhh,$Zhl,4,$Zhl
298 extru $Zhh,27,28,$Zhh
305 ldwx $rem($rem_4bit),$rem
306 shrpw $Zlh,$Zll,4,$Zll
307 shrpw $Zhl,$Zlh,4,$Zlh
308 shrpw $Zhh,$Zhl,4,$Zhl
309 extru $Zhh,27,28,$Zhh
322 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
323 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
324 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
325 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
327 $code.=<<___ if ($SIZE_T==4);
328 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
329 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
330 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
331 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
332 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
337 $POPMB -$FRAME(%sp),%r3
340 .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
344 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
346 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
347 $PUSHMA %r3,$FRAME(%sp)
348 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
349 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
350 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
352 $code.=<<___ if ($SIZE_T==4);
353 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
354 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
355 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
356 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
357 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
363 andcm $rem_4bit,$rem,$rem_4bit
365 ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
368 $code.=<<___ if ($SIZE_T==4);
371 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
383 and $mask0xf0,$nlo,$nhi
384 depd,z $nlo,59,4,$nlo
389 depd,z $Zll,60,4,$rem
390 shrpd $Zhh,$Zll,4,$Zll
391 extrd,u $Zhh,59,60,$Zhh
398 and $mask0xf0,$nlo,$nhi
399 depd,z $nlo,59,4,$nlo
403 ldd $rem($rem_4bit),$rem
409 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
410 depd,z $Zll,60,4,$rem2
412 shrpd $Zhh,$Zll,4,$Zll
413 extrd,u $Zhh,59,60,$Zhh
420 ldbx $cnt($inp),$byte
422 depd,z $Zll,60,4,$rem
423 shrpd $Zhh,$Zll,4,$Zll
424 ldd $rem2($rem_4bit),$rem2
431 and $mask0xf0,$nlo,$nhi
432 depd,z $nlo,59,4,$nlo
434 extrd,u $Zhh,59,60,$Zhh
437 ldd $rem($rem_4bit),$rem
438 addib,uv -1,$cnt,L\$oop_ghash_pa2
442 depd,z $Zll,60,4,$rem2
444 shrpd $Zhh,$Zll,4,$Zll
445 extrd,u $Zhh,59,60,$Zhh
452 depd,z $Zll,60,4,$rem
453 shrpd $Zhh,$Zll,4,$Zll
454 ldd $rem2($rem_4bit),$rem2
460 extrd,u $Zhh,59,60,$Zhh
463 ldd $rem($rem_4bit),$rem
469 cmpb,*<> $inp,$len,L\$outer_ghash_pa2
473 $code.=<<___ if ($SIZE_T==4);
486 and $mask0xf0,$nlo,$nhi
496 ldwx $rem($rem_4bit),$rem
497 shrpw $Zlh,$Zll,4,$Zll
499 shrpw $Zhl,$Zlh,4,$Zlh
501 shrpw $Zhh,$Zhl,4,$Zhl
503 extru $Zhh,27,28,$Zhh
507 and $mask0xf0,$nlo,$nhi
523 ldwx $rem($rem_4bit),$rem
524 shrpw $Zlh,$Zll,4,$Zll
526 shrpw $Zhl,$Zlh,4,$Zlh
530 shrpw $Zhh,$Zhl,4,$Zhl
531 ldbx $cnt($inp),$byte
534 extru $Zhh,27,28,$Zhh
541 shrpw $Zlh,$Zll,4,$Zll
542 ldwx $rem($rem_4bit),$rem
543 shrpw $Zhl,$Zlh,4,$Zlh
545 shrpw $Zhh,$Zhl,4,$Zhl
546 and $mask0xf0,$nlo,$nhi
547 extru $Zhh,27,28,$Zhh
554 addib,uv -1,$cnt,L\$oop_ghash_pa1
560 ldwx $rem($rem_4bit),$rem
561 shrpw $Zlh,$Zll,4,$Zll
563 shrpw $Zhl,$Zlh,4,$Zlh
566 shrpw $Zhh,$Zhl,4,$Zhl
569 extru $Zhh,27,28,$Zhh
576 ldwx $rem($rem_4bit),$rem
577 shrpw $Zlh,$Zll,4,$Zll
578 shrpw $Zhl,$Zlh,4,$Zlh
579 shrpw $Zhh,$Zhl,4,$Zhl
580 extru $Zhh,27,28,$Zhh
591 comb,<> $inp,$len,L\$outer_ghash_pa1
596 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
597 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
598 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
599 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
601 $code.=<<___ if ($SIZE_T==4);
602 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
603 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
604 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
605 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
606 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
611 $POPMB -$FRAME(%sp),%r3
616 .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
617 .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
618 .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
619 .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
620 .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
624 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
625 # that it can be compiled with .LEVEL 1.0. It should be noted that I
626 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
630 my ($mod,$args) = @_;
631 my $orig = "ldd$mod\t$args";
633 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
634 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
635 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
637 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
638 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
639 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
640 $opcode|=(1<<5) if ($mod =~ /^,m/);
641 $opcode|=(1<<13) if ($mod =~ /^,mb/);
642 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
648 my ($mod,$args) = @_;
649 my $orig = "std$mod\t$args";
651 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
652 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
653 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
659 my ($mod,$args) = @_;
660 my $orig = "extrd$mod\t$args";
662 # I only have ",u" completer, it's implicitly encoded...
663 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
664 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
666 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
667 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
668 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
670 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
671 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
673 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
674 $opcode |= (1<<13) if ($mod =~ /,\**=/);
675 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
681 my ($mod,$args) = @_;
682 my $orig = "shrpd$mod\t$args";
684 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
685 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
687 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
688 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
690 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
691 { sprintf "\t.WORD\t0x%08x\t; %s",
692 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
698 my ($mod,$args) = @_;
699 my $orig = "depd$mod\t$args";
701 # I only have ",z" completer, it's impicitly encoded...
702 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
703 { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
706 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
707 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
708 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
714 my ($mnemonic,$mod,$args)=@_;
715 my $opcode = eval("\$$mnemonic");
717 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
720 foreach (split("\n",$code)) {
721 s/\`([^\`]*)\`/eval $1/ge;
723 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;