3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # The module implements "4-bit" GCM GHASH function and underlying
13 # single multiplication operation in GF(2^128). "4-bit" means that it
14 # uses 256 bytes per-key table [+128 bytes shared table]. Even though
15 # loops are aggressively modulo-scheduled in respect to references to
16 # Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17 # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18 # scheduling "glitch," because uprofile(1) indicates uniform sample
19 # distribution, as if all instruction bundles execute in 1.5 cycles.
20 # Meaning that it could have been even faster, yet 12 cycles is ~60%
21 # better than gcc-generated code and ~80% than code generated by vendor
34 $Xi="a0"; # $16, input argument block
69 s8addq $remp,$rem_4bit,$remp
97 s8addq $remp,$rem_4bit,$remp
115 s8addq $remp,$rem_4bit,$remp
139 s8addq $remp,$rem_4bit,$remp
156 s8addq $remp,$rem_4bit,$remp
181 s8addq $remp,$rem_4bit,$remp
199 s8addq $remp,$rem_4bit,$remp
222 s8addq $remp,$rem_4bit,$remp
235 s8addq $remp,$rem_4bit,$remp
255 .globl gcm_gmult_4bit
266 .Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
272 srl $Zlo,24,$t0 # byte swap
280 zapnot $Zlo,0x88,$Zlo
299 zapnot $Zhi,0x88,$Zhi
321 .globl gcm_ghash_4bit
341 .Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit)
344 extql $inhi,$inp,$inhi
345 extqh $Thi0,$inp,$Thi0
349 extql $inlo,$inp,$inlo
350 extqh $Tlo0,$inp,$Tlo0
361 srl $Zlo,24,$t0 # byte swap
369 zapnot $Zlo,0x88,$Zlo
390 zapnot $Zhi,0x88,$Zhi
411 zapnot $Zhi,0x88,$Zhi
436 .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
437 .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
438 .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
439 .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
440 .asciiz "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
444 $output=shift and open STDOUT,">$output";