2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # The module implements GCM GHASH function and underlying single
20 # multiplication operation in GF(2^128). Even though subroutines
21 # have _4bit suffix, they are not using any tables, but rely on
22 # hardware Galois Field Multiply support. Streamed GHASH processes
23 # byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
24 # code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
25 # comparing apples vs. oranges, but compiler surely could have done
26 # better, because theoretical [though not necessarily achievable]
27 # estimate for "4-bit" table-driven implementation is ~12 cycles.
29 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
30 open STDOUT,">$output";
32 ($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
34 ($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3,
35 $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
36 ($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
37 $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
38 ($FF000000,$E10000)=("B30","B31");
39 ($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len
41 ($rem,$res)=("B4","B5"); # $rem zaps $Htable
46 .if .ASSEMBLER_VERSION<7000000
50 .asg gcm_gmult_1bit,_gcm_gmult_1bit
51 .asg gcm_gmult_4bit,_gcm_gmult_4bit
52 .asg gcm_ghash_4bit,_gcm_ghash_4bit
58 .global _gcm_gmult_1bit
60 ADDAD $Htable,2,$Htable
62 .global _gcm_gmult_4bit
65 LDDW *${Htable}[-1],$H1:$H0 ; H.lo
66 LDDW *${Htable}[-2],$H3:$H2 ; H.hi
67 || MV $Xip,${xip} ; reassign Xi
68 || MVK 15,B1 ; SPLOOPD constant
71 || LDBU *++${xip}[15],$x1 ; Xi[15]
73 || LDBU *--${xip},$x0 ; Xi[14]
74 SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
75 SHL $FF000000,24,$FF000000 ; upper byte mask
77 || MVK 1,B0 ; take a single spin
79 PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
80 AND $H2,$FF000000,$H2u ; H2's upper byte
81 AND $H3,$FF000000,$H3u ; H3's upper byte
89 .global _gcm_ghash_4bit
92 LDDW *${Htable}[-1],$H1:$H0 ; H.lo
93 || SHRU $len,4,B0 ; reassign len
94 LDDW *${Htable}[-2],$H3:$H2 ; H.hi
95 || MV $Xip,${xip} ; reassign Xi
96 || MVK 15,B1 ; SPLOOPD constant
99 || [B0] LDNDW *${inp}[1],$H1x:$H0x
101 || [B0] LDNDW *${inp}++[2],$H3x:$H2x
102 SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
103 || LDDW *${xip}[1],$Z1:$Z0
104 SHL $FF000000,24,$FF000000 ; upper byte mask
105 || LDDW *${xip}[0],$Z3:$Z2
107 PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
108 AND $H2,$FF000000,$H2u ; H2's upper byte
109 AND $H3,$FF000000,$H3u ; H3's upper byte
114 || [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
115 || [B0] XOR $H1x,$Z1,$Z1
117 [B0] XOR $H2x,$Z2,$Z2
118 || [B0] XOR $H3x,$Z3,$Z3
119 || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
120 STDW $Z1:$Z0,*${xip}[1]
121 || [B0] SHRU $Z1,16,$x0 ; Xi[14]
124 [B0] XOR $H2x,$Z2,$Z2
125 || [B0] XOR $H3x,$Z3,$Z3
126 || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
127 STDW $Z1:$Z0,*${xip}[1]
128 || [B0] SHRU $Z0,8,$x0 ; Xi[14]
131 STDW $Z3:$Z2,*${xip}[0]
141 || ADD $x1,$x1,$xib ; SHL $x1,1,$xib
145 ########____________________________
151 # 5 S2 S1x L1 D2 L2 |____________________________
152 # 6/0 L1 S1 L2 S2x |D2. M1 M2 |
153 # 7/1 L1 S1 D1x S2 M2 | M1 |
154 # 8/2 S1 L1x S2 | M1 M2 |
155 # 9/3 S1 L1x | D1. M1 M2 |
156 # 10/4 D1x | S1. L1 |
157 # 11/5 |S2 S1x L1 D2 L2 |____________
158 # 12/6/0 D1x __| L1 S1 L2 S2x |D2. ....
159 # 7/1 L1 S1 D1x S2 M2 | ....
160 # 8/2 S1 L1x S2 | ....
161 #####... ................|............
163 XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1)
164 || XORMPY $H01u,$xib,$H01y
165 || [A0] LDBU *--${xip},$x0
166 XORMPY $H1,$xia,$H1x ; 1
167 XORMPY $H2,$xia,$H2x ; 2
168 || XORMPY $H2u,$xib,$H2y
169 XORMPY $H3,$xia,$H3x ; 3
170 || XORMPY $H3u,$xib,$H3y
171 ||[!A0] MVK.D 15,A0 ; *--${xip} counter
172 XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1)
173 || [A0] SUB.S A0,1,A0
174 XOR.L $H1x,$Z1,$Z1 ; 5
175 || AND.D $H01y,$FF000000,$H0z
176 || SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y
180 XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue
181 || SHL $Z0,1,$rem ; ; rem=Z<<1
182 || SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8
183 || AND.L $H1y,$FF000000,$H1z
184 XOR.L $H3x,$Z3,$Z3 ; 7/1
185 || SHRMB.S $Z2,$Z1,$Z1
186 || XOR.D $H0z,$Z0,$Z0 ; merge upper byte products
187 || AND.S $H2y,$FF000000,$H2z
188 || XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE
189 XOR.L $H1z,$Z1,$Z1 ; 8/2
190 || SHRMB.S $Z3,$Z2,$Z2
191 || AND.S $H3y,$FF000000,$H3z
192 XOR.L $H2z,$Z2,$Z2 ; 9/3
194 XOR.D $H3z,$Z3,$Z3 ; 10/4
198 || XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res
200 ; input pre-fetch is possible where D1 slot is available...
201 [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/-
202 [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/-
214 || [B0] BNOP ghash_loop?
215 [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
216 || [B0] XOR $H1x,$Z1,$Z1
217 [B0] XOR $H2x,$Z2,$Z2
218 || [B0] XOR $H3x,$Z3,$Z3
219 || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
220 STDW $Z1:$Z0,*${xip}[1]
221 || [B0] SHRU $Z1,16,$x0 ; Xi[14]
225 [B0] BNOP ghash_loop? ; 12/-
226 [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
227 || [B0] XOR $H1x,$Z1,$Z1
228 [B0] XOR $H2x,$Z2,$Z2
229 || [B0] XOR $H3x,$Z3,$Z3
230 || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
231 STDW $Z1:$Z0,*${xip}[1]
232 || [B0] SHRU $Z0,8,$x0 ; Xi[14]
235 STDW $Z3:$Z2,*${xip}[0]
242 .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"