3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # The module implements "4-bit" GCM GHASH function and underlying
13 # single multiplication operation in GF(2^128). "4-bit" means that it
14 # uses 256 bytes per-key table [+128 bytes shared table]. Streamed
15 # GHASH performance was measured to be 6.67 cycles per processed byte
16 # on Itanium 2, which is >90% better than Microsoft compiler generated
17 # code. To anchor to something else sha1-ia64.pl module processes one
18 # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
23 # It was originally thought that it makes lesser sense to implement
24 # "528B" variant on Itanium 2 for following reason. Because number of
25 # functional units is naturally limited, it appeared impossible to
26 # implement "528B" loop in 4 cycles, only in 5. This would mean that
27 # theoretically performance improvement couldn't be more than 20%.
28 # But occasionally you prove yourself wrong:-) I figured out a way to
29 # fold couple of instructions and having freed yet another instruction
30 # slot by unrolling the loop... Resulting performance is 4.45 cycles
31 # per processed byte and 50% better than "256B" version. On original
32 # Itanium performance should remain the same as the "256B" version,
35 $output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
39 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
40 } else { $ADDP="add"; }
41 for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
42 $big_endian=0 if (/\-DL_ENDIAN/); }
43 if (!defined($big_endian))
44 { $big_endian=(unpack('L',pack('N',1))==1); }
48 my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
50 # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
51 # in scalable manner;-) Naturally assuming data in L1 cache...
52 # Special note about 'dep' instruction, which is used to construct
53 # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
54 # bytes boundary and lower 7 bits of its address are guaranteed to
58 { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
59 (p19) dep rem=Zlo,rem_4bitp,3,4 }
60 { .mfi; (p19) xor Zhi=Zhi,Hhi
61 ($p17) xor xi[1]=xi[1],in[1] };;
62 { .mfi; (p18) ld8 Hhi=[Hi[1]]
63 (p19) shrp Zlo=Zhi,Zlo,4 }
64 { .mfi; (p19) ld8 rem=[rem]
65 (p18) and Hi[1]=mask0xf0,xi[2] };;
66 { .mmi; ($p16) ld1 in[0]=[inp],-1
68 (p19) shr.u Zhi=Zhi,4 }
69 { .mib; (p19) xor Hhi=Hhi,rem
70 (p18) add Hi[1]=Htbl,Hi[1] };;
72 { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
73 (p18) dep rem=Zlo,rem_4bitp,3,4 }
74 { .mfi; (p17) shladd Hi[0]=xi[1],4,r0
75 (p18) xor Zhi=Zhi,Hhi };;
76 { .mfi; (p18) ld8 Hhi=[Hi[1]]
77 (p18) shrp Zlo=Zhi,Zlo,4 }
78 { .mfi; (p18) ld8 rem=[rem]
79 (p17) and Hi[0]=mask0xf0,Hi[0] };;
80 { .mmi; (p16) ld1 xi[0]=[Xi],-1
82 (p18) shr.u Zhi=Zhi,4 }
83 { .mib; (p18) xor Hhi=Hhi,rem
84 (p17) add Hi[0]=Htbl,Hi[0]
85 br.ctop.sptk $label };;
93 prevfs=r2; prevlc=r3; prevpr=r8;
95 rem=r22; rem_4bitp=r23;
102 .skip 16 // aligns loop body
103 .global gcm_gmult_4bit#
104 .proc gcm_gmult_4bit#
107 { .mmi; .save ar.pfs,prevfs
108 alloc prevfs=ar.pfs,2,6,0,8
109 $ADDP Xi=15,in0 // &Xi[15]
111 { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
118 .rotr in[3],xi[3],Hi[2]
120 { .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
122 brp.loop.imp .Loop1,.Lend1-16};;
123 { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
125 { .mii; shladd Hi[1]=xi[2],4,r0
128 { .mii; and Hi[1]=mask0xf0,Hi[1]
131 { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
132 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
138 { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
139 { .mib; mux1 Zlo=Zlo,\@rev };;
140 { .mib; mux1 Zhi=Zhi,\@rev };;
141 { .mmi; add Hlo=9,Xi;; // ;; is here to prevent
142 add Hhi=1,Xi };; // pipeline flush on Itanium
143 { .mib; st8 [Hlo]=Zlo
144 mov pr=prevpr,0x1ffff };;
145 { .mib; st8 [Hhi]=Zhi
147 br.ret.sptk.many b0 };;
148 .endp gcm_gmult_4bit#
151 ######################################################################
152 # "528B" (well, "512B" actualy) streamed GHASH
160 ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
163 for (my $i=0;$i<8;$i++) {
165 { .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
166 ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
167 { .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
168 ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
170 $code.=shift if (($i+$#_)==7);
179 .skip 16 // aligns loop body
180 .global gcm_ghash_4bit#
181 .proc gcm_ghash_4bit#
184 { .mmi; .save ar.pfs,prevfs
185 alloc prevfs=ar.pfs,4,2,0,0
190 { .mfi; $ADDP r8=0+0,$Htbl
192 { .mfi; $ADDP r10=128+0,$Htbl
193 $ADDP r11=128+8,$Htbl };;
196 " $ADDP $Xip=15,$Xip", # &Xi[15]
197 " $ADDP $len=$len,$inp", # &inp[len]
198 " $ADDP $inp=15,$inp", # &inp[15]
199 " mov $mask0xff=0xff",
201 " andcm sp=sp,$mask0xff", # align stack frame
205 { .mmi; $sum 1<<1 // go big-endian
208 { .mmi; add r10=256+128+0,sp
210 add $len=-17,$len };;
212 for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
213 my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
215 { .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
216 st8 [r9]=$rhi,16 // Htable[$i].hi
217 shrp $rlo=$rhi,$rlo,4 }//;;
218 { .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
219 stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
220 shr.u $rhi=$rhi,4 };;
221 { .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
222 st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
226 { .mmi; ld8 r16=[r8],16 // Htable[8].lo
227 ld8 r17=[r9],16 };; // Htable[8].hi
228 { .mmi; ld8 r18=[r8],16 // Htable[9].lo
229 ld8 r19=[r9],16 } // Htable[9].hi
230 { .mmi; rum 1<<5 // clear um.mfh
231 shrp r16=r17,r16,4 };;
233 for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
235 { .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
236 ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
237 shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
238 { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
239 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
240 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
244 { .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
245 { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
246 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
247 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
248 { .mmi; add $Htbl=256,sp // &Htable[0]
249 add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
250 shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
251 { .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
252 st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
258 ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
259 ($Atbl,$Btbl)=("r26","r27");
261 $code.=<<___; # (p16)
262 { .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
263 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
264 cmp.eq p0,p6=r0,r0 };; // clear p6
266 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
268 $code.=<<___; # (p16),(p17)
269 { .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
270 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
271 { .mii; ld1 $in=[$inp],-1 //(p16) *inp--
272 dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
273 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
277 (p6) st8 [$Xip]=$Zhi,13
279 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
281 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
283 $code.=<<___; # (p16),(p17),(p18)
284 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
285 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
286 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
287 { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
288 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
289 { .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
290 xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
291 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
292 ld1 $in=[$inp],-1 } //(p16) *inp--
293 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
294 mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
295 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
296 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
297 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
298 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
299 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
300 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
302 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
304 for ($i=1;$i<14;$i++) {
305 # Above and below fragments are derived from this one by removing
306 # unsuitable (p??) instructions.
307 $code.=<<___; # (p16),(p17),(p18),(p19)
308 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
309 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
310 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
311 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
312 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
313 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
314 { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
315 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
316 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
317 { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
318 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
319 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
320 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
321 ld1 $in=[$inp],-1 //(p16) *inp--
322 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
323 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
324 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
325 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
326 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
327 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
328 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
329 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
330 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
331 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
333 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
336 $code.=<<___; # (p17),(p18),(p19)
337 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
338 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
339 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
340 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
341 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
342 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
343 { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
344 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
345 dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
346 { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
347 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
348 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
349 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
350 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
351 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
352 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
353 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
354 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
355 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
356 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
357 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
358 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
360 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
362 $code.=<<___; # (p18),(p19)
363 { .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
364 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
365 { .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
366 xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
367 { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
368 xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
369 { .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
370 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
371 { .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
372 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
373 { .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
374 xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
375 { .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
376 shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
377 { .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
378 xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
380 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
382 $code.=<<___; # (p19)
383 { .mmi; cmp.ltu p6,p0=$inp,$len
385 shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
386 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
387 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
388 add $Xip=9,$Xip };; // &Xi.lo
389 { .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
390 (p6) ld1 $in=[$inp],-1 //[p16] *inp--
391 (p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
392 { .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
393 (p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
394 { .mmi; st8 [$Xip]=$Zlo,-8
395 (p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
396 shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
398 (p6) ld1 $in=[$inp],-1 //[p16] *inp--
399 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
400 (p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
402 (p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
403 (p6) br.cond.dptk.many .LOOP };;
405 { .mib; st8 [$Xip]=$Zhi };;
406 { .mib; $rum 1<<1 // return to little-endian
409 br.ret.sptk.many b0 };;
410 .endp gcm_ghash_4bit#
414 .type rem_4bit#,\@object
416 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
417 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
418 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
419 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
421 .type rem_8bit#,\@object
423 data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
424 data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
425 data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
426 data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
427 data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
428 data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
429 data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
430 data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
431 data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
432 data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
433 data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
434 data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
435 data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
436 data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
437 data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
438 data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
439 data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
440 data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
441 data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
442 data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
443 data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
444 data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
445 data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
446 data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
447 data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
448 data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
449 data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
450 data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
451 data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
452 data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
453 data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
454 data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
456 stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
459 $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
460 $code =~ s/\`([^\`]*)\`/eval $1/gem;