2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # The module implements "4-bit" GCM GHASH function and underlying
20 # single multiplication operation in GF(2^128). "4-bit" means that it
21 # uses 256 bytes per-key table [+128 bytes shared table]. Streamed
22 # GHASH performance was measured to be 6.67 cycles per processed byte
23 # on Itanium 2, which is >90% better than Microsoft compiler generated
24 # code. To anchor to something else sha1-ia64.pl module processes one
25 # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
30 # It was originally thought that it makes lesser sense to implement
31 # "528B" variant on Itanium 2 for following reason. Because number of
32 # functional units is naturally limited, it appeared impossible to
33 # implement "528B" loop in 4 cycles, only in 5. This would mean that
34 # theoretically performance improvement couldn't be more than 20%.
35 # But occasionally you prove yourself wrong:-) I figured out a way to
36 # fold couple of instructions and having freed yet another instruction
37 # slot by unrolling the loop... Resulting performance is 4.45 cycles
38 # per processed byte and 50% better than "256B" version. On original
39 # Itanium performance should remain the same as the "256B" version,
42 $output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
46 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
47 } else { $ADDP="add"; }
48 for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
49 $big_endian=0 if (/\-DL_ENDIAN/); }
50 if (!defined($big_endian))
51 { $big_endian=(unpack('L',pack('N',1))==1); }
55 my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
57 # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
58 # in scalable manner;-) Naturally assuming data in L1 cache...
59 # Special note about 'dep' instruction, which is used to construct
60 # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
61 # bytes boundary and lower 7 bits of its address are guaranteed to
65 { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
66 (p19) dep rem=Zlo,rem_4bitp,3,4 }
67 { .mfi; (p19) xor Zhi=Zhi,Hhi
68 ($p17) xor xi[1]=xi[1],in[1] };;
69 { .mfi; (p18) ld8 Hhi=[Hi[1]]
70 (p19) shrp Zlo=Zhi,Zlo,4 }
71 { .mfi; (p19) ld8 rem=[rem]
72 (p18) and Hi[1]=mask0xf0,xi[2] };;
73 { .mmi; ($p16) ld1 in[0]=[inp],-1
75 (p19) shr.u Zhi=Zhi,4 }
76 { .mib; (p19) xor Hhi=Hhi,rem
77 (p18) add Hi[1]=Htbl,Hi[1] };;
79 { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
80 (p18) dep rem=Zlo,rem_4bitp,3,4 }
81 { .mfi; (p17) shladd Hi[0]=xi[1],4,r0
82 (p18) xor Zhi=Zhi,Hhi };;
83 { .mfi; (p18) ld8 Hhi=[Hi[1]]
84 (p18) shrp Zlo=Zhi,Zlo,4 }
85 { .mfi; (p18) ld8 rem=[rem]
86 (p17) and Hi[0]=mask0xf0,Hi[0] };;
87 { .mmi; (p16) ld1 xi[0]=[Xi],-1
89 (p18) shr.u Zhi=Zhi,4 }
90 { .mib; (p18) xor Hhi=Hhi,rem
91 (p17) add Hi[0]=Htbl,Hi[0]
92 br.ctop.sptk $label };;
100 prevfs=r2; prevlc=r3; prevpr=r8;
102 rem=r22; rem_4bitp=r23;
109 .skip 16 // aligns loop body
110 .global gcm_gmult_4bit#
111 .proc gcm_gmult_4bit#
114 { .mmi; .save ar.pfs,prevfs
115 alloc prevfs=ar.pfs,2,6,0,8
116 $ADDP Xi=15,in0 // &Xi[15]
118 { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
125 .rotr in[3],xi[3],Hi[2]
127 { .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
129 brp.loop.imp .Loop1,.Lend1-16};;
130 { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
132 { .mii; shladd Hi[1]=xi[2],4,r0
135 { .mii; and Hi[1]=mask0xf0,Hi[1]
138 { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
139 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
145 { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
146 { .mib; mux1 Zlo=Zlo,\@rev };;
147 { .mib; mux1 Zhi=Zhi,\@rev };;
148 { .mmi; add Hlo=9,Xi;; // ;; is here to prevent
149 add Hhi=1,Xi };; // pipeline flush on Itanium
150 { .mib; st8 [Hlo]=Zlo
151 mov pr=prevpr,0x1ffff };;
152 { .mib; st8 [Hhi]=Zhi
154 br.ret.sptk.many b0 };;
155 .endp gcm_gmult_4bit#
158 ######################################################################
159 # "528B" (well, "512B" actually) streamed GHASH
167 ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
170 for (my $i=0;$i<8;$i++) {
172 { .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
173 ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
174 { .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
175 ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
177 $code.=shift if (($i+$#_)==7);
186 .skip 16 // aligns loop body
187 .global gcm_ghash_4bit#
188 .proc gcm_ghash_4bit#
191 { .mmi; .save ar.pfs,prevfs
192 alloc prevfs=ar.pfs,4,2,0,0
197 { .mfi; $ADDP r8=0+0,$Htbl
199 { .mfi; $ADDP r10=128+0,$Htbl
200 $ADDP r11=128+8,$Htbl };;
203 " $ADDP $Xip=15,$Xip", # &Xi[15]
204 " $ADDP $len=$len,$inp", # &inp[len]
205 " $ADDP $inp=15,$inp", # &inp[15]
206 " mov $mask0xff=0xff",
208 " andcm sp=sp,$mask0xff", # align stack frame
212 { .mmi; $sum 1<<1 // go big-endian
215 { .mmi; add r10=256+128+0,sp
217 add $len=-17,$len };;
219 for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
220 my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
222 { .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
223 st8 [r9]=$rhi,16 // Htable[$i].hi
224 shrp $rlo=$rhi,$rlo,4 }//;;
225 { .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
226 stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
227 shr.u $rhi=$rhi,4 };;
228 { .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
229 st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
233 { .mmi; ld8 r16=[r8],16 // Htable[8].lo
234 ld8 r17=[r9],16 };; // Htable[8].hi
235 { .mmi; ld8 r18=[r8],16 // Htable[9].lo
236 ld8 r19=[r9],16 } // Htable[9].hi
237 { .mmi; rum 1<<5 // clear um.mfh
238 shrp r16=r17,r16,4 };;
240 for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
242 { .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
243 ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
244 shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
245 { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
246 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
247 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
251 { .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
252 { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
253 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
254 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
255 { .mmi; add $Htbl=256,sp // &Htable[0]
256 add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
257 shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
258 { .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
259 st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
265 ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
266 ($Atbl,$Btbl)=("r26","r27");
268 $code.=<<___; # (p16)
269 { .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
270 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
271 cmp.eq p0,p6=r0,r0 };; // clear p6
273 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
275 $code.=<<___; # (p16),(p17)
276 { .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
277 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
278 { .mii; ld1 $in=[$inp],-1 //(p16) *inp--
279 dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
280 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
284 (p6) st8 [$Xip]=$Zhi,13
286 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
288 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
290 $code.=<<___; # (p16),(p17),(p18)
291 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
292 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
293 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
294 { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
295 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
296 { .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
297 xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
298 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
299 ld1 $in=[$inp],-1 } //(p16) *inp--
300 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
301 mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
302 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
303 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
304 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
305 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
306 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
307 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
309 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
311 for ($i=1;$i<14;$i++) {
312 # Above and below fragments are derived from this one by removing
313 # unsuitable (p??) instructions.
314 $code.=<<___; # (p16),(p17),(p18),(p19)
315 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
316 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
317 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
318 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
319 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
320 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
321 { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
322 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
323 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
324 { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
325 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
326 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
327 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
328 ld1 $in=[$inp],-1 //(p16) *inp--
329 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
330 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
331 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
332 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
333 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
334 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
335 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
336 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
337 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
338 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
340 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
343 $code.=<<___; # (p17),(p18),(p19)
344 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
345 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
346 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
347 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
348 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
349 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
350 { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
351 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
352 dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
353 { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
354 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
355 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
356 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
357 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
358 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
359 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
360 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
361 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
362 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
363 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
364 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
365 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
367 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
369 $code.=<<___; # (p18),(p19)
370 { .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
371 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
372 { .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
373 xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
374 { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
375 xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
376 { .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
377 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
378 { .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
379 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
380 { .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
381 xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
382 { .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
383 shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
384 { .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
385 xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
387 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
389 $code.=<<___; # (p19)
390 { .mmi; cmp.ltu p6,p0=$inp,$len
392 shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
393 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
394 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
395 add $Xip=9,$Xip };; // &Xi.lo
396 { .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
397 (p6) ld1 $in=[$inp],-1 //[p16] *inp--
398 (p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
399 { .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
400 (p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
401 { .mmi; st8 [$Xip]=$Zlo,-8
402 (p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
403 shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
405 (p6) ld1 $in=[$inp],-1 //[p16] *inp--
406 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
407 (p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
409 (p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
410 (p6) br.cond.dptk.many .LOOP };;
412 { .mib; st8 [$Xip]=$Zhi };;
413 { .mib; $rum 1<<1 // return to little-endian
416 br.ret.sptk.many b0 };;
417 .endp gcm_ghash_4bit#
421 .type rem_4bit#,\@object
423 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
424 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
425 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
426 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
428 .type rem_8bit#,\@object
430 data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
431 data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
432 data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
433 data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
434 data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
435 data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
436 data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
437 data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
438 data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
439 data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
440 data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
441 data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
442 data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
443 data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
444 data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
445 data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
446 data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
447 data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
448 data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
449 data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
450 data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
451 data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
452 data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
453 data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
454 data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
455 data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
456 data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
457 data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
458 data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
459 data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
460 data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
461 data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
463 stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
466 $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
467 $code =~ s/\`([^\`]*)\`/eval $1/gem;
470 close STDOUT or die "error closing STDOUT";