3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # The module implements "4-bit" GCM GHASH function and underlying
13 # single multiplication operation in GF(2^128). "4-bit" means that it
14 # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
15 # code paths: vanilla x86 and vanilla MMX. Former will be executed on
16 # 486 and Pentium, latter on all others. Performance results are for
17 # streamed GHASH subroutine and are expressed in cycles per processed
18 # byte, less is better:
20 # gcc 2.95.3(*) MMX assembler x86 assembler
22 # Pentium 100/112(**) - 50
24 # P4 96 /122 30 84(***)
25 # Opteron 50 /71 21 30
28 # (*) gcc 3.4.x was observed to generate few percent slower code,
29 # which is one of reasons why 2.95.3 results were chosen,
30 # another reason is lack of 3.4.x results for older CPUs;
31 # (**) second number is result for code compiled with -fPIC flag,
32 # which is actually more relevant, because assembler code is
33 # position-independent;
34 # (***) see comment in non-MMX routine for further details;
36 # To summarize, it's 2-3 times faster than gcc-generated code. To
37 # anchor it to something else SHA1 assembler processes one byte in
38 # 11-13 cycles on contemporary x86 cores.
40 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41 push(@INC,"${dir}","${dir}../../perlasm");
44 &asm_init($ARGV[0],"gcm-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
46 &static_label("rem_4bit") if (!$x86only);
55 $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
56 # than unrolled, which has to be weighted against
57 # 1.7x code size reduction. Well, *overall* 1.7x,
58 # x86-specific code itself shrinks by 2.5x...
61 # MMX version performs 2.8 times better on P4 (see comment in non-MMX
62 # routine for further details), 40% better on Opteron, 50% better
63 # on PIII and Core2... In other words effort is considered to be well
76 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
78 &mov (&LB($nlo),&LB($nhi));
82 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
83 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
85 &jmp (&label("mmx_loop"));
87 &set_label("mmx_loop",16);
90 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
93 &mov (&LB($nlo),&BP(0,$inp,$cnt));
96 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
98 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
101 &js (&label("mmx_break"));
109 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
111 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
113 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
115 &jmp (&label("mmx_loop"));
117 &set_label("mmx_break",16);
124 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
126 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
128 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
133 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
137 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
139 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
143 &psrlq ($Zlo,32); # lower part of Zlo is already there
159 &mov ($Zhh,&DWP(4,$Htbl,$Zll));
160 &mov ($Zhl,&DWP(0,$Htbl,$Zll));
161 &mov ($Zlh,&DWP(12,$Htbl,$Zll));
162 &mov ($Zll,&DWP(8,$Htbl,$Zll));
163 &xor ($rem,$rem); # avoid partial register stalls on PIII
165 # shrd practically kills P4, 2.5x deterioration, but P4 has
166 # MMX code-path to execute. shrd runs tad faster [than twice
167 # the shifts, move's and or's] on pre-MMX Pentium (as well as
168 # PIII and Core2), *but* minimizes code size, spares register
169 # and thus allows to fold the loop...
173 &jmp (&label("x86_loop"));
174 &set_label("x86_loop",16);
175 for($i=1;$i<=2;$i++) {
176 &mov (&LB($rem),&LB($Zll));
178 &and (&LB($rem),0xf);
182 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
184 &mov (&LB($rem),&BP($off,"esp",$cnt));
186 &and (&LB($rem),0xf0);
191 &xor ($Zll,&DWP(8,$Htbl,$rem));
192 &xor ($Zlh,&DWP(12,$Htbl,$rem));
193 &xor ($Zhl,&DWP(0,$Htbl,$rem));
194 &xor ($Zhh,&DWP(4,$Htbl,$rem));
198 &js (&label("x86_break"));
200 &jmp (&label("x86_loop"));
203 &set_label("x86_break",16);
205 for($i=1;$i<32;$i++) {
207 &mov (&LB($rem),&LB($Zll));
209 &and (&LB($rem),0xf);
213 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
216 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
217 &and (&LB($rem),0xf0);
219 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
223 &xor ($Zll,&DWP(8,$Htbl,$rem));
224 &xor ($Zlh,&DWP(12,$Htbl,$rem));
225 &xor ($Zhl,&DWP(0,$Htbl,$rem));
226 &xor ($Zhh,&DWP(4,$Htbl,$rem));
242 &function_begin_B("_x86_gmult_4bit_inner");
245 &function_end_B("_x86_gmult_4bit_inner");
248 &function_begin("gcm_gmult_4bit");
250 &call (&label("pic_point"));
251 &set_label("pic_point");
253 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
254 &bt (&DWP(0,"ebp"),23); # check for MMX bit
255 &jnc (&label("x86"));
257 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
259 &mov ($inp,&wparam(0)); # load Xi
260 &mov ($Htbl,&wparam(1)); # load Htable
262 &movz ($Zll,&BP(15,$inp));
264 &mmx_loop($inp,"eax");
267 &mov (&DWP(12,$inp),$Zll);
268 &mov (&DWP(4,$inp),$Zhl);
269 &mov (&DWP(8,$inp),$Zlh);
270 &mov (&DWP(0,$inp),$Zhh);
273 &set_label("x86",16);
275 &stack_push(16+4+1); # +1 for stack alignment
276 &mov ($inp,&wparam(0)); # load Xi
277 &mov ($Htbl,&wparam(1)); # load Htable
279 &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
280 &mov ($Zhl,&DWP(4,$inp));
281 &mov ($Zlh,&DWP(8,$inp));
282 &mov ($Zll,&DWP(12,$inp));
284 &deposit_rem_4bit(16);
286 &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
287 &mov (&DWP(4,"esp"),$Zhl);
288 &mov (&DWP(8,"esp"),$Zlh);
289 &mov (&DWP(12,"esp"),$Zll);
294 &call ("_x86_gmult_4bit_inner");
297 &mov ($inp,&wparam(0));
300 &mov (&DWP(12,$inp),$Zll);
301 &mov (&DWP(8,$inp),$Zlh);
302 &mov (&DWP(4,$inp),$Zhl);
303 &mov (&DWP(0,$inp),$Zhh);
305 &function_end("gcm_gmult_4bit");
307 # Streamed version performs 20% better on P4, 7% on Opteron,
308 # 10% on Core2 and PIII...
309 &function_begin("gcm_ghash_4bit");
311 &call (&label("pic_point"));
312 &set_label("pic_point");
314 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
315 &bt (&DWP(0,"ebp"),23); # check for MMX bit
316 &jnc (&label("x86"));
318 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
320 &mov ($Zhh,&wparam(0)); # load Xi
321 &mov ($Htbl,&wparam(1)); # load Htable
322 &mov ($inp,&wparam(2)); # load in
323 &mov ($Zlh,&wparam(3)); # load len
325 &mov (&wparam(3),$Zlh); # len to point at the end of input
326 &stack_push(4+1); # +1 for stack alignment
327 &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
328 &mov ($Zhl,&DWP(4,$Zhh));
329 &mov ($Zlh,&DWP(8,$Zhh));
330 &mov ($Zhh,&DWP(0,$Zhh));
332 &set_label("mmx_outer_loop",16);
333 &xor ($Zll,&DWP(12,$inp));
334 &xor ($Zhl,&DWP(4,$inp));
335 &xor ($Zlh,&DWP(8,$inp));
336 &xor ($Zhh,&DWP(0,$inp));
337 &mov (&DWP(12,"esp"),$Zll);
338 &mov (&DWP(4,"esp"),$Zhl);
339 &mov (&DWP(8,"esp"),$Zlh);
340 &mov (&DWP(0,"esp"),$Zhh);
344 &mmx_loop("esp","eax");
346 &lea ($inp,&DWP(16,$inp));
347 &cmp ($inp,&wparam(3));
348 &jb (&label("mmx_outer_loop"));
350 &mov ($inp,&wparam(0)); # load Xi
352 &mov (&DWP(12,$inp),$Zll);
353 &mov (&DWP(4,$inp),$Zhl);
354 &mov (&DWP(8,$inp),$Zlh);
355 &mov (&DWP(0,$inp),$Zhh);
359 &set_label("x86",16);
361 &stack_push(16+4+1); # +1 for 64-bit alignment
362 &mov ($Zll,&wparam(0)); # load Xi
363 &mov ($Htbl,&wparam(1)); # load Htable
364 &mov ($inp,&wparam(2)); # load in
365 &mov ("ecx",&wparam(3)); # load len
367 &mov (&wparam(3),"ecx");
369 &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
370 &mov ($Zhl,&DWP(4,$Zll));
371 &mov ($Zlh,&DWP(8,$Zll));
372 &mov ($Zll,&DWP(12,$Zll));
374 &deposit_rem_4bit(16);
376 &set_label("x86_outer_loop",16);
377 &xor ($Zll,&DWP(12,$inp)); # xor with input
378 &xor ($Zlh,&DWP(8,$inp));
379 &xor ($Zhl,&DWP(4,$inp));
380 &xor ($Zhh,&DWP(0,$inp));
381 &mov (&DWP(12,"esp"),$Zll); # dump it on stack
382 &mov (&DWP(8,"esp"),$Zlh);
383 &mov (&DWP(4,"esp"),$Zhl);
384 &mov (&DWP(0,"esp"),$Zhh);
390 &call ("_x86_gmult_4bit_inner");
393 &mov ($inp,&wparam(2));
395 &lea ($inp,&DWP(16,$inp));
396 &cmp ($inp,&wparam(3));
397 &mov (&wparam(2),$inp) if (!$unroll);
398 &jb (&label("x86_outer_loop"));
400 &mov ($inp,&wparam(0)); # load Xi
401 &mov (&DWP(12,$inp),$Zll);
402 &mov (&DWP(8,$inp),$Zlh);
403 &mov (&DWP(4,$inp),$Zhl);
404 &mov (&DWP(0,$inp),$Zhh);
406 &function_end("gcm_ghash_4bit");
408 sub deposit_rem_4bit {
411 &mov (&DWP($bias+0, "esp"),0x0000<<16);
412 &mov (&DWP($bias+4, "esp"),0x1C20<<16);
413 &mov (&DWP($bias+8, "esp"),0x3840<<16);
414 &mov (&DWP($bias+12,"esp"),0x2460<<16);
415 &mov (&DWP($bias+16,"esp"),0x7080<<16);
416 &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
417 &mov (&DWP($bias+24,"esp"),0x48C0<<16);
418 &mov (&DWP($bias+28,"esp"),0x54E0<<16);
419 &mov (&DWP($bias+32,"esp"),0xE100<<16);
420 &mov (&DWP($bias+36,"esp"),0xFD20<<16);
421 &mov (&DWP($bias+40,"esp"),0xD940<<16);
422 &mov (&DWP($bias+44,"esp"),0xC560<<16);
423 &mov (&DWP($bias+48,"esp"),0x9180<<16);
424 &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
425 &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
426 &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
430 &set_label("rem_4bit",64);
431 &data_word(0,0x0000<<16,0,0x1C20<<16,0,0x3840<<16,0,0x2460<<16);
432 &data_word(0,0x7080<<16,0,0x6CA0<<16,0,0x48C0<<16,0,0x54E0<<16);
433 &data_word(0,0xE100<<16,0,0xFD20<<16,0,0xD940<<16,0,0xC560<<16);
434 &data_word(0,0x9180<<16,0,0x8DA0<<16,0,0xA9C0<<16,0,0xB5E0<<16);
436 &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");