crypto/modes/asm/ghash-x86.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # March 2010
  11 #
  12 # The module implements "4-bit" GCM GHASH function and underlying
  13 # single multiplication operation in GF(2^128). "4-bit" means that it
  14 # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
  15 # code paths: vanilla x86 and vanilla MMX. Former will be executed on
  16 # 486 and Pentium, latter on all others. Performance results are for
  17 # streamed GHASH subroutine and are expressed in cycles per processed
  18 # byte, less is better:
  19 #
  20 #               gcc 2.95.3(*)   MMX assembler   x86 assembler
  21 #
  22 # Pentium       100/112(**)     -               50
  23 # PIII          63 /77          16              24
  24 # P4            96 /122         30              84(***)
  25 # Opteron       50 /71          21              30
  26 # Core2         54 /68          13              18
  27 #
  28 # (*)   gcc 3.4.x was observed to generate few percent slower code,
  29 #       which is one of reasons why 2.95.3 results were chosen,
  30 #       another reason is lack of 3.4.x results for older CPUs;
  31 # (**)  second number is result for code compiled with -fPIC flag,
  32 #       which is actually more relevant, because assembler code is
  33 #       position-independent;
  34 # (***) see comment in non-MMX routine for further details;
  35 #
  36 # To summarize, it's 2-3 times faster than gcc-generated code. To
  37 # anchor it to something else SHA1 assembler processes one byte in
  38 # 11-13 cycles on contemporary x86 cores.
  39
  40 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  41 push(@INC,"${dir}","${dir}../../perlasm");
  42 require "x86asm.pl";
  43
  44 &asm_init($ARGV[0],"gcm-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
  45
  46 &static_label("rem_4bit") if (!$x86only);
  47
  48 $Zhh  = "ebp";
  49 $Zhl  = "edx";
  50 $Zlh  = "ecx";
  51 $Zll  = "ebx";
  52 $inp  = "edi";
  53 $Htbl = "esi";
  54
  55 $unroll = 0;    # Affects x86 loop. Folded loop performs ~7% worse
  56                 # than unrolled, which has to be weighted against
  57                 # 1.7x code size reduction. Well, *overall* 1.7x,
  58                 # x86-specific code itself shrinks by 2.5x...
  59
  60 sub mmx_loop() {
  61 # MMX version performs 2.8 times better on P4 (see comment in non-MMX
  62 # routine for further details), 40% better on Opteron, 50% better
  63 # on PIII and Core2... In other words effort is considered to be well
  64 # spent...
  65     my $inp = shift;
  66     my $rem_4bit = shift;
  67     my $cnt = $Zhh;
  68     my $nhi = $Zhl;
  69     my $nlo = $Zlh;
  70     my $rem = $Zll;
  71
  72     my $Zlo = "mm0";
  73     my $Zhi = "mm1";
  74     my $tmp = "mm2";
  75
  76         &xor    ($nlo,$nlo);    # avoid partial register stalls on PIII
  77         &mov    ($nhi,$Zll);
  78         &mov    (&LB($nlo),&LB($nhi));
  79         &mov    ($cnt,14);
  80         &shl    (&LB($nlo),4);
  81         &and    ($nhi,0xf0);
  82         &movq   ($Zlo,&QWP(8,$Htbl,$nlo));
  83         &movq   ($Zhi,&QWP(0,$Htbl,$nlo));
  84         &movd   ($rem,$Zlo);
  85         &jmp    (&label("mmx_loop"));
  86
  87     &set_label("mmx_loop",16);
  88         &psrlq  ($Zlo,4);
  89         &and    ($rem,0xf);
  90         &pxor   ($Zlo,&QWP(8,$Htbl,$nhi));
  91         &movq   ($tmp,$Zhi);
  92         &psrlq  ($Zhi,4);
  93         &mov    (&LB($nlo),&BP(0,$inp,$cnt));
  94         &dec    ($cnt);
  95         &psllq  ($tmp,60);
  96         &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
  97         &movd   ($rem,$Zlo);
  98         &pxor   ($Zhi,&QWP(0,$Htbl,$nhi));
  99         &mov    ($nhi,$nlo);
 100         &pxor   ($Zlo,$tmp);
 101         &js     (&label("mmx_break"));
 102
 103         &shl    (&LB($nlo),4);
 104         &and    ($rem,0xf);
 105         &psrlq  ($Zlo,4);
 106         &and    ($nhi,0xf0);
 107         &movq   ($tmp,$Zhi);
 108         &psrlq  ($Zhi,4);
 109         &pxor   ($Zlo,&QWP(8,$Htbl,$nlo));
 110         &psllq  ($tmp,60);
 111         &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
 112         &movd   ($rem,$Zlo);
 113         &pxor   ($Zhi,&QWP(0,$Htbl,$nlo));
 114         &pxor   ($Zlo,$tmp);
 115         &jmp    (&label("mmx_loop"));
 116
 117     &set_label("mmx_break",16);
 118         &shl    (&LB($nlo),4);
 119         &and    ($rem,0xf);
 120         &psrlq  ($Zlo,4);
 121         &and    ($nhi,0xf0);
 122         &movq   ($tmp,$Zhi);
 123         &psrlq  ($Zhi,4);
 124         &pxor   ($Zlo,&QWP(8,$Htbl,$nlo));
 125         &psllq  ($tmp,60);
 126         &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
 127         &movd   ($rem,$Zlo);
 128         &pxor   ($Zhi,&QWP(0,$Htbl,$nlo));
 129         &pxor   ($Zlo,$tmp);
 130
 131         &psrlq  ($Zlo,4);
 132         &and    ($rem,0xf);
 133         &pxor   ($Zlo,&QWP(8,$Htbl,$nhi));
 134         &movq   ($tmp,$Zhi);
 135         &psrlq  ($Zhi,4);
 136         &psllq  ($tmp,60);
 137         &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
 138         &movd   ($rem,$Zlo);
 139         &pxor   ($Zhi,&QWP(0,$Htbl,$nhi));
 140         &mov    ($nhi,$nlo);
 141         &pxor   ($Zlo,$tmp);
 142
 143         &psrlq  ($Zlo,32);      # lower part of Zlo is already there
 144         &movd   ($Zhl,$Zhi);
 145         &psrlq  ($Zhi,32);
 146         &movd   ($Zlh,$Zlo);
 147         &movd   ($Zhh,$Zhi);
 148
 149         &bswap  ($Zll);
 150         &bswap  ($Zhl);
 151         &bswap  ($Zlh);
 152         &bswap  ($Zhh);
 153 }
 154
 155 sub x86_loop {
 156     my $off = shift;
 157     my $rem = "eax";
 158
 159         &mov    ($Zhh,&DWP(4,$Htbl,$Zll));
 160         &mov    ($Zhl,&DWP(0,$Htbl,$Zll));
 161         &mov    ($Zlh,&DWP(12,$Htbl,$Zll));
 162         &mov    ($Zll,&DWP(8,$Htbl,$Zll));
 163         &xor    ($rem,$rem);    # avoid partial register stalls on PIII
 164
 165         # shrd practically kills P4, 2.5x deterioration, but P4 has
 166         # MMX code-path to execute. shrd runs tad faster [than twice
 167         # the shifts, move's and or's] on pre-MMX Pentium (as well as
 168         # PIII and Core2), *but* minimizes code size, spares register
 169         # and thus allows to fold the loop...
 170         if (!$unroll) {
 171         my $cnt = $inp;
 172         &mov    ($cnt,15);
 173         &jmp    (&label("x86_loop"));
 174         &set_label("x86_loop",16);
 175             for($i=1;$i<=2;$i++) {
 176                 &mov    (&LB($rem),&LB($Zll));
 177                 &shrd   ($Zll,$Zlh,4);
 178                 &and    (&LB($rem),0xf);
 179                 &shrd   ($Zlh,$Zhl,4);
 180                 &shrd   ($Zhl,$Zhh,4);
 181                 &shr    ($Zhh,4);
 182                 &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
 183
 184                 &mov    (&LB($rem),&BP($off,"esp",$cnt));
 185                 if ($i&1) {
 186                         &and    (&LB($rem),0xf0);
 187                 } else {
 188                         &shl    (&LB($rem),4);
 189                 }
 190
 191                 &xor    ($Zll,&DWP(8,$Htbl,$rem));
 192                 &xor    ($Zlh,&DWP(12,$Htbl,$rem));
 193                 &xor    ($Zhl,&DWP(0,$Htbl,$rem));
 194                 &xor    ($Zhh,&DWP(4,$Htbl,$rem));
 195
 196                 if ($i&1) {
 197                         &dec    ($cnt);
 198                         &js     (&label("x86_break"));
 199                 } else {
 200                         &jmp    (&label("x86_loop"));
 201                 }
 202             }
 203         &set_label("x86_break",16);
 204         } else {
 205             for($i=1;$i<32;$i++) {
 206                 &comment($i);
 207                 &mov    (&LB($rem),&LB($Zll));
 208                 &shrd   ($Zll,$Zlh,4);
 209                 &and    (&LB($rem),0xf);
 210                 &shrd   ($Zlh,$Zhl,4);
 211                 &shrd   ($Zhl,$Zhh,4);
 212                 &shr    ($Zhh,4);
 213                 &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
 214
 215                 if ($i&1) {
 216                         &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
 217                         &and    (&LB($rem),0xf0);
 218                 } else {
 219                         &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
 220                         &shl    (&LB($rem),4);
 221                 }
 222
 223                 &xor    ($Zll,&DWP(8,$Htbl,$rem));
 224                 &xor    ($Zlh,&DWP(12,$Htbl,$rem));
 225                 &xor    ($Zhl,&DWP(0,$Htbl,$rem));
 226                 &xor    ($Zhh,&DWP(4,$Htbl,$rem));
 227             }
 228         }
 229         &bswap  ($Zll);
 230         &bswap  ($Zlh);
 231         &bswap  ($Zhl);
 232         if (!$x86only) {
 233                 &bswap  ($Zhh);
 234         } else {
 235                 &mov    ("eax",$Zhh);
 236                 &bswap  ("eax");
 237                 &mov    ($Zhh,"eax");
 238         }
 239 }
 240
 241 if ($unroll) {
 242     &function_begin_B("_x86_gmult_4bit_inner");
 243         &x86_loop(4);
 244         &ret    ();
 245     &function_end_B("_x86_gmult_4bit_inner");
 246 }
 247
 248 &function_begin("gcm_gmult_4bit");
 249     if (!$x86only) {
 250         &call   (&label("pic_point"));
 251         &set_label("pic_point");
 252         &blindpop("eax");
 253         &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
 254         &bt     (&DWP(0,"ebp"),23);     # check for MMX bit
 255         &jnc    (&label("x86"));
 256
 257         &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
 258
 259         &mov    ($inp,&wparam(0));      # load Xi
 260         &mov    ($Htbl,&wparam(1));     # load Htable
 261
 262         &movz   ($Zll,&BP(15,$inp));
 263
 264         &mmx_loop($inp,"eax");
 265
 266         &emms   ();
 267         &mov    (&DWP(12,$inp),$Zll);
 268         &mov    (&DWP(4,$inp),$Zhl);
 269         &mov    (&DWP(8,$inp),$Zlh);
 270         &mov    (&DWP(0,$inp),$Zhh);
 271
 272         &function_end_A();
 273     &set_label("x86",16);
 274     }
 275         &stack_push(16+4+1);                    # +1 for stack alignment
 276         &mov    ($inp,&wparam(0));              # load Xi
 277         &mov    ($Htbl,&wparam(1));             # load Htable
 278
 279         &mov    ($Zhh,&DWP(0,$inp));            # load Xi[16]
 280         &mov    ($Zhl,&DWP(4,$inp));
 281         &mov    ($Zlh,&DWP(8,$inp));
 282         &mov    ($Zll,&DWP(12,$inp));
 283
 284         &deposit_rem_4bit(16);
 285
 286         &mov    (&DWP(0,"esp"),$Zhh);           # copy Xi[16] on stack
 287         &mov    (&DWP(4,"esp"),$Zhl);
 288         &mov    (&DWP(8,"esp"),$Zlh);
 289         &mov    (&DWP(12,"esp"),$Zll);
 290         &shr    ($Zll,20);
 291         &and    ($Zll,0xf0);
 292
 293         if ($unroll) {
 294                 &call   ("_x86_gmult_4bit_inner");
 295         } else {
 296                 &x86_loop(0);
 297                 &mov    ($inp,&wparam(0));
 298         }
 299
 300         &mov    (&DWP(12,$inp),$Zll);
 301         &mov    (&DWP(8,$inp),$Zlh);
 302         &mov    (&DWP(4,$inp),$Zhl);
 303         &mov    (&DWP(0,$inp),$Zhh);
 304         &stack_pop(16+4+1);
 305 &function_end("gcm_gmult_4bit");
 306
 307 # Streamed version performs 20% better on P4, 7% on Opteron,
 308 # 10% on Core2 and PIII...
 309 &function_begin("gcm_ghash_4bit");
 310     if (!$x86only) {
 311         &call   (&label("pic_point"));
 312         &set_label("pic_point");
 313         &blindpop("eax");
 314         &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
 315         &bt     (&DWP(0,"ebp"),23);     # check for MMX bit
 316         &jnc    (&label("x86"));
 317
 318         &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
 319
 320         &mov    ($Zhh,&wparam(0));      # load Xi
 321         &mov    ($Htbl,&wparam(1));     # load Htable
 322         &mov    ($inp,&wparam(2));      # load in
 323         &mov    ($Zlh,&wparam(3));      # load len
 324         &add    ($Zlh,$inp);
 325         &mov    (&wparam(3),$Zlh);      # len to point at the end of input
 326         &stack_push(4+1);               # +1 for stack alignment
 327         &mov    ($Zll,&DWP(12,$Zhh));   # load Xi[16]
 328         &mov    ($Zhl,&DWP(4,$Zhh));
 329         &mov    ($Zlh,&DWP(8,$Zhh));
 330         &mov    ($Zhh,&DWP(0,$Zhh));
 331
 332     &set_label("mmx_outer_loop",16);
 333         &xor    ($Zll,&DWP(12,$inp));
 334         &xor    ($Zhl,&DWP(4,$inp));
 335         &xor    ($Zlh,&DWP(8,$inp));
 336         &xor    ($Zhh,&DWP(0,$inp));
 337         &mov    (&DWP(12,"esp"),$Zll);
 338         &mov    (&DWP(4,"esp"),$Zhl);
 339         &mov    (&DWP(8,"esp"),$Zlh);
 340         &mov    (&DWP(0,"esp"),$Zhh);
 341
 342         &shr    ($Zll,24);
 343
 344         &mmx_loop("esp","eax");
 345
 346         &lea    ($inp,&DWP(16,$inp));
 347         &cmp    ($inp,&wparam(3));
 348         &jb     (&label("mmx_outer_loop"));
 349
 350         &mov    ($inp,&wparam(0));      # load Xi
 351         &emms   ();
 352         &mov    (&DWP(12,$inp),$Zll);
 353         &mov    (&DWP(4,$inp),$Zhl);
 354         &mov    (&DWP(8,$inp),$Zlh);
 355         &mov    (&DWP(0,$inp),$Zhh);
 356
 357         &stack_pop(4+1);
 358         &function_end_A();
 359     &set_label("x86",16);
 360     }
 361         &stack_push(16+4+1);                    # +1 for 64-bit alignment
 362         &mov    ($Zll,&wparam(0));              # load Xi
 363         &mov    ($Htbl,&wparam(1));             # load Htable
 364         &mov    ($inp,&wparam(2));              # load in
 365         &mov    ("ecx",&wparam(3));             # load len
 366         &add    ("ecx",$inp);
 367         &mov    (&wparam(3),"ecx");
 368
 369         &mov    ($Zhh,&DWP(0,$Zll));            # load Xi[16]
 370         &mov    ($Zhl,&DWP(4,$Zll));
 371         &mov    ($Zlh,&DWP(8,$Zll));
 372         &mov    ($Zll,&DWP(12,$Zll));
 373
 374         &deposit_rem_4bit(16);
 375
 376     &set_label("x86_outer_loop",16);
 377         &xor    ($Zll,&DWP(12,$inp));           # xor with input
 378         &xor    ($Zlh,&DWP(8,$inp));
 379         &xor    ($Zhl,&DWP(4,$inp));
 380         &xor    ($Zhh,&DWP(0,$inp));
 381         &mov    (&DWP(12,"esp"),$Zll);          # dump it on stack
 382         &mov    (&DWP(8,"esp"),$Zlh);
 383         &mov    (&DWP(4,"esp"),$Zhl);
 384         &mov    (&DWP(0,"esp"),$Zhh);
 385
 386         &shr    ($Zll,20);
 387         &and    ($Zll,0xf0);
 388
 389         if ($unroll) {
 390                 &call   ("_x86_gmult_4bit_inner");
 391         } else {
 392                 &x86_loop(0);
 393                 &mov    ($inp,&wparam(2));
 394         }
 395         &lea    ($inp,&DWP(16,$inp));
 396         &cmp    ($inp,&wparam(3));
 397         &mov    (&wparam(2),$inp)       if (!$unroll);
 398         &jb     (&label("x86_outer_loop"));
 399
 400         &mov    ($inp,&wparam(0));      # load Xi
 401         &mov    (&DWP(12,$inp),$Zll);
 402         &mov    (&DWP(8,$inp),$Zlh);
 403         &mov    (&DWP(4,$inp),$Zhl);
 404         &mov    (&DWP(0,$inp),$Zhh);
 405         &stack_pop(16+4+1);
 406 &function_end("gcm_ghash_4bit");
 407
 408 sub deposit_rem_4bit {
 409     my $bias = shift;
 410
 411         &mov    (&DWP($bias+0, "esp"),0x0000<<16);
 412         &mov    (&DWP($bias+4, "esp"),0x1C20<<16);
 413         &mov    (&DWP($bias+8, "esp"),0x3840<<16);
 414         &mov    (&DWP($bias+12,"esp"),0x2460<<16);
 415         &mov    (&DWP($bias+16,"esp"),0x7080<<16);
 416         &mov    (&DWP($bias+20,"esp"),0x6CA0<<16);
 417         &mov    (&DWP($bias+24,"esp"),0x48C0<<16);
 418         &mov    (&DWP($bias+28,"esp"),0x54E0<<16);
 419         &mov    (&DWP($bias+32,"esp"),0xE100<<16);
 420         &mov    (&DWP($bias+36,"esp"),0xFD20<<16);
 421         &mov    (&DWP($bias+40,"esp"),0xD940<<16);
 422         &mov    (&DWP($bias+44,"esp"),0xC560<<16);
 423         &mov    (&DWP($bias+48,"esp"),0x9180<<16);
 424         &mov    (&DWP($bias+52,"esp"),0x8DA0<<16);
 425         &mov    (&DWP($bias+56,"esp"),0xA9C0<<16);
 426         &mov    (&DWP($bias+60,"esp"),0xB5E0<<16);
 427 }
 428
 429 if (!$x86only) {
 430 &set_label("rem_4bit",64);
 431         &data_word(0,0x0000<<16,0,0x1C20<<16,0,0x3840<<16,0,0x2460<<16);
 432         &data_word(0,0x7080<<16,0,0x6CA0<<16,0,0x48C0<<16,0,0x54E0<<16);
 433         &data_word(0,0xE100<<16,0,0xFD20<<16,0,0xD940<<16,0,0xC560<<16);
 434         &data_word(0,0x9180<<16,0,0x8DA0<<16,0,0xA9C0<<16,0,0xB5E0<<16);
 435 }
 436 &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
 437 &asm_finish();