crypto/bn/asm/sparcv8plus.S

   1 .ident  "sparcv8plus.s, Version 1.4"
   2 .ident  "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
   3
   4 /*
   5  * ====================================================================
   6  * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   7  * project.
   8  *
   9  * Rights for redistribution and usage in source and binary forms are
  10  * granted according to the OpenSSL license. Warranty of any kind is
  11  * disclaimed.
  12  * ====================================================================
  13  */
  14
  15 /*
  16  * This is my modest contributon to OpenSSL project (see
  17  * http://www.openssl.org/ for more information about it) and is
  18  * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
  19  * module. For updates see http://fy.chalmers.se/~appro/hpe/.
  20  *
  21  * Questions-n-answers.
  22  *
  23  * Q. How to compile?
  24  * A. With SC4.x/SC5.x:
  25  *
  26  *      cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  27  *
  28  *    and with gcc:
  29  *
  30  *      gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
  31  *
  32  *    or if above fails (it does if you have gas installed):
  33  *
  34  *      gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
  35  *
  36  *    Quick-n-dirty way to fuse the module into the library.
  37  *    Provided that the library is already configured and built
  38  *    (in 0.9.2 case with no-asm option):
  39  *
  40  *      # cd crypto/bn
  41  *      # cp /some/place/bn_asm.sparc.v8plus.S .
  42  *      # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  43  *      # make
  44  *      # cd ../..
  45  *      # make; make test
  46  *
  47  *    Quick-n-dirty way to get rid of it:
  48  *
  49  *      # cd crypto/bn
  50  *      # touch bn_asm.c
  51  *      # make
  52  *      # cd ../..
  53  *      # make; make test
  54  *
  55  * Q. V8plus achitecture? What kind of beast is that?
  56  * A. Well, it's rather a programming model than an architecture...
  57  *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
  58  *    special conditions, namely when kernel doesn't preserve upper
  59  *    32 bits of otherwise 64-bit registers during a context switch.
  60  *
  61  * Q. Why just UltraSPARC? What about SuperSPARC?
  62  * A. Original release did target UltraSPARC only. Now SuperSPARC
  63  *    version is provided along. Both version share bn_*comba[48]
  64  *    implementations (see comment later in code for explanation).
  65  *    But what's so special about this UltraSPARC implementation?
  66  *    Why didn't I let compiler do the job? Trouble is that most of
  67  *    available compilers (well, SC5.0 is the only exception) don't
  68  *    attempt to take advantage of UltraSPARC's 64-bitness under
  69  *    32-bit kernels even though it's perfectly possible (see next
  70  *    question).
  71  *
  72  * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
  73  *    doesn't work?
  74  * A. You can't adress *all* registers as 64-bit wide:-( The catch is
  75  *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
  76  *    preserved if you're in a leaf function, i.e. such never calling
  77  *    any other functions. All functions in this module are leaf and
  78  *    10 registers is a handful. And as a matter of fact none-"comba"
  79  *    routines don't require even that much and I could even afford to
  80  *    not allocate own stack frame for 'em:-)
  81  *
  82  * Q. What about 64-bit kernels?
  83  * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
  84  *    under evaluation and development...
  85  *
  86  * Q. What about shared libraries?
  87  * A. What about 'em? Kidding again:-) Code does *not* contain any
  88  *    code position dependencies and it's safe to include it into
  89  *    shared library as is.
  90  *
  91  * Q. How much faster does it go?
  92  * A. Do you have a good benchmark? In either case below is what I
  93  *    experience with crypto/bn/expspeed.c test program:
  94  *
  95  *      v8plus module on U10/300MHz against bn_asm.c compiled with:
  96  *
  97  *      cc-5.0 -xarch=v8plus -xO5 -xdepend      +7-12%
  98  *      cc-4.2 -xarch=v8plus -xO5 -xdepend      +25-35%
  99  *      egcs-1.1.2 -mcpu=ultrasparc -O3         +35-45%
 100  *
 101  *      v8 module on SS10/60MHz against bn_asm.c compiled with:
 102  *
 103  *      cc-5.0 -xarch=v8 -xO5 -xdepend          +7-10%
 104  *      cc-4.2 -xarch=v8 -xO5 -xdepend          +10%
 105  *      egcs-1.1.2 -mv8 -O3                     +35-45%
 106  *
 107  *    As you can see it's damn hard to beat the new Sun C compiler
 108  *    and it's in first place GNU C users who will appreciate this
 109  *    assembler implementation:-)
 110  */
 111
 112 /*
 113  * Revision history.
 114  *
 115  * 1.0  - initial release;
 116  * 1.1  - new loop unrolling model(*);
 117  *      - some more fine tuning;
 118  * 1.2  - made gas friendly;
 119  *      - updates to documentation concerning v9;
 120  *      - new performance comparison matrix;
 121  * 1.3  - fixed problem with /usr/ccs/lib/cpp;
 122  * 1.4  - native V9 bn_*_comba[48] implementation (15% more efficient)
 123  *        resulting in slight overall performance kick;
 124  *      - some retunes;
 125  *      - support for GNU as added;
 126  *
 127  * (*)  Originally unrolled loop looked like this:
 128  *          for (;;) {
 129  *              op(p+0); if (--n==0) break;
 130  *              op(p+1); if (--n==0) break;
 131  *              op(p+2); if (--n==0) break;
 132  *              op(p+3); if (--n==0) break;
 133  *              p+=4;
 134  *          }
 135  *      I unroll according to following:
 136  *          while (n&~3) {
 137  *              op(p+0); op(p+1); op(p+2); op(p+3);
 138  *              p+=4; n=-4;
 139  *          }
 140  *          if (n) {
 141  *              op(p+0); if (--n==0) return;
 142  *              op(p+2); if (--n==0) return;
 143  *              op(p+3); return;
 144  *          }
 145  */
 146
 147 #ifdef OPENSSL_FIPSCANISTER
 148 #include <openssl/fipssyms.h>
 149 #endif
 150
 151 #if defined(__SUNPRO_C) && defined(__sparcv9)
 152   /* They've said -xarch=v9 at command line */
 153   .register     %g2,#scratch
 154   .register     %g3,#scratch
 155 # define        FRAME_SIZE      -192
 156 #elif defined(__GNUC__) && defined(__arch64__)
 157   /* They've said -m64 at command line */
 158   .register     %g2,#scratch
 159   .register     %g3,#scratch
 160 # define        FRAME_SIZE      -192
 161 #else
 162 # define        FRAME_SIZE      -96
 163 #endif
 164 /*
 165  * GNU assembler can't stand stuw:-(
 166  */
 167 #define stuw st
 168
 169 .section        ".text",#alloc,#execinstr
 170 .file           "bn_asm.sparc.v8plus.S"
 171
 172 .align  32
 173
 174 .global bn_mul_add_words
 175 /*
 176  * BN_ULONG bn_mul_add_words(rp,ap,num,w)
 177  * BN_ULONG *rp,*ap;
 178  * int num;
 179  * BN_ULONG w;
 180  */
 181 bn_mul_add_words:
 182         sra     %o2,%g0,%o2     ! signx %o2
 183         brgz,a  %o2,.L_bn_mul_add_words_proceed
 184         lduw    [%o1],%g2
 185         retl
 186         clr     %o0
 187         nop
 188         nop
 189         nop
 190
 191 .L_bn_mul_add_words_proceed:
 192         srl     %o3,%g0,%o3     ! clruw %o3
 193         andcc   %o2,-4,%g0
 194         bz,pn   %icc,.L_bn_mul_add_words_tail
 195         clr     %o5
 196
 197 .L_bn_mul_add_words_loop:       ! wow! 32 aligned!
 198         lduw    [%o0],%g1
 199         lduw    [%o1+4],%g3
 200         mulx    %o3,%g2,%g2
 201         add     %g1,%o5,%o4
 202         nop
 203         add     %o4,%g2,%o4
 204         stuw    %o4,[%o0]
 205         srlx    %o4,32,%o5
 206
 207         lduw    [%o0+4],%g1
 208         lduw    [%o1+8],%g2
 209         mulx    %o3,%g3,%g3
 210         add     %g1,%o5,%o4
 211         dec     4,%o2
 212         add     %o4,%g3,%o4
 213         stuw    %o4,[%o0+4]
 214         srlx    %o4,32,%o5
 215
 216         lduw    [%o0+8],%g1
 217         lduw    [%o1+12],%g3
 218         mulx    %o3,%g2,%g2
 219         add     %g1,%o5,%o4
 220         inc     16,%o1
 221         add     %o4,%g2,%o4
 222         stuw    %o4,[%o0+8]
 223         srlx    %o4,32,%o5
 224
 225         lduw    [%o0+12],%g1
 226         mulx    %o3,%g3,%g3
 227         add     %g1,%o5,%o4
 228         inc     16,%o0
 229         add     %o4,%g3,%o4
 230         andcc   %o2,-4,%g0
 231         stuw    %o4,[%o0-4]
 232         srlx    %o4,32,%o5
 233         bnz,a,pt        %icc,.L_bn_mul_add_words_loop
 234         lduw    [%o1],%g2
 235
 236         brnz,a,pn       %o2,.L_bn_mul_add_words_tail
 237         lduw    [%o1],%g2
 238 .L_bn_mul_add_words_return:
 239         retl
 240         mov     %o5,%o0
 241
 242 .L_bn_mul_add_words_tail:
 243         lduw    [%o0],%g1
 244         mulx    %o3,%g2,%g2
 245         add     %g1,%o5,%o4
 246         dec     %o2
 247         add     %o4,%g2,%o4
 248         srlx    %o4,32,%o5
 249         brz,pt  %o2,.L_bn_mul_add_words_return
 250         stuw    %o4,[%o0]
 251
 252         lduw    [%o1+4],%g2
 253         lduw    [%o0+4],%g1
 254         mulx    %o3,%g2,%g2
 255         add     %g1,%o5,%o4
 256         dec     %o2
 257         add     %o4,%g2,%o4
 258         srlx    %o4,32,%o5
 259         brz,pt  %o2,.L_bn_mul_add_words_return
 260         stuw    %o4,[%o0+4]
 261
 262         lduw    [%o1+8],%g2
 263         lduw    [%o0+8],%g1
 264         mulx    %o3,%g2,%g2
 265         add     %g1,%o5,%o4
 266         add     %o4,%g2,%o4
 267         stuw    %o4,[%o0+8]
 268         retl
 269         srlx    %o4,32,%o0
 270
 271 .type   bn_mul_add_words,#function
 272 .size   bn_mul_add_words,(.-bn_mul_add_words)
 273
 274 .align  32
 275
 276 .global bn_mul_words
 277 /*
 278  * BN_ULONG bn_mul_words(rp,ap,num,w)
 279  * BN_ULONG *rp,*ap;
 280  * int num;
 281  * BN_ULONG w;
 282  */
 283 bn_mul_words:
 284         sra     %o2,%g0,%o2     ! signx %o2
 285         brgz,a  %o2,.L_bn_mul_words_proceeed
 286         lduw    [%o1],%g2
 287         retl
 288         clr     %o0
 289         nop
 290         nop
 291         nop
 292
 293 .L_bn_mul_words_proceeed:
 294         srl     %o3,%g0,%o3     ! clruw %o3
 295         andcc   %o2,-4,%g0
 296         bz,pn   %icc,.L_bn_mul_words_tail
 297         clr     %o5
 298
 299 .L_bn_mul_words_loop:           ! wow! 32 aligned!
 300         lduw    [%o1+4],%g3
 301         mulx    %o3,%g2,%g2
 302         add     %g2,%o5,%o4
 303         nop
 304         stuw    %o4,[%o0]
 305         srlx    %o4,32,%o5
 306
 307         lduw    [%o1+8],%g2
 308         mulx    %o3,%g3,%g3
 309         add     %g3,%o5,%o4
 310         dec     4,%o2
 311         stuw    %o4,[%o0+4]
 312         srlx    %o4,32,%o5
 313
 314         lduw    [%o1+12],%g3
 315         mulx    %o3,%g2,%g2
 316         add     %g2,%o5,%o4
 317         inc     16,%o1
 318         stuw    %o4,[%o0+8]
 319         srlx    %o4,32,%o5
 320
 321         mulx    %o3,%g3,%g3
 322         add     %g3,%o5,%o4
 323         inc     16,%o0
 324         stuw    %o4,[%o0-4]
 325         srlx    %o4,32,%o5
 326         andcc   %o2,-4,%g0
 327         bnz,a,pt        %icc,.L_bn_mul_words_loop
 328         lduw    [%o1],%g2
 329         nop
 330         nop
 331
 332         brnz,a,pn       %o2,.L_bn_mul_words_tail
 333         lduw    [%o1],%g2
 334 .L_bn_mul_words_return:
 335         retl
 336         mov     %o5,%o0
 337
 338 .L_bn_mul_words_tail:
 339         mulx    %o3,%g2,%g2
 340         add     %g2,%o5,%o4
 341         dec     %o2
 342         srlx    %o4,32,%o5
 343         brz,pt  %o2,.L_bn_mul_words_return
 344         stuw    %o4,[%o0]
 345
 346         lduw    [%o1+4],%g2
 347         mulx    %o3,%g2,%g2
 348         add     %g2,%o5,%o4
 349         dec     %o2
 350         srlx    %o4,32,%o5
 351         brz,pt  %o2,.L_bn_mul_words_return
 352         stuw    %o4,[%o0+4]
 353
 354         lduw    [%o1+8],%g2
 355         mulx    %o3,%g2,%g2
 356         add     %g2,%o5,%o4
 357         stuw    %o4,[%o0+8]
 358         retl
 359         srlx    %o4,32,%o0
 360
 361 .type   bn_mul_words,#function
 362 .size   bn_mul_words,(.-bn_mul_words)
 363
 364 .align  32
 365 .global bn_sqr_words
 366 /*
 367  * void bn_sqr_words(r,a,n)
 368  * BN_ULONG *r,*a;
 369  * int n;
 370  */
 371 bn_sqr_words:
 372         sra     %o2,%g0,%o2     ! signx %o2
 373         brgz,a  %o2,.L_bn_sqr_words_proceeed
 374         lduw    [%o1],%g2
 375         retl
 376         clr     %o0
 377         nop
 378         nop
 379         nop
 380
 381 .L_bn_sqr_words_proceeed:
 382         andcc   %o2,-4,%g0
 383         nop
 384         bz,pn   %icc,.L_bn_sqr_words_tail
 385         nop
 386
 387 .L_bn_sqr_words_loop:           ! wow! 32 aligned!
 388         lduw    [%o1+4],%g3
 389         mulx    %g2,%g2,%o4
 390         stuw    %o4,[%o0]
 391         srlx    %o4,32,%o5
 392         stuw    %o5,[%o0+4]
 393         nop
 394
 395         lduw    [%o1+8],%g2
 396         mulx    %g3,%g3,%o4
 397         dec     4,%o2
 398         stuw    %o4,[%o0+8]
 399         srlx    %o4,32,%o5
 400         stuw    %o5,[%o0+12]
 401
 402         lduw    [%o1+12],%g3
 403         mulx    %g2,%g2,%o4
 404         srlx    %o4,32,%o5
 405         stuw    %o4,[%o0+16]
 406         inc     16,%o1
 407         stuw    %o5,[%o0+20]
 408
 409         mulx    %g3,%g3,%o4
 410         inc     32,%o0
 411         stuw    %o4,[%o0-8]
 412         srlx    %o4,32,%o5
 413         andcc   %o2,-4,%g2
 414         stuw    %o5,[%o0-4]
 415         bnz,a,pt        %icc,.L_bn_sqr_words_loop
 416         lduw    [%o1],%g2
 417         nop
 418
 419         brnz,a,pn       %o2,.L_bn_sqr_words_tail
 420         lduw    [%o1],%g2
 421 .L_bn_sqr_words_return:
 422         retl
 423         clr     %o0
 424
 425 .L_bn_sqr_words_tail:
 426         mulx    %g2,%g2,%o4
 427         dec     %o2
 428         stuw    %o4,[%o0]
 429         srlx    %o4,32,%o5
 430         brz,pt  %o2,.L_bn_sqr_words_return
 431         stuw    %o5,[%o0+4]
 432
 433         lduw    [%o1+4],%g2
 434         mulx    %g2,%g2,%o4
 435         dec     %o2
 436         stuw    %o4,[%o0+8]
 437         srlx    %o4,32,%o5
 438         brz,pt  %o2,.L_bn_sqr_words_return
 439         stuw    %o5,[%o0+12]
 440
 441         lduw    [%o1+8],%g2
 442         mulx    %g2,%g2,%o4
 443         srlx    %o4,32,%o5
 444         stuw    %o4,[%o0+16]
 445         stuw    %o5,[%o0+20]
 446         retl
 447         clr     %o0
 448
 449 .type   bn_sqr_words,#function
 450 .size   bn_sqr_words,(.-bn_sqr_words)
 451
 452 .align  32
 453 .global bn_div_words
 454 /*
 455  * BN_ULONG bn_div_words(h,l,d)
 456  * BN_ULONG h,l,d;
 457  */
 458 bn_div_words:
 459         sllx    %o0,32,%o0
 460         or      %o0,%o1,%o0
 461         udivx   %o0,%o2,%o0
 462         retl
 463         srl     %o0,%g0,%o0     ! clruw %o0
 464
 465 .type   bn_div_words,#function
 466 .size   bn_div_words,(.-bn_div_words)
 467
 468 .align  32
 469
 470 .global bn_add_words
 471 /*
 472  * BN_ULONG bn_add_words(rp,ap,bp,n)
 473  * BN_ULONG *rp,*ap,*bp;
 474  * int n;
 475  */
 476 bn_add_words:
 477         sra     %o3,%g0,%o3     ! signx %o3
 478         brgz,a  %o3,.L_bn_add_words_proceed
 479         lduw    [%o1],%o4
 480         retl
 481         clr     %o0
 482
 483 .L_bn_add_words_proceed:
 484         andcc   %o3,-4,%g0
 485         bz,pn   %icc,.L_bn_add_words_tail
 486         addcc   %g0,0,%g0       ! clear carry flag
 487
 488 .L_bn_add_words_loop:           ! wow! 32 aligned!
 489         dec     4,%o3
 490         lduw    [%o2],%o5
 491         lduw    [%o1+4],%g1
 492         lduw    [%o2+4],%g2
 493         lduw    [%o1+8],%g3
 494         lduw    [%o2+8],%g4
 495         addccc  %o5,%o4,%o5
 496         stuw    %o5,[%o0]
 497
 498         lduw    [%o1+12],%o4
 499         lduw    [%o2+12],%o5
 500         inc     16,%o1
 501         addccc  %g1,%g2,%g1
 502         stuw    %g1,[%o0+4]
 503
 504         inc     16,%o2
 505         addccc  %g3,%g4,%g3
 506         stuw    %g3,[%o0+8]
 507
 508         inc     16,%o0
 509         addccc  %o5,%o4,%o5
 510         stuw    %o5,[%o0-4]
 511         and     %o3,-4,%g1
 512         brnz,a,pt       %g1,.L_bn_add_words_loop
 513         lduw    [%o1],%o4
 514
 515         brnz,a,pn       %o3,.L_bn_add_words_tail
 516         lduw    [%o1],%o4
 517 .L_bn_add_words_return:
 518         clr     %o0
 519         retl
 520         movcs   %icc,1,%o0
 521         nop
 522
 523 .L_bn_add_words_tail:
 524         lduw    [%o2],%o5
 525         dec     %o3
 526         addccc  %o5,%o4,%o5
 527         brz,pt  %o3,.L_bn_add_words_return
 528         stuw    %o5,[%o0]
 529
 530         lduw    [%o1+4],%o4
 531         lduw    [%o2+4],%o5
 532         dec     %o3
 533         addccc  %o5,%o4,%o5
 534         brz,pt  %o3,.L_bn_add_words_return
 535         stuw    %o5,[%o0+4]
 536
 537         lduw    [%o1+8],%o4
 538         lduw    [%o2+8],%o5
 539         addccc  %o5,%o4,%o5
 540         stuw    %o5,[%o0+8]
 541         clr     %o0
 542         retl
 543         movcs   %icc,1,%o0
 544
 545 .type   bn_add_words,#function
 546 .size   bn_add_words,(.-bn_add_words)
 547
 548 .global bn_sub_words
 549 /*
 550  * BN_ULONG bn_sub_words(rp,ap,bp,n)
 551  * BN_ULONG *rp,*ap,*bp;
 552  * int n;
 553  */
 554 bn_sub_words:
 555         sra     %o3,%g0,%o3     ! signx %o3
 556         brgz,a  %o3,.L_bn_sub_words_proceed
 557         lduw    [%o1],%o4
 558         retl
 559         clr     %o0
 560
 561 .L_bn_sub_words_proceed:
 562         andcc   %o3,-4,%g0
 563         bz,pn   %icc,.L_bn_sub_words_tail
 564         addcc   %g0,0,%g0       ! clear carry flag
 565
 566 .L_bn_sub_words_loop:           ! wow! 32 aligned!
 567         dec     4,%o3
 568         lduw    [%o2],%o5
 569         lduw    [%o1+4],%g1
 570         lduw    [%o2+4],%g2
 571         lduw    [%o1+8],%g3
 572         lduw    [%o2+8],%g4
 573         subccc  %o4,%o5,%o5
 574         stuw    %o5,[%o0]
 575
 576         lduw    [%o1+12],%o4
 577         lduw    [%o2+12],%o5
 578         inc     16,%o1
 579         subccc  %g1,%g2,%g2
 580         stuw    %g2,[%o0+4]
 581
 582         inc     16,%o2
 583         subccc  %g3,%g4,%g4
 584         stuw    %g4,[%o0+8]
 585
 586         inc     16,%o0
 587         subccc  %o4,%o5,%o5
 588         stuw    %o5,[%o0-4]
 589         and     %o3,-4,%g1
 590         brnz,a,pt       %g1,.L_bn_sub_words_loop
 591         lduw    [%o1],%o4
 592
 593         brnz,a,pn       %o3,.L_bn_sub_words_tail
 594         lduw    [%o1],%o4
 595 .L_bn_sub_words_return:
 596         clr     %o0
 597         retl
 598         movcs   %icc,1,%o0
 599         nop
 600
 601 .L_bn_sub_words_tail:           ! wow! 32 aligned!
 602         lduw    [%o2],%o5
 603         dec     %o3
 604         subccc  %o4,%o5,%o5
 605         brz,pt  %o3,.L_bn_sub_words_return
 606         stuw    %o5,[%o0]
 607
 608         lduw    [%o1+4],%o4
 609         lduw    [%o2+4],%o5
 610         dec     %o3
 611         subccc  %o4,%o5,%o5
 612         brz,pt  %o3,.L_bn_sub_words_return
 613         stuw    %o5,[%o0+4]
 614
 615         lduw    [%o1+8],%o4
 616         lduw    [%o2+8],%o5
 617         subccc  %o4,%o5,%o5
 618         stuw    %o5,[%o0+8]
 619         clr     %o0
 620         retl
 621         movcs   %icc,1,%o0
 622
 623 .type   bn_sub_words,#function
 624 .size   bn_sub_words,(.-bn_sub_words)
 625
 626 /*
 627  * Code below depends on the fact that upper parts of the %l0-%l7
 628  * and %i0-%i7 are zeroed by kernel after context switch. In
 629  * previous versions this comment stated that "the trouble is that
 630  * it's not feasible to implement the mumbo-jumbo in less V9
 631  * instructions:-(" which apparently isn't true thanks to
 632  * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
 633  * results not from the shorter code, but from elimination of
 634  * multicycle none-pairable 'rd %y,%rd' instructions.
 635  *
 636  *                                                      Andy.
 637  */
 638
 639 /*
 640  * Here is register usage map for *all* routines below.
 641  */
 642 #define t_1     %o0
 643 #define t_2     %o1
 644 #define c_12    %o2
 645 #define c_3     %o3
 646
 647 #define ap(I)   [%i1+4*I]
 648 #define bp(I)   [%i2+4*I]
 649 #define rp(I)   [%i0+4*I]
 650
 651 #define a_0     %l0
 652 #define a_1     %l1
 653 #define a_2     %l2
 654 #define a_3     %l3
 655 #define a_4     %l4
 656 #define a_5     %l5
 657 #define a_6     %l6
 658 #define a_7     %l7
 659
 660 #define b_0     %i3
 661 #define b_1     %i4
 662 #define b_2     %i5
 663 #define b_3     %o4
 664 #define b_4     %o5
 665 #define b_5     %o7
 666 #define b_6     %g1
 667 #define b_7     %g4
 668
 669 .align  32
 670 .global bn_mul_comba8
 671 /*
 672  * void bn_mul_comba8(r,a,b)
 673  * BN_ULONG *r,*a,*b;
 674  */
 675 bn_mul_comba8:
 676         save    %sp,FRAME_SIZE,%sp
 677         mov     1,t_2
 678         lduw    ap(0),a_0
 679         sllx    t_2,32,t_2
 680         lduw    bp(0),b_0       !=
 681         lduw    bp(1),b_1
 682         mulx    a_0,b_0,t_1     !mul_add_c(a[0],b[0],c1,c2,c3);
 683         srlx    t_1,32,c_12
 684         stuw    t_1,rp(0)       !=!r[0]=c1;
 685
 686         lduw    ap(1),a_1
 687         mulx    a_0,b_1,t_1     !mul_add_c(a[0],b[1],c2,c3,c1);
 688         addcc   c_12,t_1,c_12
 689         clr     c_3             !=
 690         bcs,a   %xcc,.+8
 691         add     c_3,t_2,c_3
 692         lduw    ap(2),a_2
 693         mulx    a_1,b_0,t_1     !=!mul_add_c(a[1],b[0],c2,c3,c1);
 694         addcc   c_12,t_1,t_1
 695         bcs,a   %xcc,.+8
 696         add     c_3,t_2,c_3
 697         srlx    t_1,32,c_12     !=
 698         stuw    t_1,rp(1)       !r[1]=c2;
 699         or      c_12,c_3,c_12
 700
 701         mulx    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
 702         addcc   c_12,t_1,c_12   !=
 703         clr     c_3
 704         bcs,a   %xcc,.+8
 705         add     c_3,t_2,c_3
 706         lduw    bp(2),b_2       !=
 707         mulx    a_1,b_1,t_1     !mul_add_c(a[1],b[1],c3,c1,c2);
 708         addcc   c_12,t_1,c_12
 709         bcs,a   %xcc,.+8
 710         add     c_3,t_2,c_3     !=
 711         lduw    bp(3),b_3
 712         mulx    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
 713         addcc   c_12,t_1,t_1
 714         bcs,a   %xcc,.+8        !=
 715         add     c_3,t_2,c_3
 716         srlx    t_1,32,c_12
 717         stuw    t_1,rp(2)       !r[2]=c3;
 718         or      c_12,c_3,c_12   !=
 719
 720         mulx    a_0,b_3,t_1     !mul_add_c(a[0],b[3],c1,c2,c3);
 721         addcc   c_12,t_1,c_12
 722         clr     c_3
 723         bcs,a   %xcc,.+8        !=
 724         add     c_3,t_2,c_3
 725         mulx    a_1,b_2,t_1     !=!mul_add_c(a[1],b[2],c1,c2,c3);
 726         addcc   c_12,t_1,c_12
 727         bcs,a   %xcc,.+8        !=
 728         add     c_3,t_2,c_3
 729         lduw    ap(3),a_3
 730         mulx    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
 731         addcc   c_12,t_1,c_12   !=
 732         bcs,a   %xcc,.+8
 733         add     c_3,t_2,c_3
 734         lduw    ap(4),a_4
 735         mulx    a_3,b_0,t_1     !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
 736         addcc   c_12,t_1,t_1
 737         bcs,a   %xcc,.+8
 738         add     c_3,t_2,c_3
 739         srlx    t_1,32,c_12     !=
 740         stuw    t_1,rp(3)       !r[3]=c1;
 741         or      c_12,c_3,c_12
 742
 743         mulx    a_4,b_0,t_1     !mul_add_c(a[4],b[0],c2,c3,c1);
 744         addcc   c_12,t_1,c_12   !=
 745         clr     c_3
 746         bcs,a   %xcc,.+8
 747         add     c_3,t_2,c_3
 748         mulx    a_3,b_1,t_1     !=!mul_add_c(a[3],b[1],c2,c3,c1);
 749         addcc   c_12,t_1,c_12
 750         bcs,a   %xcc,.+8
 751         add     c_3,t_2,c_3
 752         mulx    a_2,b_2,t_1     !=!mul_add_c(a[2],b[2],c2,c3,c1);
 753         addcc   c_12,t_1,c_12
 754         bcs,a   %xcc,.+8
 755         add     c_3,t_2,c_3
 756         lduw    bp(4),b_4       !=
 757         mulx    a_1,b_3,t_1     !mul_add_c(a[1],b[3],c2,c3,c1);
 758         addcc   c_12,t_1,c_12
 759         bcs,a   %xcc,.+8
 760         add     c_3,t_2,c_3     !=
 761         lduw    bp(5),b_5
 762         mulx    a_0,b_4,t_1     !mul_add_c(a[0],b[4],c2,c3,c1);
 763         addcc   c_12,t_1,t_1
 764         bcs,a   %xcc,.+8        !=
 765         add     c_3,t_2,c_3
 766         srlx    t_1,32,c_12
 767         stuw    t_1,rp(4)       !r[4]=c2;
 768         or      c_12,c_3,c_12   !=
 769
 770         mulx    a_0,b_5,t_1     !mul_add_c(a[0],b[5],c3,c1,c2);
 771         addcc   c_12,t_1,c_12
 772         clr     c_3
 773         bcs,a   %xcc,.+8        !=
 774         add     c_3,t_2,c_3
 775         mulx    a_1,b_4,t_1     !mul_add_c(a[1],b[4],c3,c1,c2);
 776         addcc   c_12,t_1,c_12
 777         bcs,a   %xcc,.+8        !=
 778         add     c_3,t_2,c_3
 779         mulx    a_2,b_3,t_1     !mul_add_c(a[2],b[3],c3,c1,c2);
 780         addcc   c_12,t_1,c_12
 781         bcs,a   %xcc,.+8        !=
 782         add     c_3,t_2,c_3
 783         mulx    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
 784         addcc   c_12,t_1,c_12
 785         bcs,a   %xcc,.+8        !=
 786         add     c_3,t_2,c_3
 787         lduw    ap(5),a_5
 788         mulx    a_4,b_1,t_1     !mul_add_c(a[4],b[1],c3,c1,c2);
 789         addcc   c_12,t_1,c_12   !=
 790         bcs,a   %xcc,.+8
 791         add     c_3,t_2,c_3
 792         lduw    ap(6),a_6
 793         mulx    a_5,b_0,t_1     !=!mul_add_c(a[5],b[0],c3,c1,c2);
 794         addcc   c_12,t_1,t_1
 795         bcs,a   %xcc,.+8
 796         add     c_3,t_2,c_3
 797         srlx    t_1,32,c_12     !=
 798         stuw    t_1,rp(5)       !r[5]=c3;
 799         or      c_12,c_3,c_12
 800
 801         mulx    a_6,b_0,t_1     !mul_add_c(a[6],b[0],c1,c2,c3);
 802         addcc   c_12,t_1,c_12   !=
 803         clr     c_3
 804         bcs,a   %xcc,.+8
 805         add     c_3,t_2,c_3
 806         mulx    a_5,b_1,t_1     !=!mul_add_c(a[5],b[1],c1,c2,c3);
 807         addcc   c_12,t_1,c_12
 808         bcs,a   %xcc,.+8
 809         add     c_3,t_2,c_3
 810         mulx    a_4,b_2,t_1     !=!mul_add_c(a[4],b[2],c1,c2,c3);
 811         addcc   c_12,t_1,c_12
 812         bcs,a   %xcc,.+8
 813         add     c_3,t_2,c_3
 814         mulx    a_3,b_3,t_1     !=!mul_add_c(a[3],b[3],c1,c2,c3);
 815         addcc   c_12,t_1,c_12
 816         bcs,a   %xcc,.+8
 817         add     c_3,t_2,c_3
 818         mulx    a_2,b_4,t_1     !=!mul_add_c(a[2],b[4],c1,c2,c3);
 819         addcc   c_12,t_1,c_12
 820         bcs,a   %xcc,.+8
 821         add     c_3,t_2,c_3
 822         lduw    bp(6),b_6       !=
 823         mulx    a_1,b_5,t_1     !mul_add_c(a[1],b[5],c1,c2,c3);
 824         addcc   c_12,t_1,c_12
 825         bcs,a   %xcc,.+8
 826         add     c_3,t_2,c_3     !=
 827         lduw    bp(7),b_7
 828         mulx    a_0,b_6,t_1     !mul_add_c(a[0],b[6],c1,c2,c3);
 829         addcc   c_12,t_1,t_1
 830         bcs,a   %xcc,.+8        !=
 831         add     c_3,t_2,c_3
 832         srlx    t_1,32,c_12
 833         stuw    t_1,rp(6)       !r[6]=c1;
 834         or      c_12,c_3,c_12   !=
 835
 836         mulx    a_0,b_7,t_1     !mul_add_c(a[0],b[7],c2,c3,c1);
 837         addcc   c_12,t_1,c_12
 838         clr     c_3
 839         bcs,a   %xcc,.+8        !=
 840         add     c_3,t_2,c_3
 841         mulx    a_1,b_6,t_1     !mul_add_c(a[1],b[6],c2,c3,c1);
 842         addcc   c_12,t_1,c_12
 843         bcs,a   %xcc,.+8        !=
 844         add     c_3,t_2,c_3
 845         mulx    a_2,b_5,t_1     !mul_add_c(a[2],b[5],c2,c3,c1);
 846         addcc   c_12,t_1,c_12
 847         bcs,a   %xcc,.+8        !=
 848         add     c_3,t_2,c_3
 849         mulx    a_3,b_4,t_1     !mul_add_c(a[3],b[4],c2,c3,c1);
 850         addcc   c_12,t_1,c_12
 851         bcs,a   %xcc,.+8        !=
 852         add     c_3,t_2,c_3
 853         mulx    a_4,b_3,t_1     !mul_add_c(a[4],b[3],c2,c3,c1);
 854         addcc   c_12,t_1,c_12
 855         bcs,a   %xcc,.+8        !=
 856         add     c_3,t_2,c_3
 857         mulx    a_5,b_2,t_1     !mul_add_c(a[5],b[2],c2,c3,c1);
 858         addcc   c_12,t_1,c_12
 859         bcs,a   %xcc,.+8        !=
 860         add     c_3,t_2,c_3
 861         lduw    ap(7),a_7
 862         mulx    a_6,b_1,t_1     !=!mul_add_c(a[6],b[1],c2,c3,c1);
 863         addcc   c_12,t_1,c_12
 864         bcs,a   %xcc,.+8
 865         add     c_3,t_2,c_3
 866         mulx    a_7,b_0,t_1     !=!mul_add_c(a[7],b[0],c2,c3,c1);
 867         addcc   c_12,t_1,t_1
 868         bcs,a   %xcc,.+8
 869         add     c_3,t_2,c_3
 870         srlx    t_1,32,c_12     !=
 871         stuw    t_1,rp(7)       !r[7]=c2;
 872         or      c_12,c_3,c_12
 873
 874         mulx    a_7,b_1,t_1     !=!mul_add_c(a[7],b[1],c3,c1,c2);
 875         addcc   c_12,t_1,c_12
 876         clr     c_3
 877         bcs,a   %xcc,.+8
 878         add     c_3,t_2,c_3     !=
 879         mulx    a_6,b_2,t_1     !mul_add_c(a[6],b[2],c3,c1,c2);
 880         addcc   c_12,t_1,c_12
 881         bcs,a   %xcc,.+8
 882         add     c_3,t_2,c_3     !=
 883         mulx    a_5,b_3,t_1     !mul_add_c(a[5],b[3],c3,c1,c2);
 884         addcc   c_12,t_1,c_12
 885         bcs,a   %xcc,.+8
 886         add     c_3,t_2,c_3     !=
 887         mulx    a_4,b_4,t_1     !mul_add_c(a[4],b[4],c3,c1,c2);
 888         addcc   c_12,t_1,c_12
 889         bcs,a   %xcc,.+8
 890         add     c_3,t_2,c_3     !=
 891         mulx    a_3,b_5,t_1     !mul_add_c(a[3],b[5],c3,c1,c2);
 892         addcc   c_12,t_1,c_12
 893         bcs,a   %xcc,.+8
 894         add     c_3,t_2,c_3     !=
 895         mulx    a_2,b_6,t_1     !mul_add_c(a[2],b[6],c3,c1,c2);
 896         addcc   c_12,t_1,c_12
 897         bcs,a   %xcc,.+8
 898         add     c_3,t_2,c_3     !=
 899         mulx    a_1,b_7,t_1     !mul_add_c(a[1],b[7],c3,c1,c2);
 900         addcc   c_12,t_1,t_1
 901         bcs,a   %xcc,.+8
 902         add     c_3,t_2,c_3     !=
 903         srlx    t_1,32,c_12
 904         stuw    t_1,rp(8)       !r[8]=c3;
 905         or      c_12,c_3,c_12
 906
 907         mulx    a_2,b_7,t_1     !=!mul_add_c(a[2],b[7],c1,c2,c3);
 908         addcc   c_12,t_1,c_12
 909         clr     c_3
 910         bcs,a   %xcc,.+8
 911         add     c_3,t_2,c_3     !=
 912         mulx    a_3,b_6,t_1     !mul_add_c(a[3],b[6],c1,c2,c3);
 913         addcc   c_12,t_1,c_12
 914         bcs,a   %xcc,.+8        !=
 915         add     c_3,t_2,c_3
 916         mulx    a_4,b_5,t_1     !mul_add_c(a[4],b[5],c1,c2,c3);
 917         addcc   c_12,t_1,c_12
 918         bcs,a   %xcc,.+8        !=
 919         add     c_3,t_2,c_3
 920         mulx    a_5,b_4,t_1     !mul_add_c(a[5],b[4],c1,c2,c3);
 921         addcc   c_12,t_1,c_12
 922         bcs,a   %xcc,.+8        !=
 923         add     c_3,t_2,c_3
 924         mulx    a_6,b_3,t_1     !mul_add_c(a[6],b[3],c1,c2,c3);
 925         addcc   c_12,t_1,c_12
 926         bcs,a   %xcc,.+8        !=
 927         add     c_3,t_2,c_3
 928         mulx    a_7,b_2,t_1     !mul_add_c(a[7],b[2],c1,c2,c3);
 929         addcc   c_12,t_1,t_1
 930         bcs,a   %xcc,.+8        !=
 931         add     c_3,t_2,c_3
 932         srlx    t_1,32,c_12
 933         stuw    t_1,rp(9)       !r[9]=c1;
 934         or      c_12,c_3,c_12   !=
 935
 936         mulx    a_7,b_3,t_1     !mul_add_c(a[7],b[3],c2,c3,c1);
 937         addcc   c_12,t_1,c_12
 938         clr     c_3
 939         bcs,a   %xcc,.+8        !=
 940         add     c_3,t_2,c_3
 941         mulx    a_6,b_4,t_1     !mul_add_c(a[6],b[4],c2,c3,c1);
 942         addcc   c_12,t_1,c_12
 943         bcs,a   %xcc,.+8        !=
 944         add     c_3,t_2,c_3
 945         mulx    a_5,b_5,t_1     !mul_add_c(a[5],b[5],c2,c3,c1);
 946         addcc   c_12,t_1,c_12
 947         bcs,a   %xcc,.+8        !=
 948         add     c_3,t_2,c_3
 949         mulx    a_4,b_6,t_1     !mul_add_c(a[4],b[6],c2,c3,c1);
 950         addcc   c_12,t_1,c_12
 951         bcs,a   %xcc,.+8        !=
 952         add     c_3,t_2,c_3
 953         mulx    a_3,b_7,t_1     !mul_add_c(a[3],b[7],c2,c3,c1);
 954         addcc   c_12,t_1,t_1
 955         bcs,a   %xcc,.+8        !=
 956         add     c_3,t_2,c_3
 957         srlx    t_1,32,c_12
 958         stuw    t_1,rp(10)      !r[10]=c2;
 959         or      c_12,c_3,c_12   !=
 960
 961         mulx    a_4,b_7,t_1     !mul_add_c(a[4],b[7],c3,c1,c2);
 962         addcc   c_12,t_1,c_12
 963         clr     c_3
 964         bcs,a   %xcc,.+8        !=
 965         add     c_3,t_2,c_3
 966         mulx    a_5,b_6,t_1     !mul_add_c(a[5],b[6],c3,c1,c2);
 967         addcc   c_12,t_1,c_12
 968         bcs,a   %xcc,.+8        !=
 969         add     c_3,t_2,c_3
 970         mulx    a_6,b_5,t_1     !mul_add_c(a[6],b[5],c3,c1,c2);
 971         addcc   c_12,t_1,c_12
 972         bcs,a   %xcc,.+8        !=
 973         add     c_3,t_2,c_3
 974         mulx    a_7,b_4,t_1     !mul_add_c(a[7],b[4],c3,c1,c2);
 975         addcc   c_12,t_1,t_1
 976         bcs,a   %xcc,.+8        !=
 977         add     c_3,t_2,c_3
 978         srlx    t_1,32,c_12
 979         stuw    t_1,rp(11)      !r[11]=c3;
 980         or      c_12,c_3,c_12   !=
 981
 982         mulx    a_7,b_5,t_1     !mul_add_c(a[7],b[5],c1,c2,c3);
 983         addcc   c_12,t_1,c_12
 984         clr     c_3
 985         bcs,a   %xcc,.+8        !=
 986         add     c_3,t_2,c_3
 987         mulx    a_6,b_6,t_1     !mul_add_c(a[6],b[6],c1,c2,c3);
 988         addcc   c_12,t_1,c_12
 989         bcs,a   %xcc,.+8        !=
 990         add     c_3,t_2,c_3
 991         mulx    a_5,b_7,t_1     !mul_add_c(a[5],b[7],c1,c2,c3);
 992         addcc   c_12,t_1,t_1
 993         bcs,a   %xcc,.+8        !=
 994         add     c_3,t_2,c_3
 995         srlx    t_1,32,c_12
 996         stuw    t_1,rp(12)      !r[12]=c1;
 997         or      c_12,c_3,c_12   !=
 998
 999         mulx    a_6,b_7,t_1     !mul_add_c(a[6],b[7],c2,c3,c1);
1000         addcc   c_12,t_1,c_12
1001         clr     c_3
1002         bcs,a   %xcc,.+8        !=
1003         add     c_3,t_2,c_3
1004         mulx    a_7,b_6,t_1     !mul_add_c(a[7],b[6],c2,c3,c1);
1005         addcc   c_12,t_1,t_1
1006         bcs,a   %xcc,.+8        !=
1007         add     c_3,t_2,c_3
1008         srlx    t_1,32,c_12
1009         st      t_1,rp(13)      !r[13]=c2;
1010         or      c_12,c_3,c_12   !=
1011
1012         mulx    a_7,b_7,t_1     !mul_add_c(a[7],b[7],c3,c1,c2);
1013         addcc   c_12,t_1,t_1
1014         srlx    t_1,32,c_12     !=
1015         stuw    t_1,rp(14)      !r[14]=c3;
1016         stuw    c_12,rp(15)     !r[15]=c1;
1017
1018         ret
1019         restore %g0,%g0,%o0     !=
1020
1021 .type   bn_mul_comba8,#function
1022 .size   bn_mul_comba8,(.-bn_mul_comba8)
1023
1024 .align  32
1025
1026 .global bn_mul_comba4
1027 /*
1028  * void bn_mul_comba4(r,a,b)
1029  * BN_ULONG *r,*a,*b;
1030  */
1031 bn_mul_comba4:
1032         save    %sp,FRAME_SIZE,%sp
1033         lduw    ap(0),a_0
1034         mov     1,t_2
1035         lduw    bp(0),b_0
1036         sllx    t_2,32,t_2      !=
1037         lduw    bp(1),b_1
1038         mulx    a_0,b_0,t_1     !mul_add_c(a[0],b[0],c1,c2,c3);
1039         srlx    t_1,32,c_12
1040         stuw    t_1,rp(0)       !=!r[0]=c1;
1041
1042         lduw    ap(1),a_1
1043         mulx    a_0,b_1,t_1     !mul_add_c(a[0],b[1],c2,c3,c1);
1044         addcc   c_12,t_1,c_12
1045         clr     c_3             !=
1046         bcs,a   %xcc,.+8
1047         add     c_3,t_2,c_3
1048         lduw    ap(2),a_2
1049         mulx    a_1,b_0,t_1     !=!mul_add_c(a[1],b[0],c2,c3,c1);
1050         addcc   c_12,t_1,t_1
1051         bcs,a   %xcc,.+8
1052         add     c_3,t_2,c_3
1053         srlx    t_1,32,c_12     !=
1054         stuw    t_1,rp(1)       !r[1]=c2;
1055         or      c_12,c_3,c_12
1056
1057         mulx    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
1058         addcc   c_12,t_1,c_12   !=
1059         clr     c_3
1060         bcs,a   %xcc,.+8
1061         add     c_3,t_2,c_3
1062         lduw    bp(2),b_2       !=
1063         mulx    a_1,b_1,t_1     !mul_add_c(a[1],b[1],c3,c1,c2);
1064         addcc   c_12,t_1,c_12
1065         bcs,a   %xcc,.+8
1066         add     c_3,t_2,c_3     !=
1067         lduw    bp(3),b_3
1068         mulx    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
1069         addcc   c_12,t_1,t_1
1070         bcs,a   %xcc,.+8        !=
1071         add     c_3,t_2,c_3
1072         srlx    t_1,32,c_12
1073         stuw    t_1,rp(2)       !r[2]=c3;
1074         or      c_12,c_3,c_12   !=
1075
1076         mulx    a_0,b_3,t_1     !mul_add_c(a[0],b[3],c1,c2,c3);
1077         addcc   c_12,t_1,c_12
1078         clr     c_3
1079         bcs,a   %xcc,.+8        !=
1080         add     c_3,t_2,c_3
1081         mulx    a_1,b_2,t_1     !mul_add_c(a[1],b[2],c1,c2,c3);
1082         addcc   c_12,t_1,c_12
1083         bcs,a   %xcc,.+8        !=
1084         add     c_3,t_2,c_3
1085         lduw    ap(3),a_3
1086         mulx    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
1087         addcc   c_12,t_1,c_12   !=
1088         bcs,a   %xcc,.+8
1089         add     c_3,t_2,c_3
1090         mulx    a_3,b_0,t_1     !mul_add_c(a[3],b[0],c1,c2,c3);!=
1091         addcc   c_12,t_1,t_1    !=
1092         bcs,a   %xcc,.+8
1093         add     c_3,t_2,c_3
1094         srlx    t_1,32,c_12
1095         stuw    t_1,rp(3)       !=!r[3]=c1;
1096         or      c_12,c_3,c_12
1097
1098         mulx    a_3,b_1,t_1     !mul_add_c(a[3],b[1],c2,c3,c1);
1099         addcc   c_12,t_1,c_12
1100         clr     c_3             !=
1101         bcs,a   %xcc,.+8
1102         add     c_3,t_2,c_3
1103         mulx    a_2,b_2,t_1     !mul_add_c(a[2],b[2],c2,c3,c1);
1104         addcc   c_12,t_1,c_12   !=
1105         bcs,a   %xcc,.+8
1106         add     c_3,t_2,c_3
1107         mulx    a_1,b_3,t_1     !mul_add_c(a[1],b[3],c2,c3,c1);
1108         addcc   c_12,t_1,t_1    !=
1109         bcs,a   %xcc,.+8
1110         add     c_3,t_2,c_3
1111         srlx    t_1,32,c_12
1112         stuw    t_1,rp(4)       !=!r[4]=c2;
1113         or      c_12,c_3,c_12
1114
1115         mulx    a_2,b_3,t_1     !mul_add_c(a[2],b[3],c3,c1,c2);
1116         addcc   c_12,t_1,c_12
1117         clr     c_3             !=
1118         bcs,a   %xcc,.+8
1119         add     c_3,t_2,c_3
1120         mulx    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
1121         addcc   c_12,t_1,t_1    !=
1122         bcs,a   %xcc,.+8
1123         add     c_3,t_2,c_3
1124         srlx    t_1,32,c_12
1125         stuw    t_1,rp(5)       !=!r[5]=c3;
1126         or      c_12,c_3,c_12
1127
1128         mulx    a_3,b_3,t_1     !mul_add_c(a[3],b[3],c1,c2,c3);
1129         addcc   c_12,t_1,t_1
1130         srlx    t_1,32,c_12     !=
1131         stuw    t_1,rp(6)       !r[6]=c1;
1132         stuw    c_12,rp(7)      !r[7]=c2;
1133
1134         ret
1135         restore %g0,%g0,%o0
1136
1137 .type   bn_mul_comba4,#function
1138 .size   bn_mul_comba4,(.-bn_mul_comba4)
1139
1140 .align  32
1141
1142 .global bn_sqr_comba8
1143 bn_sqr_comba8:
1144         save    %sp,FRAME_SIZE,%sp
1145         mov     1,t_2
1146         lduw    ap(0),a_0
1147         sllx    t_2,32,t_2
1148         lduw    ap(1),a_1
1149         mulx    a_0,a_0,t_1     !sqr_add_c(a,0,c1,c2,c3);
1150         srlx    t_1,32,c_12
1151         stuw    t_1,rp(0)       !r[0]=c1;
1152
1153         lduw    ap(2),a_2
1154         mulx    a_0,a_1,t_1     !=!sqr_add_c2(a,1,0,c2,c3,c1);
1155         addcc   c_12,t_1,c_12
1156         clr     c_3
1157         bcs,a   %xcc,.+8
1158         add     c_3,t_2,c_3
1159         addcc   c_12,t_1,t_1
1160         bcs,a   %xcc,.+8
1161         add     c_3,t_2,c_3
1162         srlx    t_1,32,c_12
1163         stuw    t_1,rp(1)       !r[1]=c2;
1164         or      c_12,c_3,c_12
1165
1166         mulx    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
1167         addcc   c_12,t_1,c_12
1168         clr     c_3
1169         bcs,a   %xcc,.+8
1170         add     c_3,t_2,c_3
1171         addcc   c_12,t_1,c_12
1172         bcs,a   %xcc,.+8
1173         add     c_3,t_2,c_3
1174         lduw    ap(3),a_3
1175         mulx    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
1176         addcc   c_12,t_1,t_1
1177         bcs,a   %xcc,.+8
1178         add     c_3,t_2,c_3
1179         srlx    t_1,32,c_12
1180         stuw    t_1,rp(2)       !r[2]=c3;
1181         or      c_12,c_3,c_12
1182
1183         mulx    a_0,a_3,t_1     !sqr_add_c2(a,3,0,c1,c2,c3);
1184         addcc   c_12,t_1,c_12
1185         clr     c_3
1186         bcs,a   %xcc,.+8
1187         add     c_3,t_2,c_3
1188         addcc   c_12,t_1,c_12
1189         bcs,a   %xcc,.+8
1190         add     c_3,t_2,c_3
1191         lduw    ap(4),a_4
1192         mulx    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
1193         addcc   c_12,t_1,c_12
1194         bcs,a   %xcc,.+8
1195         add     c_3,t_2,c_3
1196         addcc   c_12,t_1,t_1
1197         bcs,a   %xcc,.+8
1198         add     c_3,t_2,c_3
1199         srlx    t_1,32,c_12
1200         st      t_1,rp(3)       !r[3]=c1;
1201         or      c_12,c_3,c_12
1202
1203         mulx    a_4,a_0,t_1     !sqr_add_c2(a,4,0,c2,c3,c1);
1204         addcc   c_12,t_1,c_12
1205         clr     c_3
1206         bcs,a   %xcc,.+8
1207         add     c_3,t_2,c_3
1208         addcc   c_12,t_1,c_12
1209         bcs,a   %xcc,.+8
1210         add     c_3,t_2,c_3
1211         mulx    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
1212         addcc   c_12,t_1,c_12
1213         bcs,a   %xcc,.+8
1214         add     c_3,t_2,c_3
1215         addcc   c_12,t_1,c_12
1216         bcs,a   %xcc,.+8
1217         add     c_3,t_2,c_3
1218         lduw    ap(5),a_5
1219         mulx    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
1220         addcc   c_12,t_1,t_1
1221         bcs,a   %xcc,.+8
1222         add     c_3,t_2,c_3
1223         srlx    t_1,32,c_12
1224         stuw    t_1,rp(4)       !r[4]=c2;
1225         or      c_12,c_3,c_12
1226
1227         mulx    a_0,a_5,t_1     !sqr_add_c2(a,5,0,c3,c1,c2);
1228         addcc   c_12,t_1,c_12
1229         clr     c_3
1230         bcs,a   %xcc,.+8
1231         add     c_3,t_2,c_3
1232         addcc   c_12,t_1,c_12
1233         bcs,a   %xcc,.+8
1234         add     c_3,t_2,c_3
1235         mulx    a_1,a_4,t_1     !sqr_add_c2(a,4,1,c3,c1,c2);
1236         addcc   c_12,t_1,c_12
1237         bcs,a   %xcc,.+8
1238         add     c_3,t_2,c_3
1239         addcc   c_12,t_1,c_12
1240         bcs,a   %xcc,.+8
1241         add     c_3,t_2,c_3
1242         lduw    ap(6),a_6
1243         mulx    a_2,a_3,t_1     !sqr_add_c2(a,3,2,c3,c1,c2);
1244         addcc   c_12,t_1,c_12
1245         bcs,a   %xcc,.+8
1246         add     c_3,t_2,c_3
1247         addcc   c_12,t_1,t_1
1248         bcs,a   %xcc,.+8
1249         add     c_3,t_2,c_3
1250         srlx    t_1,32,c_12
1251         stuw    t_1,rp(5)       !r[5]=c3;
1252         or      c_12,c_3,c_12
1253
1254         mulx    a_6,a_0,t_1     !sqr_add_c2(a,6,0,c1,c2,c3);
1255         addcc   c_12,t_1,c_12
1256         clr     c_3
1257         bcs,a   %xcc,.+8
1258         add     c_3,t_2,c_3
1259         addcc   c_12,t_1,c_12
1260         bcs,a   %xcc,.+8
1261         add     c_3,t_2,c_3
1262         mulx    a_5,a_1,t_1     !sqr_add_c2(a,5,1,c1,c2,c3);
1263         addcc   c_12,t_1,c_12
1264         bcs,a   %xcc,.+8
1265         add     c_3,t_2,c_3
1266         addcc   c_12,t_1,c_12
1267         bcs,a   %xcc,.+8
1268         add     c_3,t_2,c_3
1269         mulx    a_4,a_2,t_1     !sqr_add_c2(a,4,2,c1,c2,c3);
1270         addcc   c_12,t_1,c_12
1271         bcs,a   %xcc,.+8
1272         add     c_3,t_2,c_3
1273         addcc   c_12,t_1,c_12
1274         bcs,a   %xcc,.+8
1275         add     c_3,t_2,c_3
1276         lduw    ap(7),a_7
1277         mulx    a_3,a_3,t_1     !=!sqr_add_c(a,3,c1,c2,c3);
1278         addcc   c_12,t_1,t_1
1279         bcs,a   %xcc,.+8
1280         add     c_3,t_2,c_3
1281         srlx    t_1,32,c_12
1282         stuw    t_1,rp(6)       !r[6]=c1;
1283         or      c_12,c_3,c_12
1284
1285         mulx    a_0,a_7,t_1     !sqr_add_c2(a,7,0,c2,c3,c1);
1286         addcc   c_12,t_1,c_12
1287         clr     c_3
1288         bcs,a   %xcc,.+8
1289         add     c_3,t_2,c_3
1290         addcc   c_12,t_1,c_12
1291         bcs,a   %xcc,.+8
1292         add     c_3,t_2,c_3
1293         mulx    a_1,a_6,t_1     !sqr_add_c2(a,6,1,c2,c3,c1);
1294         addcc   c_12,t_1,c_12
1295         bcs,a   %xcc,.+8
1296         add     c_3,t_2,c_3
1297         addcc   c_12,t_1,c_12
1298         bcs,a   %xcc,.+8
1299         add     c_3,t_2,c_3
1300         mulx    a_2,a_5,t_1     !sqr_add_c2(a,5,2,c2,c3,c1);
1301         addcc   c_12,t_1,c_12
1302         bcs,a   %xcc,.+8
1303         add     c_3,t_2,c_3
1304         addcc   c_12,t_1,c_12
1305         bcs,a   %xcc,.+8
1306         add     c_3,t_2,c_3
1307         mulx    a_3,a_4,t_1     !sqr_add_c2(a,4,3,c2,c3,c1);
1308         addcc   c_12,t_1,c_12
1309         bcs,a   %xcc,.+8
1310         add     c_3,t_2,c_3
1311         addcc   c_12,t_1,t_1
1312         bcs,a   %xcc,.+8
1313         add     c_3,t_2,c_3
1314         srlx    t_1,32,c_12
1315         stuw    t_1,rp(7)       !r[7]=c2;
1316         or      c_12,c_3,c_12
1317
1318         mulx    a_7,a_1,t_1     !sqr_add_c2(a,7,1,c3,c1,c2);
1319         addcc   c_12,t_1,c_12
1320         clr     c_3
1321         bcs,a   %xcc,.+8
1322         add     c_3,t_2,c_3
1323         addcc   c_12,t_1,c_12
1324         bcs,a   %xcc,.+8
1325         add     c_3,t_2,c_3
1326         mulx    a_6,a_2,t_1     !sqr_add_c2(a,6,2,c3,c1,c2);
1327         addcc   c_12,t_1,c_12
1328         bcs,a   %xcc,.+8
1329         add     c_3,t_2,c_3
1330         addcc   c_12,t_1,c_12
1331         bcs,a   %xcc,.+8
1332         add     c_3,t_2,c_3
1333         mulx    a_5,a_3,t_1     !sqr_add_c2(a,5,3,c3,c1,c2);
1334         addcc   c_12,t_1,c_12
1335         bcs,a   %xcc,.+8
1336         add     c_3,t_2,c_3
1337         addcc   c_12,t_1,c_12
1338         bcs,a   %xcc,.+8
1339         add     c_3,t_2,c_3
1340         mulx    a_4,a_4,t_1     !sqr_add_c(a,4,c3,c1,c2);
1341         addcc   c_12,t_1,t_1
1342         bcs,a   %xcc,.+8
1343         add     c_3,t_2,c_3
1344         srlx    t_1,32,c_12
1345         stuw    t_1,rp(8)       !r[8]=c3;
1346         or      c_12,c_3,c_12
1347
1348         mulx    a_2,a_7,t_1     !sqr_add_c2(a,7,2,c1,c2,c3);
1349         addcc   c_12,t_1,c_12
1350         clr     c_3
1351         bcs,a   %xcc,.+8
1352         add     c_3,t_2,c_3
1353         addcc   c_12,t_1,c_12
1354         bcs,a   %xcc,.+8
1355         add     c_3,t_2,c_3
1356         mulx    a_3,a_6,t_1     !sqr_add_c2(a,6,3,c1,c2,c3);
1357         addcc   c_12,t_1,c_12
1358         bcs,a   %xcc,.+8
1359         add     c_3,t_2,c_3
1360         addcc   c_12,t_1,c_12
1361         bcs,a   %xcc,.+8
1362         add     c_3,t_2,c_3
1363         mulx    a_4,a_5,t_1     !sqr_add_c2(a,5,4,c1,c2,c3);
1364         addcc   c_12,t_1,c_12
1365         bcs,a   %xcc,.+8
1366         add     c_3,t_2,c_3
1367         addcc   c_12,t_1,t_1
1368         bcs,a   %xcc,.+8
1369         add     c_3,t_2,c_3
1370         srlx    t_1,32,c_12
1371         stuw    t_1,rp(9)       !r[9]=c1;
1372         or      c_12,c_3,c_12
1373
1374         mulx    a_7,a_3,t_1     !sqr_add_c2(a,7,3,c2,c3,c1);
1375         addcc   c_12,t_1,c_12
1376         clr     c_3
1377         bcs,a   %xcc,.+8
1378         add     c_3,t_2,c_3
1379         addcc   c_12,t_1,c_12
1380         bcs,a   %xcc,.+8
1381         add     c_3,t_2,c_3
1382         mulx    a_6,a_4,t_1     !sqr_add_c2(a,6,4,c2,c3,c1);
1383         addcc   c_12,t_1,c_12
1384         bcs,a   %xcc,.+8
1385         add     c_3,t_2,c_3
1386         addcc   c_12,t_1,c_12
1387         bcs,a   %xcc,.+8
1388         add     c_3,t_2,c_3
1389         mulx    a_5,a_5,t_1     !sqr_add_c(a,5,c2,c3,c1);
1390         addcc   c_12,t_1,t_1
1391         bcs,a   %xcc,.+8
1392         add     c_3,t_2,c_3
1393         srlx    t_1,32,c_12
1394         stuw    t_1,rp(10)      !r[10]=c2;
1395         or      c_12,c_3,c_12
1396
1397         mulx    a_4,a_7,t_1     !sqr_add_c2(a,7,4,c3,c1,c2);
1398         addcc   c_12,t_1,c_12
1399         clr     c_3
1400         bcs,a   %xcc,.+8
1401         add     c_3,t_2,c_3
1402         addcc   c_12,t_1,c_12
1403         bcs,a   %xcc,.+8
1404         add     c_3,t_2,c_3
1405         mulx    a_5,a_6,t_1     !sqr_add_c2(a,6,5,c3,c1,c2);
1406         addcc   c_12,t_1,c_12
1407         bcs,a   %xcc,.+8
1408         add     c_3,t_2,c_3
1409         addcc   c_12,t_1,t_1
1410         bcs,a   %xcc,.+8
1411         add     c_3,t_2,c_3
1412         srlx    t_1,32,c_12
1413         stuw    t_1,rp(11)      !r[11]=c3;
1414         or      c_12,c_3,c_12
1415
1416         mulx    a_7,a_5,t_1     !sqr_add_c2(a,7,5,c1,c2,c3);
1417         addcc   c_12,t_1,c_12
1418         clr     c_3
1419         bcs,a   %xcc,.+8
1420         add     c_3,t_2,c_3
1421         addcc   c_12,t_1,c_12
1422         bcs,a   %xcc,.+8
1423         add     c_3,t_2,c_3
1424         mulx    a_6,a_6,t_1     !sqr_add_c(a,6,c1,c2,c3);
1425         addcc   c_12,t_1,t_1
1426         bcs,a   %xcc,.+8
1427         add     c_3,t_2,c_3
1428         srlx    t_1,32,c_12
1429         stuw    t_1,rp(12)      !r[12]=c1;
1430         or      c_12,c_3,c_12
1431
1432         mulx    a_6,a_7,t_1     !sqr_add_c2(a,7,6,c2,c3,c1);
1433         addcc   c_12,t_1,c_12
1434         clr     c_3
1435         bcs,a   %xcc,.+8
1436         add     c_3,t_2,c_3
1437         addcc   c_12,t_1,t_1
1438         bcs,a   %xcc,.+8
1439         add     c_3,t_2,c_3
1440         srlx    t_1,32,c_12
1441         stuw    t_1,rp(13)      !r[13]=c2;
1442         or      c_12,c_3,c_12
1443
1444         mulx    a_7,a_7,t_1     !sqr_add_c(a,7,c3,c1,c2);
1445         addcc   c_12,t_1,t_1
1446         srlx    t_1,32,c_12
1447         stuw    t_1,rp(14)      !r[14]=c3;
1448         stuw    c_12,rp(15)     !r[15]=c1;
1449
1450         ret
1451         restore %g0,%g0,%o0
1452
1453 .type   bn_sqr_comba8,#function
1454 .size   bn_sqr_comba8,(.-bn_sqr_comba8)
1455
1456 .align  32
1457
1458 .global bn_sqr_comba4
1459 /*
1460  * void bn_sqr_comba4(r,a)
1461  * BN_ULONG *r,*a;
1462  */
1463 bn_sqr_comba4:
1464         save    %sp,FRAME_SIZE,%sp
1465         mov     1,t_2
1466         lduw    ap(0),a_0
1467         sllx    t_2,32,t_2
1468         lduw    ap(1),a_1
1469         mulx    a_0,a_0,t_1     !sqr_add_c(a,0,c1,c2,c3);
1470         srlx    t_1,32,c_12
1471         stuw    t_1,rp(0)       !r[0]=c1;
1472
1473         lduw    ap(2),a_2
1474         mulx    a_0,a_1,t_1     !sqr_add_c2(a,1,0,c2,c3,c1);
1475         addcc   c_12,t_1,c_12
1476         clr     c_3
1477         bcs,a   %xcc,.+8
1478         add     c_3,t_2,c_3
1479         addcc   c_12,t_1,t_1
1480         bcs,a   %xcc,.+8
1481         add     c_3,t_2,c_3
1482         srlx    t_1,32,c_12
1483         stuw    t_1,rp(1)       !r[1]=c2;
1484         or      c_12,c_3,c_12
1485
1486         mulx    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
1487         addcc   c_12,t_1,c_12
1488         clr     c_3
1489         bcs,a   %xcc,.+8
1490         add     c_3,t_2,c_3
1491         addcc   c_12,t_1,c_12
1492         bcs,a   %xcc,.+8
1493         add     c_3,t_2,c_3
1494         lduw    ap(3),a_3
1495         mulx    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
1496         addcc   c_12,t_1,t_1
1497         bcs,a   %xcc,.+8
1498         add     c_3,t_2,c_3
1499         srlx    t_1,32,c_12
1500         stuw    t_1,rp(2)       !r[2]=c3;
1501         or      c_12,c_3,c_12
1502
1503         mulx    a_0,a_3,t_1     !sqr_add_c2(a,3,0,c1,c2,c3);
1504         addcc   c_12,t_1,c_12
1505         clr     c_3
1506         bcs,a   %xcc,.+8
1507         add     c_3,t_2,c_3
1508         addcc   c_12,t_1,c_12
1509         bcs,a   %xcc,.+8
1510         add     c_3,t_2,c_3
1511         mulx    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
1512         addcc   c_12,t_1,c_12
1513         bcs,a   %xcc,.+8
1514         add     c_3,t_2,c_3
1515         addcc   c_12,t_1,t_1
1516         bcs,a   %xcc,.+8
1517         add     c_3,t_2,c_3
1518         srlx    t_1,32,c_12
1519         stuw    t_1,rp(3)       !r[3]=c1;
1520         or      c_12,c_3,c_12
1521
1522         mulx    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
1523         addcc   c_12,t_1,c_12
1524         clr     c_3
1525         bcs,a   %xcc,.+8
1526         add     c_3,t_2,c_3
1527         addcc   c_12,t_1,c_12
1528         bcs,a   %xcc,.+8
1529         add     c_3,t_2,c_3
1530         mulx    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
1531         addcc   c_12,t_1,t_1
1532         bcs,a   %xcc,.+8
1533         add     c_3,t_2,c_3
1534         srlx    t_1,32,c_12
1535         stuw    t_1,rp(4)       !r[4]=c2;
1536         or      c_12,c_3,c_12
1537
1538         mulx    a_2,a_3,t_1     !sqr_add_c2(a,3,2,c3,c1,c2);
1539         addcc   c_12,t_1,c_12
1540         clr     c_3
1541         bcs,a   %xcc,.+8
1542         add     c_3,t_2,c_3
1543         addcc   c_12,t_1,t_1
1544         bcs,a   %xcc,.+8
1545         add     c_3,t_2,c_3
1546         srlx    t_1,32,c_12
1547         stuw    t_1,rp(5)       !r[5]=c3;
1548         or      c_12,c_3,c_12
1549
1550         mulx    a_3,a_3,t_1     !sqr_add_c(a,3,c1,c2,c3);
1551         addcc   c_12,t_1,t_1
1552         srlx    t_1,32,c_12
1553         stuw    t_1,rp(6)       !r[6]=c1;
1554         stuw    c_12,rp(7)      !r[7]=c2;
1555
1556         ret
1557         restore %g0,%g0,%o0
1558
1559 .type   bn_sqr_comba4,#function
1560 .size   bn_sqr_comba4,(.-bn_sqr_comba4)
1561
1562 .align  32