crypto/bn/asm/sparcv8plus.S

   1 .ident  "sparcv8plus.s, Version 1.4"
   2 .ident  "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
   3
   4 /*
   5  * ====================================================================
   6  * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   7  * project.
   8  *
   9  * Rights for redistribution and usage in source and binary forms are
  10  * granted according to the OpenSSL license. Warranty of any kind is
  11  * disclaimed.
  12  * ====================================================================
  13  */
  14
  15 /*
  16  * This is my modest contributon to OpenSSL project (see
  17  * http://www.openssl.org/ for more information about it) and is
  18  * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
  19  * module. For updates see http://fy.chalmers.se/~appro/hpe/.
  20  *
  21  * Questions-n-answers.
  22  *
  23  * Q. How to compile?
  24  * A. With SC4.x/SC5.x:
  25  *
  26  *      cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  27  *
  28  *    and with gcc:
  29  *
  30  *      gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
  31  *
  32  *    or if above fails (it does if you have gas installed):
  33  *
  34  *      gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
  35  *
  36  *    Quick-n-dirty way to fuse the module into the library.
  37  *    Provided that the library is already configured and built
  38  *    (in 0.9.2 case with no-asm option):
  39  *
  40  *      # cd crypto/bn
  41  *      # cp /some/place/bn_asm.sparc.v8plus.S .
  42  *      # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  43  *      # make
  44  *      # cd ../..
  45  *      # make; make test
  46  *
  47  *    Quick-n-dirty way to get rid of it:
  48  *
  49  *      # cd crypto/bn
  50  *      # touch bn_asm.c
  51  *      # make
  52  *      # cd ../..
  53  *      # make; make test
  54  *
  55  * Q. V8plus achitecture? What kind of beast is that?
  56  * A. Well, it's rather a programming model than an architecture...
  57  *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
  58  *    special conditions, namely when kernel doesn't preserve upper
  59  *    32 bits of otherwise 64-bit registers during a context switch.
  60  *
  61  * Q. Why just UltraSPARC? What about SuperSPARC?
  62  * A. Original release did target UltraSPARC only. Now SuperSPARC
  63  *    version is provided along. Both version share bn_*comba[48]
  64  *    implementations (see comment later in code for explanation).
  65  *    But what's so special about this UltraSPARC implementation?
  66  *    Why didn't I let compiler do the job? Trouble is that most of
  67  *    available compilers (well, SC5.0 is the only exception) don't
  68  *    attempt to take advantage of UltraSPARC's 64-bitness under
  69  *    32-bit kernels even though it's perfectly possible (see next
  70  *    question).
  71  *
  72  * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
  73  *    doesn't work?
  74  * A. You can't adress *all* registers as 64-bit wide:-( The catch is
  75  *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
  76  *    preserved if you're in a leaf function, i.e. such never calling
  77  *    any other functions. All functions in this module are leaf and
  78  *    10 registers is a handful. And as a matter of fact none-"comba"
  79  *    routines don't require even that much and I could even afford to
  80  *    not allocate own stack frame for 'em:-)
  81  *
  82  * Q. What about 64-bit kernels?
  83  * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
  84  *    under evaluation and development...
  85  *
  86  * Q. What about shared libraries?
  87  * A. What about 'em? Kidding again:-) Code does *not* contain any
  88  *    code position dependencies and it's safe to include it into
  89  *    shared library as is.
  90  *
  91  * Q. How much faster does it go?
  92  * A. Do you have a good benchmark? In either case below is what I
  93  *    experience with crypto/bn/expspeed.c test program:
  94  *
  95  *      v8plus module on U10/300MHz against bn_asm.c compiled with:
  96  *
  97  *      cc-5.0 -xarch=v8plus -xO5 -xdepend      +7-12%
  98  *      cc-4.2 -xarch=v8plus -xO5 -xdepend      +25-35%
  99  *      egcs-1.1.2 -mcpu=ultrasparc -O3         +35-45%
 100  *
 101  *      v8 module on SS10/60MHz against bn_asm.c compiled with:
 102  *
 103  *      cc-5.0 -xarch=v8 -xO5 -xdepend          +7-10%
 104  *      cc-4.2 -xarch=v8 -xO5 -xdepend          +10%
 105  *      egcs-1.1.2 -mv8 -O3                     +35-45%
 106  *
 107  *    As you can see it's damn hard to beat the new Sun C compiler
 108  *    and it's in first place GNU C users who will appreciate this
 109  *    assembler implementation:-)
 110  */
 111
 112 /*
 113  * Revision history.
 114  *
 115  * 1.0  - initial release;
 116  * 1.1  - new loop unrolling model(*);
 117  *      - some more fine tuning;
 118  * 1.2  - made gas friendly;
 119  *      - updates to documentation concerning v9;
 120  *      - new performance comparison matrix;
 121  * 1.3  - fixed problem with /usr/ccs/lib/cpp;
 122  * 1.4  - native V9 bn_*_comba[48] implementation (15% more efficient)
 123  *        resulting in slight overall performance kick;
 124  *      - some retunes;
 125  *      - support for GNU as added;
 126  *
 127  * (*)  Originally unrolled loop looked like this:
 128  *          for (;;) {
 129  *              op(p+0); if (--n==0) break;
 130  *              op(p+1); if (--n==0) break;
 131  *              op(p+2); if (--n==0) break;
 132  *              op(p+3); if (--n==0) break;
 133  *              p+=4;
 134  *          }
 135  *      I unroll according to following:
 136  *          while (n&~3) {
 137  *              op(p+0); op(p+1); op(p+2); op(p+3);
 138  *              p+=4; n=-4;
 139  *          }
 140  *          if (n) {
 141  *              op(p+0); if (--n==0) return;
 142  *              op(p+2); if (--n==0) return;
 143  *              op(p+3); return;
 144  *          }
 145  */
 146
 147 #if defined(__SUNPRO_C) && defined(__sparcv9)
 148   /* They've said -xarch=v9 at command line */
 149   .register     %g2,#scratch
 150   .register     %g3,#scratch
 151 # define        FRAME_SIZE      -192
 152 #elif defined(__GNUC__) && defined(__arch64__)
 153   /* They've said -m64 at command line */
 154   .register     %g2,#scratch
 155   .register     %g3,#scratch
 156 # define        FRAME_SIZE      -192
 157 #else
 158 # define        FRAME_SIZE      -96
 159 #endif
 160 /*
 161  * GNU assembler can't stand stuw:-(
 162  */
 163 #define stuw st
 164
 165 .section        ".text",#alloc,#execinstr
 166 .file           "bn_asm.sparc.v8plus.S"
 167
 168 .align  32
 169
 170 .global bn_mul_add_words
 171 /*
 172  * BN_ULONG bn_mul_add_words(rp,ap,num,w)
 173  * BN_ULONG *rp,*ap;
 174  * int num;
 175  * BN_ULONG w;
 176  */
 177 bn_mul_add_words:
 178         sra     %o2,%g0,%o2     ! signx %o2
 179         brgz,a  %o2,.L_bn_mul_add_words_proceed
 180         lduw    [%o1],%g2
 181         retl
 182         clr     %o0
 183         nop
 184         nop
 185         nop
 186
 187 .L_bn_mul_add_words_proceed:
 188         srl     %o3,%g0,%o3     ! clruw %o3
 189         andcc   %o2,-4,%g0
 190         bz,pn   %icc,.L_bn_mul_add_words_tail
 191         clr     %o5
 192
 193 .L_bn_mul_add_words_loop:       ! wow! 32 aligned!
 194         lduw    [%o0],%g1
 195         lduw    [%o1+4],%g3
 196         mulx    %o3,%g2,%g2
 197         add     %g1,%o5,%o4
 198         nop
 199         add     %o4,%g2,%o4
 200         stuw    %o4,[%o0]
 201         srlx    %o4,32,%o5
 202
 203         lduw    [%o0+4],%g1
 204         lduw    [%o1+8],%g2
 205         mulx    %o3,%g3,%g3
 206         add     %g1,%o5,%o4
 207         dec     4,%o2
 208         add     %o4,%g3,%o4
 209         stuw    %o4,[%o0+4]
 210         srlx    %o4,32,%o5
 211
 212         lduw    [%o0+8],%g1
 213         lduw    [%o1+12],%g3
 214         mulx    %o3,%g2,%g2
 215         add     %g1,%o5,%o4
 216         inc     16,%o1
 217         add     %o4,%g2,%o4
 218         stuw    %o4,[%o0+8]
 219         srlx    %o4,32,%o5
 220
 221         lduw    [%o0+12],%g1
 222         mulx    %o3,%g3,%g3
 223         add     %g1,%o5,%o4
 224         inc     16,%o0
 225         add     %o4,%g3,%o4
 226         andcc   %o2,-4,%g0
 227         stuw    %o4,[%o0-4]
 228         srlx    %o4,32,%o5
 229         bnz,a,pt        %icc,.L_bn_mul_add_words_loop
 230         lduw    [%o1],%g2
 231
 232         brnz,a,pn       %o2,.L_bn_mul_add_words_tail
 233         lduw    [%o1],%g2
 234 .L_bn_mul_add_words_return:
 235         retl
 236         mov     %o5,%o0
 237
 238 .L_bn_mul_add_words_tail:
 239         lduw    [%o0],%g1
 240         mulx    %o3,%g2,%g2
 241         add     %g1,%o5,%o4
 242         dec     %o2
 243         add     %o4,%g2,%o4
 244         srlx    %o4,32,%o5
 245         brz,pt  %o2,.L_bn_mul_add_words_return
 246         stuw    %o4,[%o0]
 247
 248         lduw    [%o1+4],%g2
 249         lduw    [%o0+4],%g1
 250         mulx    %o3,%g2,%g2
 251         add     %g1,%o5,%o4
 252         dec     %o2
 253         add     %o4,%g2,%o4
 254         srlx    %o4,32,%o5
 255         brz,pt  %o2,.L_bn_mul_add_words_return
 256         stuw    %o4,[%o0+4]
 257
 258         lduw    [%o1+8],%g2
 259         lduw    [%o0+8],%g1
 260         mulx    %o3,%g2,%g2
 261         add     %g1,%o5,%o4
 262         add     %o4,%g2,%o4
 263         stuw    %o4,[%o0+8]
 264         retl
 265         srlx    %o4,32,%o0
 266
 267 .type   bn_mul_add_words,#function
 268 .size   bn_mul_add_words,(.-bn_mul_add_words)
 269
 270 .align  32
 271
 272 .global bn_mul_words
 273 /*
 274  * BN_ULONG bn_mul_words(rp,ap,num,w)
 275  * BN_ULONG *rp,*ap;
 276  * int num;
 277  * BN_ULONG w;
 278  */
 279 bn_mul_words:
 280         sra     %o2,%g0,%o2     ! signx %o2
 281         brgz,a  %o2,.L_bn_mul_words_proceeed
 282         lduw    [%o1],%g2
 283         retl
 284         clr     %o0
 285         nop
 286         nop
 287         nop
 288
 289 .L_bn_mul_words_proceeed:
 290         srl     %o3,%g0,%o3     ! clruw %o3
 291         andcc   %o2,-4,%g0
 292         bz,pn   %icc,.L_bn_mul_words_tail
 293         clr     %o5
 294
 295 .L_bn_mul_words_loop:           ! wow! 32 aligned!
 296         lduw    [%o1+4],%g3
 297         mulx    %o3,%g2,%g2
 298         add     %g2,%o5,%o4
 299         nop
 300         stuw    %o4,[%o0]
 301         srlx    %o4,32,%o5
 302
 303         lduw    [%o1+8],%g2
 304         mulx    %o3,%g3,%g3
 305         add     %g3,%o5,%o4
 306         dec     4,%o2
 307         stuw    %o4,[%o0+4]
 308         srlx    %o4,32,%o5
 309
 310         lduw    [%o1+12],%g3
 311         mulx    %o3,%g2,%g2
 312         add     %g2,%o5,%o4
 313         inc     16,%o1
 314         stuw    %o4,[%o0+8]
 315         srlx    %o4,32,%o5
 316
 317         mulx    %o3,%g3,%g3
 318         add     %g3,%o5,%o4
 319         inc     16,%o0
 320         stuw    %o4,[%o0-4]
 321         srlx    %o4,32,%o5
 322         andcc   %o2,-4,%g0
 323         bnz,a,pt        %icc,.L_bn_mul_words_loop
 324         lduw    [%o1],%g2
 325         nop
 326         nop
 327
 328         brnz,a,pn       %o2,.L_bn_mul_words_tail
 329         lduw    [%o1],%g2
 330 .L_bn_mul_words_return:
 331         retl
 332         mov     %o5,%o0
 333
 334 .L_bn_mul_words_tail:
 335         mulx    %o3,%g2,%g2
 336         add     %g2,%o5,%o4
 337         dec     %o2
 338         srlx    %o4,32,%o5
 339         brz,pt  %o2,.L_bn_mul_words_return
 340         stuw    %o4,[%o0]
 341
 342         lduw    [%o1+4],%g2
 343         mulx    %o3,%g2,%g2
 344         add     %g2,%o5,%o4
 345         dec     %o2
 346         srlx    %o4,32,%o5
 347         brz,pt  %o2,.L_bn_mul_words_return
 348         stuw    %o4,[%o0+4]
 349
 350         lduw    [%o1+8],%g2
 351         mulx    %o3,%g2,%g2
 352         add     %g2,%o5,%o4
 353         stuw    %o4,[%o0+8]
 354         retl
 355         srlx    %o4,32,%o0
 356
 357 .type   bn_mul_words,#function
 358 .size   bn_mul_words,(.-bn_mul_words)
 359
 360 .align  32
 361 .global bn_sqr_words
 362 /*
 363  * void bn_sqr_words(r,a,n)
 364  * BN_ULONG *r,*a;
 365  * int n;
 366  */
 367 bn_sqr_words:
 368         sra     %o2,%g0,%o2     ! signx %o2
 369         brgz,a  %o2,.L_bn_sqr_words_proceeed
 370         lduw    [%o1],%g2
 371         retl
 372         clr     %o0
 373         nop
 374         nop
 375         nop
 376
 377 .L_bn_sqr_words_proceeed:
 378         andcc   %o2,-4,%g0
 379         nop
 380         bz,pn   %icc,.L_bn_sqr_words_tail
 381         nop
 382
 383 .L_bn_sqr_words_loop:           ! wow! 32 aligned!
 384         lduw    [%o1+4],%g3
 385         mulx    %g2,%g2,%o4
 386         stuw    %o4,[%o0]
 387         srlx    %o4,32,%o5
 388         stuw    %o5,[%o0+4]
 389         nop
 390
 391         lduw    [%o1+8],%g2
 392         mulx    %g3,%g3,%o4
 393         dec     4,%o2
 394         stuw    %o4,[%o0+8]
 395         srlx    %o4,32,%o5
 396         stuw    %o5,[%o0+12]
 397
 398         lduw    [%o1+12],%g3
 399         mulx    %g2,%g2,%o4
 400         srlx    %o4,32,%o5
 401         stuw    %o4,[%o0+16]
 402         inc     16,%o1
 403         stuw    %o5,[%o0+20]
 404
 405         mulx    %g3,%g3,%o4
 406         inc     32,%o0
 407         stuw    %o4,[%o0-8]
 408         srlx    %o4,32,%o5
 409         andcc   %o2,-4,%g2
 410         stuw    %o5,[%o0-4]
 411         bnz,a,pt        %icc,.L_bn_sqr_words_loop
 412         lduw    [%o1],%g2
 413         nop
 414
 415         brnz,a,pn       %o2,.L_bn_sqr_words_tail
 416         lduw    [%o1],%g2
 417 .L_bn_sqr_words_return:
 418         retl
 419         clr     %o0
 420
 421 .L_bn_sqr_words_tail:
 422         mulx    %g2,%g2,%o4
 423         dec     %o2
 424         stuw    %o4,[%o0]
 425         srlx    %o4,32,%o5
 426         brz,pt  %o2,.L_bn_sqr_words_return
 427         stuw    %o5,[%o0+4]
 428
 429         lduw    [%o1+4],%g2
 430         mulx    %g2,%g2,%o4
 431         dec     %o2
 432         stuw    %o4,[%o0+8]
 433         srlx    %o4,32,%o5
 434         brz,pt  %o2,.L_bn_sqr_words_return
 435         stuw    %o5,[%o0+12]
 436
 437         lduw    [%o1+8],%g2
 438         mulx    %g2,%g2,%o4
 439         srlx    %o4,32,%o5
 440         stuw    %o4,[%o0+16]
 441         stuw    %o5,[%o0+20]
 442         retl
 443         clr     %o0
 444
 445 .type   bn_sqr_words,#function
 446 .size   bn_sqr_words,(.-bn_sqr_words)
 447
 448 .align  32
 449 .global bn_div_words
 450 /*
 451  * BN_ULONG bn_div_words(h,l,d)
 452  * BN_ULONG h,l,d;
 453  */
 454 bn_div_words:
 455         sllx    %o0,32,%o0
 456         or      %o0,%o1,%o0
 457         udivx   %o0,%o2,%o0
 458         retl
 459         srl     %o0,%g0,%o0     ! clruw %o0
 460
 461 .type   bn_div_words,#function
 462 .size   bn_div_words,(.-bn_div_words)
 463
 464 .align  32
 465
 466 .global bn_add_words
 467 /*
 468  * BN_ULONG bn_add_words(rp,ap,bp,n)
 469  * BN_ULONG *rp,*ap,*bp;
 470  * int n;
 471  */
 472 bn_add_words:
 473         sra     %o3,%g0,%o3     ! signx %o3
 474         brgz,a  %o3,.L_bn_add_words_proceed
 475         lduw    [%o1],%o4
 476         retl
 477         clr     %o0
 478
 479 .L_bn_add_words_proceed:
 480         andcc   %o3,-4,%g0
 481         bz,pn   %icc,.L_bn_add_words_tail
 482         addcc   %g0,0,%g0       ! clear carry flag
 483
 484 .L_bn_add_words_loop:           ! wow! 32 aligned!
 485         dec     4,%o3
 486         lduw    [%o2],%o5
 487         lduw    [%o1+4],%g1
 488         lduw    [%o2+4],%g2
 489         lduw    [%o1+8],%g3
 490         lduw    [%o2+8],%g4
 491         addccc  %o5,%o4,%o5
 492         stuw    %o5,[%o0]
 493
 494         lduw    [%o1+12],%o4
 495         lduw    [%o2+12],%o5
 496         inc     16,%o1
 497         addccc  %g1,%g2,%g1
 498         stuw    %g1,[%o0+4]
 499
 500         inc     16,%o2
 501         addccc  %g3,%g4,%g3
 502         stuw    %g3,[%o0+8]
 503
 504         inc     16,%o0
 505         addccc  %o5,%o4,%o5
 506         stuw    %o5,[%o0-4]
 507         and     %o3,-4,%g1
 508         brnz,a,pt       %g1,.L_bn_add_words_loop
 509         lduw    [%o1],%o4
 510
 511         brnz,a,pn       %o3,.L_bn_add_words_tail
 512         lduw    [%o1],%o4
 513 .L_bn_add_words_return:
 514         clr     %o0
 515         retl
 516         movcs   %icc,1,%o0
 517         nop
 518
 519 .L_bn_add_words_tail:
 520         lduw    [%o2],%o5
 521         dec     %o3
 522         addccc  %o5,%o4,%o5
 523         brz,pt  %o3,.L_bn_add_words_return
 524         stuw    %o5,[%o0]
 525
 526         lduw    [%o1+4],%o4
 527         lduw    [%o2+4],%o5
 528         dec     %o3
 529         addccc  %o5,%o4,%o5
 530         brz,pt  %o3,.L_bn_add_words_return
 531         stuw    %o5,[%o0+4]
 532
 533         lduw    [%o1+8],%o4
 534         lduw    [%o2+8],%o5
 535         addccc  %o5,%o4,%o5
 536         stuw    %o5,[%o0+8]
 537         clr     %o0
 538         retl
 539         movcs   %icc,1,%o0
 540
 541 .type   bn_add_words,#function
 542 .size   bn_add_words,(.-bn_add_words)
 543
 544 .global bn_sub_words
 545 /*
 546  * BN_ULONG bn_sub_words(rp,ap,bp,n)
 547  * BN_ULONG *rp,*ap,*bp;
 548  * int n;
 549  */
 550 bn_sub_words:
 551         sra     %o3,%g0,%o3     ! signx %o3
 552         brgz,a  %o3,.L_bn_sub_words_proceed
 553         lduw    [%o1],%o4
 554         retl
 555         clr     %o0
 556
 557 .L_bn_sub_words_proceed:
 558         andcc   %o3,-4,%g0
 559         bz,pn   %icc,.L_bn_sub_words_tail
 560         addcc   %g0,0,%g0       ! clear carry flag
 561
 562 .L_bn_sub_words_loop:           ! wow! 32 aligned!
 563         dec     4,%o3
 564         lduw    [%o2],%o5
 565         lduw    [%o1+4],%g1
 566         lduw    [%o2+4],%g2
 567         lduw    [%o1+8],%g3
 568         lduw    [%o2+8],%g4
 569         subccc  %o4,%o5,%o5
 570         stuw    %o5,[%o0]
 571
 572         lduw    [%o1+12],%o4
 573         lduw    [%o2+12],%o5
 574         inc     16,%o1
 575         subccc  %g1,%g2,%g2
 576         stuw    %g2,[%o0+4]
 577
 578         inc     16,%o2
 579         subccc  %g3,%g4,%g4
 580         stuw    %g4,[%o0+8]
 581
 582         inc     16,%o0
 583         subccc  %o4,%o5,%o5
 584         stuw    %o5,[%o0-4]
 585         and     %o3,-4,%g1
 586         brnz,a,pt       %g1,.L_bn_sub_words_loop
 587         lduw    [%o1],%o4
 588
 589         brnz,a,pn       %o3,.L_bn_sub_words_tail
 590         lduw    [%o1],%o4
 591 .L_bn_sub_words_return:
 592         clr     %o0
 593         retl
 594         movcs   %icc,1,%o0
 595         nop
 596
 597 .L_bn_sub_words_tail:           ! wow! 32 aligned!
 598         lduw    [%o2],%o5
 599         dec     %o3
 600         subccc  %o4,%o5,%o5
 601         brz,pt  %o3,.L_bn_sub_words_return
 602         stuw    %o5,[%o0]
 603
 604         lduw    [%o1+4],%o4
 605         lduw    [%o2+4],%o5
 606         dec     %o3
 607         subccc  %o4,%o5,%o5
 608         brz,pt  %o3,.L_bn_sub_words_return
 609         stuw    %o5,[%o0+4]
 610
 611         lduw    [%o1+8],%o4
 612         lduw    [%o2+8],%o5
 613         subccc  %o4,%o5,%o5
 614         stuw    %o5,[%o0+8]
 615         clr     %o0
 616         retl
 617         movcs   %icc,1,%o0
 618
 619 .type   bn_sub_words,#function
 620 .size   bn_sub_words,(.-bn_sub_words)
 621
 622 /*
 623  * Code below depends on the fact that upper parts of the %l0-%l7
 624  * and %i0-%i7 are zeroed by kernel after context switch. In
 625  * previous versions this comment stated that "the trouble is that
 626  * it's not feasible to implement the mumbo-jumbo in less V9
 627  * instructions:-(" which apparently isn't true thanks to
 628  * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
 629  * results not from the shorter code, but from elimination of
 630  * multicycle none-pairable 'rd %y,%rd' instructions.
 631  *
 632  *                                                      Andy.
 633  */
 634
 635 /*
 636  * Here is register usage map for *all* routines below.
 637  */
 638 #define t_1     %o0
 639 #define t_2     %o1
 640 #define c_12    %o2
 641 #define c_3     %o3
 642
 643 #define ap(I)   [%i1+4*I]
 644 #define bp(I)   [%i2+4*I]
 645 #define rp(I)   [%i0+4*I]
 646
 647 #define a_0     %l0
 648 #define a_1     %l1
 649 #define a_2     %l2
 650 #define a_3     %l3
 651 #define a_4     %l4
 652 #define a_5     %l5
 653 #define a_6     %l6
 654 #define a_7     %l7
 655
 656 #define b_0     %i3
 657 #define b_1     %i4
 658 #define b_2     %i5
 659 #define b_3     %o4
 660 #define b_4     %o5
 661 #define b_5     %o7
 662 #define b_6     %g1
 663 #define b_7     %g4
 664
 665 .align  32
 666 .global bn_mul_comba8
 667 /*
 668  * void bn_mul_comba8(r,a,b)
 669  * BN_ULONG *r,*a,*b;
 670  */
 671 bn_mul_comba8:
 672         save    %sp,FRAME_SIZE,%sp
 673         mov     1,t_2
 674         lduw    ap(0),a_0
 675         sllx    t_2,32,t_2
 676         lduw    bp(0),b_0       !=
 677         lduw    bp(1),b_1
 678         mulx    a_0,b_0,t_1     !mul_add_c(a[0],b[0],c1,c2,c3);
 679         srlx    t_1,32,c_12
 680         stuw    t_1,rp(0)       !=!r[0]=c1;
 681
 682         lduw    ap(1),a_1
 683         mulx    a_0,b_1,t_1     !mul_add_c(a[0],b[1],c2,c3,c1);
 684         addcc   c_12,t_1,c_12
 685         clr     c_3             !=
 686         bcs,a   %xcc,.+8
 687         add     c_3,t_2,c_3
 688         lduw    ap(2),a_2
 689         mulx    a_1,b_0,t_1     !=!mul_add_c(a[1],b[0],c2,c3,c1);
 690         addcc   c_12,t_1,t_1
 691         bcs,a   %xcc,.+8
 692         add     c_3,t_2,c_3
 693         srlx    t_1,32,c_12     !=
 694         stuw    t_1,rp(1)       !r[1]=c2;
 695         or      c_12,c_3,c_12
 696
 697         mulx    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
 698         addcc   c_12,t_1,c_12   !=
 699         clr     c_3
 700         bcs,a   %xcc,.+8
 701         add     c_3,t_2,c_3
 702         lduw    bp(2),b_2       !=
 703         mulx    a_1,b_1,t_1     !mul_add_c(a[1],b[1],c3,c1,c2);
 704         addcc   c_12,t_1,c_12
 705         bcs,a   %xcc,.+8
 706         add     c_3,t_2,c_3     !=
 707         lduw    bp(3),b_3
 708         mulx    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
 709         addcc   c_12,t_1,t_1
 710         bcs,a   %xcc,.+8        !=
 711         add     c_3,t_2,c_3
 712         srlx    t_1,32,c_12
 713         stuw    t_1,rp(2)       !r[2]=c3;
 714         or      c_12,c_3,c_12   !=
 715
 716         mulx    a_0,b_3,t_1     !mul_add_c(a[0],b[3],c1,c2,c3);
 717         addcc   c_12,t_1,c_12
 718         clr     c_3
 719         bcs,a   %xcc,.+8        !=
 720         add     c_3,t_2,c_3
 721         mulx    a_1,b_2,t_1     !=!mul_add_c(a[1],b[2],c1,c2,c3);
 722         addcc   c_12,t_1,c_12
 723         bcs,a   %xcc,.+8        !=
 724         add     c_3,t_2,c_3
 725         lduw    ap(3),a_3
 726         mulx    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
 727         addcc   c_12,t_1,c_12   !=
 728         bcs,a   %xcc,.+8
 729         add     c_3,t_2,c_3
 730         lduw    ap(4),a_4
 731         mulx    a_3,b_0,t_1     !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
 732         addcc   c_12,t_1,t_1
 733         bcs,a   %xcc,.+8
 734         add     c_3,t_2,c_3
 735         srlx    t_1,32,c_12     !=
 736         stuw    t_1,rp(3)       !r[3]=c1;
 737         or      c_12,c_3,c_12
 738
 739         mulx    a_4,b_0,t_1     !mul_add_c(a[4],b[0],c2,c3,c1);
 740         addcc   c_12,t_1,c_12   !=
 741         clr     c_3
 742         bcs,a   %xcc,.+8
 743         add     c_3,t_2,c_3
 744         mulx    a_3,b_1,t_1     !=!mul_add_c(a[3],b[1],c2,c3,c1);
 745         addcc   c_12,t_1,c_12
 746         bcs,a   %xcc,.+8
 747         add     c_3,t_2,c_3
 748         mulx    a_2,b_2,t_1     !=!mul_add_c(a[2],b[2],c2,c3,c1);
 749         addcc   c_12,t_1,c_12
 750         bcs,a   %xcc,.+8
 751         add     c_3,t_2,c_3
 752         lduw    bp(4),b_4       !=
 753         mulx    a_1,b_3,t_1     !mul_add_c(a[1],b[3],c2,c3,c1);
 754         addcc   c_12,t_1,c_12
 755         bcs,a   %xcc,.+8
 756         add     c_3,t_2,c_3     !=
 757         lduw    bp(5),b_5
 758         mulx    a_0,b_4,t_1     !mul_add_c(a[0],b[4],c2,c3,c1);
 759         addcc   c_12,t_1,t_1
 760         bcs,a   %xcc,.+8        !=
 761         add     c_3,t_2,c_3
 762         srlx    t_1,32,c_12
 763         stuw    t_1,rp(4)       !r[4]=c2;
 764         or      c_12,c_3,c_12   !=
 765
 766         mulx    a_0,b_5,t_1     !mul_add_c(a[0],b[5],c3,c1,c2);
 767         addcc   c_12,t_1,c_12
 768         clr     c_3
 769         bcs,a   %xcc,.+8        !=
 770         add     c_3,t_2,c_3
 771         mulx    a_1,b_4,t_1     !mul_add_c(a[1],b[4],c3,c1,c2);
 772         addcc   c_12,t_1,c_12
 773         bcs,a   %xcc,.+8        !=
 774         add     c_3,t_2,c_3
 775         mulx    a_2,b_3,t_1     !mul_add_c(a[2],b[3],c3,c1,c2);
 776         addcc   c_12,t_1,c_12
 777         bcs,a   %xcc,.+8        !=
 778         add     c_3,t_2,c_3
 779         mulx    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
 780         addcc   c_12,t_1,c_12
 781         bcs,a   %xcc,.+8        !=
 782         add     c_3,t_2,c_3
 783         lduw    ap(5),a_5
 784         mulx    a_4,b_1,t_1     !mul_add_c(a[4],b[1],c3,c1,c2);
 785         addcc   c_12,t_1,c_12   !=
 786         bcs,a   %xcc,.+8
 787         add     c_3,t_2,c_3
 788         lduw    ap(6),a_6
 789         mulx    a_5,b_0,t_1     !=!mul_add_c(a[5],b[0],c3,c1,c2);
 790         addcc   c_12,t_1,t_1
 791         bcs,a   %xcc,.+8
 792         add     c_3,t_2,c_3
 793         srlx    t_1,32,c_12     !=
 794         stuw    t_1,rp(5)       !r[5]=c3;
 795         or      c_12,c_3,c_12
 796
 797         mulx    a_6,b_0,t_1     !mul_add_c(a[6],b[0],c1,c2,c3);
 798         addcc   c_12,t_1,c_12   !=
 799         clr     c_3
 800         bcs,a   %xcc,.+8
 801         add     c_3,t_2,c_3
 802         mulx    a_5,b_1,t_1     !=!mul_add_c(a[5],b[1],c1,c2,c3);
 803         addcc   c_12,t_1,c_12
 804         bcs,a   %xcc,.+8
 805         add     c_3,t_2,c_3
 806         mulx    a_4,b_2,t_1     !=!mul_add_c(a[4],b[2],c1,c2,c3);
 807         addcc   c_12,t_1,c_12
 808         bcs,a   %xcc,.+8
 809         add     c_3,t_2,c_3
 810         mulx    a_3,b_3,t_1     !=!mul_add_c(a[3],b[3],c1,c2,c3);
 811         addcc   c_12,t_1,c_12
 812         bcs,a   %xcc,.+8
 813         add     c_3,t_2,c_3
 814         mulx    a_2,b_4,t_1     !=!mul_add_c(a[2],b[4],c1,c2,c3);
 815         addcc   c_12,t_1,c_12
 816         bcs,a   %xcc,.+8
 817         add     c_3,t_2,c_3
 818         lduw    bp(6),b_6       !=
 819         mulx    a_1,b_5,t_1     !mul_add_c(a[1],b[5],c1,c2,c3);
 820         addcc   c_12,t_1,c_12
 821         bcs,a   %xcc,.+8
 822         add     c_3,t_2,c_3     !=
 823         lduw    bp(7),b_7
 824         mulx    a_0,b_6,t_1     !mul_add_c(a[0],b[6],c1,c2,c3);
 825         addcc   c_12,t_1,t_1
 826         bcs,a   %xcc,.+8        !=
 827         add     c_3,t_2,c_3
 828         srlx    t_1,32,c_12
 829         stuw    t_1,rp(6)       !r[6]=c1;
 830         or      c_12,c_3,c_12   !=
 831
 832         mulx    a_0,b_7,t_1     !mul_add_c(a[0],b[7],c2,c3,c1);
 833         addcc   c_12,t_1,c_12
 834         clr     c_3
 835         bcs,a   %xcc,.+8        !=
 836         add     c_3,t_2,c_3
 837         mulx    a_1,b_6,t_1     !mul_add_c(a[1],b[6],c2,c3,c1);
 838         addcc   c_12,t_1,c_12
 839         bcs,a   %xcc,.+8        !=
 840         add     c_3,t_2,c_3
 841         mulx    a_2,b_5,t_1     !mul_add_c(a[2],b[5],c2,c3,c1);
 842         addcc   c_12,t_1,c_12
 843         bcs,a   %xcc,.+8        !=
 844         add     c_3,t_2,c_3
 845         mulx    a_3,b_4,t_1     !mul_add_c(a[3],b[4],c2,c3,c1);
 846         addcc   c_12,t_1,c_12
 847         bcs,a   %xcc,.+8        !=
 848         add     c_3,t_2,c_3
 849         mulx    a_4,b_3,t_1     !mul_add_c(a[4],b[3],c2,c3,c1);
 850         addcc   c_12,t_1,c_12
 851         bcs,a   %xcc,.+8        !=
 852         add     c_3,t_2,c_3
 853         mulx    a_5,b_2,t_1     !mul_add_c(a[5],b[2],c2,c3,c1);
 854         addcc   c_12,t_1,c_12
 855         bcs,a   %xcc,.+8        !=
 856         add     c_3,t_2,c_3
 857         lduw    ap(7),a_7
 858         mulx    a_6,b_1,t_1     !=!mul_add_c(a[6],b[1],c2,c3,c1);
 859         addcc   c_12,t_1,c_12
 860         bcs,a   %xcc,.+8
 861         add     c_3,t_2,c_3
 862         mulx    a_7,b_0,t_1     !=!mul_add_c(a[7],b[0],c2,c3,c1);
 863         addcc   c_12,t_1,t_1
 864         bcs,a   %xcc,.+8
 865         add     c_3,t_2,c_3
 866         srlx    t_1,32,c_12     !=
 867         stuw    t_1,rp(7)       !r[7]=c2;
 868         or      c_12,c_3,c_12
 869
 870         mulx    a_7,b_1,t_1     !=!mul_add_c(a[7],b[1],c3,c1,c2);
 871         addcc   c_12,t_1,c_12
 872         clr     c_3
 873         bcs,a   %xcc,.+8
 874         add     c_3,t_2,c_3     !=
 875         mulx    a_6,b_2,t_1     !mul_add_c(a[6],b[2],c3,c1,c2);
 876         addcc   c_12,t_1,c_12
 877         bcs,a   %xcc,.+8
 878         add     c_3,t_2,c_3     !=
 879         mulx    a_5,b_3,t_1     !mul_add_c(a[5],b[3],c3,c1,c2);
 880         addcc   c_12,t_1,c_12
 881         bcs,a   %xcc,.+8
 882         add     c_3,t_2,c_3     !=
 883         mulx    a_4,b_4,t_1     !mul_add_c(a[4],b[4],c3,c1,c2);
 884         addcc   c_12,t_1,c_12
 885         bcs,a   %xcc,.+8
 886         add     c_3,t_2,c_3     !=
 887         mulx    a_3,b_5,t_1     !mul_add_c(a[3],b[5],c3,c1,c2);
 888         addcc   c_12,t_1,c_12
 889         bcs,a   %xcc,.+8
 890         add     c_3,t_2,c_3     !=
 891         mulx    a_2,b_6,t_1     !mul_add_c(a[2],b[6],c3,c1,c2);
 892         addcc   c_12,t_1,c_12
 893         bcs,a   %xcc,.+8
 894         add     c_3,t_2,c_3     !=
 895         mulx    a_1,b_7,t_1     !mul_add_c(a[1],b[7],c3,c1,c2);
 896         addcc   c_12,t_1,t_1
 897         bcs,a   %xcc,.+8
 898         add     c_3,t_2,c_3     !=
 899         srlx    t_1,32,c_12
 900         stuw    t_1,rp(8)       !r[8]=c3;
 901         or      c_12,c_3,c_12
 902
 903         mulx    a_2,b_7,t_1     !=!mul_add_c(a[2],b[7],c1,c2,c3);
 904         addcc   c_12,t_1,c_12
 905         clr     c_3
 906         bcs,a   %xcc,.+8
 907         add     c_3,t_2,c_3     !=
 908         mulx    a_3,b_6,t_1     !mul_add_c(a[3],b[6],c1,c2,c3);
 909         addcc   c_12,t_1,c_12
 910         bcs,a   %xcc,.+8        !=
 911         add     c_3,t_2,c_3
 912         mulx    a_4,b_5,t_1     !mul_add_c(a[4],b[5],c1,c2,c3);
 913         addcc   c_12,t_1,c_12
 914         bcs,a   %xcc,.+8        !=
 915         add     c_3,t_2,c_3
 916         mulx    a_5,b_4,t_1     !mul_add_c(a[5],b[4],c1,c2,c3);
 917         addcc   c_12,t_1,c_12
 918         bcs,a   %xcc,.+8        !=
 919         add     c_3,t_2,c_3
 920         mulx    a_6,b_3,t_1     !mul_add_c(a[6],b[3],c1,c2,c3);
 921         addcc   c_12,t_1,c_12
 922         bcs,a   %xcc,.+8        !=
 923         add     c_3,t_2,c_3
 924         mulx    a_7,b_2,t_1     !mul_add_c(a[7],b[2],c1,c2,c3);
 925         addcc   c_12,t_1,t_1
 926         bcs,a   %xcc,.+8        !=
 927         add     c_3,t_2,c_3
 928         srlx    t_1,32,c_12
 929         stuw    t_1,rp(9)       !r[9]=c1;
 930         or      c_12,c_3,c_12   !=
 931
 932         mulx    a_7,b_3,t_1     !mul_add_c(a[7],b[3],c2,c3,c1);
 933         addcc   c_12,t_1,c_12
 934         clr     c_3
 935         bcs,a   %xcc,.+8        !=
 936         add     c_3,t_2,c_3
 937         mulx    a_6,b_4,t_1     !mul_add_c(a[6],b[4],c2,c3,c1);
 938         addcc   c_12,t_1,c_12
 939         bcs,a   %xcc,.+8        !=
 940         add     c_3,t_2,c_3
 941         mulx    a_5,b_5,t_1     !mul_add_c(a[5],b[5],c2,c3,c1);
 942         addcc   c_12,t_1,c_12
 943         bcs,a   %xcc,.+8        !=
 944         add     c_3,t_2,c_3
 945         mulx    a_4,b_6,t_1     !mul_add_c(a[4],b[6],c2,c3,c1);
 946         addcc   c_12,t_1,c_12
 947         bcs,a   %xcc,.+8        !=
 948         add     c_3,t_2,c_3
 949         mulx    a_3,b_7,t_1     !mul_add_c(a[3],b[7],c2,c3,c1);
 950         addcc   c_12,t_1,t_1
 951         bcs,a   %xcc,.+8        !=
 952         add     c_3,t_2,c_3
 953         srlx    t_1,32,c_12
 954         stuw    t_1,rp(10)      !r[10]=c2;
 955         or      c_12,c_3,c_12   !=
 956
 957         mulx    a_4,b_7,t_1     !mul_add_c(a[4],b[7],c3,c1,c2);
 958         addcc   c_12,t_1,c_12
 959         clr     c_3
 960         bcs,a   %xcc,.+8        !=
 961         add     c_3,t_2,c_3
 962         mulx    a_5,b_6,t_1     !mul_add_c(a[5],b[6],c3,c1,c2);
 963         addcc   c_12,t_1,c_12
 964         bcs,a   %xcc,.+8        !=
 965         add     c_3,t_2,c_3
 966         mulx    a_6,b_5,t_1     !mul_add_c(a[6],b[5],c3,c1,c2);
 967         addcc   c_12,t_1,c_12
 968         bcs,a   %xcc,.+8        !=
 969         add     c_3,t_2,c_3
 970         mulx    a_7,b_4,t_1     !mul_add_c(a[7],b[4],c3,c1,c2);
 971         addcc   c_12,t_1,t_1
 972         bcs,a   %xcc,.+8        !=
 973         add     c_3,t_2,c_3
 974         srlx    t_1,32,c_12
 975         stuw    t_1,rp(11)      !r[11]=c3;
 976         or      c_12,c_3,c_12   !=
 977
 978         mulx    a_7,b_5,t_1     !mul_add_c(a[7],b[5],c1,c2,c3);
 979         addcc   c_12,t_1,c_12
 980         clr     c_3
 981         bcs,a   %xcc,.+8        !=
 982         add     c_3,t_2,c_3
 983         mulx    a_6,b_6,t_1     !mul_add_c(a[6],b[6],c1,c2,c3);
 984         addcc   c_12,t_1,c_12
 985         bcs,a   %xcc,.+8        !=
 986         add     c_3,t_2,c_3
 987         mulx    a_5,b_7,t_1     !mul_add_c(a[5],b[7],c1,c2,c3);
 988         addcc   c_12,t_1,t_1
 989         bcs,a   %xcc,.+8        !=
 990         add     c_3,t_2,c_3
 991         srlx    t_1,32,c_12
 992         stuw    t_1,rp(12)      !r[12]=c1;
 993         or      c_12,c_3,c_12   !=
 994
 995         mulx    a_6,b_7,t_1     !mul_add_c(a[6],b[7],c2,c3,c1);
 996         addcc   c_12,t_1,c_12
 997         clr     c_3
 998         bcs,a   %xcc,.+8        !=
 999         add     c_3,t_2,c_3
1000         mulx    a_7,b_6,t_1     !mul_add_c(a[7],b[6],c2,c3,c1);
1001         addcc   c_12,t_1,t_1
1002         bcs,a   %xcc,.+8        !=
1003         add     c_3,t_2,c_3
1004         srlx    t_1,32,c_12
1005         st      t_1,rp(13)      !r[13]=c2;
1006         or      c_12,c_3,c_12   !=
1007
1008         mulx    a_7,b_7,t_1     !mul_add_c(a[7],b[7],c3,c1,c2);
1009         addcc   c_12,t_1,t_1
1010         srlx    t_1,32,c_12     !=
1011         stuw    t_1,rp(14)      !r[14]=c3;
1012         stuw    c_12,rp(15)     !r[15]=c1;
1013
1014         ret
1015         restore %g0,%g0,%o0     !=
1016
1017 .type   bn_mul_comba8,#function
1018 .size   bn_mul_comba8,(.-bn_mul_comba8)
1019
1020 .align  32
1021
1022 .global bn_mul_comba4
1023 /*
1024  * void bn_mul_comba4(r,a,b)
1025  * BN_ULONG *r,*a,*b;
1026  */
1027 bn_mul_comba4:
1028         save    %sp,FRAME_SIZE,%sp
1029         lduw    ap(0),a_0
1030         mov     1,t_2
1031         lduw    bp(0),b_0
1032         sllx    t_2,32,t_2      !=
1033         lduw    bp(1),b_1
1034         mulx    a_0,b_0,t_1     !mul_add_c(a[0],b[0],c1,c2,c3);
1035         srlx    t_1,32,c_12
1036         stuw    t_1,rp(0)       !=!r[0]=c1;
1037
1038         lduw    ap(1),a_1
1039         mulx    a_0,b_1,t_1     !mul_add_c(a[0],b[1],c2,c3,c1);
1040         addcc   c_12,t_1,c_12
1041         clr     c_3             !=
1042         bcs,a   %xcc,.+8
1043         add     c_3,t_2,c_3
1044         lduw    ap(2),a_2
1045         mulx    a_1,b_0,t_1     !=!mul_add_c(a[1],b[0],c2,c3,c1);
1046         addcc   c_12,t_1,t_1
1047         bcs,a   %xcc,.+8
1048         add     c_3,t_2,c_3
1049         srlx    t_1,32,c_12     !=
1050         stuw    t_1,rp(1)       !r[1]=c2;
1051         or      c_12,c_3,c_12
1052
1053         mulx    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
1054         addcc   c_12,t_1,c_12   !=
1055         clr     c_3
1056         bcs,a   %xcc,.+8
1057         add     c_3,t_2,c_3
1058         lduw    bp(2),b_2       !=
1059         mulx    a_1,b_1,t_1     !mul_add_c(a[1],b[1],c3,c1,c2);
1060         addcc   c_12,t_1,c_12
1061         bcs,a   %xcc,.+8
1062         add     c_3,t_2,c_3     !=
1063         lduw    bp(3),b_3
1064         mulx    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
1065         addcc   c_12,t_1,t_1
1066         bcs,a   %xcc,.+8        !=
1067         add     c_3,t_2,c_3
1068         srlx    t_1,32,c_12
1069         stuw    t_1,rp(2)       !r[2]=c3;
1070         or      c_12,c_3,c_12   !=
1071
1072         mulx    a_0,b_3,t_1     !mul_add_c(a[0],b[3],c1,c2,c3);
1073         addcc   c_12,t_1,c_12
1074         clr     c_3
1075         bcs,a   %xcc,.+8        !=
1076         add     c_3,t_2,c_3
1077         mulx    a_1,b_2,t_1     !mul_add_c(a[1],b[2],c1,c2,c3);
1078         addcc   c_12,t_1,c_12
1079         bcs,a   %xcc,.+8        !=
1080         add     c_3,t_2,c_3
1081         lduw    ap(3),a_3
1082         mulx    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
1083         addcc   c_12,t_1,c_12   !=
1084         bcs,a   %xcc,.+8
1085         add     c_3,t_2,c_3
1086         mulx    a_3,b_0,t_1     !mul_add_c(a[3],b[0],c1,c2,c3);!=
1087         addcc   c_12,t_1,t_1    !=
1088         bcs,a   %xcc,.+8
1089         add     c_3,t_2,c_3
1090         srlx    t_1,32,c_12
1091         stuw    t_1,rp(3)       !=!r[3]=c1;
1092         or      c_12,c_3,c_12
1093
1094         mulx    a_3,b_1,t_1     !mul_add_c(a[3],b[1],c2,c3,c1);
1095         addcc   c_12,t_1,c_12
1096         clr     c_3             !=
1097         bcs,a   %xcc,.+8
1098         add     c_3,t_2,c_3
1099         mulx    a_2,b_2,t_1     !mul_add_c(a[2],b[2],c2,c3,c1);
1100         addcc   c_12,t_1,c_12   !=
1101         bcs,a   %xcc,.+8
1102         add     c_3,t_2,c_3
1103         mulx    a_1,b_3,t_1     !mul_add_c(a[1],b[3],c2,c3,c1);
1104         addcc   c_12,t_1,t_1    !=
1105         bcs,a   %xcc,.+8
1106         add     c_3,t_2,c_3
1107         srlx    t_1,32,c_12
1108         stuw    t_1,rp(4)       !=!r[4]=c2;
1109         or      c_12,c_3,c_12
1110
1111         mulx    a_2,b_3,t_1     !mul_add_c(a[2],b[3],c3,c1,c2);
1112         addcc   c_12,t_1,c_12
1113         clr     c_3             !=
1114         bcs,a   %xcc,.+8
1115         add     c_3,t_2,c_3
1116         mulx    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
1117         addcc   c_12,t_1,t_1    !=
1118         bcs,a   %xcc,.+8
1119         add     c_3,t_2,c_3
1120         srlx    t_1,32,c_12
1121         stuw    t_1,rp(5)       !=!r[5]=c3;
1122         or      c_12,c_3,c_12
1123
1124         mulx    a_3,b_3,t_1     !mul_add_c(a[3],b[3],c1,c2,c3);
1125         addcc   c_12,t_1,t_1
1126         srlx    t_1,32,c_12     !=
1127         stuw    t_1,rp(6)       !r[6]=c1;
1128         stuw    c_12,rp(7)      !r[7]=c2;
1129
1130         ret
1131         restore %g0,%g0,%o0
1132
1133 .type   bn_mul_comba4,#function
1134 .size   bn_mul_comba4,(.-bn_mul_comba4)
1135
1136 .align  32
1137
1138 .global bn_sqr_comba8
1139 bn_sqr_comba8:
1140         save    %sp,FRAME_SIZE,%sp
1141         mov     1,t_2
1142         lduw    ap(0),a_0
1143         sllx    t_2,32,t_2
1144         lduw    ap(1),a_1
1145         mulx    a_0,a_0,t_1     !sqr_add_c(a,0,c1,c2,c3);
1146         srlx    t_1,32,c_12
1147         stuw    t_1,rp(0)       !r[0]=c1;
1148
1149         lduw    ap(2),a_2
1150         mulx    a_0,a_1,t_1     !=!sqr_add_c2(a,1,0,c2,c3,c1);
1151         addcc   c_12,t_1,c_12
1152         clr     c_3
1153         bcs,a   %xcc,.+8
1154         add     c_3,t_2,c_3
1155         addcc   c_12,t_1,t_1
1156         bcs,a   %xcc,.+8
1157         add     c_3,t_2,c_3
1158         srlx    t_1,32,c_12
1159         stuw    t_1,rp(1)       !r[1]=c2;
1160         or      c_12,c_3,c_12
1161
1162         mulx    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
1163         addcc   c_12,t_1,c_12
1164         clr     c_3
1165         bcs,a   %xcc,.+8
1166         add     c_3,t_2,c_3
1167         addcc   c_12,t_1,c_12
1168         bcs,a   %xcc,.+8
1169         add     c_3,t_2,c_3
1170         lduw    ap(3),a_3
1171         mulx    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
1172         addcc   c_12,t_1,t_1
1173         bcs,a   %xcc,.+8
1174         add     c_3,t_2,c_3
1175         srlx    t_1,32,c_12
1176         stuw    t_1,rp(2)       !r[2]=c3;
1177         or      c_12,c_3,c_12
1178
1179         mulx    a_0,a_3,t_1     !sqr_add_c2(a,3,0,c1,c2,c3);
1180         addcc   c_12,t_1,c_12
1181         clr     c_3
1182         bcs,a   %xcc,.+8
1183         add     c_3,t_2,c_3
1184         addcc   c_12,t_1,c_12
1185         bcs,a   %xcc,.+8
1186         add     c_3,t_2,c_3
1187         lduw    ap(4),a_4
1188         mulx    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
1189         addcc   c_12,t_1,c_12
1190         bcs,a   %xcc,.+8
1191         add     c_3,t_2,c_3
1192         addcc   c_12,t_1,t_1
1193         bcs,a   %xcc,.+8
1194         add     c_3,t_2,c_3
1195         srlx    t_1,32,c_12
1196         st      t_1,rp(3)       !r[3]=c1;
1197         or      c_12,c_3,c_12
1198
1199         mulx    a_4,a_0,t_1     !sqr_add_c2(a,4,0,c2,c3,c1);
1200         addcc   c_12,t_1,c_12
1201         clr     c_3
1202         bcs,a   %xcc,.+8
1203         add     c_3,t_2,c_3
1204         addcc   c_12,t_1,c_12
1205         bcs,a   %xcc,.+8
1206         add     c_3,t_2,c_3
1207         mulx    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
1208         addcc   c_12,t_1,c_12
1209         bcs,a   %xcc,.+8
1210         add     c_3,t_2,c_3
1211         addcc   c_12,t_1,c_12
1212         bcs,a   %xcc,.+8
1213         add     c_3,t_2,c_3
1214         lduw    ap(5),a_5
1215         mulx    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
1216         addcc   c_12,t_1,t_1
1217         bcs,a   %xcc,.+8
1218         add     c_3,t_2,c_3
1219         srlx    t_1,32,c_12
1220         stuw    t_1,rp(4)       !r[4]=c2;
1221         or      c_12,c_3,c_12
1222
1223         mulx    a_0,a_5,t_1     !sqr_add_c2(a,5,0,c3,c1,c2);
1224         addcc   c_12,t_1,c_12
1225         clr     c_3
1226         bcs,a   %xcc,.+8
1227         add     c_3,t_2,c_3
1228         addcc   c_12,t_1,c_12
1229         bcs,a   %xcc,.+8
1230         add     c_3,t_2,c_3
1231         mulx    a_1,a_4,t_1     !sqr_add_c2(a,4,1,c3,c1,c2);
1232         addcc   c_12,t_1,c_12
1233         bcs,a   %xcc,.+8
1234         add     c_3,t_2,c_3
1235         addcc   c_12,t_1,c_12
1236         bcs,a   %xcc,.+8
1237         add     c_3,t_2,c_3
1238         lduw    ap(6),a_6
1239         mulx    a_2,a_3,t_1     !sqr_add_c2(a,3,2,c3,c1,c2);
1240         addcc   c_12,t_1,c_12
1241         bcs,a   %xcc,.+8
1242         add     c_3,t_2,c_3
1243         addcc   c_12,t_1,t_1
1244         bcs,a   %xcc,.+8
1245         add     c_3,t_2,c_3
1246         srlx    t_1,32,c_12
1247         stuw    t_1,rp(5)       !r[5]=c3;
1248         or      c_12,c_3,c_12
1249
1250         mulx    a_6,a_0,t_1     !sqr_add_c2(a,6,0,c1,c2,c3);
1251         addcc   c_12,t_1,c_12
1252         clr     c_3
1253         bcs,a   %xcc,.+8
1254         add     c_3,t_2,c_3
1255         addcc   c_12,t_1,c_12
1256         bcs,a   %xcc,.+8
1257         add     c_3,t_2,c_3
1258         mulx    a_5,a_1,t_1     !sqr_add_c2(a,5,1,c1,c2,c3);
1259         addcc   c_12,t_1,c_12
1260         bcs,a   %xcc,.+8
1261         add     c_3,t_2,c_3
1262         addcc   c_12,t_1,c_12
1263         bcs,a   %xcc,.+8
1264         add     c_3,t_2,c_3
1265         mulx    a_4,a_2,t_1     !sqr_add_c2(a,4,2,c1,c2,c3);
1266         addcc   c_12,t_1,c_12
1267         bcs,a   %xcc,.+8
1268         add     c_3,t_2,c_3
1269         addcc   c_12,t_1,c_12
1270         bcs,a   %xcc,.+8
1271         add     c_3,t_2,c_3
1272         lduw    ap(7),a_7
1273         mulx    a_3,a_3,t_1     !=!sqr_add_c(a,3,c1,c2,c3);
1274         addcc   c_12,t_1,t_1
1275         bcs,a   %xcc,.+8
1276         add     c_3,t_2,c_3
1277         srlx    t_1,32,c_12
1278         stuw    t_1,rp(6)       !r[6]=c1;
1279         or      c_12,c_3,c_12
1280
1281         mulx    a_0,a_7,t_1     !sqr_add_c2(a,7,0,c2,c3,c1);
1282         addcc   c_12,t_1,c_12
1283         clr     c_3
1284         bcs,a   %xcc,.+8
1285         add     c_3,t_2,c_3
1286         addcc   c_12,t_1,c_12
1287         bcs,a   %xcc,.+8
1288         add     c_3,t_2,c_3
1289         mulx    a_1,a_6,t_1     !sqr_add_c2(a,6,1,c2,c3,c1);
1290         addcc   c_12,t_1,c_12
1291         bcs,a   %xcc,.+8
1292         add     c_3,t_2,c_3
1293         addcc   c_12,t_1,c_12
1294         bcs,a   %xcc,.+8
1295         add     c_3,t_2,c_3
1296         mulx    a_2,a_5,t_1     !sqr_add_c2(a,5,2,c2,c3,c1);
1297         addcc   c_12,t_1,c_12
1298         bcs,a   %xcc,.+8
1299         add     c_3,t_2,c_3
1300         addcc   c_12,t_1,c_12
1301         bcs,a   %xcc,.+8
1302         add     c_3,t_2,c_3
1303         mulx    a_3,a_4,t_1     !sqr_add_c2(a,4,3,c2,c3,c1);
1304         addcc   c_12,t_1,c_12
1305         bcs,a   %xcc,.+8
1306         add     c_3,t_2,c_3
1307         addcc   c_12,t_1,t_1
1308         bcs,a   %xcc,.+8
1309         add     c_3,t_2,c_3
1310         srlx    t_1,32,c_12
1311         stuw    t_1,rp(7)       !r[7]=c2;
1312         or      c_12,c_3,c_12
1313
1314         mulx    a_7,a_1,t_1     !sqr_add_c2(a,7,1,c3,c1,c2);
1315         addcc   c_12,t_1,c_12
1316         clr     c_3
1317         bcs,a   %xcc,.+8
1318         add     c_3,t_2,c_3
1319         addcc   c_12,t_1,c_12
1320         bcs,a   %xcc,.+8
1321         add     c_3,t_2,c_3
1322         mulx    a_6,a_2,t_1     !sqr_add_c2(a,6,2,c3,c1,c2);
1323         addcc   c_12,t_1,c_12
1324         bcs,a   %xcc,.+8
1325         add     c_3,t_2,c_3
1326         addcc   c_12,t_1,c_12
1327         bcs,a   %xcc,.+8
1328         add     c_3,t_2,c_3
1329         mulx    a_5,a_3,t_1     !sqr_add_c2(a,5,3,c3,c1,c2);
1330         addcc   c_12,t_1,c_12
1331         bcs,a   %xcc,.+8
1332         add     c_3,t_2,c_3
1333         addcc   c_12,t_1,c_12
1334         bcs,a   %xcc,.+8
1335         add     c_3,t_2,c_3
1336         mulx    a_4,a_4,t_1     !sqr_add_c(a,4,c3,c1,c2);
1337         addcc   c_12,t_1,t_1
1338         bcs,a   %xcc,.+8
1339         add     c_3,t_2,c_3
1340         srlx    t_1,32,c_12
1341         stuw    t_1,rp(8)       !r[8]=c3;
1342         or      c_12,c_3,c_12
1343
1344         mulx    a_2,a_7,t_1     !sqr_add_c2(a,7,2,c1,c2,c3);
1345         addcc   c_12,t_1,c_12
1346         clr     c_3
1347         bcs,a   %xcc,.+8
1348         add     c_3,t_2,c_3
1349         addcc   c_12,t_1,c_12
1350         bcs,a   %xcc,.+8
1351         add     c_3,t_2,c_3
1352         mulx    a_3,a_6,t_1     !sqr_add_c2(a,6,3,c1,c2,c3);
1353         addcc   c_12,t_1,c_12
1354         bcs,a   %xcc,.+8
1355         add     c_3,t_2,c_3
1356         addcc   c_12,t_1,c_12
1357         bcs,a   %xcc,.+8
1358         add     c_3,t_2,c_3
1359         mulx    a_4,a_5,t_1     !sqr_add_c2(a,5,4,c1,c2,c3);
1360         addcc   c_12,t_1,c_12
1361         bcs,a   %xcc,.+8
1362         add     c_3,t_2,c_3
1363         addcc   c_12,t_1,t_1
1364         bcs,a   %xcc,.+8
1365         add     c_3,t_2,c_3
1366         srlx    t_1,32,c_12
1367         stuw    t_1,rp(9)       !r[9]=c1;
1368         or      c_12,c_3,c_12
1369
1370         mulx    a_7,a_3,t_1     !sqr_add_c2(a,7,3,c2,c3,c1);
1371         addcc   c_12,t_1,c_12
1372         clr     c_3
1373         bcs,a   %xcc,.+8
1374         add     c_3,t_2,c_3
1375         addcc   c_12,t_1,c_12
1376         bcs,a   %xcc,.+8
1377         add     c_3,t_2,c_3
1378         mulx    a_6,a_4,t_1     !sqr_add_c2(a,6,4,c2,c3,c1);
1379         addcc   c_12,t_1,c_12
1380         bcs,a   %xcc,.+8
1381         add     c_3,t_2,c_3
1382         addcc   c_12,t_1,c_12
1383         bcs,a   %xcc,.+8
1384         add     c_3,t_2,c_3
1385         mulx    a_5,a_5,t_1     !sqr_add_c(a,5,c2,c3,c1);
1386         addcc   c_12,t_1,t_1
1387         bcs,a   %xcc,.+8
1388         add     c_3,t_2,c_3
1389         srlx    t_1,32,c_12
1390         stuw    t_1,rp(10)      !r[10]=c2;
1391         or      c_12,c_3,c_12
1392
1393         mulx    a_4,a_7,t_1     !sqr_add_c2(a,7,4,c3,c1,c2);
1394         addcc   c_12,t_1,c_12
1395         clr     c_3
1396         bcs,a   %xcc,.+8
1397         add     c_3,t_2,c_3
1398         addcc   c_12,t_1,c_12
1399         bcs,a   %xcc,.+8
1400         add     c_3,t_2,c_3
1401         mulx    a_5,a_6,t_1     !sqr_add_c2(a,6,5,c3,c1,c2);
1402         addcc   c_12,t_1,c_12
1403         bcs,a   %xcc,.+8
1404         add     c_3,t_2,c_3
1405         addcc   c_12,t_1,t_1
1406         bcs,a   %xcc,.+8
1407         add     c_3,t_2,c_3
1408         srlx    t_1,32,c_12
1409         stuw    t_1,rp(11)      !r[11]=c3;
1410         or      c_12,c_3,c_12
1411
1412         mulx    a_7,a_5,t_1     !sqr_add_c2(a,7,5,c1,c2,c3);
1413         addcc   c_12,t_1,c_12
1414         clr     c_3
1415         bcs,a   %xcc,.+8
1416         add     c_3,t_2,c_3
1417         addcc   c_12,t_1,c_12
1418         bcs,a   %xcc,.+8
1419         add     c_3,t_2,c_3
1420         mulx    a_6,a_6,t_1     !sqr_add_c(a,6,c1,c2,c3);
1421         addcc   c_12,t_1,t_1
1422         bcs,a   %xcc,.+8
1423         add     c_3,t_2,c_3
1424         srlx    t_1,32,c_12
1425         stuw    t_1,rp(12)      !r[12]=c1;
1426         or      c_12,c_3,c_12
1427
1428         mulx    a_6,a_7,t_1     !sqr_add_c2(a,7,6,c2,c3,c1);
1429         addcc   c_12,t_1,c_12
1430         clr     c_3
1431         bcs,a   %xcc,.+8
1432         add     c_3,t_2,c_3
1433         addcc   c_12,t_1,t_1
1434         bcs,a   %xcc,.+8
1435         add     c_3,t_2,c_3
1436         srlx    t_1,32,c_12
1437         stuw    t_1,rp(13)      !r[13]=c2;
1438         or      c_12,c_3,c_12
1439
1440         mulx    a_7,a_7,t_1     !sqr_add_c(a,7,c3,c1,c2);
1441         addcc   c_12,t_1,t_1
1442         srlx    t_1,32,c_12
1443         stuw    t_1,rp(14)      !r[14]=c3;
1444         stuw    c_12,rp(15)     !r[15]=c1;
1445
1446         ret
1447         restore %g0,%g0,%o0
1448
1449 .type   bn_sqr_comba8,#function
1450 .size   bn_sqr_comba8,(.-bn_sqr_comba8)
1451
1452 .align  32
1453
1454 .global bn_sqr_comba4
1455 /*
1456  * void bn_sqr_comba4(r,a)
1457  * BN_ULONG *r,*a;
1458  */
1459 bn_sqr_comba4:
1460         save    %sp,FRAME_SIZE,%sp
1461         mov     1,t_2
1462         lduw    ap(0),a_0
1463         sllx    t_2,32,t_2
1464         lduw    ap(1),a_1
1465         mulx    a_0,a_0,t_1     !sqr_add_c(a,0,c1,c2,c3);
1466         srlx    t_1,32,c_12
1467         stuw    t_1,rp(0)       !r[0]=c1;
1468
1469         lduw    ap(2),a_2
1470         mulx    a_0,a_1,t_1     !sqr_add_c2(a,1,0,c2,c3,c1);
1471         addcc   c_12,t_1,c_12
1472         clr     c_3
1473         bcs,a   %xcc,.+8
1474         add     c_3,t_2,c_3
1475         addcc   c_12,t_1,t_1
1476         bcs,a   %xcc,.+8
1477         add     c_3,t_2,c_3
1478         srlx    t_1,32,c_12
1479         stuw    t_1,rp(1)       !r[1]=c2;
1480         or      c_12,c_3,c_12
1481
1482         mulx    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
1483         addcc   c_12,t_1,c_12
1484         clr     c_3
1485         bcs,a   %xcc,.+8
1486         add     c_3,t_2,c_3
1487         addcc   c_12,t_1,c_12
1488         bcs,a   %xcc,.+8
1489         add     c_3,t_2,c_3
1490         lduw    ap(3),a_3
1491         mulx    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
1492         addcc   c_12,t_1,t_1
1493         bcs,a   %xcc,.+8
1494         add     c_3,t_2,c_3
1495         srlx    t_1,32,c_12
1496         stuw    t_1,rp(2)       !r[2]=c3;
1497         or      c_12,c_3,c_12
1498
1499         mulx    a_0,a_3,t_1     !sqr_add_c2(a,3,0,c1,c2,c3);
1500         addcc   c_12,t_1,c_12
1501         clr     c_3
1502         bcs,a   %xcc,.+8
1503         add     c_3,t_2,c_3
1504         addcc   c_12,t_1,c_12
1505         bcs,a   %xcc,.+8
1506         add     c_3,t_2,c_3
1507         mulx    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
1508         addcc   c_12,t_1,c_12
1509         bcs,a   %xcc,.+8
1510         add     c_3,t_2,c_3
1511         addcc   c_12,t_1,t_1
1512         bcs,a   %xcc,.+8
1513         add     c_3,t_2,c_3
1514         srlx    t_1,32,c_12
1515         stuw    t_1,rp(3)       !r[3]=c1;
1516         or      c_12,c_3,c_12
1517
1518         mulx    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
1519         addcc   c_12,t_1,c_12
1520         clr     c_3
1521         bcs,a   %xcc,.+8
1522         add     c_3,t_2,c_3
1523         addcc   c_12,t_1,c_12
1524         bcs,a   %xcc,.+8
1525         add     c_3,t_2,c_3
1526         mulx    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
1527         addcc   c_12,t_1,t_1
1528         bcs,a   %xcc,.+8
1529         add     c_3,t_2,c_3
1530         srlx    t_1,32,c_12
1531         stuw    t_1,rp(4)       !r[4]=c2;
1532         or      c_12,c_3,c_12
1533
1534         mulx    a_2,a_3,t_1     !sqr_add_c2(a,3,2,c3,c1,c2);
1535         addcc   c_12,t_1,c_12
1536         clr     c_3
1537         bcs,a   %xcc,.+8
1538         add     c_3,t_2,c_3
1539         addcc   c_12,t_1,t_1
1540         bcs,a   %xcc,.+8
1541         add     c_3,t_2,c_3
1542         srlx    t_1,32,c_12
1543         stuw    t_1,rp(5)       !r[5]=c3;
1544         or      c_12,c_3,c_12
1545
1546         mulx    a_3,a_3,t_1     !sqr_add_c(a,3,c1,c2,c3);
1547         addcc   c_12,t_1,t_1
1548         srlx    t_1,32,c_12
1549         stuw    t_1,rp(6)       !r[6]=c1;
1550         stuw    c_12,rp(7)      !r[7]=c2;
1551
1552         ret
1553         restore %g0,%g0,%o0
1554
1555 .type   bn_sqr_comba4,#function
1556 .size   bn_sqr_comba4,(.-bn_sqr_comba4)
1557
1558 .align  32