crypto/bn/asm/sparcv8plus.S

   1 .ident  "sparcv8plus.s, Version 1.4"
   2 .ident  "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
   3
   4 /*
   5  * ====================================================================
   6  * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   7  * project.
   8  *
   9  * Rights for redistribution and usage in source and binary forms are
  10  * granted according to the OpenSSL license. Warranty of any kind is
  11  * disclaimed.
  12  * ====================================================================
  13  */
  14
  15 /*
  16  * This is my modest contributon to OpenSSL project (see
  17  * http://www.openssl.org/ for more information about it) and is
  18  * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
  19  * module. For updates see http://fy.chalmers.se/~appro/hpe/.
  20  *
  21  * Questions-n-answers.
  22  *
  23  * Q. How to compile?
  24  * A. With SC4.x/SC5.x:
  25  *
  26  *      cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  27  *
  28  *    and with gcc:
  29  *
  30  *      gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
  31  *
  32  *    or if above fails (it does if you have gas installed):
  33  *
  34  *      gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
  35  *
  36  *    Quick-n-dirty way to fuse the module into the library.
  37  *    Provided that the library is already configured and built
  38  *    (in 0.9.2 case with no-asm option):
  39  *
  40  *      # cd crypto/bn
  41  *      # cp /some/place/bn_asm.sparc.v8plus.S .
  42  *      # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  43  *      # make
  44  *      # cd ../..
  45  *      # make; make test
  46  *
  47  *    Quick-n-dirty way to get rid of it:
  48  *
  49  *      # cd crypto/bn
  50  *      # touch bn_asm.c
  51  *      # make
  52  *      # cd ../..
  53  *      # make; make test
  54  *
  55  * Q. V8plus achitecture? What kind of beast is that?
  56  * A. Well, it's rather a programming model than an architecture...
  57  *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
  58  *    special conditions, namely when kernel doesn't preserve upper
  59  *    32 bits of otherwise 64-bit registers during a context switch.
  60  *
  61  * Q. Why just UltraSPARC? What about SuperSPARC?
  62  * A. Original release did target UltraSPARC only. Now SuperSPARC
  63  *    version is provided along. Both version share bn_*comba[48]
  64  *    implementations (see comment later in code for explanation).
  65  *    But what's so special about this UltraSPARC implementation?
  66  *    Why didn't I let compiler do the job? Trouble is that most of
  67  *    available compilers (well, SC5.0 is the only exception) don't
  68  *    attempt to take advantage of UltraSPARC's 64-bitness under
  69  *    32-bit kernels even though it's perfectly possible (see next
  70  *    question).
  71  *
  72  * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
  73  *    doesn't work?
  74  * A. You can't adress *all* registers as 64-bit wide:-( The catch is
  75  *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
  76  *    preserved if you're in a leaf function, i.e. such never calling
  77  *    any other functions. All functions in this module are leaf and
  78  *    10 registers is a handful. And as a matter of fact none-"comba"
  79  *    routines don't require even that much and I could even afford to
  80  *    not allocate own stack frame for 'em:-)
  81  *
  82  * Q. What about 64-bit kernels?
  83  * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
  84  *    under evaluation and development...
  85  *
  86  * Q. What about shared libraries?
  87  * A. What about 'em? Kidding again:-) Code does *not* contain any
  88  *    code position dependencies and it's safe to include it into
  89  *    shared library as is.
  90  *
  91  * Q. How much faster does it go?
  92  * A. Do you have a good benchmark? In either case below is what I
  93  *    experience with crypto/bn/expspeed.c test program:
  94  *
  95  *      v8plus module on U10/300MHz against bn_asm.c compiled with:
  96  *
  97  *      cc-5.0 -xarch=v8plus -xO5 -xdepend      +7-12%
  98  *      cc-4.2 -xarch=v8plus -xO5 -xdepend      +25-35%
  99  *      egcs-1.1.2 -mcpu=ultrasparc -O3         +35-45%
 100  *
 101  *      v8 module on SS10/60MHz against bn_asm.c compiled with:
 102  *
 103  *      cc-5.0 -xarch=v8 -xO5 -xdepend          +7-10%
 104  *      cc-4.2 -xarch=v8 -xO5 -xdepend          +10%
 105  *      egcs-1.1.2 -mv8 -O3                     +35-45%
 106  *
 107  *    As you can see it's damn hard to beat the new Sun C compiler
 108  *    and it's in first place GNU C users who will appreciate this
 109  *    assembler implementation:-)
 110  */
 111
 112 /*
 113  * Revision history.
 114  *
 115  * 1.0  - initial release;
 116  * 1.1  - new loop unrolling model(*);
 117  *      - some more fine tuning;
 118  * 1.2  - made gas friendly;
 119  *      - updates to documentation concerning v9;
 120  *      - new performance comparison matrix;
 121  * 1.3  - fixed problem with /usr/ccs/lib/cpp;
 122  * 1.4  - native V9 bn_*_comba[48] implementation (15% more efficient)
 123  *        resulting in slight overall performance kick;
 124  *      - some retunes;
 125  *      - support for GNU as added;
 126  *
 127  * (*)  Originally unrolled loop looked like this:
 128  *          for (;;) {
 129  *              op(p+0); if (--n==0) break;
 130  *              op(p+1); if (--n==0) break;
 131  *              op(p+2); if (--n==0) break;
 132  *              op(p+3); if (--n==0) break;
 133  *              p+=4;
 134  *          }
 135  *      I unroll according to following:
 136  *          while (n&~3) {
 137  *              op(p+0); op(p+1); op(p+2); op(p+3);
 138  *              p+=4; n=-4;
 139  *          }
 140  *          if (n) {
 141  *              op(p+0); if (--n==0) return;
 142  *              op(p+2); if (--n==0) return;
 143  *              op(p+3); return;
 144  *          }
 145  */
 146
 147 /*
 148  * GNU assembler can't stand stuw:-(
 149  */
 150 #define stuw st
 151
 152 .section        ".text",#alloc,#execinstr
 153 .file           "bn_asm.sparc.v8plus.S"
 154
 155 .align  32
 156
 157 .global bn_mul_add_words
 158 /*
 159  * BN_ULONG bn_mul_add_words(rp,ap,num,w)
 160  * BN_ULONG *rp,*ap;
 161  * int num;
 162  * BN_ULONG w;
 163  */
 164 bn_mul_add_words:
 165         brgz,a  %o2,.L_bn_mul_add_words_proceed
 166         lduw    [%o1],%g2
 167         retl
 168         clr     %o0
 169
 170 .L_bn_mul_add_words_proceed:
 171         srl     %o3,%g0,%o3     ! clruw %o3
 172         andcc   %o2,-4,%g0
 173         bz,pn   %icc,.L_bn_mul_add_words_tail
 174         clr     %o5
 175
 176 .L_bn_mul_add_words_loop:       ! wow! 32 aligned!
 177         lduw    [%o0],%g1
 178         lduw    [%o1+4],%g3
 179         mulx    %o3,%g2,%g2
 180         add     %g1,%o5,%o4
 181         nop
 182         add     %o4,%g2,%o4
 183         stuw    %o4,[%o0]
 184         srlx    %o4,32,%o5
 185
 186         lduw    [%o0+4],%g1
 187         lduw    [%o1+8],%g2
 188         mulx    %o3,%g3,%g3
 189         add     %g1,%o5,%o4
 190         dec     4,%o2
 191         add     %o4,%g3,%o4
 192         stuw    %o4,[%o0+4]
 193         srlx    %o4,32,%o5
 194
 195         lduw    [%o0+8],%g1
 196         lduw    [%o1+12],%g3
 197         mulx    %o3,%g2,%g2
 198         add     %g1,%o5,%o4
 199         inc     16,%o1
 200         add     %o4,%g2,%o4
 201         stuw    %o4,[%o0+8]
 202         srlx    %o4,32,%o5
 203
 204         lduw    [%o0+12],%g1
 205         mulx    %o3,%g3,%g3
 206         add     %g1,%o5,%o4
 207         inc     16,%o0
 208         add     %o4,%g3,%o4
 209         andcc   %o2,-4,%g0
 210         stuw    %o4,[%o0-4]
 211         srlx    %o4,32,%o5
 212         bnz,a,pt        %icc,.L_bn_mul_add_words_loop
 213         lduw    [%o1],%g2
 214
 215         brnz,a,pn       %o2,.L_bn_mul_add_words_tail
 216         lduw    [%o1],%g2
 217 .L_bn_mul_add_words_return:
 218         retl
 219         mov     %o5,%o0
 220
 221 .L_bn_mul_add_words_tail:
 222         lduw    [%o0],%g1
 223         mulx    %o3,%g2,%g2
 224         add     %g1,%o5,%o4
 225         dec     %o2
 226         add     %o4,%g2,%o4
 227         srlx    %o4,32,%o5
 228         brz,pt  %o2,.L_bn_mul_add_words_return
 229         stuw    %o4,[%o0]
 230
 231         lduw    [%o1+4],%g2
 232         lduw    [%o0+4],%g1
 233         mulx    %o3,%g2,%g2
 234         add     %g1,%o5,%o4
 235         dec     %o2
 236         add     %o4,%g2,%o4
 237         srlx    %o4,32,%o5
 238         brz,pt  %o2,.L_bn_mul_add_words_return
 239         stuw    %o4,[%o0+4]
 240
 241         lduw    [%o1+8],%g2
 242         lduw    [%o0+8],%g1
 243         mulx    %o3,%g2,%g2
 244         add     %g1,%o5,%o4
 245         add     %o4,%g2,%o4
 246         stuw    %o4,[%o0+8]
 247         retl
 248         srlx    %o4,32,%o0
 249
 250 .type   bn_mul_add_words,#function
 251 .size   bn_mul_add_words,(.-bn_mul_add_words)
 252
 253 .align  32
 254
 255 .global bn_mul_words
 256 /*
 257  * BN_ULONG bn_mul_words(rp,ap,num,w)
 258  * BN_ULONG *rp,*ap;
 259  * int num;
 260  * BN_ULONG w;
 261  */
 262 bn_mul_words:
 263         brgz,a  %o2,.L_bn_mul_words_proceeed
 264         lduw    [%o1],%g2
 265         retl
 266         clr     %o0
 267
 268 .L_bn_mul_words_proceeed:
 269         srl     %o3,%g0,%o3     ! clruw %o3
 270         andcc   %o2,-4,%g0
 271         bz,pn   %icc,.L_bn_mul_words_tail
 272         clr     %o5
 273
 274 .L_bn_mul_words_loop:           ! wow! 32 aligned!
 275         lduw    [%o1+4],%g3
 276         mulx    %o3,%g2,%g2
 277         add     %g2,%o5,%o4
 278         nop
 279         stuw    %o4,[%o0]
 280         srlx    %o4,32,%o5
 281
 282         lduw    [%o1+8],%g2
 283         mulx    %o3,%g3,%g3
 284         add     %g3,%o5,%o4
 285         dec     4,%o2
 286         stuw    %o4,[%o0+4]
 287         srlx    %o4,32,%o5
 288
 289         lduw    [%o1+12],%g3
 290         mulx    %o3,%g2,%g2
 291         add     %g2,%o5,%o4
 292         inc     16,%o1
 293         stuw    %o4,[%o0+8]
 294         srlx    %o4,32,%o5
 295
 296         mulx    %o3,%g3,%g3
 297         add     %g3,%o5,%o4
 298         inc     16,%o0
 299         stuw    %o4,[%o0-4]
 300         srlx    %o4,32,%o5
 301         andcc   %o2,-4,%g0
 302         bnz,a,pt        %icc,.L_bn_mul_words_loop
 303         lduw    [%o1],%g2
 304         nop
 305         nop
 306
 307         brnz,a,pn       %o2,.L_bn_mul_words_tail
 308         lduw    [%o1],%g2
 309 .L_bn_mul_words_return:
 310         retl
 311         mov     %o5,%o0
 312
 313 .L_bn_mul_words_tail:
 314         mulx    %o3,%g2,%g2
 315         add     %g2,%o5,%o4
 316         dec     %o2
 317         srlx    %o4,32,%o5
 318         brz,pt  %o2,.L_bn_mul_words_return
 319         stuw    %o4,[%o0]
 320
 321         lduw    [%o1+4],%g2
 322         mulx    %o3,%g2,%g2
 323         add     %g2,%o5,%o4
 324         dec     %o2
 325         srlx    %o4,32,%o5
 326         brz,pt  %o2,.L_bn_mul_words_return
 327         stuw    %o4,[%o0+4]
 328
 329         lduw    [%o1+8],%g2
 330         mulx    %o3,%g2,%g2
 331         add     %g2,%o5,%o4
 332         stuw    %o4,[%o0+8]
 333         retl
 334         srlx    %o4,32,%o0
 335
 336 .type   bn_mul_words,#function
 337 .size   bn_mul_words,(.-bn_mul_words)
 338
 339 .align  32
 340 .global bn_sqr_words
 341 /*
 342  * void bn_sqr_words(r,a,n)
 343  * BN_ULONG *r,*a;
 344  * int n;
 345  */
 346 bn_sqr_words:
 347         brgz,a  %o2,.L_bn_sqr_words_proceeed
 348         lduw    [%o1],%g2
 349         retl
 350         clr     %o0
 351
 352 .L_bn_sqr_words_proceeed:
 353         andcc   %o2,-4,%g0
 354         nop
 355         bz,pn   %icc,.L_bn_sqr_words_tail
 356         nop
 357
 358 .L_bn_sqr_words_loop:           ! wow! 32 aligned!
 359         lduw    [%o1+4],%g3
 360         mulx    %g2,%g2,%o4
 361         stuw    %o4,[%o0]
 362         srlx    %o4,32,%o5
 363         stuw    %o5,[%o0+4]
 364         nop
 365
 366         lduw    [%o1+8],%g2
 367         mulx    %g3,%g3,%o4
 368         dec     4,%o2
 369         stuw    %o4,[%o0+8]
 370         srlx    %o4,32,%o5
 371         stuw    %o5,[%o0+12]
 372
 373         lduw    [%o1+12],%g3
 374         mulx    %g2,%g2,%o4
 375         srlx    %o4,32,%o5
 376         stuw    %o4,[%o0+16]
 377         inc     16,%o1
 378         stuw    %o5,[%o0+20]
 379
 380         mulx    %g3,%g3,%o4
 381         inc     32,%o0
 382         stuw    %o4,[%o0-8]
 383         srlx    %o4,32,%o5
 384         andcc   %o2,-4,%g2
 385         stuw    %o5,[%o0-4]
 386         bnz,a,pt        %icc,.L_bn_sqr_words_loop
 387         lduw    [%o1],%g2
 388         nop
 389
 390         brnz,a,pn       %o2,.L_bn_sqr_words_tail
 391         lduw    [%o1],%g2
 392 .L_bn_sqr_words_return:
 393         retl
 394         clr     %o0
 395
 396 .L_bn_sqr_words_tail:
 397         mulx    %g2,%g2,%o4
 398         dec     %o2
 399         stuw    %o4,[%o0]
 400         srlx    %o4,32,%o5
 401         brz,pt  %o2,.L_bn_sqr_words_return
 402         stuw    %o5,[%o0+4]
 403
 404         lduw    [%o1+4],%g2
 405         mulx    %g2,%g2,%o4
 406         dec     %o2
 407         stuw    %o4,[%o0+8]
 408         srlx    %o4,32,%o5
 409         brz,pt  %o2,.L_bn_sqr_words_return
 410         stuw    %o5,[%o0+12]
 411
 412         lduw    [%o1+8],%g2
 413         mulx    %g2,%g2,%o4
 414         srlx    %o4,32,%o5
 415         stuw    %o4,[%o0+16]
 416         stuw    %o5,[%o0+20]
 417         retl
 418         clr     %o0
 419
 420 .type   bn_sqr_words,#function
 421 .size   bn_sqr_words,(.-bn_sqr_words)
 422
 423 .align  32
 424 .global bn_div_words
 425 /*
 426  * BN_ULONG bn_div_words(h,l,d)
 427  * BN_ULONG h,l,d;
 428  */
 429 bn_div_words:
 430         sllx    %o0,32,%o0
 431         or      %o0,%o1,%o0
 432         udivx   %o0,%o2,%o0
 433         retl
 434         srl     %o0,%g0,%o0     ! clruw %o0
 435
 436 .type   bn_div_words,#function
 437 .size   bn_div_words,(.-bn_div_words)
 438
 439 .align  32
 440
 441 .global bn_add_words
 442 /*
 443  * BN_ULONG bn_add_words(rp,ap,bp,n)
 444  * BN_ULONG *rp,*ap,*bp;
 445  * int n;
 446  */
 447 bn_add_words:
 448         brgz,a  %o3,.L_bn_add_words_proceed
 449         lduw    [%o1],%o4
 450         retl
 451         clr     %o0
 452
 453 .L_bn_add_words_proceed:
 454         andcc   %o3,-4,%g0
 455         bz,pn   %icc,.L_bn_add_words_tail
 456         addcc   %g0,0,%g0       ! clear carry flag
 457         nop
 458
 459 .L_bn_add_words_loop:           ! wow! 32 aligned!
 460         dec     4,%o3
 461         lduw    [%o2],%o5
 462         lduw    [%o1+4],%g1
 463         lduw    [%o2+4],%g2
 464         lduw    [%o1+8],%g3
 465         lduw    [%o2+8],%g4
 466         addccc  %o5,%o4,%o5
 467         stuw    %o5,[%o0]
 468
 469         lduw    [%o1+12],%o4
 470         lduw    [%o2+12],%o5
 471         inc     16,%o1
 472         addccc  %g1,%g2,%g1
 473         stuw    %g1,[%o0+4]
 474
 475         inc     16,%o2
 476         addccc  %g3,%g4,%g3
 477         stuw    %g3,[%o0+8]
 478
 479         inc     16,%o0
 480         addccc  %o5,%o4,%o5
 481         stuw    %o5,[%o0-4]
 482         and     %o3,-4,%g1
 483         brnz,a,pt       %g1,.L_bn_add_words_loop
 484         lduw    [%o1],%o4
 485
 486         brnz,a,pn       %o3,.L_bn_add_words_tail
 487         lduw    [%o1],%o4
 488 .L_bn_add_words_return:
 489         clr     %o0
 490         retl
 491         movcs   %icc,1,%o0
 492         nop
 493
 494 .L_bn_add_words_tail:
 495         lduw    [%o2],%o5
 496         dec     %o3
 497         addccc  %o5,%o4,%o5
 498         brz,pt  %o3,.L_bn_add_words_return
 499         stuw    %o5,[%o0]
 500
 501         lduw    [%o1+4],%o4
 502         lduw    [%o2+4],%o5
 503         dec     %o3
 504         addccc  %o5,%o4,%o5
 505         brz,pt  %o3,.L_bn_add_words_return
 506         stuw    %o5,[%o0+4]
 507
 508         lduw    [%o1+8],%o4
 509         lduw    [%o2+8],%o5
 510         addccc  %o5,%o4,%o5
 511         stuw    %o5,[%o0+8]
 512         clr     %o0
 513         retl
 514         movcs   %icc,1,%o0
 515
 516 .type   bn_add_words,#function
 517 .size   bn_add_words,(.-bn_add_words)
 518
 519 .global bn_sub_words
 520 /*
 521  * BN_ULONG bn_sub_words(rp,ap,bp,n)
 522  * BN_ULONG *rp,*ap,*bp;
 523  * int n;
 524  */
 525 bn_sub_words:
 526         brgz,a  %o3,.L_bn_sub_words_proceed
 527         lduw    [%o1],%o4
 528         retl
 529         clr     %o0
 530
 531 .L_bn_sub_words_proceed:
 532         andcc   %o3,-4,%g0
 533         bz,pn   %icc,.L_bn_sub_words_tail
 534         addcc   %g0,0,%g0       ! clear carry flag
 535         nop
 536
 537 .L_bn_sub_words_loop:           ! wow! 32 aligned!
 538         dec     4,%o3
 539         lduw    [%o2],%o5
 540         lduw    [%o1+4],%g1
 541         lduw    [%o2+4],%g2
 542         lduw    [%o1+8],%g3
 543         lduw    [%o2+8],%g4
 544         subccc  %o4,%o5,%o5
 545         stuw    %o5,[%o0]
 546
 547         lduw    [%o1+12],%o4
 548         lduw    [%o2+12],%o5
 549         inc     16,%o1
 550         subccc  %g1,%g2,%g2
 551         stuw    %g2,[%o0+4]
 552
 553         inc     16,%o2
 554         subccc  %g3,%g4,%g4
 555         stuw    %g4,[%o0+8]
 556
 557         inc     16,%o0
 558         subccc  %o4,%o5,%o5
 559         stuw    %o5,[%o0-4]
 560         and     %o3,-4,%g1
 561         brnz,a,pt       %g1,.L_bn_sub_words_loop
 562         lduw    [%o1],%o4
 563
 564         brnz,a,pn       %o3,.L_bn_sub_words_tail
 565         lduw    [%o1],%o4
 566 .L_bn_sub_words_return:
 567         clr     %o0
 568         retl
 569         movcs   %icc,1,%o0
 570         nop
 571
 572 .L_bn_sub_words_tail:           ! wow! 32 aligned!
 573         lduw    [%o2],%o5
 574         dec     %o3
 575         subccc  %o4,%o5,%o5
 576         brz,pt  %o3,.L_bn_sub_words_return
 577         stuw    %o5,[%o0]
 578
 579         lduw    [%o1+4],%o4
 580         lduw    [%o2+4],%o5
 581         dec     %o3
 582         subccc  %o4,%o5,%o5
 583         brz,pt  %o3,.L_bn_sub_words_return
 584         stuw    %o5,[%o0+4]
 585
 586         lduw    [%o1+8],%o4
 587         lduw    [%o2+8],%o5
 588         subccc  %o4,%o5,%o5
 589         stuw    %o5,[%o0+8]
 590         clr     %o0
 591         retl
 592         movcs   %icc,1,%o0
 593
 594 .type   bn_sub_words,#function
 595 .size   bn_sub_words,(.-bn_sub_words)
 596
 597 /*
 598  * Code below depends on the fact that upper parts of the %l0-%l7
 599  * and %i0-%i7 are zeroed by kernel after context switch. In
 600  * previous versions this comment stated that "the trouble is that
 601  * it's not feasible to implement the mumbo-jumbo in less V9
 602  * instructions:-(" which apparently isn't true thanks to
 603  * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
 604  * results not from the shorter code, but from elimination of
 605  * multicycle none-pairable 'rd %y,%rd' instructions.
 606  *
 607  *                                                      Andy.
 608  */
 609
 610 #define FRAME_SIZE      -96
 611
 612 /*
 613  * Here is register usage map for *all* routines below.
 614  */
 615 #define t_1     %o0
 616 #define t_2     %o1
 617 #define c_12    %o2
 618 #define c_3     %o3
 619
 620 #define ap(I)   [%i1+4*I]
 621 #define bp(I)   [%i2+4*I]
 622 #define rp(I)   [%i0+4*I]
 623
 624 #define a_0     %l0
 625 #define a_1     %l1
 626 #define a_2     %l2
 627 #define a_3     %l3
 628 #define a_4     %l4
 629 #define a_5     %l5
 630 #define a_6     %l6
 631 #define a_7     %l7
 632
 633 #define b_0     %i3
 634 #define b_1     %i4
 635 #define b_2     %i5
 636 #define b_3     %o4
 637 #define b_4     %o5
 638 #define b_5     %o7
 639 #define b_6     %g1
 640 #define b_7     %g4
 641
 642 .align  32
 643 .global bn_mul_comba8
 644 /*
 645  * void bn_mul_comba8(r,a,b)
 646  * BN_ULONG *r,*a,*b;
 647  */
 648 bn_mul_comba8:
 649         save    %sp,FRAME_SIZE,%sp
 650         mov     1,t_2
 651         lduw    ap(0),a_0
 652         sllx    t_2,32,t_2
 653         lduw    bp(0),b_0       !=
 654         lduw    bp(1),b_1
 655         mulx    a_0,b_0,t_1     !mul_add_c(a[0],b[0],c1,c2,c3);
 656         srlx    t_1,32,c_12
 657         stuw    t_1,rp(0)       !=!r[0]=c1;
 658
 659         lduw    ap(1),a_1
 660         mulx    a_0,b_1,t_1     !mul_add_c(a[0],b[1],c2,c3,c1);
 661         addcc   c_12,t_1,c_12
 662         clr     c_3             !=
 663         bcs,a   %xcc,.+8
 664         add     c_3,t_2,c_3
 665         lduw    ap(2),a_2
 666         mulx    a_1,b_0,t_1     !=!mul_add_c(a[1],b[0],c2,c3,c1);
 667         addcc   c_12,t_1,t_1
 668         bcs,a   %xcc,.+8
 669         add     c_3,t_2,c_3
 670         srlx    t_1,32,c_12     !=
 671         stuw    t_1,rp(1)       !r[1]=c2;
 672         or      c_12,c_3,c_12
 673
 674         mulx    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
 675         addcc   c_12,t_1,c_12   !=
 676         clr     c_3
 677         bcs,a   %xcc,.+8
 678         add     c_3,t_2,c_3
 679         lduw    bp(2),b_2       !=
 680         mulx    a_1,b_1,t_1     !mul_add_c(a[1],b[1],c3,c1,c2);
 681         addcc   c_12,t_1,c_12
 682         bcs,a   %xcc,.+8
 683         add     c_3,t_2,c_3     !=
 684         lduw    bp(3),b_3
 685         mulx    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
 686         addcc   c_12,t_1,t_1
 687         bcs,a   %xcc,.+8        !=
 688         add     c_3,t_2,c_3
 689         srlx    t_1,32,c_12
 690         stuw    t_1,rp(2)       !r[2]=c3;
 691         or      c_12,c_3,c_12   !=
 692
 693         mulx    a_0,b_3,t_1     !mul_add_c(a[0],b[3],c1,c2,c3);
 694         addcc   c_12,t_1,c_12
 695         clr     c_3
 696         bcs,a   %xcc,.+8        !=
 697         add     c_3,t_2,c_3
 698         mulx    a_1,b_2,t_1     !=!mul_add_c(a[1],b[2],c1,c2,c3);
 699         addcc   c_12,t_1,c_12
 700         bcs,a   %xcc,.+8        !=
 701         add     c_3,t_2,c_3
 702         lduw    ap(3),a_3
 703         mulx    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
 704         addcc   c_12,t_1,c_12   !=
 705         bcs,a   %xcc,.+8
 706         add     c_3,t_2,c_3
 707         lduw    ap(4),a_4
 708         mulx    a_3,b_0,t_1     !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
 709         addcc   c_12,t_1,t_1
 710         bcs,a   %xcc,.+8
 711         add     c_3,t_2,c_3
 712         srlx    t_1,32,c_12     !=
 713         stuw    t_1,rp(3)       !r[3]=c1;
 714         or      c_12,c_3,c_12
 715
 716         mulx    a_4,b_0,t_1     !mul_add_c(a[4],b[0],c2,c3,c1);
 717         addcc   c_12,t_1,c_12   !=
 718         clr     c_3
 719         bcs,a   %xcc,.+8
 720         add     c_3,t_2,c_3
 721         mulx    a_3,b_1,t_1     !=!mul_add_c(a[3],b[1],c2,c3,c1);
 722         addcc   c_12,t_1,c_12
 723         bcs,a   %xcc,.+8
 724         add     c_3,t_2,c_3
 725         mulx    a_2,b_2,t_1     !=!mul_add_c(a[2],b[2],c2,c3,c1);
 726         addcc   c_12,t_1,c_12
 727         bcs,a   %xcc,.+8
 728         add     c_3,t_2,c_3
 729         lduw    bp(4),b_4       !=
 730         mulx    a_1,b_3,t_1     !mul_add_c(a[1],b[3],c2,c3,c1);
 731         addcc   c_12,t_1,c_12
 732         bcs,a   %xcc,.+8
 733         add     c_3,t_2,c_3     !=
 734         lduw    bp(5),b_5
 735         mulx    a_0,b_4,t_1     !mul_add_c(a[0],b[4],c2,c3,c1);
 736         addcc   c_12,t_1,t_1
 737         bcs,a   %xcc,.+8        !=
 738         add     c_3,t_2,c_3
 739         srlx    t_1,32,c_12
 740         stuw    t_1,rp(4)       !r[4]=c2;
 741         or      c_12,c_3,c_12   !=
 742
 743         mulx    a_0,b_5,t_1     !mul_add_c(a[0],b[5],c3,c1,c2);
 744         addcc   c_12,t_1,c_12
 745         clr     c_3
 746         bcs,a   %xcc,.+8        !=
 747         add     c_3,t_2,c_3
 748         mulx    a_1,b_4,t_1     !mul_add_c(a[1],b[4],c3,c1,c2);
 749         addcc   c_12,t_1,c_12
 750         bcs,a   %xcc,.+8        !=
 751         add     c_3,t_2,c_3
 752         mulx    a_2,b_3,t_1     !mul_add_c(a[2],b[3],c3,c1,c2);
 753         addcc   c_12,t_1,c_12
 754         bcs,a   %xcc,.+8        !=
 755         add     c_3,t_2,c_3
 756         mulx    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
 757         addcc   c_12,t_1,c_12
 758         bcs,a   %xcc,.+8        !=
 759         add     c_3,t_2,c_3
 760         lduw    ap(5),a_5
 761         mulx    a_4,b_1,t_1     !mul_add_c(a[4],b[1],c3,c1,c2);
 762         addcc   c_12,t_1,c_12   !=
 763         bcs,a   %xcc,.+8
 764         add     c_3,t_2,c_3
 765         lduw    ap(6),a_6
 766         mulx    a_5,b_0,t_1     !=!mul_add_c(a[5],b[0],c3,c1,c2);
 767         addcc   c_12,t_1,t_1
 768         bcs,a   %xcc,.+8
 769         add     c_3,t_2,c_3
 770         srlx    t_1,32,c_12     !=
 771         stuw    t_1,rp(5)       !r[5]=c3;
 772         or      c_12,c_3,c_12
 773
 774         mulx    a_6,b_0,t_1     !mul_add_c(a[6],b[0],c1,c2,c3);
 775         addcc   c_12,t_1,c_12   !=
 776         clr     c_3
 777         bcs,a   %xcc,.+8
 778         add     c_3,t_2,c_3
 779         mulx    a_5,b_1,t_1     !=!mul_add_c(a[5],b[1],c1,c2,c3);
 780         addcc   c_12,t_1,c_12
 781         bcs,a   %xcc,.+8
 782         add     c_3,t_2,c_3
 783         mulx    a_4,b_2,t_1     !=!mul_add_c(a[4],b[2],c1,c2,c3);
 784         addcc   c_12,t_1,c_12
 785         bcs,a   %xcc,.+8
 786         add     c_3,t_2,c_3
 787         mulx    a_3,b_3,t_1     !=!mul_add_c(a[3],b[3],c1,c2,c3);
 788         addcc   c_12,t_1,c_12
 789         bcs,a   %xcc,.+8
 790         add     c_3,t_2,c_3
 791         mulx    a_2,b_4,t_1     !=!mul_add_c(a[2],b[4],c1,c2,c3);
 792         addcc   c_12,t_1,c_12
 793         bcs,a   %xcc,.+8
 794         add     c_3,t_2,c_3
 795         lduw    bp(6),b_6       !=
 796         mulx    a_1,b_5,t_1     !mul_add_c(a[1],b[5],c1,c2,c3);
 797         addcc   c_12,t_1,c_12
 798         bcs,a   %xcc,.+8
 799         add     c_3,t_2,c_3     !=
 800         lduw    bp(7),b_7
 801         mulx    a_0,b_6,t_1     !mul_add_c(a[0],b[6],c1,c2,c3);
 802         addcc   c_12,t_1,t_1
 803         bcs,a   %xcc,.+8        !=
 804         add     c_3,t_2,c_3
 805         srlx    t_1,32,c_12
 806         stuw    t_1,rp(6)       !r[6]=c1;
 807         or      c_12,c_3,c_12   !=
 808
 809         mulx    a_0,b_7,t_1     !mul_add_c(a[0],b[7],c2,c3,c1);
 810         addcc   c_12,t_1,c_12
 811         clr     c_3
 812         bcs,a   %xcc,.+8        !=
 813         add     c_3,t_2,c_3
 814         mulx    a_1,b_6,t_1     !mul_add_c(a[1],b[6],c2,c3,c1);
 815         addcc   c_12,t_1,c_12
 816         bcs,a   %xcc,.+8        !=
 817         add     c_3,t_2,c_3
 818         mulx    a_2,b_5,t_1     !mul_add_c(a[2],b[5],c2,c3,c1);
 819         addcc   c_12,t_1,c_12
 820         bcs,a   %xcc,.+8        !=
 821         add     c_3,t_2,c_3
 822         mulx    a_3,b_4,t_1     !mul_add_c(a[3],b[4],c2,c3,c1);
 823         addcc   c_12,t_1,c_12
 824         bcs,a   %xcc,.+8        !=
 825         add     c_3,t_2,c_3
 826         mulx    a_4,b_3,t_1     !mul_add_c(a[4],b[3],c2,c3,c1);
 827         addcc   c_12,t_1,c_12
 828         bcs,a   %xcc,.+8        !=
 829         add     c_3,t_2,c_3
 830         mulx    a_5,b_2,t_1     !mul_add_c(a[5],b[2],c2,c3,c1);
 831         addcc   c_12,t_1,c_12
 832         bcs,a   %xcc,.+8        !=
 833         add     c_3,t_2,c_3
 834         lduw    ap(7),a_7
 835         mulx    a_6,b_1,t_1     !=!mul_add_c(a[6],b[1],c2,c3,c1);
 836         addcc   c_12,t_1,c_12
 837         bcs,a   %xcc,.+8
 838         add     c_3,t_2,c_3
 839         mulx    a_7,b_0,t_1     !=!mul_add_c(a[7],b[0],c2,c3,c1);
 840         addcc   c_12,t_1,t_1
 841         bcs,a   %xcc,.+8
 842         add     c_3,t_2,c_3
 843         srlx    t_1,32,c_12     !=
 844         stuw    t_1,rp(7)       !r[7]=c2;
 845         or      c_12,c_3,c_12
 846
 847         mulx    a_7,b_1,t_1     !=!mul_add_c(a[7],b[1],c3,c1,c2);
 848         addcc   c_12,t_1,c_12
 849         clr     c_3
 850         bcs,a   %xcc,.+8
 851         add     c_3,t_2,c_3     !=
 852         mulx    a_6,b_2,t_1     !mul_add_c(a[6],b[2],c3,c1,c2);
 853         addcc   c_12,t_1,c_12
 854         bcs,a   %xcc,.+8
 855         add     c_3,t_2,c_3     !=
 856         mulx    a_5,b_3,t_1     !mul_add_c(a[5],b[3],c3,c1,c2);
 857         addcc   c_12,t_1,c_12
 858         bcs,a   %xcc,.+8
 859         add     c_3,t_2,c_3     !=
 860         mulx    a_4,b_4,t_1     !mul_add_c(a[4],b[4],c3,c1,c2);
 861         addcc   c_12,t_1,c_12
 862         bcs,a   %xcc,.+8
 863         add     c_3,t_2,c_3     !=
 864         mulx    a_3,b_5,t_1     !mul_add_c(a[3],b[5],c3,c1,c2);
 865         addcc   c_12,t_1,c_12
 866         bcs,a   %xcc,.+8
 867         add     c_3,t_2,c_3     !=
 868         mulx    a_2,b_6,t_1     !mul_add_c(a[2],b[6],c3,c1,c2);
 869         addcc   c_12,t_1,c_12
 870         bcs,a   %xcc,.+8
 871         add     c_3,t_2,c_3     !=
 872         mulx    a_1,b_7,t_1     !mul_add_c(a[1],b[7],c3,c1,c2);
 873         addcc   c_12,t_1,t_1
 874         bcs,a   %xcc,.+8
 875         add     c_3,t_2,c_3     !=
 876         srlx    t_1,32,c_12
 877         stuw    t_1,rp(8)       !r[8]=c3;
 878         or      c_12,c_3,c_12
 879
 880         mulx    a_2,b_7,t_1     !=!mul_add_c(a[2],b[7],c1,c2,c3);
 881         addcc   c_12,t_1,c_12
 882         clr     c_3
 883         bcs,a   %xcc,.+8
 884         add     c_3,t_2,c_3     !=
 885         mulx    a_3,b_6,t_1     !mul_add_c(a[3],b[6],c1,c2,c3);
 886         addcc   c_12,t_1,c_12
 887         bcs,a   %xcc,.+8        !=
 888         add     c_3,t_2,c_3
 889         mulx    a_4,b_5,t_1     !mul_add_c(a[4],b[5],c1,c2,c3);
 890         addcc   c_12,t_1,c_12
 891         bcs,a   %xcc,.+8        !=
 892         add     c_3,t_2,c_3
 893         mulx    a_5,b_4,t_1     !mul_add_c(a[5],b[4],c1,c2,c3);
 894         addcc   c_12,t_1,c_12
 895         bcs,a   %xcc,.+8        !=
 896         add     c_3,t_2,c_3
 897         mulx    a_6,b_3,t_1     !mul_add_c(a[6],b[3],c1,c2,c3);
 898         addcc   c_12,t_1,c_12
 899         bcs,a   %xcc,.+8        !=
 900         add     c_3,t_2,c_3
 901         mulx    a_7,b_2,t_1     !mul_add_c(a[7],b[2],c1,c2,c3);
 902         addcc   c_12,t_1,t_1
 903         bcs,a   %xcc,.+8        !=
 904         add     c_3,t_2,c_3
 905         srlx    t_1,32,c_12
 906         stuw    t_1,rp(9)       !r[9]=c1;
 907         or      c_12,c_3,c_12   !=
 908
 909         mulx    a_7,b_3,t_1     !mul_add_c(a[7],b[3],c2,c3,c1);
 910         addcc   c_12,t_1,c_12
 911         clr     c_3
 912         bcs,a   %xcc,.+8        !=
 913         add     c_3,t_2,c_3
 914         mulx    a_6,b_4,t_1     !mul_add_c(a[6],b[4],c2,c3,c1);
 915         addcc   c_12,t_1,c_12
 916         bcs,a   %xcc,.+8        !=
 917         add     c_3,t_2,c_3
 918         mulx    a_5,b_5,t_1     !mul_add_c(a[5],b[5],c2,c3,c1);
 919         addcc   c_12,t_1,c_12
 920         bcs,a   %xcc,.+8        !=
 921         add     c_3,t_2,c_3
 922         mulx    a_4,b_6,t_1     !mul_add_c(a[4],b[6],c2,c3,c1);
 923         addcc   c_12,t_1,c_12
 924         bcs,a   %xcc,.+8        !=
 925         add     c_3,t_2,c_3
 926         mulx    a_3,b_7,t_1     !mul_add_c(a[3],b[7],c2,c3,c1);
 927         addcc   c_12,t_1,t_1
 928         bcs,a   %xcc,.+8        !=
 929         add     c_3,t_2,c_3
 930         srlx    t_1,32,c_12
 931         stuw    t_1,rp(10)      !r[10]=c2;
 932         or      c_12,c_3,c_12   !=
 933
 934         mulx    a_4,b_7,t_1     !mul_add_c(a[4],b[7],c3,c1,c2);
 935         addcc   c_12,t_1,c_12
 936         clr     c_3
 937         bcs,a   %xcc,.+8        !=
 938         add     c_3,t_2,c_3
 939         mulx    a_5,b_6,t_1     !mul_add_c(a[5],b[6],c3,c1,c2);
 940         addcc   c_12,t_1,c_12
 941         bcs,a   %xcc,.+8        !=
 942         add     c_3,t_2,c_3
 943         mulx    a_6,b_5,t_1     !mul_add_c(a[6],b[5],c3,c1,c2);
 944         addcc   c_12,t_1,c_12
 945         bcs,a   %xcc,.+8        !=
 946         add     c_3,t_2,c_3
 947         mulx    a_7,b_4,t_1     !mul_add_c(a[7],b[4],c3,c1,c2);
 948         addcc   c_12,t_1,t_1
 949         bcs,a   %xcc,.+8        !=
 950         add     c_3,t_2,c_3
 951         srlx    t_1,32,c_12
 952         stuw    t_1,rp(11)      !r[11]=c3;
 953         or      c_12,c_3,c_12   !=
 954
 955         mulx    a_7,b_5,t_1     !mul_add_c(a[7],b[5],c1,c2,c3);
 956         addcc   c_12,t_1,c_12
 957         clr     c_3
 958         bcs,a   %xcc,.+8        !=
 959         add     c_3,t_2,c_3
 960         mulx    a_6,b_6,t_1     !mul_add_c(a[6],b[6],c1,c2,c3);
 961         addcc   c_12,t_1,c_12
 962         bcs,a   %xcc,.+8        !=
 963         add     c_3,t_2,c_3
 964         mulx    a_5,b_7,t_1     !mul_add_c(a[5],b[7],c1,c2,c3);
 965         addcc   c_12,t_1,t_1
 966         bcs,a   %xcc,.+8        !=
 967         add     c_3,t_2,c_3
 968         srlx    t_1,32,c_12
 969         stuw    t_1,rp(12)      !r[12]=c1;
 970         or      c_12,c_3,c_12   !=
 971
 972         mulx    a_6,b_7,t_1     !mul_add_c(a[6],b[7],c2,c3,c1);
 973         addcc   c_12,t_1,c_12
 974         clr     c_3
 975         bcs,a   %xcc,.+8        !=
 976         add     c_3,t_2,c_3
 977         mulx    a_7,b_6,t_1     !mul_add_c(a[7],b[6],c2,c3,c1);
 978         addcc   c_12,t_1,t_1
 979         bcs,a   %xcc,.+8        !=
 980         add     c_3,t_2,c_3
 981         srlx    t_1,32,c_12
 982         st      t_1,rp(13)      !r[13]=c2;
 983         or      c_12,c_3,c_12   !=
 984
 985         mulx    a_7,b_7,t_1     !mul_add_c(a[7],b[7],c3,c1,c2);
 986         addcc   c_12,t_1,t_1
 987         srlx    t_1,32,c_12     !=
 988         stuw    t_1,rp(14)      !r[14]=c3;
 989         stuw    c_12,rp(15)     !r[15]=c1;
 990
 991         ret
 992         restore %g0,%g0,%o0     !=
 993
 994 .type   bn_mul_comba8,#function
 995 .size   bn_mul_comba8,(.-bn_mul_comba8)
 996
 997 .align  32
 998
 999 .global bn_mul_comba4
1000 /*
1001  * void bn_mul_comba4(r,a,b)
1002  * BN_ULONG *r,*a,*b;
1003  */
1004 bn_mul_comba4:
1005         save    %sp,FRAME_SIZE,%sp
1006         lduw    ap(0),a_0
1007         mov     1,t_2
1008         lduw    bp(0),b_0
1009         sllx    t_2,32,t_2      !=
1010         lduw    bp(1),b_1
1011         mulx    a_0,b_0,t_1     !mul_add_c(a[0],b[0],c1,c2,c3);
1012         srlx    t_1,32,c_12
1013         stuw    t_1,rp(0)       !=!r[0]=c1;
1014
1015         lduw    ap(1),a_1
1016         mulx    a_0,b_1,t_1     !mul_add_c(a[0],b[1],c2,c3,c1);
1017         addcc   c_12,t_1,c_12
1018         clr     c_3             !=
1019         bcs,a   %xcc,.+8
1020         add     c_3,t_2,c_3
1021         lduw    ap(2),a_2
1022         mulx    a_1,b_0,t_1     !=!mul_add_c(a[1],b[0],c2,c3,c1);
1023         addcc   c_12,t_1,t_1
1024         bcs,a   %xcc,.+8
1025         add     c_3,t_2,c_3
1026         srlx    t_1,32,c_12     !=
1027         stuw    t_1,rp(1)       !r[1]=c2;
1028         or      c_12,c_3,c_12
1029
1030         mulx    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
1031         addcc   c_12,t_1,c_12   !=
1032         clr     c_3
1033         bcs,a   %xcc,.+8
1034         add     c_3,t_2,c_3
1035         lduw    bp(2),b_2       !=
1036         mulx    a_1,b_1,t_1     !mul_add_c(a[1],b[1],c3,c1,c2);
1037         addcc   c_12,t_1,c_12
1038         bcs,a   %xcc,.+8
1039         add     c_3,t_2,c_3     !=
1040         lduw    bp(3),b_3
1041         mulx    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
1042         addcc   c_12,t_1,t_1
1043         bcs,a   %xcc,.+8        !=
1044         add     c_3,t_2,c_3
1045         srlx    t_1,32,c_12
1046         stuw    t_1,rp(2)       !r[2]=c3;
1047         or      c_12,c_3,c_12   !=
1048
1049         mulx    a_0,b_3,t_1     !mul_add_c(a[0],b[3],c1,c2,c3);
1050         addcc   c_12,t_1,c_12
1051         clr     c_3
1052         bcs,a   %xcc,.+8        !=
1053         add     c_3,t_2,c_3
1054         mulx    a_1,b_2,t_1     !mul_add_c(a[1],b[2],c1,c2,c3);
1055         addcc   c_12,t_1,c_12
1056         bcs,a   %xcc,.+8        !=
1057         add     c_3,t_2,c_3
1058         lduw    ap(3),a_3
1059         mulx    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
1060         addcc   c_12,t_1,c_12   !=
1061         bcs,a   %xcc,.+8
1062         add     c_3,t_2,c_3
1063         mulx    a_3,b_0,t_1     !mul_add_c(a[3],b[0],c1,c2,c3);!=
1064         addcc   c_12,t_1,t_1    !=
1065         bcs,a   %xcc,.+8
1066         add     c_3,t_2,c_3
1067         srlx    t_1,32,c_12
1068         stuw    t_1,rp(3)       !=!r[3]=c1;
1069         or      c_12,c_3,c_12
1070
1071         mulx    a_3,b_1,t_1     !mul_add_c(a[3],b[1],c2,c3,c1);
1072         addcc   c_12,t_1,c_12
1073         clr     c_3             !=
1074         bcs,a   %xcc,.+8
1075         add     c_3,t_2,c_3
1076         mulx    a_2,b_2,t_1     !mul_add_c(a[2],b[2],c2,c3,c1);
1077         addcc   c_12,t_1,c_12   !=
1078         bcs,a   %xcc,.+8
1079         add     c_3,t_2,c_3
1080         mulx    a_1,b_3,t_1     !mul_add_c(a[1],b[3],c2,c3,c1);
1081         addcc   c_12,t_1,t_1    !=
1082         bcs,a   %xcc,.+8
1083         add     c_3,t_2,c_3
1084         srlx    t_1,32,c_12
1085         stuw    t_1,rp(4)       !=!r[4]=c2;
1086         or      c_12,c_3,c_12
1087
1088         mulx    a_2,b_3,t_1     !mul_add_c(a[2],b[3],c3,c1,c2);
1089         addcc   c_12,t_1,c_12
1090         clr     c_3             !=
1091         bcs,a   %xcc,.+8
1092         add     c_3,t_2,c_3
1093         mulx    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
1094         addcc   c_12,t_1,t_1    !=
1095         bcs,a   %xcc,.+8
1096         add     c_3,t_2,c_3
1097         srlx    t_1,32,c_12
1098         stuw    t_1,rp(5)       !=!r[5]=c3;
1099         or      c_12,c_3,c_12
1100
1101         mulx    a_3,b_3,t_1     !mul_add_c(a[3],b[3],c1,c2,c3);
1102         addcc   c_12,t_1,t_1
1103         srlx    t_1,32,c_12     !=
1104         stuw    t_1,rp(6)       !r[6]=c1;
1105         stuw    c_12,rp(7)      !r[7]=c2;
1106
1107         ret
1108         restore %g0,%g0,%o0
1109
1110 .type   bn_mul_comba4,#function
1111 .size   bn_mul_comba4,(.-bn_mul_comba4)
1112
1113 .align  32
1114
1115 .global bn_sqr_comba8
1116 bn_sqr_comba8:
1117         save    %sp,FRAME_SIZE,%sp
1118         mov     1,t_2
1119         lduw    ap(0),a_0
1120         sllx    t_2,32,t_2
1121         lduw    ap(1),a_1
1122         mulx    a_0,a_0,t_1     !sqr_add_c(a,0,c1,c2,c3);
1123         srlx    t_1,32,c_12
1124         stuw    t_1,rp(0)       !r[0]=c1;
1125
1126         lduw    ap(2),a_2
1127         mulx    a_0,a_1,t_1     !=!sqr_add_c2(a,1,0,c2,c3,c1);
1128         addcc   c_12,t_1,c_12
1129         clr     c_3
1130         bcs,a   %xcc,.+8
1131         add     c_3,t_2,c_3
1132         addcc   c_12,t_1,t_1
1133         bcs,a   %xcc,.+8
1134         add     c_3,t_2,c_3
1135         srlx    t_1,32,c_12
1136         stuw    t_1,rp(1)       !r[1]=c2;
1137         or      c_12,c_3,c_12
1138
1139         mulx    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
1140         addcc   c_12,t_1,c_12
1141         clr     c_3
1142         bcs,a   %xcc,.+8
1143         add     c_3,t_2,c_3
1144         addcc   c_12,t_1,c_12
1145         bcs,a   %xcc,.+8
1146         add     c_3,t_2,c_3
1147         lduw    ap(3),a_3
1148         mulx    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
1149         addcc   c_12,t_1,t_1
1150         bcs,a   %xcc,.+8
1151         add     c_3,t_2,c_3
1152         srlx    t_1,32,c_12
1153         stuw    t_1,rp(2)       !r[2]=c3;
1154         or      c_12,c_3,c_12
1155
1156         mulx    a_0,a_3,t_1     !sqr_add_c2(a,3,0,c1,c2,c3);
1157         addcc   c_12,t_1,c_12
1158         clr     c_3
1159         bcs,a   %xcc,.+8
1160         add     c_3,t_2,c_3
1161         addcc   c_12,t_1,c_12
1162         bcs,a   %xcc,.+8
1163         add     c_3,t_2,c_3
1164         lduw    ap(4),a_4
1165         mulx    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
1166         addcc   c_12,t_1,c_12
1167         bcs,a   %xcc,.+8
1168         add     c_3,t_2,c_3
1169         addcc   c_12,t_1,t_1
1170         bcs,a   %xcc,.+8
1171         add     c_3,t_2,c_3
1172         srlx    t_1,32,c_12
1173         st      t_1,rp(3)       !r[3]=c1;
1174         or      c_12,c_3,c_12
1175
1176         mulx    a_4,a_0,t_1     !sqr_add_c2(a,4,0,c2,c3,c1);
1177         addcc   c_12,t_1,c_12
1178         clr     c_3
1179         bcs,a   %xcc,.+8
1180         add     c_3,t_2,c_3
1181         addcc   c_12,t_1,c_12
1182         bcs,a   %xcc,.+8
1183         add     c_3,t_2,c_3
1184         mulx    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
1185         addcc   c_12,t_1,c_12
1186         bcs,a   %xcc,.+8
1187         add     c_3,t_2,c_3
1188         addcc   c_12,t_1,c_12
1189         bcs,a   %xcc,.+8
1190         add     c_3,t_2,c_3
1191         lduw    ap(5),a_5
1192         mulx    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
1193         addcc   c_12,t_1,t_1
1194         bcs,a   %xcc,.+8
1195         add     c_3,t_2,c_3
1196         srlx    t_1,32,c_12
1197         stuw    t_1,rp(4)       !r[4]=c2;
1198         or      c_12,c_3,c_12
1199
1200         mulx    a_0,a_5,t_1     !sqr_add_c2(a,5,0,c3,c1,c2);
1201         addcc   c_12,t_1,c_12
1202         clr     c_3
1203         bcs,a   %xcc,.+8
1204         add     c_3,t_2,c_3
1205         addcc   c_12,t_1,c_12
1206         bcs,a   %xcc,.+8
1207         add     c_3,t_2,c_3
1208         mulx    a_1,a_4,t_1     !sqr_add_c2(a,4,1,c3,c1,c2);
1209         addcc   c_12,t_1,c_12
1210         bcs,a   %xcc,.+8
1211         add     c_3,t_2,c_3
1212         addcc   c_12,t_1,c_12
1213         bcs,a   %xcc,.+8
1214         add     c_3,t_2,c_3
1215         lduw    ap(6),a_6
1216         mulx    a_2,a_3,t_1     !sqr_add_c2(a,3,2,c3,c1,c2);
1217         addcc   c_12,t_1,c_12
1218         bcs,a   %xcc,.+8
1219         add     c_3,t_2,c_3
1220         addcc   c_12,t_1,t_1
1221         bcs,a   %xcc,.+8
1222         add     c_3,t_2,c_3
1223         srlx    t_1,32,c_12
1224         stuw    t_1,rp(5)       !r[5]=c3;
1225         or      c_12,c_3,c_12
1226
1227         mulx    a_6,a_0,t_1     !sqr_add_c2(a,6,0,c1,c2,c3);
1228         addcc   c_12,t_1,c_12
1229         clr     c_3
1230         bcs,a   %xcc,.+8
1231         add     c_3,t_2,c_3
1232         addcc   c_12,t_1,c_12
1233         bcs,a   %xcc,.+8
1234         add     c_3,t_2,c_3
1235         mulx    a_5,a_1,t_1     !sqr_add_c2(a,5,1,c1,c2,c3);
1236         addcc   c_12,t_1,c_12
1237         bcs,a   %xcc,.+8
1238         add     c_3,t_2,c_3
1239         addcc   c_12,t_1,c_12
1240         bcs,a   %xcc,.+8
1241         add     c_3,t_2,c_3
1242         mulx    a_4,a_2,t_1     !sqr_add_c2(a,4,2,c1,c2,c3);
1243         addcc   c_12,t_1,c_12
1244         bcs,a   %xcc,.+8
1245         add     c_3,t_2,c_3
1246         addcc   c_12,t_1,c_12
1247         bcs,a   %xcc,.+8
1248         add     c_3,t_2,c_3
1249         lduw    ap(7),a_7
1250         mulx    a_3,a_3,t_1     !=!sqr_add_c(a,3,c1,c2,c3);
1251         addcc   c_12,t_1,t_1
1252         bcs,a   %xcc,.+8
1253         add     c_3,t_2,c_3
1254         srlx    t_1,32,c_12
1255         stuw    t_1,rp(6)       !r[6]=c1;
1256         or      c_12,c_3,c_12
1257
1258         mulx    a_0,a_7,t_1     !sqr_add_c2(a,7,0,c2,c3,c1);
1259         addcc   c_12,t_1,c_12
1260         clr     c_3
1261         bcs,a   %xcc,.+8
1262         add     c_3,t_2,c_3
1263         addcc   c_12,t_1,c_12
1264         bcs,a   %xcc,.+8
1265         add     c_3,t_2,c_3
1266         mulx    a_1,a_6,t_1     !sqr_add_c2(a,6,1,c2,c3,c1);
1267         addcc   c_12,t_1,c_12
1268         bcs,a   %xcc,.+8
1269         add     c_3,t_2,c_3
1270         addcc   c_12,t_1,c_12
1271         bcs,a   %xcc,.+8
1272         add     c_3,t_2,c_3
1273         mulx    a_2,a_5,t_1     !sqr_add_c2(a,5,2,c2,c3,c1);
1274         addcc   c_12,t_1,c_12
1275         bcs,a   %xcc,.+8
1276         add     c_3,t_2,c_3
1277         addcc   c_12,t_1,c_12
1278         bcs,a   %xcc,.+8
1279         add     c_3,t_2,c_3
1280         mulx    a_3,a_4,t_1     !sqr_add_c2(a,4,3,c2,c3,c1);
1281         addcc   c_12,t_1,c_12
1282         bcs,a   %xcc,.+8
1283         add     c_3,t_2,c_3
1284         addcc   c_12,t_1,t_1
1285         bcs,a   %xcc,.+8
1286         add     c_3,t_2,c_3
1287         srlx    t_1,32,c_12
1288         stuw    t_1,rp(7)       !r[7]=c2;
1289         or      c_12,c_3,c_12
1290
1291         mulx    a_7,a_1,t_1     !sqr_add_c2(a,7,1,c3,c1,c2);
1292         addcc   c_12,t_1,c_12
1293         clr     c_3
1294         bcs,a   %xcc,.+8
1295         add     c_3,t_2,c_3
1296         addcc   c_12,t_1,c_12
1297         bcs,a   %xcc,.+8
1298         add     c_3,t_2,c_3
1299         mulx    a_6,a_2,t_1     !sqr_add_c2(a,6,2,c3,c1,c2);
1300         addcc   c_12,t_1,c_12
1301         bcs,a   %xcc,.+8
1302         add     c_3,t_2,c_3
1303         addcc   c_12,t_1,c_12
1304         bcs,a   %xcc,.+8
1305         add     c_3,t_2,c_3
1306         mulx    a_5,a_3,t_1     !sqr_add_c2(a,5,3,c3,c1,c2);
1307         addcc   c_12,t_1,c_12
1308         bcs,a   %xcc,.+8
1309         add     c_3,t_2,c_3
1310         addcc   c_12,t_1,c_12
1311         bcs,a   %xcc,.+8
1312         add     c_3,t_2,c_3
1313         mulx    a_4,a_4,t_1     !sqr_add_c(a,4,c3,c1,c2);
1314         addcc   c_12,t_1,t_1
1315         bcs,a   %xcc,.+8
1316         add     c_3,t_2,c_3
1317         srlx    t_1,32,c_12
1318         stuw    t_1,rp(8)       !r[8]=c3;
1319         or      c_12,c_3,c_12
1320
1321         mulx    a_2,a_7,t_1     !sqr_add_c2(a,7,2,c1,c2,c3);
1322         addcc   c_12,t_1,c_12
1323         clr     c_3
1324         bcs,a   %xcc,.+8
1325         add     c_3,t_2,c_3
1326         addcc   c_12,t_1,c_12
1327         bcs,a   %xcc,.+8
1328         add     c_3,t_2,c_3
1329         mulx    a_3,a_6,t_1     !sqr_add_c2(a,6,3,c1,c2,c3);
1330         addcc   c_12,t_1,c_12
1331         bcs,a   %xcc,.+8
1332         add     c_3,t_2,c_3
1333         addcc   c_12,t_1,c_12
1334         bcs,a   %xcc,.+8
1335         add     c_3,t_2,c_3
1336         mulx    a_4,a_5,t_1     !sqr_add_c2(a,5,4,c1,c2,c3);
1337         addcc   c_12,t_1,c_12
1338         bcs,a   %xcc,.+8
1339         add     c_3,t_2,c_3
1340         addcc   c_12,t_1,t_1
1341         bcs,a   %xcc,.+8
1342         add     c_3,t_2,c_3
1343         srlx    t_1,32,c_12
1344         stuw    t_1,rp(9)       !r[9]=c1;
1345         or      c_12,c_3,c_12
1346
1347         mulx    a_7,a_3,t_1     !sqr_add_c2(a,7,3,c2,c3,c1);
1348         addcc   c_12,t_1,c_12
1349         clr     c_3
1350         bcs,a   %xcc,.+8
1351         add     c_3,t_2,c_3
1352         addcc   c_12,t_1,c_12
1353         bcs,a   %xcc,.+8
1354         add     c_3,t_2,c_3
1355         mulx    a_6,a_4,t_1     !sqr_add_c2(a,6,4,c2,c3,c1);
1356         addcc   c_12,t_1,c_12
1357         bcs,a   %xcc,.+8
1358         add     c_3,t_2,c_3
1359         addcc   c_12,t_1,c_12
1360         bcs,a   %xcc,.+8
1361         add     c_3,t_2,c_3
1362         mulx    a_5,a_5,t_1     !sqr_add_c(a,5,c2,c3,c1);
1363         addcc   c_12,t_1,t_1
1364         bcs,a   %xcc,.+8
1365         add     c_3,t_2,c_3
1366         srlx    t_1,32,c_12
1367         stuw    t_1,rp(10)      !r[10]=c2;
1368         or      c_12,c_3,c_12
1369
1370         mulx    a_4,a_7,t_1     !sqr_add_c2(a,7,4,c3,c1,c2);
1371         addcc   c_12,t_1,c_12
1372         clr     c_3
1373         bcs,a   %xcc,.+8
1374         add     c_3,t_2,c_3
1375         addcc   c_12,t_1,c_12
1376         bcs,a   %xcc,.+8
1377         add     c_3,t_2,c_3
1378         mulx    a_5,a_6,t_1     !sqr_add_c2(a,6,5,c3,c1,c2);
1379         addcc   c_12,t_1,c_12
1380         bcs,a   %xcc,.+8
1381         add     c_3,t_2,c_3
1382         addcc   c_12,t_1,t_1
1383         bcs,a   %xcc,.+8
1384         add     c_3,t_2,c_3
1385         srlx    t_1,32,c_12
1386         stuw    t_1,rp(11)      !r[11]=c3;
1387         or      c_12,c_3,c_12
1388
1389         mulx    a_7,a_5,t_1     !sqr_add_c2(a,7,5,c1,c2,c3);
1390         addcc   c_12,t_1,c_12
1391         clr     c_3
1392         bcs,a   %xcc,.+8
1393         add     c_3,t_2,c_3
1394         addcc   c_12,t_1,c_12
1395         bcs,a   %xcc,.+8
1396         add     c_3,t_2,c_3
1397         mulx    a_6,a_6,t_1     !sqr_add_c(a,6,c1,c2,c3);
1398         addcc   c_12,t_1,t_1
1399         bcs,a   %xcc,.+8
1400         add     c_3,t_2,c_3
1401         srlx    t_1,32,c_12
1402         stuw    t_1,rp(12)      !r[12]=c1;
1403         or      c_12,c_3,c_12
1404
1405         mulx    a_6,a_7,t_1     !sqr_add_c2(a,7,6,c2,c3,c1);
1406         addcc   c_12,t_1,c_12
1407         clr     c_3
1408         bcs,a   %xcc,.+8
1409         add     c_3,t_2,c_3
1410         addcc   c_12,t_1,t_1
1411         bcs,a   %xcc,.+8
1412         add     c_3,t_2,c_3
1413         srlx    t_1,32,c_12
1414         stuw    t_1,rp(13)      !r[13]=c2;
1415         or      c_12,c_3,c_12
1416
1417         mulx    a_7,a_7,t_1     !sqr_add_c(a,7,c3,c1,c2);
1418         addcc   c_12,t_1,t_1
1419         srlx    t_1,32,c_12
1420         stuw    t_1,rp(14)      !r[14]=c3;
1421         stuw    c_12,rp(15)     !r[15]=c1;
1422
1423         ret
1424         restore %g0,%g0,%o0
1425
1426 .type   bn_sqr_comba8,#function
1427 .size   bn_sqr_comba8,(.-bn_sqr_comba8)
1428
1429 .align  32
1430
1431 .global bn_sqr_comba4
1432 /*
1433  * void bn_sqr_comba4(r,a)
1434  * BN_ULONG *r,*a;
1435  */
1436 bn_sqr_comba4:
1437         save    %sp,FRAME_SIZE,%sp
1438         mov     1,t_2
1439         lduw    ap(0),a_0
1440         sllx    t_2,32,t_2
1441         lduw    ap(1),a_1
1442         mulx    a_0,a_0,t_1     !sqr_add_c(a,0,c1,c2,c3);
1443         srlx    t_1,32,c_12
1444         stuw    t_1,rp(0)       !r[0]=c1;
1445
1446         lduw    ap(2),a_2
1447         mulx    a_0,a_1,t_1     !sqr_add_c2(a,1,0,c2,c3,c1);
1448         addcc   c_12,t_1,c_12
1449         clr     c_3
1450         bcs,a   %xcc,.+8
1451         add     c_3,t_2,c_3
1452         addcc   c_12,t_1,t_1
1453         bcs,a   %xcc,.+8
1454         add     c_3,t_2,c_3
1455         srlx    t_1,32,c_12
1456         stuw    t_1,rp(1)       !r[1]=c2;
1457         or      c_12,c_3,c_12
1458
1459         mulx    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
1460         addcc   c_12,t_1,c_12
1461         clr     c_3
1462         bcs,a   %xcc,.+8
1463         add     c_3,t_2,c_3
1464         addcc   c_12,t_1,c_12
1465         bcs,a   %xcc,.+8
1466         add     c_3,t_2,c_3
1467         lduw    ap(3),a_3
1468         mulx    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
1469         addcc   c_12,t_1,t_1
1470         bcs,a   %xcc,.+8
1471         add     c_3,t_2,c_3
1472         srlx    t_1,32,c_12
1473         stuw    t_1,rp(2)       !r[2]=c3;
1474         or      c_12,c_3,c_12
1475
1476         mulx    a_0,a_3,t_1     !sqr_add_c2(a,3,0,c1,c2,c3);
1477         addcc   c_12,t_1,c_12
1478         clr     c_3
1479         bcs,a   %xcc,.+8
1480         add     c_3,t_2,c_3
1481         addcc   c_12,t_1,c_12
1482         bcs,a   %xcc,.+8
1483         add     c_3,t_2,c_3
1484         mulx    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
1485         addcc   c_12,t_1,c_12
1486         bcs,a   %xcc,.+8
1487         add     c_3,t_2,c_3
1488         addcc   c_12,t_1,t_1
1489         bcs,a   %xcc,.+8
1490         add     c_3,t_2,c_3
1491         srlx    t_1,32,c_12
1492         stuw    t_1,rp(3)       !r[3]=c1;
1493         or      c_12,c_3,c_12
1494
1495         mulx    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
1496         addcc   c_12,t_1,c_12
1497         clr     c_3
1498         bcs,a   %xcc,.+8
1499         add     c_3,t_2,c_3
1500         addcc   c_12,t_1,c_12
1501         bcs,a   %xcc,.+8
1502         add     c_3,t_2,c_3
1503         mulx    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
1504         addcc   c_12,t_1,t_1
1505         bcs,a   %xcc,.+8
1506         add     c_3,t_2,c_3
1507         srlx    t_1,32,c_12
1508         stuw    t_1,rp(4)       !r[4]=c2;
1509         or      c_12,c_3,c_12
1510
1511         mulx    a_2,a_3,t_1     !sqr_add_c2(a,3,2,c3,c1,c2);
1512         addcc   c_12,t_1,c_12
1513         clr     c_3
1514         bcs,a   %xcc,.+8
1515         add     c_3,t_2,c_3
1516         addcc   c_12,t_1,t_1
1517         bcs,a   %xcc,.+8
1518         add     c_3,t_2,c_3
1519         srlx    t_1,32,c_12
1520         stuw    t_1,rp(5)       !r[5]=c3;
1521         or      c_12,c_3,c_12
1522
1523         mulx    a_3,a_3,t_1     !sqr_add_c(a,3,c1,c2,c3);
1524         addcc   c_12,t_1,t_1
1525         srlx    t_1,32,c_12
1526         stuw    t_1,rp(6)       !r[6]=c1;
1527         stuw    c_12,rp(7)      !r[7]=c2;
1528
1529         ret
1530         restore %g0,%g0,%o0
1531
1532 .type   bn_sqr_comba4,#function
1533 .size   bn_sqr_comba4,(.-bn_sqr_comba4)
1534
1535 .align  32