crypto/bn/asm/ppc.pl

   1 #!/usr/bin/env perl
   2 #
   3 # Implemented as a Perl wrapper as we want to support several different
   4 # architectures with single file. We pick up the target based on the
   5 # file name we are asked to generate.
   6 #
   7 # It should be noted though that this perl code is nothing like
   8 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
   9 # as pre-processor to cover for platform differences in name decoration,
  10 # linker tables, 32-/64-bit instruction sets...
  11 #
  12 # As you might know there're several PowerPC ABI in use. Most notably
  13 # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
  14 # are similar enough to implement leaf(!) functions, which would be ABI
  15 # neutral. And that's what you find here: ABI neutral leaf functions.
  16 # In case you wonder what that is...
  17 #
  18 #       AIX performance
  19 #
  20 #       MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
  21 #
  22 #       The following is the performance of 32-bit compiler
  23 #       generated code:
  24 #
  25 #       OpenSSL 0.9.6c 21 dec 2001
  26 #       built on: Tue Jun 11 11:06:51 EDT 2002
  27 #       options:bn(64,32) ...
  28 #compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
  29 #                  sign    verify    sign/s verify/s
  30 #rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
  31 #rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
  32 #rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
  33 #rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
  34 #dsa  512 bits   0.0087s   0.0106s    114.3     94.5
  35 #dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
  36 #
  37 #       Same bechmark with this assembler code:
  38 #
  39 #rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
  40 #rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
  41 #rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
  42 #rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
  43 #dsa  512 bits   0.0052s   0.0062s    191.6    162.0
  44 #dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
  45 #
  46 #       Number of operations increases by at almost 75%
  47 #
  48 #       Here are performance numbers for 64-bit compiler
  49 #       generated code:
  50 #
  51 #       OpenSSL 0.9.6g [engine] 9 Aug 2002
  52 #       built on: Fri Apr 18 16:59:20 EDT 2003
  53 #       options:bn(64,64) ...
  54 #       compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
  55 #                  sign    verify    sign/s verify/s
  56 #rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
  57 #rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
  58 #rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
  59 #rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
  60 #dsa  512 bits   0.0026s   0.0032s    382.5    313.7
  61 #dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
  62 #
  63 #       Same benchmark with this assembler code:
  64 #
  65 #rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
  66 #rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
  67 #rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
  68 #rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
  69 #dsa  512 bits   0.0016s   0.0020s    610.7    507.1
  70 #dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
  71 #
  72 #       Again, performance increases by at about 75%
  73 #
  74 #       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
  75 #       OpenSSL 0.9.7c 30 Sep 2003
  76 #
  77 #       Original code.
  78 #
  79 #rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
  80 #rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
  81 #rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
  82 #rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
  83 #dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
  84 #dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
  85 #dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
  86 #
  87 #       Same benchmark with this assembler code:
  88 #
  89 #rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
  90 #rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
  91 #rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
  92 #rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
  93 #dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
  94 #dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
  95 #dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
  96 #
  97 #        Performance increase of ~60%
  98 #
  99 #       If you have comments or suggestions to improve code send
 100 #       me a note at schari@us.ibm.com
 101 #
 102
 103 $flavour = shift;
 104
 105 if ($flavour =~ /32/) {
 106         $BITS=  32;
 107         $BNSZ=  $BITS/8;
 108         $ISA=   "\"ppc\"";
 109
 110         $LD=    "lwz";          # load
 111         $LDU=   "lwzu";         # load and update
 112         $ST=    "stw";          # store
 113         $STU=   "stwu";         # store and update
 114         $UMULL= "mullw";        # unsigned multiply low
 115         $UMULH= "mulhwu";       # unsigned multiply high
 116         $UDIV=  "divwu";        # unsigned divide
 117         $UCMPI= "cmplwi";       # unsigned compare with immediate
 118         $UCMP=  "cmplw";        # unsigned compare
 119         $CNTLZ= "cntlzw";       # count leading zeros
 120         $SHL=   "slw";          # shift left
 121         $SHR=   "srw";          # unsigned shift right
 122         $SHRI=  "srwi";         # unsigned shift right by immediate
 123         $SHLI=  "slwi";         # shift left by immediate
 124         $CLRU=  "clrlwi";       # clear upper bits
 125         $INSR=  "insrwi";       # insert right
 126         $ROTL=  "rotlwi";       # rotate left by immediate
 127         $TR=    "tw";           # conditional trap
 128 } elsif ($flavour =~ /64/) {
 129         $BITS=  64;
 130         $BNSZ=  $BITS/8;
 131         $ISA=   "\"ppc64\"";
 132
 133         # same as above, but 64-bit mnemonics...
 134         $LD=    "ld";           # load
 135         $LDU=   "ldu";          # load and update
 136         $ST=    "std";          # store
 137         $STU=   "stdu";         # store and update
 138         $UMULL= "mulld";        # unsigned multiply low
 139         $UMULH= "mulhdu";       # unsigned multiply high
 140         $UDIV=  "divdu";        # unsigned divide
 141         $UCMPI= "cmpldi";       # unsigned compare with immediate
 142         $UCMP=  "cmpld";        # unsigned compare
 143         $CNTLZ= "cntlzd";       # count leading zeros
 144         $SHL=   "sld";          # shift left
 145         $SHR=   "srd";          # unsigned shift right
 146         $SHRI=  "srdi";         # unsigned shift right by immediate
 147         $SHLI=  "sldi";         # shift left by immediate
 148         $CLRU=  "clrldi";       # clear upper bits
 149         $INSR=  "insrdi";       # insert right
 150         $ROTL=  "rotldi";       # rotate left by immediate
 151         $TR=    "td";           # conditional trap
 152 } else { die "nonsense $flavour"; }
 153
 154 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 155 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 156 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
 157 die "can't locate ppc-xlate.pl";
 158
 159 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 160
 161 $data=<<EOF;
 162 #--------------------------------------------------------------------
 163 #
 164 #
 165 #
 166 #
 167 #       File:           ppc32.s
 168 #
 169 #       Created by:     Suresh Chari
 170 #                       IBM Thomas J. Watson Research Library
 171 #                       Hawthorne, NY
 172 #
 173 #
 174 #       Description:    Optimized assembly routines for OpenSSL crypto
 175 #                       on the 32 bitPowerPC platform.
 176 #
 177 #
 178 #       Version History
 179 #
 180 #       2. Fixed bn_add,bn_sub and bn_div_words, added comments,
 181 #          cleaned up code. Also made a single version which can
 182 #          be used for both the AIX and Linux compilers. See NOTE
 183 #          below.
 184 #                               12/05/03                Suresh Chari
 185 #                       (with lots of help from)        Andy Polyakov
 186 ##
 187 #       1. Initial version      10/20/02                Suresh Chari
 188 #
 189 #
 190 #       The following file works for the xlc,cc
 191 #       and gcc compilers.
 192 #
 193 #       NOTE:   To get the file to link correctly with the gcc compiler
 194 #               you have to change the names of the routines and remove
 195 #               the first .(dot) character. This should automatically
 196 #               be done in the build process.
 197 #
 198 #       Hand optimized assembly code for the following routines
 199 #
 200 #       bn_sqr_comba4
 201 #       bn_sqr_comba8
 202 #       bn_mul_comba4
 203 #       bn_mul_comba8
 204 #       bn_sub_words
 205 #       bn_add_words
 206 #       bn_div_words
 207 #       bn_sqr_words
 208 #       bn_mul_words
 209 #       bn_mul_add_words
 210 #
 211 #       NOTE:   It is possible to optimize this code more for
 212 #       specific PowerPC or Power architectures. On the Northstar
 213 #       architecture the optimizations in this file do
 214 #        NOT provide much improvement.
 215 #
 216 #       If you have comments or suggestions to improve code send
 217 #       me a note at schari\@us.ibm.com
 218 #
 219 #--------------------------------------------------------------------------
 220 #
 221 #       Defines to be used in the assembly code.
 222 #
 223 #.set r0,0      # we use it as storage for value of 0
 224 #.set SP,1      # preserved
 225 #.set RTOC,2    # preserved
 226 #.set r3,3      # 1st argument/return value
 227 #.set r4,4      # 2nd argument/volatile register
 228 #.set r5,5      # 3rd argument/volatile register
 229 #.set r6,6      # ...
 230 #.set r7,7
 231 #.set r8,8
 232 #.set r9,9
 233 #.set r10,10
 234 #.set r11,11
 235 #.set r12,12
 236 #.set r13,13    # not used, nor any other "below" it...
 237
 238 #       Declare function names to be global
 239 #       NOTE:   For gcc these names MUST be changed to remove
 240 #               the first . i.e. for example change ".bn_sqr_comba4"
 241 #               to "bn_sqr_comba4". This should be automatically done
 242 #               in the build.
 243
 244         .globl  .bn_sqr_comba4
 245         .globl  .bn_sqr_comba8
 246         .globl  .bn_mul_comba4
 247         .globl  .bn_mul_comba8
 248         .globl  .bn_sub_words
 249         .globl  .bn_add_words
 250         .globl  .bn_div_words
 251         .globl  .bn_sqr_words
 252         .globl  .bn_mul_words
 253         .globl  .bn_mul_add_words
 254
 255 # .text section
 256
 257         .machine        "any"
 258
 259 #
 260 #       NOTE:   The following label name should be changed to
 261 #               "bn_sqr_comba4" i.e. remove the first dot
 262 #               for the gcc compiler. This should be automatically
 263 #               done in the build
 264 #
 265
 266 .align  4
 267 .bn_sqr_comba4:
 268 #
 269 # Optimized version of bn_sqr_comba4.
 270 #
 271 # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
 272 # r3 contains r
 273 # r4 contains a
 274 #
 275 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
 276 #
 277 # r5,r6 are the two BN_ULONGs being multiplied.
 278 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
 279 # r9,r10, r11 are the equivalents of c1,c2, c3.
 280 # Here's the assembly
 281 #
 282 #
 283         xor             r0,r0,r0                # set r0 = 0. Used in the addze
 284                                                 # instructions below
 285
 286                                                 #sqr_add_c(a,0,c1,c2,c3)
 287         $LD             r5,`0*$BNSZ`(r4)
 288         $UMULL          r9,r5,r5
 289         $UMULH          r10,r5,r5               #in first iteration. No need
 290                                                 #to add since c1=c2=c3=0.
 291                                                 # Note c3(r11) is NOT set to 0
 292                                                 # but will be.
 293
 294         $ST             r9,`0*$BNSZ`(r3)        # r[0]=c1;
 295                                                 # sqr_add_c2(a,1,0,c2,c3,c1);
 296         $LD             r6,`1*$BNSZ`(r4)
 297         $UMULL          r7,r5,r6
 298         $UMULH          r8,r5,r6
 299
 300         addc            r7,r7,r7                # compute (r7,r8)=2*(r7,r8)
 301         adde            r8,r8,r8
 302         addze           r9,r0                   # catch carry if any.
 303                                                 # r9= r0(=0) and carry
 304
 305         addc            r10,r7,r10              # now add to temp result.
 306         addze           r11,r8                  # r8 added to r11 which is 0
 307         addze           r9,r9
 308
 309         $ST             r10,`1*$BNSZ`(r3)       #r[1]=c2;
 310                                                 #sqr_add_c(a,1,c3,c1,c2)
 311         $UMULL          r7,r6,r6
 312         $UMULH          r8,r6,r6
 313         addc            r11,r7,r11
 314         adde            r9,r8,r9
 315         addze           r10,r0
 316                                                 #sqr_add_c2(a,2,0,c3,c1,c2)
 317         $LD             r6,`2*$BNSZ`(r4)
 318         $UMULL          r7,r5,r6
 319         $UMULH          r8,r5,r6
 320
 321         addc            r7,r7,r7
 322         adde            r8,r8,r8
 323         addze           r10,r10
 324
 325         addc            r11,r7,r11
 326         adde            r9,r8,r9
 327         addze           r10,r10
 328         $ST             r11,`2*$BNSZ`(r3)       #r[2]=c3
 329                                                 #sqr_add_c2(a,3,0,c1,c2,c3);
 330         $LD             r6,`3*$BNSZ`(r4)
 331         $UMULL          r7,r5,r6
 332         $UMULH          r8,r5,r6
 333         addc            r7,r7,r7
 334         adde            r8,r8,r8
 335         addze           r11,r0
 336
 337         addc            r9,r7,r9
 338         adde            r10,r8,r10
 339         addze           r11,r11
 340                                                 #sqr_add_c2(a,2,1,c1,c2,c3);
 341         $LD             r5,`1*$BNSZ`(r4)
 342         $LD             r6,`2*$BNSZ`(r4)
 343         $UMULL          r7,r5,r6
 344         $UMULH          r8,r5,r6
 345
 346         addc            r7,r7,r7
 347         adde            r8,r8,r8
 348         addze           r11,r11
 349         addc            r9,r7,r9
 350         adde            r10,r8,r10
 351         addze           r11,r11
 352         $ST             r9,`3*$BNSZ`(r3)        #r[3]=c1
 353                                                 #sqr_add_c(a,2,c2,c3,c1);
 354         $UMULL          r7,r6,r6
 355         $UMULH          r8,r6,r6
 356         addc            r10,r7,r10
 357         adde            r11,r8,r11
 358         addze           r9,r0
 359                                                 #sqr_add_c2(a,3,1,c2,c3,c1);
 360         $LD             r6,`3*$BNSZ`(r4)
 361         $UMULL          r7,r5,r6
 362         $UMULH          r8,r5,r6
 363         addc            r7,r7,r7
 364         adde            r8,r8,r8
 365         addze           r9,r9
 366
 367         addc            r10,r7,r10
 368         adde            r11,r8,r11
 369         addze           r9,r9
 370         $ST             r10,`4*$BNSZ`(r3)       #r[4]=c2
 371                                                 #sqr_add_c2(a,3,2,c3,c1,c2);
 372         $LD             r5,`2*$BNSZ`(r4)
 373         $UMULL          r7,r5,r6
 374         $UMULH          r8,r5,r6
 375         addc            r7,r7,r7
 376         adde            r8,r8,r8
 377         addze           r10,r0
 378
 379         addc            r11,r7,r11
 380         adde            r9,r8,r9
 381         addze           r10,r10
 382         $ST             r11,`5*$BNSZ`(r3)       #r[5] = c3
 383                                                 #sqr_add_c(a,3,c1,c2,c3);
 384         $UMULL          r7,r6,r6
 385         $UMULH          r8,r6,r6
 386         addc            r9,r7,r9
 387         adde            r10,r8,r10
 388
 389         $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1
 390         $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2
 391         blr
 392         .long   0
 393         .byte   0,12,0x14,0,0,0,2,0
 394         .long   0
 395
 396 #
 397 #       NOTE:   The following label name should be changed to
 398 #               "bn_sqr_comba8" i.e. remove the first dot
 399 #               for the gcc compiler. This should be automatically
 400 #               done in the build
 401 #
 402
 403 .align  4
 404 .bn_sqr_comba8:
 405 #
 406 # This is an optimized version of the bn_sqr_comba8 routine.
 407 # Tightly uses the adde instruction
 408 #
 409 #
 410 # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
 411 # r3 contains r
 412 # r4 contains a
 413 #
 414 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
 415 #
 416 # r5,r6 are the two BN_ULONGs being multiplied.
 417 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
 418 # r9,r10, r11 are the equivalents of c1,c2, c3.
 419 #
 420 # Possible optimization of loading all 8 longs of a into registers
 421 # doesnt provide any speedup
 422 #
 423
 424         xor             r0,r0,r0                #set r0 = 0.Used in addze
 425                                                 #instructions below.
 426
 427                                                 #sqr_add_c(a,0,c1,c2,c3);
 428         $LD             r5,`0*$BNSZ`(r4)
 429         $UMULL          r9,r5,r5                #1st iteration: no carries.
 430         $UMULH          r10,r5,r5
 431         $ST             r9,`0*$BNSZ`(r3)        # r[0]=c1;
 432                                                 #sqr_add_c2(a,1,0,c2,c3,c1);
 433         $LD             r6,`1*$BNSZ`(r4)
 434         $UMULL          r7,r5,r6
 435         $UMULH          r8,r5,r6
 436
 437         addc            r10,r7,r10              #add the two register number
 438         adde            r11,r8,r0               # (r8,r7) to the three register
 439         addze           r9,r0                   # number (r9,r11,r10).NOTE:r0=0
 440
 441         addc            r10,r7,r10              #add the two register number
 442         adde            r11,r8,r11              # (r8,r7) to the three register
 443         addze           r9,r9                   # number (r9,r11,r10).
 444
 445         $ST             r10,`1*$BNSZ`(r3)       # r[1]=c2
 446
 447                                                 #sqr_add_c(a,1,c3,c1,c2);
 448         $UMULL          r7,r6,r6
 449         $UMULH          r8,r6,r6
 450         addc            r11,r7,r11
 451         adde            r9,r8,r9
 452         addze           r10,r0
 453                                                 #sqr_add_c2(a,2,0,c3,c1,c2);
 454         $LD             r6,`2*$BNSZ`(r4)
 455         $UMULL          r7,r5,r6
 456         $UMULH          r8,r5,r6
 457
 458         addc            r11,r7,r11
 459         adde            r9,r8,r9
 460         addze           r10,r10
 461
 462         addc            r11,r7,r11
 463         adde            r9,r8,r9
 464         addze           r10,r10
 465
 466         $ST             r11,`2*$BNSZ`(r3)       #r[2]=c3
 467                                                 #sqr_add_c2(a,3,0,c1,c2,c3);
 468         $LD             r6,`3*$BNSZ`(r4)        #r6 = a[3]. r5 is already a[0].
 469         $UMULL          r7,r5,r6
 470         $UMULH          r8,r5,r6
 471
 472         addc            r9,r7,r9
 473         adde            r10,r8,r10
 474         addze           r11,r0
 475
 476         addc            r9,r7,r9
 477         adde            r10,r8,r10
 478         addze           r11,r11
 479                                                 #sqr_add_c2(a,2,1,c1,c2,c3);
 480         $LD             r5,`1*$BNSZ`(r4)
 481         $LD             r6,`2*$BNSZ`(r4)
 482         $UMULL          r7,r5,r6
 483         $UMULH          r8,r5,r6
 484
 485         addc            r9,r7,r9
 486         adde            r10,r8,r10
 487         addze           r11,r11
 488
 489         addc            r9,r7,r9
 490         adde            r10,r8,r10
 491         addze           r11,r11
 492
 493         $ST             r9,`3*$BNSZ`(r3)        #r[3]=c1;
 494                                                 #sqr_add_c(a,2,c2,c3,c1);
 495         $UMULL          r7,r6,r6
 496         $UMULH          r8,r6,r6
 497
 498         addc            r10,r7,r10
 499         adde            r11,r8,r11
 500         addze           r9,r0
 501                                                 #sqr_add_c2(a,3,1,c2,c3,c1);
 502         $LD             r6,`3*$BNSZ`(r4)
 503         $UMULL          r7,r5,r6
 504         $UMULH          r8,r5,r6
 505
 506         addc            r10,r7,r10
 507         adde            r11,r8,r11
 508         addze           r9,r9
 509
 510         addc            r10,r7,r10
 511         adde            r11,r8,r11
 512         addze           r9,r9
 513                                                 #sqr_add_c2(a,4,0,c2,c3,c1);
 514         $LD             r5,`0*$BNSZ`(r4)
 515         $LD             r6,`4*$BNSZ`(r4)
 516         $UMULL          r7,r5,r6
 517         $UMULH          r8,r5,r6
 518
 519         addc            r10,r7,r10
 520         adde            r11,r8,r11
 521         addze           r9,r9
 522
 523         addc            r10,r7,r10
 524         adde            r11,r8,r11
 525         addze           r9,r9
 526         $ST             r10,`4*$BNSZ`(r3)       #r[4]=c2;
 527                                                 #sqr_add_c2(a,5,0,c3,c1,c2);
 528         $LD             r6,`5*$BNSZ`(r4)
 529         $UMULL          r7,r5,r6
 530         $UMULH          r8,r5,r6
 531
 532         addc            r11,r7,r11
 533         adde            r9,r8,r9
 534         addze           r10,r0
 535
 536         addc            r11,r7,r11
 537         adde            r9,r8,r9
 538         addze           r10,r10
 539                                                 #sqr_add_c2(a,4,1,c3,c1,c2);
 540         $LD             r5,`1*$BNSZ`(r4)
 541         $LD             r6,`4*$BNSZ`(r4)
 542         $UMULL          r7,r5,r6
 543         $UMULH          r8,r5,r6
 544
 545         addc            r11,r7,r11
 546         adde            r9,r8,r9
 547         addze           r10,r10
 548
 549         addc            r11,r7,r11
 550         adde            r9,r8,r9
 551         addze           r10,r10
 552                                                 #sqr_add_c2(a,3,2,c3,c1,c2);
 553         $LD             r5,`2*$BNSZ`(r4)
 554         $LD             r6,`3*$BNSZ`(r4)
 555         $UMULL          r7,r5,r6
 556         $UMULH          r8,r5,r6
 557
 558         addc            r11,r7,r11
 559         adde            r9,r8,r9
 560         addze           r10,r10
 561
 562         addc            r11,r7,r11
 563         adde            r9,r8,r9
 564         addze           r10,r10
 565         $ST             r11,`5*$BNSZ`(r3)       #r[5]=c3;
 566                                                 #sqr_add_c(a,3,c1,c2,c3);
 567         $UMULL          r7,r6,r6
 568         $UMULH          r8,r6,r6
 569         addc            r9,r7,r9
 570         adde            r10,r8,r10
 571         addze           r11,r0
 572                                                 #sqr_add_c2(a,4,2,c1,c2,c3);
 573         $LD             r6,`4*$BNSZ`(r4)
 574         $UMULL          r7,r5,r6
 575         $UMULH          r8,r5,r6
 576
 577         addc            r9,r7,r9
 578         adde            r10,r8,r10
 579         addze           r11,r11
 580
 581         addc            r9,r7,r9
 582         adde            r10,r8,r10
 583         addze           r11,r11
 584                                                 #sqr_add_c2(a,5,1,c1,c2,c3);
 585         $LD             r5,`1*$BNSZ`(r4)
 586         $LD             r6,`5*$BNSZ`(r4)
 587         $UMULL          r7,r5,r6
 588         $UMULH          r8,r5,r6
 589
 590         addc            r9,r7,r9
 591         adde            r10,r8,r10
 592         addze           r11,r11
 593
 594         addc            r9,r7,r9
 595         adde            r10,r8,r10
 596         addze           r11,r11
 597                                                 #sqr_add_c2(a,6,0,c1,c2,c3);
 598         $LD             r5,`0*$BNSZ`(r4)
 599         $LD             r6,`6*$BNSZ`(r4)
 600         $UMULL          r7,r5,r6
 601         $UMULH          r8,r5,r6
 602         addc            r9,r7,r9
 603         adde            r10,r8,r10
 604         addze           r11,r11
 605         addc            r9,r7,r9
 606         adde            r10,r8,r10
 607         addze           r11,r11
 608         $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1;
 609                                                 #sqr_add_c2(a,7,0,c2,c3,c1);
 610         $LD             r6,`7*$BNSZ`(r4)
 611         $UMULL          r7,r5,r6
 612         $UMULH          r8,r5,r6
 613
 614         addc            r10,r7,r10
 615         adde            r11,r8,r11
 616         addze           r9,r0
 617         addc            r10,r7,r10
 618         adde            r11,r8,r11
 619         addze           r9,r9
 620                                                 #sqr_add_c2(a,6,1,c2,c3,c1);
 621         $LD             r5,`1*$BNSZ`(r4)
 622         $LD             r6,`6*$BNSZ`(r4)
 623         $UMULL          r7,r5,r6
 624         $UMULH          r8,r5,r6
 625
 626         addc            r10,r7,r10
 627         adde            r11,r8,r11
 628         addze           r9,r9
 629         addc            r10,r7,r10
 630         adde            r11,r8,r11
 631         addze           r9,r9
 632                                                 #sqr_add_c2(a,5,2,c2,c3,c1);
 633         $LD             r5,`2*$BNSZ`(r4)
 634         $LD             r6,`5*$BNSZ`(r4)
 635         $UMULL          r7,r5,r6
 636         $UMULH          r8,r5,r6
 637         addc            r10,r7,r10
 638         adde            r11,r8,r11
 639         addze           r9,r9
 640         addc            r10,r7,r10
 641         adde            r11,r8,r11
 642         addze           r9,r9
 643                                                 #sqr_add_c2(a,4,3,c2,c3,c1);
 644         $LD             r5,`3*$BNSZ`(r4)
 645         $LD             r6,`4*$BNSZ`(r4)
 646         $UMULL          r7,r5,r6
 647         $UMULH          r8,r5,r6
 648
 649         addc            r10,r7,r10
 650         adde            r11,r8,r11
 651         addze           r9,r9
 652         addc            r10,r7,r10
 653         adde            r11,r8,r11
 654         addze           r9,r9
 655         $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2;
 656                                                 #sqr_add_c(a,4,c3,c1,c2);
 657         $UMULL          r7,r6,r6
 658         $UMULH          r8,r6,r6
 659         addc            r11,r7,r11
 660         adde            r9,r8,r9
 661         addze           r10,r0
 662                                                 #sqr_add_c2(a,5,3,c3,c1,c2);
 663         $LD             r6,`5*$BNSZ`(r4)
 664         $UMULL          r7,r5,r6
 665         $UMULH          r8,r5,r6
 666         addc            r11,r7,r11
 667         adde            r9,r8,r9
 668         addze           r10,r10
 669         addc            r11,r7,r11
 670         adde            r9,r8,r9
 671         addze           r10,r10
 672                                                 #sqr_add_c2(a,6,2,c3,c1,c2);
 673         $LD             r5,`2*$BNSZ`(r4)
 674         $LD             r6,`6*$BNSZ`(r4)
 675         $UMULL          r7,r5,r6
 676         $UMULH          r8,r5,r6
 677         addc            r11,r7,r11
 678         adde            r9,r8,r9
 679         addze           r10,r10
 680
 681         addc            r11,r7,r11
 682         adde            r9,r8,r9
 683         addze           r10,r10
 684                                                 #sqr_add_c2(a,7,1,c3,c1,c2);
 685         $LD             r5,`1*$BNSZ`(r4)
 686         $LD             r6,`7*$BNSZ`(r4)
 687         $UMULL          r7,r5,r6
 688         $UMULH          r8,r5,r6
 689         addc            r11,r7,r11
 690         adde            r9,r8,r9
 691         addze           r10,r10
 692         addc            r11,r7,r11
 693         adde            r9,r8,r9
 694         addze           r10,r10
 695         $ST             r11,`8*$BNSZ`(r3)       #r[8]=c3;
 696                                                 #sqr_add_c2(a,7,2,c1,c2,c3);
 697         $LD             r5,`2*$BNSZ`(r4)
 698         $UMULL          r7,r5,r6
 699         $UMULH          r8,r5,r6
 700
 701         addc            r9,r7,r9
 702         adde            r10,r8,r10
 703         addze           r11,r0
 704         addc            r9,r7,r9
 705         adde            r10,r8,r10
 706         addze           r11,r11
 707                                                 #sqr_add_c2(a,6,3,c1,c2,c3);
 708         $LD             r5,`3*$BNSZ`(r4)
 709         $LD             r6,`6*$BNSZ`(r4)
 710         $UMULL          r7,r5,r6
 711         $UMULH          r8,r5,r6
 712         addc            r9,r7,r9
 713         adde            r10,r8,r10
 714         addze           r11,r11
 715         addc            r9,r7,r9
 716         adde            r10,r8,r10
 717         addze           r11,r11
 718                                                 #sqr_add_c2(a,5,4,c1,c2,c3);
 719         $LD             r5,`4*$BNSZ`(r4)
 720         $LD             r6,`5*$BNSZ`(r4)
 721         $UMULL          r7,r5,r6
 722         $UMULH          r8,r5,r6
 723         addc            r9,r7,r9
 724         adde            r10,r8,r10
 725         addze           r11,r11
 726         addc            r9,r7,r9
 727         adde            r10,r8,r10
 728         addze           r11,r11
 729         $ST             r9,`9*$BNSZ`(r3)        #r[9]=c1;
 730                                                 #sqr_add_c(a,5,c2,c3,c1);
 731         $UMULL          r7,r6,r6
 732         $UMULH          r8,r6,r6
 733         addc            r10,r7,r10
 734         adde            r11,r8,r11
 735         addze           r9,r0
 736                                                 #sqr_add_c2(a,6,4,c2,c3,c1);
 737         $LD             r6,`6*$BNSZ`(r4)
 738         $UMULL          r7,r5,r6
 739         $UMULH          r8,r5,r6
 740         addc            r10,r7,r10
 741         adde            r11,r8,r11
 742         addze           r9,r9
 743         addc            r10,r7,r10
 744         adde            r11,r8,r11
 745         addze           r9,r9
 746                                                 #sqr_add_c2(a,7,3,c2,c3,c1);
 747         $LD             r5,`3*$BNSZ`(r4)
 748         $LD             r6,`7*$BNSZ`(r4)
 749         $UMULL          r7,r5,r6
 750         $UMULH          r8,r5,r6
 751         addc            r10,r7,r10
 752         adde            r11,r8,r11
 753         addze           r9,r9
 754         addc            r10,r7,r10
 755         adde            r11,r8,r11
 756         addze           r9,r9
 757         $ST             r10,`10*$BNSZ`(r3)      #r[10]=c2;
 758                                                 #sqr_add_c2(a,7,4,c3,c1,c2);
 759         $LD             r5,`4*$BNSZ`(r4)
 760         $UMULL          r7,r5,r6
 761         $UMULH          r8,r5,r6
 762         addc            r11,r7,r11
 763         adde            r9,r8,r9
 764         addze           r10,r0
 765         addc            r11,r7,r11
 766         adde            r9,r8,r9
 767         addze           r10,r10
 768                                                 #sqr_add_c2(a,6,5,c3,c1,c2);
 769         $LD             r5,`5*$BNSZ`(r4)
 770         $LD             r6,`6*$BNSZ`(r4)
 771         $UMULL          r7,r5,r6
 772         $UMULH          r8,r5,r6
 773         addc            r11,r7,r11
 774         adde            r9,r8,r9
 775         addze           r10,r10
 776         addc            r11,r7,r11
 777         adde            r9,r8,r9
 778         addze           r10,r10
 779         $ST             r11,`11*$BNSZ`(r3)      #r[11]=c3;
 780                                                 #sqr_add_c(a,6,c1,c2,c3);
 781         $UMULL          r7,r6,r6
 782         $UMULH          r8,r6,r6
 783         addc            r9,r7,r9
 784         adde            r10,r8,r10
 785         addze           r11,r0
 786                                                 #sqr_add_c2(a,7,5,c1,c2,c3)
 787         $LD             r6,`7*$BNSZ`(r4)
 788         $UMULL          r7,r5,r6
 789         $UMULH          r8,r5,r6
 790         addc            r9,r7,r9
 791         adde            r10,r8,r10
 792         addze           r11,r11
 793         addc            r9,r7,r9
 794         adde            r10,r8,r10
 795         addze           r11,r11
 796         $ST             r9,`12*$BNSZ`(r3)       #r[12]=c1;
 797
 798                                                 #sqr_add_c2(a,7,6,c2,c3,c1)
 799         $LD             r5,`6*$BNSZ`(r4)
 800         $UMULL          r7,r5,r6
 801         $UMULH          r8,r5,r6
 802         addc            r10,r7,r10
 803         adde            r11,r8,r11
 804         addze           r9,r0
 805         addc            r10,r7,r10
 806         adde            r11,r8,r11
 807         addze           r9,r9
 808         $ST             r10,`13*$BNSZ`(r3)      #r[13]=c2;
 809                                                 #sqr_add_c(a,7,c3,c1,c2);
 810         $UMULL          r7,r6,r6
 811         $UMULH          r8,r6,r6
 812         addc            r11,r7,r11
 813         adde            r9,r8,r9
 814         $ST             r11,`14*$BNSZ`(r3)      #r[14]=c3;
 815         $ST             r9, `15*$BNSZ`(r3)      #r[15]=c1;
 816
 817
 818         blr
 819         .long   0
 820         .byte   0,12,0x14,0,0,0,2,0
 821         .long   0
 822
 823 #
 824 #       NOTE:   The following label name should be changed to
 825 #               "bn_mul_comba4" i.e. remove the first dot
 826 #               for the gcc compiler. This should be automatically
 827 #               done in the build
 828 #
 829
 830 .align  4
 831 .bn_mul_comba4:
 832 #
 833 # This is an optimized version of the bn_mul_comba4 routine.
 834 #
 835 # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 836 # r3 contains r
 837 # r4 contains a
 838 # r5 contains b
 839 # r6, r7 are the 2 BN_ULONGs being multiplied.
 840 # r8, r9 are the results of the 32x32 giving 64 multiply.
 841 # r10, r11, r12 are the equivalents of c1, c2, and c3.
 842 #
 843         xor     r0,r0,r0                #r0=0. Used in addze below.
 844                                         #mul_add_c(a[0],b[0],c1,c2,c3);
 845         $LD     r6,`0*$BNSZ`(r4)
 846         $LD     r7,`0*$BNSZ`(r5)
 847         $UMULL  r10,r6,r7
 848         $UMULH  r11,r6,r7
 849         $ST     r10,`0*$BNSZ`(r3)       #r[0]=c1
 850                                         #mul_add_c(a[0],b[1],c2,c3,c1);
 851         $LD     r7,`1*$BNSZ`(r5)
 852         $UMULL  r8,r6,r7
 853         $UMULH  r9,r6,r7
 854         addc    r11,r8,r11
 855         adde    r12,r9,r0
 856         addze   r10,r0
 857                                         #mul_add_c(a[1],b[0],c2,c3,c1);
 858         $LD     r6, `1*$BNSZ`(r4)
 859         $LD     r7, `0*$BNSZ`(r5)
 860         $UMULL  r8,r6,r7
 861         $UMULH  r9,r6,r7
 862         addc    r11,r8,r11
 863         adde    r12,r9,r12
 864         addze   r10,r10
 865         $ST     r11,`1*$BNSZ`(r3)       #r[1]=c2
 866                                         #mul_add_c(a[2],b[0],c3,c1,c2);
 867         $LD     r6,`2*$BNSZ`(r4)
 868         $UMULL  r8,r6,r7
 869         $UMULH  r9,r6,r7
 870         addc    r12,r8,r12
 871         adde    r10,r9,r10
 872         addze   r11,r0
 873                                         #mul_add_c(a[1],b[1],c3,c1,c2);
 874         $LD     r6,`1*$BNSZ`(r4)
 875         $LD     r7,`1*$BNSZ`(r5)
 876         $UMULL  r8,r6,r7
 877         $UMULH  r9,r6,r7
 878         addc    r12,r8,r12
 879         adde    r10,r9,r10
 880         addze   r11,r11
 881                                         #mul_add_c(a[0],b[2],c3,c1,c2);
 882         $LD     r6,`0*$BNSZ`(r4)
 883         $LD     r7,`2*$BNSZ`(r5)
 884         $UMULL  r8,r6,r7
 885         $UMULH  r9,r6,r7
 886         addc    r12,r8,r12
 887         adde    r10,r9,r10
 888         addze   r11,r11
 889         $ST     r12,`2*$BNSZ`(r3)       #r[2]=c3
 890                                         #mul_add_c(a[0],b[3],c1,c2,c3);
 891         $LD     r7,`3*$BNSZ`(r5)
 892         $UMULL  r8,r6,r7
 893         $UMULH  r9,r6,r7
 894         addc    r10,r8,r10
 895         adde    r11,r9,r11
 896         addze   r12,r0
 897                                         #mul_add_c(a[1],b[2],c1,c2,c3);
 898         $LD     r6,`1*$BNSZ`(r4)
 899         $LD     r7,`2*$BNSZ`(r5)
 900         $UMULL  r8,r6,r7
 901         $UMULH  r9,r6,r7
 902         addc    r10,r8,r10
 903         adde    r11,r9,r11
 904         addze   r12,r12
 905                                         #mul_add_c(a[2],b[1],c1,c2,c3);
 906         $LD     r6,`2*$BNSZ`(r4)
 907         $LD     r7,`1*$BNSZ`(r5)
 908         $UMULL  r8,r6,r7
 909         $UMULH  r9,r6,r7
 910         addc    r10,r8,r10
 911         adde    r11,r9,r11
 912         addze   r12,r12
 913                                         #mul_add_c(a[3],b[0],c1,c2,c3);
 914         $LD     r6,`3*$BNSZ`(r4)
 915         $LD     r7,`0*$BNSZ`(r5)
 916         $UMULL  r8,r6,r7
 917         $UMULH  r9,r6,r7
 918         addc    r10,r8,r10
 919         adde    r11,r9,r11
 920         addze   r12,r12
 921         $ST     r10,`3*$BNSZ`(r3)       #r[3]=c1
 922                                         #mul_add_c(a[3],b[1],c2,c3,c1);
 923         $LD     r7,`1*$BNSZ`(r5)
 924         $UMULL  r8,r6,r7
 925         $UMULH  r9,r6,r7
 926         addc    r11,r8,r11
 927         adde    r12,r9,r12
 928         addze   r10,r0
 929                                         #mul_add_c(a[2],b[2],c2,c3,c1);
 930         $LD     r6,`2*$BNSZ`(r4)
 931         $LD     r7,`2*$BNSZ`(r5)
 932         $UMULL  r8,r6,r7
 933         $UMULH  r9,r6,r7
 934         addc    r11,r8,r11
 935         adde    r12,r9,r12
 936         addze   r10,r10
 937                                         #mul_add_c(a[1],b[3],c2,c3,c1);
 938         $LD     r6,`1*$BNSZ`(r4)
 939         $LD     r7,`3*$BNSZ`(r5)
 940         $UMULL  r8,r6,r7
 941         $UMULH  r9,r6,r7
 942         addc    r11,r8,r11
 943         adde    r12,r9,r12
 944         addze   r10,r10
 945         $ST     r11,`4*$BNSZ`(r3)       #r[4]=c2
 946                                         #mul_add_c(a[2],b[3],c3,c1,c2);
 947         $LD     r6,`2*$BNSZ`(r4)
 948         $UMULL  r8,r6,r7
 949         $UMULH  r9,r6,r7
 950         addc    r12,r8,r12
 951         adde    r10,r9,r10
 952         addze   r11,r0
 953                                         #mul_add_c(a[3],b[2],c3,c1,c2);
 954         $LD     r6,`3*$BNSZ`(r4)
 955         $LD     r7,`2*$BNSZ`(r4)
 956         $UMULL  r8,r6,r7
 957         $UMULH  r9,r6,r7
 958         addc    r12,r8,r12
 959         adde    r10,r9,r10
 960         addze   r11,r11
 961         $ST     r12,`5*$BNSZ`(r3)       #r[5]=c3
 962                                         #mul_add_c(a[3],b[3],c1,c2,c3);
 963         $LD     r7,`3*$BNSZ`(r5)
 964         $UMULL  r8,r6,r7
 965         $UMULH  r9,r6,r7
 966         addc    r10,r8,r10
 967         adde    r11,r9,r11
 968
 969         $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1
 970         $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2
 971         blr
 972         .long   0
 973         .byte   0,12,0x14,0,0,0,3,0
 974         .long   0
 975
 976 #
 977 #       NOTE:   The following label name should be changed to
 978 #               "bn_mul_comba8" i.e. remove the first dot
 979 #               for the gcc compiler. This should be automatically
 980 #               done in the build
 981 #
 982
 983 .align  4
 984 .bn_mul_comba8:
 985 #
 986 # Optimized version of the bn_mul_comba8 routine.
 987 #
 988 # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 989 # r3 contains r
 990 # r4 contains a
 991 # r5 contains b
 992 # r6, r7 are the 2 BN_ULONGs being multiplied.
 993 # r8, r9 are the results of the 32x32 giving 64 multiply.
 994 # r10, r11, r12 are the equivalents of c1, c2, and c3.
 995 #
 996         xor     r0,r0,r0                #r0=0. Used in addze below.
 997
 998                                         #mul_add_c(a[0],b[0],c1,c2,c3);
 999         $LD     r6,`0*$BNSZ`(r4)        #a[0]
1000         $LD     r7,`0*$BNSZ`(r5)        #b[0]
1001         $UMULL  r10,r6,r7
1002         $UMULH  r11,r6,r7
1003         $ST     r10,`0*$BNSZ`(r3)       #r[0]=c1;
1004                                         #mul_add_c(a[0],b[1],c2,c3,c1);
1005         $LD     r7,`1*$BNSZ`(r5)
1006         $UMULL  r8,r6,r7
1007         $UMULH  r9,r6,r7
1008         addc    r11,r11,r8
1009         addze   r12,r9                  # since we didnt set r12 to zero before.
1010         addze   r10,r0
1011                                         #mul_add_c(a[1],b[0],c2,c3,c1);
1012         $LD     r6,`1*$BNSZ`(r4)
1013         $LD     r7,`0*$BNSZ`(r5)
1014         $UMULL  r8,r6,r7
1015         $UMULH  r9,r6,r7
1016         addc    r11,r11,r8
1017         adde    r12,r12,r9
1018         addze   r10,r10
1019         $ST     r11,`1*$BNSZ`(r3)       #r[1]=c2;
1020                                         #mul_add_c(a[2],b[0],c3,c1,c2);
1021         $LD     r6,`2*$BNSZ`(r4)
1022         $UMULL  r8,r6,r7
1023         $UMULH  r9,r6,r7
1024         addc    r12,r12,r8
1025         adde    r10,r10,r9
1026         addze   r11,r0
1027                                         #mul_add_c(a[1],b[1],c3,c1,c2);
1028         $LD     r6,`1*$BNSZ`(r4)
1029         $LD     r7,`1*$BNSZ`(r5)
1030         $UMULL  r8,r6,r7
1031         $UMULH  r9,r6,r7
1032         addc    r12,r12,r8
1033         adde    r10,r10,r9
1034         addze   r11,r11
1035                                         #mul_add_c(a[0],b[2],c3,c1,c2);
1036         $LD     r6,`0*$BNSZ`(r4)
1037         $LD     r7,`2*$BNSZ`(r5)
1038         $UMULL  r8,r6,r7
1039         $UMULH  r9,r6,r7
1040         addc    r12,r12,r8
1041         adde    r10,r10,r9
1042         addze   r11,r11
1043         $ST     r12,`2*$BNSZ`(r3)       #r[2]=c3;
1044                                         #mul_add_c(a[0],b[3],c1,c2,c3);
1045         $LD     r7,`3*$BNSZ`(r5)
1046         $UMULL  r8,r6,r7
1047         $UMULH  r9,r6,r7
1048         addc    r10,r10,r8
1049         adde    r11,r11,r9
1050         addze   r12,r0
1051                                         #mul_add_c(a[1],b[2],c1,c2,c3);
1052         $LD     r6,`1*$BNSZ`(r4)
1053         $LD     r7,`2*$BNSZ`(r5)
1054         $UMULL  r8,r6,r7
1055         $UMULH  r9,r6,r7
1056         addc    r10,r10,r8
1057         adde    r11,r11,r9
1058         addze   r12,r12
1059
1060                                         #mul_add_c(a[2],b[1],c1,c2,c3);
1061         $LD     r6,`2*$BNSZ`(r4)
1062         $LD     r7,`1*$BNSZ`(r5)
1063         $UMULL  r8,r6,r7
1064         $UMULH  r9,r6,r7
1065         addc    r10,r10,r8
1066         adde    r11,r11,r9
1067         addze   r12,r12
1068                                         #mul_add_c(a[3],b[0],c1,c2,c3);
1069         $LD     r6,`3*$BNSZ`(r4)
1070         $LD     r7,`0*$BNSZ`(r5)
1071         $UMULL  r8,r6,r7
1072         $UMULH  r9,r6,r7
1073         addc    r10,r10,r8
1074         adde    r11,r11,r9
1075         addze   r12,r12
1076         $ST     r10,`3*$BNSZ`(r3)       #r[3]=c1;
1077                                         #mul_add_c(a[4],b[0],c2,c3,c1);
1078         $LD     r6,`4*$BNSZ`(r4)
1079         $UMULL  r8,r6,r7
1080         $UMULH  r9,r6,r7
1081         addc    r11,r11,r8
1082         adde    r12,r12,r9
1083         addze   r10,r0
1084                                         #mul_add_c(a[3],b[1],c2,c3,c1);
1085         $LD     r6,`3*$BNSZ`(r4)
1086         $LD     r7,`1*$BNSZ`(r5)
1087         $UMULL  r8,r6,r7
1088         $UMULH  r9,r6,r7
1089         addc    r11,r11,r8
1090         adde    r12,r12,r9
1091         addze   r10,r10
1092                                         #mul_add_c(a[2],b[2],c2,c3,c1);
1093         $LD     r6,`2*$BNSZ`(r4)
1094         $LD     r7,`2*$BNSZ`(r5)
1095         $UMULL  r8,r6,r7
1096         $UMULH  r9,r6,r7
1097         addc    r11,r11,r8
1098         adde    r12,r12,r9
1099         addze   r10,r10
1100                                         #mul_add_c(a[1],b[3],c2,c3,c1);
1101         $LD     r6,`1*$BNSZ`(r4)
1102         $LD     r7,`3*$BNSZ`(r5)
1103         $UMULL  r8,r6,r7
1104         $UMULH  r9,r6,r7
1105         addc    r11,r11,r8
1106         adde    r12,r12,r9
1107         addze   r10,r10
1108                                         #mul_add_c(a[0],b[4],c2,c3,c1);
1109         $LD     r6,`0*$BNSZ`(r4)
1110         $LD     r7,`4*$BNSZ`(r5)
1111         $UMULL  r8,r6,r7
1112         $UMULH  r9,r6,r7
1113         addc    r11,r11,r8
1114         adde    r12,r12,r9
1115         addze   r10,r10
1116         $ST     r11,`4*$BNSZ`(r3)       #r[4]=c2;
1117                                         #mul_add_c(a[0],b[5],c3,c1,c2);
1118         $LD     r7,`5*$BNSZ`(r5)
1119         $UMULL  r8,r6,r7
1120         $UMULH  r9,r6,r7
1121         addc    r12,r12,r8
1122         adde    r10,r10,r9
1123         addze   r11,r0
1124                                         #mul_add_c(a[1],b[4],c3,c1,c2);
1125         $LD     r6,`1*$BNSZ`(r4)
1126         $LD     r7,`4*$BNSZ`(r5)
1127         $UMULL  r8,r6,r7
1128         $UMULH  r9,r6,r7
1129         addc    r12,r12,r8
1130         adde    r10,r10,r9
1131         addze   r11,r11
1132                                         #mul_add_c(a[2],b[3],c3,c1,c2);
1133         $LD     r6,`2*$BNSZ`(r4)
1134         $LD     r7,`3*$BNSZ`(r5)
1135         $UMULL  r8,r6,r7
1136         $UMULH  r9,r6,r7
1137         addc    r12,r12,r8
1138         adde    r10,r10,r9
1139         addze   r11,r11
1140                                         #mul_add_c(a[3],b[2],c3,c1,c2);
1141         $LD     r6,`3*$BNSZ`(r4)
1142         $LD     r7,`2*$BNSZ`(r5)
1143         $UMULL  r8,r6,r7
1144         $UMULH  r9,r6,r7
1145         addc    r12,r12,r8
1146         adde    r10,r10,r9
1147         addze   r11,r11
1148                                         #mul_add_c(a[4],b[1],c3,c1,c2);
1149         $LD     r6,`4*$BNSZ`(r4)
1150         $LD     r7,`1*$BNSZ`(r5)
1151         $UMULL  r8,r6,r7
1152         $UMULH  r9,r6,r7
1153         addc    r12,r12,r8
1154         adde    r10,r10,r9
1155         addze   r11,r11
1156                                         #mul_add_c(a[5],b[0],c3,c1,c2);
1157         $LD     r6,`5*$BNSZ`(r4)
1158         $LD     r7,`0*$BNSZ`(r5)
1159         $UMULL  r8,r6,r7
1160         $UMULH  r9,r6,r7
1161         addc    r12,r12,r8
1162         adde    r10,r10,r9
1163         addze   r11,r11
1164         $ST     r12,`5*$BNSZ`(r3)       #r[5]=c3;
1165                                         #mul_add_c(a[6],b[0],c1,c2,c3);
1166         $LD     r6,`6*$BNSZ`(r4)
1167         $UMULL  r8,r6,r7
1168         $UMULH  r9,r6,r7
1169         addc    r10,r10,r8
1170         adde    r11,r11,r9
1171         addze   r12,r0
1172                                         #mul_add_c(a[5],b[1],c1,c2,c3);
1173         $LD     r6,`5*$BNSZ`(r4)
1174         $LD     r7,`1*$BNSZ`(r5)
1175         $UMULL  r8,r6,r7
1176         $UMULH  r9,r6,r7
1177         addc    r10,r10,r8
1178         adde    r11,r11,r9
1179         addze   r12,r12
1180                                         #mul_add_c(a[4],b[2],c1,c2,c3);
1181         $LD     r6,`4*$BNSZ`(r4)
1182         $LD     r7,`2*$BNSZ`(r5)
1183         $UMULL  r8,r6,r7
1184         $UMULH  r9,r6,r7
1185         addc    r10,r10,r8
1186         adde    r11,r11,r9
1187         addze   r12,r12
1188                                         #mul_add_c(a[3],b[3],c1,c2,c3);
1189         $LD     r6,`3*$BNSZ`(r4)
1190         $LD     r7,`3*$BNSZ`(r5)
1191         $UMULL  r8,r6,r7
1192         $UMULH  r9,r6,r7
1193         addc    r10,r10,r8
1194         adde    r11,r11,r9
1195         addze   r12,r12
1196                                         #mul_add_c(a[2],b[4],c1,c2,c3);
1197         $LD     r6,`2*$BNSZ`(r4)
1198         $LD     r7,`4*$BNSZ`(r5)
1199         $UMULL  r8,r6,r7
1200         $UMULH  r9,r6,r7
1201         addc    r10,r10,r8
1202         adde    r11,r11,r9
1203         addze   r12,r12
1204                                         #mul_add_c(a[1],b[5],c1,c2,c3);
1205         $LD     r6,`1*$BNSZ`(r4)
1206         $LD     r7,`5*$BNSZ`(r5)
1207         $UMULL  r8,r6,r7
1208         $UMULH  r9,r6,r7
1209         addc    r10,r10,r8
1210         adde    r11,r11,r9
1211         addze   r12,r12
1212                                         #mul_add_c(a[0],b[6],c1,c2,c3);
1213         $LD     r6,`0*$BNSZ`(r4)
1214         $LD     r7,`6*$BNSZ`(r5)
1215         $UMULL  r8,r6,r7
1216         $UMULH  r9,r6,r7
1217         addc    r10,r10,r8
1218         adde    r11,r11,r9
1219         addze   r12,r12
1220         $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1;
1221                                         #mul_add_c(a[0],b[7],c2,c3,c1);
1222         $LD     r7,`7*$BNSZ`(r5)
1223         $UMULL  r8,r6,r7
1224         $UMULH  r9,r6,r7
1225         addc    r11,r11,r8
1226         adde    r12,r12,r9
1227         addze   r10,r0
1228                                         #mul_add_c(a[1],b[6],c2,c3,c1);
1229         $LD     r6,`1*$BNSZ`(r4)
1230         $LD     r7,`6*$BNSZ`(r5)
1231         $UMULL  r8,r6,r7
1232         $UMULH  r9,r6,r7
1233         addc    r11,r11,r8
1234         adde    r12,r12,r9
1235         addze   r10,r10
1236                                         #mul_add_c(a[2],b[5],c2,c3,c1);
1237         $LD     r6,`2*$BNSZ`(r4)
1238         $LD     r7,`5*$BNSZ`(r5)
1239         $UMULL  r8,r6,r7
1240         $UMULH  r9,r6,r7
1241         addc    r11,r11,r8
1242         adde    r12,r12,r9
1243         addze   r10,r10
1244                                         #mul_add_c(a[3],b[4],c2,c3,c1);
1245         $LD     r6,`3*$BNSZ`(r4)
1246         $LD     r7,`4*$BNSZ`(r5)
1247         $UMULL  r8,r6,r7
1248         $UMULH  r9,r6,r7
1249         addc    r11,r11,r8
1250         adde    r12,r12,r9
1251         addze   r10,r10
1252                                         #mul_add_c(a[4],b[3],c2,c3,c1);
1253         $LD     r6,`4*$BNSZ`(r4)
1254         $LD     r7,`3*$BNSZ`(r5)
1255         $UMULL  r8,r6,r7
1256         $UMULH  r9,r6,r7
1257         addc    r11,r11,r8
1258         adde    r12,r12,r9
1259         addze   r10,r10
1260                                         #mul_add_c(a[5],b[2],c2,c3,c1);
1261         $LD     r6,`5*$BNSZ`(r4)
1262         $LD     r7,`2*$BNSZ`(r5)
1263         $UMULL  r8,r6,r7
1264         $UMULH  r9,r6,r7
1265         addc    r11,r11,r8
1266         adde    r12,r12,r9
1267         addze   r10,r10
1268                                         #mul_add_c(a[6],b[1],c2,c3,c1);
1269         $LD     r6,`6*$BNSZ`(r4)
1270         $LD     r7,`1*$BNSZ`(r5)
1271         $UMULL  r8,r6,r7
1272         $UMULH  r9,r6,r7
1273         addc    r11,r11,r8
1274         adde    r12,r12,r9
1275         addze   r10,r10
1276                                         #mul_add_c(a[7],b[0],c2,c3,c1);
1277         $LD     r6,`7*$BNSZ`(r4)
1278         $LD     r7,`0*$BNSZ`(r5)
1279         $UMULL  r8,r6,r7
1280         $UMULH  r9,r6,r7
1281         addc    r11,r11,r8
1282         adde    r12,r12,r9
1283         addze   r10,r10
1284         $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2;
1285                                         #mul_add_c(a[7],b[1],c3,c1,c2);
1286         $LD     r7,`1*$BNSZ`(r5)
1287         $UMULL  r8,r6,r7
1288         $UMULH  r9,r6,r7
1289         addc    r12,r12,r8
1290         adde    r10,r10,r9
1291         addze   r11,r0
1292                                         #mul_add_c(a[6],b[2],c3,c1,c2);
1293         $LD     r6,`6*$BNSZ`(r4)
1294         $LD     r7,`2*$BNSZ`(r5)
1295         $UMULL  r8,r6,r7
1296         $UMULH  r9,r6,r7
1297         addc    r12,r12,r8
1298         adde    r10,r10,r9
1299         addze   r11,r11
1300                                         #mul_add_c(a[5],b[3],c3,c1,c2);
1301         $LD     r6,`5*$BNSZ`(r4)
1302         $LD     r7,`3*$BNSZ`(r5)
1303         $UMULL  r8,r6,r7
1304         $UMULH  r9,r6,r7
1305         addc    r12,r12,r8
1306         adde    r10,r10,r9
1307         addze   r11,r11
1308                                         #mul_add_c(a[4],b[4],c3,c1,c2);
1309         $LD     r6,`4*$BNSZ`(r4)
1310         $LD     r7,`4*$BNSZ`(r5)
1311         $UMULL  r8,r6,r7
1312         $UMULH  r9,r6,r7
1313         addc    r12,r12,r8
1314         adde    r10,r10,r9
1315         addze   r11,r11
1316                                         #mul_add_c(a[3],b[5],c3,c1,c2);
1317         $LD     r6,`3*$BNSZ`(r4)
1318         $LD     r7,`5*$BNSZ`(r5)
1319         $UMULL  r8,r6,r7
1320         $UMULH  r9,r6,r7
1321         addc    r12,r12,r8
1322         adde    r10,r10,r9
1323         addze   r11,r11
1324                                         #mul_add_c(a[2],b[6],c3,c1,c2);
1325         $LD     r6,`2*$BNSZ`(r4)
1326         $LD     r7,`6*$BNSZ`(r5)
1327         $UMULL  r8,r6,r7
1328         $UMULH  r9,r6,r7
1329         addc    r12,r12,r8
1330         adde    r10,r10,r9
1331         addze   r11,r11
1332                                         #mul_add_c(a[1],b[7],c3,c1,c2);
1333         $LD     r6,`1*$BNSZ`(r4)
1334         $LD     r7,`7*$BNSZ`(r5)
1335         $UMULL  r8,r6,r7
1336         $UMULH  r9,r6,r7
1337         addc    r12,r12,r8
1338         adde    r10,r10,r9
1339         addze   r11,r11
1340         $ST     r12,`8*$BNSZ`(r3)       #r[8]=c3;
1341                                         #mul_add_c(a[2],b[7],c1,c2,c3);
1342         $LD     r6,`2*$BNSZ`(r4)
1343         $UMULL  r8,r6,r7
1344         $UMULH  r9,r6,r7
1345         addc    r10,r10,r8
1346         adde    r11,r11,r9
1347         addze   r12,r0
1348                                         #mul_add_c(a[3],b[6],c1,c2,c3);
1349         $LD     r6,`3*$BNSZ`(r4)
1350         $LD     r7,`6*$BNSZ`(r5)
1351         $UMULL  r8,r6,r7
1352         $UMULH  r9,r6,r7
1353         addc    r10,r10,r8
1354         adde    r11,r11,r9
1355         addze   r12,r12
1356                                         #mul_add_c(a[4],b[5],c1,c2,c3);
1357         $LD     r6,`4*$BNSZ`(r4)
1358         $LD     r7,`5*$BNSZ`(r5)
1359         $UMULL  r8,r6,r7
1360         $UMULH  r9,r6,r7
1361         addc    r10,r10,r8
1362         adde    r11,r11,r9
1363         addze   r12,r12
1364                                         #mul_add_c(a[5],b[4],c1,c2,c3);
1365         $LD     r6,`5*$BNSZ`(r4)
1366         $LD     r7,`4*$BNSZ`(r5)
1367         $UMULL  r8,r6,r7
1368         $UMULH  r9,r6,r7
1369         addc    r10,r10,r8
1370         adde    r11,r11,r9
1371         addze   r12,r12
1372                                         #mul_add_c(a[6],b[3],c1,c2,c3);
1373         $LD     r6,`6*$BNSZ`(r4)
1374         $LD     r7,`3*$BNSZ`(r5)
1375         $UMULL  r8,r6,r7
1376         $UMULH  r9,r6,r7
1377         addc    r10,r10,r8
1378         adde    r11,r11,r9
1379         addze   r12,r12
1380                                         #mul_add_c(a[7],b[2],c1,c2,c3);
1381         $LD     r6,`7*$BNSZ`(r4)
1382         $LD     r7,`2*$BNSZ`(r5)
1383         $UMULL  r8,r6,r7
1384         $UMULH  r9,r6,r7
1385         addc    r10,r10,r8
1386         adde    r11,r11,r9
1387         addze   r12,r12
1388         $ST     r10,`9*$BNSZ`(r3)       #r[9]=c1;
1389                                         #mul_add_c(a[7],b[3],c2,c3,c1);
1390         $LD     r7,`3*$BNSZ`(r5)
1391         $UMULL  r8,r6,r7
1392         $UMULH  r9,r6,r7
1393         addc    r11,r11,r8
1394         adde    r12,r12,r9
1395         addze   r10,r0
1396                                         #mul_add_c(a[6],b[4],c2,c3,c1);
1397         $LD     r6,`6*$BNSZ`(r4)
1398         $LD     r7,`4*$BNSZ`(r5)
1399         $UMULL  r8,r6,r7
1400         $UMULH  r9,r6,r7
1401         addc    r11,r11,r8
1402         adde    r12,r12,r9
1403         addze   r10,r10
1404                                         #mul_add_c(a[5],b[5],c2,c3,c1);
1405         $LD     r6,`5*$BNSZ`(r4)
1406         $LD     r7,`5*$BNSZ`(r5)
1407         $UMULL  r8,r6,r7
1408         $UMULH  r9,r6,r7
1409         addc    r11,r11,r8
1410         adde    r12,r12,r9
1411         addze   r10,r10
1412                                         #mul_add_c(a[4],b[6],c2,c3,c1);
1413         $LD     r6,`4*$BNSZ`(r4)
1414         $LD     r7,`6*$BNSZ`(r5)
1415         $UMULL  r8,r6,r7
1416         $UMULH  r9,r6,r7
1417         addc    r11,r11,r8
1418         adde    r12,r12,r9
1419         addze   r10,r10
1420                                         #mul_add_c(a[3],b[7],c2,c3,c1);
1421         $LD     r6,`3*$BNSZ`(r4)
1422         $LD     r7,`7*$BNSZ`(r5)
1423         $UMULL  r8,r6,r7
1424         $UMULH  r9,r6,r7
1425         addc    r11,r11,r8
1426         adde    r12,r12,r9
1427         addze   r10,r10
1428         $ST     r11,`10*$BNSZ`(r3)      #r[10]=c2;
1429                                         #mul_add_c(a[4],b[7],c3,c1,c2);
1430         $LD     r6,`4*$BNSZ`(r4)
1431         $UMULL  r8,r6,r7
1432         $UMULH  r9,r6,r7
1433         addc    r12,r12,r8
1434         adde    r10,r10,r9
1435         addze   r11,r0
1436                                         #mul_add_c(a[5],b[6],c3,c1,c2);
1437         $LD     r6,`5*$BNSZ`(r4)
1438         $LD     r7,`6*$BNSZ`(r5)
1439         $UMULL  r8,r6,r7
1440         $UMULH  r9,r6,r7
1441         addc    r12,r12,r8
1442         adde    r10,r10,r9
1443         addze   r11,r11
1444                                         #mul_add_c(a[6],b[5],c3,c1,c2);
1445         $LD     r6,`6*$BNSZ`(r4)
1446         $LD     r7,`5*$BNSZ`(r5)
1447         $UMULL  r8,r6,r7
1448         $UMULH  r9,r6,r7
1449         addc    r12,r12,r8
1450         adde    r10,r10,r9
1451         addze   r11,r11
1452                                         #mul_add_c(a[7],b[4],c3,c1,c2);
1453         $LD     r6,`7*$BNSZ`(r4)
1454         $LD     r7,`4*$BNSZ`(r5)
1455         $UMULL  r8,r6,r7
1456         $UMULH  r9,r6,r7
1457         addc    r12,r12,r8
1458         adde    r10,r10,r9
1459         addze   r11,r11
1460         $ST     r12,`11*$BNSZ`(r3)      #r[11]=c3;
1461                                         #mul_add_c(a[7],b[5],c1,c2,c3);
1462         $LD     r7,`5*$BNSZ`(r5)
1463         $UMULL  r8,r6,r7
1464         $UMULH  r9,r6,r7
1465         addc    r10,r10,r8
1466         adde    r11,r11,r9
1467         addze   r12,r0
1468                                         #mul_add_c(a[6],b[6],c1,c2,c3);
1469         $LD     r6,`6*$BNSZ`(r4)
1470         $LD     r7,`6*$BNSZ`(r5)
1471         $UMULL  r8,r6,r7
1472         $UMULH  r9,r6,r7
1473         addc    r10,r10,r8
1474         adde    r11,r11,r9
1475         addze   r12,r12
1476                                         #mul_add_c(a[5],b[7],c1,c2,c3);
1477         $LD     r6,`5*$BNSZ`(r4)
1478         $LD     r7,`7*$BNSZ`(r5)
1479         $UMULL  r8,r6,r7
1480         $UMULH  r9,r6,r7
1481         addc    r10,r10,r8
1482         adde    r11,r11,r9
1483         addze   r12,r12
1484         $ST     r10,`12*$BNSZ`(r3)      #r[12]=c1;
1485                                         #mul_add_c(a[6],b[7],c2,c3,c1);
1486         $LD     r6,`6*$BNSZ`(r4)
1487         $UMULL  r8,r6,r7
1488         $UMULH  r9,r6,r7
1489         addc    r11,r11,r8
1490         adde    r12,r12,r9
1491         addze   r10,r0
1492                                         #mul_add_c(a[7],b[6],c2,c3,c1);
1493         $LD     r6,`7*$BNSZ`(r4)
1494         $LD     r7,`6*$BNSZ`(r5)
1495         $UMULL  r8,r6,r7
1496         $UMULH  r9,r6,r7
1497         addc    r11,r11,r8
1498         adde    r12,r12,r9
1499         addze   r10,r10
1500         $ST     r11,`13*$BNSZ`(r3)      #r[13]=c2;
1501                                         #mul_add_c(a[7],b[7],c3,c1,c2);
1502         $LD     r7,`7*$BNSZ`(r5)
1503         $UMULL  r8,r6,r7
1504         $UMULH  r9,r6,r7
1505         addc    r12,r12,r8
1506         adde    r10,r10,r9
1507         $ST     r12,`14*$BNSZ`(r3)      #r[14]=c3;
1508         $ST     r10,`15*$BNSZ`(r3)      #r[15]=c1;
1509         blr
1510         .long   0
1511         .byte   0,12,0x14,0,0,0,3,0
1512         .long   0
1513
1514 #
1515 #       NOTE:   The following label name should be changed to
1516 #               "bn_sub_words" i.e. remove the first dot
1517 #               for the gcc compiler. This should be automatically
1518 #               done in the build
1519 #
1520 #
1521 .align  4
1522 .bn_sub_words:
1523 #
1524 #       Handcoded version of bn_sub_words
1525 #
1526 #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1527 #
1528 #       r3 = r
1529 #       r4 = a
1530 #       r5 = b
1531 #       r6 = n
1532 #
1533 #       Note:   No loop unrolling done since this is not a performance
1534 #               critical loop.
1535
1536         xor     r0,r0,r0        #set r0 = 0
1537 #
1538 #       check for r6 = 0 AND set carry bit.
1539 #
1540         subfc.  r7,r0,r6        # If r6 is 0 then result is 0.
1541                                 # if r6 > 0 then result !=0
1542                                 # In either case carry bit is set.
1543         beq     Lppcasm_sub_adios
1544         addi    r4,r4,-$BNSZ
1545         addi    r3,r3,-$BNSZ
1546         addi    r5,r5,-$BNSZ
1547         mtctr   r6
1548 Lppcasm_sub_mainloop:
1549         $LDU    r7,$BNSZ(r4)
1550         $LDU    r8,$BNSZ(r5)
1551         subfe   r6,r8,r7        # r6 = r7+carry bit + onescomplement(r8)
1552                                 # if carry = 1 this is r7-r8. Else it
1553                                 # is r7-r8 -1 as we need.
1554         $STU    r6,$BNSZ(r3)
1555         bdnz-   Lppcasm_sub_mainloop
1556 Lppcasm_sub_adios:
1557         subfze  r3,r0           # if carry bit is set then r3 = 0 else -1
1558         andi.   r3,r3,1         # keep only last bit.
1559         blr
1560         .long   0
1561         .byte   0,12,0x14,0,0,0,4,0
1562         .long   0
1563
1564 #
1565 #       NOTE:   The following label name should be changed to
1566 #               "bn_add_words" i.e. remove the first dot
1567 #               for the gcc compiler. This should be automatically
1568 #               done in the build
1569 #
1570
1571 .align  4
1572 .bn_add_words:
1573 #
1574 #       Handcoded version of bn_add_words
1575 #
1576 #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1577 #
1578 #       r3 = r
1579 #       r4 = a
1580 #       r5 = b
1581 #       r6 = n
1582 #
1583 #       Note:   No loop unrolling done since this is not a performance
1584 #               critical loop.
1585
1586         xor     r0,r0,r0
1587 #
1588 #       check for r6 = 0. Is this needed?
1589 #
1590         addic.  r6,r6,0         #test r6 and clear carry bit.
1591         beq     Lppcasm_add_adios
1592         addi    r4,r4,-$BNSZ
1593         addi    r3,r3,-$BNSZ
1594         addi    r5,r5,-$BNSZ
1595         mtctr   r6
1596 Lppcasm_add_mainloop:
1597         $LDU    r7,$BNSZ(r4)
1598         $LDU    r8,$BNSZ(r5)
1599         adde    r8,r7,r8
1600         $STU    r8,$BNSZ(r3)
1601         bdnz-   Lppcasm_add_mainloop
1602 Lppcasm_add_adios:
1603         addze   r3,r0                   #return carry bit.
1604         blr
1605         .long   0
1606         .byte   0,12,0x14,0,0,0,4,0
1607         .long   0
1608
1609 #
1610 #       NOTE:   The following label name should be changed to
1611 #               "bn_div_words" i.e. remove the first dot
1612 #               for the gcc compiler. This should be automatically
1613 #               done in the build
1614 #
1615
1616 .align  4
1617 .bn_div_words:
1618 #
1619 #       This is a cleaned up version of code generated by
1620 #       the AIX compiler. The only optimization is to use
1621 #       the PPC instruction to count leading zeros instead
1622 #       of call to num_bits_word. Since this was compiled
1623 #       only at level -O2 we can possibly squeeze it more?
1624 #
1625 #       r3 = h
1626 #       r4 = l
1627 #       r5 = d
1628
1629         $UCMPI  0,r5,0                  # compare r5 and 0
1630         bne     Lppcasm_div1            # proceed if d!=0
1631         li      r3,-1                   # d=0 return -1
1632         blr
1633 Lppcasm_div1:
1634         xor     r0,r0,r0                #r0=0
1635         li      r8,$BITS
1636         $CNTLZ. r7,r5                   #r7 = num leading 0s in d.
1637         beq     Lppcasm_div2            #proceed if no leading zeros
1638         subf    r8,r7,r8                #r8 = BN_num_bits_word(d)
1639         $SHR.   r9,r3,r8                #are there any bits above r8'th?
1640         $TR     16,r9,r0                #if there're, signal to dump core...
1641 Lppcasm_div2:
1642         $UCMP   0,r3,r5                 #h>=d?
1643         blt     Lppcasm_div3            #goto Lppcasm_div3 if not
1644         subf    r3,r5,r3                #h-=d ;
1645 Lppcasm_div3:                           #r7 = BN_BITS2-i. so r7=i
1646         cmpi    0,0,r7,0                # is (i == 0)?
1647         beq     Lppcasm_div4
1648         $SHL    r3,r3,r7                # h = (h<< i)
1649         $SHR    r8,r4,r8                # r8 = (l >> BN_BITS2 -i)
1650         $SHL    r5,r5,r7                # d<<=i
1651         or      r3,r3,r8                # h = (h<<i)|(l>>(BN_BITS2-i))
1652         $SHL    r4,r4,r7                # l <<=i
1653 Lppcasm_div4:
1654         $SHRI   r9,r5,`$BITS/2`         # r9 = dh
1655                                         # dl will be computed when needed
1656                                         # as it saves registers.
1657         li      r6,2                    #r6=2
1658         mtctr   r6                      #counter will be in count.
1659 Lppcasm_divouterloop:
1660         $SHRI   r8,r3,`$BITS/2`         #r8 = (h>>BN_BITS4)
1661         $SHRI   r11,r4,`$BITS/2`        #r11= (l&BN_MASK2h)>>BN_BITS4
1662                                         # compute here for innerloop.
1663         $UCMP   0,r8,r9                 # is (h>>BN_BITS4)==dh
1664         bne     Lppcasm_div5            # goto Lppcasm_div5 if not
1665
1666         li      r8,-1
1667         $CLRU   r8,r8,`$BITS/2`         #q = BN_MASK2l
1668         b       Lppcasm_div6
1669 Lppcasm_div5:
1670         $UDIV   r8,r3,r9                #q = h/dh
1671 Lppcasm_div6:
1672         $UMULL  r12,r9,r8               #th = q*dh
1673         $CLRU   r10,r5,`$BITS/2`        #r10=dl
1674         $UMULL  r6,r8,r10               #tl = q*dl
1675
1676 Lppcasm_divinnerloop:
1677         subf    r10,r12,r3              #t = h -th
1678         $SHRI   r7,r10,`$BITS/2`        #r7= (t &BN_MASK2H), sort of...
1679         addic.  r7,r7,0                 #test if r7 == 0. used below.
1680                                         # now want to compute
1681                                         # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1682                                         # the following 2 instructions do that
1683         $SHLI   r7,r10,`$BITS/2`        # r7 = (t<<BN_BITS4)
1684         or      r7,r7,r11               # r7|=((l&BN_MASK2h)>>BN_BITS4)
1685         $UCMP   cr1,r6,r7               # compare (tl <= r7)
1686         bne     Lppcasm_divinnerexit
1687         ble     cr1,Lppcasm_divinnerexit
1688         addi    r8,r8,-1                #q--
1689         subf    r12,r9,r12              #th -=dh
1690         $CLRU   r10,r5,`$BITS/2`        #r10=dl. t is no longer needed in loop.
1691         subf    r6,r10,r6               #tl -=dl
1692         b       Lppcasm_divinnerloop
1693 Lppcasm_divinnerexit:
1694         $SHRI   r10,r6,`$BITS/2`        #t=(tl>>BN_BITS4)
1695         $SHLI   r11,r6,`$BITS/2`        #tl=(tl<<BN_BITS4)&BN_MASK2h;
1696         $UCMP   cr1,r4,r11              # compare l and tl
1697         add     r12,r12,r10             # th+=t
1698         bge     cr1,Lppcasm_div7        # if (l>=tl) goto Lppcasm_div7
1699         addi    r12,r12,1               # th++
1700 Lppcasm_div7:
1701         subf    r11,r11,r4              #r11=l-tl
1702         $UCMP   cr1,r3,r12              #compare h and th
1703         bge     cr1,Lppcasm_div8        #if (h>=th) goto Lppcasm_div8
1704         addi    r8,r8,-1                # q--
1705         add     r3,r5,r3                # h+=d
1706 Lppcasm_div8:
1707         subf    r12,r12,r3              #r12 = h-th
1708         $SHLI   r4,r11,`$BITS/2`        #l=(l&BN_MASK2l)<<BN_BITS4
1709                                         # want to compute
1710                                         # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1711                                         # the following 2 instructions will do this.
1712         $INSR   r11,r12,`$BITS/2`,`$BITS/2`     # r11 is the value we want rotated $BITS/2.
1713         $ROTL   r3,r11,`$BITS/2`        # rotate by $BITS/2 and store in r3
1714         bdz     Lppcasm_div9            #if (count==0) break ;
1715         $SHLI   r0,r8,`$BITS/2`         #ret =q<<BN_BITS4
1716         b       Lppcasm_divouterloop
1717 Lppcasm_div9:
1718         or      r3,r8,r0
1719         blr
1720         .long   0
1721         .byte   0,12,0x14,0,0,0,3,0
1722         .long   0
1723
1724 #
1725 #       NOTE:   The following label name should be changed to
1726 #               "bn_sqr_words" i.e. remove the first dot
1727 #               for the gcc compiler. This should be automatically
1728 #               done in the build
1729 #
1730 .align  4
1731 .bn_sqr_words:
1732 #
1733 #       Optimized version of bn_sqr_words
1734 #
1735 #       void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1736 #
1737 #       r3 = r
1738 #       r4 = a
1739 #       r5 = n
1740 #
1741 #       r6 = a[i].
1742 #       r7,r8 = product.
1743 #
1744 #       No unrolling done here. Not performance critical.
1745
1746         addic.  r5,r5,0                 #test r5.
1747         beq     Lppcasm_sqr_adios
1748         addi    r4,r4,-$BNSZ
1749         addi    r3,r3,-$BNSZ
1750         mtctr   r5
1751 Lppcasm_sqr_mainloop:
1752                                         #sqr(r[0],r[1],a[0]);
1753         $LDU    r6,$BNSZ(r4)
1754         $UMULL  r7,r6,r6
1755         $UMULH  r8,r6,r6
1756         $STU    r7,$BNSZ(r3)
1757         $STU    r8,$BNSZ(r3)
1758         bdnz-   Lppcasm_sqr_mainloop
1759 Lppcasm_sqr_adios:
1760         blr
1761         .long   0
1762         .byte   0,12,0x14,0,0,0,3,0
1763         .long   0
1764
1765 #
1766 #       NOTE:   The following label name should be changed to
1767 #               "bn_mul_words" i.e. remove the first dot
1768 #               for the gcc compiler. This should be automatically
1769 #               done in the build
1770 #
1771
1772 .align  4
1773 .bn_mul_words:
1774 #
1775 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1776 #
1777 # r3 = rp
1778 # r4 = ap
1779 # r5 = num
1780 # r6 = w
1781         xor     r0,r0,r0
1782         xor     r12,r12,r12             # used for carry
1783         rlwinm. r7,r5,30,2,31           # num >> 2
1784         beq     Lppcasm_mw_REM
1785         mtctr   r7
1786 Lppcasm_mw_LOOP:
1787                                         #mul(rp[0],ap[0],w,c1);
1788         $LD     r8,`0*$BNSZ`(r4)
1789         $UMULL  r9,r6,r8
1790         $UMULH  r10,r6,r8
1791         addc    r9,r9,r12
1792         #addze  r10,r10                 #carry is NOT ignored.
1793                                         #will be taken care of
1794                                         #in second spin below
1795                                         #using adde.
1796         $ST     r9,`0*$BNSZ`(r3)
1797                                         #mul(rp[1],ap[1],w,c1);
1798         $LD     r8,`1*$BNSZ`(r4)
1799         $UMULL  r11,r6,r8
1800         $UMULH  r12,r6,r8
1801         adde    r11,r11,r10
1802         #addze  r12,r12
1803         $ST     r11,`1*$BNSZ`(r3)
1804                                         #mul(rp[2],ap[2],w,c1);
1805         $LD     r8,`2*$BNSZ`(r4)
1806         $UMULL  r9,r6,r8
1807         $UMULH  r10,r6,r8
1808         adde    r9,r9,r12
1809         #addze  r10,r10
1810         $ST     r9,`2*$BNSZ`(r3)
1811                                         #mul_add(rp[3],ap[3],w,c1);
1812         $LD     r8,`3*$BNSZ`(r4)
1813         $UMULL  r11,r6,r8
1814         $UMULH  r12,r6,r8
1815         adde    r11,r11,r10
1816         addze   r12,r12                 #this spin we collect carry into
1817                                         #r12
1818         $ST     r11,`3*$BNSZ`(r3)
1819
1820         addi    r3,r3,`4*$BNSZ`
1821         addi    r4,r4,`4*$BNSZ`
1822         bdnz-   Lppcasm_mw_LOOP
1823
1824 Lppcasm_mw_REM:
1825         andi.   r5,r5,0x3
1826         beq     Lppcasm_mw_OVER
1827                                         #mul(rp[0],ap[0],w,c1);
1828         $LD     r8,`0*$BNSZ`(r4)
1829         $UMULL  r9,r6,r8
1830         $UMULH  r10,r6,r8
1831         addc    r9,r9,r12
1832         addze   r10,r10
1833         $ST     r9,`0*$BNSZ`(r3)
1834         addi    r12,r10,0
1835
1836         addi    r5,r5,-1
1837         cmpli   0,0,r5,0
1838         beq     Lppcasm_mw_OVER
1839
1840
1841                                         #mul(rp[1],ap[1],w,c1);
1842         $LD     r8,`1*$BNSZ`(r4)
1843         $UMULL  r9,r6,r8
1844         $UMULH  r10,r6,r8
1845         addc    r9,r9,r12
1846         addze   r10,r10
1847         $ST     r9,`1*$BNSZ`(r3)
1848         addi    r12,r10,0
1849
1850         addi    r5,r5,-1
1851         cmpli   0,0,r5,0
1852         beq     Lppcasm_mw_OVER
1853
1854                                         #mul_add(rp[2],ap[2],w,c1);
1855         $LD     r8,`2*$BNSZ`(r4)
1856         $UMULL  r9,r6,r8
1857         $UMULH  r10,r6,r8
1858         addc    r9,r9,r12
1859         addze   r10,r10
1860         $ST     r9,`2*$BNSZ`(r3)
1861         addi    r12,r10,0
1862
1863 Lppcasm_mw_OVER:
1864         addi    r3,r12,0
1865         blr
1866         .long   0
1867         .byte   0,12,0x14,0,0,0,4,0
1868         .long   0
1869
1870 #
1871 #       NOTE:   The following label name should be changed to
1872 #               "bn_mul_add_words" i.e. remove the first dot
1873 #               for the gcc compiler. This should be automatically
1874 #               done in the build
1875 #
1876
1877 .align  4
1878 .bn_mul_add_words:
1879 #
1880 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1881 #
1882 # r3 = rp
1883 # r4 = ap
1884 # r5 = num
1885 # r6 = w
1886 #
1887 # empirical evidence suggests that unrolled version performs best!!
1888 #
1889         xor     r0,r0,r0                #r0 = 0
1890         xor     r12,r12,r12             #r12 = 0 . used for carry
1891         rlwinm. r7,r5,30,2,31           # num >> 2
1892         beq     Lppcasm_maw_leftover    # if (num < 4) go LPPCASM_maw_leftover
1893         mtctr   r7
1894 Lppcasm_maw_mainloop:
1895                                         #mul_add(rp[0],ap[0],w,c1);
1896         $LD     r8,`0*$BNSZ`(r4)
1897         $LD     r11,`0*$BNSZ`(r3)
1898         $UMULL  r9,r6,r8
1899         $UMULH  r10,r6,r8
1900         addc    r9,r9,r12               #r12 is carry.
1901         addze   r10,r10
1902         addc    r9,r9,r11
1903         #addze  r10,r10
1904                                         #the above instruction addze
1905                                         #is NOT needed. Carry will NOT
1906                                         #be ignored. It's not affected
1907                                         #by multiply and will be collected
1908                                         #in the next spin
1909         $ST     r9,`0*$BNSZ`(r3)
1910
1911                                         #mul_add(rp[1],ap[1],w,c1);
1912         $LD     r8,`1*$BNSZ`(r4)
1913         $LD     r9,`1*$BNSZ`(r3)
1914         $UMULL  r11,r6,r8
1915         $UMULH  r12,r6,r8
1916         adde    r11,r11,r10             #r10 is carry.
1917         addze   r12,r12
1918         addc    r11,r11,r9
1919         #addze  r12,r12
1920         $ST     r11,`1*$BNSZ`(r3)
1921
1922                                         #mul_add(rp[2],ap[2],w,c1);
1923         $LD     r8,`2*$BNSZ`(r4)
1924         $UMULL  r9,r6,r8
1925         $LD     r11,`2*$BNSZ`(r3)
1926         $UMULH  r10,r6,r8
1927         adde    r9,r9,r12
1928         addze   r10,r10
1929         addc    r9,r9,r11
1930         #addze  r10,r10
1931         $ST     r9,`2*$BNSZ`(r3)
1932
1933                                         #mul_add(rp[3],ap[3],w,c1);
1934         $LD     r8,`3*$BNSZ`(r4)
1935         $UMULL  r11,r6,r8
1936         $LD     r9,`3*$BNSZ`(r3)
1937         $UMULH  r12,r6,r8
1938         adde    r11,r11,r10
1939         addze   r12,r12
1940         addc    r11,r11,r9
1941         addze   r12,r12
1942         $ST     r11,`3*$BNSZ`(r3)
1943         addi    r3,r3,`4*$BNSZ`
1944         addi    r4,r4,`4*$BNSZ`
1945         bdnz-   Lppcasm_maw_mainloop
1946
1947 Lppcasm_maw_leftover:
1948         andi.   r5,r5,0x3
1949         beq     Lppcasm_maw_adios
1950         addi    r3,r3,-$BNSZ
1951         addi    r4,r4,-$BNSZ
1952                                         #mul_add(rp[0],ap[0],w,c1);
1953         mtctr   r5
1954         $LDU    r8,$BNSZ(r4)
1955         $UMULL  r9,r6,r8
1956         $UMULH  r10,r6,r8
1957         $LDU    r11,$BNSZ(r3)
1958         addc    r9,r9,r11
1959         addze   r10,r10
1960         addc    r9,r9,r12
1961         addze   r12,r10
1962         $ST     r9,0(r3)
1963
1964         bdz     Lppcasm_maw_adios
1965                                         #mul_add(rp[1],ap[1],w,c1);
1966         $LDU    r8,$BNSZ(r4)
1967         $UMULL  r9,r6,r8
1968         $UMULH  r10,r6,r8
1969         $LDU    r11,$BNSZ(r3)
1970         addc    r9,r9,r11
1971         addze   r10,r10
1972         addc    r9,r9,r12
1973         addze   r12,r10
1974         $ST     r9,0(r3)
1975
1976         bdz     Lppcasm_maw_adios
1977                                         #mul_add(rp[2],ap[2],w,c1);
1978         $LDU    r8,$BNSZ(r4)
1979         $UMULL  r9,r6,r8
1980         $UMULH  r10,r6,r8
1981         $LDU    r11,$BNSZ(r3)
1982         addc    r9,r9,r11
1983         addze   r10,r10
1984         addc    r9,r9,r12
1985         addze   r12,r10
1986         $ST     r9,0(r3)
1987
1988 Lppcasm_maw_adios:
1989         addi    r3,r12,0
1990         blr
1991         .long   0
1992         .byte   0,12,0x14,0,0,0,4,0
1993         .long   0
1994         .align  4
1995 EOF
1996 $data =~ s/\`([^\`]*)\`/eval $1/gem;
1997 print $data;
1998 close STDOUT;