crypto/bn/bn_asm.c

   1 /* crypto/bn/bn_asm.c */
   2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
   3  * All rights reserved.
   4  *
   5  * This package is an SSL implementation written
   6  * by Eric Young (eay@cryptsoft.com).
   7  * The implementation was written so as to conform with Netscapes SSL.
   8  *
   9  * This library is free for commercial and non-commercial use as long as
  10  * the following conditions are aheared to.  The following conditions
  11  * apply to all code found in this distribution, be it the RC4, RSA,
  12  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
  13  * included with this distribution is covered by the same copyright terms
  14  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
  15  *
  16  * Copyright remains Eric Young's, and as such any Copyright notices in
  17  * the code are not to be removed.
  18  * If this package is used in a product, Eric Young should be given attribution
  19  * as the author of the parts of the library used.
  20  * This can be in the form of a textual message at program startup or
  21  * in documentation (online or textual) provided with the package.
  22  *
  23  * Redistribution and use in source and binary forms, with or without
  24  * modification, are permitted provided that the following conditions
  25  * are met:
  26  * 1. Redistributions of source code must retain the copyright
  27  *    notice, this list of conditions and the following disclaimer.
  28  * 2. Redistributions in binary form must reproduce the above copyright
  29  *    notice, this list of conditions and the following disclaimer in the
  30  *    documentation and/or other materials provided with the distribution.
  31  * 3. All advertising materials mentioning features or use of this software
  32  *    must display the following acknowledgement:
  33  *    "This product includes cryptographic software written by
  34  *     Eric Young (eay@cryptsoft.com)"
  35  *    The word 'cryptographic' can be left out if the rouines from the library
  36  *    being used are not cryptographic related :-).
  37  * 4. If you include any Windows specific code (or a derivative thereof) from
  38  *    the apps directory (application code) you must include an acknowledgement:
  39  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
  40  *
  41  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
  42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  51  * SUCH DAMAGE.
  52  *
  53  * The licence and distribution terms for any publically available version or
  54  * derivative of this code cannot be changed.  i.e. this code cannot simply be
  55  * copied and put under another distribution licence
  56  * [including the GNU Public Licence.]
  57  */
  58
  59 #ifndef BN_DEBUG
  60 # undef NDEBUG /* avoid conflicting definitions */
  61 # define NDEBUG
  62 #endif
  63
  64 #include <stdio.h>
  65 #include <assert.h>
  66 #include <openssl/crypto.h>
  67 #include "cryptlib.h"
  68 #include "bn_lcl.h"
  69
  70 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
  71
  72 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  73         {
  74         BN_ULONG c1=0;
  75
  76         assert(num >= 0);
  77         if (num <= 0) return(c1);
  78
  79 #ifndef OPENSSL_SMALL_FOOTPRINT
  80         while (num&~3)
  81                 {
  82                 mul_add(rp[0],ap[0],w,c1);
  83                 mul_add(rp[1],ap[1],w,c1);
  84                 mul_add(rp[2],ap[2],w,c1);
  85                 mul_add(rp[3],ap[3],w,c1);
  86                 ap+=4; rp+=4; num-=4;
  87                 }
  88 #endif
  89         while (num)
  90                 {
  91                 mul_add(rp[0],ap[0],w,c1);
  92                 ap++; rp++; num--;
  93                 }
  94
  95         return(c1);
  96         }
  97
  98 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  99         {
 100         BN_ULONG c1=0;
 101
 102         assert(num >= 0);
 103         if (num <= 0) return(c1);
 104
 105 #ifndef OPENSSL_SMALL_FOOTPRINT
 106         while (num&~3)
 107                 {
 108                 mul(rp[0],ap[0],w,c1);
 109                 mul(rp[1],ap[1],w,c1);
 110                 mul(rp[2],ap[2],w,c1);
 111                 mul(rp[3],ap[3],w,c1);
 112                 ap+=4; rp+=4; num-=4;
 113                 }
 114 #endif
 115         while (num)
 116                 {
 117                 mul(rp[0],ap[0],w,c1);
 118                 ap++; rp++; num--;
 119                 }
 120         return(c1);
 121         }
 122
 123 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
 124         {
 125         assert(n >= 0);
 126         if (n <= 0) return;
 127
 128 #ifndef OPENSSL_SMALL_FOOTPRINT
 129         while (n&~3)
 130                 {
 131                 sqr(r[0],r[1],a[0]);
 132                 sqr(r[2],r[3],a[1]);
 133                 sqr(r[4],r[5],a[2]);
 134                 sqr(r[6],r[7],a[3]);
 135                 a+=4; r+=8; n-=4;
 136                 }
 137 #endif
 138         while (n)
 139                 {
 140                 sqr(r[0],r[1],a[0]);
 141                 a++; r+=2; n--;
 142                 }
 143         }
 144
 145 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
 146
 147 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 148         {
 149         BN_ULONG c=0;
 150         BN_ULONG bl,bh;
 151
 152         assert(num >= 0);
 153         if (num <= 0) return((BN_ULONG)0);
 154
 155         bl=LBITS(w);
 156         bh=HBITS(w);
 157
 158 #ifndef OPENSSL_SMALL_FOOTPRINT
 159         while (num&~3)
 160                 {
 161                 mul_add(rp[0],ap[0],bl,bh,c);
 162                 mul_add(rp[1],ap[1],bl,bh,c);
 163                 mul_add(rp[2],ap[2],bl,bh,c);
 164                 mul_add(rp[3],ap[3],bl,bh,c);
 165                 ap+=4; rp+=4; num-=4;
 166                 }
 167 #endif
 168         while (num)
 169                 {
 170                 mul_add(rp[0],ap[0],bl,bh,c);
 171                 ap++; rp++; num--;
 172                 }
 173         return(c);
 174         }
 175
 176 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 177         {
 178         BN_ULONG carry=0;
 179         BN_ULONG bl,bh;
 180
 181         assert(num >= 0);
 182         if (num <= 0) return((BN_ULONG)0);
 183
 184         bl=LBITS(w);
 185         bh=HBITS(w);
 186
 187 #ifndef OPENSSL_SMALL_FOOTPRINT
 188         while (num&~3)
 189                 {
 190                 mul(rp[0],ap[0],bl,bh,carry);
 191                 mul(rp[1],ap[1],bl,bh,carry);
 192                 mul(rp[2],ap[2],bl,bh,carry);
 193                 mul(rp[3],ap[3],bl,bh,carry);
 194                 ap+=4; rp+=4; num-=4;
 195                 }
 196 #endif
 197         while (num)
 198                 {
 199                 mul(rp[0],ap[0],bl,bh,carry);
 200                 ap++; rp++; num--;
 201                 }
 202         return(carry);
 203         }
 204
 205 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
 206         {
 207         assert(n >= 0);
 208         if (n <= 0) return;
 209
 210 #ifndef OPENSSL_SMALL_FOOTPRINT
 211         while (n&~3)
 212                 {
 213                 sqr64(r[0],r[1],a[0]);
 214                 sqr64(r[2],r[3],a[1]);
 215                 sqr64(r[4],r[5],a[2]);
 216                 sqr64(r[6],r[7],a[3]);
 217                 a+=4; r+=8; n-=4;
 218                 }
 219 #endif
 220         while (n)
 221                 {
 222                 sqr64(r[0],r[1],a[0]);
 223                 a++; r+=2; n--;
 224                 }
 225         }
 226
 227 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
 228
 229 #if defined(BN_LLONG) && defined(BN_DIV2W)
 230
 231 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 232         {
 233         return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d));
 234         }
 235
 236 #else
 237
 238 /* Divide h,l by d and return the result. */
 239 /* I need to test this some more :-( */
 240 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 241         {
 242         BN_ULONG dh,dl,q,ret=0,th,tl,t;
 243         int i,count=2;
 244
 245         if (d == 0) return(BN_MASK2);
 246
 247         i=BN_num_bits_word(d);
 248         assert((i == BN_BITS2) || (h <= (BN_ULONG)1<<i));
 249
 250         i=BN_BITS2-i;
 251         if (h >= d) h-=d;
 252
 253         if (i)
 254                 {
 255                 d<<=i;
 256                 h=(h<<i)|(l>>(BN_BITS2-i));
 257                 l<<=i;
 258                 }
 259         dh=(d&BN_MASK2h)>>BN_BITS4;
 260         dl=(d&BN_MASK2l);
 261         for (;;)
 262                 {
 263                 if ((h>>BN_BITS4) == dh)
 264                         q=BN_MASK2l;
 265                 else
 266                         q=h/dh;
 267
 268                 th=q*dh;
 269                 tl=dl*q;
 270                 for (;;)
 271                         {
 272                         t=h-th;
 273                         if ((t&BN_MASK2h) ||
 274                                 ((tl) <= (
 275                                         (t<<BN_BITS4)|
 276                                         ((l&BN_MASK2h)>>BN_BITS4))))
 277                                 break;
 278                         q--;
 279                         th-=dh;
 280                         tl-=dl;
 281                         }
 282                 t=(tl>>BN_BITS4);
 283                 tl=(tl<<BN_BITS4)&BN_MASK2h;
 284                 th+=t;
 285
 286                 if (l < tl) th++;
 287                 l-=tl;
 288                 if (h < th)
 289                         {
 290                         h+=d;
 291                         q--;
 292                         }
 293                 h-=th;
 294
 295                 if (--count == 0) break;
 296
 297                 ret=q<<BN_BITS4;
 298                 h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
 299                 l=(l&BN_MASK2l)<<BN_BITS4;
 300                 }
 301         ret|=q;
 302         return(ret);
 303         }
 304 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
 305
 306 #ifdef BN_LLONG
 307 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 308         {
 309         BN_ULLONG ll=0;
 310
 311         assert(n >= 0);
 312         if (n <= 0) return((BN_ULONG)0);
 313
 314 #ifndef OPENSSL_SMALL_FOOTPRINT
 315         while (n&~3)
 316                 {
 317                 ll+=(BN_ULLONG)a[0]+b[0];
 318                 r[0]=(BN_ULONG)ll&BN_MASK2;
 319                 ll>>=BN_BITS2;
 320                 ll+=(BN_ULLONG)a[1]+b[1];
 321                 r[1]=(BN_ULONG)ll&BN_MASK2;
 322                 ll>>=BN_BITS2;
 323                 ll+=(BN_ULLONG)a[2]+b[2];
 324                 r[2]=(BN_ULONG)ll&BN_MASK2;
 325                 ll>>=BN_BITS2;
 326                 ll+=(BN_ULLONG)a[3]+b[3];
 327                 r[3]=(BN_ULONG)ll&BN_MASK2;
 328                 ll>>=BN_BITS2;
 329                 a+=4; b+=4; r+=4; n-=4;
 330                 }
 331 #endif
 332         while (n)
 333                 {
 334                 ll+=(BN_ULLONG)a[0]+b[0];
 335                 r[0]=(BN_ULONG)ll&BN_MASK2;
 336                 ll>>=BN_BITS2;
 337                 a++; b++; r++; n--;
 338                 }
 339         return((BN_ULONG)ll);
 340         }
 341 #else /* !BN_LLONG */
 342 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 343         {
 344         BN_ULONG c,l,t;
 345
 346         assert(n >= 0);
 347         if (n <= 0) return((BN_ULONG)0);
 348
 349         c=0;
 350 #ifndef OPENSSL_SMALL_FOOTPRINT
 351         while (n&~3)
 352                 {
 353                 t=a[0];
 354                 t=(t+c)&BN_MASK2;
 355                 c=(t < c);
 356                 l=(t+b[0])&BN_MASK2;
 357                 c+=(l < t);
 358                 r[0]=l;
 359                 t=a[1];
 360                 t=(t+c)&BN_MASK2;
 361                 c=(t < c);
 362                 l=(t+b[1])&BN_MASK2;
 363                 c+=(l < t);
 364                 r[1]=l;
 365                 t=a[2];
 366                 t=(t+c)&BN_MASK2;
 367                 c=(t < c);
 368                 l=(t+b[2])&BN_MASK2;
 369                 c+=(l < t);
 370                 r[2]=l;
 371                 t=a[3];
 372                 t=(t+c)&BN_MASK2;
 373                 c=(t < c);
 374                 l=(t+b[3])&BN_MASK2;
 375                 c+=(l < t);
 376                 r[3]=l;
 377                 a+=4; b+=4; r+=4; n-=4;
 378                 }
 379 #endif
 380         while(n)
 381                 {
 382                 t=a[0];
 383                 t=(t+c)&BN_MASK2;
 384                 c=(t < c);
 385                 l=(t+b[0])&BN_MASK2;
 386                 c+=(l < t);
 387                 r[0]=l;
 388                 a++; b++; r++; n--;
 389                 }
 390         return((BN_ULONG)c);
 391         }
 392 #endif /* !BN_LLONG */
 393
 394 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 395         {
 396         BN_ULONG t1,t2;
 397         int c=0;
 398
 399         assert(n >= 0);
 400         if (n <= 0) return((BN_ULONG)0);
 401
 402 #ifndef OPENSSL_SMALL_FOOTPRINT
 403         while (n&~3)
 404                 {
 405                 t1=a[0]; t2=b[0];
 406                 r[0]=(t1-t2-c)&BN_MASK2;
 407                 if (t1 != t2) c=(t1 < t2);
 408                 t1=a[1]; t2=b[1];
 409                 r[1]=(t1-t2-c)&BN_MASK2;
 410                 if (t1 != t2) c=(t1 < t2);
 411                 t1=a[2]; t2=b[2];
 412                 r[2]=(t1-t2-c)&BN_MASK2;
 413                 if (t1 != t2) c=(t1 < t2);
 414                 t1=a[3]; t2=b[3];
 415                 r[3]=(t1-t2-c)&BN_MASK2;
 416                 if (t1 != t2) c=(t1 < t2);
 417                 a+=4; b+=4; r+=4; n-=4;
 418                 }
 419 #endif
 420         while (n)
 421                 {
 422                 t1=a[0]; t2=b[0];
 423                 r[0]=(t1-t2-c)&BN_MASK2;
 424                 if (t1 != t2) c=(t1 < t2);
 425                 a++; b++; r++; n--;
 426                 }
 427         return(c);
 428         }
 429
 430 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
 431
 432 #ifndef OPENSSL_FIPSCANISTER
 433 #undef bn_mul_comba8
 434 #undef bn_mul_comba4
 435 #undef bn_sqr_comba8
 436 #undef bn_sqr_comba4
 437 #endif
 438
 439 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
 440 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
 441 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
 442 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
 443
 444 #ifdef BN_LLONG
 445 #define mul_add_c(a,b,c0,c1,c2) \
 446         t=(BN_ULLONG)a*b; \
 447         t1=(BN_ULONG)Lw(t); \
 448         t2=(BN_ULONG)Hw(t); \
 449         c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
 450         c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
 451
 452 #define mul_add_c2(a,b,c0,c1,c2) \
 453         t=(BN_ULLONG)a*b; \
 454         tt=(t+t)&BN_MASK; \
 455         if (tt < t) c2++; \
 456         t1=(BN_ULONG)Lw(tt); \
 457         t2=(BN_ULONG)Hw(tt); \
 458         c0=(c0+t1)&BN_MASK2;  \
 459         if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
 460         c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
 461
 462 #define sqr_add_c(a,i,c0,c1,c2) \
 463         t=(BN_ULLONG)a[i]*a[i]; \
 464         t1=(BN_ULONG)Lw(t); \
 465         t2=(BN_ULONG)Hw(t); \
 466         c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
 467         c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
 468
 469 #define sqr_add_c2(a,i,j,c0,c1,c2) \
 470         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 471
 472 #elif defined(BN_UMULT_LOHI)
 473
 474 #define mul_add_c(a,b,c0,c1,c2) {       \
 475         BN_ULONG ta=(a),tb=(b);         \
 476         BN_UMULT_LOHI(t1,t2,ta,tb);     \
 477         c0 += t1; t2 += (c0<t1)?1:0;    \
 478         c1 += t2; c2 += (c1<t2)?1:0;    \
 479         }
 480
 481 #define mul_add_c2(a,b,c0,c1,c2) {      \
 482         BN_ULONG ta=(a),tb=(b),t0;      \
 483         BN_UMULT_LOHI(t0,t1,ta,tb);     \
 484         t2 = t1+t1; c2 += (t2<t1)?1:0;  \
 485         t1 = t0+t0; t2 += (t1<t0)?1:0;  \
 486         c0 += t1; t2 += (c0<t1)?1:0;    \
 487         c1 += t2; c2 += (c1<t2)?1:0;    \
 488         }
 489
 490 #define sqr_add_c(a,i,c0,c1,c2) {       \
 491         BN_ULONG ta=(a)[i];             \
 492         BN_UMULT_LOHI(t1,t2,ta,ta);     \
 493         c0 += t1; t2 += (c0<t1)?1:0;    \
 494         c1 += t2; c2 += (c1<t2)?1:0;    \
 495         }
 496
 497 #define sqr_add_c2(a,i,j,c0,c1,c2)      \
 498         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 499
 500 #elif defined(BN_UMULT_HIGH)
 501
 502 #define mul_add_c(a,b,c0,c1,c2) {       \
 503         BN_ULONG ta=(a),tb=(b);         \
 504         t1 = ta * tb;                   \
 505         t2 = BN_UMULT_HIGH(ta,tb);      \
 506         c0 += t1; t2 += (c0<t1)?1:0;    \
 507         c1 += t2; c2 += (c1<t2)?1:0;    \
 508         }
 509
 510 #define mul_add_c2(a,b,c0,c1,c2) {      \
 511         BN_ULONG ta=(a),tb=(b),t0;      \
 512         t1 = BN_UMULT_HIGH(ta,tb);      \
 513         t0 = ta * tb;                   \
 514         t2 = t1+t1; c2 += (t2<t1)?1:0;  \
 515         t1 = t0+t0; t2 += (t1<t0)?1:0;  \
 516         c0 += t1; t2 += (c0<t1)?1:0;    \
 517         c1 += t2; c2 += (c1<t2)?1:0;    \
 518         }
 519
 520 #define sqr_add_c(a,i,c0,c1,c2) {       \
 521         BN_ULONG ta=(a)[i];             \
 522         t1 = ta * ta;                   \
 523         t2 = BN_UMULT_HIGH(ta,ta);      \
 524         c0 += t1; t2 += (c0<t1)?1:0;    \
 525         c1 += t2; c2 += (c1<t2)?1:0;    \
 526         }
 527
 528 #define sqr_add_c2(a,i,j,c0,c1,c2)      \
 529         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 530
 531 #else /* !BN_LLONG */
 532 #define mul_add_c(a,b,c0,c1,c2) \
 533         t1=LBITS(a); t2=HBITS(a); \
 534         bl=LBITS(b); bh=HBITS(b); \
 535         mul64(t1,t2,bl,bh); \
 536         c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
 537         c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
 538
 539 #define mul_add_c2(a,b,c0,c1,c2) \
 540         t1=LBITS(a); t2=HBITS(a); \
 541         bl=LBITS(b); bh=HBITS(b); \
 542         mul64(t1,t2,bl,bh); \
 543         if (t2 & BN_TBIT) c2++; \
 544         t2=(t2+t2)&BN_MASK2; \
 545         if (t1 & BN_TBIT) t2++; \
 546         t1=(t1+t1)&BN_MASK2; \
 547         c0=(c0+t1)&BN_MASK2;  \
 548         if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
 549         c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
 550
 551 #define sqr_add_c(a,i,c0,c1,c2) \
 552         sqr64(t1,t2,(a)[i]); \
 553         c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
 554         c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
 555
 556 #define sqr_add_c2(a,i,j,c0,c1,c2) \
 557         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 558 #endif /* !BN_LLONG */
 559
 560 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 561         {
 562 #ifdef BN_LLONG
 563         BN_ULLONG t;
 564 #else
 565         BN_ULONG bl,bh;
 566 #endif
 567         BN_ULONG t1,t2;
 568         BN_ULONG c1,c2,c3;
 569
 570         c1=0;
 571         c2=0;
 572         c3=0;
 573         mul_add_c(a[0],b[0],c1,c2,c3);
 574         r[0]=c1;
 575         c1=0;
 576         mul_add_c(a[0],b[1],c2,c3,c1);
 577         mul_add_c(a[1],b[0],c2,c3,c1);
 578         r[1]=c2;
 579         c2=0;
 580         mul_add_c(a[2],b[0],c3,c1,c2);
 581         mul_add_c(a[1],b[1],c3,c1,c2);
 582         mul_add_c(a[0],b[2],c3,c1,c2);
 583         r[2]=c3;
 584         c3=0;
 585         mul_add_c(a[0],b[3],c1,c2,c3);
 586         mul_add_c(a[1],b[2],c1,c2,c3);
 587         mul_add_c(a[2],b[1],c1,c2,c3);
 588         mul_add_c(a[3],b[0],c1,c2,c3);
 589         r[3]=c1;
 590         c1=0;
 591         mul_add_c(a[4],b[0],c2,c3,c1);
 592         mul_add_c(a[3],b[1],c2,c3,c1);
 593         mul_add_c(a[2],b[2],c2,c3,c1);
 594         mul_add_c(a[1],b[3],c2,c3,c1);
 595         mul_add_c(a[0],b[4],c2,c3,c1);
 596         r[4]=c2;
 597         c2=0;
 598         mul_add_c(a[0],b[5],c3,c1,c2);
 599         mul_add_c(a[1],b[4],c3,c1,c2);
 600         mul_add_c(a[2],b[3],c3,c1,c2);
 601         mul_add_c(a[3],b[2],c3,c1,c2);
 602         mul_add_c(a[4],b[1],c3,c1,c2);
 603         mul_add_c(a[5],b[0],c3,c1,c2);
 604         r[5]=c3;
 605         c3=0;
 606         mul_add_c(a[6],b[0],c1,c2,c3);
 607         mul_add_c(a[5],b[1],c1,c2,c3);
 608         mul_add_c(a[4],b[2],c1,c2,c3);
 609         mul_add_c(a[3],b[3],c1,c2,c3);
 610         mul_add_c(a[2],b[4],c1,c2,c3);
 611         mul_add_c(a[1],b[5],c1,c2,c3);
 612         mul_add_c(a[0],b[6],c1,c2,c3);
 613         r[6]=c1;
 614         c1=0;
 615         mul_add_c(a[0],b[7],c2,c3,c1);
 616         mul_add_c(a[1],b[6],c2,c3,c1);
 617         mul_add_c(a[2],b[5],c2,c3,c1);
 618         mul_add_c(a[3],b[4],c2,c3,c1);
 619         mul_add_c(a[4],b[3],c2,c3,c1);
 620         mul_add_c(a[5],b[2],c2,c3,c1);
 621         mul_add_c(a[6],b[1],c2,c3,c1);
 622         mul_add_c(a[7],b[0],c2,c3,c1);
 623         r[7]=c2;
 624         c2=0;
 625         mul_add_c(a[7],b[1],c3,c1,c2);
 626         mul_add_c(a[6],b[2],c3,c1,c2);
 627         mul_add_c(a[5],b[3],c3,c1,c2);
 628         mul_add_c(a[4],b[4],c3,c1,c2);
 629         mul_add_c(a[3],b[5],c3,c1,c2);
 630         mul_add_c(a[2],b[6],c3,c1,c2);
 631         mul_add_c(a[1],b[7],c3,c1,c2);
 632         r[8]=c3;
 633         c3=0;
 634         mul_add_c(a[2],b[7],c1,c2,c3);
 635         mul_add_c(a[3],b[6],c1,c2,c3);
 636         mul_add_c(a[4],b[5],c1,c2,c3);
 637         mul_add_c(a[5],b[4],c1,c2,c3);
 638         mul_add_c(a[6],b[3],c1,c2,c3);
 639         mul_add_c(a[7],b[2],c1,c2,c3);
 640         r[9]=c1;
 641         c1=0;
 642         mul_add_c(a[7],b[3],c2,c3,c1);
 643         mul_add_c(a[6],b[4],c2,c3,c1);
 644         mul_add_c(a[5],b[5],c2,c3,c1);
 645         mul_add_c(a[4],b[6],c2,c3,c1);
 646         mul_add_c(a[3],b[7],c2,c3,c1);
 647         r[10]=c2;
 648         c2=0;
 649         mul_add_c(a[4],b[7],c3,c1,c2);
 650         mul_add_c(a[5],b[6],c3,c1,c2);
 651         mul_add_c(a[6],b[5],c3,c1,c2);
 652         mul_add_c(a[7],b[4],c3,c1,c2);
 653         r[11]=c3;
 654         c3=0;
 655         mul_add_c(a[7],b[5],c1,c2,c3);
 656         mul_add_c(a[6],b[6],c1,c2,c3);
 657         mul_add_c(a[5],b[7],c1,c2,c3);
 658         r[12]=c1;
 659         c1=0;
 660         mul_add_c(a[6],b[7],c2,c3,c1);
 661         mul_add_c(a[7],b[6],c2,c3,c1);
 662         r[13]=c2;
 663         c2=0;
 664         mul_add_c(a[7],b[7],c3,c1,c2);
 665         r[14]=c3;
 666         r[15]=c1;
 667         }
 668
 669 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 670         {
 671 #ifdef BN_LLONG
 672         BN_ULLONG t;
 673 #else
 674         BN_ULONG bl,bh;
 675 #endif
 676         BN_ULONG t1,t2;
 677         BN_ULONG c1,c2,c3;
 678
 679         c1=0;
 680         c2=0;
 681         c3=0;
 682         mul_add_c(a[0],b[0],c1,c2,c3);
 683         r[0]=c1;
 684         c1=0;
 685         mul_add_c(a[0],b[1],c2,c3,c1);
 686         mul_add_c(a[1],b[0],c2,c3,c1);
 687         r[1]=c2;
 688         c2=0;
 689         mul_add_c(a[2],b[0],c3,c1,c2);
 690         mul_add_c(a[1],b[1],c3,c1,c2);
 691         mul_add_c(a[0],b[2],c3,c1,c2);
 692         r[2]=c3;
 693         c3=0;
 694         mul_add_c(a[0],b[3],c1,c2,c3);
 695         mul_add_c(a[1],b[2],c1,c2,c3);
 696         mul_add_c(a[2],b[1],c1,c2,c3);
 697         mul_add_c(a[3],b[0],c1,c2,c3);
 698         r[3]=c1;
 699         c1=0;
 700         mul_add_c(a[3],b[1],c2,c3,c1);
 701         mul_add_c(a[2],b[2],c2,c3,c1);
 702         mul_add_c(a[1],b[3],c2,c3,c1);
 703         r[4]=c2;
 704         c2=0;
 705         mul_add_c(a[2],b[3],c3,c1,c2);
 706         mul_add_c(a[3],b[2],c3,c1,c2);
 707         r[5]=c3;
 708         c3=0;
 709         mul_add_c(a[3],b[3],c1,c2,c3);
 710         r[6]=c1;
 711         r[7]=c2;
 712         }
 713
 714 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 715         {
 716 #ifdef BN_LLONG
 717         BN_ULLONG t,tt;
 718 #else
 719         BN_ULONG bl,bh;
 720 #endif
 721         BN_ULONG t1,t2;
 722         BN_ULONG c1,c2,c3;
 723
 724         c1=0;
 725         c2=0;
 726         c3=0;
 727         sqr_add_c(a,0,c1,c2,c3);
 728         r[0]=c1;
 729         c1=0;
 730         sqr_add_c2(a,1,0,c2,c3,c1);
 731         r[1]=c2;
 732         c2=0;
 733         sqr_add_c(a,1,c3,c1,c2);
 734         sqr_add_c2(a,2,0,c3,c1,c2);
 735         r[2]=c3;
 736         c3=0;
 737         sqr_add_c2(a,3,0,c1,c2,c3);
 738         sqr_add_c2(a,2,1,c1,c2,c3);
 739         r[3]=c1;
 740         c1=0;
 741         sqr_add_c(a,2,c2,c3,c1);
 742         sqr_add_c2(a,3,1,c2,c3,c1);
 743         sqr_add_c2(a,4,0,c2,c3,c1);
 744         r[4]=c2;
 745         c2=0;
 746         sqr_add_c2(a,5,0,c3,c1,c2);
 747         sqr_add_c2(a,4,1,c3,c1,c2);
 748         sqr_add_c2(a,3,2,c3,c1,c2);
 749         r[5]=c3;
 750         c3=0;
 751         sqr_add_c(a,3,c1,c2,c3);
 752         sqr_add_c2(a,4,2,c1,c2,c3);
 753         sqr_add_c2(a,5,1,c1,c2,c3);
 754         sqr_add_c2(a,6,0,c1,c2,c3);
 755         r[6]=c1;
 756         c1=0;
 757         sqr_add_c2(a,7,0,c2,c3,c1);
 758         sqr_add_c2(a,6,1,c2,c3,c1);
 759         sqr_add_c2(a,5,2,c2,c3,c1);
 760         sqr_add_c2(a,4,3,c2,c3,c1);
 761         r[7]=c2;
 762         c2=0;
 763         sqr_add_c(a,4,c3,c1,c2);
 764         sqr_add_c2(a,5,3,c3,c1,c2);
 765         sqr_add_c2(a,6,2,c3,c1,c2);
 766         sqr_add_c2(a,7,1,c3,c1,c2);
 767         r[8]=c3;
 768         c3=0;
 769         sqr_add_c2(a,7,2,c1,c2,c3);
 770         sqr_add_c2(a,6,3,c1,c2,c3);
 771         sqr_add_c2(a,5,4,c1,c2,c3);
 772         r[9]=c1;
 773         c1=0;
 774         sqr_add_c(a,5,c2,c3,c1);
 775         sqr_add_c2(a,6,4,c2,c3,c1);
 776         sqr_add_c2(a,7,3,c2,c3,c1);
 777         r[10]=c2;
 778         c2=0;
 779         sqr_add_c2(a,7,4,c3,c1,c2);
 780         sqr_add_c2(a,6,5,c3,c1,c2);
 781         r[11]=c3;
 782         c3=0;
 783         sqr_add_c(a,6,c1,c2,c3);
 784         sqr_add_c2(a,7,5,c1,c2,c3);
 785         r[12]=c1;
 786         c1=0;
 787         sqr_add_c2(a,7,6,c2,c3,c1);
 788         r[13]=c2;
 789         c2=0;
 790         sqr_add_c(a,7,c3,c1,c2);
 791         r[14]=c3;
 792         r[15]=c1;
 793         }
 794
 795 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 796         {
 797 #ifdef BN_LLONG
 798         BN_ULLONG t,tt;
 799 #else
 800         BN_ULONG bl,bh;
 801 #endif
 802         BN_ULONG t1,t2;
 803         BN_ULONG c1,c2,c3;
 804
 805         c1=0;
 806         c2=0;
 807         c3=0;
 808         sqr_add_c(a,0,c1,c2,c3);
 809         r[0]=c1;
 810         c1=0;
 811         sqr_add_c2(a,1,0,c2,c3,c1);
 812         r[1]=c2;
 813         c2=0;
 814         sqr_add_c(a,1,c3,c1,c2);
 815         sqr_add_c2(a,2,0,c3,c1,c2);
 816         r[2]=c3;
 817         c3=0;
 818         sqr_add_c2(a,3,0,c1,c2,c3);
 819         sqr_add_c2(a,2,1,c1,c2,c3);
 820         r[3]=c1;
 821         c1=0;
 822         sqr_add_c(a,2,c2,c3,c1);
 823         sqr_add_c2(a,3,1,c2,c3,c1);
 824         r[4]=c2;
 825         c2=0;
 826         sqr_add_c2(a,3,2,c3,c1,c2);
 827         r[5]=c3;
 828         c3=0;
 829         sqr_add_c(a,3,c1,c2,c3);
 830         r[6]=c1;
 831         r[7]=c2;
 832         }
 833
 834 #ifdef OPENSSL_NO_ASM
 835 #ifdef OPENSSL_BN_ASM_MONT
 836 #include <alloca.h>
 837 /*
 838  * This is essentially reference implementation, which may or may not
 839  * result in performance improvement. E.g. on IA-32 this routine was
 840  * observed to give 40% faster rsa1024 private key operations and 10%
 841  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
 842  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
 843  * reference implementation, one to be used as starting point for
 844  * platform-specific assembler. Mentioned numbers apply to compiler
 845  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
 846  * can vary not only from platform to platform, but even for compiler
 847  * versions. Assembler vs. assembler improvement coefficients can
 848  * [and are known to] differ and are to be documented elsewhere.
 849  */
 850 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
 851         {
 852         BN_ULONG c0,c1,ml,*tp,n0;
 853 #ifdef mul64
 854         BN_ULONG mh;
 855 #endif
 856         volatile BN_ULONG *vp;
 857         int i=0,j;
 858
 859 #if 0   /* template for platform-specific implementation */
 860         if (ap==bp)     return bn_sqr_mont(rp,ap,np,n0p,num);
 861 #endif
 862         vp = tp = alloca((num+2)*sizeof(BN_ULONG));
 863
 864         n0 = *n0p;
 865
 866         c0 = 0;
 867         ml = bp[0];
 868 #ifdef mul64
 869         mh = HBITS(ml);
 870         ml = LBITS(ml);
 871         for (j=0;j<num;++j)
 872                 mul(tp[j],ap[j],ml,mh,c0);
 873 #else
 874         for (j=0;j<num;++j)
 875                 mul(tp[j],ap[j],ml,c0);
 876 #endif
 877
 878         tp[num]   = c0;
 879         tp[num+1] = 0;
 880         goto enter;
 881
 882         for(i=0;i<num;i++)
 883                 {
 884                 c0 = 0;
 885                 ml = bp[i];
 886 #ifdef mul64
 887                 mh = HBITS(ml);
 888                 ml = LBITS(ml);
 889                 for (j=0;j<num;++j)
 890                         mul_add(tp[j],ap[j],ml,mh,c0);
 891 #else
 892                 for (j=0;j<num;++j)
 893                         mul_add(tp[j],ap[j],ml,c0);
 894 #endif
 895                 c1 = (tp[num] + c0)&BN_MASK2;
 896                 tp[num]   = c1;
 897                 tp[num+1] = (c1<c0?1:0);
 898         enter:
 899                 c1  = tp[0];
 900                 ml = (c1*n0)&BN_MASK2;
 901                 c0 = 0;
 902 #ifdef mul64
 903                 mh = HBITS(ml);
 904                 ml = LBITS(ml);
 905                 mul_add(c1,np[0],ml,mh,c0);
 906 #else
 907                 mul_add(c1,ml,np[0],c0);
 908 #endif
 909                 for(j=1;j<num;j++)
 910                         {
 911                         c1 = tp[j];
 912 #ifdef mul64
 913                         mul_add(c1,np[j],ml,mh,c0);
 914 #else
 915                         mul_add(c1,ml,np[j],c0);
 916 #endif
 917                         tp[j-1] = c1&BN_MASK2;
 918                         }
 919                 c1        = (tp[num] + c0)&BN_MASK2;
 920                 tp[num-1] = c1;
 921                 tp[num]   = tp[num+1] + (c1<c0?1:0);
 922                 }
 923
 924         if (tp[num]!=0 || tp[num-1]>=np[num-1])
 925                 {
 926                 c0 = bn_sub_words(rp,tp,np,num);
 927                 if (tp[num]!=0 || c0==0)
 928                         {
 929                         for(i=0;i<num+2;i++)    vp[i] = 0;
 930                         return 1;
 931                         }
 932                 }
 933         for(i=0;i<num;i++)      rp[i] = tp[i],  vp[i] = 0;
 934         vp[num]   = 0;
 935         vp[num+1] = 0;
 936         return 1;
 937         }
 938 #else
 939 /*
 940  * Return value of 0 indicates that multiplication/convolution was not
 941  * performed to signal the caller to fall down to alternative/original
 942  * code-path.
 943  */
 944 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
 945 {       return 0;       }
 946 #endif /* OPENSSL_BN_ASM_MONT */
 947 #endif
 948
 949 #else /* !BN_MUL_COMBA */
 950
 951 /* hmm... is it faster just to do a multiply? */
 952 #ifndef OPENSSL_FIPSCANISTER
 953 #undef bn_sqr_comba4
 954 #undef bn_sqr_comba8
 955 #endif
 956 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 957         {
 958         BN_ULONG t[8];
 959         bn_sqr_normal(r,a,4,t);
 960         }
 961
 962 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 963         {
 964         BN_ULONG t[16];
 965         bn_sqr_normal(r,a,8,t);
 966         }
 967
 968 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 969         {
 970         r[4]=bn_mul_words(    &(r[0]),a,4,b[0]);
 971         r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]);
 972         r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]);
 973         r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]);
 974         }
 975
 976 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 977         {
 978         r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
 979         r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
 980         r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
 981         r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
 982         r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
 983         r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
 984         r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
 985         r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
 986         }
 987
 988 #ifdef OPENSSL_NO_ASM
 989 #ifdef OPENSSL_BN_ASM_MONT
 990 #include <alloca.h>
 991 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
 992         {
 993         BN_ULONG c0,c1,*tp,n0=*n0p;
 994         volatile BN_ULONG *vp;
 995         int i=0,j;
 996
 997         vp = tp = alloca((num+2)*sizeof(BN_ULONG));
 998
 999         for(i=0;i<=num;i++)     tp[i]=0;
1000
1001         for(i=0;i<num;i++)
1002                 {
1003                 c0         = bn_mul_add_words(tp,ap,num,bp[i]);
1004                 c1         = (tp[num] + c0)&BN_MASK2;
1005                 tp[num]    = c1;
1006                 tp[num+1]  = (c1<c0?1:0);
1007
1008                 c0         = bn_mul_add_words(tp,np,num,tp[0]*n0);
1009                 c1         = (tp[num] + c0)&BN_MASK2;
1010                 tp[num]    = c1;
1011                 tp[num+1] += (c1<c0?1:0);
1012                 for(j=0;j<=num;j++)     tp[j]=tp[j+1];
1013                 }
1014
1015         if (tp[num]!=0 || tp[num-1]>=np[num-1])
1016                 {
1017                 c0 = bn_sub_words(rp,tp,np,num);
1018                 if (tp[num]!=0 || c0==0)
1019                         {
1020                         for(i=0;i<num+2;i++)    vp[i] = 0;
1021                         return 1;
1022                         }
1023                 }
1024         for(i=0;i<num;i++)      rp[i] = tp[i],  vp[i] = 0;
1025         vp[num]   = 0;
1026         vp[num+1] = 0;
1027         return 1;
1028         }
1029 #else
1030 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
1031 {       return 0;       }
1032 #endif /* OPENSSL_BN_ASM_MONT */
1033 #endif
1034
1035 #endif /* !BN_MUL_COMBA */