crypto/bn/bn_asm.c

   1 /* crypto/bn/bn_asm.c */
   2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
   3  * All rights reserved.
   4  *
   5  * This package is an SSL implementation written
   6  * by Eric Young (eay@cryptsoft.com).
   7  * The implementation was written so as to conform with Netscapes SSL.
   8  *
   9  * This library is free for commercial and non-commercial use as long as
  10  * the following conditions are aheared to.  The following conditions
  11  * apply to all code found in this distribution, be it the RC4, RSA,
  12  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
  13  * included with this distribution is covered by the same copyright terms
  14  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
  15  *
  16  * Copyright remains Eric Young's, and as such any Copyright notices in
  17  * the code are not to be removed.
  18  * If this package is used in a product, Eric Young should be given attribution
  19  * as the author of the parts of the library used.
  20  * This can be in the form of a textual message at program startup or
  21  * in documentation (online or textual) provided with the package.
  22  *
  23  * Redistribution and use in source and binary forms, with or without
  24  * modification, are permitted provided that the following conditions
  25  * are met:
  26  * 1. Redistributions of source code must retain the copyright
  27  *    notice, this list of conditions and the following disclaimer.
  28  * 2. Redistributions in binary form must reproduce the above copyright
  29  *    notice, this list of conditions and the following disclaimer in the
  30  *    documentation and/or other materials provided with the distribution.
  31  * 3. All advertising materials mentioning features or use of this software
  32  *    must display the following acknowledgement:
  33  *    "This product includes cryptographic software written by
  34  *     Eric Young (eay@cryptsoft.com)"
  35  *    The word 'cryptographic' can be left out if the rouines from the library
  36  *    being used are not cryptographic related :-).
  37  * 4. If you include any Windows specific code (or a derivative thereof) from
  38  *    the apps directory (application code) you must include an acknowledgement:
  39  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
  40  *
  41  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
  42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  51  * SUCH DAMAGE.
  52  *
  53  * The licence and distribution terms for any publically available version or
  54  * derivative of this code cannot be changed.  i.e. this code cannot simply be
  55  * copied and put under another distribution licence
  56  * [including the GNU Public Licence.]
  57  */
  58
  59 #ifndef BN_DEBUG
  60 # undef NDEBUG                  /* avoid conflicting definitions */
  61 # define NDEBUG
  62 #endif
  63
  64 #include <stdio.h>
  65 #include <assert.h>
  66 #include "cryptlib.h"
  67 #include "bn_lcl.h"
  68
  69 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
  70
  71 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  72                           BN_ULONG w)
  73 {
  74     BN_ULONG c1 = 0;
  75
  76     assert(num >= 0);
  77     if (num <= 0)
  78         return (c1);
  79
  80 # ifndef OPENSSL_SMALL_FOOTPRINT
  81     while (num & ~3) {
  82         mul_add(rp[0], ap[0], w, c1);
  83         mul_add(rp[1], ap[1], w, c1);
  84         mul_add(rp[2], ap[2], w, c1);
  85         mul_add(rp[3], ap[3], w, c1);
  86         ap += 4;
  87         rp += 4;
  88         num -= 4;
  89     }
  90 # endif
  91     while (num) {
  92         mul_add(rp[0], ap[0], w, c1);
  93         ap++;
  94         rp++;
  95         num--;
  96     }
  97
  98     return (c1);
  99 }
 100
 101 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 102 {
 103     BN_ULONG c1 = 0;
 104
 105     assert(num >= 0);
 106     if (num <= 0)
 107         return (c1);
 108
 109 # ifndef OPENSSL_SMALL_FOOTPRINT
 110     while (num & ~3) {
 111         mul(rp[0], ap[0], w, c1);
 112         mul(rp[1], ap[1], w, c1);
 113         mul(rp[2], ap[2], w, c1);
 114         mul(rp[3], ap[3], w, c1);
 115         ap += 4;
 116         rp += 4;
 117         num -= 4;
 118     }
 119 # endif
 120     while (num) {
 121         mul(rp[0], ap[0], w, c1);
 122         ap++;
 123         rp++;
 124         num--;
 125     }
 126     return (c1);
 127 }
 128
 129 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
 130 {
 131     assert(n >= 0);
 132     if (n <= 0)
 133         return;
 134
 135 # ifndef OPENSSL_SMALL_FOOTPRINT
 136     while (n & ~3) {
 137         sqr(r[0], r[1], a[0]);
 138         sqr(r[2], r[3], a[1]);
 139         sqr(r[4], r[5], a[2]);
 140         sqr(r[6], r[7], a[3]);
 141         a += 4;
 142         r += 8;
 143         n -= 4;
 144     }
 145 # endif
 146     while (n) {
 147         sqr(r[0], r[1], a[0]);
 148         a++;
 149         r += 2;
 150         n--;
 151     }
 152 }
 153
 154 #else                           /* !(defined(BN_LLONG) ||
 155                                  * defined(BN_UMULT_HIGH)) */
 156
 157 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
 158                           BN_ULONG w)
 159 {
 160     BN_ULONG c = 0;
 161     BN_ULONG bl, bh;
 162
 163     assert(num >= 0);
 164     if (num <= 0)
 165         return ((BN_ULONG)0);
 166
 167     bl = LBITS(w);
 168     bh = HBITS(w);
 169
 170 # ifndef OPENSSL_SMALL_FOOTPRINT
 171     while (num & ~3) {
 172         mul_add(rp[0], ap[0], bl, bh, c);
 173         mul_add(rp[1], ap[1], bl, bh, c);
 174         mul_add(rp[2], ap[2], bl, bh, c);
 175         mul_add(rp[3], ap[3], bl, bh, c);
 176         ap += 4;
 177         rp += 4;
 178         num -= 4;
 179     }
 180 # endif
 181     while (num) {
 182         mul_add(rp[0], ap[0], bl, bh, c);
 183         ap++;
 184         rp++;
 185         num--;
 186     }
 187     return (c);
 188 }
 189
 190 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 191 {
 192     BN_ULONG carry = 0;
 193     BN_ULONG bl, bh;
 194
 195     assert(num >= 0);
 196     if (num <= 0)
 197         return ((BN_ULONG)0);
 198
 199     bl = LBITS(w);
 200     bh = HBITS(w);
 201
 202 # ifndef OPENSSL_SMALL_FOOTPRINT
 203     while (num & ~3) {
 204         mul(rp[0], ap[0], bl, bh, carry);
 205         mul(rp[1], ap[1], bl, bh, carry);
 206         mul(rp[2], ap[2], bl, bh, carry);
 207         mul(rp[3], ap[3], bl, bh, carry);
 208         ap += 4;
 209         rp += 4;
 210         num -= 4;
 211     }
 212 # endif
 213     while (num) {
 214         mul(rp[0], ap[0], bl, bh, carry);
 215         ap++;
 216         rp++;
 217         num--;
 218     }
 219     return (carry);
 220 }
 221
 222 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
 223 {
 224     assert(n >= 0);
 225     if (n <= 0)
 226         return;
 227
 228 # ifndef OPENSSL_SMALL_FOOTPRINT
 229     while (n & ~3) {
 230         sqr64(r[0], r[1], a[0]);
 231         sqr64(r[2], r[3], a[1]);
 232         sqr64(r[4], r[5], a[2]);
 233         sqr64(r[6], r[7], a[3]);
 234         a += 4;
 235         r += 8;
 236         n -= 4;
 237     }
 238 # endif
 239     while (n) {
 240         sqr64(r[0], r[1], a[0]);
 241         a++;
 242         r += 2;
 243         n--;
 244     }
 245 }
 246
 247 #endif                          /* !(defined(BN_LLONG) ||
 248                                  * defined(BN_UMULT_HIGH)) */
 249
 250 #if defined(BN_LLONG) && defined(BN_DIV2W)
 251
 252 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 253 {
 254     return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
 255 }
 256
 257 #else
 258
 259 /* Divide h,l by d and return the result. */
 260 /* I need to test this some more :-( */
 261 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 262 {
 263     BN_ULONG dh, dl, q, ret = 0, th, tl, t;
 264     int i, count = 2;
 265
 266     if (d == 0)
 267         return (BN_MASK2);
 268
 269     i = BN_num_bits_word(d);
 270     assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
 271
 272     i = BN_BITS2 - i;
 273     if (h >= d)
 274         h -= d;
 275
 276     if (i) {
 277         d <<= i;
 278         h = (h << i) | (l >> (BN_BITS2 - i));
 279         l <<= i;
 280     }
 281     dh = (d & BN_MASK2h) >> BN_BITS4;
 282     dl = (d & BN_MASK2l);
 283     for (;;) {
 284         if ((h >> BN_BITS4) == dh)
 285             q = BN_MASK2l;
 286         else
 287             q = h / dh;
 288
 289         th = q * dh;
 290         tl = dl * q;
 291         for (;;) {
 292             t = h - th;
 293             if ((t & BN_MASK2h) ||
 294                 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
 295                 break;
 296             q--;
 297             th -= dh;
 298             tl -= dl;
 299         }
 300         t = (tl >> BN_BITS4);
 301         tl = (tl << BN_BITS4) & BN_MASK2h;
 302         th += t;
 303
 304         if (l < tl)
 305             th++;
 306         l -= tl;
 307         if (h < th) {
 308             h += d;
 309             q--;
 310         }
 311         h -= th;
 312
 313         if (--count == 0)
 314             break;
 315
 316         ret = q << BN_BITS4;
 317         h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
 318         l = (l & BN_MASK2l) << BN_BITS4;
 319     }
 320     ret |= q;
 321     return (ret);
 322 }
 323 #endif                          /* !defined(BN_LLONG) && defined(BN_DIV2W) */
 324
 325 #ifdef BN_LLONG
 326 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 327                       int n)
 328 {
 329     BN_ULLONG ll = 0;
 330
 331     assert(n >= 0);
 332     if (n <= 0)
 333         return ((BN_ULONG)0);
 334
 335 # ifndef OPENSSL_SMALL_FOOTPRINT
 336     while (n & ~3) {
 337         ll += (BN_ULLONG) a[0] + b[0];
 338         r[0] = (BN_ULONG)ll & BN_MASK2;
 339         ll >>= BN_BITS2;
 340         ll += (BN_ULLONG) a[1] + b[1];
 341         r[1] = (BN_ULONG)ll & BN_MASK2;
 342         ll >>= BN_BITS2;
 343         ll += (BN_ULLONG) a[2] + b[2];
 344         r[2] = (BN_ULONG)ll & BN_MASK2;
 345         ll >>= BN_BITS2;
 346         ll += (BN_ULLONG) a[3] + b[3];
 347         r[3] = (BN_ULONG)ll & BN_MASK2;
 348         ll >>= BN_BITS2;
 349         a += 4;
 350         b += 4;
 351         r += 4;
 352         n -= 4;
 353     }
 354 # endif
 355     while (n) {
 356         ll += (BN_ULLONG) a[0] + b[0];
 357         r[0] = (BN_ULONG)ll & BN_MASK2;
 358         ll >>= BN_BITS2;
 359         a++;
 360         b++;
 361         r++;
 362         n--;
 363     }
 364     return ((BN_ULONG)ll);
 365 }
 366 #else                           /* !BN_LLONG */
 367 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 368                       int n)
 369 {
 370     BN_ULONG c, l, t;
 371
 372     assert(n >= 0);
 373     if (n <= 0)
 374         return ((BN_ULONG)0);
 375
 376     c = 0;
 377 # ifndef OPENSSL_SMALL_FOOTPRINT
 378     while (n & ~3) {
 379         t = a[0];
 380         t = (t + c) & BN_MASK2;
 381         c = (t < c);
 382         l = (t + b[0]) & BN_MASK2;
 383         c += (l < t);
 384         r[0] = l;
 385         t = a[1];
 386         t = (t + c) & BN_MASK2;
 387         c = (t < c);
 388         l = (t + b[1]) & BN_MASK2;
 389         c += (l < t);
 390         r[1] = l;
 391         t = a[2];
 392         t = (t + c) & BN_MASK2;
 393         c = (t < c);
 394         l = (t + b[2]) & BN_MASK2;
 395         c += (l < t);
 396         r[2] = l;
 397         t = a[3];
 398         t = (t + c) & BN_MASK2;
 399         c = (t < c);
 400         l = (t + b[3]) & BN_MASK2;
 401         c += (l < t);
 402         r[3] = l;
 403         a += 4;
 404         b += 4;
 405         r += 4;
 406         n -= 4;
 407     }
 408 # endif
 409     while (n) {
 410         t = a[0];
 411         t = (t + c) & BN_MASK2;
 412         c = (t < c);
 413         l = (t + b[0]) & BN_MASK2;
 414         c += (l < t);
 415         r[0] = l;
 416         a++;
 417         b++;
 418         r++;
 419         n--;
 420     }
 421     return ((BN_ULONG)c);
 422 }
 423 #endif                          /* !BN_LLONG */
 424
 425 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 426                       int n)
 427 {
 428     BN_ULONG t1, t2;
 429     int c = 0;
 430
 431     assert(n >= 0);
 432     if (n <= 0)
 433         return ((BN_ULONG)0);
 434
 435 #ifndef OPENSSL_SMALL_FOOTPRINT
 436     while (n & ~3) {
 437         t1 = a[0];
 438         t2 = b[0];
 439         r[0] = (t1 - t2 - c) & BN_MASK2;
 440         if (t1 != t2)
 441             c = (t1 < t2);
 442         t1 = a[1];
 443         t2 = b[1];
 444         r[1] = (t1 - t2 - c) & BN_MASK2;
 445         if (t1 != t2)
 446             c = (t1 < t2);
 447         t1 = a[2];
 448         t2 = b[2];
 449         r[2] = (t1 - t2 - c) & BN_MASK2;
 450         if (t1 != t2)
 451             c = (t1 < t2);
 452         t1 = a[3];
 453         t2 = b[3];
 454         r[3] = (t1 - t2 - c) & BN_MASK2;
 455         if (t1 != t2)
 456             c = (t1 < t2);
 457         a += 4;
 458         b += 4;
 459         r += 4;
 460         n -= 4;
 461     }
 462 #endif
 463     while (n) {
 464         t1 = a[0];
 465         t2 = b[0];
 466         r[0] = (t1 - t2 - c) & BN_MASK2;
 467         if (t1 != t2)
 468             c = (t1 < t2);
 469         a++;
 470         b++;
 471         r++;
 472         n--;
 473     }
 474     return (c);
 475 }
 476
 477 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
 478
 479 # undef bn_mul_comba8
 480 # undef bn_mul_comba4
 481 # undef bn_sqr_comba8
 482 # undef bn_sqr_comba4
 483
 484 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
 485 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
 486 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
 487 /*
 488  * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
 489  * c=(c2,c1,c0)
 490  */
 491
 492 # ifdef BN_LLONG
 493 /*
 494  * Keep in mind that additions to multiplication result can not
 495  * overflow, because its high half cannot be all-ones.
 496  */
 497 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 498         BN_ULONG hi;                            \
 499         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
 500         t += c0;                /* no carry */  \
 501         c0 = (BN_ULONG)Lw(t);                   \
 502         hi = (BN_ULONG)Hw(t);                   \
 503         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 504         } while(0)
 505
 506 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 507         BN_ULONG hi;                            \
 508         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
 509         BN_ULLONG tt = t+c0;    /* no carry */  \
 510         c0 = (BN_ULONG)Lw(tt);                  \
 511         hi = (BN_ULONG)Hw(tt);                  \
 512         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 513         t += c0;                /* no carry */  \
 514         c0 = (BN_ULONG)Lw(t);                   \
 515         hi = (BN_ULONG)Hw(t);                   \
 516         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 517         } while(0)
 518
 519 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 520         BN_ULONG hi;                            \
 521         BN_ULLONG t = (BN_ULLONG)a[i]*a[i];     \
 522         t += c0;                /* no carry */  \
 523         c0 = (BN_ULONG)Lw(t);                   \
 524         hi = (BN_ULONG)Hw(t);                   \
 525         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 526         } while(0)
 527
 528 #  define sqr_add_c2(a,i,j,c0,c1,c2) \
 529         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 530
 531 # elif defined(BN_UMULT_LOHI)
 532 /*
 533  * Keep in mind that additions to hi can not overflow, because
 534  * the high word of a multiplication result cannot be all-ones.
 535  */
 536 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 537         BN_ULONG ta = (a), tb = (b);            \
 538         BN_ULONG lo, hi;                        \
 539         BN_UMULT_LOHI(lo,hi,ta,tb);             \
 540         c0 += lo; hi += (c0<lo)?1:0;            \
 541         c1 += hi; c2 += (c1<hi)?1:0;            \
 542         } while(0)
 543
 544 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 545         BN_ULONG ta = (a), tb = (b);            \
 546         BN_ULONG lo, hi, tt;                    \
 547         BN_UMULT_LOHI(lo,hi,ta,tb);             \
 548         c0 += lo; tt = hi+((c0<lo)?1:0);        \
 549         c1 += tt; c2 += (c1<tt)?1:0;            \
 550         c0 += lo; hi += (c0<lo)?1:0;            \
 551         c1 += hi; c2 += (c1<hi)?1:0;            \
 552         } while(0)
 553
 554 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 555         BN_ULONG ta = (a)[i];                   \
 556         BN_ULONG lo, hi;                        \
 557         BN_UMULT_LOHI(lo,hi,ta,ta);             \
 558         c0 += lo; hi += (c0<lo)?1:0;            \
 559         c1 += hi; c2 += (c1<hi)?1:0;            \
 560         } while(0)
 561
 562 #  define sqr_add_c2(a,i,j,c0,c1,c2)    \
 563         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 564
 565 # elif defined(BN_UMULT_HIGH)
 566 /*
 567  * Keep in mind that additions to hi can not overflow, because
 568  * the high word of a multiplication result cannot be all-ones.
 569  */
 570 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 571         BN_ULONG ta = (a), tb = (b);            \
 572         BN_ULONG lo = ta * tb;                  \
 573         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
 574         c0 += lo; hi += (c0<lo)?1:0;            \
 575         c1 += hi; c2 += (c1<hi)?1:0;            \
 576         } while(0)
 577
 578 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 579         BN_ULONG ta = (a), tb = (b), tt;        \
 580         BN_ULONG lo = ta * tb;                  \
 581         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
 582         c0 += lo; tt = hi + ((c0<lo)?1:0);      \
 583         c1 += tt; c2 += (c1<tt)?1:0;            \
 584         c0 += lo; hi += (c0<lo)?1:0;            \
 585         c1 += hi; c2 += (c1<hi)?1:0;            \
 586         } while(0)
 587
 588 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 589         BN_ULONG ta = (a)[i];                   \
 590         BN_ULONG lo = ta * ta;                  \
 591         BN_ULONG hi = BN_UMULT_HIGH(ta,ta);     \
 592         c0 += lo; hi += (c0<lo)?1:0;            \
 593         c1 += hi; c2 += (c1<hi)?1:0;            \
 594         } while(0)
 595
 596 #  define sqr_add_c2(a,i,j,c0,c1,c2)      \
 597         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 598
 599 # else                          /* !BN_LLONG */
 600 /*
 601  * Keep in mind that additions to hi can not overflow, because
 602  * the high word of a multiplication result cannot be all-ones.
 603  */
 604 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 605         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
 606         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
 607         mul64(lo,hi,bl,bh);                     \
 608         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 609         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 610         } while(0)
 611
 612 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 613         BN_ULONG tt;                            \
 614         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
 615         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
 616         mul64(lo,hi,bl,bh);                     \
 617         tt = hi;                                \
 618         c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
 619         c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
 620         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 621         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 622         } while(0)
 623
 624 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 625         BN_ULONG lo, hi;                        \
 626         sqr64(lo,hi,(a)[i]);                    \
 627         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 628         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 629         } while(0)
 630
 631 #  define sqr_add_c2(a,i,j,c0,c1,c2) \
 632         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 633 # endif                         /* !BN_LLONG */
 634
 635 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 636 {
 637     BN_ULONG c1, c2, c3;
 638
 639     c1 = 0;
 640     c2 = 0;
 641     c3 = 0;
 642     mul_add_c(a[0], b[0], c1, c2, c3);
 643     r[0] = c1;
 644     c1 = 0;
 645     mul_add_c(a[0], b[1], c2, c3, c1);
 646     mul_add_c(a[1], b[0], c2, c3, c1);
 647     r[1] = c2;
 648     c2 = 0;
 649     mul_add_c(a[2], b[0], c3, c1, c2);
 650     mul_add_c(a[1], b[1], c3, c1, c2);
 651     mul_add_c(a[0], b[2], c3, c1, c2);
 652     r[2] = c3;
 653     c3 = 0;
 654     mul_add_c(a[0], b[3], c1, c2, c3);
 655     mul_add_c(a[1], b[2], c1, c2, c3);
 656     mul_add_c(a[2], b[1], c1, c2, c3);
 657     mul_add_c(a[3], b[0], c1, c2, c3);
 658     r[3] = c1;
 659     c1 = 0;
 660     mul_add_c(a[4], b[0], c2, c3, c1);
 661     mul_add_c(a[3], b[1], c2, c3, c1);
 662     mul_add_c(a[2], b[2], c2, c3, c1);
 663     mul_add_c(a[1], b[3], c2, c3, c1);
 664     mul_add_c(a[0], b[4], c2, c3, c1);
 665     r[4] = c2;
 666     c2 = 0;
 667     mul_add_c(a[0], b[5], c3, c1, c2);
 668     mul_add_c(a[1], b[4], c3, c1, c2);
 669     mul_add_c(a[2], b[3], c3, c1, c2);
 670     mul_add_c(a[3], b[2], c3, c1, c2);
 671     mul_add_c(a[4], b[1], c3, c1, c2);
 672     mul_add_c(a[5], b[0], c3, c1, c2);
 673     r[5] = c3;
 674     c3 = 0;
 675     mul_add_c(a[6], b[0], c1, c2, c3);
 676     mul_add_c(a[5], b[1], c1, c2, c3);
 677     mul_add_c(a[4], b[2], c1, c2, c3);
 678     mul_add_c(a[3], b[3], c1, c2, c3);
 679     mul_add_c(a[2], b[4], c1, c2, c3);
 680     mul_add_c(a[1], b[5], c1, c2, c3);
 681     mul_add_c(a[0], b[6], c1, c2, c3);
 682     r[6] = c1;
 683     c1 = 0;
 684     mul_add_c(a[0], b[7], c2, c3, c1);
 685     mul_add_c(a[1], b[6], c2, c3, c1);
 686     mul_add_c(a[2], b[5], c2, c3, c1);
 687     mul_add_c(a[3], b[4], c2, c3, c1);
 688     mul_add_c(a[4], b[3], c2, c3, c1);
 689     mul_add_c(a[5], b[2], c2, c3, c1);
 690     mul_add_c(a[6], b[1], c2, c3, c1);
 691     mul_add_c(a[7], b[0], c2, c3, c1);
 692     r[7] = c2;
 693     c2 = 0;
 694     mul_add_c(a[7], b[1], c3, c1, c2);
 695     mul_add_c(a[6], b[2], c3, c1, c2);
 696     mul_add_c(a[5], b[3], c3, c1, c2);
 697     mul_add_c(a[4], b[4], c3, c1, c2);
 698     mul_add_c(a[3], b[5], c3, c1, c2);
 699     mul_add_c(a[2], b[6], c3, c1, c2);
 700     mul_add_c(a[1], b[7], c3, c1, c2);
 701     r[8] = c3;
 702     c3 = 0;
 703     mul_add_c(a[2], b[7], c1, c2, c3);
 704     mul_add_c(a[3], b[6], c1, c2, c3);
 705     mul_add_c(a[4], b[5], c1, c2, c3);
 706     mul_add_c(a[5], b[4], c1, c2, c3);
 707     mul_add_c(a[6], b[3], c1, c2, c3);
 708     mul_add_c(a[7], b[2], c1, c2, c3);
 709     r[9] = c1;
 710     c1 = 0;
 711     mul_add_c(a[7], b[3], c2, c3, c1);
 712     mul_add_c(a[6], b[4], c2, c3, c1);
 713     mul_add_c(a[5], b[5], c2, c3, c1);
 714     mul_add_c(a[4], b[6], c2, c3, c1);
 715     mul_add_c(a[3], b[7], c2, c3, c1);
 716     r[10] = c2;
 717     c2 = 0;
 718     mul_add_c(a[4], b[7], c3, c1, c2);
 719     mul_add_c(a[5], b[6], c3, c1, c2);
 720     mul_add_c(a[6], b[5], c3, c1, c2);
 721     mul_add_c(a[7], b[4], c3, c1, c2);
 722     r[11] = c3;
 723     c3 = 0;
 724     mul_add_c(a[7], b[5], c1, c2, c3);
 725     mul_add_c(a[6], b[6], c1, c2, c3);
 726     mul_add_c(a[5], b[7], c1, c2, c3);
 727     r[12] = c1;
 728     c1 = 0;
 729     mul_add_c(a[6], b[7], c2, c3, c1);
 730     mul_add_c(a[7], b[6], c2, c3, c1);
 731     r[13] = c2;
 732     c2 = 0;
 733     mul_add_c(a[7], b[7], c3, c1, c2);
 734     r[14] = c3;
 735     r[15] = c1;
 736 }
 737
 738 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 739 {
 740     BN_ULONG c1, c2, c3;
 741
 742     c1 = 0;
 743     c2 = 0;
 744     c3 = 0;
 745     mul_add_c(a[0], b[0], c1, c2, c3);
 746     r[0] = c1;
 747     c1 = 0;
 748     mul_add_c(a[0], b[1], c2, c3, c1);
 749     mul_add_c(a[1], b[0], c2, c3, c1);
 750     r[1] = c2;
 751     c2 = 0;
 752     mul_add_c(a[2], b[0], c3, c1, c2);
 753     mul_add_c(a[1], b[1], c3, c1, c2);
 754     mul_add_c(a[0], b[2], c3, c1, c2);
 755     r[2] = c3;
 756     c3 = 0;
 757     mul_add_c(a[0], b[3], c1, c2, c3);
 758     mul_add_c(a[1], b[2], c1, c2, c3);
 759     mul_add_c(a[2], b[1], c1, c2, c3);
 760     mul_add_c(a[3], b[0], c1, c2, c3);
 761     r[3] = c1;
 762     c1 = 0;
 763     mul_add_c(a[3], b[1], c2, c3, c1);
 764     mul_add_c(a[2], b[2], c2, c3, c1);
 765     mul_add_c(a[1], b[3], c2, c3, c1);
 766     r[4] = c2;
 767     c2 = 0;
 768     mul_add_c(a[2], b[3], c3, c1, c2);
 769     mul_add_c(a[3], b[2], c3, c1, c2);
 770     r[5] = c3;
 771     c3 = 0;
 772     mul_add_c(a[3], b[3], c1, c2, c3);
 773     r[6] = c1;
 774     r[7] = c2;
 775 }
 776
 777 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 778 {
 779     BN_ULONG c1, c2, c3;
 780
 781     c1 = 0;
 782     c2 = 0;
 783     c3 = 0;
 784     sqr_add_c(a, 0, c1, c2, c3);
 785     r[0] = c1;
 786     c1 = 0;
 787     sqr_add_c2(a, 1, 0, c2, c3, c1);
 788     r[1] = c2;
 789     c2 = 0;
 790     sqr_add_c(a, 1, c3, c1, c2);
 791     sqr_add_c2(a, 2, 0, c3, c1, c2);
 792     r[2] = c3;
 793     c3 = 0;
 794     sqr_add_c2(a, 3, 0, c1, c2, c3);
 795     sqr_add_c2(a, 2, 1, c1, c2, c3);
 796     r[3] = c1;
 797     c1 = 0;
 798     sqr_add_c(a, 2, c2, c3, c1);
 799     sqr_add_c2(a, 3, 1, c2, c3, c1);
 800     sqr_add_c2(a, 4, 0, c2, c3, c1);
 801     r[4] = c2;
 802     c2 = 0;
 803     sqr_add_c2(a, 5, 0, c3, c1, c2);
 804     sqr_add_c2(a, 4, 1, c3, c1, c2);
 805     sqr_add_c2(a, 3, 2, c3, c1, c2);
 806     r[5] = c3;
 807     c3 = 0;
 808     sqr_add_c(a, 3, c1, c2, c3);
 809     sqr_add_c2(a, 4, 2, c1, c2, c3);
 810     sqr_add_c2(a, 5, 1, c1, c2, c3);
 811     sqr_add_c2(a, 6, 0, c1, c2, c3);
 812     r[6] = c1;
 813     c1 = 0;
 814     sqr_add_c2(a, 7, 0, c2, c3, c1);
 815     sqr_add_c2(a, 6, 1, c2, c3, c1);
 816     sqr_add_c2(a, 5, 2, c2, c3, c1);
 817     sqr_add_c2(a, 4, 3, c2, c3, c1);
 818     r[7] = c2;
 819     c2 = 0;
 820     sqr_add_c(a, 4, c3, c1, c2);
 821     sqr_add_c2(a, 5, 3, c3, c1, c2);
 822     sqr_add_c2(a, 6, 2, c3, c1, c2);
 823     sqr_add_c2(a, 7, 1, c3, c1, c2);
 824     r[8] = c3;
 825     c3 = 0;
 826     sqr_add_c2(a, 7, 2, c1, c2, c3);
 827     sqr_add_c2(a, 6, 3, c1, c2, c3);
 828     sqr_add_c2(a, 5, 4, c1, c2, c3);
 829     r[9] = c1;
 830     c1 = 0;
 831     sqr_add_c(a, 5, c2, c3, c1);
 832     sqr_add_c2(a, 6, 4, c2, c3, c1);
 833     sqr_add_c2(a, 7, 3, c2, c3, c1);
 834     r[10] = c2;
 835     c2 = 0;
 836     sqr_add_c2(a, 7, 4, c3, c1, c2);
 837     sqr_add_c2(a, 6, 5, c3, c1, c2);
 838     r[11] = c3;
 839     c3 = 0;
 840     sqr_add_c(a, 6, c1, c2, c3);
 841     sqr_add_c2(a, 7, 5, c1, c2, c3);
 842     r[12] = c1;
 843     c1 = 0;
 844     sqr_add_c2(a, 7, 6, c2, c3, c1);
 845     r[13] = c2;
 846     c2 = 0;
 847     sqr_add_c(a, 7, c3, c1, c2);
 848     r[14] = c3;
 849     r[15] = c1;
 850 }
 851
 852 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 853 {
 854     BN_ULONG c1, c2, c3;
 855
 856     c1 = 0;
 857     c2 = 0;
 858     c3 = 0;
 859     sqr_add_c(a, 0, c1, c2, c3);
 860     r[0] = c1;
 861     c1 = 0;
 862     sqr_add_c2(a, 1, 0, c2, c3, c1);
 863     r[1] = c2;
 864     c2 = 0;
 865     sqr_add_c(a, 1, c3, c1, c2);
 866     sqr_add_c2(a, 2, 0, c3, c1, c2);
 867     r[2] = c3;
 868     c3 = 0;
 869     sqr_add_c2(a, 3, 0, c1, c2, c3);
 870     sqr_add_c2(a, 2, 1, c1, c2, c3);
 871     r[3] = c1;
 872     c1 = 0;
 873     sqr_add_c(a, 2, c2, c3, c1);
 874     sqr_add_c2(a, 3, 1, c2, c3, c1);
 875     r[4] = c2;
 876     c2 = 0;
 877     sqr_add_c2(a, 3, 2, c3, c1, c2);
 878     r[5] = c3;
 879     c3 = 0;
 880     sqr_add_c(a, 3, c1, c2, c3);
 881     r[6] = c1;
 882     r[7] = c2;
 883 }
 884
 885 # ifdef OPENSSL_NO_ASM
 886 #  ifdef OPENSSL_BN_ASM_MONT
 887 #   include <alloca.h>
 888 /*
 889  * This is essentially reference implementation, which may or may not
 890  * result in performance improvement. E.g. on IA-32 this routine was
 891  * observed to give 40% faster rsa1024 private key operations and 10%
 892  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
 893  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
 894  * reference implementation, one to be used as starting point for
 895  * platform-specific assembler. Mentioned numbers apply to compiler
 896  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
 897  * can vary not only from platform to platform, but even for compiler
 898  * versions. Assembler vs. assembler improvement coefficients can
 899  * [and are known to] differ and are to be documented elsewhere.
 900  */
 901 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 902                 const BN_ULONG *np, const BN_ULONG *n0p, int num)
 903 {
 904     BN_ULONG c0, c1, ml, *tp, n0;
 905 #   ifdef mul64
 906     BN_ULONG mh;
 907 #   endif
 908     volatile BN_ULONG *vp;
 909     int i = 0, j;
 910
 911 #   if 0                        /* template for platform-specific
 912                                  * implementation */
 913     if (ap == bp)
 914         return bn_sqr_mont(rp, ap, np, n0p, num);
 915 #   endif
 916     vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
 917
 918     n0 = *n0p;
 919
 920     c0 = 0;
 921     ml = bp[0];
 922 #   ifdef mul64
 923     mh = HBITS(ml);
 924     ml = LBITS(ml);
 925     for (j = 0; j < num; ++j)
 926         mul(tp[j], ap[j], ml, mh, c0);
 927 #   else
 928     for (j = 0; j < num; ++j)
 929         mul(tp[j], ap[j], ml, c0);
 930 #   endif
 931
 932     tp[num] = c0;
 933     tp[num + 1] = 0;
 934     goto enter;
 935
 936     for (i = 0; i < num; i++) {
 937         c0 = 0;
 938         ml = bp[i];
 939 #   ifdef mul64
 940         mh = HBITS(ml);
 941         ml = LBITS(ml);
 942         for (j = 0; j < num; ++j)
 943             mul_add(tp[j], ap[j], ml, mh, c0);
 944 #   else
 945         for (j = 0; j < num; ++j)
 946             mul_add(tp[j], ap[j], ml, c0);
 947 #   endif
 948         c1 = (tp[num] + c0) & BN_MASK2;
 949         tp[num] = c1;
 950         tp[num + 1] = (c1 < c0 ? 1 : 0);
 951  enter:
 952         c1 = tp[0];
 953         ml = (c1 * n0) & BN_MASK2;
 954         c0 = 0;
 955 #   ifdef mul64
 956         mh = HBITS(ml);
 957         ml = LBITS(ml);
 958         mul_add(c1, np[0], ml, mh, c0);
 959 #   else
 960         mul_add(c1, ml, np[0], c0);
 961 #   endif
 962         for (j = 1; j < num; j++) {
 963             c1 = tp[j];
 964 #   ifdef mul64
 965             mul_add(c1, np[j], ml, mh, c0);
 966 #   else
 967             mul_add(c1, ml, np[j], c0);
 968 #   endif
 969             tp[j - 1] = c1 & BN_MASK2;
 970         }
 971         c1 = (tp[num] + c0) & BN_MASK2;
 972         tp[num - 1] = c1;
 973         tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
 974     }
 975
 976     if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
 977         c0 = bn_sub_words(rp, tp, np, num);
 978         if (tp[num] != 0 || c0 == 0) {
 979             for (i = 0; i < num + 2; i++)
 980                 vp[i] = 0;
 981             return 1;
 982         }
 983     }
 984     for (i = 0; i < num; i++)
 985         rp[i] = tp[i], vp[i] = 0;
 986     vp[num] = 0;
 987     vp[num + 1] = 0;
 988     return 1;
 989 }
 990 #  else
 991 /*
 992  * Return value of 0 indicates that multiplication/convolution was not
 993  * performed to signal the caller to fall down to alternative/original
 994  * code-path.
 995  */
 996 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 997                 const BN_ULONG *np, const BN_ULONG *n0, int num)
 998 {
 999     return 0;
1000 }
1001 #  endif                        /* OPENSSL_BN_ASM_MONT */
1002 # endif
1003
1004 #else                           /* !BN_MUL_COMBA */
1005
1006 /* hmm... is it faster just to do a multiply? */
1007 # undef bn_sqr_comba4
1008 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
1009 {
1010     BN_ULONG t[8];
1011     bn_sqr_normal(r, a, 4, t);
1012 }
1013
1014 # undef bn_sqr_comba8
1015 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
1016 {
1017     BN_ULONG t[16];
1018     bn_sqr_normal(r, a, 8, t);
1019 }
1020
1021 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1022 {
1023     r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
1024     r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
1025     r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
1026     r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
1027 }
1028
1029 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1030 {
1031     r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
1032     r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
1033     r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
1034     r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
1035     r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
1036     r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
1037     r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
1038     r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
1039 }
1040
1041 # ifdef OPENSSL_NO_ASM
1042 #  ifdef OPENSSL_BN_ASM_MONT
1043 #   include <alloca.h>
1044 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1045                 const BN_ULONG *np, const BN_ULONG *n0p, int num)
1046 {
1047     BN_ULONG c0, c1, *tp, n0 = *n0p;
1048     volatile BN_ULONG *vp;
1049     int i = 0, j;
1050
1051     vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1052
1053     for (i = 0; i <= num; i++)
1054         tp[i] = 0;
1055
1056     for (i = 0; i < num; i++) {
1057         c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1058         c1 = (tp[num] + c0) & BN_MASK2;
1059         tp[num] = c1;
1060         tp[num + 1] = (c1 < c0 ? 1 : 0);
1061
1062         c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1063         c1 = (tp[num] + c0) & BN_MASK2;
1064         tp[num] = c1;
1065         tp[num + 1] += (c1 < c0 ? 1 : 0);
1066         for (j = 0; j <= num; j++)
1067             tp[j] = tp[j + 1];
1068     }
1069
1070     if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1071         c0 = bn_sub_words(rp, tp, np, num);
1072         if (tp[num] != 0 || c0 == 0) {
1073             for (i = 0; i < num + 2; i++)
1074                 vp[i] = 0;
1075             return 1;
1076         }
1077     }
1078     for (i = 0; i < num; i++)
1079         rp[i] = tp[i], vp[i] = 0;
1080     vp[num] = 0;
1081     vp[num + 1] = 0;
1082     return 1;
1083 }
1084 #  else
1085 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1086                 const BN_ULONG *np, const BN_ULONG *n0, int num)
1087 {
1088     return 0;
1089 }
1090 #  endif                        /* OPENSSL_BN_ASM_MONT */
1091 # endif
1092
1093 #endif                          /* !BN_MUL_COMBA */