crypto/bn/bn_asm.c

   1 /*
   2  * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 #include <assert.h>
  11 #include <openssl/crypto.h>
  12 #include "internal/cryptlib.h"
  13 #include "bn_lcl.h"
  14
  15 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
  16
  17 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  18                           BN_ULONG w)
  19 {
  20     BN_ULONG c1 = 0;
  21
  22     assert(num >= 0);
  23     if (num <= 0)
  24         return (c1);
  25
  26 # ifndef OPENSSL_SMALL_FOOTPRINT
  27     while (num & ~3) {
  28         mul_add(rp[0], ap[0], w, c1);
  29         mul_add(rp[1], ap[1], w, c1);
  30         mul_add(rp[2], ap[2], w, c1);
  31         mul_add(rp[3], ap[3], w, c1);
  32         ap += 4;
  33         rp += 4;
  34         num -= 4;
  35     }
  36 # endif
  37     while (num) {
  38         mul_add(rp[0], ap[0], w, c1);
  39         ap++;
  40         rp++;
  41         num--;
  42     }
  43
  44     return (c1);
  45 }
  46
  47 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  48 {
  49     BN_ULONG c1 = 0;
  50
  51     assert(num >= 0);
  52     if (num <= 0)
  53         return (c1);
  54
  55 # ifndef OPENSSL_SMALL_FOOTPRINT
  56     while (num & ~3) {
  57         mul(rp[0], ap[0], w, c1);
  58         mul(rp[1], ap[1], w, c1);
  59         mul(rp[2], ap[2], w, c1);
  60         mul(rp[3], ap[3], w, c1);
  61         ap += 4;
  62         rp += 4;
  63         num -= 4;
  64     }
  65 # endif
  66     while (num) {
  67         mul(rp[0], ap[0], w, c1);
  68         ap++;
  69         rp++;
  70         num--;
  71     }
  72     return (c1);
  73 }
  74
  75 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
  76 {
  77     assert(n >= 0);
  78     if (n <= 0)
  79         return;
  80
  81 # ifndef OPENSSL_SMALL_FOOTPRINT
  82     while (n & ~3) {
  83         sqr(r[0], r[1], a[0]);
  84         sqr(r[2], r[3], a[1]);
  85         sqr(r[4], r[5], a[2]);
  86         sqr(r[6], r[7], a[3]);
  87         a += 4;
  88         r += 8;
  89         n -= 4;
  90     }
  91 # endif
  92     while (n) {
  93         sqr(r[0], r[1], a[0]);
  94         a++;
  95         r += 2;
  96         n--;
  97     }
  98 }
  99
 100 #else                           /* !(defined(BN_LLONG) ||
 101                                  * defined(BN_UMULT_HIGH)) */
 102
 103 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
 104                           BN_ULONG w)
 105 {
 106     BN_ULONG c = 0;
 107     BN_ULONG bl, bh;
 108
 109     assert(num >= 0);
 110     if (num <= 0)
 111         return ((BN_ULONG)0);
 112
 113     bl = LBITS(w);
 114     bh = HBITS(w);
 115
 116 # ifndef OPENSSL_SMALL_FOOTPRINT
 117     while (num & ~3) {
 118         mul_add(rp[0], ap[0], bl, bh, c);
 119         mul_add(rp[1], ap[1], bl, bh, c);
 120         mul_add(rp[2], ap[2], bl, bh, c);
 121         mul_add(rp[3], ap[3], bl, bh, c);
 122         ap += 4;
 123         rp += 4;
 124         num -= 4;
 125     }
 126 # endif
 127     while (num) {
 128         mul_add(rp[0], ap[0], bl, bh, c);
 129         ap++;
 130         rp++;
 131         num--;
 132     }
 133     return (c);
 134 }
 135
 136 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 137 {
 138     BN_ULONG carry = 0;
 139     BN_ULONG bl, bh;
 140
 141     assert(num >= 0);
 142     if (num <= 0)
 143         return ((BN_ULONG)0);
 144
 145     bl = LBITS(w);
 146     bh = HBITS(w);
 147
 148 # ifndef OPENSSL_SMALL_FOOTPRINT
 149     while (num & ~3) {
 150         mul(rp[0], ap[0], bl, bh, carry);
 151         mul(rp[1], ap[1], bl, bh, carry);
 152         mul(rp[2], ap[2], bl, bh, carry);
 153         mul(rp[3], ap[3], bl, bh, carry);
 154         ap += 4;
 155         rp += 4;
 156         num -= 4;
 157     }
 158 # endif
 159     while (num) {
 160         mul(rp[0], ap[0], bl, bh, carry);
 161         ap++;
 162         rp++;
 163         num--;
 164     }
 165     return (carry);
 166 }
 167
 168 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
 169 {
 170     assert(n >= 0);
 171     if (n <= 0)
 172         return;
 173
 174 # ifndef OPENSSL_SMALL_FOOTPRINT
 175     while (n & ~3) {
 176         sqr64(r[0], r[1], a[0]);
 177         sqr64(r[2], r[3], a[1]);
 178         sqr64(r[4], r[5], a[2]);
 179         sqr64(r[6], r[7], a[3]);
 180         a += 4;
 181         r += 8;
 182         n -= 4;
 183     }
 184 # endif
 185     while (n) {
 186         sqr64(r[0], r[1], a[0]);
 187         a++;
 188         r += 2;
 189         n--;
 190     }
 191 }
 192
 193 #endif                          /* !(defined(BN_LLONG) ||
 194                                  * defined(BN_UMULT_HIGH)) */
 195
 196 #if defined(BN_LLONG) && defined(BN_DIV2W)
 197
 198 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 199 {
 200     return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
 201 }
 202
 203 #else
 204
 205 /* Divide h,l by d and return the result. */
 206 /* I need to test this some more :-( */
 207 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 208 {
 209     BN_ULONG dh, dl, q, ret = 0, th, tl, t;
 210     int i, count = 2;
 211
 212     if (d == 0)
 213         return (BN_MASK2);
 214
 215     i = BN_num_bits_word(d);
 216     assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
 217
 218     i = BN_BITS2 - i;
 219     if (h >= d)
 220         h -= d;
 221
 222     if (i) {
 223         d <<= i;
 224         h = (h << i) | (l >> (BN_BITS2 - i));
 225         l <<= i;
 226     }
 227     dh = (d & BN_MASK2h) >> BN_BITS4;
 228     dl = (d & BN_MASK2l);
 229     for (;;) {
 230         if ((h >> BN_BITS4) == dh)
 231             q = BN_MASK2l;
 232         else
 233             q = h / dh;
 234
 235         th = q * dh;
 236         tl = dl * q;
 237         for (;;) {
 238             t = h - th;
 239             if ((t & BN_MASK2h) ||
 240                 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
 241                 break;
 242             q--;
 243             th -= dh;
 244             tl -= dl;
 245         }
 246         t = (tl >> BN_BITS4);
 247         tl = (tl << BN_BITS4) & BN_MASK2h;
 248         th += t;
 249
 250         if (l < tl)
 251             th++;
 252         l -= tl;
 253         if (h < th) {
 254             h += d;
 255             q--;
 256         }
 257         h -= th;
 258
 259         if (--count == 0)
 260             break;
 261
 262         ret = q << BN_BITS4;
 263         h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
 264         l = (l & BN_MASK2l) << BN_BITS4;
 265     }
 266     ret |= q;
 267     return (ret);
 268 }
 269 #endif                          /* !defined(BN_LLONG) && defined(BN_DIV2W) */
 270
 271 #ifdef BN_LLONG
 272 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 273                       int n)
 274 {
 275     BN_ULLONG ll = 0;
 276
 277     assert(n >= 0);
 278     if (n <= 0)
 279         return ((BN_ULONG)0);
 280
 281 # ifndef OPENSSL_SMALL_FOOTPRINT
 282     while (n & ~3) {
 283         ll += (BN_ULLONG) a[0] + b[0];
 284         r[0] = (BN_ULONG)ll & BN_MASK2;
 285         ll >>= BN_BITS2;
 286         ll += (BN_ULLONG) a[1] + b[1];
 287         r[1] = (BN_ULONG)ll & BN_MASK2;
 288         ll >>= BN_BITS2;
 289         ll += (BN_ULLONG) a[2] + b[2];
 290         r[2] = (BN_ULONG)ll & BN_MASK2;
 291         ll >>= BN_BITS2;
 292         ll += (BN_ULLONG) a[3] + b[3];
 293         r[3] = (BN_ULONG)ll & BN_MASK2;
 294         ll >>= BN_BITS2;
 295         a += 4;
 296         b += 4;
 297         r += 4;
 298         n -= 4;
 299     }
 300 # endif
 301     while (n) {
 302         ll += (BN_ULLONG) a[0] + b[0];
 303         r[0] = (BN_ULONG)ll & BN_MASK2;
 304         ll >>= BN_BITS2;
 305         a++;
 306         b++;
 307         r++;
 308         n--;
 309     }
 310     return ((BN_ULONG)ll);
 311 }
 312 #else                           /* !BN_LLONG */
 313 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 314                       int n)
 315 {
 316     BN_ULONG c, l, t;
 317
 318     assert(n >= 0);
 319     if (n <= 0)
 320         return ((BN_ULONG)0);
 321
 322     c = 0;
 323 # ifndef OPENSSL_SMALL_FOOTPRINT
 324     while (n & ~3) {
 325         t = a[0];
 326         t = (t + c) & BN_MASK2;
 327         c = (t < c);
 328         l = (t + b[0]) & BN_MASK2;
 329         c += (l < t);
 330         r[0] = l;
 331         t = a[1];
 332         t = (t + c) & BN_MASK2;
 333         c = (t < c);
 334         l = (t + b[1]) & BN_MASK2;
 335         c += (l < t);
 336         r[1] = l;
 337         t = a[2];
 338         t = (t + c) & BN_MASK2;
 339         c = (t < c);
 340         l = (t + b[2]) & BN_MASK2;
 341         c += (l < t);
 342         r[2] = l;
 343         t = a[3];
 344         t = (t + c) & BN_MASK2;
 345         c = (t < c);
 346         l = (t + b[3]) & BN_MASK2;
 347         c += (l < t);
 348         r[3] = l;
 349         a += 4;
 350         b += 4;
 351         r += 4;
 352         n -= 4;
 353     }
 354 # endif
 355     while (n) {
 356         t = a[0];
 357         t = (t + c) & BN_MASK2;
 358         c = (t < c);
 359         l = (t + b[0]) & BN_MASK2;
 360         c += (l < t);
 361         r[0] = l;
 362         a++;
 363         b++;
 364         r++;
 365         n--;
 366     }
 367     return ((BN_ULONG)c);
 368 }
 369 #endif                          /* !BN_LLONG */
 370
 371 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 372                       int n)
 373 {
 374     BN_ULONG t1, t2;
 375     int c = 0;
 376
 377     assert(n >= 0);
 378     if (n <= 0)
 379         return ((BN_ULONG)0);
 380
 381 #ifndef OPENSSL_SMALL_FOOTPRINT
 382     while (n & ~3) {
 383         t1 = a[0];
 384         t2 = b[0];
 385         r[0] = (t1 - t2 - c) & BN_MASK2;
 386         if (t1 != t2)
 387             c = (t1 < t2);
 388         t1 = a[1];
 389         t2 = b[1];
 390         r[1] = (t1 - t2 - c) & BN_MASK2;
 391         if (t1 != t2)
 392             c = (t1 < t2);
 393         t1 = a[2];
 394         t2 = b[2];
 395         r[2] = (t1 - t2 - c) & BN_MASK2;
 396         if (t1 != t2)
 397             c = (t1 < t2);
 398         t1 = a[3];
 399         t2 = b[3];
 400         r[3] = (t1 - t2 - c) & BN_MASK2;
 401         if (t1 != t2)
 402             c = (t1 < t2);
 403         a += 4;
 404         b += 4;
 405         r += 4;
 406         n -= 4;
 407     }
 408 #endif
 409     while (n) {
 410         t1 = a[0];
 411         t2 = b[0];
 412         r[0] = (t1 - t2 - c) & BN_MASK2;
 413         if (t1 != t2)
 414             c = (t1 < t2);
 415         a++;
 416         b++;
 417         r++;
 418         n--;
 419     }
 420     return (c);
 421 }
 422
 423 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
 424
 425 # undef bn_mul_comba8
 426 # undef bn_mul_comba4
 427 # undef bn_sqr_comba8
 428 # undef bn_sqr_comba4
 429
 430 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
 431 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
 432 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
 433 /*
 434  * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
 435  * c=(c2,c1,c0)
 436  */
 437
 438 # ifdef BN_LLONG
 439 /*
 440  * Keep in mind that additions to multiplication result can not
 441  * overflow, because its high half cannot be all-ones.
 442  */
 443 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 444         BN_ULONG hi;                            \
 445         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
 446         t += c0;                /* no carry */  \
 447         c0 = (BN_ULONG)Lw(t);                   \
 448         hi = (BN_ULONG)Hw(t);                   \
 449         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 450         } while(0)
 451
 452 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 453         BN_ULONG hi;                            \
 454         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
 455         BN_ULLONG tt = t+c0;    /* no carry */  \
 456         c0 = (BN_ULONG)Lw(tt);                  \
 457         hi = (BN_ULONG)Hw(tt);                  \
 458         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 459         t += c0;                /* no carry */  \
 460         c0 = (BN_ULONG)Lw(t);                   \
 461         hi = (BN_ULONG)Hw(t);                   \
 462         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 463         } while(0)
 464
 465 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 466         BN_ULONG hi;                            \
 467         BN_ULLONG t = (BN_ULLONG)a[i]*a[i];     \
 468         t += c0;                /* no carry */  \
 469         c0 = (BN_ULONG)Lw(t);                   \
 470         hi = (BN_ULONG)Hw(t);                   \
 471         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 472         } while(0)
 473
 474 #  define sqr_add_c2(a,i,j,c0,c1,c2) \
 475         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 476
 477 # elif defined(BN_UMULT_LOHI)
 478 /*
 479  * Keep in mind that additions to hi can not overflow, because
 480  * the high word of a multiplication result cannot be all-ones.
 481  */
 482 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 483         BN_ULONG ta = (a), tb = (b);            \
 484         BN_ULONG lo, hi;                        \
 485         BN_UMULT_LOHI(lo,hi,ta,tb);             \
 486         c0 += lo; hi += (c0<lo)?1:0;            \
 487         c1 += hi; c2 += (c1<hi)?1:0;            \
 488         } while(0)
 489
 490 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 491         BN_ULONG ta = (a), tb = (b);            \
 492         BN_ULONG lo, hi, tt;                    \
 493         BN_UMULT_LOHI(lo,hi,ta,tb);             \
 494         c0 += lo; tt = hi+((c0<lo)?1:0);        \
 495         c1 += tt; c2 += (c1<tt)?1:0;            \
 496         c0 += lo; hi += (c0<lo)?1:0;            \
 497         c1 += hi; c2 += (c1<hi)?1:0;            \
 498         } while(0)
 499
 500 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 501         BN_ULONG ta = (a)[i];                   \
 502         BN_ULONG lo, hi;                        \
 503         BN_UMULT_LOHI(lo,hi,ta,ta);             \
 504         c0 += lo; hi += (c0<lo)?1:0;            \
 505         c1 += hi; c2 += (c1<hi)?1:0;            \
 506         } while(0)
 507
 508 #  define sqr_add_c2(a,i,j,c0,c1,c2)    \
 509         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 510
 511 # elif defined(BN_UMULT_HIGH)
 512 /*
 513  * Keep in mind that additions to hi can not overflow, because
 514  * the high word of a multiplication result cannot be all-ones.
 515  */
 516 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 517         BN_ULONG ta = (a), tb = (b);            \
 518         BN_ULONG lo = ta * tb;                  \
 519         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
 520         c0 += lo; hi += (c0<lo)?1:0;            \
 521         c1 += hi; c2 += (c1<hi)?1:0;            \
 522         } while(0)
 523
 524 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 525         BN_ULONG ta = (a), tb = (b), tt;        \
 526         BN_ULONG lo = ta * tb;                  \
 527         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
 528         c0 += lo; tt = hi + ((c0<lo)?1:0);      \
 529         c1 += tt; c2 += (c1<tt)?1:0;            \
 530         c0 += lo; hi += (c0<lo)?1:0;            \
 531         c1 += hi; c2 += (c1<hi)?1:0;            \
 532         } while(0)
 533
 534 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 535         BN_ULONG ta = (a)[i];                   \
 536         BN_ULONG lo = ta * ta;                  \
 537         BN_ULONG hi = BN_UMULT_HIGH(ta,ta);     \
 538         c0 += lo; hi += (c0<lo)?1:0;            \
 539         c1 += hi; c2 += (c1<hi)?1:0;            \
 540         } while(0)
 541
 542 #  define sqr_add_c2(a,i,j,c0,c1,c2)      \
 543         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 544
 545 # else                          /* !BN_LLONG */
 546 /*
 547  * Keep in mind that additions to hi can not overflow, because
 548  * the high word of a multiplication result cannot be all-ones.
 549  */
 550 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 551         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
 552         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
 553         mul64(lo,hi,bl,bh);                     \
 554         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 555         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 556         } while(0)
 557
 558 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 559         BN_ULONG tt;                            \
 560         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
 561         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
 562         mul64(lo,hi,bl,bh);                     \
 563         tt = hi;                                \
 564         c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
 565         c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
 566         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 567         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 568         } while(0)
 569
 570 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 571         BN_ULONG lo, hi;                        \
 572         sqr64(lo,hi,(a)[i]);                    \
 573         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 574         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 575         } while(0)
 576
 577 #  define sqr_add_c2(a,i,j,c0,c1,c2) \
 578         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 579 # endif                         /* !BN_LLONG */
 580
 581 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 582 {
 583     BN_ULONG c1, c2, c3;
 584
 585     c1 = 0;
 586     c2 = 0;
 587     c3 = 0;
 588     mul_add_c(a[0], b[0], c1, c2, c3);
 589     r[0] = c1;
 590     c1 = 0;
 591     mul_add_c(a[0], b[1], c2, c3, c1);
 592     mul_add_c(a[1], b[0], c2, c3, c1);
 593     r[1] = c2;
 594     c2 = 0;
 595     mul_add_c(a[2], b[0], c3, c1, c2);
 596     mul_add_c(a[1], b[1], c3, c1, c2);
 597     mul_add_c(a[0], b[2], c3, c1, c2);
 598     r[2] = c3;
 599     c3 = 0;
 600     mul_add_c(a[0], b[3], c1, c2, c3);
 601     mul_add_c(a[1], b[2], c1, c2, c3);
 602     mul_add_c(a[2], b[1], c1, c2, c3);
 603     mul_add_c(a[3], b[0], c1, c2, c3);
 604     r[3] = c1;
 605     c1 = 0;
 606     mul_add_c(a[4], b[0], c2, c3, c1);
 607     mul_add_c(a[3], b[1], c2, c3, c1);
 608     mul_add_c(a[2], b[2], c2, c3, c1);
 609     mul_add_c(a[1], b[3], c2, c3, c1);
 610     mul_add_c(a[0], b[4], c2, c3, c1);
 611     r[4] = c2;
 612     c2 = 0;
 613     mul_add_c(a[0], b[5], c3, c1, c2);
 614     mul_add_c(a[1], b[4], c3, c1, c2);
 615     mul_add_c(a[2], b[3], c3, c1, c2);
 616     mul_add_c(a[3], b[2], c3, c1, c2);
 617     mul_add_c(a[4], b[1], c3, c1, c2);
 618     mul_add_c(a[5], b[0], c3, c1, c2);
 619     r[5] = c3;
 620     c3 = 0;
 621     mul_add_c(a[6], b[0], c1, c2, c3);
 622     mul_add_c(a[5], b[1], c1, c2, c3);
 623     mul_add_c(a[4], b[2], c1, c2, c3);
 624     mul_add_c(a[3], b[3], c1, c2, c3);
 625     mul_add_c(a[2], b[4], c1, c2, c3);
 626     mul_add_c(a[1], b[5], c1, c2, c3);
 627     mul_add_c(a[0], b[6], c1, c2, c3);
 628     r[6] = c1;
 629     c1 = 0;
 630     mul_add_c(a[0], b[7], c2, c3, c1);
 631     mul_add_c(a[1], b[6], c2, c3, c1);
 632     mul_add_c(a[2], b[5], c2, c3, c1);
 633     mul_add_c(a[3], b[4], c2, c3, c1);
 634     mul_add_c(a[4], b[3], c2, c3, c1);
 635     mul_add_c(a[5], b[2], c2, c3, c1);
 636     mul_add_c(a[6], b[1], c2, c3, c1);
 637     mul_add_c(a[7], b[0], c2, c3, c1);
 638     r[7] = c2;
 639     c2 = 0;
 640     mul_add_c(a[7], b[1], c3, c1, c2);
 641     mul_add_c(a[6], b[2], c3, c1, c2);
 642     mul_add_c(a[5], b[3], c3, c1, c2);
 643     mul_add_c(a[4], b[4], c3, c1, c2);
 644     mul_add_c(a[3], b[5], c3, c1, c2);
 645     mul_add_c(a[2], b[6], c3, c1, c2);
 646     mul_add_c(a[1], b[7], c3, c1, c2);
 647     r[8] = c3;
 648     c3 = 0;
 649     mul_add_c(a[2], b[7], c1, c2, c3);
 650     mul_add_c(a[3], b[6], c1, c2, c3);
 651     mul_add_c(a[4], b[5], c1, c2, c3);
 652     mul_add_c(a[5], b[4], c1, c2, c3);
 653     mul_add_c(a[6], b[3], c1, c2, c3);
 654     mul_add_c(a[7], b[2], c1, c2, c3);
 655     r[9] = c1;
 656     c1 = 0;
 657     mul_add_c(a[7], b[3], c2, c3, c1);
 658     mul_add_c(a[6], b[4], c2, c3, c1);
 659     mul_add_c(a[5], b[5], c2, c3, c1);
 660     mul_add_c(a[4], b[6], c2, c3, c1);
 661     mul_add_c(a[3], b[7], c2, c3, c1);
 662     r[10] = c2;
 663     c2 = 0;
 664     mul_add_c(a[4], b[7], c3, c1, c2);
 665     mul_add_c(a[5], b[6], c3, c1, c2);
 666     mul_add_c(a[6], b[5], c3, c1, c2);
 667     mul_add_c(a[7], b[4], c3, c1, c2);
 668     r[11] = c3;
 669     c3 = 0;
 670     mul_add_c(a[7], b[5], c1, c2, c3);
 671     mul_add_c(a[6], b[6], c1, c2, c3);
 672     mul_add_c(a[5], b[7], c1, c2, c3);
 673     r[12] = c1;
 674     c1 = 0;
 675     mul_add_c(a[6], b[7], c2, c3, c1);
 676     mul_add_c(a[7], b[6], c2, c3, c1);
 677     r[13] = c2;
 678     c2 = 0;
 679     mul_add_c(a[7], b[7], c3, c1, c2);
 680     r[14] = c3;
 681     r[15] = c1;
 682 }
 683
 684 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 685 {
 686     BN_ULONG c1, c2, c3;
 687
 688     c1 = 0;
 689     c2 = 0;
 690     c3 = 0;
 691     mul_add_c(a[0], b[0], c1, c2, c3);
 692     r[0] = c1;
 693     c1 = 0;
 694     mul_add_c(a[0], b[1], c2, c3, c1);
 695     mul_add_c(a[1], b[0], c2, c3, c1);
 696     r[1] = c2;
 697     c2 = 0;
 698     mul_add_c(a[2], b[0], c3, c1, c2);
 699     mul_add_c(a[1], b[1], c3, c1, c2);
 700     mul_add_c(a[0], b[2], c3, c1, c2);
 701     r[2] = c3;
 702     c3 = 0;
 703     mul_add_c(a[0], b[3], c1, c2, c3);
 704     mul_add_c(a[1], b[2], c1, c2, c3);
 705     mul_add_c(a[2], b[1], c1, c2, c3);
 706     mul_add_c(a[3], b[0], c1, c2, c3);
 707     r[3] = c1;
 708     c1 = 0;
 709     mul_add_c(a[3], b[1], c2, c3, c1);
 710     mul_add_c(a[2], b[2], c2, c3, c1);
 711     mul_add_c(a[1], b[3], c2, c3, c1);
 712     r[4] = c2;
 713     c2 = 0;
 714     mul_add_c(a[2], b[3], c3, c1, c2);
 715     mul_add_c(a[3], b[2], c3, c1, c2);
 716     r[5] = c3;
 717     c3 = 0;
 718     mul_add_c(a[3], b[3], c1, c2, c3);
 719     r[6] = c1;
 720     r[7] = c2;
 721 }
 722
 723 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 724 {
 725     BN_ULONG c1, c2, c3;
 726
 727     c1 = 0;
 728     c2 = 0;
 729     c3 = 0;
 730     sqr_add_c(a, 0, c1, c2, c3);
 731     r[0] = c1;
 732     c1 = 0;
 733     sqr_add_c2(a, 1, 0, c2, c3, c1);
 734     r[1] = c2;
 735     c2 = 0;
 736     sqr_add_c(a, 1, c3, c1, c2);
 737     sqr_add_c2(a, 2, 0, c3, c1, c2);
 738     r[2] = c3;
 739     c3 = 0;
 740     sqr_add_c2(a, 3, 0, c1, c2, c3);
 741     sqr_add_c2(a, 2, 1, c1, c2, c3);
 742     r[3] = c1;
 743     c1 = 0;
 744     sqr_add_c(a, 2, c2, c3, c1);
 745     sqr_add_c2(a, 3, 1, c2, c3, c1);
 746     sqr_add_c2(a, 4, 0, c2, c3, c1);
 747     r[4] = c2;
 748     c2 = 0;
 749     sqr_add_c2(a, 5, 0, c3, c1, c2);
 750     sqr_add_c2(a, 4, 1, c3, c1, c2);
 751     sqr_add_c2(a, 3, 2, c3, c1, c2);
 752     r[5] = c3;
 753     c3 = 0;
 754     sqr_add_c(a, 3, c1, c2, c3);
 755     sqr_add_c2(a, 4, 2, c1, c2, c3);
 756     sqr_add_c2(a, 5, 1, c1, c2, c3);
 757     sqr_add_c2(a, 6, 0, c1, c2, c3);
 758     r[6] = c1;
 759     c1 = 0;
 760     sqr_add_c2(a, 7, 0, c2, c3, c1);
 761     sqr_add_c2(a, 6, 1, c2, c3, c1);
 762     sqr_add_c2(a, 5, 2, c2, c3, c1);
 763     sqr_add_c2(a, 4, 3, c2, c3, c1);
 764     r[7] = c2;
 765     c2 = 0;
 766     sqr_add_c(a, 4, c3, c1, c2);
 767     sqr_add_c2(a, 5, 3, c3, c1, c2);
 768     sqr_add_c2(a, 6, 2, c3, c1, c2);
 769     sqr_add_c2(a, 7, 1, c3, c1, c2);
 770     r[8] = c3;
 771     c3 = 0;
 772     sqr_add_c2(a, 7, 2, c1, c2, c3);
 773     sqr_add_c2(a, 6, 3, c1, c2, c3);
 774     sqr_add_c2(a, 5, 4, c1, c2, c3);
 775     r[9] = c1;
 776     c1 = 0;
 777     sqr_add_c(a, 5, c2, c3, c1);
 778     sqr_add_c2(a, 6, 4, c2, c3, c1);
 779     sqr_add_c2(a, 7, 3, c2, c3, c1);
 780     r[10] = c2;
 781     c2 = 0;
 782     sqr_add_c2(a, 7, 4, c3, c1, c2);
 783     sqr_add_c2(a, 6, 5, c3, c1, c2);
 784     r[11] = c3;
 785     c3 = 0;
 786     sqr_add_c(a, 6, c1, c2, c3);
 787     sqr_add_c2(a, 7, 5, c1, c2, c3);
 788     r[12] = c1;
 789     c1 = 0;
 790     sqr_add_c2(a, 7, 6, c2, c3, c1);
 791     r[13] = c2;
 792     c2 = 0;
 793     sqr_add_c(a, 7, c3, c1, c2);
 794     r[14] = c3;
 795     r[15] = c1;
 796 }
 797
 798 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 799 {
 800     BN_ULONG c1, c2, c3;
 801
 802     c1 = 0;
 803     c2 = 0;
 804     c3 = 0;
 805     sqr_add_c(a, 0, c1, c2, c3);
 806     r[0] = c1;
 807     c1 = 0;
 808     sqr_add_c2(a, 1, 0, c2, c3, c1);
 809     r[1] = c2;
 810     c2 = 0;
 811     sqr_add_c(a, 1, c3, c1, c2);
 812     sqr_add_c2(a, 2, 0, c3, c1, c2);
 813     r[2] = c3;
 814     c3 = 0;
 815     sqr_add_c2(a, 3, 0, c1, c2, c3);
 816     sqr_add_c2(a, 2, 1, c1, c2, c3);
 817     r[3] = c1;
 818     c1 = 0;
 819     sqr_add_c(a, 2, c2, c3, c1);
 820     sqr_add_c2(a, 3, 1, c2, c3, c1);
 821     r[4] = c2;
 822     c2 = 0;
 823     sqr_add_c2(a, 3, 2, c3, c1, c2);
 824     r[5] = c3;
 825     c3 = 0;
 826     sqr_add_c(a, 3, c1, c2, c3);
 827     r[6] = c1;
 828     r[7] = c2;
 829 }
 830
 831 # ifdef OPENSSL_NO_ASM
 832 #  ifdef OPENSSL_BN_ASM_MONT
 833 #   include <alloca.h>
 834 /*
 835  * This is essentially reference implementation, which may or may not
 836  * result in performance improvement. E.g. on IA-32 this routine was
 837  * observed to give 40% faster rsa1024 private key operations and 10%
 838  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
 839  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
 840  * reference implementation, one to be used as starting point for
 841  * platform-specific assembler. Mentioned numbers apply to compiler
 842  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
 843  * can vary not only from platform to platform, but even for compiler
 844  * versions. Assembler vs. assembler improvement coefficients can
 845  * [and are known to] differ and are to be documented elsewhere.
 846  */
 847 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 848                 const BN_ULONG *np, const BN_ULONG *n0p, int num)
 849 {
 850     BN_ULONG c0, c1, ml, *tp, n0;
 851 #   ifdef mul64
 852     BN_ULONG mh;
 853 #   endif
 854     volatile BN_ULONG *vp;
 855     int i = 0, j;
 856
 857 #   if 0                        /* template for platform-specific
 858                                  * implementation */
 859     if (ap == bp)
 860         return bn_sqr_mont(rp, ap, np, n0p, num);
 861 #   endif
 862     vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
 863
 864     n0 = *n0p;
 865
 866     c0 = 0;
 867     ml = bp[0];
 868 #   ifdef mul64
 869     mh = HBITS(ml);
 870     ml = LBITS(ml);
 871     for (j = 0; j < num; ++j)
 872         mul(tp[j], ap[j], ml, mh, c0);
 873 #   else
 874     for (j = 0; j < num; ++j)
 875         mul(tp[j], ap[j], ml, c0);
 876 #   endif
 877
 878     tp[num] = c0;
 879     tp[num + 1] = 0;
 880     goto enter;
 881
 882     for (i = 0; i < num; i++) {
 883         c0 = 0;
 884         ml = bp[i];
 885 #   ifdef mul64
 886         mh = HBITS(ml);
 887         ml = LBITS(ml);
 888         for (j = 0; j < num; ++j)
 889             mul_add(tp[j], ap[j], ml, mh, c0);
 890 #   else
 891         for (j = 0; j < num; ++j)
 892             mul_add(tp[j], ap[j], ml, c0);
 893 #   endif
 894         c1 = (tp[num] + c0) & BN_MASK2;
 895         tp[num] = c1;
 896         tp[num + 1] = (c1 < c0 ? 1 : 0);
 897  enter:
 898         c1 = tp[0];
 899         ml = (c1 * n0) & BN_MASK2;
 900         c0 = 0;
 901 #   ifdef mul64
 902         mh = HBITS(ml);
 903         ml = LBITS(ml);
 904         mul_add(c1, np[0], ml, mh, c0);
 905 #   else
 906         mul_add(c1, ml, np[0], c0);
 907 #   endif
 908         for (j = 1; j < num; j++) {
 909             c1 = tp[j];
 910 #   ifdef mul64
 911             mul_add(c1, np[j], ml, mh, c0);
 912 #   else
 913             mul_add(c1, ml, np[j], c0);
 914 #   endif
 915             tp[j - 1] = c1 & BN_MASK2;
 916         }
 917         c1 = (tp[num] + c0) & BN_MASK2;
 918         tp[num - 1] = c1;
 919         tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
 920     }
 921
 922     if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
 923         c0 = bn_sub_words(rp, tp, np, num);
 924         if (tp[num] != 0 || c0 == 0) {
 925             for (i = 0; i < num + 2; i++)
 926                 vp[i] = 0;
 927             return 1;
 928         }
 929     }
 930     for (i = 0; i < num; i++)
 931         rp[i] = tp[i], vp[i] = 0;
 932     vp[num] = 0;
 933     vp[num + 1] = 0;
 934     return 1;
 935 }
 936 #  else
 937 /*
 938  * Return value of 0 indicates that multiplication/convolution was not
 939  * performed to signal the caller to fall down to alternative/original
 940  * code-path.
 941  */
 942 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 943                 const BN_ULONG *np, const BN_ULONG *n0, int num)
 944 {
 945     return 0;
 946 }
 947 #  endif                        /* OPENSSL_BN_ASM_MONT */
 948 # endif
 949
 950 #else                           /* !BN_MUL_COMBA */
 951
 952 /* hmm... is it faster just to do a multiply? */
 953 # undef bn_sqr_comba4
 954 # undef bn_sqr_comba8
 955 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 956 {
 957     BN_ULONG t[8];
 958     bn_sqr_normal(r, a, 4, t);
 959 }
 960
 961 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 962 {
 963     BN_ULONG t[16];
 964     bn_sqr_normal(r, a, 8, t);
 965 }
 966
 967 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 968 {
 969     r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
 970     r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
 971     r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
 972     r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
 973 }
 974
 975 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 976 {
 977     r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
 978     r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
 979     r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
 980     r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
 981     r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
 982     r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
 983     r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
 984     r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
 985 }
 986
 987 # ifdef OPENSSL_NO_ASM
 988 #  ifdef OPENSSL_BN_ASM_MONT
 989 #   include <alloca.h>
 990 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 991                 const BN_ULONG *np, const BN_ULONG *n0p, int num)
 992 {
 993     BN_ULONG c0, c1, *tp, n0 = *n0p;
 994     volatile BN_ULONG *vp;
 995     int i = 0, j;
 996
 997     vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
 998
 999     for (i = 0; i <= num; i++)
1000         tp[i] = 0;
1001
1002     for (i = 0; i < num; i++) {
1003         c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1004         c1 = (tp[num] + c0) & BN_MASK2;
1005         tp[num] = c1;
1006         tp[num + 1] = (c1 < c0 ? 1 : 0);
1007
1008         c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1009         c1 = (tp[num] + c0) & BN_MASK2;
1010         tp[num] = c1;
1011         tp[num + 1] += (c1 < c0 ? 1 : 0);
1012         for (j = 0; j <= num; j++)
1013             tp[j] = tp[j + 1];
1014     }
1015
1016     if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1017         c0 = bn_sub_words(rp, tp, np, num);
1018         if (tp[num] != 0 || c0 == 0) {
1019             for (i = 0; i < num + 2; i++)
1020                 vp[i] = 0;
1021             return 1;
1022         }
1023     }
1024     for (i = 0; i < num; i++)
1025         rp[i] = tp[i], vp[i] = 0;
1026     vp[num] = 0;
1027     vp[num + 1] = 0;
1028     return 1;
1029 }
1030 #  else
1031 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1032                 const BN_ULONG *np, const BN_ULONG *n0, int num)
1033 {
1034     return 0;
1035 }
1036 #  endif                        /* OPENSSL_BN_ASM_MONT */
1037 # endif
1038
1039 #endif                          /* !BN_MUL_COMBA */