crypto/bn/bn_exp.c

   1 /* crypto/bn/bn_exp.c */
   2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
   3  * All rights reserved.
   4  *
   5  * This package is an SSL implementation written
   6  * by Eric Young (eay@cryptsoft.com).
   7  * The implementation was written so as to conform with Netscapes SSL.
   8  *
   9  * This library is free for commercial and non-commercial use as long as
  10  * the following conditions are aheared to.  The following conditions
  11  * apply to all code found in this distribution, be it the RC4, RSA,
  12  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
  13  * included with this distribution is covered by the same copyright terms
  14  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
  15  *
  16  * Copyright remains Eric Young's, and as such any Copyright notices in
  17  * the code are not to be removed.
  18  * If this package is used in a product, Eric Young should be given attribution
  19  * as the author of the parts of the library used.
  20  * This can be in the form of a textual message at program startup or
  21  * in documentation (online or textual) provided with the package.
  22  *
  23  * Redistribution and use in source and binary forms, with or without
  24  * modification, are permitted provided that the following conditions
  25  * are met:
  26  * 1. Redistributions of source code must retain the copyright
  27  *    notice, this list of conditions and the following disclaimer.
  28  * 2. Redistributions in binary form must reproduce the above copyright
  29  *    notice, this list of conditions and the following disclaimer in the
  30  *    documentation and/or other materials provided with the distribution.
  31  * 3. All advertising materials mentioning features or use of this software
  32  *    must display the following acknowledgement:
  33  *    "This product includes cryptographic software written by
  34  *     Eric Young (eay@cryptsoft.com)"
  35  *    The word 'cryptographic' can be left out if the rouines from the library
  36  *    being used are not cryptographic related :-).
  37  * 4. If you include any Windows specific code (or a derivative thereof) from
  38  *    the apps directory (application code) you must include an acknowledgement:
  39  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
  40  *
  41  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
  42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  51  * SUCH DAMAGE.
  52  *
  53  * The licence and distribution terms for any publically available version or
  54  * derivative of this code cannot be changed.  i.e. this code cannot simply be
  55  * copied and put under another distribution licence
  56  * [including the GNU Public Licence.]
  57  */
  58 /* ====================================================================
  59  * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
  60  *
  61  * Redistribution and use in source and binary forms, with or without
  62  * modification, are permitted provided that the following conditions
  63  * are met:
  64  *
  65  * 1. Redistributions of source code must retain the above copyright
  66  *    notice, this list of conditions and the following disclaimer.
  67  *
  68  * 2. Redistributions in binary form must reproduce the above copyright
  69  *    notice, this list of conditions and the following disclaimer in
  70  *    the documentation and/or other materials provided with the
  71  *    distribution.
  72  *
  73  * 3. All advertising materials mentioning features or use of this
  74  *    software must display the following acknowledgment:
  75  *    "This product includes software developed by the OpenSSL Project
  76  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
  77  *
  78  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
  79  *    endorse or promote products derived from this software without
  80  *    prior written permission. For written permission, please contact
  81  *    openssl-core@openssl.org.
  82  *
  83  * 5. Products derived from this software may not be called "OpenSSL"
  84  *    nor may "OpenSSL" appear in their names without prior written
  85  *    permission of the OpenSSL Project.
  86  *
  87  * 6. Redistributions of any form whatsoever must retain the following
  88  *    acknowledgment:
  89  *    "This product includes software developed by the OpenSSL Project
  90  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
  91  *
  92  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
  93  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  94  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  95  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
  96  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  97  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  98  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  99  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 100  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 101  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 102  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 103  * OF THE POSSIBILITY OF SUCH DAMAGE.
 104  * ====================================================================
 105  *
 106  * This product includes cryptographic software written by Eric Young
 107  * (eay@cryptsoft.com).  This product includes software written by Tim
 108  * Hudson (tjh@cryptsoft.com).
 109  *
 110  */
 111
 112
 113 #include "cryptlib.h"
 114 #include "bn_lcl.h"
 115
 116 #include <stdlib.h>
 117 #ifdef _WIN32
 118 # include <malloc.h>
 119 # ifndef alloca
 120 #  define alloca _alloca
 121 # endif
 122 #elif defined(__GNUC__)
 123 # ifndef alloca
 124 #  define alloca(s) __builtin_alloca((s))
 125 # endif
 126 #elif defined(__sun)
 127 # include <alloca.h>
 128 #endif
 129
 130 #undef SPARC_T4_MONT
 131 #if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
 132 # include "sparc_arch.h"
 133 extern unsigned int OPENSSL_sparcv9cap_P[];
 134 # define SPARC_T4_MONT
 135 #endif
 136
 137 /* maximum precomputation table size for *variable* sliding windows */
 138 #define TABLE_SIZE      32
 139
 140 /* this one works - simple but works */
 141 int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 142         {
 143         int i,bits,ret=0;
 144         BIGNUM *v,*rr;
 145
 146         if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
 147                 {
 148                 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
 149                 BNerr(BN_F_BN_EXP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
 150                 return -1;
 151                 }
 152
 153         BN_CTX_start(ctx);
 154         if ((r == a) || (r == p))
 155                 rr = BN_CTX_get(ctx);
 156         else
 157                 rr = r;
 158         v = BN_CTX_get(ctx);
 159         if (rr == NULL || v == NULL) goto err;
 160
 161         if (BN_copy(v,a) == NULL) goto err;
 162         bits=BN_num_bits(p);
 163
 164         if (BN_is_odd(p))
 165                 { if (BN_copy(rr,a) == NULL) goto err; }
 166         else    { if (!BN_one(rr)) goto err; }
 167
 168         for (i=1; i<bits; i++)
 169                 {
 170                 if (!BN_sqr(v,v,ctx)) goto err;
 171                 if (BN_is_bit_set(p,i))
 172                         {
 173                         if (!BN_mul(rr,rr,v,ctx)) goto err;
 174                         }
 175                 }
 176         ret=1;
 177 err:
 178         if (r != rr) BN_copy(r,rr);
 179         BN_CTX_end(ctx);
 180         bn_check_top(r);
 181         return(ret);
 182         }
 183
 184
 185 int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
 186                BN_CTX *ctx)
 187         {
 188         int ret;
 189
 190         bn_check_top(a);
 191         bn_check_top(p);
 192         bn_check_top(m);
 193
 194         /* For even modulus  m = 2^k*m_odd,  it might make sense to compute
 195          * a^p mod m_odd  and  a^p mod 2^k  separately (with Montgomery
 196          * exponentiation for the odd part), using appropriate exponent
 197          * reductions, and combine the results using the CRT.
 198          *
 199          * For now, we use Montgomery only if the modulus is odd; otherwise,
 200          * exponentiation using the reciprocal-based quick remaindering
 201          * algorithm is used.
 202          *
 203          * (Timing obtained with expspeed.c [computations  a^p mod m
 204          * where  a, p, m  are of the same length: 256, 512, 1024, 2048,
 205          * 4096, 8192 bits], compared to the running time of the
 206          * standard algorithm:
 207          *
 208          *   BN_mod_exp_mont   33 .. 40 %  [AMD K6-2, Linux, debug configuration]
 209          *                     55 .. 77 %  [UltraSparc processor, but
 210          *                                  debug-solaris-sparcv8-gcc conf.]
 211          *
 212          *   BN_mod_exp_recp   50 .. 70 %  [AMD K6-2, Linux, debug configuration]
 213          *                     62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]
 214          *
 215          * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont
 216          * at 2048 and more bits, but at 512 and 1024 bits, it was
 217          * slower even than the standard algorithm!
 218          *
 219          * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]
 220          * should be obtained when the new Montgomery reduction code
 221          * has been integrated into OpenSSL.)
 222          */
 223
 224 #define MONT_MUL_MOD
 225 #define MONT_EXP_WORD
 226 #define RECP_MUL_MOD
 227
 228 #ifdef MONT_MUL_MOD
 229         /* I have finally been able to take out this pre-condition of
 230          * the top bit being set.  It was caused by an error in BN_div
 231          * with negatives.  There was also another problem when for a^b%m
 232          * a >= m.  eay 07-May-97 */
 233 /*      if ((m->d[m->top-1]&BN_TBIT) && BN_is_odd(m)) */
 234
 235         if (BN_is_odd(m))
 236                 {
 237 #  ifdef MONT_EXP_WORD
 238                 if (a->top == 1 && !a->neg && (BN_get_flags(p, BN_FLG_CONSTTIME) == 0))
 239                         {
 240                         BN_ULONG A = a->d[0];
 241                         ret=BN_mod_exp_mont_word(r,A,p,m,ctx,NULL);
 242                         }
 243                 else
 244 #  endif
 245                         ret=BN_mod_exp_mont(r,a,p,m,ctx,NULL);
 246                 }
 247         else
 248 #endif
 249 #ifdef RECP_MUL_MOD
 250                 { ret=BN_mod_exp_recp(r,a,p,m,ctx); }
 251 #else
 252                 { ret=BN_mod_exp_simple(r,a,p,m,ctx); }
 253 #endif
 254
 255         bn_check_top(r);
 256         return(ret);
 257         }
 258
 259
 260 int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 261                     const BIGNUM *m, BN_CTX *ctx)
 262         {
 263         int i,j,bits,ret=0,wstart,wend,window,wvalue;
 264         int start=1;
 265         BIGNUM *aa;
 266         /* Table of variables obtained from 'ctx' */
 267         BIGNUM *val[TABLE_SIZE];
 268         BN_RECP_CTX recp;
 269
 270         if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
 271                 {
 272                 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
 273                 BNerr(BN_F_BN_MOD_EXP_RECP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
 274                 return -1;
 275                 }
 276
 277         bits=BN_num_bits(p);
 278
 279         if (bits == 0)
 280                 {
 281                 ret = BN_one(r);
 282                 return ret;
 283                 }
 284
 285         BN_CTX_start(ctx);
 286         aa = BN_CTX_get(ctx);
 287         val[0] = BN_CTX_get(ctx);
 288         if(!aa || !val[0]) goto err;
 289
 290         BN_RECP_CTX_init(&recp);
 291         if (m->neg)
 292                 {
 293                 /* ignore sign of 'm' */
 294                 if (!BN_copy(aa, m)) goto err;
 295                 aa->neg = 0;
 296                 if (BN_RECP_CTX_set(&recp,aa,ctx) <= 0) goto err;
 297                 }
 298         else
 299                 {
 300                 if (BN_RECP_CTX_set(&recp,m,ctx) <= 0) goto err;
 301                 }
 302
 303         if (!BN_nnmod(val[0],a,m,ctx)) goto err;                /* 1 */
 304         if (BN_is_zero(val[0]))
 305                 {
 306                 BN_zero(r);
 307                 ret = 1;
 308                 goto err;
 309                 }
 310
 311         window = BN_window_bits_for_exponent_size(bits);
 312         if (window > 1)
 313                 {
 314                 if (!BN_mod_mul_reciprocal(aa,val[0],val[0],&recp,ctx))
 315                         goto err;                               /* 2 */
 316                 j=1<<(window-1);
 317                 for (i=1; i<j; i++)
 318                         {
 319                         if(((val[i] = BN_CTX_get(ctx)) == NULL) ||
 320                                         !BN_mod_mul_reciprocal(val[i],val[i-1],
 321                                                 aa,&recp,ctx))
 322                                 goto err;
 323                         }
 324                 }
 325
 326         start=1;        /* This is used to avoid multiplication etc
 327                          * when there is only the value '1' in the
 328                          * buffer. */
 329         wvalue=0;       /* The 'value' of the window */
 330         wstart=bits-1;  /* The top bit of the window */
 331         wend=0;         /* The bottom bit of the window */
 332
 333         if (!BN_one(r)) goto err;
 334
 335         for (;;)
 336                 {
 337                 if (BN_is_bit_set(p,wstart) == 0)
 338                         {
 339                         if (!start)
 340                                 if (!BN_mod_mul_reciprocal(r,r,r,&recp,ctx))
 341                                 goto err;
 342                         if (wstart == 0) break;
 343                         wstart--;
 344                         continue;
 345                         }
 346                 /* We now have wstart on a 'set' bit, we now need to work out
 347                  * how bit a window to do.  To do this we need to scan
 348                  * forward until the last set bit before the end of the
 349                  * window */
 350                 j=wstart;
 351                 wvalue=1;
 352                 wend=0;
 353                 for (i=1; i<window; i++)
 354                         {
 355                         if (wstart-i < 0) break;
 356                         if (BN_is_bit_set(p,wstart-i))
 357                                 {
 358                                 wvalue<<=(i-wend);
 359                                 wvalue|=1;
 360                                 wend=i;
 361                                 }
 362                         }
 363
 364                 /* wend is the size of the current window */
 365                 j=wend+1;
 366                 /* add the 'bytes above' */
 367                 if (!start)
 368                         for (i=0; i<j; i++)
 369                                 {
 370                                 if (!BN_mod_mul_reciprocal(r,r,r,&recp,ctx))
 371                                         goto err;
 372                                 }
 373
 374                 /* wvalue will be an odd number < 2^window */
 375                 if (!BN_mod_mul_reciprocal(r,r,val[wvalue>>1],&recp,ctx))
 376                         goto err;
 377
 378                 /* move the 'window' down further */
 379                 wstart-=wend+1;
 380                 wvalue=0;
 381                 start=0;
 382                 if (wstart < 0) break;
 383                 }
 384         ret=1;
 385 err:
 386         BN_CTX_end(ctx);
 387         BN_RECP_CTX_free(&recp);
 388         bn_check_top(r);
 389         return(ret);
 390         }
 391
 392
 393 int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 394                     const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
 395         {
 396         int i,j,bits,ret=0,wstart,wend,window,wvalue;
 397         int start=1;
 398         BIGNUM *d,*r;
 399         const BIGNUM *aa;
 400         /* Table of variables obtained from 'ctx' */
 401         BIGNUM *val[TABLE_SIZE];
 402         BN_MONT_CTX *mont=NULL;
 403
 404         if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
 405                 {
 406                 return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
 407                 }
 408
 409         bn_check_top(a);
 410         bn_check_top(p);
 411         bn_check_top(m);
 412
 413         if (!BN_is_odd(m))
 414                 {
 415                 BNerr(BN_F_BN_MOD_EXP_MONT,BN_R_CALLED_WITH_EVEN_MODULUS);
 416                 return(0);
 417                 }
 418         bits=BN_num_bits(p);
 419         if (bits == 0)
 420                 {
 421                 ret = BN_one(rr);
 422                 return ret;
 423                 }
 424
 425         BN_CTX_start(ctx);
 426         d = BN_CTX_get(ctx);
 427         r = BN_CTX_get(ctx);
 428         val[0] = BN_CTX_get(ctx);
 429         if (!d || !r || !val[0]) goto err;
 430
 431         /* If this is not done, things will break in the montgomery
 432          * part */
 433
 434         if (in_mont != NULL)
 435                 mont=in_mont;
 436         else
 437                 {
 438                 if ((mont=BN_MONT_CTX_new()) == NULL) goto err;
 439                 if (!BN_MONT_CTX_set(mont,m,ctx)) goto err;
 440                 }
 441
 442         if (a->neg || BN_ucmp(a,m) >= 0)
 443                 {
 444                 if (!BN_nnmod(val[0],a,m,ctx))
 445                         goto err;
 446                 aa= val[0];
 447                 }
 448         else
 449                 aa=a;
 450         if (BN_is_zero(aa))
 451                 {
 452                 BN_zero(rr);
 453                 ret = 1;
 454                 goto err;
 455                 }
 456         if (!BN_to_montgomery(val[0],aa,mont,ctx)) goto err; /* 1 */
 457
 458         window = BN_window_bits_for_exponent_size(bits);
 459         if (window > 1)
 460                 {
 461                 if (!BN_mod_mul_montgomery(d,val[0],val[0],mont,ctx)) goto err; /* 2 */
 462                 j=1<<(window-1);
 463                 for (i=1; i<j; i++)
 464                         {
 465                         if(((val[i] = BN_CTX_get(ctx)) == NULL) ||
 466                                         !BN_mod_mul_montgomery(val[i],val[i-1],
 467                                                 d,mont,ctx))
 468                                 goto err;
 469                         }
 470                 }
 471
 472         start=1;        /* This is used to avoid multiplication etc
 473                          * when there is only the value '1' in the
 474                          * buffer. */
 475         wvalue=0;       /* The 'value' of the window */
 476         wstart=bits-1;  /* The top bit of the window */
 477         wend=0;         /* The bottom bit of the window */
 478
 479 #if 1   /* by Shay Gueron's suggestion */
 480         j = m->top;     /* borrow j */
 481         if (m->d[j-1] & (((BN_ULONG)1)<<(BN_BITS2-1)))
 482                 {
 483                 if (bn_wexpand(r,j) == NULL) goto err;
 484                 /* 2^(top*BN_BITS2) - m */
 485                 r->d[0] = (0-m->d[0])&BN_MASK2;
 486                 for(i=1;i<j;i++) r->d[i] = (~m->d[i])&BN_MASK2;
 487                 r->top = j;
 488                 }
 489         else
 490 #endif
 491         if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
 492         for (;;)
 493                 {
 494                 if (BN_is_bit_set(p,wstart) == 0)
 495                         {
 496                         if (!start)
 497                                 {
 498                                 if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
 499                                 goto err;
 500                                 }
 501                         if (wstart == 0) break;
 502                         wstart--;
 503                         continue;
 504                         }
 505                 /* We now have wstart on a 'set' bit, we now need to work out
 506                  * how bit a window to do.  To do this we need to scan
 507                  * forward until the last set bit before the end of the
 508                  * window */
 509                 j=wstart;
 510                 wvalue=1;
 511                 wend=0;
 512                 for (i=1; i<window; i++)
 513                         {
 514                         if (wstart-i < 0) break;
 515                         if (BN_is_bit_set(p,wstart-i))
 516                                 {
 517                                 wvalue<<=(i-wend);
 518                                 wvalue|=1;
 519                                 wend=i;
 520                                 }
 521                         }
 522
 523                 /* wend is the size of the current window */
 524                 j=wend+1;
 525                 /* add the 'bytes above' */
 526                 if (!start)
 527                         for (i=0; i<j; i++)
 528                                 {
 529                                 if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
 530                                         goto err;
 531                                 }
 532
 533                 /* wvalue will be an odd number < 2^window */
 534                 if (!BN_mod_mul_montgomery(r,r,val[wvalue>>1],mont,ctx))
 535                         goto err;
 536
 537                 /* move the 'window' down further */
 538                 wstart-=wend+1;
 539                 wvalue=0;
 540                 start=0;
 541                 if (wstart < 0) break;
 542                 }
 543 #if defined(SPARC_T4_MONT)
 544         if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_VIS3|SPARCV9_PREFER_FPU))
 545                 {
 546                 j = mont->N.top;        /* borrow j */
 547                 val[0]->d[0] = 1;       /* borrow val[0] */
 548                 for (i=1;i<j;i++) val[0]->d[i] = 0;
 549                 val[0]->top = j;
 550                 if (!BN_mod_mul_montgomery(rr,r,val[0],mont,ctx)) goto err;
 551                 }
 552         else
 553 #endif
 554         if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
 555         ret=1;
 556 err:
 557         if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
 558         BN_CTX_end(ctx);
 559         bn_check_top(rr);
 560         return(ret);
 561         }
 562
 563 #if defined(SPARC_T4_MONT)
 564 static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
 565         {
 566         BN_ULONG ret=0;
 567         int wordpos;
 568
 569         wordpos = bitpos/BN_BITS2;
 570         bitpos %= BN_BITS2;
 571         if (wordpos>=0 && wordpos < a->top)
 572                 {
 573                 ret = a->d[wordpos]&BN_MASK2;
 574                 if (bitpos)
 575                         {
 576                         ret >>= bitpos;
 577                         if (++wordpos < a->top)
 578                                 ret |= a->d[wordpos]<<(BN_BITS2-bitpos);
 579                         }
 580                 }
 581
 582         return ret&BN_MASK2;
 583 }
 584 #endif
 585
 586 /* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
 587  * so that accessing any of these table values shows the same access pattern as far
 588  * as cache lines are concerned.  The following functions are used to transfer a BIGNUM
 589  * from/to that table. */
 590
 591 static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width)
 592         {
 593         size_t i, j;
 594
 595         if (top > b->top)
 596                 top = b->top; /* this works because 'buf' is explicitly zeroed */
 597         for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
 598                 {
 599                 buf[j] = ((unsigned char*)b->d)[i];
 600                 }
 601
 602         return 1;
 603         }
 604
 605 static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
 606         {
 607         size_t i, j;
 608
 609         if (bn_wexpand(b, top) == NULL)
 610                 return 0;
 611
 612         for (i=0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
 613                 {
 614                 ((unsigned char*)b->d)[i] = buf[j];
 615                 }
 616
 617         b->top = top;
 618         bn_correct_top(b);
 619         return 1;
 620         }
 621
 622 /* Given a pointer value, compute the next address that is a cache line multiple. */
 623 #define MOD_EXP_CTIME_ALIGN(x_) \
 624         ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
 625
 626 /* This variant of BN_mod_exp_mont() uses fixed windows and the special
 627  * precomputation memory layout to limit data-dependency to a minimum
 628  * to protect secret exponents (cf. the hyper-threading timing attacks
 629  * pointed out by Colin Percival,
 630  * http://www.daemonology.net/hyperthreading-considered-harmful/)
 631  */
 632 int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 633                     const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
 634         {
 635         int i,bits,ret=0,window,wvalue;
 636         int top;
 637         BN_MONT_CTX *mont=NULL;
 638
 639         int numPowers;
 640         unsigned char *powerbufFree=NULL;
 641         int powerbufLen = 0;
 642         unsigned char *powerbuf=NULL;
 643         BIGNUM tmp, am;
 644 #if defined(SPARC_T4_MONT)
 645         unsigned int t4=0;
 646 #endif
 647
 648         bn_check_top(a);
 649         bn_check_top(p);
 650         bn_check_top(m);
 651
 652         top = m->top;
 653
 654         if (!(m->d[0] & 1))
 655                 {
 656                 BNerr(BN_F_BN_MOD_EXP_MONT_CONSTTIME,BN_R_CALLED_WITH_EVEN_MODULUS);
 657                 return(0);
 658                 }
 659         bits=BN_num_bits(p);
 660         if (bits == 0)
 661                 {
 662                 ret = BN_one(rr);
 663                 return ret;
 664                 }
 665
 666         BN_CTX_start(ctx);
 667
 668         /* Allocate a montgomery context if it was not supplied by the caller.
 669          * If this is not done, things will break in the montgomery part.
 670          */
 671         if (in_mont != NULL)
 672                 mont=in_mont;
 673         else
 674                 {
 675                 if ((mont=BN_MONT_CTX_new()) == NULL) goto err;
 676                 if (!BN_MONT_CTX_set(mont,m,ctx)) goto err;
 677                 }
 678
 679         /* Get the window size to use with size of p. */
 680         window = BN_window_bits_for_ctime_exponent_size(bits);
 681 #if defined(SPARC_T4_MONT)
 682         if (window>=5 && (top&15)==0 && top<=64 &&
 683             (OPENSSL_sparcv9cap_P[1]&(CFR_MONTMUL|CFR_MONTSQR))==
 684                                      (CFR_MONTMUL|CFR_MONTSQR) &&
 685             (t4=OPENSSL_sparcv9cap_P[0]))
 686                 window=5;
 687         else
 688 #endif
 689 #if defined(OPENSSL_BN_ASM_MONT5)
 690         if (window==6 && bits<=1024) window=5;  /* ~5% improvement of 2048-bit RSA sign */
 691 #endif
 692         (void)0;
 693
 694         /* Allocate a buffer large enough to hold all of the pre-computed
 695          * powers of am, am itself and tmp.
 696          */
 697         numPowers = 1 << window;
 698         powerbufLen = sizeof(m->d[0])*(top*numPowers +
 699                                 ((2*top)>numPowers?(2*top):numPowers));
 700 #ifdef alloca
 701         if (powerbufLen < 3072)
 702                 powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
 703         else
 704 #endif
 705         if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
 706                 goto err;
 707
 708         powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
 709         memset(powerbuf, 0, powerbufLen);
 710
 711 #ifdef alloca
 712         if (powerbufLen < 3072)
 713                 powerbufFree = NULL;
 714 #endif
 715
 716         /* lay down tmp and am right after powers table */
 717         tmp.d     = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
 718         am.d      = tmp.d + top;
 719         tmp.top   = am.top  = 0;
 720         tmp.dmax  = am.dmax = top;
 721         tmp.neg   = am.neg  = 0;
 722         tmp.flags = am.flags = BN_FLG_STATIC_DATA;
 723
 724         /* prepare a^0 in Montgomery domain */
 725 #if 1   /* by Shay Gueron's suggestion */
 726         if (m->d[top-1] & (((BN_ULONG)1)<<(BN_BITS2-1)))
 727                 {
 728                 /* 2^(top*BN_BITS2) - m */
 729                 tmp.d[0] = (0-m->d[0])&BN_MASK2;
 730                 for (i=1;i<top;i++) tmp.d[i] = (~m->d[i])&BN_MASK2;
 731                 tmp.top = top;
 732                 }
 733         else
 734 #endif
 735         if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx))    goto err;
 736
 737         /* prepare a^1 in Montgomery domain */
 738         if (a->neg || BN_ucmp(a,m) >= 0)
 739                 {
 740                 if (!BN_mod(&am,a,m,ctx))                       goto err;
 741                 if (!BN_to_montgomery(&am,&am,mont,ctx))        goto err;
 742                 }
 743         else    if (!BN_to_montgomery(&am,a,mont,ctx))          goto err;
 744
 745 #if defined(SPARC_T4_MONT)
 746     if (t4)
 747         {
 748         typedef int (*bn_pwr5_mont_f)(BN_ULONG *tp,const BN_ULONG *np,
 749                         const BN_ULONG *n0,const void *table,int power,int bits);
 750         int bn_pwr5_mont_t4_8(BN_ULONG *tp,const BN_ULONG *np,
 751                         const BN_ULONG *n0,const void *table,int power,int bits);
 752         int bn_pwr5_mont_t4_16(BN_ULONG *tp,const BN_ULONG *np,
 753                         const BN_ULONG *n0,const void *table,int power,int bits);
 754         int bn_pwr5_mont_t4_24(BN_ULONG *tp,const BN_ULONG *np,
 755                         const BN_ULONG *n0,const void *table,int power,int bits);
 756         int bn_pwr5_mont_t4_32(BN_ULONG *tp,const BN_ULONG *np,
 757                         const BN_ULONG *n0,const void *table,int power,int bits);
 758         static const bn_pwr5_mont_f pwr5_funcs[4] = {
 759                         bn_pwr5_mont_t4_8,      bn_pwr5_mont_t4_16,
 760                         bn_pwr5_mont_t4_24,     bn_pwr5_mont_t4_32 };
 761         bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top/16-1];
 762
 763         typedef int (*bn_mul_mont_f)(BN_ULONG *rp,const BN_ULONG *ap,
 764                         const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
 765         int bn_mul_mont_t4_8(BN_ULONG *rp,const BN_ULONG *ap,
 766                         const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
 767         int bn_mul_mont_t4_16(BN_ULONG *rp,const BN_ULONG *ap,
 768                         const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
 769         int bn_mul_mont_t4_24(BN_ULONG *rp,const BN_ULONG *ap,
 770                         const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
 771         int bn_mul_mont_t4_32(BN_ULONG *rp,const BN_ULONG *ap,
 772                         const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
 773         static const bn_mul_mont_f mul_funcs[4] = {
 774                         bn_mul_mont_t4_8,       bn_mul_mont_t4_16,
 775                         bn_mul_mont_t4_24,      bn_mul_mont_t4_32 };
 776         bn_mul_mont_f mul_worker = mul_funcs[top/16-1];
 777
 778         void bn_mul_mont_vis3(BN_ULONG *rp,const BN_ULONG *ap,
 779                         const void *bp,const BN_ULONG *np,
 780                         const BN_ULONG *n0,int num);
 781         void bn_mul_mont_t4(BN_ULONG *rp,const BN_ULONG *ap,
 782                         const void *bp,const BN_ULONG *np,
 783                         const BN_ULONG *n0,int num);
 784         void bn_mul_mont_gather5_t4(BN_ULONG *rp,const BN_ULONG *ap,
 785                         const void *table,const BN_ULONG *np,
 786                         const BN_ULONG *n0,int num,int power);
 787         void bn_flip_n_scatter5_t4(const BN_ULONG *inp,size_t num,
 788                         void *table,size_t power);
 789         void bn_gather5_t4(BN_ULONG *out,size_t num,
 790                         void *table,size_t power);
 791         void bn_flip_t4(BN_ULONG *dst,BN_ULONG *src,size_t num);
 792
 793         BN_ULONG *np=mont->N.d, *n0=mont->n0;
 794         int stride = 5*(6-(top/16-1));  /* multiple of 5, but less than 32 */
 795
 796         /* BN_to_montgomery can contaminate words above .top
 797          * [in BN_DEBUG[_DEBUG] build]... */
 798         for (i=am.top; i<top; i++)      am.d[i]=0;
 799         for (i=tmp.top; i<top; i++)     tmp.d[i]=0;
 800
 801         bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,0);
 802         bn_flip_n_scatter5_t4(am.d,top,powerbuf,1);
 803         if (!(*mul_worker)(tmp.d,am.d,am.d,np,n0) &&
 804             !(*mul_worker)(tmp.d,am.d,am.d,np,n0))
 805                 bn_mul_mont_vis3(tmp.d,am.d,am.d,np,n0,top);
 806         bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,2);
 807
 808         for (i=3; i<32; i++)
 809                 {
 810                 /* Calculate a^i = a^(i-1) * a */
 811                 if (!(*mul_worker)(tmp.d,tmp.d,am.d,np,n0) &&
 812                     !(*mul_worker)(tmp.d,tmp.d,am.d,np,n0))
 813                         bn_mul_mont_vis3(tmp.d,tmp.d,am.d,np,n0,top);
 814                 bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,i);
 815                 }
 816
 817         /* switch to 64-bit domain */
 818         np = alloca(top*sizeof(BN_ULONG));
 819         top /= 2;
 820         bn_flip_t4(np,mont->N.d,top);
 821
 822         bits--;
 823         for (wvalue=0, i=bits%5; i>=0; i--,bits--)
 824                 wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
 825         bn_gather5_t4(tmp.d,top,powerbuf,wvalue);
 826
 827         /* Scan the exponent one window at a time starting from the most
 828          * significant bits.
 829          */
 830         while (bits >= 0)
 831                 {
 832                 if (bits < stride) stride = bits+1;
 833                 bits -= stride;
 834                 wvalue = bn_get_bits(p,bits+1);
 835
 836                 if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride)) continue;
 837                 /* retry once and fall back */
 838                 if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride)) continue;
 839
 840                 bits += stride-5;
 841                 wvalue >>= stride-5;
 842                 wvalue &= 31;
 843                 bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
 844                 bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
 845                 bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
 846                 bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
 847                 bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
 848                 bn_mul_mont_gather5_t4(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
 849                 }
 850
 851         bn_flip_t4(tmp.d,tmp.d,top);
 852         top *= 2;
 853         /* back to 32-bit domain */
 854         tmp.top=top;
 855         bn_correct_top(&tmp);
 856         OPENSSL_cleanse(np,top*sizeof(BN_ULONG));
 857         }
 858     else
 859 #endif
 860 #if defined(OPENSSL_BN_ASM_MONT5)
 861     /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
 862      * specifically optimization of cache-timing attack countermeasures
 863      * and pre-computation optimization. */
 864
 865     /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
 866      * 512-bit RSA is hardly relevant, we omit it to spare size... */
 867     if (window==5)
 868         {
 869         void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
 870                         const void *table,const BN_ULONG *np,
 871                         const BN_ULONG *n0,int num,int power);
 872         void bn_scatter5(const BN_ULONG *inp,size_t num,
 873                         void *table,size_t power);
 874         void bn_gather5(BN_ULONG *out,size_t num,
 875                         void *table,size_t power);
 876
 877         BN_ULONG *np=mont->N.d, *n0=mont->n0;
 878
 879         /* BN_to_montgomery can contaminate words above .top
 880          * [in BN_DEBUG[_DEBUG] build]... */
 881         for (i=am.top; i<top; i++)      am.d[i]=0;
 882         for (i=tmp.top; i<top; i++)     tmp.d[i]=0;
 883
 884         bn_scatter5(tmp.d,top,powerbuf,0);
 885         bn_scatter5(am.d,am.top,powerbuf,1);
 886         bn_mul_mont(tmp.d,am.d,am.d,np,n0,top);
 887         bn_scatter5(tmp.d,top,powerbuf,2);
 888
 889 #if 0
 890         for (i=3; i<32; i++)
 891                 {
 892                 /* Calculate a^i = a^(i-1) * a */
 893                 bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
 894                 bn_scatter5(tmp.d,top,powerbuf,i);
 895                 }
 896 #else
 897         /* same as above, but uses squaring for 1/2 of operations */
 898         for (i=4; i<32; i*=2)
 899                 {
 900                 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
 901                 bn_scatter5(tmp.d,top,powerbuf,i);
 902                 }
 903         for (i=3; i<8; i+=2)
 904                 {
 905                 int j;
 906                 bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
 907                 bn_scatter5(tmp.d,top,powerbuf,i);
 908                 for (j=2*i; j<32; j*=2)
 909                         {
 910                         bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
 911                         bn_scatter5(tmp.d,top,powerbuf,j);
 912                         }
 913                 }
 914         for (; i<16; i+=2)
 915                 {
 916                 bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
 917                 bn_scatter5(tmp.d,top,powerbuf,i);
 918                 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
 919                 bn_scatter5(tmp.d,top,powerbuf,2*i);
 920                 }
 921         for (; i<32; i+=2)
 922                 {
 923                 bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
 924                 bn_scatter5(tmp.d,top,powerbuf,i);
 925                 }
 926 #endif
 927         bits--;
 928         for (wvalue=0, i=bits%5; i>=0; i--,bits--)
 929                 wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
 930         bn_gather5(tmp.d,top,powerbuf,wvalue);
 931
 932         /* Scan the exponent one window at a time starting from the most
 933          * significant bits.
 934          */
 935         while (bits >= 0)
 936                 {
 937                 for (wvalue=0, i=0; i<5; i++,bits--)
 938                         wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
 939
 940                 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
 941                 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
 942                 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
 943                 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
 944                 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
 945                 bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
 946                 }
 947
 948         tmp.top=top;
 949         bn_correct_top(&tmp);
 950         }
 951     else
 952 #endif
 953         {
 954         if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err;
 955         if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am,  top, powerbuf, 1, numPowers)) goto err;
 956
 957         /* If the window size is greater than 1, then calculate
 958          * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
 959          * (even powers could instead be computed as (a^(i/2))^2
 960          * to use the slight performance advantage of sqr over mul).
 961          */
 962         if (window > 1)
 963                 {
 964                 if (!BN_mod_mul_montgomery(&tmp,&am,&am,mont,ctx))      goto err;
 965                 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, numPowers)) goto err;
 966                 for (i=3; i<numPowers; i++)
 967                         {
 968                         /* Calculate a^i = a^(i-1) * a */
 969                         if (!BN_mod_mul_montgomery(&tmp,&am,&tmp,mont,ctx))
 970                                 goto err;
 971                         if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, numPowers)) goto err;
 972                         }
 973                 }
 974
 975         bits--;
 976         for (wvalue=0, i=bits%window; i>=0; i--,bits--)
 977                 wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
 978         if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err;
 979
 980         /* Scan the exponent one window at a time starting from the most
 981          * significant bits.
 982          */
 983         while (bits >= 0)
 984                 {
 985                 wvalue=0; /* The 'value' of the window */
 986
 987                 /* Scan the window, squaring the result as we go */
 988                 for (i=0; i<window; i++,bits--)
 989                         {
 990                         if (!BN_mod_mul_montgomery(&tmp,&tmp,&tmp,mont,ctx))    goto err;
 991                         wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
 992                         }
 993
 994                 /* Fetch the appropriate pre-computed value from the pre-buf */
 995                 if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, numPowers)) goto err;
 996
 997                 /* Multiply the result into the intermediate result */
 998                 if (!BN_mod_mul_montgomery(&tmp,&tmp,&am,mont,ctx)) goto err;
 999                 }
1000         }
1001
1002         /* Convert the final result from montgomery to standard format */
1003 #if defined(SPARC_T4_MONT)
1004         if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_VIS3|SPARCV9_PREFER_FPU))
1005                 {
1006                 am.d[0] = 1;    /* borrow am */
1007                 for (i=1;i<top;i++) am.d[i] = 0;
1008                 if (!BN_mod_mul_montgomery(rr,&tmp,&am,mont,ctx)) goto err;
1009                 }
1010         else
1011 #endif
1012         if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
1013         ret=1;
1014 err:
1015         if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
1016         if (powerbuf!=NULL)
1017                 {
1018                 OPENSSL_cleanse(powerbuf,powerbufLen);
1019                 if (powerbufFree) OPENSSL_free(powerbufFree);
1020                 }
1021         BN_CTX_end(ctx);
1022         return(ret);
1023         }
1024
1025 int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
1026                          const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
1027         {
1028         BN_MONT_CTX *mont = NULL;
1029         int b, bits, ret=0;
1030         int r_is_one;
1031         BN_ULONG w, next_w;
1032         BIGNUM *d, *r, *t;
1033         BIGNUM *swap_tmp;
1034 #define BN_MOD_MUL_WORD(r, w, m) \
1035                 (BN_mul_word(r, (w)) && \
1036                 (/* BN_ucmp(r, (m)) < 0 ? 1 :*/  \
1037                         (BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))
1038                 /* BN_MOD_MUL_WORD is only used with 'w' large,
1039                  * so the BN_ucmp test is probably more overhead
1040                  * than always using BN_mod (which uses BN_copy if
1041                  * a similar test returns true). */
1042                 /* We can use BN_mod and do not need BN_nnmod because our
1043                  * accumulator is never negative (the result of BN_mod does
1044                  * not depend on the sign of the modulus).
1045                  */
1046 #define BN_TO_MONTGOMERY_WORD(r, w, mont) \
1047                 (BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
1048
1049         if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
1050                 {
1051                 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
1052                 BNerr(BN_F_BN_MOD_EXP_MONT_WORD,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
1053                 return -1;
1054                 }
1055
1056         bn_check_top(p);
1057         bn_check_top(m);
1058
1059         if (!BN_is_odd(m))
1060                 {
1061                 BNerr(BN_F_BN_MOD_EXP_MONT_WORD,BN_R_CALLED_WITH_EVEN_MODULUS);
1062                 return(0);
1063                 }
1064         if (m->top == 1)
1065                 a %= m->d[0]; /* make sure that 'a' is reduced */
1066
1067         bits = BN_num_bits(p);
1068         if (bits == 0)
1069                 {
1070                 ret = BN_one(rr);
1071                 return ret;
1072                 }
1073         if (a == 0)
1074                 {
1075                 BN_zero(rr);
1076                 ret = 1;
1077                 return ret;
1078                 }
1079
1080         BN_CTX_start(ctx);
1081         d = BN_CTX_get(ctx);
1082         r = BN_CTX_get(ctx);
1083         t = BN_CTX_get(ctx);
1084         if (d == NULL || r == NULL || t == NULL) goto err;
1085
1086         if (in_mont != NULL)
1087                 mont=in_mont;
1088         else
1089                 {
1090                 if ((mont = BN_MONT_CTX_new()) == NULL) goto err;
1091                 if (!BN_MONT_CTX_set(mont, m, ctx)) goto err;
1092                 }
1093
1094         r_is_one = 1; /* except for Montgomery factor */
1095
1096         /* bits-1 >= 0 */
1097
1098         /* The result is accumulated in the product r*w. */
1099         w = a; /* bit 'bits-1' of 'p' is always set */
1100         for (b = bits-2; b >= 0; b--)
1101                 {
1102                 /* First, square r*w. */
1103                 next_w = w*w;
1104                 if ((next_w/w) != w) /* overflow */
1105                         {
1106                         if (r_is_one)
1107                                 {
1108                                 if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) goto err;
1109                                 r_is_one = 0;
1110                                 }
1111                         else
1112                                 {
1113                                 if (!BN_MOD_MUL_WORD(r, w, m)) goto err;
1114                                 }
1115                         next_w = 1;
1116                         }
1117                 w = next_w;
1118                 if (!r_is_one)
1119                         {
1120                         if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) goto err;
1121                         }
1122
1123                 /* Second, multiply r*w by 'a' if exponent bit is set. */
1124                 if (BN_is_bit_set(p, b))
1125                         {
1126                         next_w = w*a;
1127                         if ((next_w/a) != w) /* overflow */
1128                                 {
1129                                 if (r_is_one)
1130                                         {
1131                                         if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) goto err;
1132                                         r_is_one = 0;
1133                                         }
1134                                 else
1135                                         {
1136                                         if (!BN_MOD_MUL_WORD(r, w, m)) goto err;
1137                                         }
1138                                 next_w = a;
1139                                 }
1140                         w = next_w;
1141                         }
1142                 }
1143
1144         /* Finally, set r:=r*w. */
1145         if (w != 1)
1146                 {
1147                 if (r_is_one)
1148                         {
1149                         if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) goto err;
1150                         r_is_one = 0;
1151                         }
1152                 else
1153                         {
1154                         if (!BN_MOD_MUL_WORD(r, w, m)) goto err;
1155                         }
1156                 }
1157
1158         if (r_is_one) /* can happen only if a == 1*/
1159                 {
1160                 if (!BN_one(rr)) goto err;
1161                 }
1162         else
1163                 {
1164                 if (!BN_from_montgomery(rr, r, mont, ctx)) goto err;
1165                 }
1166         ret = 1;
1167 err:
1168         if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
1169         BN_CTX_end(ctx);
1170         bn_check_top(rr);
1171         return(ret);
1172         }
1173
1174
1175 /* The old fallback, simple version :-) */
1176 int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
1177                 const BIGNUM *m, BN_CTX *ctx)
1178         {
1179         int i,j,bits,ret=0,wstart,wend,window,wvalue;
1180         int start=1;
1181         BIGNUM *d;
1182         /* Table of variables obtained from 'ctx' */
1183         BIGNUM *val[TABLE_SIZE];
1184
1185         if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
1186                 {
1187                 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
1188                 BNerr(BN_F_BN_MOD_EXP_SIMPLE,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
1189                 return -1;
1190                 }
1191
1192         bits=BN_num_bits(p);
1193
1194         if (bits == 0)
1195                 {
1196                 ret = BN_one(r);
1197                 return ret;
1198                 }
1199
1200         BN_CTX_start(ctx);
1201         d = BN_CTX_get(ctx);
1202         val[0] = BN_CTX_get(ctx);
1203         if(!d || !val[0]) goto err;
1204
1205         if (!BN_nnmod(val[0],a,m,ctx)) goto err;                /* 1 */
1206         if (BN_is_zero(val[0]))
1207                 {
1208                 BN_zero(r);
1209                 ret = 1;
1210                 goto err;
1211                 }
1212
1213         window = BN_window_bits_for_exponent_size(bits);
1214         if (window > 1)
1215                 {
1216                 if (!BN_mod_mul(d,val[0],val[0],m,ctx))
1217                         goto err;                               /* 2 */
1218                 j=1<<(window-1);
1219                 for (i=1; i<j; i++)
1220                         {
1221                         if(((val[i] = BN_CTX_get(ctx)) == NULL) ||
1222                                         !BN_mod_mul(val[i],val[i-1],d,m,ctx))
1223                                 goto err;
1224                         }
1225                 }
1226
1227         start=1;        /* This is used to avoid multiplication etc
1228                          * when there is only the value '1' in the
1229                          * buffer. */
1230         wvalue=0;       /* The 'value' of the window */
1231         wstart=bits-1;  /* The top bit of the window */
1232         wend=0;         /* The bottom bit of the window */
1233
1234         if (!BN_one(r)) goto err;
1235
1236         for (;;)
1237                 {
1238                 if (BN_is_bit_set(p,wstart) == 0)
1239                         {
1240                         if (!start)
1241                                 if (!BN_mod_mul(r,r,r,m,ctx))
1242                                 goto err;
1243                         if (wstart == 0) break;
1244                         wstart--;
1245                         continue;
1246                         }
1247                 /* We now have wstart on a 'set' bit, we now need to work out
1248                  * how bit a window to do.  To do this we need to scan
1249                  * forward until the last set bit before the end of the
1250                  * window */
1251                 j=wstart;
1252                 wvalue=1;
1253                 wend=0;
1254                 for (i=1; i<window; i++)
1255                         {
1256                         if (wstart-i < 0) break;
1257                         if (BN_is_bit_set(p,wstart-i))
1258                                 {
1259                                 wvalue<<=(i-wend);
1260                                 wvalue|=1;
1261                                 wend=i;
1262                                 }
1263                         }
1264
1265                 /* wend is the size of the current window */
1266                 j=wend+1;
1267                 /* add the 'bytes above' */
1268                 if (!start)
1269                         for (i=0; i<j; i++)
1270                                 {
1271                                 if (!BN_mod_mul(r,r,r,m,ctx))
1272                                         goto err;
1273                                 }
1274
1275                 /* wvalue will be an odd number < 2^window */
1276                 if (!BN_mod_mul(r,r,val[wvalue>>1],m,ctx))
1277                         goto err;
1278
1279                 /* move the 'window' down further */
1280                 wstart-=wend+1;
1281                 wvalue=0;
1282                 start=0;
1283                 if (wstart < 0) break;
1284                 }
1285         ret=1;
1286 err:
1287         BN_CTX_end(ctx);
1288         bn_check_top(r);
1289         return(ret);
1290         }