-int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
- const BIGNUM *m, BN_CTX *ctx)
- {
- int i,j,bits,ret=0,wstart,wend,window,wvalue;
- int start=1,ts=0;
- BIGNUM *aa;
- BIGNUM val[TABLE_SIZE];
- BN_RECP_CTX recp;
-
- bits=BN_num_bits(p);
-
- if (bits == 0)
- {
- BN_one(r);
- return(1);
- }
-
- BN_CTX_start(ctx);
- if ((aa = BN_CTX_get(ctx)) == NULL) goto err;
-
- BN_RECP_CTX_init(&recp);
- if (BN_RECP_CTX_set(&recp,m,ctx) <= 0) goto err;
-
- BN_init(&(val[0]));
- ts=1;
-
- if (!BN_mod(&(val[0]),a,m,ctx)) goto err; /* 1 */
-
- window = BN_window_bits_for_exponent_size(bits);
- if (window > 1)
- {
- if (!BN_mod_mul_reciprocal(aa,&(val[0]),&(val[0]),&recp,ctx))
- goto err; /* 2 */
- j=1<<(window-1);
- for (i=1; i<j; i++)
- {
- BN_init(&val[i]);
- if (!BN_mod_mul_reciprocal(&(val[i]),&(val[i-1]),aa,&recp,ctx))
- goto err;
- }
- ts=i;
- }
-
- start=1; /* This is used to avoid multiplication etc
- * when there is only the value '1' in the
- * buffer. */
- wvalue=0; /* The 'value' of the window */
- wstart=bits-1; /* The top bit of the window */
- wend=0; /* The bottom bit of the window */
-
- if (!BN_one(r)) goto err;
-
- for (;;)
- {
- if (BN_is_bit_set(p,wstart) == 0)
- {
- if (!start)
- if (!BN_mod_mul_reciprocal(r,r,r,&recp,ctx))
- goto err;
- if (wstart == 0) break;
- wstart--;
- continue;
- }
- /* We now have wstart on a 'set' bit, we now need to work out
- * how bit a window to do. To do this we need to scan
- * forward until the last set bit before the end of the
- * window */
- j=wstart;
- wvalue=1;
- wend=0;
- for (i=1; i<window; i++)
- {
- if (wstart-i < 0) break;
- if (BN_is_bit_set(p,wstart-i))
- {
- wvalue<<=(i-wend);
- wvalue|=1;
- wend=i;
- }
- }
-
- /* wend is the size of the current window */
- j=wend+1;
- /* add the 'bytes above' */
- if (!start)
- for (i=0; i<j; i++)
- {
- if (!BN_mod_mul_reciprocal(r,r,r,&recp,ctx))
- goto err;
- }
-
- /* wvalue will be an odd number < 2^window */
- if (!BN_mod_mul_reciprocal(r,r,&(val[wvalue>>1]),&recp,ctx))
- goto err;
-
- /* move the 'window' down further */
- wstart-=wend+1;
- wvalue=0;
- start=0;
- if (wstart < 0) break;
- }
- ret=1;
-err:
- BN_CTX_end(ctx);
- for (i=0; i<ts; i++)
- BN_clear_free(&(val[i]));
- BN_RECP_CTX_free(&recp);
- return(ret);
- }
-
-
-int BN_mod_exp_mont(BIGNUM *rr, BIGNUM *a, const BIGNUM *p,
- const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
- {
- int i,j,bits,ret=0,wstart,wend,window,wvalue;
- int start=1,ts=0;
- BIGNUM *d,*r;
- BIGNUM *aa;
- BIGNUM val[TABLE_SIZE];
- BN_MONT_CTX *mont=NULL;
-
- bn_check_top(a);
- bn_check_top(p);
- bn_check_top(m);
-
-#ifdef ATALLA
- if(!tried_atalla && BN_mod_exp_atalla(rr,a,p,m))
- return 1;
-/* If it fails, try the other methods */
+ bn_flip_t4(tmp.d, tmp.d, top);
+ top *= 2;
+ /* back to 32-bit domain */
+ tmp.top = top;
+ bn_correct_top(&tmp);
+ OPENSSL_cleanse(np, top * sizeof(BN_ULONG));
+ } else
+#endif
+#if defined(OPENSSL_BN_ASM_MONT5)
+ if (window == 5 && top > 1) {
+ /*
+ * This optimization uses ideas from http://eprint.iacr.org/2011/239,
+ * specifically optimization of cache-timing attack countermeasures
+ * and pre-computation optimization.
+ */
+
+ /*
+ * Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
+ * 512-bit RSA is hardly relevant, we omit it to spare size...
+ */
+ void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *table, const BN_ULONG *np,
+ const BN_ULONG *n0, int num, int power);
+ void bn_scatter5(const BN_ULONG *inp, size_t num,
+ void *table, size_t power);
+ void bn_gather5(BN_ULONG *out, size_t num, void *table, size_t power);
+ void bn_power5(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *table, const BN_ULONG *np,
+ const BN_ULONG *n0, int num, int power);
+ int bn_get_bits5(const BN_ULONG *ap, int off);
+ int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *not_used, const BN_ULONG *np,
+ const BN_ULONG *n0, int num);
+
+ BN_ULONG *n0 = mont->n0, *np;
+
+ /*
+ * BN_to_montgomery can contaminate words above .top [in
+ * BN_DEBUG[_DEBUG] build]...
+ */
+ for (i = am.top; i < top; i++)
+ am.d[i] = 0;
+ for (i = tmp.top; i < top; i++)
+ tmp.d[i] = 0;
+
+ /*
+ * copy mont->N.d[] to improve cache locality
+ */
+ for (np = am.d + top, i = 0; i < top; i++)
+ np[i] = mont->N.d[i];
+
+ bn_scatter5(tmp.d, top, powerbuf, 0);
+ bn_scatter5(am.d, am.top, powerbuf, 1);
+ bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
+ bn_scatter5(tmp.d, top, powerbuf, 2);
+
+# if 0
+ for (i = 3; i < 32; i++) {
+ /* Calculate a^i = a^(i-1) * a */
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_scatter5(tmp.d, top, powerbuf, i);
+ }
+# else
+ /* same as above, but uses squaring for 1/2 of operations */
+ for (i = 4; i < 32; i *= 2) {
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_scatter5(tmp.d, top, powerbuf, i);
+ }
+ for (i = 3; i < 8; i += 2) {
+ int j;
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_scatter5(tmp.d, top, powerbuf, i);
+ for (j = 2 * i; j < 32; j *= 2) {
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_scatter5(tmp.d, top, powerbuf, j);
+ }
+ }
+ for (; i < 16; i += 2) {
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_scatter5(tmp.d, top, powerbuf, i);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_scatter5(tmp.d, top, powerbuf, 2 * i);
+ }
+ for (; i < 32; i += 2) {
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_scatter5(tmp.d, top, powerbuf, i);
+ }
+# endif
+ bits--;
+ for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
+ wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+ bn_gather5(tmp.d, top, powerbuf, wvalue);
+
+ /*
+ * Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ if (top & 7)
+ while (bits >= 0) {
+ for (wvalue = 0, i = 0; i < 5; i++, bits--)
+ wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top,
+ wvalue);
+ } else {
+ while (bits >= 0) {
+ wvalue = bn_get_bits5(p->d, bits - 4);
+ bits -= 5;
+ bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
+ }
+ }
+
+ ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
+ tmp.top = top;
+ bn_correct_top(&tmp);
+ if (ret) {
+ if (!BN_copy(rr, &tmp))
+ ret = 0;
+ goto err; /* non-zero ret means it's not error */
+ }
+ } else