-
- /* prepare a^1 in Montgomery domain */
- if (a->neg || BN_ucmp(a,m) >= 0)
- {
- if (!BN_mod(&am,a,m,ctx)) goto err;
- if (!BN_to_montgomery(&am,&am,mont,ctx)) goto err;
- }
- else if (!BN_to_montgomery(&am,a,mont,ctx)) goto err;
-
-#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
- if (t4)
- {
- typedef int (*bn_pwr5_mont_f)(BN_ULONG *tp,const BN_ULONG *np,
- const BN_ULONG *n0,const void *table,int power,int bits);
- int bn_pwr5_mont_t4_8(BN_ULONG *tp,const BN_ULONG *np,
- const BN_ULONG *n0,const void *table,int power,int bits);
- int bn_pwr5_mont_t4_16(BN_ULONG *tp,const BN_ULONG *np,
- const BN_ULONG *n0,const void *table,int power,int bits);
- int bn_pwr5_mont_t4_24(BN_ULONG *tp,const BN_ULONG *np,
- const BN_ULONG *n0,const void *table,int power,int bits);
- int bn_pwr5_mont_t4_32(BN_ULONG *tp,const BN_ULONG *np,
- const BN_ULONG *n0,const void *table,int power,int bits);
- static const bn_pwr5_mont_f pwr5_funcs[4] = {
- bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16,
- bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32 };
- bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top/16-1];
-
- typedef int (*bn_mul_mont_f)(BN_ULONG *rp,const BN_ULONG *ap,
- const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
- int bn_mul_mont_t4_8(BN_ULONG *rp,const BN_ULONG *ap,
- const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
- int bn_mul_mont_t4_16(BN_ULONG *rp,const BN_ULONG *ap,
- const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
- int bn_mul_mont_t4_24(BN_ULONG *rp,const BN_ULONG *ap,
- const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
- int bn_mul_mont_t4_32(BN_ULONG *rp,const BN_ULONG *ap,
- const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
- static const bn_mul_mont_f mul_funcs[4] = {
- bn_mul_mont_t4_8, bn_mul_mont_t4_16,
- bn_mul_mont_t4_24, bn_mul_mont_t4_32 };
- bn_mul_mont_f mul_worker = mul_funcs[top/16-1];
-
- void bn_mul_mont_vis3(BN_ULONG *rp,const BN_ULONG *ap,
- const void *bp,const BN_ULONG *np,
- const BN_ULONG *n0,int num);
- void bn_mul_mont_t4(BN_ULONG *rp,const BN_ULONG *ap,
- const void *bp,const BN_ULONG *np,
- const BN_ULONG *n0,int num);
- void bn_mul_mont_gather5_t4(BN_ULONG *rp,const BN_ULONG *ap,
- const void *table,const BN_ULONG *np,
- const BN_ULONG *n0,int num,int power);
- void bn_flip_n_scatter5_t4(const BN_ULONG *inp,size_t num,
- void *table,size_t power);
- void bn_gather5_t4(BN_ULONG *out,size_t num,
- void *table,size_t power);
- void bn_flip_t4(BN_ULONG *dst,BN_ULONG *src,size_t num);
-
- BN_ULONG *np=mont->N.d, *n0=mont->n0;
- int stride = 5*(6-(top/16-1)); /* multiple of 5, but less than 32 */
-
- /* BN_to_montgomery can contaminate words above .top
- * [in BN_DEBUG[_DEBUG] build]... */
- for (i=am.top; i<top; i++) am.d[i]=0;
- for (i=tmp.top; i<top; i++) tmp.d[i]=0;
-
- bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,0);
- bn_flip_n_scatter5_t4(am.d,top,powerbuf,1);
- if (!(*mul_worker)(tmp.d,am.d,am.d,np,n0) &&
- !(*mul_worker)(tmp.d,am.d,am.d,np,n0))
- bn_mul_mont_vis3(tmp.d,am.d,am.d,np,n0,top);
- bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,2);
-
- for (i=3; i<32; i++)
- {
- /* Calculate a^i = a^(i-1) * a */
- if (!(*mul_worker)(tmp.d,tmp.d,am.d,np,n0) &&
- !(*mul_worker)(tmp.d,tmp.d,am.d,np,n0))
- bn_mul_mont_vis3(tmp.d,tmp.d,am.d,np,n0,top);
- bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,i);
- }
-
- /* switch to 64-bit domain */
- np = alloca(top*sizeof(BN_ULONG));
- top /= 2;
- bn_flip_t4(np,mont->N.d,top);
-
- bits--;
- for (wvalue=0, i=bits%5; i>=0; i--,bits--)
- wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
- bn_gather5_t4(tmp.d,top,powerbuf,wvalue);
-
- /* Scan the exponent one window at a time starting from the most
- * significant bits.
- */
- while (bits >= 0)
- {
- if (bits < stride) stride = bits+1;
- bits -= stride;
- wvalue = bn_get_bits(p,bits+1);
-
- if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride)) continue;
- /* retry once and fall back */
- if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride)) continue;
-
- bits += stride-5;
- wvalue >>= stride-5;
- wvalue &= 31;
- bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
- bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
- bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
- bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
- bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
- bn_mul_mont_gather5_t4(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
- }
-
- bn_flip_t4(tmp.d,tmp.d,top);
- top *= 2;
- /* back to 32-bit domain */
- tmp.top=top;
- bn_correct_top(&tmp);
- OPENSSL_cleanse(np,top*sizeof(BN_ULONG));
- }
- else
+ if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx))
+ goto err;
+
+ /* prepare a^1 in Montgomery domain */
+ if (a->neg || BN_ucmp(a, m) >= 0) {
+ if (!BN_mod(&am, a, m, ctx))
+ goto err;
+ if (!BN_to_montgomery(&am, &am, mont, ctx))
+ goto err;
+ } else if (!BN_to_montgomery(&am, a, mont, ctx))
+ goto err;
+
+#if defined(SPARC_T4_MONT)
+ if (t4) {
+ typedef int (*bn_pwr5_mont_f) (BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_8(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_16(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_24(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_32(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ static const bn_pwr5_mont_f pwr5_funcs[4] = {
+ bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16,
+ bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32
+ };
+ bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top / 16 - 1];
+
+ typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const void *bp,
+ const BN_ULONG *np, const BN_ULONG *n0);
+ int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ static const bn_mul_mont_f mul_funcs[4] = {
+ bn_mul_mont_t4_8, bn_mul_mont_t4_16,
+ bn_mul_mont_t4_24, bn_mul_mont_t4_32
+ };
+ bn_mul_mont_f mul_worker = mul_funcs[top / 16 - 1];
+
+ void bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0, int num);
+ void bn_mul_mont_t4(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0, int num);
+ void bn_mul_mont_gather5_t4(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *table, const BN_ULONG *np,
+ const BN_ULONG *n0, int num, int power);
+ void bn_flip_n_scatter5_t4(const BN_ULONG *inp, size_t num,
+ void *table, size_t power);
+ void bn_gather5_t4(BN_ULONG *out, size_t num,
+ void *table, size_t power);
+ void bn_flip_t4(BN_ULONG *dst, BN_ULONG *src, size_t num);
+
+ BN_ULONG *np = mont->N.d, *n0 = mont->n0;
+ int stride = 5 * (6 - (top / 16 - 1)); /* multiple of 5, but less
+ * than 32 */
+
+ /*
+ * BN_to_montgomery can contaminate words above .top [in
+ * BN_DEBUG[_DEBUG] build]...
+ */
+ for (i = am.top; i < top; i++)
+ am.d[i] = 0;
+ for (i = tmp.top; i < top; i++)
+ tmp.d[i] = 0;
+
+ bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 0);
+ bn_flip_n_scatter5_t4(am.d, top, powerbuf, 1);
+ if (!(*mul_worker) (tmp.d, am.d, am.d, np, n0) &&
+ !(*mul_worker) (tmp.d, am.d, am.d, np, n0))
+ bn_mul_mont_vis3(tmp.d, am.d, am.d, np, n0, top);
+ bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 2);
+
+ for (i = 3; i < 32; i++) {
+ /* Calculate a^i = a^(i-1) * a */
+ if (!(*mul_worker) (tmp.d, tmp.d, am.d, np, n0) &&
+ !(*mul_worker) (tmp.d, tmp.d, am.d, np, n0))
+ bn_mul_mont_vis3(tmp.d, tmp.d, am.d, np, n0, top);
+ bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, i);
+ }
+
+ /* switch to 64-bit domain */
+ np = alloca(top * sizeof(BN_ULONG));
+ top /= 2;
+ bn_flip_t4(np, mont->N.d, top);
+
+ bits--;
+ for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
+ wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+ bn_gather5_t4(tmp.d, top, powerbuf, wvalue);
+
+ /*
+ * Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ while (bits >= 0) {
+ if (bits < stride)
+ stride = bits + 1;
+ bits -= stride;
+ wvalue = bn_get_bits(p, bits + 1);
+
+ if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
+ continue;
+ /* retry once and fall back */
+ if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
+ continue;
+
+ bits += stride - 5;
+ wvalue >>= stride - 5;
+ wvalue &= 31;
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_gather5_t4(tmp.d, tmp.d, powerbuf, np, n0, top,
+ wvalue);
+ }
+
+ bn_flip_t4(tmp.d, tmp.d, top);
+ top *= 2;
+ /* back to 32-bit domain */
+ tmp.top = top;
+ bn_correct_top(&tmp);
+ OPENSSL_cleanse(np, top * sizeof(BN_ULONG));
+ } else