r->top = max;
n0 = mont->n0[0];
+ /*
+ * Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On
+ * input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r|
+ * includes |carry| which is stored separately.
+ */
for (carry = 0, i = 0; i < nl; i++, rp++) {
v = bn_mul_add_words(rp, np, nl, (rp[0] * n0) & BN_MASK2);
v = (v + carry + rp[nl]) & BN_MASK2;
ret->neg = r->neg;
rp = ret->d;
- ap = &(r->d[nl]);
-# define BRANCH_FREE 1
-# if BRANCH_FREE
- {
- BN_ULONG *nrp;
- size_t m;
+ /*
+ * Shift |nl| words to divide by R. We have |ap| < 2 * |n|. Note that |ap|
+ * includes |carry| which is stored separately.
+ */
+ ap = &(r->d[nl]);
- v = bn_sub_words(rp, ap, np, nl) - carry;
- /*
- * if subtraction result is real, then trick unconditional memcpy
- * below to perform in-place "refresh" instead of actual copy.
- */
- m = (0 - (size_t)v);
- nrp =
- (BN_ULONG *)(((PTR_SIZE_INT) rp & ~m) | ((PTR_SIZE_INT) ap & m));
-
- for (i = 0, nl -= 4; i < nl; i += 4) {
- BN_ULONG t1, t2, t3, t4;
-
- t1 = nrp[i + 0];
- t2 = nrp[i + 1];
- t3 = nrp[i + 2];
- ap[i + 0] = 0;
- t4 = nrp[i + 3];
- ap[i + 1] = 0;
- rp[i + 0] = t1;
- ap[i + 2] = 0;
- rp[i + 1] = t2;
- ap[i + 3] = 0;
- rp[i + 2] = t3;
- rp[i + 3] = t4;
- }
- for (nl += 4; i < nl; i++)
- rp[i] = nrp[i], ap[i] = 0;
+ /*
+ * |v| is one if |ap| - |np| underflowed or zero if it did not. Note |v|
+ * cannot be -1. That would imply the subtraction did not fit in |nl| words,
+ * and we know at most one subtraction is needed.
+ */
+ v = bn_sub_words(rp, ap, np, nl) - carry;
+ v = 0 - v;
+ for (i = 0; i < nl; i++) {
+ rp[i] = (v & ap[i]) | (~v & rp[i]);
+ ap[i] = 0;
}
-# else
- if (bn_sub_words(rp, ap, np, nl) - carry)
- memcpy(rp, ap, nl * sizeof(BN_ULONG));
-# endif
bn_correct_top(r);
bn_correct_top(ret);
bn_check_top(ret);