# define ARCH_WORD_BITS 32
-static __inline__ __attribute((always_inline, unused))
-uint32_t word_is_zero(uint32_t a)
+static ossl_inline uint32_t word_is_zero(uint32_t a)
{
/* let's hope the compiler isn't clever enough to optimize this. */
return (((uint64_t)a) - 1) >> 32;
}
-static __inline__ __attribute((always_inline, unused))
-uint64_t widemul(uint32_t a, uint32_t b)
+static ossl_inline uint64_t widemul(uint32_t a, uint32_t b)
{
return ((uint64_t)a) * b;
}
# define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0)
#endif
-void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
+void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
{
const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb;
c[1] += ((uint32_t)(accum1));
}
-void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
+void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b)
{
const uint32_t *a = as->limb;
uint32_t *c = cs->limb;
});
accum0 += accum8 + c[8];
- c[8] = accum0 & mask;
- c[9] += accum0 >> 28;
+ c[8] = ((uint32_t)accum0) & mask;
+ c[9] += (uint32_t)(accum0 >> 28);
accum8 += c[0];
- c[0] = accum8 & mask;
- c[1] += accum8 >> 28;
+ c[0] = ((uint32_t)accum8) & mask;
+ c[1] += (uint32_t)(accum8 >> 28);
}
-void gf_sqr(gf_s * __restrict__ cs, const gf as)
+void gf_sqr(gf_s * RESTRICT cs, const gf as)
{
gf_mul(cs, as, as); /* Performs better with a dedicated square */
}
* Instead, we're putting our trust in the loop unroller and unswitcher.
*/
+# if defined(__GNUC__) || defined(__clang__)
/*
* Unaligned big (vector?) register.
*/
typedef struct {
big_register_t unaligned;
-} __attribute__ ((packed)) unaligned_br_t;
+} __attribute((packed)) unaligned_br_t;
/*
* Unaligned word register, for architectures where that matters.
*/
typedef struct {
word_t unaligned;
-} __attribute__ ((packed)) unaligned_word_t;
+} __attribute((packed)) unaligned_word_t;
+
+# define HAS_UNALIGNED_STRUCTS
+# define RESTRICT __restrict__
+#else
+# define RESTRICT
+# endif
/*
* Constant-time conditional swap.
* *a and *b must not alias. Also, they must be at least as aligned
* as their sizes, if the CPU cares about that sort of thing.
*/
-static ossl_inline void constant_time_cond_swap(void *__restrict__ a_,
- void *__restrict__ b_,
+static ossl_inline void constant_time_cond_swap(void *RESTRICT a_,
+ void *RESTRICT b_,
word_t elem_bytes,
mask_t doswap)
{
word_t k;
unsigned char *a = (unsigned char *)a_;
unsigned char *b = (unsigned char *)b_;
-
big_register_t br_mask = br_set_to_mask(doswap);
+# ifndef HAS_UNALIGNED_STRUCTS
+ unsigned char doswapc = (unsigned char)(doswap & 0xFF);
+# endif
+
for (k = 0; k <= elem_bytes - sizeof(big_register_t);
k += sizeof(big_register_t)) {
if (elem_bytes % sizeof(big_register_t)) {
/* unaligned */
+# ifdef HAS_UNALIGNED_STRUCTS
big_register_t xor = ((unaligned_br_t *) (&a[k]))->unaligned
^ ((unaligned_br_t *) (&b[k]))->unaligned;
xor &= br_mask;
((unaligned_br_t *)(&a[k]))->unaligned ^= xor;
((unaligned_br_t *)(&b[k]))->unaligned ^= xor;
+# else
+ size_t i;
+
+ for (i = 0; i < sizeof(big_register_t); i++) {
+ unsigned char xor = a[k + i] ^ b[k + i];
+
+ xor &= doswapc;
+ a[k + i] ^= xor;
+ b[k + i] ^= xor;
+ }
+# endif
} else {
/* aligned */
big_register_t xor = *((big_register_t *) (&a[k]))
for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
if (elem_bytes % sizeof(word_t)) {
/* unaligned */
+# ifdef HAS_UNALIGNED_STRUCTS
word_t xor = ((unaligned_word_t *)(&a[k]))->unaligned
^ ((unaligned_word_t *)(&b[k]))->unaligned;
xor &= doswap;
((unaligned_word_t *)(&a[k]))->unaligned ^= xor;
((unaligned_word_t *)(&b[k]))->unaligned ^= xor;
+# else
+ size_t i;
+
+ for (i = 0; i < sizeof(word_t); i++) {
+ unsigned char xor = a[k + i] ^ b[k + i];
+
+ xor &= doswapc;
+ a[k + i] ^= xor;
+ b[k + i] ^= xor;
+ }
+# endif
} else {
/* aligned */
word_t xor = *((word_t *) (&a[k])) ^ *((word_t *) (&b[k]));
*
* The table and output must not alias.
*/
-static ossl_inline void constant_time_lookup(void *__restrict__ out_,
+static ossl_inline void constant_time_lookup(void *RESTRICT out_,
const void *table_,
word_t elem_bytes,
word_t n_table,
unsigned char *out = (unsigned char *)out_;
const unsigned char *table = (const unsigned char *)table_;
word_t j, k;
+# ifndef HAS_UNALIGNED_STRUCTS
+ unsigned char maskc;
+# endif
memset(out, 0, elem_bytes);
for (j = 0; j < n_table; j++, big_i -= big_one) {
big_register_t br_mask = br_is_zero(big_i);
word_t mask;
+# ifndef HAS_UNALIGNED_STRUCTS
+ maskc = (unsigned char)br_mask;
+# endif
+
for (k = 0; k <= elem_bytes - sizeof(big_register_t);
k += sizeof(big_register_t)) {
if (elem_bytes % sizeof(big_register_t)) {
/* unaligned */
+# ifdef HAS_UNALIGNED_STRUCTS
((unaligned_br_t *)(out + k))->unaligned |=
br_mask
& ((const unaligned_br_t *)
(&table[k + j * elem_bytes]))->unaligned;
+# else
+ size_t i;
+
+ for (i = 0; i < sizeof(big_register_t); i++)
+ out[k + i] |= maskc
+ & ((unsigned char *) table)
+ [k + (j * elem_bytes) + i];
+# endif
} else {
/* aligned */
*(big_register_t *)(out + k) |=
}
mask = word_is_zero(idx ^ j);
+# ifndef HAS_UNALIGNED_STRUCTS
+ maskc = (unsigned char)mask;
+# endif
if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
if (elem_bytes % sizeof(word_t)) {
/* input unaligned, output aligned */
+# ifdef HAS_UNALIGNED_STRUCTS
*(word_t *)(out + k) |=
mask
& ((const unaligned_word_t *)
(&table[k + j * elem_bytes]))->unaligned;
+# else
+ size_t i;
+
+ for (i = 0; i < sizeof(word_t); i++)
+ out[k + i] |= maskc
+ & ((unsigned char *)table)
+ [k + (j * elem_bytes) + i];
+# endif
} else {
/* aligned */
*(word_t *)(out + k) |=
const unsigned char *bFalse = (const unsigned char *)bFalse_;
word_t k;
big_register_t br_mask = br_set_to_mask(mask);
+# ifndef HAS_UNALIGNED_STRUCTS
+ unsigned char maskc = (unsigned char)mask;
+# endif
alignment_bytes |= elem_bytes;
k += sizeof(big_register_t)) {
if (alignment_bytes % sizeof(big_register_t)) {
/* unaligned */
+# ifdef HAS_UNALIGNED_STRUCTS
((unaligned_br_t *)(&a[k]))->unaligned =
(br_mask & ((const unaligned_br_t *)(&bTrue[k]))->unaligned)
| (~br_mask
& ((const unaligned_br_t *)(&bFalse[k]))->unaligned);
+# else
+ size_t i;
+
+ for (i = 0; i < sizeof(big_register_t); i++)
+ a[k + i] = (maskc & ((unsigned char *)bTrue)[k + i])
+ | (~maskc & ((unsigned char *)bFalse)[k + i]);
+# endif
} else {
/* aligned */
*(big_register_t *) (a + k) =
for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
if (alignment_bytes % sizeof(word_t)) {
/* unaligned */
+# ifdef HAS_UNALIGNED_STRUCTS
((unaligned_word_t *) (&a[k]))->unaligned =
(mask & ((const unaligned_word_t *)(&bTrue[k]))->unaligned)
| (~mask &
((const unaligned_word_t *)(&bFalse[k]))->unaligned);
+# else
+ size_t i;
+
+ for (i = 0; i < sizeof(word_t); i++)
+ a[k + i] = (maskc & ((unsigned char *)bTrue)[k + i])
+ | (~maskc & ((unsigned char *)bFalse)[k + i]);
+# endif
} else {
/* aligned */
*(word_t *) (a + k) = (mask & *(const word_t *)(&bTrue[k]))
}
}
+#undef RESTRICT
+#undef HAS_UNALIGNED_STRUCTS
+
#endif /* __CONSTANT_TIME_H__ */
return mask_to_bool(out);
}
-static ossl_inline void constant_time_lookup_niels(niels_s * __restrict__ ni,
+static ossl_inline void constant_time_lookup_niels(niels_s * RESTRICT ni,
const niels_t * table,
int nelts, int idx)
{
sb = -1;
k_t = (sb >> (t % 8)) & 1;
- k_t = -k_t; /* set to all 0s or all 1s */
+ k_t = 0 - k_t; /* set to all 0s or all 1s */
swap ^= k_t;
gf_cond_swap(x2, x3, swap);
memcpy(scalar2, scalar, sizeof(scalar2));
scalar2[0] &= -(uint8_t)COFACTOR;
- scalar2[X_PRIVATE_BYTES - 1] &= ~(-1u << ((X_PRIVATE_BITS + 7) % 8));
+ scalar2[X_PRIVATE_BYTES - 1] &= ~((0u - 1u) << ((X_PRIVATE_BITS + 7) % 8));
scalar2[X_PRIVATE_BYTES - 1] |= 1 << ((X_PRIVATE_BITS + 7) % 8);
curve448_scalar_decode_long(the_scalar, scalar2, sizeof(scalar2));
int power, addend;
};
+#if defined(__GNUC__) || defined(__clang__)
+# define NUMTRAILINGZEROS __builtin_ctz
+#else
+# define NUMTRAILINGZEROS numtrailingzeros
+static uint32_t numtrailingzeros(uint32_t i)
+{
+ unsigned int tmp;
+ uint32_t num = 31;
+
+ if (i == 0)
+ return 32;
+
+ tmp = i << 16;
+ if (tmp != 0) {
+ i = tmp;
+ num -= 16;
+ }
+ tmp = i << 8;
+ if (tmp != 0) {
+ i = tmp;
+ num -= 8;
+ }
+ tmp = i << 4;
+ if (tmp != 0) {
+ i = tmp;
+ num -= 4;
+ }
+ tmp = i << 2;
+ if (tmp != 0) {
+ i = tmp;
+ num -= 2;
+ }
+ if ((i << 1) != 0)
+ num--;
+
+ return num;
+}
+#endif
+
static int recode_wnaf(struct smvt_control *control,
/* [nbits/(table_bits + 1) + 3] */
const curve448_scalar_t scalar,
}
while (current & 0xFFFF) {
- uint32_t pos = __builtin_ctz((uint32_t)current);
+ uint32_t pos = NUMTRAILINGZEROS((uint32_t)current);
uint32_t odd = (uint32_t)current >> pos;
int32_t delta = odd & mask;
# endif
/* C448_TRUE = -1 so that C448_TRUE & x = x */
-static const c448_bool_t C448_TRUE = -(c448_bool_t) 1;
+static const c448_bool_t C448_TRUE = 0 - (c448_bool_t)1;
/* C448_FALSE = 0 so that C448_FALSE & x = 0 */
static const c448_bool_t C448_FALSE = 0;
return C448_FAILURE;
{
- /* Schedule the secret key */
- struct {
- uint8_t secret_scalar_ser[EDDSA_448_PRIVATE_BYTES];
- uint8_t seed[EDDSA_448_PRIVATE_BYTES];
- } __attribute__ ((packed)) expanded;
+ /*
+ * Schedule the secret key, First EDDSA_448_PRIVATE_BYTES is serialised
+ * secret scalar,next EDDSA_448_PRIVATE_BYTES bytes is the seed.
+ */
+ uint8_t expanded[EDDSA_448_PRIVATE_BYTES * 2];
- if (!oneshot_hash((uint8_t *)&expanded, sizeof(expanded), privkey,
+ if (!oneshot_hash(expanded, sizeof(expanded), privkey,
EDDSA_448_PRIVATE_BYTES))
goto err;
- clamp(expanded.secret_scalar_ser);
- curve448_scalar_decode_long(secret_scalar, expanded.secret_scalar_ser,
- sizeof(expanded.secret_scalar_ser));
+ clamp(expanded);
+ curve448_scalar_decode_long(secret_scalar, expanded,
+ EDDSA_448_PRIVATE_BYTES);
/* Hash to create the nonce */
if (!hash_init_with_dom(hashctx, prehashed, 0, context, context_len)
- || !EVP_DigestUpdate(hashctx, expanded.seed, sizeof(expanded.seed))
+ || !EVP_DigestUpdate(hashctx, expanded + EDDSA_448_PRIVATE_BYTES,
+ EDDSA_448_PRIVATE_BYTES)
|| !EVP_DigestUpdate(hashctx, message, message_len)) {
- OPENSSL_cleanse(&expanded, sizeof(expanded));
+ OPENSSL_cleanse(expanded, sizeof(expanded));
goto err;
}
- OPENSSL_cleanse(&expanded, sizeof(expanded));
+ OPENSSL_cleanse(expanded, sizeof(expanded));
}
/* Decode the nonce */
{
unsigned int j = 0, fill = 0;
dword_t buffer = 0;
- unsigned int i;
+ int i;
gf red;
gf_copy(red, x);
fill += LIMB_PLACE_VALUE(LIMBPERM(j));
j++;
}
- serial[i] = buffer;
+ serial[i] = (uint8_t)buffer;
fill -= 8;
buffer >>= 8;
}
gf y;
gf_add(y, x, x);
gf_strong_reduce(y);
- return -(y->limb[0] & 1);
+ return 0 - (y->limb[0] & 1);
}
/* Return high bit of x = low bit of 2x mod p */
gf y;
gf_copy(y, x);
gf_strong_reduce(y);
- return -(y->limb[0] & 1);
+ return 0 - (y->limb[0] & 1);
}
/* Deserialize from wire format; return -1 on success and 0 on failure. */
fill += 8;
j++;
}
- x->limb[LIMBPERM(i)] =
- (i < NLIMBS - 1) ? buffer & LIMB_MASK(LIMBPERM(i)) : buffer;
+ x->limb[LIMBPERM(i)] = (word_t)
+ ((i < NLIMBS - 1) ? buffer & LIMB_MASK(LIMBPERM(i)) : buffer);
fill -= LIMB_PLACE_VALUE(LIMBPERM(i));
buffer >>= LIMB_PLACE_VALUE(LIMBPERM(i));
scarry =
(scarry + x->limb[LIMBPERM(i)] -
MODULUS->limb[LIMBPERM(i)]) >> (8 * sizeof(word_t));
}
- succ = with_hibit ? -(mask_t) 1 : ~gf_hibit(x);
- return succ & word_is_zero(buffer) & ~word_is_zero(scarry);
+ succ = with_hibit ? 0 - (mask_t) 1 : ~gf_hibit(x);
+ return succ & word_is_zero((word_t)buffer) & ~word_is_zero((word_t)scarry);
}
/* Reduce to canonical form. */
*/
assert(word_is_zero(scarry) | word_is_zero(scarry + 1));
- scarry_0 = scarry;
+ scarry_0 = (word_t)scarry;
/* add it back */
for (i = 0; i < NLIMBS; i++) {
# define NLIMBS (64/sizeof(word_t))
# define X_SER_BYTES 56
# define SER_BYTES 56
+
+# if defined(__GNUC__) || defined(__clang__)
+# define INLINE_UNUSED __inline__ __attribute__((unused,always_inline))
+# define RESTRICT __restrict__
+# define ALIGNED __attribute__((aligned(32)))
+# else
+# define INLINE_UNUSED ossl_inline
+# define RESTRICT
+# define ALIGNED
+# endif
+
typedef struct gf_s {
word_t limb[NLIMBS];
-} __attribute__ ((aligned(32))) gf_s, gf[1];
+} ALIGNED gf_s, gf[1];
/* RFC 7748 support */
# define X_PUBLIC_BYTES X_SER_BYTES
# define X_PRIVATE_BYTES X_PUBLIC_BYTES
# define X_PRIVATE_BITS 448
-# define INLINE_UNUSED __inline__ __attribute__((unused,always_inline))
-
static INLINE_UNUSED void gf_copy(gf out, const gf a)
{
*out = *a;
void gf_strong_reduce(gf inout);
void gf_add(gf out, const gf a, const gf b);
void gf_sub(gf out, const gf a, const gf b);
-void gf_mul(gf_s * __restrict__ out, const gf a, const gf b);
-void gf_mulw_unsigned(gf_s * __restrict__ out, const gf a, uint32_t b);
-void gf_sqr(gf_s * __restrict__ out, const gf a);
+void gf_mul(gf_s * RESTRICT out, const gf a, const gf b);
+void gf_mulw_unsigned(gf_s * RESTRICT out, const gf a, uint32_t b);
+void gf_sqr(gf_s * RESTRICT out, const gf a);
mask_t gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0. Return true if successful */
mask_t gf_eq(const gf x, const gf y);
mask_t gf_lobit(const gf x);
static const gf ZERO = {{{0}}}, ONE = {{{1}}};
/* Square x, n times. */
-static ossl_inline void gf_sqrn(gf_s * __restrict__ y, const gf x, int n)
+static ossl_inline void gf_sqrn(gf_s * RESTRICT y, const gf x, int n)
{
gf tmp;
assert(n > 0);
}
/* Constant time, if (swap) (x,y) = (y,x); */
-static ossl_inline void gf_cond_swap(gf x, gf_s * __restrict__ y, mask_t swap)
+static ossl_inline void gf_cond_swap(gf x, gf_s * RESTRICT y, mask_t swap)
{
constant_time_cond_swap(x, y, sizeof(gf_s), swap);
}
for (i = 0; i < C448_SCALAR_LIMBS; i++) {
chain = (chain + accum[i]) - sub->limb[i];
- out->limb[i] = chain;
+ out->limb[i] = (c448_word_t)chain;
chain >>= WBITS;
}
- borrow = chain + extra; /* = 0 or -1 */
+ borrow = (c448_word_t)chain + extra; /* = 0 or -1 */
chain = 0;
for (i = 0; i < C448_SCALAR_LIMBS; i++) {
chain = (chain + out->limb[i]) + (p->limb[i] & borrow);
- out->limb[i] = chain;
+ out->limb[i] = (c448_word_t)chain;
chain >>= WBITS;
}
}
c448_dword_t chain = 0;
for (j = 0; j < C448_SCALAR_LIMBS; j++) {
chain += ((c448_dword_t) mand) * mier[j] + accum[j];
- accum[j] = chain;
+ accum[j] = (c448_word_t)chain;
chain >>= WBITS;
}
- accum[j] = chain;
+ accum[j] = (c448_word_t)chain;
mand = accum[0] * MONTGOMERY_FACTOR;
chain = 0;
for (j = 0; j < C448_SCALAR_LIMBS; j++) {
chain += (c448_dword_t) mand *mier[j] + accum[j];
if (j)
- accum[j - 1] = chain;
+ accum[j - 1] = (c448_word_t)chain;
chain >>= WBITS;
}
chain += accum[j];
chain += hi_carry;
- accum[j - 1] = chain;
+ accum[j - 1] = (c448_word_t)chain;
hi_carry = chain >> WBITS;
}
for (i = 0; i < C448_SCALAR_LIMBS; i++) {
chain = (chain + a->limb[i]) + b->limb[i];
- out->limb[i] = chain;
+ out->limb[i] = (c448_word_t)chain;
chain >>= WBITS;
}
- sc_subx(out, out->limb, sc_p, sc_p, chain);
+ sc_subx(out, out->limb, sc_p, sc_p, (c448_word_t)chain);
}
static ossl_inline void scalar_decode_short(curve448_scalar_t s,
curve448_scalar_mul(s, s, curve448_scalar_one); /* ham-handed reduce */
- return c448_succeed_if(~word_is_zero(accum));
+ return c448_succeed_if(~word_is_zero((uint32_t)accum));
}
void curve448_scalar_destroy(curve448_scalar_t scalar)
void curve448_scalar_halve(curve448_scalar_t out, const curve448_scalar_t a)
{
- c448_word_t mask = -(a->limb[0] & 1);
+ c448_word_t mask = 0 - (a->limb[0] & 1);
c448_dword_t chain = 0;
unsigned int i;
for (i = 0; i < C448_SCALAR_LIMBS; i++) {
chain = (chain + a->limb[i]) + (sc_p->limb[i] & mask);
- out->limb[i] = chain;
+ out->limb[i] = (c448_word_t)chain;
chain >>= C448_WORD_BITS;
}
for (i = 0; i < C448_SCALAR_LIMBS - 1; i++)
out->limb[i] = out->limb[i] >> 1 | out->limb[i + 1] << (WBITS - 1);
- out->limb[i] = out->limb[i] >> 1 | chain << (WBITS - 1);
+ out->limb[i] = out->limb[i] >> 1 | (c448_word_t)(chain << (WBITS - 1));
}
# if defined(__ARM_NEON__)
# include <arm_neon.h>
# elif defined(__SSE2__)
-# if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
+# if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 \
+ || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
# include <immintrin.h>
# else
# include <emmintrin.h>
typedef uint32_t uint32x8_t __attribute__ ((ext_vector_type(8)));
typedef int32_t int32x8_t __attribute__ ((ext_vector_type(8)));
typedef word_t vecmask_t __attribute__ ((ext_vector_type(4)));
-# else /* GCC, hopefully? */
+# elif defined(__GNUC__) \
+ && (__GNUC__ >= 4 || (__GNUC__== 3 && __GNUC_MINOR__ >= 1))
typedef uint64_t uint64x2_t __attribute__ ((vector_size(16)));
typedef int64_t int64x2_t __attribute__ ((vector_size(16)));
typedef uint64_t uint64x4_t __attribute__ ((vector_size(32)));
{
return vdupq_n_u32(x);
}
-# elif defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
- || defined(__aarch64__)
+# elif !defined(_MSC_VER) \
+ && (defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
+ || defined(__aarch64__))
# define VECTOR_ALIGNED __attribute__((aligned(8)))
typedef uint64_t big_register_t, uint64xn_t;
return (big_register_t) x;
}
# else
-# define VECTOR_ALIGNED __attribute__((aligned(4)))
+# ifdef __GNUC__
+# define VECTOR_ALIGNED __attribute__((aligned(4)))
+# else
+/*
+ * This shouldn't be a problem because a big_register_t isn't actually a vector
+ * type anyway in this case.
+ */
+# define VECTOR_ALIGNED
+# endif
typedef uint64_t uint64xn_t;
typedef uint32_t uint32xn_t;
typedef uint32_t big_register_t;