2 * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
3 * Copyright 2014 Cryptography Research, Inc.
5 * Licensed under the OpenSSL license (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
10 * Originally written by Mike Hamburg
19 # include <openssl/e_os2.h>
20 # include "arch_intrinsics.h"
22 # include "curve448utils.h"
25 # if defined(__ARM_NEON__)
26 # include <arm_neon.h>
27 # elif defined(__SSE2__)
28 # if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 \
29 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
30 # include <immintrin.h>
32 # include <emmintrin.h>
36 # if (ARCH_WORD_BITS == 64)
37 typedef uint64_t word_t, mask_t;
38 typedef __uint128_t dword_t;
39 typedef int32_t hsword_t;
40 typedef int64_t sword_t;
41 typedef __int128_t dsword_t;
42 # elif (ARCH_WORD_BITS == 32)
43 typedef uint32_t word_t, mask_t;
44 typedef uint64_t dword_t;
45 typedef int16_t hsword_t;
46 typedef int32_t sword_t;
47 typedef int64_t dsword_t;
49 # error "For now, we only support 32- and 64-bit architectures."
53 * Scalar limbs are keyed off of the API word size instead of the arch word
56 # if C448_WORD_BITS == 64
57 # define SC_LIMB(x) (x)
58 # elif C448_WORD_BITS == 32
59 # define SC_LIMB(x) ((uint32_t)x),(x>>32)
61 # error "For now we only support 32- and 64-bit architectures."
65 typedef uint32x4_t vecmask_t;
66 # elif defined(__clang__)
67 typedef uint64_t uint64x2_t __attribute__ ((ext_vector_type(2)));
68 typedef int64_t int64x2_t __attribute__ ((ext_vector_type(2)));
69 typedef uint64_t uint64x4_t __attribute__ ((ext_vector_type(4)));
70 typedef int64_t int64x4_t __attribute__ ((ext_vector_type(4)));
71 typedef uint32_t uint32x4_t __attribute__ ((ext_vector_type(4)));
72 typedef int32_t int32x4_t __attribute__ ((ext_vector_type(4)));
73 typedef uint32_t uint32x2_t __attribute__ ((ext_vector_type(2)));
74 typedef int32_t int32x2_t __attribute__ ((ext_vector_type(2)));
75 typedef uint32_t uint32x8_t __attribute__ ((ext_vector_type(8)));
76 typedef int32_t int32x8_t __attribute__ ((ext_vector_type(8)));
77 typedef word_t vecmask_t __attribute__ ((ext_vector_type(4)));
78 # elif defined(__GNUC__) \
79 && (__GNUC__ >= 4 || (__GNUC__== 3 && __GNUC_MINOR__ >= 1))
80 typedef uint64_t uint64x2_t __attribute__ ((vector_size(16)));
81 typedef int64_t int64x2_t __attribute__ ((vector_size(16)));
82 typedef uint64_t uint64x4_t __attribute__ ((vector_size(32)));
83 typedef int64_t int64x4_t __attribute__ ((vector_size(32)));
84 typedef uint32_t uint32x4_t __attribute__ ((vector_size(16)));
85 typedef int32_t int32x4_t __attribute__ ((vector_size(16)));
86 typedef uint32_t uint32x2_t __attribute__ ((vector_size(8)));
87 typedef int32_t int32x2_t __attribute__ ((vector_size(8)));
88 typedef uint32_t uint32x8_t __attribute__ ((vector_size(32)));
89 typedef int32_t int32x8_t __attribute__ ((vector_size(32)));
90 typedef word_t vecmask_t __attribute__ ((vector_size(32)));
93 # if defined(__AVX2__)
94 # define VECTOR_ALIGNED __attribute__((aligned(32)))
95 typedef uint64x4_t uint64xn_t;
96 typedef uint32x8_t uint32xn_t;
97 # elif defined(__SSE2__)
98 # define VECTOR_ALIGNED __attribute__((aligned(16)))
99 typedef uint64x2_t uint64xn_t;
100 typedef uint32x4_t uint32xn_t;
101 # elif defined(__ARM_NEON__)
102 # define VECTOR_ALIGNED __attribute__((aligned(16)))
103 typedef uint64x2_t uint64xn_t;
104 typedef uint32x4_t uint32xn_t;
105 # elif !defined(_MSC_VER) \
106 && (defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
107 || defined(__aarch64__))
108 # define VECTOR_ALIGNED __attribute__((aligned(8)))
109 typedef uint32_t uint32xn_t;
112 # define VECTOR_ALIGNED __attribute__((aligned(4)))
115 * This shouldn't be a problem because a big_register_t isn't actually a vector
116 * type anyway in this case.
118 # define VECTOR_ALIGNED
120 typedef uint64_t uint64xn_t;
121 typedef uint32_t uint32xn_t;
125 /* PERF: vectorize vs unroll */
127 # if 100*__clang_major__ + __clang_minor__ > 305
128 # define UNROLL _Pragma("clang loop unroll(full)")
137 * The plan on booleans: The external interface uses c448_bool_t, but this
138 * might be a different size than our particular arch's word_t (and thus
139 * mask_t). Also, the caller isn't guaranteed to pass it as nonzero. So
140 * bool_to_mask converts word sizes and checks nonzero. On the flip side,
141 * mask_t is always -1 or 0, but it might be a different size than
142 * c448_bool_t. On the third hand, we have success vs boolean types, but
143 * that's handled in common.h: it converts between c448_bool_t and
146 static ossl_inline c448_bool_t mask_to_bool(mask_t m)
148 return (c448_sword_t)(sword_t)m;
151 static ossl_inline mask_t bool_to_mask(c448_bool_t m)
153 /* On most arches this will be optimized to a simple cast. */
156 unsigned int limit = sizeof(c448_bool_t) / sizeof(mask_t);
160 for (i = 0; i < limit; i++)
161 ret |= ~word_is_zero(m >> (i * 8 * sizeof(word_t)));
166 static ossl_inline void ignore_result(c448_bool_t boo)
171 #endif /* __WORD_H__ */