#endif
-#if TABLE_BITS==4 && defined(GHASH_ASM)
+#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
# if !defined(I386_ONLY) && \
(defined(__i386) || defined(__i386__) || \
defined(__x86_64) || defined(__x86_64__) || \
void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-#if defined(__i386) || defined(__i386__)
+#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
# define gcm_init_avx gcm_init_clmul
# define gcm_gmult_avx gcm_gmult_clmul
# define gcm_ghash_avx gcm_ghash_clmul
void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# endif
-# elif defined(__arm__) || defined(__arm)
+# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
# include "arm_arch.h"
# if __ARM_ARCH__>=7
# define GHASH_ASM_ARM
# define GCM_FUNCREF_4BIT
+# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
+# if defined(__arm__) || defined(__arm)
+# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
+# endif
+void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# endif
# elif defined(__sparc__) || defined(__sparc)
# include "sparc_arch.h"
void gcm_init_vis3(u128 Htable[16],const u64 Xi[2]);
void gcm_gmult_vis3(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_vis3(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+# define GHASH_ASM_PPC
+# define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_ppccap_P[];
+void gcm_init_p8(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_p8(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# endif
#endif
ctx->ghash = gcm_ghash_4bit;
# endif
# elif defined(GHASH_ASM_ARM)
- if (OPENSSL_armcap_P & ARMV7_NEON) {
+# ifdef PMULL_CAPABLE
+ if (PMULL_CAPABLE) {
+ gcm_init_v8(ctx->Htable,ctx->H.u);
+ ctx->gmult = gcm_gmult_v8;
+ ctx->ghash = gcm_ghash_v8;
+ } else
+# endif
+# ifdef NEON_CAPABLE
+ if (NEON_CAPABLE) {
+ gcm_init_neon(ctx->Htable,ctx->H.u);
ctx->gmult = gcm_gmult_neon;
ctx->ghash = gcm_ghash_neon;
- } else {
+ } else
+# endif
+ {
gcm_init_4bit(ctx->Htable,ctx->H.u);
ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit;
ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit;
}
+# elif defined(GHASH_ASM_PPC)
+ if (OPENSSL_ppccap_P[0] & (1<<2)) {
+ gcm_init_p8(ctx->Htable,ctx->H.u);
+ ctx->gmult = gcm_gmult_p8;
+ ctx->ghash = gcm_ghash_p8;
+ } else {
+ gcm_init_4bit(ctx->Htable,ctx->H.u);
+ ctx->gmult = gcm_gmult_4bit;
+ ctx->ghash = gcm_ghash_4bit;
+ }
# else
gcm_init_4bit(ctx->Htable,ctx->H.u);
# endif
GCM_MUL(ctx,Yi);
if (is_endian.little)
+#ifdef BSWAP4
+ ctr = BSWAP4(ctx->Yi.d[3]);
+#else
ctr = GETU32(ctx->Yi.c+12);
+#endif
else
ctr = ctx->Yi.d[3];
}
(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
}
}
if (is_endian.little)
+#ifdef BSWAP4
+ ctr = BSWAP4(ctx->Yi.d[3]);
+#else
ctr = GETU32(ctx->Yi.c+12);
+#endif
else
ctr = ctx->Yi.d[3];
(*block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
for (i=0; i<16/sizeof(size_t); ++i)
(*block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
for (i=0; i<16/sizeof(size_t); ++i)
(*block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
for (i=0; i<16/sizeof(size_t); ++i)
(*block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
while (len--) {
(*block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
}
}
if (is_endian.little)
+#ifdef BSWAP4
+ ctr = BSWAP4(ctx->Yi.d[3]);
+#else
ctr = GETU32(ctx->Yi.c+12);
+#endif
else
ctr = ctx->Yi.d[3];
(*block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
for (i=0; i<16/sizeof(size_t); ++i)
(*block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
for (i=0; i<16/sizeof(size_t); ++i)
(*block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
for (i=0; i<16/sizeof(size_t); ++i) {
(*block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
while (len--) {
(*block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
}
}
if (is_endian.little)
+#ifdef BSWAP4
+ ctr = BSWAP4(ctx->Yi.d[3]);
+#else
ctr = GETU32(ctx->Yi.c+12);
+#endif
else
ctr = ctx->Yi.d[3];
(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
ctr += GHASH_CHUNK/16;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
GHASH(ctx,out,GHASH_CHUNK);
(*stream)(in,out,j,key,ctx->Yi.c);
ctr += (unsigned int)j;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
in += i;
(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
while (len--) {
}
if (is_endian.little)
+#ifdef BSWAP4
+ ctr = BSWAP4(ctx->Yi.d[3]);
+#else
ctr = GETU32(ctx->Yi.c+12);
+#endif
else
ctr = ctx->Yi.d[3];
(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
ctr += GHASH_CHUNK/16;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
out += GHASH_CHUNK;
(*stream)(in,out,j,key,ctx->Yi.c);
ctr += (unsigned int)j;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
out += i;
(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
++ctr;
if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
PUTU32(ctx->Yi.c+12,ctr);
+#endif
else
ctx->Yi.d[3] = ctr;
while (len--) {