+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
-# define __ARCH_ARM_32_ARCH_INTRINSICS_H__
-
-# define ARCH_WORD_BITS 32
-
-static __inline__ __attribute((always_inline, unused))
-uint32_t word_is_zero(uint32_t a)
-{
- uint32_t ret;
-
- asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
- return ret;
-}
-
-static __inline__ __attribute((always_inline, unused))
-uint64_t widemul(uint32_t a, uint32_t b)
-{
- /*
- * Could be UMULL, but it's hard to express to CC that the registers must
- * be different
- */
- return ((uint64_t)a) * b;
-}
-
-#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#include "field.h"
-
-static inline void __attribute__ ((gnu_inline, always_inline))
- smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
-
-#ifdef __ARMEL__
- uint32_t lo = *acc, hi = (*acc) >> 32;
-
- __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
- : [lo]"+&r"(lo), [hi]"+&r"(hi)
- : [a]"r"(a), [b]"r"(b));
-
-
- *acc = lo + (((uint64_t)hi) << 32);
-#else
- *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b;
-#endif
-}
-
-static inline void __attribute__ ((gnu_inline, always_inline))
- smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
-#ifdef __ARMEL__
- uint32_t lo = *acc, hi = (*acc) >> 32;
-
- __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
- : [lo]"+&r"(lo), [hi]"+&r"(hi)
- : [a]"r"(a), [b]"r"(2 * b));
-
-
-
- *acc = lo + (((uint64_t)hi) << 32);
-#else
- *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
-#endif
-}
-
-static inline void __attribute__ ((gnu_inline, always_inline))
- smull(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
-#ifdef __ARMEL__
- uint32_t lo, hi;
-
- __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
- : [lo]"=&r"(lo), [hi]"=&r"(hi)
- : [a]"r"(a), [b]"r"(b));
-
- *acc = lo + (((uint64_t)hi) << 32);
-#else
- *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b;
-#endif
-}
-
-static inline void __attribute__ ((gnu_inline, always_inline))
- smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
-#ifdef __ARMEL__
- uint32_t lo, hi;
-
- __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
- : [lo]"=&r"(lo), [hi]"=&r"(hi)
- : [a]"r"(a), [b]"r"(2*b));
-
- *acc = lo + (((uint64_t)hi) << 32);
-#else
- *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
-#endif
-}
-
-void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
-{
-
- const uint32_t *a = as->limb, *b = bs->limb;
- uint32_t *c = cs->limb;
-
- uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1;
- uint32_t mask = (1 << 28) - 1;
-
- uint32_t aa[8], bm[8];
-
- int i;
- for (i = 0; i < 8; i++) {
- aa[i] = a[i] + a[i + 8];
- bm[i] = b[i] - b[i + 8];
- }
-
- uint32_t ax, bx;
- {
- /* t^3 terms */
- smull(&accum1, ax = aa[1], bx = b[15]);
- smull(&accum3, ax = aa[2], bx);
- smlal(&accum1, ax, bx = b[14]);
- smlal(&accum3, ax = aa[3], bx);
- smlal(&accum1, ax, bx = b[13]);
- smlal(&accum3, ax = aa[4], bx);
- smlal(&accum1, ax, bx = b[12]);
- smlal(&accum3, ax = aa[5], bx);
- smlal(&accum1, ax, bx = b[11]);
- smlal(&accum3, ax = aa[6], bx);
- smlal(&accum1, ax, bx = b[10]);
- smlal(&accum3, ax = aa[7], bx);
- smlal(&accum1, ax, bx = b[9]);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
- smlal(&accum2, ax = aa[0], bx);
- smlal(&accum0, ax, bx = b[8]);
- smlal(&accum2, ax = aa[1], bx);
-
- smlal(&accum0, ax = a[9], bx = b[7]);
- smlal(&accum2, ax = a[10], bx);
- smlal(&accum0, ax, bx = b[6]);
- smlal(&accum2, ax = a[11], bx);
- smlal(&accum0, ax, bx = b[5]);
- smlal(&accum2, ax = a[12], bx);
- smlal(&accum0, ax, bx = b[4]);
- smlal(&accum2, ax = a[13], bx);
- smlal(&accum0, ax, bx = b[3]);
- smlal(&accum2, ax = a[14], bx);
- smlal(&accum0, ax, bx = b[2]);
- smlal(&accum2, ax = a[15], bx);
- smlal(&accum0, ax, bx = b[1]);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
- smlal(&accum3, ax = a[8], bx);
- smlal(&accum1, ax, bx = b[0]);
- smlal(&accum3, ax = a[9], bx);
-
- smlal(&accum1, ax = a[1], bx = bm[7]);
- smlal(&accum3, ax = a[2], bx);
- smlal(&accum1, ax, bx = bm[6]);
- smlal(&accum3, ax = a[3], bx);
- smlal(&accum1, ax, bx = bm[5]);
- smlal(&accum3, ax = a[4], bx);
- smlal(&accum1, ax, bx = bm[4]);
- smlal(&accum3, ax = a[5], bx);
- smlal(&accum1, ax, bx = bm[3]);
- smlal(&accum3, ax = a[6], bx);
- smlal(&accum1, ax, bx = bm[2]);
- smlal(&accum3, ax = a[7], bx);
- smlal(&accum1, ax, bx = bm[1]);
-
- /* 1 terms */
- smlal(&accum2, ax = a[0], bx);
- smlal(&accum0, ax, bx = bm[0]);
- smlal(&accum2, ax = a[1], bx);
-
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[0] = ((uint32_t)(accum0)) & mask;
- c[1] = ((uint32_t)(accum2)) & mask;
- c[8] = ((uint32_t)(accum1)) & mask;
- c[9] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
- /* t^3 terms */
- smull(&accum1, ax = aa[3], bx = b[15]);
- smull(&accum3, ax = aa[4], bx);
- smlal(&accum1, ax, bx = b[14]);
- smlal(&accum3, ax = aa[5], bx);
- smlal(&accum1, ax, bx = b[13]);
- smlal(&accum3, ax = aa[6], bx);
- smlal(&accum1, ax, bx = b[12]);
- smlal(&accum3, ax = aa[7], bx);
- smlal(&accum1, ax, bx = b[11]);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
- smlal(&accum2, ax = aa[0], bx);
- smlal(&accum0, ax, bx = b[10]);
- smlal(&accum2, ax = aa[1], bx);
- smlal(&accum0, ax, bx = b[9]);
- smlal(&accum2, ax = aa[2], bx);
- smlal(&accum0, ax, bx = b[8]);
- smlal(&accum2, ax = aa[3], bx);
-
- smlal(&accum0, ax = a[11], bx = b[7]);
- smlal(&accum2, ax = a[12], bx);
- smlal(&accum0, ax, bx = b[6]);
- smlal(&accum2, ax = a[13], bx);
- smlal(&accum0, ax, bx = b[5]);
- smlal(&accum2, ax = a[14], bx);
- smlal(&accum0, ax, bx = b[4]);
- smlal(&accum2, ax = a[15], bx);
- smlal(&accum0, ax, bx = b[3]);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
- smlal(&accum3, ax = a[8], bx);
- smlal(&accum1, ax, bx = b[2]);
- smlal(&accum3, ax = a[9], bx);
- smlal(&accum1, ax, bx = b[1]);
- smlal(&accum3, ax = a[10], bx);
- smlal(&accum1, ax, bx = b[0]);
- smlal(&accum3, ax = a[11], bx);
-
- smlal(&accum1, ax = a[3], bx = bm[7]);
- smlal(&accum3, ax = a[4], bx);
- smlal(&accum1, ax, bx = bm[6]);
- smlal(&accum3, ax = a[5], bx);
- smlal(&accum1, ax, bx = bm[5]);
- smlal(&accum3, ax = a[6], bx);
- smlal(&accum1, ax, bx = bm[4]);
- smlal(&accum3, ax = a[7], bx);
- smlal(&accum1, ax, bx = bm[3]);
-
- /* 1 terms */
- smlal(&accum2, ax = a[0], bx);
- smlal(&accum0, ax, bx = bm[2]);
- smlal(&accum2, ax = a[1], bx);
- smlal(&accum0, ax, bx = bm[1]);
- smlal(&accum2, ax = a[2], bx);
- smlal(&accum0, ax, bx = bm[0]);
- smlal(&accum2, ax = a[3], bx);
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[2] = ((uint32_t)(accum0)) & mask;
- c[3] = ((uint32_t)(accum2)) & mask;
- c[10] = ((uint32_t)(accum1)) & mask;
- c[11] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
-
- /* t^3 terms */
- smull(&accum1, ax = aa[5], bx = b[15]);
- smull(&accum3, ax = aa[6], bx);
- smlal(&accum1, ax, bx = b[14]);
- smlal(&accum3, ax = aa[7], bx);
- smlal(&accum1, ax, bx = b[13]);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
-
- smlal(&accum2, ax = aa[0], bx);
- smlal(&accum0, ax, bx = b[12]);
- smlal(&accum2, ax = aa[1], bx);
- smlal(&accum0, ax, bx = b[11]);
- smlal(&accum2, ax = aa[2], bx);
- smlal(&accum0, ax, bx = b[10]);
- smlal(&accum2, ax = aa[3], bx);
- smlal(&accum0, ax, bx = b[9]);
- smlal(&accum2, ax = aa[4], bx);
- smlal(&accum0, ax, bx = b[8]);
- smlal(&accum2, ax = aa[5], bx);
-
- smlal(&accum0, ax = a[13], bx = b[7]);
- smlal(&accum2, ax = a[14], bx);
- smlal(&accum0, ax, bx = b[6]);
- smlal(&accum2, ax = a[15], bx);
- smlal(&accum0, ax, bx = b[5]);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
-
- smlal(&accum3, ax = a[8], bx);
- smlal(&accum1, ax, bx = b[4]);
- smlal(&accum3, ax = a[9], bx);
- smlal(&accum1, ax, bx = b[3]);
- smlal(&accum3, ax = a[10], bx);
- smlal(&accum1, ax, bx = b[2]);
- smlal(&accum3, ax = a[11], bx);
- smlal(&accum1, ax, bx = b[1]);
- smlal(&accum3, ax = a[12], bx);
- smlal(&accum1, ax, bx = b[0]);
- smlal(&accum3, ax = a[13], bx);
-
- smlal(&accum1, ax = a[5], bx = bm[7]);
- smlal(&accum3, ax = a[6], bx);
- smlal(&accum1, ax, bx = bm[6]);
- smlal(&accum3, ax = a[7], bx);
- smlal(&accum1, ax, bx = bm[5]);
-
- /* 1 terms */
-
- smlal(&accum2, ax = a[0], bx);
- smlal(&accum0, ax, bx = bm[4]);
- smlal(&accum2, ax = a[1], bx);
- smlal(&accum0, ax, bx = bm[3]);
- smlal(&accum2, ax = a[2], bx);
- smlal(&accum0, ax, bx = bm[2]);
- smlal(&accum2, ax = a[3], bx);
- smlal(&accum0, ax, bx = bm[1]);
- smlal(&accum2, ax = a[4], bx);
- smlal(&accum0, ax, bx = bm[0]);
- smlal(&accum2, ax = a[5], bx);
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[4] = ((uint32_t)(accum0)) & mask;
- c[5] = ((uint32_t)(accum2)) & mask;
- c[12] = ((uint32_t)(accum1)) & mask;
- c[13] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
-
- /* t^3 terms */
- smull(&accum1, ax = aa[7], bx = b[15]);
- accum0 = accum1;
-
- /* t^2 terms */
-
- smull(&accum2, ax = aa[0], bx);
- smlal(&accum0, ax, bx = b[14]);
- smlal(&accum2, ax = aa[1], bx);
- smlal(&accum0, ax, bx = b[13]);
- smlal(&accum2, ax = aa[2], bx);
- smlal(&accum0, ax, bx = b[12]);
- smlal(&accum2, ax = aa[3], bx);
- smlal(&accum0, ax, bx = b[11]);
- smlal(&accum2, ax = aa[4], bx);
- smlal(&accum0, ax, bx = b[10]);
- smlal(&accum2, ax = aa[5], bx);
- smlal(&accum0, ax, bx = b[9]);
- smlal(&accum2, ax = aa[6], bx);
- smlal(&accum0, ax, bx = b[8]);
- smlal(&accum2, ax = aa[7], bx);
-
- smlal(&accum0, ax = a[15], bx = b[7]);
-
- /* t terms */
- accum1 += accum0;
- accum3 = accum2;
-
- smlal(&accum3, ax = a[8], bx);
- smlal(&accum1, ax, bx = b[6]);
- smlal(&accum3, ax = a[9], bx);
- smlal(&accum1, ax, bx = b[5]);
- smlal(&accum3, ax = a[10], bx);
- smlal(&accum1, ax, bx = b[4]);
- smlal(&accum3, ax = a[11], bx);
- smlal(&accum1, ax, bx = b[3]);
- smlal(&accum3, ax = a[12], bx);
- smlal(&accum1, ax, bx = b[2]);
- smlal(&accum3, ax = a[13], bx);
- smlal(&accum1, ax, bx = b[1]);
- smlal(&accum3, ax = a[14], bx);
- smlal(&accum1, ax, bx = b[0]);
- smlal(&accum3, ax = a[15], bx);
-
- smlal(&accum1, ax = a[7], bx = bm[7]);
-
- /* 1 terms */
-
- smlal(&accum2, ax = a[0], bx);
- smlal(&accum0, ax, bx = bm[6]);
- smlal(&accum2, ax = a[1], bx);
- smlal(&accum0, ax, bx = bm[5]);
- smlal(&accum2, ax = a[2], bx);
- smlal(&accum0, ax, bx = bm[4]);
- smlal(&accum2, ax = a[3], bx);
- smlal(&accum0, ax, bx = bm[3]);
- smlal(&accum2, ax = a[4], bx);
- smlal(&accum0, ax, bx = bm[2]);
- smlal(&accum2, ax = a[5], bx);
- smlal(&accum0, ax, bx = bm[1]);
- smlal(&accum2, ax = a[6], bx);
- smlal(&accum0, ax, bx = bm[0]);
- smlal(&accum2, ax = a[7], bx);
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[6] = ((uint32_t)(accum0)) & mask;
- c[7] = ((uint32_t)(accum2)) & mask;
- c[14] = ((uint32_t)(accum1)) & mask;
- c[15] = ((uint32_t)(accum3)) & mask;
-
- accum0 = accum2 >> 28;
- accum1 = accum3 >> 28;
- }
-
- accum0 += accum1;
- accum0 += c[8];
- accum1 += c[0];
- c[8] = ((uint32_t)(accum0)) & mask;
- c[0] = ((uint32_t)(accum1)) & mask;
-
- accum0 >>= 28;
- accum1 >>= 28;
- c[9] += ((uint32_t)(accum0));
- c[1] += ((uint32_t)(accum1));
-}
-
-void gf_sqr(gf_s * __restrict__ cs, const gf as)
-{
- const uint32_t *a = as->limb;
- uint32_t *c = cs->limb;
-
- uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp;
- uint32_t mask = (1 << 28) - 1;
-
- uint32_t bm[8];
-
- int i;
- for (i = 0; i < 8; i++) {
- bm[i] = a[i] - a[i + 8];
- }
-
- uint32_t ax, bx;
- {
- /* t^3 terms */
- smull2(&accum1, ax = a[9], bx = a[15]);
- smull2(&accum3, ax = a[10], bx);
- smlal2(&accum1, ax, bx = a[14]);
- smlal2(&accum3, ax = a[11], bx);
- smlal2(&accum1, ax, bx = a[13]);
- smlal2(&accum3, ax = a[12], bx);
- smlal(&accum1, ax, ax);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
- smlal2(&accum2, ax = a[8], a[9]);
- smlal(&accum0, ax, ax);
-
- smlal2(&accum0, ax = a[1], bx = a[7]);
- smlal2(&accum2, ax = a[2], bx);
- smlal2(&accum0, ax, bx = a[6]);
- smlal2(&accum2, ax = a[3], bx);
- smlal2(&accum0, ax, bx = a[5]);
- smlal2(&accum2, ax = a[4], bx);
- smlal(&accum0, ax, ax);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
- smlal2(&accum3, ax = a[0], bx = a[1]);
- smlal(&accum1, ax, ax);
-
- accum1 = -accum1;
- accum3 = -accum3;
- accum2 = -accum2;
- accum0 = -accum0;
-
- smlal2(&accum1, ax = bm[1], bx = bm[7]);
- smlal2(&accum3, ax = bm[2], bx);
- smlal2(&accum1, ax, bx = bm[6]);
- smlal2(&accum3, ax = bm[3], bx);
- smlal2(&accum1, ax, bx = bm[5]);
- smlal2(&accum3, ax = bm[4], bx);
- smlal(&accum1, ax, ax);
-
- /* 1 terms */
- smlal2(&accum2, ax = bm[0], bx = bm[1]);
- smlal(&accum0, ax, ax);
-
- tmp = -accum3;
- accum3 = tmp - accum2;
- accum2 = tmp;
- tmp = -accum1;
- accum1 = tmp - accum0;
- accum0 = tmp;
-
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[0] = ((uint32_t)(accum0)) & mask;
- c[1] = ((uint32_t)(accum2)) & mask;
- c[8] = ((uint32_t)(accum1)) & mask;
- c[9] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
- /* t^3 terms */
- smull2(&accum1, ax = a[11], bx = a[15]);
- smull2(&accum3, ax = a[12], bx);
- smlal2(&accum1, ax, bx = a[14]);
- smlal2(&accum3, ax = a[13], bx);
- smlal(&accum1, ax, ax);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
- smlal2(&accum2, ax = a[8], bx = a[11]);
- smlal2(&accum0, ax, bx = a[10]);
- smlal2(&accum2, ax = a[9], bx);
- smlal(&accum0, ax, ax);
-
- smlal2(&accum0, ax = a[3], bx = a[7]);
- smlal2(&accum2, ax = a[4], bx);
- smlal2(&accum0, ax, bx = a[6]);
- smlal2(&accum2, ax = a[5], bx);
- smlal(&accum0, ax, ax);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
- smlal2(&accum3, ax = a[0], bx = a[3]);
- smlal2(&accum1, ax, bx = a[2]);
- smlal2(&accum3, ax = a[1], bx);
- smlal(&accum1, ax, ax);
-
- accum1 = -accum1;
- accum3 = -accum3;
- accum2 = -accum2;
- accum0 = -accum0;
-
- smlal2(&accum1, ax = bm[3], bx = bm[7]);
- smlal2(&accum3, ax = bm[4], bx);
- smlal2(&accum1, ax, bx = bm[6]);
- smlal2(&accum3, ax = bm[5], bx);
- smlal(&accum1, ax, ax);
-
- /* 1 terms */
- smlal2(&accum2, ax = bm[0], bx = bm[3]);
- smlal2(&accum0, ax, bx = bm[2]);
- smlal2(&accum2, ax = bm[1], bx);
- smlal(&accum0, ax, ax);
-
- tmp = -accum3;
- accum3 = tmp - accum2;
- accum2 = tmp;
- tmp = -accum1;
- accum1 = tmp - accum0;
- accum0 = tmp;
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[2] = ((uint32_t)(accum0)) & mask;
- c[3] = ((uint32_t)(accum2)) & mask;
- c[10] = ((uint32_t)(accum1)) & mask;
- c[11] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
-
- /* t^3 terms */
- smull2(&accum1, ax = a[13], bx = a[15]);
- smull2(&accum3, ax = a[14], bx);
- smlal(&accum1, ax, ax);
-
- accum0 = accum1;
- accum2 = accum3;
-
- /* t^2 terms */
-
- smlal2(&accum2, ax = a[8], bx = a[13]);
- smlal2(&accum0, ax, bx = a[12]);
- smlal2(&accum2, ax = a[9], bx);
- smlal2(&accum0, ax, bx = a[11]);
- smlal2(&accum2, ax = a[10], bx);
- smlal(&accum0, ax, ax);
-
- smlal2(&accum0, ax = a[5], bx = a[7]);
- smlal2(&accum2, ax = a[6], bx);
- smlal(&accum0, ax, ax);
-
- /* t terms */
- accum1 += accum0;
- accum3 += accum2;
-
- smlal2(&accum3, ax = a[0], bx = a[5]);
- smlal2(&accum1, ax, bx = a[4]);
- smlal2(&accum3, ax = a[1], bx);
- smlal2(&accum1, ax, bx = a[3]);
- smlal2(&accum3, ax = a[2], bx);
- smlal(&accum1, ax, ax);
-
- accum1 = -accum1;
- accum3 = -accum3;
- accum2 = -accum2;
- accum0 = -accum0;
-
- smlal2(&accum1, ax = bm[5], bx = bm[7]);
- smlal2(&accum3, ax = bm[6], bx);
- smlal(&accum1, ax, ax);
-
- /* 1 terms */
-
- smlal2(&accum2, ax = bm[0], bx = bm[5]);
- smlal2(&accum0, ax, bx = bm[4]);
- smlal2(&accum2, ax = bm[1], bx);
- smlal2(&accum0, ax, bx = bm[3]);
- smlal2(&accum2, ax = bm[2], bx);
- smlal(&accum0, ax, ax);
-
- tmp = -accum3;
- accum3 = tmp - accum2;
- accum2 = tmp;
- tmp = -accum1;
- accum1 = tmp - accum0;
- accum0 = tmp;
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[4] = ((uint32_t)(accum0)) & mask;
- c[5] = ((uint32_t)(accum2)) & mask;
- c[12] = ((uint32_t)(accum1)) & mask;
- c[13] = ((uint32_t)(accum3)) & mask;
-
- accumC0 = accum2 >> 28;
- accumC1 = accum3 >> 28;
- }
- {
-
- /* t^3 terms */
- smull(&accum1, ax = a[15], bx = a[15]);
- accum0 = accum1;
-
- /* t^2 terms */
-
- smull2(&accum2, ax = a[8], bx);
- smlal2(&accum0, ax, bx = a[14]);
- smlal2(&accum2, ax = a[9], bx);
- smlal2(&accum0, ax, bx = a[13]);
- smlal2(&accum2, ax = a[10], bx);
- smlal2(&accum0, ax, bx = a[12]);
- smlal2(&accum2, ax = a[11], bx);
- smlal(&accum0, ax, ax);
-
- smlal(&accum0, ax = a[7], bx = a[7]);
-
- /* t terms */
- accum1 += accum0;
- accum3 = accum2;
-
- smlal2(&accum3, ax = a[0], bx);
- smlal2(&accum1, ax, bx = a[6]);
- smlal2(&accum3, ax = a[1], bx);
- smlal2(&accum1, ax, bx = a[5]);
- smlal2(&accum3, ax = a[2], bx);
- smlal2(&accum1, ax, bx = a[4]);
- smlal2(&accum3, ax = a[3], bx);
- smlal(&accum1, ax, ax);
-
- accum1 = -accum1;
- accum3 = -accum3;
- accum2 = -accum2;
- accum0 = -accum0;
-
- bx = bm[7];
- smlal(&accum1, bx, bx);
-
- /* 1 terms */
-
- smlal2(&accum2, ax = bm[0], bx);
- smlal2(&accum0, ax, bx = bm[6]);
- smlal2(&accum2, ax = bm[1], bx);
- smlal2(&accum0, ax, bx = bm[5]);
- smlal2(&accum2, ax = bm[2], bx);
- smlal2(&accum0, ax, bx = bm[4]);
- smlal2(&accum2, ax = bm[3], bx);
- smlal(&accum0, ax, ax);
-
- tmp = -accum3;
- accum3 = tmp - accum2;
- accum2 = tmp;
- tmp = -accum1;
- accum1 = tmp - accum0;
- accum0 = tmp;
-
- accum0 += accumC0;
- accum1 += accumC1;
- accum2 += accum0 >> 28;
- accum3 += accum1 >> 28;
-
- c[6] = ((uint32_t)(accum0)) & mask;
- c[7] = ((uint32_t)(accum2)) & mask;
- c[14] = ((uint32_t)(accum1)) & mask;
- c[15] = ((uint32_t)(accum3)) & mask;
-
- accum0 = accum2 >> 28;
- accum1 = accum3 >> 28;
- }
-
- accum0 += accum1;
- accum0 += c[8];
- accum1 += c[0];
- c[8] = ((uint32_t)(accum0)) & mask;
- c[0] = ((uint32_t)(accum1)) & mask;
-
- accum0 >>= 28;
- accum1 >>= 28;
- c[9] += ((uint32_t)(accum0));
- c[1] += ((uint32_t)(accum1));
-}
-
-void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
-{
- uint32_t mask = (1ull << 28) - 1;
- const uint32_t *a = as->limb;
- uint32_t *c = cs->limb;
- uint64_t accum0, accum8;
- int i;
- uint32_t c0, c8, n0, n8;
-
- assert(b <= mask);
-
- c0 = a[0];
- c8 = a[8];
- accum0 = widemul(b, c0);
- accum8 = widemul(b, c8);
-
- c[0] = accum0 & mask;
- accum0 >>= 28;
- c[8] = accum8 & mask;
- accum8 >>= 28;
-
- i = 1;
- {
- n0 = a[i];
- n8 = a[i + 8];
- smlal(&accum0, b, n0);
- smlal(&accum8, b, n8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- c0 = a[i];
- c8 = a[i + 8];
- smlal(&accum0, b, c0);
- smlal(&accum8, b, c8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- n0 = a[i];
- n8 = a[i + 8];
- smlal(&accum0, b, n0);
- smlal(&accum8, b, n8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- c0 = a[i];
- c8 = a[i + 8];
- smlal(&accum0, b, c0);
- smlal(&accum8, b, c8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- n0 = a[i];
- n8 = a[i + 8];
- smlal(&accum0, b, n0);
- smlal(&accum8, b, n8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- c0 = a[i];
- c8 = a[i + 8];
- smlal(&accum0, b, c0);
- smlal(&accum8, b, c8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
- {
- n0 = a[i];
- n8 = a[i + 8];
- smlal(&accum0, b, n0);
- smlal(&accum8, b, n8);
-
- c[i] = accum0 & mask;
- accum0 >>= 28;
- c[i + 8] = accum8 & mask;
- accum8 >>= 28;
- i++;
- }
-
- accum0 += accum8 + c[8];
- c[8] = accum0 & mask;
- c[9] += accum0 >> 28;
-
- accum8 += c[0];
- c[0] = accum8 & mask;
- c[1] += accum8 >> 28;
-}
+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014-2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#define GF_HEADROOM 2
-#define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
-#define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
- {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
-
-#define LIMB_PLACE_VALUE(i) 28
-
-void gf_add_RAW(gf out, const gf a, const gf b)
-{
- for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
- ((uint32xn_t *) out)[i] =
- ((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
- }
-}
-
-void gf_sub_RAW(gf out, const gf a, const gf b)
-{
- for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
- ((uint32xn_t *) out)[i] =
- ((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
- }
-}
-
-void gf_bias(gf a, int amt)
-{
- uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
- uint32x4_t lo = { co1, co1, co1, co1 }, hi = {
- co2, co1, co1, co1};
- uint32x4_t *aa = (uint32x4_t *) a;
-
- aa[0] += lo;
- aa[1] += lo;
- aa[2] += hi;
- aa[3] += lo;
-}
-
-void gf_weak_reduce(gf a)
-{
- uint64_t mask = (1ull << 28) - 1;
- uint64_t tmp = a->limb[15] >> 28;
-
- a->limb[8] += tmp;
- for (unsigned int i = 15; i > 0; i--) {
- a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
- }
- a->limb[0] = (a->limb[0] & mask) + tmp;
-}
+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#ifndef __ARCH_NEON_ARCH_INTRINSICS_H__
-# define __ARCH_NEON_ARCH_INTRINSICS_H__
-
-# define ARCH_WORD_BITS 32
-
-static __inline__ __attribute((always_inline, unused))
-uint32_t word_is_zero(uint32_t a)
-{
- uint32_t ret;
- __asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
- return ret;
-}
-
-static __inline__ __attribute((always_inline, unused))
-uint64_t widemul(uint32_t a, uint32_t b)
-{
- /*
- * Could be UMULL, but it's hard to express to CC that the registers must
- * be different
- */
- return ((uint64_t)a) * b;
-}
-
-#endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */
+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#include "field.h"
-
-static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused))
-xx_vaddup_u64(uint64x2_t x)
-{
- __asm__ ("vadd.s64 %f0, %e0" : "+w"(x));
- return x;
-}
-
-static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused))
-vrev128_s64(int64x2_t x)
-{
- __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
- return x;
-}
-
-static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline))
-vrev128_u64(uint64x2_t x)
-{
- __asm__ ("vswp.s64 %e0, %f0" : "+w"(x));
- return x;
-}
-
-static inline void __attribute__((gnu_inline,always_inline,unused))
-smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
- *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
-}
-
-static inline void __attribute__((gnu_inline,always_inline,unused))
-smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
- *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
-}
-
-static inline void __attribute__((gnu_inline,always_inline,unused))
-smull(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
- *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
-}
-
-static inline void __attribute__((gnu_inline,always_inline,unused))
-smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
-{
- *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
-}
-
-void gf_mul(gf_s *__restrict__ cs, const gf as, const gf bs)
-{
- #define _bl0 "q0"
- #define _bl0_0 "d0"
- #define _bl0_1 "d1"
- #define _bh0 "q1"
- #define _bh0_0 "d2"
- #define _bh0_1 "d3"
- #define _bs0 "q2"
- #define _bs0_0 "d4"
- #define _bs0_1 "d5"
- #define _bl2 "q3"
- #define _bl2_0 "d6"
- #define _bl2_1 "d7"
- #define _bh2 "q4"
- #define _bh2_0 "d8"
- #define _bh2_1 "d9"
- #define _bs2 "q5"
- #define _bs2_0 "d10"
- #define _bs2_1 "d11"
-
- #define _as0 "q6"
- #define _as0_0 "d12"
- #define _as0_1 "d13"
- #define _as2 "q7"
- #define _as2_0 "d14"
- #define _as2_1 "d15"
- #define _al0 "q8"
- #define _al0_0 "d16"
- #define _al0_1 "d17"
- #define _ah0 "q9"
- #define _ah0_0 "d18"
- #define _ah0_1 "d19"
- #define _al2 "q10"
- #define _al2_0 "d20"
- #define _al2_1 "d21"
- #define _ah2 "q11"
- #define _ah2_0 "d22"
- #define _ah2_1 "d23"
-
- #define _a0a "q12"
- #define _a0a_0 "d24"
- #define _a0a_1 "d25"
- #define _a0b "q13"
- #define _a0b_0 "d26"
- #define _a0b_1 "d27"
- #define _a1a "q14"
- #define _a1a_0 "d28"
- #define _a1a_1 "d29"
- #define _a1b "q15"
- #define _a1b_0 "d30"
- #define _a1b_1 "d31"
- #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t"
- #define VOP3(op,result,a,b) #op" "result", "a", "b"\n\t"
- #define VOP2(op,result,a) #op" "result", "a"\n\t"
-
- int32x2_t *vc = (int32x2_t*) cs->limb;
-
- __asm__ __volatile__(
-
- "vld2.32 {"_al0_0","_al0_1","_ah0_0","_ah0_1"}, [%[a],:128]!" "\n\t"
- VOP3(vadd.i32,_as0,_al0,_ah0)
-
- "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
- VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1)
- VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0)
-
- "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
- VOP3(vadd.i32,_bs2,_bl2,_bh2)
-
- "vld2.32 {"_al2_0","_al2_1","_ah2_0","_ah2_1"}, [%[a],:128]!" "\n\t"
- VOP3(vadd.i32,_as2,_al2,_ah2)
-
- VMAC(vmull.s32,_a0b,_as0_1,_bs2_1,0)
- VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0)
- VMAC(vmlal.s32,_a0b,_as2_1,_bs0_1,0)
- VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0)
-
- VMAC(vmull.s32,_a1b,_as0_1,_bs2_1,1)
- VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1)
- VMAC(vmlal.s32,_a1b,_as2_1,_bs0_1,1)
- VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1)
-
- VOP2(vmov,_a0a,_a0b)
- VMAC(vmlal.s32,_a0a,_ah0_1,_bh2_1,0)
- VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_0,0)
- VMAC(vmlal.s32,_a0a,_ah2_1,_bh0_1,0)
- VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_0,0)
-
- VMAC(vmlsl.s32,_a0b,_al0_1,_bl2_1,0)
- VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_0,0)
- VMAC(vmlsl.s32,_a0b,_al2_1,_bl0_1,0)
- VMAC(vmlal.s32,_a0b,_al0_0,_bs0_0,0)
-
- VOP2(vmov,_a1a,_a1b)
- VMAC(vmlal.s32,_a1a,_ah0_1,_bh2_1,1)
- VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_0,1)
- VMAC(vmlal.s32,_a1a,_ah2_1,_bh0_1,1)
- VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_0,1)
-
- VOP2(vswp,_a0b_1,_a0a_0)
-
- VMAC(vmlsl.s32,_a1b,_al0_1,_bl2_1,1)
- VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_0,1)
- VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1)
- VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1)
-
- VOP3(vsra.u64,_a0a,_a0b,"#28")
- VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
- VOP2(vmovn.i64,_a0b_0,_a0b)
-
- VOP2(vswp,_a1b_1,_a1a_0)
- VOP3(vadd.i64,_a1b,_a0a,_a1b)
-
-
- VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0)
- VOP2(vmovn.i64,_a0b_1,_a1b)
- VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0)
- VOP3(vsra.u64,_a1a,_a1b,"#28")
- VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0)
- VOP2(vbic.i32,_a0b,"#0xf0000000")
- VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0)
- "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
-
- VMAC(vmull.s32,_a1b,_as2_0,_bs2_1,1)
- VMAC(vmlal.s32,_a1b,_as2_1,_bs2_0,1)
- VMAC(vmlal.s32,_a1b,_as0_0,_bh0_1,1)
- VMAC(vmlal.s32,_a1b,_as0_1,_bh0_0,1)
-
- VOP2(vmov,_a0b_1,_a0a_1)
- VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
- VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
- VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_1,0)
- VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_0,0)
- VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_1,0)
- VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_0,0)
-
- VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_1,0)
- VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_0,0)
- VMAC(vmlal.s32,_a0b,_al0_0,_bs0_1,0)
- VMAC(vmlal.s32,_a0b,_al0_1,_bs0_0,0)
-
- VOP2(vmov,_a1a,_a1b)
- VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_1,1)
- VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_0,1)
- VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_1,1)
- VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_0,1)
-
- VOP2(vswp,_a0b_1,_a0a_0)
-
- VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_1,1)
- VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_0,1)
- VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1)
- VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1)
-
- VOP3(vsra.u64,_a0a,_a0b,"#28")
- VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
- VOP2(vmovn.i64,_a0b_0,_a0b)
-
- VOP2(vswp,_a1b_1,_a1a_0)
- VOP3(vadd.i64,_a1b,_a0a,_a1b)
-
- VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
- VOP2(vmovn.i64,_a0b_1,_a1b)
- VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0)
- VOP3(vsra.u64,_a1a,_a1b,"#28")
- VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
- VOP2(vbic.i32,_a0b,"#0xf0000000")
- VMAC(vmlal.s32,_a0a,_as2_0,_bh0_0,0)
- "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
-
- VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
- VMAC(vmlal.s32,_a1b,_as0_0,_bh2_0,1)
- VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
- VMAC(vmlal.s32,_a1b,_as2_0,_bh0_0,1)
-
- VOP2(vmov,_a0b_1,_a0a_1)
- VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
- VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
- VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_1,0)
- VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_0,0)
- VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_1,0)
- VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_0,0)
-
- VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_1,0)
- VMAC(vmlal.s32,_a0b,_al0_0,_bs2_0,0)
- VMAC(vmlal.s32,_a0b,_al0_1,_bs0_1,0)
- VMAC(vmlal.s32,_a0b,_al2_0,_bs0_0,0)
-
- VOP2(vmov,_a1a,_a1b)
- VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_1,1)
- VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_0,1)
- VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_1,1)
- VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_0,1)
-
- VOP2(vswp,_a0b_1,_a0a_0)
-
- VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_1,1)
- VMAC(vmlal.s32,_a1b,_al0_0,_bs2_0,1)
- VMAC(vmlal.s32,_a1b,_al0_1,_bs0_1,1)
- VMAC(vmlal.s32,_a1b,_al2_0,_bs0_0,1)
-
- VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
- VOP3(vsra.u64,_a0a,_a0b,"#28")
- VOP2(vmovn.i64,_a0b_0,_a0b)
-
- VOP2(vswp,_a1b_1,_a1a_0)
- VOP3(vadd.i64,_a1b,_a0a,_a1b)
-
- VMAC(vmull.s32,_a0a,_as0_0,_bh2_1,0)
- VOP2(vmovn.i64,_a0b_1,_a1b)
- VMAC(vmlal.s32,_a0a,_as0_1,_bh2_0,0)
- VOP3(vsra.u64,_a1a,_a1b,"#28")
- VMAC(vmlal.s32,_a0a,_as2_0,_bh0_1,0)
- VOP2(vbic.i32,_a0b,"#0xf0000000")
- VMAC(vmlal.s32,_a0a,_as2_1,_bh0_0,0)
- "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
-
- VMAC(vmull.s32,_a1b,_as0_0,_bh2_1,1)
- VMAC(vmlal.s32,_a1b,_as0_1,_bh2_0,1)
- VMAC(vmlal.s32,_a1b,_as2_0,_bh0_1,1)
- VMAC(vmlal.s32,_a1b,_as2_1,_bh0_0,1)
-
- VOP2(vmov,_a0b_1,_a0a_1)
- VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
- VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
- VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_1,0)
- VMAC(vmlal.s32,_a0a,_ah0_1,_bl2_0,0)
- VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_1,0)
- VMAC(vmlal.s32,_a0a,_ah2_1,_bl0_0,0)
-
- VMAC(vmlal.s32,_a0b,_al0_0,_bs2_1,0)
- VMAC(vmlal.s32,_a0b,_al0_1,_bs2_0,0)
- VMAC(vmlal.s32,_a0b,_al2_0,_bs0_1,0)
- VMAC(vmlal.s32,_a0b,_al2_1,_bs0_0,0)
-
- VOP2(vmov,_a1a,_a1b)
- VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_1,1)
- VMAC(vmlal.s32,_a1a,_ah0_1,_bl2_0,1)
- VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_1,1)
- VMAC(vmlal.s32,_a1a,_ah2_1,_bl0_0,1)
-
- VOP2(vswp,_a0b_1,_a0a_0)
-
- VMAC(vmlal.s32,_a1b,_al0_0,_bs2_1,1)
- VMAC(vmlal.s32,_a1b,_al0_1,_bs2_0,1)
- VMAC(vmlal.s32,_a1b,_al2_0,_bs0_1,1)
- VMAC(vmlal.s32,_a1b,_al2_1,_bs0_0,1)
-
- VOP3(vsra.u64,_a0a,_a0b,"#28")
- VOP2(vmovn.i64,_a0b_0,_a0b)
-
- VOP2(vswp,_a1b_1,_a1a_0)
- VOP3(vadd.i64,_a0a,_a0a,_a1b)
-
- VOP2(vmovn.i64,_a0b_1,_a0a)
- VOP3(vsra.u64,_a1a,_a0a,"#28")
-
- VOP2(vbic.i32,_a0b,"#0xf0000000")
-
- VOP2(vswp,_a1a_0,_a1a_1)
-
- "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
- "sub %[c], #64" "\n\t"
-
- VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
-
- "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
- VOP2(vaddw.s32,_a1a,_a0a_0)
- VOP2(vmovn.i64,_a0a_0,_a1a)
- VOP2(vshr.s64,_a1a,"#28")
-
- VOP2(vaddw.s32,_a1a,_a0a_1)
- VOP2(vmovn.i64,_a0a_1,_a1a)
- VOP2(vshr.s64,_a1a,"#28")
-
- VOP2(vbic.i32,_a0a,"#0xf0000000")
-
- VOP2(vaddw.s32,_a1a,_a0b_0)
- VOP2(vmovn.i64,_a0b_0,_a1a)
-
- "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
-
- : [a]"+r"(as)
- , [b]"+r"(bs)
- , [c]"+r"(vc)
-
- :: "q0","q1","q2","q3",
- "q4","q5","q6","q7",
- "q8","q9","q10","q11",
- "q12","q13","q14","q15",
- "memory"
- );
-}
-
-void gf_sqr(gf_s *__restrict__ cs, const gf bs)
-{
- int32x2_t *vc = (int32x2_t*) cs->limb;
-
- __asm__ __volatile__ (
- "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t"
- VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) /* 0 .. 2^30 */
- VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) /* +- 2^29 */
- VOP3(vadd.i32,_as0,_bl0,_bh0) /* 0 .. 2^30 */
-
- "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t"
- VOP3(vadd.i32,_bs2,_bl2,_bh2) /* 0 .. 2^30 */
- VOP2(vmov,_as2,_bs2)
-
- VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0) /* 0 .. 8 * 2^58. danger for vqdmlal is 32 */
- VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0) /* 0 .. 12 */
- VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0) /* 0 .. 14 */
-
- VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1) /* 0 .. 8 */
- VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1) /* 0 .. 14 */
- VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1) /* 0 .. 16 */
-
- VOP2(vmov,_a0a,_a0b) /* 0 .. 14 */
- VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0) /* 0 .. 16 */
- VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0) /* 0 .. 17 */
- VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0) /* 0 .. 18 */
-
- VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0) /*-2 .. 14 */
- VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0) /*-3 .. 14 */
- VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0) /*-4 .. 15 */
-
- VOP2(vmov,_a1a,_a1b)
- VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1) /* 0 .. 18 */
- VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1) /* 0 .. 19 */
- VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1) /* 0 .. 20 */
-
- VOP2(vswp,_a0b_1,_a0a_0)
-
- VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1) /*-2 .. 16 */
- VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1) /*-3 .. 16 */
- VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1) /*-4 .. 17 */
-
- VOP3(vsra.u64,_a0a,_a0b,"#28")
- VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1)
- VOP2(vmovn.i64,_a0b_0,_a0b)
-
- VOP2(vswp,_a1b_1,_a1a_0)
- VOP3(vadd.i64,_a1b,_a0a,_a1b)
-
-
- VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0) /* 0 .. 8 */
- VOP2(vmovn.i64,_a0b_1,_a1b)
- VOP3(vsra.u64,_a1a,_a1b,"#28")
- VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0) /* 0 .. 12 */
- VOP2(vbic.i32,_a0b,"#0xf0000000")
- "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
-
- VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1) /* 0 .. 8 */
- VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1) /* 0 .. 12 */
-
- VOP2(vmov,_a0b,_a0a) /* 0 .. 12 */
- VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0) /* 0 .. 14 */
- VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0) /* 0 .. 16 */
-
- VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0) /*-2 .. 12 */
- VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0) /*-4 .. 14 */
- VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
- VOP3(vadd.i64,_a0b_0,_a0b_0,_a1a_0)
-
- VOP2(vmov,_a1a,_a1b) /* 0 .. 12 */
- VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1) /* 0 .. 14 */
- VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1) /* 0 .. 16 */
-
- VOP2(vswp,_a0b_1,_a0a_0)
-
- VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1) /*-2 .. 12 */
- VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1) /*-4 .. 14 */
-
- VOP3(vsra.u64,_a0a,_a0b,"#28")
- VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0)
- VOP2(vmovn.i64,_a0b_0,_a0b)
-
- VOP2(vswp,_a1b_1,_a1a_0)
- VOP3(vadd.i64,_a1b,_a0a,_a1b)
-
- VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0)
- VOP2(vmovn.i64,_a0b_1,_a1b)
- VMAC(vqdmlal.s32,_a0a,_as0_0,_bh2_0,0)
- VOP3(vsra.u64,_a1a,_a1b,"#28")
- VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0)
- VOP2(vbic.i32,_a0b,"#0xf0000000")
- "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
-
- VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1)
- VMAC(vqdmlal.s32,_a1b,_as0_0,_bh2_0,1)
- VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1)
-
- VOP2(vmov,_a0b_1,_a0a_1)
- VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
- VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
- VMAC(vmlal.s32,_a0a,_bh2_1,_bh2_1,0)
- VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_0,0)
- VMAC(vmlal.s32,_a0a,_bh0_1,_bl0_1,0)
-
- VMAC(vmlsl.s32,_a0b,_bl2_1,_bl2_1,0)
- VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_0,0)
- VMAC(vmlal.s32,_a0b,_bl0_1,_bs0_1,0)
-
- VOP2(vmov,_a1a,_a1b)
- VMAC(vmlal.s32,_a1a,_bh2_1,_bh2_1,1)
- VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_0,1)
- VMAC(vmlal.s32,_a1a,_bh0_1,_bl0_1,1)
-
- VOP2(vswp,_a0b_1,_a0a_0)
-
- VMAC(vmlsl.s32,_a1b,_bl2_1,_bl2_1,1)
- VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_0,1)
- VMAC(vmlal.s32,_a1b,_bl0_1,_bs0_1,1)
-
- VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1)
- VOP3(vsra.u64,_a0a,_a0b,"#28")
- VOP2(vmovn.i64,_a0b_0,_a0b)
-
- VOP2(vswp,_a1b_1,_a1a_0)
- VOP3(vadd.i64,_a1b,_a0a,_a1b)
-
- VMAC(vqdmull.s32,_a0a,_as0_0,_bh2_1,0)
- VOP2(vmovn.i64,_a0b_1,_a1b)
- VOP3(vsra.u64,_a1a,_a1b,"#28")
- VMAC(vqdmlal.s32,_a0a,_as2_0,_bh0_1,0)
- VOP2(vbic.i32,_a0b,"#0xf0000000")
- "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
-
- VMAC(vqdmull.s32,_a1b,_as0_0,_bh2_1,1)
- VMAC(vqdmlal.s32,_a1b,_as2_0,_bh0_1,1)
-
- VOP2(vmov,_a0b_1,_a0a_1)
- VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0)
- VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1)
- VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_1,0)
- VMAC(vqdmlal.s32,_a0a,_bh2_0,_bl0_1,0)
-
- VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_1,0)
- VMAC(vqdmlal.s32,_a0b,_bl2_0,_bs0_1,0)
-
- VOP2(vmov,_a1a,_a1b)
- VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_1,1)
- VMAC(vqdmlal.s32,_a1a,_bh2_0,_bl0_1,1)
-
- VOP2(vswp,_a0b_1,_a0a_0)
-
- VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_1,1)
- VMAC(vqdmlal.s32,_a1b,_bl2_0,_bs0_1,1)
-
- VOP3(vsra.u64,_a0a,_a0b,"#28")
- VOP2(vmovn.i64,_a0b_0,_a0b)
-
- VOP2(vswp,_a1b_1,_a1a_0)
- VOP3(vadd.i64,_a0a,_a0a,_a1b)
-
- VOP2(vmovn.i64,_a0b_1,_a0a)
- VOP3(vsra.u64,_a1a,_a0a,"#28")
-
- VOP2(vbic.i32,_a0b,"#0xf0000000")
-
- VOP2(vswp,_a1a_0,_a1a_1)
-
- "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t"
- "sub %[c], #64" "\n\t"
-
- VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0)
-
- "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
- VOP2(vaddw.s32,_a1a,_a0a_0)
- VOP2(vmovn.i64,_a0a_0,_a1a)
- VOP2(vshr.s64,_a1a,"#28")
-
- VOP2(vaddw.s32,_a1a,_a0a_1)
- VOP2(vmovn.i64,_a0a_1,_a1a)
- VOP2(vshr.s64,_a1a,"#28")
-
- VOP2(vbic.i32,_a0a,"#0xf0000000")
-
- VOP2(vaddw.s32,_a1a,_a0b_0)
- VOP2(vmovn.i64,_a0b_0,_a1a)
-
- "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t"
-
- : [b]"+r"(bs)
- , [c]"+r"(vc)
-
- :: "q0","q1","q2","q3",
- "q4","q5","q6","q7",
- "q12","q13","q14","q15",
- "memory"
- );
-}
-
-void gf_mulw_unsigned(gf_s *__restrict__ cs, const gf as, uint32_t b)
-{
- uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
- assert(b<(1<<28));
-
- uint64x2_t accum;
- const uint32x2_t *va = (const uint32x2_t *) as->limb;
- uint32x2_t *vo = (uint32x2_t *) cs->limb;
- uint32x2_t vc, vn;
- uint32x2_t vb = {b, 0};
-
- vc = va[0];
- accum = vmull_lane_u32(vc, vb, 0);
- vo[0] = vmovn_u64(accum) & vmask;
- accum = vshrq_n_u64(accum,28);
-
- /* PERF: the right way to do this is to reduce behind, i.e.
- * vmull + vmlal round 0
- * vmull + vmlal round 1
- * vmull + vmlal round 2
- * vsraq round 0, 1
- * vmull + vmlal round 3
- * vsraq round 1, 2
- * ...
- */
-
- int i;
- for (i=1; i<8; i++) {
- vn = va[i];
- accum = vmlal_lane_u32(accum, vn, vb, 0);
- vo[i] = vmovn_u64(accum) & vmask;
- accum = vshrq_n_u64(accum,28);
- vc = vn;
- }
-
- accum = xx_vaddup_u64(vrev128_u64(accum));
- accum = vaddw_u32(accum, vo[0]);
- vo[0] = vmovn_u64(accum) & vmask;
-
- accum = vshrq_n_u64(accum,28);
- vo[1] += vmovn_u64(accum);
-}
+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014-2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#define GF_HEADROOM 2
-#define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15)
-#define USE_NEON_PERM 1
-#define LIMBHI(x) ((x##ull)>>28)
-#define LIMBLO(x) ((x##ull)&((1ull<<28)-1))
-#define FIELD_LITERAL(a,b,c,d,e,f,g,h) { \
- { \
- LIMBLO(a), LIMBLO(e), LIMBHI(a), LIMBHI(e), LIMBLO(b), LIMBLO(f), \
- LIMBHI(b), LIMBHI(f), LIMBLO(c), LIMBLO(g), LIMBHI(c), LIMBHI(g), \
- LIMBLO(d), LIMBLO(h), LIMBHI(d), LIMBHI(h) \
- } \
-}
-
-#define LIMB_PLACE_VALUE(i) 28
-
-void gf_add_RAW(gf out, const gf a, const gf b)
-{
- for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
- ((uint32xn_t *) out)[i] =
- ((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
- }
-}
-
-void gf_sub_RAW(gf out, const gf a, const gf b)
-{
- for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
- ((uint32xn_t *) out)[i] =
- ((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
- }
-}
-
-void gf_bias(gf a, int amt)
-{
- uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
- uint32x4_t lo = { co1, co2, co1, co1 };
- uint32x4_t hi = { co1, co1, co1, co1 };
- uint32x4_t *aa = (uint32x4_t *) a;
- aa[0] += lo;
- aa[1] += hi;
- aa[2] += hi;
- aa[3] += hi;
-}
-
-void gf_weak_reduce(gf a)
-{
- uint32x2_t *aa = (uint32x2_t *) a;
- uint32x2_t vmask = { (1ull << 28) - 1, (1ull << 28) - 1};
- uint32x2_t vm2 = { 0, -1}, tmp = vshr_n_u32(aa[7], 28);
-
- for (unsigned int i = 7; i >= 1; i--)
- aa[i] = vsra_n_u32(aa[i] & vmask, aa[i - 1], 28);
- aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp & vm2);
-}
+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#ifndef __ARCH_REF64_ARCH_INTRINSICS_H__
-# define __ARCH_REF64_ARCH_INTRINSICS_H__
-
-# define ARCH_WORD_BITS 64
-
-static __inline__ __attribute((always_inline, unused))
-uint64_t word_is_zero(uint64_t a)
-{
- /* let's hope the compiler isn't clever enough to optimize this. */
- return (((__uint128_t) a) - 1) >> 64;
-}
-
-static __inline__ __attribute((always_inline, unused))
-__uint128_t widemul(uint64_t a, uint64_t b)
-{
- return ((__uint128_t) a) * b;
-}
-
-#endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */
+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-#include "field.h"
-
-void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
-{
- const uint64_t *a = as->limb, *b = bs->limb;
- uint64_t *c = cs->limb;
- __uint128_t accum0 = 0, accum1 = 0, accum2;
- uint64_t mask = (1ull << 56) - 1;
- uint64_t aa[4], bb[4], bbb[4];
- unsigned int i;
-
- for (i = 0; i < 4; i++) {
- aa[i] = a[i] + a[i + 4];
- bb[i] = b[i] + b[i + 4];
- bbb[i] = bb[i] + b[i + 4];
- }
-
- int I_HATE_UNROLLED_LOOPS = 0;
-
- if (I_HATE_UNROLLED_LOOPS) {
- /*
- * The compiler probably won't unroll this, so it's like 80% slower.
- */
- for (i = 0; i < 4; i++) {
- accum2 = 0;
-
- unsigned int j;
- for (j = 0; j <= i; j++) {
- accum2 += widemul(a[j], b[i - j]);
- accum1 += widemul(aa[j], bb[i - j]);
- accum0 += widemul(a[j + 4], b[i - j + 4]);
- }
- for (; j < 4; j++) {
- accum2 += widemul(a[j], b[i - j + 8]);
- accum1 += widemul(aa[j], bbb[i - j + 4]);
- accum0 += widemul(a[j + 4], bb[i - j + 4]);
- }
-
- accum1 -= accum2;
- accum0 += accum2;
-
- c[i] = ((uint64_t)(accum0)) & mask;
- c[i + 4] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
- }
- } else {
- accum2 = widemul(a[0], b[0]);
- accum1 += widemul(aa[0], bb[0]);
- accum0 += widemul(a[4], b[4]);
-
- accum2 += widemul(a[1], b[7]);
- accum1 += widemul(aa[1], bbb[3]);
- accum0 += widemul(a[5], bb[3]);
-
- accum2 += widemul(a[2], b[6]);
- accum1 += widemul(aa[2], bbb[2]);
- accum0 += widemul(a[6], bb[2]);
-
- accum2 += widemul(a[3], b[5]);
- accum1 += widemul(aa[3], bbb[1]);
- accum0 += widemul(a[7], bb[1]);
-
- accum1 -= accum2;
- accum0 += accum2;
-
- c[0] = ((uint64_t)(accum0)) & mask;
- c[4] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum2 = widemul(a[0], b[1]);
- accum1 += widemul(aa[0], bb[1]);
- accum0 += widemul(a[4], b[5]);
-
- accum2 += widemul(a[1], b[0]);
- accum1 += widemul(aa[1], bb[0]);
- accum0 += widemul(a[5], b[4]);
-
- accum2 += widemul(a[2], b[7]);
- accum1 += widemul(aa[2], bbb[3]);
- accum0 += widemul(a[6], bb[3]);
-
- accum2 += widemul(a[3], b[6]);
- accum1 += widemul(aa[3], bbb[2]);
- accum0 += widemul(a[7], bb[2]);
-
- accum1 -= accum2;
- accum0 += accum2;
-
- c[1] = ((uint64_t)(accum0)) & mask;
- c[5] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum2 = widemul(a[0], b[2]);
- accum1 += widemul(aa[0], bb[2]);
- accum0 += widemul(a[4], b[6]);
-
- accum2 += widemul(a[1], b[1]);
- accum1 += widemul(aa[1], bb[1]);
- accum0 += widemul(a[5], b[5]);
-
- accum2 += widemul(a[2], b[0]);
- accum1 += widemul(aa[2], bb[0]);
- accum0 += widemul(a[6], b[4]);
-
- accum2 += widemul(a[3], b[7]);
- accum1 += widemul(aa[3], bbb[3]);
- accum0 += widemul(a[7], bb[3]);
-
- accum1 -= accum2;
- accum0 += accum2;
-
- c[2] = ((uint64_t)(accum0)) & mask;
- c[6] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum2 = widemul(a[0], b[3]);
- accum1 += widemul(aa[0], bb[3]);
- accum0 += widemul(a[4], b[7]);
-
- accum2 += widemul(a[1], b[2]);
- accum1 += widemul(aa[1], bb[2]);
- accum0 += widemul(a[5], b[6]);
-
- accum2 += widemul(a[2], b[1]);
- accum1 += widemul(aa[2], bb[1]);
- accum0 += widemul(a[6], b[5]);
-
- accum2 += widemul(a[3], b[0]);
- accum1 += widemul(aa[3], bb[0]);
- accum0 += widemul(a[7], b[4]);
-
- accum1 -= accum2;
- accum0 += accum2;
-
- c[3] = ((uint64_t)(accum0)) & mask;
- c[7] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
- } /* !I_HATE_UNROLLED_LOOPS */
-
- accum0 += accum1;
- accum0 += c[4];
- accum1 += c[0];
- c[4] = ((uint64_t)(accum0)) & mask;
- c[0] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- c[5] += ((uint64_t)(accum0));
- c[1] += ((uint64_t)(accum1));
-}
-
-void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
-{
- const uint64_t *a = as->limb;
- uint64_t *c = cs->limb;
- __uint128_t accum0 = 0, accum4 = 0;
- uint64_t mask = (1ull << 56) - 1;
- int i;
-
- for (i = 0; i < 4; i++) {
- accum0 += widemul(b, a[i]);
- accum4 += widemul(b, a[i + 4]);
- c[i] = accum0 & mask;
- accum0 >>= 56;
- c[i + 4] = accum4 & mask;
- accum4 >>= 56;
- }
-
- accum0 += accum4 + c[4];
- c[4] = accum0 & mask;
- c[5] += accum0 >> 56;
-
- accum4 += c[0];
- c[0] = accum4 & mask;
- c[1] += accum4 >> 56;
-}
-
-void gf_sqr(gf_s * __restrict__ cs, const gf as)
-{
- const uint64_t *a = as->limb;
- uint64_t *c = cs->limb;
- __uint128_t accum0 = 0, accum1 = 0, accum2;
- uint64_t mask = (1ull << 56) - 1;
- uint64_t aa[4];
-
- /* For some reason clang doesn't vectorize this without prompting? */
- unsigned int i;
- for (i = 0; i < 4; i++) {
- aa[i] = a[i] + a[i + 4];
- }
-
- accum2 = widemul(a[0], a[3]);
- accum0 = widemul(aa[0], aa[3]);
- accum1 = widemul(a[4], a[7]);
-
- accum2 += widemul(a[1], a[2]);
- accum0 += widemul(aa[1], aa[2]);
- accum1 += widemul(a[5], a[6]);
-
- accum0 -= accum2;
- accum1 += accum2;
-
- c[3] = ((uint64_t)(accum1)) << 1 & mask;
- c[7] = ((uint64_t)(accum0)) << 1 & mask;
-
- accum0 >>= 55;
- accum1 >>= 55;
-
- accum0 += widemul(2 * aa[1], aa[3]);
- accum1 += widemul(2 * a[5], a[7]);
- accum0 += widemul(aa[2], aa[2]);
- accum1 += accum0;
-
- accum0 -= widemul(2 * a[1], a[3]);
- accum1 += widemul(a[6], a[6]);
-
- accum2 = widemul(a[0], a[0]);
- accum1 -= accum2;
- accum0 += accum2;
-
- accum0 -= widemul(a[2], a[2]);
- accum1 += widemul(aa[0], aa[0]);
- accum0 += widemul(a[4], a[4]);
-
- c[0] = ((uint64_t)(accum0)) & mask;
- c[4] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum2 = widemul(2 * aa[2], aa[3]);
- accum0 -= widemul(2 * a[2], a[3]);
- accum1 += widemul(2 * a[6], a[7]);
-
- accum1 += accum2;
- accum0 += accum2;
-
- accum2 = widemul(2 * a[0], a[1]);
- accum1 += widemul(2 * aa[0], aa[1]);
- accum0 += widemul(2 * a[4], a[5]);
-
- accum1 -= accum2;
- accum0 += accum2;
-
- c[1] = ((uint64_t)(accum0)) & mask;
- c[5] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum2 = widemul(aa[3], aa[3]);
- accum0 -= widemul(a[3], a[3]);
- accum1 += widemul(a[7], a[7]);
-
- accum1 += accum2;
- accum0 += accum2;
-
- accum2 = widemul(2 * a[0], a[2]);
- accum1 += widemul(2 * aa[0], aa[2]);
- accum0 += widemul(2 * a[4], a[6]);
-
- accum2 += widemul(a[1], a[1]);
- accum1 += widemul(aa[1], aa[1]);
- accum0 += widemul(a[5], a[5]);
-
- accum1 -= accum2;
- accum0 += accum2;
-
- c[2] = ((uint64_t)(accum0)) & mask;
- c[6] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum0 += c[3];
- accum1 += c[7];
- c[3] = ((uint64_t)(accum0)) & mask;
- c[7] = ((uint64_t)(accum1)) & mask;
-
- /* we could almost stop here, but it wouldn't be stable, so... */
-
- accum0 >>= 56;
- accum1 >>= 56;
- c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
- c[0] += ((uint64_t)(accum1));
-}
+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014-2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#define GF_HEADROOM 9999 /* Everything is reduced anyway */
-#define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
-
-#define LIMB_PLACE_VALUE(i) 56
-
-void gf_add_RAW(gf out, const gf a, const gf b)
-{
- for (unsigned int i = 0; i < 8; i++)
- out->limb[i] = a->limb[i] + b->limb[i];
- gf_weak_reduce(out);
-}
-
-void gf_sub_RAW(gf out, const gf a, const gf b)
-{
- uint64_t co1 = ((1ull << 56) - 1) * 2, co2 = co1 - 2;
-
- for (unsigned int i = 0; i < 8; i++)
- out->limb[i] = a->limb[i] - b->limb[i] + ((i == 4) ? co2 : co1);
- gf_weak_reduce(out);
-}
-
-void gf_bias(gf a, int amt)
-{
- (void)a;
- (void)amt;
-}
-
-void gf_weak_reduce(gf a)
-{
- uint64_t mask = (1ull << 56) - 1;
- uint64_t tmp = a->limb[7] >> 56;
-
- a->limb[4] += tmp;
- for (unsigned int i = 7; i > 0; i--)
- a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
- a->limb[0] = (a->limb[0] & mask) + tmp;
-}
+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014-2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-#ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
-#define __ARCH_X86_64_ARCH_INTRINSICS_H__
-
-#define ARCH_WORD_BITS 64
-
-#include <openssl/e_os2.h>
-
-/* FUTURE: autogenerate */
-static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b)
-{
- uint64_t c, d;
-
- #ifndef __BMI2__
- __asm__ volatile
- ("movq %[a], %%rax;"
- "mulq %[b];"
- : [c]"=&a"(c), [d]"=d"(d)
- : [b]"m"(*b), [a]"m"(*a)
- : "cc");
- #else
- __asm__ volatile
- ("movq %[a], %%rdx;"
- "mulx %[b], %[c], %[d];"
- : [c]"=r"(c), [d]"=r"(d)
- : [b]"m"(*b), [a]"m"(*a)
- : "rdx");
- #endif
- return (((__uint128_t)(d)) << 64) | c;
-}
-
-static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b)
-{
- uint64_t c, d;
-
- #ifndef __BMI2__
- __asm__ volatile
- ("movq %[a], %%rax;"
- "mulq %[b];"
- : [c]"=&a"(c), [d]"=d"(d)
- : [b]"m"(*b), [a]"r"(a)
- : "cc");
- #else
- __asm__ volatile
- ("mulx %[b], %[c], %[d];"
- : [c]"=r"(c), [d]"=r"(d)
- : [b]"m"(*b), [a]"d"(a));
- #endif
- return (((__uint128_t)(d)) << 64) | c;
-}
-
-static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b)
-{
- uint64_t c, d;
-
- #ifndef __BMI2__
- __asm__ volatile
- ("mulq %[b];"
- : [c]"=a"(c), [d]"=d"(d)
- : [b]"r"(b), "a"(a)
- : "cc");
- #else
- __asm__ volatile
- ("mulx %[b], %[c], %[d];"
- : [c]"=r"(c), [d]"=r"(d)
- : [b]"r"(b), [a]"d"(a));
- #endif
- return (((__uint128_t)(d)) << 64) | c;
-}
-
-static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b)
-{
- uint64_t c, d;
-
- #ifndef __BMI2__
- __asm__ volatile
- ("movq %[a], %%rax; "
- "addq %%rax, %%rax; "
- "mulq %[b];"
- : [c]"=&a"(c), [d]"=d"(d)
- : [b]"m"(*b), [a]"m"(*a)
- : "cc");
- #else
- __asm__ volatile
- ("movq %[a], %%rdx;"
- "leaq (,%%rdx,2), %%rdx;"
- "mulx %[b], %[c], %[d];"
- : [c]"=r"(c), [d]"=r"(d)
- : [b]"m"(*b), [a]"m"(*a)
- : "rdx");
- #endif
- return (((__uint128_t)(d)) << 64) | c;
-}
-
-static __inline__ void mac(__uint128_t *acc, const uint64_t *a,
- const uint64_t *b)
-{
- uint64_t lo = *acc, hi = *acc >> 64;
-
- #ifdef __BMI2__
- uint64_t c,d;
- __asm__ volatile
- ("movq %[a], %%rdx; "
- "mulx %[b], %[c], %[d]; "
- "addq %[c], %[lo]; "
- "adcq %[d], %[hi]; "
- : [c]"=&r"(c), [d]"=&r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"m"(*b), [a]"m"(*a)
- : "rdx", "cc");
- #else
- __asm__ volatile
- ("movq %[a], %%rax; "
- "mulq %[b]; "
- "addq %%rax, %[lo]; "
- "adcq %%rdx, %[hi]; "
- : [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"m"(*b), [a]"m"(*a)
- : "rax", "rdx", "cc");
- #endif
-
- *acc = (((__uint128_t)(hi)) << 64) | lo;
-}
-
-static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2,
- const uint64_t *a, const uint64_t *b)
-{
- uint64_t lo = *acc, hi = *acc >> 64;
- uint64_t lo2 = *acc2, hi2 = *acc2 >> 64;
-
- #ifdef __BMI2__
- uint64_t c,d;
- __asm__ volatile
- ("movq %[a], %%rdx; "
- "mulx %[b], %[c], %[d]; "
- "addq %[c], %[lo]; "
- "adcq %[d], %[hi]; "
- "addq %[c], %[lo2]; "
- "adcq %[d], %[hi2]; "
- : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
- : [b]"m"(*b), [a]"m"(*a)
- : "rdx", "cc");
- #else
- __asm__ volatile
- ("movq %[a], %%rax; "
- "mulq %[b]; "
- "addq %%rax, %[lo]; "
- "adcq %%rdx, %[hi]; "
- "addq %%rax, %[lo2]; "
- "adcq %%rdx, %[hi2]; "
- : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
- : [b]"m"(*b), [a]"m"(*a)
- : "rax", "rdx", "cc");
- #endif
-
- *acc = (((__uint128_t)(hi)) << 64) | lo;
- *acc2 = (((__uint128_t)(hi2)) << 64) | lo2;
-}
-
-static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b)
-{
- uint64_t lo = *acc, hi = *acc >> 64;
-
- #ifdef __BMI2__
- uint64_t c,d;
- __asm__ volatile
- ("mulx %[b], %[c], %[d]; "
- "addq %[c], %[lo]; "
- "adcq %[d], %[hi]; "
- : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"m"(*b), [a]"d"(a)
- : "cc");
- #else
- __asm__ volatile
- ("movq %[a], %%rax; "
- "mulq %[b]; "
- "addq %%rax, %[lo]; "
- "adcq %%rdx, %[hi]; "
- : [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"m"(*b), [a]"r"(a)
- : "rax", "rdx", "cc");
- #endif
-
- *acc = (((__uint128_t)(hi)) << 64) | lo;
-}
-
-static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b)
-{
- uint64_t lo = *acc, hi = *acc >> 64;
-
- #ifdef __BMI2__
- uint64_t c,d;
- __asm__ volatile
- ("mulx %[b], %[c], %[d]; "
- "addq %[c], %[lo]; "
- "adcq %[d], %[hi]; "
- : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"r"(b), [a]"d"(a)
- : "cc");
- #else
- __asm__ volatile
- ("mulq %[b]; "
- "addq %%rax, %[lo]; "
- "adcq %%rdx, %[hi]; "
- : [lo]"+r"(lo), [hi]"+r"(hi), "+a"(a)
- : [b]"r"(b)
- : "rdx", "cc");
- #endif
-
- *acc = (((__uint128_t)(hi)) << 64) | lo;
-}
-
-static __inline__ void mac2(__uint128_t *acc, const uint64_t *a,
- const uint64_t *b)
-{
- uint64_t lo = *acc, hi = *acc >> 64;
-
- #ifdef __BMI2__
- uint64_t c,d;
- __asm__ volatile
- ("movq %[a], %%rdx; "
- "addq %%rdx, %%rdx; "
- "mulx %[b], %[c], %[d]; "
- "addq %[c], %[lo]; "
- "adcq %[d], %[hi]; "
- : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"m"(*b), [a]"m"(*a)
- : "rdx", "cc");
- #else
- __asm__ volatile
- ("movq %[a], %%rax; "
- "addq %%rax, %%rax; "
- "mulq %[b]; "
- "addq %%rax, %[lo]; "
- "adcq %%rdx, %[hi]; "
- : [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"m"(*b), [a]"m"(*a)
- : "rax", "rdx", "cc");
- #endif
-
- *acc = (((__uint128_t)(hi)) << 64) | lo;
-}
-
-static __inline__ void msb(__uint128_t *acc, const uint64_t *a,
- const uint64_t *b)
-{
- uint64_t lo = *acc, hi = *acc >> 64;
-
- #ifdef __BMI2__
- uint64_t c,d;
- __asm__ volatile
- ("movq %[a], %%rdx; "
- "mulx %[b], %[c], %[d]; "
- "subq %[c], %[lo]; "
- "sbbq %[d], %[hi]; "
- : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"m"(*b), [a]"m"(*a)
- : "rdx", "cc");
- #else
- __asm__ volatile
- ("movq %[a], %%rax; "
- "mulq %[b]; "
- "subq %%rax, %[lo]; "
- "sbbq %%rdx, %[hi]; "
- : [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"m"(*b), [a]"m"(*a)
- : "rax", "rdx", "cc");
- #endif
- *acc = (((__uint128_t)(hi)) << 64) | lo;
-}
-
-static __inline__ void msb2(__uint128_t *acc, const uint64_t *a,
- const uint64_t *b)
-{
- uint64_t lo = *acc, hi = *acc >> 64;
-
- #ifdef __BMI2__
- uint64_t c,d;
- __asm__ volatile
- ("movq %[a], %%rdx; "
- "addq %%rdx, %%rdx; "
- "mulx %[b], %[c], %[d]; "
- "subq %[c], %[lo]; "
- "sbbq %[d], %[hi]; "
- : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"m"(*b), [a]"m"(*a)
- : "rdx", "cc");
- #else
- __asm__ volatile
- ("movq %[a], %%rax; "
- "addq %%rax, %%rax; "
- "mulq %[b]; "
- "subq %%rax, %[lo]; "
- "sbbq %%rdx, %[hi]; "
- : [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"m"(*b), [a]"m"(*a)
- : "rax", "rdx", "cc");
- #endif
- *acc = (((__uint128_t)(hi))<<64) | lo;
-
-}
-
-static __inline__ void mrs(__uint128_t *acc, const uint64_t *a,
- const uint64_t *b)
-{
- uint64_t c,d, lo = *acc, hi = *acc >> 64;
- __asm__ volatile
- ("movq %[a], %%rdx; "
- "mulx %[b], %[c], %[d]; "
- "subq %[lo], %[c]; "
- "sbbq %[hi], %[d]; "
- : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
- : [b]"m"(*b), [a]"m"(*a)
- : "rdx", "cc");
- *acc = (((__uint128_t)(d)) << 64) | c;
-}
-
-static __inline__ uint64_t word_is_zero(uint64_t x)
-{
- __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
- return ~x;
-}
-
-static inline uint64_t shrld(__uint128_t x, int n)
-{
- return x >> n;
-}
-
-#endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */
+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#include "field.h"
-
-void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
-{
- const uint64_t *a = as->limb, *b = bs->limb;
- uint64_t *c = cs->limb;
- __uint128_t accum0 = 0, accum1 = 0, accum2;
- uint64_t mask = (1ull << 56) - 1;
- uint64_t aa[4] VECTOR_ALIGNED, bb[4] VECTOR_ALIGNED, bbb[4] VECTOR_ALIGNED;
-
- /* For some reason clang doesn't vectorize this without prompting? */
- unsigned int i;
- for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
- ((uint64xn_t *) aa)[i] =
- ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
- ((uint64xn_t *) bb)[i] =
- ((const uint64xn_t *)b)[i] + ((const uint64xn_t *)(&b[4]))[i];
- ((uint64xn_t *) bbb)[i] =
- ((const uint64xn_t *)bb)[i] + ((const uint64xn_t *)(&b[4]))[i];
- }
- /*
- * for (int i=0; i<4; i++) { aa[i] = a[i] + a[i+4]; bb[i] = b[i] + b[i+4];
- * }
- */
-
- accum2 = widemul(&a[0], &b[3]);
- accum0 = widemul(&aa[0], &bb[3]);
- accum1 = widemul(&a[4], &b[7]);
-
- mac(&accum2, &a[1], &b[2]);
- mac(&accum0, &aa[1], &bb[2]);
- mac(&accum1, &a[5], &b[6]);
-
- mac(&accum2, &a[2], &b[1]);
- mac(&accum0, &aa[2], &bb[1]);
- mac(&accum1, &a[6], &b[5]);
-
- mac(&accum2, &a[3], &b[0]);
- mac(&accum0, &aa[3], &bb[0]);
- mac(&accum1, &a[7], &b[4]);
-
- accum0 -= accum2;
- accum1 += accum2;
-
- c[3] = ((uint64_t)(accum1)) & mask;
- c[7] = ((uint64_t)(accum0)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- mac(&accum0, &aa[1], &bb[3]);
- mac(&accum1, &a[5], &b[7]);
- mac(&accum0, &aa[2], &bb[2]);
- mac(&accum1, &a[6], &b[6]);
- mac(&accum0, &aa[3], &bb[1]);
- accum1 += accum0;
-
- accum2 = widemul(&a[0], &b[0]);
- accum1 -= accum2;
- accum0 += accum2;
-
- msb(&accum0, &a[1], &b[3]);
- msb(&accum0, &a[2], &b[2]);
- mac(&accum1, &a[7], &b[5]);
- msb(&accum0, &a[3], &b[1]);
- mac(&accum1, &aa[0], &bb[0]);
- mac(&accum0, &a[4], &b[4]);
-
- c[0] = ((uint64_t)(accum0)) & mask;
- c[4] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum2 = widemul(&a[2], &b[7]);
- mac(&accum0, &a[6], &bb[3]);
- mac(&accum1, &aa[2], &bbb[3]);
-
- mac(&accum2, &a[3], &b[6]);
- mac(&accum0, &a[7], &bb[2]);
- mac(&accum1, &aa[3], &bbb[2]);
-
- mac(&accum2, &a[0], &b[1]);
- mac(&accum1, &aa[0], &bb[1]);
- mac(&accum0, &a[4], &b[5]);
-
- mac(&accum2, &a[1], &b[0]);
- mac(&accum1, &aa[1], &bb[0]);
- mac(&accum0, &a[5], &b[4]);
-
- accum1 -= accum2;
- accum0 += accum2;
-
- c[1] = ((uint64_t)(accum0)) & mask;
- c[5] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum2 = widemul(&a[3], &b[7]);
- mac(&accum0, &a[7], &bb[3]);
- mac(&accum1, &aa[3], &bbb[3]);
-
- mac(&accum2, &a[0], &b[2]);
- mac(&accum1, &aa[0], &bb[2]);
- mac(&accum0, &a[4], &b[6]);
-
- mac(&accum2, &a[1], &b[1]);
- mac(&accum1, &aa[1], &bb[1]);
- mac(&accum0, &a[5], &b[5]);
-
- mac(&accum2, &a[2], &b[0]);
- mac(&accum1, &aa[2], &bb[0]);
- mac(&accum0, &a[6], &b[4]);
-
- accum1 -= accum2;
- accum0 += accum2;
-
- c[2] = ((uint64_t)(accum0)) & mask;
- c[6] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum0 += c[3];
- accum1 += c[7];
- c[3] = ((uint64_t)(accum0)) & mask;
- c[7] = ((uint64_t)(accum1)) & mask;
-
- /* we could almost stop here, but it wouldn't be stable, so... */
-
- accum0 >>= 56;
- accum1 >>= 56;
- c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
- c[0] += ((uint64_t)(accum1));
-}
-
-void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
-{
- const uint64_t *a = as->limb;
- uint64_t *c = cs->limb;
-
- __uint128_t accum0, accum4;
- uint64_t mask = (1ull << 56) - 1;
-
- accum0 = widemul_rm(b, &a[0]);
- accum4 = widemul_rm(b, &a[4]);
-
- c[0] = accum0 & mask;
- accum0 >>= 56;
- c[4] = accum4 & mask;
- accum4 >>= 56;
-
- mac_rm(&accum0, b, &a[1]);
- mac_rm(&accum4, b, &a[5]);
-
- c[1] = accum0 & mask;
- accum0 >>= 56;
- c[5] = accum4 & mask;
- accum4 >>= 56;
-
- mac_rm(&accum0, b, &a[2]);
- mac_rm(&accum4, b, &a[6]);
-
- c[2] = accum0 & mask;
- accum0 >>= 56;
- c[6] = accum4 & mask;
- accum4 >>= 56;
-
- mac_rm(&accum0, b, &a[3]);
- mac_rm(&accum4, b, &a[7]);
-
- c[3] = accum0 & mask;
- accum0 >>= 56;
- c[7] = accum4 & mask;
- accum4 >>= 56;
-
- accum0 += accum4 + c[4];
- c[4] = accum0 & mask;
- c[5] += accum0 >> 56;
-
- accum4 += c[0];
- c[0] = accum4 & mask;
- c[1] += accum4 >> 56;
-}
-
-void gf_sqr(gf_s * __restrict__ cs, const gf as)
-{
- const uint64_t *a = as->limb;
- uint64_t *c = cs->limb;
- __uint128_t accum0 = 0, accum1 = 0, accum2;
- uint64_t mask = (1ull << 56) - 1;
- uint64_t aa[4] VECTOR_ALIGNED;
-
- /* For some reason clang doesn't vectorize this without prompting? */
- unsigned int i;
- for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
- ((uint64xn_t *) aa)[i] =
- ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
- }
-
- accum2 = widemul(&a[0], &a[3]);
- accum0 = widemul(&aa[0], &aa[3]);
- accum1 = widemul(&a[4], &a[7]);
-
- mac(&accum2, &a[1], &a[2]);
- mac(&accum0, &aa[1], &aa[2]);
- mac(&accum1, &a[5], &a[6]);
-
- accum0 -= accum2;
- accum1 += accum2;
-
- c[3] = ((uint64_t)(accum1)) << 1 & mask;
- c[7] = ((uint64_t)(accum0)) << 1 & mask;
-
- accum0 >>= 55;
- accum1 >>= 55;
-
- mac2(&accum0, &aa[1], &aa[3]);
- mac2(&accum1, &a[5], &a[7]);
- mac(&accum0, &aa[2], &aa[2]);
- accum1 += accum0;
-
- msb2(&accum0, &a[1], &a[3]);
- mac(&accum1, &a[6], &a[6]);
-
- accum2 = widemul(&a[0], &a[0]);
- accum1 -= accum2;
- accum0 += accum2;
-
- msb(&accum0, &a[2], &a[2]);
- mac(&accum1, &aa[0], &aa[0]);
- mac(&accum0, &a[4], &a[4]);
-
- c[0] = ((uint64_t)(accum0)) & mask;
- c[4] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum2 = widemul2(&aa[2], &aa[3]);
- msb2(&accum0, &a[2], &a[3]);
- mac2(&accum1, &a[6], &a[7]);
-
- accum1 += accum2;
- accum0 += accum2;
-
- accum2 = widemul2(&a[0], &a[1]);
- mac2(&accum1, &aa[0], &aa[1]);
- mac2(&accum0, &a[4], &a[5]);
-
- accum1 -= accum2;
- accum0 += accum2;
-
- c[1] = ((uint64_t)(accum0)) & mask;
- c[5] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum2 = widemul(&aa[3], &aa[3]);
- msb(&accum0, &a[3], &a[3]);
- mac(&accum1, &a[7], &a[7]);
-
- accum1 += accum2;
- accum0 += accum2;
-
- accum2 = widemul2(&a[0], &a[2]);
- mac2(&accum1, &aa[0], &aa[2]);
- mac2(&accum0, &a[4], &a[6]);
-
- mac(&accum2, &a[1], &a[1]);
- mac(&accum1, &aa[1], &aa[1]);
- mac(&accum0, &a[5], &a[5]);
-
- accum1 -= accum2;
- accum0 += accum2;
-
- c[2] = ((uint64_t)(accum0)) & mask;
- c[6] = ((uint64_t)(accum1)) & mask;
-
- accum0 >>= 56;
- accum1 >>= 56;
-
- accum0 += c[3];
- accum1 += c[7];
- c[3] = ((uint64_t)(accum0)) & mask;
- c[7] = ((uint64_t)(accum1)) & mask;
-
- /* we could almost stop here, but it wouldn't be stable, so... */
-
- accum0 >>= 56;
- accum1 >>= 56;
- c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
- c[0] += ((uint64_t)(accum1));
-}
+++ /dev/null
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014-2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License"). You may not use
- * this file except in compliance with the License. You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#define GF_HEADROOM 60
-#define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
-#define LIMB_PLACE_VALUE(i) 56
-
-void gf_add_RAW(gf out, const gf a, const gf b)
-{
- for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
- ((uint64xn_t *) out)[i] =
- ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)b)[i];
- }
-}
-
-void gf_sub_RAW(gf out, const gf a, const gf b)
-{
- for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
- ((uint64xn_t *) out)[i] =
- ((const uint64xn_t *)a)[i] - ((const uint64xn_t *)b)[i];
- }
-}
-
-void gf_bias(gf a, int amt)
-{
- uint64_t co1 = ((1ull << 56) - 1) * amt, co2 = co1 - amt;
-
-#if __AVX2__
- uint64x4_t lo = { co1, co1, co1, co1 }, hi = {
- co2, co1, co1, co1};
- uint64x4_t *aa = (uint64x4_t *) a;
- aa[0] += lo;
- aa[1] += hi;
-#elif __SSE2__
- uint64x2_t lo = { co1, co1 }, hi = {
- co2, co1};
- uint64x2_t *aa = (uint64x2_t *) a;
- aa[0] += lo;
- aa[1] += lo;
- aa[2] += hi;
- aa[3] += lo;
-#else
- for (unsigned int i = 0; i < sizeof(*a) / sizeof(uint64_t); i++) {
- a->limb[i] += (i == 4) ? co2 : co1;
- }
-#endif
-}
-
-void gf_weak_reduce(gf a)
-{
- /* PERF: use pshufb/palignr if anyone cares about speed of this */
- uint64_t mask = (1ull << 56) - 1;
- uint64_t tmp = a->limb[7] >> 56;
-
- a->limb[4] += tmp;
- for (unsigned int i = 7; i > 0; i--) {
- a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
- }
- a->limb[0] = (a->limb[0] & mask) + tmp;
-}