2 * Copyright (C) 2017 Denys Vlasenko
4 * Licensed under GPLv2, see file LICENSE in this source tree.
9 * @file pstm_montgomery_reduce.c
10 * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
12 * Multiprecision Montgomery Reduction.
15 * Copyright (c) 2013-2015 INSIDE Secure Corporation
16 * Copyright (c) PeerSec Networks, 2002-2011
19 * The latest version of this code is available at http://www.matrixssl.org
21 * This software is open source; you can redistribute it and/or modify
22 * it under the terms of the GNU General Public License as published by
23 * the Free Software Foundation; either version 2 of the License, or
24 * (at your option) any later version.
26 * This General Public License does NOT permit incorporating this software
27 * into proprietary programs. If you are unable to comply with the GPL, a
28 * commercial license for this software may be purchased from INSIDE at
29 * http://www.insidesecure.com/eng/Company/Locations
31 * This program is distributed in WITHOUT ANY WARRANTY; without even the
32 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
33 * See the GNU General Public License for more details.
35 * You should have received a copy of the GNU General Public License
36 * along with this program; if not, write to the Free Software
37 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
38 * http://www.gnu.org/copyleft/gpl.html
40 /******************************************************************************/
43 //#include "../cryptoApi.h"
46 /******************************************************************************/
49 /* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
50 #if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
51 #error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
53 //#pragma message ("Using 32 bit x86 Assembly Optimizations")
63 "movl %5,%%eax \n\t" \
65 "addl %1,%%eax \n\t" \
66 "adcl $0,%%edx \n\t" \
67 "addl %%eax,%0 \n\t" \
68 "adcl $0,%%edx \n\t" \
69 "movl %%edx,%1 \n\t" \
70 :"=g"(_c[LO]), "=r"(cy) \
71 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
72 : "%eax", "%edx", "%cc")
78 "movzbl %%al,%1 \n\t" \
79 :"=g"(_c[LO]), "=r"(cy) \
80 :"0"(_c[LO]), "1"(cy) \
83 /******************************************************************************/
84 #elif defined(PSTM_X86_64)
85 /* x86-64 optimized */
86 #if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
87 #error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
89 //#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
99 "movq %5,%%rax \n\t" \
101 "addq %1,%%rax \n\t" \
102 "adcq $0,%%rdx \n\t" \
103 "addq %%rax,%0 \n\t" \
104 "adcq $0,%%rdx \n\t" \
105 "movq %%rdx,%1 \n\t" \
106 :"=g"(_c[LO]), "=r"(cy) \
107 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
108 : "%rax", "%rdx", "cc")
112 "movq 0(%5),%%rax \n\t" \
113 "movq 0(%2),%%r10 \n\t" \
114 "movq 0x8(%5),%%r11 \n\t" \
116 "addq %%r10,%%rax \n\t" \
117 "adcq $0,%%rdx \n\t" \
118 "movq 0x8(%2),%%r10 \n\t" \
119 "addq %3,%%rax \n\t" \
120 "adcq $0,%%rdx \n\t" \
121 "movq %%rax,0(%0) \n\t" \
122 "movq %%rdx,%1 \n\t" \
124 "movq %%r11,%%rax \n\t" \
125 "movq 0x10(%5),%%r11 \n\t" \
127 "addq %%r10,%%rax \n\t" \
128 "adcq $0,%%rdx \n\t" \
129 "movq 0x10(%2),%%r10 \n\t" \
130 "addq %3,%%rax \n\t" \
131 "adcq $0,%%rdx \n\t" \
132 "movq %%rax,0x8(%0) \n\t" \
133 "movq %%rdx,%1 \n\t" \
135 "movq %%r11,%%rax \n\t" \
136 "movq 0x18(%5),%%r11 \n\t" \
138 "addq %%r10,%%rax \n\t" \
139 "adcq $0,%%rdx \n\t" \
140 "movq 0x18(%2),%%r10 \n\t" \
141 "addq %3,%%rax \n\t" \
142 "adcq $0,%%rdx \n\t" \
143 "movq %%rax,0x10(%0) \n\t" \
144 "movq %%rdx,%1 \n\t" \
146 "movq %%r11,%%rax \n\t" \
147 "movq 0x20(%5),%%r11 \n\t" \
149 "addq %%r10,%%rax \n\t" \
150 "adcq $0,%%rdx \n\t" \
151 "movq 0x20(%2),%%r10 \n\t" \
152 "addq %3,%%rax \n\t" \
153 "adcq $0,%%rdx \n\t" \
154 "movq %%rax,0x18(%0) \n\t" \
155 "movq %%rdx,%1 \n\t" \
157 "movq %%r11,%%rax \n\t" \
158 "movq 0x28(%5),%%r11 \n\t" \
160 "addq %%r10,%%rax \n\t" \
161 "adcq $0,%%rdx \n\t" \
162 "movq 0x28(%2),%%r10 \n\t" \
163 "addq %3,%%rax \n\t" \
164 "adcq $0,%%rdx \n\t" \
165 "movq %%rax,0x20(%0) \n\t" \
166 "movq %%rdx,%1 \n\t" \
168 "movq %%r11,%%rax \n\t" \
169 "movq 0x30(%5),%%r11 \n\t" \
171 "addq %%r10,%%rax \n\t" \
172 "adcq $0,%%rdx \n\t" \
173 "movq 0x30(%2),%%r10 \n\t" \
174 "addq %3,%%rax \n\t" \
175 "adcq $0,%%rdx \n\t" \
176 "movq %%rax,0x28(%0) \n\t" \
177 "movq %%rdx,%1 \n\t" \
179 "movq %%r11,%%rax \n\t" \
180 "movq 0x38(%5),%%r11 \n\t" \
182 "addq %%r10,%%rax \n\t" \
183 "adcq $0,%%rdx \n\t" \
184 "movq 0x38(%2),%%r10 \n\t" \
185 "addq %3,%%rax \n\t" \
186 "adcq $0,%%rdx \n\t" \
187 "movq %%rax,0x30(%0) \n\t" \
188 "movq %%rdx,%1 \n\t" \
190 "movq %%r11,%%rax \n\t" \
192 "addq %%r10,%%rax \n\t" \
193 "adcq $0,%%rdx \n\t" \
194 "addq %3,%%rax \n\t" \
195 "adcq $0,%%rdx \n\t" \
196 "movq %%rax,0x38(%0) \n\t" \
197 "movq %%rdx,%1 \n\t" \
199 :"=r"(_c), "=r"(cy) \
200 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
201 : "%rax", "%rdx", "%r10", "%r11", "cc")
207 "movzbq %%al,%1 \n\t" \
208 :"=g"(_c[LO]), "=r"(cy) \
209 :"0"(_c[LO]), "1"(cy) \
212 /******************************************************************************/
213 #elif defined(PSTM_ARM)
222 //#pragma message ("Using 32 bit ARM Thumb2 Assembly Optimizations")
226 " ADDS r0,r0,%0 \n\t" \
228 " MOVCS %0,#1 \n\t" \
229 " MOVCC %0,#0 \n\t" \
230 " UMLAL r0,%0,%3,%4 \n\t" \
232 :"=r"(cy),"=m"(_c[0])\
233 :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
238 " ADDS r0,r0,%0 \n\t" \
241 " MOVCS %0,#1 \n\t" \
242 " MOVCC %0,#0 \n\t" \
243 :"=r"(cy),"=m"(_c[0])\
246 #else /* Non-Thumb2 code */
247 //#pragma message ("Using 32 bit ARM Assembly Optimizations")
251 " ADDS r0,r0,%0 \n\t" \
252 " MOVCS %0,#1 \n\t" \
253 " MOVCC %0,#0 \n\t" \
254 " UMLAL r0,%0,%3,%4 \n\t" \
256 :"=r"(cy),"=m"(_c[0])\
257 :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
262 " ADDS r0,r0,%0 \n\t" \
264 " MOVCS %0,#1 \n\t" \
265 " MOVCC %0,#0 \n\t" \
266 :"=r"(cy),"=m"(_c[0])\
269 #endif /* __thumb2__ */
272 /******************************************************************************/
273 #elif defined(PSTM_MIPS)
275 //#pragma message ("Using 32 bit MIPS Assembly Optimizations")
284 " multu %3,%4 \n\t" \
287 " addu $12,$12,%0 \n\t" \
288 " sltu $10,$12,%0 \n\t" \
289 " addu $13,$13,$10 \n\t" \
291 " addu $12,$12,$10 \n\t" \
292 " sltu $10,$12,$10 \n\t" \
293 " addu %0,$13,$10 \n\t" \
295 :"=r"(cy),"=m"(_c[0])\
296 :"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\
303 " addu $10,$10,%0 \n\t" \
305 " sltu %0,$10,%0 \n\t" \
306 :"=r"(cy),"=m"(_c[0])\
311 /******************************************************************************/
323 t = ((pstm_word)_c[0] + (pstm_word)cy) + \
324 (((pstm_word)mu) * ((pstm_word)*tmpm++)); \
325 _c[0] = (pstm_digit)t; \
326 cy = (pstm_digit)(t >> DIGIT_BIT); \
330 do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0)
334 /******************************************************************************/
338 /* computes x/R == x (mod N) via Montgomery Reduction */
339 int32 pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
340 pstm_digit mp, pstm_digit *paD, uint32 paDlen)
342 pstm_digit *c, *_c, *tmpm, mu;
348 /* Sanity test for bad numbers. This will confirm no buffer overruns */
349 return PS_LIMIT_FAIL;
352 if (paD && paDlen >= (uint32)2*pa+1) {
354 memset(c, 0x0, paDlen);
360 for (x = 0; x < oldused; x++) {
366 for (x = 0; x < pa; x++) {
368 /* get Mu for this round */
374 for (; y < (pa & ~7); y += 8) {
379 #endif /* PSTM_X86_64 */
380 for (; y < pa; y++) {
394 for (x = 0; x < pa+1; x++) {
398 for (; x < oldused; x++) {
407 /* reuse x as return code */
410 /* if A >= m then A = A - m */
411 if (pstm_cmp_mag (a, m) != PSTM_LT) {
412 if (s_pstm_sub (a, m, a) != PSTM_OKAY) {
416 if (paDlen < (uint32)2*pa+1) {
422 #endif /* !DISABLE_PSTM */
423 /******************************************************************************/