Bump version to 1.32.0
[oweals/busybox.git] / networking / tls_pstm_montgomery_reduce.c
1 /*
2  * Copyright (C) 2017 Denys Vlasenko
3  *
4  * Licensed under GPLv2, see file LICENSE in this source tree.
5  */
6 #include "tls.h"
7
8 /* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
9  * Changes are flagged with //bbox
10  */
11
12 /**
13  *      @file    pstm_montgomery_reduce.c
14  *      @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
15  *
16  *      Multiprecision Montgomery Reduction.
17  */
18 /*
19  *      Copyright (c) 2013-2015 INSIDE Secure Corporation
20  *      Copyright (c) PeerSec Networks, 2002-2011
21  *      All Rights Reserved
22  *
23  *      The latest version of this code is available at http://www.matrixssl.org
24  *
25  *      This software is open source; you can redistribute it and/or modify
26  *      it under the terms of the GNU General Public License as published by
27  *      the Free Software Foundation; either version 2 of the License, or
28  *      (at your option) any later version.
29  *
30  *      This General Public License does NOT permit incorporating this software
31  *      into proprietary programs.  If you are unable to comply with the GPL, a
32  *      commercial license for this software may be purchased from INSIDE at
33  *      http://www.insidesecure.com/eng/Company/Locations
34  *
35  *      This program is distributed in WITHOUT ANY WARRANTY; without even the
36  *      implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37  *      See the GNU General Public License for more details.
38  *
39  *      You should have received a copy of the GNU General Public License
40  *      along with this program; if not, write to the Free Software
41  *      Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
42  *      http://www.gnu.org/copyleft/gpl.html
43  */
44 /******************************************************************************/
45
46 //bbox
47 //#include "../cryptoApi.h"
48 #ifndef DISABLE_PSTM
49
50 /******************************************************************************/
51
52 #if defined(PSTM_X86)
53 /* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
54 #if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
55 #error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
56 #endif
57 //#pragma message ("Using 32 bit x86 Assembly Optimizations")
58
59 #define MONT_START
60 #define MONT_FINI
61 #define LOOP_END
62 #define LOOP_START \
63    mu = c[x] * mp
64
65 #define INNERMUL                                          \
66 asm(                                                      \
67    "movl %5,%%eax \n\t"                                   \
68    "mull %4       \n\t"                                   \
69    "addl %1,%%eax \n\t"                                   \
70    "adcl $0,%%edx \n\t"                                   \
71    "addl %%eax,%0 \n\t"                                   \
72    "adcl $0,%%edx \n\t"                                   \
73    "movl %%edx,%1 \n\t"                                   \
74 :"=g"(_c[LO]), "=r"(cy)                                   \
75 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++)              \
76 : "%eax", "%edx", "cc")
77
78 #define PROPCARRY                           \
79 asm(                                        \
80    "addl   %1,%0    \n\t"                   \
81    "setb   %%al     \n\t"                   \
82    "movzbl %%al,%1 \n\t"                    \
83 :"=g"(_c[LO]), "=r"(cy)                     \
84 :"0"(_c[LO]), "1"(cy)                       \
85 : "%eax", "cc")
86
87 /******************************************************************************/
88 #elif defined(PSTM_X86_64)
89 /* x86-64 optimized */
90 #if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
91 #error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
92 #endif
93 //#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
94
95 #define MONT_START
96 #define MONT_FINI
97 #define LOOP_END
98 #define LOOP_START \
99 mu = c[x] * mp
100
101 #define INNERMUL                                           \
102 asm(                                                       \
103         "movq %5,%%rax \n\t"                                   \
104         "mulq %4       \n\t"                                   \
105         "addq %1,%%rax \n\t"                                   \
106         "adcq $0,%%rdx \n\t"                                   \
107         "addq %%rax,%0 \n\t"                                   \
108         "adcq $0,%%rdx \n\t"                                   \
109         "movq %%rdx,%1 \n\t"                                   \
110         :"=g"(_c[LO]), "=r"(cy)                                \
111         :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)           \
112         : "%rax", "%rdx", "cc")
113
114 #define INNERMUL8                               \
115 asm(                                                    \
116         "movq 0(%5),%%rax    \n\t"  \
117         "movq 0(%2),%%r10    \n\t"  \
118         "movq 0x8(%5),%%r11  \n\t"  \
119         "mulq %4             \n\t"  \
120         "addq %%r10,%%rax    \n\t"  \
121         "adcq $0,%%rdx       \n\t"  \
122         "movq 0x8(%2),%%r10  \n\t"  \
123         "addq %3,%%rax       \n\t"  \
124         "adcq $0,%%rdx       \n\t"  \
125         "movq %%rax,0(%0)    \n\t"  \
126         "movq %%rdx,%1       \n\t"  \
127         \
128         "movq %%r11,%%rax    \n\t"  \
129         "movq 0x10(%5),%%r11 \n\t"  \
130         "mulq %4             \n\t"  \
131         "addq %%r10,%%rax    \n\t"  \
132         "adcq $0,%%rdx       \n\t"  \
133         "movq 0x10(%2),%%r10 \n\t"  \
134         "addq %3,%%rax       \n\t"  \
135         "adcq $0,%%rdx       \n\t"  \
136         "movq %%rax,0x8(%0)  \n\t"  \
137         "movq %%rdx,%1       \n\t"  \
138         \
139         "movq %%r11,%%rax    \n\t"  \
140         "movq 0x18(%5),%%r11 \n\t"  \
141         "mulq %4             \n\t"  \
142         "addq %%r10,%%rax    \n\t"  \
143         "adcq $0,%%rdx       \n\t"  \
144         "movq 0x18(%2),%%r10 \n\t"  \
145         "addq %3,%%rax       \n\t"  \
146         "adcq $0,%%rdx       \n\t"  \
147         "movq %%rax,0x10(%0) \n\t"  \
148         "movq %%rdx,%1       \n\t"  \
149         \
150         "movq %%r11,%%rax    \n\t"  \
151         "movq 0x20(%5),%%r11 \n\t"  \
152         "mulq %4             \n\t"  \
153         "addq %%r10,%%rax    \n\t"  \
154         "adcq $0,%%rdx       \n\t"  \
155         "movq 0x20(%2),%%r10 \n\t"  \
156         "addq %3,%%rax       \n\t"  \
157         "adcq $0,%%rdx       \n\t"  \
158         "movq %%rax,0x18(%0) \n\t"  \
159         "movq %%rdx,%1       \n\t"  \
160         \
161         "movq %%r11,%%rax    \n\t"  \
162         "movq 0x28(%5),%%r11 \n\t"  \
163         "mulq %4             \n\t"  \
164         "addq %%r10,%%rax    \n\t"  \
165         "adcq $0,%%rdx       \n\t"  \
166         "movq 0x28(%2),%%r10 \n\t"  \
167         "addq %3,%%rax       \n\t"  \
168         "adcq $0,%%rdx       \n\t"  \
169         "movq %%rax,0x20(%0) \n\t"  \
170         "movq %%rdx,%1       \n\t"  \
171         \
172         "movq %%r11,%%rax    \n\t"  \
173         "movq 0x30(%5),%%r11 \n\t"  \
174         "mulq %4             \n\t"  \
175         "addq %%r10,%%rax    \n\t"  \
176         "adcq $0,%%rdx       \n\t"  \
177         "movq 0x30(%2),%%r10 \n\t"  \
178         "addq %3,%%rax       \n\t"  \
179         "adcq $0,%%rdx       \n\t"  \
180         "movq %%rax,0x28(%0) \n\t"  \
181         "movq %%rdx,%1       \n\t"  \
182         \
183         "movq %%r11,%%rax    \n\t"  \
184         "movq 0x38(%5),%%r11 \n\t"  \
185         "mulq %4             \n\t"  \
186         "addq %%r10,%%rax    \n\t"  \
187         "adcq $0,%%rdx       \n\t"  \
188         "movq 0x38(%2),%%r10 \n\t"  \
189         "addq %3,%%rax       \n\t"  \
190         "adcq $0,%%rdx       \n\t"  \
191         "movq %%rax,0x30(%0) \n\t"  \
192         "movq %%rdx,%1       \n\t"  \
193         \
194         "movq %%r11,%%rax    \n\t"  \
195         "mulq %4             \n\t"  \
196         "addq %%r10,%%rax    \n\t"  \
197         "adcq $0,%%rdx       \n\t"  \
198         "addq %3,%%rax       \n\t"  \
199         "adcq $0,%%rdx       \n\t"  \
200         "movq %%rax,0x38(%0) \n\t"  \
201         "movq %%rdx,%1       \n\t"  \
202         \
203         :"=r"(_c), "=r"(cy)                    \
204         : "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
205         : "%rax", "%rdx", "%r10", "%r11", "cc")
206
207 #define PROPCARRY                          \
208 asm(                                       \
209         "addq   %1,%0    \n\t"                 \
210         "setb   %%al     \n\t"                 \
211         "movzbq %%al,%1 \n\t"                  \
212         :"=g"(_c[LO]), "=r"(cy)                \
213         :"0"(_c[LO]), "1"(cy)                  \
214         : "%rax", "cc")
215
216 /******************************************************************************/
217 #elif defined(PSTM_ARM)
218
219 #define MONT_START
220 #define MONT_FINI
221 #define LOOP_END
222 #define LOOP_START \
223 mu = c[x] * mp
224
225 #ifdef __thumb2__
226 //#pragma message ("Using 32 bit ARM Thumb2 Assembly Optimizations")
227 #define INNERMUL                    \
228 asm(                                \
229         " LDR    r0,%1            \n\t" \
230         " ADDS   r0,r0,%0         \n\t" \
231         " ITE CS                  \n\t" \
232         " MOVCS  %0,#1            \n\t" \
233         " MOVCC  %0,#0            \n\t" \
234         " UMLAL  r0,%0,%3,%4      \n\t" \
235         " STR    r0,%1            \n\t" \
236         :"=r"(cy),"=m"(_c[0])\
237         :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
238         :"r0","cc");
239 #define PROPCARRY                  \
240 asm(                               \
241         " LDR   r0,%1            \n\t" \
242         " ADDS  r0,r0,%0         \n\t" \
243         " STR   r0,%1            \n\t" \
244         " ITE CS                 \n\t" \
245         " MOVCS %0,#1            \n\t" \
246         " MOVCC %0,#0            \n\t" \
247         :"=r"(cy),"=m"(_c[0])\
248         :"0"(cy),"m"(_c[0])\
249         :"r0","cc");
250 #else /* Non-Thumb2 code */
251 //#pragma message ("Using 32 bit ARM Assembly Optimizations")
252 #define INNERMUL                    \
253 asm(                                \
254         " LDR    r0,%1            \n\t" \
255         " ADDS   r0,r0,%0         \n\t" \
256         " MOVCS  %0,#1            \n\t" \
257         " MOVCC  %0,#0            \n\t" \
258         " UMLAL  r0,%0,%3,%4      \n\t" \
259         " STR    r0,%1            \n\t" \
260         :"=r"(cy),"=m"(_c[0])\
261         :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
262         :"r0","cc");
263 #define PROPCARRY                  \
264 asm(                               \
265         " LDR   r0,%1            \n\t" \
266         " ADDS  r0,r0,%0         \n\t" \
267         " STR   r0,%1            \n\t" \
268         " MOVCS %0,#1            \n\t" \
269         " MOVCC %0,#0            \n\t" \
270         :"=r"(cy),"=m"(_c[0])\
271         :"0"(cy),"m"(_c[0])\
272         :"r0","cc");
273 #endif /* __thumb2__ */
274
275
276 /******************************************************************************/
277 #elif defined(PSTM_MIPS)
278 /* MIPS32 */
279 //#pragma message ("Using 32 bit MIPS Assembly Optimizations")
280 #define MONT_START
281 #define MONT_FINI
282 #define LOOP_END
283 #define LOOP_START \
284 mu = c[x] * mp
285
286 #define INNERMUL                      \
287 asm(                                                              \
288         " multu    %3,%4          \n\t"   \
289         " mflo     $12            \n\t"   \
290         " mfhi     $13            \n\t"   \
291         " addu     $12,$12,%0     \n\t"   \
292         " sltu     $10,$12,%0     \n\t"   \
293         " addu     $13,$13,$10    \n\t"   \
294         " lw       $10,%1         \n\t"   \
295         " addu     $12,$12,$10    \n\t"   \
296         " sltu     $10,$12,$10    \n\t"   \
297         " addu     %0,$13,$10     \n\t"   \
298         " sw       $12,%1         \n\t"   \
299         :"=r"(cy),"=m"(_c[0])\
300         :"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\
301         :"$10","$12","$13")\
302 ; ++tmpm;
303
304 #define PROPCARRY                     \
305 asm(                                  \
306         " lw       $10,%1        \n\t"    \
307         " addu     $10,$10,%0    \n\t"    \
308         " sw       $10,%1        \n\t"    \
309         " sltu     %0,$10,%0     \n\t"    \
310         :"=r"(cy),"=m"(_c[0])\
311         :"r"(cy),"r"(_c[0])\
312         :"$10");
313
314
315 /******************************************************************************/
316 #else
317
318 /* ISO C code */
319 #define MONT_START
320 #define MONT_FINI
321 #define LOOP_END
322 #define LOOP_START \
323    mu = c[x] * mp
324
325 #define INNERMUL                                                                                \
326         do { pstm_word t;                                                                       \
327                 t = ((pstm_word)_c[0] + (pstm_word)cy) +                \
328                         (((pstm_word)mu) * ((pstm_word)*tmpm++));       \
329                 _c[0] = (pstm_digit)t;                                                  \
330                 cy = (pstm_digit)(t >> DIGIT_BIT);                              \
331         } while (0)
332
333 #define PROPCARRY \
334    do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0)
335
336 #endif
337
338 /******************************************************************************/
339
340 #define LO 0
341
342 /* computes x/R == x (mod N) via Montgomery Reduction */
343 int32 FAST_FUNC pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
344                 pstm_digit mp, pstm_digit *paD, uint32 paDlen)
345 {
346         pstm_digit      *c, *_c, *tmpm, mu;
347         int32           oldused, x, y;
348         int             pa; //bbox: was int16
349
350         pa = m->used;
351         if (pa > a->alloc) {
352                 /* Sanity test for bad numbers.  This will confirm no buffer overruns */
353                 return PS_LIMIT_FAIL;
354         }
355
356         if (paD && paDlen >= (uint32)2*pa+1) {
357                 c = paD;
358                 memset(c, 0x0, paDlen);
359         } else {
360                 c = xzalloc(2*pa+1);//bbox
361         }
362         /* copy the input */
363         oldused = a->used;
364         for (x = 0; x < oldused; x++) {
365                 c[x] = a->dp[x];
366         }
367
368         MONT_START;
369
370         for (x = 0; x < pa; x++) {
371                 pstm_digit cy = 0;
372                 /* get Mu for this round */
373                 LOOP_START;
374                 _c   = c + x;
375                 tmpm = m->dp;
376                 y = 0;
377 #ifdef PSTM_X86_64
378                 for (; y < (pa & ~7); y += 8) {
379                         INNERMUL8;
380                         _c   += 8;
381                         tmpm += 8;
382                 }
383 #endif /* PSTM_X86_64 */
384                 for (; y < pa; y++) {
385                         INNERMUL;
386                         ++_c;
387                 }
388                 LOOP_END;
389                 while (cy) {
390                         PROPCARRY;
391                         ++_c;
392                 }
393         }
394
395         /* now copy out */
396         _c   = c + pa;
397         tmpm = a->dp;
398         for (x = 0; x < pa+1; x++) {
399                 *tmpm++ = *_c++;
400         }
401
402         for (; x < oldused; x++)   {
403                 *tmpm++ = 0;
404         }
405
406         MONT_FINI;
407
408         a->used = pa+1;
409         pstm_clamp(a);
410
411         /* reuse x as return code */
412         x = PSTM_OKAY;
413
414         /* if A >= m then A = A - m */
415         if (pstm_cmp_mag (a, m) != PSTM_LT) {
416                 if (s_pstm_sub (a, m, a) != PSTM_OKAY) {
417                         x = PS_MEM_FAIL;
418                 }
419         }
420         if (paDlen < (uint32)2*pa+1) {
421                 psFree(c, pool);
422         }
423         return x;
424 }
425
426 #endif /* !DISABLE_PSTM */
427 /******************************************************************************/