1 ;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
3 ;; Licensed under the OpenSSL license (the "License"). You may not use
4 ;; this file except in compliance with the License. You can obtain a copy
5 ;; in the file LICENSE in the source distribution or at
6 ;; https://www.openssl.org/source/license.html
8 ;;====================================================================
9 ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 ;; Rights for redistribution and usage in source and binary forms are
13 ;; granted according to the OpenSSL license. Warranty of any kind is
15 ;;====================================================================
16 ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
17 ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
18 ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
19 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
20 ;;====================================================================
23 .if .ASSEMBLER_VERSION<7000000
27 .asg bn_mul_add_words,_bn_mul_add_words
28 .asg bn_mul_words,_bn_mul_words
29 .asg bn_sqr_words,_bn_sqr_words
30 .asg bn_add_words,_bn_add_words
31 .asg bn_sub_words,_bn_sub_words
32 .asg bn_div_words,_bn_div_words
33 .asg bn_sqr_comba8,_bn_sqr_comba8
34 .asg bn_mul_comba8,_bn_mul_comba8
35 .asg bn_sqr_comba4,_bn_sqr_comba4
36 .asg bn_mul_comba4,_bn_mul_comba4
51 .global _bn_mul_add_words
58 [B0] ZERO A19 ; high part of accumulator
64 ;;====================================================================
65 LDW *ARG1++,B7 ; ap[i]
67 LDW *ARG0++,A7 ; rp[i]
69 NOP 3 ; [2,0] in epilogue
71 ADDU A19,A21:A20,A19:A18
73 SPKERNEL 2,1 ; leave slot for "return value"
74 || STW A18,*A2++ ; rp[i]
76 ;;====================================================================
78 MV A19,RET ; return value
88 [B0] ZERO A19 ; high part of accumulator
92 ;;====================================================================
93 LDW *ARG1++,A7 ; ap[i]
95 MPY32U A7,ARG3,A17:A16
96 NOP 4 ; [2,0] in epiloque
99 SPKERNEL 2,1 ; leave slot for "return value"
100 || STW A18,*ARG0++ ; rp[i]
102 ;;====================================================================
104 MV A19,RET ; return value
107 .global _bn_sqr_words
115 || [B0] ADD 4,ARG0,ARG0
119 ;;====================================================================
120 LDW *ARG1++,B7 ; ap[i]
123 NOP 3 ; [2,0] in epilogue
124 STW B0,*B2++(8) ; rp[2*i]
126 SPKERNEL 2,0 ; fully overlap BNOP RA,5
127 || STW A1,*ARG0++(8) ; rp[2*i+1]
128 ;;====================================================================
132 .global _bn_add_words
139 [B0] ZERO A1 ; carry flag
144 ;;====================================================================
145 LDW *ARG2++,A7 ; bp[i]
146 || LDW *ARG1++,B7 ; ap[i]
150 SPKERNEL 0,0 ; fully overlap BNOP RA,5
151 || STW A0,*A3++ ; write result
152 || MV A1,RET ; keep carry flag in RET
153 ;;====================================================================
157 .global _bn_sub_words
164 [B0] ZERO A2 ; borrow flag
169 ;;====================================================================
170 LDW *ARG2++,A7 ; bp[i]
171 || LDW *ARG1++,B7 ; ap[i]
174 [A2] SUB A1:A0,1,A1:A0
175 SPKERNEL 0,1 ; leave slot for "return borrow flag"
176 || STW A0,*A3++ ; write result
177 || AND 1,A1,A2 ; pass on borrow flag
178 ;;====================================================================
180 AND 1,A1,RET ; return borrow flag
183 .global _bn_div_words
186 LMBD 1,A6,A0 ; leading zero bits in dv
187 LMBD 1,A4,A1 ; leading zero bits in hi
192 ||[ A2] MVK -1,A4 ; return overflow
193 ||[!A2] MV A4,A3 ; reassign hi
194 [!A2] MV B4,A4 ; reassign lo, will be quotient
196 [!A2] SHL A6,A0,A6 ; normalize dv
199 [!A2] CMPLTU A3,A6,A1 ; hi<dv?
200 ||[!A2] SHL A4,1,A5:A4 ; lo<<1
201 [!A1] SUB A3,A6,A3 ; hi-=dv
203 [!A2] SHRU A3,31,A1 ; upper bit
204 ||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
207 [!A1] CMPLTU A3,A6,A1 ; hi<dv?
209 || SHL A4,1,A5:A4 ; lo<<1
210 [!A1] SUB A3,A6,A3 ; hi-=dv
211 ||[!A1] OR 1,A4,A4 ; quotient
212 SHRU A3,31,A1 ; upper bit
213 || ADDAH A5,A3,A3 ; hi<<1|lo>>31
219 ;;====================================================================
220 ;; Not really Comba algorithm, just straightforward NxM... Dedicated
221 ;; fully unrolled real Comba implementations are asymptotically 2x
222 ;; faster, but naturally larger undertaking. Purpose of this exercise
223 ;; was rather to learn to master nested SPLOOPs...
224 ;;====================================================================
225 .global _bn_sqr_comba8
226 .global _bn_mul_comba8
232 || MVK 8,A0 ; M, outer loop counter
233 || MV ARG1,A5 ; copy ap
234 || MV ARG0,B4 ; copy rp
235 || ZERO B19 ; high part of accumulator
237 || SUB B0,2,B1 ; N-2, initial ILC
238 || SUB B0,1,B2 ; const B2=N-1
239 || LDW *A5++,B6 ; ap[0]
240 || MV A0,A3 ; const A3=M
241 sploopNxM?: ; for best performance arrange M<=N
242 [A0] SPLOOPD 2 ; 2*n+10
246 || LDW *A5++,A9 ; pre-fetch ap[1]
249 ;;====================================================================
250 ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
251 ;; This is because of Advisory 15 from TI publication SPRZ247I.
252 LDW *ARG2++,A7 ; bp[i]
254 [A1] LDW *B5++,B7 ; rp[i]
258 ADDU B19,B21:B20,B19:B18
261 || STW B18,*B4++ ; rp[i]
263 ;;====================================================================
264 outer?: ; m*2*(n+1)+10
265 SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
267 || CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
268 MVD A9,B6 ; move through .M unit(*)
269 [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
270 SUBAW B5,B2,B5 ; rewind rp to rp[1]
272 [A0] BNOP.S1 outer?,4
273 || [A0] SUB.L A0,1,A0
274 STW B19,*B4--[B2] ; rewind rp tp rp[1]
275 || ZERO.S B19 ; high part of accumulator
279 ;; (*) It should be noted that B6 is used as input to MPY32U in
280 ;; chronologically next cycle in *preceding* SPLOOP iteration.
281 ;; Normally such arrangement would require DINT, but at this
282 ;; point SPLOOP is draining and interrupts are disabled
285 .global _bn_sqr_comba4
286 .global _bn_mul_comba4
293 ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
294 ;; because of low-counter effect, when prologue phase finishes
295 ;; before SPKERNEL instruction is reached. As result it's 25%
296 ;; slower than expected...
298 || MVK 4,A0 ; M, outer loop counter
299 || MV ARG1,A5 ; copy ap
300 || MV ARG0,B4 ; copy rp
301 || ZERO B19 ; high part of accumulator
303 || SUB B0,2,B1 ; first ILC
304 || SUB B0,1,B2 ; const B2=N-1
305 || LDW *A5++,B6 ; ap[0]
306 || MV A0,A3 ; const A3=M
308 ;; This alternative is an exercise in fully unrolled Comba
309 ;; algorithm implementation that operates at n*(n+1)+12, or
310 ;; as little as 32 cycles...
311 LDW *ARG1[0],B16 ; a[0]
312 || LDW *ARG2[0],A16 ; b[0]
313 LDW *ARG1[1],B17 ; a[1]
314 || LDW *ARG2[1],A17 ; b[1]
315 LDW *ARG1[2],B18 ; a[2]
316 || LDW *ARG2[2],A18 ; b[2]
317 LDW *ARG1[3],B19 ; a[3]
318 || LDW *ARG2[3],A19 ; b[3]
320 MPY32U A16,B16,A1:A0 ; a[0]*b[0]
321 MPY32U A17,B16,A23:A22 ; a[0]*b[1]
322 MPY32U A16,B17,A25:A24 ; a[1]*b[0]
323 MPY32U A16,B18,A27:A26 ; a[2]*b[0]
325 || MPY32U A17,B17,A29:A28 ; a[1]*b[1]
326 MPY32U A18,B16,A31:A30 ; a[0]*b[2]
329 || MPY32U A19,B16,A21:A20 ; a[3]*b[0]
330 || ADDU A24,A1:A0,A1:A0
333 || MPY32U A18,B17,A23:A22 ; a[2]*b[1]
336 || MPY32U A17,B18,A25:A24 ; a[1]*b[2]
337 || ADDU A28,A9:A8,A9:A8
339 || MPY32U A16,B19,A27:A26 ; a[0]*b[3]
340 || ADDU A30,A9:A8,A9:A8
342 || ADDU B0,A9:A8,A9:A8
346 || MPY32U A19,B17,A21:A20 ; a[3]*b[1]
347 || ADDU A22,A1:A0,A1:A0
349 || MPY32U A18,B18,A23:A22 ; a[2]*b[2]
350 || ADDU A24,A1:A0,A1:A0
352 || MPY32U A17,B19,A25:A24 ; a[1]*b[3]
353 || ADDU A26,A1:A0,A1:A0
355 || ADDU B8,A1:A0,A1:A0
357 || MPY32U A19,B18,A27:A26 ; a[3]*b[2]
360 || MPY32U A18,B19,A29:A28 ; a[2]*b[3]
361 || ADDU A22,A9:A8,A9:A8
363 || MPY32U A19,B19,A31:A30 ; a[3]*b[3]
364 || ADDU A24,A9:A8,A9:A8
366 || ADDU B0,A9:A8,A9:A8
370 || ADDU A28,A1:A0,A1:A0
373 || ADDU B8,A1:A0,A1:A0
377 ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below