1 ;;====================================================================
2 ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 ;; Rights for redistribution and usage in source and binary forms are
6 ;; granted according to the OpenSSL license. Warranty of any kind is
8 ;;====================================================================
9 ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
10 ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
11 ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
12 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
13 ;;====================================================================
16 .asg bn_mul_add_words,_bn_mul_add_words
17 .asg bn_mul_words,_bn_mul_words
18 .asg bn_sqr_words,_bn_sqr_words
19 .asg bn_add_words,_bn_add_words
20 .asg bn_sub_words,_bn_sub_words
21 .asg bn_div_words,_bn_div_words
22 .asg bn_sqr_comba8,_bn_sqr_comba8
23 .asg bn_mul_comba8,_bn_mul_comba8
24 .asg bn_sqr_comba4,_bn_sqr_comba4
25 .asg bn_mul_comba4,_bn_mul_comba4
40 .global _bn_mul_add_words
47 [B0] ZERO A19 ; high part of accumulator
53 ;;====================================================================
54 LDW *ARG1++,B7 ; ap[i]
56 LDW *ARG0++,A7 ; rp[i]
58 NOP 3 ; [2,0] in epilogue
60 ADDU A19,A21:A20,A19:A18
62 SPKERNEL 2,1 ; leave slot for "return value"
63 || STW A18,*A2++ ; rp[i]
65 ;;====================================================================
67 MV A19,RET ; return value
77 [B0] ZERO A19 ; high part of accumulator
81 ;;====================================================================
82 LDW *ARG1++,A7 ; ap[i]
84 MPY32U A7,ARG3,A17:A16
85 NOP 4 ; [2,0] in epiloque
88 SPKERNEL 2,1 ; leave slot for "return value"
89 || STW A18,*ARG0++ ; rp[i]
91 ;;====================================================================
93 MV A19,RET ; return value
104 || [B0] ADD 4,ARG0,ARG0
108 ;;====================================================================
109 LDW *ARG1++,B7 ; ap[i]
112 NOP 3 ; [2,0] in epilogue
113 STW B0,*B2++(8) ; rp[2*i]
115 SPKERNEL 2,0 ; fully overlap BNOP RA,5
116 || STW A1,*ARG0++(8) ; rp[2*i+1]
117 ;;====================================================================
121 .global _bn_add_words
128 [B0] ZERO A1 ; carry flag
133 ;;====================================================================
134 LDW *ARG2++,A7 ; bp[i]
135 || LDW *ARG1++,B7 ; ap[i]
139 SPKERNEL 0,0 ; fully overlap BNOP RA,5
140 || STW A0,*A3++ ; write result
141 || MV A1,RET ; keep carry flag in RET
142 ;;====================================================================
146 .global _bn_sub_words
153 [B0] ZERO A2 ; borrow flag
158 ;;====================================================================
159 LDW *ARG2++,A7 ; bp[i]
160 || LDW *ARG1++,B7 ; ap[i]
163 [A2] SUB A1:A0,1,A1:A0
164 SPKERNEL 0,1 ; leave slot for "return borrow flag"
165 || STW A0,*A3++ ; write result
166 || AND 1,A1,A2 ; pass on borrow flag
167 ;;====================================================================
169 AND 1,A1,RET ; return borrow flag
172 .global _bn_div_words
175 LMBD 1,A6,A0 ; leading zero bits in dv
176 LMBD 1,A4,A1 ; leading zero bits in hi
181 ||[ A2] MVK -1,A4 ; return overflow
182 ||[!A2] MV A4,A3 ; reassign hi
183 [!A2] MV B4,A4 ; reassign lo, will be quotient
185 [!A2] SHL A6,A0,A6 ; normalize dv
188 [!A2] CMPLTU A3,A6,A1 ; hi<dv?
189 ||[!A2] SHL A4,1,A5:A4 ; lo<<1
190 [!A1] SUB A3,A6,A3 ; hi-=dv
192 [!A2] SHRU A3,31,A1 ; upper bit
193 ||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
196 [!A1] CMPLTU A3,A6,A1 ; hi<dv?
198 || SHL A4,1,A5:A4 ; lo<<1
199 [!A1] SUB A3,A6,A3 ; hi-=dv
200 ||[!A1] OR 1,A4,A4 ; quotient
201 SHRU A3,31,A1 ; upper bit
202 || ADDAH A5,A3,A3 ; hi<<1|lo>>31
208 ;;====================================================================
209 ;; Not really Comba algorithm, just straightforward NxM... Dedicated
210 ;; fully unrolled real Comba implementations are asymptotically 2x
211 ;; faster, but naturally larger undertaking. Purpose of this exercise
212 ;; was rather to learn to master nested SPLOOPs...
213 ;;====================================================================
214 .global _bn_sqr_comba8
215 .global _bn_mul_comba8
221 || MVK 8,A0 ; M, outer loop counter
222 || MV ARG1,A5 ; copy ap
223 || MV ARG0,B4 ; copy rp
224 || ZERO B19 ; high part of accumulator
226 || SUB B0,2,B1 ; N-2, initial ILC
227 || SUB B0,1,B2 ; const B2=N-1
228 || LDW *A5++,B6 ; ap[0]
229 || MV A0,A3 ; const A3=M
230 sploopNxM?: ; for best performance arrange M<=N
231 [A0] SPLOOPD 2 ; 2*n+10
235 || LDW *A5++,A9 ; pre-fetch ap[1]
238 ;;====================================================================
239 ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
240 ;; This is because of Advisory 15 from TI publication SPRZ247I.
241 LDW *ARG2++,A7 ; bp[i]
243 [A1] LDW *B5++,B7 ; rp[i]
247 ADDU B19,B21:B20,B19:B18
250 || STW B18,*B4++ ; rp[i]
252 ;;====================================================================
253 outer?: ; m*2*(n+1)+10
254 SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
256 || CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
257 MVD A9,B6 ; move through .M unit(*)
258 [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
259 SUBAW B5,B2,B5 ; rewind rp to rp[1]
261 [A0] BNOP.S1 outer?,4
262 || [A0] SUB.L A0,1,A0
263 STW B19,*B4--[B2] ; rewind rp tp rp[1]
264 || ZERO.S B19 ; high part of accumulator
268 ;; (*) It should be noted that B6 is used as input to MPY32U in
269 ;; chronologically next cycle in *preceding* SPLOOP iteration.
270 ;; Normally such arrangement would require DINT, but at this
271 ;; point SPLOOP is draining and interrupts are disabled
274 .global _bn_sqr_comba4
275 .global _bn_mul_comba4
282 ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
283 ;; because of read-after-write penalties, it's rather
284 ;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
286 || MVK 4,A0 ; M, outer loop counter
287 || MV ARG1,A5 ; copy ap
288 || MV ARG0,B4 ; copy rp
289 || ZERO B19 ; high part of accumulator
291 || SUB B0,2,B1 ; first ILC
292 || SUB B0,1,B2 ; const B2=N-1
293 || LDW *A5++,B6 ; ap[0]
294 || MV A0,A3 ; const A3=M
296 ;; This alternative is an exercise in fully unrolled Comba
297 ;; algorithm implementation that operates at n*(n+1)+12, or
298 ;; as little as 32 cycles...
299 LDW *ARG1[0],B16 ; a[0]
300 || LDW *ARG2[0],A16 ; b[0]
301 LDW *ARG1[1],B17 ; a[1]
302 || LDW *ARG2[1],A17 ; b[1]
303 LDW *ARG1[2],B18 ; a[2]
304 || LDW *ARG2[2],A18 ; b[2]
305 LDW *ARG1[3],B19 ; a[3]
306 || LDW *ARG2[3],A19 ; b[3]
308 MPY32U A16,B16,A1:A0 ; a[0]*b[0]
309 MPY32U A17,B16,A23:A22 ; a[0]*b[1]
310 MPY32U A16,B17,A25:A24 ; a[1]*b[0]
311 MPY32U A16,B18,A27:A26 ; a[2]*b[0]
313 || MPY32U A17,B17,A29:A28 ; a[1]*b[1]
314 MPY32U A18,B16,A31:A30 ; a[0]*b[2]
317 || MPY32U A19,B16,A21:A20 ; a[3]*b[0]
318 || ADDU A24,A1:A0,A1:A0
321 || MPY32U A18,B17,A23:A22 ; a[2]*b[1]
324 || MPY32U A17,B18,A25:A24 ; a[1]*b[2]
325 || ADDU A28,A9:A8,A9:A8
327 || MPY32U A16,B19,A27:A26 ; a[0]*b[3]
328 || ADDU A30,A9:A8,A9:A8
330 || ADDU B0,A9:A8,A9:A8
334 || MPY32U A19,B17,A21:A20 ; a[3]*b[1]
335 || ADDU A22,A1:A0,A1:A0
337 || MPY32U A18,B18,A23:A22 ; a[2]*b[2]
338 || ADDU A24,A1:A0,A1:A0
340 || MPY32U A17,B19,A25:A24 ; a[1]*b[3]
341 || ADDU A26,A1:A0,A1:A0
343 || ADDU B8,A1:A0,A1:A0
345 || MPY32U A19,B18,A27:A26 ; a[3]*b[2]
348 || MPY32U A18,B19,A29:A28 ; a[2]*b[3]
349 || ADDU A22,A9:A8,A9:A8
351 || MPY32U A19,B19,A31:A30 ; a[3]*b[3]
352 || ADDU A24,A9:A8,A9:A8
354 || ADDU B0,A9:A8,A9:A8
358 || ADDU A28,A1:A0,A1:A0
361 || ADDU B8,A1:A0,A1:A0
365 ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below