1 ;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
3 ;; Licensed under the Apache License 2.0 (the "License"). You may not use
4 ;; this file except in compliance with the License. You can obtain a copy
5 ;; in the file LICENSE in the source distribution or at
6 ;; https://www.openssl.org/source/license.html
8 ;;====================================================================
9 ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 ;; Rights for redistribution and usage in source and binary forms are
13 ;; granted according to the License. Warranty of any kind is disclaimed.
14 ;;====================================================================
15 ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
16 ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
17 ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
18 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
19 ;;====================================================================
22 .if .ASSEMBLER_VERSION<7000000
26 .asg bn_mul_add_words,_bn_mul_add_words
27 .asg bn_mul_words,_bn_mul_words
28 .asg bn_sqr_words,_bn_sqr_words
29 .asg bn_add_words,_bn_add_words
30 .asg bn_sub_words,_bn_sub_words
31 .asg bn_div_words,_bn_div_words
32 .asg bn_sqr_comba8,_bn_sqr_comba8
33 .asg bn_mul_comba8,_bn_mul_comba8
34 .asg bn_sqr_comba4,_bn_sqr_comba4
35 .asg bn_mul_comba4,_bn_mul_comba4
50 .global _bn_mul_add_words
57 [B0] ZERO A19 ; high part of accumulator
63 ;;====================================================================
64 LDW *ARG1++,B7 ; ap[i]
66 LDW *ARG0++,A7 ; rp[i]
68 NOP 3 ; [2,0] in epilogue
70 ADDU A19,A21:A20,A19:A18
72 SPKERNEL 2,1 ; leave slot for "return value"
73 || STW A18,*A2++ ; rp[i]
75 ;;====================================================================
77 MV A19,RET ; return value
87 [B0] ZERO A19 ; high part of accumulator
91 ;;====================================================================
92 LDW *ARG1++,A7 ; ap[i]
94 MPY32U A7,ARG3,A17:A16
95 NOP 4 ; [2,0] in epiloque
98 SPKERNEL 2,1 ; leave slot for "return value"
99 || STW A18,*ARG0++ ; rp[i]
101 ;;====================================================================
103 MV A19,RET ; return value
106 .global _bn_sqr_words
114 || [B0] ADD 4,ARG0,ARG0
118 ;;====================================================================
119 LDW *ARG1++,B7 ; ap[i]
122 NOP 3 ; [2,0] in epilogue
123 STW B0,*B2++(8) ; rp[2*i]
125 SPKERNEL 2,0 ; fully overlap BNOP RA,5
126 || STW A1,*ARG0++(8) ; rp[2*i+1]
127 ;;====================================================================
131 .global _bn_add_words
138 [B0] ZERO A1 ; carry flag
143 ;;====================================================================
144 LDW *ARG2++,A7 ; bp[i]
145 || LDW *ARG1++,B7 ; ap[i]
149 SPKERNEL 0,0 ; fully overlap BNOP RA,5
150 || STW A0,*A3++ ; write result
151 || MV A1,RET ; keep carry flag in RET
152 ;;====================================================================
156 .global _bn_sub_words
163 [B0] ZERO A2 ; borrow flag
168 ;;====================================================================
169 LDW *ARG2++,A7 ; bp[i]
170 || LDW *ARG1++,B7 ; ap[i]
173 [A2] SUB A1:A0,1,A1:A0
174 SPKERNEL 0,1 ; leave slot for "return borrow flag"
175 || STW A0,*A3++ ; write result
176 || AND 1,A1,A2 ; pass on borrow flag
177 ;;====================================================================
179 AND 1,A1,RET ; return borrow flag
182 .global _bn_div_words
185 LMBD 1,A6,A0 ; leading zero bits in dv
186 LMBD 1,A4,A1 ; leading zero bits in hi
191 ||[ A2] MVK -1,A4 ; return overflow
192 ||[!A2] MV A4,A3 ; reassign hi
193 [!A2] MV B4,A4 ; reassign lo, will be quotient
195 [!A2] SHL A6,A0,A6 ; normalize dv
198 [!A2] CMPLTU A3,A6,A1 ; hi<dv?
199 ||[!A2] SHL A4,1,A5:A4 ; lo<<1
200 [!A1] SUB A3,A6,A3 ; hi-=dv
202 [!A2] SHRU A3,31,A1 ; upper bit
203 ||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
206 [!A1] CMPLTU A3,A6,A1 ; hi<dv?
208 || SHL A4,1,A5:A4 ; lo<<1
209 [!A1] SUB A3,A6,A3 ; hi-=dv
210 ||[!A1] OR 1,A4,A4 ; quotient
211 SHRU A3,31,A1 ; upper bit
212 || ADDAH A5,A3,A3 ; hi<<1|lo>>31
218 ;;====================================================================
219 ;; Not really Comba algorithm, just straightforward NxM... Dedicated
220 ;; fully unrolled real Comba implementations are asymptotically 2x
221 ;; faster, but naturally larger undertaking. Purpose of this exercise
222 ;; was rather to learn to master nested SPLOOPs...
223 ;;====================================================================
224 .global _bn_sqr_comba8
225 .global _bn_mul_comba8
231 || MVK 8,A0 ; M, outer loop counter
232 || MV ARG1,A5 ; copy ap
233 || MV ARG0,B4 ; copy rp
234 || ZERO B19 ; high part of accumulator
236 || SUB B0,2,B1 ; N-2, initial ILC
237 || SUB B0,1,B2 ; const B2=N-1
238 || LDW *A5++,B6 ; ap[0]
239 || MV A0,A3 ; const A3=M
240 sploopNxM?: ; for best performance arrange M<=N
241 [A0] SPLOOPD 2 ; 2*n+10
245 || LDW *A5++,A9 ; pre-fetch ap[1]
248 ;;====================================================================
249 ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
250 ;; This is because of Advisory 15 from TI publication SPRZ247I.
251 LDW *ARG2++,A7 ; bp[i]
253 [A1] LDW *B5++,B7 ; rp[i]
257 ADDU B19,B21:B20,B19:B18
260 || STW B18,*B4++ ; rp[i]
262 ;;====================================================================
263 outer?: ; m*2*(n+1)+10
264 SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
266 || CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
267 MVD A9,B6 ; move through .M unit(*)
268 [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
269 SUBAW B5,B2,B5 ; rewind rp to rp[1]
271 [A0] BNOP.S1 outer?,4
272 || [A0] SUB.L A0,1,A0
273 STW B19,*B4--[B2] ; rewind rp tp rp[1]
274 || ZERO.S B19 ; high part of accumulator
278 ;; (*) It should be noted that B6 is used as input to MPY32U in
279 ;; chronologically next cycle in *preceding* SPLOOP iteration.
280 ;; Normally such arrangement would require DINT, but at this
281 ;; point SPLOOP is draining and interrupts are disabled
284 .global _bn_sqr_comba4
285 .global _bn_mul_comba4
292 ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
293 ;; because of low-counter effect, when prologue phase finishes
294 ;; before SPKERNEL instruction is reached. As result it's 25%
295 ;; slower than expected...
297 || MVK 4,A0 ; M, outer loop counter
298 || MV ARG1,A5 ; copy ap
299 || MV ARG0,B4 ; copy rp
300 || ZERO B19 ; high part of accumulator
302 || SUB B0,2,B1 ; first ILC
303 || SUB B0,1,B2 ; const B2=N-1
304 || LDW *A5++,B6 ; ap[0]
305 || MV A0,A3 ; const A3=M
307 ;; This alternative is an exercise in fully unrolled Comba
308 ;; algorithm implementation that operates at n*(n+1)+12, or
309 ;; as little as 32 cycles...
310 LDW *ARG1[0],B16 ; a[0]
311 || LDW *ARG2[0],A16 ; b[0]
312 LDW *ARG1[1],B17 ; a[1]
313 || LDW *ARG2[1],A17 ; b[1]
314 LDW *ARG1[2],B18 ; a[2]
315 || LDW *ARG2[2],A18 ; b[2]
316 LDW *ARG1[3],B19 ; a[3]
317 || LDW *ARG2[3],A19 ; b[3]
319 MPY32U A16,B16,A1:A0 ; a[0]*b[0]
320 MPY32U A17,B16,A23:A22 ; a[0]*b[1]
321 MPY32U A16,B17,A25:A24 ; a[1]*b[0]
322 MPY32U A16,B18,A27:A26 ; a[2]*b[0]
324 || MPY32U A17,B17,A29:A28 ; a[1]*b[1]
325 MPY32U A18,B16,A31:A30 ; a[0]*b[2]
328 || MPY32U A19,B16,A21:A20 ; a[3]*b[0]
329 || ADDU A24,A1:A0,A1:A0
332 || MPY32U A18,B17,A23:A22 ; a[2]*b[1]
335 || MPY32U A17,B18,A25:A24 ; a[1]*b[2]
336 || ADDU A28,A9:A8,A9:A8
338 || MPY32U A16,B19,A27:A26 ; a[0]*b[3]
339 || ADDU A30,A9:A8,A9:A8
341 || ADDU B0,A9:A8,A9:A8
345 || MPY32U A19,B17,A21:A20 ; a[3]*b[1]
346 || ADDU A22,A1:A0,A1:A0
348 || MPY32U A18,B18,A23:A22 ; a[2]*b[2]
349 || ADDU A24,A1:A0,A1:A0
351 || MPY32U A17,B19,A25:A24 ; a[1]*b[3]
352 || ADDU A26,A1:A0,A1:A0
354 || ADDU B8,A1:A0,A1:A0
356 || MPY32U A19,B18,A27:A26 ; a[3]*b[2]
359 || MPY32U A18,B19,A29:A28 ; a[2]*b[3]
360 || ADDU A22,A9:A8,A9:A8
362 || MPY32U A19,B19,A31:A30 ; a[3]*b[3]
363 || ADDU A24,A9:A8,A9:A8
365 || ADDU B0,A9:A8,A9:A8
369 || ADDU A28,A1:A0,A1:A0
372 || ADDU B8,A1:A0,A1:A0
376 ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below