3 # The bn_div64 is actually gcc output but the other parts are hand done.
4 # Thanks to tzeruch@ceddec.com for sending me the gcc output for
6 # I've gone back and re-done most of routines.
7 # The key thing to remeber for the 164 CPU is that while a
8 # multiply operation takes 8 cycles, another one can only be issued
9 # after 4 cycles have elapsed. I've done modification to help
10 # improve this. Also, normally, a ld instruction will not be available
18 .globl bn_mul_add_words
27 blt $18,$43 # if we are -1, -2, -3 or -4 goto tail code
32 mulq $20,$19,$5 # 1 2 1 ######
35 umulh $20,$19,$20 # 1 2 ######
38 mulq $21,$19,$6 # 2 2 1 ######
42 umulh $21,$19,$21 # 2 2 ######
43 cmpult $1,$5,$22 # 1 2 3 1
44 addq $20,$22,$20 # 1 3 1
45 addq $1,$0,$1 # 1 2 3 1
46 mulq $27,$19,$7 # 3 2 1 ######
47 cmpult $1,$0,$0 # 1 2 3 2
49 addq $20,$0,$0 # 1 3 2
50 cmpult $2,$6,$23 # 2 2 3 1
51 addq $21,$23,$21 # 2 3 1
52 umulh $27,$19,$27 # 3 2 ######
53 addq $2,$0,$2 # 2 2 3 1
54 cmpult $2,$0,$0 # 2 2 3 2
56 mulq $28,$19,$8 # 4 2 1 ######
57 addq $21,$0,$0 # 2 3 2
60 cmpult $3,$7,$24 # 3 2 3 1
61 stq $1,-32($16) # 1 2 4
62 umulh $28,$19,$28 # 4 2 ######
63 addq $27,$24,$27 # 3 3 1
64 addq $3,$0,$3 # 3 2 3 1
65 stq $2,-24($16) # 2 2 4
66 cmpult $3,$0,$0 # 3 2 3 2
67 stq $3,-16($16) # 3 2 4
69 addq $27,$0,$0 # 3 3 2
70 cmpult $4,$8,$25 # 4 2 3 1
72 addq $28,$25,$28 # 4 3 1
73 addq $4,$0,$4 # 4 2 3 1
74 cmpult $4,$0,$0 # 4 2 3 2
75 stq $4,-8($16) # 4 2 4
76 addq $28,$0,$0 # 4 3 2
88 mulq $20,$19,$5 # 4 2 1
92 umulh $20,$19,$20 # 4 2
94 cmpult $1,$5,$22 # 4 2 3 1
95 addq $20,$22,$20 # 4 3 1
96 addq $1,$0,$1 # 4 2 3 1
97 cmpult $1,$0,$0 # 4 2 3 2
98 addq $20,$0,$0 # 4 3 2
99 stq $1,-8($16) # 4 2 4
101 ret $31,($26),1 # else exit
106 bgt $18,$45 # goto tail code
107 ret $31,($26),1 # else exit
109 .end bn_mul_add_words
120 blt $18,$143 # if we are -1, -2, -3 or -4 goto tail code
125 mulq $20,$19,$5 # 1 2 1 #####
127 ldq $27,16($17) # 3 1
128 umulh $20,$19,$20 # 1 2 #####
129 ldq $28,24($17) # 4 1
130 mulq $21,$19,$6 # 2 2 1 #####
131 addq $5,$0,$5 # 1 2 3 1
133 cmpult $5,$0,$0 # 1 2 3 2
134 umulh $21,$19,$21 # 2 2 #####
135 addq $20,$0,$0 # 1 3 2
137 addq $6,$0,$6 # 2 2 3 1
138 mulq $27,$19,$7 # 3 2 1 #####
139 cmpult $6,$0,$0 # 2 2 3 2
140 addq $21,$0,$0 # 2 3 2
142 umulh $27,$19,$27 # 3 2 #####
143 stq $5,-32($16) # 1 2 4
144 mulq $28,$19,$8 # 4 2 1 #####
145 addq $7,$0,$7 # 3 2 3 1
146 stq $6,-24($16) # 2 2 4
147 cmpult $7,$0,$0 # 3 2 3 2
148 umulh $28,$19,$28 # 4 2 #####
149 addq $27,$0,$0 # 3 3 2
150 stq $7,-16($16) # 3 2 4
151 addq $8,$0,$8 # 4 2 3 1
152 cmpult $8,$0,$0 # 4 2 3 2
154 addq $28,$0,$0 # 4 3 2
156 stq $8,-8($16) # 4 2 4
167 mulq $20,$19,$5 # 4 2 1
169 umulh $20,$19,$20 # 4 2
170 addq $5,$0,$5 # 4 2 3 1
172 cmpult $5,$0,$0 # 4 2 3 2
174 addq $20,$0,$0 # 4 3 2
175 stq $5,-8($16) # 4 2 4
178 ret $31,($26),1 # else exit
183 bgt $18,$145 # goto tail code
184 ret $31,($26),1 # else exit
196 blt $18,$543 # if we are -1, -2, -3 or -4 goto tail code
200 mulq $20,$20,$5 ######
203 umulh $20,$20,$1 ######
204 ldq $27,16($17) # 1 1
205 mulq $21,$21,$6 ######
206 ldq $28,24($17) # 1 1
208 umulh $21,$21,$2 ######
210 mulq $27,$27,$7 ######
211 stq $6,16($16) # r[0]
212 umulh $27,$27,$3 ######
213 stq $2,24($16) # r[1]
214 mulq $28,$28,$8 ######
215 stq $7,32($16) # r[0]
216 umulh $28,$28,$4 ######
217 stq $3,40($16) # r[1]
221 stq $8,-16($16) # r[0]
222 stq $4,-8($16) # r[1]
229 ldq $20,0($17) # a[0]
230 mulq $20,$20,$5 # a[0]*w low part r2
234 umulh $20,$20,$1 # a[0]*w high part r3
235 stq $5,-16($16) # r[0]
236 stq $1,-8($16) # r[1]
239 ret $31,($26),1 # else exit
244 bgt $18,$442 # goto tail code
245 ret $31,($26),1 # else exit
257 bis $31,$31,$0 # carry = 0
263 addq $1,$5,$1 # r=a+b;
265 cmpult $1,$5,$22 # did we overflow?
267 addq $1,$0,$1 # c+= overflow
268 ldq $7,16($17) # a[2]
269 cmpult $1,$0,$0 # overflow?
270 ldq $3,16($18) # b[2]
272 ldq $8,24($17) # a[3]
273 addq $2,$6,$2 # r=a+b;
274 ldq $4,24($18) # b[3]
275 cmpult $2,$6,$23 # did we overflow?
276 addq $3,$7,$3 # r=a+b;
277 addq $2,$0,$2 # c+= overflow
278 cmpult $3,$7,$24 # did we overflow?
279 cmpult $2,$0,$0 # overflow?
280 addq $4,$8,$4 # r=a+b;
282 cmpult $4,$8,$25 # did we overflow?
283 addq $3,$0,$3 # c+= overflow
284 stq $1,0($16) # r[0]=c
285 cmpult $3,$0,$0 # overflow?
286 stq $2,8($16) # r[1]=c
288 stq $3,16($16) # r[2]=c
289 addq $4,$0,$4 # c+= overflow
290 subq $19,4,$19 # loop--
291 cmpult $4,$0,$0 # overflow?
292 addq $17,32,$17 # a++
294 stq $4,24($16) # r[3]=c
295 addq $18,32,$18 # b++
296 addq $16,32,$16 # r++
306 addq $1,$5,$1 # r=a+b;
307 subq $19,1,$19 # loop--
308 addq $1,$0,$1 # c+= overflow
310 cmpult $1,$5,$22 # did we overflow?
311 cmpult $1,$0,$0 # overflow?
313 stq $1,0($16) # r[0]=c
318 ret $31,($26),1 # else exit
322 bgt $19,$945 # goto tail code
323 ret $31,($26),1 # else exit
327 # What follows was taken directly from the C compiler with a few
328 # hacks to redo the lables.
358 jsr $26,BN_num_bits_word
366 # lda $16,_IO_stderr_