2 .asciiz "mips3.s, Version 1.0"
3 .asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
6 * ====================================================================
7 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
10 * Rights for redistribution and usage in source and binary forms are
11 * granted according to the OpenSSL license. Warranty of any kind is
13 * ====================================================================
17 * This is my modest contributon to the OpenSSL project (see
18 * http://www.openssl.org/ for more information about it) and is
19 * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
20 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
22 * The module is designed to work with either of the "new" MIPS ABI(5),
23 * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
24 * IRIX 5.x not only because it doesn't support new ABIs but also
25 * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
26 * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
27 * cause illegal instruction exception:-(
29 * In addition the code depends on preprocessor flags set up by MIPSpro
30 * compiler driver (either as or cc) and therefore (probably?) can't be
31 * compiled by the GNU assembler. GNU C driver manages fine though...
32 * I mean as long as -mmips-as is specified or is the default option,
33 * because then it simply invokes /usr/bin/as which in turn takes
34 * perfect care of the preprocessor definitions. Another neat feature
35 * offered by the MIPSpro assembler is an optimization pass. This gave
36 * me the opportunity to have the code looking more regular as all those
37 * architecture dependent instruction rescheduling details were left to
38 * the assembler. Cool, huh?
40 * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
41 * goes way over 3 times faster!
43 * <appro@fy.chalmers.se>
49 #define MOVNZ(cond,dst,src) \
52 #define MOVNZ(cond,dst,src) \
67 LEAF(bn_mul_add_words)
69 bgtzl a2,.L_bn_mul_add_words_proceed
75 .L_bn_mul_add_words_proceed:
79 beqz ta0,.L_bn_mul_add_words_tail
81 .L_bn_mul_add_words_loop:
89 sltu v0,t1,v0 /* All manuals say it "compares 32-bit
90 * values", but it seems to work fine
91 * even on 64-bit registers. */
140 bgtzl ta0,.L_bn_mul_add_words_loop
143 bnezl a2,.L_bn_mul_add_words_tail
147 .L_bn_mul_add_words_return:
150 .L_bn_mul_add_words_tail:
163 beqz a2,.L_bn_mul_add_words_return
178 beqz a2,.L_bn_mul_add_words_return
193 END(bn_mul_add_words)
198 bgtzl a2,.L_bn_mul_words_proceed
204 .L_bn_mul_words_proceed:
208 beqz ta0,.L_bn_mul_words_tail
210 .L_bn_mul_words_loop:
251 bgtzl ta0,.L_bn_mul_words_loop
254 bnezl a2,.L_bn_mul_words_tail
258 .L_bn_mul_words_return:
261 .L_bn_mul_words_tail:
270 beqz a2,.L_bn_mul_words_return
281 beqz a2,.L_bn_mul_words_return
297 bgtzl a2,.L_bn_sqr_words_proceed
303 .L_bn_sqr_words_proceed:
307 beqz ta0,.L_bn_sqr_words_tail
309 .L_bn_sqr_words_loop:
343 bgtzl ta0,.L_bn_sqr_words_loop
346 bnezl a2,.L_bn_sqr_words_tail
350 .L_bn_sqr_words_return:
354 .L_bn_sqr_words_tail:
361 beqz a2,.L_bn_sqr_words_return
370 beqz a2,.L_bn_sqr_words_return
384 bgtzl a3,.L_bn_add_words_proceed
390 .L_bn_add_words_proceed:
394 beqz AT,.L_bn_add_words_tail
396 .L_bn_add_words_loop:
438 bgtzl AT,.L_bn_add_words_loop
441 bnezl a3,.L_bn_add_words_tail
445 .L_bn_add_words_return:
448 .L_bn_add_words_tail:
457 beqz a3,.L_bn_add_words_return
468 beqz a3,.L_bn_add_words_return
484 bgtzl a3,.L_bn_sub_words_proceed
490 .L_bn_sub_words_proceed:
494 beqz AT,.L_bn_sub_words_tail
496 .L_bn_sub_words_loop:
535 bgtzl AT,.L_bn_sub_words_loop
538 bnezl a3,.L_bn_sub_words_tail
542 .L_bn_sub_words_return:
545 .L_bn_sub_words_tail:
553 beqz a3,.L_bn_sub_words_return
563 beqz a3,.L_bn_sub_words_return
580 bnezl a2,.L_bn_div_words_proceed
583 li v0,-1 /* I'd rather signal div-by-zero
584 * which can be done with 'break 7' */
586 .L_bn_div_words_proceed:
587 bltz a2,.L_bn_div_words_body
601 break 6 /* signal overflow */
610 .L_bn_div_words_body:
620 dsrl QT,32 /* q=0xffffffff */
621 beq DH,HH,.L_bn_div_words_skip_div1
624 .L_bn_div_words_skip_div1:
631 .L_bn_div_words_inner_loop1:
638 beqz AT,.L_bn_div_words_inner_loop1_done
644 b .L_bn_div_words_inner_loop1
645 .L_bn_div_words_inner_loop1_done:
653 dsrl QT,32 /* q=0xffffffff */
654 beq DH,HH,.L_bn_div_words_skip_div2
657 .L_bn_div_words_skip_div2:
664 .L_bn_div_words_inner_loop2:
671 beqz AT,.L_bn_div_words_inner_loop2_done
677 b .L_bn_div_words_inner_loop2
678 .L_bn_div_words_inner_loop2_done:
682 dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */
683 dsrl a2,t9 /* restore a2 */
693 move a3,a0 /* we know that bn_div_words doesn't
694 * touch a3, ta2, ta3 and preserves a2
695 * so that we can save two arguments
696 * and return address in registers
697 * instead of stack:-)
706 beq a0,a2,.L_bn_div_3_words_skip_div
709 .L_bn_div_3_words_skip_div:
714 .L_bn_div_3_words_inner_loop:
720 bnez AT,.L_bn_div_3_words_inner_loop_done
727 beqz AT,.L_bn_div_3_words_inner_loop
728 .L_bn_div_3_words_inner_loop_done:
744 #define a_7 a1 /* once we load a[7] we don't need a anymore */
748 #define b_7 a2 /* once we load b[7] we don't need b anymore */
757 #define FRAME_SIZE 48
762 PTR_SUB sp,FRAME_SIZE
765 ld a_0,0(a1) /* If compiled with -mips3 option on
766 * R5000 box assembler barks on this
767 * line with "shouldn't have mult/div
768 * as last instruction in bb (R10K
769 * bug)" warning. If anybody out there
770 * has a clue about how to circumvent
771 * this do send me a note.
772 * <appro@fy.chalmers.se>
781 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
791 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
803 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
806 sd c_1,0(a0) /* r[0]=c1; */
814 sd c_2,8(a0) /* r[1]=c2; */
816 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
823 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
831 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
840 sd c_3,16(a0) /* r[2]=c3; */
842 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
849 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
857 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
866 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
875 sd c_1,24(a0) /* r[3]=c1; */
877 dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */
884 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
892 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
901 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
910 dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */
919 sd c_2,32(a0) /* r[4]=c2; */
921 dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */
928 dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */
936 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
945 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
954 dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */
963 dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */
972 sd c_3,40(a0) /* r[5]=c3; */
974 dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */
981 dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */
989 dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */
998 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
1007 dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */
1016 dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */
1025 dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */
1034 sd c_1,48(a0) /* r[6]=c1; */
1036 dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */
1043 dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */
1051 dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */
1060 dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */
1069 dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */
1078 dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */
1087 dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */
1096 dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */
1105 sd c_2,56(a0) /* r[7]=c2; */
1107 dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */
1114 dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */
1122 dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */
1131 dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
1140 dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */
1149 dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */
1158 dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */
1167 sd c_3,64(a0) /* r[8]=c3; */
1169 dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */
1176 dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */
1184 dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */
1193 dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */
1202 dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */
1211 dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */
1220 sd c_1,72(a0) /* r[9]=c1; */
1222 dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */
1229 dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */
1237 dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
1246 dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */
1255 dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */
1264 sd c_2,80(a0) /* r[10]=c2; */
1266 dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */
1273 dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */
1281 dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */
1290 dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */
1299 sd c_3,88(a0) /* r[11]=c3; */
1301 dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */
1308 dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
1316 dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */
1325 sd c_1,96(a0) /* r[12]=c1; */
1327 dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */
1334 dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */
1342 sd c_2,104(a0) /* r[13]=c2; */
1344 dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
1357 sd c_3,112(a0) /* r[14]=c3; */
1358 sd c_1,120(a0) /* r[15]=c1; */
1360 PTR_ADD sp,FRAME_SIZE
1372 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
1381 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
1387 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
1397 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
1404 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
1412 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
1423 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
1430 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
1438 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
1447 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
1458 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
1465 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
1473 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
1484 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
1491 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
1501 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
1531 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
1540 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
1553 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
1565 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
1576 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
1588 dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */
1605 dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */
1617 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
1632 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
1643 dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */
1655 dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */
1670 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
1687 dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */
1699 dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */
1714 dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */
1729 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
1740 dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */
1752 dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */
1767 dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */
1782 dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */
1799 dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */
1811 dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */
1826 dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */
1841 dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
1852 dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */
1864 dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */
1879 dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */
1896 dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */
1908 dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */
1923 dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
1934 dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */
1946 dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */
1963 dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */
1975 dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
1986 dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */
2000 dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
2020 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
2025 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
2038 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
2050 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
2061 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
2073 dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */
2090 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
2102 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
2113 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
2127 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */