2 .asciiz "mips3.s, Version 1.1"
3 .asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
6 * ====================================================================
7 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
10 * Rights for redistribution and usage in source and binary forms are
11 * granted according to the OpenSSL license. Warranty of any kind is
13 * ====================================================================
17 * This is my modest contributon to the OpenSSL project (see
18 * http://www.openssl.org/ for more information about it) and is
19 * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
20 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
22 * The module is designed to work with either of the "new" MIPS ABI(5),
23 * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
24 * IRIX 5.x not only because it doesn't support new ABIs but also
25 * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
26 * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
27 * cause illegal instruction exception:-(
29 * In addition the code depends on preprocessor flags set up by MIPSpro
30 * compiler driver (either as or cc) and therefore (probably?) can't be
31 * compiled by the GNU assembler. GNU C driver manages fine though...
32 * I mean as long as -mmips-as is specified or is the default option,
33 * because then it simply invokes /usr/bin/as which in turn takes
34 * perfect care of the preprocessor definitions. Another neat feature
35 * offered by the MIPSpro assembler is an optimization pass. This gave
36 * me the opportunity to have the code looking more regular as all those
37 * architecture dependent instruction rescheduling details were left to
38 * the assembler. Cool, huh?
40 * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
41 * goes way over 3 times faster!
43 * <appro@fy.chalmers.se>
49 #define MOVNZ(cond,dst,src) \
52 #define MOVNZ(cond,dst,src) \
67 LEAF(bn_mul_add_words)
69 bgtzl a2,.L_bn_mul_add_words_proceed
75 .L_bn_mul_add_words_proceed:
79 beqz ta0,.L_bn_mul_add_words_tail
81 .L_bn_mul_add_words_loop:
89 sltu v0,t1,v0 /* All manuals say it "compares 32-bit
90 * values", but it seems to work fine
91 * even on 64-bit registers. */
140 bgtzl ta0,.L_bn_mul_add_words_loop
143 bnezl a2,.L_bn_mul_add_words_tail
147 .L_bn_mul_add_words_return:
150 .L_bn_mul_add_words_tail:
163 beqz a2,.L_bn_mul_add_words_return
178 beqz a2,.L_bn_mul_add_words_return
193 END(bn_mul_add_words)
198 bgtzl a2,.L_bn_mul_words_proceed
204 .L_bn_mul_words_proceed:
208 beqz ta0,.L_bn_mul_words_tail
210 .L_bn_mul_words_loop:
251 bgtzl ta0,.L_bn_mul_words_loop
254 bnezl a2,.L_bn_mul_words_tail
258 .L_bn_mul_words_return:
261 .L_bn_mul_words_tail:
270 beqz a2,.L_bn_mul_words_return
281 beqz a2,.L_bn_mul_words_return
297 bgtzl a2,.L_bn_sqr_words_proceed
303 .L_bn_sqr_words_proceed:
307 beqz ta0,.L_bn_sqr_words_tail
309 .L_bn_sqr_words_loop:
343 bgtzl ta0,.L_bn_sqr_words_loop
346 bnezl a2,.L_bn_sqr_words_tail
350 .L_bn_sqr_words_return:
354 .L_bn_sqr_words_tail:
361 beqz a2,.L_bn_sqr_words_return
370 beqz a2,.L_bn_sqr_words_return
384 bgtzl a3,.L_bn_add_words_proceed
390 .L_bn_add_words_proceed:
394 beqz AT,.L_bn_add_words_tail
396 .L_bn_add_words_loop:
438 bgtzl AT,.L_bn_add_words_loop
441 bnezl a3,.L_bn_add_words_tail
445 .L_bn_add_words_return:
448 .L_bn_add_words_tail:
457 beqz a3,.L_bn_add_words_return
468 beqz a3,.L_bn_add_words_return
484 bgtzl a3,.L_bn_sub_words_proceed
490 .L_bn_sub_words_proceed:
494 beqz AT,.L_bn_sub_words_tail
496 .L_bn_sub_words_loop:
535 bgtzl AT,.L_bn_sub_words_loop
538 bnezl a3,.L_bn_sub_words_tail
542 .L_bn_sub_words_return:
545 .L_bn_sub_words_tail:
553 beqz a3,.L_bn_sub_words_return
563 beqz a3,.L_bn_sub_words_return
580 move a3,a0 /* we know that bn_div_words doesn't
581 * touch a3, ta2, ta3 and preserves a2
582 * so that we can save two arguments
583 * and return address in registers
584 * instead of stack:-)
589 bne a0,a2,.L_bn_div_3_words_proceed
592 .L_bn_div_3_words_proceed:
602 .L_bn_div_3_words_inner_loop:
603 bnez t8,.L_bn_div_3_words_inner_loop_done
615 beqzl AT,.L_bn_div_3_words_inner_loop
618 .L_bn_div_3_words_inner_loop_done:
625 bnezl a2,.L_bn_div_words_proceed
628 li v0,-1 /* I'd rather signal div-by-zero
629 * which can be done with 'break 7' */
631 .L_bn_div_words_proceed:
632 bltz a2,.L_bn_div_words_body
646 break 6 /* signal overflow */
655 .L_bn_div_words_body:
665 dsrl QT,32 /* q=0xffffffff */
666 beq DH,HH,.L_bn_div_words_skip_div1
669 .L_bn_div_words_skip_div1:
676 .L_bn_div_words_inner_loop1:
684 beqz AT,.L_bn_div_words_inner_loop1_done
687 b .L_bn_div_words_inner_loop1
690 .L_bn_div_words_inner_loop1_done:
698 dsrl QT,32 /* q=0xffffffff */
699 beq DH,HH,.L_bn_div_words_skip_div2
702 .L_bn_div_words_skip_div2:
710 .L_bn_div_words_inner_loop2:
718 beqz AT,.L_bn_div_words_inner_loop2_done
721 b .L_bn_div_words_inner_loop2
724 .L_bn_div_words_inner_loop2_done:
729 dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */
730 dsrl a2,t9 /* restore a2 */
747 #define a_7 a1 /* once we load a[7] we don't need a anymore */
751 #define b_7 a2 /* once we load b[7] we don't need b anymore */
760 #define FRAME_SIZE 48
765 PTR_SUB sp,FRAME_SIZE
768 ld a_0,0(a1) /* If compiled with -mips3 option on
769 * R5000 box assembler barks on this
770 * line with "shouldn't have mult/div
771 * as last instruction in bb (R10K
772 * bug)" warning. If anybody out there
773 * has a clue about how to circumvent
774 * this do send me a note.
775 * <appro@fy.chalmers.se>
784 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
794 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
806 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
809 sd c_1,0(a0) /* r[0]=c1; */
817 sd c_2,8(a0) /* r[1]=c2; */
819 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
826 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
834 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
843 sd c_3,16(a0) /* r[2]=c3; */
845 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
853 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
862 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
871 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
880 sd c_1,24(a0) /* r[3]=c1; */
882 dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */
890 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
899 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
908 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
917 dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */
926 sd c_2,32(a0) /* r[4]=c2; */
928 dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */
936 dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */
945 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
954 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
963 dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */
972 dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */
981 sd c_3,40(a0) /* r[5]=c3; */
983 dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */
991 dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */
1000 dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */
1009 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
1018 dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */
1027 dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */
1036 dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */
1045 sd c_1,48(a0) /* r[6]=c1; */
1047 dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */
1055 dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */
1064 dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */
1073 dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */
1082 dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */
1091 dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */
1100 dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */
1109 dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */
1118 sd c_2,56(a0) /* r[7]=c2; */
1120 dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */
1128 dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */
1137 dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */
1146 dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
1155 dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */
1164 dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */
1173 dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */
1182 sd c_3,64(a0) /* r[8]=c3; */
1184 dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */
1192 dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */
1201 dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */
1210 dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */
1219 dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */
1228 dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */
1237 sd c_1,72(a0) /* r[9]=c1; */
1239 dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */
1247 dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */
1256 dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
1265 dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */
1274 dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */
1283 sd c_2,80(a0) /* r[10]=c2; */
1285 dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */
1293 dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */
1302 dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */
1311 dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */
1320 sd c_3,88(a0) /* r[11]=c3; */
1322 dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */
1330 dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
1339 dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */
1348 sd c_1,96(a0) /* r[12]=c1; */
1350 dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */
1358 dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */
1367 sd c_2,104(a0) /* r[13]=c2; */
1369 dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
1382 sd c_3,112(a0) /* r[14]=c3; */
1383 sd c_1,120(a0) /* r[15]=c1; */
1385 PTR_ADD sp,FRAME_SIZE
1397 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
1406 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
1412 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
1422 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
1429 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
1437 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
1448 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
1456 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
1465 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
1474 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
1485 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
1493 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
1502 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
1513 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
1521 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
1532 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
1562 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
1571 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
1584 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
1598 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
1609 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
1623 dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */
1640 dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */
1654 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
1669 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
1680 dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */
1694 dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */
1709 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
1726 dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */
1740 dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */
1755 dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */
1770 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
1781 dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */
1795 dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */
1810 dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */
1825 dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */
1842 dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */
1856 dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */
1871 dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */
1886 dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
1897 dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */
1911 dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */
1926 dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */
1943 dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */
1957 dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */
1972 dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
1983 dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */
1997 dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */
2014 dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */
2028 dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
2039 dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */
2055 dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
2075 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
2080 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
2093 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
2107 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
2118 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
2132 dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */
2149 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
2163 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
2174 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
2190 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */