Bignum division tune-up. Idea is to move multiplications in front of

author Andy Polyakov <appro@openssl.org>

Fri, 30 Jul 1999 11:43:43 +0000 (11:43 +0000)

committer Andy Polyakov <appro@openssl.org>

Fri, 30 Jul 1999 11:43:43 +0000 (11:43 +0000)
author Andy Polyakov <appro@openssl.org>
Fri, 30 Jul 1999 11:43:43 +0000 (11:43 +0000)
committer Andy Polyakov <appro@openssl.org>
Fri, 30 Jul 1999 11:43:43 +0000 (11:43 +0000)
diff --git a/crypto/bn/asm/mips3.s b/crypto/bn/asm/mips3.s

index 2193c954b209f63a7daf469980cb6bfbd6852c24..191345d92071ce53de6a99758345fd21810437b5 100644 (file)
--- a/crypto/bn/asm/mips3.s
+++ b/crypto/bn/asm/mips3.s
@@ -1,5 +1,5 @@
  .rdata
-.asciiz "mips3.s, Version 1.0 (prerelease)"
+.asciiz "mips3.s, Version 1.0"
  .asciiz        "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
  
  /*
@@ -19,19 +19,26 @@
   * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
   * module. For updates see http://fy.chalmers.se/~appro/hpe/.
   *
- * The module is designed to work with "new" IRIX ABI(5), namely
- * N32 and N64. But it was tested only with MIPSpro 7.2.x assembler,
- * i.e. depends on preprocessor options set up by MIPSspro 7.2.x
- * driver. Another neat gadget offered by MIPSpro 7.2.x assembler is
- * an peep-hole(?) optimization pass. This gave me the opportunity
- * to make the code looking more regular as all those architecture
- * dependent(!) instruction rescheduling details were left to the
- * assembler. Cool, huh? Do note that I have no idea if GNU assembler
- * does anything similar nor how GNU C will do with this module.
- * Feedback on the matter is therefore very much appreciated:-)
+ * The module is designed to work with either of the "new" MIPS ABI(5),
+ * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
+ * IRIX 5.x not only because it doesn't support new ABIs but also
+ * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
+ * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
+ * cause illegal instruction exception:-(
+ *
+ * In addition the code depends on preprocessor flags set up by MIPSpro
+ * compiler driver (either as or cc) and therefore (probably?) can't be
+ * compiled by the GNU assembler. GNU C driver manages fine though...
+ * I mean as long as -mmips-as is specified or is the default option,
+ * because then it simply invokes /usr/bin/as which in turn takes
+ * perfect care of the preprocessor definitions. Another neat feature
+ * offered by the MIPSpro assembler is an optimization pass. This gave
+ * me the opportunity to have the code looking more regular as all those
+ * architecture dependent instruction rescheduling details were left to
+ * the assembler. Cool, huh?
   *
   * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
- * exhibits 3-3.5-3.7 times improvement!
+ * goes way over 3 times faster!
   *
   *                                     <appro@fy.chalmers.se>
   */
@@ -56,8 +63,8 @@
  
  #define        MINUS4  v1
  
+.align 5
  LEAF(bn_mul_add_words)
-       .align  5
         .set    noreorder
         bgtzl   a2,.L_bn_mul_add_words_proceed
         ld      t0,0(a1)
@@ -185,8 +192,8 @@ LEAF(bn_mul_add_words)
         jr      ra
  END(bn_mul_add_words)
  
+.align 5
  LEAF(bn_mul_words)
-       .align  5
         .set    noreorder
         bgtzl   a2,.L_bn_mul_words_proceed
         ld      t0,0(a1)
@@ -284,8 +291,8 @@ LEAF(bn_mul_words)
         jr      ra
  END(bn_mul_words)
  
+.align 5
  LEAF(bn_sqr_words)
-       .align  5
         .set    noreorder
         bgtzl   a2,.L_bn_sqr_words_proceed
         ld      t0,0(a1)
@@ -371,8 +378,8 @@ LEAF(bn_sqr_words)
         jr      ra
  END(bn_sqr_words)
  
+.align 5
  LEAF(bn_add_words)
-       .align  5
         .set    noreorder
         bgtzl   a3,.L_bn_add_words_proceed
         ld      t0,0(a1)
@@ -471,8 +478,8 @@ LEAF(bn_add_words)
         jr      ra
  END(bn_add_words)
  
+.align 5
  LEAF(bn_sub_words)
-       .align  5
         .set    noreorder
         bgtzl   a3,.L_bn_sub_words_proceed
         ld      t0,0(a1)
@@ -567,24 +574,24 @@ END(bn_sub_words)
  
  #undef MINUS4
  
+.align 5
  LEAF(bn_div_words)
-       .align  5
         .set    noreorder
         bnezl   a2,.L_bn_div_words_proceed
-       move    t0,zero
+       move    v1,zero
         jr      ra
         li      v0,-1           /* I'd rather signal div-by-zero
                                  * which can be done with 'break 7' */
-       .set    reorder
  
  .L_bn_div_words_proceed:
         bltz    a2,.L_bn_div_words_body
-       .set    noreorder
+       move    t9,v1
         dsll    a2,1
         bgtz    a2,.-4
-       addu    t0,1
+       addu    t9,1
+
         .set    reorder
-       negu    t1,t0
+       negu    t1,t9
         li      t2,-1
         dsll    t2,t1
         and     t2,a0
@@ -593,65 +600,135 @@ LEAF(bn_div_words)
         bnezl   t2,.+8
         break   6               /* signal overflow */
         .set    reorder
-       dsll    a0,t0
-       dsll    a1,t0
+       dsll    a0,t9
+       dsll    a1,t9
         or      a0,AT
  
  #define        QT      ta0
-#define        DH      ta1
-#define        HH      ta2
-#define        MINUS1  ta3
+#define        HH      ta1
+#define        DH      v1
  .L_bn_div_words_body:
         dsrl    DH,a2,32
-       li      v1,2
         sgeu    AT,a0,a2
-       li      MINUS1,-1
         .set    noreorder
         bnezl   AT,.+8
         dsubu   a0,a2
         .set    reorder
  
-.L_bn_div_words_outer_loop:
+       li      QT,-1
         dsrl    HH,a0,32
-       subu    v1,1
-       dsrl    QT,MINUS1,32    /* q=0xffffffff */
-       beq     DH,HH,.L_bn_div_words_inner_loop
+       dsrl    QT,32   /* q=0xffffffff */
+       beq     DH,HH,.L_bn_div_words_skip_div1
         ddivu   zero,a0,DH
         mflo    QT
-.L_bn_div_words_inner_loop:
+.L_bn_div_words_skip_div1:
         dmultu  a2,QT
         dsll    t3,a0,32
         dsrl    AT,a1,32
         or      t3,AT
         mflo    t0
         mfhi    t1
+.L_bn_div_words_inner_loop1:
         sltu    t2,t3,t0
         seq     t8,HH,t1
         sltu    AT,HH,t1
         and     t2,t8
         or      AT,t2
         .set    noreorder
-       bnezl   AT,.L_bn_div_words_inner_loop
-       dsubu   QT,1
+       beqz    AT,.L_bn_div_words_inner_loop1_done
+       sltu    t2,t0,a2
         .set    reorder
-       
-       dsubu   a0,t3,t0
-       beqz    v1,.L_bn_div_words_outer_loop_done
+       dsubu   QT,1
+       dsubu   t0,a2
+       dsubu   t1,t2
+       b       .L_bn_div_words_inner_loop1
+.L_bn_div_words_inner_loop1_done:      
  
         dsll    a1,32
+       dsubu   a0,t3,t0
         dsll    v0,QT,32
-       b       .L_bn_div_words_outer_loop
  
-.L_bn_div_words_outer_loop_done:
+       li      QT,-1
+       dsrl    HH,a0,32
+       dsrl    QT,32   /* q=0xffffffff */
+       beq     DH,HH,.L_bn_div_words_skip_div2
+       ddivu   zero,a0,DH
+       mflo    QT
+.L_bn_div_words_skip_div2:
+       dmultu  a2,QT
+       dsll    t3,a0,32
+       dsrl    AT,a1,32
+       or      t3,AT
+       mflo    t0
+       mfhi    t1
+.L_bn_div_words_inner_loop2:
+       sltu    t2,t3,t0
+       seq     t8,HH,t1
+       sltu    AT,HH,t1
+       and     t2,t8
+       or      AT,t2
+       .set    noreorder
+       beqz    AT,.L_bn_div_words_inner_loop2_done
+       sltu    t2,t0,a2
+       .set    reorder
+       dsubu   QT,1
+       dsubu   t0,a2
+       dsubu   t1,t2
+       b       .L_bn_div_words_inner_loop2
+.L_bn_div_words_inner_loop2_done:      
+
+       dsubu   a0,t3,t0
         or      v0,QT
-       move    v1,a0   /* v1 contains remainder if one wants it */
+       dsrl    v1,a0,t9        /* v1 contains remainder if anybody wants it */
+       dsrl    a2,t9           /* restore a2 */
         jr      ra
-#undef MINUS1
  #undef HH
  #undef DH
  #undef QT
  END(bn_div_words)
  
+.align 5
+LEAF(bn_div_3_words)
+       .set    reorder
+       move    a3,a0           /* we know that bn_div_words doesn't
+                                * touch a3, ta2, ta3 and preserves a2
+                                * so that we can save two arguments
+                                * and return address in registers
+                                * instead of stack:-)
+                                */
+       ld      a0,(a3)
+       move    ta2,a2
+       move    a2,a1
+       ld      a1,-8(a3)
+       move    ta3,ra
+       move    v1,zero
+       li      v0,-1
+       beq     a0,a2,.L_bn_div_3_words_skip_div
+       jal     bn_div_words
+       move    ra,ta3
+.L_bn_div_3_words_skip_div:
+       dmultu  ta2,v0
+       ld      t2,-16(a3)
+       mflo    t0
+       mfhi    t1
+.L_bn_div_3_words_inner_loop:
+       sgeu    AT,t2,t0
+       seq     t9,t1,v1
+       sltu    t8,t1,v1
+       and     AT,t9
+       or      AT,t8
+       bnez    AT,.L_bn_div_3_words_inner_loop_done
+       daddu   v1,a2
+       sltu    t3,t0,ta2
+       sltu    AT,v1,a2
+       dsubu   v0,1
+       dsubu   t0,ta2
+       dsubu   t1,t3
+       beqz    AT,.L_bn_div_3_words_inner_loop
+.L_bn_div_3_words_inner_loop_done:
+       jr      ra
+END(bn_div_3_words)
+
  #define        a_0     t0
  #define        a_1     t1
  #define        a_2     t2
@@ -679,20 +756,19 @@ END(bn_div_words)
  
  #define        FRAME_SIZE      48
  
+.align 5
  LEAF(bn_mul_comba8)
-       .align  5
         .set    noreorder
         PTR_SUB sp,FRAME_SIZE
         .frame  sp,64,ra
         .set    reorder
-       ld      a_0,0(a1)       /* If compiled with -mips3 options
-                                * assembler barks on this line with
-                                * "shouldn't have mult/div as last
-                                * instruction in bb (R10K bug)"
-                                * warning. If anybody out there has
-                                * a clue on what does "bb" mean and
-                                * how to circumvent this do send me
-                                * a note.
+       ld      a_0,0(a1)       /* If compiled with -mips3 option on
+                                * R5000 box assembler barks on this
+                                * line with "shouldn't have mult/div
+                                * as last instruction in bb (R10K
+                                * bug)" warning. If anybody out there
+                                * has a clue about how to circumvent
+                                * this do send me a note.
                                  *              <appro@fy.chalmers.se>
                                  */
         ld      b_0,0(a2)
@@ -1286,8 +1362,8 @@ LEAF(bn_mul_comba8)
         jr      ra
  END(bn_mul_comba8)
  
+.align 5
  LEAF(bn_mul_comba4)
-       .align  5
         .set    reorder
         ld      a_0,0(a1)
         ld      b_0,0(a2)
@@ -1444,8 +1520,8 @@ END(bn_mul_comba4)
  #define        a_6     b_2
  #define        a_7     b_3
  
+.align 5
  LEAF(bn_sqr_comba8)
-       .align  5
         .set    reorder
         ld      a_0,0(a1)
         ld      a_1,8(a1)
@@ -1934,8 +2010,8 @@ LEAF(bn_sqr_comba8)
         jr      ra
  END(bn_sqr_comba8)
  
+.align 5
  LEAF(bn_sqr_comba4)
-       .align  5
         .set    reorder
         ld      a_0,0(a1)
         ld      a_1,8(a1)
diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c

index 286a0f1e74b79967a66d30fffcee3a4c7ed17226..4d3da16a0c9ae173a07be3bb0eb2f1c6f6415df6 100644 (file)
--- a/crypto/bn/bn_asm.c
+++ b/crypto/bn/bn_asm.c
@@ -264,18 +264,20 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
                 else
                         q=h/dh;
  
+               th=q*dh;
+               tl=dl*q;
                 for (;;)
                         {
-                       t=(h-(th=q*dh));
-                       tl=BN_MASK2;
+                       t=h-th;
                         if ((t&BN_MASK2h) ||
-                               ((tl=dl*q) <= (
+                               ((tl) <= (
                                         (t<<BN_BITS4)|
                                         ((l&BN_MASK2h)>>BN_BITS4))))
                                 break;
                         q--;
+                       th-=dh;
+                       tl-=dl;
                         }
-               if (tl==BN_MASK2) tl=q*dl;
                 t=(tl>>BN_BITS4);
                 tl=(tl<<BN_BITS4)&BN_MASK2h;
                 th+=t;
diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c

index defcf90c8223d282b36306aa3737dc3b22ca7460..03b9152241798e5154c619c3f1b5ea0d332cd995 100644 (file)
--- a/crypto/bn/bn_div.c
+++ b/crypto/bn/bn_div.c
@@ -200,56 +200,69 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
  
         for (i=0; i<loop-1; i++)
                 {
-               BN_ULONG q,n0,n1;
-               BN_ULONG l0;
+               BN_ULONG q,l0;
+#ifdef BN_DIV3W
+               q=bn_div_3_words(wnump,d0,d1);
+#else
+               BN_ULONG n0,n1,rem;
  
-               wnum.d--; wnum.top++;
                 n0=wnump[0];
                 n1=wnump[-1];
                 if (n0 == d0)
                         q=BN_MASK2;
                 else
+#if defined(BN_LLONG) && defined(BN_DIV2W)
+                       q=((((BN_ULLONG)n0)<<BN_BITS2)|n1)/((BN_ULLONG)d0);
+#else
                         q=bn_div_words(n0,n1,d0);
+#endif
                 {
  #ifdef BN_LLONG
-               BN_ULLONG t1,t2,rem;
-               t1=((BN_ULLONG)n0<<BN_BITS2)|n1;
+               BN_ULLONG t2;
+
+               /*
+                * rem doesn't have to be BN_ULLONG. The least we
+                * know it's less that d0, isn't it?
+                */
+               rem=(n1-q*d0)&BN_MASK2;
+
+               t2=(BN_ULLONG)d1*q;
                 for (;;)
                         {
-                       rem=t1-(BN_ULLONG)q*d0;
-                       t2=(BN_ULLONG)d1*q;
-                        if ((rem>>BN_BITS2) ||
-                               (t2 <= ((rem<<BN_BITS2)|wnump[-2])))
+                        if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
                                 break;
                         q--;
+                       rem += d0;
+                       if (rem < d0) break; /* don't let rem overflow */
+                       t2 -= d1;
                         }
  #else
-               BN_ULONG t1l,t1h,t2l,t2h,t3l,t3h,ql,qh,t3t;
-               t1h=n0;
-               t1l=n1;
+               BN_ULONG t2l,t2h,ql,qh;
+
+               /*
+                * It's more than enough with the only multiplication.
+                * See the comment above in BN_LLONG section...
+                */
+               rem=(n1-q*d0)&BN_MASK2;
+
+               t2l=LBITS(d1); t2h=HBITS(d1);
+               ql =LBITS(q);  qh =HBITS(q);
+               mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+
                 for (;;)
                         {
-                       t2l=LBITS(d1); t2h=HBITS(d1);
-                       ql =LBITS(q);  qh =HBITS(q);
-                       mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
-
-                       t3t=LBITS(d0); t3h=HBITS(d0);
-                       mul64(t3t,t3h,ql,qh); /* t3=t1-(BN_ULLONG)q*d0; */
-                       t3l=(t1l-t3t)&BN_MASK2;
-                       if (t3l > t1l) t3h++;
-                       t3h=(t1h-t3h)&BN_MASK2;
-
-                       /*if ((t3>>BN_BITS2) ||
-                               (t2 <= ((t3<<BN_BITS2)+wnump[-2])))
-                               break; */
-                       if (t3h) break;
-                       if (t2h < t3l) break;
-                       if ((t2h == t3l) && (t2l <= wnump[-2])) break;
-
+                       if ((t2h < rem) ||
+                               ((t2h == rem) && (t2l <= wnump[-2])))
+                               break;
                         q--;
+                       rem += d0;
+                       if (rem < d0) break; /* don't let rem overflow */
+                       if (t2l < d1) t2h--; t2l -= d1;
                         }
  #endif
                 }
+#endif /* BN_DIV3W */
+               wnum.d--; wnum.top++;
                 l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
                 tmp->d[div_n]=l0;
                 for (j=div_n+1; j>0; j--)
author	Andy Polyakov <appro@openssl.org>
	Fri, 30 Jul 1999 11:43:43 +0000 (11:43 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Fri, 30 Jul 1999 11:43:43 +0000 (11:43 +0000)
crypto/bn/asm/mips3.s		patch \| blob \| history
crypto/bn/bn_asm.c		patch \| blob \| history
crypto/bn/bn_div.c		patch \| blob \| history