crypto/bn/asm/x86_64-gcc.c

   1 #ifdef __SUNPRO_C
   2 # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
   3 #else
   4 /*
   5  * x86_64 BIGNUM accelerator version 0.1, December 2002.
   6  *
   7  * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   8  * project.
   9  *
  10  * Rights for redistribution and usage in source and binary forms are
  11  * granted according to the OpenSSL license. Warranty of any kind is
  12  * disclaimed.
  13  *
  14  * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
  15  *    versions, like 1.0...
  16  * A. Well, that's because this code is basically a quick-n-dirty
  17  *    proof-of-concept hack. As you can see it's implemented with
  18  *    inline assembler, which means that you're bound to GCC and that
  19  *    there might be enough room for further improvement.
  20  *
  21  * Q. Why inline assembler?
  22  * A. x86_64 features own ABI which I'm not familiar with. This is
  23  *    why I decided to let the compiler take care of subroutine
  24  *    prologue/epilogue as well as register allocation. For reference.
  25  *    Win64 implements different ABI for AMD64, different from Linux.
  26  *
  27  * Q. How much faster does it get?
  28  * A. 'apps/openssl speed rsa dsa' output with no-asm:
  29  *
  30  *                        sign    verify    sign/s verify/s
  31  *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
  32  *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
  33  *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
  34  *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
  35  *                        sign    verify    sign/s verify/s
  36  *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
  37  *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
  38  *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
  39  *
  40  *    'apps/openssl speed rsa dsa' output with this module:
  41  *
  42  *                        sign    verify    sign/s verify/s
  43  *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
  44  *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
  45  *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
  46  *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
  47  *                        sign    verify    sign/s verify/s
  48  *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
  49  *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
  50  *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
  51  *
  52  *    For the reference. IA-32 assembler implementation performs
  53  *    very much like 64-bit code compiled with no-asm on the same
  54  *    machine.
  55  */
  56
  57 #define BN_ULONG unsigned long
  58
  59 /*
  60  * "m"(a), "+m"(r)      is the way to favor DirectPath µ-code;
  61  * "g"(0)               let the compiler to decide where does it
  62  *                      want to keep the value of zero;
  63  */
  64 #define mul_add(r,a,word,carry) do {    \
  65         register BN_ULONG high,low;     \
  66         asm ("mulq %3"                  \
  67                 : "=a"(low),"=d"(high)  \
  68                 : "a"(word),"m"(a)      \
  69                 : "cc");                \
  70         asm ("addq %2,%0; adcq %3,%1"   \
  71                 : "+r"(carry),"+d"(high)\
  72                 : "a"(low),"g"(0)       \
  73                 : "cc");                \
  74         asm ("addq %2,%0; adcq %3,%1"   \
  75                 : "+m"(r),"+d"(high)    \
  76                 : "r"(carry),"g"(0)     \
  77                 : "cc");                \
  78         carry=high;                     \
  79         } while (0)
  80
  81 #define mul(r,a,word,carry) do {        \
  82         register BN_ULONG high,low;     \
  83         asm ("mulq %3"                  \
  84                 : "=a"(low),"=d"(high)  \
  85                 : "a"(word),"g"(a)      \
  86                 : "cc");                \
  87         asm ("addq %2,%0; adcq %3,%1"   \
  88                 : "+r"(carry),"+d"(high)\
  89                 : "a"(low),"g"(0)       \
  90                 : "cc");                \
  91         (r)=carry, carry=high;          \
  92         } while (0)
  93
  94 #define sqr(r0,r1,a)                    \
  95         asm ("mulq %2"                  \
  96                 : "=a"(r0),"=d"(r1)     \
  97                 : "a"(a)                \
  98                 : "cc");
  99
 100 BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 101         {
 102         BN_ULONG c1=0;
 103
 104         if (num <= 0) return(c1);
 105
 106         while (num&~3)
 107                 {
 108                 mul_add(rp[0],ap[0],w,c1);
 109                 mul_add(rp[1],ap[1],w,c1);
 110                 mul_add(rp[2],ap[2],w,c1);
 111                 mul_add(rp[3],ap[3],w,c1);
 112                 ap+=4; rp+=4; num-=4;
 113                 }
 114         if (num)
 115                 {
 116                 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
 117                 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
 118                 mul_add(rp[2],ap[2],w,c1); return c1;
 119                 }
 120
 121         return(c1);
 122         }
 123
 124 BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 125         {
 126         BN_ULONG c1=0;
 127
 128         if (num <= 0) return(c1);
 129
 130         while (num&~3)
 131                 {
 132                 mul(rp[0],ap[0],w,c1);
 133                 mul(rp[1],ap[1],w,c1);
 134                 mul(rp[2],ap[2],w,c1);
 135                 mul(rp[3],ap[3],w,c1);
 136                 ap+=4; rp+=4; num-=4;
 137                 }
 138         if (num)
 139                 {
 140                 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
 141                 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
 142                 mul(rp[2],ap[2],w,c1);
 143                 }
 144         return(c1);
 145         }
 146
 147 void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
 148         {
 149         if (n <= 0) return;
 150
 151         while (n&~3)
 152                 {
 153                 sqr(r[0],r[1],a[0]);
 154                 sqr(r[2],r[3],a[1]);
 155                 sqr(r[4],r[5],a[2]);
 156                 sqr(r[6],r[7],a[3]);
 157                 a+=4; r+=8; n-=4;
 158                 }
 159         if (n)
 160                 {
 161                 sqr(r[0],r[1],a[0]); if (--n == 0) return;
 162                 sqr(r[2],r[3],a[1]); if (--n == 0) return;
 163                 sqr(r[4],r[5],a[2]);
 164                 }
 165         }
 166
 167 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 168 {       BN_ULONG ret,waste;
 169
 170         asm ("divq      %4"
 171                 : "=a"(ret),"=d"(waste)
 172                 : "a"(l),"d"(h),"g"(d)
 173                 : "cc");
 174
 175         return ret;
 176 }
 177
 178 BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
 179 { BN_ULONG ret=0,i=0;
 180
 181         if (n <= 0) return 0;
 182
 183         asm (
 184         "       subq    %2,%2           \n"
 185         ".p2align 4                     \n"
 186         "1:     movq    (%4,%2,8),%0    \n"
 187         "       adcq    (%5,%2,8),%0    \n"
 188         "       movq    %0,(%3,%2,8)    \n"
 189         "       leaq    1(%2),%2        \n"
 190         "       loop    1b              \n"
 191         "       sbbq    %0,%0           \n"
 192                 : "=&a"(ret),"+c"(n),"=&r"(i)
 193                 : "r"(rp),"r"(ap),"r"(bp)
 194                 : "cc"
 195         );
 196
 197   return ret&1;
 198 }
 199
 200 #ifndef SIMICS
 201 BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
 202 { BN_ULONG ret=0,i=0;
 203
 204         if (n <= 0) return 0;
 205
 206         asm (
 207         "       subq    %2,%2           \n"
 208         ".p2align 4                     \n"
 209         "1:     movq    (%4,%2,8),%0    \n"
 210         "       sbbq    (%5,%2,8),%0    \n"
 211         "       movq    %0,(%3,%2,8)    \n"
 212         "       leaq    1(%2),%2        \n"
 213         "       loop    1b              \n"
 214         "       sbbq    %0,%0           \n"
 215                 : "=&a"(ret),"+c"(n),"=&r"(i)
 216                 : "r"(rp),"r"(ap),"r"(bp)
 217                 : "cc"
 218         );
 219
 220   return ret&1;
 221 }
 222 #else
 223 /* Simics 1.4<7 has buggy sbbq:-( */
 224 #define BN_MASK2 0xffffffffffffffffL
 225 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 226         {
 227         BN_ULONG t1,t2;
 228         int c=0;
 229
 230         if (n <= 0) return((BN_ULONG)0);
 231
 232         for (;;)
 233                 {
 234                 t1=a[0]; t2=b[0];
 235                 r[0]=(t1-t2-c)&BN_MASK2;
 236                 if (t1 != t2) c=(t1 < t2);
 237                 if (--n <= 0) break;
 238
 239                 t1=a[1]; t2=b[1];
 240                 r[1]=(t1-t2-c)&BN_MASK2;
 241                 if (t1 != t2) c=(t1 < t2);
 242                 if (--n <= 0) break;
 243
 244                 t1=a[2]; t2=b[2];
 245                 r[2]=(t1-t2-c)&BN_MASK2;
 246                 if (t1 != t2) c=(t1 < t2);
 247                 if (--n <= 0) break;
 248
 249                 t1=a[3]; t2=b[3];
 250                 r[3]=(t1-t2-c)&BN_MASK2;
 251                 if (t1 != t2) c=(t1 < t2);
 252                 if (--n <= 0) break;
 253
 254                 a+=4;
 255                 b+=4;
 256                 r+=4;
 257                 }
 258         return(c);
 259         }
 260 #endif
 261
 262 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
 263 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
 264 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
 265 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
 266
 267 #if 0
 268 /* original macros are kept for reference purposes */
 269 #define mul_add_c(a,b,c0,c1,c2) {       \
 270         BN_ULONG ta=(a),tb=(b);         \
 271         t1 = ta * tb;                   \
 272         t2 = BN_UMULT_HIGH(ta,tb);      \
 273         c0 += t1; t2 += (c0<t1)?1:0;    \
 274         c1 += t2; c2 += (c1<t2)?1:0;    \
 275         }
 276
 277 #define mul_add_c2(a,b,c0,c1,c2) {      \
 278         BN_ULONG ta=(a),tb=(b),t0;      \
 279         t1 = BN_UMULT_HIGH(ta,tb);      \
 280         t0 = ta * tb;                   \
 281         t2 = t1+t1; c2 += (t2<t1)?1:0;  \
 282         t1 = t0+t0; t2 += (t1<t0)?1:0;  \
 283         c0 += t1; t2 += (c0<t1)?1:0;    \
 284         c1 += t2; c2 += (c1<t2)?1:0;    \
 285         }
 286 #else
 287 #define mul_add_c(a,b,c0,c1,c2) do {    \
 288         asm ("mulq %3"                  \
 289                 : "=a"(t1),"=d"(t2)     \
 290                 : "a"(a),"m"(b)         \
 291                 : "cc");                \
 292         asm ("addq %2,%0; adcq %3,%1"   \
 293                 : "+r"(c0),"+d"(t2)     \
 294                 : "a"(t1),"g"(0)        \
 295                 : "cc");                \
 296         asm ("addq %2,%0; adcq %3,%1"   \
 297                 : "+r"(c1),"+r"(c2)     \
 298                 : "d"(t2),"g"(0)        \
 299                 : "cc");                \
 300         } while (0)
 301
 302 #define sqr_add_c(a,i,c0,c1,c2) do {    \
 303         asm ("mulq %2"                  \
 304                 : "=a"(t1),"=d"(t2)     \
 305                 : "a"(a[i])             \
 306                 : "cc");                \
 307         asm ("addq %2,%0; adcq %3,%1"   \
 308                 : "+r"(c0),"+d"(t2)     \
 309                 : "a"(t1),"g"(0)        \
 310                 : "cc");                \
 311         asm ("addq %2,%0; adcq %3,%1"   \
 312                 : "+r"(c1),"+r"(c2)     \
 313                 : "d"(t2),"g"(0)        \
 314                 : "cc");                \
 315         } while (0)
 316
 317 #define mul_add_c2(a,b,c0,c1,c2) do {   \
 318         asm ("mulq %3"                  \
 319                 : "=a"(t1),"=d"(t2)     \
 320                 : "a"(a),"m"(b)         \
 321                 : "cc");                \
 322         asm ("addq %0,%0; adcq %2,%1"   \
 323                 : "+d"(t2),"+r"(c2)     \
 324                 : "g"(0)                \
 325                 : "cc");                \
 326         asm ("addq %0,%0; adcq %2,%1"   \
 327                 : "+a"(t1),"+d"(t2)     \
 328                 : "g"(0)                \
 329                 : "cc");                \
 330         asm ("addq %2,%0; adcq %3,%1"   \
 331                 : "+r"(c0),"+d"(t2)     \
 332                 : "a"(t1),"g"(0)        \
 333                 : "cc");                \
 334         asm ("addq %2,%0; adcq %3,%1"   \
 335                 : "+r"(c1),"+r"(c2)     \
 336                 : "d"(t2),"g"(0)        \
 337                 : "cc");                \
 338         } while (0)
 339 #endif
 340
 341 #define sqr_add_c2(a,i,j,c0,c1,c2)      \
 342         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 343
 344 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 345         {
 346         BN_ULONG t1,t2;
 347         BN_ULONG c1,c2,c3;
 348
 349         c1=0;
 350         c2=0;
 351         c3=0;
 352         mul_add_c(a[0],b[0],c1,c2,c3);
 353         r[0]=c1;
 354         c1=0;
 355         mul_add_c(a[0],b[1],c2,c3,c1);
 356         mul_add_c(a[1],b[0],c2,c3,c1);
 357         r[1]=c2;
 358         c2=0;
 359         mul_add_c(a[2],b[0],c3,c1,c2);
 360         mul_add_c(a[1],b[1],c3,c1,c2);
 361         mul_add_c(a[0],b[2],c3,c1,c2);
 362         r[2]=c3;
 363         c3=0;
 364         mul_add_c(a[0],b[3],c1,c2,c3);
 365         mul_add_c(a[1],b[2],c1,c2,c3);
 366         mul_add_c(a[2],b[1],c1,c2,c3);
 367         mul_add_c(a[3],b[0],c1,c2,c3);
 368         r[3]=c1;
 369         c1=0;
 370         mul_add_c(a[4],b[0],c2,c3,c1);
 371         mul_add_c(a[3],b[1],c2,c3,c1);
 372         mul_add_c(a[2],b[2],c2,c3,c1);
 373         mul_add_c(a[1],b[3],c2,c3,c1);
 374         mul_add_c(a[0],b[4],c2,c3,c1);
 375         r[4]=c2;
 376         c2=0;
 377         mul_add_c(a[0],b[5],c3,c1,c2);
 378         mul_add_c(a[1],b[4],c3,c1,c2);
 379         mul_add_c(a[2],b[3],c3,c1,c2);
 380         mul_add_c(a[3],b[2],c3,c1,c2);
 381         mul_add_c(a[4],b[1],c3,c1,c2);
 382         mul_add_c(a[5],b[0],c3,c1,c2);
 383         r[5]=c3;
 384         c3=0;
 385         mul_add_c(a[6],b[0],c1,c2,c3);
 386         mul_add_c(a[5],b[1],c1,c2,c3);
 387         mul_add_c(a[4],b[2],c1,c2,c3);
 388         mul_add_c(a[3],b[3],c1,c2,c3);
 389         mul_add_c(a[2],b[4],c1,c2,c3);
 390         mul_add_c(a[1],b[5],c1,c2,c3);
 391         mul_add_c(a[0],b[6],c1,c2,c3);
 392         r[6]=c1;
 393         c1=0;
 394         mul_add_c(a[0],b[7],c2,c3,c1);
 395         mul_add_c(a[1],b[6],c2,c3,c1);
 396         mul_add_c(a[2],b[5],c2,c3,c1);
 397         mul_add_c(a[3],b[4],c2,c3,c1);
 398         mul_add_c(a[4],b[3],c2,c3,c1);
 399         mul_add_c(a[5],b[2],c2,c3,c1);
 400         mul_add_c(a[6],b[1],c2,c3,c1);
 401         mul_add_c(a[7],b[0],c2,c3,c1);
 402         r[7]=c2;
 403         c2=0;
 404         mul_add_c(a[7],b[1],c3,c1,c2);
 405         mul_add_c(a[6],b[2],c3,c1,c2);
 406         mul_add_c(a[5],b[3],c3,c1,c2);
 407         mul_add_c(a[4],b[4],c3,c1,c2);
 408         mul_add_c(a[3],b[5],c3,c1,c2);
 409         mul_add_c(a[2],b[6],c3,c1,c2);
 410         mul_add_c(a[1],b[7],c3,c1,c2);
 411         r[8]=c3;
 412         c3=0;
 413         mul_add_c(a[2],b[7],c1,c2,c3);
 414         mul_add_c(a[3],b[6],c1,c2,c3);
 415         mul_add_c(a[4],b[5],c1,c2,c3);
 416         mul_add_c(a[5],b[4],c1,c2,c3);
 417         mul_add_c(a[6],b[3],c1,c2,c3);
 418         mul_add_c(a[7],b[2],c1,c2,c3);
 419         r[9]=c1;
 420         c1=0;
 421         mul_add_c(a[7],b[3],c2,c3,c1);
 422         mul_add_c(a[6],b[4],c2,c3,c1);
 423         mul_add_c(a[5],b[5],c2,c3,c1);
 424         mul_add_c(a[4],b[6],c2,c3,c1);
 425         mul_add_c(a[3],b[7],c2,c3,c1);
 426         r[10]=c2;
 427         c2=0;
 428         mul_add_c(a[4],b[7],c3,c1,c2);
 429         mul_add_c(a[5],b[6],c3,c1,c2);
 430         mul_add_c(a[6],b[5],c3,c1,c2);
 431         mul_add_c(a[7],b[4],c3,c1,c2);
 432         r[11]=c3;
 433         c3=0;
 434         mul_add_c(a[7],b[5],c1,c2,c3);
 435         mul_add_c(a[6],b[6],c1,c2,c3);
 436         mul_add_c(a[5],b[7],c1,c2,c3);
 437         r[12]=c1;
 438         c1=0;
 439         mul_add_c(a[6],b[7],c2,c3,c1);
 440         mul_add_c(a[7],b[6],c2,c3,c1);
 441         r[13]=c2;
 442         c2=0;
 443         mul_add_c(a[7],b[7],c3,c1,c2);
 444         r[14]=c3;
 445         r[15]=c1;
 446         }
 447
 448 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 449         {
 450         BN_ULONG t1,t2;
 451         BN_ULONG c1,c2,c3;
 452
 453         c1=0;
 454         c2=0;
 455         c3=0;
 456         mul_add_c(a[0],b[0],c1,c2,c3);
 457         r[0]=c1;
 458         c1=0;
 459         mul_add_c(a[0],b[1],c2,c3,c1);
 460         mul_add_c(a[1],b[0],c2,c3,c1);
 461         r[1]=c2;
 462         c2=0;
 463         mul_add_c(a[2],b[0],c3,c1,c2);
 464         mul_add_c(a[1],b[1],c3,c1,c2);
 465         mul_add_c(a[0],b[2],c3,c1,c2);
 466         r[2]=c3;
 467         c3=0;
 468         mul_add_c(a[0],b[3],c1,c2,c3);
 469         mul_add_c(a[1],b[2],c1,c2,c3);
 470         mul_add_c(a[2],b[1],c1,c2,c3);
 471         mul_add_c(a[3],b[0],c1,c2,c3);
 472         r[3]=c1;
 473         c1=0;
 474         mul_add_c(a[3],b[1],c2,c3,c1);
 475         mul_add_c(a[2],b[2],c2,c3,c1);
 476         mul_add_c(a[1],b[3],c2,c3,c1);
 477         r[4]=c2;
 478         c2=0;
 479         mul_add_c(a[2],b[3],c3,c1,c2);
 480         mul_add_c(a[3],b[2],c3,c1,c2);
 481         r[5]=c3;
 482         c3=0;
 483         mul_add_c(a[3],b[3],c1,c2,c3);
 484         r[6]=c1;
 485         r[7]=c2;
 486         }
 487
 488 void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
 489         {
 490         BN_ULONG t1,t2;
 491         BN_ULONG c1,c2,c3;
 492
 493         c1=0;
 494         c2=0;
 495         c3=0;
 496         sqr_add_c(a,0,c1,c2,c3);
 497         r[0]=c1;
 498         c1=0;
 499         sqr_add_c2(a,1,0,c2,c3,c1);
 500         r[1]=c2;
 501         c2=0;
 502         sqr_add_c(a,1,c3,c1,c2);
 503         sqr_add_c2(a,2,0,c3,c1,c2);
 504         r[2]=c3;
 505         c3=0;
 506         sqr_add_c2(a,3,0,c1,c2,c3);
 507         sqr_add_c2(a,2,1,c1,c2,c3);
 508         r[3]=c1;
 509         c1=0;
 510         sqr_add_c(a,2,c2,c3,c1);
 511         sqr_add_c2(a,3,1,c2,c3,c1);
 512         sqr_add_c2(a,4,0,c2,c3,c1);
 513         r[4]=c2;
 514         c2=0;
 515         sqr_add_c2(a,5,0,c3,c1,c2);
 516         sqr_add_c2(a,4,1,c3,c1,c2);
 517         sqr_add_c2(a,3,2,c3,c1,c2);
 518         r[5]=c3;
 519         c3=0;
 520         sqr_add_c(a,3,c1,c2,c3);
 521         sqr_add_c2(a,4,2,c1,c2,c3);
 522         sqr_add_c2(a,5,1,c1,c2,c3);
 523         sqr_add_c2(a,6,0,c1,c2,c3);
 524         r[6]=c1;
 525         c1=0;
 526         sqr_add_c2(a,7,0,c2,c3,c1);
 527         sqr_add_c2(a,6,1,c2,c3,c1);
 528         sqr_add_c2(a,5,2,c2,c3,c1);
 529         sqr_add_c2(a,4,3,c2,c3,c1);
 530         r[7]=c2;
 531         c2=0;
 532         sqr_add_c(a,4,c3,c1,c2);
 533         sqr_add_c2(a,5,3,c3,c1,c2);
 534         sqr_add_c2(a,6,2,c3,c1,c2);
 535         sqr_add_c2(a,7,1,c3,c1,c2);
 536         r[8]=c3;
 537         c3=0;
 538         sqr_add_c2(a,7,2,c1,c2,c3);
 539         sqr_add_c2(a,6,3,c1,c2,c3);
 540         sqr_add_c2(a,5,4,c1,c2,c3);
 541         r[9]=c1;
 542         c1=0;
 543         sqr_add_c(a,5,c2,c3,c1);
 544         sqr_add_c2(a,6,4,c2,c3,c1);
 545         sqr_add_c2(a,7,3,c2,c3,c1);
 546         r[10]=c2;
 547         c2=0;
 548         sqr_add_c2(a,7,4,c3,c1,c2);
 549         sqr_add_c2(a,6,5,c3,c1,c2);
 550         r[11]=c3;
 551         c3=0;
 552         sqr_add_c(a,6,c1,c2,c3);
 553         sqr_add_c2(a,7,5,c1,c2,c3);
 554         r[12]=c1;
 555         c1=0;
 556         sqr_add_c2(a,7,6,c2,c3,c1);
 557         r[13]=c2;
 558         c2=0;
 559         sqr_add_c(a,7,c3,c1,c2);
 560         r[14]=c3;
 561         r[15]=c1;
 562         }
 563
 564 void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
 565         {
 566         BN_ULONG t1,t2;
 567         BN_ULONG c1,c2,c3;
 568
 569         c1=0;
 570         c2=0;
 571         c3=0;
 572         sqr_add_c(a,0,c1,c2,c3);
 573         r[0]=c1;
 574         c1=0;
 575         sqr_add_c2(a,1,0,c2,c3,c1);
 576         r[1]=c2;
 577         c2=0;
 578         sqr_add_c(a,1,c3,c1,c2);
 579         sqr_add_c2(a,2,0,c3,c1,c2);
 580         r[2]=c3;
 581         c3=0;
 582         sqr_add_c2(a,3,0,c1,c2,c3);
 583         sqr_add_c2(a,2,1,c1,c2,c3);
 584         r[3]=c1;
 585         c1=0;
 586         sqr_add_c(a,2,c2,c3,c1);
 587         sqr_add_c2(a,3,1,c2,c3,c1);
 588         r[4]=c2;
 589         c2=0;
 590         sqr_add_c2(a,3,2,c3,c1,c2);
 591         r[5]=c3;
 592         c3=0;
 593         sqr_add_c(a,3,c1,c2,c3);
 594         r[6]=c1;
 595         r[7]=c2;
 596         }
 597 #endif