bn_asm for s390x.
authorAndy Polyakov <appro@openssl.org>
Wed, 20 Jun 2007 14:10:16 +0000 (14:10 +0000)
committerAndy Polyakov <appro@openssl.org>
Wed, 20 Jun 2007 14:10:16 +0000 (14:10 +0000)
crypto/bn/asm/s390x.S [new file with mode: 0755]

diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S
new file mode 100755 (executable)
index 0000000..8f45f5d
--- /dev/null
@@ -0,0 +1,678 @@
+.ident "s390x.S, version 1.0"
+// ====================================================================
+// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+// project.
+//
+// Rights for redistribution and usage in source and binary forms are
+// granted according to the OpenSSL license. Warranty of any kind is
+// disclaimed.
+// ====================================================================
+
+.text
+
+#define zero   %r0
+
+// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl bn_mul_add_words
+.type  bn_mul_add_words,@function
+.align 4
+bn_mul_add_words:
+       lghi    zero,0          // zero = 0
+       la      %r1,0(%r2)      // put rp aside
+       lghi    %r2,0           // i=0;
+       ltgfr   %r4,%r4
+       bler    %r14            // if (len<=0) return 0;
+
+       stmg    %r6,%r10,48(%r15)
+       lghi    %r8,0           // carry = 0
+       srag    %r10,%r4,2      // cnt=len/4
+       jz      .Loop1_madd
+
+.Loop4_madd:
+       lg      %r7,0(%r2,%r3)  // ap[i]
+       mlgr    %r6,%r5         // *=w
+       algr    %r7,%r8         // +=carry
+       alcgr   %r6,zero
+       alg     %r7,0(%r2,%r1)  // +=rp[i]
+       alcgr   %r6,zero
+       stg     %r7,0(%r2,%r1)  // rp[i]=
+
+       lg      %r9,8(%r2,%r3)
+       mlgr    %r8,%r5
+       algr    %r9,%r6
+       alcgr   %r8,zero
+       alg     %r9,8(%r2,%r1)
+       alcgr   %r8,zero
+       stg     %r9,8(%r2,%r1)
+
+       lg      %r7,16(%r2,%r3)
+       mlgr    %r6,%r5
+       algr    %r7,%r8
+       alcgr   %r6,zero
+       alg     %r7,16(%r2,%r1)
+       alcgr   %r6,zero
+       stg     %r7,16(%r2,%r1)
+
+       lg      %r9,24(%r2,%r3)
+       mlgr    %r8,%r5
+       algr    %r9,%r6
+       alcgr   %r8,zero
+       alg     %r9,24(%r2,%r1)
+       alcgr   %r8,zero
+       stg     %r9,24(%r2,%r1)
+
+       la      %r2,32(%r2)     // i+=4
+       brct    %r10,.Loop4_madd
+
+       lghi    %r10,3
+       nr      %r4,%r10        // cnt=len%4
+       jz      .Lend_madd
+
+.Loop1_madd:
+       lg      %r7,0(%r2,%r3)  // ap[i]
+       mlgr    %r6,%r5         // *=w
+       algr    %r7,%r8         // +=carry
+       alcgr   %r6,zero
+       alg     %r7,0(%r2,%r1)  // +=rp[i]
+       alcgr   %r6,zero
+       stg     %r7,0(%r2,%r1)  // rp[i]=
+
+       lgr     %r8,%r6
+       la      %r2,8(%r2)      // i++
+       brct    %r4,.Loop1_madd
+
+.Lend_madd:
+       lgr     %r2,%r8
+       lmg     %r6,%r10,48(%r15)
+       br      %r14
+.size  bn_mul_add_words,.-bn_mul_add_words
+
+// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl bn_mul_words
+.type  bn_mul_words,@function
+.align 4
+bn_mul_words:
+       lghi    zero,0          // zero = 0
+       la      %r1,0(%r2)      // put rp aside
+       lghi    %r2,0           // i=0;
+       ltgfr   %r4,%r4
+       bler    %r14            // if (len<=0) return 0;
+
+       stmg    %r6,%r10,48(%r15)
+       lghi    %r8,0           // carry = 0
+       srag    %r10,%r4,2      // cnt=len/4
+       jz      .Loop1_mul
+
+.Loop4_mul:
+       lg      %r7,0(%r2,%r3)  // ap[i]
+       mlgr    %r6,%r5         // *=w
+       algr    %r7,%r8         // +=carry
+       alcgr   %r6,zero
+       stg     %r7,0(%r2,%r1)  // rp[i]=
+
+       lg      %r9,8(%r2,%r3)
+       mlgr    %r8,%r5
+       algr    %r9,%r6
+       alcgr   %r8,zero
+       stg     %r9,8(%r2,%r1)
+
+       lg      %r7,16(%r2,%r3)
+       mlgr    %r6,%r5
+       algr    %r7,%r8
+       alcgr   %r6,zero
+       stg     %r7,16(%r2,%r1)
+
+       lg      %r9,24(%r2,%r3)
+       mlgr    %r8,%r5
+       algr    %r9,%r6
+       alcgr   %r8,zero
+       stg     %r9,24(%r2,%r1)
+
+       la      %r2,32(%r2)     // i+=4
+       brct    %r10,.Loop4_mul
+
+       lghi    %r10,3
+       nr      %r4,%r10        // cnt=len%4
+       jz      .Lend_mul
+
+.Loop1_mul:
+       lg      %r7,0(%r2,%r3)  // ap[i]
+       mlgr    %r6,%r5         // *=w
+       algr    %r7,%r8         // +=carry
+       alcgr   %r6,zero
+       stg     %r7,0(%r2,%r1)  // rp[i]=
+
+       lgr     %r8,%r6
+       la      %r2,8(%r2)      // i++
+       brct    %r4,.Loop1_mul
+
+.Lend_mul:
+       lgr     %r2,%r8
+       lmg     %r6,%r10,48(%r15)
+       br      %r14
+.size  bn_mul_words,.-bn_mul_words
+
+// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
+.globl bn_sqr_words
+.type  bn_sqr_words,@function
+.align 4
+bn_sqr_words:
+       ltgfr   %r4,%r4
+       bler    %r14
+
+       stmg    %r6,%r7,48(%r15)
+       srag    %r1,%r4,2       // cnt=len/4
+       jz      .Loop1_sqr
+
+.Loop4_sqr:
+       lg      %r7,0(%r3)
+       mlgr    %r6,%r7
+       stg     %r7,0(%r2)
+       stg     %r6,8(%r2)
+
+       lg      %r7,8(%r3)
+       mlgr    %r6,%r7
+       stg     %r7,16(%r2)
+       stg     %r6,24(%r2)
+
+       lg      %r7,16(%r3)
+       mlgr    %r6,%r7
+       stg     %r7,32(%r2)
+       stg     %r6,40(%r2)
+
+       lg      %r7,24(%r3)
+       mlgr    %r6,%r7
+       stg     %r7,48(%r2)
+       stg     %r6,56(%r2)
+
+       la      %r3,32(%r3)
+       la      %r2,64(%r2)
+       brct    %r1,.Loop4_sqr
+
+       lghi    %r1,3
+       nr      %r4,%r1         // cnt=len%4
+       jz      .Lend_sqr
+
+.Loop1_sqr:
+       lg      %r7,0(%r3)
+       mlgr    %r6,%r7
+       stg     %r7,0(%r2)
+       stg     %r6,8(%r2)
+
+       la      %r3,8(%r3)
+       la      %r2,16(%r2)
+       brct    %r4,.Loop1_sqr
+
+.Lend_sqr:
+       lmg     %r6,%r7,48(%r15)
+       br      %r14
+.size  bn_sqr_words,.-bn_sqr_words
+
+// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
+.globl bn_div_words
+.type  bn_div_words,@function
+.align 4
+bn_div_words:
+       dlgr    %r2,%r4
+       lgr     %r2,%r3
+       br      %r14
+.size  bn_div_words,.-bn_div_words
+
+// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl bn_add_words
+.type  bn_add_words,@function
+.align 4
+bn_add_words:
+       la      %r1,0(%r2)      // put rp aside
+       lghi    %r2,0           // i=0
+       ltgfr   %r5,%r5
+       bler    %r14            // if (len<=0) return 0;
+
+       stg     %r6,48(%r15)
+       lghi    %r6,3
+       nr      %r6,%r5         // len%4
+       sra     %r5,2           // len/4, use sra because it sets condition code
+       jz      .Loop1_add      // carry is incidentally cleared if branch taken
+       algr    %r2,%r2         // clear carry
+
+.Loop4_add:
+       lg      %r0,0(%r2,%r3)
+       alcg    %r0,0(%r2,%r4)
+       stg     %r0,0(%r2,%r1)
+       lg      %r0,8(%r2,%r3)
+       alcg    %r0,8(%r2,%r4)
+       stg     %r0,8(%r2,%r1)
+       lg      %r0,16(%r2,%r3)
+       alcg    %r0,16(%r2,%r4)
+       stg     %r0,16(%r2,%r1)
+       lg      %r0,24(%r2,%r3)
+       alcg    %r0,24(%r2,%r4)
+       stg     %r0,24(%r2,%r1)
+
+       la      %r2,32(%r2)     // i+=4
+       brct    %r5,.Loop4_add
+
+       la      %r6,1(%r6)      // see if len%4 is zero ...
+       brct    %r6,.Loop1_add  // without touching condition code:-)
+
+.Lexit_add:
+       lghi    %r2,0
+       alcgr   %r2,%r2
+       lg      %r6,48(%r15)
+       br      %r14
+
+.Loop1_add:
+       lg      %r0,0(%r2,%r3)
+       alcg    %r0,0(%r2,%r4)
+       stg     %r0,0(%r2,%r1)
+
+       la      %r2,8(%r2)      // i++
+       brct    %r6,.Loop1_add
+
+       j       .Lexit_add
+.size  bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl bn_sub_words
+.type  bn_sub_words,@function
+.align 4
+bn_sub_words:
+       la      %r1,0(%r2)      // put rp aside
+       lghi    %r2,0           // i=0
+       ltgfr   %r5,%r5
+       bler    %r14            // if (len<=0) return 0;
+
+       stg     %r6,48(%r15)
+       lghi    %r6,3
+       nr      %r6,%r5         // len%4
+       sra     %r5,2           // len/4, use sra because it sets condition code
+       jnz     .Loop4_sub      // borrow is incidentally cleared if branch taken
+       slgr    %r2,%r2         // clear borrow
+
+.Loop1_sub:
+       lg      %r0,0(%r2,%r3)
+       slbg    %r0,0(%r2,%r4)
+       stg     %r0,0(%r2,%r1)
+
+       la      %r2,8(%r2)      // i++
+       brct    %r6,.Loop1_sub
+       j       .Lexit_sub
+
+.Loop4_sub:
+       lg      %r0,0(%r2,%r3)
+       slbg    %r0,0(%r2,%r4)
+       stg     %r0,0(%r2,%r1)
+       lg      %r0,8(%r2,%r3)
+       slbg    %r0,8(%r2,%r4)
+       stg     %r0,8(%r2,%r1)
+       lg      %r0,16(%r2,%r3)
+       slbg    %r0,16(%r2,%r4)
+       stg     %r0,16(%r2,%r1)
+       lg      %r0,24(%r2,%r3)
+       slbg    %r0,24(%r2,%r4)
+       stg     %r0,24(%r2,%r1)
+
+       la      %r2,32(%r2)     // i+=4
+       brct    %r5,.Loop4_sub
+
+       la      %r6,1(%r6)      // see if len%4 is zero ...
+       brct    %r6,.Loop1_sub  // without touching condition code:-)
+
+.Lexit_sub:
+       lghi    %r2,0
+       slbgr   %r2,%r2
+       lcgr    %r2,%r2
+       lg      %r6,48(%r15)
+       br      %r14
+.size  bn_sub_words,.-bn_sub_words
+
+#define c1     %r1
+#define c2     %r5
+#define c3     %r8
+
+#define mul_add_c(ai,bi,c1,c2,c3)      \
+       lg      %r7,ai*8(%r3);          \
+       mlg     %r6,bi*8(%r4);          \
+       algr    c1,%r7;                 \
+       alcgr   c2,%r6;                 \
+       alcgr   c3,zero
+
+// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl bn_mul_comba8
+.type  bn_mul_comba8,@function
+.align 4
+bn_mul_comba8:
+       stmg    %r6,%r8,48(%r15)
+
+       lghi    c1,0
+       lghi    c2,0
+       lghi    c3,0
+       lghi    zero,0
+
+       mul_add_c(0,0,c1,c2,c3);
+       stg     c1,0*8(%r2)
+       lghi    c1,0
+
+       mul_add_c(0,1,c2,c3,c1);
+       mul_add_c(1,0,c2,c3,c1);
+       stg     c2,1*8(%r2)
+       lghi    c2,0
+
+       mul_add_c(2,0,c3,c1,c2);
+       mul_add_c(1,1,c3,c1,c2);
+       mul_add_c(0,2,c3,c1,c2);
+       stg     c3,2*8(%r2)
+       lghi    c3,0
+
+       mul_add_c(0,3,c1,c2,c3);
+       mul_add_c(1,2,c1,c2,c3);
+       mul_add_c(2,1,c1,c2,c3);
+       mul_add_c(3,0,c1,c2,c3);
+       stg     c1,3*8(%r2)
+       lghi    c1,0
+
+       mul_add_c(4,0,c2,c3,c1);
+       mul_add_c(3,1,c2,c3,c1);
+       mul_add_c(2,2,c2,c3,c1);
+       mul_add_c(1,3,c2,c3,c1);
+       mul_add_c(0,4,c2,c3,c1);
+       stg     c2,4*8(%r2)
+       lghi    c2,0
+
+       mul_add_c(0,5,c3,c1,c2);
+       mul_add_c(1,4,c3,c1,c2);
+       mul_add_c(2,3,c3,c1,c2);
+       mul_add_c(3,2,c3,c1,c2);
+       mul_add_c(4,1,c3,c1,c2);
+       mul_add_c(5,0,c3,c1,c2);
+       stg     c3,5*8(%r2)
+       lghi    c3,0
+
+       mul_add_c(6,0,c1,c2,c3);
+       mul_add_c(5,1,c1,c2,c3);
+       mul_add_c(4,2,c1,c2,c3);
+       mul_add_c(3,3,c1,c2,c3);
+       mul_add_c(2,4,c1,c2,c3);
+       mul_add_c(1,5,c1,c2,c3);
+       mul_add_c(0,6,c1,c2,c3);
+       stg     c1,6*8(%r2)
+       lghi    c1,0
+
+       mul_add_c(0,7,c2,c3,c1);
+       mul_add_c(1,6,c2,c3,c1);
+       mul_add_c(2,5,c2,c3,c1);
+       mul_add_c(3,4,c2,c3,c1);
+       mul_add_c(4,3,c2,c3,c1);
+       mul_add_c(5,2,c2,c3,c1);
+       mul_add_c(6,1,c2,c3,c1);
+       mul_add_c(7,0,c2,c3,c1);
+       stg     c2,7*8(%r2)
+       lghi    c2,0
+
+       mul_add_c(7,1,c3,c1,c2);
+       mul_add_c(6,2,c3,c1,c2);
+       mul_add_c(5,3,c3,c1,c2);
+       mul_add_c(4,4,c3,c1,c2);
+       mul_add_c(3,5,c3,c1,c2);
+       mul_add_c(2,6,c3,c1,c2);
+       mul_add_c(1,7,c3,c1,c2);
+       stg     c3,8*8(%r2)
+       lghi    c3,0
+
+       mul_add_c(2,7,c1,c2,c3);
+       mul_add_c(3,6,c1,c2,c3);
+       mul_add_c(4,5,c1,c2,c3);
+       mul_add_c(5,4,c1,c2,c3);
+       mul_add_c(6,3,c1,c2,c3);
+       mul_add_c(7,2,c1,c2,c3);
+       stg     c1,9*8(%r2)
+       lghi    c1,0
+
+       mul_add_c(7,3,c2,c3,c1);
+       mul_add_c(6,4,c2,c3,c1);
+       mul_add_c(5,5,c2,c3,c1);
+       mul_add_c(4,6,c2,c3,c1);
+       mul_add_c(3,7,c2,c3,c1);
+       stg     c2,10*8(%r2)
+       lghi    c2,0
+
+       mul_add_c(4,7,c3,c1,c2);
+       mul_add_c(5,6,c3,c1,c2);
+       mul_add_c(6,5,c3,c1,c2);
+       mul_add_c(7,4,c3,c1,c2);
+       stg     c3,11*8(%r2)
+       lghi    c3,0
+
+       mul_add_c(7,5,c1,c2,c3);
+       mul_add_c(6,6,c1,c2,c3);
+       mul_add_c(5,7,c1,c2,c3);
+       stg     c1,12*8(%r2)
+       lghi    c1,0
+
+
+       mul_add_c(6,7,c2,c3,c1);
+       mul_add_c(7,6,c2,c3,c1);
+       stg     c2,13*8(%r2)
+       lghi    c2,0
+
+       mul_add_c(7,7,c3,c1,c2);
+       stg     c3,14*8(%r2)
+       stg     c1,15*8(%r2)
+
+       lmg     %r6,%r8,48(%r15)
+       br      %r14
+.size  bn_mul_comba8,.-bn_mul_comba8
+
+// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl bn_mul_comba4
+.type  bn_mul_comba4,@function
+.align 4
+bn_mul_comba4:
+       stmg    %r6,%r8,48(%r15)
+
+       lghi    c1,0
+       lghi    c2,0
+       lghi    c3,0
+       lghi    zero,0
+
+       mul_add_c(0,0,c1,c2,c3);
+       stg     c1,0*8(%r3)
+       lghi    c1,0
+
+       mul_add_c(0,1,c2,c3,c1);
+       mul_add_c(1,0,c2,c3,c1);
+       stg     c2,1*8(%r2)
+       lghi    c2,0
+
+       mul_add_c(2,0,c3,c1,c2);
+       mul_add_c(1,1,c3,c1,c2);
+       mul_add_c(0,2,c3,c1,c2);
+       stg     c3,2*8(%r2)
+       lghi    c3,0
+
+       mul_add_c(0,3,c1,c2,c3);
+       mul_add_c(1,2,c1,c2,c3);
+       mul_add_c(2,1,c1,c2,c3);
+       mul_add_c(3,0,c1,c2,c3);
+       stg     c1,3*8(%r2)
+       lghi    c1,0
+
+       mul_add_c(3,1,c2,c3,c1);
+       mul_add_c(2,2,c2,c3,c1);
+       mul_add_c(1,3,c2,c3,c1);
+       stg     c2,4*8(%r2)
+       lghi    c2,0
+
+       mul_add_c(2,3,c3,c1,c2);
+       mul_add_c(3,2,c3,c1,c2);
+       stg     c3,5*8(%r2)
+       lghi    c3,0
+
+       mul_add_c(3,3,c1,c2,c3);
+       stg     c1,6*8(%r2)
+       stg     c2,7*8(%r2)
+
+       stmg    %r6,%r8,48(%r15)
+       br      %r14
+.size  bn_mul_comba4,.-bn_mul_comba4
+
+#define sqr_add_c(ai,c1,c2,c3)         \
+       lg      %r7,ai*8(%r3);          \
+       mlgr    %r6,%r7;                \
+       algr    c1,%r7;                 \
+       alcgr   c2,%r6;                 \
+       alcgr   c3,zero
+
+#define sqr_add_c2(ai,aj,c1,c2,c3)     \
+       lg      %r7,ai*8(%r3);          \
+       mlg     %r6,aj*8(%r3);          \
+       algr    c1,%r7;                 \
+       alcgr   c2,%r6;                 \
+       alcgr   c3,zero;                \
+       algr    c1,%r7;                 \
+       alcgr   c2,%r6;                 \
+       alcgr   c3,zero
+
+// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
+.globl bn_sqr_comba8
+.type  bn_sqr_comba8,@function
+.align 4
+bn_sqr_comba8:
+       stmg    %r6,%r8,48(%r15)
+
+       lghi    c1,0
+       lghi    c2,0
+       lghi    c3,0
+       lghi    zero,0
+
+       sqr_add_c(0,c1,c2,c3);
+       stg     c1,0*8(%r2)
+       lghi    c1,0
+
+       sqr_add_c2(1,0,c2,c3,c1);
+       stg     c2,1*8(%r2)
+       lghi    c2,0
+
+       sqr_add_c(1,c3,c1,c2);
+       sqr_add_c2(2,0,c3,c1,c2);
+       stg     c3,2*8(%r2)
+       lghi    c3,0
+
+       sqr_add_c2(3,0,c1,c2,c3);
+       sqr_add_c2(2,1,c1,c2,c3);
+       stg     c1,3*8(%r2)
+       lghi    c1,0
+
+       sqr_add_c(2,c2,c3,c1);
+       sqr_add_c2(3,1,c2,c3,c1);
+       sqr_add_c2(4,0,c2,c3,c1);
+       stg     c2,4*8(%r2)
+       lghi    c2,0
+
+       sqr_add_c2(5,0,c3,c1,c2);
+       sqr_add_c2(4,1,c3,c1,c2);
+       sqr_add_c2(3,2,c3,c1,c2);
+       stg     c3,5*8(%r2)
+       lghi    c3,0
+
+       sqr_add_c(3,c1,c2,c3);
+       sqr_add_c2(4,2,c1,c2,c3);
+       sqr_add_c2(5,1,c1,c2,c3);
+       sqr_add_c2(6,0,c1,c2,c3);
+       stg     c1,6*8(%r2)
+       lghi    c1,0
+
+       sqr_add_c2(7,0,c2,c3,c1);
+       sqr_add_c2(6,1,c2,c3,c1);
+       sqr_add_c2(5,2,c2,c3,c1);
+       sqr_add_c2(4,3,c2,c3,c1);
+       stg     c2,7*8(%r2)
+       lghi    c2,0
+
+       sqr_add_c(4,c3,c1,c2);
+       sqr_add_c2(5,3,c3,c1,c2);
+       sqr_add_c2(6,2,c3,c1,c2);
+       sqr_add_c2(7,1,c3,c1,c2);
+       stg     c3,8*8(%r2)
+       lghi    c3,0
+
+       sqr_add_c2(7,2,c1,c2,c3);
+       sqr_add_c2(6,3,c1,c2,c3);
+       sqr_add_c2(5,4,c1,c2,c3);
+       stg     c1,9*8(%r2)
+       lghi    c1,0
+
+       sqr_add_c(5,c2,c3,c1);
+       sqr_add_c2(6,4,c2,c3,c1);
+       sqr_add_c2(7,3,c2,c3,c1);
+       stg     c2,10*8(%r2)
+       lghi    c2,0
+
+       sqr_add_c2(7,4,c3,c1,c2);
+       sqr_add_c2(6,5,c3,c1,c2);
+       stg     c3,11*8(%r2)
+       lghi    c3,0
+
+       sqr_add_c(6,c1,c2,c3);
+       sqr_add_c2(7,5,c1,c2,c3);
+       stg     c1,12*8(%r2)
+       lghi    c1,0
+
+       sqr_add_c2(7,6,c2,c3,c1);
+       stg     c2,13*8(%r2)
+       lghi    c2,0
+
+       sqr_add_c(7,c3,c1,c2);
+       stg     c3,14*8(%r2)
+       stg     c1,15*8(%r2)
+
+       lmg     %r6,%r8,48(%r15)
+       br      %r14
+.size  bn_sqr_comba8,.-bn_sqr_comba8
+
+// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
+.globl bn_sqr_comba4
+.type  bn_sqr_comba4,@function
+.align 4
+bn_sqr_comba4:
+       stmg    %r6,%r8,48(%r15)
+
+       lghi    c1,0
+       lghi    c2,0
+       lghi    c3,0
+       lghi    zero,0
+
+       sqr_add_c(0,c1,c2,c3);
+       stg     c1,0*8(%r2)
+       lghi    c1,0
+
+       sqr_add_c2(1,0,c2,c3,c1);
+       stg     c2,1*8(%r2)
+       lghi    c2,0
+
+       sqr_add_c(1,c3,c1,c2);
+       sqr_add_c2(2,0,c3,c1,c2);
+       stg     c3,2*8(%r2)
+       lghi    c3,0
+
+       sqr_add_c2(3,0,c1,c2,c3);
+       sqr_add_c2(2,1,c1,c2,c3);
+       stg     c1,3*8(%r2)
+       lghi    c1,0
+
+       sqr_add_c(2,c2,c3,c1);
+       sqr_add_c2(3,1,c2,c3,c1);
+       stg     c2,4*8(%r2)
+       lghi    c2,0
+
+       sqr_add_c2(3,2,c3,c1,c2);
+       stg     c3,5*8(%r2)
+       lghi    c3,0
+
+       sqr_add_c(3,c1,c2,c3);
+       stg     c1,6*8(%r2)
+       stg     c2,7*8(%r2)
+
+       lmg     %r6,%r8,48(%r15)
+       br      %r14
+.size  bn_sqr_comba4,.-bn_sqr_comba4