+ $PUSH r14,-$SIZE_T*18($a0)
+ $PUSH r15,-$SIZE_T*17($a0)
+ $PUSH r16,-$SIZE_T*16($a0)
+ $PUSH r17,-$SIZE_T*15($a0)
+ $PUSH r18,-$SIZE_T*14($a0)
+ $PUSH r19,-$SIZE_T*13($a0)
+ $PUSH r20,-$SIZE_T*12($a0)
+ $PUSH r21,-$SIZE_T*11($a0)
+ $PUSH r22,-$SIZE_T*10($a0)
+ $PUSH r23,-$SIZE_T*9($a0)
+ $PUSH r24,-$SIZE_T*8($a0)
+ $PUSH r25,-$SIZE_T*7($a0)
+ $PUSH r26,-$SIZE_T*6($a0)
+ $PUSH r27,-$SIZE_T*5($a0)
+ $PUSH r28,-$SIZE_T*4($a0)
+ $PUSH r29,-$SIZE_T*3($a0)
+ $PUSH r30,-$SIZE_T*2($a0)
+ $PUSH r31,-$SIZE_T*1($a0)
+
+ subi $ap,$ap,$SIZE_T # bias by -1
+ subi $t0,$np,$SIZE_T # bias by -1
+ subi $rp,$rp,$SIZE_T # bias by -1
+ $LD $n0,0($n0) # *n0
+ li $zero,0
+
+ add $ap_end,$ap,$num
+ $LD $a0,$SIZE_T*1($ap)
+ #li $acc0,0
+ $LD $a1,$SIZE_T*2($ap)
+ li $acc1,0
+ $LD $a2,$SIZE_T*3($ap)
+ li $acc2,0
+ $LD $a3,$SIZE_T*4($ap)
+ li $acc3,0
+ $LD $a4,$SIZE_T*5($ap)
+ li $acc4,0
+ $LD $a5,$SIZE_T*6($ap)
+ li $acc5,0
+ $LD $a6,$SIZE_T*7($ap)
+ li $acc6,0
+ $LDU $a7,$SIZE_T*8($ap)
+ li $acc7,0
+
+ addi $tp,$sp,$SIZE_T*11 # &tp[-1]
+ subic. $cnt,$num,$SIZE_T*8
+ b .Lsqr8x_zero_start
+
+.align 5
+.Lsqr8x_zero:
+ subic. $cnt,$cnt,$SIZE_T*8
+ $ST $zero,$SIZE_T*1($tp)
+ $ST $zero,$SIZE_T*2($tp)
+ $ST $zero,$SIZE_T*3($tp)
+ $ST $zero,$SIZE_T*4($tp)
+ $ST $zero,$SIZE_T*5($tp)
+ $ST $zero,$SIZE_T*6($tp)
+ $ST $zero,$SIZE_T*7($tp)
+ $ST $zero,$SIZE_T*8($tp)
+.Lsqr8x_zero_start:
+ $ST $zero,$SIZE_T*9($tp)
+ $ST $zero,$SIZE_T*10($tp)
+ $ST $zero,$SIZE_T*11($tp)
+ $ST $zero,$SIZE_T*12($tp)
+ $ST $zero,$SIZE_T*13($tp)
+ $ST $zero,$SIZE_T*14($tp)
+ $ST $zero,$SIZE_T*15($tp)
+ $STU $zero,$SIZE_T*16($tp)
+ bne .Lsqr8x_zero
+
+ $PUSH $rp,$SIZE_T*6($sp) # offload &rp[-1]
+ $PUSH $t0,$SIZE_T*7($sp) # offload &np[-1]
+ $PUSH $n0,$SIZE_T*8($sp) # offload n0
+ $PUSH $tp,$SIZE_T*9($sp) # &tp[2*num-1]
+ $PUSH $zero,$SIZE_T*10($sp) # initial top-most carry
+ addi $tp,$sp,$SIZE_T*11 # &tp[-1]
+
+ # Multiply everything but a[i]*a[i]
+.align 5
+.Lsqr8x_outer_loop:
+ # a[1]a[0] (i)
+ # a[2]a[0]
+ # a[3]a[0]
+ # a[4]a[0]
+ # a[5]a[0]
+ # a[6]a[0]
+ # a[7]a[0]
+ # a[2]a[1] (ii)
+ # a[3]a[1]
+ # a[4]a[1]
+ # a[5]a[1]
+ # a[6]a[1]
+ # a[7]a[1]
+ # a[3]a[2] (iii)
+ # a[4]a[2]
+ # a[5]a[2]
+ # a[6]a[2]
+ # a[7]a[2]
+ # a[4]a[3] (iv)
+ # a[5]a[3]
+ # a[6]a[3]
+ # a[7]a[3]
+ # a[5]a[4] (v)
+ # a[6]a[4]
+ # a[7]a[4]
+ # a[6]a[5] (vi)
+ # a[7]a[5]
+ # a[7]a[6] (vii)
+
+ $UMULL $t0,$a1,$a0 # lo(a[1..7]*a[0]) (i)
+ $UMULL $t1,$a2,$a0
+ $UMULL $t2,$a3,$a0
+ $UMULL $t3,$a4,$a0
+ addc $acc1,$acc1,$t0 # t[1]+lo(a[1]*a[0])
+ $UMULL $t0,$a5,$a0
+ adde $acc2,$acc2,$t1
+ $UMULL $t1,$a6,$a0
+ adde $acc3,$acc3,$t2
+ $UMULL $t2,$a7,$a0
+ adde $acc4,$acc4,$t3
+ $UMULH $t3,$a1,$a0 # hi(a[1..7]*a[0])
+ adde $acc5,$acc5,$t0
+ $UMULH $t0,$a2,$a0
+ adde $acc6,$acc6,$t1
+ $UMULH $t1,$a3,$a0
+ adde $acc7,$acc7,$t2
+ $UMULH $t2,$a4,$a0
+ $ST $acc0,$SIZE_T*1($tp) # t[0]
+ addze $acc0,$zero # t[8]
+ $ST $acc1,$SIZE_T*2($tp) # t[1]
+ addc $acc2,$acc2,$t3 # t[2]+lo(a[1]*a[0])
+ $UMULH $t3,$a5,$a0
+ adde $acc3,$acc3,$t0
+ $UMULH $t0,$a6,$a0
+ adde $acc4,$acc4,$t1
+ $UMULH $t1,$a7,$a0
+ adde $acc5,$acc5,$t2
+ $UMULL $t2,$a2,$a1 # lo(a[2..7]*a[1]) (ii)
+ adde $acc6,$acc6,$t3
+ $UMULL $t3,$a3,$a1
+ adde $acc7,$acc7,$t0
+ $UMULL $t0,$a4,$a1
+ adde $acc0,$acc0,$t1
+
+ $UMULL $t1,$a5,$a1
+ addc $acc3,$acc3,$t2
+ $UMULL $t2,$a6,$a1
+ adde $acc4,$acc4,$t3
+ $UMULL $t3,$a7,$a1
+ adde $acc5,$acc5,$t0
+ $UMULH $t0,$a2,$a1 # hi(a[2..7]*a[1])
+ adde $acc6,$acc6,$t1
+ $UMULH $t1,$a3,$a1
+ adde $acc7,$acc7,$t2
+ $UMULH $t2,$a4,$a1
+ adde $acc0,$acc0,$t3
+ $UMULH $t3,$a5,$a1
+ $ST $acc2,$SIZE_T*3($tp) # t[2]
+ addze $acc1,$zero # t[9]
+ $ST $acc3,$SIZE_T*4($tp) # t[3]
+ addc $acc4,$acc4,$t0
+ $UMULH $t0,$a6,$a1
+ adde $acc5,$acc5,$t1
+ $UMULH $t1,$a7,$a1
+ adde $acc6,$acc6,$t2
+ $UMULL $t2,$a3,$a2 # lo(a[3..7]*a[2]) (iii)
+ adde $acc7,$acc7,$t3
+ $UMULL $t3,$a4,$a2
+ adde $acc0,$acc0,$t0
+ $UMULL $t0,$a5,$a2
+ adde $acc1,$acc1,$t1
+
+ $UMULL $t1,$a6,$a2
+ addc $acc5,$acc5,$t2
+ $UMULL $t2,$a7,$a2
+ adde $acc6,$acc6,$t3
+ $UMULH $t3,$a3,$a2 # hi(a[3..7]*a[2])
+ adde $acc7,$acc7,$t0
+ $UMULH $t0,$a4,$a2
+ adde $acc0,$acc0,$t1
+ $UMULH $t1,$a5,$a2
+ adde $acc1,$acc1,$t2
+ $UMULH $t2,$a6,$a2
+ $ST $acc4,$SIZE_T*5($tp) # t[4]
+ addze $acc2,$zero # t[10]
+ $ST $acc5,$SIZE_T*6($tp) # t[5]
+ addc $acc6,$acc6,$t3
+ $UMULH $t3,$a7,$a2
+ adde $acc7,$acc7,$t0
+ $UMULL $t0,$a4,$a3 # lo(a[4..7]*a[3]) (iv)
+ adde $acc0,$acc0,$t1
+ $UMULL $t1,$a5,$a3
+ adde $acc1,$acc1,$t2
+ $UMULL $t2,$a6,$a3
+ adde $acc2,$acc2,$t3
+
+ $UMULL $t3,$a7,$a3
+ addc $acc7,$acc7,$t0
+ $UMULH $t0,$a4,$a3 # hi(a[4..7]*a[3])
+ adde $acc0,$acc0,$t1
+ $UMULH $t1,$a5,$a3
+ adde $acc1,$acc1,$t2
+ $UMULH $t2,$a6,$a3
+ adde $acc2,$acc2,$t3
+ $UMULH $t3,$a7,$a3
+ $ST $acc6,$SIZE_T*7($tp) # t[6]
+ addze $acc3,$zero # t[11]
+ $STU $acc7,$SIZE_T*8($tp) # t[7]
+ addc $acc0,$acc0,$t0
+ $UMULL $t0,$a5,$a4 # lo(a[5..7]*a[4]) (v)
+ adde $acc1,$acc1,$t1
+ $UMULL $t1,$a6,$a4
+ adde $acc2,$acc2,$t2
+ $UMULL $t2,$a7,$a4
+ adde $acc3,$acc3,$t3
+
+ $UMULH $t3,$a5,$a4 # hi(a[5..7]*a[4])
+ addc $acc1,$acc1,$t0
+ $UMULH $t0,$a6,$a4
+ adde $acc2,$acc2,$t1
+ $UMULH $t1,$a7,$a4
+ adde $acc3,$acc3,$t2
+ $UMULL $t2,$a6,$a5 # lo(a[6..7]*a[5]) (vi)
+ addze $acc4,$zero # t[12]
+ addc $acc2,$acc2,$t3
+ $UMULL $t3,$a7,$a5
+ adde $acc3,$acc3,$t0
+ $UMULH $t0,$a6,$a5 # hi(a[6..7]*a[5])
+ adde $acc4,$acc4,$t1
+
+ $UMULH $t1,$a7,$a5
+ addc $acc3,$acc3,$t2
+ $UMULL $t2,$a7,$a6 # lo(a[7]*a[6]) (vii)
+ adde $acc4,$acc4,$t3
+ $UMULH $t3,$a7,$a6 # hi(a[7]*a[6])
+ addze $acc5,$zero # t[13]
+ addc $acc4,$acc4,$t0
+ $UCMP $ap_end,$ap # done yet?
+ adde $acc5,$acc5,$t1
+
+ addc $acc5,$acc5,$t2
+ sub $t0,$ap_end,$num # rewinded ap
+ addze $acc6,$zero # t[14]
+ add $acc6,$acc6,$t3
+
+ beq .Lsqr8x_outer_break
+
+ mr $n0,$a0
+ $LD $a0,$SIZE_T*1($tp)
+ $LD $a1,$SIZE_T*2($tp)
+ $LD $a2,$SIZE_T*3($tp)
+ $LD $a3,$SIZE_T*4($tp)
+ $LD $a4,$SIZE_T*5($tp)
+ $LD $a5,$SIZE_T*6($tp)
+ $LD $a6,$SIZE_T*7($tp)
+ $LD $a7,$SIZE_T*8($tp)
+ addc $acc0,$acc0,$a0
+ $LD $a0,$SIZE_T*1($ap)
+ adde $acc1,$acc1,$a1
+ $LD $a1,$SIZE_T*2($ap)
+ adde $acc2,$acc2,$a2
+ $LD $a2,$SIZE_T*3($ap)
+ adde $acc3,$acc3,$a3
+ $LD $a3,$SIZE_T*4($ap)
+ adde $acc4,$acc4,$a4
+ $LD $a4,$SIZE_T*5($ap)
+ adde $acc5,$acc5,$a5
+ $LD $a5,$SIZE_T*6($ap)
+ adde $acc6,$acc6,$a6
+ $LD $a6,$SIZE_T*7($ap)
+ subi $rp,$ap,$SIZE_T*7
+ addze $acc7,$a7
+ $LDU $a7,$SIZE_T*8($ap)
+ #addze $carry,$zero # moved below
+ li $cnt,0
+ b .Lsqr8x_mul
+
+ # a[8]a[0]
+ # a[9]a[0]
+ # a[a]a[0]
+ # a[b]a[0]
+ # a[c]a[0]
+ # a[d]a[0]
+ # a[e]a[0]
+ # a[f]a[0]
+ # a[8]a[1]
+ # a[f]a[1]........................
+ # a[8]a[2]
+ # a[f]a[2]........................
+ # a[8]a[3]
+ # a[f]a[3]........................
+ # a[8]a[4]
+ # a[f]a[4]........................
+ # a[8]a[5]
+ # a[f]a[5]........................
+ # a[8]a[6]
+ # a[f]a[6]........................
+ # a[8]a[7]
+ # a[f]a[7]........................
+.align 5
+.Lsqr8x_mul:
+ $UMULL $t0,$a0,$n0
+ addze $carry,$zero # carry bit, modulo-scheduled
+ $UMULL $t1,$a1,$n0
+ addi $cnt,$cnt,$SIZE_T
+ $UMULL $t2,$a2,$n0
+ andi. $cnt,$cnt,$SIZE_T*8-1
+ $UMULL $t3,$a3,$n0
+ addc $acc0,$acc0,$t0
+ $UMULL $t0,$a4,$n0
+ adde $acc1,$acc1,$t1
+ $UMULL $t1,$a5,$n0
+ adde $acc2,$acc2,$t2
+ $UMULL $t2,$a6,$n0
+ adde $acc3,$acc3,$t3
+ $UMULL $t3,$a7,$n0
+ adde $acc4,$acc4,$t0
+ $UMULH $t0,$a0,$n0
+ adde $acc5,$acc5,$t1
+ $UMULH $t1,$a1,$n0
+ adde $acc6,$acc6,$t2
+ $UMULH $t2,$a2,$n0
+ adde $acc7,$acc7,$t3
+ $UMULH $t3,$a3,$n0
+ addze $carry,$carry
+ $STU $acc0,$SIZE_T($tp)
+ addc $acc0,$acc1,$t0
+ $UMULH $t0,$a4,$n0
+ adde $acc1,$acc2,$t1
+ $UMULH $t1,$a5,$n0
+ adde $acc2,$acc3,$t2
+ $UMULH $t2,$a6,$n0
+ adde $acc3,$acc4,$t3
+ $UMULH $t3,$a7,$n0
+ $LDX $n0,$rp,$cnt
+ adde $acc4,$acc5,$t0
+ adde $acc5,$acc6,$t1
+ adde $acc6,$acc7,$t2
+ adde $acc7,$carry,$t3
+ #addze $carry,$zero # moved above
+ bne .Lsqr8x_mul
+ # note that carry flag is guaranteed
+ # to be zero at this point
+ $UCMP $ap,$ap_end # done yet?
+ beq .Lsqr8x_break
+
+ $LD $a0,$SIZE_T*1($tp)
+ $LD $a1,$SIZE_T*2($tp)
+ $LD $a2,$SIZE_T*3($tp)
+ $LD $a3,$SIZE_T*4($tp)
+ $LD $a4,$SIZE_T*5($tp)
+ $LD $a5,$SIZE_T*6($tp)
+ $LD $a6,$SIZE_T*7($tp)
+ $LD $a7,$SIZE_T*8($tp)
+ addc $acc0,$acc0,$a0
+ $LD $a0,$SIZE_T*1($ap)
+ adde $acc1,$acc1,$a1
+ $LD $a1,$SIZE_T*2($ap)
+ adde $acc2,$acc2,$a2
+ $LD $a2,$SIZE_T*3($ap)
+ adde $acc3,$acc3,$a3
+ $LD $a3,$SIZE_T*4($ap)
+ adde $acc4,$acc4,$a4
+ $LD $a4,$SIZE_T*5($ap)
+ adde $acc5,$acc5,$a5
+ $LD $a5,$SIZE_T*6($ap)
+ adde $acc6,$acc6,$a6
+ $LD $a6,$SIZE_T*7($ap)
+ adde $acc7,$acc7,$a7
+ $LDU $a7,$SIZE_T*8($ap)
+ #addze $carry,$zero # moved above
+ b .Lsqr8x_mul
+
+.align 5
+.Lsqr8x_break:
+ $LD $a0,$SIZE_T*8($rp)
+ addi $ap,$rp,$SIZE_T*15
+ $LD $a1,$SIZE_T*9($rp)
+ sub. $t0,$ap_end,$ap # is it last iteration?
+ $LD $a2,$SIZE_T*10($rp)
+ sub $t1,$tp,$t0
+ $LD $a3,$SIZE_T*11($rp)
+ $LD $a4,$SIZE_T*12($rp)
+ $LD $a5,$SIZE_T*13($rp)
+ $LD $a6,$SIZE_T*14($rp)
+ $LD $a7,$SIZE_T*15($rp)
+ beq .Lsqr8x_outer_loop
+
+ $ST $acc0,$SIZE_T*1($tp)
+ $LD $acc0,$SIZE_T*1($t1)
+ $ST $acc1,$SIZE_T*2($tp)
+ $LD $acc1,$SIZE_T*2($t1)
+ $ST $acc2,$SIZE_T*3($tp)
+ $LD $acc2,$SIZE_T*3($t1)
+ $ST $acc3,$SIZE_T*4($tp)
+ $LD $acc3,$SIZE_T*4($t1)
+ $ST $acc4,$SIZE_T*5($tp)
+ $LD $acc4,$SIZE_T*5($t1)
+ $ST $acc5,$SIZE_T*6($tp)
+ $LD $acc5,$SIZE_T*6($t1)
+ $ST $acc6,$SIZE_T*7($tp)
+ $LD $acc6,$SIZE_T*7($t1)
+ $ST $acc7,$SIZE_T*8($tp)
+ $LD $acc7,$SIZE_T*8($t1)
+ mr $tp,$t1
+ b .Lsqr8x_outer_loop
+
+.align 5
+.Lsqr8x_outer_break:
+ ####################################################################
+ # Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+ $LD $a1,$SIZE_T*1($t0) # recall that $t0 is &a[-1]
+ $LD $a3,$SIZE_T*2($t0)
+ $LD $a5,$SIZE_T*3($t0)
+ $LD $a7,$SIZE_T*4($t0)
+ addi $ap,$t0,$SIZE_T*4
+ # "tp[x]" comments are for num==8 case
+ $LD $t1,$SIZE_T*13($sp) # =tp[1], t[0] is not interesting
+ $LD $t2,$SIZE_T*14($sp)
+ $LD $t3,$SIZE_T*15($sp)
+ $LD $t0,$SIZE_T*16($sp)
+
+ $ST $acc0,$SIZE_T*1($tp) # tp[8]=
+ srwi $cnt,$num,`log($SIZE_T)/log(2)+2`
+ $ST $acc1,$SIZE_T*2($tp)
+ subi $cnt,$cnt,1
+ $ST $acc2,$SIZE_T*3($tp)
+ $ST $acc3,$SIZE_T*4($tp)
+ $ST $acc4,$SIZE_T*5($tp)
+ $ST $acc5,$SIZE_T*6($tp)
+ $ST $acc6,$SIZE_T*7($tp)
+ #$ST $acc7,$SIZE_T*8($tp) # tp[15] is not interesting
+ addi $tp,$sp,$SIZE_T*11 # &tp[-1]
+ $UMULL $acc0,$a1,$a1
+ $UMULH $a1,$a1,$a1
+ add $acc1,$t1,$t1 # <<1
+ $SHRI $t1,$t1,$BITS-1
+ $UMULL $a2,$a3,$a3
+ $UMULH $a3,$a3,$a3
+ addc $acc1,$acc1,$a1
+ add $acc2,$t2,$t2
+ $SHRI $t2,$t2,$BITS-1
+ add $acc3,$t3,$t3
+ $SHRI $t3,$t3,$BITS-1
+ or $acc2,$acc2,$t1
+
+ mtctr $cnt
+.Lsqr4x_shift_n_add:
+ $UMULL $a4,$a5,$a5
+ $UMULH $a5,$a5,$a5
+ $LD $t1,$SIZE_T*6($tp) # =tp[5]
+ $LD $a1,$SIZE_T*1($ap)
+ adde $acc2,$acc2,$a2
+ add $acc4,$t0,$t0
+ $SHRI $t0,$t0,$BITS-1
+ or $acc3,$acc3,$t2
+ $LD $t2,$SIZE_T*7($tp) # =tp[6]
+ adde $acc3,$acc3,$a3
+ $LD $a3,$SIZE_T*2($ap)
+ add $acc5,$t1,$t1
+ $SHRI $t1,$t1,$BITS-1
+ or $acc4,$acc4,$t3
+ $LD $t3,$SIZE_T*8($tp) # =tp[7]
+ $UMULL $a6,$a7,$a7
+ $UMULH $a7,$a7,$a7
+ adde $acc4,$acc4,$a4
+ add $acc6,$t2,$t2
+ $SHRI $t2,$t2,$BITS-1
+ or $acc5,$acc5,$t0
+ $LD $t0,$SIZE_T*9($tp) # =tp[8]
+ adde $acc5,$acc5,$a5
+ $LD $a5,$SIZE_T*3($ap)
+ add $acc7,$t3,$t3
+ $SHRI $t3,$t3,$BITS-1
+ or $acc6,$acc6,$t1
+ $LD $t1,$SIZE_T*10($tp) # =tp[9]
+ $UMULL $a0,$a1,$a1
+ $UMULH $a1,$a1,$a1
+ adde $acc6,$acc6,$a6
+ $ST $acc0,$SIZE_T*1($tp) # tp[0]=
+ add $acc0,$t0,$t0
+ $SHRI $t0,$t0,$BITS-1
+ or $acc7,$acc7,$t2
+ $LD $t2,$SIZE_T*11($tp) # =tp[10]
+ adde $acc7,$acc7,$a7
+ $LDU $a7,$SIZE_T*4($ap)
+ $ST $acc1,$SIZE_T*2($tp) # tp[1]=
+ add $acc1,$t1,$t1
+ $SHRI $t1,$t1,$BITS-1
+ or $acc0,$acc0,$t3
+ $LD $t3,$SIZE_T*12($tp) # =tp[11]
+ $UMULL $a2,$a3,$a3
+ $UMULH $a3,$a3,$a3
+ adde $acc0,$acc0,$a0
+ $ST $acc2,$SIZE_T*3($tp) # tp[2]=
+ add $acc2,$t2,$t2
+ $SHRI $t2,$t2,$BITS-1
+ or $acc1,$acc1,$t0
+ $LD $t0,$SIZE_T*13($tp) # =tp[12]
+ adde $acc1,$acc1,$a1
+ $ST $acc3,$SIZE_T*4($tp) # tp[3]=
+ $ST $acc4,$SIZE_T*5($tp) # tp[4]=
+ $ST $acc5,$SIZE_T*6($tp) # tp[5]=
+ $ST $acc6,$SIZE_T*7($tp) # tp[6]=
+ $STU $acc7,$SIZE_T*8($tp) # tp[7]=
+ add $acc3,$t3,$t3
+ $SHRI $t3,$t3,$BITS-1
+ or $acc2,$acc2,$t1
+ bdnz .Lsqr4x_shift_n_add
+___
+my ($np,$np_end)=($ap,$ap_end);
+$code.=<<___;
+ $POP $np,$SIZE_T*7($sp) # pull &np[-1] and n0
+ $POP $n0,$SIZE_T*8($sp)
+
+ $UMULL $a4,$a5,$a5
+ $UMULH $a5,$a5,$a5
+ $ST $acc0,$SIZE_T*1($tp) # tp[8]=
+ $LD $acc0,$SIZE_T*12($sp) # =tp[0]
+ $LD $t1,$SIZE_T*6($tp) # =tp[13]
+ adde $acc2,$acc2,$a2
+ add $acc4,$t0,$t0
+ $SHRI $t0,$t0,$BITS-1
+ or $acc3,$acc3,$t2
+ $LD $t2,$SIZE_T*7($tp) # =tp[14]
+ adde $acc3,$acc3,$a3
+ add $acc5,$t1,$t1
+ $SHRI $t1,$t1,$BITS-1
+ or $acc4,$acc4,$t3
+ $UMULL $a6,$a7,$a7
+ $UMULH $a7,$a7,$a7
+ adde $acc4,$acc4,$a4
+ add $acc6,$t2,$t2
+ $SHRI $t2,$t2,$BITS-1
+ or $acc5,$acc5,$t0
+ $ST $acc1,$SIZE_T*2($tp) # tp[9]=
+ $LD $acc1,$SIZE_T*13($sp) # =tp[1]
+ adde $acc5,$acc5,$a5
+ or $acc6,$acc6,$t1
+ $LD $a0,$SIZE_T*1($np)
+ $LD $a1,$SIZE_T*2($np)
+ adde $acc6,$acc6,$a6
+ $LD $a2,$SIZE_T*3($np)
+ $LD $a3,$SIZE_T*4($np)
+ adde $acc7,$a7,$t2
+ $LD $a4,$SIZE_T*5($np)
+ $LD $a5,$SIZE_T*6($np)
+
+ ################################################################
+ # Reduce by 8 limbs per iteration
+ $UMULL $na0,$n0,$acc0 # t[0]*n0
+ li $cnt,8
+ $LD $a6,$SIZE_T*7($np)
+ add $np_end,$np,$num
+ $LDU $a7,$SIZE_T*8($np)
+ $ST $acc2,$SIZE_T*3($tp) # tp[10]=
+ $LD $acc2,$SIZE_T*14($sp)
+ $ST $acc3,$SIZE_T*4($tp) # tp[11]=
+ $LD $acc3,$SIZE_T*15($sp)
+ $ST $acc4,$SIZE_T*5($tp) # tp[12]=
+ $LD $acc4,$SIZE_T*16($sp)
+ $ST $acc5,$SIZE_T*6($tp) # tp[13]=
+ $LD $acc5,$SIZE_T*17($sp)
+ $ST $acc6,$SIZE_T*7($tp) # tp[14]=
+ $LD $acc6,$SIZE_T*18($sp)
+ $ST $acc7,$SIZE_T*8($tp) # tp[15]=
+ $LD $acc7,$SIZE_T*19($sp)
+ addi $tp,$sp,$SIZE_T*11 # &tp[-1]
+ mtctr $cnt
+ b .Lsqr8x_reduction
+
+.align 5
+.Lsqr8x_reduction:
+ # (*) $UMULL $t0,$a0,$na0 # lo(n[0-7])*lo(t[0]*n0)
+ $UMULL $t1,$a1,$na0
+ $UMULL $t2,$a2,$na0
+ $STU $na0,$SIZE_T($tp) # put aside t[0]*n0 for tail processing
+ $UMULL $t3,$a3,$na0
+ # (*) addc $acc0,$acc0,$t0
+ addic $acc0,$acc0,-1 # (*)
+ $UMULL $t0,$a4,$na0
+ adde $acc0,$acc1,$t1
+ $UMULL $t1,$a5,$na0
+ adde $acc1,$acc2,$t2
+ $UMULL $t2,$a6,$na0
+ adde $acc2,$acc3,$t3
+ $UMULL $t3,$a7,$na0
+ adde $acc3,$acc4,$t0
+ $UMULH $t0,$a0,$na0 # hi(n[0-7])*lo(t[0]*n0)
+ adde $acc4,$acc5,$t1
+ $UMULH $t1,$a1,$na0
+ adde $acc5,$acc6,$t2
+ $UMULH $t2,$a2,$na0
+ adde $acc6,$acc7,$t3
+ $UMULH $t3,$a3,$na0
+ addze $acc7,$zero
+ addc $acc0,$acc0,$t0
+ $UMULH $t0,$a4,$na0
+ adde $acc1,$acc1,$t1
+ $UMULH $t1,$a5,$na0
+ adde $acc2,$acc2,$t2
+ $UMULH $t2,$a6,$na0
+ adde $acc3,$acc3,$t3
+ $UMULH $t3,$a7,$na0
+ $UMULL $na0,$n0,$acc0 # next t[0]*n0
+ adde $acc4,$acc4,$t0
+ adde $acc5,$acc5,$t1
+ adde $acc6,$acc6,$t2
+ adde $acc7,$acc7,$t3
+ bdnz .Lsqr8x_reduction
+
+ $LD $t0,$SIZE_T*1($tp)
+ $LD $t1,$SIZE_T*2($tp)
+ $LD $t2,$SIZE_T*3($tp)
+ $LD $t3,$SIZE_T*4($tp)
+ subi $rp,$tp,$SIZE_T*7
+ $UCMP $np_end,$np # done yet?
+ addc $acc0,$acc0,$t0
+ $LD $t0,$SIZE_T*5($tp)
+ adde $acc1,$acc1,$t1
+ $LD $t1,$SIZE_T*6($tp)
+ adde $acc2,$acc2,$t2
+ $LD $t2,$SIZE_T*7($tp)
+ adde $acc3,$acc3,$t3
+ $LD $t3,$SIZE_T*8($tp)
+ adde $acc4,$acc4,$t0
+ adde $acc5,$acc5,$t1
+ adde $acc6,$acc6,$t2
+ adde $acc7,$acc7,$t3
+ #addze $carry,$zero # moved below
+ beq .Lsqr8x8_post_condition
+
+ $LD $n0,$SIZE_T*0($rp)
+ $LD $a0,$SIZE_T*1($np)
+ $LD $a1,$SIZE_T*2($np)
+ $LD $a2,$SIZE_T*3($np)
+ $LD $a3,$SIZE_T*4($np)
+ $LD $a4,$SIZE_T*5($np)
+ $LD $a5,$SIZE_T*6($np)
+ $LD $a6,$SIZE_T*7($np)
+ $LDU $a7,$SIZE_T*8($np)
+ li $cnt,0
+
+.align 5
+.Lsqr8x_tail:
+ $UMULL $t0,$a0,$n0
+ addze $carry,$zero # carry bit, modulo-scheduled
+ $UMULL $t1,$a1,$n0
+ addi $cnt,$cnt,$SIZE_T
+ $UMULL $t2,$a2,$n0
+ andi. $cnt,$cnt,$SIZE_T*8-1
+ $UMULL $t3,$a3,$n0
+ addc $acc0,$acc0,$t0
+ $UMULL $t0,$a4,$n0
+ adde $acc1,$acc1,$t1
+ $UMULL $t1,$a5,$n0
+ adde $acc2,$acc2,$t2
+ $UMULL $t2,$a6,$n0
+ adde $acc3,$acc3,$t3
+ $UMULL $t3,$a7,$n0
+ adde $acc4,$acc4,$t0
+ $UMULH $t0,$a0,$n0
+ adde $acc5,$acc5,$t1
+ $UMULH $t1,$a1,$n0
+ adde $acc6,$acc6,$t2
+ $UMULH $t2,$a2,$n0
+ adde $acc7,$acc7,$t3
+ $UMULH $t3,$a3,$n0
+ addze $carry,$carry
+ $STU $acc0,$SIZE_T($tp)
+ addc $acc0,$acc1,$t0
+ $UMULH $t0,$a4,$n0
+ adde $acc1,$acc2,$t1
+ $UMULH $t1,$a5,$n0
+ adde $acc2,$acc3,$t2
+ $UMULH $t2,$a6,$n0
+ adde $acc3,$acc4,$t3
+ $UMULH $t3,$a7,$n0
+ $LDX $n0,$rp,$cnt
+ adde $acc4,$acc5,$t0
+ adde $acc5,$acc6,$t1
+ adde $acc6,$acc7,$t2
+ adde $acc7,$carry,$t3
+ #addze $carry,$zero # moved above
+ bne .Lsqr8x_tail
+ # note that carry flag is guaranteed
+ # to be zero at this point
+ $LD $a0,$SIZE_T*1($tp)
+ $POP $carry,$SIZE_T*10($sp) # pull top-most carry in case we break
+ $UCMP $np_end,$np # done yet?
+ $LD $a1,$SIZE_T*2($tp)
+ sub $t2,$np_end,$num # rewinded np
+ $LD $a2,$SIZE_T*3($tp)
+ $LD $a3,$SIZE_T*4($tp)
+ $LD $a4,$SIZE_T*5($tp)
+ $LD $a5,$SIZE_T*6($tp)
+ $LD $a6,$SIZE_T*7($tp)
+ $LD $a7,$SIZE_T*8($tp)
+ beq .Lsqr8x_tail_break
+
+ addc $acc0,$acc0,$a0
+ $LD $a0,$SIZE_T*1($np)
+ adde $acc1,$acc1,$a1
+ $LD $a1,$SIZE_T*2($np)
+ adde $acc2,$acc2,$a2
+ $LD $a2,$SIZE_T*3($np)
+ adde $acc3,$acc3,$a3
+ $LD $a3,$SIZE_T*4($np)
+ adde $acc4,$acc4,$a4
+ $LD $a4,$SIZE_T*5($np)
+ adde $acc5,$acc5,$a5
+ $LD $a5,$SIZE_T*6($np)
+ adde $acc6,$acc6,$a6
+ $LD $a6,$SIZE_T*7($np)
+ adde $acc7,$acc7,$a7
+ $LDU $a7,$SIZE_T*8($np)
+ #addze $carry,$zero # moved above
+ b .Lsqr8x_tail
+
+.align 5
+.Lsqr8x_tail_break:
+ $POP $n0,$SIZE_T*8($sp) # pull n0
+ $POP $t3,$SIZE_T*9($sp) # &tp[2*num-1]
+ addi $cnt,$tp,$SIZE_T*8 # end of current t[num] window
+
+ addic $carry,$carry,-1 # "move" top-most carry to carry bit
+ adde $t0,$acc0,$a0
+ $LD $acc0,$SIZE_T*8($rp)
+ $LD $a0,$SIZE_T*1($t2) # recall that $t2 is &n[-1]
+ adde $t1,$acc1,$a1
+ $LD $acc1,$SIZE_T*9($rp)
+ $LD $a1,$SIZE_T*2($t2)
+ adde $acc2,$acc2,$a2
+ $LD $a2,$SIZE_T*3($t2)
+ adde $acc3,$acc3,$a3
+ $LD $a3,$SIZE_T*4($t2)
+ adde $acc4,$acc4,$a4
+ $LD $a4,$SIZE_T*5($t2)
+ adde $acc5,$acc5,$a5
+ $LD $a5,$SIZE_T*6($t2)
+ adde $acc6,$acc6,$a6
+ $LD $a6,$SIZE_T*7($t2)
+ adde $acc7,$acc7,$a7
+ $LD $a7,$SIZE_T*8($t2)
+ addi $np,$t2,$SIZE_T*8
+ addze $t2,$zero # top-most carry
+ $UMULL $na0,$n0,$acc0
+ $ST $t0,$SIZE_T*1($tp)
+ $UCMP $cnt,$t3 # did we hit the bottom?
+ $ST $t1,$SIZE_T*2($tp)
+ li $cnt,8
+ $ST $acc2,$SIZE_T*3($tp)
+ $LD $acc2,$SIZE_T*10($rp)
+ $ST $acc3,$SIZE_T*4($tp)
+ $LD $acc3,$SIZE_T*11($rp)
+ $ST $acc4,$SIZE_T*5($tp)
+ $LD $acc4,$SIZE_T*12($rp)
+ $ST $acc5,$SIZE_T*6($tp)
+ $LD $acc5,$SIZE_T*13($rp)
+ $ST $acc6,$SIZE_T*7($tp)
+ $LD $acc6,$SIZE_T*14($rp)
+ $ST $acc7,$SIZE_T*8($tp)
+ $LD $acc7,$SIZE_T*15($rp)
+ $PUSH $t2,$SIZE_T*10($sp) # off-load top-most carry
+ addi $tp,$rp,$SIZE_T*7 # slide the window
+ mtctr $cnt
+ bne .Lsqr8x_reduction
+
+ ################################################################
+ # Final step. We see if result is larger than modulus, and
+ # if it is, subtract the modulus. But comparison implies
+ # subtraction. So we subtract modulus, see if it borrowed,
+ # and conditionally copy original value.
+ $POP $rp,$SIZE_T*6($sp) # pull &rp[-1]
+ srwi $cnt,$num,`log($SIZE_T)/log(2)+3`
+ mr $n0,$tp # put tp aside
+ addi $tp,$tp,$SIZE_T*8
+ subi $cnt,$cnt,1
+ subfc $t0,$a0,$acc0
+ subfe $t1,$a1,$acc1
+ mr $carry,$t2
+ mr $ap_end,$rp # $rp copy
+
+ mtctr $cnt
+ b .Lsqr8x_sub
+
+.align 5
+.Lsqr8x_sub:
+ $LD $a0,$SIZE_T*1($np)
+ $LD $acc0,$SIZE_T*1($tp)
+ $LD $a1,$SIZE_T*2($np)
+ $LD $acc1,$SIZE_T*2($tp)
+ subfe $t2,$a2,$acc2
+ $LD $a2,$SIZE_T*3($np)
+ $LD $acc2,$SIZE_T*3($tp)
+ subfe $t3,$a3,$acc3
+ $LD $a3,$SIZE_T*4($np)
+ $LD $acc3,$SIZE_T*4($tp)
+ $ST $t0,$SIZE_T*1($rp)
+ subfe $t0,$a4,$acc4
+ $LD $a4,$SIZE_T*5($np)
+ $LD $acc4,$SIZE_T*5($tp)
+ $ST $t1,$SIZE_T*2($rp)
+ subfe $t1,$a5,$acc5
+ $LD $a5,$SIZE_T*6($np)
+ $LD $acc5,$SIZE_T*6($tp)
+ $ST $t2,$SIZE_T*3($rp)
+ subfe $t2,$a6,$acc6
+ $LD $a6,$SIZE_T*7($np)
+ $LD $acc6,$SIZE_T*7($tp)
+ $ST $t3,$SIZE_T*4($rp)
+ subfe $t3,$a7,$acc7
+ $LDU $a7,$SIZE_T*8($np)
+ $LDU $acc7,$SIZE_T*8($tp)
+ $ST $t0,$SIZE_T*5($rp)
+ subfe $t0,$a0,$acc0
+ $ST $t1,$SIZE_T*6($rp)
+ subfe $t1,$a1,$acc1
+ $ST $t2,$SIZE_T*7($rp)
+ $STU $t3,$SIZE_T*8($rp)
+ bdnz .Lsqr8x_sub
+
+ srwi $cnt,$num,`log($SIZE_T)/log(2)+2`
+ $LD $a0,$SIZE_T*1($ap_end) # original $rp
+ $LD $acc0,$SIZE_T*1($n0) # original $tp
+ subi $cnt,$cnt,1
+ $LD $a1,$SIZE_T*2($ap_end)
+ $LD $acc1,$SIZE_T*2($n0)
+ subfe $t2,$a2,$acc2
+ $LD $a2,$SIZE_T*3($ap_end)
+ $LD $acc2,$SIZE_T*3($n0)
+ subfe $t3,$a3,$acc3
+ $LD $a3,$SIZE_T*4($ap_end)
+ $LDU $acc3,$SIZE_T*4($n0)
+ $ST $t0,$SIZE_T*1($rp)
+ subfe $t0,$a4,$acc4
+ $ST $t1,$SIZE_T*2($rp)
+ subfe $t1,$a5,$acc5
+ $ST $t2,$SIZE_T*3($rp)
+ subfe $t2,$a6,$acc6
+ $ST $t3,$SIZE_T*4($rp)
+ subfe $t3,$a7,$acc7
+ $ST $t0,$SIZE_T*5($rp)
+ subfe $carry,$zero,$carry # did it borrow?
+ $ST $t1,$SIZE_T*6($rp)
+ $ST $t2,$SIZE_T*7($rp)
+ $ST $t3,$SIZE_T*8($rp)
+
+ addi $tp,$sp,$SIZE_T*11
+ mtctr $cnt
+
+.Lsqr4x_cond_copy:
+ andc $a0,$a0,$carry
+ $ST $zero,-$SIZE_T*3($n0) # wipe stack clean
+ and $acc0,$acc0,$carry
+ $ST $zero,-$SIZE_T*2($n0)
+ andc $a1,$a1,$carry
+ $ST $zero,-$SIZE_T*1($n0)
+ and $acc1,$acc1,$carry
+ $ST $zero,-$SIZE_T*0($n0)
+ andc $a2,$a2,$carry
+ $ST $zero,$SIZE_T*1($tp)
+ and $acc2,$acc2,$carry
+ $ST $zero,$SIZE_T*2($tp)
+ andc $a3,$a3,$carry
+ $ST $zero,$SIZE_T*3($tp)
+ and $acc3,$acc3,$carry
+ $STU $zero,$SIZE_T*4($tp)
+ or $t0,$a0,$acc0
+ $LD $a0,$SIZE_T*5($ap_end)
+ $LD $acc0,$SIZE_T*1($n0)
+ or $t1,$a1,$acc1
+ $LD $a1,$SIZE_T*6($ap_end)
+ $LD $acc1,$SIZE_T*2($n0)
+ or $t2,$a2,$acc2
+ $LD $a2,$SIZE_T*7($ap_end)
+ $LD $acc2,$SIZE_T*3($n0)
+ or $t3,$a3,$acc3
+ $LD $a3,$SIZE_T*8($ap_end)
+ $LDU $acc3,$SIZE_T*4($n0)
+ $ST $t0,$SIZE_T*1($ap_end)
+ $ST $t1,$SIZE_T*2($ap_end)
+ $ST $t2,$SIZE_T*3($ap_end)
+ $STU $t3,$SIZE_T*4($ap_end)
+ bdnz .Lsqr4x_cond_copy
+
+ $POP $ap,0($sp) # pull saved sp
+ andc $a0,$a0,$carry
+ and $acc0,$acc0,$carry
+ andc $a1,$a1,$carry
+ and $acc1,$acc1,$carry
+ andc $a2,$a2,$carry
+ and $acc2,$acc2,$carry
+ andc $a3,$a3,$carry
+ and $acc3,$acc3,$carry
+ or $t0,$a0,$acc0
+ or $t1,$a1,$acc1
+ or $t2,$a2,$acc2
+ or $t3,$a3,$acc3
+ $ST $t0,$SIZE_T*1($ap_end)
+ $ST $t1,$SIZE_T*2($ap_end)
+ $ST $t2,$SIZE_T*3($ap_end)
+ $ST $t3,$SIZE_T*4($ap_end)
+
+ b .Lsqr8x_done
+
+.align 5
+.Lsqr8x8_post_condition:
+ $POP $rp,$SIZE_T*6($sp) # pull rp
+ $POP $ap,0($sp) # pull saved sp
+ addze $carry,$zero
+
+ # $acc0-7,$carry hold result, $a0-7 hold modulus
+ subfc $acc0,$a0,$acc0
+ subfe $acc1,$a1,$acc1
+ $ST $zero,$SIZE_T*12($sp) # wipe stack clean
+ $ST $zero,$SIZE_T*13($sp)
+ subfe $acc2,$a2,$acc2
+ $ST $zero,$SIZE_T*14($sp)
+ $ST $zero,$SIZE_T*15($sp)
+ subfe $acc3,$a3,$acc3
+ $ST $zero,$SIZE_T*16($sp)
+ $ST $zero,$SIZE_T*17($sp)
+ subfe $acc4,$a4,$acc4
+ $ST $zero,$SIZE_T*18($sp)
+ $ST $zero,$SIZE_T*19($sp)
+ subfe $acc5,$a5,$acc5
+ $ST $zero,$SIZE_T*20($sp)
+ $ST $zero,$SIZE_T*21($sp)
+ subfe $acc6,$a6,$acc6
+ $ST $zero,$SIZE_T*22($sp)
+ $ST $zero,$SIZE_T*23($sp)
+ subfe $acc7,$a7,$acc7
+ $ST $zero,$SIZE_T*24($sp)
+ $ST $zero,$SIZE_T*25($sp)
+ subfe $carry,$zero,$carry # did it borrow?
+ $ST $zero,$SIZE_T*26($sp)
+ $ST $zero,$SIZE_T*27($sp)
+
+ and $a0,$a0,$carry
+ and $a1,$a1,$carry
+ addc $acc0,$acc0,$a0 # add modulus back if borrowed
+ and $a2,$a2,$carry
+ adde $acc1,$acc1,$a1
+ and $a3,$a3,$carry
+ adde $acc2,$acc2,$a2
+ and $a4,$a4,$carry
+ adde $acc3,$acc3,$a3
+ and $a5,$a5,$carry
+ adde $acc4,$acc4,$a4
+ and $a6,$a6,$carry
+ adde $acc5,$acc5,$a5
+ and $a7,$a7,$carry
+ adde $acc6,$acc6,$a6
+ adde $acc7,$acc7,$a7
+ $ST $acc0,$SIZE_T*1($rp)
+ $ST $acc1,$SIZE_T*2($rp)
+ $ST $acc2,$SIZE_T*3($rp)
+ $ST $acc3,$SIZE_T*4($rp)
+ $ST $acc4,$SIZE_T*5($rp)
+ $ST $acc5,$SIZE_T*6($rp)
+ $ST $acc6,$SIZE_T*7($rp)
+ $ST $acc7,$SIZE_T*8($rp)
+
+.Lsqr8x_done:
+ $PUSH $zero,$SIZE_T*8($sp)
+ $PUSH $zero,$SIZE_T*10($sp)
+
+ $POP r14,-$SIZE_T*18($ap)
+ li r3,1 # signal "done"
+ $POP r15,-$SIZE_T*17($ap)
+ $POP r16,-$SIZE_T*16($ap)
+ $POP r17,-$SIZE_T*15($ap)
+ $POP r18,-$SIZE_T*14($ap)
+ $POP r19,-$SIZE_T*13($ap)
+ $POP r20,-$SIZE_T*12($ap)
+ $POP r21,-$SIZE_T*11($ap)
+ $POP r22,-$SIZE_T*10($ap)
+ $POP r23,-$SIZE_T*9($ap)
+ $POP r24,-$SIZE_T*8($ap)
+ $POP r25,-$SIZE_T*7($ap)
+ $POP r26,-$SIZE_T*6($ap)
+ $POP r27,-$SIZE_T*5($ap)
+ $POP r28,-$SIZE_T*4($ap)
+ $POP r29,-$SIZE_T*3($ap)
+ $POP r30,-$SIZE_T*2($ap)
+ $POP r31,-$SIZE_T*1($ap)
+ mr $sp,$ap
+ blr
+ .long 0
+ .byte 0,12,4,0x20,0x80,18,6,0
+ .long 0
+.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
+___
+}
+$code.=<<___;