+___
+} else {
+$code.=<<___;
+ fmadd $T1a,$A0,$bc,$T1a
+ fmadd $T1b,$A0,$bd,$T1b
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ fmadd $T2a,$A1,$bc,$T2a
+ fmadd $T2b,$A1,$bd,$T2b
+ stfd $N0,40($nap_d) ; save n[j] in double format
+ stfd $N1,48($nap_d)
+ srwi $c1,$t1,16
+ insrwi $carry,$t1,16,0
+ fmadd $T3a,$A2,$bc,$T3a
+ fmadd $T3b,$A2,$bd,$T3b
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ fmul $dota,$A3,$bc
+ fmul $dotb,$A3,$bd
+ stfd $N2,56($nap_d) ; save n[j+1] in double format
+ stfdu $N3,64($nap_d)
+ insrwi $t0,$t2,16,0 ; 0..31 bits
+ srwi $c1,$t3,16
+ insrwi $carry,$t3,16,0
+
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ lwz $t3,`$FRAME+32`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36`($sp) ; permuted $t0
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ srwi $c1,$t5,16
+ insrwi $carry,$t5,16,0
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+ insrwi $t4,$t6,16,0 ; 32..63 bits
+ srwi $c1,$t7,16
+ insrwi $carry,$t7,16,0
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ lwz $t7,`$FRAME+40`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44`($sp) ; permuted $t2
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ stw $t0,12($tp) ; tp[j-1]
+ stw $t4,8($tp)
+ srwi $c1,$t3,16
+ insrwi $carry,$t3,16,0
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ lwz $t1,`$FRAME+48`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52`($sp) ; permuted $t4
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+ insrwi $t2,$t6,16,0 ; 64..95 bits
+ srwi $c1,$t7,16
+ insrwi $carry,$t7,16,0
+
+ fctid $T0a,$T0a
+ fctid $T0b,$T0b
+ lwz $t5,`$FRAME+56`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60`($sp) ; permuted $t6
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ fctid $T1a,$T1a
+ fctid $T1b,$T1b
+ srwi $c1,$t1,16
+ insrwi $carry,$t1,16,0
+ fctid $T2a,$T2a
+ fctid $T2b,$T2b
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ fctid $T3a,$T3a
+ fctid $T3b,$T3b
+ insrwi $t0,$t4,16,0 ; 96..127 bits
+ srwi $c1,$t5,16
+ insrwi $carry,$t5,16,0
+
+ stfd $T0a,`$FRAME+0`($sp)
+ stfd $T0b,`$FRAME+8`($sp)
+ stfd $T1a,`$FRAME+16`($sp)
+ stfd $T1b,`$FRAME+24`($sp)
+ stfd $T2a,`$FRAME+32`($sp)
+ stfd $T2b,`$FRAME+40`($sp)
+ stfd $T3a,`$FRAME+48`($sp)
+ stfd $T3b,`$FRAME+56`($sp)
+ stw $t2,20($tp) ; tp[j]
+ stwu $t0,16($tp)
+___
+}
+$code.=<<___;