This module is used only with odd input lengths, i.e. not used in normal
PKI cases, on contemporary processors. The problem was "illuminated" by
fuzzing tests.
Reviewed-by: Richard Levitte <levitte@openssl.org>
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure.
########
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure.
########
-$sbit="%i2"; # re-use $bp!
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0
add $j,4,$j ! j++
ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc1,$car1,$car1
add $acc0,$acc0,$acc0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc1,$car1,$car1
add $acc0,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
srlx $car1,32,$car1
add $car0,$car0,$car0
srlx $car1,32,$car1
add $car0,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
.Lsqr_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
.Lsqr_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j]
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
.Lsqr_no_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
.Lsqr_no_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
srlx $car1,32,$car1
add $car0,$car0,$car0
srlx $car1,32,$car1
add $car0,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
!.Lsqr_last
mulx $npj,$mul1,$acc1
!.Lsqr_last
mulx $npj,$mul1,$acc1
+ add $tpj,$acc0,$acc0
+ srlx $acc0,32,$tmp0
+ and $acc0,$mask,$acc0
+ add $tmp0,$sbit,$sbit
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]