is fixed now.
mov $rp,$bp # put rp aside
mov sp,$tp
mov sp,$ap
- srl $nj,62,AT # boundary condition...
- beq AT,.Lcopy # ... is met
mov 0,$hi0 # clear borrow bit
.align 4
mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
- movs $tj,$nj,lsr#30 @ boundary condition...
- beq .Lcopy @ ... is met
-
subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp],#4
ldr $nj,[$np],#4
PTR_ADD $tj,sp,$num # &tp[num]
move $tp,sp
move $ap,sp
-
- dsrl AT,$nj,62 # boundary condition...
- beqz AT,.Lcopy # ... is met
li $hi0,0 # clear borrow bit
.align 4
addi $i,$i,$BNSZ
ble- Louter
\f
- $SHRI. $nj,$nj,$BITS-2 ; check boundary condition
addi $num,$num,2 ; restore $num
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$FRAME
addi $ap,$sp,$FRAME
mtctr $num
- beq Lcopy ; boundary condition is met
.align 4
Lsub: $LDX $tj,$tp,$j
la $ap,8($fp)
lgr $j,$num
- #lg $nhi,-8($np) # buggy assembler
- lghi $count,-8 # buggy assembler
- lg $nhi,0($count,$np) # buggy assembler
- srag $nhi,$nhi,62 # boundary condition...
- jz .Lcopy # ... is met
-
lcgr $count,$num
sra $count,3 # incidentally clears "borrow"
.Lsub: lg $alo,0($j,$ap)
add $rp,$num,$rp
mov $tp,$ap
sub %g0,$num,%o7 ! k=-num
-
- srl $npj,30,%o0 ! boundary condition...
- brz,pn %o0,.Lcopy ! ... is met
- nop
-
ba .Lsub
subcc %g0,%g0,%g0 ! clear %icc.c
.align 16
bnz %icc,.Louter
nop
\f
- ld [$np-4],%o1
- subcc %g0,%g0,%g0 ! clear %icc.c
add $tp,8,$tp ! adjust tp to point at the end
- srl %o1,30,%o1 ! boundary condition...
orn %g0,%g0,%g4
- brz,pn %o1,.Lcopy ! ... is met
sub %g0,$num,%o7 ! n=-num
ba .Lsub
- nop
+ subcc %g0,%g0,%g0 ! clear %icc.c
.align 32
.Lsub:
&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
&mov ("ecx","ebp");
- &xor ("edx","edx"); # i=0
&lea ("esi",&DWP(64,"esp")); # tp
# edi still points at the end of padded np copy...
- &mov ("eax",&DWP(-4-$pad,"edi")); # np[num-1]
&neg ("ebp");
&lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
&mov ("edi",$rp); # restore rp
-
- &shr ("eax",30); # boundary condition...
- &jz (&label("copy")); # ... is met
- &xor ("edx","edx"); # clear CF
+ &xor ("edx","edx"); # i=0 and clear CF
&set_label("sub",8);
&mov ("eax",&DWP(0,"esi","edx",4));
&mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
- &mov ("eax",&DWP(0,$np,$num,4)); # np[num-1]
- &shr ("eax",30); # check for boundary condition
- &jz (&label("copy"));
&mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
cmp $num,$i
jl .Louter
- mov -8($np,$num,8),%rax # np[num-1]
lea (%rsp),$ap # borrow ap for tp
- shr \$62,%rax # check for boundary condition
lea -1($num),$j # j=num-1
- jz .Lcopy
mov ($ap),%rax # tp[0]
xor $i,$i # i=0 and clear CF!
mov $i,(%rsp,$j,8) # zap temporary vector
dec $j
jge .Lcopy
-
+
mov 8(%rsp,$num,8),%rsp # restore %rsp
mov \$1,%rax
pop %r15
/* mont->ri will be a multiple of the word size and below code
* is kind of BN_rshift(ret,r,mont->ri) equivalent */
- if (r->top < ri)
+ if (r->top <= ri)
{
ret->top=0;
return(1);
rp=ret->d;
ap=&(r->d[ri]);
- nrp=ap;
- /* This 'if' denotes violation of 2*M<r^(n-1) boundary condition
- * formulated by C.D.Walter in "Montgomery exponentiation needs
- * no final subtractions." Incurred branch can disclose only
- * information about modulus length, which is not really secret. */
- if ((mont->N.d[ri-1]>>(BN_BITS2-2))!=0)
- {
- size_t m1,m2;
-
- v=bn_sub_words(rp,ap,mont->N.d,ri);
- /* this -----------------------^^ works even in al<ri case
- * thanks to zealous zeroing of top of the vector in the
- * beginning. */
-
- /* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
- /* in other words if subtraction result is real, then
- * trick unconditional memcpy below to perform in-place
- * "refresh" instead of actual copy. */
- m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al<ri */
- m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1); /* al>ri */
- m1|=m2; /* (al!=ri) */
- m1|=(0-(size_t)v); /* (al!=ri || v) */
- m1&=~m2; /* (al!=ri || v) && !al>ri */
- nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
- }
+ {
+ size_t m1,m2;
+
+ v=bn_sub_words(rp,ap,np,ri);
+ /* this ----------------^^ works even in al<ri case
+ * thanks to zealous zeroing of top of the vector in the
+ * beginning. */
+
+ /* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
+ /* in other words if subtraction result is real, then
+ * trick unconditional memcpy below to perform in-place
+ * "refresh" instead of actual copy. */
+ m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al<ri */
+ m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1); /* al>ri */
+ m1|=m2; /* (al!=ri) */
+ m1|=(0-(size_t)v); /* (al!=ri || v) */
+ m1&=~m2; /* (al!=ri || v) && !al>ri */
+ nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
+ }
/* 'i<ri' is chosen to eliminate dependency on input data, even
* though it results in redundant copy in al<ri case. */