$TR= "td"; # conditional trap
} else { die "nonsense $opf"; }
-( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
-# function entry points from the AIX code
-#
-# There are other, more elegant, ways to handle this. We (IBM) chose
-# this approach as it plays well with scripts we run to 'namespace'
-# OpenSSL .i.e. we add a prefix to all the public symbols so we can
-# co-exist in the same process with other implementations of OpenSSL.
-# 'cleverer' ways of doing these substitutions tend to hide data we
-# need to be obvious.
-#
-my @items = ("bn_sqr_comba4",
- "bn_sqr_comba8",
- "bn_mul_comba4",
- "bn_mul_comba8",
- "bn_sub_words",
- "bn_add_words",
- "bn_div_words",
- "bn_sqr_words",
- "bn_mul_words",
- "bn_mul_add_words");
+( defined shift || open STDOUT,"| $^X $xlate $opf" ) ||
+ die "can't call $xlate: $!";
-if ($opf =~ /linux/) { do_linux(); }
-elsif ($opf =~ /aix/) { do_aix(); }
-elsif ($opf =~ /osx/) { do_osx(); }
-else { do_bsd(); }
-
-sub do_linux {
- $d=&data();
-
- if ($BITS==64) {
- foreach $t (@items) {
- $d =~ s/\.$t:/\
-\t.section\t".opd","aw"\
-\t.align\t3\
-\t.globl\t$t\
-$t:\
-\t.quad\t.$t,.TOC.\@tocbase,0\
-\t.size\t$t,24\
-\t.previous\n\
-\t.type\t.$t,\@function\
-\t.globl\t.$t\
-.$t:/g;
- }
- }
- else {
- foreach $t (@items) {
- $d=~s/\.$t/$t/g;
- }
- }
- # hide internal labels to avoid pollution of name table...
- $d=~s/Lppcasm_/.Lppcasm_/gm;
- print $d;
-}
-
-sub do_aix {
- # AIX assembler is smart enough to please the linker without
- # making us do something special...
- print &data();
-}
-
-# MacOSX 32 bit
-sub do_osx {
- $d=&data();
- # Change the bn symbol prefix from '.' to '_'
- foreach $t (@items) {
- $d=~s/\.$t/_$t/g;
- }
- # Change .machine to something OS X asm will accept
- $d=~s/\.machine.*/.text/g;
- $d=~s/\#/;/g; # change comment from '#' to ';'
- print $d;
-}
-
-# BSD (Untested)
-sub do_bsd {
- $d=&data();
- foreach $t (@items) {
- $d=~s/\.$t/_$t/g;
- }
- print $d;
-}
-
-sub data {
- local($data)=<<EOF;
+$data=<<EOF;
#--------------------------------------------------------------------
#
#
#
# Defines to be used in the assembly code.
#
-.set r0,0 # we use it as storage for value of 0
-.set SP,1 # preserved
-.set RTOC,2 # preserved
-.set r3,3 # 1st argument/return value
-.set r4,4 # 2nd argument/volatile register
-.set r5,5 # 3rd argument/volatile register
-.set r6,6 # ...
-.set r7,7
-.set r8,8
-.set r9,9
-.set r10,10
-.set r11,11
-.set r12,12
-.set r13,13 # not used, nor any other "below" it...
-
-.set BO_IF_NOT,4
-.set BO_IF,12
-.set BO_dCTR_NZERO,16
-.set BO_dCTR_ZERO,18
-.set BO_ALWAYS,20
-.set CR0_LT,0;
-.set CR0_GT,1;
-.set CR0_EQ,2
-.set CR1_FX,4;
-.set CR1_FEX,5;
-.set CR1_VX,6
-.set LR,8
+#.set r0,0 # we use it as storage for value of 0
+#.set SP,1 # preserved
+#.set RTOC,2 # preserved
+#.set r3,3 # 1st argument/return value
+#.set r4,4 # 2nd argument/volatile register
+#.set r5,5 # 3rd argument/volatile register
+#.set r6,6 # ...
+#.set r7,7
+#.set r8,8
+#.set r9,9
+#.set r10,10
+#.set r11,11
+#.set r12,12
+#.set r13,13 # not used, nor any other "below" it...
# Declare function names to be global
# NOTE: For gcc these names MUST be changed to remove
$ST r9,`6*$BNSZ`(r3) #r[6]=c1
$ST r10,`7*$BNSZ`(r3) #r[7]=c2
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
$ST r9, `15*$BNSZ`(r3) #r[15]=c1;
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
$ST r10,`6*$BNSZ`(r3) #r[6]=c1
$ST r11,`7*$BNSZ`(r3) #r[7]=c2
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
adde r10,r10,r9
$ST r12,`14*$BNSZ`(r3) #r[14]=c3;
$ST r10,`15*$BNSZ`(r3) #r[15]=c1;
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
subfc. r7,r0,r6 # If r6 is 0 then result is 0.
# if r6 > 0 then result !=0
# In either case carry bit is set.
- bc BO_IF,CR0_EQ,Lppcasm_sub_adios
+ beq Lppcasm_sub_adios
addi r4,r4,-$BNSZ
addi r3,r3,-$BNSZ
addi r5,r5,-$BNSZ
# if carry = 1 this is r7-r8. Else it
# is r7-r8 -1 as we need.
$STU r6,$BNSZ(r3)
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop
+ bdnz- Lppcasm_sub_mainloop
Lppcasm_sub_adios:
subfze r3,r0 # if carry bit is set then r3 = 0 else -1
andi. r3,r3,1 # keep only last bit.
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
# check for r6 = 0. Is this needed?
#
addic. r6,r6,0 #test r6 and clear carry bit.
- bc BO_IF,CR0_EQ,Lppcasm_add_adios
+ beq Lppcasm_add_adios
addi r4,r4,-$BNSZ
addi r3,r3,-$BNSZ
addi r5,r5,-$BNSZ
$LDU r8,$BNSZ(r5)
adde r8,r7,r8
$STU r8,$BNSZ(r3)
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop
+ bdnz- Lppcasm_add_mainloop
Lppcasm_add_adios:
addze r3,r0 #return carry bit.
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
# r5 = d
$UCMPI 0,r5,0 # compare r5 and 0
- bc BO_IF_NOT,CR0_EQ,Lppcasm_div1 # proceed if d!=0
+ bne Lppcasm_div1 # proceed if d!=0
li r3,-1 # d=0 return -1
- bclr BO_ALWAYS,CR0_LT
+ blr
Lppcasm_div1:
xor r0,r0,r0 #r0=0
li r8,$BITS
$CNTLZ. r7,r5 #r7 = num leading 0s in d.
- bc BO_IF,CR0_EQ,Lppcasm_div2 #proceed if no leading zeros
+ beq Lppcasm_div2 #proceed if no leading zeros
subf r8,r7,r8 #r8 = BN_num_bits_word(d)
$SHR. r9,r3,r8 #are there any bits above r8'th?
$TR 16,r9,r0 #if there're, signal to dump core...
Lppcasm_div2:
$UCMP 0,r3,r5 #h>=d?
- bc BO_IF,CR0_LT,Lppcasm_div3 #goto Lppcasm_div3 if not
+ blt Lppcasm_div3 #goto Lppcasm_div3 if not
subf r3,r5,r3 #h-=d ;
Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
cmpi 0,0,r7,0 # is (i == 0)?
- bc BO_IF,CR0_EQ,Lppcasm_div4
+ beq Lppcasm_div4
$SHL r3,r3,r7 # h = (h<< i)
$SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
$SHL r5,r5,r7 # d<<=i
$SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
# compute here for innerloop.
$UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
- bc BO_IF_NOT,CR0_EQ,Lppcasm_div5 # goto Lppcasm_div5 if not
+ bne Lppcasm_div5 # goto Lppcasm_div5 if not
li r8,-1
$CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
# the following 2 instructions do that
$SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
- $UCMP 1,r6,r7 # compare (tl <= r7)
- bc BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit
- bc BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit
+ $UCMP cr1,r6,r7 # compare (tl <= r7)
+ bne Lppcasm_divinnerexit
+ ble cr1,Lppcasm_divinnerexit
addi r8,r8,-1 #q--
subf r12,r9,r12 #th -=dh
$CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
Lppcasm_divinnerexit:
$SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
$SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
- $UCMP 1,r4,r11 # compare l and tl
+ $UCMP cr1,r4,r11 # compare l and tl
add r12,r12,r10 # th+=t
- bc BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
+ bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
addi r12,r12,1 # th++
Lppcasm_div7:
subf r11,r11,r4 #r11=l-tl
- $UCMP 1,r3,r12 #compare h and th
- bc BO_IF_NOT,CR1_FX,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
+ $UCMP cr1,r3,r12 #compare h and th
+ bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
addi r8,r8,-1 # q--
add r3,r5,r3 # h+=d
Lppcasm_div8:
# the following 2 instructions will do this.
$INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
$ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
- bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ;
+ bdz Lppcasm_div9 #if (count==0) break ;
$SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
b Lppcasm_divouterloop
Lppcasm_div9:
or r3,r8,r0
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
# No unrolling done here. Not performance critical.
addic. r5,r5,0 #test r5.
- bc BO_IF,CR0_EQ,Lppcasm_sqr_adios
+ beq Lppcasm_sqr_adios
addi r4,r4,-$BNSZ
addi r3,r3,-$BNSZ
mtctr r5
$UMULH r8,r6,r6
$STU r7,$BNSZ(r3)
$STU r8,$BNSZ(r3)
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop
+ bdnz- Lppcasm_sqr_mainloop
Lppcasm_sqr_adios:
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
xor r0,r0,r0
xor r12,r12,r12 # used for carry
rlwinm. r7,r5,30,2,31 # num >> 2
- bc BO_IF,CR0_EQ,Lppcasm_mw_REM
+ beq Lppcasm_mw_REM
mtctr r7
Lppcasm_mw_LOOP:
#mul(rp[0],ap[0],w,c1);
addi r3,r3,`4*$BNSZ`
addi r4,r4,`4*$BNSZ`
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP
+ bdnz- Lppcasm_mw_LOOP
Lppcasm_mw_REM:
andi. r5,r5,0x3
- bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+ beq Lppcasm_mw_OVER
#mul(rp[0],ap[0],w,c1);
$LD r8,`0*$BNSZ`(r4)
$UMULL r9,r6,r8
addi r5,r5,-1
cmpli 0,0,r5,0
- bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+ beq Lppcasm_mw_OVER
#mul(rp[1],ap[1],w,c1);
addi r5,r5,-1
cmpli 0,0,r5,0
- bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+ beq Lppcasm_mw_OVER
#mul_add(rp[2],ap[2],w,c1);
$LD r8,`2*$BNSZ`(r4)
Lppcasm_mw_OVER:
addi r3,r12,0
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
xor r0,r0,r0 #r0 = 0
xor r12,r12,r12 #r12 = 0 . used for carry
rlwinm. r7,r5,30,2,31 # num >> 2
- bc BO_IF,CR0_EQ,Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
+ beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
mtctr r7
Lppcasm_maw_mainloop:
#mul_add(rp[0],ap[0],w,c1);
$ST r11,`3*$BNSZ`(r3)
addi r3,r3,`4*$BNSZ`
addi r4,r4,`4*$BNSZ`
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop
+ bdnz- Lppcasm_maw_mainloop
Lppcasm_maw_leftover:
andi. r5,r5,0x3
- bc BO_IF,CR0_EQ,Lppcasm_maw_adios
+ beq Lppcasm_maw_adios
addi r3,r3,-$BNSZ
addi r4,r4,-$BNSZ
#mul_add(rp[0],ap[0],w,c1);
addze r12,r10
$ST r9,0(r3)
- bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+ bdz Lppcasm_maw_adios
#mul_add(rp[1],ap[1],w,c1);
$LDU r8,$BNSZ(r4)
$UMULL r9,r6,r8
addze r12,r10
$ST r9,0(r3)
- bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+ bdz Lppcasm_maw_adios
#mul_add(rp[2],ap[2],w,c1);
$LDU r8,$BNSZ(r4)
$UMULL r9,r6,r8
Lppcasm_maw_adios:
addi r3,r12,0
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
.align 4
EOF
- $data =~ s/\`([^\`]*)\`/eval $1/gem;
-
- # if some assembler chokes on some simplified mnemonic,
- # this is the spot to fix it up, e.g.:
- # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare
- $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;
- # assembler X doesn't accept li, load immediate value
- #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;
- return($data);
-}
+$data =~ s/\`([^\`]*)\`/eval $1/gem;
+print $data;
+close STDOUT;