X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;f=crypto%2Fbn%2Fasm%2Fia64-mont.pl;h=5cc5c599f9dead3436450d23c735cbcdd4901247;hb=77b072504ec464eac5e0f9aab19cadb9c4e311d1;hp=af903a2bf8699ade29c50ea0aea340b90af0333d;hpb=4407700c40f766cedccc26479d18ff2e8bc19ff4;p=oweals%2Fopenssl.git diff --git a/crypto/bn/asm/ia64-mont.pl b/crypto/bn/asm/ia64-mont.pl index af903a2bf8..5cc5c599f9 100644 --- a/crypto/bn/asm/ia64-mont.pl +++ b/crypto/bn/asm/ia64-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -38,15 +45,15 @@ # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* # this module is: # sign verify sign/s verify/s -# rsa 512 bits 0.000302s 0.000024s 3312.3 41332.2 -# rsa 1024 bits 0.000816s 0.000058s 1225.2 17172.0 +# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 +# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 -# dsa 512 bits 0.000254s 0.000206s 3944.6 4865.1 +# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 # -# ... and *without*: +# ... and *without* (but still with ia64.S): # # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 @@ -56,9 +63,11 @@ # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 # -# As it can be seen, RSA sign performance improves by 120-30%, -# hereafter less for longer keys, while verify - by 72-13%. -# DSA performance improves by 100-30%. +# As it can be seen, RSA sign performance improves by 130-30%, +# hereafter less for longer keys, while verify - by 74-13%. +# DSA performance improves by 115-30%. + +$output=pop; if ($^O eq "hpux") { $ADDP="addp4"; @@ -72,9 +81,9 @@ $code=<<___; // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, // const BN_ULONG *bp,const BN_ULONG *np, // const BN_ULONG *n0p,int num); +.align 64 .global bn_mul_mont# .proc bn_mul_mont# -.align 64;; bn_mul_mont: .prologue .body @@ -99,9 +108,9 @@ n0=f6; m0=f7; bi=f8; +.align 64 .local bn_mul_mont_general# .proc bn_mul_mont_general# -.align 64;; bn_mul_mont_general: .prologue { .mmi; .save ar.pfs,prevfs @@ -353,7 +362,7 @@ bn_mul_mont_general: mov ar.lc=prevlc } { .mib; .restore sp mov sp=prevsp - mov pr=prevpr,-2 + mov pr=prevpr,0x1ffff br.ret.sptk.many b0 };; .endp bn_mul_mont_general# @@ -364,10 +373,10 @@ t0=r15; ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; +.align 64 +.skip 48 // aligns loop body .local bn_mul_mont_8# .proc bn_mul_mont_8# -.align 64 -.skip 48;; // aligns loop body bn_mul_mont_8: .prologue { .mmi; .save ar.pfs,prevfs @@ -473,7 +482,7 @@ bn_mul_mont_8: mov ar.lc=r28 } { .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] (p15)fcvt.fxu bj[0]=f0 - mov ar.ec=2 } + mov ar.ec=1 } { .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] (p13)fcvt.fxu ni6=f0 mov pr.rot=1<<16 } @@ -482,12 +491,12 @@ bn_mul_mont_8: brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 };; -// The loop is scheduled for 32*(n+1) ticks on Itanium 2. Actual -// measurement with help of Interval Time Counter indicate that the +// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt +// to measure with help of Interval Time Counter indicated that the // factor is a tad higher: 33 or 34, if not 35. Exact measurement and // addressing the issue is problematic, because I don't have access // to platform-specific instruction-level profiler. On Itanium it -// should run in 56*(n+1) ticks, because of higher xma latency... +// should run in 56*n ticks, because of higher xma latency... .Louter_8_ctop: .pred.rel "mutex",p40,p42 .pred.rel "mutex",p48,p50 @@ -652,62 +661,149 @@ bn_mul_mont_8: br.ctop.sptk.many .Louter_8_ctop };; .Louter_8_cend: -// move np[8] to GPR bank and subtract it from carrybit|tmp[8] +// above loop has to execute one more time, without (p16), which is +// replaced with merged move of np[8] to GPR bank + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mmi; (p0) getf.sig n1=ni0 // 0: + (p40) add a3=a3,n3 // (p17) a3+=n3 + (p42) add a3=a3,n3,1 };; +{ .mii; (p17) getf.sig a7=alo[8] // 1: + (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 + (p50) add t[6]=t[6],a3,1 };; +{ .mfi; (p17) getf.sig a8=ahi[8] // 2: + (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 + (p40) cmp.ltu p43,p41=a3,n3 } +{ .mfi; (p42) cmp.leu p43,p41=a3,n3 + (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] + (p0) nop.i 0 };; +{ .mii; (p17) getf.sig n5=nlo[6] // 3: + (p48) cmp.ltu p51,p49=t[6],a3 + (p50) cmp.leu p51,p49=t[6],a3 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mmi; (p0) getf.sig n2=ni1 // 4: + (p41) add a4=a4,n4 // (p17) a4+=n4 + (p43) add a4=a4,n4,1 };; +{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 + (p0) nop.f 0 + (p51) add t[5]=t[5],a4,1 };; +{ .mfi; (p0) getf.sig n3=ni2 // 6: + (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 + (p41) cmp.ltu p42,p40=a4,n4 } +{ .mfi; (p43) cmp.leu p42,p40=a4,n4 + (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] + (p0) nop.i 0 };; +{ .mii; (p17) getf.sig n6=nlo[7] // 7: + (p49) cmp.ltu p50,p48=t[5],a4 + (p51) cmp.leu p50,p48=t[5],a4 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mii; (p0) getf.sig n4=ni3 // 8: + (p40) add a5=a5,n5 // (p17) a5+=n5 + (p42) add a5=a5,n5,1 };; +{ .mii; (p0) nop.m 0 // 9: + (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 + (p50) add t[4]=t[4],a5,1 };; +{ .mii; (p0) nop.m 0 // 10: + (p40) cmp.ltu p43,p41=a5,n5 + (p42) cmp.leu p43,p41=a5,n5 };; +{ .mii; (p17) getf.sig n7=nlo[8] // 11: + (p48) cmp.ltu p51,p49=t[4],a5 + (p50) cmp.leu p51,p49=t[4],a5 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mii; (p17) getf.sig n8=nhi[8] // 12: + (p41) add a6=a6,n6 // (p17) a6+=n6 + (p43) add a6=a6,n6,1 };; +{ .mii; (p0) getf.sig n5=ni4 // 13: + (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 + (p51) add t[3]=t[3],a6,1 };; +{ .mii; (p0) nop.m 0 // 14: + (p41) cmp.ltu p42,p40=a6,n6 + (p43) cmp.leu p42,p40=a6,n6 };; +{ .mii; (p0) getf.sig n6=ni5 // 15: + (p49) cmp.ltu p50,p48=t[3],a6 + (p51) cmp.leu p50,p48=t[3],a6 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mii; (p0) nop.m 0 // 16: + (p40) add a7=a7,n7 // (p17) a7+=n7 + (p42) add a7=a7,n7,1 };; +{ .mii; (p0) nop.m 0 // 17: + (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 + (p50) add t[2]=t[2],a7,1 };; +{ .mii; (p0) nop.m 0 // 18: + (p40) cmp.ltu p43,p41=a7,n7 + (p42) cmp.leu p43,p41=a7,n7 };; +{ .mii; (p0) getf.sig n7=ni6 // 19: + (p48) cmp.ltu p51,p49=t[2],a7 + (p50) cmp.leu p51,p49=t[2],a7 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mii; (p0) nop.m 0 // 20: + (p41) add a8=a8,n8 // (p17) a8+=n8 + (p43) add a8=a8,n8,1 };; +{ .mmi; (p0) nop.m 0 // 21: + (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 + (p51) add t[1]=t[1],a8,1 } +{ .mmi; (p17) mov t[0]=r0 + (p41) cmp.ltu p42,p40=a8,n8 + (p43) cmp.leu p42,p40=a8,n8 };; +{ .mmi; (p0) getf.sig n8=ni7 // 22: + (p49) cmp.ltu p50,p48=t[1],a8 + (p51) cmp.leu p50,p48=t[1],a8 } +{ .mmi; (p42) add t[0]=t[0],r0,1 + (p0) add r16=-7*16,prevsp + (p0) add r17=-6*16,prevsp };; + +// subtract np[8] from carrybit|tmp[8] // carrybit|tmp[8] layout upon exit from above loop is: -// t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t[0]|t0 (least significant) -{ .mmi; getf.sig n1=ni0 - getf.sig n2=ni1 - add r16=-7*16,prevsp} -{ .mmi; getf.sig n3=ni2 - getf.sig n4=ni3 - add r17=-6*16,prevsp};; -{ .mmi; getf.sig n5=ni4 - getf.sig n6=ni5 - add r18=-5*16,prevsp} -{ .mmi; getf.sig n7=ni6 - getf.sig n8=ni7 +// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) +{ .mmi; (p50)add t[0]=t[0],r0,1 + add r18=-5*16,prevsp sub n1=t0,n1 };; { .mmi; cmp.gtu p34,p32=n1,t0;; .pred.rel "mutex",p32,p34 - (p32)sub n2=t[0],n2 - (p34)sub n2=t[0],n2,1 };; -{ .mii; (p32)cmp.gtu p35,p33=n2,t[0] - (p34)cmp.geu p35,p33=n2,t[0];; + (p32)sub n2=t[7],n2 + (p34)sub n2=t[7],n2,1 };; +{ .mii; (p32)cmp.gtu p35,p33=n2,t[7] + (p34)cmp.geu p35,p33=n2,t[7];; .pred.rel "mutex",p33,p35 - (p33)sub n3=t[7],n3 } -{ .mmi; (p35)sub n3=t[7],n3,1;; - (p33)cmp.gtu p34,p32=n3,t[7] - (p35)cmp.geu p34,p32=n3,t[7] };; + (p33)sub n3=t[6],n3 } +{ .mmi; (p35)sub n3=t[6],n3,1;; + (p33)cmp.gtu p34,p32=n3,t[6] + (p35)cmp.geu p34,p32=n3,t[6] };; .pred.rel "mutex",p32,p34 -{ .mii; (p32)sub n4=t[6],n4 - (p34)sub n4=t[6],n4,1;; - (p32)cmp.gtu p35,p33=n4,t[6] } -{ .mmi; (p34)cmp.geu p35,p33=n4,t[6];; +{ .mii; (p32)sub n4=t[5],n4 + (p34)sub n4=t[5],n4,1;; + (p32)cmp.gtu p35,p33=n4,t[5] } +{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];; .pred.rel "mutex",p33,p35 - (p33)sub n5=t[5],n5 - (p35)sub n5=t[5],n5,1 };; -{ .mii; (p33)cmp.gtu p34,p32=n5,t[5] - (p35)cmp.geu p34,p32=n5,t[5];; + (p33)sub n5=t[4],n5 + (p35)sub n5=t[4],n5,1 };; +{ .mii; (p33)cmp.gtu p34,p32=n5,t[4] + (p35)cmp.geu p34,p32=n5,t[4];; .pred.rel "mutex",p32,p34 - (p32)sub n6=t[4],n6 } -{ .mmi; (p34)sub n6=t[4],n6,1;; - (p32)cmp.gtu p35,p33=n6,t[4] - (p34)cmp.geu p35,p33=n6,t[4] };; + (p32)sub n6=t[3],n6 } +{ .mmi; (p34)sub n6=t[3],n6,1;; + (p32)cmp.gtu p35,p33=n6,t[3] + (p34)cmp.geu p35,p33=n6,t[3] };; .pred.rel "mutex",p33,p35 -{ .mii; (p33)sub n7=t[3],n7 - (p35)sub n7=t[3],n7,1;; - (p33)cmp.gtu p34,p32=n7,t[3] } -{ .mmi; (p35)cmp.geu p34,p32=n7,t[3];; +{ .mii; (p33)sub n7=t[2],n7 + (p35)sub n7=t[2],n7,1;; + (p33)cmp.gtu p34,p32=n7,t[2] } +{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];; .pred.rel "mutex",p32,p34 - (p32)sub n8=t[2],n8 - (p34)sub n8=t[2],n8,1 };; -{ .mii; (p32)cmp.gtu p35,p33=n8,t[2] - (p34)cmp.geu p35,p33=n8,t[2];; + (p32)sub n8=t[1],n8 + (p34)sub n8=t[1],n8,1 };; +{ .mii; (p32)cmp.gtu p35,p33=n8,t[1] + (p34)cmp.geu p35,p33=n8,t[1];; .pred.rel "mutex",p33,p35 - (p33)sub a8=t[1],r0 } -{ .mmi; (p35)sub a8=t[1],r0,1;; - (p33)cmp.gtu p34,p32=a8,t[1] - (p35)cmp.geu p34,p32=a8,t[1] };; + (p33)sub a8=t[0],r0 } +{ .mmi; (p35)sub a8=t[0],r0,1;; + (p33)cmp.gtu p34,p32=a8,t[0] + (p35)cmp.geu p34,p32=a8,t[0] };; // save the result, either tmp[num] or tmp[num]-np[num] .pred.rel "mutex",p32,p34 @@ -715,25 +811,25 @@ bn_mul_mont_8: (p34)st8 [rptr]=t0,8 add r19=-4*16,prevsp};; { .mmb; (p32)st8 [rptr]=n2,8 - (p34)st8 [rptr]=t[0],8 + (p34)st8 [rptr]=t[7],8 (p5)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n3,8 - (p34)st8 [rptr]=t[7],8 + (p34)st8 [rptr]=t[6],8 (p7)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n4,8 - (p34)st8 [rptr]=t[6],8 + (p34)st8 [rptr]=t[5],8 (p9)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n5,8 - (p34)st8 [rptr]=t[5],8 + (p34)st8 [rptr]=t[4],8 (p11)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n6,8 - (p34)st8 [rptr]=t[4],8 + (p34)st8 [rptr]=t[3],8 (p13)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n7,8 - (p34)st8 [rptr]=t[3],8 + (p34)st8 [rptr]=t[2],8 (p15)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n8,8 - (p34)st8 [rptr]=t[2],8 + (p34)st8 [rptr]=t[1],8 nop.b 0 };; .Ldone: // epilogue { .mmi; ldf.fill f16=[r16],64 @@ -741,7 +837,7 @@ bn_mul_mont_8: nop.i 0 } { .mmi; ldf.fill f18=[r18],64 ldf.fill f19=[r19],64 - mov pr=prevpr,-2 };; + mov pr=prevpr,0x1ffff };; { .mmi; ldf.fill f20=[r16] ldf.fill f21=[r17] mov ar.lc=prevlc } @@ -759,6 +855,6 @@ copyright: stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by " ___ -$output=shift and open STDOUT,">$output"; +open STDOUT,">$output" if $output; print $code; close STDOUT;