"Do not load certificates from the default certificates directory"},
{"content", OPT_CONTENT, '<',
"Supply or override content for detached signature"},
- {"print", OPT_PRINT, '-',
+ {"print", OPT_PRINT, '-',
"For the -cmsout operation print out all fields of the CMS structure"},
{"secretkey", OPT_SECRETKEY, 's'},
{"secretkeyid", OPT_SECRETKEYID, 's'},
{"no-CApath", OPT_NOCAPATH, '-',
"Do not load certificates from the default certificates directory"},
{"resign", OPT_RESIGN, '-', "Resign a signed message"},
- {"nochain", OPT_NOCHAIN, '-',
+ {"nochain", OPT_NOCHAIN, '-',
"set PKCS7_NOCHAIN so certificates contained in the message are not used as untrusted CAs" },
{"nosmimecap", OPT_NOSMIMECAP, '-', "Omit the SMIMECapabilities attribute"},
{"stream", OPT_STREAM, '-', "Enable CMS streaming" },
continue;
#endif
- ret = ASYNC_start_job(&loopargs[i].inprogress_job,
- loopargs[i].wait_ctx, &job_op_count, loop_function,
+ ret = ASYNC_start_job(&loopargs[i].inprogress_job,
+ loopargs[i].wait_ctx, &job_op_count, loop_function,
(void *)(loopargs + i), sizeof(loopargs_t));
switch (ret) {
case ASYNC_PAUSE:
# words every cache-line is *guaranteed* to be accessed within ~50
# cycles window. Why just SSE? Because it's needed on hyper-threading
# CPU! Which is also why it's prefetched with 64 byte stride. Best
-# part is that it has no negative effect on performance:-)
+# part is that it has no negative effect on performance:-)
#
# Version 4.3 implements switch between compact and non-compact block
# functions in AES_cbc_encrypt depending on how much data was asked
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# | mm4 | mm0 |
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# | s3 | s2 | s1 | s0 |
+# | s3 | s2 | s1 | s0 |
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
- else { &mov ($tmp,$s[3]);
+ else { &mov ($tmp,$s[3]);
&shr ($tmp,24) }
&xor ($out,&DWP(1,$te,$tmp,8));
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
&pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
&pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
- &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
&pxor ("mm3","mm3"); &pxor ("mm7","mm7");
&pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
{
# stack frame layout
# -4(%esp) # return address 0(%esp)
-# 0(%esp) # s0 backing store 4(%esp)
+# 0(%esp) # s0 backing store 4(%esp)
# 4(%esp) # s1 backing store 8(%esp)
# 8(%esp) # s2 backing store 12(%esp)
# 12(%esp) # s3 backing store 16(%esp)
&mov (&DWP(80,"edi"),10); # setup number of rounds
&xor ("eax","eax");
&jmp (&label("exit"));
-
+
&set_label("12rounds");
&mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
&mov ("ebx",&DWP(4,"esi"));
xor $s1,$s1,$acc05
xor $s2,$s2,$acc06
xor $s3,$s3,$acc07
- xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
- xor $s1,$s1,$acc09
- xor $s2,$s2,$acc10
- xor $s3,$s3,$acc11
+ xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
+ xor $s1,$s1,$acc09
+ xor $s2,$s2,$acc10
+ xor $s3,$s3,$acc11
b Ldec_compact_loop
.align 4
or $s1,$t1
or $t2,$i2
or $t3,$i3
-
+
srlg $i1,$s2,`8-3` # i0
srlg $i2,$s2,`16-3` # i1
nr $i1,$mask
x $s2,24($key)
x $s3,28($key)
- br $ra
+ br $ra
.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
___
x $s2,24($key)
x $s3,28($key)
- br $ra
+ br $ra
.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
___
.Lcbc_enc_done:
l${g} $ivp,6*$SIZE_T($sp)
st $s0,0($ivp)
- st $s1,4($ivp)
+ st $s1,4($ivp)
st $s2,8($ivp)
st $s3,12($ivp)
llgc $len,2*$SIZE_T-1($sp)
nill $len,0x0f # $len%=16
br $ra
-
+
.align 16
.Lxts_km_vanilla:
___
xgr $s1,%r1
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
- srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32
xgr $s1,%r1
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
- srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32
xgr $s1,%r1
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
- srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32
AES_set_encrypt_key:
push %rbx
push %rbp
- push %r12 # redundant, but allows to share
+ push %r12 # redundant, but allows to share
push %r13 # exception handler...
push %r14
push %r15
xor %rax,%rax
jmp .Lexit
-.L14rounds:
+.L14rounds:
mov 0(%rsi),%rax # copy first 8 dwords
mov 8(%rsi),%rbx
mov 16(%rsi),%rcx
movaps %xmm10,0x40(%rsp)
movaps %xmm11,0x50(%rsp)
movaps %xmm12,0x60(%rsp)
- movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
+ movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
movaps %xmm14,-0x58(%rax)
movaps %xmm15,-0x48(%rax)
___
movups @out[0],-16(@outptr[0],$offset)
pxor @inp[0],@out[0]
- movups @out[1],-16(@outptr[1],$offset)
+ movups @out[1],-16(@outptr[1],$offset)
pxor @inp[1],@out[1]
- movups @out[2],-16(@outptr[2],$offset)
+ movups @out[2],-16(@outptr[2],$offset)
pxor @inp[2],@out[2]
movups @out[3],-16(@outptr[3],$offset)
pxor @inp[3],@out[3]
movaps %xmm10,0x40(%rsp)
movaps %xmm11,0x50(%rsp)
movaps %xmm12,0x60(%rsp)
- movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
+ movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
movaps %xmm14,-0x58(%rax)
movaps %xmm15,-0x48(%rax)
___
movups @out[0],-16(@outptr[0],$offset)
movdqu (@inptr[0],$offset),@out[0]
- movups @out[1],-16(@outptr[1],$offset)
+ movups @out[1],-16(@outptr[1],$offset)
movdqu (@inptr[1],$offset),@out[1]
pxor $zero,@out[0]
- movups @out[2],-16(@outptr[2],$offset)
+ movups @out[2],-16(@outptr[2],$offset)
movdqu (@inptr[2],$offset),@out[2]
pxor $zero,@out[1]
movups @out[3],-16(@outptr[3],$offset)
vmovups @out[0],-16(@ptr[0]) # write output
sub $offset,@ptr[0] # switch to input
vpxor 0x00($offload),@out[0],@out[0]
- vmovups @out[1],-16(@ptr[1])
+ vmovups @out[1],-16(@ptr[1])
sub `64+1*8`(%rsp),@ptr[1]
vpxor 0x10($offload),@out[1],@out[1]
- vmovups @out[2],-16(@ptr[2])
+ vmovups @out[2],-16(@ptr[2])
sub `64+2*8`(%rsp),@ptr[2]
vpxor 0x20($offload),@out[2],@out[2]
vmovups @out[3],-16(@ptr[3])
vmovups @out[4],-16(@ptr[4])
sub `64+4*8`(%rsp),@ptr[4]
vpxor @inp[0],@out[4],@out[4]
- vmovups @out[5],-16(@ptr[5])
+ vmovups @out[5],-16(@ptr[5])
sub `64+5*8`(%rsp),@ptr[5]
vpxor @inp[1],@out[5],@out[5]
- vmovups @out[6],-16(@ptr[6])
+ vmovups @out[6],-16(@ptr[6])
sub `64+6*8`(%rsp),@ptr[6]
vpxor @inp[2],@out[6],@out[6]
vmovups @out[7],-16(@ptr[7])
sub $offset,@ptr[0] # switch to input
vmovdqu 128+0(%rsp),@out[0]
vpxor 0x70($offload),@out[7],@out[7]
- vmovups @out[1],-16(@ptr[1])
+ vmovups @out[1],-16(@ptr[1])
sub `64+1*8`(%rsp),@ptr[1]
vmovdqu @out[0],0x00($offload)
vpxor $zero,@out[0],@out[0]
vmovdqu 128+16(%rsp),@out[1]
- vmovups @out[2],-16(@ptr[2])
+ vmovups @out[2],-16(@ptr[2])
sub `64+2*8`(%rsp),@ptr[2]
vmovdqu @out[1],0x10($offload)
vpxor $zero,@out[1],@out[1]
vpxor $zero,@out[3],@out[3]
vmovdqu @inp[0],0x40($offload)
vpxor @inp[0],$zero,@out[4]
- vmovups @out[5],-16(@ptr[5])
+ vmovups @out[5],-16(@ptr[5])
sub `64+5*8`(%rsp),@ptr[5]
vmovdqu @inp[1],0x50($offload)
vpxor @inp[1],$zero,@out[5]
- vmovups @out[6],-16(@ptr[6])
+ vmovups @out[6],-16(@ptr[6])
sub `64+6*8`(%rsp),@ptr[6]
vmovdqu @inp[2],0x60($offload)
vpxor @inp[2],$zero,@out[6]
sub body_20_39_dec () { # b^d^c
# on entry @T[0]=b^d
return &body_40_59_dec() if ($rx==39);
-
+
my @r=@body_20_39;
unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
######################################################################
# AVX2+BMI code path
#
-my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
+my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;
&set_label("ctr32_one_shortcut",16);
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
&mov ($rounds,&DWP(240,$key));
-
+
&set_label("ctr32_one");
if ($inline)
{ &aesni_inline_generate1("enc"); }
# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
-# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
+# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
#
# performance is achieved by interleaving instructions working on
# independent blocks. In which case asymptotic limit for such modes
# can be obtained by dividing above mentioned numbers by AES
-# instructions' interleave factor. Westmere can execute at most 3
+# instructions' interleave factor. Westmere can execute at most 3
# instructions at a time, meaning that optimal interleave factor is 3,
# and that's where the "magic" number of 1.25 come from. "Optimal
# interleave factor" means that increase of interleave factor does
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
-# * Bridge and "super-optimal" for other Intel CPUs...
+# * Bridge and "super-optimal" for other Intel CPUs...
sub aesni_generate2 {
my $dir=shift;
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
- mov OPENSSL_ia32cap_P+4(%rip),%r10d
+ mov OPENSSL_ia32cap_P+4(%rip),%r10d
xor $key0,%r9d
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
mov %r9d,0x70+12(%rsp)
.Lctr32_tail:
# note that at this point $inout0..5 are populated with
- # counter values xor-ed with 0-round key
+ # counter values xor-ed with 0-round key
lea 16($key),$key
cmp \$4,$len
jb .Lctr32_loop3
if ($flavour =~ /le$/o) {
SWITCH: for($conv) {
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
- /\?rev/ && do { @bytes=reverse(@bytes); last; };
+ /\?rev/ && do { @bytes=reverse(@bytes); last; };
}
}
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
- "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
+ "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
}
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
- sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvmov32 {
my $arg=shift;
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
- sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
+ sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
}
foreach(split("\n",$code)) {
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
veor @b[2], @b[2], @b[1]
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
pxor @b[6], @b[5]
pxor @s[0], @t[3]
pxor @s[1], @t[2]
pxor @s[2], @t[1]
- pxor @s[3], @t[0]
+ pxor @s[3], @t[0]
#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
bl _vpaes_schedule_transform // input transform
mov $inp, #7 // mov \$7, %esi
-
+
.Loop_schedule_256:
sub $inp, $inp, #1 // dec %esi
bl _vpaes_schedule_mangle // output low result
// high round
bl _vpaes_schedule_round
cbz $inp, .Lschedule_mangle_last
- bl _vpaes_schedule_mangle
+ bl _vpaes_schedule_mangle
// low round. swap xmm7 and xmm6
dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
bl _vpaes_schedule_low_round
mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
-
+
b .Loop_schedule_256
##
.Lschedule_mangle_last_dec:
ld1 {v20.2d-v21.2d}, [x11] // reload constants
- sub $out, $out, #16 // add \$-16, %rdx
+ sub $out, $out, #16 // add \$-16, %rdx
eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
bl _vpaes_schedule_transform // output transform
st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
# high round
bl _vpaes_schedule_round
bdz Lschedule_mangle_last # dec %esi
- bl _vpaes_schedule_mangle
+ bl _vpaes_schedule_mangle
# low round. swap xmm7 and xmm6
?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
vmr v7, v6 # vmovdqa %xmm6, %xmm7
bl _vpaes_schedule_low_round
vmr v7, v5 # vmovdqa %xmm5, %xmm7
-
+
b Loop_schedule_256
##
## .aes_schedule_mangle_last
Lschedule_mangle_last_dec:
lvx $iptlo, r11, r12 # reload $ipt
lvx $ipthi, r9, r12
- addi $out, $out, -16 # add \$-16, %rdx
+ addi $out, $out, -16 # add \$-16, %rdx
vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
bl _vpaes_schedule_transform # output transform
if ($flavour =~ /le$/o) {
SWITCH: for($conv) {
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
- /\?rev/ && do { @bytes=reverse(@bytes); last; };
+ /\?rev/ && do { @bytes=reverse(@bytes); last; };
}
}
##
&set_label("schedule_192",16);
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
- &call ("_vpaes_schedule_transform"); # input transform
+ &call ("_vpaes_schedule_transform"); # input transform
&movdqa ("xmm6","xmm0"); # save short part
&pxor ("xmm4","xmm4"); # clear 4
&movhlps("xmm6","xmm4"); # clobber low side with zeros
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
- &call ("_vpaes_schedule_transform"); # input transform
+ &call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
- &call ("_vpaes_schedule_mangle");
+ &call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
- &movdqa ("xmm1","xmm4");
+ &movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k
pshufb %xmm1, %xmm0
ret
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
-
+
##
## Decryption core
##
##
.Lschedule_128:
mov \$10, %esi
-
+
.Loop_schedule_128:
call _vpaes_schedule_round
dec %rsi
.Loop_schedule_192:
call _vpaes_schedule_round
- palignr \$8,%xmm6,%xmm0
+ palignr \$8,%xmm6,%xmm0
call _vpaes_schedule_mangle # save key n
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle # save key n+1
movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
call _vpaes_schedule_transform # input transform
mov \$7, %esi
-
+
.Loop_schedule_256:
call _vpaes_schedule_mangle # output low result
movdqa %xmm0, %xmm6 # save cur_lo in xmm6
call _vpaes_schedule_round
dec %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
# low round. swap xmm7 and xmm6
pshufd \$0xFF, %xmm0, %xmm0
movdqa %xmm6, %xmm7
call _vpaes_schedule_low_round
movdqa %xmm5, %xmm7
-
+
jmp .Loop_schedule_256
-
+
##
## .aes_schedule_mangle_last
##
# rotate
pshufd \$0xFF, %xmm0, %xmm0
palignr \$1, %xmm0, %xmm0
-
+
# fall through...
-
+
# low round: same as high round, but no rotation and no rcon.
_vpaes_schedule_low_round:
# smear xmm7
pxor %xmm4, %xmm0 # 0 = sbox output
# add in smeared stuff
- pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm0
movdqa %xmm0, %xmm7
ret
.size _vpaes_schedule_round,.-_vpaes_schedule_round
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
-#
+#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
$flavour = shift;
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
-# instructions. The latter means that this code didn't really have an
+# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
&jmp(&label("maw_sse2_entry"));
-
+
&set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
-
+
&comment("");
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_neg_loop"));
-
+
&set_label("pw_neg_finish",0);
&mov($tmp2,&wparam(4)); # get dl
&mov($num,0);
&sub($num,$tmp2);
&and($num,7);
&jz(&label("pw_end"));
-
+
for ($i=0; $i<7; $i++)
{
&comment("dl<0 Tail Round $i");
}
&jmp(&label("pw_end"));
-
+
&set_label("pw_pos",0);
-
+
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_pos_finish"));
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_nc".$i));
}
-
+
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_pos_loop"));
-
+
&set_label("pw_pos_finish",0);
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_end"));
-
+
for ($i=0; $i<7; $i++)
{
&comment("dl>0 Tail Round $i");
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_nc".$i,0);
}
-
+
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_nc_loop"));
-
+
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_nc_end"));
-
+
for ($i=0; $i<7; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
- # is pos > 1, it means it is the last loop
+ # is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
- # is pos > 1, it means it is the last loop
+ # is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
$c2="ebp";
$a="esi";
$b="edi";
-
+
$as=0;
$ae=0;
$bs=0;
&push("ebx");
&xor($c0,$c0);
- &mov("eax",&DWP(0,$a,"",0)); # load the first word
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
- &mov("edx",&DWP(0,$b,"",0)); # load the first second
+ &mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
$bi=$bs;
$end=$be+1;
- &comment("################## Calculate word $i");
+ &comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
// const BN_ULONG *bp,const BN_ULONG *np,
-// const BN_ULONG *n0p,int num);
+// const BN_ULONG *n0p,int num);
.align 64
.global bn_mul_mont#
.proc bn_mul_mont#
{ .mmi; .pred.rel "mutex",p39,p41
(p39) add topbit=r0,r0
(p41) add topbit=r0,r0,1
- nop.i 0 }
+ nop.i 0 }
{ .mmi; st8 [tp_1]=n[0]
add tptr=16,sp
add tp_1=8,sp };;
sltu $v0,$t2,$ta2
$ST $t2,-2*$BNSZ($a0)
$ADDU $v0,$t8
-
+
$ADDU $ta3,$t3
sltu $t9,$ta3,$t3
$ADDU $t3,$ta3,$v0
sltu $v0,$t3,$ta3
$ST $t3,-$BNSZ($a0)
-
+
.set noreorder
bgtz $at,.L_bn_add_words_loop
$ADDU $v0,$t9
# so that we can save two arguments
# and return address in registers
# instead of stack:-)
-
+
$LD $a0,($a3)
move $ta2,$a1
bne $a0,$a2,bn_div_3_words_internal
ldd $idx($np),$hi0
std,ma %r0,8($tp)
addib,<> 8,$idx,.-8 ; L\$copy
- std,ma $hi0,8($rp)
+ std,ma $hi0,8($rp)
___
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
ldwx $idx($np),$hi0
stws,ma %r0,4($tp)
addib,<> 4,$idx,L\$copy_pa11
- stws,ma $hi0,4($rp)
+ stws,ma $hi0,4($rp)
nop ; alignment
L\$done
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
# for 64-bit application running on PPC970/G5 is:
#
-# 512-bit +65%
+# 512-bit +65%
# 1024-bit +35%
# 2048-bit +18%
# 4096-bit +4%
$UMULL= "mullw"; # unsigned multiply low
$UMULH= "mulhwu"; # unsigned multiply high
$UCMP= "cmplw"; # unsigned compare
- $SHRI= "srwi"; # unsigned shift right by immediate
+ $SHRI= "srwi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} elsif ($flavour =~ /64/) {
$UMULL= "mulld"; # unsigned multiply low
$UMULH= "mulhdu"; # unsigned multiply high
$UCMP= "cmpld"; # unsigned compare
- $SHRI= "srdi"; # unsigned shift right by immediate
+ $SHRI= "srdi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} else { die "nonsense $flavour"; }
#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
-#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
+#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
#
# Same bechmark with this assembler code:
#
#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
-#
+#
# Again, performance increases by at about 75%
#
# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
$CNTLZ= "cntlzw"; # count leading zeros
$SHL= "slw"; # shift left
$SHR= "srw"; # unsigned shift right
- $SHRI= "srwi"; # unsigned shift right by immediate
+ $SHRI= "srwi"; # unsigned shift right by immediate
$SHLI= "slwi"; # shift left by immediate
$CLRU= "clrlwi"; # clear upper bits
$INSR= "insrwi"; # insert right
$CNTLZ= "cntlzd"; # count leading zeros
$SHL= "sld"; # shift left
$SHR= "srd"; # unsigned shift right
- $SHRI= "srdi"; # unsigned shift right by immediate
+ $SHRI= "srdi"; # unsigned shift right by immediate
$SHLI= "sldi"; # shift left by immediate
$CLRU= "clrldi"; # clear upper bits
- $INSR= "insrdi"; # insert right
+ $INSR= "insrdi"; # insert right
$ROTL= "rotldi"; # rotate left by immediate
$TR= "td"; # conditional trap
} else { die "nonsense $flavour"; }
# below.
# 12/05/03 Suresh Chari
# (with lots of help from) Andy Polyakov
-##
+##
# 1. Initial version 10/20/02 Suresh Chari
#
#
# be done in the build process.
#
# Hand optimized assembly code for the following routines
-#
+#
# bn_sqr_comba4
# bn_sqr_comba8
# bn_mul_comba4
#--------------------------------------------------------------------------
#
# Defines to be used in the assembly code.
-#
+#
#.set r0,0 # we use it as storage for value of 0
#.set SP,1 # preserved
-#.set RTOC,2 # preserved
+#.set RTOC,2 # preserved
#.set r3,3 # 1st argument/return value
#.set r4,4 # 2nd argument/volatile register
#.set r5,5 # 3rd argument/volatile register
# the first . i.e. for example change ".bn_sqr_comba4"
# to "bn_sqr_comba4". This should be automatically done
# in the build.
-
+
.globl .bn_sqr_comba4
.globl .bn_sqr_comba8
.globl .bn_mul_comba4
.globl .bn_sqr_words
.globl .bn_mul_words
.globl .bn_mul_add_words
-
+
# .text section
-
+
.machine "any"
#
# r3 contains r
# r4 contains a
#
-# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
-#
+# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
+#
# r5,r6 are the two BN_ULONGs being multiplied.
# r7,r8 are the results of the 32x32 giving 64 bit multiply.
# r9,r10, r11 are the equivalents of c1,c2, c3.
#
xor r0,r0,r0 # set r0 = 0. Used in the addze
# instructions below
-
+
#sqr_add_c(a,0,c1,c2,c3)
- $LD r5,`0*$BNSZ`(r4)
- $UMULL r9,r5,r5
+ $LD r5,`0*$BNSZ`(r4)
+ $UMULL r9,r5,r5
$UMULH r10,r5,r5 #in first iteration. No need
#to add since c1=c2=c3=0.
# Note c3(r11) is NOT set to 0
$ST r9,`0*$BNSZ`(r3) # r[0]=c1;
# sqr_add_c2(a,1,0,c2,c3,c1);
- $LD r6,`1*$BNSZ`(r4)
+ $LD r6,`1*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
adde r8,r8,r8
addze r9,r0 # catch carry if any.
- # r9= r0(=0) and carry
-
+ # r9= r0(=0) and carry
+
addc r10,r7,r10 # now add to temp result.
- addze r11,r8 # r8 added to r11 which is 0
+ addze r11,r8 # r8 added to r11 which is 0
addze r9,r9
-
- $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
+
+ $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
#sqr_add_c(a,1,c3,c1,c2)
$UMULL r7,r6,r6
$UMULH r8,r6,r6
$LD r6,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r7,r7,r7
adde r8,r8,r8
addze r10,r10
-
+
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
- $ST r11,`2*$BNSZ`(r3) #r[2]=c3
+ $ST r11,`2*$BNSZ`(r3) #r[2]=c3
#sqr_add_c2(a,3,0,c1,c2,c3);
- $LD r6,`3*$BNSZ`(r4)
+ $LD r6,`3*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r11,r0
-
+
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
$LD r6,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r7,r7,r7
adde r8,r8,r8
addze r11,r11
adde r11,r8,r11
addze r9,r0
#sqr_add_c2(a,3,1,c2,c3,c1);
- $LD r6,`3*$BNSZ`(r4)
+ $LD r6,`3*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r9,r9
-
+
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
$ST r10,`4*$BNSZ`(r3) #r[4]=c2
#sqr_add_c2(a,3,2,c3,c1,c2);
- $LD r5,`2*$BNSZ`(r4)
+ $LD r5,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r10,r0
-
+
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
$ST r11,`5*$BNSZ`(r3) #r[5] = c3
#sqr_add_c(a,3,c1,c2,c3);
- $UMULL r7,r6,r6
+ $UMULL r7,r6,r6
$UMULH r8,r6,r6
addc r9,r7,r9
adde r10,r8,r10
# for the gcc compiler. This should be automatically
# done in the build
#
-
+
.align 4
.bn_sqr_comba8:
#
# r3 contains r
# r4 contains a
#
-# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
-#
+# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
+#
# r5,r6 are the two BN_ULONGs being multiplied.
# r7,r8 are the results of the 32x32 giving 64 bit multiply.
# r9,r10, r11 are the equivalents of c1,c2, c3.
#
# Possible optimization of loading all 8 longs of a into registers
# doesn't provide any speedup
-#
+#
xor r0,r0,r0 #set r0 = 0.Used in addze
#instructions below.
#sqr_add_c2(a,1,0,c2,c3,c1);
$LD r6,`1*$BNSZ`(r4)
$UMULL r7,r5,r6
- $UMULH r8,r5,r6
-
+ $UMULH r8,r5,r6
+
addc r10,r7,r10 #add the two register number
adde r11,r8,r0 # (r8,r7) to the three register
addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
-
+
addc r10,r7,r10 #add the two register number
adde r11,r8,r11 # (r8,r7) to the three register
addze r9,r9 # number (r9,r11,r10).
-
+
$ST r10,`1*$BNSZ`(r3) # r[1]=c2
-
+
#sqr_add_c(a,1,c3,c1,c2);
$UMULL r7,r6,r6
$UMULH r8,r6,r6
$LD r6,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
-
+
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
-
+
$ST r11,`2*$BNSZ`(r3) #r[2]=c3
#sqr_add_c2(a,3,0,c1,c2,c3);
$LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r9,r7,r9
adde r10,r8,r10
addze r11,r0
-
+
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
$LD r6,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
-
+
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
-
+
$ST r9,`3*$BNSZ`(r3) #r[3]=c1;
#sqr_add_c(a,2,c2,c3,c1);
$UMULL r7,r6,r6
$UMULH r8,r6,r6
-
+
addc r10,r7,r10
adde r11,r8,r11
addze r9,r0
$LD r6,`3*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
-
+
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
$LD r6,`4*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
-
+
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
$LD r6,`5*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r11,r7,r11
adde r9,r8,r9
addze r10,r0
-
+
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
$LD r6,`4*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
-
+
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
$LD r6,`3*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
-
+
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
$LD r6,`4*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
-
+
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
$LD r6,`5*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
-
+
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
$LD r6,`7*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r10,r7,r10
adde r11,r8,r11
addze r9,r0
$LD r6,`6*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
$LD r6,`4*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
-
+
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
$LD r5,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
-
+
addc r9,r7,r9
adde r10,r8,r10
addze r11,r0
adde r10,r8,r10
addze r11,r11
$ST r9,`12*$BNSZ`(r3) #r[12]=c1;
-
+
#sqr_add_c2(a,7,6,c2,c3,c1)
$LD r5,`6*$BNSZ`(r4)
$UMULL r7,r5,r6
#
xor r0,r0,r0 #r0=0. Used in addze below.
#mul_add_c(a[0],b[0],c1,c2,c3);
- $LD r6,`0*$BNSZ`(r4)
- $LD r7,`0*$BNSZ`(r5)
- $UMULL r10,r6,r7
- $UMULH r11,r6,r7
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r10,r6,r7
+ $UMULH r11,r6,r7
$ST r10,`0*$BNSZ`(r3) #r[0]=c1
#mul_add_c(a[0],b[1],c2,c3,c1);
- $LD r7,`1*$BNSZ`(r5)
+ $LD r7,`1*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r11,r8,r11
adde r12,r9,r0
addze r10,r0
#mul_add_c(a[1],b[0],c2,c3,c1);
- $LD r6, `1*$BNSZ`(r4)
- $LD r7, `0*$BNSZ`(r5)
+ $LD r6, `1*$BNSZ`(r4)
+ $LD r7, `0*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r11,r8,r11
addze r10,r10
$ST r11,`1*$BNSZ`(r3) #r[1]=c2
#mul_add_c(a[2],b[0],c3,c1,c2);
- $LD r6,`2*$BNSZ`(r4)
+ $LD r6,`2*$BNSZ`(r4)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r12,r8,r12
adde r10,r9,r10
addze r11,r0
#mul_add_c(a[1],b[1],c3,c1,c2);
- $LD r6,`1*$BNSZ`(r4)
- $LD r7,`1*$BNSZ`(r5)
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r12,r8,r12
adde r10,r9,r10
addze r11,r11
#mul_add_c(a[0],b[2],c3,c1,c2);
- $LD r6,`0*$BNSZ`(r4)
- $LD r7,`2*$BNSZ`(r5)
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r12,r8,r12
addze r11,r11
$ST r12,`2*$BNSZ`(r3) #r[2]=c3
#mul_add_c(a[0],b[3],c1,c2,c3);
- $LD r7,`3*$BNSZ`(r5)
+ $LD r7,`3*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r10,r8,r10
addze r12,r12
$ST r10,`3*$BNSZ`(r3) #r[3]=c1
#mul_add_c(a[3],b[1],c2,c3,c1);
- $LD r7,`1*$BNSZ`(r5)
+ $LD r7,`1*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r11,r8,r11
addze r10,r10
$ST r11,`4*$BNSZ`(r3) #r[4]=c2
#mul_add_c(a[2],b[3],c3,c1,c2);
- $LD r6,`2*$BNSZ`(r4)
+ $LD r6,`2*$BNSZ`(r4)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r12,r8,r12
addze r11,r11
$ST r12,`5*$BNSZ`(r3) #r[5]=c3
#mul_add_c(a[3],b[3],c1,c2,c3);
- $LD r7,`3*$BNSZ`(r5)
+ $LD r7,`3*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r10,r8,r10
# for the gcc compiler. This should be automatically
# done in the build
#
-
+
.align 4
.bn_mul_comba8:
#
# r10, r11, r12 are the equivalents of c1, c2, and c3.
#
xor r0,r0,r0 #r0=0. Used in addze below.
-
+
#mul_add_c(a[0],b[0],c1,c2,c3);
$LD r6,`0*$BNSZ`(r4) #a[0]
$LD r7,`0*$BNSZ`(r5) #b[0]
addc r10,r10,r8
adde r11,r11,r9
addze r12,r12
-
+
#mul_add_c(a[2],b[1],c1,c2,c3);
$LD r6,`2*$BNSZ`(r4)
$LD r7,`1*$BNSZ`(r5)
adde r10,r10,r9
addze r11,r0
#mul_add_c(a[1],b[4],c3,c1,c2);
- $LD r6,`1*$BNSZ`(r4)
+ $LD r6,`1*$BNSZ`(r4)
$LD r7,`4*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
adde r10,r10,r9
addze r11,r11
#mul_add_c(a[2],b[3],c3,c1,c2);
- $LD r6,`2*$BNSZ`(r4)
+ $LD r6,`2*$BNSZ`(r4)
$LD r7,`3*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
adde r10,r10,r9
addze r11,r11
#mul_add_c(a[3],b[2],c3,c1,c2);
- $LD r6,`3*$BNSZ`(r4)
+ $LD r6,`3*$BNSZ`(r4)
$LD r7,`2*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
adde r10,r10,r9
addze r11,r11
#mul_add_c(a[4],b[1],c3,c1,c2);
- $LD r6,`4*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
$LD r7,`1*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
adde r10,r10,r9
addze r11,r11
#mul_add_c(a[5],b[0],c3,c1,c2);
- $LD r6,`5*$BNSZ`(r4)
+ $LD r6,`5*$BNSZ`(r4)
$LD r7,`0*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addi r3,r3,-$BNSZ
addi r5,r5,-$BNSZ
mtctr r6
-Lppcasm_sub_mainloop:
+Lppcasm_sub_mainloop:
$LDU r7,$BNSZ(r4)
$LDU r8,$BNSZ(r5)
subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
# is r7-r8 -1 as we need.
$STU r6,$BNSZ(r3)
bdnz Lppcasm_sub_mainloop
-Lppcasm_sub_adios:
+Lppcasm_sub_adios:
subfze r3,r0 # if carry bit is set then r3 = 0 else -1
andi. r3,r3,1 # keep only last bit.
blr
addi r3,r3,-$BNSZ
addi r5,r5,-$BNSZ
mtctr r6
-Lppcasm_add_mainloop:
+Lppcasm_add_mainloop:
$LDU r7,$BNSZ(r4)
$LDU r8,$BNSZ(r5)
adde r8,r7,r8
$STU r8,$BNSZ(r3)
bdnz Lppcasm_add_mainloop
-Lppcasm_add_adios:
+Lppcasm_add_adios:
addze r3,r0 #return carry bit.
blr
.long 0
# the PPC instruction to count leading zeros instead
# of call to num_bits_word. Since this was compiled
# only at level -O2 we can possibly squeeze it more?
-#
+#
# r3 = h
# r4 = l
# r5 = d
-
+
$UCMPI 0,r5,0 # compare r5 and 0
bne Lppcasm_div1 # proceed if d!=0
li r3,-1 # d=0 return -1
Lppcasm_div2:
$UCMP 0,r3,r5 #h>=d?
blt Lppcasm_div3 #goto Lppcasm_div3 if not
- subf r3,r5,r3 #h-=d ;
+ subf r3,r5,r3 #h-=d ;
Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
cmpi 0,0,r7,0 # is (i == 0)?
beq Lppcasm_div4
# as it saves registers.
li r6,2 #r6=2
mtctr r6 #counter will be in count.
-Lppcasm_divouterloop:
+Lppcasm_divouterloop:
$SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
$SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
# compute here for innerloop.
bne Lppcasm_div5 # goto Lppcasm_div5 if not
li r8,-1
- $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
+ $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
b Lppcasm_div6
Lppcasm_div5:
$UDIV r8,r3,r9 #q = h/dh
$UMULL r12,r9,r8 #th = q*dh
$CLRU r10,r5,`$BITS/2` #r10=dl
$UMULL r6,r8,r10 #tl = q*dl
-
+
Lppcasm_divinnerloop:
subf r10,r12,r3 #t = h -th
$SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
addi r4,r4,-$BNSZ
addi r3,r3,-$BNSZ
mtctr r5
-Lppcasm_sqr_mainloop:
+Lppcasm_sqr_mainloop:
#sqr(r[0],r[1],a[0]);
$LDU r6,$BNSZ(r4)
$UMULL r7,r6,r6
$STU r7,$BNSZ(r3)
$STU r8,$BNSZ(r3)
bdnz Lppcasm_sqr_mainloop
-Lppcasm_sqr_adios:
+Lppcasm_sqr_adios:
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
# done in the build
#
-.align 4
+.align 4
.bn_mul_words:
#
# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
rlwinm. r7,r5,30,2,31 # num >> 2
beq Lppcasm_mw_REM
mtctr r7
-Lppcasm_mw_LOOP:
+Lppcasm_mw_LOOP:
#mul(rp[0],ap[0],w,c1);
$LD r8,`0*$BNSZ`(r4)
$UMULL r9,r6,r8
#using adde.
$ST r9,`0*$BNSZ`(r3)
#mul(rp[1],ap[1],w,c1);
- $LD r8,`1*$BNSZ`(r4)
+ $LD r8,`1*$BNSZ`(r4)
$UMULL r11,r6,r8
$UMULH r12,r6,r8
adde r11,r11,r10
addze r12,r12 #this spin we collect carry into
#r12
$ST r11,`3*$BNSZ`(r3)
-
+
addi r3,r3,`4*$BNSZ`
addi r4,r4,`4*$BNSZ`
bdnz Lppcasm_mw_LOOP
addze r10,r10
$ST r9,`0*$BNSZ`(r3)
addi r12,r10,0
-
+
addi r5,r5,-1
cmpli 0,0,r5,0
beq Lppcasm_mw_OVER
-
+
#mul(rp[1],ap[1],w,c1);
- $LD r8,`1*$BNSZ`(r4)
+ $LD r8,`1*$BNSZ`(r4)
$UMULL r9,r6,r8
$UMULH r10,r6,r8
addc r9,r9,r12
addze r10,r10
$ST r9,`1*$BNSZ`(r3)
addi r12,r10,0
-
+
addi r5,r5,-1
cmpli 0,0,r5,0
beq Lppcasm_mw_OVER
-
+
#mul_add(rp[2],ap[2],w,c1);
$LD r8,`2*$BNSZ`(r4)
$UMULL r9,r6,r8
addze r10,r10
$ST r9,`2*$BNSZ`(r3)
addi r12,r10,0
-
-Lppcasm_mw_OVER:
+
+Lppcasm_mw_OVER:
addi r3,r12,0
blr
.long 0
# empirical evidence suggests that unrolled version performs best!!
#
xor r0,r0,r0 #r0 = 0
- xor r12,r12,r12 #r12 = 0 . used for carry
+ xor r12,r12,r12 #r12 = 0 . used for carry
rlwinm. r7,r5,30,2,31 # num >> 2
beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
mtctr r7
-Lppcasm_maw_mainloop:
+Lppcasm_maw_mainloop:
#mul_add(rp[0],ap[0],w,c1);
$LD r8,`0*$BNSZ`(r4)
$LD r11,`0*$BNSZ`(r3)
#by multiply and will be collected
#in the next spin
$ST r9,`0*$BNSZ`(r3)
-
+
#mul_add(rp[1],ap[1],w,c1);
- $LD r8,`1*$BNSZ`(r4)
+ $LD r8,`1*$BNSZ`(r4)
$LD r9,`1*$BNSZ`(r3)
$UMULL r11,r6,r8
$UMULH r12,r6,r8
addc r11,r11,r9
#addze r12,r12
$ST r11,`1*$BNSZ`(r3)
-
+
#mul_add(rp[2],ap[2],w,c1);
$LD r8,`2*$BNSZ`(r4)
$UMULL r9,r6,r8
addc r9,r9,r11
#addze r10,r10
$ST r9,`2*$BNSZ`(r3)
-
+
#mul_add(rp[3],ap[3],w,c1);
$LD r8,`3*$BNSZ`(r4)
$UMULL r11,r6,r8
addi r3,r3,`4*$BNSZ`
addi r4,r4,`4*$BNSZ`
bdnz Lppcasm_maw_mainloop
-
+
Lppcasm_maw_leftover:
andi. r5,r5,0x3
beq Lppcasm_maw_adios
addc r9,r9,r12
addze r12,r10
$ST r9,0(r3)
-
+
bdz Lppcasm_maw_adios
#mul_add(rp[1],ap[1],w,c1);
- $LDU r8,$BNSZ(r4)
+ $LDU r8,$BNSZ(r4)
$UMULL r9,r6,r8
$UMULH r10,r6,r8
$LDU r11,$BNSZ(r3)
addc r9,r9,r12
addze r12,r10
$ST r9,0(r3)
-
+
bdz Lppcasm_maw_adios
#mul_add(rp[2],ap[2],w,c1);
$LDU r8,$BNSZ(r4)
addc r9,r9,r12
addze r12,r10
$ST r9,0(r3)
-
-Lppcasm_maw_adios:
+
+Lppcasm_maw_adios:
addi r3,r12,0
blr
.long 0
vpaddq $TEMP1, $ACC1, $ACC1
vpmuludq 32*7-128($aap), $B2, $ACC2
vpbroadcastq 32*5-128($tpa), $B2
- vpaddq 32*11-448($tp1), $ACC2, $ACC2
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
vmovdqu $ACC6, 32*6-192($tp0)
vmovdqu $ACC7, 32*7-192($tp0)
vmovdqu $ACC7, 32*16-448($tp1)
lea 8($tp1), $tp1
- dec $i
+ dec $i
jnz .LOOP_SQR_1024
___
$ZERO = $ACC9;
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
vpaddq $TEMP3, $ACC7, $ACC7
vpaddq $TEMP4, $ACC8, $ACC8
-
+
vpsrlq \$29, $ACC4, $TEMP1
vpand $AND_MASK, $ACC4, $ACC4
vpsrlq \$29, $ACC5, $TEMP2
vpaddq $TEMP4, $ACC8, $ACC8
vmovdqu $ACC4, 128-128($rp)
- vmovdqu $ACC5, 160-128($rp)
+ vmovdqu $ACC5, 160-128($rp)
vmovdqu $ACC6, 192-128($rp)
vmovdqu $ACC7, 224-128($rp)
vmovdqu $ACC8, 256-128($rp)
movq %r9, 16(%rsp)
movq %r10, 24(%rsp)
shrq \$63, %rbx
-
+
#third iteration
- movq 16($inp), %r9
+ movq 16($inp), %r9
movq 24($inp), %rax
mulq %r9
addq %rax, %r12
movl $times,128+8(%rsp)
movq $out, %xmm0 # off-load
movq %rbp, %xmm1 # off-load
-#first iteration
+#first iteration
mulx %rax, %r8, %r9
mulx 16($inp), %rcx, %r10
mov %rax, (%rsp)
mov %r8, 8(%rsp)
-#second iteration
+#second iteration
mulx 16($inp), %rax, %rbx
adox %rax, %r10
adcx %rbx, %r11
mov %r9, 16(%rsp)
.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
-
-#third iteration
+
+#third iteration
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
adox $out, %r12
adcx %r9, %r13
mov %r11, 32(%rsp)
.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
-
-#fourth iteration
+
+#fourth iteration
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
adox %rax, %r14
adcx %rbx, %r15
mov %r13, 48(%rsp)
mov %r14, 56(%rsp)
-
-#fifth iteration
+
+#fifth iteration
.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
adox $out, %r8
adcx %r11, %r9
mov %r15, 64(%rsp)
mov %r8, 72(%rsp)
-
-#sixth iteration
+
+#sixth iteration
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
adox %rax, %r10
adcx %rbx, %r11
movq 56($ap), %rax
movq %rdx, %r14
adcq \$0, %r14
-
+
mulq %rbx
addq %rax, %r14
movq ($ap), %rax
movq ($ap), %rax
adcq \$0, %rdx
addq %r15, %r14
- movq %rdx, %r15
+ movq %rdx, %r15
adcq \$0, %r15
leaq 8(%rdi), %rdi
mulx 48($ap), %rbx, %r14
adcx %rax, %r12
-
+
mulx 56($ap), %rax, %r15
adcx %rbx, %r13
adcx %rax, %r14
___
$code.=<<___ if ($addx);
jmp .Lmul_scatter_tail
-
+
.align 32
.Lmulx_scatter:
movq ($out), %rdx # pass b[0]
movq 56($ap), %rax
movq %rdx, %r14
adcq \$0, %r14
-
+
mulq %rbx
addq %rax, %r14
movq ($ap), %rax
movq ($ap), %rax
adcq \$0, %rdx
addq %r15, %r14
- movq %rdx, %r15
+ movq %rdx, %r15
adcq \$0, %r15
leaq 8(%rdi), %rdi
xgr $hi,@r[1]
xgr $lo,@r[0]
xgr $hi,@r[2]
- xgr $lo,@r[3]
+ xgr $lo,@r[3]
xgr $hi,@r[3]
xgr $lo,$hi
stg $hi,16($rp)
# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
#
-# Conclusions:
+# Conclusions:
# - VIA SDK leaves a *lot* of room for improvement (which this
# implementation successfully fills:-);
# - 'rep montmul' gives up to >3x performance improvement depending on
$output = pop;
open STDOUT,">$output";
-
+
&asm_init($ARGV[0],$0);
$sse2=0;
my $nptr="%rcx"; # const BN_ULONG *nptr,
my $n0 ="%r8"; # const BN_ULONG *n0);
my $num ="%r9"; # int num, has to be divisible by 8
- # int pwr
+ # int pwr
my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
my @A0=("%r10","%r11");
ja .Lpwr_page_walk
.Lpwr_page_walk_done:
- mov $num,%r10
+ mov $num,%r10
neg $num
##############################################################
jnz .Lsqr4x_sub
mov $num,%r10 # prepare for back-to-back call
- neg $num # restore $num
+ neg $num # restore $num
ret
.size __bn_post4x_internal,.-__bn_post4x_internal
___
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rbp
-.Lmulx4xsp_done:
+.Lmulx4xsp_done:
and \$-64,%rbp # ensure alignment
mov %rsp,%r11
sub %rbp,%r11
ja .Lpwrx_page_walk
.Lpwrx_page_walk_done:
- mov $num,%r10
+ mov $num,%r10
neg $num
##############################################################
64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
-sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
-sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
-sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
+sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
+sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
+sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
&set_label("Camellia_SIGMA",64);
&data_word(
# https://www.openssl.org/source/license.html
-# This flag makes the inner loop one cycle longer, but generates
+# This flag makes the inner loop one cycle longer, but generates
# code that runs %30 faster on the pentium pro/II, 44% faster
# of PIII, while only %7 slower on the pentium.
# By default, this flag is on.
if ($ppro) {
&xor( $tmp1, $tmp1);
&mov( $tmp2, 0xff);
-
+
&movb( &LB($tmp1), &HB($tmp4)); # A
&and( $tmp2, $tmp4);
} else {
&mov( $tmp2, $tmp4); # B
&movb( &LB($tmp1), &HB($tmp4)); # A # BAD BAD BAD
-
+
&shr( $tmp4, 16); #
&and( $tmp2, 0xff);
}
# ====================================================================
#
# December 2014
-#
+#
# ChaCha20 for ARMv4.
#
# Performance in cycles per byte out of large buffer.
vadd.i32 $d2,$d1,$t0 @ counter+2
str @t[3], [sp,#4*(16+15)]
mov @t[3],#10
- add @x[12],@x[12],#3 @ counter+3
+ add @x[12],@x[12],#3 @ counter+3
b .Loop_neon
.align 4
# ====================================================================
#
# June 2015
-#
+#
# ChaCha20 for ARMv8.
#
# Performance in cycles per byte out of large buffer.
mov $ctr,#10
subs $len,$len,#64
.Loop:
- sub $ctr,$ctr,#1
+ sub $ctr,$ctr,#1
___
foreach (&ROUND(0, 4, 8,12)) { eval; }
foreach (&ROUND(0, 5,10,15)) { eval; }
# ====================================================================
#
# October 2015
-#
+#
# ChaCha20 for PowerPC/AltiVec.
#
# Performance in cycles per byte out of large buffer.
lwz @d[3],12($ctr)
vadduwm @K[5],@K[4],@K[5]
- vspltisw $twenty,-12 # synthesize constants
+ vspltisw $twenty,-12 # synthesize constants
vspltisw $twelve,12
vspltisw $twenty5,-7
#vspltisw $seven,7 # synthesized in the loop
&and( $u, "0xfcfcfcfc" ); # 2
&xor( $tmp1, $tmp1); # 1
&and( $t, "0xcfcfcfcf" ); # 2
- &xor( $tmp2, $tmp2);
+ &xor( $tmp2, $tmp2);
&movb( &LB($tmp1), &LB($u) );
&movb( &LB($tmp2), &HB($u) );
&rotr( $t, 4 );
&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
-
+
if ($lr != 3)
{
if (($lr-3) < 0)
&function_end_B("_x86_DES_encrypt");
}
-
+
sub DES_decrypt_internal()
{
&function_begin_B("_x86_DES_decrypt");
&function_end_B("_x86_DES_decrypt");
}
-
+
sub DES_encrypt
{
local($name,$do_ip)=@_;
&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
-
+
if ($lr != 3)
{
if (($lr-3) < 0)
&IP_new($L,$R,"edx",0);
# put them back
-
+
if ($enc)
{
&mov(&DWP(4,"ebx","",0),$R);
adc $ap,xzr,xzr // zap $ap
tst $acc0,#1 // is a even?
- csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
+ csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
csel $acc1,$acc1,$t1,eq
csel $acc2,$acc2,$t2,eq
csel $acc3,$acc3,$t3,eq
ldx [$bp+8*($i+1)],$bi ! bp[$i+1]
___
$code.=<<___;
- addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication
+ addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication
sllx $acc0,32,$t0
addxccc $acc2,$t1,$acc2
srlx $acc0,32,$t1
&mov (&DWP(20,"esp"),"eax");
&mov (&DWP(24,"esp"),"eax");
&mov (&DWP(28,"esp"),"eax");
-
+
&call ("_ecp_nistz256_sub");
&stack_pop(8);
adc \$0, $acc0
########################################################################
- # Second reduction step
+ # Second reduction step
mov $acc1, $t1
shl \$32, $acc1
mulq $poly3
adc \$0, $acc1
########################################################################
- # Third reduction step
+ # Third reduction step
mov $acc2, $t1
shl \$32, $acc2
mulq $poly3
adc \$0, $acc2
########################################################################
- # Final reduction step
+ # Final reduction step
mov $acc3, $t1
shl \$32, $acc3
mulq $poly3
mov $acc5, $t1
adc \$0, $acc2
- ########################################################################
+ ########################################################################
# Branch-less conditional subtraction of P
sub \$-1, $acc4 # .Lpoly[0]
mov $acc0, $t2
movq %xmm1, $r_ptr
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
___
-{
+{
######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
# operate in 4-5-6-7 "name space" that matches squaring output
#
lea $M(%rsp), $b_ptr
mov $acc4, $acc6 # harmonize sub output and mul input
xor %ecx, %ecx
- mov $acc4, $S+8*0(%rsp) # have to save:-(
+ mov $acc4, $S+8*0(%rsp) # have to save:-(
mov $acc5, $acc2
mov $acc5, $S+8*1(%rsp)
cmovz $acc0, $acc3
########################################################################
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
#
-open TABLE,"<ecp_nistz256_table.c" or
-open TABLE,"<${dir}../ecp_nistz256_table.c" or
+open TABLE,"<ecp_nistz256_table.c" or
+open TABLE,"<${dir}../ecp_nistz256_table.c" or
die "failed to open ecp_nistz256_table.c:",$!;
use integer;
local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
&mov($tmp1,$C) if $pos < 0;
- &mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
+ &mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
# body proper
ldd [%o1 + 0x20], %f16
ldd [%o1 + 0x28], %f18
ldd [%o1 + 0x30], %f20
- subcc %o2, 1, %o2 ! done yet?
+ subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x38], %f22
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20
&& !defined(_MIPS_ARCH_MIPS32R2)
# define _MIPS_ARCH_MIPS32R2
# endif
-
+
# if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \
defined(_MIPS_ARCH_MIPS64R6)) \
&& !defined(_MIPS_ARCH_MIPS64R2)
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
-#
+#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
- sub $Xi,#16
+ sub $Xi,#16
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
vst1.64 $Xl#lo,[$Xi]
lg $Zhi,0+1($Xi)
lghi $tmp,0
.Louter:
- xg $Zhi,0($inp) # Xi ^= inp
+ xg $Zhi,0($inp) # Xi ^= inp
xg $Zlo,8($inp)
xgr $Zhi,$tmp
stg $Zlo,8+1($Xi)
&bswap ($dat);
&pshufw ($Zhi,$Zhi,0b00011011); # 76543210
&bswap ("ebx");
-
+
&cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
&jne (&label("outer"));
}
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T1,8); #
+ &psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T1,8); #
+ &psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
&pshufd ($T1,$Xhn,0b01001110);
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
# experimental alternative. special thing about is that there
- # no dependency between the two multiplications...
+ # no dependency between the two multiplications...
mov \$`0xE1<<1`,%eax
mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
mov \$0x07,%r11d
movdqa $T2,$T1 #
pslldq \$8,$T2
pclmulqdq \$0x00,$Hkey2,$Xln
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
movdqu 0($inp),$T1
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pshufd \$0b01001110,$Xhn,$Xmn
pxor $T1,$Xhi #
# des_cblock (*ivec);
# int enc;
#
-# calls
+# calls
# des_encrypt((DES_LONG *)tin,schedule,DES_ENCRYPT);
#
# name is the function name
# enc_func and dec_func and the functions to call for encrypt/decrypt
# swap is true if byte order needs to be reversed
- # iv_off is parameter number for the iv
+ # iv_off is parameter number for the iv
# enc_off is parameter number for the encrypt/decrypt flag
# p1,p2,p3 are the offsets for parameters to be passed to the
# underlying calls.
#############################################################
&set_label("encrypt_loop");
- # encrypt start
+ # encrypt start
# "eax" and "ebx" hold iv (or the last cipher text)
&mov("ecx", &DWP(0,$in,"",0)); # load first 4 bytes
#############################################################
#############################################################
&set_label("decrypt",1);
- # decrypt start
+ # decrypt start
&and($count,0xfffffff8);
# The next 2 instructions are only for if the jz is taken
&mov("eax", &DWP($data_off+8,"esp","",0)); # get iv[0]
&align(64);
&function_end_B($name);
-
+
}
1;
my $ret;
$name =~ s|^\.||;
-
+
SWITCH: for ($flavour) {
/aix/ && do { if (!$$type) {
$$type = "\@function";
brnz,pn $ooff, 2f
sub $len, 1, $len
-
+
std %f0, [$out + 0]
std %f2, [$out + 8]
brnz,pt $len, .L${bits}_cbc_enc_loop
call _${alg}${bits}_encrypt_1x
add $inp, 16, $inp
sub $len, 1, $len
-
+
stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
add $out, 8, $out
stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
brnz,pn $ooff, 2f
sub $len, 1, $len
-
+
std %f0, [$out + 0]
std %f2, [$out + 8]
brnz,pt $len, .L${bits}_cbc_dec_loop2x
brnz,pn $ooff, 2f
sub $len, 2, $len
-
+
std %f0, [$out + 0]
std %f2, [$out + 8]
std %f4, [$out + 16]
brnz,pn $ooff, 2f
sub $len, 1, $len
-
+
std %f0, [$out + 0]
std %f2, [$out + 8]
brnz,pt $len, .L${bits}_ctr32_loop2x
brnz,pn $ooff, 2f
sub $len, 2, $len
-
+
std %f0, [$out + 0]
std %f2, [$out + 8]
std %f4, [$out + 16]
brnz,pn $ooff, 2f
sub $len, 1, $len
-
+
std %f0, [$out + 0]
std %f2, [$out + 8]
brnz,pt $len, .L${bits}_xts_${dir}loop2x
brnz,pn $ooff, 2f
sub $len, 2, $len
-
+
std %f0, [$out + 0]
std %f2, [$out + 8]
std %f4, [$out + 16]
if ($gas) {
if ($self->{op} eq "movz") { # movz is pain...
sprintf "%s%s%s",$self->{op},$self->{sz},shift;
- } elsif ($self->{op} =~ /^set/) {
+ } elsif ($self->{op} =~ /^set/) {
"$self->{op}";
} elsif ($self->{op} eq "ret") {
my $epilogue = "";
$self->{op} .= $self->{sz};
} elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
$self->{op} = "\tDQ";
- }
+ }
$self->{op};
}
}
if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva"))
{ $var=~s/([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
$var;
- };
+ };
$sz =~ tr/bvlrq/BWDDQ/;
$self->{value} = "\tD$sz\t";
};
/\.byte/ && do { my @str=split(/,\s*/,$$line);
map(s/(0b[0-1]+)/oct($1)/eig,@str);
- map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
+ map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
while ($#str>15) {
$self->{value}.="DB\t"
.join(",",@str[0..15])."\n";
printf "%s",$directive->out();
} elsif (my $opcode=opcode->re(\$line)) {
my $asm = eval("\$".$opcode->mnemonic());
-
+
if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) {
print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
next;
# %r13 - -
# %r14 - -
# %r15 - -
-#
+#
# (*) volatile register
# (-) preserved by callee
# (#) Nth argument, volatile
grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
push (@out,$comm)
}
- push (@out,$initseg) if ($initseg);
+ push (@out,$initseg) if ($initseg);
}
sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } }
|| NOP 5
STB $XX,*${KEYA}[-2] ; key->x
|| SUB4 $YY,$TX,$YY
-|| BNOP B3
+|| BNOP B3
STB $YY,*${KEYB}[-1] ; key->y
|| NOP 5
.endasmfunc
my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(),
# but its result is discarded. Idea here is
# to be able to use 'openssl speed rc4' for
- # benchmarking the stitched subroutine...
+ # benchmarking the stitched subroutine...
my $flavour = shift;
my $output = shift;
and \$63,$len # remaining bytes
jnz .Loop1
jmp .Ldone
-
+
.align 16
.Loop1:
add $TX[0]#b,$YY#b
for ($i=0;$i<4;$i++) {
$code.=<<___;
ldo 1($XX[0]),$XX[1]
- `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
+ `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
and $mask,$XX[1],$XX[1]
$LDX $YY($key),$TY
$MKX $YY,$key,$ix
ldo `2*$SZ`($key),$key
ldi 0xff,$mask
- ldi 3,$dat0
+ ldi 3,$dat0
ldo 1($XX[0]),$XX[0] ; warm up loop
and $mask,$XX[0],$XX[0]
# April 2005
#
-# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
+# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
# those with add/sub results in 50% performance improvement of folded
# loop...
$KL3=0x8F1BBCDC;
$KL4=0xA953FD4E;
$KR0=0x50A28BE6;
-$KR1=0x5C4DD124;
+$KR1=0x5C4DD124;
$KR2=0x6D703EF3;
$KR3=0x7A6D76E9;
# &mov($tmp2, &wparam(0)); # Moved into last round
&mov($tmp1, &DWP( 4,$tmp2,"",0)); # ctx->B
- &add($D, $tmp1);
+ &add($D, $tmp1);
&mov($tmp1, &swtmp(16+2)); # $c
&add($D, $tmp1);
&mov($tmp1, &DWP( 8,$tmp2,"",0)); # ctx->C
- &add($E, $tmp1);
+ &add($E, $tmp1);
&mov($tmp1, &swtmp(16+3)); # $d
&add($E, $tmp1);
&mov($tmp1, &DWP(12,$tmp2,"",0)); # ctx->D
- &add($A, $tmp1);
+ &add($A, $tmp1);
&mov($tmp1, &swtmp(16+4)); # $e
&add($A, $tmp1);
&mov($tmp1, &DWP(16,$tmp2,"",0)); # ctx->E
- &add($B, $tmp1);
+ &add($B, $tmp1);
&mov($tmp1, &swtmp(16+0)); # $a
&add($B, $tmp1);
&mov($tmp1, &DWP( 0,$tmp2,"",0)); # ctx->A
- &add($C, $tmp1);
+ &add($C, $tmp1);
&mov($tmp1, &swtmp(16+1)); # $b
&add($C, $tmp1);
=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
$1>=2.19); # first version supporting AVX
-$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
$1>=2.03); # first version supporting AVX
if (1) {
# Atom-specific optimization aiming to eliminate pshufb with high
- # registers [and thus get rid of 48 cycles accumulated penalty]
+ # registers [and thus get rid of 48 cycles accumulated penalty]
@Xi=map("%xmm$_",(0..4));
($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
@V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
# ...
# $i==13: 14,15,15,15,
# $i==14: 15
-#
+#
# Then at $i==15 Xupdate is applied one iteration in advance...
$code.=<<___ if ($i==0);
movd (@ptr[0]),@Xi[0]
ldd [%o1 + 0x20], %f16
ldd [%o1 + 0x28], %f18
ldd [%o1 + 0x30], %f20
- subcc %o2, 1, %o2 ! done yet?
+ subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x38], %f22
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20
mov $Cctx,$C
mov $Dctx,$D
mov $Ectx,$E
- alignaddr %g0,$tmp0,%g0
+ alignaddr %g0,$tmp0,%g0
dec 1,$len
ba .Loop
mov $nXfer,$Xfer
jz .Lialu
___
$code.=<<___ if ($shaext);
- test \$`1<<29`,%r10d # check SHA bit
+ test \$`1<<29`,%r10d # check SHA bit
jnz _shaext_shortcut
___
$code.=<<___ if ($avx>1);
#
# Performance in clock cycles per processed byte (less is better):
#
-# gcc icc x86 asm(*) SIMD x86_64 asm(**)
+# gcc icc x86 asm(*) SIMD x86_64 asm(**)
# Pentium 46 57 40/38 - -
# PIII 36 33 27/24 - -
# P4 41 38 28 - 17.3
&mov ($Coff,"ecx");
&mov ($Doff,"edi");
&mov (&DWP(0,"esp"),"ebx"); # magic
- &mov ($E,&DWP(16,"esi"));
+ &mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
&mov ("edi",&DWP(28,"esi"));
&xor ($AH[1],"ecx"); # magic
&mov (&DWP(8,"esp"),"ecx");
&mov (&DWP(12,"esp"),"ebx");
- &mov ($E,&DWP(16,"esi"));
+ &mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
&mov ("esi",&DWP(28,"esi"));
# (iii) "this" is for n=8, when we gather twice as much data, result
# for n=4 is 20.3+4.44=24.7;
# (iv) presented improvement coefficients are asymptotic limits and
-# in real-life application are somewhat lower, e.g. for 2KB
+# in real-life application are somewhat lower, e.g. for 2KB
# fragments they range from 75% to 130% (on Haswell);
$flavour = shift;
&set_label("16_79_sse2",16);
for ($j=0;$j<2;$j++) { # 2x unroll
- #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
+ #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
&movq ("mm5",&QWP(8*(9+16-14),"esp"));
&movq ("mm1","mm7");
&psrlq ("mm7",1);
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
-#
+#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by
___
@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
- $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
+ $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
$a0 ="%r17";
add $t0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[0],$t0
addc $t1,$hhi,$hhi ; h += Sigma1(e)
- shd $alo,$ahi,$Sigma0[0],$t1
+ shd $alo,$ahi,$Sigma0[0],$t1
add $a0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[1],$t2
addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
cl${g} $inp,`$frame+4*$SIZE_T`($sp)
jne .Lloop
- lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
+ lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.size $Func,.-$Func
.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
$locals=0; # X[16] is register resident
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
-
+
$A="%l0";
$B="%l1";
$C="%l2";
$SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
xor $tmp0,$h,$h
$SRL $a,@Sigma0[2],$tmp0
- xor $tmp1,$h,$h
+ xor $tmp1,$h,$h
$SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
xor $tmp0,$h,$h
xor $tmp1,$h,$h ! Sigma0(a)
######################################################################
# AVX2+BMI code path
#
-my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
+my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;
return result;
}
-static int ts_check_policy(const ASN1_OBJECT *req_oid,
+static int ts_check_policy(const ASN1_OBJECT *req_oid,
const TS_TST_INFO *tst_info)
{
const ASN1_OBJECT *resp_oid = tst_info->policy_id;
# multiplying 64 by CPU clock frequency and dividing by relevant
# value from the given table:
#
-# $SCALE=2/8 icc8 gcc3
+# $SCALE=2/8 icc8 gcc3
# Intel P4 3200/4600 4600(*) 6400
# Intel PIII 2900/3000 4900 5400
# AMD K[78] 2500/1800 9900 8200(**)
&L(0xca,0x2d,0xbf,0x07,0xad,0x5a,0x83,0x33);
&function_end_B("whirlpool_block_mmx");
-&asm_finish();
+&asm_finish();
close STDOUT;
crl_reasons
};
-char *i2s_ASN1_ENUMERATED_TABLE(X509V3_EXT_METHOD *method,
+char *i2s_ASN1_ENUMERATED_TABLE(X509V3_EXT_METHOD *method,
const ASN1_ENUMERATED *e)
{
ENUMERATED_NAMES *enam;
NULL
};
-char *i2s_ASN1_OCTET_STRING(X509V3_EXT_METHOD *method,
+char *i2s_ASN1_OCTET_STRING(X509V3_EXT_METHOD *method,
const ASN1_OCTET_STRING *oct)
{
return OPENSSL_buf2hexstr(oct->data, oct->length);
&ja (&label("generic"));
&and ("edx",0xefffffff); # clear hyper-threading bit
&jmp (&label("generic"));
-
+
&set_label("intel");
&cmp ("edi",7);
&jb (&label("cacheinfo"));
sub $len,%rsp
shr \$3,$len
lea (%rsp),$out
- .byte 0xf3,0x48,0xa5 # rep movsq
+ .byte 0xf3,0x48,0xa5 # rep movsq
lea (%r8),$out
lea (%rsp),$inp
mov $chunk,$len
const unsigned char *bytes,
int len);
X509_NAME_ENTRY *X509_NAME_ENTRY_create_by_NID(X509_NAME_ENTRY **ne, int nid,
- int type,
+ int type,
const unsigned char *bytes,
int len);
int X509_NAME_add_entry_by_txt(X509_NAME *name, const char *field, int type,
}
/* Write out the WPACKET length if needed */
- if (sub->lenbytes > 0
+ if (sub->lenbytes > 0
&& !put_value((unsigned char *)&pkt->buf->data[sub->packet_len],
packlen, sub->lenbytes))
return 0;
* maximum size will be. If this function is used, then it should be immediately
* followed by a WPACKET_allocate_bytes() call before any other WPACKET
* functions are called (unless the write to the allocated bytes is abandoned).
- *
+ *
* For example: If we are generating a signature, then the size of that
* signature may not be known in advance. We can use WPACKET_reserve_bytes() to
* handle this:
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
-# Perl utility to run PKITS tests for RFC3280 compliance.
+# Perl utility to run PKITS tests for RFC3280 compliance.
my $ossl_path;
sub tconversion {
my $testtype = shift;
my $t = shift;
- my @conversionforms =
+ my @conversionforms =
defined($conversionforms{$testtype}) ?
@{$conversionforms{$testtype}} :
@{$conversionforms{"*"}};
|| !WPACKET_set_max_size(&pkt, SIZE_MAX)
|| !WPACKET_finish(&pkt)) {
testfail("test_WPACKET_set_max_size():1 failed\n", &pkt);
- return 0;
+ return 0;
}
if (!WPACKET_init_len(&pkt, buf, 1)
# This is just a quick script to scan for cases where the 'error'
# function name in a XXXerr() macro is wrong.
-#
+#
# Run in the top level by going
# perl util/ck_errf.pl */*.c */*/*.c
#
}
$dest = pop @filelist;
-
+
if ($fnum > 2 && ! -d $dest)
{
die "Destination must be a directory";
close(OUT);
print "Copying: $_ to $dfile\n";
}
-
+
$hashval =~ s/^.*=\s+//;
die "Invalid hash syntax in file" if (length($hashfile) != 40);
die "Invalid hash received for file" if (length($hashval) != 40);
- die "***HASH VALUE MISMATCH FOR FILE $filename ***" if ($hashval ne $hashfile);
+ die "***HASH VALUE MISMATCH FOR FILE $filename ***" if ($hashval ne $hashfile);
}
# existence:platform:kind:algorithms
#
# - "existence" can be "EXIST" or "NOEXIST" depending on if the symbol is
-# found somewhere in the source,
+# found somewhere in the source,
# - "platforms" is empty if it exists on all platforms, otherwise it contains
# comma-separated list of the platform, just as they are if the symbol exists
# for those platforms, or prepended with a "!" if not. This helps resolve
$do_ssl=1 if $_ eq "libssl";
if ($_ eq "ssl") {
- $do_ssl=1;
+ $do_ssl=1;
$libname=$_
}
$do_crypto=1 if $_ eq "libcrypto";
}
-if (!$libname) {
+if (!$libname) {
if ($do_ssl) {
$libname="LIBSSL";
}
}
&update_numbers(*OUT,"LIBCRYPTO",*crypto_list,$max_crypto,@crypto_symbols);
close OUT;
-}
+}
} elsif ($do_checkexist) {
&check_existing(*ssl_list, @ssl_symbols)
Default: keep previously assigned numbers. (You are warned
when collisions are detected.)
- -nostatic Generates a different source code, where these additional
+ -nostatic Generates a different source code, where these additional
functions are generated for each library specified in the
config file:
void ERR_load_<LIB>_strings(void);
void ERR_<LIB>_error(int f, int r, char *fn, int ln);
#define <LIB>err(f,r) ERR_<LIB>_error(f,r,OPENSSL_FILE,OPENSSL_LINE)
while the code facilitates the use of these in an environment
- where the error support routines are dynamically loaded at
+ where the error support routines are dynamically loaded at
runtime.
Default: 'static' code generation.
-unref Print out unreferenced function and reason codes.
- -write Actually (over)write the generated code to the header and C
- source files as assigned to each library through the config
+ -write Actually (over)write the generated code to the header and C
+ source files as assigned to each library through the config
file.
Default: don't write.
if(/\/\*/) {
if (not /\*\//) { # multiline comment...
$line = $_; # ... just accumulate
- next;
+ next;
} else {
s/\/\*.*?\*\///gs; # wipe it
}
print STDERR "ERROR: mismatch $file:$linenr $func:$3\n";
$errcount++;
}
- print STDERR "Function: $1\t= $fcodes{$1} (lib: $2, name: $3)\n" if $debug;
+ print STDERR "Function: $1\t= $fcodes{$1} (lib: $2, name: $3)\n" if $debug;
}
if(/(([A-Z0-9]+)_R_[A-Z0-9_]+)/) {
next unless exists $csrc{$2};
$rcodes{$1} = "X";
$rnew{$2}++;
}
- print STDERR "Reason: $1\t= $rcodes{$1} (lib: $2)\n" if $debug;
- }
+ print STDERR "Reason: $1\t= $rcodes{$1} (lib: $2)\n" if $debug;
+ }
}
close IN;
}
if($inbrace) {
if($item eq "}") {
$inbrace --;
-
+
if(!$inbrace) {
$substruc = structureData($dataitem);
$dataitem = $substruc;