From 2e6b615f795e8ca8ae830a00079c4ea064eaae42 Mon Sep 17 00:00:00 2001 From: Patrick Steuer Date: Sat, 23 Mar 2019 00:03:24 +0100 Subject: [PATCH] s390x assembly pack: import poly from cryptogams repo >=20% faster than present code. Signed-off-by: Patrick Steuer Reviewed-by: Matt Caswell Reviewed-by: Richard Levitte (Merged from https://github.com/openssl/openssl/pull/8560) --- crypto/poly1305/asm/poly1305-s390x.pl | 1455 ++++++++++++++----------- crypto/poly1305/build.info | 1 + 2 files changed, 799 insertions(+), 657 deletions(-) diff --git a/crypto/poly1305/asm/poly1305-s390x.pl b/crypto/poly1305/asm/poly1305-s390x.pl index 390f9eefe7..ea1c2d82b5 100755 --- a/crypto/poly1305/asm/poly1305-s390x.pl +++ b/crypto/poly1305/asm/poly1305-s390x.pl @@ -32,10 +32,20 @@ # Copyright IBM Corp. 2019 # Author: Patrick Steuer +# +# January 2019 +# +# Add vector base 2^26 implementation. It's problematic to accurately +# measure performance, because reference system is hardly idle. But +# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's +# >=20% faster than IBM's submission on long inputs, and much faster on +# short ones, because calculation of key powers is postponed till we +# know that input is long enough to justify the additional overhead. + use strict; use FindBin qw($Bin); use lib "$Bin/../.."; -use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL); +use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE); my $flavour = shift; @@ -51,666 +61,98 @@ if ($flavour =~ /3[12]/) { my $output; while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +my $stdframe=16*$SIZE_T+4*8; my $sp="%r15"; -# novx code path ctx layout -# --------------------------------- -# var value base off -# --------------------------------- -# u64 h[3] hash 2^64 0 -# u32 pad[2] -# u64 r[2] key 2^64 32 - -# vx code path ctx layout -# --------------------------------- -# var value base off -# --------------------------------- -# u32 acc1[5] r^2-acc 2^26 0 -# u32 pad -# u32 acc2[5] r-acc 2^26 24 -# u32 pad -# u32 r1[5] r 2^26 48 -# u32 r15[5] 5*r 2^26 68 -# u32 r2[5] r^2 2^26 88 -# u32 r25[5] 5*r^2 2^26 108 -# u32 r4[5] r^4 2^26 128 -# u32 r45[5] 5*r^4 2^26 148 +my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5)); PERLASM_BEGIN($output); +INCLUDE ("s390x_arch.h"); TEXT (); ################ # static void poly1305_init(void *ctx, const unsigned char key[16]) { -my ($ctx,$key)=map("%r$_",(2..3)); -my ($r0,$r1,$r2)=map("%r$_",(9,11,13)); - -sub MUL_RKEY { # r*=key -my ($d0hi,$d0lo,$d1hi,$d1lo)=map("%r$_",(4..7)); -my ($t0,$t1,$s1)=map("%r$_",(8,10,12)); - - lg ("%r0","32($ctx)"); - lg ("%r1","40($ctx)"); - - srlg ($s1,"%r1",2); - algr ($s1,"%r1"); - - lgr ($d0lo,$r0); - lgr ($d1lo,$r1); - - mlgr ($d0hi,"%r0"); - lgr ($r1,$d1lo); - mlgr ($d1hi,$s1); - - mlgr ($t0,"%r1"); - mlgr ($t1,"%r0"); - - algr ($d0lo,$d1lo); - lgr ($d1lo,$r2); - alcgr ($d0hi,$d1hi); - lghi ($d1hi,0); - - algr ($r1,$r0); - alcgr ($t1,$t0); - - msgr ($d1lo,$s1); - msgr ($r2,"%r0"); - - algr ($r1,$d1lo); - alcgr ($t1,$d1hi); - - algr ($r1,$d0hi); - alcgr ($r2,$t1); - - lghi ($r0,-4); - ngr ($r0,$r2); - srlg ($t0,$r2,2); - algr ($r0,$t0); - lghi ($t1,3); - ngr ($r2,$t1); - - algr ($r0,$d0lo); - alcgr ($r1,$d1hi); - alcgr ($r2,$d1hi); -} - -sub ST_R5R { # store r,5*r -> base 2^26 -my @d=map("%r$_",(4..8)); -my @off=@_; - - lgr (@d[2],$r0); - lr ("%r1",@d[2]); - nilh ("%r1",1023); - lgr (@d[3],$r1); - lr (@d[0],"%r1"); - srlg ("%r1",@d[2],52); - lgr (@d[4],$r2); - srlg ("%r0",@d[2],26); - sll (@d[4],24); - lr (@d[2],@d[3]); - nilh ("%r0",1023); - sll (@d[2],12); - lr (@d[1],"%r0"); - &or (@d[2],"%r1"); - srlg ("%r1",@d[3],40); - nilh (@d[2],1023); - &or (@d[4],"%r1"); - srlg (@d[3],@d[3],14); - nilh (@d[4],1023); - nilh (@d[3],1023); - - stm (@d[0],@d[4],"@off[0]($ctx)"); - mhi (@d[$_],5) for (0..4); - stm (@d[0],@d[4],"@off[1]($ctx)"); -} - GLOBL ("poly1305_init"); TYPE ("poly1305_init","\@function"); ALIGN (16); LABEL ("poly1305_init"); lghi ("%r0",0); lghi ("%r1",-1); - stg ("%r0","0($ctx)"); # zero hash value / acc1 + stg ("%r0","0($ctx)"); # zero hash value stg ("%r0","8($ctx)"); stg ("%r0","16($ctx)"); + st ("%r0","24($ctx)"); # clear is_base2_26 + lgr ("%r5",$ctx); # reassign $ctx + lghi ("%r2",0); -&{$z? \&clgr:\&clr} ($key,"%r0"); - je (".Ldone"); +&{$z? \&clgr:\&clr} ($inp,"%r0"); + je (".Lno_key"); - lrvg ("%r4","0($key)"); # load little-endian key - lrvg ("%r5","8($key)"); + lrvg ("%r2","0($inp)"); # load little-endian key + lrvg ("%r3","8($inp)"); - nihl ("%r1",0xffc0); # 0xffffffc0ffffffff - srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff + nihl ("%r1",0xffc0); # 0xffffffc0ffffffff + srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff srlg ("%r1","%r1",4); - nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc + nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc - ngr ("%r4","%r0"); - ngr ("%r5","%r1"); + ngr ("%r2","%r0"); + ngr ("%r3","%r1"); - stg ("%r4","32($ctx)"); - stg ("%r5","40($ctx)"); + stmg ("%r2","%r3","32(%r5)"); larl ("%r1","OPENSSL_s390xcap_P"); lg ("%r0","16(%r1)"); - tmhh ("%r0",0x4000); # check for vector facility - jz (".Ldone"); - - larl ("%r4","poly1305_blocks_vx"); - larl ("%r5","poly1305_emit_vx"); - -&{$z? \&stmg:\&stm} ("%r6","%r13","6*$SIZE_T($sp)"); -&{$z? \&stmg:\&stm} ("%r4","%r5","4*$z+228($ctx)"); - - lg ($r0,"32($ctx)"); - lg ($r1,"40($ctx)"); - lghi ($r2,0); - - ST_R5R (48,68); # store r,5*r - - MUL_RKEY(); - ST_R5R (88,108); # store r^2,5*r^2 - - MUL_RKEY(); - MUL_RKEY(); - ST_R5R (128,148); # store r^4,5*r^4 - - lghi ("%r0",0); - stg ("%r0","24($ctx)"); # zero acc2 - stg ("%r0","32($ctx)"); - stg ("%r0","40($ctx)"); - -&{$z? \&lmg:\&lm} ("%r6","%r13","6*$SIZE_T($sp)"); + srlg ("%r0","%r0",62); + nill ("%r0",1); # extract vx bit + lcgr ("%r0","%r0"); + larl ("%r1",".Lpoly1305_blocks"); + larl ("%r2",".Lpoly1305_blocks_vx"); + larl ("%r3",".Lpoly1305_emit"); +&{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector +&{$z? \&ngr:\&nr} ("%r2","%r0"); +&{$z? \&xgr:\&xr} ("%r2","%r1"); +&{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)"); lghi ("%r2",1); - br ("%r14"); - -LABEL (".Ldone"); - lghi ("%r2",0); +LABEL (".Lno_key"); br ("%r14"); SIZE ("poly1305_init",".-poly1305_init"); } -# VX CODE PATH -{ -my $frame=8*16; -my @m01=map("%v$_",(0..4)); -my @m23=map("%v$_",(5..9)); -my @tmp=@m23; -my @acc=map("%v$_",(10..14)); -my @r=map("%v$_",(15..19)); -my @r5=map("%v$_",(20..24)); -my $padvec="%v26"; -my $mask4="%v27"; -my @vperm=map("%v$_",(28..30)); -my $mask="%v31"; - -sub REDUCE { - vesrlg (@tmp[0],@acc[0],26); - vesrlg (@tmp[3],@acc[3],26); - vn (@acc[0],@acc[0],$mask); - vn (@acc[3],@acc[3],$mask); - vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1 - vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4 - - vesrlg (@tmp[1],@acc[1],26); - vesrlg (@tmp[4],@acc[4],26); - vn (@acc[1],@acc[1],$mask); - vn (@acc[4],@acc[4],$mask); - veslg (@tmp[0],@tmp[4],2); - vag (@tmp[4],@tmp[4],@tmp[0]); # h[4]*=5 - vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2 - vag (@acc[0],@acc[0],@tmp[4]); # carry 4->0 - - vesrlg (@tmp[2],@acc[2],26); - vesrlg (@tmp[0],@acc[0],26); - vn (@acc[2],@acc[2],$mask); - vn (@acc[0],@acc[0],$mask); - vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3 - vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1 - - vesrlg (@tmp[3],@acc[3],26); - vn (@acc[3],@acc[3],$mask); - vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4 -} - ################ -# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp, -# size_t len, u32 padbit) +# static void poly1305_blocks(void *ctx, const unsigned char *inp, +# size_t len, u32 padbit) { -my ($ctx,$inp,$len) = map("%r$_",(2..4)); -my $padbit="%r0"; - -GLOBL ("poly1305_blocks_vx"); -TYPE ("poly1305_blocks_vx","\@function"); -ALIGN (16); -LABEL ("poly1305_blocks_vx"); -if ($z) { - aghi ($sp,-$frame); - vstm ("%v8","%v15","0($sp)"); -} else { - std ("%f4","16*$SIZE_T+2*8($sp)"); - std ("%f6","16*$SIZE_T+3*8($sp)"); - llgfr ($len,$len); -} - llgfr ($padbit,"%r5"); - vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1 - larl ("%r5",".Lconst"); - vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2 - sllg ($padbit,$padbit,24); - vlm (@vperm[0],$mask,"0(%r5)"); # load vperm ops, mask - vgbm ($mask4,0x0707); - vlvgp ($padvec,$padbit,$padbit); - - srlg ("%r1",$len,6); - ltgr ("%r1","%r1"); - jz (".Lvx_4x_done"); - -ALIGN (16); -LABEL (".Lvx_4x"); - vlm ("%v20","%v23","0($inp)"); # load m0,m1,m2,m3 - - # m01,m23 -> base 2^26 - - vperm (@m01[0],"%v20","%v21",@vperm[0]); - vperm (@m23[0],"%v22","%v23",@vperm[0]); - vperm (@m01[2],"%v20","%v21",@vperm[1]); - vperm (@m23[2],"%v22","%v23",@vperm[1]); - vperm (@m01[4],"%v20","%v21",@vperm[2]); - vperm (@m23[4],"%v22","%v23",@vperm[2]); - - vesrlg (@m01[1],@m01[0],26); - vesrlg (@m23[1],@m23[0],26); - vesrlg (@m01[3],@m01[2],30); - vesrlg (@m23[3],@m23[2],30); - vesrlg (@m01[2],@m01[2],4); - vesrlg (@m23[2],@m23[2],4); - - vn (@m01[4],@m01[4],$mask4); - vn (@m23[4],@m23[4],$mask4); -for (0..3) { - vn (@m01[$_],@m01[$_],$mask); - vn (@m23[$_],@m23[$_],$mask); -} - vaf (@m01[4],@m01[4],$padvec); # pad m01 - vaf (@m23[4],@m23[4],$padvec); # pad m23 - - # acc = acc * r^4 + m01 * r^2 + m23 - - vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2 - vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2 - - vmalof (@tmp[0],@m01[4],@r5[1],@m23[0]); - vmalof (@tmp[1],@m01[4],@r5[2],@m23[1]); - vmalof (@tmp[2],@m01[4],@r5[3],@m23[2]); - vmalof (@tmp[3],@m01[4],@r5[4],@m23[3]); - vmalof (@tmp[4],@m01[4],@r[0],@m23[4]); - - vmalof (@tmp[0],@m01[3],@r5[2],@tmp[0]); - vmalof (@tmp[1],@m01[3],@r5[3],@tmp[1]); - vmalof (@tmp[2],@m01[3],@r5[4],@tmp[2]); - vmalof (@tmp[3],@m01[3],@r[0],@tmp[3]); - vmalof (@tmp[4],@m01[3],@r[1],@tmp[4]); - - vmalof (@tmp[0],@m01[2],@r5[3],@tmp[0]); - vmalof (@tmp[1],@m01[2],@r5[4],@tmp[1]); - vmalof (@tmp[2],@m01[2],@r[0],@tmp[2]); - vmalof (@tmp[3],@m01[2],@r[1],@tmp[3]); - vmalof (@tmp[4],@m01[2],@r[2],@tmp[4]); - - vmalof (@tmp[0],@m01[1],@r5[4],@tmp[0]); - vmalof (@tmp[1],@m01[1],@r[0],@tmp[1]); - vmalof (@tmp[2],@m01[1],@r[1],@tmp[2]); - vmalof (@tmp[3],@m01[1],@r[2],@tmp[3]); - vmalof (@tmp[4],@m01[1],@r[3],@tmp[4]); - - vmalof (@tmp[0],@m01[0],@r[0],@tmp[0]); - vmalof (@tmp[1],@m01[0],@r[1],@tmp[1]); - vmalof (@tmp[2],@m01[0],@r[2],@tmp[2]); - vmalof (@tmp[3],@m01[0],@r[3],@tmp[3]); - vmalof (@tmp[4],@m01[0],@r[4],@tmp[4]); - - vlrepf (@r5[$_],"4*$_+148($ctx)") for (0..4); # load 5*r^4 - vlrepf (@r[$_],"4*$_+128($ctx)") for (0..4); # load r^4 - - vmalof (@tmp[0],@acc[4],@r5[1],@tmp[0]); - vmalof (@tmp[1],@acc[4],@r5[2],@tmp[1]); - vmalof (@tmp[2],@acc[4],@r5[3],@tmp[2]); - vmalof (@tmp[3],@acc[4],@r5[4],@tmp[3]); - vmalof (@tmp[4],@acc[4],@r[0],@tmp[4]); - - vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]); - vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]); - vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]); - vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]); - vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]); - - vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]); - vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]); - vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]); - vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]); - vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]); - - vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]); - vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]); - vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]); - vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]); - vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]); - - vmalof (@acc[1],@acc[0],@r[1],@tmp[1]); - vmalof (@acc[2],@acc[0],@r[2],@tmp[2]); - vmalof (@acc[3],@acc[0],@r[3],@tmp[3]); - vmalof (@acc[4],@acc[0],@r[4],@tmp[4]); - vmalof (@acc[0],@acc[0],@r[0],@tmp[0]); - - REDUCE (); - - la ($inp,"64($inp)"); - brctg ("%r1",".Lvx_4x"); - -ALIGN (16); -LABEL (".Lvx_4x_done"); - tml ($len,32); - jz (".Lvx_2x_done"); - - vlm ("%v20","%v21","0($inp)"); # load m0,m1 - - # m01 -> base 2^26 - - vperm (@m01[0],"%v20","%v21",@vperm[0]); - vperm (@m01[2],"%v20","%v21",@vperm[1]); - vperm (@m01[4],"%v20","%v21",@vperm[2]); - - vesrlg (@m01[1],@m01[0],26); - vesrlg (@m01[3],@m01[2],30); - vesrlg (@m01[2],@m01[2],4); - - vn (@m01[4],@m01[4],$mask4); - vn (@m01[$_],@m01[$_],$mask) for (0..3); - - vaf (@m01[4],@m01[4],$padvec); # pad m01 - - # acc = acc * r^2+ m01 - - vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2 - vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2 - - vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]); - vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]); - vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]); - vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]); - vmalof (@tmp[4],@acc[4],@r[0],@m01[4]); - - vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]); - vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]); - vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]); - vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]); - vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]); - - vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]); - vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]); - vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]); - vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]); - vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]); - - vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]); - vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]); - vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]); - vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]); - vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]); - - vmalof (@acc[1],@acc[0],@r[1],@tmp[1]); - vmalof (@acc[2],@acc[0],@r[2],@tmp[2]); - vmalof (@acc[3],@acc[0],@r[3],@tmp[3]); - vmalof (@acc[4],@acc[0],@r[4],@tmp[4]); - vmalof (@acc[0],@acc[0],@r[0],@tmp[0]); - - REDUCE (); - - la ($inp,"32($inp)"); - -ALIGN (16); -LABEL (".Lvx_2x_done"); - tml ($len,16); - jz (".Lvx_done"); - - vleig ($padvec,0,0); - - vzero ("%v20"); - vl ("%v21","0($inp)"); # load m0 - - # m0 -> base 2^26 - - vperm (@m01[0],"%v20","%v21",@vperm[0]); - vperm (@m01[2],"%v20","%v21",@vperm[1]); - vperm (@m01[4],"%v20","%v21",@vperm[2]); - - vesrlg (@m01[1],@m01[0],26); - vesrlg (@m01[3],@m01[2],30); - vesrlg (@m01[2],@m01[2],4); - - vn (@m01[4],@m01[4],$mask4); - vn (@m01[$_],@m01[$_],$mask) for (0..3); - - vaf (@m01[4],@m01[4],$padvec); # pad m0 - - # acc = acc * r + m01 - - vlrepf (@r5[$_],"4*$_+68($ctx)") for (0..4); # load 5*r - vlrepf (@r[$_],"4*$_+48($ctx)") for (0..4); # load r - - vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]); - vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]); - vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]); - vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]); - vmalof (@tmp[4],@acc[4],@r[0],@m01[4]); - - vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]); - vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]); - vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]); - vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]); - vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]); - - vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]); - vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]); - vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]); - vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]); - vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]); - - vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]); - vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]); - vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]); - vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]); - vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]); - - vmalof (@acc[1],@acc[0],@r[1],@tmp[1]); - vmalof (@acc[2],@acc[0],@r[2],@tmp[2]); - vmalof (@acc[3],@acc[0],@r[3],@tmp[3]); - vmalof (@acc[4],@acc[0],@r[4],@tmp[4]); - vmalof (@acc[0],@acc[0],@r[0],@tmp[0]); - - REDUCE (); - -ALIGN (16); -LABEL (".Lvx_done"); - vstef (@acc[$_],"4*$_($ctx)",1) for (0..4); # store acc - vstef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); - -if ($z) { - vlm ("%v8","%v15","0($sp)"); - la ($sp,"$frame($sp)"); -} else { - ld ("%f4","16*$SIZE_T+2*8($sp)"); - ld ("%f6","16*$SIZE_T+3*8($sp)"); -} - br ("%r14"); -SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx"); -} - -################ -# static void poly1305_emit_vx(void *ctx, unsigned char mac[16], -# const u32 nonce[4]) -{ -my ($ctx,$mac,$nonce) = map("%r$_",(2..4)); - -GLOBL ("poly1305_emit_vx"); -TYPE ("poly1305_emit_vx","\@function"); -ALIGN (16); -LABEL ("poly1305_emit_vx"); -if ($z) { - aghi ($sp,-$frame); - vstm ("%v8","%v15","0($sp)"); -} else { - std ("%f4","16*$SIZE_T+2*8($sp)"); - std ("%f6","16*$SIZE_T+3*8($sp)"); -} - larl ("%r5",".Lconst"); - - vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1 - vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2 - vlef (@r5[$_],"108+4*$_($ctx)",1) for (0..4); # load 5*r^2 - vlef (@r[$_],"88+4*$_($ctx)",1) for (0..4); # load r^2 - vlef (@r5[$_],"68+4*$_($ctx)",3) for (0..4); # load 5*r - vlef (@r[$_],"48+4*$_($ctx)",3) for (0..4); # load r - vl ($mask,"48(%r5)"); # load mask - - # acc = acc1 * r^2 + acc2 * r - - vmlof (@tmp[0],@acc[4],@r5[1]); - vmlof (@tmp[1],@acc[4],@r5[2]); - vmlof (@tmp[2],@acc[4],@r5[3]); - vmlof (@tmp[3],@acc[4],@r5[4]); - vmlof (@tmp[4],@acc[4],@r[0]); - - vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]); - vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]); - vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]); - vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]); - vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]); - - vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]); - vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]); - vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]); - vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]); - vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]); - - vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]); - vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]); - vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]); - vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]); - vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]); - - vmalof (@acc[1],@acc[0],@r[1],@tmp[1]); - vmalof (@acc[2],@acc[0],@r[2],@tmp[2]); - vmalof (@acc[3],@acc[0],@r[3],@tmp[3]); - vmalof (@acc[4],@acc[0],@r[4],@tmp[4]); - vmalof (@acc[0],@acc[0],@r[0],@tmp[0]); - - vzero ("%v27"); - vsumqg (@acc[$_],@acc[$_],"%v27") for (0..4); - - REDUCE (); - - vesrlg (@tmp[1],@acc[1],26); - vn (@acc[1],@acc[1],$mask); - vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2 - - vesrlg (@tmp[2],@acc[2],26); - vn (@acc[2],@acc[2],$mask); - vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3 - - vesrlg (@tmp[3],@acc[3],26); - vn (@acc[3],@acc[3],$mask); - vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4 - - # acc -> base 2^64 - vleib ("%v30",6*8,7); - vleib ("%v29",13*8,7); - vleib ("%v28",3*8,7); - - veslg (@acc[1],@acc[1],26); - veslg (@acc[3],@acc[3],26); - vo (@acc[0],@acc[0],@acc[1]); - vo (@acc[2],@acc[2],@acc[3]); - - veslg (@acc[2],@acc[2],4); - vslb (@acc[2],@acc[2],"%v30"); # <<52 - vo (@acc[0],@acc[0],@acc[2]); - - vslb (@tmp[4],@acc[4],"%v29"); # <<104 - vo (@acc[0],@acc[0],@tmp[4]); - - vsrlb (@acc[1],@acc[4],"%v28"); # >>24 - - # acc %= 2^130-5 - vone ("%v26"); - vleig ("%v27",5,1); - vone ("%v29"); - vleig ("%v26",-4,1); - - vaq (@tmp[0],@acc[0],"%v27"); - vaccq (@tmp[1],@acc[0],"%v27"); - - vaq (@tmp[1],@tmp[1],"%v26"); - vaccq (@tmp[1],@tmp[1],@acc[1]); - - vaq (@tmp[1],@tmp[1],"%v29"); - - vn (@tmp[2],@tmp[1],@acc[0]); - vnc (@tmp[3],@tmp[0],@tmp[1]); - vo (@acc[0],@tmp[2],@tmp[3]); - - # acc += nonce - vl (@vperm[0],"64(%r5)"); - vlef (@tmp[0],"4*$_($nonce)",3-$_) for (0..3); - - vaq (@acc[0],@acc[0],@tmp[0]); - - vperm (@acc[0],@acc[0],@acc[0],@vperm[0]); - vst (@acc[0],"0($mac)"); # store mac - -if ($z) { - vlm ("%v8","%v15","0($sp)"); - la ($sp,"$frame($sp)"); -} else { - ld ("%f4","16*$SIZE_T+2*8($sp)"); - ld ("%f6","16*$SIZE_T+3*8($sp)"); -} - br ("%r14"); -SIZE ("poly1305_emit_vx",".-poly1305_emit_vx"); -} -} - -# NOVX CODE PATH -{ -################ -# static void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, -# u32 padbit) -{ -my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5)); - my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14)); my ($r0,$r1,$s1) = map("%r$_",(0..2)); + GLOBL ("poly1305_blocks"); TYPE ("poly1305_blocks","\@function"); ALIGN (16); LABEL ("poly1305_blocks"); -$z? srlg ($len,$len,4) :srl ($len,4); - lghi ("%r0",0); -&{$z? \&clgr:\&clr} ($len,"%r0"); - je (".Lno_data"); +LABEL (".Lpoly1305_blocks"); +&{$z? \<gr:\<r} ("%r0",$len); + jz (".Lno_data"); &{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)"); - llgfr ($padbit,$padbit); # clear upper half, much needed with - # non-64-bit ABI - lg ($r0,"32($ctx)"); # load key - lg ($r1,"40($ctx)"); - - lg ($h0,"0($ctx)"); # load hash value + lg ($h0,"0($ctx)"); # load hash value lg ($h1,"8($ctx)"); lg ($h2,"16($ctx)"); +LABEL (".Lpoly1305_blocks_entry"); +if ($z) { + srlg ($len,$len,4); +} else { + srl ($len,4); +} + llgfr ($padbit,$padbit); # clear upper half, much needed with + # non-64-bit ABI + lg ($r0,"32($ctx)"); # load key + lg ($r1,"40($ctx)"); + &{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx srlg ($s1,$r1,2); algr ($s1,$r1); # s1 = r1 + r1>>2 @@ -718,21 +160,21 @@ $z? srlg ($len,$len,4) :srl ($len,4); ALIGN (16); LABEL (".Loop"); - lrvg ($d0lo,"0($inp)"); # load little-endian input + lrvg ($d0lo,"0($inp)"); # load little-endian input lrvg ($d1lo,"8($inp)"); la ($inp,"16($inp)"); - algr ($d0lo,$h0); # accumulate input + algr ($d0lo,$h0); # accumulate input alcgr ($d1lo,$h1); + alcgr ($h2,$padbit); lgr ($h0,$d0lo); - mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo + mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo lgr ($h1,$d1lo); - mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo + mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo - mlgr ($t0,$r1); # h0*r1 -> $t0:$h0 - mlgr ($t1,$r0); # h1*r0 -> $t1:$h1 - alcgr ($h2,$padbit); + mlgr ($t0,$r1); # h0*r1 -> $t0:$h0 + mlgr ($t1,$r0); # h1*r0 -> $t1:$h1 algr ($d0lo,$d1lo); lgr ($d1lo,$h2); @@ -742,16 +184,16 @@ LABEL (".Loop"); algr ($h1,$h0); alcgr ($t1,$t0); - msgr ($d1lo,$s1); # h2*s1 - msgr ($h2,$r0); # h2*r0 + msgr ($d1lo,$s1); # h2*s1 + msgr ($h2,$r0); # h2*r0 algr ($h1,$d1lo); - alcgr ($t1,$d1hi); # $d1hi is zero + alcgr ($t1,$d1hi); # $d1hi is zero algr ($h1,$d0hi); alcgr ($h2,$t1); - lghi ($h0,-4); # final reduction step + lghi ($h0,-4); # final reduction step ngr ($h0,$h2); srlg ($t0,$h2,2); algr ($h0,$t0); @@ -759,14 +201,14 @@ LABEL (".Loop"); ngr ($h2,$t1); algr ($h0,$d0lo); - alcgr ($h1,$d1hi); # $d1hi is still zero - alcgr ($h2,$d1hi); # $d1hi is still zero + alcgr ($h1,$d1hi); # $d1hi is still zero + alcgr ($h2,$d1hi); # $d1hi is still zero &{$z? \&brctg:\&brct} ($len,".Loop"); &{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx - stg ($h0,"0($ctx)"); # store hash value + stg ($h0,"0($ctx)"); # store hash value stg ($h1,"8($ctx)"); stg ($h2,"16($ctx)"); @@ -776,68 +218,767 @@ LABEL (".Lno_data"); SIZE ("poly1305_blocks",".-poly1305_blocks"); } +################ +# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp, +# size_t len, u32 padbit) +{ +my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4)); +my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9)); +my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14)); +my ($R3, $S3, $R4, $S4) = map("%v$_",(15..18)); +my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23)); +my ($T1, $T2, $T3, $T4) = map("%v$_",(24..27)); +my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31)); + +my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14)); + +TYPE ("poly1305_blocks_vx","\@function"); +ALIGN (16); +LABEL ("poly1305_blocks_vx"); +LABEL (".Lpoly1305_blocks_vx"); +&{$z? \&clgfi:\&clfi} ($len,128); + jhe ("__poly1305_blocks_vx"); + +&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)"); + + lg ($d0,"0($ctx)"); + lg ($d1,"8($ctx)"); + lg ($d2,"16($ctx)"); + + llgfr ("%r0",$d0); # base 2^26 -> base 2^64 + srlg ($h0,$d0,32); + llgfr ("%r1",$d1); + srlg ($h1,$d1,32); + srlg ($h2,$d2,32); + + sllg ("%r0","%r0",26); + algr ($h0,"%r0"); + sllg ("%r0",$h1,52); + srlg ($h1,$h1,12); + sllg ("%r1","%r1",14); + algr ($h0,"%r0"); + alcgr ($h1,"%r1"); + sllg ("%r0",$h2,40); + srlg ($h2,$h2,24); + lghi ("%r1",0); + algr ($h1,"%r0"); + alcgr ($h2,"%r1"); + + llgf ("%r0","24($ctx)"); # is_base2_26 + lcgr ("%r0","%r0"); + + xgr ($h0,$d0); # choose between radixes + xgr ($h1,$d1); + xgr ($h2,$d2); + ngr ($h0,"%r0"); + ngr ($h1,"%r0"); + ngr ($h2,"%r0"); + xgr ($h0,$d0); + xgr ($h1,$d1); + xgr ($h2,$d2); + + lhi ("%r0",0); + st ("%r0","24($ctx)"); # clear is_base2_26 + + j (".Lpoly1305_blocks_entry"); +SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx"); + +TYPE ("__poly1305_mul","\@function"); +ALIGN (16); +LABEL ("__poly1305_mul"); + vmlof ($ACC0,$H0,$R0); + vmlof ($ACC1,$H0,$R1); + vmlof ($ACC2,$H0,$R2); + vmlof ($ACC3,$H0,$R3); + vmlof ($ACC4,$H0,$R4); + + vmalof ($ACC0,$H1,$S4,$ACC0); + vmalof ($ACC1,$H1,$R0,$ACC1); + vmalof ($ACC2,$H1,$R1,$ACC2); + vmalof ($ACC3,$H1,$R2,$ACC3); + vmalof ($ACC4,$H1,$R3,$ACC4); + + vmalof ($ACC0,$H2,$S3,$ACC0); + vmalof ($ACC1,$H2,$S4,$ACC1); + vmalof ($ACC2,$H2,$R0,$ACC2); + vmalof ($ACC3,$H2,$R1,$ACC3); + vmalof ($ACC4,$H2,$R2,$ACC4); + + vmalof ($ACC0,$H3,$S2,$ACC0); + vmalof ($ACC1,$H3,$S3,$ACC1); + vmalof ($ACC2,$H3,$S4,$ACC2); + vmalof ($ACC3,$H3,$R0,$ACC3); + vmalof ($ACC4,$H3,$R1,$ACC4); + + vmalof ($ACC0,$H4,$S1,$ACC0); + vmalof ($ACC1,$H4,$S2,$ACC1); + vmalof ($ACC2,$H4,$S3,$ACC2); + vmalof ($ACC3,$H4,$S4,$ACC3); + vmalof ($ACC4,$H4,$R0,$ACC4); + + ################################################################ + # lazy reduction + + vesrlg ($H4,$ACC3,26); + vesrlg ($H1,$ACC0,26); + vn ($H3,$ACC3,$mask26); + vn ($H0,$ACC0,$mask26); + vag ($H4,$H4,$ACC4); # h3 -> h4 + vag ($H1,$H1,$ACC1); # h0 -> h1 + + vesrlg ($ACC4,$H4,26); + vesrlg ($ACC1,$H1,26); + vn ($H4,$H4,$mask26); + vn ($H1,$H1,$mask26); + vag ($H0,$H0,$ACC4); + vag ($H2,$ACC2,$ACC1); # h1 -> h2 + + veslg ($ACC4,$ACC4,2); # <<2 + vesrlg ($ACC2,$H2,26); + vn ($H2,$H2,$mask26); + vag ($H0,$H0,$ACC4); # h4 -> h0 + vag ($H3,$H3,$ACC2); # h2 -> h3 + + vesrlg ($ACC0,$H0,26); + vesrlg ($ACC3,$H3,26); + vn ($H0,$H0,$mask26); + vn ($H3,$H3,$mask26); + vag ($H1,$H1,$ACC0); # h0 -> h1 + vag ($H4,$H4,$ACC3); # h3 -> h4 + br ("%r14"); +SIZE ("__poly1305_mul",".-__poly1305_mul"); + +TYPE ("__poly1305_blocks_vx","\@function"); +ALIGN (16); +LABEL ("__poly1305_blocks_vx"); +&{$z? \&lgr:\&lr} ("%r0",$sp); +&{$z? \&stmg:\&stm} ("%r10","%r15","10*$SIZE_T($sp)"); +if (!$z) { + std ("%f4","16*$SIZE_T+2*8($sp)"); + std ("%f6","16*$SIZE_T+3*8($sp)"); + ahi ($sp,-$stdframe); + st ("%r0","0($sp)"); # back-chain + + llgfr ($len,$len); # so that srlg works on $len +} else { + aghi ($sp,"-($stdframe+8*8)"); + stg ("%r0","0($sp)"); # back-chain + + std ("%f8","$stdframe+0*8($sp)"); + std ("%f9","$stdframe+1*8($sp)"); + std ("%f10","$stdframe+2*8($sp)"); + std ("%f11","$stdframe+3*8($sp)"); + std ("%f12","$stdframe+4*8($sp)"); + std ("%f13","$stdframe+5*8($sp)"); + std ("%f14","$stdframe+6*8($sp)"); + std ("%f15","$stdframe+7*8($sp)"); +} + larl ("%r1",".Lconst"); + vgmg ($mask26,38,63); + vlm ($bswaplo,$bswapmi,"16(%r1)"); + + < ("%r0","24($ctx)"); # is_base2_26? + jnz (".Lskip_init"); + + lg ($h0,"32($ctx)"); # load key base 2^64 + lg ($h1,"40($ctx)"); + + risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26 + srlg ($d1,$h0,52); + risbg ($h0,$h0,38,0x80+63,0); + vlvgg ($R0,$h0,0); + risbg ($d1,$h1,38,51,12); + vlvgg ($R1,$d0,0); + risbg ($d0,$h1,38,63,50); + vlvgg ($R2,$d1,0); + srlg ($d1,$h1,40); + vlvgg ($R3,$d0,0); + vlvgg ($R4,$d1,0); + + veslg ($S1,$R1,2); + veslg ($S2,$R2,2); + veslg ($S3,$R3,2); + veslg ($S4,$R4,2); + vlr ($H0,$R0); + vlr ($H1,$R1); + vlr ($H2,$R2); + vlr ($H3,$R3); + vlr ($H4,$R4); + vag ($S1,$S1,$R1); # * 5 + vag ($S2,$S2,$R2); + vag ($S3,$S3,$R3); + vag ($S4,$S4,$R4); + + brasl ("%r14","__poly1305_mul"); # r^1:- * r^1:- + + vpdi ($R0,$H0,$R0,0); # r^2:r^1 + vpdi ($R1,$H1,$R1,0); + vpdi ($R2,$H2,$R2,0); + vpdi ($R3,$H3,$R3,0); + vpdi ($R4,$H4,$R4,0); + vpdi ($H0,$H0,$H0,0); # r^2:r^2 + vpdi ($H1,$H1,$H1,0); + vpdi ($H2,$H2,$H2,0); + vpdi ($H3,$H3,$H3,0); + vpdi ($H4,$H4,$H4,0); + veslg ($S1,$R1,2); + veslg ($S2,$R2,2); + veslg ($S3,$R3,2); + veslg ($S4,$R4,2); + vag ($S1,$S1,$R1); # * 5 + vag ($S2,$S2,$R2); + vag ($S3,$S3,$R3); + vag ($S4,$S4,$R4); + + brasl ("%r14,__poly1305_mul"); # r^2:r^2 * r^2:r^1 + + vl ($I0,"0(%r1)"); # borrow $I0 + vperm ($R0,$R0,$H0,$I0); # r^2:r^4:r^1:r^3 + vperm ($R1,$R1,$H1,$I0); + vperm ($R2,$R2,$H2,$I0); + vperm ($R3,$R3,$H3,$I0); + vperm ($R4,$R4,$H4,$I0); + veslf ($S1,$R1,2); + veslf ($S2,$R2,2); + veslf ($S3,$R3,2); + veslf ($S4,$R4,2); + vaf ($S1,$S1,$R1); # * 5 + vaf ($S2,$S2,$R2); + vaf ($S3,$S3,$R3); + vaf ($S4,$S4,$R4); + + lg ($h0,"0($ctx)"); # load hash base 2^64 + lg ($h1,"8($ctx)"); + lg ($h2,"16($ctx)"); + + vzero ($H0); + vzero ($H1); + vzero ($H2); + vzero ($H3); + vzero ($H4); + + risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26 + srlg ($d1,$h0,52); + risbg ($h0,$h0,38,0x80+63,0); + vlvgg ($H0,$h0,0); + risbg ($d1,$h1,38,51,12); + vlvgg ($H1,$d0,0); + risbg ($d0,$h1,38,63,50); + vlvgg ($H2,$d1,0); + srlg ($d1,$h1,40); + vlvgg ($H3,$d0,0); + risbg ($d1,$h2,37,39,24); + vlvgg ($H4,$d1,0); + + lhi ("%r0",1); + st ("%r0","24($ctx)"); # set is_base2_26 + + vstm ($R0,$S4,"48($ctx)"); # save key schedule base 2^26 + + vpdi ($R0,$R0,$R0,0); # broadcast r^2:r^4 + vpdi ($R1,$R1,$R1,0); + vpdi ($S1,$S1,$S1,0); + vpdi ($R2,$R2,$R2,0); + vpdi ($S2,$S2,$S2,0); + vpdi ($R3,$R3,$R3,0); + vpdi ($S3,$S3,$S3,0); + vpdi ($R4,$R4,$R4,0); + vpdi ($S4,$S4,$S4,0); + + j (".Loaded_hash"); + +ALIGN (16); +LABEL (".Lskip_init"); + vllezf ($H0,"0($ctx)"); # load hash base 2^26 + vllezf ($H1,"4($ctx)"); + vllezf ($H2,"8($ctx)"); + vllezf ($H3,"12($ctx)"); + vllezf ($H4,"16($ctx)"); + + vlrepg ($R0,"0x30($ctx)"); # broadcast r^2:r^4 + vlrepg ($R1,"0x40($ctx)"); + vlrepg ($S1,"0x50($ctx)"); + vlrepg ($R2,"0x60($ctx)"); + vlrepg ($S2,"0x70($ctx)"); + vlrepg ($R3,"0x80($ctx)"); + vlrepg ($S3,"0x90($ctx)"); + vlrepg ($R4,"0xa0($ctx)"); + vlrepg ($S4,"0xb0($ctx)"); + +LABEL (".Loaded_hash"); + vzero ($I1); + vzero ($I3); + + vlm ($T1,$T4,"0x00($inp)"); # load first input block + la ($inp,"0x40($inp)"); + vgmg ($mask26,6,31); + vgmf ($I4,5,5); # padbit<<2 + + vperm ($I0,$T3,$T4,$bswaplo); + vperm ($I2,$T3,$T4,$bswapmi); + vperm ($T3,$T3,$T4,$bswaphi); + + verimg ($I1,$I0,$mask26,6); # >>26 + veslg ($I0,$I0,32); + veslg ($I2,$I2,28); # >>4 + verimg ($I3,$T3,$mask26,18); # >>14 + verimg ($I4,$T3,$mask26,58); # >>38 + vn ($I0,$I0,$mask26); + vn ($I2,$I2,$mask26); + vesrlf ($I4,$I4,2); # >>2 + + vgmg ($mask26,38,63); + vperm ($T3,$T1,$T2,$bswaplo); + vperm ($T4,$T1,$T2,$bswaphi); + vperm ($T2,$T1,$T2,$bswapmi); + + verimg ($I0,$T3,$mask26,0); + verimg ($I1,$T3,$mask26,38); # >>26 + verimg ($I2,$T2,$mask26,60); # >>4 + verimg ($I3,$T4,$mask26,50); # >>14 + vesrlg ($T4,$T4,40); + vo ($I4,$I4,$T4); + + srlg ("%r0",$len,6); +&{$z? \&aghi:\&ahi} ("%r0",-1); + +ALIGN (16); +LABEL (".Loop_vx"); + vmlef ($ACC0,$I0,$R0); + vmlef ($ACC1,$I0,$R1); + vmlef ($ACC2,$I0,$R2); + vmlef ($ACC3,$I0,$R3); + vmlef ($ACC4,$I0,$R4); + + vmalef ($ACC0,$I1,$S4,$ACC0); + vmalef ($ACC1,$I1,$R0,$ACC1); + vmalef ($ACC2,$I1,$R1,$ACC2); + vmalef ($ACC3,$I1,$R2,$ACC3); + vmalef ($ACC4,$I1,$R3,$ACC4); + + vaf ($H2,$H2,$I2); + vaf ($H0,$H0,$I0); + vaf ($H3,$H3,$I3); + vaf ($H1,$H1,$I1); + vaf ($H4,$H4,$I4); + + vmalef ($ACC0,$I2,$S3,$ACC0); + vmalef ($ACC1,$I2,$S4,$ACC1); + vmalef ($ACC2,$I2,$R0,$ACC2); + vmalef ($ACC3,$I2,$R1,$ACC3); + vmalef ($ACC4,$I2,$R2,$ACC4); + + vlm ($T1,$T4,"0x00($inp)"); # load next input block + la ($inp,"0x40($inp)"); + vgmg ($mask26,6,31); + + vmalef ($ACC0,$I3,$S2,$ACC0); + vmalef ($ACC1,$I3,$S3,$ACC1); + vmalef ($ACC2,$I3,$S4,$ACC2); + vmalef ($ACC3,$I3,$R0,$ACC3); + vmalef ($ACC4,$I3,$R1,$ACC4); + + vperm ($I0,$T3,$T4,$bswaplo); + vperm ($I2,$T3,$T4,$bswapmi); + vperm ($T3,$T3,$T4,$bswaphi); + + vmalef ($ACC0,$I4,$S1,$ACC0); + vmalef ($ACC1,$I4,$S2,$ACC1); + vmalef ($ACC2,$I4,$S3,$ACC2); + vmalef ($ACC3,$I4,$S4,$ACC3); + vmalef ($ACC4,$I4,$R0,$ACC4); + + verimg ($I1,$I0,$mask26,6); # >>26 + veslg ($I0,$I0,32); + veslg ($I2,$I2,28); # >>4 + verimg ($I3,$T3,$mask26,18); # >>14 + + vmalof ($ACC0,$H0,$R0,$ACC0); + vmalof ($ACC1,$H0,$R1,$ACC1); + vmalof ($ACC2,$H0,$R2,$ACC2); + vmalof ($ACC3,$H0,$R3,$ACC3); + vmalof ($ACC4,$H0,$R4,$ACC4); + + vgmf ($I4,5,5); # padbit<<2 + verimg ($I4,$T3,$mask26,58); # >>38 + vn ($I0,$I0,$mask26); + vn ($I2,$I2,$mask26); + vesrlf ($I4,$I4,2); # >>2 + + vmalof ($ACC0,$H1,$S4,$ACC0); + vmalof ($ACC1,$H1,$R0,$ACC1); + vmalof ($ACC2,$H1,$R1,$ACC2); + vmalof ($ACC3,$H1,$R2,$ACC3); + vmalof ($ACC4,$H1,$R3,$ACC4); + + vgmg ($mask26,38,63); + vperm ($T3,$T1,$T2,$bswaplo); + vperm ($T4,$T1,$T2,$bswaphi); + vperm ($T2,$T1,$T2,$bswapmi); + + vmalof ($ACC0,$H2,$S3,$ACC0); + vmalof ($ACC1,$H2,$S4,$ACC1); + vmalof ($ACC2,$H2,$R0,$ACC2); + vmalof ($ACC3,$H2,$R1,$ACC3); + vmalof ($ACC4,$H2,$R2,$ACC4); + + verimg ($I0,$T3,$mask26,0); + verimg ($I1,$T3,$mask26,38); # >>26 + verimg ($I2,$T2,$mask26,60); # >>4 + + vmalof ($ACC0,$H3,$S2,$ACC0); + vmalof ($ACC1,$H3,$S3,$ACC1); + vmalof ($ACC2,$H3,$S4,$ACC2); + vmalof ($ACC3,$H3,$R0,$ACC3); + vmalof ($ACC4,$H3,$R1,$ACC4); + + verimg ($I3,$T4,$mask26,50); # >>14 + vesrlg ($T4,$T4,40); + vo ($I4,$I4,$T4); + + vmalof ($ACC0,$H4,$S1,$ACC0); + vmalof ($ACC1,$H4,$S2,$ACC1); + vmalof ($ACC2,$H4,$S3,$ACC2); + vmalof ($ACC3,$H4,$S4,$ACC3); + vmalof ($ACC4,$H4,$R0,$ACC4); + + ################################################################ + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + # and P. Schwabe + + vesrlg ($H4,$ACC3,26); + vesrlg ($H1,$ACC0,26); + vn ($H3,$ACC3,$mask26); + vn ($H0,$ACC0,$mask26); + vag ($H4,$H4,$ACC4); # h3 -> h4 + vag ($H1,$H1,$ACC1); # h0 -> h1 + + vesrlg ($ACC4,$H4,26); + vesrlg ($ACC1,$H1,26); + vn ($H4,$H4,$mask26); + vn ($H1,$H1,$mask26); + vag ($H0,$H0,$ACC4); + vag ($H2,$ACC2,$ACC1); # h1 -> h2 + + veslg ($ACC4,$ACC4,2); # <<2 + vesrlg ($ACC2,$H2,26); + vn ($H2,$H2,$mask26); + vag ($H0,$H0,$ACC4); # h4 -> h0 + vag ($H3,$H3,$ACC2); # h2 -> h3 + + vesrlg ($ACC0,$H0,26); + vesrlg ($ACC3,$H3,26); + vn ($H0,$H0,$mask26); + vn ($H3,$H3,$mask26); + vag ($H1,$H1,$ACC0); # h0 -> h1 + vag ($H4,$H4,$ACC3); # h3 -> h4 + +&{$z? \&brctg:\&brct} ("%r0",".Loop_vx"); + + vlm ($R0,$S4,"48($ctx)"); # load all powers + + lghi ("%r0",0x30); +&{$z? \&lcgr:\&lcr} ($len,$len); +&{$z? \&ngr:\&nr} ($len,"%r0"); +&{$z? \&slgr:\&slr} ($inp,$len); + +LABEL (".Last"); + vmlef ($ACC0,$I0,$R0); + vmlef ($ACC1,$I0,$R1); + vmlef ($ACC2,$I0,$R2); + vmlef ($ACC3,$I0,$R3); + vmlef ($ACC4,$I0,$R4); + + vmalef ($ACC0,$I1,$S4,$ACC0); + vmalef ($ACC1,$I1,$R0,$ACC1); + vmalef ($ACC2,$I1,$R1,$ACC2); + vmalef ($ACC3,$I1,$R2,$ACC3); + vmalef ($ACC4,$I1,$R3,$ACC4); + + vaf ($H0,$H0,$I0); + vaf ($H1,$H1,$I1); + vaf ($H2,$H2,$I2); + vaf ($H3,$H3,$I3); + vaf ($H4,$H4,$I4); + + vmalef ($ACC0,$I2,$S3,$ACC0); + vmalef ($ACC1,$I2,$S4,$ACC1); + vmalef ($ACC2,$I2,$R0,$ACC2); + vmalef ($ACC3,$I2,$R1,$ACC3); + vmalef ($ACC4,$I2,$R2,$ACC4); + + vmalef ($ACC0,$I3,$S2,$ACC0); + vmalef ($ACC1,$I3,$S3,$ACC1); + vmalef ($ACC2,$I3,$S4,$ACC2); + vmalef ($ACC3,$I3,$R0,$ACC3); + vmalef ($ACC4,$I3,$R1,$ACC4); + + vmalef ($ACC0,$I4,$S1,$ACC0); + vmalef ($ACC1,$I4,$S2,$ACC1); + vmalef ($ACC2,$I4,$S3,$ACC2); + vmalef ($ACC3,$I4,$S4,$ACC3); + vmalef ($ACC4,$I4,$R0,$ACC4); + + vmalof ($ACC0,$H0,$R0,$ACC0); + vmalof ($ACC1,$H0,$R1,$ACC1); + vmalof ($ACC2,$H0,$R2,$ACC2); + vmalof ($ACC3,$H0,$R3,$ACC3); + vmalof ($ACC4,$H0,$R4,$ACC4); + + vmalof ($ACC0,$H1,$S4,$ACC0); + vmalof ($ACC1,$H1,$R0,$ACC1); + vmalof ($ACC2,$H1,$R1,$ACC2); + vmalof ($ACC3,$H1,$R2,$ACC3); + vmalof ($ACC4,$H1,$R3,$ACC4); + + vmalof ($ACC0,$H2,$S3,$ACC0); + vmalof ($ACC1,$H2,$S4,$ACC1); + vmalof ($ACC2,$H2,$R0,$ACC2); + vmalof ($ACC3,$H2,$R1,$ACC3); + vmalof ($ACC4,$H2,$R2,$ACC4); + + vmalof ($ACC0,$H3,$S2,$ACC0); + vmalof ($ACC1,$H3,$S3,$ACC1); + vmalof ($ACC2,$H3,$S4,$ACC2); + vmalof ($ACC3,$H3,$R0,$ACC3); + vmalof ($ACC4,$H3,$R1,$ACC4); + + vmalof ($ACC0,$H4,$S1,$ACC0); + vmalof ($ACC1,$H4,$S2,$ACC1); + vmalof ($ACC2,$H4,$S3,$ACC2); + vmalof ($ACC3,$H4,$S4,$ACC3); + vmalof ($ACC4,$H4,$R0,$ACC4); + + ################################################################ + # horizontal addition + + vzero ($H0); + vsumqg ($ACC0,$ACC0,$H0); + vsumqg ($ACC1,$ACC1,$H0); + vsumqg ($ACC2,$ACC2,$H0); + vsumqg ($ACC3,$ACC3,$H0); + vsumqg ($ACC4,$ACC4,$H0); + + ################################################################ + # lazy reduction + + vesrlg ($H4,$ACC3,26); + vesrlg ($H1,$ACC0,26); + vn ($H3,$ACC3,$mask26); + vn ($H0,$ACC0,$mask26); + vag ($H4,$H4,$ACC4); # h3 -> h4 + vag ($H1,$H1,$ACC1); # h0 -> h1 + + vesrlg ($ACC4,$H4,26); + vesrlg ($ACC1,$H1,26); + vn ($H4,$H4,$mask26); + vn ($H1,$H1,$mask26); + vag ($H0,$H0,$ACC4); + vag ($H2,$ACC2,$ACC1); # h1 -> h2 + + veslg ($ACC4,$ACC4,2); # <<2 + vesrlg ($ACC2,$H2,26); + vn ($H2,$H2,$mask26); + vag ($H0,$H0,$ACC4); # h4 -> h0 + vag ($H3,$H3,$ACC2); # h2 -> h3 + + vesrlg ($ACC0,$H0,26); + vesrlg ($ACC3,$H3,26); + vn ($H0,$H0,$mask26); + vn ($H3,$H3,$mask26); + vag ($H1,$H1,$ACC0); # h0 -> h1 + vag ($H4,$H4,$ACC3); # h3 -> h4 + +&{$z? \&clgfi:\&clfi} ($len,0); + je (".Ldone"); + + vlm ($T1,$T4,"0x00($inp)"); # load last partial block + vgmg ($mask26,6,31); + vgmf ($I4,5,5); # padbit<<2 + + vperm ($I0,$T3,$T4,$bswaplo); + vperm ($I2,$T3,$T4,$bswapmi); + vperm ($T3,$T3,$T4,$bswaphi); + + vl ($ACC0,"0x30($len,%r1)"); # borrow $ACC0,1 + vl ($ACC1,"0x60($len,%r1)"); + + verimg ($I1,$I0,$mask26,6); # >>26 + veslg ($I0,$I0,32); + veslg ($I2,$I2,28); # >>4 + verimg ($I3,$T3,$mask26,18); # >>14 + verimg ($I4,$T3,$mask26,58); # >>38 + vn ($I0,$I0,$mask26); + vn ($I2,$I2,$mask26); + vesrlf ($I4,$I4,2); # >>2 + + vgmg ($mask26,38,63); + vperm ($T3,$T1,$T2,$bswaplo); + vperm ($T4,$T1,$T2,$bswaphi); + vperm ($T2,$T1,$T2,$bswapmi); + + verimg ($I0,$T3,$mask26,0); + verimg ($I1,$T3,$mask26,38); # >>26 + verimg ($I2,$T2,$mask26,60); # >>4 + verimg ($I3,$T4,$mask26,50); # >>14 + vesrlg ($T4,$T4,40); + vo ($I4,$I4,$T4); + + vperm ($H0,$H0,$H0,$ACC0); # move hash to right lane + vn ($I0,$I0,$ACC1); # mask redundant lane[s] + vperm ($H1,$H1,$H1,$ACC0); + vn ($I1,$I1,$ACC1); + vperm ($H2,$H2,$H2,$ACC0); + vn ($I2,$I2,$ACC1); + vperm ($H3,$H3,$H3,$ACC0); + vn ($I3,$I3,$ACC1); + vperm ($H4,$H4,$H4,$ACC0); + vn ($I4,$I4,$ACC1); + + vaf ($I0,$I0,$H0); # accumulate hash + vzero ($H0); # wipe hash value + vaf ($I1,$I1,$H1); + vzero ($H1); + vaf ($I2,$I2,$H2); + vzero ($H2); + vaf ($I3,$I3,$H3); + vzero ($H3); + vaf ($I4,$I4,$H4); + vzero ($H4); + +&{$z? \&lghi:\&lhi} ($len,0); + j (".Last"); + # I don't bother to tell apart cases when only one multiplication + # pass is sufficient, because I argue that mispredicted branch + # penalties are comparable to overhead of sometimes redundant + # multiplication pass... + +LABEL (".Ldone"); + vstef ($H0,"0($ctx)",3); # store hash base 2^26 + vstef ($H1,"4($ctx)",3); + vstef ($H2,"8($ctx)",3); + vstef ($H3,"12($ctx)",3); + vstef ($H4,"16($ctx)",3); + +if ($z) { + ld ("%f8","$stdframe+0*8($sp)"); + ld ("%f9","$stdframe+1*8($sp)"); + ld ("%f10","$stdframe+2*8($sp)"); + ld ("%f11","$stdframe+3*8($sp)"); + ld ("%f12","$stdframe+4*8($sp)"); + ld ("%f13","$stdframe+5*8($sp)"); + ld ("%f14","$stdframe+6*8($sp)"); + ld ("%f15","$stdframe+7*8($sp)"); +&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)"); +} else { + ld ("%f4","$stdframe+16*$SIZE_T+2*8($sp)"); + ld ("%f6","$stdframe+16*$SIZE_T+3*8($sp)"); +&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+10*$SIZE_T($sp)"); +} + br ("%r14"); +SIZE ("__poly1305_blocks_vx",".-__poly1305_blocks_vx"); +} + ################ # static void poly1305_emit(void *ctx, unsigned char mac[16], # const u32 nonce[4]) { -my ($ctx,$mac,$nonce) = map("%r$_",(2..4)); -my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9)); +my ($mac,$nonce)=($inp,$len); +my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10)); GLOBL ("poly1305_emit"); TYPE ("poly1305_emit","\@function"); ALIGN (16); LABEL ("poly1305_emit"); -&{$z? \&stmg:\&stm} ("%r6","%r9","6*$SIZE_T($sp)"); +LABEL (".Lpoly1305_emit"); +&{$z? \&stmg:\&stm} ("%r6","%r10","6*$SIZE_T($sp)"); + + lg ($d0,"0($ctx)"); + lg ($d1,"8($ctx)"); + lg ($d2,"16($ctx)"); + + llgfr ("%r0",$d0); # base 2^26 -> base 2^64 + srlg ($h0,$d0,32); + llgfr ("%r1",$d1); + srlg ($h1,$d1,32); + srlg ($h2,$d2,32); + + sllg ("%r0","%r0",26); + algr ($h0,"%r0"); + sllg ("%r0",$h1,52); + srlg ($h1,$h1,12); + sllg ("%r1","%r1",14); + algr ($h0,"%r0"); + alcgr ($h1,"%r1"); + sllg ("%r0",$h2,40); + srlg ($h2,$h2,24); + lghi ("%r1",0); + algr ($h1,"%r0"); + alcgr ($h2,"%r1"); - lg ($h0,"0($ctx)"); - lg ($h1,"8($ctx)"); - lg ($h2,"16($ctx)"); + llgf ("%r0","24($ctx)"); # is_base2_26 + lcgr ("%r0","%r0"); + + xgr ($h0,$d0); # choose between radixes + xgr ($h1,$d1); + xgr ($h2,$d2); + ngr ($h0,"%r0"); + ngr ($h1,"%r0"); + ngr ($h2,"%r0"); + xgr ($h0,$d0); + xgr ($h1,$d1); + xgr ($h2,$d2); lghi ("%r0",5); - lghi ("%r1",0); lgr ($d0,$h0); lgr ($d1,$h1); - algr ($h0,"%r0"); # compare to modulus + algr ($h0,"%r0"); # compare to modulus alcgr ($h1,"%r1"); alcgr ($h2,"%r1"); - srlg ($h2,$h2,2); # did it borrow/carry? - slgr ("%r1",$h2); # 0-$h2>>2 - lg ($h2,"0($nonce)"); # load nonce - lghi ("%r0",-1); + srlg ($h2,$h2,2); # did it borrow/carry? + slgr ("%r1",$h2); # 0-$h2>>2 + lg ($d2,"0($nonce)"); # load nonce lg ($ctx,"8($nonce)"); - xgr ("%r0","%r1"); # ~%r1 + xgr ($h0,$d0); + xgr ($h1,$d1); ngr ($h0,"%r1"); - ngr ($d0,"%r0"); ngr ($h1,"%r1"); - ngr ($d1,"%r0"); - ogr ($h0,$d0); - rllg ($d0,$h2,32); # flip nonce words - ogr ($h1,$d1); + xgr ($h0,$d0); + rllg ($d0,$d2,32); # flip nonce words + xgr ($h1,$d1); rllg ($d1,$ctx,32); - algr ($h0,$d0); # accumulate nonce + algr ($h0,$d0); # accumulate nonce alcgr ($h1,$d1); - strvg ($h0,"0($mac)"); # write little-endian result + strvg ($h0,"0($mac)"); # write little-endian result strvg ($h1,"8($mac)"); -&{$z? \&lmg:\&lm} ("%r6","%r9","6*$SIZE_T($sp)"); +&{$z? \&lmg:\&lm} ("%r6","%r10","6*$SIZE_T($sp)"); br ("%r14"); SIZE ("poly1305_emit",".-poly1305_emit"); } -} + ################ -ALIGN (128); +ALIGN (16); LABEL (".Lconst"); -LONG (0x00060504,0x03020100,0x00161514,0x13121110); # vperm op[m[1],m[0]] -LONG (0x000c0b0a,0x09080706,0x001c1b1a,0x19181716); # vperm op[m[3],m[2]] -LONG (0x00000000,0x000f0e0d,0x00000000,0x001f1e1d); # vperm op[ - ,m[4]] -LONG (0x00000000,0x03ffffff,0x00000000,0x03ffffff); # [0,2^26-1,0,2^26-1] -LONG (0x0f0e0d0c,0x0b0a0908,0x07060504,0x03020100); # vperm op endian +LONG (0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f); # merge odd +LONG (0x07060504,0x03020100,0x17161514,0x13121110); # byte swap masks +LONG (0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918); +LONG (0x00000000,0x09080706,0x00000000,0x19181716); + +LONG (0x00000000,0x00000000,0x00000000,0x0c0d0e0f); # magic tail masks +LONG (0x0c0d0e0f,0x00000000,0x00000000,0x00000000); +LONG (0x00000000,0x00000000,0x0c0d0e0f,0x00000000); + +LONG (0xffffffff,0x00000000,0xffffffff,0xffffffff); +LONG (0xffffffff,0x00000000,0xffffffff,0x00000000); +LONG (0x00000000,0x00000000,0xffffffff,0x00000000); + STRING ("\"Poly1305 for s390x, CRYPTOGAMS by \""); PERLASM_END(); diff --git a/crypto/poly1305/build.info b/crypto/poly1305/build.info index de44bb8803..60eeeeec9d 100644 --- a/crypto/poly1305/build.info +++ b/crypto/poly1305/build.info @@ -18,4 +18,5 @@ INCLUDE[poly1305-armv8.o]=.. GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl $(PERLASM_SCHEME) INCLUDE[poly1305-mips.o]=.. GENERATE[poly1305-c64xplus.S]=asm/poly1305-c64xplus.pl $(PERLASM_SCHEME) +INCLUDE[poly1305-s390x.o]=.. GENERATE[poly1305-s390x.S]=asm/poly1305-s390x.pl $(PERLASM_SCHEME) -- 2.25.1