From: Andy Polyakov Date: Tue, 13 Feb 2018 20:59:15 +0000 (+0100) Subject: sha/asm/keccak1600-armv8.pl: add hardware-assisted ARMv8.2 subroutines. X-Git-Tag: OpenSSL_1_1_1-pre2~131 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=b761ff4e77f74f91d8694964039af24ead3c2467;p=oweals%2Fopenssl.git sha/asm/keccak1600-armv8.pl: add hardware-assisted ARMv8.2 subroutines. Reviewed-by: Rich Salz (Merged from https://github.com/openssl/openssl/pull/5358) --- diff --git a/crypto/sha/asm/keccak1600-armv8.pl b/crypto/sha/asm/keccak1600-armv8.pl index 5e4cf77003..a32b8bae84 100755 --- a/crypto/sha/asm/keccak1600-armv8.pl +++ b/crypto/sha/asm/keccak1600-armv8.pl @@ -1,5 +1,5 @@ #!/usr/bin/env perl -# Copyright 2017 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy @@ -26,6 +26,19 @@ # it gets compensated by availability of rotate. Not to forget that # most processors achieve higher issue rate with scalar instructions. # +# February 2018. +# +# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT +# variant with register permutation/rotation twist that allows to +# eliminate copies to temporary registers. If you look closely you'll +# notice that it uses only one lane of vector registers. The new +# instructions effectively facilitate parallel hashing, which we don't +# support [yet?]. But lowest-level core procedure is prepared for it. +# The inner round is 67 [vector] instructions, so it's not actually +# obvious that it will provide performance improvement [in serial +# hash] as long as vector instructions issue rate is limited to 1 per +# cycle... +# ###################################################################### # Numbers are cycles per processed byte. # @@ -56,12 +69,6 @@ die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; -my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ], - (0, 5, 10, 15, 20)); - $A[3][3] = "x25"; # x18 is reserved - -my @C = map("x$_", (26,27,28,30)); - my @rhotates = ([ 0, 1, 62, 28, 27 ], [ 36, 44, 6, 55, 20 ], [ 3, 10, 43, 25, 39 ], @@ -101,7 +108,15 @@ iotas: .quad 0x0000000080000001 .quad 0x8000000080008008 .size iotas,.-iotas +___ + {{{ +my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ], + (0, 5, 10, 15, 20)); + $A[3][3] = "x25"; # x18 is reserved + +my @C = map("x$_", (26,27,28,30)); +$code.=<<___; .type KeccakF1600_int,%function .align 5 KeccakF1600_int: @@ -510,9 +525,342 @@ SHA3_squeeze: ldp x29,x30,[sp],#48 ret .size SHA3_squeeze,.-SHA3_squeeze +___ +} }}} + {{{ +my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b", + "v".($_+3).".16b", "v".($_+4).".16b" ], + (0, 5, 10, 15, 20)); + +my @C = map("v$_.16b", (25..31)); + +$code.=<<___; +.type KeccakF1600_ce,%function +.align 5 +KeccakF1600_ce: + mov x9,#6 + adr x10,iotas + b .Loop_ce +.align 4 +.Loop_ce: +___ +for($i=0; $i<4; $i++) { +$code.=<<___; + ////////////////////////////////////////////////// Theta + eor3 $C[0],$A[0][0],$A[1][0],$A[2][0] + eor3 $C[1],$A[0][1],$A[1][1],$A[2][1] + eor3 $C[2],$A[0][2],$A[1][2],$A[2][2] + eor3 $C[3],$A[0][3],$A[1][3],$A[2][3] + eor3 $C[4],$A[0][4],$A[1][4],$A[2][4] + eor3 $C[0],$C[0], $A[3][0],$A[4][0] + eor3 $C[1],$C[1], $A[3][1],$A[4][1] + eor3 $C[2],$C[2], $A[3][2],$A[4][2] + eor3 $C[3],$C[3], $A[3][3],$A[4][3] + eor3 $C[4],$C[4], $A[3][4],$A[4][4] + + rax1 $C[5],$C[0],$C[2] // D[1] + rax1 $C[6],$C[1],$C[3] // D[2] + rax1 $C[2],$C[2],$C[4] // D[3] + rax1 $C[3],$C[3],$C[0] // D[4] + rax1 $C[4],$C[4],$C[1] // D[0] + + ////////////////////////////////////////////////// Theta+Rho+Pi + xar $C[0], $A[1][1],$C[5],#64-$rhotates[1][1] // C[0]=A[0][1] + xar $A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4] + xar $A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2] + xar $A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4] + xar $A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0] + + xar $A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2] + + xar $A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2] + xar $A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3] + xar $A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4] + xar $A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3] + xar $A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0] + + xar $A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4] + + eor $A[0][0],$A[0][0],$C[4] + ldr x11,[x10],#8 + + xar $C[1] ,$A[3][3],$C[2],#64-$rhotates[3][3] // C[1]=A[0][3] + xar $A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2] + xar $A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1] + xar $A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2] + xar $A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0] + + xar $A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1] // * + + xar $A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4] + xar $A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1] + xar $A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3] + xar $A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1] + xar $A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0] + + xar $A[1][0],$A[0][3],$C[2],#64-$rhotates[0][3] // * + + ////////////////////////////////////////////////// Chi+Iota + dup $C[6],x11 // borrow C[6] + bcax $C[3], $A[0][0],$A[0][2],$C[0] // * + bcax $A[0][1],$C[0], $C[1], $A[0][2] // * + bcax $A[0][2],$A[0][2],$A[0][4],$C[1] + bcax $A[0][3],$C[1], $A[0][0],$A[0][4] + bcax $A[0][4],$A[0][4],$C[0], $A[0][0] + + bcax $C[0], $A[1][0],$A[1][2],$A[1][1] // * + bcax $C[1], $A[1][1],$A[1][3],$A[1][2] // * + bcax $A[1][2],$A[1][2],$A[1][4],$A[1][3] + bcax $A[1][3],$A[1][3],$A[1][0],$A[1][4] + bcax $A[1][4],$A[1][4],$A[1][1],$A[1][0] + + eor $A[0][0],$C[3],$C[6] // Iota + + bcax $C[2], $A[2][0],$A[2][2],$A[2][1] // * + bcax $C[3], $A[2][1],$A[2][3],$A[2][2] // * + bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3] + bcax $A[2][3],$A[2][3],$A[2][0],$A[2][4] + bcax $A[2][4],$A[2][4],$A[2][1],$A[2][0] + + bcax $A[2][0],$A[3][0],$A[3][2],$A[3][1] // * + bcax $A[2][1],$A[3][1],$A[3][3],$A[3][2] // * + bcax $A[3][2],$A[3][2],$A[3][4],$A[3][3] + bcax $A[3][3],$A[3][3],$A[3][0],$A[3][4] + bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0] + + bcax $A[3][0],$A[4][0],$A[4][2],$A[4][1] // * + bcax $A[3][1],$A[4][1],$A[4][3],$A[4][2] // * + bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3] + bcax $A[4][3],$A[4][3],$A[4][0],$A[4][4] + bcax $A[4][4],$A[4][4],$A[4][1],$A[4][0] +___ + ($A[1][0],$A[1][1], $C[0],$C[1]) + = ($C[0],$C[1], $A[1][0],$A[1][1]); + ($A[2][0],$A[2][1], $A[3][0],$A[3][1], $A[4][0],$A[4][1], $C[2],$C[3]) + = ($C[2],$C[3], $A[2][0],$A[2][1], $A[3][0],$A[3][1], $A[4][0],$A[4][1]); +} +$code.=<<___; + subs x9,x9,#1 + bne .Loop_ce + + ret +.size KeccakF1600_ce,.-KeccakF1600_ce + +.type KeccakF1600_cext,%function +.align 5 +KeccakF1600_cext: + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] +___ +for($i=0; $i<24; $i+=2) { # load A[5][5] +my $j=$i+1; +$code.=<<___; + ldp d$i,d$j,[x0,#8*$i] +___ +} +$code.=<<___; + ldr d24,[x0,#8*$i] + bl KeccakF1600_ce + ldr x30,[sp,#8] +___ +for($i=0; $i<24; $i+=2) { # store A[5][5] +my $j=$i+1; +$code.=<<___; + stp d$i,d$j,[x0,#8*$i] +___ +} +$code.=<<___; + str d24,[x0,#8*$i] + + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 + ret +.size KeccakF1600_cext,.-KeccakF1600_cext +___ + +{ +my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3)); + +$code.=<<___; +.globl SHA3_absorb_cext +.type SHA3_absorb_cext,%function +.align 5 +SHA3_absorb_cext: + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] +___ +for($i=0; $i<24; $i+=2) { # load A[5][5] +my $j=$i+1; +$code.=<<___; + ldp d$i,d$j,[x0,#8*$i] +___ +} +$code.=<<___; + ldr d24,[x0,#8*$i] + b .Loop_absorb_ce + +.align 4 +.Loop_absorb_ce: + subs $len,$len,$bsz // len - bsz + blo .Labsorbed_ce +___ +for (my $i=0; $i<24; $i+=2) { +my $j = $i+1; +$code.=<<___; + ldr d31,[$inp],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b + cmp $bsz,#8*($i+2) + blo .Lprocess_block_ce + ldr d31,[$inp],#8 // *inp++ +#ifdef __AARCH64EB__ + rev v31.16b,v31.16b +#endif + eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b + beq .Lprocess_block_ce +___ +} +$code.=<<___; + ldr d31,[$inp],#8 // *inp++ +#ifdef __AARCH64EB__ + rev v31.16b,v31.16b +#endif + eor $A[4][4],$A[4][4],v31.16b + +.Lprocess_block_ce: + + bl KeccakF1600_ce + + b .Loop_absorb_ce + +.align 4 +.Labsorbed_ce: +___ +for($i=0; $i<24; $i+=2) { # store A[5][5] +my $j=$i+1; +$code.=<<___; + stp d$i,d$j,[x0,#8*$i] +___ +} +$code.=<<___; + str d24,[x0,#8*$i] + add x0,$len,$bsz // return value + + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldp x29,x30,[sp],#80 + ret +.size SHA3_absorb_cext,.-SHA3_absorb_cext +___ +} +{ +my ($ctx,$out,$len,$bsz) = map("x$_",(0..3)); +$code.=<<___; +.globl SHA3_squeeze_cext +.type SHA3_squeeze_cext,%function +.align 5 +SHA3_squeeze_cext: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x9,$ctx + mov x10,$bsz + +.Loop_squeeze_ce: + ldr x4,[x9],#8 + cmp $len,#8 + blo .Lsqueeze_tail_ce +#ifdef __AARCH64EB__ + rev x4,x4 +#endif + str x4,[$out],#8 + beq .Lsqueeze_done_ce + + sub $len,$len,#8 + subs x10,x10,#8 + bhi .Loop_squeeze_ce + + bl KeccakF1600_cext + ldr x30,[sp,#8] + mov x9,$ctx + mov x10,$bsz + b .Loop_squeeze_ce + +.align 4 +.Lsqueeze_tail_ce: + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + +.Lsqueeze_done_ce: + ldr x29,[sp],#16 + ret +.size SHA3_squeeze_cext,.-SHA3_squeeze_cext +___ +} }}} +$code.=<<___; .asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by " ___ + +{ my %opcode = ( + "rax1" => 0xce608c00, "eor3" => 0xce000000, + "bcax" => 0xce200000, "xar" => 0xce800000 ); + + sub unsha3 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), + $mnemonic,$arg; + } +} + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/ge; + + m/\bdup\b/ and s/\.16b/.2d/g or + s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; + + print $_,"\n"; } -print $code; close STDOUT;