From: Andy Polyakov Date: Sat, 22 Sep 2018 12:39:51 +0000 (+0200) Subject: sha/asm/keccak1600-armv8.pl: halve the size of hw-assisted subroutine. X-Git-Tag: openssl-3.0.0-alpha1~3031 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=9986bfefa420f0db920768453bef0b40507db595;p=oweals%2Fopenssl.git sha/asm/keccak1600-armv8.pl: halve the size of hw-assisted subroutine. Yes, it's second halving, i.e. it's now 1/4 of original size, or more specifically inner loop. The challenge with Keccak is that you need more temporary registers than there are available. By reversing the order in which columns are assigned in Chi, it's possible to use three of A[][] registers as temporary prior their assigment. Reviewed-by: Richard Levitte (Merged from https://github.com/openssl/openssl/pull/7294) --- diff --git a/crypto/sha/asm/keccak1600-armv8.pl b/crypto/sha/asm/keccak1600-armv8.pl index 704ab4a7e4..e4e94bcad0 100755 --- a/crypto/sha/asm/keccak1600-armv8.pl +++ b/crypto/sha/asm/keccak1600-armv8.pl @@ -533,30 +533,28 @@ my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b", (0, 5, 10, 15, 20)); my @C = map("v$_.16b", (25..31)); +my @D = @C[4,5,6,2,3]; $code.=<<___; .type KeccakF1600_ce,%function .align 5 KeccakF1600_ce: - mov x9,#12 + mov x9,#24 adr x10,iotas b .Loop_ce .align 4 .Loop_ce: -___ -for($i=0; $i<2; $i++) { -$code.=<<___; ////////////////////////////////////////////////// Theta - eor3 $C[0],$A[0][0],$A[1][0],$A[2][0] - eor3 $C[1],$A[0][1],$A[1][1],$A[2][1] - eor3 $C[2],$A[0][2],$A[1][2],$A[2][2] - eor3 $C[3],$A[0][3],$A[1][3],$A[2][3] - eor3 $C[4],$A[0][4],$A[1][4],$A[2][4] - eor3 $C[0],$C[0], $A[3][0],$A[4][0] - eor3 $C[1],$C[1], $A[3][1],$A[4][1] - eor3 $C[2],$C[2], $A[3][2],$A[4][2] - eor3 $C[3],$C[3], $A[3][3],$A[4][3] - eor3 $C[4],$C[4], $A[3][4],$A[4][4] + eor3 $C[0],$A[4][0],$A[3][0],$A[2][0] + eor3 $C[1],$A[4][1],$A[3][1],$A[2][1] + eor3 $C[2],$A[4][2],$A[3][2],$A[2][2] + eor3 $C[3],$A[4][3],$A[3][3],$A[2][3] + eor3 $C[4],$A[4][4],$A[3][4],$A[2][4] + eor3 $C[0],$C[0], $A[1][0],$A[0][0] + eor3 $C[1],$C[1], $A[1][1],$A[0][1] + eor3 $C[2],$C[2], $A[1][2],$A[0][2] + eor3 $C[3],$C[3], $A[1][3],$A[0][3] + eor3 $C[4],$C[4], $A[1][4],$A[0][4] rax1 $C[5],$C[0],$C[2] // D[1] rax1 $C[6],$C[1],$C[3] // D[2] @@ -565,81 +563,75 @@ $code.=<<___; rax1 $C[4],$C[4],$C[1] // D[0] ////////////////////////////////////////////////// Theta+Rho+Pi - xar $C[0], $A[1][1],$C[5],#64-$rhotates[1][1] // C[0]=A[0][1] - xar $A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4] - xar $A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2] - xar $A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4] - xar $A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0] + xar $C[0], $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0] - xar $A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2] + xar $A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1] + xar $A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4] + xar $A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2] + xar $A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4] + xar $A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0] - xar $A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2] - xar $A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3] - xar $A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4] - xar $A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3] - xar $A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0] + xar $C[1], $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0] - xar $A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4] + xar $A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2] + xar $A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3] + xar $A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4] + xar $A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3] + xar $A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0] - eor $A[0][0],$A[0][0],$C[4] - ldr x11,[x10],#8 + xar $A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4] - xar $C[1], $A[3][3],$C[2],#64-$rhotates[3][3] // C[1]=A[0][3] - xar $A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2] - xar $A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1] - xar $A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2] - xar $A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0] + xar $D[4], $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4] + xar $A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1] + xar $A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1] + xar $A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3] + xar $A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0] - xar $A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1] // * + xar $A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3] - xar $A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4] - xar $A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1] - xar $A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3] - xar $A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1] - xar $A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0] + eor $A[0][0],$A[0][0],$D[0] - xar $C[2], $A[0][3],$C[2],#64-$rhotates[0][3] // C[2]=A[1][0] + xar $D[3], $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3] + xar $A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3] + xar $D[1], $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2] + xar $D[2], $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1] + xar $D[0], $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2] ////////////////////////////////////////////////// Chi+Iota - dup $C[6],x11 // borrow C[6] - bcax $C[3], $A[0][0],$A[0][2],$C[0] // * - bcax $A[0][1],$C[0], $C[1], $A[0][2] // * - bcax $A[0][2],$A[0][2],$A[0][4],$C[1] - bcax $A[0][3],$C[1], $A[0][0],$A[0][4] - bcax $A[0][4],$A[0][4],$C[0], $A[0][0] - - bcax $A[1][0],$C[2], $A[1][2],$A[1][1] // * - bcax $C[0], $A[1][1],$A[1][3],$A[1][2] // * - bcax $A[1][2],$A[1][2],$A[1][4],$A[1][3] - bcax $A[1][3],$A[1][3],$C[2], $A[1][4] - bcax $A[1][4],$A[1][4],$A[1][1],$C[2] - - eor $A[0][0],$C[3],$C[6] // Iota - - bcax $C[1], $A[2][0],$A[2][2],$A[2][1] // * - bcax $C[2], $A[2][1],$A[2][3],$A[2][2] // * - bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3] - bcax $A[2][3],$A[2][3],$A[2][0],$A[2][4] - bcax $A[2][4],$A[2][4],$A[2][1],$A[2][0] + bcax $A[4][0],$C[1], $A[4][2],$A[1][3] // A[1][3]=A[4][1] + bcax $A[4][1],$A[1][3],$A[4][3],$A[4][2] // A[1][3]=A[4][1] + bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3] + bcax $A[4][3],$A[4][3],$C[1], $A[4][4] + bcax $A[4][4],$A[4][4],$A[1][3],$C[1] // A[1][3]=A[4][1] + + ld1r {$C[1]},[x10],#8 - bcax $C[3], $A[3][0],$A[3][2],$A[3][1] // * - bcax $C[4], $A[3][1],$A[3][3],$A[3][2] // * - bcax $A[3][2],$A[3][2],$A[3][4],$A[3][3] - bcax $A[3][3],$A[3][3],$A[3][0],$A[3][4] + bcax $A[3][2],$D[1], $A[3][4],$A[0][3] // A[0][3]=A[3][3] + bcax $A[3][3],$A[0][3],$A[3][0],$A[3][4] // A[0][3]=A[3][3] bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0] + bcax $A[3][0],$A[3][0],$D[1], $A[3][1] + bcax $A[3][1],$A[3][1],$A[0][3],$D[1] // A[0][3]=A[3][3] + + bcax $A[2][0],$C[0], $A[2][2],$D[2] + bcax $A[2][1],$D[2], $A[2][3],$A[2][2] + bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3] + bcax $A[2][3],$A[2][3],$C[0], $A[2][4] + bcax $A[2][4],$A[2][4],$D[2], $C[0] + + bcax $A[1][2],$D[0], $A[1][4],$A[0][4] // A[0][4]=A[1][3] + bcax $A[1][3],$A[0][4],$A[1][0],$A[1][4] // A[0][4]=A[1][3] + bcax $A[1][4],$A[1][4],$A[1][1],$A[1][0] + bcax $A[1][0],$A[1][0],$D[0], $A[1][1] + bcax $A[1][1],$A[1][1],$A[0][4],$D[0] // A[0][4]=A[1][3] + + bcax $A[0][3],$D[3], $A[0][0],$D[4] + bcax $A[0][4],$D[4], $A[0][1],$A[0][0] + bcax $A[0][0],$A[0][0],$A[0][2],$A[0][1] + bcax $A[0][1],$A[0][1],$D[3], $A[0][2] + bcax $A[0][2],$A[0][2],$D[4], $D[3] + + eor $A[0][0],$A[0][0],$C[1] - bcax $C[5], $A[4][0],$A[4][2],$A[4][1] // * - bcax $C[6], $A[4][1],$A[4][3],$A[4][2] // * - bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3] - bcax $A[4][3],$A[4][3],$A[4][0],$A[4][4] - bcax $A[4][4],$A[4][4],$A[4][1],$A[4][0] -___ - ( $A[1][1], $C[0]) = ( $C[0], $A[1][1]); - ($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]); - ($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]); - ($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]); -} -$code.=<<___; subs x9,x9,#1 bne .Loop_ce @@ -857,7 +849,7 @@ foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; - m/\bdup\b/ and s/\.16b/.2d/g or + m/\bld1r\b/ and s/\.16b/.2d/g or s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; print $_,"\n";