From 56676f877d1ec4bc49ceb45636717e0f9a7ae445 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 1 Jun 2017 21:05:59 +0200 Subject: [PATCH] sha/asm/keccak1600-armv4.pl: add SHA3_absorb and SHA3_squeeze. Reviewed-by: Rich Salz --- crypto/sha/asm/keccak1600-armv4.pl | 369 +++++++++++++++++++++++++---- 1 file changed, 319 insertions(+), 50 deletions(-) diff --git a/crypto/sha/asm/keccak1600-armv4.pl b/crypto/sha/asm/keccak1600-armv4.pl index 616beae9d8..a560aa7c31 100644 --- a/crypto/sha/asm/keccak1600-armv4.pl +++ b/crypto/sha/asm/keccak1600-armv4.pl @@ -1,6 +1,63 @@ +#!/usr/bin/env perl +# Copyright 2017 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Keccak-1600 for ARMv4. +# +# June 2017. +# +# This is KECCAK_1X variant (see keccak1600.c) with bit interleaving. +# How does it compare to Keccak Code Package? It's as fast, but several +# times smaller, and is endian- and ISA-neutral. ISA neutrality means +# that minimum ISA requirement is ARMv4, yet it can be assembled even +# as ARMv7 Thumb-2. +# +######################################################################## +# Numbers are cycles per processed byte accounting even for input bit +# interleaving. +# +# r=1600(*) r=1024 +# +# Cortex-A7 71/+180% 103 +# Cortex-A8 48/+290% 69 +# Cortex-A15 34/+210% 49 +# +# (*) Not used in real life, meaningful as estimate for single sponge +# operation performance. Numbers after slash are improvement over +# compiler-generated KECCAK_1X reference code. + my @C = map("r$_",(0..9)); my @E = map("r$_",(10..12,14)); +######################################################################## +# Stack layout +# ----->+-----------------------+ +# | uint64_t A[5][5] | +# | ... | +# +200->+-----------------------+ +# | uint64_t D[5] | +# | ... | +# +240->+-----------------------+ +# | uint64_t T[2][5] | +# | ... | +# +320->+-----------------------+ +# | saved lr | +# +324->+-----------------------+ +# | loop counter | +# +328->+-----------------------+ +# | ... + my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20)); my @D = map(8*$_, (25..29)); my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35)); @@ -8,6 +65,13 @@ my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35)); $code.=<<___; .text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + .type iotas,%object .align 5 iotas: @@ -35,34 +99,23 @@ iotas: .long 0x00000000, 0x80000088 .long 0x00000001, 0x00008000 .long 0x00000000, 0x80008082 -.size iostas,.-iotas +.size iotas,.-iotas -.global KeccakF1600 -.type KeccakF1600, %function +.type KeccakF1600_int, %function .align 5 -KeccakF1600: - eor r1,r1,r1 - stmdb sp!,{r0,r1,r4-r12,lr} - sub sp,sp,#320 @ space for A[5][5],D[5],T[2][5] - - add @E[0],r0,#$A[1][0] - add @E[1],sp,#$A[1][0] - mov @E[2],r0 - ldmia @E[0]!,{@C[0]-@C[9]} @ copy A[5][5] to stack - stmia @E[1]!,{@C[0]-@C[9]} - ldmia @E[0]!,{@C[0]-@C[9]} - stmia @E[1]!,{@C[0]-@C[9]} - ldmia @E[0]!,{@C[0]-@C[9]} - stmia @E[1]!,{@C[0]-@C[9]} - ldmia @E[0],{@C[0]-@C[9]} - stmia @E[1],{@C[0]-@C[9]} - ldmia @E[2],{@C[0]-@C[9]} - stmia sp,{@C[0]-@C[9]} +KeccakF1600_int: + ldmia sp,{@C[0]-@C[9]} @ A[0][0..4] add @E[0],sp,#$A[1][0] - b .Lround +KeccakF1600_enter: + str lr,[sp,#320] + eor @E[1],@E[1],@E[1] + str @E[1],[sp,#324] + b .Lround_enter .align 4 .Lround: + ldmia sp,{@C[0]-@C[9]} @ A[0][0..4] +.Lround_enter: ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1] eor @C[0],@C[0],@E[0] add @E[0],sp,#$A[1][2] @@ -141,14 +194,14 @@ KeccakF1600: eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2]; str @C[3],[sp,#$D[2]+4] eor @C[5],@C[5],@C[8] + ldr @C[8],[sp,#$A[3][0]] + ldr @C[9],[sp,#$A[3][0]+4] str @C[4],[sp,#$D[3]] @ D[3] = C[2] str @C[5],[sp,#$D[3]+4] - ldr @C[8],[sp,#$A[3][0]] - ldr @C[9],[sp,#$A[3][0]+4] ldr @C[6],[sp,#$A[0][1]] - ldr @C[7],[sp,#$A[0][1]+4] eor @C[8],@C[8],@C[0] + ldr @C[7],[sp,#$A[0][1]+4] eor @C[9],@C[9],@C[1] str @C[8],[sp,#$T[0][0]] @ T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */ ldr @C[8],[sp,#$A[0][2]] @@ -172,14 +225,14 @@ KeccakF1600: eor @C[8],@C[8],@E[2] str @C[7],[sp,#$T[0][3]+4] eor @C[9],@C[9],@E[3] + ldr @C[6],[sp,#$A[3][3]] + ldr @C[7],[sp,#$A[3][3]+4] str @C[8],[sp,#$T[0][4]] @ T[0][4] = A[0][4] ^ E[1]; /* D[4] */ str @C[9],[sp,#$T[0][4]+4] - ldr @C[6],[sp,#$A[3][3]] - ldr @C[7],[sp,#$A[3][3]+4] ldr @C[8],[sp,#$A[4][4]] - ldr @C[9],[sp,#$A[4][4]+4] eor @C[4],@C[4],@C[6] + ldr @C[9],[sp,#$A[4][4]+4] eor @C[5],@C[5],@C[7] ror @C[7],@C[4],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ ldr @C[4],[sp,#$A[0][0]] @@ -198,25 +251,25 @@ KeccakF1600: eor @E[3],@E[3],@C[3] ldr @C[3],[sp,#$A[1][1]+4] ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ + ldr @E[2],[sp,#324] @ load counter eor @C[2],@C[2],@E[0] ror @C[4],@E[3],#32-22 - adr @E[0],iotas + adr @E[3],iotas eor @C[3],@C[3],@E[1] - ldr @E[1],[sp,#320+4] @ load counter ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */ + add @E[3],@E[3],@E[2] ror @C[3],@C[3],#32-22 - add @E[0],@E[0],@E[1] - ldr @E[2],[@E[0],#0] - add @E[1],@E[1],#8 - ldr @E[3],[@E[0],#4] - cmp @E[1],#192 - str @E[1],[sp,#320+4] @ store counter + ldr @E[0],[@E[3],#0] @ iotas[i].lo + add @E[2],@E[2],#8 + ldr @E[1],[@E[3],#4] @ iotas[i].hi + cmp @E[2],#192 + str @E[2],[sp,#324] @ store counter - bic @E[0],@C[4],@C[2] - bic @E[1],@C[5],@C[3] - eor @E[0],@E[0],@C[0] - eor @E[1],@E[1],@C[1] + bic @E[2],@C[4],@C[2] + bic @E[3],@C[5],@C[3] + eor @E[2],@E[2],@C[0] + eor @E[3],@E[3],@C[1] eor @E[0],@E[0],@E[2] eor @E[1],@E[1],@E[3] str @E[0],[sp,#$A[0][0]] @ A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; @@ -248,13 +301,13 @@ KeccakF1600: str @E[1],[sp,#$A[0][4]+4] ldmia @E[3],{@C[6]-@C[9],@E[0],@E[1],@E[2],@E[3]} @ D[0..3] - ldr @C[4],[sp,#$D[4]] - ldr @C[5],[sp,#$D[4]+4] ldr @C[0],[sp,#$A[1][0]] ldr @C[1],[sp,#$A[1][0]+4] ldr @C[2],[sp,#$A[2][1]] ldr @C[3],[sp,#$A[2][1]+4] + ldr @C[4],[sp,#$D[4]] eor @C[0],@C[0],@C[6] + ldr @C[5],[sp,#$D[4]+4] eor @C[1],@C[1],@C[7] str @C[0],[sp,#$T[1][0]] @ T[1][0] = A[1][0] ^ (C[3] = D[0]); add @C[0],sp,#$A[1][2] @@ -506,27 +559,243 @@ KeccakF1600: str @E[3],[sp,#$A[4][3]+4] bic @E[1],@C[3],@C[1] eor @E[2],@E[0],@C[8] - add @E[0],sp,#$A[1][0] eor @E[3],@E[1],@C[9] - ldmia sp,{@C[0]-@C[9]} @ A[0][0..5] str @E[2],[sp,#$A[4][4]] @ A[4][4] = C[4] ^ (~C[0] & C[1]); + add @E[0],sp,#$A[1][0] str @E[3],[sp,#$A[4][4]+4] blo .Lround - ldr @E[1],[sp,#320] @ restore pointer to A - stmia @E[1]!,{@C[0]-@C[9]} @ copy A[5][5] from stack + ldr pc,[sp,#320] +.size KeccakF1600_int,.-KeccakF1600_int + +.type KeccakF1600, %function +.align 5 +KeccakF1600: + stmdb sp!,{r0,r4-r11,lr} + sub sp,sp,#320+16 @ space for A[5][5],D[5],T[2][5],... + + add @E[0],r0,#$A[1][0] + add @E[1],sp,#$A[1][0] + mov @E[2],r0 + ldmia @E[0]!,{@C[0]-@C[9]} @ copy A[5][5] to stack + stmia @E[1]!,{@C[0]-@C[9]} + ldmia @E[0]!,{@C[0]-@C[9]} + stmia @E[1]!,{@C[0]-@C[9]} + ldmia @E[0]!,{@C[0]-@C[9]} + stmia @E[1]!,{@C[0]-@C[9]} + ldmia @E[0], {@C[0]-@C[9]} + stmia @E[1], {@C[0]-@C[9]} + ldmia @E[2], {@C[0]-@C[9]} @ A[0][0..4] + add @E[0],sp,#$A[1][0] + stmia sp, {@C[0]-@C[9]} + + bl KeccakF1600_enter + + ldr @E[1], [sp,#320+16] @ restore pointer to A + ldmia sp, {@C[0]-@C[9]} + stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5] ldmia @E[0]!,{@C[0]-@C[9]} stmia @E[1]!,{@C[0]-@C[9]} ldmia @E[0]!,{@C[0]-@C[9]} stmia @E[1]!,{@C[0]-@C[9]} ldmia @E[0]!,{@C[0]-@C[9]} stmia @E[1]!,{@C[0]-@C[9]} - ldmia @E[0],{@C[0]-@C[9]} - stmia @E[1],{@C[0]-@C[9]} + ldmia @E[0], {@C[0]-@C[9]} + stmia @E[1], {@C[0]-@C[9]} - add sp,sp,#320+8 - ldmia sp!,{r4-r12,pc} + add sp,sp,#320+20 + ldmia sp!,{r4-r11,pc} .size KeccakF1600,.-KeccakF1600 ___ +{ my ($hi,$lo,$i,$A_flat, $len,$bsz,$inp) = map("r$_",(5..8, 10..12)); + +######################################################################## +# Stack layout +# ----->+-----------------------+ +# | uint64_t A[5][5] | +# | ... | +# | ... | +# +336->+-----------------------+ +# | uint64_t *A | +# +340->+-----------------------+ +# | const void *inp | +# +344->+-----------------------+ +# | size_t len | +# +348->+-----------------------+ +# | size_t bs | +# +352->+-----------------------+ +# | .... + +$code.=<<___; +.global SHA3_absorb +.type SHA3_absorb,%function +.align 5 +SHA3_absorb: + stmdb sp!,{r0-r12,lr} + sub sp,sp,#320+16 + + mov r12,r0 + add r14,sp,#0 + mov $len,r2 + mov $bsz,r3 + + ldmia r12!,{@C[0]-@C[9]} @ copy A[5][5] to stack + stmia r14!,{@C[0]-@C[9]} + ldmia r12!,{@C[0]-@C[9]} + stmia r14!,{@C[0]-@C[9]} + ldmia r12!,{@C[0]-@C[9]} + stmia r14!,{@C[0]-@C[9]} + ldmia r12!,{@C[0]-@C[9]} + stmia r14!,{@C[0]-@C[9]} + ldmia r12, {@C[0]-@C[9]} + stmia r14, {@C[0]-@C[9]} + + ldr $inp,[sp,#340] + +.Loop_absorb: + subs r0,$len,$bsz + blo .Labsorbed + add $A_flat,sp,#0 + str r0,[sp,#344] @ save len - bsz + +.Loop_block: + ldmia $A_flat,{r2-r3} @ A_flat[i] + ldrb r0,[$inp,#7]! @ inp[7] + mov $i,#8 + +.Lane_loop: + subs $i,$i,#1 + lsl r1,r0,#24 + blo .Lane_done +#ifdef __thumb2__ + it ne + ldrbne r0,[$inp,#-1]! +#else + ldrneb r0,[$inp,#-1]! +#endif + adds r1,r1,r1 @ sip through carry flag + adc $hi,$hi,$hi + adds r1,r1,r1 + adc $lo,$lo,$lo + adds r1,r1,r1 + adc $hi,$hi,$hi + adds r1,r1,r1 + adc $lo,$lo,$lo + adds r1,r1,r1 + adc $hi,$hi,$hi + adds r1,r1,r1 + adc $lo,$lo,$lo + adds r1,r1,r1 + adc $hi,$hi,$hi + adds r1,r1,r1 + adc $lo,$lo,$lo + b .Lane_loop + +.Lane_done: + eor r2,r2,$lo + eor r3,r3,$hi + add $inp,$inp,#8 + stmia $A_flat!,{r2-r3} @ A_flat[i++] ^= BitInterleave(inp[0..7]) + subs $bsz,$bsz,#8 + bhi .Loop_block + + str $inp,[sp,#340] + + bl KeccakF1600_int + + ldr $inp,[sp,#340] + ldr $len,[sp,#344] + ldr $bsz,[sp,#348] + b .Loop_absorb + +.align 4 +.Labsorbed: + add r12,sp,#$A[1][0] + ldr r14, [sp,#336] @ pull pointer to A[5][5] + ldmia sp, {@C[0]-@C[9]} + stmia r14!,{@C[0]-@C[9]} @ return A[5][5] + ldmia r12!,{@C[0]-@C[9]} + stmia r14!,{@C[0]-@C[9]} + ldmia r12!,{@C[0]-@C[9]} + stmia r14!,{@C[0]-@C[9]} + ldmia r12!,{@C[0]-@C[9]} + stmia r14!,{@C[0]-@C[9]} + ldmia r12, {@C[0]-@C[9]} + stmia r14, {@C[0]-@C[9]} + + add sp,sp,#320+32 + mov r0,$len @ return value + ldmia sp!,{r4-r12,pc} +.size SHA3_absorb,.-SHA3_absorb +___ +} +{ my ($A_flat,$out,$len,$bsz, $byte,$shl) = map("r$_", (4..9)); + +$code.=<<___; +.global SHA3_squeeze +.type SHA3_squeeze,%function +.align 5 +SHA3_squeeze: + stmdb sp!,{r4-r10,lr} + mov r12,r0 + mov $A_flat,r0 + mov $out,r1 + mov $len,r2 + mov $bsz,r3 + mov r14,r3 + b .Loop_squeeze + +.align 4 +.Loop_squeeze: + ldmia r12!,{r0,r1} @ A_flat[i++] + mov $shl,#28 + +.Lane_squeeze: + lsl r2,r0,$shl + lsl r3,r1,$shl + eor $byte,$byte,$byte + adds r3,r3,r3 @ sip through carry flag + adc $byte,$byte,$byte + adds r2,r2,r2 + adc $byte,$byte,$byte + adds r3,r3,r3 + adc $byte,$byte,$byte + adds r2,r2,r2 + adc $byte,$byte,$byte + adds r3,r3,r3 + adc $byte,$byte,$byte + adds r2,r2,r2 + adc $byte,$byte,$byte + adds r3,r3,r3 + adc $byte,$byte,$byte + adds r2,r2,r2 + adc $byte,$byte,$byte + subs $len,$len,#1 @ len -= 1 + str $byte,[$out],#1 + beq .Lsqueeze_done + subs $shl,$shl,#4 + bhs .Lane_squeeze + + subs r14,r14,#8 @ bsz -= 8 + bhi .Loop_squeeze + + mov r0,$A_flat + + bl KeccakF1600 + + mov r12,$A_flat + mov r14,$bsz + b .Loop_squeeze + +.Lsqueeze_done: + ldmia sp!,{r4-r10,pc} +.size SHA3_squeeze,.-SHA3_squeeze +.asciz "Keccak-1600 absorb and squeeze for ARMv4, CRYPTOGAMS by " +.align 2 +___ +} + print $code; + +close STDOUT; # enforce flush -- 2.25.1