From c6b77c16a63839132cbea29a4b6487ac8e0e7224 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 4 May 2016 15:33:42 +0200 Subject: [PATCH] MIPS64 assembly pack: add Poly1305 module. Reviewed-by: Richard Levitte --- crypto/poly1305/asm/poly1305-mips.pl | 414 +++++++++++++++++++++++++++ crypto/poly1305/build.info | 1 + 2 files changed, 415 insertions(+) create mode 100755 crypto/poly1305/asm/poly1305-mips.pl diff --git a/crypto/poly1305/asm/poly1305-mips.pl b/crypto/poly1305/asm/poly1305-mips.pl new file mode 100755 index 0000000000..cb0531f05e --- /dev/null +++ b/crypto/poly1305/asm/poly1305-mips.pl @@ -0,0 +1,414 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# Poly1305 hash for MIPS64. +# +# May 2016 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc +# R1x000 5.64/+120% (big-endian) +# Octeon II 3.80/+280% (little-endian) + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp [o32 can be +# excluded from the rule, because it's specified volatile]; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +# +# +###################################################################### + +$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 + +die "MIPS64 only" unless ($flavour =~ /64|n32/i); + +$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; + +($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); + +$code.=<<___; +#ifdef MIPSEB +# define MSB 0 +# define LSB 7 +#else +# define MSB 7 +# define LSB 0 +#endif + +.text +.set noat +.set noreorder + +.align 5 +.globl poly1305_init +.ent poly1305_init +poly1305_init: + .frame $sp,0,$ra + .set reorder + + sd $zero,0($ctx) + sd $zero,8($ctx) + sd $zero,16($ctx) + + beqz $inp,.Lno_key + + ldl $in0,0+MSB($inp) + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + li $tmp0,1 + dsll $tmp0,32 + daddiu $tmp0,-63 + dsll $tmp0,28 + daddiu $tmp0,-1 # 0ffffffc0fffffff + + and $in0,$tmp0 + daddiu $tmp0,-3 # 0ffffffc0ffffffc + and $in1,$tmp0 + + sd $in0,24($ctx) + dsrl $tmp0,$in1,2 + sd $in1,32($ctx) + daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) + sd $tmp0,40($ctx) + +.Lno_key: + li $v0,0 # return 0 + jr $ra +.end poly1305_init +___ +{ +my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) = + ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); + +$code.=<<___; +.align 5 +.globl poly1305_blocks +.ent poly1305_blocks +poly1305_blocks: + .set noreorder + dsrl $len,4 # number of complete blocks + beqz $len,.Lno_data + nop + + .frame $sp,8*8,$ra + .mask $SAVED_REGS_MASK,-8 + dsub $sp,8*8 + sd $s5,0($sp) + sd $s4,8($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + sd $s3,16($sp) + sd $s2,24($sp) + sd $s1,32($sp) + sd $s0,40($sp) +___ +$code.=<<___; + .set reorder + + ld $h0,0($ctx) # load hash value + ld $h1,8($ctx) + ld $h2,16($ctx) + + ld $r0,24($ctx) # load key + ld $r1,32($ctx) + ld $s1,40($ctx) + +.Loop: + ldl $in0,0+MSB($inp) # load input + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + daddiu $len,-1 + ldr $in1,8+LSB($inp) + daddiu $inp,16 +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + daddu $h0,$in0 # accumulate input + daddu $h1,$in1 + sltu $tmp0,$h0,$in0 + sltu $tmp1,$h1,$in1 + daddu $h1,$tmp0 + + dmultu $r0,$h0 # h0*r0 + daddu $h2,$padbit + sltu $tmp0,$h1,$tmp0 + mflo $d0 + mfhi $d1 + + dmultu $s1,$h1 # h1*5*r1 + daddu $tmp0,$tmp1 + daddu $h2,$tmp0 + mflo $tmp0 + mfhi $tmp1 + + dmultu $r1,$h0 # h0*r1 + daddu $d0,$tmp0 + daddu $d1,$tmp1 + mflo $tmp2 + mfhi $d2 + sltu $tmp0,$d0,$tmp0 + daddu $d1,$tmp0 + + dmultu $r0,$h1 # h1*r0 + daddu $d1,$tmp2 + sltu $tmp2,$d1,$tmp2 + mflo $tmp0 + mfhi $tmp1 + daddu $d2,$tmp2 + + dmultu $s1,$h2 # h2*5*r1 + daddu $d1,$tmp0 + daddu $d2,$tmp1 + mflo $tmp2 + + dmultu $r0,$h2 # h2*r0 + sltu $tmp0,$d1,$tmp0 + daddu $d2,$tmp0 + mflo $tmp3 + + daddu $d1,$tmp2 + daddu $d2,$tmp3 + sltu $tmp2,$d1,$tmp2 + daddu $d2,$tmp2 + + li $tmp0,-4 # final reduction + and $tmp0,$d2 + dsrl $tmp1,$d2,2 + andi $h2,$d2,3 + daddu $tmp0,$tmp1 + daddu $h0,$d0,$tmp0 + sltu $tmp0,$h0,$tmp0 + daddu $h1,$d1,$tmp0 + sltu $tmp0,$h1,$tmp0 + daddu $h2,$h2,$tmp0 + + bnez $len,.Loop + + sd $h0,0($ctx) # store hash value + sd $h1,8($ctx) + sd $h2,16($ctx) + + .set noreorder + ld $s5,0($sp) # epilogue + ld $s4,8($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue + ld $s3,16($sp) + ld $s2,24($sp) + ld $s1,32($sp) + ld $s0,40($sp) +___ +$code.=<<___; + dadd $sp,8*8 + +.Lno_data: + jr $ra + nop +.end poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); + +$code.=<<___; +.align 5 +.globl poly1305_emit +.ent poly1305_emit +poly1305_emit: + .frame $sp,0,$ra + .set reorder + + ld $tmp0,0($ctx) + ld $tmp1,8($ctx) + ld $tmp2,16($ctx) + + daddiu $in0,$tmp0,5 # compare to modulus + sltiu $tmp3,$in0,5 + daddu $in1,$tmp1,$tmp3 + sltu $tmp3,$in1,$tmp3 + daddu $tmp2,$tmp2,$tmp3 + + dsrl $tmp2,2 # see if it carried/borrowed + dsubu $tmp2,$zero,$tmp2 + nor $tmp3,$zero,$tmp2 + + and $in0,$tmp2 + and $tmp0,$tmp3 + and $in1,$tmp2 + and $tmp1,$tmp3 + or $in0,$tmp0 + or $in1,$tmp1 + + lwu $tmp0,0($nonce) # load nonce + lwu $tmp1,4($nonce) + lwu $tmp2,8($nonce) + lwu $tmp3,12($nonce) + dsll $tmp1,32 + dsll $tmp3,32 + or $tmp0,$tmp1 + or $tmp2,$tmp3 + + daddu $in0,$tmp0 # accumulate nonce + daddu $in1,$tmp2 + sltu $tmp0,$in0,$tmp0 + daddu $in1,$tmp0 + + dsrl $tmp0,$in0,8 # write mac value + dsrl $tmp1,$in0,16 + dsrl $tmp2,$in0,24 + sb $in0,0($mac) + dsrl $tmp3,$in0,32 + sb $tmp0,1($mac) + dsrl $tmp0,$in0,40 + sb $tmp1,2($mac) + dsrl $tmp1,$in0,48 + sb $tmp2,3($mac) + dsrl $tmp2,$in0,56 + sb $tmp3,4($mac) + dsrl $tmp3,$in1,8 + sb $tmp0,5($mac) + dsrl $tmp0,$in1,16 + sb $tmp1,6($mac) + dsrl $tmp1,$in1,24 + sb $tmp2,7($mac) + + sb $in1,8($mac) + dsrl $tmp2,$in1,32 + sb $tmp3,9($mac) + dsrl $tmp3,$in1,40 + sb $tmp0,10($mac) + dsrl $tmp0,$in1,48 + sb $tmp1,11($mac) + dsrl $tmp1,$in1,56 + sb $tmp2,12($mac) + sb $tmp3,13($mac) + sb $tmp0,14($mac) + sb $tmp1,15($mac) + + jr $ra +.end poly1305_emit +.rdata +.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by " +.align 2 +___ +} + +$output=pop and open STDOUT,">$output"; +print $code; +close STDOUT; + diff --git a/crypto/poly1305/build.info b/crypto/poly1305/build.info index 389c7f6d32..d575f5a63e 100644 --- a/crypto/poly1305/build.info +++ b/crypto/poly1305/build.info @@ -12,6 +12,7 @@ GENERATE[poly1305-armv4.S]=asm/poly1305-armv4.pl $(PERLASM_SCHEME) INCLUDE[poly1305-armv4.o]=.. GENERATE[poly1305-armv8.S]=asm/poly1305-armv8.pl $(PERLASM_SCHEME) INCLUDE[poly1305-armv8.o]=.. +GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl $(PERLASM_SCHEME) BEGINRAW[Makefile(unix)] {- $builddir -}/poly1305-%.S: {- $sourcedir -}/asm/poly1305-%.pl -- 2.25.1