X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;f=crypto%2Fpoly1305%2Fasm%2Fpoly1305-x86_64.pl;h=e5b841260e815cb8c466b6fb0fea86bcde895c91;hb=98ad3fe82bd3e7e7f929dd1fa4ef3915426002c0;hp=1dce5d61e3a081587245ff5cdd6fc0bf0ad5e2ff;hpb=54f8f9a1edfcf1a2d2df801728e462841e2c316e;p=oweals%2Fopenssl.git diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl index 1dce5d61e3..e5b841260e 100755 --- a/crypto/poly1305/asm/poly1305-x86_64.pl +++ b/crypto/poly1305/asm/poly1305-x86_64.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl -# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -24,17 +24,28 @@ # # Add AVX512F+VL+BW code path. # +# November 2017 +# +# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be +# executed even on Knights Landing. Trigger for modification was +# observation that AVX512 code paths can negatively affect overall +# Skylake-X system performance. Since we are likely to suppress +# AVX512F capability flag [at least on Skylake-X], conversion serves +# as kind of "investment protection". Note that next *lake processor, +# Cannolake, has AVX512IFMA code path to execute... +# # Numbers are cycles per processed byte with poly1305_blocks alone, # measured with rdtsc at fixed clock frequency. # -# IALU/gcc-4.8(*) AVX(**) AVX2 +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 # P4 4.46/+120% - # Core 2 2.41/+90% - # Westmere 1.88/+120% - # Sandy Bridge 1.39/+140% 1.10 # Haswell 1.14/+175% 1.11 0.65 -# Skylake 1.13/+120% 0.96 0.51 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] # Silvermont 2.83/+95% - +# Knights L 3.60/? 1.65 1.10 0.41(***) # Goldmont 1.70/+180% - # VIA Nano 1.82/+150% - # Sledgehammer 1.38/+160% - @@ -49,10 +60,13 @@ # Core processors, 50-30%, less newer processor is, but slower on # contemporary ones, for example almost 2x slower on Atom, and as # former are naturally disappearing, SSE2 is deemed unnecessary; +# (***) strangely enough performance seems to vary from core to core, +# listed result is best case; -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); @@ -77,11 +91,12 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $avx = ($1>=10) + ($1>=12); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); @@ -155,6 +170,7 @@ $code.=<<___; .type poly1305_init,\@function,3 .align 32 poly1305_init: +.cfi_startproc xor %rax,%rax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) @@ -206,6 +222,7 @@ $code.=<<___; mov \$1,%eax .Lno_key: ret +.cfi_endproc .size poly1305_init,.-poly1305_init .type poly1305_blocks,\@function,4 @@ -285,6 +302,7 @@ $code.=<<___; .type poly1305_emit,\@function,3 .align 32 poly1305_emit: +.cfi_startproc .Lemit: mov 0($ctx),%r8 # load hash value mov 8($ctx),%r9 @@ -295,7 +313,7 @@ poly1305_emit: mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overfow? + shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx @@ -305,6 +323,7 @@ poly1305_emit: mov %rcx,8($mac) ret +.cfi_endproc .size poly1305_emit,.-poly1305_emit ___ if ($avx) { @@ -329,15 +348,18 @@ $code.=<<___; .type __poly1305_block,\@abi-omnipotent .align 32 __poly1305_block: +.cfi_startproc ___ &poly1305_iteration(); $code.=<<___; ret +.cfi_endproc .size __poly1305_block,.-__poly1305_block .type __poly1305_init_avx,\@abi-omnipotent .align 32 __poly1305_init_avx: +.cfi_startproc mov $r0,$h0 mov $r1,$h1 xor $h2,$h2 @@ -495,6 +517,7 @@ __poly1305_init_avx: lea -48-64($ctx),$ctx # size [de-]optimization ret +.cfi_endproc .size __poly1305_init_avx,.-__poly1305_init_avx .type poly1305_blocks_avx,\@function,4 @@ -1360,6 +1383,7 @@ $code.=<<___; .type poly1305_emit_avx,\@function,3 .align 32 poly1305_emit_avx: +.cfi_startproc cmpl \$0,20($ctx) # is_base2_26? je .Lemit @@ -1400,7 +1424,7 @@ poly1305_emit_avx: mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overfow? + shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx @@ -1410,6 +1434,7 @@ poly1305_emit_avx: mov %rcx,8($mac) ret +.cfi_endproc .size poly1305_emit_avx,.-poly1305_emit_avx ___ @@ -1682,7 +1707,6 @@ poly1305_blocks_avx2: .Leven_avx2: .cfi_startproc mov OPENSSL_ia32cap_P+8(%rip),%r10d - mov \$`(1<<31|1<<30|1<<16)`,%r11d vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 vmovd 4*1($ctx),%x#$H1 vmovd 4*2($ctx),%x#$H2 @@ -1695,8 +1719,8 @@ $code.=<<___ if ($avx>2); cmp \$512,$len jb .Lskip_avx512 and %r11d,%r10d - cmp %r11d,%r10d # check for AVX512F+BW+VL - je .Lblocks_avx512 + test \$`1<<16`,%r10d # check for AVX512F + jnz .Lblocks_avx512 .Lskip_avx512: ___ $code.=<<___ if (!$win64); @@ -2106,10 +2130,14 @@ if ($avx>2) { # reason stack layout is kept identical to poly1305_blocks_avx2. If not # for this tail, we wouldn't have to even allocate stack frame... -my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%ymm$_",(16..24)); -my ($M0,$M1,$M2,$M3,$M4) = map("%ymm$_",(25..29)); +my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); +my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); my $PADBIT="%zmm30"; -my $GATHER="%ymm31"; + +map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain +map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); +map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); +map(s/%y/%z/,($MASK)); $code.=<<___; .type poly1305_blocks_avx512,\@function,4 @@ -2117,7 +2145,8 @@ $code.=<<___; poly1305_blocks_avx512: .cfi_startproc .Lblocks_avx512: - vzeroupper + mov \$15,%eax + kmovw %eax,%k2 ___ $code.=<<___ if (!$win64); lea -8(%rsp),%r11 @@ -2130,52 +2159,53 @@ $code.=<<___ if ($win64); vmovdqa %xmm6,0x50(%r11) vmovdqa %xmm7,0x60(%r11) vmovdqa %xmm8,0x70(%r11) - vmovdqa32 %xmm9,0x80(%r11) - vmovdqa32 %xmm10,0x90(%r11) - vmovdqa32 %xmm11,0xa0(%r11) - vmovdqa32 %xmm12,0xb0(%r11) - vmovdqa32 %xmm13,0xc0(%r11) - vmovdqa32 %xmm14,0xd0(%r11) - vmovdqa32 %xmm15,0xe0(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) .Ldo_avx512_body: ___ $code.=<<___; lea .Lconst(%rip),%rcx lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),$T2 # .Lpermd_avx2 + vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 # expand pre-calculated table - vmovdqu32 `16*0-64`($ctx),%x#$R0 + vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} and \$-512,%rsp - vmovdqu32 `16*1-64`($ctx),%x#$R1 - vmovdqu32 `16*2-64`($ctx),%x#$S1 - vmovdqu32 `16*3-64`($ctx),%x#$R2 - vmovdqu32 `16*4-64`($ctx),%x#$S2 - vmovdqu32 `16*5-64`($ctx),%x#$R3 - vmovdqu32 `16*6-64`($ctx),%x#$S3 - vmovdqu32 `16*7-64`($ctx),%x#$R4 - vmovdqu32 `16*8-64`($ctx),%x#$S4 - vpermd $R0,$T2,$R0 # 00003412 -> 14243444 - vmovdqa64 64(%rcx),$MASK # .Lmask26 - vpermd $R1,$T2,$R1 - vpermd $S1,$T2,$S1 - vpermd $R2,$T2,$R2 - vmovdqa32 $R0,0x00(%rsp) # save in case $len%128 != 0 + vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} + mov \$0x20,%rax + vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} + vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} + vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} + vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} + vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} + vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} + vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} + vpermd $D0,$T2,$R0 # 00003412 -> 14243444 + vpbroadcastq 64(%rcx),$MASK # .Lmask26 + vpermd $D1,$T2,$R1 + vpermd $T0,$T2,$S1 + vpermd $D2,$T2,$R2 + vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 - vpermd $S2,$T2,$S2 - vmovdqa32 $R1,0x20(%rsp) + vpermd $T1,$T2,$S2 + vmovdqu64 $R1,0x00(%rsp,%rax){%k2} vpsrlq \$32,$R1,$T1 - vpermd $R3,$T2,$R3 - vmovdqa32 $S1,0x40(%rsp) - vpermd $S3,$T2,$S3 - vpermd $R4,$T2,$R4 - vmovdqa32 $R2,0x60(%rsp) - vpermd $S4,$T2,$S4 - vmovdqa32 $S2,0x80(%rsp) - vmovdqa32 $R3,0xa0(%rsp) - vmovdqa32 $S3,0xc0(%rsp) - vmovdqa32 $R4,0xe0(%rsp) - vmovdqa32 $S4,0x100(%rsp) + vpermd $D3,$T2,$R3 + vmovdqa64 $S1,0x40(%rsp){%k2} + vpermd $T3,$T2,$S3 + vpermd $D4,$T2,$R4 + vmovdqu64 $R2,0x40(%rsp,%rax){%k2} + vpermd $T4,$T2,$S4 + vmovdqa64 $S2,0x80(%rsp){%k2} + vmovdqu64 $R3,0x80(%rsp,%rax){%k2} + vmovdqa64 $S3,0xc0(%rsp){%k2} + vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} + vmovdqa64 $S4,0x100(%rsp){%k2} ################################################################ # calculate 5th through 8th powers of the key @@ -2279,14 +2309,6 @@ $code.=<<___; vpandq $MASK,$D3,$D3 vpaddq $M3,$D4,$D4 # d3 -> d4 -___ -map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain -map(s/%y/%z/,($M4,$M0,$M1,$M2,$M3)); -map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); -map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4)); -map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); -map(s/%y/%z/,($MASK)); -$code.=<<___; ################################################################ # at this point we have 14243444 in $R0-$S4 and 05060708 in # $D0-$D4, ... @@ -2324,7 +2346,6 @@ $code.=<<___; vpaddd $R3,$S3,$S3 vpaddd $R4,$S4,$S4 - vpbroadcastq %x#$MASK,$MASK vpbroadcastq 32(%rcx),$PADBIT # .L129 vpsrlq \$52,$T0,$T2 # splat input @@ -2342,7 +2363,7 @@ $code.=<<___; vpaddq $H2,$T2,$H2 # accumulate input sub \$192,$len jbe .Ltail_avx512 - #jmp .Loop_avx512 + jmp .Loop_avx512 .align 32 .Loop_avx512: @@ -2529,7 +2550,7 @@ $code.=<<___; vpaddq $H3,$T3,$H3 vpaddq $H4,$T4,$H4 - vmovdqu64 16*0($inp),%x#$T0 + vmovdqu 16*0($inp),%x#$T0 vpmuludq $H0,$R3,$M3 vpmuludq $H0,$R4,$M4 vpmuludq $H0,$R0,$M0 @@ -2539,7 +2560,7 @@ $code.=<<___; vpaddq $M0,$D0,$D0 # d0 += h0*r0 vpaddq $M1,$D1,$D1 # d1 += h0*r1 - vmovdqu64 16*1($inp),%x#$T1 + vmovdqu 16*1($inp),%x#$T1 vpmuludq $H1,$R2,$M3 vpmuludq $H1,$R3,$M4 vpmuludq $H1,$S4,$M0 @@ -2549,7 +2570,7 @@ $code.=<<___; vpaddq $M0,$D0,$D0 # d0 += h1*s4 vpaddq $M2,$D2,$D2 # d2 += h0*r2 - vinserti64x2 \$1,16*2($inp),$T0,$T0 + vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 vpmuludq $H3,$R0,$M3 vpmuludq $H3,$R1,$M4 vpmuludq $H1,$R0,$M1 @@ -2559,7 +2580,7 @@ $code.=<<___; vpaddq $M1,$D1,$D1 # d1 += h1*r0 vpaddq $M2,$D2,$D2 # d2 += h1*r1 - vinserti64x2 \$1,16*3($inp),$T1,$T1 + vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 vpmuludq $H4,$S4,$M3 vpmuludq $H4,$R0,$M4 vpmuludq $H3,$S2,$M0 @@ -2582,11 +2603,11 @@ $code.=<<___; # horizontal addition mov \$1,%eax - vpsrldq \$8,$H3,$D3 - vpsrldq \$8,$D4,$H4 - vpsrldq \$8,$H0,$D0 - vpsrldq \$8,$H1,$D1 - vpsrldq \$8,$H2,$D2 + vpermq \$0xb1,$H3,$D3 + vpermq \$0xb1,$D4,$H4 + vpermq \$0xb1,$H0,$D0 + vpermq \$0xb1,$H1,$D1 + vpermq \$0xb1,$H2,$D2 vpaddq $D3,$H3,$H3 vpaddq $D4,$H4,$H4 vpaddq $D0,$H0,$H0 @@ -2623,23 +2644,23 @@ $code.=<<___; # lazy reduction (interleaved with input splat) vpsrlq \$26,$H3,$D3 - vpandq $MASK,$H3,$H3 + vpand $MASK,$H3,$H3 vpsrldq \$6,$T0,$T2 # splat input vpsrldq \$6,$T1,$T3 vpunpckhqdq $T1,$T0,$T4 # 4 vpaddq $D3,$H4,$H4 # h3 -> h4 vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 + vpand $MASK,$H0,$H0 vpunpcklqdq $T3,$T2,$T2 # 2:3 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H4,$D4 - vpandq $MASK,$H4,$H4 + vpand $MASK,$H4,$H4 vpsrlq \$26,$H1,$D1 - vpandq $MASK,$H1,$H1 + vpand $MASK,$H1,$H1 vpsrlq \$30,$T2,$T3 vpsrlq \$4,$T2,$T2 vpaddq $D1,$H2,$H2 # h1 -> h2 @@ -2651,21 +2672,21 @@ $code.=<<___; vpaddq $D4,$H0,$H0 # h4 -> h0 vpsrlq \$26,$H2,$D2 - vpandq $MASK,$H2,$H2 - vpandq $MASK,$T2,$T2 # 2 - vpandq $MASK,$T0,$T0 # 0 + vpand $MASK,$H2,$H2 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 vpaddq $D2,$H3,$H3 # h2 -> h3 vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 + vpand $MASK,$H0,$H0 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 - vpandq $MASK,$T1,$T1 # 1 + vpand $MASK,$T1,$T1 # 1 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H3,$D3 - vpandq $MASK,$H3,$H3 - vpandq $MASK,$T3,$T3 # 3 - vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpand $MASK,$H3,$H3 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always vpaddq $D3,$H4,$H4 # h3 -> h4 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 @@ -2732,6 +2753,7 @@ $code.=<<___; .type poly1305_init_base2_44,\@function,3 .align 32 poly1305_init_base2_44: +.cfi_startproc xor %rax,%rax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) @@ -2773,6 +2795,7 @@ ___ $code.=<<___; mov \$1,%eax ret +.cfi_endproc .size poly1305_init_base2_44,.-poly1305_init_base2_44 ___ { @@ -2784,6 +2807,8 @@ $code.=<<___; .type poly1305_blocks_vpmadd52,\@function,4 .align 32 poly1305_blocks_vpmadd52: +.cfi_startproc + endbranch shr \$4,$len jz .Lno_data_vpmadd52 # too short @@ -2890,6 +2915,7 @@ poly1305_blocks_vpmadd52: .Lno_data_vpmadd52: ret +.cfi_endproc .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 ___ } @@ -2907,6 +2933,7 @@ $code.=<<___; .type poly1305_blocks_vpmadd52_4x,\@function,4 .align 32 poly1305_blocks_vpmadd52_4x: +.cfi_startproc shr \$4,$len jz .Lno_data_vpmadd52_4x # too short @@ -3331,6 +3358,7 @@ poly1305_blocks_vpmadd52_4x: .Lno_data_vpmadd52_4x: ret +.cfi_endproc .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x ___ } @@ -3349,6 +3377,7 @@ $code.=<<___; .type poly1305_blocks_vpmadd52_8x,\@function,4 .align 32 poly1305_blocks_vpmadd52_8x: +.cfi_startproc shr \$4,$len jz .Lno_data_vpmadd52_8x # too short @@ -3704,6 +3733,7 @@ $code.=<<___; .Lno_data_vpmadd52_8x: ret +.cfi_endproc .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x ___ } @@ -3711,6 +3741,8 @@ $code.=<<___; .type poly1305_emit_base2_44,\@function,3 .align 32 poly1305_emit_base2_44: +.cfi_startproc + endbranch mov 0($ctx),%r8 # load hash value mov 8($ctx),%r9 mov 16($ctx),%r10 @@ -3731,7 +3763,7 @@ poly1305_emit_base2_44: mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overfow? + shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx @@ -3741,6 +3773,7 @@ poly1305_emit_base2_44: mov %rcx,8($mac) ret +.cfi_endproc .size poly1305_emit_base2_44,.-poly1305_emit_base2_44 ___ } } } @@ -3778,12 +3811,119 @@ $code.=<<___; .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff ___ } - $code.=<<___; .asciz "Poly1305 for x86_64, CRYPTOGAMS by " .align 16 ___ +{ # chacha20-poly1305 helpers +my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order +$code.=<<___; +.globl xor128_encrypt_n_pad +.type xor128_encrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_encrypt_n_pad: +.cfi_startproc + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_enc + nop +.Loop_enc_xmm: + movdqu ($inp,$otp),%xmm0 + pxor ($otp),%xmm0 + movdqu %xmm0,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_enc_xmm + + and \$15,%r10 # len % 16 + jz .Ldone_enc + +.Ltail_enc: + mov \$16,$len + sub %r10,$len + xor %eax,%eax +.Loop_enc_byte: + mov ($inp,$otp),%al + xor ($otp),%al + mov %al,($out,$otp) + mov %al,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_enc_byte + + xor %eax,%eax +.Loop_enc_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_enc_pad + +.Ldone_enc: + mov $otp,%rax + ret +.cfi_endproc +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad + +.globl xor128_decrypt_n_pad +.type xor128_decrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_decrypt_n_pad: +.cfi_startproc + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_dec + nop +.Loop_dec_xmm: + movdqu ($inp,$otp),%xmm0 + movdqa ($otp),%xmm1 + pxor %xmm0,%xmm1 + movdqu %xmm1,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_dec_xmm + + pxor %xmm1,%xmm1 + and \$15,%r10 # len % 16 + jz .Ldone_dec + +.Ltail_dec: + mov \$16,$len + sub %r10,$len + xor %eax,%eax + xor %r11,%r11 +.Loop_dec_byte: + mov ($inp,$otp),%r11b + mov ($otp),%al + xor %r11b,%al + mov %al,($out,$otp) + mov %r11b,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_dec_byte + + xor %eax,%eax +.Loop_dec_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_dec_pad + +.Ldone_dec: + mov $otp,%rax + ret +.cfi_endproc +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad +___ +} + # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { @@ -4044,4 +4184,4 @@ foreach (split('\n',$code)) { print $_,"\n"; } -close STDOUT; +close STDOUT or die "error closing STDOUT";