Remove resolved TODO

[oweals/openssl.git] / crypto / poly1305 / asm / poly1305-c64xplus.pl
diff --git a/crypto/poly1305/asm/poly1305-c64xplus.pl b/crypto/poly1305/asm/poly1305-c64xplus.pl

index fc765e14fe8c9e6942d1193ccfcca93b25b07169..93fef37e605b9381ba97dda0fb198b7acd58bb72 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-c64xplus.pl
+++ b/crypto/poly1305/asm/poly1305-c64xplus.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  #
  # ====================================================================
  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -11,7 +18,7 @@
  #
  # October 2015
  #
-# Performance is [incredible for a 32-bit processor] 1.76 cycles per
+# Performance is [incredible for a 32-bit processor] 1.82 cycles per
  # processed byte. Comparison to compiler-generated code is problematic,
  # because results were observed to vary from 2.1 to 7.6 cpb depending
  # on compiler's ability to inline small functions. Compiler also
@@ -19,6 +26,9 @@
  # time dependent on input length. This module on the other hand is free
  # from such limitation.
  
+$output=pop;
+open STDOUT,">$output";
+
  ($CTXA,$INPB,$LEN,$PADBIT)=("A4","B4","A6","B6");
  ($H0,$H1,$H2,$H3,$H4,$H4a)=("A8","B8","A10","B10","B2",$LEN);
  ($D0,$D1,$D2,$D3)=         ("A9","B9","A11","B11");
@@ -125,7 +135,7 @@ _poly1305_blocks:
  ||     SWAP2   $D1,$D1
  
         ADDU    $D0,B24,$D0:$H0         ; h0+=inp[0]
-||     ADD     $D0,B24,B31             ; B-copy of h0+inp[0]
+||     ADD     $D0,B24,B27             ; B-copy of h0+inp[0]
  ||     SWAP4   $D1,$D1
         ADDU    $D1,B25,$D1:$H1         ; h1+=inp[1]
  ||     MVK     3,$THREE
@@ -137,12 +147,12 @@ _poly1305_blocks:
  
  loop?:
         MPY32U  $H0,$R0,A17:A16
-||     MPY32U  B31,$R1,B17:B16         ; MPY32U        $H0,$R1,B17:B16
+||     MPY32U  B27,$R1,B17:B16         ; MPY32U        $H0,$R1,B17:B16
  ||     ADDU    $D0,$D1:$H1,B25:B24     ; ADDU          $D0,$D1:$H1,$D1:$H1
  ||     ADDU    $D2,B28,$D2:$H2         ; h2+=inp[2]
  ||     SWAP2   $D3,$D3
         MPY32U  $H0,$R2,A19:A18
-||     MPY32U  B31,$R3,B19:B18         ; MPY32U        $H0,$R3,B19:B18
+||     MPY32U  B27,$R3,B19:B18         ; MPY32U        $H0,$R3,B19:B18
  ||     ADD     $D0,$H1,A24             ; A-copy of B24
  ||     SWAP4   $D3,$D3
  || [A2]        SUB     A2,1,A2                 ; decrement loop counter
@@ -224,8 +234,8 @@ loop?:
  
         SHRU    $H4,2,B16               ; last reduction step
  ||     AND     $H4,$THREE,$H4
-|| [A2]        BNOP    loop?
         ADDAW   B16,B16,B16             ; 5*(h4>>2)
+|| [A2]        BNOP    loop?
  
         ADDU    B24,B16,B25:B24         ; B24 is h0
  || [A2]        SWAP2   $D2,$D2
@@ -233,8 +243,9 @@ loop?:
  || [A2]        SWAP4   $D2,$D2
         ADDU    B28,B27,B29:B28         ; B28 is h2
  || [A2]        ADDU    $D0,B24,$D0:$H0         ; h0+=inp[0]
-|| [A2]        ADD     $D0,B24,B31             ; B-copy of h0+inp[0]
-       ADD     B30,B29,B30             ; B30 is h3
+|| [A2]        ADD     $D0,B24,B27             ; B-copy of h0+inp[0]
+       ADDU    B30,B29,B31:B30         ; B30 is h3
+       ADD     B31,$H4,$H4
  || [A2]        ADDU    $D1,B26,$D1:$H1         ; h1+=inp[1]
  ;;===== branch to loop? is taken here