crypto/modes/asm/ghash-c64xplus.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # December 2011
  18 #
  19 # The module implements GCM GHASH function and underlying single
  20 # multiplication operation in GF(2^128). Even though subroutines
  21 # have _4bit suffix, they are not using any tables, but rely on
  22 # hardware Galois Field Multiply support. Streamed GHASH processes
  23 # byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
  24 # code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
  25 # comparing apples vs. oranges, but compiler surely could have done
  26 # better, because theoretical [though not necessarily achievable]
  27 # estimate for "4-bit" table-driven implementation is ~12 cycles.
  28
  29 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  30 open STDOUT,">$output";
  31
  32 ($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
  33
  34 ($Z0,$Z1,$Z2,$Z3,       $H0, $H1, $H2, $H3,
  35                         $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
  36 ($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
  37                         $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
  38 ($FF000000,$E10000)=("B30","B31");
  39 ($xip,$x0,$x1,$xib)=map("B$_",(6..9));  # $xip zaps $len
  40  $xia="A9";
  41 ($rem,$res)=("B4","B5");                # $rem zaps $Htable
  42
  43 $code.=<<___;
  44         .text
  45
  46         .if     .ASSEMBLER_VERSION<7000000
  47         .asg    0,__TI_EABI__
  48         .endif
  49         .if     __TI_EABI__
  50         .asg    gcm_gmult_1bit,_gcm_gmult_1bit
  51         .asg    gcm_gmult_4bit,_gcm_gmult_4bit
  52         .asg    gcm_ghash_4bit,_gcm_ghash_4bit
  53         .endif
  54
  55         .asg    B3,RA
  56
  57         .if     0
  58         .global _gcm_gmult_1bit
  59 _gcm_gmult_1bit:
  60         ADDAD   $Htable,2,$Htable
  61         .endif
  62         .global _gcm_gmult_4bit
  63 _gcm_gmult_4bit:
  64         .asmfunc
  65         LDDW    *${Htable}[-1],$H1:$H0  ; H.lo
  66         LDDW    *${Htable}[-2],$H3:$H2  ; H.hi
  67 ||      MV      $Xip,${xip}             ; reassign Xi
  68 ||      MVK     15,B1                   ; SPLOOPD constant
  69
  70         MVK     0xE1,$E10000
  71 ||      LDBU    *++${xip}[15],$x1       ; Xi[15]
  72         MVK     0xFF,$FF000000
  73 ||      LDBU    *--${xip},$x0           ; Xi[14]
  74         SHL     $E10000,16,$E10000      ; [pre-shifted] reduction polynomial
  75         SHL     $FF000000,24,$FF000000  ; upper byte mask
  76 ||      BNOP    ghash_loop?
  77 ||      MVK     1,B0                    ; take a single spin
  78
  79         PACKH2  $H0,$H1,$xia            ; pack H0' and H1's upper bytes
  80         AND     $H2,$FF000000,$H2u      ; H2's upper byte
  81         AND     $H3,$FF000000,$H3u      ; H3's upper byte
  82 ||      SHRU    $H2u,8,$H2u
  83         SHRU    $H3u,8,$H3u
  84 ||      ZERO    $Z1:$Z0
  85         SHRU2   $xia,8,$H01u
  86 ||      ZERO    $Z3:$Z2
  87         .endasmfunc
  88
  89         .global _gcm_ghash_4bit
  90 _gcm_ghash_4bit:
  91         .asmfunc
  92         LDDW    *${Htable}[-1],$H1:$H0  ; H.lo
  93 ||      SHRU    $len,4,B0               ; reassign len
  94         LDDW    *${Htable}[-2],$H3:$H2  ; H.hi
  95 ||      MV      $Xip,${xip}             ; reassign Xi
  96 ||      MVK     15,B1                   ; SPLOOPD constant
  97
  98         MVK     0xE1,$E10000
  99 || [B0] LDNDW   *${inp}[1],$H1x:$H0x
 100         MVK     0xFF,$FF000000
 101 || [B0] LDNDW   *${inp}++[2],$H3x:$H2x
 102         SHL     $E10000,16,$E10000      ; [pre-shifted] reduction polynomial
 103 ||      LDDW    *${xip}[1],$Z1:$Z0
 104         SHL     $FF000000,24,$FF000000  ; upper byte mask
 105 ||      LDDW    *${xip}[0],$Z3:$Z2
 106
 107         PACKH2  $H0,$H1,$xia            ; pack H0' and H1's upper bytes
 108         AND     $H2,$FF000000,$H2u      ; H2's upper byte
 109         AND     $H3,$FF000000,$H3u      ; H3's upper byte
 110 ||      SHRU    $H2u,8,$H2u
 111         SHRU    $H3u,8,$H3u
 112         SHRU2   $xia,8,$H01u
 113
 114 || [B0] XOR     $H0x,$Z0,$Z0            ; Xi^=inp
 115 || [B0] XOR     $H1x,$Z1,$Z1
 116         .if     .LITTLE_ENDIAN
 117    [B0] XOR     $H2x,$Z2,$Z2
 118 || [B0] XOR     $H3x,$Z3,$Z3
 119 || [B0] SHRU    $Z1,24,$xia             ; Xi[15], avoid cross-path stall
 120         STDW    $Z1:$Z0,*${xip}[1]
 121 || [B0] SHRU    $Z1,16,$x0              ; Xi[14]
 122 || [B0] ZERO    $Z1:$Z0
 123         .else
 124    [B0] XOR     $H2x,$Z2,$Z2
 125 || [B0] XOR     $H3x,$Z3,$Z3
 126 || [B0] MV      $Z0,$xia                ; Xi[15], avoid cross-path stall
 127         STDW    $Z1:$Z0,*${xip}[1]
 128 || [B0] SHRU    $Z0,8,$x0               ; Xi[14]
 129 || [B0] ZERO    $Z1:$Z0
 130         .endif
 131         STDW    $Z3:$Z2,*${xip}[0]
 132 || [B0] ZERO    $Z3:$Z2
 133 || [B0] MV      $xia,$x1
 134    [B0] ADDK    14,${xip}
 135
 136 ghash_loop?:
 137         SPLOOPD 6                       ; 6*16+7
 138 ||      MVC     B1,ILC
 139 || [B0] SUB     B0,1,B0
 140 ||      ZERO    A0
 141 ||      ADD     $x1,$x1,$xib            ; SHL   $x1,1,$xib
 142 ||      SHL     $x1,1,$xia
 143 ___
 144 \f
 145 ########____________________________
 146 #  0    D2.     M1          M2      |
 147 #  1            M1                  |
 148 #  2            M1          M2      |
 149 #  3        D1. M1          M2      |
 150 #  4        S1. L1                  |
 151 #  5    S2  S1x L1          D2  L2  |____________________________
 152 #  6/0          L1  S1      L2  S2x |D2.     M1          M2      |
 153 #  7/1          L1  S1  D1x S2  M2  |        M1                  |
 154 #  8/2              S1  L1x S2      |        M1          M2      |
 155 #  9/3              S1  L1x         |    D1. M1          M2      |
 156 # 10/4                  D1x         |    S1. L1                  |
 157 # 11/5                              |S2  S1x L1          D2  L2  |____________
 158 # 12/6/0                D1x       __|        L1  S1      L2  S2x |D2.     ....
 159 #    7/1                                     L1  S1  D1x S2  M2  |        ....
 160 #    8/2                                         S1  L1x S2      |        ....
 161 #####...                                         ................|............
 162 $code.=<<___;
 163         XORMPY  $H0,$xia,$H0x           ; 0     ; H·(Xi[i]<<1)
 164 ||      XORMPY  $H01u,$xib,$H01y
 165 || [A0] LDBU    *--${xip},$x0
 166         XORMPY  $H1,$xia,$H1x           ; 1
 167         XORMPY  $H2,$xia,$H2x           ; 2
 168 ||      XORMPY  $H2u,$xib,$H2y
 169         XORMPY  $H3,$xia,$H3x           ; 3
 170 ||      XORMPY  $H3u,$xib,$H3y
 171 ||[!A0] MVK.D   15,A0                           ; *--${xip} counter
 172         XOR.L   $H0x,$Z0,$Z0            ; 4     ; Z^=H·(Xi[i]<<1)
 173 || [A0] SUB.S   A0,1,A0
 174         XOR.L   $H1x,$Z1,$Z1            ; 5
 175 ||      AND.D   $H01y,$FF000000,$H0z
 176 ||      SWAP2.L $H01y,$H1y              ;       ; SHL   $H01y,16,$H1y
 177 ||      SHL     $x0,1,$xib
 178 ||      SHL     $x0,1,$xia
 179
 180         XOR.L   $H2x,$Z2,$Z2            ; 6/0   ; [0,0] in epilogue
 181 ||      SHL     $Z0,1,$rem              ;       ; rem=Z<<1
 182 ||      SHRMB.S $Z1,$Z0,$Z0             ;       ; Z>>=8
 183 ||      AND.L   $H1y,$FF000000,$H1z
 184         XOR.L   $H3x,$Z3,$Z3            ; 7/1
 185 ||      SHRMB.S $Z2,$Z1,$Z1
 186 ||      XOR.D   $H0z,$Z0,$Z0                    ; merge upper byte products
 187 ||      AND.S   $H2y,$FF000000,$H2z
 188 ||      XORMPY  $E10000,$rem,$res       ;       ; implicit rem&0x1FE
 189         XOR.L   $H1z,$Z1,$Z1            ; 8/2
 190 ||      SHRMB.S $Z3,$Z2,$Z2
 191 ||      AND.S   $H3y,$FF000000,$H3z
 192         XOR.L   $H2z,$Z2,$Z2            ; 9/3
 193 ||      SHRU    $Z3,8,$Z3
 194         XOR.D   $H3z,$Z3,$Z3            ; 10/4
 195         NOP                             ; 11/5
 196
 197         SPKERNEL 0,2
 198 ||      XOR.D   $res,$Z3,$Z3            ; 12/6/0; Z^=res
 199
 200         ; input pre-fetch is possible where D1 slot is available...
 201    [B0] LDNDW   *${inp}[1],$H1x:$H0x    ; 8/-
 202    [B0] LDNDW   *${inp}++[2],$H3x:$H2x  ; 9/-
 203         NOP                             ; 10/-
 204         .if     .LITTLE_ENDIAN
 205         SWAP2   $Z0,$Z1                 ; 11/-
 206 ||      SWAP4   $Z1,$Z0
 207         SWAP4   $Z1,$Z1                 ; 12/-
 208 ||      SWAP2   $Z0,$Z0
 209         SWAP2   $Z2,$Z3
 210 ||      SWAP4   $Z3,$Z2
 211 ||[!B0] BNOP    RA
 212         SWAP4   $Z3,$Z3
 213 ||      SWAP2   $Z2,$Z2
 214 || [B0] BNOP    ghash_loop?
 215    [B0] XOR     $H0x,$Z0,$Z0            ; Xi^=inp
 216 || [B0] XOR     $H1x,$Z1,$Z1
 217    [B0] XOR     $H2x,$Z2,$Z2
 218 || [B0] XOR     $H3x,$Z3,$Z3
 219 || [B0] SHRU    $Z1,24,$xia             ; Xi[15], avoid cross-path stall
 220         STDW    $Z1:$Z0,*${xip}[1]
 221 || [B0] SHRU    $Z1,16,$x0              ; Xi[14]
 222 || [B0] ZERO    $Z1:$Z0
 223         .else
 224   [!B0] BNOP    RA                      ; 11/-
 225    [B0] BNOP    ghash_loop?             ; 12/-
 226    [B0] XOR     $H0x,$Z0,$Z0            ; Xi^=inp
 227 || [B0] XOR     $H1x,$Z1,$Z1
 228    [B0] XOR     $H2x,$Z2,$Z2
 229 || [B0] XOR     $H3x,$Z3,$Z3
 230 || [B0] MV      $Z0,$xia                ; Xi[15], avoid cross-path stall
 231         STDW    $Z1:$Z0,*${xip}[1]
 232 || [B0] SHRU    $Z0,8,$x0               ; Xi[14]
 233 || [B0] ZERO    $Z1:$Z0
 234         .endif
 235         STDW    $Z3:$Z2,*${xip}[0]
 236 || [B0] ZERO    $Z3:$Z2
 237 || [B0] MV      $xia,$x1
 238    [B0] ADDK    14,${xip}
 239         .endasmfunc
 240
 241         .sect   .const
 242         .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
 243         .align  4
 244 ___
 245
 246 print $code;
 247 close STDOUT;