crypto/bn/asm/bn-c64xplus.asm

   1 ;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
   2 ;;
   3 ;; Licensed under the OpenSSL license (the "License").  You may not use
   4 ;; this file except in compliance with the License.  You can obtain a copy
   5 ;; in the file LICENSE in the source distribution or at
   6 ;; https://www.openssl.org/source/license.html
   7 ;;
   8 ;;====================================================================
   9 ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10 ;; project.
  11 ;;
  12 ;; Rights for redistribution and usage in source and binary forms are
  13 ;; granted according to the OpenSSL license. Warranty of any kind is
  14 ;; disclaimed.
  15 ;;====================================================================
  16 ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
  17 ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
  18 ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
  19 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
  20 ;;====================================================================
  21         .text
  22
  23         .if     .ASSEMBLER_VERSION<7000000
  24         .asg    0,__TI_EABI__
  25         .endif
  26         .if     __TI_EABI__
  27         .asg    bn_mul_add_words,_bn_mul_add_words
  28         .asg    bn_mul_words,_bn_mul_words
  29         .asg    bn_sqr_words,_bn_sqr_words
  30         .asg    bn_add_words,_bn_add_words
  31         .asg    bn_sub_words,_bn_sub_words
  32         .asg    bn_div_words,_bn_div_words
  33         .asg    bn_sqr_comba8,_bn_sqr_comba8
  34         .asg    bn_mul_comba8,_bn_mul_comba8
  35         .asg    bn_sqr_comba4,_bn_sqr_comba4
  36         .asg    bn_mul_comba4,_bn_mul_comba4
  37         .endif
  38
  39         .asg    B3,RA
  40         .asg    A4,ARG0
  41         .asg    B4,ARG1
  42         .asg    A6,ARG2
  43         .asg    B6,ARG3
  44         .asg    A8,ARG4
  45         .asg    B8,ARG5
  46         .asg    A4,RET
  47         .asg    A15,FP
  48         .asg    B14,DP
  49         .asg    B15,SP
  50
  51         .global _bn_mul_add_words
  52 _bn_mul_add_words:
  53         .asmfunc
  54         MV      ARG2,B0
  55   [!B0] BNOP    RA
  56 ||[!B0] MVK     0,RET
  57    [B0] MVC     B0,ILC
  58    [B0] ZERO    A19             ; high part of accumulator
  59 || [B0] MV      ARG0,A2
  60 || [B0] MV      ARG3,A3
  61         NOP     3
  62
  63         SPLOOP  2               ; 2*n+10
  64 ;;====================================================================
  65         LDW     *ARG1++,B7      ; ap[i]
  66         NOP     3
  67         LDW     *ARG0++,A7      ; rp[i]
  68         MPY32U  B7,A3,A17:A16
  69         NOP     3               ; [2,0] in epilogue
  70         ADDU    A16,A7,A21:A20
  71         ADDU    A19,A21:A20,A19:A18
  72 ||      MV.S    A17,A23
  73         SPKERNEL 2,1            ; leave slot for "return value"
  74 ||      STW     A18,*A2++       ; rp[i]
  75 ||      ADD     A19,A23,A19
  76 ;;====================================================================
  77         BNOP    RA,4
  78         MV      A19,RET         ; return value
  79         .endasmfunc
  80
  81         .global _bn_mul_words
  82 _bn_mul_words:
  83         .asmfunc
  84         MV      ARG2,B0
  85   [!B0] BNOP    RA
  86 ||[!B0] MVK     0,RET
  87    [B0] MVC     B0,ILC
  88    [B0] ZERO    A19             ; high part of accumulator
  89         NOP     3
  90
  91         SPLOOP  2               ; 2*n+10
  92 ;;====================================================================
  93         LDW     *ARG1++,A7      ; ap[i]
  94         NOP     4
  95         MPY32U  A7,ARG3,A17:A16
  96         NOP     4               ; [2,0] in epiloque
  97         ADDU    A19,A16,A19:A18
  98 ||      MV.S    A17,A21
  99         SPKERNEL 2,1            ; leave slot for "return value"
 100 ||      STW     A18,*ARG0++     ; rp[i]
 101 ||      ADD.L   A19,A21,A19
 102 ;;====================================================================
 103         BNOP    RA,4
 104         MV      A19,RET         ; return value
 105         .endasmfunc
 106
 107         .global _bn_sqr_words
 108 _bn_sqr_words:
 109         .asmfunc
 110         MV      ARG2,B0
 111   [!B0] BNOP    RA
 112 ||[!B0] MVK     0,RET
 113    [B0] MVC     B0,ILC
 114    [B0] MV      ARG0,B2
 115 || [B0] ADD     4,ARG0,ARG0
 116         NOP     3
 117
 118         SPLOOP  2               ; 2*n+10
 119 ;;====================================================================
 120         LDW     *ARG1++,B7      ; ap[i]
 121         NOP     4
 122         MPY32U  B7,B7,B1:B0
 123         NOP     3               ; [2,0] in epilogue
 124         STW     B0,*B2++(8)     ; rp[2*i]
 125         MV      B1,A1
 126         SPKERNEL 2,0            ; fully overlap BNOP RA,5
 127 ||      STW     A1,*ARG0++(8)   ; rp[2*i+1]
 128 ;;====================================================================
 129         BNOP    RA,5
 130         .endasmfunc
 131
 132         .global _bn_add_words
 133 _bn_add_words:
 134         .asmfunc
 135         MV      ARG3,B0
 136   [!B0] BNOP    RA
 137 ||[!B0] MVK     0,RET
 138    [B0] MVC     B0,ILC
 139    [B0] ZERO    A1              ; carry flag
 140 || [B0] MV      ARG0,A3
 141         NOP     3
 142
 143         SPLOOP  2               ; 2*n+6
 144 ;;====================================================================
 145         LDW     *ARG2++,A7      ; bp[i]
 146 ||      LDW     *ARG1++,B7      ; ap[i]
 147         NOP     4
 148         ADDU    A7,B7,A9:A8
 149         ADDU    A1,A9:A8,A1:A0
 150         SPKERNEL 0,0            ; fully overlap BNOP RA,5
 151 ||      STW     A0,*A3++        ; write result
 152 ||      MV      A1,RET          ; keep carry flag in RET
 153 ;;====================================================================
 154         BNOP    RA,5
 155         .endasmfunc
 156
 157         .global _bn_sub_words
 158 _bn_sub_words:
 159         .asmfunc
 160         MV      ARG3,B0
 161   [!B0] BNOP    RA
 162 ||[!B0] MVK     0,RET
 163    [B0] MVC     B0,ILC
 164    [B0] ZERO    A2              ; borrow flag
 165 || [B0] MV      ARG0,A3
 166         NOP     3
 167
 168         SPLOOP  2               ; 2*n+6
 169 ;;====================================================================
 170         LDW     *ARG2++,A7      ; bp[i]
 171 ||      LDW     *ARG1++,B7      ; ap[i]
 172         NOP     4
 173         SUBU    B7,A7,A1:A0
 174   [A2]  SUB     A1:A0,1,A1:A0
 175         SPKERNEL 0,1            ; leave slot for "return borrow flag"
 176 ||      STW     A0,*A3++        ; write result
 177 ||      AND     1,A1,A2         ; pass on borrow flag
 178 ;;====================================================================
 179         BNOP    RA,4
 180         AND     1,A1,RET        ; return borrow flag
 181         .endasmfunc
 182
 183         .global _bn_div_words
 184 _bn_div_words:
 185         .asmfunc
 186         LMBD    1,A6,A0         ; leading zero bits in dv
 187         LMBD    1,A4,A1         ; leading zero bits in hi
 188 ||      MVK     32,B0
 189         CMPLTU  A1,A0,A2
 190 ||      ADD     A0,B0,B0
 191   [ A2] BNOP    RA
 192 ||[ A2] MVK     -1,A4           ; return overflow
 193 ||[!A2] MV      A4,A3           ; reassign hi
 194   [!A2] MV      B4,A4           ; reassign lo, will be quotient
 195 ||[!A2] MVC     B0,ILC
 196   [!A2] SHL     A6,A0,A6        ; normalize dv
 197 ||      MVK     1,A1
 198
 199   [!A2] CMPLTU  A3,A6,A1        ; hi<dv?
 200 ||[!A2] SHL     A4,1,A5:A4      ; lo<<1
 201   [!A1] SUB     A3,A6,A3        ; hi-=dv
 202 ||[!A1] OR      1,A4,A4
 203   [!A2] SHRU    A3,31,A1        ; upper bit
 204 ||[!A2] ADDAH   A5,A3,A3        ; hi<<1|lo>>31
 205
 206         SPLOOP  3
 207   [!A1] CMPLTU  A3,A6,A1        ; hi<dv?
 208 ||[ A1] ZERO    A1
 209 ||      SHL     A4,1,A5:A4      ; lo<<1
 210   [!A1] SUB     A3,A6,A3        ; hi-=dv
 211 ||[!A1] OR      1,A4,A4         ; quotient
 212         SHRU    A3,31,A1        ; upper bit
 213 ||      ADDAH   A5,A3,A3        ; hi<<1|lo>>31
 214         SPKERNEL
 215
 216         BNOP    RA,5
 217         .endasmfunc
 218
 219 ;;====================================================================
 220 ;; Not really Comba algorithm, just straightforward NxM... Dedicated
 221 ;; fully unrolled real Comba implementations are asymptotically 2x
 222 ;; faster, but naturally larger undertaking. Purpose of this exercise
 223 ;; was rather to learn to master nested SPLOOPs...
 224 ;;====================================================================
 225         .global _bn_sqr_comba8
 226         .global _bn_mul_comba8
 227 _bn_sqr_comba8:
 228         MV      ARG1,ARG2
 229 _bn_mul_comba8:
 230         .asmfunc
 231         MVK     8,B0            ; N, RILC
 232 ||      MVK     8,A0            ; M, outer loop counter
 233 ||      MV      ARG1,A5         ; copy ap
 234 ||      MV      ARG0,B4         ; copy rp
 235 ||      ZERO    B19             ; high part of accumulator
 236         MVC     B0,RILC
 237 ||      SUB     B0,2,B1         ; N-2, initial ILC
 238 ||      SUB     B0,1,B2         ; const B2=N-1
 239 ||      LDW     *A5++,B6        ; ap[0]
 240 ||      MV      A0,A3           ; const A3=M
 241 sploopNxM?:                     ; for best performance arrange M<=N
 242    [A0] SPLOOPD 2               ; 2*n+10
 243 ||      MVC     B1,ILC
 244 ||      ADDAW   B4,B0,B5
 245 ||      ZERO    B7
 246 ||      LDW     *A5++,A9        ; pre-fetch ap[1]
 247 ||      ZERO    A1
 248 ||      SUB     A0,1,A0
 249 ;;====================================================================
 250 ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
 251 ;; This is because of Advisory 15 from TI publication SPRZ247I.
 252         LDW     *ARG2++,A7      ; bp[i]
 253         NOP     3
 254    [A1] LDW     *B5++,B7        ; rp[i]
 255         MPY32U  A7,B6,B17:B16
 256         NOP     3
 257         ADDU    B16,B7,B21:B20
 258         ADDU    B19,B21:B20,B19:B18
 259 ||      MV.S    B17,B23
 260         SPKERNEL
 261 ||      STW     B18,*B4++       ; rp[i]
 262 ||      ADD.S   B19,B23,B19
 263 ;;====================================================================
 264 outer?:                         ; m*2*(n+1)+10
 265         SUBAW   ARG2,A3,ARG2    ; rewind bp to bp[0]
 266         SPMASKR
 267 ||      CMPGT   A0,1,A2         ; done pre-fetching ap[i+1]?
 268         MVD     A9,B6           ; move through .M unit(*)
 269    [A2] LDW     *A5++,A9        ; pre-fetch ap[i+1]
 270         SUBAW   B5,B2,B5        ; rewind rp to rp[1]
 271         MVK     1,A1
 272    [A0] BNOP.S1 outer?,4
 273 || [A0] SUB.L   A0,1,A0
 274         STW     B19,*B4--[B2]   ; rewind rp tp rp[1]
 275 ||      ZERO.S  B19             ; high part of accumulator
 276 ;; end of outer?
 277         BNOP    RA,5            ; return
 278         .endasmfunc
 279 ;; (*)  It should be noted that B6 is used as input to MPY32U in
 280 ;;      chronologically next cycle in *preceding* SPLOOP iteration.
 281 ;;      Normally such arrangement would require DINT, but at this
 282 ;;      point SPLOOP is draining and interrupts are disabled
 283 ;;      implicitly.
 284
 285         .global _bn_sqr_comba4
 286         .global _bn_mul_comba4
 287 _bn_sqr_comba4:
 288         MV      ARG1,ARG2
 289 _bn_mul_comba4:
 290         .asmfunc
 291         .if     0
 292         BNOP    sploopNxM?,3
 293         ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
 294         ;; because of low-counter effect, when prologue phase finishes
 295         ;; before SPKERNEL instruction is reached. As result it's 25%
 296         ;; slower than expected...
 297         MVK     4,B0            ; N, RILC
 298 ||      MVK     4,A0            ; M, outer loop counter
 299 ||      MV      ARG1,A5         ; copy ap
 300 ||      MV      ARG0,B4         ; copy rp
 301 ||      ZERO    B19             ; high part of accumulator
 302         MVC     B0,RILC
 303 ||      SUB     B0,2,B1         ; first ILC
 304 ||      SUB     B0,1,B2         ; const B2=N-1
 305 ||      LDW     *A5++,B6        ; ap[0]
 306 ||      MV      A0,A3           ; const A3=M
 307         .else
 308         ;; This alternative is an exercise in fully unrolled Comba
 309         ;; algorithm implementation that operates at n*(n+1)+12, or
 310         ;; as little as 32 cycles...
 311         LDW     *ARG1[0],B16    ; a[0]
 312 ||      LDW     *ARG2[0],A16    ; b[0]
 313         LDW     *ARG1[1],B17    ; a[1]
 314 ||      LDW     *ARG2[1],A17    ; b[1]
 315         LDW     *ARG1[2],B18    ; a[2]
 316 ||      LDW     *ARG2[2],A18    ; b[2]
 317         LDW     *ARG1[3],B19    ; a[3]
 318 ||      LDW     *ARG2[3],A19    ; b[3]
 319         NOP
 320         MPY32U  A16,B16,A1:A0   ; a[0]*b[0]
 321         MPY32U  A17,B16,A23:A22 ; a[0]*b[1]
 322         MPY32U  A16,B17,A25:A24 ; a[1]*b[0]
 323         MPY32U  A16,B18,A27:A26 ; a[2]*b[0]
 324         STW     A0,*ARG0[0]
 325 ||      MPY32U  A17,B17,A29:A28 ; a[1]*b[1]
 326         MPY32U  A18,B16,A31:A30 ; a[0]*b[2]
 327 ||      ADDU    A22,A1,A1:A0
 328         MV      A23,B0
 329 ||      MPY32U  A19,B16,A21:A20 ; a[3]*b[0]
 330 ||      ADDU    A24,A1:A0,A1:A0
 331         ADDU    A25,B0,B1:B0
 332 ||      STW     A0,*ARG0[1]
 333 ||      MPY32U  A18,B17,A23:A22 ; a[2]*b[1]
 334 ||      ADDU    A26,A1,A9:A8
 335         ADDU    A27,B1,B9:B8
 336 ||      MPY32U  A17,B18,A25:A24 ; a[1]*b[2]
 337 ||      ADDU    A28,A9:A8,A9:A8
 338         ADDU    A29,B9:B8,B9:B8
 339 ||      MPY32U  A16,B19,A27:A26 ; a[0]*b[3]
 340 ||      ADDU    A30,A9:A8,A9:A8
 341         ADDU    A31,B9:B8,B9:B8
 342 ||      ADDU    B0,A9:A8,A9:A8
 343         STW     A8,*ARG0[2]
 344 ||      ADDU    A20,A9,A1:A0
 345         ADDU    A21,B9,B1:B0
 346 ||      MPY32U  A19,B17,A21:A20 ; a[3]*b[1]
 347 ||      ADDU    A22,A1:A0,A1:A0
 348         ADDU    A23,B1:B0,B1:B0
 349 ||      MPY32U  A18,B18,A23:A22 ; a[2]*b[2]
 350 ||      ADDU    A24,A1:A0,A1:A0
 351         ADDU    A25,B1:B0,B1:B0
 352 ||      MPY32U  A17,B19,A25:A24 ; a[1]*b[3]
 353 ||      ADDU    A26,A1:A0,A1:A0
 354         ADDU    A27,B1:B0,B1:B0
 355 ||      ADDU    B8,A1:A0,A1:A0
 356         STW     A0,*ARG0[3]
 357 ||      MPY32U  A19,B18,A27:A26 ; a[3]*b[2]
 358 ||      ADDU    A20,A1,A9:A8
 359         ADDU    A21,B1,B9:B8
 360 ||      MPY32U  A18,B19,A29:A28 ; a[2]*b[3]
 361 ||      ADDU    A22,A9:A8,A9:A8
 362         ADDU    A23,B9:B8,B9:B8
 363 ||      MPY32U  A19,B19,A31:A30 ; a[3]*b[3]
 364 ||      ADDU    A24,A9:A8,A9:A8
 365         ADDU    A25,B9:B8,B9:B8
 366 ||      ADDU    B0,A9:A8,A9:A8
 367         STW     A8,*ARG0[4]
 368 ||      ADDU    A26,A9,A1:A0
 369         ADDU    A27,B9,B1:B0
 370 ||      ADDU    A28,A1:A0,A1:A0
 371         ADDU    A29,B1:B0,B1:B0
 372 ||      BNOP    RA
 373 ||      ADDU    B8,A1:A0,A1:A0
 374         STW     A0,*ARG0[5]
 375 ||      ADDU    A30,A1,A9:A8
 376         ADD     A31,B1,B8
 377         ADDU    B0,A9:A8,A9:A8  ; removed || to avoid cross-path stall below
 378         ADD     B8,A9,A9
 379 ||      STW     A8,*ARG0[6]
 380         STW     A9,*ARG0[7]
 381         .endif
 382         .endasmfunc