crypto/bn/asm/bn-c64xplus.asm

   1 ;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
   2 ;;
   3 ;; Licensed under the Apache License 2.0 (the "License").  You may not use
   4 ;; this file except in compliance with the License.  You can obtain a copy
   5 ;; in the file LICENSE in the source distribution or at
   6 ;; https://www.openssl.org/source/license.html
   7 ;;
   8 ;;====================================================================
   9 ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10 ;; project.
  11 ;;
  12 ;; Rights for redistribution and usage in source and binary forms are
  13 ;; granted according to the License. Warranty of any kind is disclaimed.
  14 ;;====================================================================
  15 ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
  16 ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
  17 ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
  18 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
  19 ;;====================================================================
  20         .text
  21
  22         .if     .ASSEMBLER_VERSION<7000000
  23         .asg    0,__TI_EABI__
  24         .endif
  25         .if     __TI_EABI__
  26         .asg    bn_mul_add_words,_bn_mul_add_words
  27         .asg    bn_mul_words,_bn_mul_words
  28         .asg    bn_sqr_words,_bn_sqr_words
  29         .asg    bn_add_words,_bn_add_words
  30         .asg    bn_sub_words,_bn_sub_words
  31         .asg    bn_div_words,_bn_div_words
  32         .asg    bn_sqr_comba8,_bn_sqr_comba8
  33         .asg    bn_mul_comba8,_bn_mul_comba8
  34         .asg    bn_sqr_comba4,_bn_sqr_comba4
  35         .asg    bn_mul_comba4,_bn_mul_comba4
  36         .endif
  37
  38         .asg    B3,RA
  39         .asg    A4,ARG0
  40         .asg    B4,ARG1
  41         .asg    A6,ARG2
  42         .asg    B6,ARG3
  43         .asg    A8,ARG4
  44         .asg    B8,ARG5
  45         .asg    A4,RET
  46         .asg    A15,FP
  47         .asg    B14,DP
  48         .asg    B15,SP
  49
  50         .global _bn_mul_add_words
  51 _bn_mul_add_words:
  52         .asmfunc
  53         MV      ARG2,B0
  54   [!B0] BNOP    RA
  55 ||[!B0] MVK     0,RET
  56    [B0] MVC     B0,ILC
  57    [B0] ZERO    A19             ; high part of accumulator
  58 || [B0] MV      ARG0,A2
  59 || [B0] MV      ARG3,A3
  60         NOP     3
  61
  62         SPLOOP  2               ; 2*n+10
  63 ;;====================================================================
  64         LDW     *ARG1++,B7      ; ap[i]
  65         NOP     3
  66         LDW     *ARG0++,A7      ; rp[i]
  67         MPY32U  B7,A3,A17:A16
  68         NOP     3               ; [2,0] in epilogue
  69         ADDU    A16,A7,A21:A20
  70         ADDU    A19,A21:A20,A19:A18
  71 ||      MV.S    A17,A23
  72         SPKERNEL 2,1            ; leave slot for "return value"
  73 ||      STW     A18,*A2++       ; rp[i]
  74 ||      ADD     A19,A23,A19
  75 ;;====================================================================
  76         BNOP    RA,4
  77         MV      A19,RET         ; return value
  78         .endasmfunc
  79
  80         .global _bn_mul_words
  81 _bn_mul_words:
  82         .asmfunc
  83         MV      ARG2,B0
  84   [!B0] BNOP    RA
  85 ||[!B0] MVK     0,RET
  86    [B0] MVC     B0,ILC
  87    [B0] ZERO    A19             ; high part of accumulator
  88         NOP     3
  89
  90         SPLOOP  2               ; 2*n+10
  91 ;;====================================================================
  92         LDW     *ARG1++,A7      ; ap[i]
  93         NOP     4
  94         MPY32U  A7,ARG3,A17:A16
  95         NOP     4               ; [2,0] in epiloque
  96         ADDU    A19,A16,A19:A18
  97 ||      MV.S    A17,A21
  98         SPKERNEL 2,1            ; leave slot for "return value"
  99 ||      STW     A18,*ARG0++     ; rp[i]
 100 ||      ADD.L   A19,A21,A19
 101 ;;====================================================================
 102         BNOP    RA,4
 103         MV      A19,RET         ; return value
 104         .endasmfunc
 105
 106         .global _bn_sqr_words
 107 _bn_sqr_words:
 108         .asmfunc
 109         MV      ARG2,B0
 110   [!B0] BNOP    RA
 111 ||[!B0] MVK     0,RET
 112    [B0] MVC     B0,ILC
 113    [B0] MV      ARG0,B2
 114 || [B0] ADD     4,ARG0,ARG0
 115         NOP     3
 116
 117         SPLOOP  2               ; 2*n+10
 118 ;;====================================================================
 119         LDW     *ARG1++,B7      ; ap[i]
 120         NOP     4
 121         MPY32U  B7,B7,B1:B0
 122         NOP     3               ; [2,0] in epilogue
 123         STW     B0,*B2++(8)     ; rp[2*i]
 124         MV      B1,A1
 125         SPKERNEL 2,0            ; fully overlap BNOP RA,5
 126 ||      STW     A1,*ARG0++(8)   ; rp[2*i+1]
 127 ;;====================================================================
 128         BNOP    RA,5
 129         .endasmfunc
 130
 131         .global _bn_add_words
 132 _bn_add_words:
 133         .asmfunc
 134         MV      ARG3,B0
 135   [!B0] BNOP    RA
 136 ||[!B0] MVK     0,RET
 137    [B0] MVC     B0,ILC
 138    [B0] ZERO    A1              ; carry flag
 139 || [B0] MV      ARG0,A3
 140         NOP     3
 141
 142         SPLOOP  2               ; 2*n+6
 143 ;;====================================================================
 144         LDW     *ARG2++,A7      ; bp[i]
 145 ||      LDW     *ARG1++,B7      ; ap[i]
 146         NOP     4
 147         ADDU    A7,B7,A9:A8
 148         ADDU    A1,A9:A8,A1:A0
 149         SPKERNEL 0,0            ; fully overlap BNOP RA,5
 150 ||      STW     A0,*A3++        ; write result
 151 ||      MV      A1,RET          ; keep carry flag in RET
 152 ;;====================================================================
 153         BNOP    RA,5
 154         .endasmfunc
 155
 156         .global _bn_sub_words
 157 _bn_sub_words:
 158         .asmfunc
 159         MV      ARG3,B0
 160   [!B0] BNOP    RA
 161 ||[!B0] MVK     0,RET
 162    [B0] MVC     B0,ILC
 163    [B0] ZERO    A2              ; borrow flag
 164 || [B0] MV      ARG0,A3
 165         NOP     3
 166
 167         SPLOOP  2               ; 2*n+6
 168 ;;====================================================================
 169         LDW     *ARG2++,A7      ; bp[i]
 170 ||      LDW     *ARG1++,B7      ; ap[i]
 171         NOP     4
 172         SUBU    B7,A7,A1:A0
 173   [A2]  SUB     A1:A0,1,A1:A0
 174         SPKERNEL 0,1            ; leave slot for "return borrow flag"
 175 ||      STW     A0,*A3++        ; write result
 176 ||      AND     1,A1,A2         ; pass on borrow flag
 177 ;;====================================================================
 178         BNOP    RA,4
 179         AND     1,A1,RET        ; return borrow flag
 180         .endasmfunc
 181
 182         .global _bn_div_words
 183 _bn_div_words:
 184         .asmfunc
 185         LMBD    1,A6,A0         ; leading zero bits in dv
 186         LMBD    1,A4,A1         ; leading zero bits in hi
 187 ||      MVK     32,B0
 188         CMPLTU  A1,A0,A2
 189 ||      ADD     A0,B0,B0
 190   [ A2] BNOP    RA
 191 ||[ A2] MVK     -1,A4           ; return overflow
 192 ||[!A2] MV      A4,A3           ; reassign hi
 193   [!A2] MV      B4,A4           ; reassign lo, will be quotient
 194 ||[!A2] MVC     B0,ILC
 195   [!A2] SHL     A6,A0,A6        ; normalize dv
 196 ||      MVK     1,A1
 197
 198   [!A2] CMPLTU  A3,A6,A1        ; hi<dv?
 199 ||[!A2] SHL     A4,1,A5:A4      ; lo<<1
 200   [!A1] SUB     A3,A6,A3        ; hi-=dv
 201 ||[!A1] OR      1,A4,A4
 202   [!A2] SHRU    A3,31,A1        ; upper bit
 203 ||[!A2] ADDAH   A5,A3,A3        ; hi<<1|lo>>31
 204
 205         SPLOOP  3
 206   [!A1] CMPLTU  A3,A6,A1        ; hi<dv?
 207 ||[ A1] ZERO    A1
 208 ||      SHL     A4,1,A5:A4      ; lo<<1
 209   [!A1] SUB     A3,A6,A3        ; hi-=dv
 210 ||[!A1] OR      1,A4,A4         ; quotient
 211         SHRU    A3,31,A1        ; upper bit
 212 ||      ADDAH   A5,A3,A3        ; hi<<1|lo>>31
 213         SPKERNEL
 214
 215         BNOP    RA,5
 216         .endasmfunc
 217
 218 ;;====================================================================
 219 ;; Not really Comba algorithm, just straightforward NxM... Dedicated
 220 ;; fully unrolled real Comba implementations are asymptotically 2x
 221 ;; faster, but naturally larger undertaking. Purpose of this exercise
 222 ;; was rather to learn to master nested SPLOOPs...
 223 ;;====================================================================
 224         .global _bn_sqr_comba8
 225         .global _bn_mul_comba8
 226 _bn_sqr_comba8:
 227         MV      ARG1,ARG2
 228 _bn_mul_comba8:
 229         .asmfunc
 230         MVK     8,B0            ; N, RILC
 231 ||      MVK     8,A0            ; M, outer loop counter
 232 ||      MV      ARG1,A5         ; copy ap
 233 ||      MV      ARG0,B4         ; copy rp
 234 ||      ZERO    B19             ; high part of accumulator
 235         MVC     B0,RILC
 236 ||      SUB     B0,2,B1         ; N-2, initial ILC
 237 ||      SUB     B0,1,B2         ; const B2=N-1
 238 ||      LDW     *A5++,B6        ; ap[0]
 239 ||      MV      A0,A3           ; const A3=M
 240 sploopNxM?:                     ; for best performance arrange M<=N
 241    [A0] SPLOOPD 2               ; 2*n+10
 242 ||      MVC     B1,ILC
 243 ||      ADDAW   B4,B0,B5
 244 ||      ZERO    B7
 245 ||      LDW     *A5++,A9        ; pre-fetch ap[1]
 246 ||      ZERO    A1
 247 ||      SUB     A0,1,A0
 248 ;;====================================================================
 249 ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
 250 ;; This is because of Advisory 15 from TI publication SPRZ247I.
 251         LDW     *ARG2++,A7      ; bp[i]
 252         NOP     3
 253    [A1] LDW     *B5++,B7        ; rp[i]
 254         MPY32U  A7,B6,B17:B16
 255         NOP     3
 256         ADDU    B16,B7,B21:B20
 257         ADDU    B19,B21:B20,B19:B18
 258 ||      MV.S    B17,B23
 259         SPKERNEL
 260 ||      STW     B18,*B4++       ; rp[i]
 261 ||      ADD.S   B19,B23,B19
 262 ;;====================================================================
 263 outer?:                         ; m*2*(n+1)+10
 264         SUBAW   ARG2,A3,ARG2    ; rewind bp to bp[0]
 265         SPMASKR
 266 ||      CMPGT   A0,1,A2         ; done pre-fetching ap[i+1]?
 267         MVD     A9,B6           ; move through .M unit(*)
 268    [A2] LDW     *A5++,A9        ; pre-fetch ap[i+1]
 269         SUBAW   B5,B2,B5        ; rewind rp to rp[1]
 270         MVK     1,A1
 271    [A0] BNOP.S1 outer?,4
 272 || [A0] SUB.L   A0,1,A0
 273         STW     B19,*B4--[B2]   ; rewind rp tp rp[1]
 274 ||      ZERO.S  B19             ; high part of accumulator
 275 ;; end of outer?
 276         BNOP    RA,5            ; return
 277         .endasmfunc
 278 ;; (*)  It should be noted that B6 is used as input to MPY32U in
 279 ;;      chronologically next cycle in *preceding* SPLOOP iteration.
 280 ;;      Normally such arrangement would require DINT, but at this
 281 ;;      point SPLOOP is draining and interrupts are disabled
 282 ;;      implicitly.
 283
 284         .global _bn_sqr_comba4
 285         .global _bn_mul_comba4
 286 _bn_sqr_comba4:
 287         MV      ARG1,ARG2
 288 _bn_mul_comba4:
 289         .asmfunc
 290         .if     0
 291         BNOP    sploopNxM?,3
 292         ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
 293         ;; because of low-counter effect, when prologue phase finishes
 294         ;; before SPKERNEL instruction is reached. As result it's 25%
 295         ;; slower than expected...
 296         MVK     4,B0            ; N, RILC
 297 ||      MVK     4,A0            ; M, outer loop counter
 298 ||      MV      ARG1,A5         ; copy ap
 299 ||      MV      ARG0,B4         ; copy rp
 300 ||      ZERO    B19             ; high part of accumulator
 301         MVC     B0,RILC
 302 ||      SUB     B0,2,B1         ; first ILC
 303 ||      SUB     B0,1,B2         ; const B2=N-1
 304 ||      LDW     *A5++,B6        ; ap[0]
 305 ||      MV      A0,A3           ; const A3=M
 306         .else
 307         ;; This alternative is an exercise in fully unrolled Comba
 308         ;; algorithm implementation that operates at n*(n+1)+12, or
 309         ;; as little as 32 cycles...
 310         LDW     *ARG1[0],B16    ; a[0]
 311 ||      LDW     *ARG2[0],A16    ; b[0]
 312         LDW     *ARG1[1],B17    ; a[1]
 313 ||      LDW     *ARG2[1],A17    ; b[1]
 314         LDW     *ARG1[2],B18    ; a[2]
 315 ||      LDW     *ARG2[2],A18    ; b[2]
 316         LDW     *ARG1[3],B19    ; a[3]
 317 ||      LDW     *ARG2[3],A19    ; b[3]
 318         NOP
 319         MPY32U  A16,B16,A1:A0   ; a[0]*b[0]
 320         MPY32U  A17,B16,A23:A22 ; a[0]*b[1]
 321         MPY32U  A16,B17,A25:A24 ; a[1]*b[0]
 322         MPY32U  A16,B18,A27:A26 ; a[2]*b[0]
 323         STW     A0,*ARG0[0]
 324 ||      MPY32U  A17,B17,A29:A28 ; a[1]*b[1]
 325         MPY32U  A18,B16,A31:A30 ; a[0]*b[2]
 326 ||      ADDU    A22,A1,A1:A0
 327         MV      A23,B0
 328 ||      MPY32U  A19,B16,A21:A20 ; a[3]*b[0]
 329 ||      ADDU    A24,A1:A0,A1:A0
 330         ADDU    A25,B0,B1:B0
 331 ||      STW     A0,*ARG0[1]
 332 ||      MPY32U  A18,B17,A23:A22 ; a[2]*b[1]
 333 ||      ADDU    A26,A1,A9:A8
 334         ADDU    A27,B1,B9:B8
 335 ||      MPY32U  A17,B18,A25:A24 ; a[1]*b[2]
 336 ||      ADDU    A28,A9:A8,A9:A8
 337         ADDU    A29,B9:B8,B9:B8
 338 ||      MPY32U  A16,B19,A27:A26 ; a[0]*b[3]
 339 ||      ADDU    A30,A9:A8,A9:A8
 340         ADDU    A31,B9:B8,B9:B8
 341 ||      ADDU    B0,A9:A8,A9:A8
 342         STW     A8,*ARG0[2]
 343 ||      ADDU    A20,A9,A1:A0
 344         ADDU    A21,B9,B1:B0
 345 ||      MPY32U  A19,B17,A21:A20 ; a[3]*b[1]
 346 ||      ADDU    A22,A1:A0,A1:A0
 347         ADDU    A23,B1:B0,B1:B0
 348 ||      MPY32U  A18,B18,A23:A22 ; a[2]*b[2]
 349 ||      ADDU    A24,A1:A0,A1:A0
 350         ADDU    A25,B1:B0,B1:B0
 351 ||      MPY32U  A17,B19,A25:A24 ; a[1]*b[3]
 352 ||      ADDU    A26,A1:A0,A1:A0
 353         ADDU    A27,B1:B0,B1:B0
 354 ||      ADDU    B8,A1:A0,A1:A0
 355         STW     A0,*ARG0[3]
 356 ||      MPY32U  A19,B18,A27:A26 ; a[3]*b[2]
 357 ||      ADDU    A20,A1,A9:A8
 358         ADDU    A21,B1,B9:B8
 359 ||      MPY32U  A18,B19,A29:A28 ; a[2]*b[3]
 360 ||      ADDU    A22,A9:A8,A9:A8
 361         ADDU    A23,B9:B8,B9:B8
 362 ||      MPY32U  A19,B19,A31:A30 ; a[3]*b[3]
 363 ||      ADDU    A24,A9:A8,A9:A8
 364         ADDU    A25,B9:B8,B9:B8
 365 ||      ADDU    B0,A9:A8,A9:A8
 366         STW     A8,*ARG0[4]
 367 ||      ADDU    A26,A9,A1:A0
 368         ADDU    A27,B9,B1:B0
 369 ||      ADDU    A28,A1:A0,A1:A0
 370         ADDU    A29,B1:B0,B1:B0
 371 ||      BNOP    RA
 372 ||      ADDU    B8,A1:A0,A1:A0
 373         STW     A0,*ARG0[5]
 374 ||      ADDU    A30,A1,A9:A8
 375         ADD     A31,B1,B8
 376         ADDU    B0,A9:A8,A9:A8  ; removed || to avoid cross-path stall below
 377         ADD     B8,A9,A9
 378 ||      STW     A8,*ARG0[6]
 379         STW     A9,*ARG0[7]
 380         .endif
 381         .endasmfunc