crypto/bn/asm/bn-c64xplus.asm

   1 ;;====================================================================
   2 ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   3 ;; project.
   4 ;;
   5 ;; Rights for redistribution and usage in source and binary forms are
   6 ;; granted according to the OpenSSL license. Warranty of any kind is
   7 ;; disclaimed.
   8 ;;====================================================================
   9 ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
  10 ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
  11 ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
  12 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
  13 ;;====================================================================
  14         .text
  15         .if     __TI_EABI__
  16         .asg    bn_mul_add_words,_bn_mul_add_words
  17         .asg    bn_mul_words,_bn_mul_words
  18         .asg    bn_sqr_words,_bn_sqr_words
  19         .asg    bn_add_words,_bn_add_words
  20         .asg    bn_sub_words,_bn_sub_words
  21         .asg    bn_div_words,_bn_div_words
  22         .asg    bn_sqr_comba8,_bn_sqr_comba8
  23         .asg    bn_mul_comba8,_bn_mul_comba8
  24         .asg    bn_sqr_comba4,_bn_sqr_comba4
  25         .asg    bn_mul_comba4,_bn_mul_comba4
  26         .endif
  27
  28         .asg    B3,RA
  29         .asg    A4,ARG0
  30         .asg    B4,ARG1
  31         .asg    A6,ARG2
  32         .asg    B6,ARG3
  33         .asg    A8,ARG4
  34         .asg    B8,ARG5
  35         .asg    A4,RET
  36         .asg    A15,FP
  37         .asg    B14,DP
  38         .asg    B15,SP
  39
  40         .global _bn_mul_add_words
  41 _bn_mul_add_words:
  42         .asmfunc
  43         MV      ARG2,B0
  44   [!B0] BNOP    RA
  45 ||[!B0] MVK     0,RET
  46    [B0] MVC     B0,ILC
  47    [B0] ZERO    A19             ; high part of accumulator
  48 || [B0] MV      ARG0,A2
  49 || [B0] MV      ARG3,A3
  50         NOP     3
  51
  52         SPLOOP  2               ; 2*n+10
  53 ;;====================================================================
  54         LDW     *ARG1++,B7      ; ap[i]
  55         NOP     3
  56         LDW     *ARG0++,A7      ; rp[i]
  57         MPY32U  B7,A3,A17:A16
  58         NOP     3               ; [2,0] in epilogue
  59         ADDU    A16,A7,A21:A20
  60         ADDU    A19,A21:A20,A19:A18
  61 ||      MV.S    A17,A23
  62         SPKERNEL 2,1            ; leave slot for "return value"
  63 ||      STW     A18,*A2++       ; rp[i]
  64 ||      ADD     A19,A23,A19
  65 ;;====================================================================
  66         BNOP    RA,4
  67         MV      A19,RET         ; return value
  68         .endasmfunc
  69
  70         .global _bn_mul_words
  71 _bn_mul_words:
  72         .asmfunc
  73         MV      ARG2,B0
  74   [!B0] BNOP    RA
  75 ||[!B0] MVK     0,RET
  76    [B0] MVC     B0,ILC
  77    [B0] ZERO    A19             ; high part of accumulator
  78         NOP     3
  79
  80         SPLOOP  2               ; 2*n+10
  81 ;;====================================================================
  82         LDW     *ARG1++,A7      ; ap[i]
  83         NOP     4
  84         MPY32U  A7,ARG3,A17:A16
  85         NOP     4               ; [2,0] in epiloque
  86         ADDU    A19,A16,A19:A18
  87 ||      MV.S    A17,A21
  88         SPKERNEL 2,1            ; leave slot for "return value"
  89 ||      STW     A18,*ARG0++     ; rp[i]
  90 ||      ADD.L   A19,A21,A19
  91 ;;====================================================================
  92         BNOP    RA,4
  93         MV      A19,RET         ; return value
  94         .endasmfunc
  95
  96         .global _bn_sqr_words
  97 _bn_sqr_words:
  98         .asmfunc
  99         MV      ARG2,B0
 100   [!B0] BNOP    RA
 101 ||[!B0] MVK     0,RET
 102    [B0] MVC     B0,ILC
 103    [B0] MV      ARG0,B2
 104 || [B0] ADD     4,ARG0,ARG0
 105         NOP     3
 106
 107         SPLOOP  2               ; 2*n+10
 108 ;;====================================================================
 109         LDW     *ARG1++,B7      ; ap[i]
 110         NOP     4
 111         MPY32U  B7,B7,B1:B0
 112         NOP     3               ; [2,0] in epilogue
 113         STW     B0,*B2++(8)     ; rp[2*i]
 114         MV      B1,A1
 115         SPKERNEL 2,0            ; fully overlap BNOP RA,5
 116 ||      STW     A1,*ARG0++(8)   ; rp[2*i+1]
 117 ;;====================================================================
 118         BNOP    RA,5
 119         .endasmfunc
 120
 121         .global _bn_add_words
 122 _bn_add_words:
 123         .asmfunc
 124         MV      ARG3,B0
 125   [!B0] BNOP    RA
 126 ||[!B0] MVK     0,RET
 127    [B0] MVC     B0,ILC
 128    [B0] ZERO    A1              ; carry flag
 129 || [B0] MV      ARG0,A3
 130         NOP     3
 131
 132         SPLOOP  2               ; 2*n+6
 133 ;;====================================================================
 134         LDW     *ARG2++,A7      ; bp[i]
 135 ||      LDW     *ARG1++,B7      ; ap[i]
 136         NOP     4
 137         ADDU    A7,B7,A9:A8
 138         ADDU    A1,A9:A8,A1:A0
 139         SPKERNEL 0,0            ; fully overlap BNOP RA,5
 140 ||      STW     A0,*A3++        ; write result
 141 ||      MV      A1,RET          ; keep carry flag in RET
 142 ;;====================================================================
 143         BNOP    RA,5
 144         .endasmfunc
 145
 146         .global _bn_sub_words
 147 _bn_sub_words:
 148         .asmfunc
 149         MV      ARG3,B0
 150   [!B0] BNOP    RA
 151 ||[!B0] MVK     0,RET
 152    [B0] MVC     B0,ILC
 153    [B0] ZERO    A2              ; borrow flag
 154 || [B0] MV      ARG0,A3
 155         NOP     3
 156
 157         SPLOOP  2               ; 2*n+6
 158 ;;====================================================================
 159         LDW     *ARG2++,A7      ; bp[i]
 160 ||      LDW     *ARG1++,B7      ; ap[i]
 161         NOP     4
 162         SUBU    B7,A7,A1:A0
 163   [A2]  SUB     A1:A0,1,A1:A0
 164         SPKERNEL 0,1            ; leave slot for "return borrow flag"
 165 ||      STW     A0,*A3++        ; write result
 166 ||      AND     1,A1,A2         ; pass on borrow flag
 167 ;;====================================================================
 168         BNOP    RA,4
 169         AND     1,A1,RET        ; return borrow flag
 170         .endasmfunc
 171
 172         .global _bn_div_words
 173 _bn_div_words:
 174         .asmfunc
 175         LMBD    1,A6,A0         ; leading zero bits in dv
 176         LMBD    1,A4,A1         ; leading zero bits in hi
 177 ||      MVK     32,B0
 178         CMPLTU  A1,A0,A2
 179 ||      ADD     A0,B0,B0
 180   [ A2] BNOP    RA
 181 ||[ A2] MVK     -1,A4           ; return overflow
 182 ||[!A2] MV      A4,A3           ; reassign hi
 183   [!A2] MV      B4,A4           ; reassign lo, will be quotient
 184 ||[!A2] MVC     B0,ILC
 185   [!A2] SHL     A6,A0,A6        ; normalize dv
 186 ||      MVK     1,A1
 187
 188   [!A2] CMPLTU  A3,A6,A1        ; hi<dv?
 189 ||[!A2] SHL     A4,1,A5:A4      ; lo<<1
 190   [!A1] SUB     A3,A6,A3        ; hi-=dv
 191 ||[!A1] OR      1,A4,A4
 192   [!A2] SHRU    A3,31,A1        ; upper bit
 193 ||[!A2] ADDAH   A5,A3,A3        ; hi<<1|lo>>31
 194
 195         SPLOOP  3
 196   [!A1] CMPLTU  A3,A6,A1        ; hi<dv?
 197 ||[ A1] ZERO    A1
 198 ||      SHL     A4,1,A5:A4      ; lo<<1
 199   [!A1] SUB     A3,A6,A3        ; hi-=dv
 200 ||[!A1] OR      1,A4,A4         ; quotient
 201         SHRU    A3,31,A1        ; upper bit
 202 ||      ADDAH   A5,A3,A3        ; hi<<1|lo>>31
 203         SPKERNEL
 204
 205         BNOP    RA,5
 206         .endasmfunc
 207
 208 ;;====================================================================
 209 ;; Not really Comba algorithm, just straightforward NxM... Dedicated
 210 ;; fully unrolled real Comba implementations are asymptotically 2x
 211 ;; faster, but naturally larger undertaking. Purpose of this exercise
 212 ;; was rather to learn to master nested SPLOOPs...
 213 ;;====================================================================
 214         .global _bn_sqr_comba8
 215         .global _bn_mul_comba8
 216 _bn_sqr_comba8:
 217         MV      ARG1,ARG2
 218 _bn_mul_comba8:
 219         .asmfunc
 220         MVK     8,B0            ; N, RILC
 221 ||      MVK     8,A0            ; M, outer loop counter
 222 ||      MV      ARG1,A5         ; copy ap
 223 ||      MV      ARG0,B4         ; copy rp
 224 ||      ZERO    B19             ; high part of accumulator
 225         MVC     B0,RILC
 226 ||      SUB     B0,2,B1         ; N-2, initial ILC
 227 ||      SUB     B0,1,B2         ; const B2=N-1
 228 ||      LDW     *A5++,B6        ; ap[0]
 229 ||      MV      A0,A3           ; const A3=M
 230 sploopNxM?:                     ; for best performance arrange M<=N
 231    [A0] SPLOOPD 2               ; 2*n+10
 232 ||      MVC     B1,ILC
 233 ||      ADDAW   B4,B0,B5
 234 ||      ZERO    B7
 235 ||      LDW     *A5++,A9        ; pre-fetch ap[1]
 236 ||      ZERO    A1
 237 ||      SUB     A0,1,A0
 238 ;;====================================================================
 239 ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
 240 ;; This is because of Advisory 15 from TI publication SPRZ247I.
 241         LDW     *ARG2++,A7      ; bp[i]
 242         NOP     3
 243    [A1] LDW     *B5++,B7        ; rp[i]
 244         MPY32U  A7,B6,B17:B16
 245         NOP     3
 246         ADDU    B16,B7,B21:B20
 247         ADDU    B19,B21:B20,B19:B18
 248 ||      MV.S    B17,B23
 249         SPKERNEL
 250 ||      STW     B18,*B4++       ; rp[i]
 251 ||      ADD.S   B19,B23,B19
 252 ;;====================================================================
 253 outer?:                         ; m*2*(n+1)+10
 254         SUBAW   ARG2,A3,ARG2    ; rewind bp to bp[0]
 255         SPMASKR
 256 ||      CMPGT   A0,1,A2         ; done pre-fetching ap[i+1]?
 257         MVD     A9,B6           ; move through .M unit(*)
 258    [A2] LDW     *A5++,A9        ; pre-fetch ap[i+1]
 259         SUBAW   B5,B2,B5        ; rewind rp to rp[1]
 260         MVK     1,A1
 261    [A0] BNOP.S1 outer?,4
 262 || [A0] SUB.L   A0,1,A0
 263         STW     B19,*B4--[B2]   ; rewind rp tp rp[1]
 264 ||      ZERO.S  B19             ; high part of accumulator
 265 ;; end of outer?
 266         BNOP    RA,5            ; return
 267         .endasmfunc
 268 ;; (*)  It should be noted that B6 is used as input to MPY32U in
 269 ;;      chronologically next cycle in *preceding* SPLOOP iteration.
 270 ;;      Normally such arrangement would require DINT, but at this
 271 ;;      point SPLOOP is draining and interrupts are disabled
 272 ;;      implicitly.
 273
 274         .global _bn_sqr_comba4
 275         .global _bn_mul_comba4
 276 _bn_sqr_comba4:
 277         MV      ARG1,ARG2
 278 _bn_mul_comba4:
 279         .asmfunc
 280         .if     0
 281         BNOP    sploopNxM?,3
 282         ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
 283         ;; because of read-after-write penalties, it's rather
 284         ;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
 285         MVK     4,B0            ; N, RILC
 286 ||      MVK     4,A0            ; M, outer loop counter
 287 ||      MV      ARG1,A5         ; copy ap
 288 ||      MV      ARG0,B4         ; copy rp
 289 ||      ZERO    B19             ; high part of accumulator
 290         MVC     B0,RILC
 291 ||      SUB     B0,2,B1         ; first ILC
 292 ||      SUB     B0,1,B2         ; const B2=N-1
 293 ||      LDW     *A5++,B6        ; ap[0]
 294 ||      MV      A0,A3           ; const A3=M
 295         .else
 296         ;; This alternative is an exercise in fully unrolled Comba
 297         ;; algorithm implementation that operates at n*(n+1)+12, or
 298         ;; as little as 32 cycles...
 299         LDW     *ARG1[0],B16    ; a[0]
 300 ||      LDW     *ARG2[0],A16    ; b[0]
 301         LDW     *ARG1[1],B17    ; a[1]
 302 ||      LDW     *ARG2[1],A17    ; b[1]
 303         LDW     *ARG1[2],B18    ; a[2]
 304 ||      LDW     *ARG2[2],A18    ; b[2]
 305         LDW     *ARG1[3],B19    ; a[3]
 306 ||      LDW     *ARG2[3],A19    ; b[3]
 307         NOP
 308         MPY32U  A16,B16,A1:A0   ; a[0]*b[0]
 309         MPY32U  A17,B16,A23:A22 ; a[0]*b[1]
 310         MPY32U  A16,B17,A25:A24 ; a[1]*b[0]
 311         MPY32U  A16,B18,A27:A26 ; a[2]*b[0]
 312         STW     A0,*ARG0[0]
 313 ||      MPY32U  A17,B17,A29:A28 ; a[1]*b[1]
 314         MPY32U  A18,B16,A31:A30 ; a[0]*b[2]
 315 ||      ADDU    A22,A1,A1:A0
 316         MV      A23,B0
 317 ||      MPY32U  A19,B16,A21:A20 ; a[3]*b[0]
 318 ||      ADDU    A24,A1:A0,A1:A0
 319         ADDU    A25,B0,B1:B0
 320 ||      STW     A0,*ARG0[1]
 321 ||      MPY32U  A18,B17,A23:A22 ; a[2]*b[1]
 322 ||      ADDU    A26,A1,A9:A8
 323         ADDU    A27,B1,B9:B8
 324 ||      MPY32U  A17,B18,A25:A24 ; a[1]*b[2]
 325 ||      ADDU    A28,A9:A8,A9:A8
 326         ADDU    A29,B9:B8,B9:B8
 327 ||      MPY32U  A16,B19,A27:A26 ; a[0]*b[3]
 328 ||      ADDU    A30,A9:A8,A9:A8
 329         ADDU    A31,B9:B8,B9:B8
 330 ||      ADDU    B0,A9:A8,A9:A8
 331         STW     A8,*ARG0[2]
 332 ||      ADDU    A20,A9,A1:A0
 333         ADDU    A21,B9,B1:B0
 334 ||      MPY32U  A19,B17,A21:A20 ; a[3]*b[1]
 335 ||      ADDU    A22,A1:A0,A1:A0
 336         ADDU    A23,B1:B0,B1:B0
 337 ||      MPY32U  A18,B18,A23:A22 ; a[2]*b[2]
 338 ||      ADDU    A24,A1:A0,A1:A0
 339         ADDU    A25,B1:B0,B1:B0
 340 ||      MPY32U  A17,B19,A25:A24 ; a[1]*b[3]
 341 ||      ADDU    A26,A1:A0,A1:A0
 342         ADDU    A27,B1:B0,B1:B0
 343 ||      ADDU    B8,A1:A0,A1:A0
 344         STW     A0,*ARG0[3]
 345 ||      MPY32U  A19,B18,A27:A26 ; a[3]*b[2]
 346 ||      ADDU    A20,A1,A9:A8
 347         ADDU    A21,B1,B9:B8
 348 ||      MPY32U  A18,B19,A29:A28 ; a[2]*b[3]
 349 ||      ADDU    A22,A9:A8,A9:A8
 350         ADDU    A23,B9:B8,B9:B8
 351 ||      MPY32U  A19,B19,A31:A30 ; a[3]*b[3]
 352 ||      ADDU    A24,A9:A8,A9:A8
 353         ADDU    A25,B9:B8,B9:B8
 354 ||      ADDU    B0,A9:A8,A9:A8
 355         STW     A8,*ARG0[4]
 356 ||      ADDU    A26,A9,A1:A0
 357         ADDU    A27,B9,B1:B0
 358 ||      ADDU    A28,A1:A0,A1:A0
 359         ADDU    A29,B1:B0,B1:B0
 360 ||      BNOP    RA
 361 ||      ADDU    B8,A1:A0,A1:A0
 362         STW     A0,*ARG0[5]
 363 ||      ADDU    A30,A1,A9:A8
 364         ADD     A31,B1,B8
 365         ADDU    B0,A9:A8,A9:A8  ; removed || to avoid cross-path stall below
 366         ADD     B8,A9,A9
 367 ||      STW     A8,*ARG0[6]
 368         STW     A9,*ARG0[7]
 369         .endif
 370         .endasmfunc