Fix the error handling in ERR_get_state:

[oweals/openssl.git] / crypto / bn / asm / bn-c64xplus.asm
diff --git a/crypto/bn/asm/bn-c64xplus.asm b/crypto/bn/asm/bn-c64xplus.asm

index 161547c3b0a486bba45632bf0272f9ad691b3ee7..de6d37728fbac7ea0c6aa3301a2aa6c33ec5b15f 100644 (file)
--- a/crypto/bn/asm/bn-c64xplus.asm
+++ b/crypto/bn/asm/bn-c64xplus.asm
@@ -1,3 +1,10 @@
+;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+;;
+;; Licensed under the OpenSSL license (the "License").  You may not use
+;; this file except in compliance with the License.  You can obtain a copy
+;; in the file LICENSE in the source distribution or at
+;; https://www.openssl.org/source/license.html
+;;
  ;;====================================================================
  ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  ;; project.
@@ -13,6 +20,22 @@
  ;;====================================================================
         .text
  
+       .if     .ASSEMBLER_VERSION<7000000
+       .asg    0,__TI_EABI__
+       .endif
+       .if     __TI_EABI__
+       .asg    bn_mul_add_words,_bn_mul_add_words
+       .asg    bn_mul_words,_bn_mul_words
+       .asg    bn_sqr_words,_bn_sqr_words
+       .asg    bn_add_words,_bn_add_words
+       .asg    bn_sub_words,_bn_sub_words
+       .asg    bn_div_words,_bn_div_words
+       .asg    bn_sqr_comba8,_bn_sqr_comba8
+       .asg    bn_mul_comba8,_bn_mul_comba8
+       .asg    bn_sqr_comba4,_bn_sqr_comba4
+       .asg    bn_mul_comba4,_bn_mul_comba4
+       .endif
+
         .asg    B3,RA
         .asg    A4,ARG0
         .asg    B4,ARG1
@@ -158,14 +181,39 @@ _bn_sub_words:
         .endasmfunc
  
         .global _bn_div_words
-       .global __divull
  _bn_div_words:
         .asmfunc
-       CALLP   __divull,A3     ; jump to rts64plus.lib
-||     MV      ARG0,A5
-||     MV      ARG1,ARG0
-||     MV      ARG2,ARG1
-||     ZERO    B5
+       LMBD    1,A6,A0         ; leading zero bits in dv
+       LMBD    1,A4,A1         ; leading zero bits in hi
+||     MVK     32,B0
+       CMPLTU  A1,A0,A2
+||     ADD     A0,B0,B0
+  [ A2]        BNOP    RA
+||[ A2]        MVK     -1,A4           ; return overflow
+||[!A2]        MV      A4,A3           ; reassign hi
+  [!A2]        MV      B4,A4           ; reassign lo, will be quotient
+||[!A2]        MVC     B0,ILC
+  [!A2]        SHL     A6,A0,A6        ; normalize dv
+||     MVK     1,A1
+
+  [!A2]        CMPLTU  A3,A6,A1        ; hi<dv?
+||[!A2]        SHL     A4,1,A5:A4      ; lo<<1
+  [!A1]        SUB     A3,A6,A3        ; hi-=dv
+||[!A1]        OR      1,A4,A4
+  [!A2]        SHRU    A3,31,A1        ; upper bit
+||[!A2]        ADDAH   A5,A3,A3        ; hi<<1|lo>>31
+
+       SPLOOP  3
+  [!A1]        CMPLTU  A3,A6,A1        ; hi<dv?
+||[ A1]        ZERO    A1
+||     SHL     A4,1,A5:A4      ; lo<<1
+  [!A1]        SUB     A3,A6,A3        ; hi-=dv
+||[!A1]        OR      1,A4,A4         ; quotient
+       SHRU    A3,31,A1        ; upper bit
+||     ADDAH   A5,A3,A3        ; hi<<1|lo>>31
+       SPKERNEL
+
+       BNOP    RA,5
         .endasmfunc
  
  ;;====================================================================
@@ -243,8 +291,9 @@ _bn_mul_comba4:
         .if     0
         BNOP    sploopNxM?,3
         ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
-       ;; because of read-after-write penalties, it's rather
-       ;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
+       ;; because of low-counter effect, when prologue phase finishes
+       ;; before SPKERNEL instruction is reached. As result it's 25%
+       ;; slower than expected...
         MVK     4,B0            ; N, RILC
  ||     MVK     4,A0            ; M, outer loop counter
  ||     MV      ARG1,A5         ; copy ap
@@ -256,7 +305,7 @@ _bn_mul_comba4:
  ||     LDW     *A5++,B6        ; ap[0]
  ||     MV      A0,A3           ; const A3=M
         .else
-       ;; This alternative is exercise in fully unrolled Comba
+       ;; This alternative is an exercise in fully unrolled Comba
         ;; algorithm implementation that operates at n*(n+1)+12, or
         ;; as little as 32 cycles...
         LDW     *ARG1[0],B16    ; a[0]