2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # The module implements bn_GF2m_mul_2x2 polynomial multiplication
20 # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
21 # C for the time being... The subroutine runs in 37 cycles, which is
22 # 4.5x faster than compiler-generated code. Though comparison is
23 # totally unfair, because this module utilizes Galois Field Multiply
26 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
27 open STDOUT,">$output";
29 ($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector
31 ($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
32 ($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
33 ($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
40 EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
43 SHRU $A,16, $Ahi ; smash $A to two halfwords
46 XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits multiplication
47 || XORMPY $Ahi,$B_2,$Ahix2
49 XORMPY $Alo,$B_0,$Alox0
50 || XORMPY $Ahi,$B_0,$Ahix0
51 XORMPY $Alo,$B_3,$Alox3
52 || XORMPY $Ahi,$B_3,$Ahix3
53 XORMPY $Alo,$B_1,$Alox1
54 || XORMPY $Ahi,$B_1,$Ahix1
58 my ($OUTlo,$OUThi,$A,$B)=@_;
60 EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
63 SHRU $A,16, $Ahi ; smash $A to two halfwords
66 XOR $Ahix0,$Alox2,$Ahix0
68 || XORMPY $Alo,$B_2,$Alox2
69 XORMPY $Ahi,$B_2,$Ahix2
71 || XORMPY $Alo,$B_0,A1 ; $Alox0
72 XOR $Ahix1,$Alox3,$Ahix1
73 || SHL $Ahix0,16,$OUTlo
74 || SHRU $Ahix0,16,$Ahix0
75 XOR $Alox0,$OUTlo,$OUTlo
76 || XOR $Ahix0,$OUThi,$OUThi
77 || XORMPY $Ahi,$B_0,$Ahix0
78 || XORMPY $Alo,$B_3,$Alox3
79 || SHL $Alox1,8,$Alox1
80 || SHL $Ahix3,8,$Ahix3
81 XOR $Alox1,$OUTlo,$OUTlo
82 || XOR $Ahix3,$OUThi,$OUThi
83 || XORMPY $Ahi,$B_3,$Ahix3
84 || SHL $Ahix1,24,$Alox1
85 || SHRU $Ahix1,8, $Ahix1
86 XOR $Alox1,$OUTlo,$OUTlo
87 || XOR $Ahix1,$OUThi,$OUThi
88 || XORMPY $Alo,$B_1,$Alox1
89 || XORMPY $Ahi,$B_1,$Ahix1
94 my ($OUTlo,$OUThi)=@_;
97 XOR $Ahix0,$Alox2,$Ahix0
100 XOR $Ahix1,$Alox3,$Ahix1
101 || SHL $Ahix0,16,$OUTlo
102 || SHRU $Ahix0,16,$Ahix0
103 XOR $Alox0,$OUTlo,$OUTlo
104 || XOR $Ahix0,$OUThi,$OUThi
105 || SHL $Alox1,8,$Alox1
106 || SHL $Ahix3,8,$Ahix3
107 XOR $Alox1,$OUTlo,$OUTlo
108 || XOR $Ahix3,$OUThi,$OUThi
109 || SHL $Ahix1,24,$Alox1
110 || SHRU $Ahix1,8, $Ahix1
111 XOR $Alox1,$OUTlo,$OUTlo
112 || XOR $Ahix1,$OUThi,$OUThi
118 .if .ASSEMBLER_VERSION<7000000
122 .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
125 .global _bn_GF2m_mul_2x2
130 &mul_1x1_upper($a0,$b0); # a0·b0
135 &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1
140 &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1)
143 || XOR B28,B31,B29 ; a0·b0+a1·b1
145 &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1)
149 || XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1