2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
21 # Performance is just below 10 cycles per processed byte, which is
22 # almost 40% faster than compiler-generated code. Unroll is unlikely
23 # to give more than ~8% improvement...
25 # !!! Note that this module uses AMR, which means that all interrupt
26 # service routines are expected to preserve it and for own well-being
29 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
30 open STDOUT,">$output";
32 ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
35 ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
37 ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
40 ($Xia,$Xib)=("A5","B5"); # circular/ring buffer
43 ($Xn,$X0,$K)=("B7","B8","B9");
44 ($Maj,$Ch)=($T2,"B6");
49 .if .ASSEMBLER_VERSION<7000000
54 .asg sha256_block_data_order,_sha256_block_data_order
66 .global _sha256_block_data_order
67 _sha256_block_data_order:
69 .asmfunc stack_usage(64)
70 MV $NUM,A0 ; reassign $NUM
72 [!A0] BNOP RA ; if ($NUM==0) return;
73 || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
75 [A0] ADDKPC __sha256_block,B2
76 || [A0] AND B0,SP,SP ; align stack at 64 bytes
79 || [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256
81 || [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256
84 || [A0] MVKL (K256-__sha256_block),$K256
86 || [A0] MVKH (K256-__sha256_block),$K256
88 [A0] MVC B1,AMR ; setup circular addressing
91 || [A0] ADD B2,$K256,$K256
92 || [A0] MV $CTXA,$CTXB
93 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
94 LDW *${CTXA}[0],$A ; load ctx
101 || LDW *${CTXB}[7],$H
103 LDNW *$INP++,$Xn ; pre-fetch input
104 LDW *$K256++,$K ; pre-fetch K256[0]
105 MVK 14,B0 ; loop counters
120 SPLOOPD 8 ; BODY_00_14
134 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
136 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
138 || ADD $K,$H,$T1 ; T1 = h + K256[i]
139 ADD $X0,$T1,$T1 ; T1 += X[i];
143 XOR $t1a,$S0,$S0 ; Sigma0(a)
144 || XOR $t1e,$S1,$S1 ; Sigma1(e)
145 || LDW *$K256++,$K ; pre-fetch K256[i+1]
146 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
147 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
148 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
149 || ROTL $G,0,$H ; h = g
155 || ADD $D,$T1,$E ; e = d + T1
159 || ADD $T1,$T2,$A ; a = T1 + T2
162 ROTL $A,30,$S0 ; BODY_15
168 || LDW *${Xib}[1],$Xn ; modulo-scheduled
172 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
173 || LDW *${Xib}[2],$X1 ; modulo-scheduled
175 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
177 || ADD $K,$H,$T1 ; T1 = h + K256[i]
178 ADD $X0,$T1,$T1 ; T1 += X[i];
182 XOR $t1a,$S0,$S0 ; Sigma0(a)
183 || XOR $t1e,$S1,$S1 ; Sigma1(e)
184 || LDW *$K256++,$K ; pre-fetch K256[i+1]
185 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
186 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
187 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
188 || ROTL $G,0,$H ; h = g
192 || ADD $D,$T1,$E ; e = d + T1
194 || MV $Xn,$X0 ; modulo-scheduled
195 || LDW *$Xia,$X9 ; modulo-scheduled
196 || ROTL $X1,25,$t0e ; modulo-scheduled
197 || ROTL $X14,15,$t0a ; modulo-scheduled
198 SHRU $X1,3,$s0 ; modulo-scheduled
199 || SHRU $X14,10,$s1 ; modulo-scheduled
200 || ROTL $B,0,$C ; c = b
202 || ADD $T1,$T2,$A ; a = T1 + T2
204 SPLOOPD 10 ; BODY_16_63
206 || ROTL $X1,14,$t1e ; modulo-scheduled
207 || ROTL $X14,13,$t1a ; modulo-scheduled
213 XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
214 || XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
215 || LDW *${Xib}[2],$X1 ; module-scheduled
222 || ADD $X9,$X0,$X0 ; X[i] += X[i+9]
226 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
227 || ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
229 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
231 || ADD $H,$K,$T1 ; T1 = h + K256[i]
232 || ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
235 || ADD $X0,$T1,$T1 ; T1 += X[i]
237 XOR $t1a,$S0,$S0 ; Sigma0(a)
238 || XOR $t1e,$S1,$S1 ; Sigma1(e)
239 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
241 || ROTL $G,0,$H ; h = g
242 || LDW *$K256++,$K ; pre-fetch K256[i+1]
243 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
244 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
246 || MV $Xn,$X0 ; modulo-scheduled
247 || LDW *++$Xia,$X9 ; modulo-scheduled
248 || ROTL $X1,25,$t0e ; module-scheduled
249 || ROTL $X14,15,$t0a ; modulo-scheduled
250 ROTL $X1,14,$t1e ; modulo-scheduled
251 || ROTL $X14,13,$t1a ; modulo-scheduled
253 || ADD $D,$T1,$E ; e = d + T1
257 || ADD $T1,$T2,$A ; a = T1 + T2
258 || SHRU $X1,3,$s0 ; modulo-scheduled
259 || SHRU $X14,10,$s1 ; modulo-scheduled
263 || [A0] LDNW *$INP++,$Xn ; pre-fetch input
264 || [A0] ADDK -260,$K256 ; rewind K256
265 || ADD $Actx,$A,$A ; accumulate ctx
273 || [A0] LDW *$K256++,$K ; pre-fetch K256[0]
276 ||[!A0] MV $CTXA,$CTXB
277 [!A0] MV FP,SP ; restore stack pointer
278 ||[!A0] LDW *FP[0],FP ; restore frame pointer
279 [!A0] STW $A,*${CTXA}[0] ; save ctx
280 ||[!A0] STW $E,*${CTXB}[4]
282 [!A0] STW $B,*${CTXA}[1]
283 ||[!A0] STW $F,*${CTXB}[5]
284 ||[!A0] MVC B0,AMR ; clear AMR
286 || STW $G,*${CTXB}[6]
288 || STW $H,*${CTXB}[7]
292 .sect ".text:sha_asm.const"
294 .sect ".const:sha_asm"
298 .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
299 .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
300 .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
301 .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
302 .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
303 .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
304 .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
305 .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
306 .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
307 .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
308 .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
309 .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
310 .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
311 .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
312 .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
313 .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
314 .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"