3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # Performance is just below 10 cycles per processed byte, which is
15 # almost 40% faster than compiler-generated code. Unroll is unlikely
16 # to give more than ~8% improvement...
18 # !!! Note that this module uses AMR, which means that all interrupt
19 # service routines are expected to preserve it and for own well-being
22 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
23 open STDOUT,">$output";
25 ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
28 ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
30 ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
33 ($Xia,$Xib)=("A5","B5"); # circular/ring buffer
36 ($Xn,$X0,$K)=("B7","B8","B9");
37 ($Maj,$Ch)=($T2,"B6");
43 .asg sha256_block_data_order,_sha256_block_data_order
55 .global _sha256_block_data_order
56 _sha256_block_data_order:
57 .asmfunc stack_usage(64)
58 MV $NUM,A0 ; reassign $NUM
60 [!A0] BNOP RA ; if ($NUM==0) return;
61 || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
63 [A0] ADDKPC _sha256_block_data_order,B2
64 || [A0] AND B0,SP,SP ; align stack at 64 bytes
67 || [A0] MVKL \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
69 || [A0] MVKH \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
72 || [A0] MVKL (K256-_sha256_block_data_order),$K256
74 || [A0] MVKH (K256-_sha256_block_data_order),$K256
76 [A0] MVC B1,AMR ; setup circular addressing
79 || [A0] ADD B2,$K256,$K256
80 || [A0] MV $CTXA,$CTXB
81 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
82 LDW *${CTXA}[0],$A ; load ctx
91 LDNW *$INP++,$Xn ; pre-fetch input
92 LDW *$K256++,$K ; pre-fetch K256[0]
93 MVK 14,B0 ; loop counters
108 SPLOOPD 8 ; BODY_00_14
122 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
124 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
126 || ADD $K,$H,$T1 ; T1 = h + K256[i]
127 ADD $X0,$T1,$T1 ; T1 += X[i];
131 XOR $t1a,$S0,$S0 ; Sigma0(a)
132 || XOR $t1e,$S1,$S1 ; Sigma1(e)
133 || LDW *$K256++,$K ; pre-fetch K256[i+1]
134 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
135 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
136 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
137 || ROTL $G,0,$H ; h = g
143 || ADD $D,$T1,$E ; e = d + T1
147 || ADD $T1,$T2,$A ; a = T1 + T2
150 ROTL $A,30,$S0 ; BODY_15
156 || LDW *${Xib}[1],$Xn ; modulo-scheduled
160 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
161 || LDW *${Xib}[2],$X1 ; modulo-scheduled
163 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
165 || ADD $K,$H,$T1 ; T1 = h + K256[i]
166 ADD $X0,$T1,$T1 ; T1 += X[i];
170 XOR $t1a,$S0,$S0 ; Sigma0(a)
171 || XOR $t1e,$S1,$S1 ; Sigma1(e)
172 || LDW *$K256++,$K ; pre-fetch K256[i+1]
173 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
174 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
175 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
176 || ROTL $G,0,$H ; h = g
180 || ADD $D,$T1,$E ; e = d + T1
182 || MV $Xn,$X0 ; modulo-scheduled
183 || LDW *$Xia,$X9 ; modulo-scheduled
184 || ROTL $X1,25,$t0e ; modulo-scheduled
185 || ROTL $X14,15,$t0a ; modulo-scheduled
186 SHRU $X1,3,$s0 ; modulo-scheduled
187 || SHRU $X14,10,$s1 ; modulo-scheduled
188 || ROTL $B,0,$C ; c = b
190 || ADD $T1,$T2,$A ; a = T1 + T2
192 SPLOOPD 10 ; BODY_16_63
194 || ROTL $X1,14,$t1e ; modulo-scheduled
195 || ROTL $X14,13,$t1a ; modulo-scheduled
201 XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
202 || XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
203 || LDW *${Xib}[2],$X1 ; module-scheduled
210 || ADD $X9,$X0,$X0 ; X[i] += X[i+9]
214 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
215 || ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
217 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
219 || ADD $H,$K,$T1 ; T1 = h + K256[i]
220 || ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
223 || ADD $X0,$T1,$T1 ; T1 += X[i]
225 XOR $t1a,$S0,$S0 ; Sigma0(a)
226 || XOR $t1e,$S1,$S1 ; Sigma1(e)
227 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
229 || ROTL $G,0,$H ; h = g
230 || LDW *$K256++,$K ; pre-fetch K256[i+1]
231 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
232 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
234 || MV $Xn,$X0 ; modulo-scheduled
235 || LDW *++$Xia,$X9 ; modulo-scheduled
236 || ROTL $X1,25,$t0e ; module-scheduled
237 || ROTL $X14,15,$t0a ; modulo-scheduled
238 ROTL $X1,14,$t1e ; modulo-scheduled
239 || ROTL $X14,13,$t1a ; modulo-scheduled
241 || ADD $D,$T1,$E ; e = d + T1
245 || ADD $T1,$T2,$A ; a = T1 + T2
246 || SHRU $X1,3,$s0 ; modulo-scheduled
247 || SHRU $X14,10,$s1 ; modulo-scheduled
251 || [A0] LDNW *$INP++,$Xn ; pre-fetch input
252 || [A0] ADDK -260,$K256 ; rewind K256
253 || ADD $Actx,$A,$A ; accumulate ctx
261 || [A0] LDW *$K256++,$K ; pre-fetch K256[0]
264 ||[!A0] MV $CTXA,$CTXB
265 [!A0] MV FP,SP ; restore stack pointer
266 ||[!A0] LDW *FP[0],FP ; restore frame pointer
267 [!A0] STW $A,*${CTXA}[0] ; save ctx
268 ||[!A0] STW $E,*${CTXB}[4]
270 [!A0] STW $B,*${CTXA}[1]
271 ||[!A0] STW $F,*${CTXB}[5]
272 ||[!A0] MVC B0,AMR ; clear AMR
274 || STW $G,*${CTXB}[6]
276 || STW $H,*${CTXB}[7]
280 .sect ".text:sha_asm.const"
282 .sect ".const:sha_asm"
286 .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
287 .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
288 .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
289 .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
290 .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
291 .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
292 .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
293 .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
294 .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
295 .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
296 .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
297 .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
298 .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
299 .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
300 .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
301 .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
302 .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"