3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # Performance is just below 10 cycles per processed byte, which is
15 # almost 40% faster than compiler-generated code. Unroll is unlikely
16 # to give more than ~8% improvement...
18 # !!! Note that this module uses AMR, which means that all interrupt
19 # service routines are expected to preserve it and for own well-being
22 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
23 open STDOUT,">$output";
25 ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
28 ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
30 ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
33 ($Xia,$Xib)=("A5","B5"); # circular/ring buffer
36 ($Xn,$X0,$K)=("B7","B8","B9");
37 ($Maj,$Ch)=($T2,"B6");
42 .if .ASSEMBLER_VERSION<7000000
47 .asg sha256_block_data_order,_sha256_block_data_order
59 .global _sha256_block_data_order
60 _sha256_block_data_order:
62 .asmfunc stack_usage(64)
63 MV $NUM,A0 ; reassign $NUM
65 [!A0] BNOP RA ; if ($NUM==0) return;
66 || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
68 [A0] ADDKPC __sha256_block,B2
69 || [A0] AND B0,SP,SP ; align stack at 64 bytes
72 || [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256
74 || [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256
77 || [A0] MVKL (K256-__sha256_block),$K256
79 || [A0] MVKH (K256-__sha256_block),$K256
81 [A0] MVC B1,AMR ; setup circular addressing
84 || [A0] ADD B2,$K256,$K256
85 || [A0] MV $CTXA,$CTXB
86 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
87 LDW *${CTXA}[0],$A ; load ctx
96 LDNW *$INP++,$Xn ; pre-fetch input
97 LDW *$K256++,$K ; pre-fetch K256[0]
98 MVK 14,B0 ; loop counters
113 SPLOOPD 8 ; BODY_00_14
127 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
129 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
131 || ADD $K,$H,$T1 ; T1 = h + K256[i]
132 ADD $X0,$T1,$T1 ; T1 += X[i];
136 XOR $t1a,$S0,$S0 ; Sigma0(a)
137 || XOR $t1e,$S1,$S1 ; Sigma1(e)
138 || LDW *$K256++,$K ; pre-fetch K256[i+1]
139 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
140 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
141 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
142 || ROTL $G,0,$H ; h = g
148 || ADD $D,$T1,$E ; e = d + T1
152 || ADD $T1,$T2,$A ; a = T1 + T2
155 ROTL $A,30,$S0 ; BODY_15
161 || LDW *${Xib}[1],$Xn ; modulo-scheduled
165 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
166 || LDW *${Xib}[2],$X1 ; modulo-scheduled
168 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
170 || ADD $K,$H,$T1 ; T1 = h + K256[i]
171 ADD $X0,$T1,$T1 ; T1 += X[i];
175 XOR $t1a,$S0,$S0 ; Sigma0(a)
176 || XOR $t1e,$S1,$S1 ; Sigma1(e)
177 || LDW *$K256++,$K ; pre-fetch K256[i+1]
178 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
179 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
180 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
181 || ROTL $G,0,$H ; h = g
185 || ADD $D,$T1,$E ; e = d + T1
187 || MV $Xn,$X0 ; modulo-scheduled
188 || LDW *$Xia,$X9 ; modulo-scheduled
189 || ROTL $X1,25,$t0e ; modulo-scheduled
190 || ROTL $X14,15,$t0a ; modulo-scheduled
191 SHRU $X1,3,$s0 ; modulo-scheduled
192 || SHRU $X14,10,$s1 ; modulo-scheduled
193 || ROTL $B,0,$C ; c = b
195 || ADD $T1,$T2,$A ; a = T1 + T2
197 SPLOOPD 10 ; BODY_16_63
199 || ROTL $X1,14,$t1e ; modulo-scheduled
200 || ROTL $X14,13,$t1a ; modulo-scheduled
206 XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
207 || XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
208 || LDW *${Xib}[2],$X1 ; module-scheduled
215 || ADD $X9,$X0,$X0 ; X[i] += X[i+9]
219 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
220 || ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
222 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
224 || ADD $H,$K,$T1 ; T1 = h + K256[i]
225 || ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
228 || ADD $X0,$T1,$T1 ; T1 += X[i]
230 XOR $t1a,$S0,$S0 ; Sigma0(a)
231 || XOR $t1e,$S1,$S1 ; Sigma1(e)
232 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
234 || ROTL $G,0,$H ; h = g
235 || LDW *$K256++,$K ; pre-fetch K256[i+1]
236 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
237 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
239 || MV $Xn,$X0 ; modulo-scheduled
240 || LDW *++$Xia,$X9 ; modulo-scheduled
241 || ROTL $X1,25,$t0e ; module-scheduled
242 || ROTL $X14,15,$t0a ; modulo-scheduled
243 ROTL $X1,14,$t1e ; modulo-scheduled
244 || ROTL $X14,13,$t1a ; modulo-scheduled
246 || ADD $D,$T1,$E ; e = d + T1
250 || ADD $T1,$T2,$A ; a = T1 + T2
251 || SHRU $X1,3,$s0 ; modulo-scheduled
252 || SHRU $X14,10,$s1 ; modulo-scheduled
256 || [A0] LDNW *$INP++,$Xn ; pre-fetch input
257 || [A0] ADDK -260,$K256 ; rewind K256
258 || ADD $Actx,$A,$A ; accumulate ctx
266 || [A0] LDW *$K256++,$K ; pre-fetch K256[0]
269 ||[!A0] MV $CTXA,$CTXB
270 [!A0] MV FP,SP ; restore stack pointer
271 ||[!A0] LDW *FP[0],FP ; restore frame pointer
272 [!A0] STW $A,*${CTXA}[0] ; save ctx
273 ||[!A0] STW $E,*${CTXB}[4]
275 [!A0] STW $B,*${CTXA}[1]
276 ||[!A0] STW $F,*${CTXB}[5]
277 ||[!A0] MVC B0,AMR ; clear AMR
279 || STW $G,*${CTXB}[6]
281 || STW $H,*${CTXB}[7]
285 .sect ".text:sha_asm.const"
287 .sect ".const:sha_asm"
291 .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
292 .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
293 .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
294 .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
295 .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
296 .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
297 .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
298 .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
299 .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
300 .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
301 .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
302 .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
303 .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
304 .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
305 .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
306 .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
307 .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"