3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA512 block procedure for ARMv4. September 2007.
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14 # Xscale PXA250 core].
18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
19 # Cortex A8 core and ~40 cycles per processed byte.
23 # Profiler-assisted and platform-specific optimization resulted in 7%
24 # improvement on Coxtex A8 core and ~38 cycles per byte.
28 # Add NEON implementation. On Cortex A8 it was measured to process
29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
33 # Improve NEON performance by 12% on Snapdragon S4. In absolute
34 # terms it's 22.6 cycles per byte, which is disappointing result.
35 # Technical writers asserted that 3-way S4 pipeline can sustain
36 # multiple NEON instructions per cycle, but dual NEON issue could
37 # not be observed, and for NEON-only sequences IPC(*) was found to
38 # be limited by 1:-( 0.33 and 0.66 were measured for sequences with
39 # ILPs(*) of 1 and 2 respectively. This in turn means that you can
40 # even find yourself striving, as I did here, for achieving IPC
41 # adequate to one delivered by Cortex A8 [for reference, it's
42 # 0.5 for ILP of 1, and 1 for higher ILPs].
44 # (*) ILP, instruction-level parallelism, how many instructions
45 # *can* execute at the same time. IPC, instructions per cycle,
46 # indicates how many instructions actually execute.
48 # Byte order [in]dependence. =========================================
50 # Originally caller was expected to maintain specific *dword* order in
51 # h[0-7], namely with most significant dword at *lower* address, which
52 # was reflected in below two parameters as 0 and 4. Now caller is
53 # expected to maintain native byte order for whole 64-bit values.
56 # ====================================================================
58 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
59 open STDOUT,">$output";
61 $ctx="r0"; # parameter block
75 ############ r13 is stack pointer
77 ############ r15 is program counter
92 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
93 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
94 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
96 str $Tlo,[sp,#$Xoff+0]
98 str $Thi,[sp,#$Xoff+4]
99 eor $t0,$t0,$Ehi,lsl#18
100 ldr $t2,[sp,#$Hoff+0] @ h.lo
101 eor $t1,$t1,$Elo,lsl#18
102 ldr $t3,[sp,#$Hoff+4] @ h.hi
103 eor $t0,$t0,$Elo,lsr#18
104 eor $t1,$t1,$Ehi,lsr#18
105 eor $t0,$t0,$Ehi,lsl#14
106 eor $t1,$t1,$Elo,lsl#14
107 eor $t0,$t0,$Ehi,lsr#9
108 eor $t1,$t1,$Elo,lsr#9
109 eor $t0,$t0,$Elo,lsl#23
110 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
112 ldr $t0,[sp,#$Foff+0] @ f.lo
113 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
114 ldr $t1,[sp,#$Foff+4] @ f.hi
116 ldr $t2,[sp,#$Goff+0] @ g.lo
117 adc $Thi,$Thi,$t3 @ T += h
118 ldr $t3,[sp,#$Goff+4] @ g.hi
121 str $Elo,[sp,#$Eoff+0]
123 str $Ehi,[sp,#$Eoff+4]
125 str $Alo,[sp,#$Aoff+0]
127 str $Ahi,[sp,#$Aoff+4]
129 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
130 eor $t1,$t1,$t3 @ Ch(e,f,g)
131 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
134 ldr $Elo,[sp,#$Doff+0] @ d.lo
135 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
136 ldr $Ehi,[sp,#$Doff+4] @ d.hi
139 adc $Thi,$Thi,$t3 @ T += K[i]
141 ldr $t2,[sp,#$Boff+0] @ b.lo
142 adc $Ehi,$Ehi,$Thi @ d += T
145 ldr $t3,[sp,#$Coff+0] @ c.lo
147 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
148 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
149 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
152 eor $t0,$t0,$Ahi,lsl#4
153 eor $t1,$t1,$Alo,lsl#4
154 eor $t0,$t0,$Ahi,lsr#2
155 eor $t1,$t1,$Alo,lsr#2
156 eor $t0,$t0,$Alo,lsl#30
157 eor $t1,$t1,$Ahi,lsl#30
158 eor $t0,$t0,$Ahi,lsr#7
159 eor $t1,$t1,$Alo,lsr#7
160 eor $t0,$t0,$Alo,lsl#25
161 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
164 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
166 ldr $t1,[sp,#$Boff+4] @ b.hi
168 ldr $t2,[sp,#$Coff+4] @ c.hi
172 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
175 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
177 adc $Ahi,$Ahi,$Thi @ h += T
183 #include "arm_arch.h"
187 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
191 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
199 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
200 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
201 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
202 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
203 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
204 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
205 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
206 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
207 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
208 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
209 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
210 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
211 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
212 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
213 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
214 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
215 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
216 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
217 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
218 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
219 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
220 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
221 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
222 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
223 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
224 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
225 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
226 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
227 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
228 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
229 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
230 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
231 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
232 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
233 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
234 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
235 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
236 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
237 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
238 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
241 .word OPENSSL_armcap_P-sha512_block_data_order
244 .global sha512_block_data_order
245 .type sha512_block_data_order,%function
246 sha512_block_data_order:
247 sub r3,pc,#8 @ sha512_block_data_order
248 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
250 ldr r12,.LOPENSSL_armcap
251 ldr r12,[r3,r12] @ OPENSSL_armcap_P
255 stmdb sp!,{r4-r12,lr}
256 sub $Ktbl,r3,#672 @ K512
259 ldr $Elo,[$ctx,#$Eoff+$lo]
260 ldr $Ehi,[$ctx,#$Eoff+$hi]
261 ldr $t0, [$ctx,#$Goff+$lo]
262 ldr $t1, [$ctx,#$Goff+$hi]
263 ldr $t2, [$ctx,#$Hoff+$lo]
264 ldr $t3, [$ctx,#$Hoff+$hi]
266 str $t0, [sp,#$Goff+0]
267 str $t1, [sp,#$Goff+4]
268 str $t2, [sp,#$Hoff+0]
269 str $t3, [sp,#$Hoff+4]
270 ldr $Alo,[$ctx,#$Aoff+$lo]
271 ldr $Ahi,[$ctx,#$Aoff+$hi]
272 ldr $Tlo,[$ctx,#$Boff+$lo]
273 ldr $Thi,[$ctx,#$Boff+$hi]
274 ldr $t0, [$ctx,#$Coff+$lo]
275 ldr $t1, [$ctx,#$Coff+$hi]
276 ldr $t2, [$ctx,#$Doff+$lo]
277 ldr $t3, [$ctx,#$Doff+$hi]
278 str $Tlo,[sp,#$Boff+0]
279 str $Thi,[sp,#$Boff+4]
280 str $t0, [sp,#$Coff+0]
281 str $t1, [sp,#$Coff+4]
282 str $t2, [sp,#$Doff+0]
283 str $t3, [sp,#$Doff+4]
284 ldr $Tlo,[$ctx,#$Foff+$lo]
285 ldr $Thi,[$ctx,#$Foff+$hi]
286 str $Tlo,[sp,#$Foff+0]
287 str $Thi,[sp,#$Foff+4]
297 orr $Tlo,$Tlo,$t0,lsl#8
299 orr $Tlo,$Tlo,$t1,lsl#16
301 orr $Tlo,$Tlo,$t2,lsl#24
302 orr $Thi,$Thi,$t3,lsl#8
303 orr $Thi,$Thi,$t0,lsl#16
304 orr $Thi,$Thi,$t1,lsl#24
318 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
319 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
322 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
323 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
324 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
326 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
328 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
329 eor $Tlo,$Tlo,$t1,lsl#31
330 eor $Thi,$Thi,$t0,lsl#31
331 eor $Tlo,$Tlo,$t0,lsr#8
332 eor $Thi,$Thi,$t1,lsr#8
333 eor $Tlo,$Tlo,$t1,lsl#24
334 eor $Thi,$Thi,$t0,lsl#24
335 eor $Tlo,$Tlo,$t0,lsr#7
336 eor $Thi,$Thi,$t1,lsr#7
337 eor $Tlo,$Tlo,$t1,lsl#25
339 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
340 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
341 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
344 eor $t0,$t0,$t3,lsl#13
345 eor $t1,$t1,$t2,lsl#13
346 eor $t0,$t0,$t3,lsr#29
347 eor $t1,$t1,$t2,lsr#29
348 eor $t0,$t0,$t2,lsl#3
349 eor $t1,$t1,$t3,lsl#3
350 eor $t0,$t0,$t2,lsr#6
351 eor $t1,$t1,$t3,lsr#6
352 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
353 eor $t0,$t0,$t3,lsl#26
355 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
357 ldr $t0,[sp,#`$Xoff+8*16`+0]
360 ldr $t1,[sp,#`$Xoff+8*16`+4]
368 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
369 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
373 ldr $Tlo,[sp,#$Boff+0]
374 ldr $Thi,[sp,#$Boff+4]
375 ldr $t0, [$ctx,#$Aoff+$lo]
376 ldr $t1, [$ctx,#$Aoff+$hi]
377 ldr $t2, [$ctx,#$Boff+$lo]
378 ldr $t3, [$ctx,#$Boff+$hi]
380 str $t0, [$ctx,#$Aoff+$lo]
382 str $t1, [$ctx,#$Aoff+$hi]
384 str $t2, [$ctx,#$Boff+$lo]
386 str $t3, [$ctx,#$Boff+$hi]
388 ldr $Alo,[sp,#$Coff+0]
389 ldr $Ahi,[sp,#$Coff+4]
390 ldr $Tlo,[sp,#$Doff+0]
391 ldr $Thi,[sp,#$Doff+4]
392 ldr $t0, [$ctx,#$Coff+$lo]
393 ldr $t1, [$ctx,#$Coff+$hi]
394 ldr $t2, [$ctx,#$Doff+$lo]
395 ldr $t3, [$ctx,#$Doff+$hi]
397 str $t0, [$ctx,#$Coff+$lo]
399 str $t1, [$ctx,#$Coff+$hi]
401 str $t2, [$ctx,#$Doff+$lo]
403 str $t3, [$ctx,#$Doff+$hi]
405 ldr $Tlo,[sp,#$Foff+0]
406 ldr $Thi,[sp,#$Foff+4]
407 ldr $t0, [$ctx,#$Eoff+$lo]
408 ldr $t1, [$ctx,#$Eoff+$hi]
409 ldr $t2, [$ctx,#$Foff+$lo]
410 ldr $t3, [$ctx,#$Foff+$hi]
412 str $Elo,[$ctx,#$Eoff+$lo]
414 str $Ehi,[$ctx,#$Eoff+$hi]
416 str $t2, [$ctx,#$Foff+$lo]
418 str $t3, [$ctx,#$Foff+$hi]
420 ldr $Alo,[sp,#$Goff+0]
421 ldr $Ahi,[sp,#$Goff+4]
422 ldr $Tlo,[sp,#$Hoff+0]
423 ldr $Thi,[sp,#$Hoff+4]
424 ldr $t0, [$ctx,#$Goff+$lo]
425 ldr $t1, [$ctx,#$Goff+$hi]
426 ldr $t2, [$ctx,#$Hoff+$lo]
427 ldr $t3, [$ctx,#$Hoff+$hi]
429 str $t0, [$ctx,#$Goff+$lo]
431 str $t1, [$ctx,#$Goff+$hi]
433 str $t2, [$ctx,#$Hoff+$lo]
435 str $t3, [$ctx,#$Hoff+$hi]
443 add sp,sp,#8*9 @ destroy frame
445 ldmia sp!,{r4-r12,pc}
447 ldmia sp!,{r4-r12,lr}
449 moveq pc,lr @ be binary compatible with V4, yet
450 bx lr @ interoperable with Thumb ISA:-)
455 my @Sigma0=(28,34,39);
456 my @Sigma1=(14,18,41);
457 my @sigma0=(1, 8, 7);
458 my @sigma1=(19,61,6);
461 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
463 my @X=map("d$_",(0..15));
464 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
468 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
469 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
471 $code.=<<___ if ($i<16 || $i&1);
472 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
474 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
476 vshr.u64 $t1,$e,#@Sigma1[1]
478 vadd.i64 $a,$Maj @ h+=Maj from the past
480 vshr.u64 $t2,$e,#@Sigma1[2]
483 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
484 vsli.64 $t0,$e,#`64-@Sigma1[0]`
485 vsli.64 $t1,$e,#`64-@Sigma1[1]`
487 vsli.64 $t2,$e,#`64-@Sigma1[2]`
488 #if $i<16 && defined(__ARMEL__)
489 vrev64.8 @X[$i],@X[$i]
492 vbsl $Ch,$f,$g @ Ch(e,f,g)
493 vshr.u64 $t0,$a,#@Sigma0[0]
494 veor $t2,$t1 @ Sigma1(e)
496 vshr.u64 $t1,$a,#@Sigma0[1]
497 vsli.64 $t0,$a,#`64-@Sigma0[0]`
499 vshr.u64 $t2,$a,#@Sigma0[2]
500 vadd.i64 $K,@X[$i%16]
501 vsli.64 $t1,$a,#`64-@Sigma0[1]`
503 vsli.64 $t2,$a,#`64-@Sigma0[2]`
506 vbsl $Maj,$c,$b @ Maj(a,b,c)
507 veor $h,$t2 @ Sigma0(a)
517 if ($i&1) { &NEON_00_15($i,@_); return; }
519 # 2x-vectorized, therefore runs every 2nd round
520 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
521 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
522 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
523 my $e=@_[4]; # $e from NEON_00_15
526 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
527 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
528 vadd.i64 @_[0],d30 @ h+=Maj from the past
529 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
530 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
531 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
532 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
534 vshr.u64 $t0,$s0,#@sigma0[0]
535 veor $s1,$t1 @ sigma1(X[i+14])
536 vshr.u64 $t1,$s0,#@sigma0[1]
537 vadd.i64 @X[$i%8],$s1
538 vshr.u64 $s1,$s0,#@sigma0[2]
539 vsli.64 $t0,$s0,#`64-@sigma0[0]`
540 vsli.64 $t1,$s0,#`64-@sigma0[1]`
541 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
543 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
544 vadd.i64 @X[$i%8],$s0
545 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
546 veor $s1,$t1 @ sigma0(X[i+1])
547 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
548 vadd.i64 @X[$i%8],$s1
550 &NEON_00_15(2*$i,@_);
559 dmb @ errata #451034 on early Cortex A8
560 vstmdb sp!,{d8-d15} @ ABI specification says so
561 sub $Ktbl,r3,#672 @ K512
562 vldmia $ctx,{$A-$H} @ load context
565 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
571 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
575 vadd.i64 $A,d30 @ h+=Maj from the past
576 vldmia $ctx,{d24-d31} @ load context to temp
577 vadd.i64 q8,q12 @ vectorized accumulate
581 vstmia $ctx,{$A-$H} @ save context
583 sub $Ktbl,#640 @ rewind K512
586 vldmia sp!,{d8-d15} @ epilogue
592 .size sha512_block_data_order,.-sha512_block_data_order
593 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
595 .comm OPENSSL_armcap_P,4,4
598 $code =~ s/\`([^\`]*)\`/eval $1/gem;
599 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
601 close STDOUT; # enforce flush