2 # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
16 # Permission to use under GPL terms is granted.
17 # ====================================================================
19 # SHA512 block procedure for ARMv4. September 2007.
21 # This code is ~4.5 (four and a half) times faster than code generated
22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23 # Xscale PXA250 core].
27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
28 # Cortex A8 core and ~40 cycles per processed byte.
32 # Profiler-assisted and platform-specific optimization resulted in 7%
33 # improvement on Coxtex A8 core and ~38 cycles per byte.
37 # Add NEON implementation. On Cortex A8 it was measured to process
38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
43 # terms it's 22.6 cycles per byte, which is disappointing result.
44 # Technical writers asserted that 3-way S4 pipeline can sustain
45 # multiple NEON instructions per cycle, but dual NEON issue could
46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
47 # for further details. On side note Cortex-A15 processes one byte in
50 # Byte order [in]dependence. =========================================
52 # Originally caller was expected to maintain specific *dword* order in
53 # h[0-7], namely with most significant dword at *lower* address, which
54 # was reflected in below two parameters as 0 and 4. Now caller is
55 # expected to maintain native byte order for whole 64-bit values.
58 # ====================================================================
60 # $output is the last argument if it looks like a file (it has an extension)
61 # $flavour is the first argument if it doesn't look like a file
62 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
63 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
65 if ($flavour && $flavour ne "void") {
66 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
67 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
68 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
69 die "can't locate arm-xlate.pl";
71 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
72 or die "can't call $xlate: $!";
74 $output and open STDOUT,">$output";
77 $ctx="r0"; # parameter block
91 ############ r13 is stack pointer
93 ############ r15 is program counter
108 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
109 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
110 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
112 str $Tlo,[sp,#$Xoff+0]
114 str $Thi,[sp,#$Xoff+4]
115 eor $t0,$t0,$Ehi,lsl#18
116 ldr $t2,[sp,#$Hoff+0] @ h.lo
117 eor $t1,$t1,$Elo,lsl#18
118 ldr $t3,[sp,#$Hoff+4] @ h.hi
119 eor $t0,$t0,$Elo,lsr#18
120 eor $t1,$t1,$Ehi,lsr#18
121 eor $t0,$t0,$Ehi,lsl#14
122 eor $t1,$t1,$Elo,lsl#14
123 eor $t0,$t0,$Ehi,lsr#9
124 eor $t1,$t1,$Elo,lsr#9
125 eor $t0,$t0,$Elo,lsl#23
126 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
128 ldr $t0,[sp,#$Foff+0] @ f.lo
129 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
130 ldr $t1,[sp,#$Foff+4] @ f.hi
132 ldr $t2,[sp,#$Goff+0] @ g.lo
133 adc $Thi,$Thi,$t3 @ T += h
134 ldr $t3,[sp,#$Goff+4] @ g.hi
137 str $Elo,[sp,#$Eoff+0]
139 str $Ehi,[sp,#$Eoff+4]
141 str $Alo,[sp,#$Aoff+0]
143 str $Ahi,[sp,#$Aoff+4]
145 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
146 eor $t1,$t1,$t3 @ Ch(e,f,g)
147 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
150 ldr $Elo,[sp,#$Doff+0] @ d.lo
151 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
152 ldr $Ehi,[sp,#$Doff+4] @ d.hi
155 adc $Thi,$Thi,$t3 @ T += K[i]
157 ldr $t2,[sp,#$Boff+0] @ b.lo
158 adc $Ehi,$Ehi,$Thi @ d += T
161 ldr $t3,[sp,#$Coff+0] @ c.lo
163 it eq @ Thumb2 thing, sanity check in ARM
166 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
167 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
168 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
171 eor $t0,$t0,$Ahi,lsl#4
172 eor $t1,$t1,$Alo,lsl#4
173 eor $t0,$t0,$Ahi,lsr#2
174 eor $t1,$t1,$Alo,lsr#2
175 eor $t0,$t0,$Alo,lsl#30
176 eor $t1,$t1,$Ahi,lsl#30
177 eor $t0,$t0,$Ahi,lsr#7
178 eor $t1,$t1,$Alo,lsr#7
179 eor $t0,$t0,$Alo,lsl#25
180 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
183 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
185 ldr $t1,[sp,#$Boff+4] @ b.hi
187 ldr $t2,[sp,#$Coff+4] @ c.hi
191 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
194 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
196 adc $Ahi,$Ahi,$Thi @ h += T
202 my $_word = ($flavour =~ /win/ ? "DCDU" : ".word");
206 # include "arm_arch.h"
207 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
208 # define VFP_ABI_POP vldmia sp!,{d8-d15}
210 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
211 # define __ARM_MAX_ARCH__ 7
212 # define VFP_ABI_PUSH
219 # define WORD64(hi0,lo0,hi1,lo1) $_word lo0,hi0, lo1,hi1
223 # define WORD64(hi0,lo0,hi1,lo1) $_word hi0,lo0, hi1,lo1
226 #if defined(__thumb2__)
239 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
240 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
241 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
242 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
243 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
244 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
245 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
246 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
247 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
248 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
249 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
250 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
251 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
252 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
253 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
254 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
255 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
256 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
257 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
258 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
259 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
260 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
261 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
262 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
263 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
264 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
265 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
266 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
267 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
268 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
269 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
270 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
271 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
272 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
273 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
274 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
275 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
276 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
277 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
278 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
280 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
283 .word OPENSSL_armcap_P
285 .word OPENSSL_armcap_P-.Lsha512_block_data_order
292 .global sha512_block_data_order
293 .type sha512_block_data_order,%function
294 sha512_block_data_order:
295 .Lsha512_block_data_order:
296 #if __ARM_ARCH__<7 && !defined(__thumb2__)
297 sub r3,pc,#8 @ sha512_block_data_order
299 adr r3,.Lsha512_block_data_order
301 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
302 ldr r12,.LOPENSSL_armcap
303 # if !defined(_WIN32)
304 ldr r12,[r3,r12] @ OPENSSL_armcap_P
306 # if defined(__APPLE__) || defined(_WIN32)
312 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
313 stmdb sp!,{r4-r12,lr}
314 sub $Ktbl,r3,#672 @ K512
317 ldr $Elo,[$ctx,#$Eoff+$lo]
318 ldr $Ehi,[$ctx,#$Eoff+$hi]
319 ldr $t0, [$ctx,#$Goff+$lo]
320 ldr $t1, [$ctx,#$Goff+$hi]
321 ldr $t2, [$ctx,#$Hoff+$lo]
322 ldr $t3, [$ctx,#$Hoff+$hi]
324 str $t0, [sp,#$Goff+0]
325 str $t1, [sp,#$Goff+4]
326 str $t2, [sp,#$Hoff+0]
327 str $t3, [sp,#$Hoff+4]
328 ldr $Alo,[$ctx,#$Aoff+$lo]
329 ldr $Ahi,[$ctx,#$Aoff+$hi]
330 ldr $Tlo,[$ctx,#$Boff+$lo]
331 ldr $Thi,[$ctx,#$Boff+$hi]
332 ldr $t0, [$ctx,#$Coff+$lo]
333 ldr $t1, [$ctx,#$Coff+$hi]
334 ldr $t2, [$ctx,#$Doff+$lo]
335 ldr $t3, [$ctx,#$Doff+$hi]
336 str $Tlo,[sp,#$Boff+0]
337 str $Thi,[sp,#$Boff+4]
338 str $t0, [sp,#$Coff+0]
339 str $t1, [sp,#$Coff+4]
340 str $t2, [sp,#$Doff+0]
341 str $t3, [sp,#$Doff+4]
342 ldr $Tlo,[$ctx,#$Foff+$lo]
343 ldr $Thi,[$ctx,#$Foff+$hi]
344 str $Tlo,[sp,#$Foff+0]
345 str $Thi,[sp,#$Foff+4]
355 orr $Tlo,$Tlo,$t0,lsl#8
357 orr $Tlo,$Tlo,$t1,lsl#16
359 orr $Tlo,$Tlo,$t2,lsl#24
360 orr $Thi,$Thi,$t3,lsl#8
361 orr $Thi,$Thi,$t0,lsl#16
362 orr $Thi,$Thi,$t1,lsl#24
376 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
377 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
380 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
381 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
382 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
384 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
386 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
387 eor $Tlo,$Tlo,$t1,lsl#31
388 eor $Thi,$Thi,$t0,lsl#31
389 eor $Tlo,$Tlo,$t0,lsr#8
390 eor $Thi,$Thi,$t1,lsr#8
391 eor $Tlo,$Tlo,$t1,lsl#24
392 eor $Thi,$Thi,$t0,lsl#24
393 eor $Tlo,$Tlo,$t0,lsr#7
394 eor $Thi,$Thi,$t1,lsr#7
395 eor $Tlo,$Tlo,$t1,lsl#25
397 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
398 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
399 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
402 eor $t0,$t0,$t3,lsl#13
403 eor $t1,$t1,$t2,lsl#13
404 eor $t0,$t0,$t3,lsr#29
405 eor $t1,$t1,$t2,lsr#29
406 eor $t0,$t0,$t2,lsl#3
407 eor $t1,$t1,$t3,lsl#3
408 eor $t0,$t0,$t2,lsr#6
409 eor $t1,$t1,$t3,lsr#6
410 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
411 eor $t0,$t0,$t3,lsl#26
413 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
415 ldr $t0,[sp,#`$Xoff+8*16`+0]
418 ldr $t1,[sp,#`$Xoff+8*16`+4]
427 ittt eq @ Thumb2 thing, sanity check in ARM
429 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
430 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
434 ldr $Tlo,[sp,#$Boff+0]
435 ldr $Thi,[sp,#$Boff+4]
436 ldr $t0, [$ctx,#$Aoff+$lo]
437 ldr $t1, [$ctx,#$Aoff+$hi]
438 ldr $t2, [$ctx,#$Boff+$lo]
439 ldr $t3, [$ctx,#$Boff+$hi]
441 str $t0, [$ctx,#$Aoff+$lo]
443 str $t1, [$ctx,#$Aoff+$hi]
445 str $t2, [$ctx,#$Boff+$lo]
447 str $t3, [$ctx,#$Boff+$hi]
449 ldr $Alo,[sp,#$Coff+0]
450 ldr $Ahi,[sp,#$Coff+4]
451 ldr $Tlo,[sp,#$Doff+0]
452 ldr $Thi,[sp,#$Doff+4]
453 ldr $t0, [$ctx,#$Coff+$lo]
454 ldr $t1, [$ctx,#$Coff+$hi]
455 ldr $t2, [$ctx,#$Doff+$lo]
456 ldr $t3, [$ctx,#$Doff+$hi]
458 str $t0, [$ctx,#$Coff+$lo]
460 str $t1, [$ctx,#$Coff+$hi]
462 str $t2, [$ctx,#$Doff+$lo]
464 str $t3, [$ctx,#$Doff+$hi]
466 ldr $Tlo,[sp,#$Foff+0]
467 ldr $Thi,[sp,#$Foff+4]
468 ldr $t0, [$ctx,#$Eoff+$lo]
469 ldr $t1, [$ctx,#$Eoff+$hi]
470 ldr $t2, [$ctx,#$Foff+$lo]
471 ldr $t3, [$ctx,#$Foff+$hi]
473 str $Elo,[$ctx,#$Eoff+$lo]
475 str $Ehi,[$ctx,#$Eoff+$hi]
477 str $t2, [$ctx,#$Foff+$lo]
479 str $t3, [$ctx,#$Foff+$hi]
481 ldr $Alo,[sp,#$Goff+0]
482 ldr $Ahi,[sp,#$Goff+4]
483 ldr $Tlo,[sp,#$Hoff+0]
484 ldr $Thi,[sp,#$Hoff+4]
485 ldr $t0, [$ctx,#$Goff+$lo]
486 ldr $t1, [$ctx,#$Goff+$hi]
487 ldr $t2, [$ctx,#$Hoff+$lo]
488 ldr $t3, [$ctx,#$Hoff+$hi]
490 str $t0, [$ctx,#$Goff+$lo]
492 str $t1, [$ctx,#$Goff+$hi]
494 str $t2, [$ctx,#$Hoff+$lo]
496 str $t3, [$ctx,#$Hoff+$hi]
504 add sp,sp,#8*9 @ destroy frame
506 ldmia sp!,{r4-r12,pc}
508 ldmia sp!,{r4-r12,lr}
510 moveq pc,lr @ be binary compatible with V4, yet
511 bx lr @ interoperable with Thumb ISA:-)
513 .size sha512_block_data_order,.-sha512_block_data_order
517 my @Sigma0=(28,34,39);
518 my @Sigma1=(14,18,41);
519 my @sigma0=(1, 8, 7);
520 my @sigma1=(19,61,6);
523 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
525 my @X=map("d$_",(0..15));
526 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
530 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
531 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
533 $code.=<<___ if ($i<16 || $i&1);
534 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
536 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
538 vshr.u64 $t1,$e,#@Sigma1[1]
540 vadd.i64 $a,$Maj @ h+=Maj from the past
542 vshr.u64 $t2,$e,#@Sigma1[2]
545 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
546 vsli.64 $t0,$e,#`64-@Sigma1[0]`
547 vsli.64 $t1,$e,#`64-@Sigma1[1]`
549 vsli.64 $t2,$e,#`64-@Sigma1[2]`
550 #if $i<16 && defined(__ARMEL__)
551 vrev64.8 @X[$i],@X[$i]
554 vbsl $Ch,$f,$g @ Ch(e,f,g)
555 vshr.u64 $t0,$a,#@Sigma0[0]
556 veor $t2,$t1 @ Sigma1(e)
558 vshr.u64 $t1,$a,#@Sigma0[1]
559 vsli.64 $t0,$a,#`64-@Sigma0[0]`
561 vshr.u64 $t2,$a,#@Sigma0[2]
562 vadd.i64 $K,@X[$i%16]
563 vsli.64 $t1,$a,#`64-@Sigma0[1]`
565 vsli.64 $t2,$a,#`64-@Sigma0[2]`
568 vbsl $Maj,$c,$b @ Maj(a,b,c)
569 veor $h,$t2 @ Sigma0(a)
579 if ($i&1) { &NEON_00_15($i,@_); return; }
581 # 2x-vectorized, therefore runs every 2nd round
582 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
583 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
584 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
585 my $e=@_[4]; # $e from NEON_00_15
588 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
589 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
590 vadd.i64 @_[0],d30 @ h+=Maj from the past
591 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
592 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
593 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
594 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
596 vshr.u64 $t0,$s0,#@sigma0[0]
597 veor $s1,$t1 @ sigma1(X[i+14])
598 vshr.u64 $t1,$s0,#@sigma0[1]
599 vadd.i64 @X[$i%8],$s1
600 vshr.u64 $s1,$s0,#@sigma0[2]
601 vsli.64 $t0,$s0,#`64-@sigma0[0]`
602 vsli.64 $t1,$s0,#`64-@sigma0[1]`
603 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
605 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
606 vadd.i64 @X[$i%8],$s0
607 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
608 veor $s1,$t1 @ sigma0(X[i+1])
609 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
610 vadd.i64 @X[$i%8],$s1
612 &NEON_00_15(2*$i,@_);
616 #if __ARM_MAX_ARCH__>=7
620 .global sha512_block_data_order_neon
621 .type sha512_block_data_order_neon,%function
623 sha512_block_data_order_neon:
625 dmb @ errata #451034 on early Cortex A8
626 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
629 vldmia $ctx,{$A-$H} @ load context
632 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
638 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
642 vadd.i64 $A,d30 @ h+=Maj from the past
643 vldmia $ctx,{d24-d31} @ load context to temp
644 vadd.i64 q8,q12 @ vectorized accumulate
648 vstmia $ctx,{$A-$H} @ save context
650 sub $Ktbl,#640 @ rewind K512
655 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
660 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
662 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
663 .comm OPENSSL_armcap_P,4,4
667 $code =~ s/\`([^\`]*)\`/eval $1/gem;
668 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
669 $code =~ s/\bret\b/bx lr/gm;
674 last if (!s/^#/@/ and !/^$/);
680 close STDOUT or die "error closing STDOUT"; # enforce flush