RT2210: Add missing EVP_cleanup to example
[oweals/openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional NEON load and integer
15 # instructions. This has no effect on mighty Apple A7, where results
16 # are literally equal to the theoretical estimates based on AES
17 # instruction latencies and issue rates. On Cortex-A53, an in-order
18 # execution core, this costs up to 10-15%, which is partially
19 # compensated by implementing dedicated code path for 128-bit
20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
21 # seems to be limited by sheer amount of NEON instructions...
22 #
23 # Performance in cycles per byte processed with 128-bit key:
24 #
25 #               CBC enc         CBC dec         CTR
26 # Apple A7      2.39            1.20            1.20
27 # Cortex-A53    2.45            1.87            1.94
28 # Cortex-A57    3.64            1.34            1.32
29
30 $flavour = shift;
31 open STDOUT,">".shift;
32
33 $prefix="aes_v8";
34
35 $code=<<___;
36 #include "arm_arch.h"
37
38 #if __ARM_ARCH__>=7
39 .text
40 ___
41 $code.=".arch   armv8-a+crypto\n"       if ($flavour =~ /64/);
42 $code.=".fpu    neon\n.code     32\n"   if ($flavour !~ /64/);
43
44 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
45 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
46 # maintain both 32- and 64-bit codes within single module and
47 # transliterate common code to either flavour with regex vodoo.
48 #
49 {{{
50 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
51 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
52         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
53
54
55 $code.=<<___;
56 .align  5
57 rcon:
58 .long   0x01,0x01,0x01,0x01
59 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
60 .long   0x1b,0x1b,0x1b,0x1b
61
62 .globl  ${prefix}_set_encrypt_key
63 .type   ${prefix}_set_encrypt_key,%function
64 .align  5
65 ${prefix}_set_encrypt_key:
66 .Lenc_key:
67 ___
68 $code.=<<___    if ($flavour =~ /64/);
69         stp     x29,x30,[sp,#-16]!
70         add     x29,sp,#0
71 ___
72 $code.=<<___;
73         mov     $ptr,#-1
74         cmp     $inp,#0
75         b.eq    .Lenc_key_abort
76         cmp     $out,#0
77         b.eq    .Lenc_key_abort
78         mov     $ptr,#-2
79         cmp     $bits,#128
80         b.lt    .Lenc_key_abort
81         cmp     $bits,#256
82         b.gt    .Lenc_key_abort
83         tst     $bits,#0x3f
84         b.ne    .Lenc_key_abort
85
86         adr     $ptr,rcon
87         cmp     $bits,#192
88
89         veor    $zero,$zero,$zero
90         vld1.8  {$in0},[$inp],#16
91         mov     $bits,#8                // reuse $bits
92         vld1.32 {$rcon,$mask},[$ptr],#32
93
94         b.lt    .Loop128
95         b.eq    .L192
96         b       .L256
97
98 .align  4
99 .Loop128:
100         vtbl.8  $key,{$in0},$mask
101         vext.8  $tmp,$zero,$in0,#12
102         vst1.32 {$in0},[$out],#16
103         aese    $key,$zero
104         subs    $bits,$bits,#1
105
106         veor    $in0,$in0,$tmp
107         vext.8  $tmp,$zero,$tmp,#12
108         veor    $in0,$in0,$tmp
109         vext.8  $tmp,$zero,$tmp,#12
110          veor   $key,$key,$rcon
111         veor    $in0,$in0,$tmp
112         vshl.u8 $rcon,$rcon,#1
113         veor    $in0,$in0,$key
114         b.ne    .Loop128
115
116         vld1.32 {$rcon},[$ptr]
117
118         vtbl.8  $key,{$in0},$mask
119         vext.8  $tmp,$zero,$in0,#12
120         vst1.32 {$in0},[$out],#16
121         aese    $key,$zero
122
123         veor    $in0,$in0,$tmp
124         vext.8  $tmp,$zero,$tmp,#12
125         veor    $in0,$in0,$tmp
126         vext.8  $tmp,$zero,$tmp,#12
127          veor   $key,$key,$rcon
128         veor    $in0,$in0,$tmp
129         vshl.u8 $rcon,$rcon,#1
130         veor    $in0,$in0,$key
131
132         vtbl.8  $key,{$in0},$mask
133         vext.8  $tmp,$zero,$in0,#12
134         vst1.32 {$in0},[$out],#16
135         aese    $key,$zero
136
137         veor    $in0,$in0,$tmp
138         vext.8  $tmp,$zero,$tmp,#12
139         veor    $in0,$in0,$tmp
140         vext.8  $tmp,$zero,$tmp,#12
141          veor   $key,$key,$rcon
142         veor    $in0,$in0,$tmp
143         veor    $in0,$in0,$key
144         vst1.32 {$in0},[$out]
145         add     $out,$out,#0x50
146
147         mov     $rounds,#10
148         b       .Ldone
149
150 .align  4
151 .L192:
152         vld1.8  {$in1},[$inp],#8
153         vmov.i8 $key,#8                 // borrow $key
154         vst1.32 {$in0},[$out],#16
155         vsub.i8 $mask,$mask,$key        // adjust the mask
156
157 .Loop192:
158         vtbl.8  $key,{$in1},$mask
159         vext.8  $tmp,$zero,$in0,#12
160         vst1.32 {$in1},[$out],#8
161         aese    $key,$zero
162         subs    $bits,$bits,#1
163
164         veor    $in0,$in0,$tmp
165         vext.8  $tmp,$zero,$tmp,#12
166         veor    $in0,$in0,$tmp
167         vext.8  $tmp,$zero,$tmp,#12
168         veor    $in0,$in0,$tmp
169
170         vdup.32 $tmp,${in0}[3]
171         veor    $tmp,$tmp,$in1
172          veor   $key,$key,$rcon
173         vext.8  $in1,$zero,$in1,#12
174         vshl.u8 $rcon,$rcon,#1
175         veor    $in1,$in1,$tmp
176         veor    $in0,$in0,$key
177         veor    $in1,$in1,$key
178         vst1.32 {$in0},[$out],#16
179         b.ne    .Loop192
180
181         mov     $rounds,#12
182         add     $out,$out,#0x20
183         b       .Ldone
184
185 .align  4
186 .L256:
187         vld1.8  {$in1},[$inp]
188         mov     $bits,#7
189         mov     $rounds,#14
190         vst1.32 {$in0},[$out],#16
191
192 .Loop256:
193         vtbl.8  $key,{$in1},$mask
194         vext.8  $tmp,$zero,$in0,#12
195         vst1.32 {$in1},[$out],#16
196         aese    $key,$zero
197         subs    $bits,$bits,#1
198
199         veor    $in0,$in0,$tmp
200         vext.8  $tmp,$zero,$tmp,#12
201         veor    $in0,$in0,$tmp
202         vext.8  $tmp,$zero,$tmp,#12
203          veor   $key,$key,$rcon
204         veor    $in0,$in0,$tmp
205         vshl.u8 $rcon,$rcon,#1
206         veor    $in0,$in0,$key
207         vst1.32 {$in0},[$out],#16
208         b.eq    .Ldone
209
210         vdup.32 $key,${in0}[3]          // just splat
211         vext.8  $tmp,$zero,$in1,#12
212         aese    $key,$zero
213
214         veor    $in1,$in1,$tmp
215         vext.8  $tmp,$zero,$tmp,#12
216         veor    $in1,$in1,$tmp
217         vext.8  $tmp,$zero,$tmp,#12
218         veor    $in1,$in1,$tmp
219
220         veor    $in1,$in1,$key
221         b       .Loop256
222
223 .Ldone:
224         str     $rounds,[$out]
225         mov     $ptr,#0
226
227 .Lenc_key_abort:
228         mov     x0,$ptr                 // return value
229         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
230         ret
231 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
232
233 .globl  ${prefix}_set_decrypt_key
234 .type   ${prefix}_set_decrypt_key,%function
235 .align  5
236 ${prefix}_set_decrypt_key:
237 ___
238 $code.=<<___    if ($flavour =~ /64/);
239         stp     x29,x30,[sp,#-16]!
240         add     x29,sp,#0
241 ___
242 $code.=<<___    if ($flavour !~ /64/);
243         stmdb   sp!,{r4,lr}
244 ___
245 $code.=<<___;
246         bl      .Lenc_key
247
248         cmp     x0,#0
249         b.ne    .Ldec_key_abort
250
251         sub     $out,$out,#240          // restore original $out
252         mov     x4,#-16
253         add     $inp,$out,x12,lsl#4     // end of key schedule
254
255         vld1.32 {v0.16b},[$out]
256         vld1.32 {v1.16b},[$inp]
257         vst1.32 {v0.16b},[$inp],x4
258         vst1.32 {v1.16b},[$out],#16
259
260 .Loop_imc:
261         vld1.32 {v0.16b},[$out]
262         vld1.32 {v1.16b},[$inp]
263         aesimc  v0.16b,v0.16b
264         aesimc  v1.16b,v1.16b
265         vst1.32 {v0.16b},[$inp],x4
266         vst1.32 {v1.16b},[$out],#16
267         cmp     $inp,$out
268         b.hi    .Loop_imc
269
270         vld1.32 {v0.16b},[$out]
271         aesimc  v0.16b,v0.16b
272         vst1.32 {v0.16b},[$inp]
273
274         eor     x0,x0,x0                // return value
275 .Ldec_key_abort:
276 ___
277 $code.=<<___    if ($flavour !~ /64/);
278         ldmia   sp!,{r4,pc}
279 ___
280 $code.=<<___    if ($flavour =~ /64/);
281         ldp     x29,x30,[sp],#16
282         ret
283 ___
284 $code.=<<___;
285 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
286 ___
287 }}}
288 {{{
289 sub gen_block () {
290 my $dir = shift;
291 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
292 my ($inp,$out,$key)=map("x$_",(0..2));
293 my $rounds="w3";
294 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
295
296 $code.=<<___;
297 .globl  ${prefix}_${dir}crypt
298 .type   ${prefix}_${dir}crypt,%function
299 .align  5
300 ${prefix}_${dir}crypt:
301         ldr     $rounds,[$key,#240]
302         vld1.32 {$rndkey0},[$key],#16
303         vld1.8  {$inout},[$inp]
304         sub     $rounds,$rounds,#2
305         vld1.32 {$rndkey1},[$key],#16
306
307 .Loop_${dir}c:
308         aes$e   $inout,$rndkey0
309         vld1.32 {$rndkey0},[$key],#16
310         aes$mc  $inout,$inout
311         subs    $rounds,$rounds,#2
312         aes$e   $inout,$rndkey1
313         vld1.32 {$rndkey1},[$key],#16
314         aes$mc  $inout,$inout
315         b.gt    .Loop_${dir}c
316
317         aes$e   $inout,$rndkey0
318         vld1.32 {$rndkey0},[$key]
319         aes$mc  $inout,$inout
320         aes$e   $inout,$rndkey1
321         veor    $inout,$inout,$rndkey0
322
323         vst1.8  {$inout},[$out]
324         ret
325 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
326 ___
327 }
328 &gen_block("en");
329 &gen_block("de");
330 }}}
331 {{{
332 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
333 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
334 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
335
336 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
337
338 ### q8-q15      preloaded key schedule
339
340 $code.=<<___;
341 .globl  ${prefix}_cbc_encrypt
342 .type   ${prefix}_cbc_encrypt,%function
343 .align  5
344 ${prefix}_cbc_encrypt:
345 ___
346 $code.=<<___    if ($flavour =~ /64/);
347         stp     x29,x30,[sp,#-16]!
348         add     x29,sp,#0
349 ___
350 $code.=<<___    if ($flavour !~ /64/);
351         mov     ip,sp
352         stmdb   sp!,{r4-r8,lr}
353         vstmdb  sp!,{d8-d15}            @ ABI specification says so
354         ldmia   ip,{r4-r5}              @ load remaining args
355 ___
356 $code.=<<___;
357         subs    $len,$len,#16
358         mov     $step,#16
359         b.lo    .Lcbc_abort
360         cclr    $step,eq
361
362         cmp     $enc,#0                 // en- or decrypting?
363         ldr     $rounds,[$key,#240]
364         and     $len,$len,#-16
365         vld1.8  {$ivec},[$ivp]
366         vld1.8  {$dat},[$inp],$step
367
368         vld1.32 {q8-q9},[$key]          // load key schedule...
369         sub     $rounds,$rounds,#6
370         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
371         sub     $rounds,$rounds,#2
372         vld1.32 {q10-q11},[$key_],#32
373         vld1.32 {q12-q13},[$key_],#32
374         vld1.32 {q14-q15},[$key_],#32
375         vld1.32 {$rndlast},[$key_]
376
377         add     $key_,$key,#32
378         mov     $cnt,$rounds
379         b.eq    .Lcbc_dec
380
381         cmp     $rounds,#2
382         veor    $dat,$dat,$ivec
383         veor    $rndzero_n_last,q8,$rndlast
384         b.eq    .Lcbc_enc128
385
386 .Loop_cbc_enc:
387         aese    $dat,q8
388         vld1.32 {q8},[$key_],#16
389         aesmc   $dat,$dat
390         subs    $cnt,$cnt,#2
391         aese    $dat,q9
392         vld1.32 {q9},[$key_],#16
393         aesmc   $dat,$dat
394         b.gt    .Loop_cbc_enc
395
396         aese    $dat,q8
397         aesmc   $dat,$dat
398          subs   $len,$len,#16
399         aese    $dat,q9
400         aesmc   $dat,$dat
401          cclr   $step,eq
402         aese    $dat,q10
403         aesmc   $dat,$dat
404          add    $key_,$key,#16
405         aese    $dat,q11
406         aesmc   $dat,$dat
407          vld1.8 {q8},[$inp],$step
408         aese    $dat,q12
409         aesmc   $dat,$dat
410          veor   q8,q8,$rndzero_n_last
411         aese    $dat,q13
412         aesmc   $dat,$dat
413          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
414         aese    $dat,q14
415         aesmc   $dat,$dat
416         aese    $dat,q15
417
418          mov    $cnt,$rounds
419         veor    $ivec,$dat,$rndlast
420         vst1.8  {$ivec},[$out],#16
421         b.hs    .Loop_cbc_enc
422
423         b       .Lcbc_done
424
425 .align  5
426 .Lcbc_enc128:
427         vld1.32 {$in0-$in1},[$key_]
428         aese    $dat,q8
429         aesmc   $dat,$dat
430         b       .Lenter_cbc_enc128
431 .Loop_cbc_enc128:
432         aese    $dat,q8
433         aesmc   $dat,$dat
434          vst1.8 {$ivec},[$out],#16
435 .Lenter_cbc_enc128:
436         aese    $dat,q9
437         aesmc   $dat,$dat
438          subs   $len,$len,#16
439         aese    $dat,$in0
440         aesmc   $dat,$dat
441          cclr   $step,eq
442         aese    $dat,$in1
443         aesmc   $dat,$dat
444         aese    $dat,q10
445         aesmc   $dat,$dat
446         aese    $dat,q11
447         aesmc   $dat,$dat
448          vld1.8 {q8},[$inp],$step
449         aese    $dat,q12
450         aesmc   $dat,$dat
451         aese    $dat,q13
452         aesmc   $dat,$dat
453         aese    $dat,q14
454         aesmc   $dat,$dat
455          veor   q8,q8,$rndzero_n_last
456         aese    $dat,q15
457         veor    $ivec,$dat,$rndlast
458         b.hs    .Loop_cbc_enc128
459
460         vst1.8  {$ivec},[$out],#16
461         b       .Lcbc_done
462 ___
463 {
464 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
465 $code.=<<___;
466 .align  5
467 .Lcbc_dec:
468         vld1.8  {$dat2},[$inp],#16
469         subs    $len,$len,#32           // bias
470         add     $cnt,$rounds,#2
471         vorr    $in1,$dat,$dat
472         vorr    $dat1,$dat,$dat
473         vorr    $in2,$dat2,$dat2
474         b.lo    .Lcbc_dec_tail
475
476         vorr    $dat1,$dat2,$dat2
477         vld1.8  {$dat2},[$inp],#16
478         vorr    $in0,$dat,$dat
479         vorr    $in1,$dat1,$dat1
480         vorr    $in2,$dat2,$dat2
481
482 .Loop3x_cbc_dec:
483         aesd    $dat0,q8
484         aesd    $dat1,q8
485         aesd    $dat2,q8
486         vld1.32 {q8},[$key_],#16
487         aesimc  $dat0,$dat0
488         aesimc  $dat1,$dat1
489         aesimc  $dat2,$dat2
490         subs    $cnt,$cnt,#2
491         aesd    $dat0,q9
492         aesd    $dat1,q9
493         aesd    $dat2,q9
494         vld1.32 {q9},[$key_],#16
495         aesimc  $dat0,$dat0
496         aesimc  $dat1,$dat1
497         aesimc  $dat2,$dat2
498         b.gt    .Loop3x_cbc_dec
499
500         aesd    $dat0,q8
501         aesd    $dat1,q8
502         aesd    $dat2,q8
503          veor   $tmp0,$ivec,$rndlast
504         aesimc  $dat0,$dat0
505         aesimc  $dat1,$dat1
506         aesimc  $dat2,$dat2
507          veor   $tmp1,$in0,$rndlast
508         aesd    $dat0,q9
509         aesd    $dat1,q9
510         aesd    $dat2,q9
511          veor   $tmp2,$in1,$rndlast
512          subs   $len,$len,#0x30
513         aesimc  $dat0,$dat0
514         aesimc  $dat1,$dat1
515         aesimc  $dat2,$dat2
516          vorr   $ivec,$in2,$in2
517          mov.lo x6,$len                 // x6, $cnt, is zero at this point
518         aesd    $dat0,q12
519         aesd    $dat1,q12
520         aesd    $dat2,q12
521          add    $inp,$inp,x6            // $inp is adjusted in such way that
522                                         // at exit from the loop $dat1-$dat2
523                                         // are loaded with last "words"
524         aesimc  $dat0,$dat0
525         aesimc  $dat1,$dat1
526         aesimc  $dat2,$dat2
527          mov    $key_,$key
528         aesd    $dat0,q13
529         aesd    $dat1,q13
530         aesd    $dat2,q13
531          vld1.8 {$in0},[$inp],#16
532         aesimc  $dat0,$dat0
533         aesimc  $dat1,$dat1
534         aesimc  $dat2,$dat2
535          vld1.8 {$in1},[$inp],#16
536         aesd    $dat0,q14
537         aesd    $dat1,q14
538         aesd    $dat2,q14
539          vld1.8 {$in2},[$inp],#16
540         aesimc  $dat0,$dat0
541         aesimc  $dat1,$dat1
542         aesimc  $dat2,$dat2
543          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
544         aesd    $dat0,q15
545         aesd    $dat1,q15
546         aesd    $dat2,q15
547
548          add    $cnt,$rounds,#2
549         veor    $tmp0,$tmp0,$dat0
550         veor    $tmp1,$tmp1,$dat1
551         veor    $dat2,$dat2,$tmp2
552          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
553          vorr   $dat0,$in0,$in0
554         vst1.8  {$tmp0},[$out],#16
555          vorr   $dat1,$in1,$in1
556         vst1.8  {$tmp1},[$out],#16
557         vst1.8  {$dat2},[$out],#16
558          vorr   $dat2,$in2,$in2
559         b.hs    .Loop3x_cbc_dec
560
561         cmn     $len,#0x30
562         b.eq    .Lcbc_done
563         nop
564
565 .Lcbc_dec_tail:
566         aesd    $dat1,q8
567         aesd    $dat2,q8
568         vld1.32 {q8},[$key_],#16
569         aesimc  $dat1,$dat1
570         aesimc  $dat2,$dat2
571         subs    $cnt,$cnt,#2
572         aesd    $dat1,q9
573         aesd    $dat2,q9
574         vld1.32 {q9},[$key_],#16
575         aesimc  $dat1,$dat1
576         aesimc  $dat2,$dat2
577         b.gt    .Lcbc_dec_tail
578
579         aesd    $dat1,q8
580         aesd    $dat2,q8
581         aesimc  $dat1,$dat1
582         aesimc  $dat2,$dat2
583         aesd    $dat1,q9
584         aesd    $dat2,q9
585         aesimc  $dat1,$dat1
586         aesimc  $dat2,$dat2
587         aesd    $dat1,q12
588         aesd    $dat2,q12
589         aesimc  $dat1,$dat1
590         aesimc  $dat2,$dat2
591          cmn    $len,#0x20
592         aesd    $dat1,q13
593         aesd    $dat2,q13
594         aesimc  $dat1,$dat1
595         aesimc  $dat2,$dat2
596          veor   $tmp1,$ivec,$rndlast
597         aesd    $dat1,q14
598         aesd    $dat2,q14
599         aesimc  $dat1,$dat1
600         aesimc  $dat2,$dat2
601          veor   $tmp2,$in1,$rndlast
602         aesd    $dat1,q15
603         aesd    $dat2,q15
604         b.eq    .Lcbc_dec_one
605         veor    $tmp1,$tmp1,$dat1
606         veor    $tmp2,$tmp2,$dat2
607          vorr   $ivec,$in2,$in2
608         vst1.8  {$tmp1},[$out],#16
609         vst1.8  {$tmp2},[$out],#16
610         b       .Lcbc_done
611
612 .Lcbc_dec_one:
613         veor    $tmp1,$tmp1,$dat2
614          vorr   $ivec,$in2,$in2
615         vst1.8  {$tmp1},[$out],#16
616
617 .Lcbc_done:
618         vst1.8  {$ivec},[$ivp]
619 .Lcbc_abort:
620 ___
621 }
622 $code.=<<___    if ($flavour !~ /64/);
623         vldmia  sp!,{d8-d15}
624         ldmia   sp!,{r4-r8,pc}
625 ___
626 $code.=<<___    if ($flavour =~ /64/);
627         ldr     x29,[sp],#16
628         ret
629 ___
630 $code.=<<___;
631 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
632 ___
633 }}}
634 {{{
635 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
636 my ($rounds,$cnt,$key_)=("w5","w6","x7");
637 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
638 my $step="x12";         # aliases with $tctr2
639
640 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
641 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
642
643 my ($dat,$tmp)=($dat0,$tmp0);
644
645 ### q8-q15      preloaded key schedule
646
647 $code.=<<___;
648 .globl  ${prefix}_ctr32_encrypt_blocks
649 .type   ${prefix}_ctr32_encrypt_blocks,%function
650 .align  5
651 ${prefix}_ctr32_encrypt_blocks:
652 ___
653 $code.=<<___    if ($flavour =~ /64/);
654         stp             x29,x30,[sp,#-16]!
655         add             x29,sp,#0
656 ___
657 $code.=<<___    if ($flavour !~ /64/);
658         mov             ip,sp
659         stmdb           sp!,{r4-r10,lr}
660         vstmdb          sp!,{d8-d15}            @ ABI specification says so
661         ldr             r4, [ip]                @ load remaining arg
662 ___
663 $code.=<<___;
664         ldr             $rounds,[$key,#240]
665
666         ldr             $ctr, [$ivp, #12]
667         vld1.32         {$dat0},[$ivp]
668
669         vld1.32         {q8-q9},[$key]          // load key schedule...
670         sub             $rounds,$rounds,#4
671         mov             $step,#16
672         cmp             $len,#2
673         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
674         sub             $rounds,$rounds,#2
675         vld1.32         {q12-q13},[$key_],#32
676         vld1.32         {q14-q15},[$key_],#32
677         vld1.32         {$rndlast},[$key_]
678         add             $key_,$key,#32
679         mov             $cnt,$rounds
680         cclr            $step,lo
681 #ifndef __ARMEB__
682         rev             $ctr, $ctr
683 #endif
684         vorr            $dat1,$dat0,$dat0
685         add             $tctr1, $ctr, #1
686         vorr            $dat2,$dat0,$dat0
687         add             $ctr, $ctr, #2
688         vorr            $ivec,$dat0,$dat0
689         rev             $tctr1, $tctr1
690         vmov.32         ${dat1}[3],$tctr1
691         b.ls            .Lctr32_tail
692         rev             $tctr2, $ctr
693         sub             $len,$len,#3            // bias
694         vmov.32         ${dat2}[3],$tctr2
695         b               .Loop3x_ctr32
696
697 .align  4
698 .Loop3x_ctr32:
699         aese            $dat0,q8
700         aese            $dat1,q8
701         aese            $dat2,q8
702         vld1.32         {q8},[$key_],#16
703         aesmc           $dat0,$dat0
704         aesmc           $dat1,$dat1
705         aesmc           $dat2,$dat2
706         subs            $cnt,$cnt,#2
707         aese            $dat0,q9
708         aese            $dat1,q9
709         aese            $dat2,q9
710         vld1.32         {q9},[$key_],#16
711         aesmc           $dat0,$dat0
712         aesmc           $dat1,$dat1
713         aesmc           $dat2,$dat2
714         b.gt            .Loop3x_ctr32
715
716         aese            $dat0,q8
717         aese            $dat1,q8
718         aese            $dat2,q8
719          mov            $key_,$key
720         aesmc           $tmp0,$dat0
721          vld1.8         {$in0},[$inp],#16
722         aesmc           $tmp1,$dat1
723         aesmc           $dat2,$dat2
724          vorr           $dat0,$ivec,$ivec
725         aese            $tmp0,q9
726          vld1.8         {$in1},[$inp],#16
727         aese            $tmp1,q9
728         aese            $dat2,q9
729          vorr           $dat1,$ivec,$ivec
730         aesmc           $tmp0,$tmp0
731          vld1.8         {$in2},[$inp],#16
732         aesmc           $tmp1,$tmp1
733         aesmc           $tmp2,$dat2
734          vorr           $dat2,$ivec,$ivec
735          add            $tctr0,$ctr,#1
736         aese            $tmp0,q12
737         aese            $tmp1,q12
738         aese            $tmp2,q12
739          veor           $in0,$in0,$rndlast
740          add            $tctr1,$ctr,#2
741         aesmc           $tmp0,$tmp0
742         aesmc           $tmp1,$tmp1
743         aesmc           $tmp2,$tmp2
744          veor           $in1,$in1,$rndlast
745          add            $ctr,$ctr,#3
746         aese            $tmp0,q13
747         aese            $tmp1,q13
748         aese            $tmp2,q13
749          veor           $in2,$in2,$rndlast
750          rev            $tctr0,$tctr0
751         aesmc           $tmp0,$tmp0
752          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
753         aesmc           $tmp1,$tmp1
754         aesmc           $tmp2,$tmp2
755          vmov.32        ${dat0}[3], $tctr0
756          rev            $tctr1,$tctr1
757         aese            $tmp0,q14
758         aese            $tmp1,q14
759         aese            $tmp2,q14
760          vmov.32        ${dat1}[3], $tctr1
761          rev            $tctr2,$ctr
762         aesmc           $tmp0,$tmp0
763         aesmc           $tmp1,$tmp1
764         aesmc           $tmp2,$tmp2
765          vmov.32        ${dat2}[3], $tctr2
766          subs           $len,$len,#3
767         aese            $tmp0,q15
768         aese            $tmp1,q15
769         aese            $tmp2,q15
770
771          mov            $cnt,$rounds
772         veor            $in0,$in0,$tmp0
773         veor            $in1,$in1,$tmp1
774         veor            $in2,$in2,$tmp2
775          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
776         vst1.8          {$in0},[$out],#16
777         vst1.8          {$in1},[$out],#16
778         vst1.8          {$in2},[$out],#16
779         b.hs            .Loop3x_ctr32
780
781         adds            $len,$len,#3
782         b.eq            .Lctr32_done
783         cmp             $len,#1
784         mov             $step,#16
785         cclr            $step,eq
786
787 .Lctr32_tail:
788         aese            $dat0,q8
789         aese            $dat1,q8
790         vld1.32         {q8},[$key_],#16
791         aesmc           $dat0,$dat0
792         aesmc           $dat1,$dat1
793         subs            $cnt,$cnt,#2
794         aese            $dat0,q9
795         aese            $dat1,q9
796         vld1.32         {q9},[$key_],#16
797         aesmc           $dat0,$dat0
798         aesmc           $dat1,$dat1
799         b.gt            .Lctr32_tail
800
801         aese            $dat0,q8
802         aese            $dat1,q8
803         aesmc           $dat0,$dat0
804         aesmc           $dat1,$dat1
805         aese            $dat0,q9
806         aese            $dat1,q9
807         aesmc           $dat0,$dat0
808         aesmc           $dat1,$dat1
809          vld1.8         {$in0},[$inp],$step
810         aese            $dat0,q12
811         aese            $dat1,q12
812          vld1.8         {$in1},[$inp]
813         aesmc           $dat0,$dat0
814         aesmc           $dat1,$dat1
815         aese            $dat0,q13
816         aese            $dat1,q13
817         aesmc           $dat0,$dat0
818         aesmc           $dat1,$dat1
819         aese            $dat0,q14
820         aese            $dat1,q14
821          veor           $in0,$in0,$rndlast
822         aesmc           $dat0,$dat0
823         aesmc           $dat1,$dat1
824          veor           $in1,$in1,$rndlast
825         aese            $dat0,q15
826         aese            $dat1,q15
827
828         cmp             $len,#1
829         veor            $in0,$in0,$dat0
830         veor            $in1,$in1,$dat1
831         vst1.8          {$in0},[$out],#16
832         b.eq            .Lctr32_done
833         vst1.8          {$in1},[$out]
834
835 .Lctr32_done:
836 ___
837 $code.=<<___    if ($flavour !~ /64/);
838         vldmia          sp!,{d8-d15}
839         ldmia           sp!,{r4-r10,pc}
840 ___
841 $code.=<<___    if ($flavour =~ /64/);
842         ldr             x29,[sp],#16
843         ret
844 ___
845 $code.=<<___;
846 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
847 ___
848 }}}
849 $code.=<<___;
850 #endif
851 ___
852 ########################################
853 if ($flavour =~ /64/) {                 ######## 64-bit code
854     my %opcode = (
855         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
856         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
857
858     local *unaes = sub {
859         my ($mnemonic,$arg)=@_;
860
861         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
862         sprintf ".inst\t0x%08x\t//%s %s",
863                         $opcode{$mnemonic}|$1|($2<<5),
864                         $mnemonic,$arg;
865     };
866
867     foreach(split("\n",$code)) {
868         s/\`([^\`]*)\`/eval($1)/geo;
869
870         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
871         s/@\s/\/\//o;                   # old->new style commentary
872
873         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
874         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
875         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
876         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
877         s/vext\.8/ext/o         or
878         s/vrev32\.8/rev32/o     or
879         s/vtst\.8/cmtst/o       or
880         s/vshr/ushr/o           or
881         s/^(\s+)v/$1/o          or      # strip off v prefix
882         s/\bbx\s+lr\b/ret/o;
883
884         # fix up remainig legacy suffixes
885         s/\.[ui]?8//o;
886         m/\],#8/o and s/\.16b/\.8b/go;
887         s/\.[ui]?32//o and s/\.16b/\.4s/go;
888         s/\.[ui]?64//o and s/\.16b/\.2d/go;
889         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
890
891         print $_,"\n";
892     }
893 } else {                                ######## 32-bit code
894     my %opcode = (
895         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
896         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
897
898     local *unaes = sub {
899         my ($mnemonic,$arg)=@_;
900
901         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
902             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
903                                          |(($2&7)<<1) |(($2&8)<<2);
904             # since ARMv7 instructions are always encoded little-endian.
905             # correct solution is to use .inst directive, but older
906             # assemblers don't implement it:-(
907             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
908                         $word&0xff,($word>>8)&0xff,
909                         ($word>>16)&0xff,($word>>24)&0xff,
910                         $mnemonic,$arg;
911         }
912     };
913
914     sub unvtbl {
915         my $arg=shift;
916
917         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
918         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
919                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 
920     }
921
922     sub unvdup32 {
923         my $arg=shift;
924
925         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
926         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;      
927     }
928
929     sub unvmov32 {
930         my $arg=shift;
931
932         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
933         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;       
934     }
935
936     foreach(split("\n",$code)) {
937         s/\`([^\`]*)\`/eval($1)/geo;
938
939         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
940         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
941         s/\/\/\s?/@ /o;                         # new->old style commentary
942
943         # fix up remainig new-style suffixes
944         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
945         s/\],#[0-9]+/]!/o;
946
947         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
948         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
949         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
950         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
951         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
952         s/^(\s+)b\./$1b/o                               or
953         s/^(\s+)mov\./$1mov/o                           or
954         s/^(\s+)ret/$1bx\tlr/o;
955
956         print $_,"\n";
957     }
958 }
959
960 close STDOUT;