modes/asm/ghashv8-armx.pl: optimize modulo-scheduled loop.
[oweals/openssl.git] / crypto / modes / asm / ghashv8-armx.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
18 #
19 # June 2014
20 #
21 # Initial version was developed in tight cooperation with Ard
22 # Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
23 # Just like aesv8-armx.pl this module supports both AArch32 and
24 # AArch64 execution modes.
25 #
26 # July 2014
27 #
28 # Implement 2x aggregated reduction [see ghash-x86.pl for background
29 # information].
30 #
31 # November 2017
32 #
33 # AArch64 register bank to "accommodate" 4x aggregated reduction and
34 # improve performance by 20-70% depending on processor.
35 #
36 # Current performance in cycles per processed byte:
37 #
38 #               64-bit PMULL    32-bit PMULL    32-bit NEON(*)
39 # Apple A7      0.58            0.92            5.62
40 # Cortex-A53    0.85            1.01            8.39
41 # Cortex-A57    0.73            1.17            7.61
42 # Denver        0.51            0.65            6.02
43 # Mongoose      0.65            1.10            8.06
44 # Kryo          0.76            1.16            8.00
45 #
46 # (*)   presented for reference/comparison purposes;
47
48 $flavour = shift;
49 $output  = shift;
50
51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
53 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
54 die "can't locate arm-xlate.pl";
55
56 open OUT,"| \"$^X\" $xlate $flavour $output";
57 *STDOUT=*OUT;
58
59 $Xi="x0";       # argument block
60 $Htbl="x1";
61 $inp="x2";
62 $len="x3";
63
64 $inc="x12";
65
66 {
67 my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
68 my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
69
70 $code=<<___;
71 #include "arm_arch.h"
72
73 .text
74 ___
75 $code.=".arch   armv8-a+crypto\n"       if ($flavour =~ /64/);
76 $code.=<<___                            if ($flavour !~ /64/);
77 .fpu    neon
78 .code   32
79 #undef  __thumb2__
80 ___
81
82 ################################################################################
83 # void gcm_init_v8(u128 Htable[16],const u64 H[2]);
84 #
85 # input:        128-bit H - secret parameter E(K,0^128)
86 # output:       precomputed table filled with degrees of twisted H;
87 #               H is twisted to handle reverse bitness of GHASH;
88 #               only few of 16 slots of Htable[16] are used;
89 #               data is opaque to outside world (which allows to
90 #               optimize the code independently);
91 #
92 $code.=<<___;
93 .global gcm_init_v8
94 .type   gcm_init_v8,%function
95 .align  4
96 gcm_init_v8:
97         vld1.64         {$t1},[x1]              @ load input H
98         vmov.i8         $xC2,#0xe1
99         vshl.i64        $xC2,$xC2,#57           @ 0xc2.0
100         vext.8          $IN,$t1,$t1,#8
101         vshr.u64        $t2,$xC2,#63
102         vdup.32         $t1,${t1}[1]
103         vext.8          $t0,$t2,$xC2,#8         @ t0=0xc2....01
104         vshr.u64        $t2,$IN,#63
105         vshr.s32        $t1,$t1,#31             @ broadcast carry bit
106         vand            $t2,$t2,$t0
107         vshl.i64        $IN,$IN,#1
108         vext.8          $t2,$t2,$t2,#8
109         vand            $t0,$t0,$t1
110         vorr            $IN,$IN,$t2             @ H<<<=1
111         veor            $H,$IN,$t0              @ twisted H
112         vst1.64         {$H},[x0],#16           @ store Htable[0]
113
114         @ calculate H^2
115         vext.8          $t0,$H,$H,#8            @ Karatsuba pre-processing
116         vpmull.p64      $Xl,$H,$H
117         veor            $t0,$t0,$H
118         vpmull2.p64     $Xh,$H,$H
119         vpmull.p64      $Xm,$t0,$t0
120
121         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
122         veor            $t2,$Xl,$Xh
123         veor            $Xm,$Xm,$t1
124         veor            $Xm,$Xm,$t2
125         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
126
127         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
128         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
129         veor            $Xl,$Xm,$t2
130
131         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
132         vpmull.p64      $Xl,$Xl,$xC2
133         veor            $t2,$t2,$Xh
134         veor            $H2,$Xl,$t2
135
136         vext.8          $t1,$H2,$H2,#8          @ Karatsuba pre-processing
137         veor            $t1,$t1,$H2
138         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
139         vst1.64         {$Hhl-$H2},[x0],#32     @ store Htable[1..2]
140 ___
141 if ($flavour =~ /64/) {
142 my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
143
144 $code.=<<___;
145         @ calculate H^3 and H^4
146         vpmull.p64      $Xl,$H, $H2
147          vpmull.p64     $Yl,$H2,$H2
148         vpmull2.p64     $Xh,$H, $H2
149          vpmull2.p64    $Yh,$H2,$H2
150         vpmull.p64      $Xm,$t0,$t1
151          vpmull.p64     $Ym,$t1,$t1
152
153         vext.8          $t0,$Xl,$Xh,#8          @ Karatsuba post-processing
154          vext.8         $t1,$Yl,$Yh,#8
155         veor            $t2,$Xl,$Xh
156         veor            $Xm,$Xm,$t0
157          veor           $t3,$Yl,$Yh
158          veor           $Ym,$Ym,$t1
159         veor            $Xm,$Xm,$t2
160         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
161          veor           $Ym,$Ym,$t3
162          vpmull.p64     $t3,$Yl,$xC2
163
164         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
165          vmov           $Yh#lo,$Ym#hi
166         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
167          vmov           $Ym#hi,$Yl#lo
168         veor            $Xl,$Xm,$t2
169          veor           $Yl,$Ym,$t3
170
171         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
172          vext.8         $t3,$Yl,$Yl,#8
173         vpmull.p64      $Xl,$Xl,$xC2
174          vpmull.p64     $Yl,$Yl,$xC2
175         veor            $t2,$t2,$Xh
176          veor           $t3,$t3,$Yh
177         veor            $H, $Xl,$t2             @ H^3
178          veor           $H2,$Yl,$t3             @ H^4
179
180         vext.8          $t0,$H, $H,#8           @ Karatsuba pre-processing
181          vext.8         $t1,$H2,$H2,#8
182         veor            $t0,$t0,$H
183          veor           $t1,$t1,$H2
184         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
185         vst1.64         {$H-$H2},[x0]           @ store Htable[3..5]
186 ___
187 }
188 $code.=<<___;
189         ret
190 .size   gcm_init_v8,.-gcm_init_v8
191 ___
192 ################################################################################
193 # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
194 #
195 # input:        Xi - current hash value;
196 #               Htable - table precomputed in gcm_init_v8;
197 # output:       Xi - next hash value Xi;
198 #
199 $code.=<<___;
200 .global gcm_gmult_v8
201 .type   gcm_gmult_v8,%function
202 .align  4
203 gcm_gmult_v8:
204         vld1.64         {$t1},[$Xi]             @ load Xi
205         vmov.i8         $xC2,#0xe1
206         vld1.64         {$H-$Hhl},[$Htbl]       @ load twisted H, ...
207         vshl.u64        $xC2,$xC2,#57
208 #ifndef __ARMEB__
209         vrev64.8        $t1,$t1
210 #endif
211         vext.8          $IN,$t1,$t1,#8
212
213         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
214         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
215         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
216         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
217
218         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
219         veor            $t2,$Xl,$Xh
220         veor            $Xm,$Xm,$t1
221         veor            $Xm,$Xm,$t2
222         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
223
224         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
225         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
226         veor            $Xl,$Xm,$t2
227
228         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
229         vpmull.p64      $Xl,$Xl,$xC2
230         veor            $t2,$t2,$Xh
231         veor            $Xl,$Xl,$t2
232
233 #ifndef __ARMEB__
234         vrev64.8        $Xl,$Xl
235 #endif
236         vext.8          $Xl,$Xl,$Xl,#8
237         vst1.64         {$Xl},[$Xi]             @ write out Xi
238
239         ret
240 .size   gcm_gmult_v8,.-gcm_gmult_v8
241 ___
242 ################################################################################
243 # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
244 #
245 # input:        table precomputed in gcm_init_v8;
246 #               current hash value Xi;
247 #               pointer to input data;
248 #               length of input data in bytes, but divisible by block size;
249 # output:       next hash value Xi;
250 #
251 $code.=<<___;
252 .global gcm_ghash_v8
253 .type   gcm_ghash_v8,%function
254 .align  4
255 gcm_ghash_v8:
256 ___
257 $code.=<<___    if ($flavour =~ /64/);
258         bic             $inc,$len,#63
259         cmp             $len,$inc
260         b.eq            .Lgcm_ghash_v8_4x
261 ___
262 $code.=<<___            if ($flavour !~ /64/);
263         vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
264 ___
265 $code.=<<___;
266         vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
267                                                 @ "[rotated]" means that
268                                                 @ loaded value would have
269                                                 @ to be rotated in order to
270                                                 @ make it appear as in
271                                                 @ algorithm specification
272         subs            $len,$len,#32           @ see if $len is 32 or larger
273         mov             $inc,#16                @ $inc is used as post-
274                                                 @ increment for input pointer;
275                                                 @ as loop is modulo-scheduled
276                                                 @ $inc is zeroed just in time
277                                                 @ to preclude overstepping
278                                                 @ inp[len], which means that
279                                                 @ last block[s] are actually
280                                                 @ loaded twice, but last
281                                                 @ copy is not processed
282         vld1.64         {$H-$Hhl},[$Htbl],#32   @ load twisted H, ..., H^2
283         vmov.i8         $xC2,#0xe1
284         vld1.64         {$H2},[$Htbl]
285         cclr            $inc,eq                 @ is it time to zero $inc?
286         vext.8          $Xl,$Xl,$Xl,#8          @ rotate Xi
287         vld1.64         {$t0},[$inp],#16        @ load [rotated] I[0]
288         vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
289 #ifndef __ARMEB__
290         vrev64.8        $t0,$t0
291         vrev64.8        $Xl,$Xl
292 #endif
293         vext.8          $IN,$t0,$t0,#8          @ rotate I[0]
294         b.lo            .Lodd_tail_v8           @ $len was less than 32
295 ___
296 { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
297         #######
298         # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
299         #       [(H*Ii+1) + (H*Xi+1)] mod P =
300         #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
301         #
302 $code.=<<___;
303         vld1.64         {$t1},[$inp],$inc       @ load [rotated] I[1]
304 #ifndef __ARMEB__
305         vrev64.8        $t1,$t1
306 #endif
307         vext.8          $In,$t1,$t1,#8
308         veor            $IN,$IN,$Xl             @ I[i]^=Xi
309         vpmull.p64      $Xln,$H,$In             @ H·Ii+1
310         veor            $t1,$t1,$In             @ Karatsuba pre-processing
311         vpmull2.p64     $Xhn,$H,$In
312         b               .Loop_mod2x_v8
313
314 .align  4
315 .Loop_mod2x_v8:
316         vext.8          $t2,$IN,$IN,#8
317         subs            $len,$len,#32           @ is there more data?
318         vpmull.p64      $Xl,$H2,$IN             @ H^2.lo·Xi.lo
319         cclr            $inc,lo                 @ is it time to zero $inc?
320
321          vpmull.p64     $Xmn,$Hhl,$t1
322         veor            $t2,$t2,$IN             @ Karatsuba pre-processing
323         vpmull2.p64     $Xh,$H2,$IN             @ H^2.hi·Xi.hi
324         veor            $Xl,$Xl,$Xln            @ accumulate
325         vpmull2.p64     $Xm,$Hhl,$t2            @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
326          vld1.64        {$t0},[$inp],$inc       @ load [rotated] I[i+2]
327
328         veor            $Xh,$Xh,$Xhn
329          cclr           $inc,eq                 @ is it time to zero $inc?
330         veor            $Xm,$Xm,$Xmn
331
332         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
333         veor            $t2,$Xl,$Xh
334         veor            $Xm,$Xm,$t1
335          vld1.64        {$t1},[$inp],$inc       @ load [rotated] I[i+3]
336 #ifndef __ARMEB__
337          vrev64.8       $t0,$t0
338 #endif
339         veor            $Xm,$Xm,$t2
340         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
341
342 #ifndef __ARMEB__
343          vrev64.8       $t1,$t1
344 #endif
345         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
346         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
347          vext.8         $In,$t1,$t1,#8
348          vext.8         $IN,$t0,$t0,#8
349         veor            $Xl,$Xm,$t2
350          vpmull.p64     $Xln,$H,$In             @ H·Ii+1
351         veor            $IN,$IN,$Xh             @ accumulate $IN early
352
353         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
354         vpmull.p64      $Xl,$Xl,$xC2
355         veor            $IN,$IN,$t2
356          veor           $t1,$t1,$In             @ Karatsuba pre-processing
357         veor            $IN,$IN,$Xl
358          vpmull2.p64    $Xhn,$H,$In
359         b.hs            .Loop_mod2x_v8          @ there was at least 32 more bytes
360
361         veor            $Xh,$Xh,$t2
362         vext.8          $IN,$t0,$t0,#8          @ re-construct $IN
363         adds            $len,$len,#32           @ re-construct $len
364         veor            $Xl,$Xl,$Xh             @ re-construct $Xl
365         b.eq            .Ldone_v8               @ is $len zero?
366 ___
367 }
368 $code.=<<___;
369 .Lodd_tail_v8:
370         vext.8          $t2,$Xl,$Xl,#8
371         veor            $IN,$IN,$Xl             @ inp^=Xi
372         veor            $t1,$t0,$t2             @ $t1 is rotated inp^Xi
373
374         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
375         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
376         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
377         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
378
379         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
380         veor            $t2,$Xl,$Xh
381         veor            $Xm,$Xm,$t1
382         veor            $Xm,$Xm,$t2
383         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
384
385         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
386         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
387         veor            $Xl,$Xm,$t2
388
389         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
390         vpmull.p64      $Xl,$Xl,$xC2
391         veor            $t2,$t2,$Xh
392         veor            $Xl,$Xl,$t2
393
394 .Ldone_v8:
395 #ifndef __ARMEB__
396         vrev64.8        $Xl,$Xl
397 #endif
398         vext.8          $Xl,$Xl,$Xl,#8
399         vst1.64         {$Xl},[$Xi]             @ write out Xi
400
401 ___
402 $code.=<<___            if ($flavour !~ /64/);
403         vldmia          sp!,{d8-d15}            @ 32-bit ABI says so
404 ___
405 $code.=<<___;
406         ret
407 .size   gcm_ghash_v8,.-gcm_ghash_v8
408 ___
409
410 if ($flavour =~ /64/) {                         # 4x subroutine
411 my ($I0,$j1,$j2,$j3,
412     $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
413
414 $code.=<<___;
415 .type   gcm_ghash_v8_4x,%function
416 .align  4
417 gcm_ghash_v8_4x:
418 .Lgcm_ghash_v8_4x:
419         vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
420         vld1.64         {$H-$H2},[$Htbl],#48    @ load twisted H, ..., H^2
421         vmov.i8         $xC2,#0xe1
422         vld1.64         {$H3-$H4},[$Htbl]       @ load twisted H^3, ..., H^4
423         vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
424 #ifndef __ARMEB__
425         vrev64.8        $Xl,$Xl
426 #endif
427
428         vld1.64         {$I0-$j3},[$inp],#64
429 #ifndef __ARMEB__
430         vrev64.8        $j1,$j1
431         vrev64.8        $j2,$j2
432         vrev64.8        $j3,$j3
433         vrev64.8        $I0,$I0
434 #endif
435         vext.8          $I3,$j3,$j3,#8
436         vext.8          $I2,$j2,$j2,#8
437         vext.8          $I1,$j1,$j1,#8
438
439         vpmull.p64      $Yl,$H,$I3              @ H·Ii+3
440         veor            $j3,$j3,$I3
441         vpmull2.p64     $Yh,$H,$I3
442         vpmull.p64      $Ym,$Hhl,$j3
443
444         vpmull.p64      $t0,$H2,$I2             @ H^2·Ii+2
445         veor            $j2,$j2,$I2
446         vpmull2.p64     $I2,$H2,$I2
447         vpmull2.p64     $j2,$Hhl,$j2
448
449         veor            $Yl,$Yl,$t0
450         veor            $Yh,$Yh,$I2
451         veor            $Ym,$Ym,$j2
452
453         vpmull.p64      $j3,$H3,$I1             @ H^3·Ii+1
454         veor            $j1,$j1,$I1
455         vpmull2.p64     $I1,$H3,$I1
456         vpmull.p64      $j1,$H34,$j1
457
458         veor            $Yl,$Yl,$j3
459         veor            $Yh,$Yh,$I1
460         veor            $Ym,$Ym,$j1
461
462         subs            $len,$len,#64
463         b.eq            .Ltail4x
464
465         b               .Loop4x
466
467 .align  4
468 .Loop4x:
469         veor            $t0,$I0,$Xl
470          vld1.64        {$I0-$j3},[$inp],#64
471         vext.8          $IN,$t0,$t0,#8
472 #ifndef __ARMEB__
473          vrev64.8       $j1,$j1
474          vrev64.8       $j2,$j2
475          vrev64.8       $j3,$j3
476          vrev64.8       $I0,$I0
477 #endif
478
479         vpmull.p64      $Xl,$H4,$IN             @ H^4·(Xi+Ii)
480         veor            $t0,$t0,$IN
481         vpmull2.p64     $Xh,$H4,$IN
482          vext.8         $I3,$j3,$j3,#8
483         vpmull2.p64     $Xm,$H34,$t0
484
485         veor            $Xl,$Xl,$Yl
486         veor            $Xh,$Xh,$Yh
487          vext.8         $I2,$j2,$j2,#8
488         veor            $Xm,$Xm,$Ym
489          vext.8         $I1,$j1,$j1,#8
490
491         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
492         veor            $t2,$Xl,$Xh
493          vpmull.p64     $Yl,$H,$I3              @ H·Ii+3
494          veor           $j3,$j3,$I3
495         veor            $Xm,$Xm,$t1
496          vpmull2.p64    $Yh,$H,$I3
497         veor            $Xm,$Xm,$t2
498          vpmull.p64     $Ym,$Hhl,$j3
499
500         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
501         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
502         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
503          vpmull.p64     $t0,$H2,$I2             @ H^2·Ii+2
504          veor           $j2,$j2,$I2
505          vpmull2.p64    $I2,$H2,$I2
506         veor            $Xl,$Xm,$t2
507          vpmull2.p64    $j2,$Hhl,$j2
508
509          veor           $Yl,$Yl,$t0
510          veor           $Yh,$Yh,$I2
511          veor           $Ym,$Ym,$j2
512
513         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
514         vpmull.p64      $Xl,$Xl,$xC2
515          vpmull.p64     $j3,$H3,$I1             @ H^3·Ii+1
516          veor           $j1,$j1,$I1
517         veor            $t2,$t2,$Xh
518          vpmull2.p64    $I1,$H3,$I1
519          vpmull.p64     $j1,$H34,$j1
520
521         veor            $Xl,$Xl,$t2
522          veor           $Yl,$Yl,$j3
523          veor           $Yh,$Yh,$I1
524         vext.8          $Xl,$Xl,$Xl,#8
525          veor           $Ym,$Ym,$j1
526
527         subs            $len,$len,#64
528         b.ne            .Loop4x
529
530 .Ltail4x:
531         veor            $t0,$I0,$Xl
532         vext.8          $IN,$t0,$t0,#8
533
534         vpmull.p64      $Xl,$H4,$IN             @ H^4·(Xi+Ii)
535         veor            $t0,$t0,$IN
536         vpmull2.p64     $Xh,$H4,$IN
537         vpmull2.p64     $Xm,$H34,$t0
538
539         veor            $Xl,$Xl,$Yl
540         veor            $Xh,$Xh,$Yh
541         veor            $Xm,$Xm,$Ym
542
543         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
544         veor            $t2,$Xl,$Xh
545         veor            $Xm,$Xm,$t1
546         veor            $Xm,$Xm,$t2
547
548         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
549         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
550         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
551         veor            $Xl,$Xm,$t2
552
553         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
554         vpmull.p64      $Xl,$Xl,$xC2
555         veor            $t2,$t2,$Xh
556         veor            $Xl,$Xl,$t2
557         vext.8          $Xl,$Xl,$Xl,#8
558
559 #ifndef __ARMEB__
560         vrev64.8        $Xl,$Xl
561 #endif
562         vst1.64         {$Xl},[$Xi]             @ write out Xi
563
564         ret
565 .size   gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
566 ___
567
568 }
569 }
570
571 $code.=<<___;
572 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
573 .align  2
574 ___
575
576 if ($flavour =~ /64/) {                 ######## 64-bit code
577     sub unvmov {
578         my $arg=shift;
579
580         $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
581         sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
582                                              $3<8?$3:$3+8,($4 eq "lo")?0:1;
583     }
584     foreach(split("\n",$code)) {
585         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
586         s/vmov\.i8/movi/o               or      # fix up legacy mnemonics
587         s/vmov\s+(.*)/unvmov($1)/geo    or
588         s/vext\.8/ext/o                 or
589         s/vshr\.s/sshr\.s/o             or
590         s/vshr/ushr/o                   or
591         s/^(\s+)v/$1/o                  or      # strip off v prefix
592         s/\bbx\s+lr\b/ret/o;
593
594         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
595         s/@\s/\/\//o;                           # old->new style commentary
596
597         # fix up remaining legacy suffixes
598         s/\.[ui]?8(\s)/$1/o;
599         s/\.[uis]?32//o and s/\.16b/\.4s/go;
600         m/\.p64/o and s/\.16b/\.1q/o;           # 1st pmull argument
601         m/l\.p64/o and s/\.16b/\.1d/go;         # 2nd and 3rd pmull arguments
602         s/\.[uisp]?64//o and s/\.16b/\.2d/go;
603         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
604
605         print $_,"\n";
606     }
607 } else {                                ######## 32-bit code
608     sub unvdup32 {
609         my $arg=shift;
610
611         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
612         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
613     }
614     sub unvpmullp64 {
615         my ($mnemonic,$arg)=@_;
616
617         if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
618             my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
619                                  |(($2&7)<<17)|(($2&8)<<4)
620                                  |(($3&7)<<1) |(($3&8)<<2);
621             $word |= 0x00010001  if ($mnemonic =~ "2");
622             # since ARMv7 instructions are always encoded little-endian.
623             # correct solution is to use .inst directive, but older
624             # assemblers don't implement it:-(
625             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
626                         $word&0xff,($word>>8)&0xff,
627                         ($word>>16)&0xff,($word>>24)&0xff,
628                         $mnemonic,$arg;
629         }
630     }
631
632     foreach(split("\n",$code)) {
633         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
634         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
635         s/\/\/\s?/@ /o;                         # new->old style commentary
636
637         # fix up remaining new-style suffixes
638         s/\],#[0-9]+/]!/o;
639
640         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o                 or
641         s/vdup\.32\s+(.*)/unvdup32($1)/geo                              or
642         s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo                or
643         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
644         s/^(\s+)b\./$1b/o                                               or
645         s/^(\s+)ret/$1bx\tlr/o;
646
647         print $_,"\n";
648     }
649 }
650
651 close STDOUT; # enforce flush