crypto/chacha/asm/chacha-s390x.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # December 2015
  18 #
  19 # ChaCha20 for s390x.
  20 #
  21 # 3 times faster than compiler-generated code.
  22
  23 #
  24 # August 2018
  25 #
  26 # Add vx code path: 4x"vertical".
  27 #
  28 # Copyright IBM Corp. 2018
  29 # Author: Patrick Steuer <patrick.steuer@de.ibm.com>
  30
  31 #
  32 # February 2019
  33 #
  34 # Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
  35 # 4x"vertical" submission [on z13] and >3 faster than scalar code.
  36 # But to harness overheads revert to transliteration of VSX code path
  37 # from chacha-ppc module, which is also 4x"vertical", to handle inputs
  38 # not longer than 256 bytes.
  39
  40 use strict;
  41 use FindBin qw($Bin);
  42 use lib "$Bin/../..";
  43 use perlasm::s390x qw(:DEFAULT :VX :LD AUTOLOAD LABEL INCLUDE);
  44
  45 # $output is the last argument if it looks like a file (it has an extension)
  46 # $flavour is the first argument if it doesn't look like a file
  47 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  48 my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  49
  50 my ($z,$SIZE_T);
  51 if ($flavour =~ /3[12]/) {
  52         $z=0;   # S/390 ABI
  53         $SIZE_T=4;
  54 } else {
  55         $z=1;   # zSeries ABI
  56         $SIZE_T=8;
  57 }
  58
  59 my $sp="%r15";
  60 my $stdframe=16*$SIZE_T+4*8;
  61
  62 sub ROUND {
  63 my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
  64 my @t=map("%r$_",(8,9));
  65 my ($a0,$b0,$c0,$d0)=@_;
  66 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  67 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  68 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  69 my ($xc,$xc_)=map("$_",@t);
  70
  71         # Consider order in which variables are addressed by their
  72         # index:
  73         #
  74         #       a   b   c   d
  75         #
  76         #       0   4   8  12 < even round
  77         #       1   5   9  13
  78         #       2   6  10  14
  79         #       3   7  11  15
  80         #       0   5  10  15 < odd round
  81         #       1   6  11  12
  82         #       2   7   8  13
  83         #       3   4   9  14
  84         #
  85         # 'a', 'b' and 'd's are permanently allocated in registers,
  86         # @x[0..7,12..15], while 'c's are maintained in memory. If
  87         # you observe 'c' column, you'll notice that pair of 'c's is
  88         # invariant between rounds. This means that we have to reload
  89         # them once per round, in the middle. This is why you'll see
  90         # 'c' stores and loads in the middle, but none in the beginning
  91         # or end.
  92
  93         alr     (@x[$a0],@x[$b0]);      # Q1
  94          alr    (@x[$a1],@x[$b1]);      # Q2
  95         xr      (@x[$d0],@x[$a0]);
  96          xr     (@x[$d1],@x[$a1]);
  97         rll     (@x[$d0],@x[$d0],16);
  98          rll    (@x[$d1],@x[$d1],16);
  99
 100         alr     ($xc,@x[$d0]);
 101          alr    ($xc_,@x[$d1]);
 102         xr      (@x[$b0],$xc);
 103          xr     (@x[$b1],$xc_);
 104         rll     (@x[$b0],@x[$b0],12);
 105          rll    (@x[$b1],@x[$b1],12);
 106
 107         alr     (@x[$a0],@x[$b0]);
 108          alr    (@x[$a1],@x[$b1]);
 109         xr      (@x[$d0],@x[$a0]);
 110          xr     (@x[$d1],@x[$a1]);
 111         rll     (@x[$d0],@x[$d0],8);
 112          rll    (@x[$d1],@x[$d1],8);
 113
 114         alr     ($xc,@x[$d0]);
 115          alr    ($xc_,@x[$d1]);
 116         xr      (@x[$b0],$xc);
 117          xr     (@x[$b1],$xc_);
 118         rll     (@x[$b0],@x[$b0],7);
 119          rll    (@x[$b1],@x[$b1],7);
 120
 121         stm     ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)");  # reload pair of 'c's
 122         lm      ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
 123
 124         alr     (@x[$a2],@x[$b2]);      # Q3
 125          alr    (@x[$a3],@x[$b3]);      # Q4
 126         xr      (@x[$d2],@x[$a2]);
 127          xr     (@x[$d3],@x[$a3]);
 128         rll     (@x[$d2],@x[$d2],16);
 129          rll    (@x[$d3],@x[$d3],16);
 130
 131         alr     ($xc,@x[$d2]);
 132          alr    ($xc_,@x[$d3]);
 133         xr      (@x[$b2],$xc);
 134          xr     (@x[$b3],$xc_);
 135         rll     (@x[$b2],@x[$b2],12);
 136          rll    (@x[$b3],@x[$b3],12);
 137
 138         alr     (@x[$a2],@x[$b2]);
 139          alr    (@x[$a3],@x[$b3]);
 140         xr      (@x[$d2],@x[$a2]);
 141          xr     (@x[$d3],@x[$a3]);
 142         rll     (@x[$d2],@x[$d2],8);
 143          rll    (@x[$d3],@x[$d3],8);
 144
 145         alr     ($xc,@x[$d2]);
 146          alr    ($xc_,@x[$d3]);
 147         xr      (@x[$b2],$xc);
 148          xr     (@x[$b3],$xc_);
 149         rll     (@x[$b2],@x[$b2],7);
 150          rll    (@x[$b3],@x[$b3],7);
 151 }
 152
 153 sub VX_lane_ROUND {
 154 my ($a0,$b0,$c0,$d0)=@_;
 155 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
 156 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
 157 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
 158 my @x=map("%v$_",(0..15));
 159
 160         vaf     (@x[$a0],@x[$a0],@x[$b0]);      # Q1
 161         vx      (@x[$d0],@x[$d0],@x[$a0]);
 162         verllf  (@x[$d0],@x[$d0],16);
 163         vaf     (@x[$a1],@x[$a1],@x[$b1]);      # Q2
 164         vx      (@x[$d1],@x[$d1],@x[$a1]);
 165         verllf  (@x[$d1],@x[$d1],16);
 166         vaf     (@x[$a2],@x[$a2],@x[$b2]);      # Q3
 167         vx      (@x[$d2],@x[$d2],@x[$a2]);
 168         verllf  (@x[$d2],@x[$d2],16);
 169         vaf     (@x[$a3],@x[$a3],@x[$b3]);      # Q4
 170         vx      (@x[$d3],@x[$d3],@x[$a3]);
 171         verllf  (@x[$d3],@x[$d3],16);
 172
 173         vaf     (@x[$c0],@x[$c0],@x[$d0]);
 174         vx      (@x[$b0],@x[$b0],@x[$c0]);
 175         verllf  (@x[$b0],@x[$b0],12);
 176         vaf     (@x[$c1],@x[$c1],@x[$d1]);
 177         vx      (@x[$b1],@x[$b1],@x[$c1]);
 178         verllf  (@x[$b1],@x[$b1],12);
 179         vaf     (@x[$c2],@x[$c2],@x[$d2]);
 180         vx      (@x[$b2],@x[$b2],@x[$c2]);
 181         verllf  (@x[$b2],@x[$b2],12);
 182         vaf     (@x[$c3],@x[$c3],@x[$d3]);
 183         vx      (@x[$b3],@x[$b3],@x[$c3]);
 184         verllf  (@x[$b3],@x[$b3],12);
 185
 186         vaf     (@x[$a0],@x[$a0],@x[$b0]);
 187         vx      (@x[$d0],@x[$d0],@x[$a0]);
 188         verllf  (@x[$d0],@x[$d0],8);
 189         vaf     (@x[$a1],@x[$a1],@x[$b1]);
 190         vx      (@x[$d1],@x[$d1],@x[$a1]);
 191         verllf  (@x[$d1],@x[$d1],8);
 192         vaf     (@x[$a2],@x[$a2],@x[$b2]);
 193         vx      (@x[$d2],@x[$d2],@x[$a2]);
 194         verllf  (@x[$d2],@x[$d2],8);
 195         vaf     (@x[$a3],@x[$a3],@x[$b3]);
 196         vx      (@x[$d3],@x[$d3],@x[$a3]);
 197         verllf  (@x[$d3],@x[$d3],8);
 198
 199         vaf     (@x[$c0],@x[$c0],@x[$d0]);
 200         vx      (@x[$b0],@x[$b0],@x[$c0]);
 201         verllf  (@x[$b0],@x[$b0],7);
 202         vaf     (@x[$c1],@x[$c1],@x[$d1]);
 203         vx      (@x[$b1],@x[$b1],@x[$c1]);
 204         verllf  (@x[$b1],@x[$b1],7);
 205         vaf     (@x[$c2],@x[$c2],@x[$d2]);
 206         vx      (@x[$b2],@x[$b2],@x[$c2]);
 207         verllf  (@x[$b2],@x[$b2],7);
 208         vaf     (@x[$c3],@x[$c3],@x[$d3]);
 209         vx      (@x[$b3],@x[$b3],@x[$c3]);
 210         verllf  (@x[$b3],@x[$b3],7);
 211 }
 212
 213 sub VX_ROUND {
 214 my @a=@_[0..5];
 215 my @b=@_[6..11];
 216 my @c=@_[12..17];
 217 my @d=@_[18..23];
 218 my $odd=@_[24];
 219
 220         vaf             (@a[$_],@a[$_],@b[$_]) for (0..5);
 221         vx              (@d[$_],@d[$_],@a[$_]) for (0..5);
 222         verllf          (@d[$_],@d[$_],16) for (0..5);
 223
 224         vaf             (@c[$_],@c[$_],@d[$_]) for (0..5);
 225         vx              (@b[$_],@b[$_],@c[$_]) for (0..5);
 226         verllf          (@b[$_],@b[$_],12) for (0..5);
 227
 228         vaf             (@a[$_],@a[$_],@b[$_]) for (0..5);
 229         vx              (@d[$_],@d[$_],@a[$_]) for (0..5);
 230         verllf          (@d[$_],@d[$_],8) for (0..5);
 231
 232         vaf             (@c[$_],@c[$_],@d[$_]) for (0..5);
 233         vx              (@b[$_],@b[$_],@c[$_]) for (0..5);
 234         verllf          (@b[$_],@b[$_],7) for (0..5);
 235
 236         vsldb           (@c[$_],@c[$_],@c[$_],8) for (0..5);
 237         vsldb           (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
 238         vsldb           (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
 239 }
 240
 241 PERLASM_BEGIN($output);
 242
 243 INCLUDE ("s390x_arch.h");
 244 TEXT    ();
 245
 246 ################
 247 # void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
 248 #                     const unsigned int key[8], const unsigned int counter[4])
 249 my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
 250 {
 251 my $frame=$stdframe+4*20;
 252 my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
 253 my @t=map("%r$_",(8,9));
 254
 255 GLOBL   ("ChaCha20_ctr32");
 256 TYPE    ("ChaCha20_ctr32","\@function");
 257 ALIGN   (32);
 258 LABEL   ("ChaCha20_ctr32");
 259         larl    ("%r1","OPENSSL_s390xcap_P");
 260
 261         lghi    ("%r0",64);
 262 &{$z?   \&ltgr:\&ltr}   ($len,$len);            # len==0?
 263         bzr     ("%r14");
 264         lg      ("%r1","S390X_STFLE+16(%r1)");
 265 &{$z?   \&clgr:\&clr}   ($len,"%r0");
 266         jle     (".Lshort");
 267
 268         tmhh    ("%r1",0x4000);                 # check for vx bit
 269         jnz     (".LChaCha20_ctr32_vx");
 270
 271 LABEL   (".Lshort");
 272 &{$z?   \&aghi:\&ahi}   ($len,-64);
 273 &{$z?   \&lghi:\&lhi}   ("%r1",-$frame);
 274 &{$z?   \&stmg:\&stm}   ("%r6","%r15","6*$SIZE_T($sp)");
 275 &{$z?   \&slgr:\&slr}   ($out,$inp);    # difference
 276         la      ($len,"0($inp,$len)");  # end of input minus 64
 277         larl    ("%r7",".Lsigma");
 278         lgr     ("%r0",$sp);
 279         la      ($sp,"0(%r1,$sp)");
 280 &{$z?   \&stg:\&st}     ("%r0","0($sp)");
 281
 282         lmg     ("%r8","%r11","0($key)");       # load key
 283         lmg     ("%r12","%r13","0($counter)");  # load counter
 284         lmg     ("%r6","%r7","0(%r7)"); # load sigma constant
 285
 286         la      ("%r14","0($inp)");
 287 &{$z?   \&stg:\&st}     ($out,"$frame+3*$SIZE_T($sp)");
 288 &{$z?   \&stg:\&st}     ($len,"$frame+4*$SIZE_T($sp)");
 289         stmg    ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
 290         srlg    (@x[12],"%r12",32);     # 32-bit counter value
 291         j       (".Loop_outer");
 292
 293 ALIGN   (16);
 294 LABEL   (".Loop_outer");
 295         lm      (@x[0],@x[7],"$stdframe+4*0($sp)");     # load x[0]-x[7]
 296         lm      (@t[0],@t[1],"$stdframe+4*10($sp)");    # load x[10]-x[11]
 297         lm      (@x[13],@x[15],"$stdframe+4*13($sp)");  # load x[13]-x[15]
 298         stm     (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
 299         lm      (@t[0],@t[1],"$stdframe+4*8($sp)");     # load x[8]-x[9]
 300         st      (@x[12],"$stdframe+4*12($sp)"); # save counter
 301 &{$z?   \&stg:\&st}     ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
 302         lhi     ("%r14",10);
 303         j       (".Loop");
 304
 305 ALIGN   (4);
 306 LABEL   (".Loop");
 307         ROUND   (0, 4, 8,12);
 308         ROUND   (0, 5,10,15);
 309         brct    ("%r14",".Loop");
 310
 311 &{$z?   \&lg:\&l}       ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
 312         stm     (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9]
 313 &{$z?   \&lmg:\&lm}     (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
 314
 315         al      (@x[0],"$stdframe+4*0($sp)");   # accumulate key schedule
 316         al      (@x[1],"$stdframe+4*1($sp)");
 317         al      (@x[2],"$stdframe+4*2($sp)");
 318         al      (@x[3],"$stdframe+4*3($sp)");
 319         al      (@x[4],"$stdframe+4*4($sp)");
 320         al      (@x[5],"$stdframe+4*5($sp)");
 321         al      (@x[6],"$stdframe+4*6($sp)");
 322         al      (@x[7],"$stdframe+4*7($sp)");
 323         lrvr    (@x[0],@x[0]);
 324         lrvr    (@x[1],@x[1]);
 325         lrvr    (@x[2],@x[2]);
 326         lrvr    (@x[3],@x[3]);
 327         lrvr    (@x[4],@x[4]);
 328         lrvr    (@x[5],@x[5]);
 329         lrvr    (@x[6],@x[6]);
 330         lrvr    (@x[7],@x[7]);
 331         al      (@x[12],"$stdframe+4*12($sp)");
 332         al      (@x[13],"$stdframe+4*13($sp)");
 333         al      (@x[14],"$stdframe+4*14($sp)");
 334         al      (@x[15],"$stdframe+4*15($sp)");
 335         lrvr    (@x[12],@x[12]);
 336         lrvr    (@x[13],@x[13]);
 337         lrvr    (@x[14],@x[14]);
 338         lrvr    (@x[15],@x[15]);
 339
 340         la      (@t[0],"0(@t[0],%r14)");        # reconstruct output pointer
 341 &{$z?   \&clgr:\&clr}   ("%r14",@t[1]);
 342         jh      (".Ltail");
 343
 344         x       (@x[0],"4*0(%r14)");    # xor with input
 345         x       (@x[1],"4*1(%r14)");
 346         st      (@x[0],"4*0(@t[0])");   # store output
 347         x       (@x[2],"4*2(%r14)");
 348         st      (@x[1],"4*1(@t[0])");
 349         x       (@x[3],"4*3(%r14)");
 350         st      (@x[2],"4*2(@t[0])");
 351         x       (@x[4],"4*4(%r14)");
 352         st      (@x[3],"4*3(@t[0])");
 353          lm     (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11]
 354         x       (@x[5],"4*5(%r14)");
 355         st      (@x[4],"4*4(@t[0])");
 356         x       (@x[6],"4*6(%r14)");
 357          al     (@x[0],"$stdframe+4*8($sp)");
 358         st      (@x[5],"4*5(@t[0])");
 359         x       (@x[7],"4*7(%r14)");
 360          al     (@x[1],"$stdframe+4*9($sp)");
 361         st      (@x[6],"4*6(@t[0])");
 362         x       (@x[12],"4*12(%r14)");
 363          al     (@x[2],"$stdframe+4*10($sp)");
 364         st      (@x[7],"4*7(@t[0])");
 365         x       (@x[13],"4*13(%r14)");
 366          al     (@x[3],"$stdframe+4*11($sp)");
 367         st      (@x[12],"4*12(@t[0])");
 368         x       (@x[14],"4*14(%r14)");
 369         st      (@x[13],"4*13(@t[0])");
 370         x       (@x[15],"4*15(%r14)");
 371         st      (@x[14],"4*14(@t[0])");
 372          lrvr   (@x[0],@x[0]);
 373         st      (@x[15],"4*15(@t[0])");
 374          lrvr   (@x[1],@x[1]);
 375          lrvr   (@x[2],@x[2]);
 376          lrvr   (@x[3],@x[3]);
 377         lhi     (@x[12],1);
 378          x      (@x[0],"4*8(%r14)");
 379         al      (@x[12],"$stdframe+4*12($sp)"); # increment counter
 380          x      (@x[1],"4*9(%r14)");
 381          st     (@x[0],"4*8(@t[0])");
 382          x      (@x[2],"4*10(%r14)");
 383          st     (@x[1],"4*9(@t[0])");
 384          x      (@x[3],"4*11(%r14)");
 385          st     (@x[2],"4*10(@t[0])");
 386          st     (@x[3],"4*11(@t[0])");
 387
 388 &{$z?   \&clgr:\&clr}   ("%r14",@t[1]); # done yet?
 389         la      ("%r14","64(%r14)");
 390         jl      (".Loop_outer");
 391
 392 LABEL   (".Ldone");
 393         xgr     ("%r0","%r0");
 394         xgr     ("%r1","%r1");
 395         xgr     ("%r2","%r2");
 396         xgr     ("%r3","%r3");
 397         stmg    ("%r0","%r3","$stdframe+4*4($sp)");     # wipe key copy
 398         stmg    ("%r0","%r3","$stdframe+4*12($sp)");
 399
 400 &{$z?   \&lmg:\&lm}     ("%r6","%r15","$frame+6*$SIZE_T($sp)");
 401         br      ("%r14");
 402
 403 ALIGN   (16);
 404 LABEL   (".Ltail");
 405         la      (@t[1],"64($t[1])");
 406         stm     (@x[0],@x[7],"$stdframe+4*0($sp)");
 407 &{$z?   \&slgr:\&slr}   (@t[1],"%r14");
 408         lm      (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
 409 &{$z?   \&lghi:\&lhi}   (@x[6],0);
 410         stm     (@x[12],@x[15],"$stdframe+4*12($sp)");
 411         al      (@x[0],"$stdframe+4*8($sp)");
 412         al      (@x[1],"$stdframe+4*9($sp)");
 413         al      (@x[2],"$stdframe+4*10($sp)");
 414         al      (@x[3],"$stdframe+4*11($sp)");
 415         lrvr    (@x[0],@x[0]);
 416         lrvr    (@x[1],@x[1]);
 417         lrvr    (@x[2],@x[2]);
 418         lrvr    (@x[3],@x[3]);
 419         stm     (@x[0],@x[3],"$stdframe+4*8($sp)");
 420
 421 LABEL   (".Loop_tail");
 422         llgc    (@x[4],"0(@x[6],%r14)");
 423         llgc    (@x[5],"$stdframe(@x[6],$sp)");
 424         xr      (@x[5],@x[4]);
 425         stc     (@x[5],"0(@x[6],@t[0])");
 426         la      (@x[6],"1(@x[6])");
 427         brct    (@t[1],".Loop_tail");
 428
 429         j       (".Ldone");
 430 SIZE    ("ChaCha20_ctr32",".-ChaCha20_ctr32");
 431 }
 432
 433 ########################################################################
 434 # 4x"vertical" layout minimizes amount of instructions, but pipeline
 435 # runs underutilized [because of vector instructions' high latency].
 436 # On the other hand minimum amount of data it takes to fully utilize
 437 # the pipeline is higher, so that effectively, short inputs would be
 438 # processed slower. Hence this code path targeting <=256 bytes lengths.
 439 #
 440 {
 441 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
 442     $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
 443 my @K=map("%v$_",(16..19));
 444 my $CTR="%v26";
 445 my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
 446 my $beperm="%v31";
 447 my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
 448 my $FRAME=$stdframe+4*16;
 449
 450 ALIGN   (32);
 451 LABEL   ("ChaCha20_ctr32_4x");
 452 LABEL   (".LChaCha20_ctr32_4x");
 453 &{$z?   \&stmg:\&stm}   ("%r6","%r7","6*$SIZE_T($sp)");
 454 if (!$z) {
 455         std     ("%f4","16*$SIZE_T+2*8($sp)");
 456         std     ("%f6","16*$SIZE_T+3*8($sp)");
 457 }
 458 &{$z?   \&lghi:\&lhi}   ("%r1",-$FRAME);
 459         lgr     ("%r0",$sp);
 460         la      ($sp,"0(%r1,$sp)");
 461 &{$z?   \&stg:\&st}     ("%r0","0($sp)");       # back-chain
 462 if ($z) {
 463         std     ("%f8","$stdframe+8*0($sp)");
 464         std     ("%f9","$stdframe+8*1($sp)");
 465         std     ("%f10","$stdframe+8*2($sp)");
 466         std     ("%f11","$stdframe+8*3($sp)");
 467         std     ("%f12","$stdframe+8*4($sp)");
 468         std     ("%f13","$stdframe+8*5($sp)");
 469         std     ("%f14","$stdframe+8*6($sp)");
 470         std     ("%f15","$stdframe+8*7($sp)");
 471 }
 472         larl    ("%r7",".Lsigma");
 473         lhi     ("%r0",10);
 474         lhi     ("%r1",0);
 475
 476         vl      (@K[0],"0(%r7)");               # load sigma
 477         vl      (@K[1],"0($key)");              # load key
 478         vl      (@K[2],"16($key)");
 479         vl      (@K[3],"0($counter)");          # load counter
 480
 481         vl      ($beperm,"0x40(%r7)");
 482         vl      ($xt1,"0x50(%r7)");
 483         vrepf   ($CTR,@K[3],0);
 484         vlvgf   (@K[3],"%r1",0);                # clear @K[3].word[0]
 485         vaf     ($CTR,$CTR,$xt1);
 486
 487 #LABEL  (".Loop_outer_4x");
 488         vlm     ($xa0,$xa3,"0x60(%r7)");        # load [smashed] sigma
 489
 490         vrepf   ($xb0,@K[1],0);                 # smash the key
 491         vrepf   ($xb1,@K[1],1);
 492         vrepf   ($xb2,@K[1],2);
 493         vrepf   ($xb3,@K[1],3);
 494
 495         vrepf   ($xc0,@K[2],0);
 496         vrepf   ($xc1,@K[2],1);
 497         vrepf   ($xc2,@K[2],2);
 498         vrepf   ($xc3,@K[2],3);
 499
 500         vlr     ($xd0,$CTR);
 501         vrepf   ($xd1,@K[3],1);
 502         vrepf   ($xd2,@K[3],2);
 503         vrepf   ($xd3,@K[3],3);
 504
 505 LABEL   (".Loop_4x");
 506         VX_lane_ROUND(0, 4, 8,12);
 507         VX_lane_ROUND(0, 5,10,15);
 508         brct    ("%r0",".Loop_4x");
 509
 510         vaf     ($xd0,$xd0,$CTR);
 511
 512         vmrhf   ($xt0,$xa0,$xa1);               # transpose data
 513         vmrhf   ($xt1,$xa2,$xa3);
 514         vmrlf   ($xt2,$xa0,$xa1);
 515         vmrlf   ($xt3,$xa2,$xa3);
 516         vpdi    ($xa0,$xt0,$xt1,0b0000);
 517         vpdi    ($xa1,$xt0,$xt1,0b0101);
 518         vpdi    ($xa2,$xt2,$xt3,0b0000);
 519         vpdi    ($xa3,$xt2,$xt3,0b0101);
 520
 521         vmrhf   ($xt0,$xb0,$xb1);
 522         vmrhf   ($xt1,$xb2,$xb3);
 523         vmrlf   ($xt2,$xb0,$xb1);
 524         vmrlf   ($xt3,$xb2,$xb3);
 525         vpdi    ($xb0,$xt0,$xt1,0b0000);
 526         vpdi    ($xb1,$xt0,$xt1,0b0101);
 527         vpdi    ($xb2,$xt2,$xt3,0b0000);
 528         vpdi    ($xb3,$xt2,$xt3,0b0101);
 529
 530         vmrhf   ($xt0,$xc0,$xc1);
 531         vmrhf   ($xt1,$xc2,$xc3);
 532         vmrlf   ($xt2,$xc0,$xc1);
 533         vmrlf   ($xt3,$xc2,$xc3);
 534         vpdi    ($xc0,$xt0,$xt1,0b0000);
 535         vpdi    ($xc1,$xt0,$xt1,0b0101);
 536         vpdi    ($xc2,$xt2,$xt3,0b0000);
 537         vpdi    ($xc3,$xt2,$xt3,0b0101);
 538
 539         vmrhf   ($xt0,$xd0,$xd1);
 540         vmrhf   ($xt1,$xd2,$xd3);
 541         vmrlf   ($xt2,$xd0,$xd1);
 542         vmrlf   ($xt3,$xd2,$xd3);
 543         vpdi    ($xd0,$xt0,$xt1,0b0000);
 544         vpdi    ($xd1,$xt0,$xt1,0b0101);
 545         vpdi    ($xd2,$xt2,$xt3,0b0000);
 546         vpdi    ($xd3,$xt2,$xt3,0b0101);
 547
 548         #vrepif ($xt0,4);
 549         #vaf    ($CTR,$CTR,$xt0);               # next counter value
 550
 551         vaf     ($xa0,$xa0,@K[0]);
 552         vaf     ($xb0,$xb0,@K[1]);
 553         vaf     ($xc0,$xc0,@K[2]);
 554         vaf     ($xd0,$xd0,@K[3]);
 555
 556         vperm   ($xa0,$xa0,$xa0,$beperm);
 557         vperm   ($xb0,$xb0,$xb0,$beperm);
 558         vperm   ($xc0,$xc0,$xc0,$beperm);
 559         vperm   ($xd0,$xd0,$xd0,$beperm);
 560
 561         #&{$z?  \&clgfi:\&clfi} ($len,0x40);
 562         #jl     (".Ltail_4x");
 563
 564         vlm     ($xt0,$xt3,"0($inp)");
 565
 566         vx      ($xt0,$xt0,$xa0);
 567         vx      ($xt1,$xt1,$xb0);
 568         vx      ($xt2,$xt2,$xc0);
 569         vx      ($xt3,$xt3,$xd0);
 570
 571         vstm    ($xt0,$xt3,"0($out)");
 572
 573         la      ($inp,"0x40($inp)");
 574         la      ($out,"0x40($out)");
 575 &{$z?   \&aghi:\&ahi}   ($len,-0x40);
 576         #je     (".Ldone_4x");
 577
 578         vaf     ($xa0,$xa1,@K[0]);
 579         vaf     ($xb0,$xb1,@K[1]);
 580         vaf     ($xc0,$xc1,@K[2]);
 581         vaf     ($xd0,$xd1,@K[3]);
 582
 583         vperm   ($xa0,$xa0,$xa0,$beperm);
 584         vperm   ($xb0,$xb0,$xb0,$beperm);
 585         vperm   ($xc0,$xc0,$xc0,$beperm);
 586         vperm   ($xd0,$xd0,$xd0,$beperm);
 587
 588 &{$z?   \&clgfi:\&clfi} ($len,0x40);
 589         jl      (".Ltail_4x");
 590
 591         vlm     ($xt0,$xt3,"0($inp)");
 592
 593         vx      ($xt0,$xt0,$xa0);
 594         vx      ($xt1,$xt1,$xb0);
 595         vx      ($xt2,$xt2,$xc0);
 596         vx      ($xt3,$xt3,$xd0);
 597
 598         vstm    ($xt0,$xt3,"0($out)");
 599
 600         la      ($inp,"0x40($inp)");
 601         la      ($out,"0x40($out)");
 602 &{$z?   \&aghi:\&ahi}   ($len,-0x40);
 603         je      (".Ldone_4x");
 604
 605         vaf     ($xa0,$xa2,@K[0]);
 606         vaf     ($xb0,$xb2,@K[1]);
 607         vaf     ($xc0,$xc2,@K[2]);
 608         vaf     ($xd0,$xd2,@K[3]);
 609
 610         vperm   ($xa0,$xa0,$xa0,$beperm);
 611         vperm   ($xb0,$xb0,$xb0,$beperm);
 612         vperm   ($xc0,$xc0,$xc0,$beperm);
 613         vperm   ($xd0,$xd0,$xd0,$beperm);
 614
 615 &{$z?   \&clgfi:\&clfi} ($len,0x40);
 616         jl      (".Ltail_4x");
 617
 618         vlm     ($xt0,$xt3,"0($inp)");
 619
 620         vx      ($xt0,$xt0,$xa0);
 621         vx      ($xt1,$xt1,$xb0);
 622         vx      ($xt2,$xt2,$xc0);
 623         vx      ($xt3,$xt3,$xd0);
 624
 625         vstm    ($xt0,$xt3,"0($out)");
 626
 627         la      ($inp,"0x40($inp)");
 628         la      ($out,"0x40($out)");
 629 &{$z?   \&aghi:\&ahi}   ($len,-0x40);
 630         je      (".Ldone_4x");
 631
 632         vaf     ($xa0,$xa3,@K[0]);
 633         vaf     ($xb0,$xb3,@K[1]);
 634         vaf     ($xc0,$xc3,@K[2]);
 635         vaf     ($xd0,$xd3,@K[3]);
 636
 637         vperm   ($xa0,$xa0,$xa0,$beperm);
 638         vperm   ($xb0,$xb0,$xb0,$beperm);
 639         vperm   ($xc0,$xc0,$xc0,$beperm);
 640         vperm   ($xd0,$xd0,$xd0,$beperm);
 641
 642 &{$z?   \&clgfi:\&clfi} ($len,0x40);
 643         jl      (".Ltail_4x");
 644
 645         vlm     ($xt0,$xt3,"0($inp)");
 646
 647         vx      ($xt0,$xt0,$xa0);
 648         vx      ($xt1,$xt1,$xb0);
 649         vx      ($xt2,$xt2,$xc0);
 650         vx      ($xt3,$xt3,$xd0);
 651
 652         vstm    ($xt0,$xt3,"0($out)");
 653
 654         #la     $inp,0x40($inp));
 655         #la     $out,0x40($out));
 656         #lhi    %r0,10);
 657         #&{$z?  \&aghi:\&ahi}   $len,-0x40);
 658         #jne    .Loop_outer_4x);
 659
 660 LABEL   (".Ldone_4x");
 661 if (!$z) {
 662         ld      ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
 663         ld      ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
 664 } else {
 665         ld      ("%f8","$stdframe+8*0($sp)");
 666         ld      ("%f9","$stdframe+8*1($sp)");
 667         ld      ("%f10","$stdframe+8*2($sp)");
 668         ld      ("%f11","$stdframe+8*3($sp)");
 669         ld      ("%f12","$stdframe+8*4($sp)");
 670         ld      ("%f13","$stdframe+8*5($sp)");
 671         ld      ("%f14","$stdframe+8*6($sp)");
 672         ld      ("%f15","$stdframe+8*7($sp)");
 673 }
 674 &{$z?   \&lmg:\&lm}     ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
 675         la      ($sp,"$FRAME($sp)");
 676         br      ("%r14");
 677
 678 ALIGN   (16);
 679 LABEL   (".Ltail_4x");
 680 if (!$z) {
 681         vlr     ($xt0,$xb0);
 682         ld      ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
 683         ld      ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
 684
 685         vst     ($xa0,"$stdframe+0x00($sp)");
 686         vst     ($xt0,"$stdframe+0x10($sp)");
 687         vst     ($xc0,"$stdframe+0x20($sp)");
 688         vst     ($xd0,"$stdframe+0x30($sp)");
 689 } else {
 690         vlr     ($xt0,$xc0);
 691         ld      ("%f8","$stdframe+8*0($sp)");
 692         ld      ("%f9","$stdframe+8*1($sp)");
 693         ld      ("%f10","$stdframe+8*2($sp)");
 694         ld      ("%f11","$stdframe+8*3($sp)");
 695         vlr     ($xt1,$xd0);
 696         ld      ("%f12","$stdframe+8*4($sp)");
 697         ld      ("%f13","$stdframe+8*5($sp)");
 698         ld      ("%f14","$stdframe+8*6($sp)");
 699         ld      ("%f15","$stdframe+8*7($sp)");
 700
 701         vst     ($xa0,"$stdframe+0x00($sp)");
 702         vst     ($xb0,"$stdframe+0x10($sp)");
 703         vst     ($xt0,"$stdframe+0x20($sp)");
 704         vst     ($xt1,"$stdframe+0x30($sp)");
 705 }
 706         lghi    ("%r1",0);
 707
 708 LABEL   (".Loop_tail_4x");
 709         llgc    ("%r5","0(%r1,$inp)");
 710         llgc    ("%r6","$stdframe(%r1,$sp)");
 711         xr      ("%r6","%r5");
 712         stc     ("%r6","0(%r1,$out)");
 713         la      ("%r1","1(%r1)");
 714         brct    ($len,".Loop_tail_4x");
 715
 716 &{$z?   \&lmg:\&lm}     ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
 717         la      ($sp,"$FRAME($sp)");
 718         br      ("%r14");
 719 SIZE    ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
 720 }
 721
 722 ########################################################################
 723 # 6x"horizontal" layout is optimal fit for the platform in its current
 724 # shape, more specifically for given vector instructions' latency. Well,
 725 # computational part of 8x"vertical" would be faster, but it consumes
 726 # all registers and dealing with that will diminish the return...
 727 #
 728 {
 729 my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
 730     $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
 731     $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
 732 my @K=map("%v$_",(27,24..26));
 733 my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
 734 my $beperm="%v31";
 735 my $FRAME=$stdframe + 4*16;
 736
 737 GLOBL   ("ChaCha20_ctr32_vx");
 738 ALIGN   (32);
 739 LABEL   ("ChaCha20_ctr32_vx");
 740 LABEL   (".LChaCha20_ctr32_vx");
 741 &{$z?   \&clgfi:\&clfi} ($len,256);
 742         jle     (".LChaCha20_ctr32_4x");
 743 &{$z?   \&stmg:\&stm}   ("%r6","%r7","6*$SIZE_T($sp)");
 744 if (!$z) {
 745         std     ("%f4","16*$SIZE_T+2*8($sp)");
 746         std     ("%f6","16*$SIZE_T+3*8($sp)");
 747 }
 748 &{$z?   \&lghi:\&lhi}   ("%r1",-$FRAME);
 749         lgr     ("%r0",$sp);
 750         la      ($sp,"0(%r1,$sp)");
 751 &{$z?   \&stg:\&st}     ("%r0","0($sp)");       # back-chain
 752 if ($z) {
 753         std     ("%f8","$FRAME-8*8($sp)");
 754         std     ("%f9","$FRAME-8*7($sp)");
 755         std     ("%f10","$FRAME-8*6($sp)");
 756         std     ("%f11","$FRAME-8*5($sp)");
 757         std     ("%f12","$FRAME-8*4($sp)");
 758         std     ("%f13","$FRAME-8*3($sp)");
 759         std     ("%f14","$FRAME-8*2($sp)");
 760         std     ("%f15","$FRAME-8*1($sp)");
 761 }
 762         larl    ("%r7",".Lsigma");
 763         lhi     ("%r0",10);
 764
 765         vlm     (@K[1],@K[2],"0($key)");        # load key
 766         vl      (@K[3],"0($counter)");          # load counter
 767
 768         vlm     (@K[0],"$beperm","0(%r7)");     # load sigma, increments, ...
 769
 770 LABEL   (".Loop_outer_vx");
 771         vlr     ($a0,@K[0]);
 772         vlr     ($b0,@K[1]);
 773         vlr     ($a1,@K[0]);
 774         vlr     ($b1,@K[1]);
 775         vlr     ($a2,@K[0]);
 776         vlr     ($b2,@K[1]);
 777         vlr     ($a3,@K[0]);
 778         vlr     ($b3,@K[1]);
 779         vlr     ($a4,@K[0]);
 780         vlr     ($b4,@K[1]);
 781         vlr     ($a5,@K[0]);
 782         vlr     ($b5,@K[1]);
 783
 784         vlr     ($d0,@K[3]);
 785         vaf     ($d1,@K[3],$t1);                # K[3]+1
 786         vaf     ($d2,@K[3],$t2);                # K[3]+2
 787         vaf     ($d3,@K[3],$t3);                # K[3]+3
 788         vaf     ($d4,$d2,$t2);                  # K[3]+4
 789         vaf     ($d5,$d2,$t3);                  # K[3]+5
 790
 791         vlr     ($c0,@K[2]);
 792         vlr     ($c1,@K[2]);
 793         vlr     ($c2,@K[2]);
 794         vlr     ($c3,@K[2]);
 795         vlr     ($c4,@K[2]);
 796         vlr     ($c5,@K[2]);
 797
 798         vlr     ($t1,$d1);
 799         vlr     ($t2,$d2);
 800         vlr     ($t3,$d3);
 801
 802 ALIGN   (4);
 803 LABEL   (".Loop_vx");
 804
 805         VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
 806                  $b0,$b1,$b2,$b3,$b4,$b5,
 807                  $c0,$c1,$c2,$c3,$c4,$c5,
 808                  $d0,$d1,$d2,$d3,$d4,$d5,
 809                  0);
 810
 811         VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
 812                  $b0,$b1,$b2,$b3,$b4,$b5,
 813                  $c0,$c1,$c2,$c3,$c4,$c5,
 814                  $d0,$d1,$d2,$d3,$d4,$d5,
 815                  1);
 816
 817         brct    ("%r0",".Loop_vx");
 818
 819         vaf     ($a0,$a0,@K[0]);
 820         vaf     ($b0,$b0,@K[1]);
 821         vaf     ($c0,$c0,@K[2]);
 822         vaf     ($d0,$d0,@K[3]);
 823         vaf     ($a1,$a1,@K[0]);
 824         vaf     ($d1,$d1,$t1);                  # +K[3]+1
 825
 826         vperm   ($a0,$a0,$a0,$beperm);
 827         vperm   ($b0,$b0,$b0,$beperm);
 828         vperm   ($c0,$c0,$c0,$beperm);
 829         vperm   ($d0,$d0,$d0,$beperm);
 830
 831 &{$z?   \&clgfi:\&clfi} ($len,0x40);
 832         jl      (".Ltail_vx");
 833
 834         vaf     ($d2,$d2,$t2);                  # +K[3]+2
 835         vaf     ($d3,$d3,$t3);                  # +K[3]+3
 836         vlm     ($t0,$t3,"0($inp)");
 837
 838         vx      ($a0,$a0,$t0);
 839         vx      ($b0,$b0,$t1);
 840         vx      ($c0,$c0,$t2);
 841         vx      ($d0,$d0,$t3);
 842
 843         vlm     (@K[0],$t3,"0(%r7)");           # re-load sigma and increments
 844
 845         vstm    ($a0,$d0,"0($out)");
 846
 847         la      ($inp,"0x40($inp)");
 848         la      ($out,"0x40($out)");
 849 &{$z?   \&aghi:\&ahi}   ($len,-0x40);
 850         je      (".Ldone_vx");
 851
 852         vaf     ($b1,$b1,@K[1]);
 853         vaf     ($c1,$c1,@K[2]);
 854
 855         vperm   ($a0,$a1,$a1,$beperm);
 856         vperm   ($b0,$b1,$b1,$beperm);
 857         vperm   ($c0,$c1,$c1,$beperm);
 858         vperm   ($d0,$d1,$d1,$beperm);
 859
 860 &{$z?   \&clgfi:\&clfi} ($len,0x40);
 861         jl      (".Ltail_vx");
 862
 863         vlm     ($a1,$d1,"0($inp)");
 864
 865         vx      ($a0,$a0,$a1);
 866         vx      ($b0,$b0,$b1);
 867         vx      ($c0,$c0,$c1);
 868         vx      ($d0,$d0,$d1);
 869
 870         vstm    ($a0,$d0,"0($out)");
 871
 872         la      ($inp,"0x40($inp)");
 873         la      ($out,"0x40($out)");
 874 &{$z?   \&aghi:\&ahi}   ($len,-0x40);
 875         je      (".Ldone_vx");
 876
 877         vaf     ($a2,$a2,@K[0]);
 878         vaf     ($b2,$b2,@K[1]);
 879         vaf     ($c2,$c2,@K[2]);
 880
 881         vperm   ($a0,$a2,$a2,$beperm);
 882         vperm   ($b0,$b2,$b2,$beperm);
 883         vperm   ($c0,$c2,$c2,$beperm);
 884         vperm   ($d0,$d2,$d2,$beperm);
 885
 886 &{$z?   \&clgfi:\&clfi} ($len,0x40);
 887         jl      (".Ltail_vx");
 888
 889         vlm     ($a1,$d1,"0($inp)");
 890
 891         vx      ($a0,$a0,$a1);
 892         vx      ($b0,$b0,$b1);
 893         vx      ($c0,$c0,$c1);
 894         vx      ($d0,$d0,$d1);
 895
 896         vstm    ($a0,$d0,"0($out)");
 897
 898         la      ($inp,"0x40($inp)");
 899         la      ($out,"0x40($out)");
 900 &{$z?   \&aghi:\&ahi}   ($len,-0x40);
 901         je      (".Ldone_vx");
 902
 903         vaf     ($a3,$a3,@K[0]);
 904         vaf     ($b3,$b3,@K[1]);
 905         vaf     ($c3,$c3,@K[2]);
 906         vaf     ($d2,@K[3],$t3);                # K[3]+3
 907
 908         vperm   ($a0,$a3,$a3,$beperm);
 909         vperm   ($b0,$b3,$b3,$beperm);
 910         vperm   ($c0,$c3,$c3,$beperm);
 911         vperm   ($d0,$d3,$d3,$beperm);
 912
 913 &{$z?   \&clgfi:\&clfi} ($len,0x40);
 914         jl      (".Ltail_vx");
 915
 916         vaf     ($d3,$d2,$t1);                  # K[3]+4
 917         vlm     ($a1,$d1,"0($inp)");
 918
 919         vx      ($a0,$a0,$a1);
 920         vx      ($b0,$b0,$b1);
 921         vx      ($c0,$c0,$c1);
 922         vx      ($d0,$d0,$d1);
 923
 924         vstm    ($a0,$d0,"0($out)");
 925
 926         la      ($inp,"0x40($inp)");
 927         la      ($out,"0x40($out)");
 928 &{$z?   \&aghi:\&ahi}   ($len,-0x40);
 929         je      (".Ldone_vx");
 930
 931         vaf     ($a4,$a4,@K[0]);
 932         vaf     ($b4,$b4,@K[1]);
 933         vaf     ($c4,$c4,@K[2]);
 934         vaf     ($d4,$d4,$d3);                  # +K[3]+4
 935         vaf     ($d3,$d3,$t1);                  # K[3]+5
 936         vaf     (@K[3],$d2,$t3);                # K[3]+=6
 937
 938         vperm   ($a0,$a4,$a4,$beperm);
 939         vperm   ($b0,$b4,$b4,$beperm);
 940         vperm   ($c0,$c4,$c4,$beperm);
 941         vperm   ($d0,$d4,$d4,$beperm);
 942
 943 &{$z?   \&clgfi:\&clfi} ($len,0x40);
 944         jl      (".Ltail_vx");
 945
 946         vlm     ($a1,$d1,"0($inp)");
 947
 948         vx      ($a0,$a0,$a1);
 949         vx      ($b0,$b0,$b1);
 950         vx      ($c0,$c0,$c1);
 951         vx      ($d0,$d0,$d1);
 952
 953         vstm    ($a0,$d0,"0($out)");
 954
 955         la      ($inp,"0x40($inp)");
 956         la      ($out,"0x40($out)");
 957 &{$z?   \&aghi:\&ahi}   ($len,-0x40);
 958         je      (".Ldone_vx");
 959
 960         vaf     ($a5,$a5,@K[0]);
 961         vaf     ($b5,$b5,@K[1]);
 962         vaf     ($c5,$c5,@K[2]);
 963         vaf     ($d5,$d5,$d3);                  # +K[3]+5
 964
 965         vperm   ($a0,$a5,$a5,$beperm);
 966         vperm   ($b0,$b5,$b5,$beperm);
 967         vperm   ($c0,$c5,$c5,$beperm);
 968         vperm   ($d0,$d5,$d5,$beperm);
 969
 970 &{$z?   \&clgfi:\&clfi} ($len,0x40);
 971         jl      (".Ltail_vx");
 972
 973         vlm     ($a1,$d1,"0($inp)");
 974
 975         vx      ($a0,$a0,$a1);
 976         vx      ($b0,$b0,$b1);
 977         vx      ($c0,$c0,$c1);
 978         vx      ($d0,$d0,$d1);
 979
 980         vstm    ($a0,$d0,"0($out)");
 981
 982         la      ($inp,"0x40($inp)");
 983         la      ($out,"0x40($out)");
 984         lhi     ("%r0",10);
 985 &{$z?   \&aghi:\&ahi}   ($len,-0x40);
 986         jne     (".Loop_outer_vx");
 987
 988 LABEL   (".Ldone_vx");
 989 if (!$z) {
 990         ld      ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
 991         ld      ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
 992 } else {
 993         ld      ("%f8","$FRAME-8*8($sp)");
 994         ld      ("%f9","$FRAME-8*7($sp)");
 995         ld      ("%f10","$FRAME-8*6($sp)");
 996         ld      ("%f11","$FRAME-8*5($sp)");
 997         ld      ("%f12","$FRAME-8*4($sp)");
 998         ld      ("%f13","$FRAME-8*3($sp)");
 999         ld      ("%f14","$FRAME-8*2($sp)");
1000         ld      ("%f15","$FRAME-8*1($sp)");
1001 }
1002 &{$z?   \&lmg:\&lm}     ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1003         la      ($sp,"$FRAME($sp)");
1004         br      ("%r14");
1005
1006 ALIGN   (16);
1007 LABEL   (".Ltail_vx");
1008 if (!$z) {
1009         ld      ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
1010         ld      ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
1011 } else {
1012         ld      ("%f8","$FRAME-8*8($sp)");
1013         ld      ("%f9","$FRAME-8*7($sp)");
1014         ld      ("%f10","$FRAME-8*6($sp)");
1015         ld      ("%f11","$FRAME-8*5($sp)");
1016         ld      ("%f12","$FRAME-8*4($sp)");
1017         ld      ("%f13","$FRAME-8*3($sp)");
1018         ld      ("%f14","$FRAME-8*2($sp)");
1019         ld      ("%f15","$FRAME-8*1($sp)");
1020 }
1021         vstm    ($a0,$d0,"$stdframe($sp)");
1022         lghi    ("%r1",0);
1023
1024 LABEL   (".Loop_tail_vx");
1025         llgc    ("%r5","0(%r1,$inp)");
1026         llgc    ("%r6","$stdframe(%r1,$sp)");
1027         xr      ("%r6","%r5");
1028         stc     ("%r6","0(%r1,$out)");
1029         la      ("%r1","1(%r1)");
1030         brct    ($len,".Loop_tail_vx");
1031
1032 &{$z?   \&lmg:\&lm}     ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1033         la      ($sp,"$FRAME($sp)");
1034         br      ("%r14");
1035 SIZE    ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
1036 }
1037 ################
1038
1039 ALIGN   (32);
1040 LABEL   (".Lsigma");
1041 LONG    (0x61707865,0x3320646e,0x79622d32,0x6b206574);  # endian-neutral sigma
1042 LONG    (1,0,0,0);
1043 LONG    (2,0,0,0);
1044 LONG    (3,0,0,0);
1045 LONG    (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c);  # byte swap
1046
1047 LONG    (0,1,2,3);
1048 LONG    (0x61707865,0x61707865,0x61707865,0x61707865);  # smashed sigma
1049 LONG    (0x3320646e,0x3320646e,0x3320646e,0x3320646e);
1050 LONG    (0x79622d32,0x79622d32,0x79622d32,0x79622d32);
1051 LONG    (0x6b206574,0x6b206574,0x6b206574,0x6b206574);
1052
1053 ASCIZ   ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
1054 ALIGN   (4);
1055
1056 PERLASM_END();