2 # Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
21 # 3 times faster than compiler-generated code.
28 # Copyright IBM Corp. 2018
29 # Author: Patrick Steuer <patrick.steuer@de.ibm.com>
34 use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);
39 if ($flavour =~ /3[12]/) {
48 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
51 my $stdframe=16*$SIZE_T+4*8;
53 my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
54 my @t=map("%r$_",(8,9));
55 my @v=map("%v$_",(16..31));
58 my ($a0,$b0,$c0,$d0)=@_;
59 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
60 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
61 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
62 my ($xc,$xc_)=map("$_",@t);
64 # Consider order in which variables are addressed by their
69 # 0 4 8 12 < even round
73 # 0 5 10 15 < odd round
78 # 'a', 'b' and 'd's are permanently allocated in registers,
79 # @x[0..7,12..15], while 'c's are maintained in memory. If
80 # you observe 'c' column, you'll notice that pair of 'c's is
81 # invariant between rounds. This means that we have to reload
82 # them once per round, in the middle. This is why you'll see
83 # 'c' stores and loads in the middle, but none in the beginning
86 alr (@x[$a0],@x[$b0]); # Q1
87 alr (@x[$a1],@x[$b1]); # Q2
90 rll (@x[$d0],@x[$d0],16);
91 rll (@x[$d1],@x[$d1],16);
97 rll (@x[$b0],@x[$b0],12);
98 rll (@x[$b1],@x[$b1],12);
100 alr (@x[$a0],@x[$b0]);
101 alr (@x[$a1],@x[$b1]);
102 xr (@x[$d0],@x[$a0]);
103 xr (@x[$d1],@x[$a1]);
104 rll (@x[$d0],@x[$d0],8);
105 rll (@x[$d1],@x[$d1],8);
111 rll (@x[$b0],@x[$b0],7);
112 rll (@x[$b1],@x[$b1],7);
114 stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's
115 lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
117 alr (@x[$a2],@x[$b2]); # Q3
118 alr (@x[$a3],@x[$b3]); # Q4
119 xr (@x[$d2],@x[$a2]);
120 xr (@x[$d3],@x[$a3]);
121 rll (@x[$d2],@x[$d2],16);
122 rll (@x[$d3],@x[$d3],16);
128 rll (@x[$b2],@x[$b2],12);
129 rll (@x[$b3],@x[$b3],12);
131 alr (@x[$a2],@x[$b2]);
132 alr (@x[$a3],@x[$b3]);
133 xr (@x[$d2],@x[$a2]);
134 xr (@x[$d3],@x[$a3]);
135 rll (@x[$d2],@x[$d2],8);
136 rll (@x[$d3],@x[$d3],8);
142 rll (@x[$b2],@x[$b2],7);
143 rll (@x[$b3],@x[$b3],7);
147 my ($a0,$b0,$c0,$d0)=@_;
148 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
149 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
150 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
152 vaf (@v[$a0],@v[$a0],@v[$b0]);
153 vaf (@v[$a1],@v[$a1],@v[$b1]);
154 vaf (@v[$a2],@v[$a2],@v[$b2]);
155 vaf (@v[$a3],@v[$a3],@v[$b3]);
156 vx (@v[$d0],@v[$d0],@v[$a0]);
157 vx (@v[$d1],@v[$d1],@v[$a1]);
158 vx (@v[$d2],@v[$d2],@v[$a2]);
159 vx (@v[$d3],@v[$d3],@v[$a3]);
160 verllf (@v[$d0],@v[$d0],16);
161 verllf (@v[$d1],@v[$d1],16);
162 verllf (@v[$d2],@v[$d2],16);
163 verllf (@v[$d3],@v[$d3],16);
165 vaf (@v[$c0],@v[$c0],@v[$d0]);
166 vaf (@v[$c1],@v[$c1],@v[$d1]);
167 vaf (@v[$c2],@v[$c2],@v[$d2]);
168 vaf (@v[$c3],@v[$c3],@v[$d3]);
169 vx (@v[$b0],@v[$b0],@v[$c0]);
170 vx (@v[$b1],@v[$b1],@v[$c1]);
171 vx (@v[$b2],@v[$b2],@v[$c2]);
172 vx (@v[$b3],@v[$b3],@v[$c3]);
173 verllf (@v[$b0],@v[$b0],12);
174 verllf (@v[$b1],@v[$b1],12);
175 verllf (@v[$b2],@v[$b2],12);
176 verllf (@v[$b3],@v[$b3],12);
178 vaf (@v[$a0],@v[$a0],@v[$b0]);
179 vaf (@v[$a1],@v[$a1],@v[$b1]);
180 vaf (@v[$a2],@v[$a2],@v[$b2]);
181 vaf (@v[$a3],@v[$a3],@v[$b3]);
182 vx (@v[$d0],@v[$d0],@v[$a0]);
183 vx (@v[$d1],@v[$d1],@v[$a1]);
184 vx (@v[$d2],@v[$d2],@v[$a2]);
185 vx (@v[$d3],@v[$d3],@v[$a3]);
186 verllf (@v[$d0],@v[$d0],8);
187 verllf (@v[$d1],@v[$d1],8);
188 verllf (@v[$d2],@v[$d2],8);
189 verllf (@v[$d3],@v[$d3],8);
191 vaf (@v[$c0],@v[$c0],@v[$d0]);
192 vaf (@v[$c1],@v[$c1],@v[$d1]);
193 vaf (@v[$c2],@v[$c2],@v[$d2]);
194 vaf (@v[$c3],@v[$c3],@v[$d3]);
195 vx (@v[$b0],@v[$b0],@v[$c0]);
196 vx (@v[$b1],@v[$b1],@v[$c1]);
197 vx (@v[$b2],@v[$b2],@v[$c2]);
198 vx (@v[$b3],@v[$b3],@v[$c3]);
199 verllf (@v[$b0],@v[$b0],7);
200 verllf (@v[$b1],@v[$b1],7);
201 verllf (@v[$b2],@v[$b2],7);
202 verllf (@v[$b3],@v[$b3],7);
205 PERLASM_BEGIN($output);
207 INCLUDE ("s390x_arch.h");
211 # void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
212 # const unsigned int key[8], const unsigned int counter[4])
214 my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
218 my $off=$z*8*16+8; # offset(initial state)
219 my $frame=$stdframe+4*16+$off;
221 GLOBL ("ChaCha20_ctr32");
222 TYPE ("ChaCha20_ctr32","\@function");
224 LABEL ("ChaCha20_ctr32");
225 larl ("%r1","OPENSSL_s390xcap_P");
228 &{$z? \&cgr:\&cr} ($len,"%r0");
229 jle ("_s390x_chacha_novx");
231 lg ("%r0","S390X_STFLE+16(%r1)");
232 tmhh ("%r0",0x4000); # check for vector facility
233 jz ("_s390x_chacha_novx");
237 std ("%f4","16*$SIZE_T+2*8($sp)");
238 std ("%f6","16*$SIZE_T+3*8($sp)");
240 &{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
242 lghi ("%r1",-$frame);
244 la ($sp,"0(%r1,$sp)"); # allocate stack frame
246 larl ("%r7",".Lsigma");
247 &{$z? \&stg:\&st} ("%r0","0($sp)"); # backchain
249 vstm ("%v8","%v15","8($sp)") if ($z);
251 vlm ("%v1","%v2","0($key)"); # load key
252 vl ("%v0","0(%r7)"); # load sigma constant
253 vl ("%v3","0($counter)"); # load iv (counter||nonce)
254 l ("%r0","0($counter)"); # load counter
255 vstm ("%v0","%v3","$off($sp)"); # copy initial state to stack
261 ALIGN (16); # process 4 64-byte blocks
263 vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
265 vl ("%v31","16(%r7)");
266 vaf ("%v12","%v12","%v31"); # increment counter
268 vlr (@v[$_],"%v$_") for (0..15); # copy initial state
274 LABEL (".Loop_vx_4x");
275 VX_ROUND( 0, 4, 8,12); # column round
276 VX_ROUND( 0, 5,10,15); # diagonal round
277 brct ("%r6",".Loop_vx_4x");
279 vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
281 vlm ("%v6","%v7","32(%r7)"); # load vperm operands
283 for (0..3) { # blocks 1,2
284 vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
285 vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
286 vperm ("%v".($_+ 8),"%v0","%v1","%v6");
287 vperm ("%v".($_+12),"%v0","%v1","%v7");
289 vlm ("%v0","%v7","0($inp)"); # load in
290 vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
291 vstm ("%v0","%v7","0($out)"); # store out
293 vlm ("%v6","%v7","32(%r7)"); # restore vperm operands
295 for (0..3) { # blocks 2,3
296 vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
297 vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
298 vperm ("%v".($_+ 8),"%v0","%v1","%v6");
299 vperm ("%v".($_+12),"%v0","%v1","%v7");
301 vlm ("%v0","%v7","128($inp)"); # load in
302 vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
303 vstm ("%v0","%v7","128($out)"); # store out
306 st ("%r0","48+$off($sp)"); # update initial state
308 la ($inp,"256($inp)");
309 la ($out,"256($out)");
310 brctg ("%r1",".Lvx_4x");
313 LABEL (".Lvx_4x_done");
320 vzero ("%v$_") for (16..31); # wipe ks and key copy
321 vstm ("%v16","%v17","16+$off($sp)");
322 vlm ("%v8","%v15","8($sp)") if ($z);
324 la ($sp,"$frame($sp)");
325 &{$z? \&lmg:\&lm} ("%r6","%r7","6*$SIZE_T($sp)");
328 ld ("%f4","16*$SIZE_T+2*8($sp)");
329 ld ("%f6","16*$SIZE_T+3*8($sp)");
330 vzero ("%v$_") for (8..15);
338 brc (2,".Lvx_rem_g64"); # cc==2?
340 lghi ("%r1",-$stdframe);
342 la ($counter,"48+$off($sp)"); # load updated iv
343 ar ($len,"%r0"); # restore len
345 lgr ("%r7",$counter);
346 &{$z? \&stg:\&st} ("%r14","14*$SIZE_T+$frame($sp)");
347 la ($sp,"0(%r1,$sp)");
349 bras ("%r14","_s390x_chacha_novx");
351 la ($sp,"$stdframe($sp)");
352 &{$z? \&lg:\&l} ("%r14","14*$SIZE_T+$frame($sp)");
353 lgr ($counter,"%r7");
357 LABEL (".Lvx_rem_g64");
358 vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
360 vl ("%v31","16(%r7)");
361 vaf ("%v12","%v12","%v31"); # increment counter
363 vlr (@v[$_],"%v$_") for (0..15); # state = initial state
369 LABEL (".Loop_vx_rem");
370 VX_ROUND( 0, 4, 8,12); # column round
371 VX_ROUND( 0, 5,10,15); # diagonal round
372 brct ("%r6",".Loop_vx_rem");
374 vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
376 vlm ("%v6","%v7","32(%r7)"); # load vperm operands
378 for (0..3) { # blocks 1,2
379 vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
380 vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
381 vperm ("%v".($_+8),"%v0","%v1","%v6");
382 vperm ("%v".($_+12),"%v0","%v1","%v7");
384 vlm ("%v0","%v3","0($inp)"); # load in
385 vx ("%v$_","%v$_","%v".($_+8)) for (0..3); # out = in ^ ks
386 vstm ("%v0","%v3","0($out)"); # store out
388 la ($inp,"64($inp)");
389 la ($out,"64($out)");
392 brc (4,".Lvx_tail"); # cc==4?
394 vlm ("%v0","%v3","0($inp)"); # load in
395 vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
396 vstm ("%v0","%v3","0($out)"); # store out
399 for (0..3) { # blocks 3,4
400 vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
401 vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
402 vperm ("%v".($_+12),"%v0","%v1","%v6");
403 vperm ("%v".($_+8),"%v0","%v1","%v7");
405 la ($inp,"64($inp)");
406 la ($out,"64($out)");
409 brc (4,".Lvx_tail"); # cc==4?
411 vlm ("%v0","%v3","0($inp)"); # load in
412 vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
413 vstm ("%v0","%v3","0($out)"); # store out
416 la ($inp,"64($inp)");
417 la ($out,"64($out)");
420 vlr ("%v".($_+4),"%v$_") for (8..11);
425 ar ($len,"%r0"); # restore $len
430 vll ("%v0",$len,($_*16)."($inp)");
431 vx ("%v0","%v0","%v".($_+12));
432 vstl ("%v0",$len,($_*16)."($out)");
434 brc (4,".Lvx_done"); # cc==4?
436 vll ("%v0",$len,"3*16($inp)");
437 vx ("%v0","%v0","%v15");
438 vstl ("%v0",$len,"3*16($out)");
440 SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
445 my $frame=$stdframe+4*20;
447 TYPE ("_s390x_chacha_novx","\@function");
449 LABEL ("_s390x_chacha_novx");
450 &{$z? \<gr:\<r} ($len,$len); # $len==0?
452 &{$z? \&aghi:\&ahi} ($len,-64);
453 &{$z? \&lghi:\&lhi} ("%r1",-$frame);
454 &{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
455 &{$z? \&slgr:\&slr} ($out,$inp); # difference
456 la ($len,"0($inp,$len)"); # end of input minus 64
457 larl ("%r7",".Lsigma");
459 la ($sp,"0(%r1,$sp)");
460 &{$z? \&stg:\&st} ("%r0","0($sp)");
462 lmg ("%r8","%r11","0($key)"); # load key
463 lmg ("%r12","%r13","0($counter)"); # load counter
464 lmg ("%r6","%r7","0(%r7)"); # load sigma constant
466 la ("%r14","0($inp)");
467 &{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)");
468 &{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)");
469 stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
470 srlg (@x[12],"%r12",32); # 32-bit counter value
474 LABEL (".Loop_outer");
475 lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7]
476 lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11]
477 lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15]
478 stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
479 lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9]
480 st (@x[12],"$stdframe+4*12($sp)"); # save counter
481 &{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
489 brct ("%r14",".Loop");
491 &{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
492 stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9]
493 &{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
495 al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule
496 al (@x[1],"$stdframe+4*1($sp)");
497 al (@x[2],"$stdframe+4*2($sp)");
498 al (@x[3],"$stdframe+4*3($sp)");
499 al (@x[4],"$stdframe+4*4($sp)");
500 al (@x[5],"$stdframe+4*5($sp)");
501 al (@x[6],"$stdframe+4*6($sp)");
502 al (@x[7],"$stdframe+4*7($sp)");
511 al (@x[12],"$stdframe+4*12($sp)");
512 al (@x[13],"$stdframe+4*13($sp)");
513 al (@x[14],"$stdframe+4*14($sp)");
514 al (@x[15],"$stdframe+4*15($sp)");
515 lrvr (@x[12],@x[12]);
516 lrvr (@x[13],@x[13]);
517 lrvr (@x[14],@x[14]);
518 lrvr (@x[15],@x[15]);
520 la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer
521 &{$z? \&clgr:\&clr} ("%r14",@t[1]);
524 x (@x[0],"4*0(%r14)"); # xor with input
525 x (@x[1],"4*1(%r14)");
526 st (@x[0],"4*0(@t[0])"); # store output
527 x (@x[2],"4*2(%r14)");
528 st (@x[1],"4*1(@t[0])");
529 x (@x[3],"4*3(%r14)");
530 st (@x[2],"4*2(@t[0])");
531 x (@x[4],"4*4(%r14)");
532 st (@x[3],"4*3(@t[0])");
533 lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11]
534 x (@x[5],"4*5(%r14)");
535 st (@x[4],"4*4(@t[0])");
536 x (@x[6],"4*6(%r14)");
537 al (@x[0],"$stdframe+4*8($sp)");
538 st (@x[5],"4*5(@t[0])");
539 x (@x[7],"4*7(%r14)");
540 al (@x[1],"$stdframe+4*9($sp)");
541 st (@x[6],"4*6(@t[0])");
542 x (@x[12],"4*12(%r14)");
543 al (@x[2],"$stdframe+4*10($sp)");
544 st (@x[7],"4*7(@t[0])");
545 x (@x[13],"4*13(%r14)");
546 al (@x[3],"$stdframe+4*11($sp)");
547 st (@x[12],"4*12(@t[0])");
548 x (@x[14],"4*14(%r14)");
549 st (@x[13],"4*13(@t[0])");
550 x (@x[15],"4*15(%r14)");
551 st (@x[14],"4*14(@t[0])");
553 st (@x[15],"4*15(@t[0])");
558 x (@x[0],"4*8(%r14)");
559 al (@x[12],"$stdframe+4*12($sp)"); # increment counter
560 x (@x[1],"4*9(%r14)");
561 st (@x[0],"4*8(@t[0])");
562 x (@x[2],"4*10(%r14)");
563 st (@x[1],"4*9(@t[0])");
564 x (@x[3],"4*11(%r14)");
565 st (@x[2],"4*10(@t[0])");
566 st (@x[3],"4*11(@t[0])");
568 &{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet?
569 la ("%r14","64(%r14)");
577 stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy
578 stmg ("%r0","%r3","$stdframe+4*12($sp)");
580 &{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)");
585 la (@t[1],"64($t[1])");
586 stm (@x[0],@x[7],"$stdframe+4*0($sp)");
587 &{$z? \&slgr:\&slr} (@t[1],"%r14");
588 lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
589 &{$z? \&lghi:\&lhi} (@x[6],0);
590 stm (@x[12],@x[15],"$stdframe+4*12($sp)");
591 al (@x[0],"$stdframe+4*8($sp)");
592 al (@x[1],"$stdframe+4*9($sp)");
593 al (@x[2],"$stdframe+4*10($sp)");
594 al (@x[3],"$stdframe+4*11($sp)");
599 stm (@x[0],@x[3],"$stdframe+4*8($sp)");
601 LABEL (".Loop_tail");
602 llgc (@x[4],"0(@x[6],%r14)");
603 llgc (@x[5],"$stdframe(@x[6],$sp)");
605 stc (@x[5],"0(@x[6],@t[0])");
606 la (@x[6],"1(@x[6])");
607 brct (@t[1],".Loop_tail");
610 SIZE ("_s390x_chacha_novx",".-_s390x_chacha_novx");
617 LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
618 LONG (0x00000000,0x00000001,0x00000002,0x00000003); # vaf counter increment
619 LONG (0x03020100,0x07060504,0x13121110,0x17161514); # vperm serialization
620 LONG (0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c); # vperm serialization
621 ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");