3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # 3 times faster than compiler-generated code.
18 if ($flavour =~ /3[12]/) {
26 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27 open STDOUT,">$output";
29 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
30 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
31 $code .= "\t$opcode\t".join(',',@_)."\n";
36 my $stdframe=16*$SIZE_T+4*8;
37 my $frame=$stdframe+4*20;
39 my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
41 my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
42 my @t=map("%r$_",(8,9));
45 my ($a0,$b0,$c0,$d0)=@_;
46 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
47 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
48 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
49 my ($xc,$xc_)=map("\"$_\"",@t);
50 my @x=map("\"$_\"",@x);
52 # Consider order in which variables are addressed by their
57 # 0 4 8 12 < even round
61 # 0 5 10 15 < odd round
66 # 'a', 'b' and 'd's are permanently allocated in registers,
67 # @x[0..7,12..15], while 'c's are maintained in memory. If
68 # you observe 'c' column, you'll notice that pair of 'c's is
69 # invariant between rounds. This means that we have to reload
70 # them once per round, in the middle. This is why you'll see
71 # 'c' stores and loads in the middle, but none in the beginning
75 "&alr (@x[$a0],@x[$b0])", # Q1
76 "&alr (@x[$a1],@x[$b1])", # Q2
77 "&xr (@x[$d0],@x[$a0])",
78 "&xr (@x[$d1],@x[$a1])",
79 "&rll (@x[$d0],@x[$d0],16)",
80 "&rll (@x[$d1],@x[$d1],16)",
83 "&alr ($xc_,@x[$d1])",
86 "&rll (@x[$b0],@x[$b0],12)",
87 "&rll (@x[$b1],@x[$b1],12)",
89 "&alr (@x[$a0],@x[$b0])",
90 "&alr (@x[$a1],@x[$b1])",
91 "&xr (@x[$d0],@x[$a0])",
92 "&xr (@x[$d1],@x[$a1])",
93 "&rll (@x[$d0],@x[$d0],8)",
94 "&rll (@x[$d1],@x[$d1],8)",
97 "&alr ($xc_,@x[$d1])",
100 "&rll (@x[$b0],@x[$b0],7)",
101 "&rll (@x[$b1],@x[$b1],7)",
103 "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
104 "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
106 "&alr (@x[$a2],@x[$b2])", # Q3
107 "&alr (@x[$a3],@x[$b3])", # Q4
108 "&xr (@x[$d2],@x[$a2])",
109 "&xr (@x[$d3],@x[$a3])",
110 "&rll (@x[$d2],@x[$d2],16)",
111 "&rll (@x[$d3],@x[$d3],16)",
113 "&alr ($xc,@x[$d2])",
114 "&alr ($xc_,@x[$d3])",
116 "&xr (@x[$b3],$xc_)",
117 "&rll (@x[$b2],@x[$b2],12)",
118 "&rll (@x[$b3],@x[$b3],12)",
120 "&alr (@x[$a2],@x[$b2])",
121 "&alr (@x[$a3],@x[$b3])",
122 "&xr (@x[$d2],@x[$a2])",
123 "&xr (@x[$d3],@x[$a3])",
124 "&rll (@x[$d2],@x[$d2],8)",
125 "&rll (@x[$d3],@x[$d3],8)",
127 "&alr ($xc,@x[$d2])",
128 "&alr ($xc_,@x[$d3])",
130 "&xr (@x[$b3],$xc_)",
131 "&rll (@x[$b2],@x[$b2],7)",
132 "&rll (@x[$b3],@x[$b3],7)"
139 .globl ChaCha20_ctr32
140 .type ChaCha20_ctr32,\@function
143 cl${g}ije $len,0,.Lno_data # $len==0?
146 stm${g} %r6,%r15,`6*$SIZE_T`($sp)
147 sl${g}r $out,$inp # difference
148 la $len,0($inp,$len) # end of input minus 64
154 lmg %r8,%r11,0($key) # load key
155 lmg %r12,%r13,0($counter) # load counter
156 lmg %r6,%r7,0(%r7) # load sigma constant
159 st${g} $out,$frame+3*$SIZE_T($sp)
160 st${g} $len,$frame+4*$SIZE_T($sp)
161 stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack
162 srlg @x[12],%r12,32 # 32-bit counter value
167 lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7]
168 lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11]
169 lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15]
170 stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11]
171 lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9]
172 st @x[12],$stdframe+4*12($sp) # save counter
173 st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer
180 foreach (&ROUND(0, 4, 8,12)) { eval; }
181 foreach (&ROUND(0, 5,10,15)) { eval; }
185 l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer
186 stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9]
187 lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp)
189 al @x[0],$stdframe+4*0($sp) # accumulate key schedule
190 al @x[1],$stdframe+4*1($sp)
191 al @x[2],$stdframe+4*2($sp)
192 al @x[3],$stdframe+4*3($sp)
193 al @x[4],$stdframe+4*4($sp)
194 al @x[5],$stdframe+4*5($sp)
195 al @x[6],$stdframe+4*6($sp)
196 al @x[7],$stdframe+4*7($sp)
205 al @x[12],$stdframe+4*12($sp)
206 al @x[13],$stdframe+4*13($sp)
207 al @x[14],$stdframe+4*14($sp)
208 al @x[15],$stdframe+4*15($sp)
214 la @t[0],0(@t[0],%r14) # reconstruct output pointer
218 x @x[0],4*0(%r14) # xor with input
220 st @x[0],4*0(@t[0]) # store output
227 lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11]
231 al @x[0],$stdframe+4*8($sp)
234 al @x[1],$stdframe+4*9($sp)
237 al @x[2],$stdframe+4*10($sp)
240 al @x[3],$stdframe+4*11($sp)
241 st @x[12],4*12(@t[0])
243 st @x[13],4*13(@t[0])
245 st @x[14],4*14(@t[0])
247 st @x[15],4*15(@t[0])
253 al @x[12],$stdframe+4*12($sp) # increment counter
263 cl${g}r %r14,@t[1] # done yet?
271 stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy
272 stmg %r0,%r3,$stdframe+4*12($sp)
274 lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
281 stm @x[0],@x[7],$stdframe+4*0($sp)
283 lm @x[0],@x[3],$stdframe+4*8+4*8($sp)
285 stm @x[12],@x[15],$stdframe+4*12($sp)
286 al @x[0],$stdframe+4*8($sp)
287 al @x[1],$stdframe+4*9($sp)
288 al @x[2],$stdframe+4*10($sp)
289 al @x[3],$stdframe+4*11($sp)
294 stm @x[0],@x[3],$stdframe+4*8+4*8($sp)
297 llgc @x[4],0(@x[6],%r14)
298 llgc @x[5],$stdframe(@x[6],$sp)
300 stc @x[5],0(@x[6],@t[0])
302 brct @t[1],.Loop_tail
305 .size ChaCha20_ctr32,.-ChaCha20_ctr32
309 .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
310 .asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
314 foreach (split("\n",$code)) {
315 s/\`([^\`]*)\`/eval $1/ge;