2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for s390x.
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
25 # In the nutshell it's transliteration of x86_64 module, because both
26 # architectures have similar capabilities/limitations. Performance
27 # measurement is problematic as I don't have access to an idle system.
28 # It looks like z13 processes one byte [out of long message] in ~14
29 # cycles. At least the result is consistent with estimate based on
30 # amount of instruction and assumed instruction issue rate. It's ~2.5x
31 # faster than compiler-generated code.
35 if ($flavour =~ /3[12]/) {
43 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
44 open STDOUT,">$output";
46 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
48 my @C = map("%r$_",(0,1,5..7));
49 my @D = map("%r$_",(8..12));
50 my @T = map("%r$_",(13..14));
51 my ($src,$dst,$iotas) = map("%r$_",(2..4));
54 $stdframe=16*$SIZE_T+4*8;
55 $frame=$stdframe+25*8;
57 my @rhotates = ([ 0, 1, 62, 28, 27 ],
58 [ 36, 44, 6, 55, 20 ],
59 [ 3, 10, 43, 25, 39 ],
60 [ 41, 45, 15, 21, 8 ],
61 [ 18, 2, 61, 56, 14 ]);
63 { my @C = @C; # copy, because we mess them up...
69 .type __KeccakF1600,\@function
72 st${g} %r14,$SIZE_T*14($sp)
73 lg @C[0],$A[4][0]($src)
74 lg @C[1],$A[4][1]($src)
75 lg @C[2],$A[4][2]($src)
76 lg @C[3],$A[4][3]($src)
77 lg @C[4],$A[4][4]($src)
83 lg @D[0],$A[0][0]($src)
84 lg @D[1],$A[1][1]($src)
85 lg @D[2],$A[2][2]($src)
86 lg @D[3],$A[3][3]($src)
89 xg @C[1],$A[0][1]($src)
90 xg @C[2],$A[0][2]($src)
91 xg @C[3],$A[0][3]($src)
93 xg @C[4],$A[0][4]($src)
95 xg @C[0],$A[1][0]($src)
97 xg @C[2],$A[1][2]($src)
98 xg @C[3],$A[1][3]($src)
99 xg @C[4],$A[1][4]($src)
101 xg @C[0],$A[2][0]($src)
102 xg @C[1],$A[2][1]($src)
104 xg @C[3],$A[2][3]($src)
105 xg @C[4],$A[2][4]($src)
107 xg @C[0],$A[3][0]($src)
108 xg @C[1],$A[3][1]($src)
109 xg @C[2],$A[3][2]($src)
111 xg @C[4],$A[3][4]($src)
115 xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0]
118 xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3]
121 xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1]
124 xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4]
127 xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2]
129 (@D[0..4], @C) = (@C[1..4,0], @D);
134 rllg @C[1],@C[1],$rhotates[1][1]
136 rllg @C[2],@C[2],$rhotates[2][2]
141 rllg @C[3],@C[3],$rhotates[3][3]
142 xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2])
143 rllg @C[4],@C[4],$rhotates[4][4]
146 stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
150 lghi @C[1],-1 # no 'not' instruction :-(
151 xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3])
152 xgr @C[2],@C[1] # not @C[2]
153 stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3])
155 xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3])
158 stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3])
159 xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0])
161 stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0])
162 xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0])
163 stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0])
166 lg @C[0],$A[0][3]($src)
167 lg @C[4],$A[4][2]($src)
168 lg @C[3],$A[3][1]($src)
169 lg @C[1],$A[1][4]($src)
170 lg @C[2],$A[2][0]($src)
174 rllg @C[0],@C[0],$rhotates[0][3]
176 rllg @C[4],@C[4],$rhotates[4][2]
178 rllg @C[3],@C[3],$rhotates[3][1]
183 rllg @C[1],@C[1],$rhotates[1][4]
184 xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4])
185 rllg @C[2],@C[2],$rhotates[2][0]
186 stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4])
190 lghi @C[0],-1 # no 'not' instruction :-(
191 xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0])
192 xgr @C[4],@C[0] # not @C[4]
193 stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0])
196 xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3])
199 stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3])
200 xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2])
202 stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2])
203 xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2])
204 stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2])
207 lg @C[2],$A[2][3]($src)
208 lg @C[3],$A[3][4]($src)
209 lg @C[1],$A[1][2]($src)
210 lg @C[4],$A[4][0]($src)
211 lg @C[0],$A[0][1]($src)
215 rllg @C[2],@C[2],$rhotates[2][3]
217 rllg @C[3],@C[3],$rhotates[3][4]
219 rllg @C[1],@C[1],$rhotates[1][2]
224 rllg @C[4],@C[4],$rhotates[4][0]
225 xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3])
226 lghi @T[1],-1 # no 'not' instruction :-(
227 stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3])
229 xgr @C[3],@T[1] # not @C[3]
232 rllg @C[0],@C[0],$rhotates[0][1]
233 xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3])
235 stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
236 xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1])
239 stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1])
240 xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0])
242 stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0])
243 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4])
244 stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
247 lg @C[2],$A[2][1]($src)
248 lg @C[3],$A[3][2]($src)
249 lg @C[1],$A[1][0]($src)
250 lg @C[4],$A[4][3]($src)
251 lg @C[0],$A[0][4]($src)
255 rllg @C[2],@C[2],$rhotates[2][1]
257 rllg @C[3],@C[3],$rhotates[3][2]
259 rllg @C[1],@C[1],$rhotates[1][0]
261 rllg @C[4],@C[4],$rhotates[4][3]
265 lghi @T[1],-1 # no 'not' instruction :-(
266 xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3])
267 xgr @C[3],@T[1] # not @C[3]
268 stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3])
272 rllg @C[0],@C[0],$rhotates[0][4]
273 xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3])
275 stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
276 xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1])
279 stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1])
280 xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0])
282 stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0])
283 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4])
284 stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
287 xg @D[2],$A[0][2]($src)
288 xg @D[3],$A[1][3]($src)
289 xg @D[1],$A[4][1]($src)
290 xg @D[4],$A[2][4]($src)
291 xgr $dst,$src # xchg $dst,$src
292 rllg @D[2],@D[2],$rhotates[0][2]
293 xg @D[0],$A[3][0]($src)
294 rllg @D[3],@D[3],$rhotates[1][3]
296 rllg @D[1],@D[1],$rhotates[4][1]
298 rllg @D[4],@D[4],$rhotates[2][4]
304 lghi @T[1],-1 # no 'not' instruction :-(
305 xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1])
306 xgr @C[1],@T[1] # not @C[1]
307 stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1])
311 rllg @D[0],@D[0],$rhotates[3][0]
312 xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1])
314 stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
315 xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4])
318 stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4])
319 xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3])
321 stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3])
322 xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3])
324 lgr @C[1],@C[0] # harmonize with the loop top
326 stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
331 l${g} %r14,$SIZE_T*14($sp)
333 .size __KeccakF1600,.-__KeccakF1600
339 .type KeccakF1600,\@function
344 stm${g} %r6,%r15,$SIZE_T*6($sp)
349 lghi @D[0],-1 # no 'not' instruction :-(
355 xg @D[0],$A[0][1]($src)
356 xg @D[1],$A[0][2]($src)
357 xg @D[2],$A[1][3]($src)
358 xg @D[3],$A[2][2]($src)
359 xg @D[4],$A[3][2]($src)
360 xg @T[0],$A[4][0]($src)
361 stmg @D[0],@D[1],$A[0][1]($src)
362 stg @D[2],$A[1][3]($src)
363 stg @D[3],$A[2][2]($src)
364 stg @D[4],$A[3][2]($src)
365 stg @T[0],$A[4][0]($src)
367 la $dst,$stdframe($sp)
369 bras %r14,__KeccakF1600
371 lghi @D[0],-1 # no 'not' instruction :-(
377 xg @D[0],$A[0][1]($src)
378 xg @D[1],$A[0][2]($src)
379 xg @D[2],$A[1][3]($src)
380 xg @D[3],$A[2][2]($src)
381 xg @D[4],$A[3][2]($src)
382 xg @T[0],$A[4][0]($src)
383 stmg @D[0],@D[1],$A[0][1]($src)
384 stg @D[2],$A[1][3]($src)
385 stg @D[3],$A[2][2]($src)
386 stg @D[4],$A[3][2]($src)
387 stg @T[0],$A[4][0]($src)
389 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
391 .size KeccakF1600,.-KeccakF1600
394 { my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5));
398 .type SHA3_absorb,\@function
402 stm${g} %r5,%r15,$SIZE_T*5($sp)
407 lghi @D[0],-1 # no 'not' instruction :-(
413 xg @D[0],$A[0][1]($src)
414 xg @D[1],$A[0][2]($src)
415 xg @D[2],$A[1][3]($src)
416 xg @D[3],$A[2][2]($src)
417 xg @D[4],$A[3][2]($src)
418 xg @T[0],$A[4][0]($src)
419 stmg @D[0],@D[1],$A[0][1]($src)
420 stg @D[2],$A[1][3]($src)
421 stg @D[3],$A[2][2]($src)
422 stg @D[4],$A[3][2]($src)
423 stg @T[0],$A[4][0]($src)
439 brct $bsz,.Lblock_absorb
441 stm${g} $inp,$len,$frame+3*$SIZE_T($sp)
442 la $dst,$stdframe($sp)
443 bras %r14,__KeccakF1600
444 lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp)
449 lghi @D[0],-1 # no 'not' instruction :-(
455 xg @D[0],$A[0][1]($src)
456 xg @D[1],$A[0][2]($src)
457 xg @D[2],$A[1][3]($src)
458 xg @D[3],$A[2][2]($src)
459 xg @D[4],$A[3][2]($src)
460 xg @T[0],$A[4][0]($src)
461 stmg @D[0],@D[1],$A[0][1]($src)
462 stg @D[2],$A[1][3]($src)
463 stg @D[3],$A[2][2]($src)
464 stg @D[4],$A[3][2]($src)
465 stg @T[0],$A[4][0]($src)
467 lgr %r2,$len # return value
469 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
471 .size SHA3_absorb,.-SHA3_absorb
474 { my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5));
478 .type SHA3_squeeze,\@function
482 st${g} %r14,2*$SIZE_T($sp)
484 st${g} $bsz,5*$SIZE_T($sp)
498 a${g}hi $len,-8 # len -= 8
501 brct $bsz,.Loop_squeeze # bsz--
503 stm${g} $out,$len,3*$SIZE_T($sp)
504 bras %r14,.LKeccakF1600
505 lm${g} $out,$bsz,3*$SIZE_T($sp)
516 brct $len,.Loop_tail_squeeze
519 l${g} %r14,2*$SIZE_T($sp)
521 .size SHA3_squeeze,.-SHA3_squeeze
526 .quad 0,0,0,0,0,0,0,0
529 .quad 0x0000000000000001
530 .quad 0x0000000000008082
531 .quad 0x800000000000808a
532 .quad 0x8000000080008000
533 .quad 0x000000000000808b
534 .quad 0x0000000080000001
535 .quad 0x8000000080008081
536 .quad 0x8000000000008009
537 .quad 0x000000000000008a
538 .quad 0x0000000000000088
539 .quad 0x0000000080008009
540 .quad 0x000000008000000a
541 .quad 0x000000008000808b
542 .quad 0x800000000000008b
543 .quad 0x8000000000008089
544 .quad 0x8000000000008003
545 .quad 0x8000000000008002
546 .quad 0x8000000000000080
547 .quad 0x000000000000800a
548 .quad 0x800000008000000a
549 .quad 0x8000000080008081
550 .quad 0x8000000000008080
551 .quad 0x0000000080000001
552 .quad 0x8000000080008008
554 .asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>"
557 # unlike 32-bit shift 64-bit one takes three arguments
558 $code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;