2 # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
21 # Software performance improvement over gcc-generated code is ~70% and
22 # in absolute terms is ~73 cycles per byte processed with 128-bit key.
23 # You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
24 # *strictly* in-order execution and issued instruction [in this case
25 # load value from memory is critical] has to complete before execution
26 # flow proceeds. S-boxes are compressed to 2KB[+256B].
28 # As for hardware acceleration support. It's basically a "teaser," as
29 # it can and should be improved in several ways. Most notably support
30 # for CBC is not utilized, nor multiple blocks are ever processed.
31 # Then software key schedule can be postponed till hardware support
32 # detection... Performance improvement over assembler is reportedly
33 # ~2.5x, but can reach >8x [naturally on larger chunks] if proper
34 # support is implemented.
38 # Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
39 # for 128-bit keys, if hardware support is detected.
43 # Add support for hardware AES192/256 and reschedule instructions to
44 # minimize/avoid Address Generation Interlock hazard and to favour
45 # dual-issue z10 pipeline. This gave ~25% improvement on z10 and
46 # almost 50% on z9. The gain is smaller on z10, because being dual-
47 # issue z10 makes it impossible to eliminate the interlock condition:
48 # critical path is not long enough. Yet it spends ~24 cycles per byte
49 # processed with 128-bit key.
51 # Unlike previous version hardware support detection takes place only
52 # at the moment of key schedule setup, which is denoted in key->rounds.
53 # This is done, because deferred key setup can't be made MT-safe, not
54 # for keys longer than 128 bits.
56 # Add AES_cbc_encrypt, which gives incredible performance improvement,
57 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
58 # because software implementation was optimized.
62 # Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
63 # performance improvement over "generic" counter mode routine relying
64 # on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
65 # to the fact that exact throughput value depends on current stack
66 # frame alignment within 4KB page. In worst case you get ~75% of the
67 # maximum, but *on average* it would be as much as ~98%. Meaning that
68 # worst case is unlike, it's like hitting ravine on plateau.
72 # Adapt for -m31 build. If kernel supports what's called "highgprs"
73 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
74 # instructions and achieve "64-bit" performance even in 31-bit legacy
75 # application context. The feature is not specific to any particular
76 # processor, as long as it's "z-CPU". Latter implies that the code
77 # remains z/Architecture specific. On z990 it was measured to perform
78 # 2x better than code generated by gcc 4.3.
82 # Add support for z196 "cipher message with counter" instruction.
83 # Note however that it's disengaged, because it was measured to
84 # perform ~12% worse than vanilla km-based code...
88 # Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
89 # instructions, which deliver ~70% improvement at 8KB block size over
90 # vanilla km-based code, 37% - at most like 512-bytes block size.
92 # $output is the last argument if it looks like a file (it has an extension)
93 # $flavour is the first argument if it doesn't look like a file
94 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
95 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
97 if ($flavour =~ /3[12]/) {
105 $output and open STDOUT,">$output";
107 $softonly=0; # allow hardware support
109 $t0="%r0"; $mask="%r0";
111 $t2="%r2"; $inp="%r2";
112 $t3="%r3"; $out="%r3"; $bits="%r3";
126 $stdframe=16*$SIZE_T+4*8;
130 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
134 #include "s390x_arch.h"
138 .type AES_Te,\@object
143 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
144 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
145 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
146 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
147 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
148 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
149 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
150 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
151 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
152 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
153 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
154 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
155 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
156 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
157 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
158 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
159 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
160 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
161 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
162 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
163 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
164 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
165 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
166 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
167 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
168 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
169 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
170 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
171 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
172 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
173 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
174 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
175 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
176 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
177 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
178 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
179 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
180 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
181 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
182 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
183 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
184 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
185 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
186 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
187 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
188 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
189 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
190 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
191 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
192 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
193 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
194 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
195 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
196 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
197 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
198 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
199 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
200 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
201 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
202 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
203 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
204 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
205 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
206 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
209 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
210 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
211 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
212 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
213 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
214 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
215 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
216 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
217 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
218 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
219 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
220 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
221 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
222 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
223 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
224 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
225 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
226 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
227 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
228 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
229 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
230 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
231 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
232 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
233 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
234 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
235 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
236 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
237 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
238 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
239 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
240 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
242 .long 0x01000000, 0x02000000, 0x04000000, 0x08000000
243 .long 0x10000000, 0x20000000, 0x40000000, 0x80000000
244 .long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
246 .size AES_Te,.-AES_Te
248 # void AES_encrypt(const unsigned char *inp, unsigned char *out,
249 # const AES_KEY *key) {
251 .type AES_encrypt,\@function
254 $code.=<<___ if (!$softonly);
263 lghi %r3,16 # single block length
264 .long 0xb92e0042 # km %r4,%r2
265 brc 1,.-4 # can this happen?
271 stm${g} %r3,$ra,3*$SIZE_T($sp)
279 bras $ra,_s390x_AES_encrypt
281 l${g} $out,3*$SIZE_T($sp)
287 lm${g} %r6,$ra,6*$SIZE_T($sp)
289 .size AES_encrypt,.-AES_encrypt
291 .type _s390x_AES_encrypt,\@function
294 st${g} $ra,15*$SIZE_T($sp)
300 llill $mask,`0xff<<3`
314 srlg $i1,$s1,`16-3` # i0
323 l $s0,0($s0,$tbl) # Te0[s0>>24]
324 l $t1,1($t1,$tbl) # Te3[s0>>0]
325 l $t2,2($t2,$tbl) # Te2[s0>>8]
326 l $t3,3($t3,$tbl) # Te1[s0>>16]
328 x $s0,3($i1,$tbl) # Te1[s1>>16]
329 l $s1,0($s1,$tbl) # Te0[s1>>24]
330 x $t2,1($i2,$tbl) # Te3[s1>>0]
331 x $t3,2($i3,$tbl) # Te2[s1>>8]
333 srlg $i1,$s2,`8-3` # i0
334 srlg $i2,$s2,`16-3` # i1
343 srlg $ra,$s3,`8-3` # i1
344 sllg $t1,$s3,`0+3` # i0
349 x $s0,2($i1,$tbl) # Te2[s2>>8]
350 x $s1,3($i2,$tbl) # Te1[s2>>16]
351 l $s2,0($s2,$tbl) # Te0[s2>>24]
352 x $t3,1($i3,$tbl) # Te3[s2>>0]
354 srlg $i3,$s3,`16-3` # i2
365 x $s0,1($t1,$tbl) # Te3[s3>>0]
366 x $s1,2($ra,$tbl) # Te2[s3>>8]
367 x $s2,3($i3,$tbl) # Te1[s3>>16]
368 l $s3,0($s3,$tbl) # Te0[s3>>24]
371 brct $rounds,.Lenc_loop
383 srlg $i1,$s1,`16-3` # i0
392 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
393 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
395 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
396 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
400 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
401 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
402 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
403 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
412 srlg $i1,$s2,`8-3` # i0
413 srlg $i2,$s2,`16-3` # i1
421 sllg $t1,$s3,`0+3` # i0
422 srlg $ra,$s3,`8-3` # i1
425 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
426 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
428 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
429 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
438 srlg $i3,$s3,`16-3` # i2
446 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
447 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
448 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
449 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
458 l${g} $ra,15*$SIZE_T($sp)
465 .size _s390x_AES_encrypt,.-_s390x_AES_encrypt
469 .type AES_Td,\@object
474 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
475 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
476 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
477 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
478 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
479 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
480 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
481 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
482 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
483 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
484 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
485 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
486 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
487 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
488 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
489 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
490 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
491 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
492 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
493 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
494 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
495 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
496 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
497 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
498 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
499 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
500 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
501 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
502 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
503 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
504 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
505 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
506 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
507 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
508 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
509 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
510 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
511 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
512 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
513 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
514 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
515 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
516 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
517 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
518 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
519 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
520 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
521 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
522 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
523 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
524 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
525 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
526 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
527 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
528 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
529 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
530 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
531 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
532 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
533 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
534 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
535 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
536 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
537 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
540 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
541 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
542 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
543 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
544 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
545 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
546 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
547 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
548 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
549 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
550 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
551 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
552 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
553 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
554 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
555 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
556 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
557 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
558 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
559 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
560 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
561 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
562 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
563 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
564 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
565 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
566 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
567 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
568 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
569 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
570 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
571 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
572 .size AES_Td,.-AES_Td
574 # void AES_decrypt(const unsigned char *inp, unsigned char *out,
575 # const AES_KEY *key) {
577 .type AES_decrypt,\@function
580 $code.=<<___ if (!$softonly);
589 lghi %r3,16 # single block length
590 .long 0xb92e0042 # km %r4,%r2
591 brc 1,.-4 # can this happen?
597 stm${g} %r3,$ra,3*$SIZE_T($sp)
605 bras $ra,_s390x_AES_decrypt
607 l${g} $out,3*$SIZE_T($sp)
613 lm${g} %r6,$ra,6*$SIZE_T($sp)
615 .size AES_decrypt,.-AES_decrypt
617 .type _s390x_AES_decrypt,\@function
620 st${g} $ra,15*$SIZE_T($sp)
626 llill $mask,`0xff<<3`
640 sllg $i1,$s1,`0+3` # i0
649 l $s0,0($s0,$tbl) # Td0[s0>>24]
650 l $t1,3($t1,$tbl) # Td1[s0>>16]
651 l $t2,2($t2,$tbl) # Td2[s0>>8]
652 l $t3,1($t3,$tbl) # Td3[s0>>0]
654 x $s0,1($i1,$tbl) # Td3[s1>>0]
655 l $s1,0($s1,$tbl) # Td0[s1>>24]
656 x $t2,3($i2,$tbl) # Td1[s1>>16]
657 x $t3,2($i3,$tbl) # Td2[s1>>8]
659 srlg $i1,$s2,`8-3` # i0
660 sllg $i2,$s2,`0+3` # i1
669 srlg $ra,$s3,`8-3` # i1
670 srlg $t1,$s3,`16-3` # i0
675 x $s0,2($i1,$tbl) # Td2[s2>>8]
676 x $s1,1($i2,$tbl) # Td3[s2>>0]
677 l $s2,0($s2,$tbl) # Td0[s2>>24]
678 x $t3,3($i3,$tbl) # Td1[s2>>16]
680 sllg $i3,$s3,`0+3` # i2
691 x $s0,3($t1,$tbl) # Td1[s3>>16]
692 x $s1,2($ra,$tbl) # Td2[s3>>8]
693 x $s2,1($i3,$tbl) # Td3[s3>>0]
694 l $s3,0($s3,$tbl) # Td0[s3>>24]
697 brct $rounds,.Ldec_loop
700 l $t1,`2048+0`($tbl) # prefetch Td4
701 l $t2,`2048+64`($tbl)
702 l $t3,`2048+128`($tbl)
703 l $i1,`2048+192`($tbl)
720 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
721 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
722 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
724 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
728 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
729 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
730 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
732 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
746 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
747 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
748 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
749 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
764 l${g} $ra,15*$SIZE_T($sp)
769 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
770 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
772 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
773 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
787 .size _s390x_AES_decrypt,.-_s390x_AES_decrypt
791 # void AES_set_encrypt_key(const unsigned char *in, int bits,
793 .globl AES_set_encrypt_key
794 .type AES_set_encrypt_key,\@function
797 _s390x_AES_set_encrypt_key:
819 $code.=<<___ if (!$softonly);
820 # convert bits to km(c) code, [128,192,256]->[18,19,20]
827 larl %r1,OPENSSL_s390xcap_P
830 ng %r0,S390X_KM(%r1) # check availability of both km...
831 ng %r0,S390X_KMC(%r1) # ...and kmc support for given key length
834 lmg %r0,%r1,0($inp) # just copy 128 bits...
844 1: st $bits,236($key) # save bits [for debugging purposes]
846 st %r5,240($key) # save km(c) code
853 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
855 larl $tbl,AES_Te+2048
874 llgfr $t2,$s3 # temp=rk[3]
888 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
889 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
890 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
891 icm $t2,1,0($i3) # Te4[rk[3]>>24]
892 x $t2,256($t3,$tbl) # rcon[i]
893 xr $s0,$t2 # rk[4]=rk[0]^...
894 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
895 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
896 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
898 llgfr $t2,$s3 # temp=rk[3]
910 la $key,16($key) # key+=4
912 brct $rounds,.L128_loop
915 lm${g} %r4,%r13,4*$SIZE_T($sp)
947 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
948 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
949 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
950 icm $t1,1,0($i3) # Te4[rk[5]>>24]
951 x $t1,256($t3,$tbl) # rcon[i]
952 xr $s0,$t1 # rk[6]=rk[0]^...
953 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
954 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
955 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
961 brct $rounds,.L192_continue
964 lm${g} %r4,%r13,4*$SIZE_T($sp)
970 x $t1,16($key) # rk[10]=rk[4]^rk[9]
972 x $t1,20($key) # rk[11]=rk[5]^rk[10]
982 la $key,24($key) # key+=6
1011 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
1012 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
1013 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
1014 icm $t1,1,0($i3) # Te4[rk[7]>>24]
1015 x $t1,256($t3,$tbl) # rcon[i]
1016 xr $s0,$t1 # rk[8]=rk[0]^...
1017 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
1018 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
1019 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
1024 brct $rounds,.L256_continue
1027 lm${g} %r4,%r13,4*$SIZE_T($sp)
1032 lgr $t1,$s3 # temp=rk[11]
1043 llgc $t1,0($t1) # Te4[rk[11]>>0]
1044 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
1045 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
1046 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
1047 x $t1,16($key) # rk[12]=rk[4]^...
1049 x $t1,20($key) # rk[13]=rk[5]^rk[12]
1051 x $t1,24($key) # rk[14]=rk[6]^rk[13]
1053 x $t1,28($key) # rk[15]=rk[7]^rk[14]
1063 la $key,32($key) # key+=8
1070 .size AES_set_encrypt_key,.-AES_set_encrypt_key
1072 # void AES_set_decrypt_key(const unsigned char *in, int bits,
1074 .globl AES_set_decrypt_key
1075 .type AES_set_decrypt_key,\@function
1077 AES_set_decrypt_key:
1078 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1079 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
1080 bras $ra,_s390x_AES_set_encrypt_key
1081 #l${g} $key,4*$SIZE_T($sp)
1082 l${g} $ra,14*$SIZE_T($sp)
1086 $code.=<<___ if (!$softonly);
1091 oill $t0,S390X_DECRYPT # set "decrypt" bit
1097 .Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
1105 .Linv: lmg $s0,$s1,0($i1)
1117 llgf $rounds,240($key)
1119 sll $rounds,2 # (rounds-1)*4
1120 llilh $mask80,0x8080
1121 llilh $mask1b,0x1b1b
1122 llilh $maskfe,0xfefe
1128 .Lmix: l $s0,16($key) # tp1
1156 xr $s1,$s0 # tp2^tp1
1157 xr $s2,$s0 # tp4^tp1
1158 rll $s0,$s0,24 # = ROTATE(tp1,8)
1160 xr $s0,$s1 # ^=tp2^tp1
1161 xr $s1,$s3 # tp2^tp1^tp8
1162 xr $s0,$s2 # ^=tp4^tp1^tp8
1165 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
1167 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
1168 xr $s0,$s3 # ^= ROTATE(tp8,8)
1174 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1177 .size AES_set_decrypt_key,.-AES_set_decrypt_key
1180 ########################################################################
1181 # void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1182 # size_t length, const AES_KEY *key,
1183 # unsigned char *ivec, const int enc)
1186 my $out="%r4"; # length and out are swapped
1192 .globl AES_cbc_encrypt
1193 .type AES_cbc_encrypt,\@function
1196 xgr %r3,%r4 # flip %r3 and %r4, out and len
1200 $code.=<<___ if (!$softonly);
1205 lg %r0,0($ivp) # copy ivec
1207 stmg %r0,%r1,16($sp)
1208 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1209 stmg %r0,%r1,32($sp)
1210 lmg %r0,%r1,16($key)
1211 stmg %r0,%r1,48($sp)
1212 l %r0,240($key) # load kmc code
1213 lghi $key,15 # res=len%16, len-=res;
1216 la %r1,16($sp) # parameter block - ivec || key
1218 .long 0xb92f0042 # kmc %r4,%r2
1219 brc 1,.-4 # pay attention to "partial completion"
1223 lmg %r0,%r1,16($sp) # copy ivec to caller
1229 ahi $key,-1 # it's the way it's encoded in mvc
1230 tmll %r0,S390X_DECRYPT
1231 jnz .Lkmc_truncated_dec
1233 stg %r1,16*$SIZE_T($sp)
1234 stg %r1,16*$SIZE_T+8($sp)
1236 mvc 16*$SIZE_T(1,$sp),0($inp)
1238 la %r1,16($sp) # restore parameter block
1239 la $inp,16*$SIZE_T($sp)
1241 .long 0xb92f0042 # kmc %r4,%r2
1244 .Lkmc_truncated_dec:
1245 st${g} $out,4*$SIZE_T($sp)
1246 la $out,16*$SIZE_T($sp)
1248 .long 0xb92f0042 # kmc %r4,%r2
1249 l${g} $out,4*$SIZE_T($sp)
1251 mvc 0(1,$out),16*$SIZE_T($sp)
1258 stm${g} $key,$ra,5*$SIZE_T($sp)
1260 cl %r0,`$stdframe+$SIZE_T-4`($sp)
1272 brc 4,.Lcbc_enc_tail # if borrow
1274 stm${g} $inp,$out,2*$SIZE_T($sp)
1281 bras $ra,_s390x_AES_encrypt
1283 lm${g} $inp,$key,2*$SIZE_T($sp)
1295 brc 4,.Lcbc_enc_tail # if borrow
1299 l${g} $ivp,6*$SIZE_T($sp)
1305 lm${g} %r7,$ra,7*$SIZE_T($sp)
1312 stg $t0,16*$SIZE_T($sp)
1313 stg $t0,16*$SIZE_T+8($sp)
1315 mvc 16*$SIZE_T(1,$sp),0($inp)
1318 la $inp,16*$SIZE_T($sp)
1327 stmg $t0,$t1,16*$SIZE_T($sp)
1330 stm${g} $inp,$out,2*$SIZE_T($sp)
1337 bras $ra,_s390x_AES_decrypt
1339 lm${g} $inp,$key,2*$SIZE_T($sp)
1347 xg $s0,16*$SIZE_T($sp)
1348 xg $s2,16*$SIZE_T+8($sp)
1351 brc 4,.Lcbc_dec_tail # if borrow
1352 brc 2,.Lcbc_dec_done # if zero
1355 stmg $t0,$t1,16*$SIZE_T($sp)
1365 lm${g} %r6,$ra,6*$SIZE_T($sp)
1366 stmg $t0,$t1,0($ivp)
1373 stg $s0,16*$SIZE_T($sp)
1374 stg $s2,16*$SIZE_T+8($sp)
1376 mvc 0(1,$out),16*$SIZE_T($sp)
1379 .size AES_cbc_encrypt,.-AES_cbc_encrypt
1382 ########################################################################
1383 # void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1384 # size_t blocks, const AES_KEY *key,
1385 # const unsigned char *ivec)
1388 my $out="%r4"; # blocks and out are swapped
1390 my $key="%r5"; my $iv0="%r5";
1395 .globl AES_ctr32_encrypt
1396 .type AES_ctr32_encrypt,\@function
1399 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1402 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
1404 $code.=<<___ if (!$softonly);
1410 st${g} $s2,10*$SIZE_T($sp)
1411 st${g} $s3,11*$SIZE_T($sp)
1413 clr $len,%r1 # does work even in 64-bit mode
1414 jle .Lctr32_nokma # kma is slower for <= 16 blocks
1416 larl %r1,OPENSSL_s390xcap_P
1420 ng $s3,S390X_KMA(%r1) # check kma capability vector
1423 l${g}hi %r1,-$stdframe-112
1425 la $sp,0(%r1,$sp) # prepare parameter block
1429 or %r0,%r1 # set HS and LAAD flags
1431 st${g} $s3,0($sp) # backchain
1432 la %r1,$stdframe($sp)
1434 lmg $s2,$s3,0($key) # copy key
1435 stg $s2,$stdframe+80($sp)
1436 stg $s3,$stdframe+88($sp)
1437 lmg $s2,$s3,16($key)
1438 stg $s2,$stdframe+96($sp)
1439 stg $s3,$stdframe+104($sp)
1441 lmg $s2,$s3,0($ivp) # copy iv
1442 stg $s2,$stdframe+64($sp)
1443 ahi $s3,-1 # kma requires counter-1
1444 stg $s3,$stdframe+72($sp)
1445 st $s3,$stdframe+12($sp) # copy counter
1450 .long 0xb929a042 # kma $out,$s2,$inp
1451 brc 1,.-4 # pay attention to "partial completion"
1453 stg %r0,$stdframe+80($sp) # wipe key
1454 stg %r0,$stdframe+88($sp)
1455 stg %r0,$stdframe+96($sp)
1456 stg %r0,$stdframe+104($sp)
1457 la $sp,$stdframe+112($sp)
1459 lm${g} $s2,$s3,10*$SIZE_T($sp)
1464 stm${g} %r6,$s1,6*$SIZE_T($sp)
1467 la %r1,0($key) # %r1 is permanent copy of $key
1468 lg $iv0,0($ivp) # load ivec
1471 # prepare and allocate stack frame at the top of 4K page
1472 # with 1K reserved for eventual signal handling
1473 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1477 ngr $s0,$s1 # align at page boundary
1478 slgr $fp,$s0 # total buffer size
1480 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1481 slgr $fp,$s1 # deduct reservation to get usable buffer size
1482 # buffer size is at lest 256 and at most 3072+256-16
1484 la $sp,1024($s0) # alloca
1485 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
1486 st${g} $s2,0($sp) # back-chain
1487 st${g} $fp,$SIZE_T($sp)
1490 brc 1,.Lctr32_hw_switch # not zero, no borrow
1491 algr $fp,$len # input is shorter than allocated buffer
1493 st${g} $fp,$SIZE_T($sp)
1497 $code.=<<___ if (!$softonly && 0);# kmctr code was measured to be ~12% slower
1500 larl %r1,OPENSSL_s390xcap_P
1501 llihh %r0,0x8000 # check if kmctr supports the function code
1503 ng %r0,S390X_KMCTR(%r1) # check kmctr capability vector
1509 algr $out,$inp # restore $out
1510 lgr $s1,$len # $s1 undertakes $len
1511 j .Lctr32_kmctr_loop
1516 .Lctr32_kmctr_prepare:
1520 ahi $ivp,1 # 32-bit increment, preserves upper half
1521 brct $s3,.Lctr32_kmctr_prepare
1523 #la $inp,0($inp) # inp
1524 sllg $len,$fp,4 # len
1525 #la $out,0($out) # out
1527 .long 0xb92da042 # kmctr $out,$s2,$inp
1528 brc 1,.-4 # pay attention to "partial completion"
1531 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1534 brc 4+1,.Lctr32_kmctr_loop # not zero
1537 lm${g} %r6,$s3,6*$SIZE_T($sp)
1541 $code.=<<___ if (!$softonly);
1549 ahi $ivp,1 # 32-bit increment, preserves upper half
1550 brct $s3,.Lctr32_km_prepare
1552 la $s0,16($sp) # inp
1553 sllg $s1,$fp,4 # len
1554 la $s2,16($sp) # out
1555 .long 0xb92e00a8 # km %r10,%r8
1556 brc 1,.-4 # pay attention to "partial completion"
1566 stg $s0,0($out,$inp)
1567 stg $s1,8($out,$inp)
1569 brct $s3,.Lctr32_km_xor
1572 brc 1,.Lctr32_km_loop # not zero, no borrow
1575 brc 4+1,.Lctr32_km_loop # not zero
1578 l${g} $s1,$SIZE_T($sp)
1584 brct $s1,.Lctr32_km_zap
1587 lm${g} %r6,$s3,6*$SIZE_T($sp)
1593 stm${g} $key,$ra,5*$SIZE_T($sp)
1599 stm${g} $inp,$out,2*$SIZE_T($sp)
1604 st $t1,16*$SIZE_T($sp)
1607 bras $ra,_s390x_AES_encrypt
1609 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1610 llgf $t1,16*$SIZE_T($sp)
1618 ahi $t1,1 # 32-bit increment
1619 brct $len,.Lctr32_loop
1621 lm${g} %r6,$ra,6*$SIZE_T($sp)
1623 .size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1627 ########################################################################
1628 # void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
1629 # size_t len, const AES_KEY *key1, const AES_KEY *key2,
1630 # const unsigned char iv[16]);
1634 my $out="%r4"; # len and out are swapped
1636 my $key1="%r5"; # $i1
1637 my $key2="%r6"; # $i2
1639 my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1642 .type _s390x_xts_km,\@function
1647 llgfr $s0,%r0 # put aside the function code
1650 larl %r1,OPENSSL_s390xcap_P
1652 srlg %r0,%r0,32($s1) # check for 32+function code
1653 ng %r0,S390X_KM(%r1) # check km capability vector
1654 lgr %r0,$s0 # restore the function code
1655 la %r1,0($key1) # restore $key1
1658 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1661 oill %r0,32 # switch to xts function code
1663 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1664 la %r1,$tweak-16($sp)
1665 slgr %r1,$s1 # parameter block position
1666 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1667 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1668 # yes, it contains junk and overlaps
1669 # with the tweak in 128-bit case.
1670 # it's done to avoid conditional
1672 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1674 .long 0xb92e0042 # km %r4,%r2
1675 brc 1,.-4 # pay attention to "partial completion"
1677 lrvg $s0,$tweak+0($sp) # load the last tweak
1678 lrvg $s1,$tweak+8($sp)
1679 stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
1681 nill %r0,0xffdf # switch back to original function code
1682 la %r1,0($key1) # restore pointer to $key1
1685 llgc $len,2*$SIZE_T-1($sp)
1686 nill $len,0x0f # $len%=16
1693 # prepare and allocate stack frame at the top of 4K page
1694 # with 1K reserved for eventual signal handling
1695 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1699 ngr $s0,$s1 # align at page boundary
1700 slgr $fp,$s0 # total buffer size
1702 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1703 slgr $fp,$s1 # deduct reservation to get usable buffer size
1704 # buffer size is at lest 256 and at most 3072+256-16
1706 la $sp,1024($s0) # alloca
1707 nill $fp,0xfff0 # round to 16*n
1708 st${g} $s2,0($sp) # back-chain
1709 nill $len,0xfff0 # redundant
1710 st${g} $fp,$SIZE_T($sp)
1713 brc 1,.Lxts_km_go # not zero, no borrow
1714 algr $fp,$len # input is shorter than allocated buffer
1716 st${g} $fp,$SIZE_T($sp)
1719 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1720 lrvg $s1,$tweak+8($s2)
1722 la $s2,16($sp) # vector of ascending tweak values
1733 srag $i2,$s1,63 # broadcast upper bit
1739 lrvgr $i1,$s0 # flip byte order
1745 stg $i1,0($out,$inp)
1746 stg $i2,8($out,$inp)
1748 brct $s3,.Lxts_km_prepare
1750 slgr $inp,$fp # rewind $inp
1753 .long 0xb92e00aa # km $s2,$s2
1754 brc 1,.-4 # pay attention to "partial completion"
1764 stg $i1,0($out,$inp)
1765 stg $i2,8($out,$inp)
1767 brct $s3,.Lxts_km_xor
1770 brc 1,.Lxts_km_loop # not zero, no borrow
1773 brc 4+1,.Lxts_km_loop # not zero
1775 l${g} $i1,0($sp) # back-chain
1776 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1783 brct $fp,.Lxts_km_zap
1786 llgc $len,2*$SIZE_T-1($i1)
1787 nill $len,0x0f # $len%=16
1790 # generate one more tweak...
1792 srag $i2,$s1,63 # broadcast upper bit
1798 ltr $len,$len # clear zero flag
1800 .size _s390x_xts_km,.-_s390x_xts_km
1802 .globl AES_xts_encrypt
1803 .type AES_xts_encrypt,\@function
1806 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1810 $code.=<<___ if ($SIZE_T==4);
1814 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1815 srag $len,$len,4 # formally wrong, because it expands
1816 # sign byte, but who can afford asking
1817 # to process more than 2^63-1 bytes?
1818 # I use it, because it sets condition
1820 bcr 8,$ra # abort if zero (i.e. less than 16)
1822 $code.=<<___ if (!$softonly);
1826 jl .Lxts_enc_software
1828 st${g} $ra,5*$SIZE_T($sp)
1829 stm${g} %r6,$s3,6*$SIZE_T($sp)
1831 sllg $len,$len,4 # $len&=~15
1834 # generate the tweak value
1835 l${g} $s3,$stdframe($sp) # pointer to iv
1840 la %r1,0($key2) # $key2 is not needed anymore
1841 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1842 brc 1,.-4 # can this happen?
1845 la %r1,0($key1) # $key1 is not needed anymore
1846 bras $ra,_s390x_xts_km
1847 jz .Lxts_enc_km_done
1849 aghi $inp,-16 # take one step back
1850 la $i3,0($out,$inp) # put aside real $out
1853 llgc $i2,0($out,$inp)
1854 stc $i1,0($out,$inp)
1855 stc $i2,16($out,$inp)
1857 brct $len,.Lxts_enc_km_steal
1861 lrvgr $i1,$s0 # flip byte order
1867 .long 0xb92e00aa # km $s2,$s2
1868 brc 1,.-4 # can this happen?
1869 lrvgr $i1,$s0 # flip byte order
1877 stg $sp,$tweak+0($sp) # wipe tweak
1878 stg $sp,$tweak+8($sp)
1879 l${g} $ra,5*$SIZE_T($sp)
1880 lm${g} %r6,$s3,6*$SIZE_T($sp)
1886 stm${g} %r6,$ra,6*$SIZE_T($sp)
1890 l${g} $s3,$stdframe($sp) # ivp
1891 llgf $s0,0($s3) # load iv
1895 stm${g} %r2,%r5,2*$SIZE_T($sp)
1898 bras $ra,_s390x_AES_encrypt # generate the tweak
1899 lm${g} %r2,%r5,2*$SIZE_T($sp)
1900 stm $s0,$s3,$tweak($sp) # save the tweak
1905 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1906 lrvg $s3,$tweak+8($sp)
1908 srag %r0,$s3,63 # broadcast upper bit
1913 lrvgr $s1,$s1 # flip byte order
1915 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1916 stg $s1,$tweak+0($sp) # save the tweak
1919 stg $s3,$tweak+8($sp)
1921 la $inp,16($inp) # $inp+=16
1923 x $s0,0($inp) # ^=*($inp)
1927 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1929 bras $ra,_s390x_AES_encrypt
1930 lm${g} %r2,%r5,2*$SIZE_T($sp)
1931 x $s0,$tweak+0($sp) # ^=tweak
1934 x $s3,$tweak+12($sp)
1938 st $s3,12($out,$inp)
1939 brct${g} $len,.Lxts_enc_loop
1941 llgc $len,`2*$SIZE_T-1`($sp)
1942 nill $len,0x0f # $len%16
1945 la $i3,0($inp,$out) # put aside real $out
1948 llgc %r1,0($out,$inp)
1949 stc %r0,0($out,$inp)
1950 stc %r1,16($out,$inp)
1952 brct $len,.Lxts_enc_steal
1953 la $out,0($i3) # restore real $out
1955 # generate last tweak...
1956 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1957 lrvg $s3,$tweak+8($sp)
1959 srag %r0,$s3,63 # broadcast upper bit
1964 lrvgr $s1,$s1 # flip byte order
1966 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1967 stg $s1,$tweak+0($sp) # save the tweak
1970 stg $s3,$tweak+8($sp)
1973 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1977 st${g} $out,4*$SIZE_T($sp)
1979 bras $ra,_s390x_AES_encrypt
1980 l${g} $out,4*$SIZE_T($sp)
1981 x $s0,`$tweak+0`($sp) # ^=tweak
1982 x $s1,`$tweak+4`($sp)
1983 x $s2,`$tweak+8`($sp)
1984 x $s3,`$tweak+12`($sp)
1991 stg $sp,$tweak+0($sp) # wipe tweak
1992 stg $sp,$twesk+8($sp)
1993 lm${g} %r6,$ra,6*$SIZE_T($sp)
1995 .size AES_xts_encrypt,.-AES_xts_encrypt
1997 # void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
1998 # size_t len, const AES_KEY *key1, const AES_KEY *key2,
1999 # const unsigned char iv[16]);
2002 .globl AES_xts_decrypt
2003 .type AES_xts_decrypt,\@function
2006 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
2010 $code.=<<___ if ($SIZE_T==4);
2014 st${g} $len,1*$SIZE_T($sp) # save copy of $len
2016 bcr 4,$ra # abort if less than zero. formally
2017 # wrong, because $len is unsigned,
2018 # but who can afford asking to
2019 # process more than 2^63-1 bytes?
2021 jnz .Lxts_dec_proceed
2025 $code.=<<___ if (!$softonly);
2029 jl .Lxts_dec_software
2031 st${g} $ra,5*$SIZE_T($sp)
2032 stm${g} %r6,$s3,6*$SIZE_T($sp)
2034 nill $len,0xfff0 # $len&=~15
2037 # generate the tweak value
2038 l${g} $s3,$stdframe($sp) # pointer to iv
2043 la %r1,0($key2) # $key2 is not needed past this point
2044 .long 0xb92e00aa # km $s2,$s2, generate the tweak
2045 brc 1,.-4 # can this happen?
2048 la %r1,0($key1) # $key1 is not needed anymore
2051 jz .Lxts_dec_km_short
2052 bras $ra,_s390x_xts_km
2053 jz .Lxts_dec_km_done
2055 lrvgr $s2,$s0 # make copy in reverse byte order
2057 j .Lxts_dec_km_2ndtweak
2060 llgc $len,`2*$SIZE_T-1`($sp)
2061 nill $len,0x0f # $len%=16
2062 lrvg $s0,$tweak+0($sp) # load the tweak
2063 lrvg $s1,$tweak+8($sp)
2064 lrvgr $s2,$s0 # make copy in reverse byte order
2067 .Lxts_dec_km_2ndtweak:
2069 srag $i2,$s1,63 # broadcast upper bit
2074 lrvgr $i1,$s0 # flip byte order
2079 stg $i1,0($out,$inp)
2080 stg $i2,8($out,$inp)
2083 .long 0xb92e0066 # km $i2,$i2
2084 brc 1,.-4 # can this happen?
2089 stg $i1,0($out,$inp)
2090 stg $i2,8($out,$inp)
2092 la $i3,0($out,$inp) # put aside real $out
2095 llgc $i2,0($out,$inp)
2096 stc $i1,0($out,$inp)
2097 stc $i2,16($out,$inp)
2099 brct $len,.Lxts_dec_km_steal
2109 .long 0xb92e0088 # km $s0,$s0
2110 brc 1,.-4 # can this happen?
2116 stg $sp,$tweak+0($sp) # wipe tweak
2117 stg $sp,$tweak+8($sp)
2118 l${g} $ra,5*$SIZE_T($sp)
2119 lm${g} %r6,$s3,6*$SIZE_T($sp)
2125 stm${g} %r6,$ra,6*$SIZE_T($sp)
2130 l${g} $s3,$stdframe($sp) # ivp
2131 llgf $s0,0($s3) # load iv
2135 stm${g} %r2,%r5,2*$SIZE_T($sp)
2138 bras $ra,_s390x_AES_encrypt # generate the tweak
2139 lm${g} %r2,%r5,2*$SIZE_T($sp)
2142 stm $s0,$s3,$tweak($sp) # save the tweak
2148 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2149 lrvg $s3,$tweak+8($sp)
2151 srag %r0,$s3,63 # broadcast upper bit
2156 lrvgr $s1,$s1 # flip byte order
2158 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2159 stg $s1,$tweak+0($sp) # save the tweak
2162 stg $s3,$tweak+8($sp)
2165 x $s0,0($inp) # tweak^=*(inp)
2169 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2171 bras $ra,_s390x_AES_decrypt
2172 lm${g} %r2,%r5,2*$SIZE_T($sp)
2173 x $s0,$tweak+0($sp) # ^=tweak
2176 x $s3,$tweak+12($sp)
2180 st $s3,12($out,$inp)
2182 brct${g} $len,.Lxts_dec_loop
2184 llgc $len,`2*$SIZE_T-1`($sp)
2185 nill $len,0x0f # $len%16
2188 # generate pair of tweaks...
2189 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2190 lrvg $s3,$tweak+8($sp)
2192 srag %r0,$s3,63 # broadcast upper bit
2197 lrvgr $i2,$s1 # flip byte order
2199 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2200 j .Lxts_dec_2ndtweak
2204 llgc $len,`2*$SIZE_T-1`($sp)
2205 nill $len,0x0f # $len%16
2206 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2207 lrvg $s3,$tweak+8($sp)
2210 srag %r0,$s3,63 # broadcast upper bit
2215 lrvgr $s1,$s1 # flip byte order
2217 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2218 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2221 stg $s3,$tweak-16+8($sp)
2224 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2228 stm${g} %r2,%r3,2*$SIZE_T($sp)
2230 bras $ra,_s390x_AES_decrypt
2231 lm${g} %r2,%r5,2*$SIZE_T($sp)
2232 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2233 x $s1,$tweak-16+4($sp)
2234 x $s2,$tweak-16+8($sp)
2235 x $s3,$tweak-16+12($sp)
2239 st $s3,12($out,$inp)
2241 la $i3,0($out,$inp) # put aside real $out
2244 llgc %r1,0($out,$inp)
2245 stc %r0,0($out,$inp)
2246 stc %r1,16($out,$inp)
2248 brct $len,.Lxts_dec_steal
2249 la $out,0($i3) # restore real $out
2251 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2252 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2256 st${g} $out,4*$SIZE_T($sp)
2258 bras $ra,_s390x_AES_decrypt
2259 l${g} $out,4*$SIZE_T($sp)
2260 x $s0,$tweak+0($sp) # ^=tweak
2263 x $s3,$tweak+12($sp)
2268 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2269 stg $sp,$tweak-16+8($sp)
2271 stg $sp,$tweak+0($sp) # wipe tweak
2272 stg $sp,$twesk+8($sp)
2273 lm${g} %r6,$ra,6*$SIZE_T($sp)
2275 .size AES_xts_decrypt,.-AES_xts_decrypt
2279 .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2282 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2284 close STDOUT or die "error closing STDOUT"; # force flush