2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
21 # Software performance improvement over gcc-generated code is ~70% and
22 # in absolute terms is ~73 cycles per byte processed with 128-bit key.
23 # You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
24 # *strictly* in-order execution and issued instruction [in this case
25 # load value from memory is critical] has to complete before execution
26 # flow proceeds. S-boxes are compressed to 2KB[+256B].
28 # As for hardware acceleration support. It's basically a "teaser," as
29 # it can and should be improved in several ways. Most notably support
30 # for CBC is not utilized, nor multiple blocks are ever processed.
31 # Then software key schedule can be postponed till hardware support
32 # detection... Performance improvement over assembler is reportedly
33 # ~2.5x, but can reach >8x [naturally on larger chunks] if proper
34 # support is implemented.
38 # Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
39 # for 128-bit keys, if hardware support is detected.
43 # Add support for hardware AES192/256 and reschedule instructions to
44 # minimize/avoid Address Generation Interlock hazard and to favour
45 # dual-issue z10 pipeline. This gave ~25% improvement on z10 and
46 # almost 50% on z9. The gain is smaller on z10, because being dual-
47 # issue z10 makes it improssible to eliminate the interlock condition:
48 # critial path is not long enough. Yet it spends ~24 cycles per byte
49 # processed with 128-bit key.
51 # Unlike previous version hardware support detection takes place only
52 # at the moment of key schedule setup, which is denoted in key->rounds.
53 # This is done, because deferred key setup can't be made MT-safe, not
54 # for keys longer than 128 bits.
56 # Add AES_cbc_encrypt, which gives incredible performance improvement,
57 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
58 # because software implementation was optimized.
62 # Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
63 # performance improvement over "generic" counter mode routine relying
64 # on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
65 # to the fact that exact throughput value depends on current stack
66 # frame alignment within 4KB page. In worst case you get ~75% of the
67 # maximum, but *on average* it would be as much as ~98%. Meaning that
68 # worst case is unlike, it's like hitting ravine on plateau.
72 # Adapt for -m31 build. If kernel supports what's called "highgprs"
73 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
74 # instructions and achieve "64-bit" performance even in 31-bit legacy
75 # application context. The feature is not specific to any particular
76 # processor, as long as it's "z-CPU". Latter implies that the code
77 # remains z/Architecture specific. On z990 it was measured to perform
78 # 2x better than code generated by gcc 4.3.
82 # Add support for z196 "cipher message with counter" instruction.
83 # Note however that it's disengaged, because it was measured to
84 # perform ~12% worse than vanilla km-based code...
88 # Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
89 # instructions, which deliver ~70% improvement at 8KB block size over
90 # vanilla km-based code, 37% - at most like 512-bytes block size.
94 if ($flavour =~ /3[12]/) {
102 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
103 open STDOUT,">$output";
105 $softonly=0; # allow hardware support
107 $t0="%r0"; $mask="%r0";
109 $t2="%r2"; $inp="%r2";
110 $t3="%r3"; $out="%r3"; $bits="%r3";
124 $stdframe=16*$SIZE_T+4*8;
128 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
134 .type AES_Te,\@object
139 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
140 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
141 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
142 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
143 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
144 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
145 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
146 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
147 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
148 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
149 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
150 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
151 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
152 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
153 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
154 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
155 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
156 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
157 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
158 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
159 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
160 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
161 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
162 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
163 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
164 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
165 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
166 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
167 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
168 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
169 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
170 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
171 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
172 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
173 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
174 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
175 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
176 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
177 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
178 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
179 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
180 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
181 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
182 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
183 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
184 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
185 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
186 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
187 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
188 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
189 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
190 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
191 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
192 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
193 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
194 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
195 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
196 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
197 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
198 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
199 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
200 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
201 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
202 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
205 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
206 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
207 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
208 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
209 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
210 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
211 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
212 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
213 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
214 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
215 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
216 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
217 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
218 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
219 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
220 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
221 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
222 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
223 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
224 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
225 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
226 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
227 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
228 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
229 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
230 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
231 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
232 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
233 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
234 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
235 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
236 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
238 .long 0x01000000, 0x02000000, 0x04000000, 0x08000000
239 .long 0x10000000, 0x20000000, 0x40000000, 0x80000000
240 .long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
242 .size AES_Te,.-AES_Te
244 # void AES_encrypt(const unsigned char *inp, unsigned char *out,
245 # const AES_KEY *key) {
247 .type AES_encrypt,\@function
250 $code.=<<___ if (!$softonly);
259 lghi %r3,16 # single block length
260 .long 0xb92e0042 # km %r4,%r2
261 brc 1,.-4 # can this happen?
267 stm${g} %r3,$ra,3*$SIZE_T($sp)
275 bras $ra,_s390x_AES_encrypt
277 l${g} $out,3*$SIZE_T($sp)
283 lm${g} %r6,$ra,6*$SIZE_T($sp)
285 .size AES_encrypt,.-AES_encrypt
287 .type _s390x_AES_encrypt,\@function
290 st${g} $ra,15*$SIZE_T($sp)
296 llill $mask,`0xff<<3`
310 srlg $i1,$s1,`16-3` # i0
319 l $s0,0($s0,$tbl) # Te0[s0>>24]
320 l $t1,1($t1,$tbl) # Te3[s0>>0]
321 l $t2,2($t2,$tbl) # Te2[s0>>8]
322 l $t3,3($t3,$tbl) # Te1[s0>>16]
324 x $s0,3($i1,$tbl) # Te1[s1>>16]
325 l $s1,0($s1,$tbl) # Te0[s1>>24]
326 x $t2,1($i2,$tbl) # Te3[s1>>0]
327 x $t3,2($i3,$tbl) # Te2[s1>>8]
329 srlg $i1,$s2,`8-3` # i0
330 srlg $i2,$s2,`16-3` # i1
339 srlg $ra,$s3,`8-3` # i1
340 sllg $t1,$s3,`0+3` # i0
345 x $s0,2($i1,$tbl) # Te2[s2>>8]
346 x $s1,3($i2,$tbl) # Te1[s2>>16]
347 l $s2,0($s2,$tbl) # Te0[s2>>24]
348 x $t3,1($i3,$tbl) # Te3[s2>>0]
350 srlg $i3,$s3,`16-3` # i2
361 x $s0,1($t1,$tbl) # Te3[s3>>0]
362 x $s1,2($ra,$tbl) # Te2[s3>>8]
363 x $s2,3($i3,$tbl) # Te1[s3>>16]
364 l $s3,0($s3,$tbl) # Te0[s3>>24]
367 brct $rounds,.Lenc_loop
379 srlg $i1,$s1,`16-3` # i0
388 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
389 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
391 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
392 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
396 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
397 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
398 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
399 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
408 srlg $i1,$s2,`8-3` # i0
409 srlg $i2,$s2,`16-3` # i1
417 sllg $t1,$s3,`0+3` # i0
418 srlg $ra,$s3,`8-3` # i1
421 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
422 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
424 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
425 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
434 srlg $i3,$s3,`16-3` # i2
442 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
443 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
444 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
445 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
454 l${g} $ra,15*$SIZE_T($sp)
461 .size _s390x_AES_encrypt,.-_s390x_AES_encrypt
465 .type AES_Td,\@object
470 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
471 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
472 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
473 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
474 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
475 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
476 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
477 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
478 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
479 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
480 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
481 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
482 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
483 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
484 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
485 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
486 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
487 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
488 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
489 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
490 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
491 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
492 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
493 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
494 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
495 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
496 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
497 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
498 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
499 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
500 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
501 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
502 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
503 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
504 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
505 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
506 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
507 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
508 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
509 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
510 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
511 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
512 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
513 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
514 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
515 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
516 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
517 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
518 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
519 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
520 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
521 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
522 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
523 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
524 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
525 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
526 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
527 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
528 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
529 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
530 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
531 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
532 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
533 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
536 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
537 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
538 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
539 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
540 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
541 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
542 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
543 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
544 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
545 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
546 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
547 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
548 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
549 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
550 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
551 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
552 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
553 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
554 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
555 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
556 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
557 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
558 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
559 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
560 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
561 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
562 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
563 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
564 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
565 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
566 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
567 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
568 .size AES_Td,.-AES_Td
570 # void AES_decrypt(const unsigned char *inp, unsigned char *out,
571 # const AES_KEY *key) {
573 .type AES_decrypt,\@function
576 $code.=<<___ if (!$softonly);
585 lghi %r3,16 # single block length
586 .long 0xb92e0042 # km %r4,%r2
587 brc 1,.-4 # can this happen?
593 stm${g} %r3,$ra,3*$SIZE_T($sp)
601 bras $ra,_s390x_AES_decrypt
603 l${g} $out,3*$SIZE_T($sp)
609 lm${g} %r6,$ra,6*$SIZE_T($sp)
611 .size AES_decrypt,.-AES_decrypt
613 .type _s390x_AES_decrypt,\@function
616 st${g} $ra,15*$SIZE_T($sp)
622 llill $mask,`0xff<<3`
636 sllg $i1,$s1,`0+3` # i0
645 l $s0,0($s0,$tbl) # Td0[s0>>24]
646 l $t1,3($t1,$tbl) # Td1[s0>>16]
647 l $t2,2($t2,$tbl) # Td2[s0>>8]
648 l $t3,1($t3,$tbl) # Td3[s0>>0]
650 x $s0,1($i1,$tbl) # Td3[s1>>0]
651 l $s1,0($s1,$tbl) # Td0[s1>>24]
652 x $t2,3($i2,$tbl) # Td1[s1>>16]
653 x $t3,2($i3,$tbl) # Td2[s1>>8]
655 srlg $i1,$s2,`8-3` # i0
656 sllg $i2,$s2,`0+3` # i1
665 srlg $ra,$s3,`8-3` # i1
666 srlg $t1,$s3,`16-3` # i0
671 x $s0,2($i1,$tbl) # Td2[s2>>8]
672 x $s1,1($i2,$tbl) # Td3[s2>>0]
673 l $s2,0($s2,$tbl) # Td0[s2>>24]
674 x $t3,3($i3,$tbl) # Td1[s2>>16]
676 sllg $i3,$s3,`0+3` # i2
687 x $s0,3($t1,$tbl) # Td1[s3>>16]
688 x $s1,2($ra,$tbl) # Td2[s3>>8]
689 x $s2,1($i3,$tbl) # Td3[s3>>0]
690 l $s3,0($s3,$tbl) # Td0[s3>>24]
693 brct $rounds,.Ldec_loop
696 l $t1,`2048+0`($tbl) # prefetch Td4
697 l $t2,`2048+64`($tbl)
698 l $t3,`2048+128`($tbl)
699 l $i1,`2048+192`($tbl)
716 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
717 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
718 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
720 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
724 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
725 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
726 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
728 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
742 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
743 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
744 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
745 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
760 l${g} $ra,15*$SIZE_T($sp)
765 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
766 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
768 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
769 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
783 .size _s390x_AES_decrypt,.-_s390x_AES_decrypt
787 # void AES_set_encrypt_key(const unsigned char *in, int bits,
789 .globl AES_set_encrypt_key
790 .type AES_set_encrypt_key,\@function
793 _s390x_AES_set_encrypt_key:
815 $code.=<<___ if (!$softonly);
816 # convert bits to km code, [128,192,256]->[18,19,20]
823 larl %r1,OPENSSL_s390xcap_P
825 tmhl %r0,0x4000 # check for message-security assist
830 ng %r0,48(%r1) # check kmc capability vector
833 lmg %r0,%r1,0($inp) # just copy 128 bits...
843 1: st $bits,236($key) # save bits [for debugging purposes]
845 st %r5,240($key) # save km code
852 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
854 larl $tbl,AES_Te+2048
873 llgfr $t2,$s3 # temp=rk[3]
887 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
888 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
889 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
890 icm $t2,1,0($i3) # Te4[rk[3]>>24]
891 x $t2,256($t3,$tbl) # rcon[i]
892 xr $s0,$t2 # rk[4]=rk[0]^...
893 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
894 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
895 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
897 llgfr $t2,$s3 # temp=rk[3]
909 la $key,16($key) # key+=4
911 brct $rounds,.L128_loop
914 lm${g} %r4,%r13,4*$SIZE_T($sp)
946 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
947 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
948 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
949 icm $t1,1,0($i3) # Te4[rk[5]>>24]
950 x $t1,256($t3,$tbl) # rcon[i]
951 xr $s0,$t1 # rk[6]=rk[0]^...
952 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
953 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
954 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
960 brct $rounds,.L192_continue
963 lm${g} %r4,%r13,4*$SIZE_T($sp)
969 x $t1,16($key) # rk[10]=rk[4]^rk[9]
971 x $t1,20($key) # rk[11]=rk[5]^rk[10]
981 la $key,24($key) # key+=6
1010 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
1011 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
1012 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
1013 icm $t1,1,0($i3) # Te4[rk[7]>>24]
1014 x $t1,256($t3,$tbl) # rcon[i]
1015 xr $s0,$t1 # rk[8]=rk[0]^...
1016 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
1017 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
1018 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
1023 brct $rounds,.L256_continue
1026 lm${g} %r4,%r13,4*$SIZE_T($sp)
1031 lgr $t1,$s3 # temp=rk[11]
1042 llgc $t1,0($t1) # Te4[rk[11]>>0]
1043 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
1044 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
1045 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
1046 x $t1,16($key) # rk[12]=rk[4]^...
1048 x $t1,20($key) # rk[13]=rk[5]^rk[12]
1050 x $t1,24($key) # rk[14]=rk[6]^rk[13]
1052 x $t1,28($key) # rk[15]=rk[7]^rk[14]
1062 la $key,32($key) # key+=8
1069 .size AES_set_encrypt_key,.-AES_set_encrypt_key
1071 # void AES_set_decrypt_key(const unsigned char *in, int bits,
1073 .globl AES_set_decrypt_key
1074 .type AES_set_decrypt_key,\@function
1076 AES_set_decrypt_key:
1077 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1078 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
1079 bras $ra,_s390x_AES_set_encrypt_key
1080 #l${g} $key,4*$SIZE_T($sp)
1081 l${g} $ra,14*$SIZE_T($sp)
1085 $code.=<<___ if (!$softonly);
1090 oill $t0,0x80 # set "decrypt" bit
1096 .Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
1104 .Linv: lmg $s0,$s1,0($i1)
1116 llgf $rounds,240($key)
1118 sll $rounds,2 # (rounds-1)*4
1119 llilh $mask80,0x8080
1120 llilh $mask1b,0x1b1b
1121 llilh $maskfe,0xfefe
1127 .Lmix: l $s0,16($key) # tp1
1155 xr $s1,$s0 # tp2^tp1
1156 xr $s2,$s0 # tp4^tp1
1157 rll $s0,$s0,24 # = ROTATE(tp1,8)
1159 xr $s0,$s1 # ^=tp2^tp1
1160 xr $s1,$s3 # tp2^tp1^tp8
1161 xr $s0,$s2 # ^=tp4^tp1^tp8
1164 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
1166 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
1167 xr $s0,$s3 # ^= ROTATE(tp8,8)
1173 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1176 .size AES_set_decrypt_key,.-AES_set_decrypt_key
1179 ########################################################################
1180 # void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1181 # size_t length, const AES_KEY *key,
1182 # unsigned char *ivec, const int enc)
1185 my $out="%r4"; # length and out are swapped
1191 .globl AES_cbc_encrypt
1192 .type AES_cbc_encrypt,\@function
1195 xgr %r3,%r4 # flip %r3 and %r4, out and len
1199 $code.=<<___ if (!$softonly);
1204 lg %r0,0($ivp) # copy ivec
1206 stmg %r0,%r1,16($sp)
1207 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1208 stmg %r0,%r1,32($sp)
1209 lmg %r0,%r1,16($key)
1210 stmg %r0,%r1,48($sp)
1211 l %r0,240($key) # load kmc code
1212 lghi $key,15 # res=len%16, len-=res;
1215 la %r1,16($sp) # parameter block - ivec || key
1217 .long 0xb92f0042 # kmc %r4,%r2
1218 brc 1,.-4 # pay attention to "partial completion"
1222 lmg %r0,%r1,16($sp) # copy ivec to caller
1228 ahi $key,-1 # it's the way it's encoded in mvc
1230 jnz .Lkmc_truncated_dec
1232 stg %r1,16*$SIZE_T($sp)
1233 stg %r1,16*$SIZE_T+8($sp)
1235 mvc 16*$SIZE_T(1,$sp),0($inp)
1237 la %r1,16($sp) # restore parameter block
1238 la $inp,16*$SIZE_T($sp)
1240 .long 0xb92f0042 # kmc %r4,%r2
1243 .Lkmc_truncated_dec:
1244 st${g} $out,4*$SIZE_T($sp)
1245 la $out,16*$SIZE_T($sp)
1247 .long 0xb92f0042 # kmc %r4,%r2
1248 l${g} $out,4*$SIZE_T($sp)
1250 mvc 0(1,$out),16*$SIZE_T($sp)
1257 stm${g} $key,$ra,5*$SIZE_T($sp)
1259 cl %r0,`$stdframe+$SIZE_T-4`($sp)
1271 brc 4,.Lcbc_enc_tail # if borrow
1273 stm${g} $inp,$out,2*$SIZE_T($sp)
1280 bras $ra,_s390x_AES_encrypt
1282 lm${g} $inp,$key,2*$SIZE_T($sp)
1294 brc 4,.Lcbc_enc_tail # if borrow
1298 l${g} $ivp,6*$SIZE_T($sp)
1304 lm${g} %r7,$ra,7*$SIZE_T($sp)
1311 stg $t0,16*$SIZE_T($sp)
1312 stg $t0,16*$SIZE_T+8($sp)
1314 mvc 16*$SIZE_T(1,$sp),0($inp)
1317 la $inp,16*$SIZE_T($sp)
1326 stmg $t0,$t1,16*$SIZE_T($sp)
1329 stm${g} $inp,$out,2*$SIZE_T($sp)
1336 bras $ra,_s390x_AES_decrypt
1338 lm${g} $inp,$key,2*$SIZE_T($sp)
1346 xg $s0,16*$SIZE_T($sp)
1347 xg $s2,16*$SIZE_T+8($sp)
1350 brc 4,.Lcbc_dec_tail # if borrow
1351 brc 2,.Lcbc_dec_done # if zero
1354 stmg $t0,$t1,16*$SIZE_T($sp)
1364 lm${g} %r6,$ra,6*$SIZE_T($sp)
1365 stmg $t0,$t1,0($ivp)
1372 stg $s0,16*$SIZE_T($sp)
1373 stg $s2,16*$SIZE_T+8($sp)
1375 mvc 0(1,$out),16*$SIZE_T($sp)
1378 .size AES_cbc_encrypt,.-AES_cbc_encrypt
1381 ########################################################################
1382 # void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1383 # size_t blocks, const AES_KEY *key,
1384 # const unsigned char *ivec)
1387 my $out="%r4"; # blocks and out are swapped
1389 my $key="%r5"; my $iv0="%r5";
1394 .globl AES_ctr32_encrypt
1395 .type AES_ctr32_encrypt,\@function
1398 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1401 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
1403 $code.=<<___ if (!$softonly);
1409 stm${g} %r6,$s3,6*$SIZE_T($sp)
1412 la %r1,0($key) # %r1 is permanent copy of $key
1413 lg $iv0,0($ivp) # load ivec
1416 # prepare and allocate stack frame at the top of 4K page
1417 # with 1K reserved for eventual signal handling
1418 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1422 ngr $s0,$s1 # align at page boundary
1423 slgr $fp,$s0 # total buffer size
1425 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1426 slgr $fp,$s1 # deduct reservation to get usable buffer size
1427 # buffer size is at lest 256 and at most 3072+256-16
1429 la $sp,1024($s0) # alloca
1430 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
1431 st${g} $s2,0($sp) # back-chain
1432 st${g} $fp,$SIZE_T($sp)
1435 brc 1,.Lctr32_hw_switch # not zero, no borrow
1436 algr $fp,$len # input is shorter than allocated buffer
1438 st${g} $fp,$SIZE_T($sp)
1442 $code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
1443 larl $s0,OPENSSL_s390xcap_P
1445 tmhh $s0,0x0004 # check for message_security-assist-4
1450 larl %r1,OPENSSL_s390xcap_P
1451 llihh %r0,0x8000 # check if kmctr supports the function code
1453 ng %r0,64(%r1) # check kmctr capability vector
1459 algr $out,$inp # restore $out
1460 lgr $s1,$len # $s1 undertakes $len
1461 j .Lctr32_kmctr_loop
1466 .Lctr32_kmctr_prepare:
1470 ahi $ivp,1 # 32-bit increment, preserves upper half
1471 brct $s3,.Lctr32_kmctr_prepare
1473 #la $inp,0($inp) # inp
1474 sllg $len,$fp,4 # len
1475 #la $out,0($out) # out
1477 .long 0xb92da042 # kmctr $out,$s2,$inp
1478 brc 1,.-4 # pay attention to "partial completion"
1481 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1484 brc 4+1,.Lctr32_kmctr_loop # not zero
1487 lm${g} %r6,$s3,6*$SIZE_T($sp)
1499 ahi $ivp,1 # 32-bit increment, preserves upper half
1500 brct $s3,.Lctr32_km_prepare
1502 la $s0,16($sp) # inp
1503 sllg $s1,$fp,4 # len
1504 la $s2,16($sp) # out
1505 .long 0xb92e00a8 # km %r10,%r8
1506 brc 1,.-4 # pay attention to "partial completion"
1516 stg $s0,0($out,$inp)
1517 stg $s1,8($out,$inp)
1519 brct $s3,.Lctr32_km_xor
1522 brc 1,.Lctr32_km_loop # not zero, no borrow
1525 brc 4+1,.Lctr32_km_loop # not zero
1528 l${g} $s1,$SIZE_T($sp)
1534 brct $s1,.Lctr32_km_zap
1537 lm${g} %r6,$s3,6*$SIZE_T($sp)
1543 stm${g} $key,$ra,5*$SIZE_T($sp)
1549 stm${g} $inp,$out,2*$SIZE_T($sp)
1554 st $t1,16*$SIZE_T($sp)
1557 bras $ra,_s390x_AES_encrypt
1559 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1560 llgf $t1,16*$SIZE_T($sp)
1568 ahi $t1,1 # 32-bit increment
1569 brct $len,.Lctr32_loop
1571 lm${g} %r6,$ra,6*$SIZE_T($sp)
1573 .size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1577 ########################################################################
1578 # void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
1579 # size_t len, const AES_KEY *key1, const AES_KEY *key2,
1580 # const unsigned char iv[16]);
1584 my $out="%r4"; # len and out are swapped
1586 my $key1="%r5"; # $i1
1587 my $key2="%r6"; # $i2
1589 my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1592 .type _s390x_xts_km,\@function
1597 llgfr $s0,%r0 # put aside the function code
1600 larl %r1,OPENSSL_s390xcap_P
1602 srlg %r0,%r0,32($s1) # check for 32+function code
1603 ng %r0,32(%r1) # check km capability vector
1604 lgr %r0,$s0 # restore the function code
1605 la %r1,0($key1) # restore $key1
1608 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1611 oill %r0,32 # switch to xts function code
1613 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1614 la %r1,$tweak-16($sp)
1615 slgr %r1,$s1 # parameter block position
1616 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1617 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1618 # yes, it contains junk and overlaps
1619 # with the tweak in 128-bit case.
1620 # it's done to avoid conditional
1622 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1624 .long 0xb92e0042 # km %r4,%r2
1625 brc 1,.-4 # pay attention to "partial completion"
1627 lrvg $s0,$tweak+0($sp) # load the last tweak
1628 lrvg $s1,$tweak+8($sp)
1629 stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
1631 nill %r0,0xffdf # switch back to original function code
1632 la %r1,0($key1) # restore pointer to $key1
1635 llgc $len,2*$SIZE_T-1($sp)
1636 nill $len,0x0f # $len%=16
1643 # prepare and allocate stack frame at the top of 4K page
1644 # with 1K reserved for eventual signal handling
1645 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1649 ngr $s0,$s1 # align at page boundary
1650 slgr $fp,$s0 # total buffer size
1652 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1653 slgr $fp,$s1 # deduct reservation to get usable buffer size
1654 # buffer size is at lest 256 and at most 3072+256-16
1656 la $sp,1024($s0) # alloca
1657 nill $fp,0xfff0 # round to 16*n
1658 st${g} $s2,0($sp) # back-chain
1659 nill $len,0xfff0 # redundant
1660 st${g} $fp,$SIZE_T($sp)
1663 brc 1,.Lxts_km_go # not zero, no borrow
1664 algr $fp,$len # input is shorter than allocated buffer
1666 st${g} $fp,$SIZE_T($sp)
1669 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1670 lrvg $s1,$tweak+8($s2)
1672 la $s2,16($sp) # vector of ascending tweak values
1683 srag $i2,$s1,63 # broadcast upper bit
1689 lrvgr $i1,$s0 # flip byte order
1695 stg $i1,0($out,$inp)
1696 stg $i2,8($out,$inp)
1698 brct $s3,.Lxts_km_prepare
1700 slgr $inp,$fp # rewind $inp
1703 .long 0xb92e00aa # km $s2,$s2
1704 brc 1,.-4 # pay attention to "partial completion"
1714 stg $i1,0($out,$inp)
1715 stg $i2,8($out,$inp)
1717 brct $s3,.Lxts_km_xor
1720 brc 1,.Lxts_km_loop # not zero, no borrow
1723 brc 4+1,.Lxts_km_loop # not zero
1725 l${g} $i1,0($sp) # back-chain
1726 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1733 brct $fp,.Lxts_km_zap
1736 llgc $len,2*$SIZE_T-1($i1)
1737 nill $len,0x0f # $len%=16
1740 # generate one more tweak...
1742 srag $i2,$s1,63 # broadcast upper bit
1748 ltr $len,$len # clear zero flag
1750 .size _s390x_xts_km,.-_s390x_xts_km
1752 .globl AES_xts_encrypt
1753 .type AES_xts_encrypt,\@function
1756 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1760 $code.=<<___ if ($SIZE_T==4);
1764 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1765 srag $len,$len,4 # formally wrong, because it expands
1766 # sign byte, but who can afford asking
1767 # to process more than 2^63-1 bytes?
1768 # I use it, because it sets condition
1770 bcr 8,$ra # abort if zero (i.e. less than 16)
1772 $code.=<<___ if (!$softonly);
1776 jl .Lxts_enc_software
1778 st${g} $ra,5*$SIZE_T($sp)
1779 stm${g} %r6,$s3,6*$SIZE_T($sp)
1781 sllg $len,$len,4 # $len&=~15
1784 # generate the tweak value
1785 l${g} $s3,$stdframe($sp) # pointer to iv
1790 la %r1,0($key2) # $key2 is not needed anymore
1791 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1792 brc 1,.-4 # can this happen?
1795 la %r1,0($key1) # $key1 is not needed anymore
1796 bras $ra,_s390x_xts_km
1797 jz .Lxts_enc_km_done
1799 aghi $inp,-16 # take one step back
1800 la $i3,0($out,$inp) # put aside real $out
1803 llgc $i2,0($out,$inp)
1804 stc $i1,0($out,$inp)
1805 stc $i2,16($out,$inp)
1807 brct $len,.Lxts_enc_km_steal
1811 lrvgr $i1,$s0 # flip byte order
1817 .long 0xb92e00aa # km $s2,$s2
1818 brc 1,.-4 # can this happen?
1819 lrvgr $i1,$s0 # flip byte order
1827 stg $sp,$tweak+0($sp) # wipe tweak
1828 stg $sp,$tweak+8($sp)
1829 l${g} $ra,5*$SIZE_T($sp)
1830 lm${g} %r6,$s3,6*$SIZE_T($sp)
1836 stm${g} %r6,$ra,6*$SIZE_T($sp)
1840 l${g} $s3,$stdframe($sp) # ivp
1841 llgf $s0,0($s3) # load iv
1845 stm${g} %r2,%r5,2*$SIZE_T($sp)
1848 bras $ra,_s390x_AES_encrypt # generate the tweak
1849 lm${g} %r2,%r5,2*$SIZE_T($sp)
1850 stm $s0,$s3,$tweak($sp) # save the tweak
1855 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1856 lrvg $s3,$tweak+8($sp)
1858 srag %r0,$s3,63 # broadcast upper bit
1863 lrvgr $s1,$s1 # flip byte order
1865 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1866 stg $s1,$tweak+0($sp) # save the tweak
1869 stg $s3,$tweak+8($sp)
1871 la $inp,16($inp) # $inp+=16
1873 x $s0,0($inp) # ^=*($inp)
1877 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1879 bras $ra,_s390x_AES_encrypt
1880 lm${g} %r2,%r5,2*$SIZE_T($sp)
1881 x $s0,$tweak+0($sp) # ^=tweak
1884 x $s3,$tweak+12($sp)
1888 st $s3,12($out,$inp)
1889 brct${g} $len,.Lxts_enc_loop
1891 llgc $len,`2*$SIZE_T-1`($sp)
1892 nill $len,0x0f # $len%16
1895 la $i3,0($inp,$out) # put aside real $out
1898 llgc %r1,0($out,$inp)
1899 stc %r0,0($out,$inp)
1900 stc %r1,16($out,$inp)
1902 brct $len,.Lxts_enc_steal
1903 la $out,0($i3) # restore real $out
1905 # generate last tweak...
1906 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1907 lrvg $s3,$tweak+8($sp)
1909 srag %r0,$s3,63 # broadcast upper bit
1914 lrvgr $s1,$s1 # flip byte order
1916 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1917 stg $s1,$tweak+0($sp) # save the tweak
1920 stg $s3,$tweak+8($sp)
1923 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1927 st${g} $out,4*$SIZE_T($sp)
1929 bras $ra,_s390x_AES_encrypt
1930 l${g} $out,4*$SIZE_T($sp)
1931 x $s0,`$tweak+0`($sp) # ^=tweak
1932 x $s1,`$tweak+4`($sp)
1933 x $s2,`$tweak+8`($sp)
1934 x $s3,`$tweak+12`($sp)
1941 stg $sp,$tweak+0($sp) # wipe tweak
1942 stg $sp,$twesk+8($sp)
1943 lm${g} %r6,$ra,6*$SIZE_T($sp)
1945 .size AES_xts_encrypt,.-AES_xts_encrypt
1947 # void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
1948 # size_t len, const AES_KEY *key1, const AES_KEY *key2,
1949 # const unsigned char iv[16]);
1952 .globl AES_xts_decrypt
1953 .type AES_xts_decrypt,\@function
1956 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1960 $code.=<<___ if ($SIZE_T==4);
1964 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1966 bcr 4,$ra # abort if less than zero. formally
1967 # wrong, because $len is unsigned,
1968 # but who can afford asking to
1969 # process more than 2^63-1 bytes?
1971 jnz .Lxts_dec_proceed
1975 $code.=<<___ if (!$softonly);
1979 jl .Lxts_dec_software
1981 st${g} $ra,5*$SIZE_T($sp)
1982 stm${g} %r6,$s3,6*$SIZE_T($sp)
1984 nill $len,0xfff0 # $len&=~15
1987 # generate the tweak value
1988 l${g} $s3,$stdframe($sp) # pointer to iv
1993 la %r1,0($key2) # $key2 is not needed past this point
1994 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1995 brc 1,.-4 # can this happen?
1998 la %r1,0($key1) # $key1 is not needed anymore
2001 jz .Lxts_dec_km_short
2002 bras $ra,_s390x_xts_km
2003 jz .Lxts_dec_km_done
2005 lrvgr $s2,$s0 # make copy in reverse byte order
2007 j .Lxts_dec_km_2ndtweak
2010 llgc $len,`2*$SIZE_T-1`($sp)
2011 nill $len,0x0f # $len%=16
2012 lrvg $s0,$tweak+0($sp) # load the tweak
2013 lrvg $s1,$tweak+8($sp)
2014 lrvgr $s2,$s0 # make copy in reverse byte order
2017 .Lxts_dec_km_2ndtweak:
2019 srag $i2,$s1,63 # broadcast upper bit
2024 lrvgr $i1,$s0 # flip byte order
2029 stg $i1,0($out,$inp)
2030 stg $i2,8($out,$inp)
2033 .long 0xb92e0066 # km $i2,$i2
2034 brc 1,.-4 # can this happen?
2039 stg $i1,0($out,$inp)
2040 stg $i2,8($out,$inp)
2042 la $i3,0($out,$inp) # put aside real $out
2045 llgc $i2,0($out,$inp)
2046 stc $i1,0($out,$inp)
2047 stc $i2,16($out,$inp)
2049 brct $len,.Lxts_dec_km_steal
2059 .long 0xb92e0088 # km $s0,$s0
2060 brc 1,.-4 # can this happen?
2066 stg $sp,$tweak+0($sp) # wipe tweak
2067 stg $sp,$tweak+8($sp)
2068 l${g} $ra,5*$SIZE_T($sp)
2069 lm${g} %r6,$s3,6*$SIZE_T($sp)
2075 stm${g} %r6,$ra,6*$SIZE_T($sp)
2080 l${g} $s3,$stdframe($sp) # ivp
2081 llgf $s0,0($s3) # load iv
2085 stm${g} %r2,%r5,2*$SIZE_T($sp)
2088 bras $ra,_s390x_AES_encrypt # generate the tweak
2089 lm${g} %r2,%r5,2*$SIZE_T($sp)
2092 stm $s0,$s3,$tweak($sp) # save the tweak
2098 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2099 lrvg $s3,$tweak+8($sp)
2101 srag %r0,$s3,63 # broadcast upper bit
2106 lrvgr $s1,$s1 # flip byte order
2108 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2109 stg $s1,$tweak+0($sp) # save the tweak
2112 stg $s3,$tweak+8($sp)
2115 x $s0,0($inp) # tweak^=*(inp)
2119 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2121 bras $ra,_s390x_AES_decrypt
2122 lm${g} %r2,%r5,2*$SIZE_T($sp)
2123 x $s0,$tweak+0($sp) # ^=tweak
2126 x $s3,$tweak+12($sp)
2130 st $s3,12($out,$inp)
2132 brct${g} $len,.Lxts_dec_loop
2134 llgc $len,`2*$SIZE_T-1`($sp)
2135 nill $len,0x0f # $len%16
2138 # generate pair of tweaks...
2139 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2140 lrvg $s3,$tweak+8($sp)
2142 srag %r0,$s3,63 # broadcast upper bit
2147 lrvgr $i2,$s1 # flip byte order
2149 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2150 j .Lxts_dec_2ndtweak
2154 llgc $len,`2*$SIZE_T-1`($sp)
2155 nill $len,0x0f # $len%16
2156 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2157 lrvg $s3,$tweak+8($sp)
2160 srag %r0,$s3,63 # broadcast upper bit
2165 lrvgr $s1,$s1 # flip byte order
2167 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2168 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2171 stg $s3,$tweak-16+8($sp)
2174 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2178 stm${g} %r2,%r3,2*$SIZE_T($sp)
2180 bras $ra,_s390x_AES_decrypt
2181 lm${g} %r2,%r5,2*$SIZE_T($sp)
2182 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2183 x $s1,$tweak-16+4($sp)
2184 x $s2,$tweak-16+8($sp)
2185 x $s3,$tweak-16+12($sp)
2189 st $s3,12($out,$inp)
2191 la $i3,0($out,$inp) # put aside real $out
2194 llgc %r1,0($out,$inp)
2195 stc %r0,0($out,$inp)
2196 stc %r1,16($out,$inp)
2198 brct $len,.Lxts_dec_steal
2199 la $out,0($i3) # restore real $out
2201 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2202 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2206 st${g} $out,4*$SIZE_T($sp)
2208 bras $ra,_s390x_AES_decrypt
2209 l${g} $out,4*$SIZE_T($sp)
2210 x $s0,$tweak+0($sp) # ^=tweak
2213 x $s3,$tweak+12($sp)
2218 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2219 stg $sp,$tweak-16+8($sp)
2221 stg $sp,$tweak+0($sp) # wipe tweak
2222 stg $sp,$twesk+8($sp)
2223 lm${g} %r6,$ra,6*$SIZE_T($sp)
2225 .size AES_xts_decrypt,.-AES_xts_decrypt
2229 .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2230 .comm OPENSSL_s390xcap_P,80,8
2233 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2235 close STDOUT; # force flush