2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
21 # Software performance improvement over gcc-generated code is ~70% and
22 # in absolute terms is ~73 cycles per byte processed with 128-bit key.
23 # You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
24 # *strictly* in-order execution and issued instruction [in this case
25 # load value from memory is critical] has to complete before execution
26 # flow proceeds. S-boxes are compressed to 2KB[+256B].
28 # As for hardware acceleration support. It's basically a "teaser," as
29 # it can and should be improved in several ways. Most notably support
30 # for CBC is not utilized, nor multiple blocks are ever processed.
31 # Then software key schedule can be postponed till hardware support
32 # detection... Performance improvement over assembler is reportedly
33 # ~2.5x, but can reach >8x [naturally on larger chunks] if proper
34 # support is implemented.
38 # Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
39 # for 128-bit keys, if hardware support is detected.
43 # Add support for hardware AES192/256 and reschedule instructions to
44 # minimize/avoid Address Generation Interlock hazard and to favour
45 # dual-issue z10 pipeline. This gave ~25% improvement on z10 and
46 # almost 50% on z9. The gain is smaller on z10, because being dual-
47 # issue z10 makes it improssible to eliminate the interlock condition:
48 # critial path is not long enough. Yet it spends ~24 cycles per byte
49 # processed with 128-bit key.
51 # Unlike previous version hardware support detection takes place only
52 # at the moment of key schedule setup, which is denoted in key->rounds.
53 # This is done, because deferred key setup can't be made MT-safe, not
54 # for keys longer than 128 bits.
56 # Add AES_cbc_encrypt, which gives incredible performance improvement,
57 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
58 # because software implementation was optimized.
62 # Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
63 # performance improvement over "generic" counter mode routine relying
64 # on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
65 # to the fact that exact throughput value depends on current stack
66 # frame alignment within 4KB page. In worst case you get ~75% of the
67 # maximum, but *on average* it would be as much as ~98%. Meaning that
68 # worst case is unlike, it's like hitting ravine on plateau.
72 # Adapt for -m31 build. If kernel supports what's called "highgprs"
73 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
74 # instructions and achieve "64-bit" performance even in 31-bit legacy
75 # application context. The feature is not specific to any particular
76 # processor, as long as it's "z-CPU". Latter implies that the code
77 # remains z/Architecture specific. On z990 it was measured to perform
78 # 2x better than code generated by gcc 4.3.
82 # Add support for z196 "cipher message with counter" instruction.
83 # Note however that it's disengaged, because it was measured to
84 # perform ~12% worse than vanilla km-based code...
88 # Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
89 # instructions, which deliver ~70% improvement at 8KB block size over
90 # vanilla km-based code, 37% - at most like 512-bytes block size.
94 if ($flavour =~ /3[12]/) {
102 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
103 open STDOUT,">$output";
105 $softonly=0; # allow hardware support
107 $t0="%r0"; $mask="%r0";
109 $t2="%r2"; $inp="%r2";
110 $t3="%r3"; $out="%r3"; $bits="%r3";
124 $stdframe=16*$SIZE_T+4*8;
128 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
132 #include "s390x_arch.h"
136 .type AES_Te,\@object
141 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
142 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
143 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
144 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
145 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
146 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
147 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
148 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
149 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
150 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
151 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
152 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
153 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
154 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
155 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
156 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
157 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
158 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
159 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
160 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
161 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
162 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
163 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
164 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
165 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
166 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
167 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
168 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
169 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
170 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
171 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
172 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
173 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
174 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
175 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
176 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
177 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
178 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
179 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
180 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
181 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
182 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
183 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
184 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
185 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
186 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
187 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
188 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
189 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
190 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
191 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
192 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
193 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
194 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
195 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
196 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
197 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
198 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
199 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
200 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
201 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
202 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
203 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
204 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
207 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
208 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
209 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
210 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
211 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
212 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
213 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
214 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
215 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
216 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
217 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
218 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
219 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
220 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
221 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
222 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
223 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
224 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
225 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
226 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
227 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
228 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
229 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
230 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
231 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
232 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
233 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
234 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
235 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
236 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
237 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
238 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
240 .long 0x01000000, 0x02000000, 0x04000000, 0x08000000
241 .long 0x10000000, 0x20000000, 0x40000000, 0x80000000
242 .long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
244 .size AES_Te,.-AES_Te
246 # void AES_encrypt(const unsigned char *inp, unsigned char *out,
247 # const AES_KEY *key) {
249 .type AES_encrypt,\@function
252 $code.=<<___ if (!$softonly);
261 lghi %r3,16 # single block length
262 .long 0xb92e0042 # km %r4,%r2
263 brc 1,.-4 # can this happen?
269 stm${g} %r3,$ra,3*$SIZE_T($sp)
277 bras $ra,_s390x_AES_encrypt
279 l${g} $out,3*$SIZE_T($sp)
285 lm${g} %r6,$ra,6*$SIZE_T($sp)
287 .size AES_encrypt,.-AES_encrypt
289 .type _s390x_AES_encrypt,\@function
292 st${g} $ra,15*$SIZE_T($sp)
298 llill $mask,`0xff<<3`
312 srlg $i1,$s1,`16-3` # i0
321 l $s0,0($s0,$tbl) # Te0[s0>>24]
322 l $t1,1($t1,$tbl) # Te3[s0>>0]
323 l $t2,2($t2,$tbl) # Te2[s0>>8]
324 l $t3,3($t3,$tbl) # Te1[s0>>16]
326 x $s0,3($i1,$tbl) # Te1[s1>>16]
327 l $s1,0($s1,$tbl) # Te0[s1>>24]
328 x $t2,1($i2,$tbl) # Te3[s1>>0]
329 x $t3,2($i3,$tbl) # Te2[s1>>8]
331 srlg $i1,$s2,`8-3` # i0
332 srlg $i2,$s2,`16-3` # i1
341 srlg $ra,$s3,`8-3` # i1
342 sllg $t1,$s3,`0+3` # i0
347 x $s0,2($i1,$tbl) # Te2[s2>>8]
348 x $s1,3($i2,$tbl) # Te1[s2>>16]
349 l $s2,0($s2,$tbl) # Te0[s2>>24]
350 x $t3,1($i3,$tbl) # Te3[s2>>0]
352 srlg $i3,$s3,`16-3` # i2
363 x $s0,1($t1,$tbl) # Te3[s3>>0]
364 x $s1,2($ra,$tbl) # Te2[s3>>8]
365 x $s2,3($i3,$tbl) # Te1[s3>>16]
366 l $s3,0($s3,$tbl) # Te0[s3>>24]
369 brct $rounds,.Lenc_loop
381 srlg $i1,$s1,`16-3` # i0
390 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
391 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
393 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
394 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
398 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
399 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
400 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
401 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
410 srlg $i1,$s2,`8-3` # i0
411 srlg $i2,$s2,`16-3` # i1
419 sllg $t1,$s3,`0+3` # i0
420 srlg $ra,$s3,`8-3` # i1
423 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
424 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
426 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
427 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
436 srlg $i3,$s3,`16-3` # i2
444 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
445 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
446 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
447 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
456 l${g} $ra,15*$SIZE_T($sp)
463 .size _s390x_AES_encrypt,.-_s390x_AES_encrypt
467 .type AES_Td,\@object
472 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
473 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
474 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
475 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
476 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
477 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
478 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
479 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
480 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
481 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
482 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
483 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
484 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
485 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
486 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
487 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
488 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
489 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
490 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
491 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
492 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
493 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
494 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
495 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
496 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
497 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
498 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
499 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
500 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
501 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
502 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
503 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
504 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
505 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
506 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
507 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
508 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
509 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
510 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
511 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
512 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
513 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
514 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
515 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
516 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
517 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
518 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
519 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
520 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
521 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
522 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
523 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
524 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
525 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
526 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
527 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
528 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
529 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
530 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
531 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
532 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
533 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
534 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
535 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
538 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
539 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
540 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
541 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
542 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
543 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
544 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
545 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
546 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
547 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
548 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
549 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
550 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
551 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
552 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
553 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
554 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
555 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
556 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
557 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
558 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
559 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
560 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
561 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
562 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
563 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
564 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
565 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
566 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
567 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
568 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
569 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
570 .size AES_Td,.-AES_Td
572 # void AES_decrypt(const unsigned char *inp, unsigned char *out,
573 # const AES_KEY *key) {
575 .type AES_decrypt,\@function
578 $code.=<<___ if (!$softonly);
587 lghi %r3,16 # single block length
588 .long 0xb92e0042 # km %r4,%r2
589 brc 1,.-4 # can this happen?
595 stm${g} %r3,$ra,3*$SIZE_T($sp)
603 bras $ra,_s390x_AES_decrypt
605 l${g} $out,3*$SIZE_T($sp)
611 lm${g} %r6,$ra,6*$SIZE_T($sp)
613 .size AES_decrypt,.-AES_decrypt
615 .type _s390x_AES_decrypt,\@function
618 st${g} $ra,15*$SIZE_T($sp)
624 llill $mask,`0xff<<3`
638 sllg $i1,$s1,`0+3` # i0
647 l $s0,0($s0,$tbl) # Td0[s0>>24]
648 l $t1,3($t1,$tbl) # Td1[s0>>16]
649 l $t2,2($t2,$tbl) # Td2[s0>>8]
650 l $t3,1($t3,$tbl) # Td3[s0>>0]
652 x $s0,1($i1,$tbl) # Td3[s1>>0]
653 l $s1,0($s1,$tbl) # Td0[s1>>24]
654 x $t2,3($i2,$tbl) # Td1[s1>>16]
655 x $t3,2($i3,$tbl) # Td2[s1>>8]
657 srlg $i1,$s2,`8-3` # i0
658 sllg $i2,$s2,`0+3` # i1
667 srlg $ra,$s3,`8-3` # i1
668 srlg $t1,$s3,`16-3` # i0
673 x $s0,2($i1,$tbl) # Td2[s2>>8]
674 x $s1,1($i2,$tbl) # Td3[s2>>0]
675 l $s2,0($s2,$tbl) # Td0[s2>>24]
676 x $t3,3($i3,$tbl) # Td1[s2>>16]
678 sllg $i3,$s3,`0+3` # i2
689 x $s0,3($t1,$tbl) # Td1[s3>>16]
690 x $s1,2($ra,$tbl) # Td2[s3>>8]
691 x $s2,1($i3,$tbl) # Td3[s3>>0]
692 l $s3,0($s3,$tbl) # Td0[s3>>24]
695 brct $rounds,.Ldec_loop
698 l $t1,`2048+0`($tbl) # prefetch Td4
699 l $t2,`2048+64`($tbl)
700 l $t3,`2048+128`($tbl)
701 l $i1,`2048+192`($tbl)
718 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
719 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
720 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
722 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
726 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
727 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
728 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
730 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
744 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
745 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
746 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
747 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
762 l${g} $ra,15*$SIZE_T($sp)
767 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
768 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
770 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
771 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
785 .size _s390x_AES_decrypt,.-_s390x_AES_decrypt
789 # void AES_set_encrypt_key(const unsigned char *in, int bits,
791 .globl AES_set_encrypt_key
792 .type AES_set_encrypt_key,\@function
795 _s390x_AES_set_encrypt_key:
817 $code.=<<___ if (!$softonly);
818 # convert bits to km(c) code, [128,192,256]->[18,19,20]
825 larl %r1,OPENSSL_s390xcap_P
828 ng %r0,S390X_KM(%r1) # check availability of both km...
829 ng %r0,S390X_KMC(%r1) # ...and kmc support for given key length
832 lmg %r0,%r1,0($inp) # just copy 128 bits...
842 1: st $bits,236($key) # save bits [for debugging purposes]
844 st %r5,240($key) # save km(c) code
851 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
853 larl $tbl,AES_Te+2048
872 llgfr $t2,$s3 # temp=rk[3]
886 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
887 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
888 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
889 icm $t2,1,0($i3) # Te4[rk[3]>>24]
890 x $t2,256($t3,$tbl) # rcon[i]
891 xr $s0,$t2 # rk[4]=rk[0]^...
892 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
893 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
894 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
896 llgfr $t2,$s3 # temp=rk[3]
908 la $key,16($key) # key+=4
910 brct $rounds,.L128_loop
913 lm${g} %r4,%r13,4*$SIZE_T($sp)
945 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
946 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
947 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
948 icm $t1,1,0($i3) # Te4[rk[5]>>24]
949 x $t1,256($t3,$tbl) # rcon[i]
950 xr $s0,$t1 # rk[6]=rk[0]^...
951 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
952 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
953 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
959 brct $rounds,.L192_continue
962 lm${g} %r4,%r13,4*$SIZE_T($sp)
968 x $t1,16($key) # rk[10]=rk[4]^rk[9]
970 x $t1,20($key) # rk[11]=rk[5]^rk[10]
980 la $key,24($key) # key+=6
1009 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
1010 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
1011 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
1012 icm $t1,1,0($i3) # Te4[rk[7]>>24]
1013 x $t1,256($t3,$tbl) # rcon[i]
1014 xr $s0,$t1 # rk[8]=rk[0]^...
1015 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
1016 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
1017 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
1022 brct $rounds,.L256_continue
1025 lm${g} %r4,%r13,4*$SIZE_T($sp)
1030 lgr $t1,$s3 # temp=rk[11]
1041 llgc $t1,0($t1) # Te4[rk[11]>>0]
1042 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
1043 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
1044 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
1045 x $t1,16($key) # rk[12]=rk[4]^...
1047 x $t1,20($key) # rk[13]=rk[5]^rk[12]
1049 x $t1,24($key) # rk[14]=rk[6]^rk[13]
1051 x $t1,28($key) # rk[15]=rk[7]^rk[14]
1061 la $key,32($key) # key+=8
1068 .size AES_set_encrypt_key,.-AES_set_encrypt_key
1070 # void AES_set_decrypt_key(const unsigned char *in, int bits,
1072 .globl AES_set_decrypt_key
1073 .type AES_set_decrypt_key,\@function
1075 AES_set_decrypt_key:
1076 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1077 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
1078 bras $ra,_s390x_AES_set_encrypt_key
1079 #l${g} $key,4*$SIZE_T($sp)
1080 l${g} $ra,14*$SIZE_T($sp)
1084 $code.=<<___ if (!$softonly);
1089 oill $t0,0x80 # set "decrypt" bit
1095 .Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
1103 .Linv: lmg $s0,$s1,0($i1)
1115 llgf $rounds,240($key)
1117 sll $rounds,2 # (rounds-1)*4
1118 llilh $mask80,0x8080
1119 llilh $mask1b,0x1b1b
1120 llilh $maskfe,0xfefe
1126 .Lmix: l $s0,16($key) # tp1
1154 xr $s1,$s0 # tp2^tp1
1155 xr $s2,$s0 # tp4^tp1
1156 rll $s0,$s0,24 # = ROTATE(tp1,8)
1158 xr $s0,$s1 # ^=tp2^tp1
1159 xr $s1,$s3 # tp2^tp1^tp8
1160 xr $s0,$s2 # ^=tp4^tp1^tp8
1163 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
1165 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
1166 xr $s0,$s3 # ^= ROTATE(tp8,8)
1172 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1175 .size AES_set_decrypt_key,.-AES_set_decrypt_key
1178 ########################################################################
1179 # void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1180 # size_t length, const AES_KEY *key,
1181 # unsigned char *ivec, const int enc)
1184 my $out="%r4"; # length and out are swapped
1190 .globl AES_cbc_encrypt
1191 .type AES_cbc_encrypt,\@function
1194 xgr %r3,%r4 # flip %r3 and %r4, out and len
1198 $code.=<<___ if (!$softonly);
1203 lg %r0,0($ivp) # copy ivec
1205 stmg %r0,%r1,16($sp)
1206 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1207 stmg %r0,%r1,32($sp)
1208 lmg %r0,%r1,16($key)
1209 stmg %r0,%r1,48($sp)
1210 l %r0,240($key) # load kmc code
1211 lghi $key,15 # res=len%16, len-=res;
1214 la %r1,16($sp) # parameter block - ivec || key
1216 .long 0xb92f0042 # kmc %r4,%r2
1217 brc 1,.-4 # pay attention to "partial completion"
1221 lmg %r0,%r1,16($sp) # copy ivec to caller
1227 ahi $key,-1 # it's the way it's encoded in mvc
1229 jnz .Lkmc_truncated_dec
1231 stg %r1,16*$SIZE_T($sp)
1232 stg %r1,16*$SIZE_T+8($sp)
1234 mvc 16*$SIZE_T(1,$sp),0($inp)
1236 la %r1,16($sp) # restore parameter block
1237 la $inp,16*$SIZE_T($sp)
1239 .long 0xb92f0042 # kmc %r4,%r2
1242 .Lkmc_truncated_dec:
1243 st${g} $out,4*$SIZE_T($sp)
1244 la $out,16*$SIZE_T($sp)
1246 .long 0xb92f0042 # kmc %r4,%r2
1247 l${g} $out,4*$SIZE_T($sp)
1249 mvc 0(1,$out),16*$SIZE_T($sp)
1256 stm${g} $key,$ra,5*$SIZE_T($sp)
1258 cl %r0,`$stdframe+$SIZE_T-4`($sp)
1270 brc 4,.Lcbc_enc_tail # if borrow
1272 stm${g} $inp,$out,2*$SIZE_T($sp)
1279 bras $ra,_s390x_AES_encrypt
1281 lm${g} $inp,$key,2*$SIZE_T($sp)
1293 brc 4,.Lcbc_enc_tail # if borrow
1297 l${g} $ivp,6*$SIZE_T($sp)
1303 lm${g} %r7,$ra,7*$SIZE_T($sp)
1310 stg $t0,16*$SIZE_T($sp)
1311 stg $t0,16*$SIZE_T+8($sp)
1313 mvc 16*$SIZE_T(1,$sp),0($inp)
1316 la $inp,16*$SIZE_T($sp)
1325 stmg $t0,$t1,16*$SIZE_T($sp)
1328 stm${g} $inp,$out,2*$SIZE_T($sp)
1335 bras $ra,_s390x_AES_decrypt
1337 lm${g} $inp,$key,2*$SIZE_T($sp)
1345 xg $s0,16*$SIZE_T($sp)
1346 xg $s2,16*$SIZE_T+8($sp)
1349 brc 4,.Lcbc_dec_tail # if borrow
1350 brc 2,.Lcbc_dec_done # if zero
1353 stmg $t0,$t1,16*$SIZE_T($sp)
1363 lm${g} %r6,$ra,6*$SIZE_T($sp)
1364 stmg $t0,$t1,0($ivp)
1371 stg $s0,16*$SIZE_T($sp)
1372 stg $s2,16*$SIZE_T+8($sp)
1374 mvc 0(1,$out),16*$SIZE_T($sp)
1377 .size AES_cbc_encrypt,.-AES_cbc_encrypt
1380 ########################################################################
1381 # void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1382 # size_t blocks, const AES_KEY *key,
1383 # const unsigned char *ivec)
1386 my $out="%r4"; # blocks and out are swapped
1388 my $key="%r5"; my $iv0="%r5";
1393 .globl AES_ctr32_encrypt
1394 .type AES_ctr32_encrypt,\@function
1397 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1400 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
1402 $code.=<<___ if (!$softonly);
1408 stm${g} %r6,$s3,6*$SIZE_T($sp)
1411 la %r1,0($key) # %r1 is permanent copy of $key
1412 lg $iv0,0($ivp) # load ivec
1415 # prepare and allocate stack frame at the top of 4K page
1416 # with 1K reserved for eventual signal handling
1417 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1421 ngr $s0,$s1 # align at page boundary
1422 slgr $fp,$s0 # total buffer size
1424 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1425 slgr $fp,$s1 # deduct reservation to get usable buffer size
1426 # buffer size is at lest 256 and at most 3072+256-16
1428 la $sp,1024($s0) # alloca
1429 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
1430 st${g} $s2,0($sp) # back-chain
1431 st${g} $fp,$SIZE_T($sp)
1434 brc 1,.Lctr32_hw_switch # not zero, no borrow
1435 algr $fp,$len # input is shorter than allocated buffer
1437 st${g} $fp,$SIZE_T($sp)
1441 $code.=<<___ if (!$softonly && 0);# kmctr code was measured to be ~12% slower
1444 larl %r1,OPENSSL_s390xcap_P
1445 llihh %r0,0x8000 # check if kmctr supports the function code
1447 ng %r0,S390X_KMCTR(%r1) # check kmctr capability vector
1453 algr $out,$inp # restore $out
1454 lgr $s1,$len # $s1 undertakes $len
1455 j .Lctr32_kmctr_loop
1460 .Lctr32_kmctr_prepare:
1464 ahi $ivp,1 # 32-bit increment, preserves upper half
1465 brct $s3,.Lctr32_kmctr_prepare
1467 #la $inp,0($inp) # inp
1468 sllg $len,$fp,4 # len
1469 #la $out,0($out) # out
1471 .long 0xb92da042 # kmctr $out,$s2,$inp
1472 brc 1,.-4 # pay attention to "partial completion"
1475 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1478 brc 4+1,.Lctr32_kmctr_loop # not zero
1481 lm${g} %r6,$s3,6*$SIZE_T($sp)
1485 $code.=<<___ if (!$softonly);
1493 ahi $ivp,1 # 32-bit increment, preserves upper half
1494 brct $s3,.Lctr32_km_prepare
1496 la $s0,16($sp) # inp
1497 sllg $s1,$fp,4 # len
1498 la $s2,16($sp) # out
1499 .long 0xb92e00a8 # km %r10,%r8
1500 brc 1,.-4 # pay attention to "partial completion"
1510 stg $s0,0($out,$inp)
1511 stg $s1,8($out,$inp)
1513 brct $s3,.Lctr32_km_xor
1516 brc 1,.Lctr32_km_loop # not zero, no borrow
1519 brc 4+1,.Lctr32_km_loop # not zero
1522 l${g} $s1,$SIZE_T($sp)
1528 brct $s1,.Lctr32_km_zap
1531 lm${g} %r6,$s3,6*$SIZE_T($sp)
1537 stm${g} $key,$ra,5*$SIZE_T($sp)
1543 stm${g} $inp,$out,2*$SIZE_T($sp)
1548 st $t1,16*$SIZE_T($sp)
1551 bras $ra,_s390x_AES_encrypt
1553 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1554 llgf $t1,16*$SIZE_T($sp)
1562 ahi $t1,1 # 32-bit increment
1563 brct $len,.Lctr32_loop
1565 lm${g} %r6,$ra,6*$SIZE_T($sp)
1567 .size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1571 ########################################################################
1572 # void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
1573 # size_t len, const AES_KEY *key1, const AES_KEY *key2,
1574 # const unsigned char iv[16]);
1578 my $out="%r4"; # len and out are swapped
1580 my $key1="%r5"; # $i1
1581 my $key2="%r6"; # $i2
1583 my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1586 .type _s390x_xts_km,\@function
1591 llgfr $s0,%r0 # put aside the function code
1594 larl %r1,OPENSSL_s390xcap_P
1596 srlg %r0,%r0,32($s1) # check for 32+function code
1597 ng %r0,S390X_KM(%r1) # check km capability vector
1598 lgr %r0,$s0 # restore the function code
1599 la %r1,0($key1) # restore $key1
1602 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1605 oill %r0,32 # switch to xts function code
1607 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1608 la %r1,$tweak-16($sp)
1609 slgr %r1,$s1 # parameter block position
1610 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1611 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1612 # yes, it contains junk and overlaps
1613 # with the tweak in 128-bit case.
1614 # it's done to avoid conditional
1616 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1618 .long 0xb92e0042 # km %r4,%r2
1619 brc 1,.-4 # pay attention to "partial completion"
1621 lrvg $s0,$tweak+0($sp) # load the last tweak
1622 lrvg $s1,$tweak+8($sp)
1623 stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
1625 nill %r0,0xffdf # switch back to original function code
1626 la %r1,0($key1) # restore pointer to $key1
1629 llgc $len,2*$SIZE_T-1($sp)
1630 nill $len,0x0f # $len%=16
1637 # prepare and allocate stack frame at the top of 4K page
1638 # with 1K reserved for eventual signal handling
1639 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1643 ngr $s0,$s1 # align at page boundary
1644 slgr $fp,$s0 # total buffer size
1646 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1647 slgr $fp,$s1 # deduct reservation to get usable buffer size
1648 # buffer size is at lest 256 and at most 3072+256-16
1650 la $sp,1024($s0) # alloca
1651 nill $fp,0xfff0 # round to 16*n
1652 st${g} $s2,0($sp) # back-chain
1653 nill $len,0xfff0 # redundant
1654 st${g} $fp,$SIZE_T($sp)
1657 brc 1,.Lxts_km_go # not zero, no borrow
1658 algr $fp,$len # input is shorter than allocated buffer
1660 st${g} $fp,$SIZE_T($sp)
1663 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1664 lrvg $s1,$tweak+8($s2)
1666 la $s2,16($sp) # vector of ascending tweak values
1677 srag $i2,$s1,63 # broadcast upper bit
1683 lrvgr $i1,$s0 # flip byte order
1689 stg $i1,0($out,$inp)
1690 stg $i2,8($out,$inp)
1692 brct $s3,.Lxts_km_prepare
1694 slgr $inp,$fp # rewind $inp
1697 .long 0xb92e00aa # km $s2,$s2
1698 brc 1,.-4 # pay attention to "partial completion"
1708 stg $i1,0($out,$inp)
1709 stg $i2,8($out,$inp)
1711 brct $s3,.Lxts_km_xor
1714 brc 1,.Lxts_km_loop # not zero, no borrow
1717 brc 4+1,.Lxts_km_loop # not zero
1719 l${g} $i1,0($sp) # back-chain
1720 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1727 brct $fp,.Lxts_km_zap
1730 llgc $len,2*$SIZE_T-1($i1)
1731 nill $len,0x0f # $len%=16
1734 # generate one more tweak...
1736 srag $i2,$s1,63 # broadcast upper bit
1742 ltr $len,$len # clear zero flag
1744 .size _s390x_xts_km,.-_s390x_xts_km
1746 .globl AES_xts_encrypt
1747 .type AES_xts_encrypt,\@function
1750 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1754 $code.=<<___ if ($SIZE_T==4);
1758 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1759 srag $len,$len,4 # formally wrong, because it expands
1760 # sign byte, but who can afford asking
1761 # to process more than 2^63-1 bytes?
1762 # I use it, because it sets condition
1764 bcr 8,$ra # abort if zero (i.e. less than 16)
1766 $code.=<<___ if (!$softonly);
1770 jl .Lxts_enc_software
1772 st${g} $ra,5*$SIZE_T($sp)
1773 stm${g} %r6,$s3,6*$SIZE_T($sp)
1775 sllg $len,$len,4 # $len&=~15
1778 # generate the tweak value
1779 l${g} $s3,$stdframe($sp) # pointer to iv
1784 la %r1,0($key2) # $key2 is not needed anymore
1785 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1786 brc 1,.-4 # can this happen?
1789 la %r1,0($key1) # $key1 is not needed anymore
1790 bras $ra,_s390x_xts_km
1791 jz .Lxts_enc_km_done
1793 aghi $inp,-16 # take one step back
1794 la $i3,0($out,$inp) # put aside real $out
1797 llgc $i2,0($out,$inp)
1798 stc $i1,0($out,$inp)
1799 stc $i2,16($out,$inp)
1801 brct $len,.Lxts_enc_km_steal
1805 lrvgr $i1,$s0 # flip byte order
1811 .long 0xb92e00aa # km $s2,$s2
1812 brc 1,.-4 # can this happen?
1813 lrvgr $i1,$s0 # flip byte order
1821 stg $sp,$tweak+0($sp) # wipe tweak
1822 stg $sp,$tweak+8($sp)
1823 l${g} $ra,5*$SIZE_T($sp)
1824 lm${g} %r6,$s3,6*$SIZE_T($sp)
1830 stm${g} %r6,$ra,6*$SIZE_T($sp)
1834 l${g} $s3,$stdframe($sp) # ivp
1835 llgf $s0,0($s3) # load iv
1839 stm${g} %r2,%r5,2*$SIZE_T($sp)
1842 bras $ra,_s390x_AES_encrypt # generate the tweak
1843 lm${g} %r2,%r5,2*$SIZE_T($sp)
1844 stm $s0,$s3,$tweak($sp) # save the tweak
1849 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1850 lrvg $s3,$tweak+8($sp)
1852 srag %r0,$s3,63 # broadcast upper bit
1857 lrvgr $s1,$s1 # flip byte order
1859 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1860 stg $s1,$tweak+0($sp) # save the tweak
1863 stg $s3,$tweak+8($sp)
1865 la $inp,16($inp) # $inp+=16
1867 x $s0,0($inp) # ^=*($inp)
1871 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1873 bras $ra,_s390x_AES_encrypt
1874 lm${g} %r2,%r5,2*$SIZE_T($sp)
1875 x $s0,$tweak+0($sp) # ^=tweak
1878 x $s3,$tweak+12($sp)
1882 st $s3,12($out,$inp)
1883 brct${g} $len,.Lxts_enc_loop
1885 llgc $len,`2*$SIZE_T-1`($sp)
1886 nill $len,0x0f # $len%16
1889 la $i3,0($inp,$out) # put aside real $out
1892 llgc %r1,0($out,$inp)
1893 stc %r0,0($out,$inp)
1894 stc %r1,16($out,$inp)
1896 brct $len,.Lxts_enc_steal
1897 la $out,0($i3) # restore real $out
1899 # generate last tweak...
1900 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1901 lrvg $s3,$tweak+8($sp)
1903 srag %r0,$s3,63 # broadcast upper bit
1908 lrvgr $s1,$s1 # flip byte order
1910 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1911 stg $s1,$tweak+0($sp) # save the tweak
1914 stg $s3,$tweak+8($sp)
1917 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1921 st${g} $out,4*$SIZE_T($sp)
1923 bras $ra,_s390x_AES_encrypt
1924 l${g} $out,4*$SIZE_T($sp)
1925 x $s0,`$tweak+0`($sp) # ^=tweak
1926 x $s1,`$tweak+4`($sp)
1927 x $s2,`$tweak+8`($sp)
1928 x $s3,`$tweak+12`($sp)
1935 stg $sp,$tweak+0($sp) # wipe tweak
1936 stg $sp,$twesk+8($sp)
1937 lm${g} %r6,$ra,6*$SIZE_T($sp)
1939 .size AES_xts_encrypt,.-AES_xts_encrypt
1941 # void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
1942 # size_t len, const AES_KEY *key1, const AES_KEY *key2,
1943 # const unsigned char iv[16]);
1946 .globl AES_xts_decrypt
1947 .type AES_xts_decrypt,\@function
1950 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1954 $code.=<<___ if ($SIZE_T==4);
1958 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1960 bcr 4,$ra # abort if less than zero. formally
1961 # wrong, because $len is unsigned,
1962 # but who can afford asking to
1963 # process more than 2^63-1 bytes?
1965 jnz .Lxts_dec_proceed
1969 $code.=<<___ if (!$softonly);
1973 jl .Lxts_dec_software
1975 st${g} $ra,5*$SIZE_T($sp)
1976 stm${g} %r6,$s3,6*$SIZE_T($sp)
1978 nill $len,0xfff0 # $len&=~15
1981 # generate the tweak value
1982 l${g} $s3,$stdframe($sp) # pointer to iv
1987 la %r1,0($key2) # $key2 is not needed past this point
1988 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1989 brc 1,.-4 # can this happen?
1992 la %r1,0($key1) # $key1 is not needed anymore
1995 jz .Lxts_dec_km_short
1996 bras $ra,_s390x_xts_km
1997 jz .Lxts_dec_km_done
1999 lrvgr $s2,$s0 # make copy in reverse byte order
2001 j .Lxts_dec_km_2ndtweak
2004 llgc $len,`2*$SIZE_T-1`($sp)
2005 nill $len,0x0f # $len%=16
2006 lrvg $s0,$tweak+0($sp) # load the tweak
2007 lrvg $s1,$tweak+8($sp)
2008 lrvgr $s2,$s0 # make copy in reverse byte order
2011 .Lxts_dec_km_2ndtweak:
2013 srag $i2,$s1,63 # broadcast upper bit
2018 lrvgr $i1,$s0 # flip byte order
2023 stg $i1,0($out,$inp)
2024 stg $i2,8($out,$inp)
2027 .long 0xb92e0066 # km $i2,$i2
2028 brc 1,.-4 # can this happen?
2033 stg $i1,0($out,$inp)
2034 stg $i2,8($out,$inp)
2036 la $i3,0($out,$inp) # put aside real $out
2039 llgc $i2,0($out,$inp)
2040 stc $i1,0($out,$inp)
2041 stc $i2,16($out,$inp)
2043 brct $len,.Lxts_dec_km_steal
2053 .long 0xb92e0088 # km $s0,$s0
2054 brc 1,.-4 # can this happen?
2060 stg $sp,$tweak+0($sp) # wipe tweak
2061 stg $sp,$tweak+8($sp)
2062 l${g} $ra,5*$SIZE_T($sp)
2063 lm${g} %r6,$s3,6*$SIZE_T($sp)
2069 stm${g} %r6,$ra,6*$SIZE_T($sp)
2074 l${g} $s3,$stdframe($sp) # ivp
2075 llgf $s0,0($s3) # load iv
2079 stm${g} %r2,%r5,2*$SIZE_T($sp)
2082 bras $ra,_s390x_AES_encrypt # generate the tweak
2083 lm${g} %r2,%r5,2*$SIZE_T($sp)
2086 stm $s0,$s3,$tweak($sp) # save the tweak
2092 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2093 lrvg $s3,$tweak+8($sp)
2095 srag %r0,$s3,63 # broadcast upper bit
2100 lrvgr $s1,$s1 # flip byte order
2102 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2103 stg $s1,$tweak+0($sp) # save the tweak
2106 stg $s3,$tweak+8($sp)
2109 x $s0,0($inp) # tweak^=*(inp)
2113 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2115 bras $ra,_s390x_AES_decrypt
2116 lm${g} %r2,%r5,2*$SIZE_T($sp)
2117 x $s0,$tweak+0($sp) # ^=tweak
2120 x $s3,$tweak+12($sp)
2124 st $s3,12($out,$inp)
2126 brct${g} $len,.Lxts_dec_loop
2128 llgc $len,`2*$SIZE_T-1`($sp)
2129 nill $len,0x0f # $len%16
2132 # generate pair of tweaks...
2133 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2134 lrvg $s3,$tweak+8($sp)
2136 srag %r0,$s3,63 # broadcast upper bit
2141 lrvgr $i2,$s1 # flip byte order
2143 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2144 j .Lxts_dec_2ndtweak
2148 llgc $len,`2*$SIZE_T-1`($sp)
2149 nill $len,0x0f # $len%16
2150 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2151 lrvg $s3,$tweak+8($sp)
2154 srag %r0,$s3,63 # broadcast upper bit
2159 lrvgr $s1,$s1 # flip byte order
2161 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2162 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2165 stg $s3,$tweak-16+8($sp)
2168 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2172 stm${g} %r2,%r3,2*$SIZE_T($sp)
2174 bras $ra,_s390x_AES_decrypt
2175 lm${g} %r2,%r5,2*$SIZE_T($sp)
2176 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2177 x $s1,$tweak-16+4($sp)
2178 x $s2,$tweak-16+8($sp)
2179 x $s3,$tweak-16+12($sp)
2183 st $s3,12($out,$inp)
2185 la $i3,0($out,$inp) # put aside real $out
2188 llgc %r1,0($out,$inp)
2189 stc %r0,0($out,$inp)
2190 stc %r1,16($out,$inp)
2192 brct $len,.Lxts_dec_steal
2193 la $out,0($i3) # restore real $out
2195 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2196 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2200 st${g} $out,4*$SIZE_T($sp)
2202 bras $ra,_s390x_AES_decrypt
2203 l${g} $out,4*$SIZE_T($sp)
2204 x $s0,$tweak+0($sp) # ^=tweak
2207 x $s3,$tweak+12($sp)
2212 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2213 stg $sp,$tweak-16+8($sp)
2215 stg $sp,$tweak+0($sp) # wipe tweak
2216 stg $sp,$twesk+8($sp)
2217 lm${g} %r6,$ra,6*$SIZE_T($sp)
2219 .size AES_xts_decrypt,.-AES_xts_decrypt
2223 .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2226 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2228 close STDOUT; # force flush