3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # Software performance improvement over gcc-generated code is ~70% and
15 # in absolute terms is ~73 cycles per byte processed with 128-bit key.
16 # You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17 # *strictly* in-order execution and issued instruction [in this case
18 # load value from memory is critical] has to complete before execution
19 # flow proceeds. S-boxes are compressed to 2KB[+256B].
21 # As for hardware acceleration support. It's basically a "teaser," as
22 # it can and should be improved in several ways. Most notably support
23 # for CBC is not utilized, nor multiple blocks are ever processed.
24 # Then software key schedule can be postponed till hardware support
25 # detection... Performance improvement over assembler is reportedly
26 # ~2.5x, but can reach >8x [naturally on larger chunks] if proper
27 # support is implemented.
31 # Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32 # for 128-bit keys, if hardware support is detected.
36 # Add support for hardware AES192/256 and reschedule instructions to
37 # minimize/avoid Address Generation Interlock hazard and to favour
38 # dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39 # almost 50% on z9. The gain is smaller on z10, because being dual-
40 # issue z10 makes it improssible to eliminate the interlock condition:
41 # critial path is not long enough. Yet it spends ~24 cycles per byte
42 # processed with 128-bit key.
44 # Unlike previous version hardware support detection takes place only
45 # at the moment of key schedule setup, which is denoted in key->rounds.
46 # This is done, because deferred key setup can't be made MT-safe, not
47 # for keys longer than 128 bits.
49 # Add AES_cbc_encrypt, which gives incredible performance improvement,
50 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
51 # because software implementation was optimized.
55 # Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
56 # performance improvement over "generic" counter mode routine relying
57 # on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
58 # to the fact that exact throughput value depends on current stack
59 # frame alignment within 4KB page. In worst case you get ~75% of the
60 # maximum, but *on average* it would be as much as ~98%. Meaning that
61 # worst case is unlike, it's like hitting ravine on plateau.
65 # Adapt for -m31 build. If kernel supports what's called "highgprs"
66 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
67 # instructions and achieve "64-bit" performance even in 31-bit legacy
68 # application context. The feature is not specific to any particular
69 # processor, as long as it's "z-CPU". Latter implies that the code
70 # remains z/Architecture specific. On z990 it was measured to perform
71 # 2x better than code generated by gcc 4.3.
75 # Add support for z196 "cipher message with counter" instruction.
76 # Note however that it's disengaged, because it was measured to
77 # perform ~12% worse than vanilla km-based code...
81 # Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
82 # instructions, which deliver ~70% improvement at 8KB block size over
83 # vanilla km-based code, 37% - at most like 512-bytes block size.
87 if ($flavour =~ /3[12]/) {
95 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
96 open STDOUT,">$output";
98 $softonly=0; # allow hardware support
100 $t0="%r0"; $mask="%r0";
102 $t2="%r2"; $inp="%r2";
103 $t3="%r3"; $out="%r3"; $bits="%r3";
117 $stdframe=16*$SIZE_T+4*8;
121 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
127 .type AES_Te,\@object
132 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
133 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
134 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
135 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
136 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
137 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
138 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
139 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
140 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
141 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
142 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
143 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
144 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
145 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
146 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
147 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
148 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
149 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
150 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
151 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
152 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
153 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
154 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
155 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
156 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
157 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
158 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
159 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
160 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
161 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
162 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
163 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
164 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
165 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
166 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
167 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
168 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
169 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
170 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
171 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
172 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
173 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
174 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
175 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
176 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
177 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
178 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
179 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
180 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
181 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
182 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
183 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
184 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
185 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
186 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
187 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
188 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
189 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
190 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
191 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
192 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
193 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
194 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
195 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
198 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
199 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
200 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
201 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
202 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
203 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
204 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
205 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
206 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
207 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
208 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
209 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
210 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
211 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
212 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
213 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
214 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
215 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
216 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
217 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
218 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
219 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
220 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
221 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
222 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
223 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
224 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
225 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
226 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
227 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
228 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
229 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
231 .long 0x01000000, 0x02000000, 0x04000000, 0x08000000
232 .long 0x10000000, 0x20000000, 0x40000000, 0x80000000
233 .long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
235 .size AES_Te,.-AES_Te
237 # void AES_encrypt(const unsigned char *inp, unsigned char *out,
238 # const AES_KEY *key) {
240 .type AES_encrypt,\@function
243 $code.=<<___ if (!$softonly);
252 lghi %r3,16 # single block length
253 .long 0xb92e0042 # km %r4,%r2
254 brc 1,.-4 # can this happen?
260 stm${g} %r3,$ra,3*$SIZE_T($sp)
268 bras $ra,_s390x_AES_encrypt
270 l${g} $out,3*$SIZE_T($sp)
276 lm${g} %r6,$ra,6*$SIZE_T($sp)
278 .size AES_encrypt,.-AES_encrypt
280 .type _s390x_AES_encrypt,\@function
283 st${g} $ra,15*$SIZE_T($sp)
289 llill $mask,`0xff<<3`
303 srlg $i1,$s1,`16-3` # i0
312 l $s0,0($s0,$tbl) # Te0[s0>>24]
313 l $t1,1($t1,$tbl) # Te3[s0>>0]
314 l $t2,2($t2,$tbl) # Te2[s0>>8]
315 l $t3,3($t3,$tbl) # Te1[s0>>16]
317 x $s0,3($i1,$tbl) # Te1[s1>>16]
318 l $s1,0($s1,$tbl) # Te0[s1>>24]
319 x $t2,1($i2,$tbl) # Te3[s1>>0]
320 x $t3,2($i3,$tbl) # Te2[s1>>8]
322 srlg $i1,$s2,`8-3` # i0
323 srlg $i2,$s2,`16-3` # i1
332 srlg $ra,$s3,`8-3` # i1
333 sllg $t1,$s3,`0+3` # i0
338 x $s0,2($i1,$tbl) # Te2[s2>>8]
339 x $s1,3($i2,$tbl) # Te1[s2>>16]
340 l $s2,0($s2,$tbl) # Te0[s2>>24]
341 x $t3,1($i3,$tbl) # Te3[s2>>0]
343 srlg $i3,$s3,`16-3` # i2
354 x $s0,1($t1,$tbl) # Te3[s3>>0]
355 x $s1,2($ra,$tbl) # Te2[s3>>8]
356 x $s2,3($i3,$tbl) # Te1[s3>>16]
357 l $s3,0($s3,$tbl) # Te0[s3>>24]
360 brct $rounds,.Lenc_loop
372 srlg $i1,$s1,`16-3` # i0
381 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
382 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
384 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
385 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
389 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
390 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
391 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
392 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
401 srlg $i1,$s2,`8-3` # i0
402 srlg $i2,$s2,`16-3` # i1
410 sllg $t1,$s3,`0+3` # i0
411 srlg $ra,$s3,`8-3` # i1
414 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
415 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
417 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
418 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
427 srlg $i3,$s3,`16-3` # i2
435 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
436 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
437 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
438 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
447 l${g} $ra,15*$SIZE_T($sp)
454 .size _s390x_AES_encrypt,.-_s390x_AES_encrypt
458 .type AES_Td,\@object
463 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
464 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
465 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
466 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
467 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
468 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
469 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
470 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
471 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
472 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
473 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
474 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
475 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
476 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
477 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
478 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
479 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
480 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
481 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
482 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
483 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
484 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
485 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
486 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
487 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
488 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
489 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
490 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
491 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
492 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
493 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
494 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
495 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
496 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
497 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
498 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
499 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
500 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
501 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
502 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
503 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
504 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
505 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
506 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
507 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
508 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
509 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
510 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
511 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
512 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
513 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
514 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
515 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
516 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
517 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
518 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
519 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
520 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
521 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
522 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
523 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
524 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
525 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
526 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
529 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
530 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
531 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
532 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
533 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
534 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
535 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
536 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
537 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
538 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
539 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
540 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
541 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
542 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
543 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
544 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
545 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
546 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
547 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
548 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
549 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
550 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
551 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
552 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
553 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
554 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
555 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
556 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
557 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
558 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
559 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
560 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
561 .size AES_Td,.-AES_Td
563 # void AES_decrypt(const unsigned char *inp, unsigned char *out,
564 # const AES_KEY *key) {
566 .type AES_decrypt,\@function
569 $code.=<<___ if (!$softonly);
578 lghi %r3,16 # single block length
579 .long 0xb92e0042 # km %r4,%r2
580 brc 1,.-4 # can this happen?
586 stm${g} %r3,$ra,3*$SIZE_T($sp)
594 bras $ra,_s390x_AES_decrypt
596 l${g} $out,3*$SIZE_T($sp)
602 lm${g} %r6,$ra,6*$SIZE_T($sp)
604 .size AES_decrypt,.-AES_decrypt
606 .type _s390x_AES_decrypt,\@function
609 st${g} $ra,15*$SIZE_T($sp)
615 llill $mask,`0xff<<3`
629 sllg $i1,$s1,`0+3` # i0
638 l $s0,0($s0,$tbl) # Td0[s0>>24]
639 l $t1,3($t1,$tbl) # Td1[s0>>16]
640 l $t2,2($t2,$tbl) # Td2[s0>>8]
641 l $t3,1($t3,$tbl) # Td3[s0>>0]
643 x $s0,1($i1,$tbl) # Td3[s1>>0]
644 l $s1,0($s1,$tbl) # Td0[s1>>24]
645 x $t2,3($i2,$tbl) # Td1[s1>>16]
646 x $t3,2($i3,$tbl) # Td2[s1>>8]
648 srlg $i1,$s2,`8-3` # i0
649 sllg $i2,$s2,`0+3` # i1
658 srlg $ra,$s3,`8-3` # i1
659 srlg $t1,$s3,`16-3` # i0
664 x $s0,2($i1,$tbl) # Td2[s2>>8]
665 x $s1,1($i2,$tbl) # Td3[s2>>0]
666 l $s2,0($s2,$tbl) # Td0[s2>>24]
667 x $t3,3($i3,$tbl) # Td1[s2>>16]
669 sllg $i3,$s3,`0+3` # i2
680 x $s0,3($t1,$tbl) # Td1[s3>>16]
681 x $s1,2($ra,$tbl) # Td2[s3>>8]
682 x $s2,1($i3,$tbl) # Td3[s3>>0]
683 l $s3,0($s3,$tbl) # Td0[s3>>24]
686 brct $rounds,.Ldec_loop
689 l $t1,`2048+0`($tbl) # prefetch Td4
690 l $t2,`2048+64`($tbl)
691 l $t3,`2048+128`($tbl)
692 l $i1,`2048+192`($tbl)
709 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
710 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
711 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
713 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
717 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
718 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
719 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
721 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
735 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
736 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
737 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
738 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
753 l${g} $ra,15*$SIZE_T($sp)
758 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
759 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
761 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
762 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
776 .size _s390x_AES_decrypt,.-_s390x_AES_decrypt
780 # void AES_set_encrypt_key(const unsigned char *in, int bits,
782 .globl AES_set_encrypt_key
783 .type AES_set_encrypt_key,\@function
786 _s390x_AES_set_encrypt_key:
808 $code.=<<___ if (!$softonly);
809 # convert bits to km code, [128,192,256]->[18,19,20]
816 larl %r1,OPENSSL_s390xcap_P
818 tmhl %r0,0x4000 # check for message-security assist
821 lghi %r0,0 # query capability vector
823 .long 0xb92f0042 # kmc %r4,%r2
830 lmg %r0,%r1,0($inp) # just copy 128 bits...
840 1: st $bits,236($key) # save bits
841 st %r5,240($key) # save km code
848 stm${g} %r6,%r13,6*$SIZE_T($sp) # all non-volatile regs
850 larl $tbl,AES_Te+2048
869 llgfr $t2,$s3 # temp=rk[3]
883 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
884 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
885 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
886 icm $t2,1,0($i3) # Te4[rk[3]>>24]
887 x $t2,256($t3,$tbl) # rcon[i]
888 xr $s0,$t2 # rk[4]=rk[0]^...
889 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
890 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
891 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
893 llgfr $t2,$s3 # temp=rk[3]
905 la $key,16($key) # key+=4
907 brct $rounds,.L128_loop
909 lm${g} %r6,%r13,6*$SIZE_T($sp)
941 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
942 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
943 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
944 icm $t1,1,0($i3) # Te4[rk[5]>>24]
945 x $t1,256($t3,$tbl) # rcon[i]
946 xr $s0,$t1 # rk[6]=rk[0]^...
947 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
948 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
949 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
955 brct $rounds,.L192_continue
957 lm${g} %r6,%r13,6*$SIZE_T($sp)
963 x $t1,16($key) # rk[10]=rk[4]^rk[9]
965 x $t1,20($key) # rk[11]=rk[5]^rk[10]
975 la $key,24($key) # key+=6
1004 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
1005 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
1006 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
1007 icm $t1,1,0($i3) # Te4[rk[7]>>24]
1008 x $t1,256($t3,$tbl) # rcon[i]
1009 xr $s0,$t1 # rk[8]=rk[0]^...
1010 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
1011 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
1012 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
1017 brct $rounds,.L256_continue
1019 lm${g} %r6,%r13,6*$SIZE_T($sp)
1024 lgr $t1,$s3 # temp=rk[11]
1035 llgc $t1,0($t1) # Te4[rk[11]>>0]
1036 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
1037 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
1038 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
1039 x $t1,16($key) # rk[12]=rk[4]^...
1041 x $t1,20($key) # rk[13]=rk[5]^rk[12]
1043 x $t1,24($key) # rk[14]=rk[6]^rk[13]
1045 x $t1,28($key) # rk[15]=rk[7]^rk[14]
1055 la $key,32($key) # key+=8
1062 .size AES_set_encrypt_key,.-AES_set_encrypt_key
1064 # void AES_set_decrypt_key(const unsigned char *in, int bits,
1066 .globl AES_set_decrypt_key
1067 .type AES_set_decrypt_key,\@function
1069 AES_set_decrypt_key:
1070 st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1071 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers!
1072 bras $ra,_s390x_AES_set_encrypt_key
1073 l${g} $key,4*$SIZE_T($sp)
1074 l${g} $ra,14*$SIZE_T($sp)
1078 $code.=<<___ if (!$softonly);
1083 oill $t0,0x80 # set "decrypt" bit
1089 st${g} $key,4*$SIZE_T($sp)
1090 st${g} $ra,14*$SIZE_T($sp)
1091 bras $ra,.Lekey_internal
1092 l${g} $key,4*$SIZE_T($sp)
1093 l${g} $ra,14*$SIZE_T($sp)
1097 .Lgo: llgf $rounds,240($key)
1105 .Linv: lmg $s0,$s1,0($i1)
1117 llgf $rounds,240($key)
1119 sll $rounds,2 # (rounds-1)*4
1120 llilh $mask80,0x8080
1121 llilh $mask1b,0x1b1b
1122 llilh $maskfe,0xfefe
1128 .Lmix: l $s0,16($key) # tp1
1156 xr $s1,$s0 # tp2^tp1
1157 xr $s2,$s0 # tp4^tp1
1158 rll $s0,$s0,24 # = ROTATE(tp1,8)
1160 xr $s0,$s1 # ^=tp2^tp1
1161 xr $s1,$s3 # tp2^tp1^tp8
1162 xr $s0,$s2 # ^=tp4^tp1^tp8
1165 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
1167 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
1168 xr $s0,$s3 # ^= ROTATE(tp8,8)
1174 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1177 .size AES_set_decrypt_key,.-AES_set_decrypt_key
1180 ########################################################################
1181 # void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1182 # size_t length, const AES_KEY *key,
1183 # unsigned char *ivec, const int enc)
1186 my $out="%r4"; # length and out are swapped
1192 .globl AES_cbc_encrypt
1193 .type AES_cbc_encrypt,\@function
1196 xgr %r3,%r4 # flip %r3 and %r4, out and len
1200 $code.=<<___ if (!$softonly);
1205 lg %r0,0($ivp) # copy ivec
1207 stmg %r0,%r1,16($sp)
1208 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1209 stmg %r0,%r1,32($sp)
1210 lmg %r0,%r1,16($key)
1211 stmg %r0,%r1,48($sp)
1212 l %r0,240($key) # load kmc code
1213 lghi $key,15 # res=len%16, len-=res;
1216 la %r1,16($sp) # parameter block - ivec || key
1218 .long 0xb92f0042 # kmc %r4,%r2
1219 brc 1,.-4 # pay attention to "partial completion"
1223 lmg %r0,%r1,16($sp) # copy ivec to caller
1229 ahi $key,-1 # it's the way it's encoded in mvc
1231 jnz .Lkmc_truncated_dec
1233 stg %r1,16*$SIZE_T($sp)
1234 stg %r1,16*$SIZE_T+8($sp)
1236 mvc 16*$SIZE_T(1,$sp),0($inp)
1238 la %r1,16($sp) # restore parameter block
1239 la $inp,16*$SIZE_T($sp)
1241 .long 0xb92f0042 # kmc %r4,%r2
1244 .Lkmc_truncated_dec:
1245 st${g} $out,4*$SIZE_T($sp)
1246 la $out,16*$SIZE_T($sp)
1248 .long 0xb92f0042 # kmc %r4,%r2
1249 l${g} $out,4*$SIZE_T($sp)
1251 mvc 0(1,$out),16*$SIZE_T($sp)
1258 stm${g} $key,$ra,5*$SIZE_T($sp)
1260 cl %r0,`$stdframe+$SIZE_T-4`($sp)
1272 brc 4,.Lcbc_enc_tail # if borrow
1274 stm${g} $inp,$out,2*$SIZE_T($sp)
1281 bras $ra,_s390x_AES_encrypt
1283 lm${g} $inp,$key,2*$SIZE_T($sp)
1295 brc 4,.Lcbc_enc_tail # if borrow
1299 l${g} $ivp,6*$SIZE_T($sp)
1305 lm${g} %r7,$ra,7*$SIZE_T($sp)
1312 stg $t0,16*$SIZE_T($sp)
1313 stg $t0,16*$SIZE_T+8($sp)
1315 mvc 16*$SIZE_T(1,$sp),0($inp)
1318 la $inp,16*$SIZE_T($sp)
1327 stmg $t0,$t1,16*$SIZE_T($sp)
1330 stm${g} $inp,$out,2*$SIZE_T($sp)
1337 bras $ra,_s390x_AES_decrypt
1339 lm${g} $inp,$key,2*$SIZE_T($sp)
1347 xg $s0,16*$SIZE_T($sp)
1348 xg $s2,16*$SIZE_T+8($sp)
1351 brc 4,.Lcbc_dec_tail # if borrow
1352 brc 2,.Lcbc_dec_done # if zero
1355 stmg $t0,$t1,16*$SIZE_T($sp)
1365 lm${g} %r6,$ra,6*$SIZE_T($sp)
1366 stmg $t0,$t1,0($ivp)
1373 stg $s0,16*$SIZE_T($sp)
1374 stg $s2,16*$SIZE_T+8($sp)
1376 mvc 0(1,$out),16*$SIZE_T($sp)
1379 .size AES_cbc_encrypt,.-AES_cbc_encrypt
1382 ########################################################################
1383 # void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1384 # size_t blocks, const AES_KEY *key,
1385 # const unsigned char *ivec)
1388 my $out="%r4"; # blocks and out are swapped
1390 my $key="%r5"; my $iv0="%r5";
1395 .globl AES_ctr32_encrypt
1396 .type AES_ctr32_encrypt,\@function
1399 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1402 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
1404 $code.=<<___ if (!$softonly);
1410 stm${g} %r6,$s3,6*$SIZE_T($sp)
1413 la %r1,0($key) # %r1 is permanent copy of $key
1414 lg $iv0,0($ivp) # load ivec
1417 # prepare and allocate stack frame at the top of 4K page
1418 # with 1K reserved for eventual signal handling
1419 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1423 ngr $s0,$s1 # align at page boundary
1424 slgr $fp,$s0 # total buffer size
1426 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1427 slgr $fp,$s1 # deduct reservation to get usable buffer size
1428 # buffer size is at lest 256 and at most 3072+256-16
1430 la $sp,1024($s0) # alloca
1431 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
1432 st${g} $s2,0($sp) # back-chain
1433 st${g} $fp,$SIZE_T($sp)
1436 brc 1,.Lctr32_hw_switch # not zero, no borrow
1437 algr $fp,$len # input is shorter than allocated buffer
1439 st${g} $fp,$SIZE_T($sp)
1443 $code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
1444 larl $s0,OPENSSL_s390xcap_P
1446 tmhh $s0,0x0004 # check for message_security-assist-4
1453 .long 0xb92d2042 # kmctr %r4,%r2,%r2
1455 llihh %r0,0x8000 # check if kmctr supports the function code
1463 algr $out,$inp # restore $out
1464 lgr $s1,$len # $s1 undertakes $len
1465 j .Lctr32_kmctr_loop
1470 .Lctr32_kmctr_prepare:
1474 ahi $ivp,1 # 32-bit increment, preserves upper half
1475 brct $s3,.Lctr32_kmctr_prepare
1477 #la $inp,0($inp) # inp
1478 sllg $len,$fp,4 # len
1479 #la $out,0($out) # out
1481 .long 0xb92da042 # kmctr $out,$s2,$inp
1482 brc 1,.-4 # pay attention to "partial completion"
1485 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1488 brc 4+1,.Lctr32_kmctr_loop # not zero
1491 lm${g} %r6,$s3,6*$SIZE_T($sp)
1503 ahi $ivp,1 # 32-bit increment, preserves upper half
1504 brct $s3,.Lctr32_km_prepare
1506 la $s0,16($sp) # inp
1507 sllg $s1,$fp,4 # len
1508 la $s2,16($sp) # out
1509 .long 0xb92e00a8 # km %r10,%r8
1510 brc 1,.-4 # pay attention to "partial completion"
1520 stg $s0,0($out,$inp)
1521 stg $s1,8($out,$inp)
1523 brct $s3,.Lctr32_km_xor
1526 brc 1,.Lctr32_km_loop # not zero, no borrow
1529 brc 4+1,.Lctr32_km_loop # not zero
1532 l${g} $s1,$SIZE_T($sp)
1538 brct $s1,.Lctr32_km_zap
1541 lm${g} %r6,$s3,6*$SIZE_T($sp)
1547 stm${g} $key,$ra,5*$SIZE_T($sp)
1553 stm${g} $inp,$out,2*$SIZE_T($sp)
1558 st $t1,16*$SIZE_T($sp)
1561 bras $ra,_s390x_AES_encrypt
1563 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1564 llgf $t1,16*$SIZE_T($sp)
1572 ahi $t1,1 # 32-bit increment
1573 brct $len,.Lctr32_loop
1575 lm${g} %r6,$ra,6*$SIZE_T($sp)
1577 .size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1581 ########################################################################
1582 # void AES_xts_encrypt(const char *inp,char *out,size_t len,
1583 # const AES_KEY *key1, const AES_KEY *key2,
1584 # const unsigned char iv[16]);
1588 my $out="%r4"; # len and out are swapped
1590 my $key1="%r5"; # $i1
1591 my $key2="%r6"; # $i2
1593 my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1596 .type _s390x_xts_km,\@function
1601 llgfr $s0,%r0 # put aside the function code
1604 lghi %r0,0 # query capability vector
1605 la %r1,2*$SIZE_T($sp)
1606 .long 0xb92e0042 # km %r4,%r2
1608 srlg %r1,%r1,32($s1) # check for 32+function code
1609 ng %r1,2*$SIZE_T($sp)
1610 lgr %r0,$s0 # restore the function code
1611 la %r1,0($key1) # restore $key1
1614 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1617 oill %r0,32 # switch to xts function code
1619 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1620 la %r1,$tweak-16($sp)
1621 slgr %r1,$s1 # parameter block position
1622 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1623 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1624 # yes, it contains junk and overlaps
1625 # with the tweak in 128-bit case.
1626 # it's done to avoid conditional
1628 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1630 .long 0xb92e0042 # km %r4,%r2
1631 brc 1,.-4 # pay attention to "partial completion"
1633 lrvg $s0,$tweak+0($sp) # load the last tweak
1634 lrvg $s1,$tweak+8($sp)
1635 stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key
1637 nill %r0,0xffdf # switch back to original function code
1638 la %r1,0($key1) # restore pointer to $key1
1641 llgc $len,2*$SIZE_T-1($sp)
1642 nill $len,0x0f # $len%=16
1649 # prepare and allocate stack frame at the top of 4K page
1650 # with 1K reserved for eventual signal handling
1651 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1655 ngr $s0,$s1 # align at page boundary
1656 slgr $fp,$s0 # total buffer size
1658 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1659 slgr $fp,$s1 # deduct reservation to get usable buffer size
1660 # buffer size is at lest 256 and at most 3072+256-16
1662 la $sp,1024($s0) # alloca
1663 nill $fp,0xfff0 # round to 16*n
1664 st${g} $s2,0($sp) # back-chain
1665 nill $len,0xfff0 # redundant
1666 st${g} $fp,$SIZE_T($sp)
1669 brc 1,.Lxts_km_go # not zero, no borrow
1670 algr $fp,$len # input is shorter than allocated buffer
1672 st${g} $fp,$SIZE_T($sp)
1675 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1676 lrvg $s1,$tweak+8($s2)
1678 la $s2,16($sp) # vector of ascending tweak values
1689 srag $i2,$s1,63 # broadcast upper bit
1691 srlg $i2,$s0,63 # carry bit from lower half
1697 lrvgr $i1,$s0 # flip byte order
1703 stg $i1,0($out,$inp)
1704 stg $i2,8($out,$inp)
1706 brct $s3,.Lxts_km_prepare
1708 slgr $inp,$fp # rewind $inp
1711 .long 0xb92e00aa # km $s2,$s2
1712 brc 1,.-4 # pay attention to "partial completion"
1722 stg $i1,0($out,$inp)
1723 stg $i2,8($out,$inp)
1725 brct $s3,.Lxts_km_xor
1728 brc 1,.Lxts_km_loop # not zero, no borrow
1731 brc 4+1,.Lxts_km_loop # not zero
1733 l${g} $i1,0($sp) # back-chain
1734 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1741 brct $fp,.Lxts_km_zap
1744 llgc $len,2*$SIZE_T-1($i1)
1745 nill $len,0x0f # $len%=16
1748 # generate one more tweak...
1750 srag $i2,$s1,63 # broadcast upper bit
1752 srlg $i2,$s0,63 # carry bit from lower half
1758 ltr $len,$len # clear zero flag
1760 .size _s390x_xts_km,.-_s390x_xts_km
1762 .globl AES_xts_encrypt
1763 .type AES_xts_encrypt,\@function
1766 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1770 $code.=<<___ if ($SIZE_T==4);
1774 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1775 srag $len,$len,4 # formally wrong, because it expands
1776 # sign byte, but who can afford asking
1777 # to process more than 2^63-1 bytes?
1778 # I use it, because it sets condition
1780 bcr 8,$ra # abort if zero (i.e. less than 16)
1782 $code.=<<___ if (!$softonly);
1786 jl .Lxts_enc_software
1788 stm${g} %r6,$s3,6*$SIZE_T($sp)
1789 st${g} $ra,14*$SIZE_T($sp)
1791 sllg $len,$len,4 # $len&=~15
1794 # generate the tweak value
1795 l${g} $s3,$stdframe($sp) # pointer to iv
1800 la %r1,0($key2) # $key2 is not needed anymore
1801 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1802 brc 1,.-4 # can this happen?
1805 la %r1,0($key1) # $key1 is not needed anymore
1806 bras $ra,_s390x_xts_km
1807 jz .Lxts_enc_km_done
1809 aghi $inp,-16 # take one step back
1810 la $i3,0($out,$inp) # put aside real $out
1813 llgc $i2,0($out,$inp)
1814 stc $i1,0($out,$inp)
1815 stc $i2,16($out,$inp)
1817 brct $len,.Lxts_enc_km_steal
1821 lrvgr $i1,$s0 # flip byte order
1827 .long 0xb92e00aa # km $s2,$s2
1828 brc 1,.-4 # can this happen?
1829 lrvgr $i1,$s0 # flip byte order
1837 l${g} $ra,14*$SIZE_T($sp)
1838 st${g} $sp,$tweak($sp) # wipe tweak
1839 st${g} $sp,$tweak($sp)
1840 lm${g} %r6,$s3,6*$SIZE_T($sp)
1846 stm${g} %r6,$ra,6*$SIZE_T($sp)
1850 xgr $s0,$s0 # clear upper half
1852 lrv $s0,$stdframe+4($sp) # load secno
1853 lrv $s1,$stdframe+0($sp)
1856 stm${g} %r2,%r5,2*$SIZE_T($sp)
1859 bras $ra,_s390x_AES_encrypt # generate the tweak
1860 lm${g} %r2,%r5,2*$SIZE_T($sp)
1861 stm $s0,$s3,$tweak($sp) # save the tweak
1866 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1867 lrvg $s3,$tweak+8($sp)
1869 srag %r0,$s3,63 # broadcast upper bit
1871 srlg %r0,$s1,63 # carry bit from lower half
1876 lrvgr $s1,$s1 # flip byte order
1878 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1879 stg $s1,$tweak+0($sp) # save the tweak
1882 stg $s3,$tweak+8($sp)
1884 la $inp,16($inp) # $inp+=16
1886 x $s0,0($inp) # ^=*($inp)
1890 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1892 bras $ra,_s390x_AES_encrypt
1893 lm${g} %r2,%r5,2*$SIZE_T($sp)
1894 x $s0,$tweak+0($sp) # ^=tweak
1897 x $s3,$tweak+12($sp)
1901 st $s3,12($out,$inp)
1902 brct${g} $len,.Lxts_enc_loop
1904 llgc $len,`2*$SIZE_T-1`($sp)
1905 nill $len,0x0f # $len%16
1908 la $i3,0($inp,$out) # put aside real $out
1911 llgc %r1,0($out,$inp)
1912 stc %r0,0($out,$inp)
1913 stc %r1,16($out,$inp)
1915 brct $len,.Lxts_enc_steal
1916 la $out,0($i3) # restore real $out
1918 # generate last tweak...
1919 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1920 lrvg $s3,$tweak+8($sp)
1922 srag %r0,$s3,63 # broadcast upper bit
1924 srlg %r0,$s1,63 # carry bit from lower half
1929 lrvgr $s1,$s1 # flip byte order
1931 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1932 stg $s1,$tweak+0($sp) # save the tweak
1935 stg $s3,$tweak+8($sp)
1938 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1942 st${g} $out,4*$SIZE_T($sp)
1944 bras $ra,_s390x_AES_encrypt
1945 l${g} $out,4*$SIZE_T($sp)
1946 x $s0,`$tweak+0`($sp) # ^=tweak
1947 x $s1,`$tweak+4`($sp)
1948 x $s2,`$tweak+8`($sp)
1949 x $s3,`$tweak+12`($sp)
1956 stg $sp,$tweak+0($sp) # wipe tweak
1957 stg $sp,$twesk+8($sp)
1958 lm${g} %r6,$ra,6*$SIZE_T($sp)
1960 .size AES_xts_encrypt,.-AES_xts_encrypt
1962 # void AES_xts_decrypt(const char *inp,char *out,size_t len,
1963 # const AES_KEY *key1, const AES_KEY *key2,u64 secno);
1966 .globl AES_xts_decrypt
1967 .type AES_xts_decrypt,\@function
1970 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1974 $code.=<<___ if ($SIZE_T==4);
1978 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1980 bcr 4,$ra # abort if less than zero. formally
1981 # wrong, because $len is unsigned,
1982 # but who can afford asking to
1983 # process more than 2^63-1 bytes?
1985 jnz .Lxts_dec_proceed
1989 $code.=<<___ if (!$softonly);
1993 jl .Lxts_dec_software
1995 stm${g} %r6,$s3,6*$SIZE_T($sp)
1996 st${g} $ra,14*$SIZE_T($sp)
1998 nill $len,0xfff0 # $len&=~15
2001 # generate the tweak value
2002 l${g} $s3,$stdframe($sp) # pointer to iv
2007 la %r1,0($key2) # $key2 is not needed past this point
2008 .long 0xb92e00aa # km $s2,$s2, generate the tweak
2009 brc 1,.-4 # can this happen?
2012 la %r1,0($key1) # $key1 is not needed anymore
2015 jz .Lxts_dec_km_short
2016 bras $ra,_s390x_xts_km
2017 jz .Lxts_dec_km_done
2019 lrvgr $s2,$s0 # make copy in reverse byte order
2021 j .Lxts_dec_km_2ndtweak
2024 llgc $len,`2*$SIZE_T-1`($sp)
2025 nill $len,0x0f # $len%=16
2026 lrvg $s0,$tweak+0($sp) # load the tweak
2027 lrvg $s1,$tweak+8($sp)
2028 lrvgr $s2,$s0 # make copy in reverse byte order
2031 .Lxts_dec_km_2ndtweak:
2033 srag $i2,$s1,63 # broadcast upper bit
2035 srlg $i2,$s0,63 # carry bit from lower half
2040 lrvgr $i1,$s0 # flip byte order
2045 stg $i1,0($out,$inp)
2046 stg $i2,8($out,$inp)
2049 .long 0xb92e0066 # km $i2,$i2
2050 brc 1,.-4 # can this happen?
2055 stg $i1,0($out,$inp)
2056 stg $i2,8($out,$inp)
2058 la $i3,0($out,$inp) # put aside real $out
2061 llgc $i2,0($out,$inp)
2062 stc $i1,0($out,$inp)
2063 stc $i2,16($out,$inp)
2065 brct $len,.Lxts_dec_km_steal
2075 .long 0xb92e0088 # km $s0,$s0
2076 brc 1,.-4 # can this happen?
2082 l${g} $ra,14*$SIZE_T($sp)
2083 st${g} $sp,$tweak($sp) # wipe tweak
2084 st${g} $sp,$tweak($sp)
2085 lm${g} %r6,$s3,6*$SIZE_T($sp)
2091 stm${g} %r6,$ra,6*$SIZE_T($sp)
2096 xgr $s0,$s0 # clear upper half
2098 lrv $s0,$stdframe+4($sp) # load secno
2099 lrv $s1,$stdframe+0($sp)
2102 stm${g} %r2,%r5,2*$SIZE_T($sp)
2105 bras $ra,_s390x_AES_encrypt # generate the tweak
2106 lm${g} %r2,%r5,2*$SIZE_T($sp)
2109 stm $s0,$s3,$tweak($sp) # save the tweak
2115 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2116 lrvg $s3,$tweak+8($sp)
2118 srag %r0,$s3,63 # broadcast upper bit
2120 srlg %r0,$s1,63 # carry bit from lower half
2125 lrvgr $s1,$s1 # flip byte order
2127 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2128 stg $s1,$tweak+0($sp) # save the tweak
2131 stg $s3,$tweak+8($sp)
2134 x $s0,0($inp) # tweak^=*(inp)
2138 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2140 bras $ra,_s390x_AES_decrypt
2141 lm${g} %r2,%r5,2*$SIZE_T($sp)
2142 x $s0,$tweak+0($sp) # ^=tweak
2145 x $s3,$tweak+12($sp)
2149 st $s3,12($out,$inp)
2151 brct${g} $len,.Lxts_dec_loop
2153 llgc $len,`2*$SIZE_T-1`($sp)
2154 nill $len,0x0f # $len%16
2157 # generate pair of tweaks...
2158 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2159 lrvg $s3,$tweak+8($sp)
2161 srag %r0,$s3,63 # broadcast upper bit
2163 srlg %r0,$s1,63 # carry bit from lower half
2168 lrvgr $i2,$s1 # flip byte order
2170 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2171 j .Lxts_dec_2ndtweak
2175 llgc $len,`2*$SIZE_T-1`($sp)
2176 nill $len,0x0f # $len%16
2177 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2178 lrvg $s3,$tweak+8($sp)
2181 srag %r0,$s3,63 # broadcast upper bit
2183 srlg %r0,$s1,63 # carry bit from lower half
2188 lrvgr $s1,$s1 # flip byte order
2190 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2191 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2194 stg $s3,$tweak-16+8($sp)
2197 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2201 stm${g} %r2,%r3,2*$SIZE_T($sp)
2203 bras $ra,_s390x_AES_decrypt
2204 lm${g} %r2,%r5,2*$SIZE_T($sp)
2205 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2206 x $s1,$tweak-16+4($sp)
2207 x $s2,$tweak-16+8($sp)
2208 x $s3,$tweak-16+12($sp)
2212 st $s3,12($out,$inp)
2214 la $i3,0($out,$inp) # put aside real $out
2217 llgc %r1,0($out,$inp)
2218 stc %r0,0($out,$inp)
2219 stc %r1,16($out,$inp)
2221 brct $len,.Lxts_dec_steal
2222 la $out,0($i3) # restore real $out
2224 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2225 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2229 st${g} $out,4*$SIZE_T($sp)
2231 bras $ra,_s390x_AES_decrypt
2232 l${g} $out,4*$SIZE_T($sp)
2233 x $s0,$tweak+0($sp) # ^=tweak
2236 x $s3,$tweak+12($sp)
2241 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2242 stg $sp,$tweak-16+8($sp)
2244 stg $sp,$tweak+0($sp) # wipe tweak
2245 stg $sp,$twesk+8($sp)
2246 lm${g} %r6,$ra,6*$SIZE_T($sp)
2248 .size AES_xts_decrypt,.-AES_xts_decrypt
2252 .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2253 .comm OPENSSL_s390xcap_P,16,8
2256 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2258 close STDOUT; # force flush