2 * x86_64/AVX2/AES-NI assembler implementation of Camellia
4 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
13 #include <linux/linkage.h>
14 #include <asm/nospec-branch.h>
16 #define CAMELLIA_TABLE_BYTE_LEN 272
18 /* struct camellia_ctx: */
20 #define key_length CAMELLIA_TABLE_BYTE_LEN
26 /**********************************************************************
28 **********************************************************************/
29 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
30 vpand x, mask4bit, tmp0; \
31 vpandn x, mask4bit, x; \
34 vpshufb tmp0, lo_t, tmp0; \
55 /**********************************************************************
57 **********************************************************************/
61 * x0..x7: byte-sliced AB state
62 * mem_cd: register pointer storing CD state
63 * key: index for key material
65 * x0..x7: new byte-sliced CD state
67 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
70 * S-function with AES subbytes \
72 vbroadcasti128 .Linv_shift_row, t4; \
73 vpbroadcastd .L0f0f0f0f, t7; \
74 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
75 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
76 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
77 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
79 /* AES inverse shift rows */ \
89 /* prefilter sboxes 1, 2 and 3 */ \
90 /* prefilter sbox 4 */ \
91 filter_8bit(x0, t5, t6, t7, t4); \
92 filter_8bit(x7, t5, t6, t7, t4); \
93 vextracti128 $1, x0, t0##_x; \
94 vextracti128 $1, x7, t1##_x; \
95 filter_8bit(x3, t2, t3, t7, t4); \
96 filter_8bit(x6, t2, t3, t7, t4); \
97 vextracti128 $1, x3, t3##_x; \
98 vextracti128 $1, x6, t2##_x; \
99 filter_8bit(x2, t5, t6, t7, t4); \
100 filter_8bit(x5, t5, t6, t7, t4); \
101 filter_8bit(x1, t5, t6, t7, t4); \
102 filter_8bit(x4, t5, t6, t7, t4); \
104 vpxor t4##_x, t4##_x, t4##_x; \
106 /* AES subbytes + AES shift rows */ \
107 vextracti128 $1, x2, t6##_x; \
108 vextracti128 $1, x5, t5##_x; \
109 vaesenclast t4##_x, x0##_x, x0##_x; \
110 vaesenclast t4##_x, t0##_x, t0##_x; \
111 vinserti128 $1, t0##_x, x0, x0; \
112 vaesenclast t4##_x, x7##_x, x7##_x; \
113 vaesenclast t4##_x, t1##_x, t1##_x; \
114 vinserti128 $1, t1##_x, x7, x7; \
115 vaesenclast t4##_x, x3##_x, x3##_x; \
116 vaesenclast t4##_x, t3##_x, t3##_x; \
117 vinserti128 $1, t3##_x, x3, x3; \
118 vaesenclast t4##_x, x6##_x, x6##_x; \
119 vaesenclast t4##_x, t2##_x, t2##_x; \
120 vinserti128 $1, t2##_x, x6, x6; \
121 vextracti128 $1, x1, t3##_x; \
122 vextracti128 $1, x4, t2##_x; \
123 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
124 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
125 vaesenclast t4##_x, x2##_x, x2##_x; \
126 vaesenclast t4##_x, t6##_x, t6##_x; \
127 vinserti128 $1, t6##_x, x2, x2; \
128 vaesenclast t4##_x, x5##_x, x5##_x; \
129 vaesenclast t4##_x, t5##_x, t5##_x; \
130 vinserti128 $1, t5##_x, x5, x5; \
131 vaesenclast t4##_x, x1##_x, x1##_x; \
132 vaesenclast t4##_x, t3##_x, t3##_x; \
133 vinserti128 $1, t3##_x, x1, x1; \
134 vaesenclast t4##_x, x4##_x, x4##_x; \
135 vaesenclast t4##_x, t2##_x, t2##_x; \
136 vinserti128 $1, t2##_x, x4, x4; \
138 /* postfilter sboxes 1 and 4 */ \
139 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
140 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
141 filter_8bit(x0, t0, t1, t7, t6); \
142 filter_8bit(x7, t0, t1, t7, t6); \
143 filter_8bit(x3, t0, t1, t7, t6); \
144 filter_8bit(x6, t0, t1, t7, t6); \
146 /* postfilter sbox 3 */ \
147 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
148 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
149 filter_8bit(x2, t2, t3, t7, t6); \
150 filter_8bit(x5, t2, t3, t7, t6); \
152 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
154 /* postfilter sbox 2 */ \
155 filter_8bit(x1, t4, t5, t7, t2); \
156 filter_8bit(x4, t4, t5, t7, t2); \
159 vpsrldq $1, t0, t1; \
160 vpsrldq $2, t0, t2; \
161 vpshufb t7, t1, t1; \
162 vpsrldq $3, t0, t3; \
170 vpshufb t7, t2, t2; \
171 vpsrldq $4, t0, t4; \
172 vpshufb t7, t3, t3; \
173 vpsrldq $5, t0, t5; \
174 vpshufb t7, t4, t4; \
181 vpsrldq $6, t0, t6; \
182 vpshufb t7, t5, t5; \
183 vpshufb t7, t6, t6; \
193 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
195 /* Add key material and result to CD (x becomes new CD) */ \
198 vpxor 5 * 32(mem_cd), x1, x1; \
200 vpsrldq $7, t0, t6; \
201 vpshufb t7, t0, t0; \
202 vpshufb t7, t6, t7; \
205 vpxor 4 * 32(mem_cd), x0, x0; \
208 vpxor 6 * 32(mem_cd), x2, x2; \
211 vpxor 7 * 32(mem_cd), x3, x3; \
214 vpxor 0 * 32(mem_cd), x4, x4; \
217 vpxor 1 * 32(mem_cd), x5, x5; \
220 vpxor 2 * 32(mem_cd), x6, x6; \
223 vpxor 3 * 32(mem_cd), x7, x7;
226 * Size optimization... with inlined roundsm32 binary would be over 5 times
227 * larger and would only marginally faster.
230 roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
231 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
232 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
235 ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
238 roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
239 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
240 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
243 ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
247 * x0..x7: byte-sliced AB state preloaded
248 * mem_ab: byte-sliced AB state in memory
249 * mem_cb: byte-sliced CD state in memory
251 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
252 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
253 leaq (key_table + (i) * 8)(CTX), %r9; \
254 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
256 vmovdqu x0, 4 * 32(mem_cd); \
257 vmovdqu x1, 5 * 32(mem_cd); \
258 vmovdqu x2, 6 * 32(mem_cd); \
259 vmovdqu x3, 7 * 32(mem_cd); \
260 vmovdqu x4, 0 * 32(mem_cd); \
261 vmovdqu x5, 1 * 32(mem_cd); \
262 vmovdqu x6, 2 * 32(mem_cd); \
263 vmovdqu x7, 3 * 32(mem_cd); \
265 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
266 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
268 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
270 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
272 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
273 /* Store new AB state */ \
274 vmovdqu x4, 4 * 32(mem_ab); \
275 vmovdqu x5, 5 * 32(mem_ab); \
276 vmovdqu x6, 6 * 32(mem_ab); \
277 vmovdqu x7, 7 * 32(mem_ab); \
278 vmovdqu x0, 0 * 32(mem_ab); \
279 vmovdqu x1, 1 * 32(mem_ab); \
280 vmovdqu x2, 2 * 32(mem_ab); \
281 vmovdqu x3, 3 * 32(mem_ab);
283 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
284 y6, y7, mem_ab, mem_cd, i) \
285 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
286 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
287 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
288 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
289 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
290 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
292 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
293 y6, y7, mem_ab, mem_cd, i) \
294 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
295 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
296 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
297 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
298 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
299 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
303 * v0..3: byte-sliced 32-bit integers
307 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
308 vpcmpgtb v0, zero, t0; \
312 vpcmpgtb v1, zero, t1; \
316 vpcmpgtb v2, zero, t2; \
322 vpcmpgtb v3, zero, t0; \
332 * r: byte-sliced AB state in memory
333 * l: byte-sliced CD state in memory
335 * x0..x7: new byte-sliced CD state
337 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
338 tt1, tt2, tt3, kll, klr, krl, krr) \
342 * lr ^= rol32(t0, 1); \
344 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
345 vpxor tt0, tt0, tt0; \
346 vpshufb tt0, t0, t3; \
347 vpsrldq $1, t0, t0; \
348 vpshufb tt0, t0, t2; \
349 vpsrldq $1, t0, t0; \
350 vpshufb tt0, t0, t1; \
351 vpsrldq $1, t0, t0; \
352 vpshufb tt0, t0, t0; \
359 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
362 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
363 vmovdqu l4, 4 * 32(l); \
365 vmovdqu l5, 5 * 32(l); \
367 vmovdqu l6, 6 * 32(l); \
369 vmovdqu l7, 7 * 32(l); \
377 vpshufb tt0, t0, t3; \
378 vpsrldq $1, t0, t0; \
379 vpshufb tt0, t0, t2; \
380 vpsrldq $1, t0, t0; \
381 vpshufb tt0, t0, t1; \
382 vpsrldq $1, t0, t0; \
383 vpshufb tt0, t0, t0; \
385 vpor 4 * 32(r), t0, t0; \
386 vpor 5 * 32(r), t1, t1; \
387 vpor 6 * 32(r), t2, t2; \
388 vpor 7 * 32(r), t3, t3; \
390 vpxor 0 * 32(r), t0, t0; \
391 vpxor 1 * 32(r), t1, t1; \
392 vpxor 2 * 32(r), t2, t2; \
393 vpxor 3 * 32(r), t3, t3; \
394 vmovdqu t0, 0 * 32(r); \
395 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
396 vmovdqu t1, 1 * 32(r); \
397 vmovdqu t2, 2 * 32(r); \
398 vmovdqu t3, 3 * 32(r); \
403 * rr ^= rol32(t2, 1); \
405 vpshufb tt0, t0, t3; \
406 vpsrldq $1, t0, t0; \
407 vpshufb tt0, t0, t2; \
408 vpsrldq $1, t0, t0; \
409 vpshufb tt0, t0, t1; \
410 vpsrldq $1, t0, t0; \
411 vpshufb tt0, t0, t0; \
413 vpand 0 * 32(r), t0, t0; \
414 vpand 1 * 32(r), t1, t1; \
415 vpand 2 * 32(r), t2, t2; \
416 vpand 3 * 32(r), t3, t3; \
418 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
420 vpxor 4 * 32(r), t0, t0; \
421 vpxor 5 * 32(r), t1, t1; \
422 vpxor 6 * 32(r), t2, t2; \
423 vpxor 7 * 32(r), t3, t3; \
424 vmovdqu t0, 4 * 32(r); \
425 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
426 vmovdqu t1, 5 * 32(r); \
427 vmovdqu t2, 6 * 32(r); \
428 vmovdqu t3, 7 * 32(r); \
436 vpshufb tt0, t0, t3; \
437 vpsrldq $1, t0, t0; \
438 vpshufb tt0, t0, t2; \
439 vpsrldq $1, t0, t0; \
440 vpshufb tt0, t0, t1; \
441 vpsrldq $1, t0, t0; \
442 vpshufb tt0, t0, t0; \
450 vmovdqu l0, 0 * 32(l); \
452 vmovdqu l1, 1 * 32(l); \
454 vmovdqu l2, 2 * 32(l); \
456 vmovdqu l3, 3 * 32(l);
458 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
459 vpunpckhdq x1, x0, t2; \
460 vpunpckldq x1, x0, x0; \
462 vpunpckldq x3, x2, t1; \
463 vpunpckhdq x3, x2, x2; \
465 vpunpckhqdq t1, x0, x1; \
466 vpunpcklqdq t1, x0, x0; \
468 vpunpckhqdq x2, t2, x3; \
469 vpunpcklqdq x2, t2, x2;
471 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
472 a3, b3, c3, d3, st0, st1) \
475 transpose_4x4(a0, a1, a2, a3, d2, d3); \
476 transpose_4x4(b0, b1, b2, b3, d2, d3); \
482 transpose_4x4(c0, c1, c2, c3, a0, a1); \
483 transpose_4x4(d0, d1, d2, d3, a0, a1); \
485 vbroadcasti128 .Lshufb_16x16b, a0; \
487 vpshufb a0, a2, a2; \
488 vpshufb a0, a3, a3; \
489 vpshufb a0, b0, b0; \
490 vpshufb a0, b1, b1; \
491 vpshufb a0, b2, b2; \
492 vpshufb a0, b3, b3; \
493 vpshufb a0, a1, a1; \
494 vpshufb a0, c0, c0; \
495 vpshufb a0, c1, c1; \
496 vpshufb a0, c2, c2; \
497 vpshufb a0, c3, c3; \
498 vpshufb a0, d0, d0; \
499 vpshufb a0, d1, d1; \
500 vpshufb a0, d2, d2; \
501 vpshufb a0, d3, d3; \
504 vpshufb a0, d3, a0; \
507 transpose_4x4(a0, b0, c0, d0, d2, d3); \
508 transpose_4x4(a1, b1, c1, d1, d2, d3); \
514 transpose_4x4(a2, b2, c2, d2, b0, b1); \
515 transpose_4x4(a3, b3, c3, d3, b0, b1); \
518 /* does not adjust output bytes inside vectors */
520 /* load blocks to registers and apply pre-whitening */
521 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
523 vpbroadcastq key, x0; \
524 vpshufb .Lpack_bswap, x0, x0; \
526 vpxor 0 * 32(rio), x0, y7; \
527 vpxor 1 * 32(rio), x0, y6; \
528 vpxor 2 * 32(rio), x0, y5; \
529 vpxor 3 * 32(rio), x0, y4; \
530 vpxor 4 * 32(rio), x0, y3; \
531 vpxor 5 * 32(rio), x0, y2; \
532 vpxor 6 * 32(rio), x0, y1; \
533 vpxor 7 * 32(rio), x0, y0; \
534 vpxor 8 * 32(rio), x0, x7; \
535 vpxor 9 * 32(rio), x0, x6; \
536 vpxor 10 * 32(rio), x0, x5; \
537 vpxor 11 * 32(rio), x0, x4; \
538 vpxor 12 * 32(rio), x0, x3; \
539 vpxor 13 * 32(rio), x0, x2; \
540 vpxor 14 * 32(rio), x0, x1; \
541 vpxor 15 * 32(rio), x0, x0;
543 /* byteslice pre-whitened blocks and store to temporary memory */
544 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
545 y6, y7, mem_ab, mem_cd) \
546 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
547 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
549 vmovdqu x0, 0 * 32(mem_ab); \
550 vmovdqu x1, 1 * 32(mem_ab); \
551 vmovdqu x2, 2 * 32(mem_ab); \
552 vmovdqu x3, 3 * 32(mem_ab); \
553 vmovdqu x4, 4 * 32(mem_ab); \
554 vmovdqu x5, 5 * 32(mem_ab); \
555 vmovdqu x6, 6 * 32(mem_ab); \
556 vmovdqu x7, 7 * 32(mem_ab); \
557 vmovdqu y0, 0 * 32(mem_cd); \
558 vmovdqu y1, 1 * 32(mem_cd); \
559 vmovdqu y2, 2 * 32(mem_cd); \
560 vmovdqu y3, 3 * 32(mem_cd); \
561 vmovdqu y4, 4 * 32(mem_cd); \
562 vmovdqu y5, 5 * 32(mem_cd); \
563 vmovdqu y6, 6 * 32(mem_cd); \
564 vmovdqu y7, 7 * 32(mem_cd);
566 /* de-byteslice, apply post-whitening and store blocks */
567 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
568 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
569 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
570 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
572 vmovdqu x0, stack_tmp0; \
574 vpbroadcastq key, x0; \
575 vpshufb .Lpack_bswap, x0, x0; \
592 vpxor stack_tmp0, x0, x0;
594 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
596 vmovdqu x0, 0 * 32(rio); \
597 vmovdqu x1, 1 * 32(rio); \
598 vmovdqu x2, 2 * 32(rio); \
599 vmovdqu x3, 3 * 32(rio); \
600 vmovdqu x4, 4 * 32(rio); \
601 vmovdqu x5, 5 * 32(rio); \
602 vmovdqu x6, 6 * 32(rio); \
603 vmovdqu x7, 7 * 32(rio); \
604 vmovdqu y0, 8 * 32(rio); \
605 vmovdqu y1, 9 * 32(rio); \
606 vmovdqu y2, 10 * 32(rio); \
607 vmovdqu y3, 11 * 32(rio); \
608 vmovdqu y4, 12 * 32(rio); \
609 vmovdqu y5, 13 * 32(rio); \
610 vmovdqu y6, 14 * 32(rio); \
611 vmovdqu y7, 15 * 32(rio);
616 #define SHUFB_BYTES(idx) \
617 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
620 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
621 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
624 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
625 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
627 /* For CTR-mode IV byteswap */
629 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
632 .Lxts_gf128mul_and_shl1_mask_0:
633 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
634 .Lxts_gf128mul_and_shl1_mask_1:
635 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
638 * pre-SubByte transform
640 * pre-lookup for sbox1, sbox2, sbox3:
641 * swap_bitendianness(
642 * isom_map_camellia_to_aes(
644 * swap_bitendianess(in)
649 * (note: '⊕ 0xc5' inside camellia_f())
652 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
653 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
655 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
656 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
659 * pre-SubByte transform
661 * pre-lookup for sbox4:
662 * swap_bitendianness(
663 * isom_map_camellia_to_aes(
665 * swap_bitendianess(in <<< 1)
670 * (note: '⊕ 0xc5' inside camellia_f())
673 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
674 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
676 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
677 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
680 * post-SubByte transform
682 * post-lookup for sbox1, sbox4:
683 * swap_bitendianness(
685 * isom_map_aes_to_camellia(
686 * swap_bitendianness(
687 * aes_inverse_affine_transform(in)
693 * (note: '⊕ 0x6e' inside camellia_h())
696 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
697 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
699 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
700 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
703 * post-SubByte transform
705 * post-lookup for sbox2:
706 * swap_bitendianness(
708 * isom_map_aes_to_camellia(
709 * swap_bitendianness(
710 * aes_inverse_affine_transform(in)
716 * (note: '⊕ 0x6e' inside camellia_h())
719 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
720 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
722 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
723 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
726 * post-SubByte transform
728 * post-lookup for sbox3:
729 * swap_bitendianness(
731 * isom_map_aes_to_camellia(
732 * swap_bitendianness(
733 * aes_inverse_affine_transform(in)
739 * (note: '⊕ 0x6e' inside camellia_h())
742 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
743 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
745 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
746 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
748 /* For isolating SubBytes from AESENCLAST, inverse shift row */
750 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
751 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
761 __camellia_enc_blk32:
764 * %rax: temporary storage, 512 bytes
765 * %ymm0..%ymm15: 32 plaintext blocks
767 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
768 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
771 leaq 8 * 32(%rax), %rcx;
773 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
774 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
777 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
778 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
779 %ymm15, %rax, %rcx, 0);
781 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
782 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
784 ((key_table + (8) * 8) + 0)(CTX),
785 ((key_table + (8) * 8) + 4)(CTX),
786 ((key_table + (8) * 8) + 8)(CTX),
787 ((key_table + (8) * 8) + 12)(CTX));
789 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
790 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
791 %ymm15, %rax, %rcx, 8);
793 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
794 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
796 ((key_table + (16) * 8) + 0)(CTX),
797 ((key_table + (16) * 8) + 4)(CTX),
798 ((key_table + (16) * 8) + 8)(CTX),
799 ((key_table + (16) * 8) + 12)(CTX));
801 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
802 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
803 %ymm15, %rax, %rcx, 16);
806 cmpl $16, key_length(CTX);
810 /* load CD for output */
811 vmovdqu 0 * 32(%rcx), %ymm8;
812 vmovdqu 1 * 32(%rcx), %ymm9;
813 vmovdqu 2 * 32(%rcx), %ymm10;
814 vmovdqu 3 * 32(%rcx), %ymm11;
815 vmovdqu 4 * 32(%rcx), %ymm12;
816 vmovdqu 5 * 32(%rcx), %ymm13;
817 vmovdqu 6 * 32(%rcx), %ymm14;
818 vmovdqu 7 * 32(%rcx), %ymm15;
820 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
821 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
822 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
830 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
831 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
833 ((key_table + (24) * 8) + 0)(CTX),
834 ((key_table + (24) * 8) + 4)(CTX),
835 ((key_table + (24) * 8) + 8)(CTX),
836 ((key_table + (24) * 8) + 12)(CTX));
838 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
839 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
840 %ymm15, %rax, %rcx, 24);
843 ENDPROC(__camellia_enc_blk32)
846 __camellia_dec_blk32:
849 * %rax: temporary storage, 512 bytes
850 * %r8d: 24 for 16 byte key, 32 for larger
851 * %ymm0..%ymm15: 16 encrypted blocks
853 * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
854 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
857 leaq 8 * 32(%rax), %rcx;
859 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
860 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
867 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
868 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
869 %ymm15, %rax, %rcx, 16);
871 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
872 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
874 ((key_table + (16) * 8) + 8)(CTX),
875 ((key_table + (16) * 8) + 12)(CTX),
876 ((key_table + (16) * 8) + 0)(CTX),
877 ((key_table + (16) * 8) + 4)(CTX));
879 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
880 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
881 %ymm15, %rax, %rcx, 8);
883 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
884 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
886 ((key_table + (8) * 8) + 8)(CTX),
887 ((key_table + (8) * 8) + 12)(CTX),
888 ((key_table + (8) * 8) + 0)(CTX),
889 ((key_table + (8) * 8) + 4)(CTX));
891 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
892 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
893 %ymm15, %rax, %rcx, 0);
895 /* load CD for output */
896 vmovdqu 0 * 32(%rcx), %ymm8;
897 vmovdqu 1 * 32(%rcx), %ymm9;
898 vmovdqu 2 * 32(%rcx), %ymm10;
899 vmovdqu 3 * 32(%rcx), %ymm11;
900 vmovdqu 4 * 32(%rcx), %ymm12;
901 vmovdqu 5 * 32(%rcx), %ymm13;
902 vmovdqu 6 * 32(%rcx), %ymm14;
903 vmovdqu 7 * 32(%rcx), %ymm15;
905 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
906 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
907 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
913 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
914 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
915 %ymm15, %rax, %rcx, 24);
917 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
918 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
920 ((key_table + (24) * 8) + 8)(CTX),
921 ((key_table + (24) * 8) + 12)(CTX),
922 ((key_table + (24) * 8) + 0)(CTX),
923 ((key_table + (24) * 8) + 4)(CTX));
926 ENDPROC(__camellia_dec_blk32)
928 ENTRY(camellia_ecb_enc_32way)
931 * %rsi: dst (32 blocks)
932 * %rdx: src (32 blocks)
937 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
938 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
939 %ymm15, %rdx, (key_table)(CTX));
941 /* now dst can be used as temporary buffer (even in src == dst case) */
944 call __camellia_enc_blk32;
946 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
947 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
953 ENDPROC(camellia_ecb_enc_32way)
955 ENTRY(camellia_ecb_dec_32way)
958 * %rsi: dst (32 blocks)
959 * %rdx: src (32 blocks)
964 cmpl $16, key_length(CTX);
967 cmovel %eax, %r8d; /* max */
969 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
970 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
971 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
973 /* now dst can be used as temporary buffer (even in src == dst case) */
976 call __camellia_dec_blk32;
978 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
979 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
985 ENDPROC(camellia_ecb_dec_32way)
987 ENTRY(camellia_cbc_dec_32way)
990 * %rsi: dst (32 blocks)
991 * %rdx: src (32 blocks)
996 cmpl $16, key_length(CTX);
999 cmovel %eax, %r8d; /* max */
1001 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1002 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1003 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1007 je .Lcbc_dec_use_stack;
1009 /* dst can be used as temporary storage, src is not overwritten. */
1011 jmp .Lcbc_dec_continue;
1013 .Lcbc_dec_use_stack:
1015 * dst still in-use (because dst == src), so use stack for temporary
1018 subq $(16 * 32), %rsp;
1022 call __camellia_dec_blk32;
1024 vmovdqu %ymm7, (%rax);
1025 vpxor %ymm7, %ymm7, %ymm7;
1026 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1027 vpxor (%rax), %ymm7, %ymm7;
1029 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1030 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1031 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1032 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1033 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1034 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1035 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1036 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1037 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1038 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1039 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1040 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1041 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1042 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1043 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1044 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1045 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1051 ENDPROC(camellia_cbc_dec_32way)
1053 #define inc_le128(x, minus_one, tmp) \
1054 vpcmpeqq minus_one, x, tmp; \
1055 vpsubq minus_one, x, x; \
1056 vpslldq $8, tmp, tmp; \
1059 #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1060 vpcmpeqq minus_one, x, tmp1; \
1061 vpcmpeqq minus_two, x, tmp2; \
1062 vpsubq minus_two, x, x; \
1063 vpor tmp2, tmp1, tmp1; \
1064 vpslldq $8, tmp1, tmp1; \
1067 ENTRY(camellia_ctr_32way)
1070 * %rsi: dst (32 blocks)
1071 * %rdx: src (32 blocks)
1072 * %rcx: iv (little endian, 128bit)
1081 /* dst can be used as temporary storage, src is not overwritten. */
1086 subq $(16 * 32), %rsp;
1090 vpcmpeqd %ymm15, %ymm15, %ymm15;
1091 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
1092 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
1094 /* load IV and byteswap */
1095 vmovdqu (%rcx), %xmm0;
1096 vmovdqa %xmm0, %xmm1;
1097 inc_le128(%xmm0, %xmm15, %xmm14);
1098 vbroadcasti128 .Lbswap128_mask, %ymm14;
1099 vinserti128 $1, %xmm0, %ymm1, %ymm0;
1100 vpshufb %ymm14, %ymm0, %ymm13;
1101 vmovdqu %ymm13, 15 * 32(%rax);
1104 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
1105 vpshufb %ymm14, %ymm0, %ymm13;
1106 vmovdqu %ymm13, 14 * 32(%rax);
1107 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1108 vpshufb %ymm14, %ymm0, %ymm13;
1109 vmovdqu %ymm13, 13 * 32(%rax);
1110 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1111 vpshufb %ymm14, %ymm0, %ymm13;
1112 vmovdqu %ymm13, 12 * 32(%rax);
1113 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1114 vpshufb %ymm14, %ymm0, %ymm13;
1115 vmovdqu %ymm13, 11 * 32(%rax);
1116 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1117 vpshufb %ymm14, %ymm0, %ymm10;
1118 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1119 vpshufb %ymm14, %ymm0, %ymm9;
1120 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1121 vpshufb %ymm14, %ymm0, %ymm8;
1122 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1123 vpshufb %ymm14, %ymm0, %ymm7;
1124 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1125 vpshufb %ymm14, %ymm0, %ymm6;
1126 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1127 vpshufb %ymm14, %ymm0, %ymm5;
1128 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1129 vpshufb %ymm14, %ymm0, %ymm4;
1130 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1131 vpshufb %ymm14, %ymm0, %ymm3;
1132 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1133 vpshufb %ymm14, %ymm0, %ymm2;
1134 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1135 vpshufb %ymm14, %ymm0, %ymm1;
1136 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1137 vextracti128 $1, %ymm0, %xmm13;
1138 vpshufb %ymm14, %ymm0, %ymm0;
1139 inc_le128(%xmm13, %xmm15, %xmm14);
1140 vmovdqu %xmm13, (%rcx);
1143 vpbroadcastq (key_table)(CTX), %ymm15;
1144 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1145 vpxor %ymm0, %ymm15, %ymm0;
1146 vpxor %ymm1, %ymm15, %ymm1;
1147 vpxor %ymm2, %ymm15, %ymm2;
1148 vpxor %ymm3, %ymm15, %ymm3;
1149 vpxor %ymm4, %ymm15, %ymm4;
1150 vpxor %ymm5, %ymm15, %ymm5;
1151 vpxor %ymm6, %ymm15, %ymm6;
1152 vpxor %ymm7, %ymm15, %ymm7;
1153 vpxor %ymm8, %ymm15, %ymm8;
1154 vpxor %ymm9, %ymm15, %ymm9;
1155 vpxor %ymm10, %ymm15, %ymm10;
1156 vpxor 11 * 32(%rax), %ymm15, %ymm11;
1157 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1158 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1159 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1160 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1162 call __camellia_enc_blk32;
1166 vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1167 vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1168 vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1169 vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1170 vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1171 vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1172 vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1173 vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1174 vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1175 vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1176 vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1177 vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1178 vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1179 vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1180 vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1181 vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1182 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1183 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1189 ENDPROC(camellia_ctr_32way)
1191 #define gf128mul_x_ble(iv, mask, tmp) \
1192 vpsrad $31, iv, tmp; \
1193 vpaddq iv, iv, iv; \
1194 vpshufd $0x13, tmp, tmp; \
1195 vpand mask, tmp, tmp; \
1198 #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1199 vpsrad $31, iv, tmp0; \
1200 vpaddq iv, iv, tmp1; \
1201 vpsllq $2, iv, iv; \
1202 vpshufd $0x13, tmp0, tmp0; \
1203 vpsrad $31, tmp1, tmp1; \
1204 vpand mask2, tmp0, tmp0; \
1205 vpshufd $0x13, tmp1, tmp1; \
1206 vpxor tmp0, iv, iv; \
1207 vpand mask1, tmp1, tmp1; \
1211 camellia_xts_crypt_32way:
1214 * %rsi: dst (32 blocks)
1215 * %rdx: src (32 blocks)
1216 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1217 * %r8: index for input whitening key
1218 * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
1223 subq $(16 * 32), %rsp;
1226 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1228 /* load IV and construct second IV */
1229 vmovdqu (%rcx), %xmm0;
1230 vmovdqa %xmm0, %xmm15;
1231 gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1232 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1233 vinserti128 $1, %xmm0, %ymm15, %ymm0;
1234 vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1235 vmovdqu %ymm15, 15 * 32(%rax);
1236 vmovdqu %ymm0, 0 * 32(%rsi);
1239 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1240 vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1241 vmovdqu %ymm15, 14 * 32(%rax);
1242 vmovdqu %ymm0, 1 * 32(%rsi);
1244 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1245 vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1246 vmovdqu %ymm15, 13 * 32(%rax);
1247 vmovdqu %ymm0, 2 * 32(%rsi);
1249 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1250 vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1251 vmovdqu %ymm15, 12 * 32(%rax);
1252 vmovdqu %ymm0, 3 * 32(%rsi);
1254 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1255 vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1256 vmovdqu %ymm0, 4 * 32(%rsi);
1258 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1259 vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1260 vmovdqu %ymm0, 5 * 32(%rsi);
1262 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1263 vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1264 vmovdqu %ymm0, 6 * 32(%rsi);
1266 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1267 vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1268 vmovdqu %ymm0, 7 * 32(%rsi);
1270 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1271 vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1272 vmovdqu %ymm0, 8 * 32(%rsi);
1274 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1275 vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1276 vmovdqu %ymm0, 9 * 32(%rsi);
1278 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1279 vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1280 vmovdqu %ymm0, 10 * 32(%rsi);
1282 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1283 vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1284 vmovdqu %ymm0, 11 * 32(%rsi);
1286 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1287 vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1288 vmovdqu %ymm0, 12 * 32(%rsi);
1290 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1291 vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1292 vmovdqu %ymm0, 13 * 32(%rsi);
1294 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1295 vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1296 vmovdqu %ymm0, 14 * 32(%rsi);
1298 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1299 vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1300 vmovdqu %ymm15, 0 * 32(%rax);
1301 vmovdqu %ymm0, 15 * 32(%rsi);
1303 vextracti128 $1, %ymm0, %xmm0;
1304 gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1305 vmovdqu %xmm0, (%rcx);
1308 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1309 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1310 vpxor 0 * 32(%rax), %ymm15, %ymm0;
1311 vpxor %ymm1, %ymm15, %ymm1;
1312 vpxor %ymm2, %ymm15, %ymm2;
1313 vpxor %ymm3, %ymm15, %ymm3;
1314 vpxor %ymm4, %ymm15, %ymm4;
1315 vpxor %ymm5, %ymm15, %ymm5;
1316 vpxor %ymm6, %ymm15, %ymm6;
1317 vpxor %ymm7, %ymm15, %ymm7;
1318 vpxor %ymm8, %ymm15, %ymm8;
1319 vpxor %ymm9, %ymm15, %ymm9;
1320 vpxor %ymm10, %ymm15, %ymm10;
1321 vpxor %ymm11, %ymm15, %ymm11;
1322 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1323 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1324 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1325 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1329 addq $(16 * 32), %rsp;
1331 vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1332 vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1333 vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1334 vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1335 vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1336 vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1337 vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1338 vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1339 vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1340 vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1341 vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1342 vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1343 vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1344 vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1345 vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1346 vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1347 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1348 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1354 ENDPROC(camellia_xts_crypt_32way)
1356 ENTRY(camellia_xts_enc_32way)
1359 * %rsi: dst (32 blocks)
1360 * %rdx: src (32 blocks)
1361 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1364 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1366 leaq __camellia_enc_blk32, %r9;
1368 jmp camellia_xts_crypt_32way;
1369 ENDPROC(camellia_xts_enc_32way)
1371 ENTRY(camellia_xts_dec_32way)
1374 * %rsi: dst (32 blocks)
1375 * %rdx: src (32 blocks)
1376 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1379 cmpl $16, key_length(CTX);
1382 cmovel %eax, %r8d; /* input whitening key, last for dec */
1384 leaq __camellia_dec_blk32, %r9;
1386 jmp camellia_xts_crypt_32way;
1387 ENDPROC(camellia_xts_dec_32way)