bd1e4c060f0af61bc9e7e8e69851b0def3a57044
[oweals/openssl.git] / crypto / camellia / asm / cmll-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12 #
13 # This module may be used under the terms of either the GNU General
14 # Public License version 2 or later, the GNU Lesser General Public
15 # License version 2.1 or later, the Mozilla Public License version
16 # 1.1 or the BSD License. The exact terms of either license are
17 # distributed along with this module. For further details see
18 # http://www.openssl.org/~appro/camellia/.
19 # ====================================================================
20
21 # Performance in cycles per processed byte (less is better) in
22 # 'openssl speed ...' benchmark:
23 #
24 #                       AMD64   Core2   EM64T
25 # -evp camellia-128-ecb 16.7    21.0    22.7
26 # + over gcc 3.4.6      +25%    +5%     0%
27 #
28 # camellia-128-cbc      15.7    20.4    21.1
29 #
30 # 128-bit key setup     128     216     205     cycles/key
31 # + over gcc 3.4.6      +54%    +39%    +15%
32 #
33 # Numbers in "+" rows represent performance improvement over compiler
34 # generated code. Key setup timings are impressive on AMD and Core2
35 # thanks to 64-bit operations being covertly deployed. Improvement on
36 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37 # apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39 # $output is the last argument if it looks like a file (it has an extension)
40 # $flavour is the first argument if it doesn't look like a file
41 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
42 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
43
44 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
45
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
49 die "can't locate x86_64-xlate.pl";
50
51 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
52     or die "can't call $xlate: $!";
53 *STDOUT=*OUT;
54
55 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
56 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
57                         $r =~ s/%[er]([sd]i)/%\1l/;
58                         $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
59
60 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
61 @S=("%r8d","%r9d","%r10d","%r11d");
62 $i0="%esi";
63 $i1="%edi";
64 $Tbl="%rbp";    # size optimization
65 $inp="%r12";
66 $out="%r13";
67 $key="%r14";
68 $keyend="%r15";
69 $arg0d=$win64?"%ecx":"%edi";
70
71 # const unsigned int Camellia_SBOX[4][256];
72 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
73 # and [2][] - with [3][]. This is done to minimize code size.
74 $SBOX1_1110=0;          # Camellia_SBOX[0]
75 $SBOX4_4404=4;          # Camellia_SBOX[1]
76 $SBOX2_0222=2048;       # Camellia_SBOX[2]
77 $SBOX3_3033=2052;       # Camellia_SBOX[3]
78
79 sub Camellia_Feistel {
80 my $i=@_[0];
81 my $seed=defined(@_[1])?@_[1]:0;
82 my $scale=$seed<0?-8:8;
83 my $j=($i&1)*2;
84 my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
85
86 $code.=<<___;
87         xor     $s0,$t0                         # t0^=key[0]
88         xor     $s1,$t1                         # t1^=key[1]
89         movz    `&hi("$t0")`,$i0                # (t0>>8)&0xff
90         movz    `&lo("$t1")`,$i1                # (t1>>0)&0xff
91         mov     $SBOX3_3033($Tbl,$i0,8),$t3     # t3=SBOX3_3033[0]
92         mov     $SBOX1_1110($Tbl,$i1,8),$t2     # t2=SBOX1_1110[1]
93         movz    `&lo("$t0")`,$i0                # (t0>>0)&0xff
94         shr     \$16,$t0
95         movz    `&hi("$t1")`,$i1                # (t1>>8)&0xff
96         xor     $SBOX4_4404($Tbl,$i0,8),$t3     # t3^=SBOX4_4404[0]
97         shr     \$16,$t1
98         xor     $SBOX4_4404($Tbl,$i1,8),$t2     # t2^=SBOX4_4404[1]
99         movz    `&hi("$t0")`,$i0                # (t0>>24)&0xff
100         movz    `&lo("$t1")`,$i1                # (t1>>16)&0xff
101         xor     $SBOX1_1110($Tbl,$i0,8),$t3     # t3^=SBOX1_1110[0]
102         xor     $SBOX3_3033($Tbl,$i1,8),$t2     # t2^=SBOX3_3033[1]
103         movz    `&lo("$t0")`,$i0                # (t0>>16)&0xff
104         movz    `&hi("$t1")`,$i1                # (t1>>24)&0xff
105         xor     $SBOX2_0222($Tbl,$i0,8),$t3     # t3^=SBOX2_0222[0]
106         xor     $SBOX2_0222($Tbl,$i1,8),$t2     # t2^=SBOX2_0222[1]
107         mov     `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
108         mov     `$seed+($i+1)*$scale+4`($key),$t0
109         xor     $t3,$t2                         # t2^=t3
110         ror     \$8,$t3                         # t3=RightRotate(t3,8)
111         xor     $t2,$s2
112         xor     $t2,$s3
113         xor     $t3,$s3
114 ___
115 }
116
117 # void Camellia_EncryptBlock_Rounds(
118 #               int grandRounds,
119 #               const Byte plaintext[],
120 #               const KEY_TABLE_TYPE keyTable,
121 #               Byte ciphertext[])
122 $code=<<___;
123 .text
124
125 # V1.x API
126 .globl  Camellia_EncryptBlock
127 .type   Camellia_EncryptBlock,\@abi-omnipotent
128 .align  16
129 Camellia_EncryptBlock:
130         movl    \$128,%eax
131         subl    $arg0d,%eax
132         movl    \$3,$arg0d
133         adcl    \$0,$arg0d      # keyBitLength==128?3:4
134         jmp     .Lenc_rounds
135 .size   Camellia_EncryptBlock,.-Camellia_EncryptBlock
136 # V2
137 .globl  Camellia_EncryptBlock_Rounds
138 .type   Camellia_EncryptBlock_Rounds,\@function,4
139 .align  16
140 .Lenc_rounds:
141 Camellia_EncryptBlock_Rounds:
142 .cfi_startproc
143         push    %rbx
144 .cfi_push       %rbx
145         push    %rbp
146 .cfi_push       %rbp
147         push    %r13
148 .cfi_push       %r13
149         push    %r14
150 .cfi_push       %r14
151         push    %r15
152 .cfi_push       %r15
153 .Lenc_prologue:
154
155         #mov    %rsi,$inp               # put away arguments
156         mov     %rcx,$out
157         mov     %rdx,$key
158
159         shl     \$6,%edi                # process grandRounds
160         lea     .LCamellia_SBOX(%rip),$Tbl
161         lea     ($key,%rdi),$keyend
162
163         mov     0(%rsi),@S[0]           # load plaintext
164         mov     4(%rsi),@S[1]
165         mov     8(%rsi),@S[2]
166         bswap   @S[0]
167         mov     12(%rsi),@S[3]
168         bswap   @S[1]
169         bswap   @S[2]
170         bswap   @S[3]
171
172         call    _x86_64_Camellia_encrypt
173
174         bswap   @S[0]
175         bswap   @S[1]
176         bswap   @S[2]
177         mov     @S[0],0($out)
178         bswap   @S[3]
179         mov     @S[1],4($out)
180         mov     @S[2],8($out)
181         mov     @S[3],12($out)
182
183         mov     0(%rsp),%r15
184 .cfi_restore    %r15
185         mov     8(%rsp),%r14
186 .cfi_restore    %r14
187         mov     16(%rsp),%r13
188 .cfi_restore    %r13
189         mov     24(%rsp),%rbp
190 .cfi_restore    %rbp
191         mov     32(%rsp),%rbx
192 .cfi_restore    %rbx
193         lea     40(%rsp),%rsp
194 .cfi_adjust_cfa_offset  -40
195 .Lenc_epilogue:
196         ret
197 .cfi_endproc
198 .size   Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
199
200 .type   _x86_64_Camellia_encrypt,\@abi-omnipotent
201 .align  16
202 _x86_64_Camellia_encrypt:
203         xor     0($key),@S[1]
204         xor     4($key),@S[0]           # ^=key[0-3]
205         xor     8($key),@S[3]
206         xor     12($key),@S[2]
207 .align  16
208 .Leloop:
209         mov     16($key),$t1            # prefetch key[4-5]
210         mov     20($key),$t0
211
212 ___
213         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
214 $code.=<<___;
215         lea     16*4($key),$key
216         cmp     $keyend,$key
217         mov     8($key),$t3             # prefetch key[2-3]
218         mov     12($key),$t2
219         je      .Ledone
220
221         and     @S[0],$t0
222         or      @S[3],$t3
223         rol     \$1,$t0
224         xor     $t3,@S[2]               # s2^=s3|key[3];
225         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
226         and     @S[2],$t2
227         or      @S[1],$t1
228         rol     \$1,$t2
229         xor     $t1,@S[0]               # s0^=s1|key[1];
230         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
231         jmp     .Leloop
232
233 .align  16
234 .Ledone:
235         xor     @S[2],$t0               # SwapHalf
236         xor     @S[3],$t1
237         xor     @S[0],$t2
238         xor     @S[1],$t3
239
240         mov     $t0,@S[0]
241         mov     $t1,@S[1]
242         mov     $t2,@S[2]
243         mov     $t3,@S[3]
244
245         .byte   0xf3,0xc3               # rep ret
246 .size   _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
247
248 # V1.x API
249 .globl  Camellia_DecryptBlock
250 .type   Camellia_DecryptBlock,\@abi-omnipotent
251 .align  16
252 Camellia_DecryptBlock:
253         movl    \$128,%eax
254         subl    $arg0d,%eax
255         movl    \$3,$arg0d
256         adcl    \$0,$arg0d      # keyBitLength==128?3:4
257         jmp     .Ldec_rounds
258 .size   Camellia_DecryptBlock,.-Camellia_DecryptBlock
259 # V2
260 .globl  Camellia_DecryptBlock_Rounds
261 .type   Camellia_DecryptBlock_Rounds,\@function,4
262 .align  16
263 .Ldec_rounds:
264 Camellia_DecryptBlock_Rounds:
265 .cfi_startproc
266         push    %rbx
267 .cfi_push       %rbx
268         push    %rbp
269 .cfi_push       %rbp
270         push    %r13
271 .cfi_push       %r13
272         push    %r14
273 .cfi_push       %r14
274         push    %r15
275 .cfi_push       %r15
276 .Ldec_prologue:
277
278         #mov    %rsi,$inp               # put away arguments
279         mov     %rcx,$out
280         mov     %rdx,$keyend
281
282         shl     \$6,%edi                # process grandRounds
283         lea     .LCamellia_SBOX(%rip),$Tbl
284         lea     ($keyend,%rdi),$key
285
286         mov     0(%rsi),@S[0]           # load plaintext
287         mov     4(%rsi),@S[1]
288         mov     8(%rsi),@S[2]
289         bswap   @S[0]
290         mov     12(%rsi),@S[3]
291         bswap   @S[1]
292         bswap   @S[2]
293         bswap   @S[3]
294
295         call    _x86_64_Camellia_decrypt
296
297         bswap   @S[0]
298         bswap   @S[1]
299         bswap   @S[2]
300         mov     @S[0],0($out)
301         bswap   @S[3]
302         mov     @S[1],4($out)
303         mov     @S[2],8($out)
304         mov     @S[3],12($out)
305
306         mov     0(%rsp),%r15
307 .cfi_restore    %r15
308         mov     8(%rsp),%r14
309 .cfi_restore    %r14
310         mov     16(%rsp),%r13
311 .cfi_restore    %r13
312         mov     24(%rsp),%rbp
313 .cfi_restore    %rbp
314         mov     32(%rsp),%rbx
315 .cfi_restore    %rbx
316         lea     40(%rsp),%rsp
317 .cfi_adjust_cfa_offset  -40
318 .Ldec_epilogue:
319         ret
320 .cfi_endproc
321 .size   Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
322
323 .type   _x86_64_Camellia_decrypt,\@abi-omnipotent
324 .align  16
325 _x86_64_Camellia_decrypt:
326         xor     0($key),@S[1]
327         xor     4($key),@S[0]           # ^=key[0-3]
328         xor     8($key),@S[3]
329         xor     12($key),@S[2]
330 .align  16
331 .Ldloop:
332         mov     -8($key),$t1            # prefetch key[4-5]
333         mov     -4($key),$t0
334
335 ___
336         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
337 $code.=<<___;
338         lea     -16*4($key),$key
339         cmp     $keyend,$key
340         mov     0($key),$t3             # prefetch key[2-3]
341         mov     4($key),$t2
342         je      .Lddone
343
344         and     @S[0],$t0
345         or      @S[3],$t3
346         rol     \$1,$t0
347         xor     $t3,@S[2]               # s2^=s3|key[3];
348         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
349         and     @S[2],$t2
350         or      @S[1],$t1
351         rol     \$1,$t2
352         xor     $t1,@S[0]               # s0^=s1|key[1];
353         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
354
355         jmp     .Ldloop
356
357 .align  16
358 .Lddone:
359         xor     @S[2],$t2
360         xor     @S[3],$t3
361         xor     @S[0],$t0
362         xor     @S[1],$t1
363
364         mov     $t2,@S[0]               # SwapHalf
365         mov     $t3,@S[1]
366         mov     $t0,@S[2]
367         mov     $t1,@S[3]
368
369         .byte   0xf3,0xc3               # rep ret
370 .size   _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
371 ___
372
373 sub _saveround {
374 my ($rnd,$key,@T)=@_;
375 my $bias=int(@T[0])?shift(@T):0;
376
377     if ($#T==3) {
378         $code.=<<___;
379         mov     @T[1],`$bias+$rnd*8+0`($key)
380         mov     @T[0],`$bias+$rnd*8+4`($key)
381         mov     @T[3],`$bias+$rnd*8+8`($key)
382         mov     @T[2],`$bias+$rnd*8+12`($key)
383 ___
384     } else {
385         $code.="        mov     @T[0],`$bias+$rnd*8+0`($key)\n";
386         $code.="        mov     @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
387     }
388 }
389
390 sub _loadround {
391 my ($rnd,$key,@T)=@_;
392 my $bias=int(@T[0])?shift(@T):0;
393
394 $code.="        mov     `$bias+$rnd*8+0`($key),@T[0]\n";
395 $code.="        mov     `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
396 }
397
398 # shld is very slow on Intel EM64T family. Even on AMD it limits
399 # instruction decode rate [because it's VectorPath] and consequently
400 # performance...
401 sub __rotl128 {
402 my ($i0,$i1,$rot)=@_;
403
404     if ($rot) {
405         $code.=<<___;
406         mov     $i0,%r11
407         shld    \$$rot,$i1,$i0
408         shld    \$$rot,%r11,$i1
409 ___
410     }
411 }
412
413 # ... Implementing 128-bit rotate without shld gives 80% better
414 # performance EM64T, +15% on AMD64 and only ~7% degradation on
415 # Core2. This is therefore preferred.
416 sub _rotl128 {
417 my ($i0,$i1,$rot)=@_;
418
419     if ($rot) {
420         $code.=<<___;
421         mov     $i0,%r11
422         shl     \$$rot,$i0
423         mov     $i1,%r9
424         shr     \$`64-$rot`,%r9
425         shr     \$`64-$rot`,%r11
426         or      %r9,$i0
427         shl     \$$rot,$i1
428         or      %r11,$i1
429 ___
430     }
431 }
432
433 { my $step=0;
434
435 $code.=<<___;
436 .globl  Camellia_Ekeygen
437 .type   Camellia_Ekeygen,\@function,3
438 .align  16
439 Camellia_Ekeygen:
440 .cfi_startproc
441         push    %rbx
442 .cfi_push       %rbx
443         push    %rbp
444 .cfi_push       %rbp
445         push    %r13
446 .cfi_push       %r13
447         push    %r14
448 .cfi_push       %r14
449         push    %r15
450 .cfi_push       %r15
451 .Lkey_prologue:
452
453         mov     %edi,${keyend}d         # put away arguments, keyBitLength
454         mov     %rdx,$out               # keyTable
455
456         mov     0(%rsi),@S[0]           # load 0-127 bits
457         mov     4(%rsi),@S[1]
458         mov     8(%rsi),@S[2]
459         mov     12(%rsi),@S[3]
460
461         bswap   @S[0]
462         bswap   @S[1]
463         bswap   @S[2]
464         bswap   @S[3]
465 ___
466         &_saveround     (0,$out,@S);    # KL<<<0
467 $code.=<<___;
468         cmp     \$128,$keyend           # check keyBitLength
469         je      .L1st128
470
471         mov     16(%rsi),@S[0]          # load 128-191 bits
472         mov     20(%rsi),@S[1]
473         cmp     \$192,$keyend
474         je      .L1st192
475         mov     24(%rsi),@S[2]          # load 192-255 bits
476         mov     28(%rsi),@S[3]
477         jmp     .L1st256
478 .L1st192:
479         mov     @S[0],@S[2]
480         mov     @S[1],@S[3]
481         not     @S[2]
482         not     @S[3]
483 .L1st256:
484         bswap   @S[0]
485         bswap   @S[1]
486         bswap   @S[2]
487         bswap   @S[3]
488 ___
489         &_saveround     (4,$out,@S);    # temp storage for KR!
490 $code.=<<___;
491         xor     0($out),@S[1]           # KR^KL
492         xor     4($out),@S[0]
493         xor     8($out),@S[3]
494         xor     12($out),@S[2]
495
496 .L1st128:
497         lea     .LCamellia_SIGMA(%rip),$key
498         lea     .LCamellia_SBOX(%rip),$Tbl
499
500         mov     0($key),$t1
501         mov     4($key),$t0
502 ___
503         &Camellia_Feistel($step++);
504         &Camellia_Feistel($step++);
505 $code.=<<___;
506         xor     0($out),@S[1]           # ^KL
507         xor     4($out),@S[0]
508         xor     8($out),@S[3]
509         xor     12($out),@S[2]
510 ___
511         &Camellia_Feistel($step++);
512         &Camellia_Feistel($step++);
513 $code.=<<___;
514         cmp     \$128,$keyend
515         jne     .L2nd256
516
517         lea     128($out),$out          # size optimization
518         shl     \$32,%r8                # @S[0]||
519         shl     \$32,%r10               # @S[2]||
520         or      %r9,%r8                 # ||@S[1]
521         or      %r11,%r10               # ||@S[3]
522 ___
523         &_loadround     (0,$out,-128,"%rax","%rbx");    # KL
524         &_saveround     (2,$out,-128,"%r8","%r10");     # KA<<<0
525         &_rotl128       ("%rax","%rbx",15);
526         &_saveround     (4,$out,-128,"%rax","%rbx");    # KL<<<15
527         &_rotl128       ("%r8","%r10",15);
528         &_saveround     (6,$out,-128,"%r8","%r10");     # KA<<<15
529         &_rotl128       ("%r8","%r10",15);              # 15+15=30
530         &_saveround     (8,$out,-128,"%r8","%r10");     # KA<<<30
531         &_rotl128       ("%rax","%rbx",30);             # 15+30=45
532         &_saveround     (10,$out,-128,"%rax","%rbx");   # KL<<<45
533         &_rotl128       ("%r8","%r10",15);              # 30+15=45
534         &_saveround     (12,$out,-128,"%r8");           # KA<<<45
535         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
536         &_saveround     (13,$out,-128,"%rbx");          # KL<<<60
537         &_rotl128       ("%r8","%r10",15);              # 45+15=60
538         &_saveround     (14,$out,-128,"%r8","%r10");    # KA<<<60
539         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
540         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<77
541         &_rotl128       ("%rax","%rbx",17);             # 77+17=94
542         &_saveround     (18,$out,-128,"%rax","%rbx");   # KL<<<94
543         &_rotl128       ("%r8","%r10",34);              # 60+34=94
544         &_saveround     (20,$out,-128,"%r8","%r10");    # KA<<<94
545         &_rotl128       ("%rax","%rbx",17);             # 94+17=111
546         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<111
547         &_rotl128       ("%r8","%r10",17);              # 94+17=111
548         &_saveround     (24,$out,-128,"%r8","%r10");    # KA<<<111
549 $code.=<<___;
550         mov     \$3,%eax
551         jmp     .Ldone
552 .align  16
553 .L2nd256:
554 ___
555         &_saveround     (6,$out,@S);    # temp storage for KA!
556 $code.=<<___;
557         xor     `4*8+0`($out),@S[1]     # KA^KR
558         xor     `4*8+4`($out),@S[0]
559         xor     `5*8+0`($out),@S[3]
560         xor     `5*8+4`($out),@S[2]
561 ___
562         &Camellia_Feistel($step++);
563         &Camellia_Feistel($step++);
564
565         &_loadround     (0,$out,"%rax","%rbx"); # KL
566         &_loadround     (4,$out,"%rcx","%rdx"); # KR
567         &_loadround     (6,$out,"%r14","%r15"); # KA
568 $code.=<<___;
569         lea     128($out),$out          # size optimization
570         shl     \$32,%r8                # @S[0]||
571         shl     \$32,%r10               # @S[2]||
572         or      %r9,%r8                 # ||@S[1]
573         or      %r11,%r10               # ||@S[3]
574 ___
575         &_saveround     (2,$out,-128,"%r8","%r10");     # KB<<<0
576         &_rotl128       ("%rcx","%rdx",15);
577         &_saveround     (4,$out,-128,"%rcx","%rdx");    # KR<<<15
578         &_rotl128       ("%r14","%r15",15);
579         &_saveround     (6,$out,-128,"%r14","%r15");    # KA<<<15
580         &_rotl128       ("%rcx","%rdx",15);             # 15+15=30
581         &_saveround     (8,$out,-128,"%rcx","%rdx");    # KR<<<30
582         &_rotl128       ("%r8","%r10",30);
583         &_saveround     (10,$out,-128,"%r8","%r10");    # KB<<<30
584         &_rotl128       ("%rax","%rbx",45);
585         &_saveround     (12,$out,-128,"%rax","%rbx");   # KL<<<45
586         &_rotl128       ("%r14","%r15",30);             # 15+30=45
587         &_saveround     (14,$out,-128,"%r14","%r15");   # KA<<<45
588         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
589         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<60
590         &_rotl128       ("%rcx","%rdx",30);             # 30+30=60
591         &_saveround     (18,$out,-128,"%rcx","%rdx");   # KR<<<60
592         &_rotl128       ("%r8","%r10",30);              # 30+30=60
593         &_saveround     (20,$out,-128,"%r8","%r10");    # KB<<<60
594         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
595         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<77
596         &_rotl128       ("%r14","%r15",32);             # 45+32=77
597         &_saveround     (24,$out,-128,"%r14","%r15");   # KA<<<77
598         &_rotl128       ("%rcx","%rdx",34);             # 60+34=94
599         &_saveround     (26,$out,-128,"%rcx","%rdx");   # KR<<<94
600         &_rotl128       ("%r14","%r15",17);             # 77+17=94
601         &_saveround     (28,$out,-128,"%r14","%r15");   # KA<<<77
602         &_rotl128       ("%rax","%rbx",34);             # 77+34=111
603         &_saveround     (30,$out,-128,"%rax","%rbx");   # KL<<<111
604         &_rotl128       ("%r8","%r10",51);              # 60+51=111
605         &_saveround     (32,$out,-128,"%r8","%r10");    # KB<<<111
606 $code.=<<___;
607         mov     \$4,%eax
608 .Ldone:
609         mov     0(%rsp),%r15
610 .cfi_restore    %r15
611         mov     8(%rsp),%r14
612 .cfi_restore    %r14
613         mov     16(%rsp),%r13
614 .cfi_restore    %r13
615         mov     24(%rsp),%rbp
616 .cfi_restore    %rbp
617         mov     32(%rsp),%rbx
618 .cfi_restore    %rbx
619         lea     40(%rsp),%rsp
620 .cfi_adjust_cfa_offset  -40
621 .Lkey_epilogue:
622         ret
623 .cfi_endproc
624 .size   Camellia_Ekeygen,.-Camellia_Ekeygen
625 ___
626 }
627
628 @SBOX=(
629 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
630  35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
631 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
632 166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
633 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
634 223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
635  20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
636 254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
637 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
638  16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
639 135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
640  82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
641 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
642 120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
643 114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
644  64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
645
646 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
647 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
648 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
649 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
650
651 $code.=<<___;
652 .align  64
653 .LCamellia_SIGMA:
654 .long   0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
655 .long   0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
656 .long   0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
657 .long   0,          0,          0,          0
658 .LCamellia_SBOX:
659 ___
660 # tables are interleaved, remember?
661 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
662 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
663 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
664
665 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
666 #                       size_t length, const CAMELLIA_KEY *key,
667 #                       unsigned char *ivp,const int enc);
668 {
669 $_key="0(%rsp)";
670 $_end="8(%rsp)";        # inp+len&~15
671 $_res="16(%rsp)";       # len&15
672 $ivec="24(%rsp)";
673 $_ivp="40(%rsp)";
674 $_rsp="48(%rsp)";
675
676 $code.=<<___;
677 .globl  Camellia_cbc_encrypt
678 .type   Camellia_cbc_encrypt,\@function,6
679 .align  16
680 Camellia_cbc_encrypt:
681 .cfi_startproc
682         cmp     \$0,%rdx
683         je      .Lcbc_abort
684         push    %rbx
685 .cfi_push       %rbx
686         push    %rbp
687 .cfi_push       %rbp
688         push    %r12
689 .cfi_push       %r12
690         push    %r13
691 .cfi_push       %r13
692         push    %r14
693 .cfi_push       %r14
694         push    %r15
695 .cfi_push       %r15
696 .Lcbc_prologue:
697
698         mov     %rsp,%rbp
699 .cfi_def_cfa_register   %rbp
700         sub     \$64,%rsp
701         and     \$-64,%rsp
702
703         # place stack frame just "above mod 1024" the key schedule,
704         # this ensures that cache associativity suffices
705         lea     -64-63(%rcx),%r10
706         sub     %rsp,%r10
707         neg     %r10
708         and     \$0x3C0,%r10
709         sub     %r10,%rsp
710         #add    \$8,%rsp                # 8 is reserved for callee's ra
711
712         mov     %rdi,$inp               # inp argument
713         mov     %rsi,$out               # out argument
714         mov     %r8,%rbx                # ivp argument
715         mov     %rcx,$key               # key argument
716         mov     272(%rcx),${keyend}d    # grandRounds
717
718         mov     %r8,$_ivp
719         mov     %rbp,$_rsp
720 .cfi_cfa_expression     $_rsp,deref,+56
721
722 .Lcbc_body:
723         lea     .LCamellia_SBOX(%rip),$Tbl
724
725         mov     \$32,%ecx
726 .align  4
727 .Lcbc_prefetch_sbox:
728         mov     0($Tbl),%rax
729         mov     32($Tbl),%rsi
730         mov     64($Tbl),%rdi
731         mov     96($Tbl),%r11
732         lea     128($Tbl),$Tbl
733         loop    .Lcbc_prefetch_sbox
734         sub     \$4096,$Tbl
735         shl     \$6,$keyend
736         mov     %rdx,%rcx               # len argument
737         lea     ($key,$keyend),$keyend
738
739         cmp     \$0,%r9d                # enc argument
740         je      .LCBC_DECRYPT
741
742         and     \$-16,%rdx
743         and     \$15,%rcx               # length residue
744         lea     ($inp,%rdx),%rdx
745         mov     $key,$_key
746         mov     %rdx,$_end
747         mov     %rcx,$_res
748
749         cmp     $inp,%rdx
750         mov     0(%rbx),@S[0]           # load IV
751         mov     4(%rbx),@S[1]
752         mov     8(%rbx),@S[2]
753         mov     12(%rbx),@S[3]
754         je      .Lcbc_enc_tail
755         jmp     .Lcbc_eloop
756
757 .align  16
758 .Lcbc_eloop:
759         xor     0($inp),@S[0]
760         xor     4($inp),@S[1]
761         xor     8($inp),@S[2]
762         bswap   @S[0]
763         xor     12($inp),@S[3]
764         bswap   @S[1]
765         bswap   @S[2]
766         bswap   @S[3]
767
768         call    _x86_64_Camellia_encrypt
769
770         mov     $_key,$key              # "rewind" the key
771         bswap   @S[0]
772         mov     $_end,%rdx
773         bswap   @S[1]
774         mov     $_res,%rcx
775         bswap   @S[2]
776         mov     @S[0],0($out)
777         bswap   @S[3]
778         mov     @S[1],4($out)
779         mov     @S[2],8($out)
780         lea     16($inp),$inp
781         mov     @S[3],12($out)
782         cmp     %rdx,$inp
783         lea     16($out),$out
784         jne     .Lcbc_eloop
785
786         cmp     \$0,%rcx
787         jne     .Lcbc_enc_tail
788
789         mov     $_ivp,$out
790         mov     @S[0],0($out)           # write out IV residue
791         mov     @S[1],4($out)
792         mov     @S[2],8($out)
793         mov     @S[3],12($out)
794         jmp     .Lcbc_done
795
796 .align  16
797 .Lcbc_enc_tail:
798         xor     %rax,%rax
799         mov     %rax,0+$ivec
800         mov     %rax,8+$ivec
801         mov     %rax,$_res
802
803 .Lcbc_enc_pushf:
804         pushfq
805         cld
806         mov     $inp,%rsi
807         lea     8+$ivec,%rdi
808         .long   0x9066A4F3              # rep movsb
809         popfq
810 .Lcbc_enc_popf:
811
812         lea     $ivec,$inp
813         lea     16+$ivec,%rax
814         mov     %rax,$_end
815         jmp     .Lcbc_eloop             # one more time
816
817 .align  16
818 .LCBC_DECRYPT:
819         xchg    $key,$keyend
820         add     \$15,%rdx
821         and     \$15,%rcx               # length residue
822         and     \$-16,%rdx
823         mov     $key,$_key
824         lea     ($inp,%rdx),%rdx
825         mov     %rdx,$_end
826         mov     %rcx,$_res
827
828         mov     (%rbx),%rax             # load IV
829         mov     8(%rbx),%rbx
830         jmp     .Lcbc_dloop
831 .align  16
832 .Lcbc_dloop:
833         mov     0($inp),@S[0]
834         mov     4($inp),@S[1]
835         mov     8($inp),@S[2]
836         bswap   @S[0]
837         mov     12($inp),@S[3]
838         bswap   @S[1]
839         mov     %rax,0+$ivec            # save IV to temporary storage
840         bswap   @S[2]
841         mov     %rbx,8+$ivec
842         bswap   @S[3]
843
844         call    _x86_64_Camellia_decrypt
845
846         mov     $_key,$key              # "rewind" the key
847         mov     $_end,%rdx
848         mov     $_res,%rcx
849
850         bswap   @S[0]
851         mov     ($inp),%rax             # load IV for next iteration
852         bswap   @S[1]
853         mov     8($inp),%rbx
854         bswap   @S[2]
855         xor     0+$ivec,@S[0]
856         bswap   @S[3]
857         xor     4+$ivec,@S[1]
858         xor     8+$ivec,@S[2]
859         lea     16($inp),$inp
860         xor     12+$ivec,@S[3]
861         cmp     %rdx,$inp
862         je      .Lcbc_ddone
863
864         mov     @S[0],0($out)
865         mov     @S[1],4($out)
866         mov     @S[2],8($out)
867         mov     @S[3],12($out)
868
869         lea     16($out),$out
870         jmp     .Lcbc_dloop
871
872 .align  16
873 .Lcbc_ddone:
874         mov     $_ivp,%rdx
875         cmp     \$0,%rcx
876         jne     .Lcbc_dec_tail
877
878         mov     @S[0],0($out)
879         mov     @S[1],4($out)
880         mov     @S[2],8($out)
881         mov     @S[3],12($out)
882
883         mov     %rax,(%rdx)             # write out IV residue
884         mov     %rbx,8(%rdx)
885         jmp     .Lcbc_done
886 .align  16
887 .Lcbc_dec_tail:
888         mov     @S[0],0+$ivec
889         mov     @S[1],4+$ivec
890         mov     @S[2],8+$ivec
891         mov     @S[3],12+$ivec
892
893 .Lcbc_dec_pushf:
894         pushfq
895         cld
896         lea     8+$ivec,%rsi
897         lea     ($out),%rdi
898         .long   0x9066A4F3              # rep movsb
899         popfq
900 .Lcbc_dec_popf:
901
902         mov     %rax,(%rdx)             # write out IV residue
903         mov     %rbx,8(%rdx)
904         jmp     .Lcbc_done
905
906 .align  16
907 .Lcbc_done:
908         mov     $_rsp,%rcx
909 .cfi_def_cfa    %rcx,56
910         mov     0(%rcx),%r15
911 .cfi_restore    %r15
912         mov     8(%rcx),%r14
913 .cfi_restore    %r14
914         mov     16(%rcx),%r13
915 .cfi_restore    %r13
916         mov     24(%rcx),%r12
917 .cfi_restore    %r12
918         mov     32(%rcx),%rbp
919 .cfi_restore    %rbp
920         mov     40(%rcx),%rbx
921 .cfi_restore    %rbx
922         lea     48(%rcx),%rsp
923 .cfi_def_cfa    %rsp,8
924 .Lcbc_abort:
925         ret
926 .cfi_endproc
927 .size   Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
928
929 .asciz  "Camellia for x86_64 by <appro\@openssl.org>"
930 ___
931 }
932
933 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
934 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
935 if ($win64) {
936 $rec="%rcx";
937 $frame="%rdx";
938 $context="%r8";
939 $disp="%r9";
940
941 $code.=<<___;
942 .extern __imp_RtlVirtualUnwind
943 .type   common_se_handler,\@abi-omnipotent
944 .align  16
945 common_se_handler:
946         push    %rsi
947         push    %rdi
948         push    %rbx
949         push    %rbp
950         push    %r12
951         push    %r13
952         push    %r14
953         push    %r15
954         pushfq
955         lea     -64(%rsp),%rsp
956
957         mov     120($context),%rax      # pull context->Rax
958         mov     248($context),%rbx      # pull context->Rip
959
960         mov     8($disp),%rsi           # disp->ImageBase
961         mov     56($disp),%r11          # disp->HandlerData
962
963         mov     0(%r11),%r10d           # HandlerData[0]
964         lea     (%rsi,%r10),%r10        # prologue label
965         cmp     %r10,%rbx               # context->Rip<prologue label
966         jb      .Lin_prologue
967
968         mov     152($context),%rax      # pull context->Rsp
969
970         mov     4(%r11),%r10d           # HandlerData[1]
971         lea     (%rsi,%r10),%r10        # epilogue label
972         cmp     %r10,%rbx               # context->Rip>=epilogue label
973         jae     .Lin_prologue
974
975         lea     40(%rax),%rax
976         mov     -8(%rax),%rbx
977         mov     -16(%rax),%rbp
978         mov     -24(%rax),%r13
979         mov     -32(%rax),%r14
980         mov     -40(%rax),%r15
981         mov     %rbx,144($context)      # restore context->Rbx
982         mov     %rbp,160($context)      # restore context->Rbp
983         mov     %r13,224($context)      # restore context->R13
984         mov     %r14,232($context)      # restore context->R14
985         mov     %r15,240($context)      # restore context->R15
986
987 .Lin_prologue:
988         mov     8(%rax),%rdi
989         mov     16(%rax),%rsi
990         mov     %rax,152($context)      # restore context->Rsp
991         mov     %rsi,168($context)      # restore context->Rsi
992         mov     %rdi,176($context)      # restore context->Rdi
993
994         jmp     .Lcommon_seh_exit
995 .size   common_se_handler,.-common_se_handler
996
997 .type   cbc_se_handler,\@abi-omnipotent
998 .align  16
999 cbc_se_handler:
1000         push    %rsi
1001         push    %rdi
1002         push    %rbx
1003         push    %rbp
1004         push    %r12
1005         push    %r13
1006         push    %r14
1007         push    %r15
1008         pushfq
1009         lea     -64(%rsp),%rsp
1010
1011         mov     120($context),%rax      # pull context->Rax
1012         mov     248($context),%rbx      # pull context->Rip
1013
1014         lea     .Lcbc_prologue(%rip),%r10
1015         cmp     %r10,%rbx               # context->Rip<.Lcbc_prologue
1016         jb      .Lin_cbc_prologue
1017
1018         lea     .Lcbc_body(%rip),%r10
1019         cmp     %r10,%rbx               # context->Rip<.Lcbc_body
1020         jb      .Lin_cbc_frame_setup
1021
1022         mov     152($context),%rax      # pull context->Rsp
1023
1024         lea     .Lcbc_abort(%rip),%r10
1025         cmp     %r10,%rbx               # context->Rip>=.Lcbc_abort
1026         jae     .Lin_cbc_prologue
1027
1028         # handle pushf/popf in Camellia_cbc_encrypt
1029         lea     .Lcbc_enc_pushf(%rip),%r10
1030         cmp     %r10,%rbx               # context->Rip<=.Lcbc_enc_pushf
1031         jbe     .Lin_cbc_no_flag
1032         lea     8(%rax),%rax
1033         lea     .Lcbc_enc_popf(%rip),%r10
1034         cmp     %r10,%rbx               # context->Rip<.Lcbc_enc_popf
1035         jb      .Lin_cbc_no_flag
1036         lea     -8(%rax),%rax
1037         lea     .Lcbc_dec_pushf(%rip),%r10
1038         cmp     %r10,%rbx               # context->Rip<=.Lcbc_dec_pushf
1039         jbe     .Lin_cbc_no_flag
1040         lea     8(%rax),%rax
1041         lea     .Lcbc_dec_popf(%rip),%r10
1042         cmp     %r10,%rbx               # context->Rip<.Lcbc_dec_popf
1043         jb      .Lin_cbc_no_flag
1044         lea     -8(%rax),%rax
1045
1046 .Lin_cbc_no_flag:
1047         mov     48(%rax),%rax           # $_rsp
1048         lea     48(%rax),%rax
1049
1050 .Lin_cbc_frame_setup:
1051         mov     -8(%rax),%rbx
1052         mov     -16(%rax),%rbp
1053         mov     -24(%rax),%r12
1054         mov     -32(%rax),%r13
1055         mov     -40(%rax),%r14
1056         mov     -48(%rax),%r15
1057         mov     %rbx,144($context)      # restore context->Rbx
1058         mov     %rbp,160($context)      # restore context->Rbp
1059         mov     %r12,216($context)      # restore context->R12
1060         mov     %r13,224($context)      # restore context->R13
1061         mov     %r14,232($context)      # restore context->R14
1062         mov     %r15,240($context)      # restore context->R15
1063
1064 .Lin_cbc_prologue:
1065         mov     8(%rax),%rdi
1066         mov     16(%rax),%rsi
1067         mov     %rax,152($context)      # restore context->Rsp
1068         mov     %rsi,168($context)      # restore context->Rsi
1069         mov     %rdi,176($context)      # restore context->Rdi
1070
1071 .align  4
1072 .Lcommon_seh_exit:
1073
1074         mov     40($disp),%rdi          # disp->ContextRecord
1075         mov     $context,%rsi           # context
1076         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
1077         .long   0xa548f3fc              # cld; rep movsq
1078
1079         mov     $disp,%rsi
1080         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1081         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1082         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1083         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1084         mov     40(%rsi),%r10           # disp->ContextRecord
1085         lea     56(%rsi),%r11           # &disp->HandlerData
1086         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1087         mov     %r10,32(%rsp)           # arg5
1088         mov     %r11,40(%rsp)           # arg6
1089         mov     %r12,48(%rsp)           # arg7
1090         mov     %rcx,56(%rsp)           # arg8, (NULL)
1091         call    *__imp_RtlVirtualUnwind(%rip)
1092
1093         mov     \$1,%eax                # ExceptionContinueSearch
1094         lea     64(%rsp),%rsp
1095         popfq
1096         pop     %r15
1097         pop     %r14
1098         pop     %r13
1099         pop     %r12
1100         pop     %rbp
1101         pop     %rbx
1102         pop     %rdi
1103         pop     %rsi
1104         ret
1105 .size   cbc_se_handler,.-cbc_se_handler
1106
1107 .section        .pdata
1108 .align  4
1109         .rva    .LSEH_begin_Camellia_EncryptBlock_Rounds
1110         .rva    .LSEH_end_Camellia_EncryptBlock_Rounds
1111         .rva    .LSEH_info_Camellia_EncryptBlock_Rounds
1112
1113         .rva    .LSEH_begin_Camellia_DecryptBlock_Rounds
1114         .rva    .LSEH_end_Camellia_DecryptBlock_Rounds
1115         .rva    .LSEH_info_Camellia_DecryptBlock_Rounds
1116
1117         .rva    .LSEH_begin_Camellia_Ekeygen
1118         .rva    .LSEH_end_Camellia_Ekeygen
1119         .rva    .LSEH_info_Camellia_Ekeygen
1120
1121         .rva    .LSEH_begin_Camellia_cbc_encrypt
1122         .rva    .LSEH_end_Camellia_cbc_encrypt
1123         .rva    .LSEH_info_Camellia_cbc_encrypt
1124
1125 .section        .xdata
1126 .align  8
1127 .LSEH_info_Camellia_EncryptBlock_Rounds:
1128         .byte   9,0,0,0
1129         .rva    common_se_handler
1130         .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
1131 .LSEH_info_Camellia_DecryptBlock_Rounds:
1132         .byte   9,0,0,0
1133         .rva    common_se_handler
1134         .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
1135 .LSEH_info_Camellia_Ekeygen:
1136         .byte   9,0,0,0
1137         .rva    common_se_handler
1138         .rva    .Lkey_prologue,.Lkey_epilogue   # HandlerData[]
1139 .LSEH_info_Camellia_cbc_encrypt:
1140         .byte   9,0,0,0
1141         .rva    cbc_se_handler
1142 ___
1143 }
1144
1145 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1146 print $code;
1147 close STDOUT;