2 # Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
13 # This module may be used under the terms of either the GNU General
14 # Public License version 2 or later, the GNU Lesser General Public
15 # License version 2.1 or later, the Mozilla Public License version
16 # 1.1 or the BSD License. The exact terms of either license are
17 # distributed along with this module. For further details see
18 # http://www.openssl.org/~appro/camellia/.
19 # ====================================================================
21 # Performance in cycles per processed byte (less is better) in
22 # 'openssl speed ...' benchmark:
24 # AMD K8 Core2 PIII P4
25 # -evp camellia-128-ecb 21.5 22.8 27.0 28.9
26 # + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
27 # + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
29 # camellia-128-cbc 17.3 21.1 23.9 25.9
31 # 128-bit key setup 196 280 256 240 cycles/key
32 # + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
33 # + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
35 # Pairs of numbers in "+" rows represent performance improvement over
36 # compiler generated position-independent code, PIC, and non-PIC
37 # respectively. PIC results are of greater relevance, as this module
38 # is position-independent, i.e. suitable for a shared library or PIE.
39 # Position independence "costs" one register, which is why compilers
40 # are so close with non-PIC results, they have an extra register to
41 # spare. CBC results are better than ECB ones thanks to "zero-copy"
42 # private _x86_* interface, and are ~30-40% better than with compiler
43 # generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
44 # same CPU (where applicable).
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 push(@INC,"${dir}","${dir}../../perlasm");
53 open STDOUT,">$output";
55 &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
57 @T=("eax","ebx","ecx","edx");
62 # stack frame layout in _x86_Camellia_* routines, frame is allocated
64 $__ra=&DWP(0,"esp"); # return address
65 $__s0=&DWP(4,"esp"); # s0 backing store
66 $__s1=&DWP(8,"esp"); # s1 backing store
67 $__s2=&DWP(12,"esp"); # s2 backing store
68 $__s3=&DWP(16,"esp"); # s3 backing store
69 $__end=&DWP(20,"esp"); # pointer to end/start of key schedule
71 # stack frame layout in Camellia_[en|crypt] routines, which differs from
72 # above by 4 and overlaps by pointer to end/start of key schedule
76 # const unsigned int Camellia_SBOX[4][256];
77 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
78 # and [2][] - with [3][]. This is done to optimize code size.
79 $SBOX1_1110=0; # Camellia_SBOX[0]
80 $SBOX4_4404=4; # Camellia_SBOX[1]
81 $SBOX2_0222=2048; # Camellia_SBOX[2]
82 $SBOX3_3033=2052; # Camellia_SBOX[3]
83 &static_label("Camellia_SIGMA");
84 &static_label("Camellia_SBOX");
86 sub Camellia_Feistel {
88 my $seed=defined(@_[1])?@_[1]:0;
89 my $scale=$seed<0?-8:8;
90 my $frame=defined(@_[2])?@_[2]:0;
92 my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
94 &xor ($t0,$idx); # t0^=key[0]
95 &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
96 &movz ($idx,&HB($t0)); # (t0>>8)&0xff
97 &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
98 &movz ($idx,&LB($t0)); # (t0>>0)&0xff
99 &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
101 &movz ($idx,&LB($t1)); # (t1>>0)&0xff
102 &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
103 &movz ($idx,&HB($t0)); # (t0>>24)&0xff
104 &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0]
105 &movz ($idx,&HB($t1)); # (t1>>8)&0xff
106 &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1]
108 &movz ($t0,&LB($t0)); # (t0>>16)&0xff
109 &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
110 &movz ($idx,&HB($t1)); # (t1>>24)&0xff
111 &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3"
112 &xor ($t2,$t3); # t2^=t3
113 &rotr ($t3,8); # t3=RightRotate(t3,8)
114 &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1]
115 &movz ($idx,&LB($t1)); # (t1>>16)&0xff
116 &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2"
117 &xor ($t3,$t0); # t3^=s3
118 &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1]
119 &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1]
120 &xor ($t3,$t2); # t3^=t2
121 &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3
122 &xor ($t2,$t1); # t2^=s2
123 &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2
126 # void Camellia_EncryptBlock_Rounds(
128 # const Byte plaintext[],
129 # const KEY_TABLE_TYPE keyTable,
131 &function_begin("Camellia_EncryptBlock_Rounds");
132 &mov ("eax",&wparam(0)); # load grandRounds
133 &mov ($idx,&wparam(1)); # load plaintext pointer
134 &mov ($key,&wparam(2)); # load key schedule pointer
137 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
140 # place stack frame just "above mod 1024" the key schedule
141 # this ensures that cache associativity of 2 suffices
142 &lea ("ecx",&DWP(-64-63,$key));
145 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
147 &add ("esp",4); # 4 is reserved for callee's return address
150 &lea ("eax",&DWP(0,$key,"eax"));
151 &mov ($_esp,"ebx"); # save %esp
152 &mov ($_end,"eax"); # save keyEnd
154 &call (&label("pic_point"));
155 &set_label("pic_point");
157 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
159 &mov (@T[0],&DWP(0,$idx)); # load plaintext
160 &mov (@T[1],&DWP(4,$idx));
161 &mov (@T[2],&DWP(8,$idx));
163 &mov (@T[3],&DWP(12,$idx));
168 &call ("_x86_Camellia_encrypt");
172 &mov ($idx,&wparam(3)); # load ciphertext pointer
176 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
177 &mov (&DWP(4,$idx),@T[1]);
178 &mov (&DWP(8,$idx),@T[2]);
179 &mov (&DWP(12,$idx),@T[3]);
180 &function_end("Camellia_EncryptBlock_Rounds");
182 &function_begin_B("Camellia_EncryptBlock");
184 &sub ("eax",&wparam(0)); # load keyBitLength
186 &adc ("eax",0); # keyBitLength==128?3:4
187 &mov (&wparam(0),"eax");
188 &jmp (&label("Camellia_EncryptBlock_Rounds"));
189 &function_end_B("Camellia_EncryptBlock");
192 # void Camellia_encrypt(
193 # const unsigned char *in,
194 # unsigned char *out,
195 # const CAMELLIA_KEY *key)
196 &function_begin("Camellia_encrypt");
197 &mov ($idx,&wparam(0)); # load plaintext pointer
198 &mov ($key,&wparam(2)); # load key schedule pointer
201 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
203 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
205 # place stack frame just "above mod 1024" the key schedule
206 # this ensures that cache associativity of 2 suffices
207 &lea ("ecx",&DWP(-64-63,$key));
210 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
212 &add ("esp",4); # 4 is reserved for callee's return address
215 &lea ("eax",&DWP(0,$key,"eax"));
216 &mov ($_esp,"ebx"); # save %esp
217 &mov ($_end,"eax"); # save keyEnd
219 &call (&label("pic_point"));
220 &set_label("pic_point");
222 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
224 &mov (@T[0],&DWP(0,$idx)); # load plaintext
225 &mov (@T[1],&DWP(4,$idx));
226 &mov (@T[2],&DWP(8,$idx));
228 &mov (@T[3],&DWP(12,$idx));
233 &call ("_x86_Camellia_encrypt");
237 &mov ($idx,&wparam(1)); # load ciphertext pointer
241 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
242 &mov (&DWP(4,$idx),@T[1]);
243 &mov (&DWP(8,$idx),@T[2]);
244 &mov (&DWP(12,$idx),@T[3]);
245 &function_end("Camellia_encrypt");
248 &function_begin_B("_x86_Camellia_encrypt");
249 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
250 &xor (@T[1],&DWP(4,$key));
251 &xor (@T[2],&DWP(8,$key));
252 &xor (@T[3],&DWP(12,$key));
253 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
255 &mov ($__s0,@T[0]); # save s[0-3]
260 &set_label("loop",16);
261 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
265 &je (&label("done"));
267 # @T[0-1] are preloaded, $idx is preloaded with key[0]
273 &or (@T[2],&DWP(12,$key));
274 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
277 &mov ($idx,&DWP(4,$key));
278 &mov ($__s2,@T[2]); # s2^=s3|key[3];
280 &and (@T[2],&DWP(8,$key));
283 &mov ($__s0,@T[0]); # s0^=s1|key[1];
285 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
286 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
287 &jmp (&label("loop"));
289 &set_label("done",8);
290 &mov (@T[2],@T[0]); # SwapHalf
294 &xor (@T[0],$idx); # $idx is preloaded with key[0]
295 &xor (@T[1],&DWP(4,$key));
296 &xor (@T[2],&DWP(8,$key));
297 &xor (@T[3],&DWP(12,$key));
299 &function_end_B("_x86_Camellia_encrypt");
301 # void Camellia_DecryptBlock_Rounds(
303 # const Byte ciphertext[],
304 # const KEY_TABLE_TYPE keyTable,
306 &function_begin("Camellia_DecryptBlock_Rounds");
307 &mov ("eax",&wparam(0)); # load grandRounds
308 &mov ($idx,&wparam(1)); # load ciphertext pointer
309 &mov ($key,&wparam(2)); # load key schedule pointer
312 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
315 # place stack frame just "above mod 1024" the key schedule
316 # this ensures that cache associativity of 2 suffices
317 &lea ("ecx",&DWP(-64-63,$key));
320 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
322 &add ("esp",4); # 4 is reserved for callee's return address
325 &mov (&DWP(4*4,"esp"),$key); # save keyStart
326 &lea ($key,&DWP(0,$key,"eax"));
327 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
329 &call (&label("pic_point"));
330 &set_label("pic_point");
332 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
334 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
335 &mov (@T[1],&DWP(4,$idx));
336 &mov (@T[2],&DWP(8,$idx));
338 &mov (@T[3],&DWP(12,$idx));
343 &call ("_x86_Camellia_decrypt");
345 &mov ("esp",&DWP(5*4,"esp"));
347 &mov ($idx,&wparam(3)); # load plaintext pointer
351 &mov (&DWP(0,$idx),@T[0]); # write plaintext
352 &mov (&DWP(4,$idx),@T[1]);
353 &mov (&DWP(8,$idx),@T[2]);
354 &mov (&DWP(12,$idx),@T[3]);
355 &function_end("Camellia_DecryptBlock_Rounds");
357 &function_begin_B("Camellia_DecryptBlock");
359 &sub ("eax",&wparam(0)); # load keyBitLength
361 &adc ("eax",0); # keyBitLength==128?3:4
362 &mov (&wparam(0),"eax");
363 &jmp (&label("Camellia_DecryptBlock_Rounds"));
364 &function_end_B("Camellia_DecryptBlock");
367 # void Camellia_decrypt(
368 # const unsigned char *in,
369 # unsigned char *out,
370 # const CAMELLIA_KEY *key)
371 &function_begin("Camellia_decrypt");
372 &mov ($idx,&wparam(0)); # load ciphertext pointer
373 &mov ($key,&wparam(2)); # load key schedule pointer
376 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
378 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
380 # place stack frame just "above mod 1024" the key schedule
381 # this ensures that cache associativity of 2 suffices
382 &lea ("ecx",&DWP(-64-63,$key));
385 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
387 &add ("esp",4); # 4 is reserved for callee's return address
390 &mov (&DWP(4*4,"esp"),$key); # save keyStart
391 &lea ($key,&DWP(0,$key,"eax"));
392 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
394 &call (&label("pic_point"));
395 &set_label("pic_point");
397 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
399 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
400 &mov (@T[1],&DWP(4,$idx));
401 &mov (@T[2],&DWP(8,$idx));
403 &mov (@T[3],&DWP(12,$idx));
408 &call ("_x86_Camellia_decrypt");
410 &mov ("esp",&DWP(5*4,"esp"));
412 &mov ($idx,&wparam(1)); # load plaintext pointer
416 &mov (&DWP(0,$idx),@T[0]); # write plaintext
417 &mov (&DWP(4,$idx),@T[1]);
418 &mov (&DWP(8,$idx),@T[2]);
419 &mov (&DWP(12,$idx),@T[3]);
420 &function_end("Camellia_decrypt");
423 &function_begin_B("_x86_Camellia_decrypt");
424 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
425 &xor (@T[1],&DWP(4,$key));
426 &xor (@T[2],&DWP(8,$key));
427 &xor (@T[3],&DWP(12,$key));
428 &mov ($idx,&DWP(-8,$key)); # prefetch key[-2]
430 &mov ($__s0,@T[0]); # save s[0-3]
435 &set_label("loop",16);
436 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
440 &je (&label("done"));
442 # @T[0-1] are preloaded, $idx is preloaded with key[2]
448 &or (@T[2],&DWP(4,$key));
449 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
452 &mov ($idx,&DWP(12,$key));
453 &mov ($__s2,@T[2]); # s2^=s3|key[3];
455 &and (@T[2],&DWP(0,$key));
458 &mov ($__s0,@T[0]); # s0^=s1|key[1];
460 &mov ($idx,&DWP(-8,$key)); # prefetch key[4]
461 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
462 &jmp (&label("loop"));
464 &set_label("done",8);
465 &mov (@T[2],@T[0]); # SwapHalf
469 &xor (@T[2],$idx); # $idx is preloaded with key[2]
470 &xor (@T[3],&DWP(12,$key));
471 &xor (@T[0],&DWP(0,$key));
472 &xor (@T[1],&DWP(4,$key));
474 &function_end_B("_x86_Camellia_decrypt");
476 # shld is very slow on Intel P4 family. Even on AMD it limits
477 # instruction decode rate [because it's VectorPath] and consequently
478 # performance. PIII, PM and Core[2] seem to be the only ones which
479 # execute this code ~7% faster...
481 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
486 &shld ($i0,$i1,$rot);
487 &shld ($i1,$i2,$rot);
488 &shld ($i2,$i3,$rot);
489 &shld ($i3,$idx,$rot);
491 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
492 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
493 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
494 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
497 # ... Implementing 128-bit rotate without shld gives >3x performance
498 # improvement on P4, only ~7% degradation on other Intel CPUs and
499 # not worse performance on AMD. This is therefore preferred.
501 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
513 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
519 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
523 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
524 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
526 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
527 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
528 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
529 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
534 my ($rnd,$key,@T)=@_;
535 my $bias=int(@T[0])?shift(@T):0;
537 &mov (&DWP($bias+$rnd*8+0,$key),@T[0]);
538 &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1);
539 &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2);
540 &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);
544 my ($rnd,$key,@T)=@_;
545 my $bias=int(@T[0])?shift(@T):0;
547 &mov (@T[0],&DWP($bias+$rnd*8+0,$key));
548 &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1);
549 &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2);
550 &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3);
553 # void Camellia_Ekeygen(
554 # const int keyBitLength,
555 # const Byte *rawKey,
556 # KEY_TABLE_TYPE keyTable)
557 &function_begin("Camellia_Ekeygen");
560 &stack_push(4); # place for s[0-3]
562 &mov ($Tbl,&wparam(0)); # load arguments
563 &mov ($idx,&wparam(1));
564 &mov ($key,&wparam(2));
566 &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits
567 &mov (@T[1],&DWP(4,$idx));
568 &mov (@T[2],&DWP(8,$idx));
569 &mov (@T[3],&DWP(12,$idx));
576 &_saveround (0,$key,@T); # KL<<<0
579 &je (&label("1st128"));
581 &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits
582 &mov (@T[1],&DWP(20,$idx));
584 &je (&label("1st192"));
585 &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits
586 &mov (@T[3],&DWP(28,$idx));
587 &jmp (&label("1st256"));
588 &set_label("1st192",4);
593 &set_label("1st256",4);
599 &_saveround (4,$key,@T); # temporary storage for KR!
601 &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL
602 &xor (@T[1],&DWP(0*8+4,$key));
603 &xor (@T[2],&DWP(1*8+0,$key));
604 &xor (@T[3],&DWP(1*8+4,$key));
606 &set_label("1st128",4);
607 &call (&label("pic_point"));
608 &set_label("pic_point");
610 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
611 &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
613 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0]
614 &mov (&swtmp(0),@T[0]); # save s[0-3]
615 &mov (&swtmp(1),@T[1]);
616 &mov (&swtmp(2),@T[2]);
617 &mov (&swtmp(3),@T[3]);
618 &Camellia_Feistel($step++);
619 &Camellia_Feistel($step++);
620 &mov (@T[2],&swtmp(2));
621 &mov (@T[3],&swtmp(3));
623 &mov ($idx,&wparam(2));
624 &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL
625 &xor (@T[1],&DWP(0*8+4,$idx));
626 &xor (@T[2],&DWP(1*8+0,$idx));
627 &xor (@T[3],&DWP(1*8+4,$idx));
629 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4]
630 &mov (&swtmp(0),@T[0]); # save s[0-3]
631 &mov (&swtmp(1),@T[1]);
632 &mov (&swtmp(2),@T[2]);
633 &mov (&swtmp(3),@T[3]);
634 &Camellia_Feistel($step++);
635 &Camellia_Feistel($step++);
636 &mov (@T[2],&swtmp(2));
637 &mov (@T[3],&swtmp(3));
639 &mov ($idx,&wparam(0));
641 &jne (&label("2nd256"));
643 &mov ($key,&wparam(2));
644 &lea ($key,&DWP(128,$key)); # size optimization
647 &_saveround (2,$key,-128,@T); # KA<<<0
648 &_rotl128 (@T,15,6,@T); # KA<<<15
649 &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30)
650 &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45)
651 &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60)
652 push (@T,shift(@T)); # rotl128(@T,32);
653 &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94)
654 &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111)
657 &_loadround (0,$key,-128,@T); # load KL
658 &_rotl128 (@T,15,4,@T); # KL<<<15
659 &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45)
660 &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60)
661 &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77)
662 &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94)
663 &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111)
665 while (@T[0] ne "eax") # restore order
666 { unshift (@T,pop(@T)); }
668 &mov ("eax",3); # 3 grandRounds
669 &jmp (&label("done"));
671 &set_label("2nd256",16);
672 &mov ($idx,&wparam(2));
673 &_saveround (6,$idx,@T); # temporary storage for KA!
675 &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR
676 &xor (@T[1],&DWP(4*8+4,$idx));
677 &xor (@T[2],&DWP(5*8+0,$idx));
678 &xor (@T[3],&DWP(5*8+4,$idx));
680 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8]
681 &mov (&swtmp(0),@T[0]); # save s[0-3]
682 &mov (&swtmp(1),@T[1]);
683 &mov (&swtmp(2),@T[2]);
684 &mov (&swtmp(3),@T[3]);
685 &Camellia_Feistel($step++);
686 &Camellia_Feistel($step++);
687 &mov (@T[2],&swtmp(2));
688 &mov (@T[3],&swtmp(3));
690 &mov ($key,&wparam(2));
691 &lea ($key,&DWP(128,$key)); # size optimization
694 &_saveround (2,$key,-128,@T); # KB<<<0
695 &_rotl128 (@T,30,10,@T); # KB<<<30
696 &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60)
697 push (@T,shift(@T)); # rotl128(@T,32);
698 &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111)
701 &_loadround (4,$key,-128,@T); # load KR
702 &_rotl128 (@T,15,4,@T); # KR<<<15
703 &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30)
704 &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60)
705 push (@T,shift(@T)); # rotl128(@T,32);
706 &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94)
709 &_loadround (6,$key,-128,@T); # load KA
710 &_rotl128 (@T,15,6,@T); # KA<<<15
711 &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45)
712 push (@T,shift(@T)); # rotl128(@T,32);
713 &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77)
714 &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94)
717 &_loadround (0,$key,-128,@T); # load KL
718 push (@T,shift(@T)); # rotl128(@T,32);
719 &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45)
720 &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60)
721 &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77)
722 push (@T,shift(@T)); # rotl128(@T,32);
723 &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111)
725 while (@T[0] ne "eax") # restore order
726 { unshift (@T,pop(@T)); }
728 &mov ("eax",4); # 4 grandRounds
730 &lea ("edx",&DWP(272-128,$key)); # end of key schedule
733 &function_end("Camellia_Ekeygen");
736 # int Camellia_set_key (
737 # const unsigned char *userKey,
740 &function_begin_B("Camellia_set_key");
742 &mov ("ecx",&wparam(0)); # pull arguments
743 &mov ("ebx",&wparam(1));
744 &mov ("edx",&wparam(2));
748 &jz (&label("done")); # userKey==NULL?
750 &jz (&label("done")); # key==NULL?
754 &je (&label("arg_ok")); # bits==256?
756 &je (&label("arg_ok")); # bits==192?
758 &jne (&label("done")); # bits!=128?
759 &set_label("arg_ok",4);
761 &push ("edx"); # push arguments
764 &call ("Camellia_Ekeygen");
767 # eax holds grandRounds and edx points at where to put it
768 &mov (&DWP(0,"edx"),"eax");
770 &set_label("done",4);
773 &function_end_B("Camellia_set_key");
777 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
778 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
779 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
780 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
781 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
782 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
783 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
784 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
785 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
786 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
787 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
788 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
789 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
790 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
791 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
792 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
794 sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
795 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
796 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
797 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
799 &set_label("Camellia_SIGMA",64);
801 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
802 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
803 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
805 &set_label("Camellia_SBOX",64);
806 # tables are interleaved, remember?
807 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
808 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
810 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
811 # size_t length, const CAMELLIA_KEY *key,
812 # unsigned char *ivp,const int enc);
815 # -4(%esp) # return address 0(%esp)
816 # 0(%esp) # s0 4(%esp)
817 # 4(%esp) # s1 8(%esp)
818 # 8(%esp) # s2 12(%esp)
819 # 12(%esp) # s3 16(%esp)
820 # 16(%esp) # end of key schedule 20(%esp)
821 # 20(%esp) # %esp backup
822 my $_inp=&DWP(24,"esp"); #copy of wparam(0)
823 my $_out=&DWP(28,"esp"); #copy of wparam(1)
824 my $_len=&DWP(32,"esp"); #copy of wparam(2)
825 my $_key=&DWP(36,"esp"); #copy of wparam(3)
826 my $_ivp=&DWP(40,"esp"); #copy of wparam(4)
827 my $ivec=&DWP(44,"esp"); #ivec[16]
828 my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec]
829 my ($s0,$s1,$s2,$s3) = @T;
831 &function_begin("Camellia_cbc_encrypt");
832 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
834 &je (&label("enc_out"));
839 &mov ($s0,&wparam(0)); # load inp
840 &mov ($s1,&wparam(1)); # load out
841 #&mov ($s2,&wparam(2)); # load len
842 &mov ($s3,&wparam(3)); # load key
843 &mov ($Tbl,&wparam(4)); # load ivp
845 # allocate aligned stack frame...
846 &lea ($idx,&DWP(-64,"esp"));
849 # place stack frame just "above mod 1024" the key schedule
850 # this ensures that cache associativity of 2 suffices
851 &lea ($key,&DWP(-64-63,$s3));
854 &and ($key,0x3C0); # modulo 1024, but aligned to cache-line
857 &mov ($key,&wparam(5)); # load enc
860 &add ("esp",4); # reserve for return address!
861 &mov ($_esp,$idx); # save %esp
863 &mov ($_inp,$s0); # save copy of inp
864 &mov ($_out,$s1); # save copy of out
865 &mov ($_len,$s2); # save copy of len
866 &mov ($_key,$s3); # save copy of key
867 &mov ($_ivp,$Tbl); # save copy of ivp
869 &call (&label("pic_point")); # make it PIC!
870 &set_label("pic_point");
872 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
875 &set_label("prefetch_sbox",4);
876 &mov ($s0,&DWP(0,$Tbl));
877 &mov ($s1,&DWP(32,$Tbl));
878 &mov ($s2,&DWP(64,$Tbl));
879 &mov ($s3,&DWP(96,$Tbl));
880 &lea ($Tbl,&DWP(128,$Tbl));
882 &jnz (&label("prefetch_sbox"));
886 &mov ($s3,&DWP(272,$s0)); # load grandRounds
889 &je (&label("DECRYPT"));
894 &lea ($s3,&DWP(0,$s0,$s3));
897 &test ($s2,0xFFFFFFF0);
898 &jz (&label("enc_tail")); # short input...
900 &mov ($s0,&DWP(0,$key)); # load iv
901 &mov ($s1,&DWP(4,$key));
903 &set_label("enc_loop",4);
904 &mov ($s2,&DWP(8,$key));
905 &mov ($s3,&DWP(12,$key));
907 &xor ($s0,&DWP(0,$idx)); # xor input data
908 &xor ($s1,&DWP(4,$idx));
909 &xor ($s2,&DWP(8,$idx));
911 &xor ($s3,&DWP(12,$idx));
913 &mov ($key,$_key); # load key
917 &call ("_x86_Camellia_encrypt");
919 &mov ($idx,$_inp); # load inp
920 &mov ($key,$_out); # load out
925 &mov (&DWP(0,$key),$s0); # save output data
927 &mov (&DWP(4,$key),$s1);
928 &mov (&DWP(8,$key),$s2);
929 &mov (&DWP(12,$key),$s3);
931 &mov ($s2,$_len); # load len
933 &lea ($idx,&DWP(16,$idx));
934 &mov ($_inp,$idx); # save inp
936 &lea ($s3,&DWP(16,$key));
937 &mov ($_out,$s3); # save out
940 &test ($s2,0xFFFFFFF0);
941 &mov ($_len,$s2); # save len
942 &jnz (&label("enc_loop"));
944 &jnz (&label("enc_tail"));
945 &mov ($idx,$_ivp); # load ivp
946 &mov ($s2,&DWP(8,$key)); # restore last dwords
947 &mov ($s3,&DWP(12,$key));
948 &mov (&DWP(0,$idx),$s0); # save ivec
949 &mov (&DWP(4,$idx),$s1);
950 &mov (&DWP(8,$idx),$s2);
951 &mov (&DWP(12,$idx),$s3);
955 &set_label("enc_out");
957 &pushf (); # kludge, never executed
959 &set_label("enc_tail",4);
960 &mov ($s0,$key eq "edi" ? $key : "");
961 &mov ($key,$_out); # load out
962 &push ($s0); # push ivp
965 &cmp ($key,$idx); # compare with inp
966 &je (&label("enc_in_place"));
968 &data_word(0xA4F3F689); # rep movsb # copy input
969 &jmp (&label("enc_skip_in_place"));
970 &set_label("enc_in_place");
971 &lea ($key,&DWP(0,$key,$s2));
972 &set_label("enc_skip_in_place");
976 &data_word(0xAAF3F689); # rep stosb # zero tail
977 &pop ($key); # pop ivp
979 &mov ($idx,$_out); # output as input
980 &mov ($s0,&DWP(0,$key));
981 &mov ($s1,&DWP(4,$key));
982 &mov ($_len,16); # len=16
983 &jmp (&label("enc_loop")); # one more spin...
985 #----------------------------- DECRYPT -----------------------------#
986 &set_label("DECRYPT",16);
988 &lea ($s3,&DWP(0,$s0,$s3));
993 &je (&label("dec_in_place")); # in-place processing...
995 &mov ($key,$_ivp); # load ivp
998 &set_label("dec_loop",4);
999 &mov ($s0,&DWP(0,$idx)); # read input
1000 &mov ($s1,&DWP(4,$idx));
1001 &mov ($s2,&DWP(8,$idx));
1003 &mov ($s3,&DWP(12,$idx));
1005 &mov ($key,$_key); # load key
1009 &call ("_x86_Camellia_decrypt");
1011 &mov ($key,$_tmp); # load ivp
1012 &mov ($idx,$_len); # load len
1017 &xor ($s0,&DWP(0,$key)); # xor iv
1019 &xor ($s1,&DWP(4,$key));
1020 &xor ($s2,&DWP(8,$key));
1021 &xor ($s3,&DWP(12,$key));
1024 &jc (&label("dec_partial"));
1025 &mov ($_len,$idx); # save len
1026 &mov ($idx,$_inp); # load inp
1027 &mov ($key,$_out); # load out
1029 &mov (&DWP(0,$key),$s0); # write output
1030 &mov (&DWP(4,$key),$s1);
1031 &mov (&DWP(8,$key),$s2);
1032 &mov (&DWP(12,$key),$s3);
1034 &mov ($_tmp,$idx); # save ivp
1035 &lea ($idx,&DWP(16,$idx));
1036 &mov ($_inp,$idx); # save inp
1038 &lea ($key,&DWP(16,$key));
1039 &mov ($_out,$key); # save out
1041 &jnz (&label("dec_loop"));
1042 &mov ($key,$_tmp); # load temp ivp
1043 &set_label("dec_end");
1044 &mov ($idx,$_ivp); # load user ivp
1045 &mov ($s0,&DWP(0,$key)); # load iv
1046 &mov ($s1,&DWP(4,$key));
1047 &mov ($s2,&DWP(8,$key));
1048 &mov ($s3,&DWP(12,$key));
1049 &mov (&DWP(0,$idx),$s0); # copy back to user
1050 &mov (&DWP(4,$idx),$s1);
1051 &mov (&DWP(8,$idx),$s2);
1052 &mov (&DWP(12,$idx),$s3);
1053 &jmp (&label("dec_out"));
1055 &set_label("dec_partial",4);
1057 &mov (&DWP(0,$key),$s0); # dump output to stack
1058 &mov (&DWP(4,$key),$s1);
1059 &mov (&DWP(8,$key),$s2);
1060 &mov (&DWP(12,$key),$s3);
1061 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
1062 &mov ($idx eq "esi" ? $idx : "",$key);
1063 &mov ($key eq "edi" ? $key : "",$_out); # load out
1064 &data_word(0xA4F3F689); # rep movsb # copy output
1065 &mov ($key,$_inp); # use inp as temp ivp
1066 &jmp (&label("dec_end"));
1068 &set_label("dec_in_place",4);
1069 &set_label("dec_in_place_loop");
1071 &mov ($s0,&DWP(0,$idx)); # read input
1072 &mov ($s1,&DWP(4,$idx));
1073 &mov ($s2,&DWP(8,$idx));
1074 &mov ($s3,&DWP(12,$idx));
1076 &mov (&DWP(0,$key),$s0); # copy to temp
1077 &mov (&DWP(4,$key),$s1);
1078 &mov (&DWP(8,$key),$s2);
1080 &mov (&DWP(12,$key),$s3);
1082 &mov ($key,$_key); # load key
1086 &call ("_x86_Camellia_decrypt");
1088 &mov ($key,$_ivp); # load ivp
1089 &mov ($idx,$_out); # load out
1094 &xor ($s0,&DWP(0,$key)); # xor iv
1096 &xor ($s1,&DWP(4,$key));
1097 &xor ($s2,&DWP(8,$key));
1098 &xor ($s3,&DWP(12,$key));
1100 &mov (&DWP(0,$idx),$s0); # write output
1101 &mov (&DWP(4,$idx),$s1);
1102 &mov (&DWP(8,$idx),$s2);
1103 &mov (&DWP(12,$idx),$s3);
1105 &lea ($idx,&DWP(16,$idx));
1106 &mov ($_out,$idx); # save out
1109 &mov ($s0,&DWP(0,$idx)); # read temp
1110 &mov ($s1,&DWP(4,$idx));
1111 &mov ($s2,&DWP(8,$idx));
1112 &mov ($s3,&DWP(12,$idx));
1114 &mov (&DWP(0,$key),$s0); # copy iv
1115 &mov (&DWP(4,$key),$s1);
1116 &mov (&DWP(8,$key),$s2);
1117 &mov (&DWP(12,$key),$s3);
1119 &mov ($idx,$_inp); # load inp
1121 &lea ($idx,&DWP(16,$idx));
1122 &mov ($_inp,$idx); # save inp
1124 &mov ($s2,$_len); # load len
1126 &jc (&label("dec_in_place_partial"));
1127 &mov ($_len,$s2); # save len
1128 &jnz (&label("dec_in_place_loop"));
1129 &jmp (&label("dec_out"));
1131 &set_label("dec_in_place_partial",4);
1132 # one can argue if this is actually required...
1133 &mov ($key eq "edi" ? $key : "",$_out);
1134 &lea ($idx eq "esi" ? $idx : "",$ivec);
1135 &lea ($key,&DWP(0,$key,$s2));
1136 &lea ($idx,&DWP(16,$idx,$s2));
1137 &neg ($s2 eq "ecx" ? $s2 : "");
1138 &data_word(0xA4F3F689); # rep movsb # restore tail
1140 &set_label("dec_out",4);
1143 &function_end("Camellia_cbc_encrypt");
1146 &asciz("Camellia for x86 by <appro\@openssl.org>");