2 # Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
13 # This module may be used under the terms of either the GNU General
14 # Public License version 2 or later, the GNU Lesser General Public
15 # License version 2.1 or later, the Mozilla Public License version
16 # 1.1 or the BSD License. The exact terms of either license are
17 # distributed along with this module. For further details see
18 # http://www.openssl.org/~appro/camellia/.
19 # ====================================================================
21 # Performance in cycles per processed byte (less is better) in
22 # 'openssl speed ...' benchmark:
24 # AMD K8 Core2 PIII P4
25 # -evp camellia-128-ecb 21.5 22.8 27.0 28.9
26 # + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
27 # + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
29 # camellia-128-cbc 17.3 21.1 23.9 25.9
31 # 128-bit key setup 196 280 256 240 cycles/key
32 # + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
33 # + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
35 # Pairs of numbers in "+" rows represent performance improvement over
36 # compiler generated position-independent code, PIC, and non-PIC
37 # respectively. PIC results are of greater relevance, as this module
38 # is position-independent, i.e. suitable for a shared library or PIE.
39 # Position independence "costs" one register, which is why compilers
40 # are so close with non-PIC results, they have an extra register to
41 # spare. CBC results are better than ECB ones thanks to "zero-copy"
42 # private _x86_* interface, and are ~30-40% better than with compiler
43 # generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
44 # same CPU (where applicable).
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 push(@INC,"${dir}","${dir}../../perlasm");
52 $output = pop and open STDOUT,">$output";
54 &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
56 @T=("eax","ebx","ecx","edx");
61 # stack frame layout in _x86_Camellia_* routines, frame is allocated
63 $__ra=&DWP(0,"esp"); # return address
64 $__s0=&DWP(4,"esp"); # s0 backing store
65 $__s1=&DWP(8,"esp"); # s1 backing store
66 $__s2=&DWP(12,"esp"); # s2 backing store
67 $__s3=&DWP(16,"esp"); # s3 backing store
68 $__end=&DWP(20,"esp"); # pointer to end/start of key schedule
70 # stack frame layout in Camellia_[en|crypt] routines, which differs from
71 # above by 4 and overlaps by pointer to end/start of key schedule
75 # const unsigned int Camellia_SBOX[4][256];
76 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
77 # and [2][] - with [3][]. This is done to optimize code size.
78 $SBOX1_1110=0; # Camellia_SBOX[0]
79 $SBOX4_4404=4; # Camellia_SBOX[1]
80 $SBOX2_0222=2048; # Camellia_SBOX[2]
81 $SBOX3_3033=2052; # Camellia_SBOX[3]
82 &static_label("Camellia_SIGMA");
83 &static_label("Camellia_SBOX");
85 sub Camellia_Feistel {
87 my $seed=defined(@_[1])?@_[1]:0;
88 my $scale=$seed<0?-8:8;
89 my $frame=defined(@_[2])?@_[2]:0;
91 my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
93 &xor ($t0,$idx); # t0^=key[0]
94 &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
95 &movz ($idx,&HB($t0)); # (t0>>8)&0xff
96 &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
97 &movz ($idx,&LB($t0)); # (t0>>0)&0xff
98 &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
100 &movz ($idx,&LB($t1)); # (t1>>0)&0xff
101 &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
102 &movz ($idx,&HB($t0)); # (t0>>24)&0xff
103 &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0]
104 &movz ($idx,&HB($t1)); # (t1>>8)&0xff
105 &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1]
107 &movz ($t0,&LB($t0)); # (t0>>16)&0xff
108 &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
109 &movz ($idx,&HB($t1)); # (t1>>24)&0xff
110 &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3"
111 &xor ($t2,$t3); # t2^=t3
112 &rotr ($t3,8); # t3=RightRotate(t3,8)
113 &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1]
114 &movz ($idx,&LB($t1)); # (t1>>16)&0xff
115 &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2"
116 &xor ($t3,$t0); # t3^=s3
117 &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1]
118 &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1]
119 &xor ($t3,$t2); # t3^=t2
120 &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3
121 &xor ($t2,$t1); # t2^=s2
122 &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2
125 # void Camellia_EncryptBlock_Rounds(
127 # const Byte plaintext[],
128 # const KEY_TABLE_TYPE keyTable,
130 &function_begin("Camellia_EncryptBlock_Rounds");
131 &mov ("eax",&wparam(0)); # load grandRounds
132 &mov ($idx,&wparam(1)); # load plaintext pointer
133 &mov ($key,&wparam(2)); # load key schedule pointer
136 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
139 # place stack frame just "above mod 1024" the key schedule
140 # this ensures that cache associativity of 2 suffices
141 &lea ("ecx",&DWP(-64-63,$key));
144 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
146 &add ("esp",4); # 4 is reserved for callee's return address
149 &lea ("eax",&DWP(0,$key,"eax"));
150 &mov ($_esp,"ebx"); # save %esp
151 &mov ($_end,"eax"); # save keyEnd
153 &call (&label("pic_point"));
154 &set_label("pic_point");
156 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
158 &mov (@T[0],&DWP(0,$idx)); # load plaintext
159 &mov (@T[1],&DWP(4,$idx));
160 &mov (@T[2],&DWP(8,$idx));
162 &mov (@T[3],&DWP(12,$idx));
167 &call ("_x86_Camellia_encrypt");
171 &mov ($idx,&wparam(3)); # load ciphertext pointer
175 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
176 &mov (&DWP(4,$idx),@T[1]);
177 &mov (&DWP(8,$idx),@T[2]);
178 &mov (&DWP(12,$idx),@T[3]);
179 &function_end("Camellia_EncryptBlock_Rounds");
181 &function_begin_B("Camellia_EncryptBlock");
183 &sub ("eax",&wparam(0)); # load keyBitLength
185 &adc ("eax",0); # keyBitLength==128?3:4
186 &mov (&wparam(0),"eax");
187 &jmp (&label("Camellia_EncryptBlock_Rounds"));
188 &function_end_B("Camellia_EncryptBlock");
191 # void Camellia_encrypt(
192 # const unsigned char *in,
193 # unsigned char *out,
194 # const CAMELLIA_KEY *key)
195 &function_begin("Camellia_encrypt");
196 &mov ($idx,&wparam(0)); # load plaintext pointer
197 &mov ($key,&wparam(2)); # load key schedule pointer
200 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
202 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
204 # place stack frame just "above mod 1024" the key schedule
205 # this ensures that cache associativity of 2 suffices
206 &lea ("ecx",&DWP(-64-63,$key));
209 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
211 &add ("esp",4); # 4 is reserved for callee's return address
214 &lea ("eax",&DWP(0,$key,"eax"));
215 &mov ($_esp,"ebx"); # save %esp
216 &mov ($_end,"eax"); # save keyEnd
218 &call (&label("pic_point"));
219 &set_label("pic_point");
221 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
223 &mov (@T[0],&DWP(0,$idx)); # load plaintext
224 &mov (@T[1],&DWP(4,$idx));
225 &mov (@T[2],&DWP(8,$idx));
227 &mov (@T[3],&DWP(12,$idx));
232 &call ("_x86_Camellia_encrypt");
236 &mov ($idx,&wparam(1)); # load ciphertext pointer
240 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
241 &mov (&DWP(4,$idx),@T[1]);
242 &mov (&DWP(8,$idx),@T[2]);
243 &mov (&DWP(12,$idx),@T[3]);
244 &function_end("Camellia_encrypt");
247 &function_begin_B("_x86_Camellia_encrypt");
248 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
249 &xor (@T[1],&DWP(4,$key));
250 &xor (@T[2],&DWP(8,$key));
251 &xor (@T[3],&DWP(12,$key));
252 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
254 &mov ($__s0,@T[0]); # save s[0-3]
259 &set_label("loop",16);
260 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
264 &je (&label("done"));
266 # @T[0-1] are preloaded, $idx is preloaded with key[0]
272 &or (@T[2],&DWP(12,$key));
273 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
276 &mov ($idx,&DWP(4,$key));
277 &mov ($__s2,@T[2]); # s2^=s3|key[3];
279 &and (@T[2],&DWP(8,$key));
282 &mov ($__s0,@T[0]); # s0^=s1|key[1];
284 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
285 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
286 &jmp (&label("loop"));
288 &set_label("done",8);
289 &mov (@T[2],@T[0]); # SwapHalf
293 &xor (@T[0],$idx); # $idx is preloaded with key[0]
294 &xor (@T[1],&DWP(4,$key));
295 &xor (@T[2],&DWP(8,$key));
296 &xor (@T[3],&DWP(12,$key));
298 &function_end_B("_x86_Camellia_encrypt");
300 # void Camellia_DecryptBlock_Rounds(
302 # const Byte ciphertext[],
303 # const KEY_TABLE_TYPE keyTable,
305 &function_begin("Camellia_DecryptBlock_Rounds");
306 &mov ("eax",&wparam(0)); # load grandRounds
307 &mov ($idx,&wparam(1)); # load ciphertext pointer
308 &mov ($key,&wparam(2)); # load key schedule pointer
311 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
314 # place stack frame just "above mod 1024" the key schedule
315 # this ensures that cache associativity of 2 suffices
316 &lea ("ecx",&DWP(-64-63,$key));
319 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
321 &add ("esp",4); # 4 is reserved for callee's return address
324 &mov (&DWP(4*4,"esp"),$key); # save keyStart
325 &lea ($key,&DWP(0,$key,"eax"));
326 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
328 &call (&label("pic_point"));
329 &set_label("pic_point");
331 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
333 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
334 &mov (@T[1],&DWP(4,$idx));
335 &mov (@T[2],&DWP(8,$idx));
337 &mov (@T[3],&DWP(12,$idx));
342 &call ("_x86_Camellia_decrypt");
344 &mov ("esp",&DWP(5*4,"esp"));
346 &mov ($idx,&wparam(3)); # load plaintext pointer
350 &mov (&DWP(0,$idx),@T[0]); # write plaintext
351 &mov (&DWP(4,$idx),@T[1]);
352 &mov (&DWP(8,$idx),@T[2]);
353 &mov (&DWP(12,$idx),@T[3]);
354 &function_end("Camellia_DecryptBlock_Rounds");
356 &function_begin_B("Camellia_DecryptBlock");
358 &sub ("eax",&wparam(0)); # load keyBitLength
360 &adc ("eax",0); # keyBitLength==128?3:4
361 &mov (&wparam(0),"eax");
362 &jmp (&label("Camellia_DecryptBlock_Rounds"));
363 &function_end_B("Camellia_DecryptBlock");
366 # void Camellia_decrypt(
367 # const unsigned char *in,
368 # unsigned char *out,
369 # const CAMELLIA_KEY *key)
370 &function_begin("Camellia_decrypt");
371 &mov ($idx,&wparam(0)); # load ciphertext pointer
372 &mov ($key,&wparam(2)); # load key schedule pointer
375 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
377 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
379 # place stack frame just "above mod 1024" the key schedule
380 # this ensures that cache associativity of 2 suffices
381 &lea ("ecx",&DWP(-64-63,$key));
384 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
386 &add ("esp",4); # 4 is reserved for callee's return address
389 &mov (&DWP(4*4,"esp"),$key); # save keyStart
390 &lea ($key,&DWP(0,$key,"eax"));
391 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
393 &call (&label("pic_point"));
394 &set_label("pic_point");
396 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
398 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
399 &mov (@T[1],&DWP(4,$idx));
400 &mov (@T[2],&DWP(8,$idx));
402 &mov (@T[3],&DWP(12,$idx));
407 &call ("_x86_Camellia_decrypt");
409 &mov ("esp",&DWP(5*4,"esp"));
411 &mov ($idx,&wparam(1)); # load plaintext pointer
415 &mov (&DWP(0,$idx),@T[0]); # write plaintext
416 &mov (&DWP(4,$idx),@T[1]);
417 &mov (&DWP(8,$idx),@T[2]);
418 &mov (&DWP(12,$idx),@T[3]);
419 &function_end("Camellia_decrypt");
422 &function_begin_B("_x86_Camellia_decrypt");
423 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
424 &xor (@T[1],&DWP(4,$key));
425 &xor (@T[2],&DWP(8,$key));
426 &xor (@T[3],&DWP(12,$key));
427 &mov ($idx,&DWP(-8,$key)); # prefetch key[-2]
429 &mov ($__s0,@T[0]); # save s[0-3]
434 &set_label("loop",16);
435 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
439 &je (&label("done"));
441 # @T[0-1] are preloaded, $idx is preloaded with key[2]
447 &or (@T[2],&DWP(4,$key));
448 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
451 &mov ($idx,&DWP(12,$key));
452 &mov ($__s2,@T[2]); # s2^=s3|key[3];
454 &and (@T[2],&DWP(0,$key));
457 &mov ($__s0,@T[0]); # s0^=s1|key[1];
459 &mov ($idx,&DWP(-8,$key)); # prefetch key[4]
460 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
461 &jmp (&label("loop"));
463 &set_label("done",8);
464 &mov (@T[2],@T[0]); # SwapHalf
468 &xor (@T[2],$idx); # $idx is preloaded with key[2]
469 &xor (@T[3],&DWP(12,$key));
470 &xor (@T[0],&DWP(0,$key));
471 &xor (@T[1],&DWP(4,$key));
473 &function_end_B("_x86_Camellia_decrypt");
475 # shld is very slow on Intel P4 family. Even on AMD it limits
476 # instruction decode rate [because it's VectorPath] and consequently
477 # performance. PIII, PM and Core[2] seem to be the only ones which
478 # execute this code ~7% faster...
480 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
485 &shld ($i0,$i1,$rot);
486 &shld ($i1,$i2,$rot);
487 &shld ($i2,$i3,$rot);
488 &shld ($i3,$idx,$rot);
490 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
491 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
492 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
493 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
496 # ... Implementing 128-bit rotate without shld gives >3x performance
497 # improvement on P4, only ~7% degradation on other Intel CPUs and
498 # not worse performance on AMD. This is therefore preferred.
500 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
512 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
518 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
522 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
523 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
525 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
526 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
527 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
528 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
533 my ($rnd,$key,@T)=@_;
534 my $bias=int(@T[0])?shift(@T):0;
536 &mov (&DWP($bias+$rnd*8+0,$key),@T[0]);
537 &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1);
538 &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2);
539 &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);
543 my ($rnd,$key,@T)=@_;
544 my $bias=int(@T[0])?shift(@T):0;
546 &mov (@T[0],&DWP($bias+$rnd*8+0,$key));
547 &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1);
548 &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2);
549 &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3);
552 # void Camellia_Ekeygen(
553 # const int keyBitLength,
554 # const Byte *rawKey,
555 # KEY_TABLE_TYPE keyTable)
556 &function_begin("Camellia_Ekeygen");
559 &stack_push(4); # place for s[0-3]
561 &mov ($Tbl,&wparam(0)); # load arguments
562 &mov ($idx,&wparam(1));
563 &mov ($key,&wparam(2));
565 &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits
566 &mov (@T[1],&DWP(4,$idx));
567 &mov (@T[2],&DWP(8,$idx));
568 &mov (@T[3],&DWP(12,$idx));
575 &_saveround (0,$key,@T); # KL<<<0
578 &je (&label("1st128"));
580 &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits
581 &mov (@T[1],&DWP(20,$idx));
583 &je (&label("1st192"));
584 &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits
585 &mov (@T[3],&DWP(28,$idx));
586 &jmp (&label("1st256"));
587 &set_label("1st192",4);
592 &set_label("1st256",4);
598 &_saveround (4,$key,@T); # temporary storage for KR!
600 &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL
601 &xor (@T[1],&DWP(0*8+4,$key));
602 &xor (@T[2],&DWP(1*8+0,$key));
603 &xor (@T[3],&DWP(1*8+4,$key));
605 &set_label("1st128",4);
606 &call (&label("pic_point"));
607 &set_label("pic_point");
609 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
610 &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
612 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0]
613 &mov (&swtmp(0),@T[0]); # save s[0-3]
614 &mov (&swtmp(1),@T[1]);
615 &mov (&swtmp(2),@T[2]);
616 &mov (&swtmp(3),@T[3]);
617 &Camellia_Feistel($step++);
618 &Camellia_Feistel($step++);
619 &mov (@T[2],&swtmp(2));
620 &mov (@T[3],&swtmp(3));
622 &mov ($idx,&wparam(2));
623 &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL
624 &xor (@T[1],&DWP(0*8+4,$idx));
625 &xor (@T[2],&DWP(1*8+0,$idx));
626 &xor (@T[3],&DWP(1*8+4,$idx));
628 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4]
629 &mov (&swtmp(0),@T[0]); # save s[0-3]
630 &mov (&swtmp(1),@T[1]);
631 &mov (&swtmp(2),@T[2]);
632 &mov (&swtmp(3),@T[3]);
633 &Camellia_Feistel($step++);
634 &Camellia_Feistel($step++);
635 &mov (@T[2],&swtmp(2));
636 &mov (@T[3],&swtmp(3));
638 &mov ($idx,&wparam(0));
640 &jne (&label("2nd256"));
642 &mov ($key,&wparam(2));
643 &lea ($key,&DWP(128,$key)); # size optimization
646 &_saveround (2,$key,-128,@T); # KA<<<0
647 &_rotl128 (@T,15,6,@T); # KA<<<15
648 &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30)
649 &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45)
650 &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60)
651 push (@T,shift(@T)); # rotl128(@T,32);
652 &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94)
653 &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111)
656 &_loadround (0,$key,-128,@T); # load KL
657 &_rotl128 (@T,15,4,@T); # KL<<<15
658 &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45)
659 &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60)
660 &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77)
661 &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94)
662 &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111)
664 while (@T[0] ne "eax") # restore order
665 { unshift (@T,pop(@T)); }
667 &mov ("eax",3); # 3 grandRounds
668 &jmp (&label("done"));
670 &set_label("2nd256",16);
671 &mov ($idx,&wparam(2));
672 &_saveround (6,$idx,@T); # temporary storage for KA!
674 &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR
675 &xor (@T[1],&DWP(4*8+4,$idx));
676 &xor (@T[2],&DWP(5*8+0,$idx));
677 &xor (@T[3],&DWP(5*8+4,$idx));
679 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8]
680 &mov (&swtmp(0),@T[0]); # save s[0-3]
681 &mov (&swtmp(1),@T[1]);
682 &mov (&swtmp(2),@T[2]);
683 &mov (&swtmp(3),@T[3]);
684 &Camellia_Feistel($step++);
685 &Camellia_Feistel($step++);
686 &mov (@T[2],&swtmp(2));
687 &mov (@T[3],&swtmp(3));
689 &mov ($key,&wparam(2));
690 &lea ($key,&DWP(128,$key)); # size optimization
693 &_saveround (2,$key,-128,@T); # KB<<<0
694 &_rotl128 (@T,30,10,@T); # KB<<<30
695 &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60)
696 push (@T,shift(@T)); # rotl128(@T,32);
697 &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111)
700 &_loadround (4,$key,-128,@T); # load KR
701 &_rotl128 (@T,15,4,@T); # KR<<<15
702 &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30)
703 &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60)
704 push (@T,shift(@T)); # rotl128(@T,32);
705 &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94)
708 &_loadround (6,$key,-128,@T); # load KA
709 &_rotl128 (@T,15,6,@T); # KA<<<15
710 &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45)
711 push (@T,shift(@T)); # rotl128(@T,32);
712 &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77)
713 &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94)
716 &_loadround (0,$key,-128,@T); # load KL
717 push (@T,shift(@T)); # rotl128(@T,32);
718 &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45)
719 &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60)
720 &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77)
721 push (@T,shift(@T)); # rotl128(@T,32);
722 &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111)
724 while (@T[0] ne "eax") # restore order
725 { unshift (@T,pop(@T)); }
727 &mov ("eax",4); # 4 grandRounds
729 &lea ("edx",&DWP(272-128,$key)); # end of key schedule
732 &function_end("Camellia_Ekeygen");
735 # int Camellia_set_key (
736 # const unsigned char *userKey,
739 &function_begin_B("Camellia_set_key");
741 &mov ("ecx",&wparam(0)); # pull arguments
742 &mov ("ebx",&wparam(1));
743 &mov ("edx",&wparam(2));
747 &jz (&label("done")); # userKey==NULL?
749 &jz (&label("done")); # key==NULL?
753 &je (&label("arg_ok")); # bits==256?
755 &je (&label("arg_ok")); # bits==192?
757 &jne (&label("done")); # bits!=128?
758 &set_label("arg_ok",4);
760 &push ("edx"); # push arguments
763 &call ("Camellia_Ekeygen");
766 # eax holds grandRounds and edx points at where to put it
767 &mov (&DWP(0,"edx"),"eax");
769 &set_label("done",4);
772 &function_end_B("Camellia_set_key");
776 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
777 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
778 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
779 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
780 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
781 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
782 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
783 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
784 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
785 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
786 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
787 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
788 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
789 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
790 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
791 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
793 sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
794 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
795 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
796 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
798 &set_label("Camellia_SIGMA",64);
800 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
801 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
802 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
804 &set_label("Camellia_SBOX",64);
805 # tables are interleaved, remember?
806 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
807 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
809 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
810 # size_t length, const CAMELLIA_KEY *key,
811 # unsigned char *ivp,const int enc);
814 # -4(%esp) # return address 0(%esp)
815 # 0(%esp) # s0 4(%esp)
816 # 4(%esp) # s1 8(%esp)
817 # 8(%esp) # s2 12(%esp)
818 # 12(%esp) # s3 16(%esp)
819 # 16(%esp) # end of key schedule 20(%esp)
820 # 20(%esp) # %esp backup
821 my $_inp=&DWP(24,"esp"); #copy of wparam(0)
822 my $_out=&DWP(28,"esp"); #copy of wparam(1)
823 my $_len=&DWP(32,"esp"); #copy of wparam(2)
824 my $_key=&DWP(36,"esp"); #copy of wparam(3)
825 my $_ivp=&DWP(40,"esp"); #copy of wparam(4)
826 my $ivec=&DWP(44,"esp"); #ivec[16]
827 my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec]
828 my ($s0,$s1,$s2,$s3) = @T;
830 &function_begin("Camellia_cbc_encrypt");
831 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
833 &je (&label("enc_out"));
838 &mov ($s0,&wparam(0)); # load inp
839 &mov ($s1,&wparam(1)); # load out
840 #&mov ($s2,&wparam(2)); # load len
841 &mov ($s3,&wparam(3)); # load key
842 &mov ($Tbl,&wparam(4)); # load ivp
844 # allocate aligned stack frame...
845 &lea ($idx,&DWP(-64,"esp"));
848 # place stack frame just "above mod 1024" the key schedule
849 # this ensures that cache associativity of 2 suffices
850 &lea ($key,&DWP(-64-63,$s3));
853 &and ($key,0x3C0); # modulo 1024, but aligned to cache-line
856 &mov ($key,&wparam(5)); # load enc
859 &add ("esp",4); # reserve for return address!
860 &mov ($_esp,$idx); # save %esp
862 &mov ($_inp,$s0); # save copy of inp
863 &mov ($_out,$s1); # save copy of out
864 &mov ($_len,$s2); # save copy of len
865 &mov ($_key,$s3); # save copy of key
866 &mov ($_ivp,$Tbl); # save copy of ivp
868 &call (&label("pic_point")); # make it PIC!
869 &set_label("pic_point");
871 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
874 &set_label("prefetch_sbox",4);
875 &mov ($s0,&DWP(0,$Tbl));
876 &mov ($s1,&DWP(32,$Tbl));
877 &mov ($s2,&DWP(64,$Tbl));
878 &mov ($s3,&DWP(96,$Tbl));
879 &lea ($Tbl,&DWP(128,$Tbl));
881 &jnz (&label("prefetch_sbox"));
885 &mov ($s3,&DWP(272,$s0)); # load grandRounds
888 &je (&label("DECRYPT"));
893 &lea ($s3,&DWP(0,$s0,$s3));
896 &test ($s2,0xFFFFFFF0);
897 &jz (&label("enc_tail")); # short input...
899 &mov ($s0,&DWP(0,$key)); # load iv
900 &mov ($s1,&DWP(4,$key));
902 &set_label("enc_loop",4);
903 &mov ($s2,&DWP(8,$key));
904 &mov ($s3,&DWP(12,$key));
906 &xor ($s0,&DWP(0,$idx)); # xor input data
907 &xor ($s1,&DWP(4,$idx));
908 &xor ($s2,&DWP(8,$idx));
910 &xor ($s3,&DWP(12,$idx));
912 &mov ($key,$_key); # load key
916 &call ("_x86_Camellia_encrypt");
918 &mov ($idx,$_inp); # load inp
919 &mov ($key,$_out); # load out
924 &mov (&DWP(0,$key),$s0); # save output data
926 &mov (&DWP(4,$key),$s1);
927 &mov (&DWP(8,$key),$s2);
928 &mov (&DWP(12,$key),$s3);
930 &mov ($s2,$_len); # load len
932 &lea ($idx,&DWP(16,$idx));
933 &mov ($_inp,$idx); # save inp
935 &lea ($s3,&DWP(16,$key));
936 &mov ($_out,$s3); # save out
939 &test ($s2,0xFFFFFFF0);
940 &mov ($_len,$s2); # save len
941 &jnz (&label("enc_loop"));
943 &jnz (&label("enc_tail"));
944 &mov ($idx,$_ivp); # load ivp
945 &mov ($s2,&DWP(8,$key)); # restore last dwords
946 &mov ($s3,&DWP(12,$key));
947 &mov (&DWP(0,$idx),$s0); # save ivec
948 &mov (&DWP(4,$idx),$s1);
949 &mov (&DWP(8,$idx),$s2);
950 &mov (&DWP(12,$idx),$s3);
954 &set_label("enc_out");
956 &pushf (); # kludge, never executed
958 &set_label("enc_tail",4);
959 &mov ($s0,$key eq "edi" ? $key : "");
960 &mov ($key,$_out); # load out
961 &push ($s0); # push ivp
964 &cmp ($key,$idx); # compare with inp
965 &je (&label("enc_in_place"));
967 &data_word(0xA4F3F689); # rep movsb # copy input
968 &jmp (&label("enc_skip_in_place"));
969 &set_label("enc_in_place");
970 &lea ($key,&DWP(0,$key,$s2));
971 &set_label("enc_skip_in_place");
975 &data_word(0xAAF3F689); # rep stosb # zero tail
976 &pop ($key); # pop ivp
978 &mov ($idx,$_out); # output as input
979 &mov ($s0,&DWP(0,$key));
980 &mov ($s1,&DWP(4,$key));
981 &mov ($_len,16); # len=16
982 &jmp (&label("enc_loop")); # one more spin...
984 #----------------------------- DECRYPT -----------------------------#
985 &set_label("DECRYPT",16);
987 &lea ($s3,&DWP(0,$s0,$s3));
992 &je (&label("dec_in_place")); # in-place processing...
994 &mov ($key,$_ivp); # load ivp
997 &set_label("dec_loop",4);
998 &mov ($s0,&DWP(0,$idx)); # read input
999 &mov ($s1,&DWP(4,$idx));
1000 &mov ($s2,&DWP(8,$idx));
1002 &mov ($s3,&DWP(12,$idx));
1004 &mov ($key,$_key); # load key
1008 &call ("_x86_Camellia_decrypt");
1010 &mov ($key,$_tmp); # load ivp
1011 &mov ($idx,$_len); # load len
1016 &xor ($s0,&DWP(0,$key)); # xor iv
1018 &xor ($s1,&DWP(4,$key));
1019 &xor ($s2,&DWP(8,$key));
1020 &xor ($s3,&DWP(12,$key));
1023 &jc (&label("dec_partial"));
1024 &mov ($_len,$idx); # save len
1025 &mov ($idx,$_inp); # load inp
1026 &mov ($key,$_out); # load out
1028 &mov (&DWP(0,$key),$s0); # write output
1029 &mov (&DWP(4,$key),$s1);
1030 &mov (&DWP(8,$key),$s2);
1031 &mov (&DWP(12,$key),$s3);
1033 &mov ($_tmp,$idx); # save ivp
1034 &lea ($idx,&DWP(16,$idx));
1035 &mov ($_inp,$idx); # save inp
1037 &lea ($key,&DWP(16,$key));
1038 &mov ($_out,$key); # save out
1040 &jnz (&label("dec_loop"));
1041 &mov ($key,$_tmp); # load temp ivp
1042 &set_label("dec_end");
1043 &mov ($idx,$_ivp); # load user ivp
1044 &mov ($s0,&DWP(0,$key)); # load iv
1045 &mov ($s1,&DWP(4,$key));
1046 &mov ($s2,&DWP(8,$key));
1047 &mov ($s3,&DWP(12,$key));
1048 &mov (&DWP(0,$idx),$s0); # copy back to user
1049 &mov (&DWP(4,$idx),$s1);
1050 &mov (&DWP(8,$idx),$s2);
1051 &mov (&DWP(12,$idx),$s3);
1052 &jmp (&label("dec_out"));
1054 &set_label("dec_partial",4);
1056 &mov (&DWP(0,$key),$s0); # dump output to stack
1057 &mov (&DWP(4,$key),$s1);
1058 &mov (&DWP(8,$key),$s2);
1059 &mov (&DWP(12,$key),$s3);
1060 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
1061 &mov ($idx eq "esi" ? $idx : "",$key);
1062 &mov ($key eq "edi" ? $key : "",$_out); # load out
1063 &data_word(0xA4F3F689); # rep movsb # copy output
1064 &mov ($key,$_inp); # use inp as temp ivp
1065 &jmp (&label("dec_end"));
1067 &set_label("dec_in_place",4);
1068 &set_label("dec_in_place_loop");
1070 &mov ($s0,&DWP(0,$idx)); # read input
1071 &mov ($s1,&DWP(4,$idx));
1072 &mov ($s2,&DWP(8,$idx));
1073 &mov ($s3,&DWP(12,$idx));
1075 &mov (&DWP(0,$key),$s0); # copy to temp
1076 &mov (&DWP(4,$key),$s1);
1077 &mov (&DWP(8,$key),$s2);
1079 &mov (&DWP(12,$key),$s3);
1081 &mov ($key,$_key); # load key
1085 &call ("_x86_Camellia_decrypt");
1087 &mov ($key,$_ivp); # load ivp
1088 &mov ($idx,$_out); # load out
1093 &xor ($s0,&DWP(0,$key)); # xor iv
1095 &xor ($s1,&DWP(4,$key));
1096 &xor ($s2,&DWP(8,$key));
1097 &xor ($s3,&DWP(12,$key));
1099 &mov (&DWP(0,$idx),$s0); # write output
1100 &mov (&DWP(4,$idx),$s1);
1101 &mov (&DWP(8,$idx),$s2);
1102 &mov (&DWP(12,$idx),$s3);
1104 &lea ($idx,&DWP(16,$idx));
1105 &mov ($_out,$idx); # save out
1108 &mov ($s0,&DWP(0,$idx)); # read temp
1109 &mov ($s1,&DWP(4,$idx));
1110 &mov ($s2,&DWP(8,$idx));
1111 &mov ($s3,&DWP(12,$idx));
1113 &mov (&DWP(0,$key),$s0); # copy iv
1114 &mov (&DWP(4,$key),$s1);
1115 &mov (&DWP(8,$key),$s2);
1116 &mov (&DWP(12,$key),$s3);
1118 &mov ($idx,$_inp); # load inp
1120 &lea ($idx,&DWP(16,$idx));
1121 &mov ($_inp,$idx); # save inp
1123 &mov ($s2,$_len); # load len
1125 &jc (&label("dec_in_place_partial"));
1126 &mov ($_len,$s2); # save len
1127 &jnz (&label("dec_in_place_loop"));
1128 &jmp (&label("dec_out"));
1130 &set_label("dec_in_place_partial",4);
1131 # one can argue if this is actually required...
1132 &mov ($key eq "edi" ? $key : "",$_out);
1133 &lea ($idx eq "esi" ? $idx : "",$ivec);
1134 &lea ($key,&DWP(0,$key,$s2));
1135 &lea ($idx,&DWP(16,$idx,$s2));
1136 &neg ($s2 eq "ecx" ? $s2 : "");
1137 &data_word(0xA4F3F689); # rep movsb # restore tail
1139 &set_label("dec_out",4);
1142 &function_end("Camellia_cbc_encrypt");
1145 &asciz("Camellia for x86 by <appro\@openssl.org>");