3 # ====================================================================
4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
6 # This module may be used under the terms of either the GNU General
7 # Public License version 2 or later, the GNU Lesser General Public
8 # License version 2.1 or later, the Mozilla Public License version
9 # 1.1 or the BSD License. The exact terms of either license are
10 # distributed along with this module. For further details see
11 # http://www.openssl.org/~appro/camellia/.
12 # ====================================================================
14 # Performance in cycles per processed byte (less is better) in
15 # 'openssl speed ...' benchmark:
17 # AMD K8 Core2 PIII P4
18 # -evp camellia-128-ecb 21.5 22.8 27.0 28.9
19 # + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
20 # + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
22 # camellia-128-cbc 17.3 21.1 23.9 25.9
24 # 128-bit key setup 196 280 256 240 cycles/key
25 # + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
26 # + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
28 # Pairs of numbers in "+" rows represent performance improvement over
29 # compiler generated position-independent code, PIC, and non-PIC
30 # respectively. PIC results are of greater relevance, as this module
31 # is position-independent, i.e. suitable for a shared library or PIE.
32 # Position independence "costs" one register, which is why compilers
33 # are so close with non-PIC results, they have an extra register to
34 # spare. CBC results are better than ECB ones thanks to "zero-copy"
35 # private _x86_* interface, and are ~30-40% better than with compiler
36 # generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
37 # same CPU (where applicable).
39 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40 push(@INC,"${dir}","${dir}../../perlasm");
46 open STDOUT,">$output";
48 &asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
50 @T=("eax","ebx","ecx","edx");
55 # stack frame layout in _x86_Camellia_* routines, frame is allocated
57 $__ra=&DWP(0,"esp"); # return address
58 $__s0=&DWP(4,"esp"); # s0 backing store
59 $__s1=&DWP(8,"esp"); # s1 backing store
60 $__s2=&DWP(12,"esp"); # s2 backing store
61 $__s3=&DWP(16,"esp"); # s3 backing store
62 $__end=&DWP(20,"esp"); # pointer to end/start of key schedule
64 # stack frame layout in Camellia_[en|crypt] routines, which differs from
65 # above by 4 and overlaps by pointer to end/start of key schedule
69 # const unsigned int Camellia_SBOX[4][256];
70 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
71 # and [2][] - with [3][]. This is done to optimize code size.
72 $SBOX1_1110=0; # Camellia_SBOX[0]
73 $SBOX4_4404=4; # Camellia_SBOX[1]
74 $SBOX2_0222=2048; # Camellia_SBOX[2]
75 $SBOX3_3033=2052; # Camellia_SBOX[3]
76 &static_label("Camellia_SIGMA");
77 &static_label("Camellia_SBOX");
79 sub Camellia_Feistel {
81 my $seed=defined(@_[1])?@_[1]:0;
82 my $scale=$seed<0?-8:8;
83 my $frame=defined(@_[2])?@_[2]:0;
85 my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
87 &xor ($t0,$idx); # t0^=key[0]
88 &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
89 &movz ($idx,&HB($t0)); # (t0>>8)&0xff
90 &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
91 &movz ($idx,&LB($t0)); # (t0>>0)&0xff
92 &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
94 &movz ($idx,&LB($t1)); # (t1>>0)&0xff
95 &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
96 &movz ($idx,&HB($t0)); # (t0>>24)&0xff
97 &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0]
98 &movz ($idx,&HB($t1)); # (t1>>8)&0xff
99 &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1]
101 &movz ($t0,&LB($t0)); # (t0>>16)&0xff
102 &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
103 &movz ($idx,&HB($t1)); # (t1>>24)&0xff
104 &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3"
105 &xor ($t2,$t3); # t2^=t3
106 &rotr ($t3,8); # t3=RightRotate(t3,8)
107 &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1]
108 &movz ($idx,&LB($t1)); # (t1>>16)&0xff
109 &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2"
110 &xor ($t3,$t0); # t3^=s3
111 &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1]
112 &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1]
113 &xor ($t3,$t2); # t3^=t2
114 &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3
115 &xor ($t2,$t1); # t2^=s2
116 &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2
119 # void Camellia_EncryptBlock_Rounds(
121 # const Byte plaintext[],
122 # const KEY_TABLE_TYPE keyTable,
124 &function_begin("Camellia_EncryptBlock_Rounds");
125 &mov ("eax",&wparam(0)); # load grandRounds
126 &mov ($idx,&wparam(1)); # load plaintext pointer
127 &mov ($key,&wparam(2)); # load key schedule pointer
130 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
133 # place stack frame just "above mod 1024" the key schedule
134 # this ensures that cache associativity of 2 suffices
135 &lea ("ecx",&DWP(-64-63,$key));
138 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
140 &add ("esp",4); # 4 is reserved for callee's return address
143 &lea ("eax",&DWP(0,$key,"eax"));
144 &mov ($_esp,"ebx"); # save %esp
145 &mov ($_end,"eax"); # save keyEnd
147 &call (&label("pic_point"));
148 &set_label("pic_point");
150 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
152 &mov (@T[0],&DWP(0,$idx)); # load plaintext
153 &mov (@T[1],&DWP(4,$idx));
154 &mov (@T[2],&DWP(8,$idx));
156 &mov (@T[3],&DWP(12,$idx));
161 &call ("_x86_Camellia_encrypt");
165 &mov ($idx,&wparam(3)); # load ciphertext pointer
169 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
170 &mov (&DWP(4,$idx),@T[1]);
171 &mov (&DWP(8,$idx),@T[2]);
172 &mov (&DWP(12,$idx),@T[3]);
173 &function_end("Camellia_EncryptBlock_Rounds");
175 &function_begin_B("Camellia_EncryptBlock");
177 &sub ("eax",&wparam(0)); # load keyBitLength
179 &adc ("eax",0); # keyBitLength==128?3:4
180 &mov (&wparam(0),"eax");
181 &jmp (&label("Camellia_EncryptBlock_Rounds"));
182 &function_end_B("Camellia_EncryptBlock");
185 # void Camellia_encrypt(
186 # const unsigned char *in,
187 # unsigned char *out,
188 # const CAMELLIA_KEY *key)
189 &function_begin("Camellia_encrypt");
190 &mov ($idx,&wparam(0)); # load plaintext pointer
191 &mov ($key,&wparam(2)); # load key schedule pointer
194 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
196 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
198 # place stack frame just "above mod 1024" the key schedule
199 # this ensures that cache associativity of 2 suffices
200 &lea ("ecx",&DWP(-64-63,$key));
203 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
205 &add ("esp",4); # 4 is reserved for callee's return address
208 &lea ("eax",&DWP(0,$key,"eax"));
209 &mov ($_esp,"ebx"); # save %esp
210 &mov ($_end,"eax"); # save keyEnd
212 &call (&label("pic_point"));
213 &set_label("pic_point");
215 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
217 &mov (@T[0],&DWP(0,$idx)); # load plaintext
218 &mov (@T[1],&DWP(4,$idx));
219 &mov (@T[2],&DWP(8,$idx));
221 &mov (@T[3],&DWP(12,$idx));
226 &call ("_x86_Camellia_encrypt");
230 &mov ($idx,&wparam(1)); # load ciphertext pointer
234 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
235 &mov (&DWP(4,$idx),@T[1]);
236 &mov (&DWP(8,$idx),@T[2]);
237 &mov (&DWP(12,$idx),@T[3]);
238 &function_end("Camellia_encrypt");
241 &function_begin_B("_x86_Camellia_encrypt");
242 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
243 &xor (@T[1],&DWP(4,$key));
244 &xor (@T[2],&DWP(8,$key));
245 &xor (@T[3],&DWP(12,$key));
246 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
248 &mov ($__s0,@T[0]); # save s[0-3]
253 &set_label("loop",16);
254 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
258 &je (&label("done"));
260 # @T[0-1] are preloaded, $idx is preloaded with key[0]
266 &or (@T[2],&DWP(12,$key));
267 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
270 &mov ($idx,&DWP(4,$key));
271 &mov ($__s2,@T[2]); # s2^=s3|key[3];
273 &and (@T[2],&DWP(8,$key));
276 &mov ($__s0,@T[0]); # s0^=s1|key[1];
278 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
279 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
280 &jmp (&label("loop"));
282 &set_label("done",8);
283 &mov (@T[2],@T[0]); # SwapHalf
287 &xor (@T[0],$idx); # $idx is preloaded with key[0]
288 &xor (@T[1],&DWP(4,$key));
289 &xor (@T[2],&DWP(8,$key));
290 &xor (@T[3],&DWP(12,$key));
292 &function_end_B("_x86_Camellia_encrypt");
294 # void Camellia_DecryptBlock_Rounds(
296 # const Byte ciphertext[],
297 # const KEY_TABLE_TYPE keyTable,
299 &function_begin("Camellia_DecryptBlock_Rounds");
300 &mov ("eax",&wparam(0)); # load grandRounds
301 &mov ($idx,&wparam(1)); # load ciphertext pointer
302 &mov ($key,&wparam(2)); # load key schedule pointer
305 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
308 # place stack frame just "above mod 1024" the key schedule
309 # this ensures that cache associativity of 2 suffices
310 &lea ("ecx",&DWP(-64-63,$key));
313 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
315 &add ("esp",4); # 4 is reserved for callee's return address
318 &mov (&DWP(4*4,"esp"),$key); # save keyStart
319 &lea ($key,&DWP(0,$key,"eax"));
320 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
322 &call (&label("pic_point"));
323 &set_label("pic_point");
325 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
327 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
328 &mov (@T[1],&DWP(4,$idx));
329 &mov (@T[2],&DWP(8,$idx));
331 &mov (@T[3],&DWP(12,$idx));
336 &call ("_x86_Camellia_decrypt");
338 &mov ("esp",&DWP(5*4,"esp"));
340 &mov ($idx,&wparam(3)); # load plaintext pointer
344 &mov (&DWP(0,$idx),@T[0]); # write plaintext
345 &mov (&DWP(4,$idx),@T[1]);
346 &mov (&DWP(8,$idx),@T[2]);
347 &mov (&DWP(12,$idx),@T[3]);
348 &function_end("Camellia_DecryptBlock_Rounds");
350 &function_begin_B("Camellia_DecryptBlock");
352 &sub ("eax",&wparam(0)); # load keyBitLength
354 &adc ("eax",0); # keyBitLength==128?3:4
355 &mov (&wparam(0),"eax");
356 &jmp (&label("Camellia_DecryptBlock_Rounds"));
357 &function_end_B("Camellia_DecryptBlock");
360 # void Camellia_decrypt(
361 # const unsigned char *in,
362 # unsigned char *out,
363 # const CAMELLIA_KEY *key)
364 &function_begin("Camellia_decrypt");
365 &mov ($idx,&wparam(0)); # load ciphertext pointer
366 &mov ($key,&wparam(2)); # load key schedule pointer
369 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
371 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
373 # place stack frame just "above mod 1024" the key schedule
374 # this ensures that cache associativity of 2 suffices
375 &lea ("ecx",&DWP(-64-63,$key));
378 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
380 &add ("esp",4); # 4 is reserved for callee's return address
383 &mov (&DWP(4*4,"esp"),$key); # save keyStart
384 &lea ($key,&DWP(0,$key,"eax"));
385 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
387 &call (&label("pic_point"));
388 &set_label("pic_point");
390 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
392 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
393 &mov (@T[1],&DWP(4,$idx));
394 &mov (@T[2],&DWP(8,$idx));
396 &mov (@T[3],&DWP(12,$idx));
401 &call ("_x86_Camellia_decrypt");
403 &mov ("esp",&DWP(5*4,"esp"));
405 &mov ($idx,&wparam(1)); # load plaintext pointer
409 &mov (&DWP(0,$idx),@T[0]); # write plaintext
410 &mov (&DWP(4,$idx),@T[1]);
411 &mov (&DWP(8,$idx),@T[2]);
412 &mov (&DWP(12,$idx),@T[3]);
413 &function_end("Camellia_decrypt");
416 &function_begin_B("_x86_Camellia_decrypt");
417 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
418 &xor (@T[1],&DWP(4,$key));
419 &xor (@T[2],&DWP(8,$key));
420 &xor (@T[3],&DWP(12,$key));
421 &mov ($idx,&DWP(-8,$key)); # prefetch key[-2]
423 &mov ($__s0,@T[0]); # save s[0-3]
428 &set_label("loop",16);
429 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
433 &je (&label("done"));
435 # @T[0-1] are preloaded, $idx is preloaded with key[2]
441 &or (@T[2],&DWP(4,$key));
442 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
445 &mov ($idx,&DWP(12,$key));
446 &mov ($__s2,@T[2]); # s2^=s3|key[3];
448 &and (@T[2],&DWP(0,$key));
451 &mov ($__s0,@T[0]); # s0^=s1|key[1];
453 &mov ($idx,&DWP(-8,$key)); # prefetch key[4]
454 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
455 &jmp (&label("loop"));
457 &set_label("done",8);
458 &mov (@T[2],@T[0]); # SwapHalf
462 &xor (@T[2],$idx); # $idx is preloaded with key[2]
463 &xor (@T[3],&DWP(12,$key));
464 &xor (@T[0],&DWP(0,$key));
465 &xor (@T[1],&DWP(4,$key));
467 &function_end_B("_x86_Camellia_decrypt");
469 # shld is very slow on Intel P4 family. Even on AMD it limits
470 # instruction decode rate [because it's VectorPath] and consequently
471 # performance. PIII, PM and Core[2] seem to be the only ones which
472 # execute this code ~7% faster...
474 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
479 &shld ($i0,$i1,$rot);
480 &shld ($i1,$i2,$rot);
481 &shld ($i2,$i3,$rot);
482 &shld ($i3,$idx,$rot);
484 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
485 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
486 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
487 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
490 # ... Implementing 128-bit rotate without shld gives >3x performance
491 # improvement on P4, only ~7% degradation on other Intel CPUs and
492 # not worse performance on AMD. This is therefore preferred.
494 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
506 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
512 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
516 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
517 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
519 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
520 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
521 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
522 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
527 my ($rnd,$key,@T)=@_;
528 my $bias=int(@T[0])?shift(@T):0;
530 &mov (&DWP($bias+$rnd*8+0,$key),@T[0]);
531 &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1);
532 &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2);
533 &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);
537 my ($rnd,$key,@T)=@_;
538 my $bias=int(@T[0])?shift(@T):0;
540 &mov (@T[0],&DWP($bias+$rnd*8+0,$key));
541 &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1);
542 &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2);
543 &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3);
546 # void Camellia_Ekeygen(
547 # const int keyBitLength,
548 # const Byte *rawKey,
549 # KEY_TABLE_TYPE keyTable)
550 &function_begin("Camellia_Ekeygen");
553 &stack_push(4); # place for s[0-3]
555 &mov ($Tbl,&wparam(0)); # load arguments
556 &mov ($idx,&wparam(1));
557 &mov ($key,&wparam(2));
559 &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits
560 &mov (@T[1],&DWP(4,$idx));
561 &mov (@T[2],&DWP(8,$idx));
562 &mov (@T[3],&DWP(12,$idx));
569 &_saveround (0,$key,@T); # KL<<<0
572 &je (&label("1st128"));
574 &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits
575 &mov (@T[1],&DWP(20,$idx));
577 &je (&label("1st192"));
578 &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits
579 &mov (@T[3],&DWP(28,$idx));
580 &jmp (&label("1st256"));
581 &set_label("1st192",4);
586 &set_label("1st256",4);
592 &_saveround (4,$key,@T); # temporary storage for KR!
594 &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL
595 &xor (@T[1],&DWP(0*8+4,$key));
596 &xor (@T[2],&DWP(1*8+0,$key));
597 &xor (@T[3],&DWP(1*8+4,$key));
599 &set_label("1st128",4);
600 &call (&label("pic_point"));
601 &set_label("pic_point");
603 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
604 &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
606 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0]
607 &mov (&swtmp(0),@T[0]); # save s[0-3]
608 &mov (&swtmp(1),@T[1]);
609 &mov (&swtmp(2),@T[2]);
610 &mov (&swtmp(3),@T[3]);
611 &Camellia_Feistel($step++);
612 &Camellia_Feistel($step++);
613 &mov (@T[2],&swtmp(2));
614 &mov (@T[3],&swtmp(3));
616 &mov ($idx,&wparam(2));
617 &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL
618 &xor (@T[1],&DWP(0*8+4,$idx));
619 &xor (@T[2],&DWP(1*8+0,$idx));
620 &xor (@T[3],&DWP(1*8+4,$idx));
622 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4]
623 &mov (&swtmp(0),@T[0]); # save s[0-3]
624 &mov (&swtmp(1),@T[1]);
625 &mov (&swtmp(2),@T[2]);
626 &mov (&swtmp(3),@T[3]);
627 &Camellia_Feistel($step++);
628 &Camellia_Feistel($step++);
629 &mov (@T[2],&swtmp(2));
630 &mov (@T[3],&swtmp(3));
632 &mov ($idx,&wparam(0));
634 &jne (&label("2nd256"));
636 &mov ($key,&wparam(2));
637 &lea ($key,&DWP(128,$key)); # size optimization
640 &_saveround (2,$key,-128,@T); # KA<<<0
641 &_rotl128 (@T,15,6,@T); # KA<<<15
642 &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30)
643 &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45)
644 &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60)
645 push (@T,shift(@T)); # rotl128(@T,32);
646 &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94)
647 &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111)
650 &_loadround (0,$key,-128,@T); # load KL
651 &_rotl128 (@T,15,4,@T); # KL<<<15
652 &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45)
653 &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60)
654 &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77)
655 &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94)
656 &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111)
658 while (@T[0] ne "eax") # restore order
659 { unshift (@T,pop(@T)); }
661 &mov ("eax",3); # 3 grandRounds
662 &jmp (&label("done"));
664 &set_label("2nd256",16);
665 &mov ($idx,&wparam(2));
666 &_saveround (6,$idx,@T); # temporary storage for KA!
668 &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR
669 &xor (@T[1],&DWP(4*8+4,$idx));
670 &xor (@T[2],&DWP(5*8+0,$idx));
671 &xor (@T[3],&DWP(5*8+4,$idx));
673 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8]
674 &mov (&swtmp(0),@T[0]); # save s[0-3]
675 &mov (&swtmp(1),@T[1]);
676 &mov (&swtmp(2),@T[2]);
677 &mov (&swtmp(3),@T[3]);
678 &Camellia_Feistel($step++);
679 &Camellia_Feistel($step++);
680 &mov (@T[2],&swtmp(2));
681 &mov (@T[3],&swtmp(3));
683 &mov ($key,&wparam(2));
684 &lea ($key,&DWP(128,$key)); # size optimization
687 &_saveround (2,$key,-128,@T); # KB<<<0
688 &_rotl128 (@T,30,10,@T); # KB<<<30
689 &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60)
690 push (@T,shift(@T)); # rotl128(@T,32);
691 &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111)
694 &_loadround (4,$key,-128,@T); # load KR
695 &_rotl128 (@T,15,4,@T); # KR<<<15
696 &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30)
697 &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60)
698 push (@T,shift(@T)); # rotl128(@T,32);
699 &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94)
702 &_loadround (6,$key,-128,@T); # load KA
703 &_rotl128 (@T,15,6,@T); # KA<<<15
704 &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45)
705 push (@T,shift(@T)); # rotl128(@T,32);
706 &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77)
707 &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94)
710 &_loadround (0,$key,-128,@T); # load KL
711 push (@T,shift(@T)); # rotl128(@T,32);
712 &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45)
713 &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60)
714 &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77)
715 push (@T,shift(@T)); # rotl128(@T,32);
716 &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111)
718 while (@T[0] ne "eax") # restore order
719 { unshift (@T,pop(@T)); }
721 &mov ("eax",4); # 4 grandRounds
723 &lea ("edx",&DWP(272-128,$key)); # end of key schedule
726 &function_end("Camellia_Ekeygen");
729 # int Camellia_set_key (
730 # const unsigned char *userKey,
733 &function_begin_B("Camellia_set_key");
735 &mov ("ecx",&wparam(0)); # pull arguments
736 &mov ("ebx",&wparam(1));
737 &mov ("edx",&wparam(2));
741 &jz (&label("done")); # userKey==NULL?
743 &jz (&label("done")); # key==NULL?
747 &je (&label("arg_ok")); # bits==256?
749 &je (&label("arg_ok")); # bits==192?
751 &jne (&label("done")); # bits!=128?
752 &set_label("arg_ok",4);
754 &push ("edx"); # push arguments
757 &call ("Camellia_Ekeygen");
760 # eax holds grandRounds and edx points at where to put it
761 &mov (&DWP(0,"edx"),"eax");
763 &set_label("done",4);
766 &function_end_B("Camellia_set_key");
770 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
771 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
772 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
773 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
774 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
775 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
776 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
777 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
778 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
779 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
780 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
781 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
782 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
783 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
784 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
785 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
787 sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
788 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
789 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
790 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
792 &set_label("Camellia_SIGMA",64);
794 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
795 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
796 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
798 &set_label("Camellia_SBOX",64);
799 # tables are interleaved, remember?
800 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
801 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
803 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
804 # size_t length, const CAMELLIA_KEY *key,
805 # unsigned char *ivp,const int enc);
808 # -4(%esp) # return address 0(%esp)
809 # 0(%esp) # s0 4(%esp)
810 # 4(%esp) # s1 8(%esp)
811 # 8(%esp) # s2 12(%esp)
812 # 12(%esp) # s3 16(%esp)
813 # 16(%esp) # end of key schedule 20(%esp)
814 # 20(%esp) # %esp backup
815 my $_inp=&DWP(24,"esp"); #copy of wparam(0)
816 my $_out=&DWP(28,"esp"); #copy of wparam(1)
817 my $_len=&DWP(32,"esp"); #copy of wparam(2)
818 my $_key=&DWP(36,"esp"); #copy of wparam(3)
819 my $_ivp=&DWP(40,"esp"); #copy of wparam(4)
820 my $ivec=&DWP(44,"esp"); #ivec[16]
821 my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec]
822 my ($s0,$s1,$s2,$s3) = @T;
824 &function_begin("Camellia_cbc_encrypt");
825 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
827 &je (&label("enc_out"));
832 &mov ($s0,&wparam(0)); # load inp
833 &mov ($s1,&wparam(1)); # load out
834 #&mov ($s2,&wparam(2)); # load len
835 &mov ($s3,&wparam(3)); # load key
836 &mov ($Tbl,&wparam(4)); # load ivp
838 # allocate aligned stack frame...
839 &lea ($idx,&DWP(-64,"esp"));
842 # place stack frame just "above mod 1024" the key schedule
843 # this ensures that cache associativity of 2 suffices
844 &lea ($key,&DWP(-64-63,$s3));
847 &and ($key,0x3C0); # modulo 1024, but aligned to cache-line
850 &mov ($key,&wparam(5)); # load enc
853 &add ("esp",4); # reserve for return address!
854 &mov ($_esp,$idx); # save %esp
856 &mov ($_inp,$s0); # save copy of inp
857 &mov ($_out,$s1); # save copy of out
858 &mov ($_len,$s2); # save copy of len
859 &mov ($_key,$s3); # save copy of key
860 &mov ($_ivp,$Tbl); # save copy of ivp
862 &call (&label("pic_point")); # make it PIC!
863 &set_label("pic_point");
865 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
868 &set_label("prefetch_sbox",4);
869 &mov ($s0,&DWP(0,$Tbl));
870 &mov ($s1,&DWP(32,$Tbl));
871 &mov ($s2,&DWP(64,$Tbl));
872 &mov ($s3,&DWP(96,$Tbl));
873 &lea ($Tbl,&DWP(128,$Tbl));
875 &jnz (&label("prefetch_sbox"));
879 &mov ($s3,&DWP(272,$s0)); # load grandRounds
882 &je (&label("DECRYPT"));
887 &lea ($s3,&DWP(0,$s0,$s3));
890 &test ($s2,0xFFFFFFF0);
891 &jz (&label("enc_tail")); # short input...
893 &mov ($s0,&DWP(0,$key)); # load iv
894 &mov ($s1,&DWP(4,$key));
896 &set_label("enc_loop",4);
897 &mov ($s2,&DWP(8,$key));
898 &mov ($s3,&DWP(12,$key));
900 &xor ($s0,&DWP(0,$idx)); # xor input data
901 &xor ($s1,&DWP(4,$idx));
902 &xor ($s2,&DWP(8,$idx));
904 &xor ($s3,&DWP(12,$idx));
906 &mov ($key,$_key); # load key
910 &call ("_x86_Camellia_encrypt");
912 &mov ($idx,$_inp); # load inp
913 &mov ($key,$_out); # load out
918 &mov (&DWP(0,$key),$s0); # save output data
920 &mov (&DWP(4,$key),$s1);
921 &mov (&DWP(8,$key),$s2);
922 &mov (&DWP(12,$key),$s3);
924 &mov ($s2,$_len); # load len
926 &lea ($idx,&DWP(16,$idx));
927 &mov ($_inp,$idx); # save inp
929 &lea ($s3,&DWP(16,$key));
930 &mov ($_out,$s3); # save out
933 &test ($s2,0xFFFFFFF0);
934 &mov ($_len,$s2); # save len
935 &jnz (&label("enc_loop"));
937 &jnz (&label("enc_tail"));
938 &mov ($idx,$_ivp); # load ivp
939 &mov ($s2,&DWP(8,$key)); # restore last dwords
940 &mov ($s3,&DWP(12,$key));
941 &mov (&DWP(0,$idx),$s0); # save ivec
942 &mov (&DWP(4,$idx),$s1);
943 &mov (&DWP(8,$idx),$s2);
944 &mov (&DWP(12,$idx),$s3);
948 &set_label("enc_out");
950 &pushf (); # kludge, never executed
952 &set_label("enc_tail",4);
953 &mov ($s0,$key eq "edi" ? $key : "");
954 &mov ($key,$_out); # load out
955 &push ($s0); # push ivp
958 &cmp ($key,$idx); # compare with inp
959 &je (&label("enc_in_place"));
961 &data_word(0xA4F3F689); # rep movsb # copy input
962 &jmp (&label("enc_skip_in_place"));
963 &set_label("enc_in_place");
964 &lea ($key,&DWP(0,$key,$s2));
965 &set_label("enc_skip_in_place");
969 &data_word(0xAAF3F689); # rep stosb # zero tail
970 &pop ($key); # pop ivp
972 &mov ($idx,$_out); # output as input
973 &mov ($s0,&DWP(0,$key));
974 &mov ($s1,&DWP(4,$key));
975 &mov ($_len,16); # len=16
976 &jmp (&label("enc_loop")); # one more spin...
978 #----------------------------- DECRYPT -----------------------------#
979 &set_label("DECRYPT",16);
981 &lea ($s3,&DWP(0,$s0,$s3));
986 &je (&label("dec_in_place")); # in-place processing...
988 &mov ($key,$_ivp); # load ivp
991 &set_label("dec_loop",4);
992 &mov ($s0,&DWP(0,$idx)); # read input
993 &mov ($s1,&DWP(4,$idx));
994 &mov ($s2,&DWP(8,$idx));
996 &mov ($s3,&DWP(12,$idx));
998 &mov ($key,$_key); # load key
1002 &call ("_x86_Camellia_decrypt");
1004 &mov ($key,$_tmp); # load ivp
1005 &mov ($idx,$_len); # load len
1010 &xor ($s0,&DWP(0,$key)); # xor iv
1012 &xor ($s1,&DWP(4,$key));
1013 &xor ($s2,&DWP(8,$key));
1014 &xor ($s3,&DWP(12,$key));
1017 &jc (&label("dec_partial"));
1018 &mov ($_len,$idx); # save len
1019 &mov ($idx,$_inp); # load inp
1020 &mov ($key,$_out); # load out
1022 &mov (&DWP(0,$key),$s0); # write output
1023 &mov (&DWP(4,$key),$s1);
1024 &mov (&DWP(8,$key),$s2);
1025 &mov (&DWP(12,$key),$s3);
1027 &mov ($_tmp,$idx); # save ivp
1028 &lea ($idx,&DWP(16,$idx));
1029 &mov ($_inp,$idx); # save inp
1031 &lea ($key,&DWP(16,$key));
1032 &mov ($_out,$key); # save out
1034 &jnz (&label("dec_loop"));
1035 &mov ($key,$_tmp); # load temp ivp
1036 &set_label("dec_end");
1037 &mov ($idx,$_ivp); # load user ivp
1038 &mov ($s0,&DWP(0,$key)); # load iv
1039 &mov ($s1,&DWP(4,$key));
1040 &mov ($s2,&DWP(8,$key));
1041 &mov ($s3,&DWP(12,$key));
1042 &mov (&DWP(0,$idx),$s0); # copy back to user
1043 &mov (&DWP(4,$idx),$s1);
1044 &mov (&DWP(8,$idx),$s2);
1045 &mov (&DWP(12,$idx),$s3);
1046 &jmp (&label("dec_out"));
1048 &set_label("dec_partial",4);
1050 &mov (&DWP(0,$key),$s0); # dump output to stack
1051 &mov (&DWP(4,$key),$s1);
1052 &mov (&DWP(8,$key),$s2);
1053 &mov (&DWP(12,$key),$s3);
1054 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
1055 &mov ($idx eq "esi" ? $idx : "",$key);
1056 &mov ($key eq "edi" ? $key : "",$_out); # load out
1057 &data_word(0xA4F3F689); # rep movsb # copy output
1058 &mov ($key,$_inp); # use inp as temp ivp
1059 &jmp (&label("dec_end"));
1061 &set_label("dec_in_place",4);
1062 &set_label("dec_in_place_loop");
1064 &mov ($s0,&DWP(0,$idx)); # read input
1065 &mov ($s1,&DWP(4,$idx));
1066 &mov ($s2,&DWP(8,$idx));
1067 &mov ($s3,&DWP(12,$idx));
1069 &mov (&DWP(0,$key),$s0); # copy to temp
1070 &mov (&DWP(4,$key),$s1);
1071 &mov (&DWP(8,$key),$s2);
1073 &mov (&DWP(12,$key),$s3);
1075 &mov ($key,$_key); # load key
1079 &call ("_x86_Camellia_decrypt");
1081 &mov ($key,$_ivp); # load ivp
1082 &mov ($idx,$_out); # load out
1087 &xor ($s0,&DWP(0,$key)); # xor iv
1089 &xor ($s1,&DWP(4,$key));
1090 &xor ($s2,&DWP(8,$key));
1091 &xor ($s3,&DWP(12,$key));
1093 &mov (&DWP(0,$idx),$s0); # write output
1094 &mov (&DWP(4,$idx),$s1);
1095 &mov (&DWP(8,$idx),$s2);
1096 &mov (&DWP(12,$idx),$s3);
1098 &lea ($idx,&DWP(16,$idx));
1099 &mov ($_out,$idx); # save out
1102 &mov ($s0,&DWP(0,$idx)); # read temp
1103 &mov ($s1,&DWP(4,$idx));
1104 &mov ($s2,&DWP(8,$idx));
1105 &mov ($s3,&DWP(12,$idx));
1107 &mov (&DWP(0,$key),$s0); # copy iv
1108 &mov (&DWP(4,$key),$s1);
1109 &mov (&DWP(8,$key),$s2);
1110 &mov (&DWP(12,$key),$s3);
1112 &mov ($idx,$_inp); # load inp
1114 &lea ($idx,&DWP(16,$idx));
1115 &mov ($_inp,$idx); # save inp
1117 &mov ($s2,$_len); # load len
1119 &jc (&label("dec_in_place_partial"));
1120 &mov ($_len,$s2); # save len
1121 &jnz (&label("dec_in_place_loop"));
1122 &jmp (&label("dec_out"));
1124 &set_label("dec_in_place_partial",4);
1125 # one can argue if this is actually required...
1126 &mov ($key eq "edi" ? $key : "",$_out);
1127 &lea ($idx eq "esi" ? $idx : "",$ivec);
1128 &lea ($key,&DWP(0,$key,$s2));
1129 &lea ($idx,&DWP(16,$idx,$s2));
1130 &neg ($s2 eq "ecx" ? $s2 : "");
1131 &data_word(0xA4F3F689); # rep movsb # restore tail
1133 &set_label("dec_out",4);
1136 &function_end("Camellia_cbc_encrypt");
1139 &asciz("Camellia for x86 by <appro\@openssl.org>");