3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA256/512 for PowerISA v2.07.
12 # Accurate performance measurements are problematic, because it's
13 # always virtualized setup with possibly throttled processor.
14 # Relative comparison is therefore more informative. This module is
15 # ~60% faster than integer-only sha512-ppc.pl. To anchor to something
16 # else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
17 # hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
18 # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
19 # result is degree of computational resources' utilization. POWER8 is
20 # "massively multi-threaded chip" and difference between single- and
21 # maximum multi-process benchmark results tells that utlization is
22 # whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
23 # for sha1-ppc.pl - 73%. 100% means that multi-process result equals
24 # to single-process one, given that all threads end up on the same
30 if ($flavour =~ /64/) {
36 } elsif ($flavour =~ /32/) {
42 } else { die "nonsense $flavour"; }
44 $LENDIAN=($flavour=~/le/);
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49 die "can't locate ppc-xlate.pl";
51 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
53 if ($output =~ /512/) {
65 $func="sha${bits}_block_p8";
78 ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
80 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
81 @X=map("v$_",(8..23));
82 ($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
85 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
88 $code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
89 lvx_u @X[$i+1],0,$inp ; load X[i] in advance
92 $code.=<<___ if ($i<16 && ($i%(16/$SZ)));
93 vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ
95 $code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
96 vperm @X[$i],@X[$i],@X[$i],$lemask
99 `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)`
100 vsel $Func,$g,$f,$e ; Ch(e,f,g)
101 vshasigma${sz} $S1,$e,1,15 ; Sigma1(e)
102 vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i]
103 vshasigma${sz} $S0,$a,1,0 ; Sigma0(a)
104 `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)`
105 vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g)
107 `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)`
108 vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e)
109 vsel $Func,$b,$c,$Func ; Maj(a,b,c)
110 vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
111 vaddu${sz}m $d,$d,$h ; d+=h
112 vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c)
113 `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)`
114 lvx $Ki,$idx,$Tbl ; load next K[i]
116 vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c)
117 `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)`
128 $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
130 li r10,`$FRAME+8*16+15`
131 li r11,`$FRAME+8*16+31`
132 stvx v20,r10,$sp # ABI says so
156 stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
158 $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
160 $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
162 $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
164 $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
166 $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
168 $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
170 $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
174 addi $offload,$sp,$FRAME+15
176 $code.=<<___ if ($LENDIAN);
180 vxor $lemask,$lemask,$Ki
182 $code.=<<___ if ($SZ==4);
185 vsldoi $B,$A,$A,4 # unpack
192 $code.=<<___ if ($SZ==8);
196 vsldoi $B,$A,$A,8 # unpack
203 li r0,`($rounds-16)/16` # inner loop counter
211 stvx $A,$x00,$offload # offload $A-$H
212 stvx $B,$x10,$offload
213 stvx $C,$x20,$offload
214 stvx $D,$x30,$offload
215 stvx $E,$x40,$offload
216 stvx $F,$x50,$offload
217 stvx $G,$x60,$offload
218 stvx $H,$x70,$offload
219 vaddu${sz}m $H,$H,$Ki # h+K[i]
223 for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
230 for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
234 lvx @X[2],$x00,$offload
236 lvx @X[3],$x10,$offload
237 vaddu${sz}m $A,$A,@X[2]
238 lvx @X[4],$x20,$offload
239 vaddu${sz}m $B,$B,@X[3]
240 lvx @X[5],$x30,$offload
241 vaddu${sz}m $C,$C,@X[4]
242 lvx @X[6],$x40,$offload
243 vaddu${sz}m $D,$D,@X[5]
244 lvx @X[7],$x50,$offload
245 vaddu${sz}m $E,$E,@X[6]
246 lvx @X[8],$x60,$offload
247 vaddu${sz}m $F,$F,@X[7]
248 lvx @X[9],$x70,$offload
249 vaddu${sz}m $G,$G,@X[8]
250 vaddu${sz}m $H,$H,@X[9]
253 $code.=<<___ if ($SZ==4);
256 vperm $A,$A,$B,$Ki # pack the answer
266 $code.=<<___ if ($SZ==8);
267 vperm $A,$A,$B,$Ki # pack the answer
277 li r10,`$FRAME+8*16+15`
279 li r11,`$FRAME+8*16+31`
281 lvx v20,r10,$sp # ABI says so
303 $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
304 $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
305 $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
306 $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
307 $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
308 $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
309 addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
312 .byte 0,12,4,1,0x80,6,3,0
317 # Ugly hack here, because PPC assembler syntax seem to vary too
318 # much from platforms to platform...
324 mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
325 addi $Tbl,$Tbl,`64-8`
329 .byte 0,12,0x14,0,0,0,0,0
335 foreach(@_) { $code.=".quad $_,$_\n"; }
338 "0x428a2f98d728ae22","0x7137449123ef65cd",
339 "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
340 "0x3956c25bf348b538","0x59f111f1b605d019",
341 "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
342 "0xd807aa98a3030242","0x12835b0145706fbe",
343 "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
344 "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
345 "0x9bdc06a725c71235","0xc19bf174cf692694",
346 "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
347 "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
348 "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
349 "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
350 "0x983e5152ee66dfab","0xa831c66d2db43210",
351 "0xb00327c898fb213f","0xbf597fc7beef0ee4",
352 "0xc6e00bf33da88fc2","0xd5a79147930aa725",
353 "0x06ca6351e003826f","0x142929670a0e6e70",
354 "0x27b70a8546d22ffc","0x2e1b21385c26c926",
355 "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
356 "0x650a73548baf63de","0x766a0abb3c77b2a8",
357 "0x81c2c92e47edaee6","0x92722c851482353b",
358 "0xa2bfe8a14cf10364","0xa81a664bbc423001",
359 "0xc24b8b70d0f89791","0xc76c51a30654be30",
360 "0xd192e819d6ef5218","0xd69906245565a910",
361 "0xf40e35855771202a","0x106aa07032bbd1b8",
362 "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
363 "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
364 "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
365 "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
366 "0x748f82ee5defb2fc","0x78a5636f43172f60",
367 "0x84c87814a1f0ab72","0x8cc702081a6439ec",
368 "0x90befffa23631e28","0xa4506cebde82bde9",
369 "0xbef9a3f7b2c67915","0xc67178f2e372532b",
370 "0xca273eceea26619c","0xd186b8c721c0c207",
371 "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
372 "0x06f067aa72176fba","0x0a637dc5a2c898a6",
373 "0x113f9804bef90dae","0x1b710b35131c471b",
374 "0x28db77f523047d84","0x32caab7b40c72493",
375 "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
376 "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
377 "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
378 $code.=<<___ if (!$LENDIAN);
379 .quad 0x0001020304050607,0x1011121314151617
381 $code.=<<___ if ($LENDIAN); # quad-swapped
382 .quad 0x1011121314151617,0x0001020304050607
386 foreach(@_) { $code.=".long $_,$_,$_,$_\n"; }
389 "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
390 "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
391 "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
392 "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
393 "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
394 "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
395 "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
396 "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
397 "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
398 "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
399 "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
400 "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
401 "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
402 "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
403 "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
404 "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
405 $code.=<<___ if (!$LENDIAN);
406 .long 0x00010203,0x10111213,0x10111213,0x10111213
407 .long 0x00010203,0x04050607,0x10111213,0x10111213
408 .long 0x00010203,0x04050607,0x08090a0b,0x10111213
410 $code.=<<___ if ($LENDIAN); # word-swapped
411 .long 0x10111213,0x10111213,0x10111213,0x00010203
412 .long 0x10111213,0x10111213,0x04050607,0x00010203
413 .long 0x10111213,0x08090a0b,0x04050607,0x00010203
417 .asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
421 $code =~ s/\`([^\`]*)\`/eval $1/gem;