2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for PowerISA 2.07.
20 # This is straightforward KECCAK_1X_ALT SIMD implementation, but with
21 # disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral.
22 # POWER8 processor spends 9.8 cycles to process byte out of large
23 # buffer for r=1088, which matches SHA3-256. This is 17% better than
24 # scalar PPC64 code. It probably should be noted that if POWER8's
25 # successor can achieve higher scalar instruction issue rate, then
26 # this module will loose...
30 if ($flavour =~ /64/) {
37 } elsif ($flavour =~ /32/) {
44 } else { die "nonsense $flavour"; }
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49 die "can't locate ppc-xlate.pl";
51 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
53 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
59 ########################################################################
78 # v13..25 rhotates[][]
85 .type KeccakF1600_int,\@function
95 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta
96 vxor v26,v0, v5 ; A[0..1][0]^A[2..3][0]
97 vxor v27,v1, v6 ; A[0..1][1]^A[2..3][1]
98 vxor v28,v2, v7 ; A[0..1][2]^A[2..3][2]
99 vxor v29,v3, v8 ; A[0..1][3]^A[2..3][3]
100 vxor v30,v4, v9 ; A[0..1][4]^A[2..3][4]
101 vpermdi v31,v26,v27,0b00 ; A[0][0..1]^A[2][0..1]
102 vpermdi v26,v26,v27,0b11 ; A[1][0..1]^A[3][0..1]
103 vpermdi v27,v28,v29,0b00 ; A[0][2..3]^A[2][2..3]
104 vpermdi v28,v28,v29,0b11 ; A[1][2..3]^A[3][2..3]
105 vpermdi v29,v30,v30,0b10 ; A[1..0][4]^A[3..2][4]
106 vxor v26,v26,v31 ; C[0..1]
107 vxor v27,v27,v28 ; C[2..3]
108 vxor v28,v29,v30 ; C[4..4]
110 vxor v26,v26,v10 ; C[0..1] ^= A[4][0..1]
111 vxor v27,v27,v11 ; C[2..3] ^= A[4][2..3]
112 vxor v28,v28,v12 ; C[4..4] ^= A[4][4..4], low!
114 vrld v29,v26,v31 ; ROL64(C[0..1],1)
115 vrld v30,v27,v31 ; ROL64(C[2..3],1)
116 vrld v31,v28,v31 ; ROL64(C[4..4],1)
117 vpermdi v31,v31,v29,0b10
118 vxor v26,v26,v30 ; C[0..1] ^= ROL64(C[2..3],1)
119 vxor v27,v27,v31 ; C[2..3] ^= ROL64(C[4..0],1)
120 vxor v28,v28,v29 ; C[4..4] ^= ROL64(C[0..1],1), low!
122 vpermdi v29,v26,v26,0b00 ; C[0..0]
123 vpermdi v30,v28,v26,0b10 ; C[4..0]
124 vpermdi v31,v28,v28,0b11 ; C[4..4]
125 vxor v1, v1, v29 ; A[0..1][1] ^= C[0..0]
126 vxor v6, v6, v29 ; A[2..3][1] ^= C[0..0]
127 vxor v10,v10,v30 ; A[4][0..1] ^= C[4..0]
128 vxor v0, v0, v31 ; A[0..1][0] ^= C[4..4]
129 vxor v5, v5, v31 ; A[2..3][0] ^= C[4..4]
131 vpermdi v29,v27,v27,0b00 ; C[2..2]
132 vpermdi v30,v26,v26,0b11 ; C[1..1]
133 vpermdi v31,v26,v27,0b10 ; C[1..2]
134 vxor v3, v3, v29 ; A[0..1][3] ^= C[2..2]
135 vxor v8, v8, v29 ; A[2..3][3] ^= C[2..2]
136 vxor v2, v2, v30 ; A[0..1][2] ^= C[1..1]
137 vxor v7, v7, v30 ; A[2..3][2] ^= C[1..1]
138 vxor v11,v11,v31 ; A[4][2..3] ^= C[1..2]
140 vpermdi v29,v27,v27,0b11 ; C[3..3]
141 vxor v4, v4, v29 ; A[0..1][4] ^= C[3..3]
142 vxor v9, v9, v29 ; A[2..3][4] ^= C[3..3]
143 vxor v12,v12,v29 ; A[4..4][4] ^= C[3..3]
145 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho
146 vrld v26,v0, v13 ; v0
148 vrld v27,v2, v15 ; v2
149 vrld v28,v3, v16 ; v3
153 vrld v29,v7, v20 ; v7
157 vrld v30,v11,v24 ; v11
160 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi
161 vpermdi v0, v26,v28,0b00 ; [0][0] [1][0] < [0][0] [0][3]
162 vpermdi v2, v29,v5, 0b00 ; [0][2] [1][2] < [2][2] [2][0]
163 vpermdi v11,v9, v5, 0b01 ; [4][2] [4][3] < [2][4] [3][0]
164 vpermdi v5, v1, v4, 0b00 ; [2][0] [3][0] < [0][1] [0][4]
165 vpermdi v1, v1, v4, 0b11 ; [0][1] [1][1] < [1][1] [1][4]
166 vpermdi v3, v8, v6, 0b11 ; [0][3] [1][3] < [3][3] [3][1]
167 vpermdi v4, v12,v30,0b10 ; [0][4] [1][4] < [4][4] [4][2]
168 vpermdi v7, v8, v6, 0b00 ; [2][2] [3][2] < [2][3] [2][1]
169 vpermdi v6, v27,v26,0b11 ; [2][1] [3][1] < [1][2] [1][0]
170 vpermdi v8, v9, v29,0b11 ; [2][3] [3][3] < [3][4] [3][2]
171 vpermdi v12,v10,v10,0b11 ; [4][4] [4][4] < [4][1] [4][1]
172 vpermdi v9, v10,v30,0b01 ; [2][4] [3][4] < [4][0] [4][3]
173 vpermdi v10,v27,v28,0b01 ; [4][0] [4][1] < [0][2] [1][3]
175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota
176 lvx_u v31,$iotas,r0 ; iotas[index]
177 addic r0,r0,16 ; index++
179 vandc v26,v2, v1 ; (~A[0..1][1] & A[0..1][2])
180 vandc v27,v3, v2 ; (~A[0..1][2] & A[0..1][3])
181 vandc v28,v4, v3 ; (~A[0..1][3] & A[0..1][4])
182 vandc v29,v0, v4 ; (~A[0..1][4] & A[0..1][0])
183 vandc v30,v1, v0 ; (~A[0..1][0] & A[0..1][1])
184 vxor v0, v0, v26 ; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2])
185 vxor v1, v1, v27 ; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3])
186 vxor v2, v2, v28 ; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
187 vxor v3, v3, v29 ; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
188 vxor v4, v4, v30 ; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
190 vandc v26,v7, v6 ; (~A[2..3][1] & A[2..3][2])
191 vandc v27,v8, v7 ; (~A[2..3][2] & A[2..3][3])
192 vandc v28,v9, v8 ; (~A[2..3][3] & A[2..3][4])
193 vandc v29,v5, v9 ; (~A[2..3][4] & A[2..3][0])
194 vandc v30,v6, v5 ; (~A[2..3][0] & A[2..3][1])
195 vxor v5, v5, v26 ; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
196 vxor v6, v6, v27 ; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
197 vxor v7, v7, v28 ; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
198 vxor v8, v8, v29 ; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
199 vxor v9, v9, v30 ; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
201 vxor v0, v0, v31 ; A[0][0] ^= iotas[index++]
203 vpermdi v26,v10,v11,0b10 ; A[4][1..2]
204 vpermdi v27,v12,v10,0b00 ; A[4][4..0]
205 vpermdi v28,v11,v12,0b10 ; A[4][3..4]
206 vpermdi v29,v10,v10,0b10 ; A[4][1..0]
207 vandc v26,v11,v26 ; (~A[4][1..2] & A[4][2..3])
208 vandc v27,v27,v28 ; (~A[4][3..4] & A[4][4..0])
209 vandc v28,v10,v29 ; (~A[4][1..0] & A[4][0..1])
210 vxor v10,v10,v26 ; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3])
211 vxor v11,v11,v27 ; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0])
212 vxor v12,v12,v28 ; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0])
216 vpermdi v12,v12,v12,0b11 ; broadcast A[4][4]
219 .byte 0,12,0x14,0,0,0,0,0
220 .size KeccakF1600_int,.-KeccakF1600_int
222 .type KeccakF1600,\@function
225 $STU $sp,-$FRAME($sp)
226 li r10,`15+6*$SIZE_T`
227 li r11,`31+6*$SIZE_T`
229 mfspr r7, 256 ; save vrsave
252 stw r7,`$FRAME-4`($sp) ; save vrsave
254 $PUSH r8,`$FRAME+$LRSAVE`($sp)
255 mtspr 256, r0 ; preserve all AltiVec registers
258 lvx_4w v0,0,r3 ; load A[5][5]
286 lvx_u v13,0,r12 ; load rhotates
310 addi r12,r12,`16*16` ; points at iotas
315 stvx_4w v0,0,r3 ; return A[5][5]
340 li r10,`15+6*$SIZE_T`
341 li r11,`31+6*$SIZE_T`
343 mtspr 256, r7 ; restore vrsave
369 .byte 0,12,0x04,1,0x80,0,1,0
371 .size KeccakF1600,.-KeccakF1600
374 my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6));
378 .type SHA3_absorb,\@function
381 $STU $sp,-$FRAME($sp)
382 li r10,`15+6*$SIZE_T`
383 li r11,`31+6*$SIZE_T`
385 mfspr r7, 256 ; save vrsave
408 stw r7,`$FRAME-4`($sp) ; save vrsave
410 $PUSH r8,`$FRAME+$LRSAVE`($sp)
411 mtspr 256, r0 ; preserve all AltiVec registers
414 lvx_4w v0,0,$A_jagged ; load A[5][5]
416 lvx_4w v1,r11,$A_jagged
418 lvx_4w v2,r10,$A_jagged
420 lvx_4w v3,r11,$A_jagged
422 lvx_4w v4,r10,$A_jagged
424 lvx_4w v5,r11,$A_jagged
426 lvx_4w v6,r10,$A_jagged
428 lvx_4w v7,r11,$A_jagged
430 lvx_4w v8,r10,$A_jagged
432 lvx_4w v9,r11,$A_jagged
434 lvx_4w v10,r10,$A_jagged
436 lvx_4w v11,r11,$A_jagged
437 lvx_splt v12,r10,$A_jagged
442 lvx_u v13,0,r12 ; load rhotates
468 addi r12,r12,`16*16` ; points at iotas
473 $UCMP $len,$bsz ; len < bsz?
476 sub $len,$len,$bsz ; len -= bsz
480 lvx_u v30,r10,r12 ; permutation masks
482 ?vspltisb v27,7 ; prepare masks for byte swap
483 ?vxor v30,v30,v27 ; on big-endian
486 vxor v27,v27,v27 ; zero
489 vperm v26,v26,v27,v30
494 vperm v26,v26,v27,v30
499 vperm v26,v26,v27,v30
504 vperm v26,v26,v27,v30
509 vperm v26,v26,v27,v30
514 vperm v26,v26,v27,v31
519 vperm v26,v26,v27,v31
524 vperm v26,v26,v27,v31
529 vperm v26,v26,v27,v31
534 vperm v26,v26,v27,v31
539 vperm v26,v26,v27,v30
544 vperm v26,v26,v27,v30
549 vperm v26,v26,v27,v30
554 vperm v26,v26,v27,v30
559 vperm v26,v26,v27,v30
564 vperm v26,v26,v27,v31
569 vperm v26,v26,v27,v31
574 vperm v26,v26,v27,v31
579 vperm v26,v26,v27,v31
584 vperm v26,v26,v27,v31
589 vperm v26,v26,v27,v30
594 vperm v26,v26,v27,v31
599 vperm v26,v26,v27,v30
604 vperm v26,v26,v27,v31
609 vperm v26,v26,v27,v31
620 stvx_4w v0,0,$A_jagged ; return A[5][5]
622 stvx_4w v1,r11,$A_jagged
624 stvx_4w v2,r10,$A_jagged
626 stvx_4w v3,r11,$A_jagged
628 stvx_4w v4,r10,$A_jagged
630 stvx_4w v5,r11,$A_jagged
632 stvx_4w v6,r10,$A_jagged
634 stvx_4w v7,r11,$A_jagged
636 stvx_4w v8,r10,$A_jagged
638 stvx_4w v9,r11,$A_jagged
640 stvx_4w v10,r10,$A_jagged
642 stvx_4w v11,r11,$A_jagged
643 stvdx_u v12,r10,$A_jagged
645 mr r3,$len ; return value
646 li r10,`15+6*$SIZE_T`
647 li r11,`31+6*$SIZE_T`
649 mtspr 256, r7 ; restore vrsave
675 .byte 0,12,0x04,1,0x80,0,4,0
677 .size SHA3_absorb,.-SHA3_absorb
681 my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6));
685 .type SHA3_squeeze,\@function
688 mflr r9 ; r9 is not touched by KeccakF1600
689 subi $out,$out,1 ; prepare for stbu
690 addi r8,$A_jagged,4 ; prepare volatiles
697 lwzx r0,r11,$A_jagged ; hi
701 stbu r7,1($out) ; write lo
708 stbu r0,1($out) ; write hi
717 beqlr ; return if done
722 addi r11,r11,16 ; calculate jagged index
745 addi r8,$A_jagged,4 ; restore volatiles
771 .byte 0,12,0x14,0,0,0,4,0
773 .size SHA3_squeeze,.-SHA3_squeeze
781 mflr r12 ; vvvvvv "distance" between . and 1st data entry
786 .byte 0,12,0x14,0,0,0,0,0
788 .type rhotates,\@object
804 .size rhotates,.-rhotates
806 .quad 0x0001020304050607,0x1011121314151617
807 .quad 0x1011121314151617,0x0001020304050607
810 .quad 0x0000000000000001,0
811 .quad 0x0000000000008082,0
812 .quad 0x800000000000808a,0
813 .quad 0x8000000080008000,0
814 .quad 0x000000000000808b,0
815 .quad 0x0000000080000001,0
816 .quad 0x8000000080008081,0
817 .quad 0x8000000000008009,0
818 .quad 0x000000000000008a,0
819 .quad 0x0000000000000088,0
820 .quad 0x0000000080008009,0
821 .quad 0x000000008000000a,0
822 .quad 0x000000008000808b,0
823 .quad 0x800000000000008b,0
824 .quad 0x8000000000008089,0
825 .quad 0x8000000000008003,0
826 .quad 0x8000000000008002,0
827 .quad 0x8000000000000080,0
828 .quad 0x000000000000800a,0
829 .quad 0x800000008000000a,0
830 .quad 0x8000000080008081,0
831 .quad 0x8000000000008080,0
832 .quad 0x0000000080000001,0
833 .quad 0x8000000080008008,0
835 .asciz "Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
838 foreach (split("\n",$code)) {
839 s/\`([^\`]*)\`/eval $1/ge;
841 if ($flavour =~ /le$/) { # little-endian
843 } else { # big-endian