Linux-libre 5.3.12-gnu
[librecmc/linux-libre.git] / arch / powerpc / crypto / sha1-spe-asm.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Fast SHA-1 implementation for SPE instruction set (PPC)
4  *
5  * This code makes use of the SPE SIMD instruction set as defined in
6  * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7  * Implementation is based on optimization guide notes from
8  * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9  *
10  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11  */
12
13 #include <asm/ppc_asm.h>
14 #include <asm/asm-offsets.h>
15
16 #define rHP     r3      /* pointer to hash value                        */
17 #define rWP     r4      /* pointer to input                             */
18 #define rKP     r5      /* pointer to constants                         */
19
20 #define rW0     r14     /* 64 bit round words                           */
21 #define rW1     r15
22 #define rW2     r16
23 #define rW3     r17
24 #define rW4     r18
25 #define rW5     r19
26 #define rW6     r20
27 #define rW7     r21
28
29 #define rH0     r6      /* 32 bit hash values                           */
30 #define rH1     r7
31 #define rH2     r8
32 #define rH3     r9
33 #define rH4     r10
34
35 #define rT0     r22     /* 64 bit temporary                             */
36 #define rT1     r0      /* 32 bit temporaries                           */
37 #define rT2     r11
38 #define rT3     r12
39
40 #define rK      r23     /* 64 bit constant in volatile register         */
41
42 #define LOAD_K01
43
44 #define LOAD_K11 \
45         evlwwsplat      rK,0(rKP);
46
47 #define LOAD_K21 \
48         evlwwsplat      rK,4(rKP);
49
50 #define LOAD_K31 \
51         evlwwsplat      rK,8(rKP);
52
53 #define LOAD_K41 \
54         evlwwsplat      rK,12(rKP);
55
56 #define INITIALIZE \
57         stwu            r1,-128(r1);    /* create stack frame           */ \
58         evstdw          r14,8(r1);      /* We must save non volatile    */ \
59         evstdw          r15,16(r1);     /* registers. Take the chance   */ \
60         evstdw          r16,24(r1);     /* and save the SPE part too    */ \
61         evstdw          r17,32(r1);                                        \
62         evstdw          r18,40(r1);                                        \
63         evstdw          r19,48(r1);                                        \
64         evstdw          r20,56(r1);                                        \
65         evstdw          r21,64(r1);                                        \
66         evstdw          r22,72(r1);                                        \
67         evstdw          r23,80(r1);
68
69
70 #define FINALIZE \
71         evldw           r14,8(r1);      /* restore SPE registers        */ \
72         evldw           r15,16(r1);                                        \
73         evldw           r16,24(r1);                                        \
74         evldw           r17,32(r1);                                        \
75         evldw           r18,40(r1);                                        \
76         evldw           r19,48(r1);                                        \
77         evldw           r20,56(r1);                                        \
78         evldw           r21,64(r1);                                        \
79         evldw           r22,72(r1);                                        \
80         evldw           r23,80(r1);                                        \
81         xor             r0,r0,r0;                                          \
82         stw             r0,8(r1);       /* Delete sensitive data        */ \
83         stw             r0,16(r1);      /* that we might have pushed    */ \
84         stw             r0,24(r1);      /* from other context that runs */ \
85         stw             r0,32(r1);      /* the same code. Assume that   */ \
86         stw             r0,40(r1);      /* the lower part of the GPRs   */ \
87         stw             r0,48(r1);      /* were already overwritten on  */ \
88         stw             r0,56(r1);      /* the way down to here         */ \
89         stw             r0,64(r1);                                         \
90         stw             r0,72(r1);                                         \
91         stw             r0,80(r1);                                         \
92         addi            r1,r1,128;      /* cleanup stack frame          */
93
94 #ifdef __BIG_ENDIAN__
95 #define LOAD_DATA(reg, off) \
96         lwz             reg,off(rWP);   /* load data                    */
97 #define NEXT_BLOCK \
98         addi            rWP,rWP,64;     /* increment per block          */
99 #else
100 #define LOAD_DATA(reg, off) \
101         lwbrx           reg,0,rWP;      /* load data                    */ \
102         addi            rWP,rWP,4;      /* increment per word           */
103 #define NEXT_BLOCK                      /* nothing to do                */
104 #endif
105
106 #define R_00_15(a, b, c, d, e, w0, w1, k, off) \
107         LOAD_DATA(w0, off)              /* 1: W                         */ \
108         and             rT2,b,c;        /* 1: F' = B and C              */ \
109         LOAD_K##k##1                                                       \
110         andc            rT1,d,b;        /* 1: F" = ~B and D             */ \
111         rotrwi          rT0,a,27;       /* 1: A' = A rotl 5             */ \
112         or              rT2,rT2,rT1;    /* 1: F = F' or F"              */ \
113         add             e,e,rT0;        /* 1: E = E + A'                */ \
114         rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
115         add             e,e,w0;         /* 1: E = E + W                 */ \
116         LOAD_DATA(w1, off+4)            /* 2: W                         */ \
117         add             e,e,rT2;        /* 1: E = E + F                 */ \
118         and             rT1,a,b;        /* 2: F' = B and C              */ \
119         add             e,e,rK;         /* 1: E = E + K                 */ \
120         andc            rT2,c,a;        /* 2: F" = ~B and D             */ \
121         add             d,d,rK;         /* 2: E = E + K                 */ \
122         or              rT2,rT2,rT1;    /* 2: F = F' or F"              */ \
123         rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
124         add             d,d,w1;         /* 2: E = E + W                 */ \
125         rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
126         add             d,d,rT0;        /* 2: E = E + A'                */ \
127         evmergelo       w1,w1,w0;       /*    mix W[0]/W[1]             */ \
128         add             d,d,rT2         /* 2: E = E + F                 */
129
130 #define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
131         and             rT2,b,c;        /* 1: F' = B and C              */ \
132         evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
133         andc            rT1,d,b;        /* 1: F" = ~B and D             */ \
134         evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
135         or              rT1,rT1,rT2;    /* 1: F = F' or F"              */ \
136         evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
137         add             e,e,rT1;        /* 1: E = E + F                 */ \
138         evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
139         rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
140         evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
141         add             e,e,rT2;        /* 1: E = E + A'                */ \
142         evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
143         rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
144         LOAD_K##k##1                                                       \
145         evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
146         add             e,e,rT0;        /* 1: E = E + WK                */ \
147         add             d,d,rT1;        /* 2: E = E + WK                */ \
148         and             rT2,a,b;        /* 2: F' = B and C              */ \
149         andc            rT1,c,a;        /* 2: F" = ~B and D             */ \
150         rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
151         or              rT1,rT1,rT2;    /* 2: F = F' or F"              */ \
152         add             d,d,rT0;        /* 2: E = E + A'                */ \
153         rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
154         add             d,d,rT1         /* 2: E = E + F                 */
155
156 #define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
157         evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
158         xor             rT2,b,c;        /* 1: F' = B xor C              */ \
159         evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
160         xor             rT2,rT2,d;      /* 1: F = F' xor D              */ \
161         evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
162         add             e,e,rT2;        /* 1: E = E + F                 */ \
163         evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
164         rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
165         evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
166         add             e,e,rT2;        /* 1: E = E + A'                */ \
167         evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
168         rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
169         LOAD_K##k##1                                                       \
170         evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
171         add             e,e,rT0;        /* 1: E = E + WK                */ \
172         xor             rT2,a,b;        /* 2: F' = B xor C              */ \
173         add             d,d,rT1;        /* 2: E = E + WK                */ \
174         xor             rT2,rT2,c;      /* 2: F = F' xor D              */ \
175         rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
176         add             d,d,rT2;        /* 2: E = E + F                 */ \
177         rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
178         add             d,d,rT0         /* 2: E = E + A'                */
179
180 #define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
181         and             rT2,b,c;        /* 1: F' = B and C              */ \
182         evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
183         or              rT1,b,c;        /* 1: F" = B or C               */ \
184         evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
185         and             rT1,d,rT1;      /* 1: F" = F" and D             */ \
186         evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
187         or              rT2,rT2,rT1;    /* 1: F = F' or F"              */ \
188         evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
189         add             e,e,rT2;        /* 1: E = E + F                 */ \
190         evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
191         rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
192         evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
193         add             e,e,rT2;        /* 1: E = E + A'                */ \
194         LOAD_K##k##1                                                       \
195         evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
196         rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
197         add             e,e,rT0;        /* 1: E = E + WK                */ \
198         and             rT2,a,b;        /* 2: F' = B and C              */ \
199         or              rT0,a,b;        /* 2: F" = B or C               */ \
200         add             d,d,rT1;        /* 2: E = E + WK                */ \
201         and             rT0,c,rT0;      /* 2: F" = F" and D             */ \
202         rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
203         or              rT2,rT2,rT0;    /* 2: F = F' or F"              */ \
204         rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
205         add             d,d,rT2;        /* 2: E = E + F                 */ \
206         add             d,d,rT0         /* 2: E = E + A'                */
207
208 #define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
209         R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
210
211 _GLOBAL(ppc_spe_sha1_transform)
212         INITIALIZE
213
214         lwz             rH0,0(rHP)
215         lwz             rH1,4(rHP)
216         mtctr           r5
217         lwz             rH2,8(rHP)
218         lis             rKP,PPC_SPE_SHA1_K@h
219         lwz             rH3,12(rHP)
220         ori             rKP,rKP,PPC_SPE_SHA1_K@l
221         lwz             rH4,16(rHP)
222
223 ppc_spe_sha1_main:
224         R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
225         R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
226         R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
227         R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
228         R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
229         R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
230         R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
231         R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
232
233         R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
234         R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
235
236         R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
237         R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
238         R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
239         R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
240         R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
241         R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
242         R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
243         R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
244         R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
245         R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
246
247         R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
248         R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
249         R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
250         R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
251         R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
252         R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
253         R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
254         R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
255         R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
256         R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
257
258         R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
259         R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
260         R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
261         R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
262         R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
263         R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
264         R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
265         lwz             rT3,0(rHP)
266         R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
267         lwz             rW1,4(rHP)
268         R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
269         lwz             rW2,8(rHP)
270         R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
271         lwz             rW3,12(rHP)
272         NEXT_BLOCK
273         lwz             rW4,16(rHP)
274
275         add             rH0,rH0,rT3
276         stw             rH0,0(rHP)
277         add             rH1,rH1,rW1
278         stw             rH1,4(rHP)
279         add             rH2,rH2,rW2
280         stw             rH2,8(rHP)
281         add             rH3,rH3,rW3
282         stw             rH3,12(rHP)
283         add             rH4,rH4,rW4
284         stw             rH4,16(rHP)
285
286         bdnz            ppc_spe_sha1_main
287
288         FINALIZE
289         blr
290
291 .data
292 .align 4
293 PPC_SPE_SHA1_K:
294         .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6