lantiq: fix broadcasts and vlans in two iface mode
[oweals/openwrt.git] / target / linux / brcm2708 / patches-4.9 / 0165-drm-vc4-Add-fragment-shader-threading-support.patch
1 From 8f5722ac3e42a33345bfd82b7ad6a153134a4239 Mon Sep 17 00:00:00 2001
2 From: Jonas Pfeil <pfeiljonas@gmx.de>
3 Date: Tue, 8 Nov 2016 00:18:39 +0100
4 Subject: [PATCH] drm/vc4: Add fragment shader threading support
5
6 FS threading brings performance improvements of 0-20% in glmark2.
7
8 The validation code checks for thread switch signals and ensures that
9 the registers of the other thread are not touched, and that our clamps
10 are not live across thread switches.  It also checks that the
11 threading and branching instructions do not interfere.
12
13 (Original patch by Jonas, changes by anholt for style cleanup,
14 removing validation the kernel doesn't need to do, and adding the flag
15 for userspace).
16
17 v2: Minor style fixes from checkpatch.
18
19 Signed-off-by: Jonas Pfeil <pfeiljonas@gmx.de>
20 Signed-off-by: Eric Anholt <eric@anholt.net>
21 (cherry picked from commit c778cc5df944291dcdb1ca7a6bb781fbc22550c5)
22 ---
23  drivers/gpu/drm/vc4/vc4_drv.c              |  1 +
24  drivers/gpu/drm/vc4/vc4_drv.h              |  2 +
25  drivers/gpu/drm/vc4/vc4_validate.c         | 17 +++++---
26  drivers/gpu/drm/vc4/vc4_validate_shaders.c | 63 ++++++++++++++++++++++++++++++
27  include/uapi/drm/vc4_drm.h                 |  1 +
28  5 files changed, 79 insertions(+), 5 deletions(-)
29
30 --- a/drivers/gpu/drm/vc4/vc4_drv.c
31 +++ b/drivers/gpu/drm/vc4/vc4_drv.c
32 @@ -82,6 +82,7 @@ static int vc4_get_param_ioctl(struct dr
33                 break;
34         case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
35         case DRM_VC4_PARAM_SUPPORTS_ETC1:
36 +       case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
37                 args->value = true;
38                 break;
39         default:
40 --- a/drivers/gpu/drm/vc4/vc4_drv.h
41 +++ b/drivers/gpu/drm/vc4/vc4_drv.h
42 @@ -384,6 +384,8 @@ struct vc4_validated_shader_info {
43  
44         uint32_t num_uniform_addr_offsets;
45         uint32_t *uniform_addr_offsets;
46 +
47 +       bool is_threaded;
48  };
49  
50  /**
51 --- a/drivers/gpu/drm/vc4/vc4_validate.c
52 +++ b/drivers/gpu/drm/vc4/vc4_validate.c
53 @@ -789,11 +789,6 @@ validate_gl_shader_rec(struct drm_device
54         exec->shader_rec_v += roundup(packet_size, 16);
55         exec->shader_rec_size -= packet_size;
56  
57 -       if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
58 -               DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
59 -               return -EINVAL;
60 -       }
61 -
62         for (i = 0; i < shader_reloc_count; i++) {
63                 if (src_handles[i] > exec->bo_count) {
64                         DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
65 @@ -810,6 +805,18 @@ validate_gl_shader_rec(struct drm_device
66                         return -EINVAL;
67         }
68  
69 +       if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) !=
70 +           to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) {
71 +               DRM_ERROR("Thread mode of CL and FS do not match\n");
72 +               return -EINVAL;
73 +       }
74 +
75 +       if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded ||
76 +           to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) {
77 +               DRM_ERROR("cs and vs cannot be threaded\n");
78 +               return -EINVAL;
79 +       }
80 +
81         for (i = 0; i < shader_reloc_count; i++) {
82                 struct vc4_validated_shader_info *validated_shader;
83                 uint32_t o = shader_reloc_offsets[i];
84 --- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
85 +++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
86 @@ -83,6 +83,13 @@ struct vc4_shader_validation_state {
87          * basic blocks.
88          */
89         bool needs_uniform_address_for_loop;
90 +
91 +       /* Set when we find an instruction writing the top half of the
92 +        * register files.  If we allowed writing the unusable regs in
93 +        * a threaded shader, then the other shader running on our
94 +        * QPU's clamp validation would be invalid.
95 +        */
96 +       bool all_registers_used;
97  };
98  
99  static uint32_t
100 @@ -119,6 +126,13 @@ raddr_add_a_to_live_reg_index(uint64_t i
101  }
102  
103  static bool
104 +live_reg_is_upper_half(uint32_t lri)
105 +{
106 +       return  (lri >= 16 && lri < 32) ||
107 +               (lri >= 32 + 16 && lri < 32 + 32);
108 +}
109 +
110 +static bool
111  is_tmu_submit(uint32_t waddr)
112  {
113         return (waddr == QPU_W_TMU0_S ||
114 @@ -390,6 +404,9 @@ check_reg_write(struct vc4_validated_sha
115                 } else {
116                         validation_state->live_immediates[lri] = ~0;
117                 }
118 +
119 +               if (live_reg_is_upper_half(lri))
120 +                       validation_state->all_registers_used = true;
121         }
122  
123         switch (waddr) {
124 @@ -598,6 +615,11 @@ check_instruction_reads(struct vc4_valid
125                 }
126         }
127  
128 +       if ((raddr_a >= 16 && raddr_a < 32) ||
129 +           (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
130 +               validation_state->all_registers_used = true;
131 +       }
132 +
133         return true;
134  }
135  
136 @@ -753,6 +775,7 @@ vc4_validate_shader(struct drm_gem_cma_o
137  {
138         bool found_shader_end = false;
139         int shader_end_ip = 0;
140 +       uint32_t last_thread_switch_ip = -3;
141         uint32_t ip;
142         struct vc4_validated_shader_info *validated_shader = NULL;
143         struct vc4_shader_validation_state validation_state;
144 @@ -785,6 +808,17 @@ vc4_validate_shader(struct drm_gem_cma_o
145                 if (!vc4_handle_branch_target(&validation_state))
146                         goto fail;
147  
148 +               if (ip == last_thread_switch_ip + 3) {
149 +                       /* Reset r0-r3 live clamp data */
150 +                       int i;
151 +
152 +                       for (i = 64; i < LIVE_REG_COUNT; i++) {
153 +                               validation_state.live_min_clamp_offsets[i] = ~0;
154 +                               validation_state.live_max_clamp_regs[i] = false;
155 +                               validation_state.live_immediates[i] = ~0;
156 +                       }
157 +               }
158 +
159                 switch (sig) {
160                 case QPU_SIG_NONE:
161                 case QPU_SIG_WAIT_FOR_SCOREBOARD:
162 @@ -794,6 +828,8 @@ vc4_validate_shader(struct drm_gem_cma_o
163                 case QPU_SIG_LOAD_TMU1:
164                 case QPU_SIG_PROG_END:
165                 case QPU_SIG_SMALL_IMM:
166 +               case QPU_SIG_THREAD_SWITCH:
167 +               case QPU_SIG_LAST_THREAD_SWITCH:
168                         if (!check_instruction_writes(validated_shader,
169                                                       &validation_state)) {
170                                 DRM_ERROR("Bad write at ip %d\n", ip);
171 @@ -809,6 +845,18 @@ vc4_validate_shader(struct drm_gem_cma_o
172                                 shader_end_ip = ip;
173                         }
174  
175 +                       if (sig == QPU_SIG_THREAD_SWITCH ||
176 +                           sig == QPU_SIG_LAST_THREAD_SWITCH) {
177 +                               validated_shader->is_threaded = true;
178 +
179 +                               if (ip < last_thread_switch_ip + 3) {
180 +                                       DRM_ERROR("Thread switch too soon after "
181 +                                                 "last switch at ip %d\n", ip);
182 +                                       goto fail;
183 +                               }
184 +                               last_thread_switch_ip = ip;
185 +                       }
186 +
187                         break;
188  
189                 case QPU_SIG_LOAD_IMM:
190 @@ -823,6 +871,13 @@ vc4_validate_shader(struct drm_gem_cma_o
191                         if (!check_branch(inst, validated_shader,
192                                           &validation_state, ip))
193                                 goto fail;
194 +
195 +                       if (ip < last_thread_switch_ip + 3) {
196 +                               DRM_ERROR("Branch in thread switch at ip %d",
197 +                                         ip);
198 +                               goto fail;
199 +                       }
200 +
201                         break;
202                 default:
203                         DRM_ERROR("Unsupported QPU signal %d at "
204 @@ -844,6 +899,14 @@ vc4_validate_shader(struct drm_gem_cma_o
205                 goto fail;
206         }
207  
208 +       /* Might corrupt other thread */
209 +       if (validated_shader->is_threaded &&
210 +           validation_state.all_registers_used) {
211 +               DRM_ERROR("Shader uses threading, but uses the upper "
212 +                         "half of the registers, too\n");
213 +               goto fail;
214 +       }
215 +
216         /* If we did a backwards branch and we haven't emitted a uniforms
217          * reset since then, we still need the uniforms stream to have the
218          * uniforms address available so that the backwards branch can do its
219 --- a/include/uapi/drm/vc4_drm.h
220 +++ b/include/uapi/drm/vc4_drm.h
221 @@ -287,6 +287,7 @@ struct drm_vc4_get_hang_state {
222  #define DRM_VC4_PARAM_V3D_IDENT2               2
223  #define DRM_VC4_PARAM_SUPPORTS_BRANCHES                3
224  #define DRM_VC4_PARAM_SUPPORTS_ETC1            4
225 +#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS     5
226  
227  struct drm_vc4_get_param {
228         __u32 param;