Linux-libre 5.4.49-gnu
[librecmc/linux-libre.git] / drivers / gpu / drm / i915 / gt / intel_lrc.c
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135
136 #include "gem/i915_gem_context.h"
137
138 #include "i915_drv.h"
139 #include "i915_perf.h"
140 #include "i915_trace.h"
141 #include "i915_vgpu.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_workarounds.h"
149
150 #define RING_EXECLIST_QFULL             (1 << 0x2)
151 #define RING_EXECLIST1_VALID            (1 << 0x3)
152 #define RING_EXECLIST0_VALID            (1 << 0x4)
153 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
154 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
155 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
156
157 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
158 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
159 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
160 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
161 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
162 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
163
164 #define GEN8_CTX_STATUS_COMPLETED_MASK \
165          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
166
167 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
168
169 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
170 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
171 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
172 #define GEN12_IDLE_CTX_ID               0x7FF
173 #define GEN12_CSB_CTX_VALID(csb_dw) \
174         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
175
176 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
177 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
178 #define WA_TAIL_DWORDS 2
179 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
180
181 struct virtual_engine {
182         struct intel_engine_cs base;
183         struct intel_context context;
184
185         /*
186          * We allow only a single request through the virtual engine at a time
187          * (each request in the timeline waits for the completion fence of
188          * the previous before being submitted). By restricting ourselves to
189          * only submitting a single request, each request is placed on to a
190          * physical to maximise load spreading (by virtue of the late greedy
191          * scheduling -- each real engine takes the next available request
192          * upon idling).
193          */
194         struct i915_request *request;
195
196         /*
197          * We keep a rbtree of available virtual engines inside each physical
198          * engine, sorted by priority. Here we preallocate the nodes we need
199          * for the virtual engine, indexed by physical_engine->id.
200          */
201         struct ve_node {
202                 struct rb_node rb;
203                 int prio;
204         } nodes[I915_NUM_ENGINES];
205
206         /*
207          * Keep track of bonded pairs -- restrictions upon on our selection
208          * of physical engines any particular request may be submitted to.
209          * If we receive a submit-fence from a master engine, we will only
210          * use one of sibling_mask physical engines.
211          */
212         struct ve_bond {
213                 const struct intel_engine_cs *master;
214                 intel_engine_mask_t sibling_mask;
215         } *bonds;
216         unsigned int num_bonds;
217
218         /* And finally, which physical engines this virtual engine maps onto. */
219         unsigned int num_siblings;
220         struct intel_engine_cs *siblings[0];
221 };
222
223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
224 {
225         GEM_BUG_ON(!intel_engine_is_virtual(engine));
226         return container_of(engine, struct virtual_engine, base);
227 }
228
229 static int __execlists_context_alloc(struct intel_context *ce,
230                                      struct intel_engine_cs *engine);
231
232 static void execlists_init_reg_state(u32 *reg_state,
233                                      struct intel_context *ce,
234                                      struct intel_engine_cs *engine,
235                                      struct intel_ring *ring);
236
237 static void mark_eio(struct i915_request *rq)
238 {
239         if (!i915_request_signaled(rq))
240                 dma_fence_set_error(&rq->fence, -EIO);
241         i915_request_mark_complete(rq);
242 }
243
244 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
245 {
246         return (i915_ggtt_offset(engine->status_page.vma) +
247                 I915_GEM_HWS_PREEMPT_ADDR);
248 }
249
250 static inline void
251 ring_set_paused(const struct intel_engine_cs *engine, int state)
252 {
253         /*
254          * We inspect HWS_PREEMPT with a semaphore inside
255          * engine->emit_fini_breadcrumb. If the dword is true,
256          * the ring is paused as the semaphore will busywait
257          * until the dword is false.
258          */
259         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
260         if (state)
261                 wmb();
262 }
263
264 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
265 {
266         return rb_entry(rb, struct i915_priolist, node);
267 }
268
269 static inline int rq_prio(const struct i915_request *rq)
270 {
271         return rq->sched.attr.priority;
272 }
273
274 static int effective_prio(const struct i915_request *rq)
275 {
276         int prio = rq_prio(rq);
277
278         /*
279          * If this request is special and must not be interrupted at any
280          * cost, so be it. Note we are only checking the most recent request
281          * in the context and so may be masking an earlier vip request. It
282          * is hoped that under the conditions where nopreempt is used, this
283          * will not matter (i.e. all requests to that context will be
284          * nopreempt for as long as desired).
285          */
286         if (i915_request_has_nopreempt(rq))
287                 prio = I915_PRIORITY_UNPREEMPTABLE;
288
289         /*
290          * On unwinding the active request, we give it a priority bump
291          * if it has completed waiting on any semaphore. If we know that
292          * the request has already started, we can prevent an unwanted
293          * preempt-to-idle cycle by taking that into account now.
294          */
295         if (__i915_request_has_started(rq))
296                 prio |= I915_PRIORITY_NOSEMAPHORE;
297
298         /* Restrict mere WAIT boosts from triggering preemption */
299         BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
300         return prio | __NO_PREEMPTION;
301 }
302
303 static int queue_prio(const struct intel_engine_execlists *execlists)
304 {
305         struct i915_priolist *p;
306         struct rb_node *rb;
307
308         rb = rb_first_cached(&execlists->queue);
309         if (!rb)
310                 return INT_MIN;
311
312         /*
313          * As the priolist[] are inverted, with the highest priority in [0],
314          * we have to flip the index value to become priority.
315          */
316         p = to_priolist(rb);
317         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
318 }
319
320 static inline bool need_preempt(const struct intel_engine_cs *engine,
321                                 const struct i915_request *rq,
322                                 struct rb_node *rb)
323 {
324         int last_prio;
325
326         if (!intel_engine_has_semaphores(engine))
327                 return false;
328
329         /*
330          * Check if the current priority hint merits a preemption attempt.
331          *
332          * We record the highest value priority we saw during rescheduling
333          * prior to this dequeue, therefore we know that if it is strictly
334          * less than the current tail of ESLP[0], we do not need to force
335          * a preempt-to-idle cycle.
336          *
337          * However, the priority hint is a mere hint that we may need to
338          * preempt. If that hint is stale or we may be trying to preempt
339          * ourselves, ignore the request.
340          */
341         last_prio = effective_prio(rq);
342         if (!i915_scheduler_need_preempt(engine->execlists.queue_priority_hint,
343                                          last_prio))
344                 return false;
345
346         /*
347          * Check against the first request in ELSP[1], it will, thanks to the
348          * power of PI, be the highest priority of that context.
349          */
350         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
351             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
352                 return true;
353
354         if (rb) {
355                 struct virtual_engine *ve =
356                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
357                 bool preempt = false;
358
359                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
360                         struct i915_request *next;
361
362                         rcu_read_lock();
363                         next = READ_ONCE(ve->request);
364                         if (next)
365                                 preempt = rq_prio(next) > last_prio;
366                         rcu_read_unlock();
367                 }
368
369                 if (preempt)
370                         return preempt;
371         }
372
373         /*
374          * If the inflight context did not trigger the preemption, then maybe
375          * it was the set of queued requests? Pick the highest priority in
376          * the queue (the first active priolist) and see if it deserves to be
377          * running instead of ELSP[0].
378          *
379          * The highest priority request in the queue can not be either
380          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
381          * context, it's priority would not exceed ELSP[0] aka last_prio.
382          */
383         return queue_prio(&engine->execlists) > last_prio;
384 }
385
386 __maybe_unused static inline bool
387 assert_priority_queue(const struct i915_request *prev,
388                       const struct i915_request *next)
389 {
390         /*
391          * Without preemption, the prev may refer to the still active element
392          * which we refuse to let go.
393          *
394          * Even with preemption, there are times when we think it is better not
395          * to preempt and leave an ostensibly lower priority request in flight.
396          */
397         if (i915_request_is_active(prev))
398                 return true;
399
400         return rq_prio(prev) >= rq_prio(next);
401 }
402
403 /*
404  * The context descriptor encodes various attributes of a context,
405  * including its GTT address and some flags. Because it's fairly
406  * expensive to calculate, we'll just do it once and cache the result,
407  * which remains valid until the context is unpinned.
408  *
409  * This is what a descriptor looks like, from LSB to MSB::
410  *
411  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
412  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
413  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
414  *      bits 53-54:    mbz, reserved for use by hardware
415  *      bits 55-63:    group ID, currently unused and set to 0
416  *
417  * Starting from Gen11, the upper dword of the descriptor has a new format:
418  *
419  *      bits 32-36:    reserved
420  *      bits 37-47:    SW context ID
421  *      bits 48:53:    engine instance
422  *      bit 54:        mbz, reserved for use by hardware
423  *      bits 55-60:    SW counter
424  *      bits 61-63:    engine class
425  *
426  * engine info, SW context ID and SW counter need to form a unique number
427  * (Context ID) per lrc.
428  */
429 static u64
430 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
431 {
432         struct i915_gem_context *ctx = ce->gem_context;
433         u64 desc;
434
435         BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
436         BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
437
438         desc = INTEL_LEGACY_32B_CONTEXT;
439         if (i915_vm_is_4lvl(ce->vm))
440                 desc = INTEL_LEGACY_64B_CONTEXT;
441         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
442
443         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
444         if (IS_GEN(engine->i915, 8))
445                 desc |= GEN8_CTX_L3LLC_COHERENT;
446
447         desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
448                                                                 /* bits 12-31 */
449         /*
450          * The following 32bits are copied into the OA reports (dword 2).
451          * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
452          * anything below.
453          */
454         if (INTEL_GEN(engine->i915) >= 11) {
455                 GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
456                 desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
457                                                                 /* bits 37-47 */
458
459                 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
460                                                                 /* bits 48-53 */
461
462                 /* TODO: decide what to do with SW counter (bits 55-60) */
463
464                 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
465                                                                 /* bits 61-63 */
466         } else {
467                 GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
468                 desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT;   /* bits 32-52 */
469         }
470
471         return desc;
472 }
473
474 static struct i915_request *
475 __unwind_incomplete_requests(struct intel_engine_cs *engine)
476 {
477         struct i915_request *rq, *rn, *active = NULL;
478         struct list_head *uninitialized_var(pl);
479         int prio = I915_PRIORITY_INVALID;
480
481         lockdep_assert_held(&engine->active.lock);
482
483         list_for_each_entry_safe_reverse(rq, rn,
484                                          &engine->active.requests,
485                                          sched.link) {
486                 struct intel_engine_cs *owner;
487
488                 if (i915_request_completed(rq))
489                         continue; /* XXX */
490
491                 __i915_request_unsubmit(rq);
492
493                 /*
494                  * Push the request back into the queue for later resubmission.
495                  * If this request is not native to this physical engine (i.e.
496                  * it came from a virtual source), push it back onto the virtual
497                  * engine so that it can be moved across onto another physical
498                  * engine as load dictates.
499                  */
500                 owner = rq->hw_context->engine;
501                 if (likely(owner == engine)) {
502                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
503                         if (rq_prio(rq) != prio) {
504                                 prio = rq_prio(rq);
505                                 pl = i915_sched_lookup_priolist(engine, prio);
506                         }
507                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
508
509                         list_move(&rq->sched.link, pl);
510                         active = rq;
511                 } else {
512                         /*
513                          * Decouple the virtual breadcrumb before moving it
514                          * back to the virtual engine -- we don't want the
515                          * request to complete in the background and try
516                          * and cancel the breadcrumb on the virtual engine
517                          * (instead of the old engine where it is linked)!
518                          */
519                         if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
520                                      &rq->fence.flags)) {
521                                 spin_lock_nested(&rq->lock,
522                                                  SINGLE_DEPTH_NESTING);
523                                 i915_request_cancel_breadcrumb(rq);
524                                 spin_unlock(&rq->lock);
525                         }
526                         rq->engine = owner;
527                         owner->submit_request(rq);
528                         active = NULL;
529                 }
530         }
531
532         return active;
533 }
534
535 struct i915_request *
536 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
537 {
538         struct intel_engine_cs *engine =
539                 container_of(execlists, typeof(*engine), execlists);
540
541         return __unwind_incomplete_requests(engine);
542 }
543
544 static inline void
545 execlists_context_status_change(struct i915_request *rq, unsigned long status)
546 {
547         /*
548          * Only used when GVT-g is enabled now. When GVT-g is disabled,
549          * The compiler should eliminate this function as dead-code.
550          */
551         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
552                 return;
553
554         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
555                                    status, rq);
556 }
557
558 static inline struct intel_engine_cs *
559 __execlists_schedule_in(struct i915_request *rq)
560 {
561         struct intel_engine_cs * const engine = rq->engine;
562         struct intel_context * const ce = rq->hw_context;
563
564         intel_context_get(ce);
565
566         intel_gt_pm_get(engine->gt);
567         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
568         intel_engine_context_in(engine);
569
570         return engine;
571 }
572
573 static inline struct i915_request *
574 execlists_schedule_in(struct i915_request *rq, int idx)
575 {
576         struct intel_context * const ce = rq->hw_context;
577         struct intel_engine_cs *old;
578
579         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
580         trace_i915_request_in(rq, idx);
581
582         old = READ_ONCE(ce->inflight);
583         do {
584                 if (!old) {
585                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
586                         break;
587                 }
588         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
589
590         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
591         return i915_request_get(rq);
592 }
593
594 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
595 {
596         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
597         struct i915_request *next = READ_ONCE(ve->request);
598
599         if (next && next->execution_mask & ~rq->execution_mask)
600                 tasklet_schedule(&ve->base.execlists.tasklet);
601 }
602
603 static inline void
604 __execlists_schedule_out(struct i915_request *rq,
605                          struct intel_engine_cs * const engine)
606 {
607         struct intel_context * const ce = rq->hw_context;
608
609         intel_engine_context_out(engine);
610         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
611         intel_gt_pm_put(engine->gt);
612
613         /*
614          * If this is part of a virtual engine, its next request may
615          * have been blocked waiting for access to the active context.
616          * We have to kick all the siblings again in case we need to
617          * switch (e.g. the next request is not runnable on this
618          * engine). Hopefully, we will already have submitted the next
619          * request before the tasklet runs and do not need to rebuild
620          * each virtual tree and kick everyone again.
621          */
622         if (ce->engine != engine)
623                 kick_siblings(rq, ce);
624
625         intel_context_put(ce);
626 }
627
628 static inline void
629 execlists_schedule_out(struct i915_request *rq)
630 {
631         struct intel_context * const ce = rq->hw_context;
632         struct intel_engine_cs *cur, *old;
633
634         trace_i915_request_out(rq);
635
636         old = READ_ONCE(ce->inflight);
637         do
638                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
639         while (!try_cmpxchg(&ce->inflight, &old, cur));
640         if (!cur)
641                 __execlists_schedule_out(rq, old);
642
643         i915_request_put(rq);
644 }
645
646 static u64 execlists_update_context(struct i915_request *rq)
647 {
648         struct intel_context *ce = rq->hw_context;
649         u64 desc = ce->lrc_desc;
650         u32 tail, prev;
651
652         /*
653          * WaIdleLiteRestore:bdw,skl
654          *
655          * We should never submit the context with the same RING_TAIL twice
656          * just in case we submit an empty ring, which confuses the HW.
657          *
658          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
659          * the normal request to be able to always advance the RING_TAIL on
660          * subsequent resubmissions (for lite restore). Should that fail us,
661          * and we try and submit the same tail again, force the context
662          * reload.
663          *
664          * If we need to return to a preempted context, we need to skip the
665          * lite-restore and force it to reload the RING_TAIL. Otherwise, the
666          * HW has a tendency to ignore us rewinding the TAIL to the end of
667          * an earlier request.
668          */
669         tail = intel_ring_set_tail(rq->ring, rq->tail);
670         prev = ce->lrc_reg_state[CTX_RING_TAIL + 1];
671         if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
672                 desc |= CTX_DESC_FORCE_RESTORE;
673         ce->lrc_reg_state[CTX_RING_TAIL + 1] = tail;
674         rq->tail = rq->wa_tail;
675
676         /*
677          * Make sure the context image is complete before we submit it to HW.
678          *
679          * Ostensibly, writes (including the WCB) should be flushed prior to
680          * an uncached write such as our mmio register access, the empirical
681          * evidence (esp. on Braswell) suggests that the WC write into memory
682          * may not be visible to the HW prior to the completion of the UC
683          * register write and that we may begin execution from the context
684          * before its image is complete leading to invalid PD chasing.
685          *
686          * Furthermore, Braswell, at least, wants a full mb to be sure that
687          * the writes are coherent in memory (visible to the GPU) prior to
688          * execution, and not just visible to other CPUs (as is the result of
689          * wmb).
690          */
691         mb();
692
693         ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
694
695         return desc;
696 }
697
698 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
699 {
700         if (execlists->ctrl_reg) {
701                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
702                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
703         } else {
704                 writel(upper_32_bits(desc), execlists->submit_reg);
705                 writel(lower_32_bits(desc), execlists->submit_reg);
706         }
707 }
708
709 static __maybe_unused void
710 trace_ports(const struct intel_engine_execlists *execlists,
711             const char *msg,
712             struct i915_request * const *ports)
713 {
714         const struct intel_engine_cs *engine =
715                 container_of(execlists, typeof(*engine), execlists);
716
717         GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
718                   engine->name, msg,
719                   ports[0]->fence.context,
720                   ports[0]->fence.seqno,
721                   i915_request_completed(ports[0]) ? "!" :
722                   i915_request_started(ports[0]) ? "*" :
723                   "",
724                   ports[1] ? ports[1]->fence.context : 0,
725                   ports[1] ? ports[1]->fence.seqno : 0);
726 }
727
728 static __maybe_unused bool
729 assert_pending_valid(const struct intel_engine_execlists *execlists,
730                      const char *msg)
731 {
732         struct i915_request * const *port, *rq;
733         struct intel_context *ce = NULL;
734
735         trace_ports(execlists, msg, execlists->pending);
736
737         if (!execlists->pending[0])
738                 return false;
739
740         if (execlists->pending[execlists_num_ports(execlists)])
741                 return false;
742
743         for (port = execlists->pending; (rq = *port); port++) {
744                 if (ce == rq->hw_context)
745                         return false;
746
747                 ce = rq->hw_context;
748                 if (i915_request_completed(rq))
749                         continue;
750
751                 if (i915_active_is_idle(&ce->active))
752                         return false;
753
754                 if (!i915_vma_is_pinned(ce->state))
755                         return false;
756         }
757
758         return ce;
759 }
760
761 static void execlists_submit_ports(struct intel_engine_cs *engine)
762 {
763         struct intel_engine_execlists *execlists = &engine->execlists;
764         unsigned int n;
765
766         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
767
768         /*
769          * We can skip acquiring intel_runtime_pm_get() here as it was taken
770          * on our behalf by the request (see i915_gem_mark_busy()) and it will
771          * not be relinquished until the device is idle (see
772          * i915_gem_idle_work_handler()). As a precaution, we make sure
773          * that all ELSP are drained i.e. we have processed the CSB,
774          * before allowing ourselves to idle and calling intel_runtime_pm_put().
775          */
776         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
777
778         /*
779          * ELSQ note: the submit queue is not cleared after being submitted
780          * to the HW so we need to make sure we always clean it up. This is
781          * currently ensured by the fact that we always write the same number
782          * of elsq entries, keep this in mind before changing the loop below.
783          */
784         for (n = execlists_num_ports(execlists); n--; ) {
785                 struct i915_request *rq = execlists->pending[n];
786
787                 write_desc(execlists,
788                            rq ? execlists_update_context(rq) : 0,
789                            n);
790         }
791
792         /* we need to manually load the submit queue */
793         if (execlists->ctrl_reg)
794                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
795 }
796
797 static bool ctx_single_port_submission(const struct intel_context *ce)
798 {
799         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
800                 i915_gem_context_force_single_submission(ce->gem_context));
801 }
802
803 static bool can_merge_ctx(const struct intel_context *prev,
804                           const struct intel_context *next)
805 {
806         if (prev != next)
807                 return false;
808
809         if (ctx_single_port_submission(prev))
810                 return false;
811
812         return true;
813 }
814
815 static bool can_merge_rq(const struct i915_request *prev,
816                          const struct i915_request *next)
817 {
818         GEM_BUG_ON(prev == next);
819         GEM_BUG_ON(!assert_priority_queue(prev, next));
820
821         /*
822          * We do not submit known completed requests. Therefore if the next
823          * request is already completed, we can pretend to merge it in
824          * with the previous context (and we will skip updating the ELSP
825          * and tracking). Thus hopefully keeping the ELSP full with active
826          * contexts, despite the best efforts of preempt-to-busy to confuse
827          * us.
828          */
829         if (i915_request_completed(next))
830                 return true;
831
832         if (!can_merge_ctx(prev->hw_context, next->hw_context))
833                 return false;
834
835         return true;
836 }
837
838 static void virtual_update_register_offsets(u32 *regs,
839                                             struct intel_engine_cs *engine)
840 {
841         u32 base = engine->mmio_base;
842
843         /* Must match execlists_init_reg_state()! */
844
845         regs[CTX_CONTEXT_CONTROL] =
846                 i915_mmio_reg_offset(RING_CONTEXT_CONTROL(base));
847         regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base));
848         regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base));
849         regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base));
850         regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base));
851
852         regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base));
853         regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base));
854         regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base));
855         regs[CTX_SECOND_BB_HEAD_U] =
856                 i915_mmio_reg_offset(RING_SBBADDR_UDW(base));
857         regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base));
858         regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base));
859
860         regs[CTX_CTX_TIMESTAMP] =
861                 i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base));
862         regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 3));
863         regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 3));
864         regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 2));
865         regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 2));
866         regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 1));
867         regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 1));
868         regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0));
869         regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 0));
870
871         if (engine->class == RENDER_CLASS) {
872                 regs[CTX_RCS_INDIRECT_CTX] =
873                         i915_mmio_reg_offset(RING_INDIRECT_CTX(base));
874                 regs[CTX_RCS_INDIRECT_CTX_OFFSET] =
875                         i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base));
876                 regs[CTX_BB_PER_CTX_PTR] =
877                         i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base));
878
879                 regs[CTX_R_PWR_CLK_STATE] =
880                         i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE);
881         }
882 }
883
884 static bool virtual_matches(const struct virtual_engine *ve,
885                             const struct i915_request *rq,
886                             const struct intel_engine_cs *engine)
887 {
888         const struct intel_engine_cs *inflight;
889
890         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
891                 return false;
892
893         /*
894          * We track when the HW has completed saving the context image
895          * (i.e. when we have seen the final CS event switching out of
896          * the context) and must not overwrite the context image before
897          * then. This restricts us to only using the active engine
898          * while the previous virtualized request is inflight (so
899          * we reuse the register offsets). This is a very small
900          * hystersis on the greedy seelction algorithm.
901          */
902         inflight = intel_context_inflight(&ve->context);
903         if (inflight && inflight != engine)
904                 return false;
905
906         return true;
907 }
908
909 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
910                                      struct intel_engine_cs *engine)
911 {
912         struct intel_engine_cs *old = ve->siblings[0];
913
914         /* All unattached (rq->engine == old) must already be completed */
915
916         spin_lock(&old->breadcrumbs.irq_lock);
917         if (!list_empty(&ve->context.signal_link)) {
918                 list_move_tail(&ve->context.signal_link,
919                                &engine->breadcrumbs.signalers);
920                 intel_engine_queue_breadcrumbs(engine);
921         }
922         spin_unlock(&old->breadcrumbs.irq_lock);
923 }
924
925 static struct i915_request *
926 last_active(const struct intel_engine_execlists *execlists)
927 {
928         struct i915_request * const *last = READ_ONCE(execlists->active);
929
930         while (*last && i915_request_completed(*last))
931                 last++;
932
933         return *last;
934 }
935
936 #define for_each_waiter(p__, rq__) \
937         list_for_each_entry_lockless(p__, \
938                                      &(rq__)->sched.waiters_list, \
939                                      wait_link)
940
941 static void defer_request(struct i915_request *rq, struct list_head * const pl)
942 {
943         LIST_HEAD(list);
944
945         /*
946          * We want to move the interrupted request to the back of
947          * the round-robin list (i.e. its priority level), but
948          * in doing so, we must then move all requests that were in
949          * flight and were waiting for the interrupted request to
950          * be run after it again.
951          */
952         do {
953                 struct i915_dependency *p;
954
955                 GEM_BUG_ON(i915_request_is_active(rq));
956                 list_move_tail(&rq->sched.link, pl);
957
958                 for_each_waiter(p, rq) {
959                         struct i915_request *w =
960                                 container_of(p->waiter, typeof(*w), sched);
961
962                         /* Leave semaphores spinning on the other engines */
963                         if (w->engine != rq->engine)
964                                 continue;
965
966                         /* No waiter should start before its signaler */
967                         GEM_BUG_ON(i915_request_started(w) &&
968                                    !i915_request_completed(rq));
969
970                         GEM_BUG_ON(i915_request_is_active(w));
971                         if (list_empty(&w->sched.link))
972                                 continue; /* Not yet submitted; unready */
973
974                         if (rq_prio(w) < rq_prio(rq))
975                                 continue;
976
977                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
978                         list_move_tail(&w->sched.link, &list);
979                 }
980
981                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
982         } while (rq);
983 }
984
985 static void defer_active(struct intel_engine_cs *engine)
986 {
987         struct i915_request *rq;
988
989         rq = __unwind_incomplete_requests(engine);
990         if (!rq)
991                 return;
992
993         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
994 }
995
996 static bool
997 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
998 {
999         int hint;
1000
1001         if (!intel_engine_has_semaphores(engine))
1002                 return false;
1003
1004         if (list_is_last(&rq->sched.link, &engine->active.requests))
1005                 return false;
1006
1007         hint = max(rq_prio(list_next_entry(rq, sched.link)),
1008                    engine->execlists.queue_priority_hint);
1009
1010         return hint >= effective_prio(rq);
1011 }
1012
1013 static int
1014 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1015 {
1016         if (list_is_last(&rq->sched.link, &engine->active.requests))
1017                 return INT_MIN;
1018
1019         return rq_prio(list_next_entry(rq, sched.link));
1020 }
1021
1022 static bool
1023 enable_timeslice(const struct intel_engine_execlists *execlists)
1024 {
1025         const struct i915_request *rq = *execlists->active;
1026
1027         if (i915_request_completed(rq))
1028                 return false;
1029
1030         return execlists->switch_priority_hint >= effective_prio(rq);
1031 }
1032
1033 static void record_preemption(struct intel_engine_execlists *execlists)
1034 {
1035         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1036 }
1037
1038 static void execlists_dequeue(struct intel_engine_cs *engine)
1039 {
1040         struct intel_engine_execlists * const execlists = &engine->execlists;
1041         struct i915_request **port = execlists->pending;
1042         struct i915_request ** const last_port = port + execlists->port_mask;
1043         struct i915_request *last;
1044         struct rb_node *rb;
1045         bool submit = false;
1046
1047         /*
1048          * Hardware submission is through 2 ports. Conceptually each port
1049          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1050          * static for a context, and unique to each, so we only execute
1051          * requests belonging to a single context from each ring. RING_HEAD
1052          * is maintained by the CS in the context image, it marks the place
1053          * where it got up to last time, and through RING_TAIL we tell the CS
1054          * where we want to execute up to this time.
1055          *
1056          * In this list the requests are in order of execution. Consecutive
1057          * requests from the same context are adjacent in the ringbuffer. We
1058          * can combine these requests into a single RING_TAIL update:
1059          *
1060          *              RING_HEAD...req1...req2
1061          *                                    ^- RING_TAIL
1062          * since to execute req2 the CS must first execute req1.
1063          *
1064          * Our goal then is to point each port to the end of a consecutive
1065          * sequence of requests as being the most optimal (fewest wake ups
1066          * and context switches) submission.
1067          */
1068
1069         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1070                 struct virtual_engine *ve =
1071                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1072                 struct i915_request *rq = READ_ONCE(ve->request);
1073
1074                 if (!rq) { /* lazily cleanup after another engine handled rq */
1075                         rb_erase_cached(rb, &execlists->virtual);
1076                         RB_CLEAR_NODE(rb);
1077                         rb = rb_first_cached(&execlists->virtual);
1078                         continue;
1079                 }
1080
1081                 if (!virtual_matches(ve, rq, engine)) {
1082                         rb = rb_next(rb);
1083                         continue;
1084                 }
1085
1086                 break;
1087         }
1088
1089         /*
1090          * If the queue is higher priority than the last
1091          * request in the currently active context, submit afresh.
1092          * We will resubmit again afterwards in case we need to split
1093          * the active context to interject the preemption request,
1094          * i.e. we will retrigger preemption following the ack in case
1095          * of trouble.
1096          */
1097         last = last_active(execlists);
1098         if (last) {
1099                 if (need_preempt(engine, last, rb)) {
1100                         GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
1101                                   engine->name,
1102                                   last->fence.context,
1103                                   last->fence.seqno,
1104                                   last->sched.attr.priority,
1105                                   execlists->queue_priority_hint);
1106                         record_preemption(execlists);
1107
1108                         /*
1109                          * Don't let the RING_HEAD advance past the breadcrumb
1110                          * as we unwind (and until we resubmit) so that we do
1111                          * not accidentally tell it to go backwards.
1112                          */
1113                         ring_set_paused(engine, 1);
1114
1115                         /*
1116                          * Note that we have not stopped the GPU at this point,
1117                          * so we are unwinding the incomplete requests as they
1118                          * remain inflight and so by the time we do complete
1119                          * the preemption, some of the unwound requests may
1120                          * complete!
1121                          */
1122                         __unwind_incomplete_requests(engine);
1123
1124                         last = NULL;
1125                 } else if (need_timeslice(engine, last) &&
1126                            !timer_pending(&engine->execlists.timer)) {
1127                         GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
1128                                   engine->name,
1129                                   last->fence.context,
1130                                   last->fence.seqno,
1131                                   last->sched.attr.priority,
1132                                   execlists->queue_priority_hint);
1133
1134                         ring_set_paused(engine, 1);
1135                         defer_active(engine);
1136
1137                         /*
1138                          * Unlike for preemption, if we rewind and continue
1139                          * executing the same context as previously active,
1140                          * the order of execution will remain the same and
1141                          * the tail will only advance. We do not need to
1142                          * force a full context restore, as a lite-restore
1143                          * is sufficient to resample the monotonic TAIL.
1144                          *
1145                          * If we switch to any other context, similarly we
1146                          * will not rewind TAIL of current context, and
1147                          * normal save/restore will preserve state and allow
1148                          * us to later continue executing the same request.
1149                          */
1150                         last = NULL;
1151                 } else {
1152                         /*
1153                          * Otherwise if we already have a request pending
1154                          * for execution after the current one, we can
1155                          * just wait until the next CS event before
1156                          * queuing more. In either case we will force a
1157                          * lite-restore preemption event, but if we wait
1158                          * we hopefully coalesce several updates into a single
1159                          * submission.
1160                          */
1161                         if (!list_is_last(&last->sched.link,
1162                                           &engine->active.requests))
1163                                 return;
1164                 }
1165         }
1166
1167         while (rb) { /* XXX virtual is always taking precedence */
1168                 struct virtual_engine *ve =
1169                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1170                 struct i915_request *rq;
1171
1172                 spin_lock(&ve->base.active.lock);
1173
1174                 rq = ve->request;
1175                 if (unlikely(!rq)) { /* lost the race to a sibling */
1176                         spin_unlock(&ve->base.active.lock);
1177                         rb_erase_cached(rb, &execlists->virtual);
1178                         RB_CLEAR_NODE(rb);
1179                         rb = rb_first_cached(&execlists->virtual);
1180                         continue;
1181                 }
1182
1183                 GEM_BUG_ON(rq != ve->request);
1184                 GEM_BUG_ON(rq->engine != &ve->base);
1185                 GEM_BUG_ON(rq->hw_context != &ve->context);
1186
1187                 if (rq_prio(rq) >= queue_prio(execlists)) {
1188                         if (!virtual_matches(ve, rq, engine)) {
1189                                 spin_unlock(&ve->base.active.lock);
1190                                 rb = rb_next(rb);
1191                                 continue;
1192                         }
1193
1194                         if (last && !can_merge_rq(last, rq)) {
1195                                 spin_unlock(&ve->base.active.lock);
1196                                 return; /* leave this for another */
1197                         }
1198
1199                         GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
1200                                   engine->name,
1201                                   rq->fence.context,
1202                                   rq->fence.seqno,
1203                                   i915_request_completed(rq) ? "!" :
1204                                   i915_request_started(rq) ? "*" :
1205                                   "",
1206                                   yesno(engine != ve->siblings[0]));
1207
1208                         ve->request = NULL;
1209                         ve->base.execlists.queue_priority_hint = INT_MIN;
1210                         rb_erase_cached(rb, &execlists->virtual);
1211                         RB_CLEAR_NODE(rb);
1212
1213                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1214                         rq->engine = engine;
1215
1216                         if (engine != ve->siblings[0]) {
1217                                 u32 *regs = ve->context.lrc_reg_state;
1218                                 unsigned int n;
1219
1220                                 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1221                                 virtual_update_register_offsets(regs, engine);
1222
1223                                 if (!list_empty(&ve->context.signals))
1224                                         virtual_xfer_breadcrumbs(ve, engine);
1225
1226                                 /*
1227                                  * Move the bound engine to the top of the list
1228                                  * for future execution. We then kick this
1229                                  * tasklet first before checking others, so that
1230                                  * we preferentially reuse this set of bound
1231                                  * registers.
1232                                  */
1233                                 for (n = 1; n < ve->num_siblings; n++) {
1234                                         if (ve->siblings[n] == engine) {
1235                                                 swap(ve->siblings[n],
1236                                                      ve->siblings[0]);
1237                                                 break;
1238                                         }
1239                                 }
1240
1241                                 GEM_BUG_ON(ve->siblings[0] != engine);
1242                         }
1243
1244                         if (__i915_request_submit(rq)) {
1245                                 submit = true;
1246                                 last = rq;
1247                         }
1248                         i915_request_put(rq);
1249
1250                         /*
1251                          * Hmm, we have a bunch of virtual engine requests,
1252                          * but the first one was already completed (thanks
1253                          * preempt-to-busy!). Keep looking at the veng queue
1254                          * until we have no more relevant requests (i.e.
1255                          * the normal submit queue has higher priority).
1256                          */
1257                         if (!submit) {
1258                                 spin_unlock(&ve->base.active.lock);
1259                                 rb = rb_first_cached(&execlists->virtual);
1260                                 continue;
1261                         }
1262                 }
1263
1264                 spin_unlock(&ve->base.active.lock);
1265                 break;
1266         }
1267
1268         while ((rb = rb_first_cached(&execlists->queue))) {
1269                 struct i915_priolist *p = to_priolist(rb);
1270                 struct i915_request *rq, *rn;
1271                 int i;
1272
1273                 priolist_for_each_request_consume(rq, rn, p, i) {
1274                         bool merge = true;
1275
1276                         /*
1277                          * Can we combine this request with the current port?
1278                          * It has to be the same context/ringbuffer and not
1279                          * have any exceptions (e.g. GVT saying never to
1280                          * combine contexts).
1281                          *
1282                          * If we can combine the requests, we can execute both
1283                          * by updating the RING_TAIL to point to the end of the
1284                          * second request, and so we never need to tell the
1285                          * hardware about the first.
1286                          */
1287                         if (last && !can_merge_rq(last, rq)) {
1288                                 /*
1289                                  * If we are on the second port and cannot
1290                                  * combine this request with the last, then we
1291                                  * are done.
1292                                  */
1293                                 if (port == last_port)
1294                                         goto done;
1295
1296                                 /*
1297                                  * We must not populate both ELSP[] with the
1298                                  * same LRCA, i.e. we must submit 2 different
1299                                  * contexts if we submit 2 ELSP.
1300                                  */
1301                                 if (last->hw_context == rq->hw_context)
1302                                         goto done;
1303
1304                                 /*
1305                                  * If GVT overrides us we only ever submit
1306                                  * port[0], leaving port[1] empty. Note that we
1307                                  * also have to be careful that we don't queue
1308                                  * the same context (even though a different
1309                                  * request) to the second port.
1310                                  */
1311                                 if (ctx_single_port_submission(last->hw_context) ||
1312                                     ctx_single_port_submission(rq->hw_context))
1313                                         goto done;
1314
1315                                 merge = false;
1316                         }
1317
1318                         if (__i915_request_submit(rq)) {
1319                                 if (!merge) {
1320                                         *port = execlists_schedule_in(last, port - execlists->pending);
1321                                         port++;
1322                                         last = NULL;
1323                                 }
1324
1325                                 GEM_BUG_ON(last &&
1326                                            !can_merge_ctx(last->hw_context,
1327                                                           rq->hw_context));
1328
1329                                 submit = true;
1330                                 last = rq;
1331                         }
1332                 }
1333
1334                 rb_erase_cached(&p->node, &execlists->queue);
1335                 i915_priolist_free(p);
1336         }
1337
1338 done:
1339         /*
1340          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1341          *
1342          * We choose the priority hint such that if we add a request of greater
1343          * priority than this, we kick the submission tasklet to decide on
1344          * the right order of submitting the requests to hardware. We must
1345          * also be prepared to reorder requests as they are in-flight on the
1346          * HW. We derive the priority hint then as the first "hole" in
1347          * the HW submission ports and if there are no available slots,
1348          * the priority of the lowest executing request, i.e. last.
1349          *
1350          * When we do receive a higher priority request ready to run from the
1351          * user, see queue_request(), the priority hint is bumped to that
1352          * request triggering preemption on the next dequeue (or subsequent
1353          * interrupt for secondary ports).
1354          */
1355         execlists->queue_priority_hint = queue_prio(execlists);
1356         GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
1357                   engine->name, execlists->queue_priority_hint,
1358                   yesno(submit));
1359
1360         if (submit) {
1361                 *port = execlists_schedule_in(last, port - execlists->pending);
1362                 memset(port + 1, 0, (last_port - port) * sizeof(*port));
1363                 execlists->switch_priority_hint =
1364                         switch_prio(engine, *execlists->pending);
1365                 execlists_submit_ports(engine);
1366         } else {
1367                 ring_set_paused(engine, 0);
1368         }
1369 }
1370
1371 static void
1372 cancel_port_requests(struct intel_engine_execlists * const execlists)
1373 {
1374         struct i915_request * const *port, *rq;
1375
1376         for (port = execlists->pending; (rq = *port); port++)
1377                 execlists_schedule_out(rq);
1378         memset(execlists->pending, 0, sizeof(execlists->pending));
1379
1380         for (port = execlists->active; (rq = *port); port++)
1381                 execlists_schedule_out(rq);
1382         execlists->active =
1383                 memset(execlists->inflight, 0, sizeof(execlists->inflight));
1384 }
1385
1386 static inline void
1387 invalidate_csb_entries(const u32 *first, const u32 *last)
1388 {
1389         clflush((void *)first);
1390         clflush((void *)last);
1391 }
1392
1393 static inline bool
1394 reset_in_progress(const struct intel_engine_execlists *execlists)
1395 {
1396         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1397 }
1398
1399 enum csb_step {
1400         CSB_NOP,
1401         CSB_PROMOTE,
1402         CSB_PREEMPT,
1403         CSB_COMPLETE,
1404 };
1405
1406 /*
1407  * Starting with Gen12, the status has a new format:
1408  *
1409  *     bit  0:     switched to new queue
1410  *     bit  1:     reserved
1411  *     bit  2:     semaphore wait mode (poll or signal), only valid when
1412  *                 switch detail is set to "wait on semaphore"
1413  *     bits 3-5:   engine class
1414  *     bits 6-11:  engine instance
1415  *     bits 12-14: reserved
1416  *     bits 15-25: sw context id of the lrc the GT switched to
1417  *     bits 26-31: sw counter of the lrc the GT switched to
1418  *     bits 32-35: context switch detail
1419  *                  - 0: ctx complete
1420  *                  - 1: wait on sync flip
1421  *                  - 2: wait on vblank
1422  *                  - 3: wait on scanline
1423  *                  - 4: wait on semaphore
1424  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
1425  *                       WAIT_FOR_EVENT)
1426  *     bit  36:    reserved
1427  *     bits 37-43: wait detail (for switch detail 1 to 4)
1428  *     bits 44-46: reserved
1429  *     bits 47-57: sw context id of the lrc the GT switched away from
1430  *     bits 58-63: sw counter of the lrc the GT switched away from
1431  */
1432 static inline enum csb_step
1433 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1434 {
1435         u32 lower_dw = csb[0];
1436         u32 upper_dw = csb[1];
1437         bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
1438         bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
1439         bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
1440
1441         if (!ctx_away_valid && ctx_to_valid)
1442                 return CSB_PROMOTE;
1443
1444         /*
1445          * The context switch detail is not guaranteed to be 5 when a preemption
1446          * occurs, so we can't just check for that. The check below works for
1447          * all the cases we care about, including preemptions of WAIT
1448          * instructions and lite-restore. Preempt-to-idle via the CTRL register
1449          * would require some extra handling, but we don't support that.
1450          */
1451         if (new_queue && ctx_away_valid)
1452                 return CSB_PREEMPT;
1453
1454         /*
1455          * switch detail = 5 is covered by the case above and we do not expect a
1456          * context switch on an unsuccessful wait instruction since we always
1457          * use polling mode.
1458          */
1459         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
1460
1461         if (*execlists->active) {
1462                 GEM_BUG_ON(!ctx_away_valid);
1463                 return CSB_COMPLETE;
1464         }
1465
1466         return CSB_NOP;
1467 }
1468
1469 static inline enum csb_step
1470 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1471 {
1472         unsigned int status = *csb;
1473
1474         if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
1475                 return CSB_PROMOTE;
1476
1477         if (status & GEN8_CTX_STATUS_PREEMPTED)
1478                 return CSB_PREEMPT;
1479
1480         if (*execlists->active)
1481                 return CSB_COMPLETE;
1482
1483         return CSB_NOP;
1484 }
1485
1486 static void process_csb(struct intel_engine_cs *engine)
1487 {
1488         struct intel_engine_execlists * const execlists = &engine->execlists;
1489         const u32 * const buf = execlists->csb_status;
1490         const u8 num_entries = execlists->csb_size;
1491         u8 head, tail;
1492
1493         GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915));
1494
1495         /*
1496          * Note that csb_write, csb_status may be either in HWSP or mmio.
1497          * When reading from the csb_write mmio register, we have to be
1498          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
1499          * the low 4bits. As it happens we know the next 4bits are always
1500          * zero and so we can simply masked off the low u8 of the register
1501          * and treat it identically to reading from the HWSP (without having
1502          * to use explicit shifting and masking, and probably bifurcating
1503          * the code to handle the legacy mmio read).
1504          */
1505         head = execlists->csb_head;
1506         tail = READ_ONCE(*execlists->csb_write);
1507         GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
1508         if (unlikely(head == tail))
1509                 return;
1510
1511         /*
1512          * Hopefully paired with a wmb() in HW!
1513          *
1514          * We must complete the read of the write pointer before any reads
1515          * from the CSB, so that we do not see stale values. Without an rmb
1516          * (lfence) the HW may speculatively perform the CSB[] reads *before*
1517          * we perform the READ_ONCE(*csb_write).
1518          */
1519         rmb();
1520
1521         do {
1522                 enum csb_step csb_step;
1523
1524                 if (++head == num_entries)
1525                         head = 0;
1526
1527                 /*
1528                  * We are flying near dragons again.
1529                  *
1530                  * We hold a reference to the request in execlist_port[]
1531                  * but no more than that. We are operating in softirq
1532                  * context and so cannot hold any mutex or sleep. That
1533                  * prevents us stopping the requests we are processing
1534                  * in port[] from being retired simultaneously (the
1535                  * breadcrumb will be complete before we see the
1536                  * context-switch). As we only hold the reference to the
1537                  * request, any pointer chasing underneath the request
1538                  * is subject to a potential use-after-free. Thus we
1539                  * store all of the bookkeeping within port[] as
1540                  * required, and avoid using unguarded pointers beneath
1541                  * request itself. The same applies to the atomic
1542                  * status notifier.
1543                  */
1544
1545                 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
1546                           engine->name, head,
1547                           buf[2 * head + 0], buf[2 * head + 1]);
1548
1549                 if (INTEL_GEN(engine->i915) >= 12)
1550                         csb_step = gen12_csb_parse(execlists, buf + 2 * head);
1551                 else
1552                         csb_step = gen8_csb_parse(execlists, buf + 2 * head);
1553
1554                 switch (csb_step) {
1555                 case CSB_PREEMPT: /* cancel old inflight, prepare for switch */
1556                         trace_ports(execlists, "preempted", execlists->active);
1557
1558                         while (*execlists->active)
1559                                 execlists_schedule_out(*execlists->active++);
1560
1561                         /* fallthrough */
1562                 case CSB_PROMOTE: /* switch pending to inflight */
1563                         GEM_BUG_ON(*execlists->active);
1564                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
1565                         execlists->active =
1566                                 memcpy(execlists->inflight,
1567                                        execlists->pending,
1568                                        execlists_num_ports(execlists) *
1569                                        sizeof(*execlists->pending));
1570
1571                         if (enable_timeslice(execlists))
1572                                 mod_timer(&execlists->timer, jiffies + 1);
1573
1574                         if (!inject_preempt_hang(execlists))
1575                                 ring_set_paused(engine, 0);
1576
1577                         WRITE_ONCE(execlists->pending[0], NULL);
1578                         break;
1579
1580                 case CSB_COMPLETE: /* port0 completed, advanced to port1 */
1581                         trace_ports(execlists, "completed", execlists->active);
1582
1583                         /*
1584                          * We rely on the hardware being strongly
1585                          * ordered, that the breadcrumb write is
1586                          * coherent (visible from the CPU) before the
1587                          * user interrupt and CSB is processed.
1588                          */
1589                         GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
1590                                    !reset_in_progress(execlists));
1591                         execlists_schedule_out(*execlists->active++);
1592
1593                         GEM_BUG_ON(execlists->active - execlists->inflight >
1594                                    execlists_num_ports(execlists));
1595                         break;
1596
1597                 case CSB_NOP:
1598                         break;
1599                 }
1600         } while (head != tail);
1601
1602         execlists->csb_head = head;
1603
1604         /*
1605          * Gen11 has proven to fail wrt global observation point between
1606          * entry and tail update, failing on the ordering and thus
1607          * we see an old entry in the context status buffer.
1608          *
1609          * Forcibly evict out entries for the next gpu csb update,
1610          * to increase the odds that we get a fresh entries with non
1611          * working hardware. The cost for doing so comes out mostly with
1612          * the wash as hardware, working or not, will need to do the
1613          * invalidation before.
1614          */
1615         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
1616 }
1617
1618 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
1619 {
1620         lockdep_assert_held(&engine->active.lock);
1621         if (!engine->execlists.pending[0]) {
1622                 rcu_read_lock(); /* protect peeking at execlists->active */
1623                 execlists_dequeue(engine);
1624                 rcu_read_unlock();
1625         }
1626 }
1627
1628 /*
1629  * Check the unread Context Status Buffers and manage the submission of new
1630  * contexts to the ELSP accordingly.
1631  */
1632 static void execlists_submission_tasklet(unsigned long data)
1633 {
1634         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
1635         unsigned long flags;
1636
1637         process_csb(engine);
1638         if (!READ_ONCE(engine->execlists.pending[0])) {
1639                 spin_lock_irqsave(&engine->active.lock, flags);
1640                 __execlists_submission_tasklet(engine);
1641                 spin_unlock_irqrestore(&engine->active.lock, flags);
1642         }
1643 }
1644
1645 static void execlists_submission_timer(struct timer_list *timer)
1646 {
1647         struct intel_engine_cs *engine =
1648                 from_timer(engine, timer, execlists.timer);
1649
1650         /* Kick the tasklet for some interrupt coalescing and reset handling */
1651         tasklet_hi_schedule(&engine->execlists.tasklet);
1652 }
1653
1654 static void queue_request(struct intel_engine_cs *engine,
1655                           struct i915_sched_node *node,
1656                           int prio)
1657 {
1658         GEM_BUG_ON(!list_empty(&node->link));
1659         list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
1660 }
1661
1662 static void __submit_queue_imm(struct intel_engine_cs *engine)
1663 {
1664         struct intel_engine_execlists * const execlists = &engine->execlists;
1665
1666         if (reset_in_progress(execlists))
1667                 return; /* defer until we restart the engine following reset */
1668
1669         if (execlists->tasklet.func == execlists_submission_tasklet)
1670                 __execlists_submission_tasklet(engine);
1671         else
1672                 tasklet_hi_schedule(&execlists->tasklet);
1673 }
1674
1675 static void submit_queue(struct intel_engine_cs *engine,
1676                          const struct i915_request *rq)
1677 {
1678         struct intel_engine_execlists *execlists = &engine->execlists;
1679
1680         if (rq_prio(rq) <= execlists->queue_priority_hint)
1681                 return;
1682
1683         execlists->queue_priority_hint = rq_prio(rq);
1684         __submit_queue_imm(engine);
1685 }
1686
1687 static void execlists_submit_request(struct i915_request *request)
1688 {
1689         struct intel_engine_cs *engine = request->engine;
1690         unsigned long flags;
1691
1692         /* Will be called from irq-context when using foreign fences. */
1693         spin_lock_irqsave(&engine->active.lock, flags);
1694
1695         queue_request(engine, &request->sched, rq_prio(request));
1696
1697         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1698         GEM_BUG_ON(list_empty(&request->sched.link));
1699
1700         submit_queue(engine, request);
1701
1702         spin_unlock_irqrestore(&engine->active.lock, flags);
1703 }
1704
1705 static void __execlists_context_fini(struct intel_context *ce)
1706 {
1707         intel_ring_put(ce->ring);
1708         i915_vma_put(ce->state);
1709 }
1710
1711 static void execlists_context_destroy(struct kref *kref)
1712 {
1713         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1714
1715         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1716         GEM_BUG_ON(intel_context_is_pinned(ce));
1717
1718         if (ce->state)
1719                 __execlists_context_fini(ce);
1720
1721         intel_context_fini(ce);
1722         intel_context_free(ce);
1723 }
1724
1725 static void
1726 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
1727 {
1728         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1729                 return;
1730
1731         vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
1732         vaddr += engine->context_size;
1733
1734         memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
1735 }
1736
1737 static void
1738 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
1739 {
1740         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1741                 return;
1742
1743         vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
1744         vaddr += engine->context_size;
1745
1746         if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
1747                 dev_err_once(engine->i915->drm.dev,
1748                              "%s context redzone overwritten!\n",
1749                              engine->name);
1750 }
1751
1752 static void execlists_context_unpin(struct intel_context *ce)
1753 {
1754         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
1755                       ce->engine);
1756
1757         i915_gem_context_unpin_hw_id(ce->gem_context);
1758         i915_gem_object_unpin_map(ce->state->obj);
1759         intel_ring_reset(ce->ring, ce->ring->tail);
1760 }
1761
1762 static void
1763 __execlists_update_reg_state(struct intel_context *ce,
1764                              struct intel_engine_cs *engine)
1765 {
1766         struct intel_ring *ring = ce->ring;
1767         u32 *regs = ce->lrc_reg_state;
1768
1769         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
1770         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1771
1772         regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(ring->vma);
1773         regs[CTX_RING_HEAD + 1] = ring->head;
1774         regs[CTX_RING_TAIL + 1] = ring->tail;
1775
1776         /* RPCS */
1777         if (engine->class == RENDER_CLASS) {
1778                 regs[CTX_R_PWR_CLK_STATE + 1] =
1779                         intel_sseu_make_rpcs(engine->i915, &ce->sseu);
1780
1781                 i915_oa_init_reg_state(engine, ce, regs);
1782         }
1783 }
1784
1785 static int
1786 __execlists_context_pin(struct intel_context *ce,
1787                         struct intel_engine_cs *engine)
1788 {
1789         void *vaddr;
1790         int ret;
1791
1792         GEM_BUG_ON(!ce->state);
1793
1794         ret = intel_context_active_acquire(ce);
1795         if (ret)
1796                 goto err;
1797         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1798
1799         vaddr = i915_gem_object_pin_map(ce->state->obj,
1800                                         i915_coherent_map_type(engine->i915) |
1801                                         I915_MAP_OVERRIDE);
1802         if (IS_ERR(vaddr)) {
1803                 ret = PTR_ERR(vaddr);
1804                 goto unpin_active;
1805         }
1806
1807         ret = i915_gem_context_pin_hw_id(ce->gem_context);
1808         if (ret)
1809                 goto unpin_map;
1810
1811         ce->lrc_desc = lrc_descriptor(ce, engine);
1812         ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
1813         __execlists_update_reg_state(ce, engine);
1814
1815         return 0;
1816
1817 unpin_map:
1818         i915_gem_object_unpin_map(ce->state->obj);
1819 unpin_active:
1820         intel_context_active_release(ce);
1821 err:
1822         return ret;
1823 }
1824
1825 static int execlists_context_pin(struct intel_context *ce)
1826 {
1827         return __execlists_context_pin(ce, ce->engine);
1828 }
1829
1830 static int execlists_context_alloc(struct intel_context *ce)
1831 {
1832         return __execlists_context_alloc(ce, ce->engine);
1833 }
1834
1835 static void execlists_context_reset(struct intel_context *ce)
1836 {
1837         /*
1838          * Because we emit WA_TAIL_DWORDS there may be a disparity
1839          * between our bookkeeping in ce->ring->head and ce->ring->tail and
1840          * that stored in context. As we only write new commands from
1841          * ce->ring->tail onwards, everything before that is junk. If the GPU
1842          * starts reading from its RING_HEAD from the context, it may try to
1843          * execute that junk and die.
1844          *
1845          * The contexts that are stilled pinned on resume belong to the
1846          * kernel, and are local to each engine. All other contexts will
1847          * have their head/tail sanitized upon pinning before use, so they
1848          * will never see garbage,
1849          *
1850          * So to avoid that we reset the context images upon resume. For
1851          * simplicity, we just zero everything out.
1852          */
1853         intel_ring_reset(ce->ring, 0);
1854         __execlists_update_reg_state(ce, ce->engine);
1855 }
1856
1857 static const struct intel_context_ops execlists_context_ops = {
1858         .alloc = execlists_context_alloc,
1859
1860         .pin = execlists_context_pin,
1861         .unpin = execlists_context_unpin,
1862
1863         .enter = intel_context_enter_engine,
1864         .exit = intel_context_exit_engine,
1865
1866         .reset = execlists_context_reset,
1867         .destroy = execlists_context_destroy,
1868 };
1869
1870 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
1871 {
1872         u32 *cs;
1873
1874         GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb);
1875
1876         cs = intel_ring_begin(rq, 6);
1877         if (IS_ERR(cs))
1878                 return PTR_ERR(cs);
1879
1880         /*
1881          * Check if we have been preempted before we even get started.
1882          *
1883          * After this point i915_request_started() reports true, even if
1884          * we get preempted and so are no longer running.
1885          */
1886         *cs++ = MI_ARB_CHECK;
1887         *cs++ = MI_NOOP;
1888
1889         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1890         *cs++ = rq->timeline->hwsp_offset;
1891         *cs++ = 0;
1892         *cs++ = rq->fence.seqno - 1;
1893
1894         intel_ring_advance(rq, cs);
1895
1896         /* Record the updated position of the request's payload */
1897         rq->infix = intel_ring_offset(rq, cs);
1898
1899         return 0;
1900 }
1901
1902 static int emit_pdps(struct i915_request *rq)
1903 {
1904         const struct intel_engine_cs * const engine = rq->engine;
1905         struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->hw_context->vm);
1906         int err, i;
1907         u32 *cs;
1908
1909         GEM_BUG_ON(intel_vgpu_active(rq->i915));
1910
1911         /*
1912          * Beware ye of the dragons, this sequence is magic!
1913          *
1914          * Small changes to this sequence can cause anything from
1915          * GPU hangs to forcewake errors and machine lockups!
1916          */
1917
1918         /* Flush any residual operations from the context load */
1919         err = engine->emit_flush(rq, EMIT_FLUSH);
1920         if (err)
1921                 return err;
1922
1923         /* Magic required to prevent forcewake errors! */
1924         err = engine->emit_flush(rq, EMIT_INVALIDATE);
1925         if (err)
1926                 return err;
1927
1928         cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
1929         if (IS_ERR(cs))
1930                 return PTR_ERR(cs);
1931
1932         /* Ensure the LRI have landed before we invalidate & continue */
1933         *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
1934         for (i = GEN8_3LVL_PDPES; i--; ) {
1935                 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
1936                 u32 base = engine->mmio_base;
1937
1938                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
1939                 *cs++ = upper_32_bits(pd_daddr);
1940                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
1941                 *cs++ = lower_32_bits(pd_daddr);
1942         }
1943         *cs++ = MI_NOOP;
1944
1945         intel_ring_advance(rq, cs);
1946
1947         /* Be doubly sure the LRI have landed before proceeding */
1948         err = engine->emit_flush(rq, EMIT_FLUSH);
1949         if (err)
1950                 return err;
1951
1952         /* Re-invalidate the TLB for luck */
1953         return engine->emit_flush(rq, EMIT_INVALIDATE);
1954 }
1955
1956 static int execlists_request_alloc(struct i915_request *request)
1957 {
1958         int ret;
1959
1960         GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
1961
1962         /*
1963          * Flush enough space to reduce the likelihood of waiting after
1964          * we start building the request - in which case we will just
1965          * have to repeat work.
1966          */
1967         request->reserved_space += EXECLISTS_REQUEST_SIZE;
1968
1969         /*
1970          * Note that after this point, we have committed to using
1971          * this request as it is being used to both track the
1972          * state of engine initialisation and liveness of the
1973          * golden renderstate above. Think twice before you try
1974          * to cancel/unwind this request now.
1975          */
1976
1977         /* Unconditionally invalidate GPU caches and TLBs. */
1978         if (i915_vm_is_4lvl(request->hw_context->vm))
1979                 ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1980         else
1981                 ret = emit_pdps(request);
1982         if (ret)
1983                 return ret;
1984
1985         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
1986         return 0;
1987 }
1988
1989 /*
1990  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1991  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1992  * but there is a slight complication as this is applied in WA batch where the
1993  * values are only initialized once so we cannot take register value at the
1994  * beginning and reuse it further; hence we save its value to memory, upload a
1995  * constant value with bit21 set and then we restore it back with the saved value.
1996  * To simplify the WA, a constant value is formed by using the default value
1997  * of this register. This shouldn't be a problem because we are only modifying
1998  * it for a short period and this batch in non-premptible. We can ofcourse
1999  * use additional instructions that read the actual value of the register
2000  * at that time and set our bit of interest but it makes the WA complicated.
2001  *
2002  * This WA is also required for Gen9 so extracting as a function avoids
2003  * code duplication.
2004  */
2005 static u32 *
2006 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
2007 {
2008         /* NB no one else is allowed to scribble over scratch + 256! */
2009         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2010         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2011         *batch++ = intel_gt_scratch_offset(engine->gt,
2012                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2013         *batch++ = 0;
2014
2015         *batch++ = MI_LOAD_REGISTER_IMM(1);
2016         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2017         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
2018
2019         batch = gen8_emit_pipe_control(batch,
2020                                        PIPE_CONTROL_CS_STALL |
2021                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
2022                                        0);
2023
2024         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2025         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2026         *batch++ = intel_gt_scratch_offset(engine->gt,
2027                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2028         *batch++ = 0;
2029
2030         return batch;
2031 }
2032
2033 static u32 slm_offset(struct intel_engine_cs *engine)
2034 {
2035         return intel_gt_scratch_offset(engine->gt,
2036                                        INTEL_GT_SCRATCH_FIELD_CLEAR_SLM_WA);
2037 }
2038
2039 /*
2040  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
2041  * initialized at the beginning and shared across all contexts but this field
2042  * helps us to have multiple batches at different offsets and select them based
2043  * on a criteria. At the moment this batch always start at the beginning of the page
2044  * and at this point we don't have multiple wa_ctx batch buffers.
2045  *
2046  * The number of WA applied are not known at the beginning; we use this field
2047  * to return the no of DWORDS written.
2048  *
2049  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
2050  * so it adds NOOPs as padding to make it cacheline aligned.
2051  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
2052  * makes a complete batch buffer.
2053  */
2054 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2055 {
2056         /* WaDisableCtxRestoreArbitration:bdw,chv */
2057         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2058
2059         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
2060         if (IS_BROADWELL(engine->i915))
2061                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2062
2063         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
2064         /* Actual scratch location is at 128 bytes offset */
2065         batch = gen8_emit_pipe_control(batch,
2066                                        PIPE_CONTROL_FLUSH_L3 |
2067                                        PIPE_CONTROL_GLOBAL_GTT_IVB |
2068                                        PIPE_CONTROL_CS_STALL |
2069                                        PIPE_CONTROL_QW_WRITE,
2070                                        slm_offset(engine));
2071
2072         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2073
2074         /* Pad to end of cacheline */
2075         while ((unsigned long)batch % CACHELINE_BYTES)
2076                 *batch++ = MI_NOOP;
2077
2078         /*
2079          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
2080          * execution depends on the length specified in terms of cache lines
2081          * in the register CTX_RCS_INDIRECT_CTX
2082          */
2083
2084         return batch;
2085 }
2086
2087 struct lri {
2088         i915_reg_t reg;
2089         u32 value;
2090 };
2091
2092 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
2093 {
2094         GEM_BUG_ON(!count || count > 63);
2095
2096         *batch++ = MI_LOAD_REGISTER_IMM(count);
2097         do {
2098                 *batch++ = i915_mmio_reg_offset(lri->reg);
2099                 *batch++ = lri->value;
2100         } while (lri++, --count);
2101         *batch++ = MI_NOOP;
2102
2103         return batch;
2104 }
2105
2106 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2107 {
2108         static const struct lri lri[] = {
2109                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2110                 {
2111                         COMMON_SLICE_CHICKEN2,
2112                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2113                                        0),
2114                 },
2115
2116                 /* BSpec: 11391 */
2117                 {
2118                         FF_SLICE_CHICKEN,
2119                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2120                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2121                 },
2122
2123                 /* BSpec: 11299 */
2124                 {
2125                         _3D_CHICKEN3,
2126                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2127                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2128                 }
2129         };
2130
2131         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2132
2133         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2134         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2135
2136         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
2137         batch = gen8_emit_pipe_control(batch,
2138                                        PIPE_CONTROL_FLUSH_L3 |
2139                                        PIPE_CONTROL_GLOBAL_GTT_IVB |
2140                                        PIPE_CONTROL_CS_STALL |
2141                                        PIPE_CONTROL_QW_WRITE,
2142                                        slm_offset(engine));
2143
2144         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2145
2146         /* WaMediaPoolStateCmdInWABB:bxt,glk */
2147         if (HAS_POOLED_EU(engine->i915)) {
2148                 /*
2149                  * EU pool configuration is setup along with golden context
2150                  * during context initialization. This value depends on
2151                  * device type (2x6 or 3x6) and needs to be updated based
2152                  * on which subslice is disabled especially for 2x6
2153                  * devices, however it is safe to load default
2154                  * configuration of 3x6 device instead of masking off
2155                  * corresponding bits because HW ignores bits of a disabled
2156                  * subslice and drops down to appropriate config. Please
2157                  * see render_state_setup() in i915_gem_render_state.c for
2158                  * possible configurations, to avoid duplication they are
2159                  * not shown here again.
2160                  */
2161                 *batch++ = GEN9_MEDIA_POOL_STATE;
2162                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
2163                 *batch++ = 0x00777000;
2164                 *batch++ = 0;
2165                 *batch++ = 0;
2166                 *batch++ = 0;
2167         }
2168
2169         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2170
2171         /* Pad to end of cacheline */
2172         while ((unsigned long)batch % CACHELINE_BYTES)
2173                 *batch++ = MI_NOOP;
2174
2175         return batch;
2176 }
2177
2178 static u32 *
2179 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2180 {
2181         int i;
2182
2183         /*
2184          * WaPipeControlBefore3DStateSamplePattern: cnl
2185          *
2186          * Ensure the engine is idle prior to programming a
2187          * 3DSTATE_SAMPLE_PATTERN during a context restore.
2188          */
2189         batch = gen8_emit_pipe_control(batch,
2190                                        PIPE_CONTROL_CS_STALL,
2191                                        0);
2192         /*
2193          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2194          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2195          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2196          * confusing. Since gen8_emit_pipe_control() already advances the
2197          * batch by 6 dwords, we advance the other 10 here, completing a
2198          * cacheline. It's not clear if the workaround requires this padding
2199          * before other commands, or if it's just the regular padding we would
2200          * already have for the workaround bb, so leave it here for now.
2201          */
2202         for (i = 0; i < 10; i++)
2203                 *batch++ = MI_NOOP;
2204
2205         /* Pad to end of cacheline */
2206         while ((unsigned long)batch % CACHELINE_BYTES)
2207                 *batch++ = MI_NOOP;
2208
2209         return batch;
2210 }
2211
2212 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2213
2214 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2215 {
2216         struct drm_i915_gem_object *obj;
2217         struct i915_vma *vma;
2218         int err;
2219
2220         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2221         if (IS_ERR(obj))
2222                 return PTR_ERR(obj);
2223
2224         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2225         if (IS_ERR(vma)) {
2226                 err = PTR_ERR(vma);
2227                 goto err;
2228         }
2229
2230         err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2231         if (err)
2232                 goto err;
2233
2234         engine->wa_ctx.vma = vma;
2235         return 0;
2236
2237 err:
2238         i915_gem_object_put(obj);
2239         return err;
2240 }
2241
2242 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2243 {
2244         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2245 }
2246
2247 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2248
2249 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2250 {
2251         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2252         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2253                                             &wa_ctx->per_ctx };
2254         wa_bb_func_t wa_bb_fn[2];
2255         struct page *page;
2256         void *batch, *batch_ptr;
2257         unsigned int i;
2258         int ret;
2259
2260         if (engine->class != RENDER_CLASS)
2261                 return 0;
2262
2263         switch (INTEL_GEN(engine->i915)) {
2264         case 12:
2265         case 11:
2266                 return 0;
2267         case 10:
2268                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
2269                 wa_bb_fn[1] = NULL;
2270                 break;
2271         case 9:
2272                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
2273                 wa_bb_fn[1] = NULL;
2274                 break;
2275         case 8:
2276                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
2277                 wa_bb_fn[1] = NULL;
2278                 break;
2279         default:
2280                 MISSING_CASE(INTEL_GEN(engine->i915));
2281                 return 0;
2282         }
2283
2284         ret = lrc_setup_wa_ctx(engine);
2285         if (ret) {
2286                 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2287                 return ret;
2288         }
2289
2290         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2291         batch = batch_ptr = kmap_atomic(page);
2292
2293         /*
2294          * Emit the two workaround batch buffers, recording the offset from the
2295          * start of the workaround batch buffer object for each and their
2296          * respective sizes.
2297          */
2298         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
2299                 wa_bb[i]->offset = batch_ptr - batch;
2300                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
2301                                                   CACHELINE_BYTES))) {
2302                         ret = -EINVAL;
2303                         break;
2304                 }
2305                 if (wa_bb_fn[i])
2306                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
2307                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
2308         }
2309
2310         BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
2311
2312         kunmap_atomic(batch);
2313         if (ret)
2314                 lrc_destroy_wa_ctx(engine);
2315
2316         return ret;
2317 }
2318
2319 static void enable_execlists(struct intel_engine_cs *engine)
2320 {
2321         u32 mode;
2322
2323         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
2324
2325         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
2326
2327         if (INTEL_GEN(engine->i915) >= 11)
2328                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2329         else
2330                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2331         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2332
2333         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2334
2335         ENGINE_WRITE_FW(engine,
2336                         RING_HWS_PGA,
2337                         i915_ggtt_offset(engine->status_page.vma));
2338         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2339 }
2340
2341 static bool unexpected_starting_state(struct intel_engine_cs *engine)
2342 {
2343         bool unexpected = false;
2344
2345         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
2346                 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
2347                 unexpected = true;
2348         }
2349
2350         return unexpected;
2351 }
2352
2353 static int execlists_resume(struct intel_engine_cs *engine)
2354 {
2355         intel_engine_apply_workarounds(engine);
2356         intel_engine_apply_whitelist(engine);
2357
2358         intel_mocs_init_engine(engine);
2359
2360         intel_engine_reset_breadcrumbs(engine);
2361
2362         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
2363                 struct drm_printer p = drm_debug_printer(__func__);
2364
2365                 intel_engine_dump(engine, &p, NULL);
2366         }
2367
2368         enable_execlists(engine);
2369
2370         return 0;
2371 }
2372
2373 static void execlists_reset_prepare(struct intel_engine_cs *engine)
2374 {
2375         struct intel_engine_execlists * const execlists = &engine->execlists;
2376         unsigned long flags;
2377
2378         GEM_TRACE("%s: depth<-%d\n", engine->name,
2379                   atomic_read(&execlists->tasklet.count));
2380
2381         /*
2382          * Prevent request submission to the hardware until we have
2383          * completed the reset in i915_gem_reset_finish(). If a request
2384          * is completed by one engine, it may then queue a request
2385          * to a second via its execlists->tasklet *just* as we are
2386          * calling engine->resume() and also writing the ELSP.
2387          * Turning off the execlists->tasklet until the reset is over
2388          * prevents the race.
2389          */
2390         __tasklet_disable_sync_once(&execlists->tasklet);
2391         GEM_BUG_ON(!reset_in_progress(execlists));
2392
2393         /* And flush any current direct submission. */
2394         spin_lock_irqsave(&engine->active.lock, flags);
2395         spin_unlock_irqrestore(&engine->active.lock, flags);
2396
2397         /*
2398          * We stop engines, otherwise we might get failed reset and a
2399          * dead gpu (on elk). Also as modern gpu as kbl can suffer
2400          * from system hang if batchbuffer is progressing when
2401          * the reset is issued, regardless of READY_TO_RESET ack.
2402          * Thus assume it is best to stop engines on all gens
2403          * where we have a gpu reset.
2404          *
2405          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2406          *
2407          * FIXME: Wa for more modern gens needs to be validated
2408          */
2409         intel_engine_stop_cs(engine);
2410 }
2411
2412 static void reset_csb_pointers(struct intel_engine_cs *engine)
2413 {
2414         struct intel_engine_execlists * const execlists = &engine->execlists;
2415         const unsigned int reset_value = execlists->csb_size - 1;
2416
2417         ring_set_paused(engine, 0);
2418
2419         /*
2420          * After a reset, the HW starts writing into CSB entry [0]. We
2421          * therefore have to set our HEAD pointer back one entry so that
2422          * the *first* entry we check is entry 0. To complicate this further,
2423          * as we don't wait for the first interrupt after reset, we have to
2424          * fake the HW write to point back to the last entry so that our
2425          * inline comparison of our cached head position against the last HW
2426          * write works even before the first interrupt.
2427          */
2428         execlists->csb_head = reset_value;
2429         WRITE_ONCE(*execlists->csb_write, reset_value);
2430         wmb(); /* Make sure this is visible to HW (paranoia?) */
2431
2432         invalidate_csb_entries(&execlists->csb_status[0],
2433                                &execlists->csb_status[reset_value]);
2434 }
2435
2436 static struct i915_request *active_request(struct i915_request *rq)
2437 {
2438         const struct intel_context * const ce = rq->hw_context;
2439         struct i915_request *active = NULL;
2440         struct list_head *list;
2441
2442         if (!i915_request_is_active(rq)) /* unwound, but incomplete! */
2443                 return rq;
2444
2445         list = &rq->timeline->requests;
2446         list_for_each_entry_from_reverse(rq, list, link) {
2447                 if (i915_request_completed(rq))
2448                         break;
2449
2450                 if (rq->hw_context != ce)
2451                         break;
2452
2453                 active = rq;
2454         }
2455
2456         return active;
2457 }
2458
2459 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
2460 {
2461         struct intel_engine_execlists * const execlists = &engine->execlists;
2462         struct intel_context *ce;
2463         struct i915_request *rq;
2464         u32 *regs;
2465
2466         process_csb(engine); /* drain preemption events */
2467
2468         /* Following the reset, we need to reload the CSB read/write pointers */
2469         reset_csb_pointers(engine);
2470
2471         /*
2472          * Save the currently executing context, even if we completed
2473          * its request, it was still running at the time of the
2474          * reset and will have been clobbered.
2475          */
2476         rq = execlists_active(execlists);
2477         if (!rq)
2478                 goto unwind;
2479
2480         ce = rq->hw_context;
2481         GEM_BUG_ON(i915_active_is_idle(&ce->active));
2482         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2483         rq = active_request(rq);
2484         if (!rq) {
2485                 ce->ring->head = ce->ring->tail;
2486                 goto out_replay;
2487         }
2488
2489         ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
2490
2491         /*
2492          * If this request hasn't started yet, e.g. it is waiting on a
2493          * semaphore, we need to avoid skipping the request or else we
2494          * break the signaling chain. However, if the context is corrupt
2495          * the request will not restart and we will be stuck with a wedged
2496          * device. It is quite often the case that if we issue a reset
2497          * while the GPU is loading the context image, that the context
2498          * image becomes corrupt.
2499          *
2500          * Otherwise, if we have not started yet, the request should replay
2501          * perfectly and we do not need to flag the result as being erroneous.
2502          */
2503         if (!i915_request_started(rq))
2504                 goto out_replay;
2505
2506         /*
2507          * If the request was innocent, we leave the request in the ELSP
2508          * and will try to replay it on restarting. The context image may
2509          * have been corrupted by the reset, in which case we may have
2510          * to service a new GPU hang, but more likely we can continue on
2511          * without impact.
2512          *
2513          * If the request was guilty, we presume the context is corrupt
2514          * and have to at least restore the RING register in the context
2515          * image back to the expected values to skip over the guilty request.
2516          */
2517         __i915_request_reset(rq, stalled);
2518         if (!stalled)
2519                 goto out_replay;
2520
2521         /*
2522          * We want a simple context + ring to execute the breadcrumb update.
2523          * We cannot rely on the context being intact across the GPU hang,
2524          * so clear it and rebuild just what we need for the breadcrumb.
2525          * All pending requests for this context will be zapped, and any
2526          * future request will be after userspace has had the opportunity
2527          * to recreate its own state.
2528          */
2529         regs = ce->lrc_reg_state;
2530         if (engine->pinned_default_state) {
2531                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
2532                        engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
2533                        engine->context_size - PAGE_SIZE);
2534         }
2535         execlists_init_reg_state(regs, ce, engine, ce->ring);
2536
2537 out_replay:
2538         GEM_TRACE("%s replay {head:%04x, tail:%04x\n",
2539                   engine->name, ce->ring->head, ce->ring->tail);
2540         intel_ring_update_space(ce->ring);
2541         __execlists_update_reg_state(ce, engine);
2542
2543 unwind:
2544         /* Push back any incomplete requests for replay after the reset. */
2545         cancel_port_requests(execlists);
2546         __unwind_incomplete_requests(engine);
2547 }
2548
2549 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
2550 {
2551         unsigned long flags;
2552
2553         GEM_TRACE("%s\n", engine->name);
2554
2555         spin_lock_irqsave(&engine->active.lock, flags);
2556
2557         __execlists_reset(engine, stalled);
2558
2559         spin_unlock_irqrestore(&engine->active.lock, flags);
2560 }
2561
2562 static void nop_submission_tasklet(unsigned long data)
2563 {
2564         /* The driver is wedged; don't process any more events. */
2565 }
2566
2567 static void execlists_cancel_requests(struct intel_engine_cs *engine)
2568 {
2569         struct intel_engine_execlists * const execlists = &engine->execlists;
2570         struct i915_request *rq, *rn;
2571         struct rb_node *rb;
2572         unsigned long flags;
2573
2574         GEM_TRACE("%s\n", engine->name);
2575
2576         /*
2577          * Before we call engine->cancel_requests(), we should have exclusive
2578          * access to the submission state. This is arranged for us by the
2579          * caller disabling the interrupt generation, the tasklet and other
2580          * threads that may then access the same state, giving us a free hand
2581          * to reset state. However, we still need to let lockdep be aware that
2582          * we know this state may be accessed in hardirq context, so we
2583          * disable the irq around this manipulation and we want to keep
2584          * the spinlock focused on its duties and not accidentally conflate
2585          * coverage to the submission's irq state. (Similarly, although we
2586          * shouldn't need to disable irq around the manipulation of the
2587          * submission's irq state, we also wish to remind ourselves that
2588          * it is irq state.)
2589          */
2590         spin_lock_irqsave(&engine->active.lock, flags);
2591
2592         __execlists_reset(engine, true);
2593
2594         /* Mark all executing requests as skipped. */
2595         list_for_each_entry(rq, &engine->active.requests, sched.link)
2596                 mark_eio(rq);
2597
2598         /* Flush the queued requests to the timeline list (for retiring). */
2599         while ((rb = rb_first_cached(&execlists->queue))) {
2600                 struct i915_priolist *p = to_priolist(rb);
2601                 int i;
2602
2603                 priolist_for_each_request_consume(rq, rn, p, i) {
2604                         mark_eio(rq);
2605                         __i915_request_submit(rq);
2606                 }
2607
2608                 rb_erase_cached(&p->node, &execlists->queue);
2609                 i915_priolist_free(p);
2610         }
2611
2612         /* Cancel all attached virtual engines */
2613         while ((rb = rb_first_cached(&execlists->virtual))) {
2614                 struct virtual_engine *ve =
2615                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2616
2617                 rb_erase_cached(rb, &execlists->virtual);
2618                 RB_CLEAR_NODE(rb);
2619
2620                 spin_lock(&ve->base.active.lock);
2621                 rq = fetch_and_zero(&ve->request);
2622                 if (rq) {
2623                         mark_eio(rq);
2624
2625                         rq->engine = engine;
2626                         __i915_request_submit(rq);
2627                         i915_request_put(rq);
2628
2629                         ve->base.execlists.queue_priority_hint = INT_MIN;
2630                 }
2631                 spin_unlock(&ve->base.active.lock);
2632         }
2633
2634         /* Remaining _unready_ requests will be nop'ed when submitted */
2635
2636         execlists->queue_priority_hint = INT_MIN;
2637         execlists->queue = RB_ROOT_CACHED;
2638
2639         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
2640         execlists->tasklet.func = nop_submission_tasklet;
2641
2642         spin_unlock_irqrestore(&engine->active.lock, flags);
2643 }
2644
2645 static void execlists_reset_finish(struct intel_engine_cs *engine)
2646 {
2647         struct intel_engine_execlists * const execlists = &engine->execlists;
2648
2649         /*
2650          * After a GPU reset, we may have requests to replay. Do so now while
2651          * we still have the forcewake to be sure that the GPU is not allowed
2652          * to sleep before we restart and reload a context.
2653          */
2654         GEM_BUG_ON(!reset_in_progress(execlists));
2655         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
2656                 execlists->tasklet.func(execlists->tasklet.data);
2657
2658         if (__tasklet_enable(&execlists->tasklet))
2659                 /* And kick in case we missed a new request submission. */
2660                 tasklet_hi_schedule(&execlists->tasklet);
2661         GEM_TRACE("%s: depth->%d\n", engine->name,
2662                   atomic_read(&execlists->tasklet.count));
2663 }
2664
2665 static int gen8_emit_bb_start(struct i915_request *rq,
2666                               u64 offset, u32 len,
2667                               const unsigned int flags)
2668 {
2669         u32 *cs;
2670
2671         cs = intel_ring_begin(rq, 4);
2672         if (IS_ERR(cs))
2673                 return PTR_ERR(cs);
2674
2675         /*
2676          * WaDisableCtxRestoreArbitration:bdw,chv
2677          *
2678          * We don't need to perform MI_ARB_ENABLE as often as we do (in
2679          * particular all the gen that do not need the w/a at all!), if we
2680          * took care to make sure that on every switch into this context
2681          * (both ordinary and for preemption) that arbitrartion was enabled
2682          * we would be fine.  However, for gen8 there is another w/a that
2683          * requires us to not preempt inside GPGPU execution, so we keep
2684          * arbitration disabled for gen8 batches. Arbitration will be
2685          * re-enabled before we close the request
2686          * (engine->emit_fini_breadcrumb).
2687          */
2688         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2689
2690         /* FIXME(BDW+): Address space and security selectors. */
2691         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2692                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2693         *cs++ = lower_32_bits(offset);
2694         *cs++ = upper_32_bits(offset);
2695
2696         intel_ring_advance(rq, cs);
2697
2698         return 0;
2699 }
2700
2701 static int gen9_emit_bb_start(struct i915_request *rq,
2702                               u64 offset, u32 len,
2703                               const unsigned int flags)
2704 {
2705         u32 *cs;
2706
2707         cs = intel_ring_begin(rq, 6);
2708         if (IS_ERR(cs))
2709                 return PTR_ERR(cs);
2710
2711         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2712
2713         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2714                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2715         *cs++ = lower_32_bits(offset);
2716         *cs++ = upper_32_bits(offset);
2717
2718         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2719         *cs++ = MI_NOOP;
2720
2721         intel_ring_advance(rq, cs);
2722
2723         return 0;
2724 }
2725
2726 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
2727 {
2728         ENGINE_WRITE(engine, RING_IMR,
2729                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
2730         ENGINE_POSTING_READ(engine, RING_IMR);
2731 }
2732
2733 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
2734 {
2735         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
2736 }
2737
2738 static int gen8_emit_flush(struct i915_request *request, u32 mode)
2739 {
2740         u32 cmd, *cs;
2741
2742         cs = intel_ring_begin(request, 4);
2743         if (IS_ERR(cs))
2744                 return PTR_ERR(cs);
2745
2746         cmd = MI_FLUSH_DW + 1;
2747
2748         /* We always require a command barrier so that subsequent
2749          * commands, such as breadcrumb interrupts, are strictly ordered
2750          * wrt the contents of the write cache being flushed to memory
2751          * (and thus being coherent from the CPU).
2752          */
2753         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2754
2755         if (mode & EMIT_INVALIDATE) {
2756                 cmd |= MI_INVALIDATE_TLB;
2757                 if (request->engine->class == VIDEO_DECODE_CLASS)
2758                         cmd |= MI_INVALIDATE_BSD;
2759         }
2760
2761         *cs++ = cmd;
2762         *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2763         *cs++ = 0; /* upper addr */
2764         *cs++ = 0; /* value */
2765         intel_ring_advance(request, cs);
2766
2767         return 0;
2768 }
2769
2770 static int gen8_emit_flush_render(struct i915_request *request,
2771                                   u32 mode)
2772 {
2773         struct intel_engine_cs *engine = request->engine;
2774         u32 scratch_addr =
2775                 intel_gt_scratch_offset(engine->gt,
2776                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
2777         bool vf_flush_wa = false, dc_flush_wa = false;
2778         u32 *cs, flags = 0;
2779         int len;
2780
2781         flags |= PIPE_CONTROL_CS_STALL;
2782
2783         if (mode & EMIT_FLUSH) {
2784                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2785                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2786                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
2787                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
2788         }
2789
2790         if (mode & EMIT_INVALIDATE) {
2791                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
2792                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2793                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2794                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2795                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2796                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2797                 flags |= PIPE_CONTROL_QW_WRITE;
2798                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2799
2800                 /*
2801                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
2802                  * pipe control.
2803                  */
2804                 if (IS_GEN(request->i915, 9))
2805                         vf_flush_wa = true;
2806
2807                 /* WaForGAMHang:kbl */
2808                 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
2809                         dc_flush_wa = true;
2810         }
2811
2812         len = 6;
2813
2814         if (vf_flush_wa)
2815                 len += 6;
2816
2817         if (dc_flush_wa)
2818                 len += 12;
2819
2820         cs = intel_ring_begin(request, len);
2821         if (IS_ERR(cs))
2822                 return PTR_ERR(cs);
2823
2824         if (vf_flush_wa)
2825                 cs = gen8_emit_pipe_control(cs, 0, 0);
2826
2827         if (dc_flush_wa)
2828                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
2829                                             0);
2830
2831         cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2832
2833         if (dc_flush_wa)
2834                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
2835
2836         intel_ring_advance(request, cs);
2837
2838         return 0;
2839 }
2840
2841 static int gen11_emit_flush_render(struct i915_request *request,
2842                                    u32 mode)
2843 {
2844         struct intel_engine_cs *engine = request->engine;
2845         const u32 scratch_addr =
2846                 intel_gt_scratch_offset(engine->gt,
2847                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
2848
2849         if (mode & EMIT_FLUSH) {
2850                 u32 *cs;
2851                 u32 flags = 0;
2852
2853                 flags |= PIPE_CONTROL_CS_STALL;
2854
2855                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
2856                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2857                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2858                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
2859                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
2860                 flags |= PIPE_CONTROL_QW_WRITE;
2861                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2862
2863                 cs = intel_ring_begin(request, 6);
2864                 if (IS_ERR(cs))
2865                         return PTR_ERR(cs);
2866
2867                 cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2868                 intel_ring_advance(request, cs);
2869         }
2870
2871         if (mode & EMIT_INVALIDATE) {
2872                 u32 *cs;
2873                 u32 flags = 0;
2874
2875                 flags |= PIPE_CONTROL_CS_STALL;
2876
2877                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
2878                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
2879                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2880                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2881                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2882                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2883                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2884                 flags |= PIPE_CONTROL_QW_WRITE;
2885                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2886
2887                 cs = intel_ring_begin(request, 6);
2888                 if (IS_ERR(cs))
2889                         return PTR_ERR(cs);
2890
2891                 cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2892                 intel_ring_advance(request, cs);
2893         }
2894
2895         return 0;
2896 }
2897
2898 /*
2899  * Reserve space for 2 NOOPs at the end of each request to be
2900  * used as a workaround for not being allowed to do lite
2901  * restore with HEAD==TAIL (WaIdleLiteRestore).
2902  */
2903 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
2904 {
2905         /* Ensure there's always at least one preemption point per-request. */
2906         *cs++ = MI_ARB_CHECK;
2907         *cs++ = MI_NOOP;
2908         request->wa_tail = intel_ring_offset(request, cs);
2909
2910         return cs;
2911 }
2912
2913 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
2914 {
2915         *cs++ = MI_SEMAPHORE_WAIT |
2916                 MI_SEMAPHORE_GLOBAL_GTT |
2917                 MI_SEMAPHORE_POLL |
2918                 MI_SEMAPHORE_SAD_EQ_SDD;
2919         *cs++ = 0;
2920         *cs++ = intel_hws_preempt_address(request->engine);
2921         *cs++ = 0;
2922
2923         return cs;
2924 }
2925
2926 static __always_inline u32*
2927 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
2928                                  u32 *cs)
2929 {
2930         *cs++ = MI_USER_INTERRUPT;
2931
2932         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2933         if (intel_engine_has_semaphores(request->engine))
2934                 cs = emit_preempt_busywait(request, cs);
2935
2936         request->tail = intel_ring_offset(request, cs);
2937         assert_ring_tail_valid(request->ring, request->tail);
2938
2939         return gen8_emit_wa_tail(request, cs);
2940 }
2941
2942 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
2943 {
2944         cs = gen8_emit_ggtt_write(cs,
2945                                   request->fence.seqno,
2946                                   request->timeline->hwsp_offset,
2947                                   0);
2948
2949         return gen8_emit_fini_breadcrumb_footer(request, cs);
2950 }
2951
2952 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
2953 {
2954         cs = gen8_emit_ggtt_write_rcs(cs,
2955                                       request->fence.seqno,
2956                                       request->timeline->hwsp_offset,
2957                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
2958                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2959                                       PIPE_CONTROL_DC_FLUSH_ENABLE);
2960
2961         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
2962         cs = gen8_emit_pipe_control(cs,
2963                                     PIPE_CONTROL_FLUSH_ENABLE |
2964                                     PIPE_CONTROL_CS_STALL,
2965                                     0);
2966
2967         return gen8_emit_fini_breadcrumb_footer(request, cs);
2968 }
2969
2970 static u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *request,
2971                                            u32 *cs)
2972 {
2973         cs = gen8_emit_ggtt_write_rcs(cs,
2974                                       request->fence.seqno,
2975                                       request->timeline->hwsp_offset,
2976                                       PIPE_CONTROL_CS_STALL |
2977                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
2978                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
2979                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2980                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
2981                                       PIPE_CONTROL_FLUSH_ENABLE);
2982
2983         return gen8_emit_fini_breadcrumb_footer(request, cs);
2984 }
2985
2986 static void execlists_park(struct intel_engine_cs *engine)
2987 {
2988         del_timer(&engine->execlists.timer);
2989 }
2990
2991 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
2992 {
2993         engine->submit_request = execlists_submit_request;
2994         engine->cancel_requests = execlists_cancel_requests;
2995         engine->schedule = i915_schedule;
2996         engine->execlists.tasklet.func = execlists_submission_tasklet;
2997
2998         engine->reset.prepare = execlists_reset_prepare;
2999         engine->reset.reset = execlists_reset;
3000         engine->reset.finish = execlists_reset_finish;
3001
3002         engine->park = execlists_park;
3003         engine->unpark = NULL;
3004
3005         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
3006         if (!intel_vgpu_active(engine->i915)) {
3007                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
3008                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
3009                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3010         }
3011 }
3012
3013 static void execlists_destroy(struct intel_engine_cs *engine)
3014 {
3015         intel_engine_cleanup_common(engine);
3016         lrc_destroy_wa_ctx(engine);
3017         kfree(engine);
3018 }
3019
3020 static void
3021 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3022 {
3023         /* Default vfuncs which can be overriden by each engine. */
3024
3025         engine->destroy = execlists_destroy;
3026         engine->resume = execlists_resume;
3027
3028         engine->reset.prepare = execlists_reset_prepare;
3029         engine->reset.reset = execlists_reset;
3030         engine->reset.finish = execlists_reset_finish;
3031
3032         engine->cops = &execlists_context_ops;
3033         engine->request_alloc = execlists_request_alloc;
3034
3035         engine->emit_flush = gen8_emit_flush;
3036         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3037         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
3038
3039         engine->set_default_submission = intel_execlists_set_default_submission;
3040
3041         if (INTEL_GEN(engine->i915) < 11) {
3042                 engine->irq_enable = gen8_logical_ring_enable_irq;
3043                 engine->irq_disable = gen8_logical_ring_disable_irq;
3044         } else {
3045                 /*
3046                  * TODO: On Gen11 interrupt masks need to be clear
3047                  * to allow C6 entry. Keep interrupts enabled at
3048                  * and take the hit of generating extra interrupts
3049                  * until a more refined solution exists.
3050                  */
3051         }
3052         if (IS_GEN(engine->i915, 8))
3053                 engine->emit_bb_start = gen8_emit_bb_start;
3054         else
3055                 engine->emit_bb_start = gen9_emit_bb_start;
3056 }
3057
3058 static inline void
3059 logical_ring_default_irqs(struct intel_engine_cs *engine)
3060 {
3061         unsigned int shift = 0;
3062
3063         if (INTEL_GEN(engine->i915) < 11) {
3064                 const u8 irq_shifts[] = {
3065                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
3066                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
3067                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
3068                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
3069                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
3070                 };
3071
3072                 shift = irq_shifts[engine->id];
3073         }
3074
3075         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3076         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3077 }
3078
3079 static void rcs_submission_override(struct intel_engine_cs *engine)
3080 {
3081         switch (INTEL_GEN(engine->i915)) {
3082         case 12:
3083         case 11:
3084                 engine->emit_flush = gen11_emit_flush_render;
3085                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3086                 break;
3087         default:
3088                 engine->emit_flush = gen8_emit_flush_render;
3089                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3090                 break;
3091         }
3092 }
3093
3094 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3095 {
3096         tasklet_init(&engine->execlists.tasklet,
3097                      execlists_submission_tasklet, (unsigned long)engine);
3098         timer_setup(&engine->execlists.timer, execlists_submission_timer, 0);
3099
3100         logical_ring_default_vfuncs(engine);
3101         logical_ring_default_irqs(engine);
3102
3103         if (engine->class == RENDER_CLASS)
3104                 rcs_submission_override(engine);
3105
3106         return 0;
3107 }
3108
3109 int intel_execlists_submission_init(struct intel_engine_cs *engine)
3110 {
3111         struct intel_engine_execlists * const execlists = &engine->execlists;
3112         struct drm_i915_private *i915 = engine->i915;
3113         struct intel_uncore *uncore = engine->uncore;
3114         u32 base = engine->mmio_base;
3115         int ret;
3116
3117         ret = intel_engine_init_common(engine);
3118         if (ret)
3119                 return ret;
3120
3121         if (intel_init_workaround_bb(engine))
3122                 /*
3123                  * We continue even if we fail to initialize WA batch
3124                  * because we only expect rare glitches but nothing
3125                  * critical to prevent us from using GPU
3126                  */
3127                 DRM_ERROR("WA batch buffer initialization failed\n");
3128
3129         if (HAS_LOGICAL_RING_ELSQ(i915)) {
3130                 execlists->submit_reg = uncore->regs +
3131                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3132                 execlists->ctrl_reg = uncore->regs +
3133                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3134         } else {
3135                 execlists->submit_reg = uncore->regs +
3136                         i915_mmio_reg_offset(RING_ELSP(base));
3137         }
3138
3139         execlists->csb_status =
3140                 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3141
3142         execlists->csb_write =
3143                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
3144
3145         if (INTEL_GEN(i915) < 11)
3146                 execlists->csb_size = GEN8_CSB_ENTRIES;
3147         else
3148                 execlists->csb_size = GEN11_CSB_ENTRIES;
3149
3150         reset_csb_pointers(engine);
3151
3152         return 0;
3153 }
3154
3155 static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
3156 {
3157         u32 indirect_ctx_offset;
3158
3159         switch (INTEL_GEN(engine->i915)) {
3160         default:
3161                 MISSING_CASE(INTEL_GEN(engine->i915));
3162                 /* fall through */
3163         case 12:
3164                 indirect_ctx_offset =
3165                         GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3166                 break;
3167         case 11:
3168                 indirect_ctx_offset =
3169                         GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3170                 break;
3171         case 10:
3172                 indirect_ctx_offset =
3173                         GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3174                 break;
3175         case 9:
3176                 indirect_ctx_offset =
3177                         GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3178                 break;
3179         case 8:
3180                 indirect_ctx_offset =
3181                         GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3182                 break;
3183         }
3184
3185         return indirect_ctx_offset;
3186 }
3187
3188 static void execlists_init_reg_state(u32 *regs,
3189                                      struct intel_context *ce,
3190                                      struct intel_engine_cs *engine,
3191                                      struct intel_ring *ring)
3192 {
3193         struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(ce->vm);
3194         bool rcs = engine->class == RENDER_CLASS;
3195         u32 base = engine->mmio_base;
3196
3197         /*
3198          * A context is actually a big batch buffer with several
3199          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
3200          * values we are setting here are only for the first context restore:
3201          * on a subsequent save, the GPU will recreate this batchbuffer with new
3202          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
3203          * we are not initializing here).
3204          *
3205          * Must keep consistent with virtual_update_register_offsets().
3206          */
3207         regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
3208                                  MI_LRI_FORCE_POSTED;
3209
3210         CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base),
3211                 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
3212                 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH));
3213         if (INTEL_GEN(engine->i915) < 11) {
3214                 regs[CTX_CONTEXT_CONTROL + 1] |=
3215                         _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
3216                                             CTX_CTRL_RS_CTX_ENABLE);
3217         }
3218         CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
3219         CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
3220         CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
3221         CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
3222                 RING_CTL_SIZE(ring->size) | RING_VALID);
3223         CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
3224         CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
3225         CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
3226         CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
3227         CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
3228         CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
3229         if (rcs) {
3230                 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3231
3232                 CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
3233                 CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
3234                         RING_INDIRECT_CTX_OFFSET(base), 0);
3235                 if (wa_ctx->indirect_ctx.size) {
3236                         u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3237
3238                         regs[CTX_RCS_INDIRECT_CTX + 1] =
3239                                 (ggtt_offset + wa_ctx->indirect_ctx.offset) |
3240                                 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
3241
3242                         regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
3243                                 intel_lr_indirect_ctx_offset(engine) << 6;
3244                 }
3245
3246                 CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
3247                 if (wa_ctx->per_ctx.size) {
3248                         u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3249
3250                         regs[CTX_BB_PER_CTX_PTR + 1] =
3251                                 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
3252                 }
3253         }
3254
3255         regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
3256
3257         CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
3258         /* PDP values well be assigned later if needed */
3259         CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(base, 3), 0);
3260         CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(base, 3), 0);
3261         CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(base, 2), 0);
3262         CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(base, 2), 0);
3263         CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(base, 1), 0);
3264         CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(base, 1), 0);
3265         CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(base, 0), 0);
3266         CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(base, 0), 0);
3267
3268         if (i915_vm_is_4lvl(&ppgtt->vm)) {
3269                 /* 64b PPGTT (48bit canonical)
3270                  * PDP0_DESCRIPTOR contains the base address to PML4 and
3271                  * other PDP Descriptors are ignored.
3272                  */
3273                 ASSIGN_CTX_PML4(ppgtt, regs);
3274         } else {
3275                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
3276                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
3277                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
3278                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
3279         }
3280
3281         if (rcs) {
3282                 regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
3283                 CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
3284         }
3285
3286         regs[CTX_END] = MI_BATCH_BUFFER_END;
3287         if (INTEL_GEN(engine->i915) >= 10)
3288                 regs[CTX_END] |= BIT(0);
3289 }
3290
3291 static int
3292 populate_lr_context(struct intel_context *ce,
3293                     struct drm_i915_gem_object *ctx_obj,
3294                     struct intel_engine_cs *engine,
3295                     struct intel_ring *ring)
3296 {
3297         void *vaddr;
3298         u32 *regs;
3299         int ret;
3300
3301         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
3302         if (IS_ERR(vaddr)) {
3303                 ret = PTR_ERR(vaddr);
3304                 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
3305                 return ret;
3306         }
3307
3308         set_redzone(vaddr, engine);
3309
3310         if (engine->default_state) {
3311                 /*
3312                  * We only want to copy over the template context state;
3313                  * skipping over the headers reserved for GuC communication,
3314                  * leaving those as zero.
3315                  */
3316                 const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
3317                 void *defaults;
3318
3319                 defaults = i915_gem_object_pin_map(engine->default_state,
3320                                                    I915_MAP_WB);
3321                 if (IS_ERR(defaults)) {
3322                         ret = PTR_ERR(defaults);
3323                         goto err_unpin_ctx;
3324                 }
3325
3326                 memcpy(vaddr + start, defaults + start, engine->context_size);
3327                 i915_gem_object_unpin_map(engine->default_state);
3328         }
3329
3330         /* The second page of the context object contains some fields which must
3331          * be set up prior to the first execution. */
3332         regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
3333         execlists_init_reg_state(regs, ce, engine, ring);
3334         if (!engine->default_state)
3335                 regs[CTX_CONTEXT_CONTROL + 1] |=
3336                         _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
3337
3338         ret = 0;
3339 err_unpin_ctx:
3340         __i915_gem_object_flush_map(ctx_obj,
3341                                     LRC_HEADER_PAGES * PAGE_SIZE,
3342                                     engine->context_size);
3343         i915_gem_object_unpin_map(ctx_obj);
3344         return ret;
3345 }
3346
3347 static int __execlists_context_alloc(struct intel_context *ce,
3348                                      struct intel_engine_cs *engine)
3349 {
3350         struct drm_i915_gem_object *ctx_obj;
3351         struct intel_ring *ring;
3352         struct i915_vma *vma;
3353         u32 context_size;
3354         int ret;
3355
3356         GEM_BUG_ON(ce->state);
3357         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
3358
3359         /*
3360          * Before the actual start of the context image, we insert a few pages
3361          * for our own use and for sharing with the GuC.
3362          */
3363         context_size += LRC_HEADER_PAGES * PAGE_SIZE;
3364         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3365                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
3366
3367         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
3368         if (IS_ERR(ctx_obj))
3369                 return PTR_ERR(ctx_obj);
3370
3371         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
3372         if (IS_ERR(vma)) {
3373                 ret = PTR_ERR(vma);
3374                 goto error_deref_obj;
3375         }
3376
3377         if (!ce->timeline) {
3378                 struct intel_timeline *tl;
3379
3380                 tl = intel_timeline_create(engine->gt, NULL);
3381                 if (IS_ERR(tl)) {
3382                         ret = PTR_ERR(tl);
3383                         goto error_deref_obj;
3384                 }
3385
3386                 ce->timeline = tl;
3387         }
3388
3389         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
3390         if (IS_ERR(ring)) {
3391                 ret = PTR_ERR(ring);
3392                 goto error_deref_obj;
3393         }
3394
3395         ret = populate_lr_context(ce, ctx_obj, engine, ring);
3396         if (ret) {
3397                 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
3398                 goto error_ring_free;
3399         }
3400
3401         ce->ring = ring;
3402         ce->state = vma;
3403
3404         return 0;
3405
3406 error_ring_free:
3407         intel_ring_put(ring);
3408 error_deref_obj:
3409         i915_gem_object_put(ctx_obj);
3410         return ret;
3411 }
3412
3413 static struct list_head *virtual_queue(struct virtual_engine *ve)
3414 {
3415         return &ve->base.execlists.default_priolist.requests[0];
3416 }
3417
3418 static void virtual_context_destroy(struct kref *kref)
3419 {
3420         struct virtual_engine *ve =
3421                 container_of(kref, typeof(*ve), context.ref);
3422         unsigned int n;
3423
3424         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3425         GEM_BUG_ON(ve->request);
3426         GEM_BUG_ON(ve->context.inflight);
3427
3428         for (n = 0; n < ve->num_siblings; n++) {
3429                 struct intel_engine_cs *sibling = ve->siblings[n];
3430                 struct rb_node *node = &ve->nodes[sibling->id].rb;
3431
3432                 if (RB_EMPTY_NODE(node))
3433                         continue;
3434
3435                 spin_lock_irq(&sibling->active.lock);
3436
3437                 /* Detachment is lazily performed in the execlists tasklet */
3438                 if (!RB_EMPTY_NODE(node))
3439                         rb_erase_cached(node, &sibling->execlists.virtual);
3440
3441                 spin_unlock_irq(&sibling->active.lock);
3442         }
3443         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
3444
3445         if (ve->context.state)
3446                 __execlists_context_fini(&ve->context);
3447         intel_context_fini(&ve->context);
3448
3449         kfree(ve->bonds);
3450         kfree(ve);
3451 }
3452
3453 static void virtual_engine_initial_hint(struct virtual_engine *ve)
3454 {
3455         int swp;
3456
3457         /*
3458          * Pick a random sibling on starting to help spread the load around.
3459          *
3460          * New contexts are typically created with exactly the same order
3461          * of siblings, and often started in batches. Due to the way we iterate
3462          * the array of sibling when submitting requests, sibling[0] is
3463          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
3464          * randomised across the system, we also help spread the load by the
3465          * first engine we inspect being different each time.
3466          *
3467          * NB This does not force us to execute on this engine, it will just
3468          * typically be the first we inspect for submission.
3469          */
3470         swp = prandom_u32_max(ve->num_siblings);
3471         if (!swp)
3472                 return;
3473
3474         swap(ve->siblings[swp], ve->siblings[0]);
3475         virtual_update_register_offsets(ve->context.lrc_reg_state,
3476                                         ve->siblings[0]);
3477 }
3478
3479 static int virtual_context_pin(struct intel_context *ce)
3480 {
3481         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3482         int err;
3483
3484         /* Note: we must use a real engine class for setting up reg state */
3485         err = __execlists_context_pin(ce, ve->siblings[0]);
3486         if (err)
3487                 return err;
3488
3489         virtual_engine_initial_hint(ve);
3490         return 0;
3491 }
3492
3493 static void virtual_context_enter(struct intel_context *ce)
3494 {
3495         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3496         unsigned int n;
3497
3498         for (n = 0; n < ve->num_siblings; n++)
3499                 intel_engine_pm_get(ve->siblings[n]);
3500
3501         intel_timeline_enter(ce->timeline);
3502 }
3503
3504 static void virtual_context_exit(struct intel_context *ce)
3505 {
3506         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3507         unsigned int n;
3508
3509         intel_timeline_exit(ce->timeline);
3510
3511         for (n = 0; n < ve->num_siblings; n++)
3512                 intel_engine_pm_put(ve->siblings[n]);
3513 }
3514
3515 static const struct intel_context_ops virtual_context_ops = {
3516         .pin = virtual_context_pin,
3517         .unpin = execlists_context_unpin,
3518
3519         .enter = virtual_context_enter,
3520         .exit = virtual_context_exit,
3521
3522         .destroy = virtual_context_destroy,
3523 };
3524
3525 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
3526 {
3527         struct i915_request *rq;
3528         intel_engine_mask_t mask;
3529
3530         rq = READ_ONCE(ve->request);
3531         if (!rq)
3532                 return 0;
3533
3534         /* The rq is ready for submission; rq->execution_mask is now stable. */
3535         mask = rq->execution_mask;
3536         if (unlikely(!mask)) {
3537                 /* Invalid selection, submit to a random engine in error */
3538                 i915_request_skip(rq, -ENODEV);
3539                 mask = ve->siblings[0]->mask;
3540         }
3541
3542         GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
3543                   ve->base.name,
3544                   rq->fence.context, rq->fence.seqno,
3545                   mask, ve->base.execlists.queue_priority_hint);
3546
3547         return mask;
3548 }
3549
3550 static void virtual_submission_tasklet(unsigned long data)
3551 {
3552         struct virtual_engine * const ve = (struct virtual_engine *)data;
3553         const int prio = ve->base.execlists.queue_priority_hint;
3554         intel_engine_mask_t mask;
3555         unsigned int n;
3556
3557         rcu_read_lock();
3558         mask = virtual_submission_mask(ve);
3559         rcu_read_unlock();
3560         if (unlikely(!mask))
3561                 return;
3562
3563         local_irq_disable();
3564         for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
3565                 struct intel_engine_cs *sibling = ve->siblings[n];
3566                 struct ve_node * const node = &ve->nodes[sibling->id];
3567                 struct rb_node **parent, *rb;
3568                 bool first;
3569
3570                 if (unlikely(!(mask & sibling->mask))) {
3571                         if (!RB_EMPTY_NODE(&node->rb)) {
3572                                 spin_lock(&sibling->active.lock);
3573                                 rb_erase_cached(&node->rb,
3574                                                 &sibling->execlists.virtual);
3575                                 RB_CLEAR_NODE(&node->rb);
3576                                 spin_unlock(&sibling->active.lock);
3577                         }
3578                         continue;
3579                 }
3580
3581                 spin_lock(&sibling->active.lock);
3582
3583                 if (!RB_EMPTY_NODE(&node->rb)) {
3584                         /*
3585                          * Cheat and avoid rebalancing the tree if we can
3586                          * reuse this node in situ.
3587                          */
3588                         first = rb_first_cached(&sibling->execlists.virtual) ==
3589                                 &node->rb;
3590                         if (prio == node->prio || (prio > node->prio && first))
3591                                 goto submit_engine;
3592
3593                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
3594                 }
3595
3596                 rb = NULL;
3597                 first = true;
3598                 parent = &sibling->execlists.virtual.rb_root.rb_node;
3599                 while (*parent) {
3600                         struct ve_node *other;
3601
3602                         rb = *parent;
3603                         other = rb_entry(rb, typeof(*other), rb);
3604                         if (prio > other->prio) {
3605                                 parent = &rb->rb_left;
3606                         } else {
3607                                 parent = &rb->rb_right;
3608                                 first = false;
3609                         }
3610                 }
3611
3612                 rb_link_node(&node->rb, rb, parent);
3613                 rb_insert_color_cached(&node->rb,
3614                                        &sibling->execlists.virtual,
3615                                        first);
3616
3617 submit_engine:
3618                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
3619                 node->prio = prio;
3620                 if (first && prio > sibling->execlists.queue_priority_hint) {
3621                         sibling->execlists.queue_priority_hint = prio;
3622                         tasklet_hi_schedule(&sibling->execlists.tasklet);
3623                 }
3624
3625                 spin_unlock(&sibling->active.lock);
3626         }
3627         local_irq_enable();
3628 }
3629
3630 static void virtual_submit_request(struct i915_request *rq)
3631 {
3632         struct virtual_engine *ve = to_virtual_engine(rq->engine);
3633         struct i915_request *old;
3634         unsigned long flags;
3635
3636         GEM_TRACE("%s: rq=%llx:%lld\n",
3637                   ve->base.name,
3638                   rq->fence.context,
3639                   rq->fence.seqno);
3640
3641         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
3642
3643         spin_lock_irqsave(&ve->base.active.lock, flags);
3644
3645         old = ve->request;
3646         if (old) { /* background completion event from preempt-to-busy */
3647                 GEM_BUG_ON(!i915_request_completed(old));
3648                 __i915_request_submit(old);
3649                 i915_request_put(old);
3650         }
3651
3652         if (i915_request_completed(rq)) {
3653                 __i915_request_submit(rq);
3654
3655                 ve->base.execlists.queue_priority_hint = INT_MIN;
3656                 ve->request = NULL;
3657         } else {
3658                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
3659                 ve->request = i915_request_get(rq);
3660
3661                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3662                 list_move_tail(&rq->sched.link, virtual_queue(ve));
3663
3664                 tasklet_schedule(&ve->base.execlists.tasklet);
3665         }
3666
3667         spin_unlock_irqrestore(&ve->base.active.lock, flags);
3668 }
3669
3670 static struct ve_bond *
3671 virtual_find_bond(struct virtual_engine *ve,
3672                   const struct intel_engine_cs *master)
3673 {
3674         int i;
3675
3676         for (i = 0; i < ve->num_bonds; i++) {
3677                 if (ve->bonds[i].master == master)
3678                         return &ve->bonds[i];
3679         }
3680
3681         return NULL;
3682 }
3683
3684 static void
3685 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
3686 {
3687         struct virtual_engine *ve = to_virtual_engine(rq->engine);
3688         intel_engine_mask_t allowed, exec;
3689         struct ve_bond *bond;
3690
3691         allowed = ~to_request(signal)->engine->mask;
3692
3693         bond = virtual_find_bond(ve, to_request(signal)->engine);
3694         if (bond)
3695                 allowed &= bond->sibling_mask;
3696
3697         /* Restrict the bonded request to run on only the available engines */
3698         exec = READ_ONCE(rq->execution_mask);
3699         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
3700                 ;
3701
3702         /* Prevent the master from being re-run on the bonded engines */
3703         to_request(signal)->execution_mask &= ~allowed;
3704 }
3705
3706 struct intel_context *
3707 intel_execlists_create_virtual(struct i915_gem_context *ctx,
3708                                struct intel_engine_cs **siblings,
3709                                unsigned int count)
3710 {
3711         struct virtual_engine *ve;
3712         unsigned int n;
3713         int err;
3714
3715         if (count == 0)
3716                 return ERR_PTR(-EINVAL);
3717
3718         if (count == 1)
3719                 return intel_context_create(ctx, siblings[0]);
3720
3721         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
3722         if (!ve)
3723                 return ERR_PTR(-ENOMEM);
3724
3725         ve->base.i915 = ctx->i915;
3726         ve->base.gt = siblings[0]->gt;
3727         ve->base.id = -1;
3728
3729         ve->base.class = OTHER_CLASS;
3730         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
3731         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
3732         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
3733
3734         /*
3735          * The decision on whether to submit a request using semaphores
3736          * depends on the saturated state of the engine. We only compute
3737          * this during HW submission of the request, and we need for this
3738          * state to be globally applied to all requests being submitted
3739          * to this engine. Virtual engines encompass more than one physical
3740          * engine and so we cannot accurately tell in advance if one of those
3741          * engines is already saturated and so cannot afford to use a semaphore
3742          * and be pessimized in priority for doing so -- if we are the only
3743          * context using semaphores after all other clients have stopped, we
3744          * will be starved on the saturated system. Such a global switch for
3745          * semaphores is less than ideal, but alas is the current compromise.
3746          */
3747         ve->base.saturated = ALL_ENGINES;
3748
3749         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
3750
3751         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
3752
3753         intel_engine_init_execlists(&ve->base);
3754
3755         ve->base.cops = &virtual_context_ops;
3756         ve->base.request_alloc = execlists_request_alloc;
3757
3758         ve->base.schedule = i915_schedule;
3759         ve->base.submit_request = virtual_submit_request;
3760         ve->base.bond_execute = virtual_bond_execute;
3761
3762         INIT_LIST_HEAD(virtual_queue(ve));
3763         ve->base.execlists.queue_priority_hint = INT_MIN;
3764         tasklet_init(&ve->base.execlists.tasklet,
3765                      virtual_submission_tasklet,
3766                      (unsigned long)ve);
3767
3768         intel_context_init(&ve->context, ctx, &ve->base);
3769
3770         for (n = 0; n < count; n++) {
3771                 struct intel_engine_cs *sibling = siblings[n];
3772
3773                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
3774                 if (sibling->mask & ve->base.mask) {
3775                         DRM_DEBUG("duplicate %s entry in load balancer\n",
3776                                   sibling->name);
3777                         err = -EINVAL;
3778                         goto err_put;
3779                 }
3780
3781                 /*
3782                  * The virtual engine implementation is tightly coupled to
3783                  * the execlists backend -- we push out request directly
3784                  * into a tree inside each physical engine. We could support
3785                  * layering if we handle cloning of the requests and
3786                  * submitting a copy into each backend.
3787                  */
3788                 if (sibling->execlists.tasklet.func !=
3789                     execlists_submission_tasklet) {
3790                         err = -ENODEV;
3791                         goto err_put;
3792                 }
3793
3794                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
3795                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
3796
3797                 ve->siblings[ve->num_siblings++] = sibling;
3798                 ve->base.mask |= sibling->mask;
3799
3800                 /*
3801                  * All physical engines must be compatible for their emission
3802                  * functions (as we build the instructions during request
3803                  * construction and do not alter them before submission
3804                  * on the physical engine). We use the engine class as a guide
3805                  * here, although that could be refined.
3806                  */
3807                 if (ve->base.class != OTHER_CLASS) {
3808                         if (ve->base.class != sibling->class) {
3809                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
3810                                           sibling->class, ve->base.class);
3811                                 err = -EINVAL;
3812                                 goto err_put;
3813                         }
3814                         continue;
3815                 }
3816
3817                 ve->base.class = sibling->class;
3818                 ve->base.uabi_class = sibling->uabi_class;
3819                 snprintf(ve->base.name, sizeof(ve->base.name),
3820                          "v%dx%d", ve->base.class, count);
3821                 ve->base.context_size = sibling->context_size;
3822
3823                 ve->base.emit_bb_start = sibling->emit_bb_start;
3824                 ve->base.emit_flush = sibling->emit_flush;
3825                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
3826                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
3827                 ve->base.emit_fini_breadcrumb_dw =
3828                         sibling->emit_fini_breadcrumb_dw;
3829
3830                 ve->base.flags = sibling->flags;
3831         }
3832
3833         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
3834
3835         err = __execlists_context_alloc(&ve->context, siblings[0]);
3836         if (err)
3837                 goto err_put;
3838
3839         __set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags);
3840
3841         return &ve->context;
3842
3843 err_put:
3844         intel_context_put(&ve->context);
3845         return ERR_PTR(err);
3846 }
3847
3848 struct intel_context *
3849 intel_execlists_clone_virtual(struct i915_gem_context *ctx,
3850                               struct intel_engine_cs *src)
3851 {
3852         struct virtual_engine *se = to_virtual_engine(src);
3853         struct intel_context *dst;
3854
3855         dst = intel_execlists_create_virtual(ctx,
3856                                              se->siblings,
3857                                              se->num_siblings);
3858         if (IS_ERR(dst))
3859                 return dst;
3860
3861         if (se->num_bonds) {
3862                 struct virtual_engine *de = to_virtual_engine(dst->engine);
3863
3864                 de->bonds = kmemdup(se->bonds,
3865                                     sizeof(*se->bonds) * se->num_bonds,
3866                                     GFP_KERNEL);
3867                 if (!de->bonds) {
3868                         intel_context_put(dst);
3869                         return ERR_PTR(-ENOMEM);
3870                 }
3871
3872                 de->num_bonds = se->num_bonds;
3873         }
3874
3875         return dst;
3876 }
3877
3878 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
3879                                      const struct intel_engine_cs *master,
3880                                      const struct intel_engine_cs *sibling)
3881 {
3882         struct virtual_engine *ve = to_virtual_engine(engine);
3883         struct ve_bond *bond;
3884         int n;
3885
3886         /* Sanity check the sibling is part of the virtual engine */
3887         for (n = 0; n < ve->num_siblings; n++)
3888                 if (sibling == ve->siblings[n])
3889                         break;
3890         if (n == ve->num_siblings)
3891                 return -EINVAL;
3892
3893         bond = virtual_find_bond(ve, master);
3894         if (bond) {
3895                 bond->sibling_mask |= sibling->mask;
3896                 return 0;
3897         }
3898
3899         bond = krealloc(ve->bonds,
3900                         sizeof(*bond) * (ve->num_bonds + 1),
3901                         GFP_KERNEL);
3902         if (!bond)
3903                 return -ENOMEM;
3904
3905         bond[ve->num_bonds].master = master;
3906         bond[ve->num_bonds].sibling_mask = sibling->mask;
3907
3908         ve->bonds = bond;
3909         ve->num_bonds++;
3910
3911         return 0;
3912 }
3913
3914 void intel_execlists_show_requests(struct intel_engine_cs *engine,
3915                                    struct drm_printer *m,
3916                                    void (*show_request)(struct drm_printer *m,
3917                                                         struct i915_request *rq,
3918                                                         const char *prefix),
3919                                    unsigned int max)
3920 {
3921         const struct intel_engine_execlists *execlists = &engine->execlists;
3922         struct i915_request *rq, *last;
3923         unsigned long flags;
3924         unsigned int count;
3925         struct rb_node *rb;
3926
3927         spin_lock_irqsave(&engine->active.lock, flags);
3928
3929         last = NULL;
3930         count = 0;
3931         list_for_each_entry(rq, &engine->active.requests, sched.link) {
3932                 if (count++ < max - 1)
3933                         show_request(m, rq, "\t\tE ");
3934                 else
3935                         last = rq;
3936         }
3937         if (last) {
3938                 if (count > max) {
3939                         drm_printf(m,
3940                                    "\t\t...skipping %d executing requests...\n",
3941                                    count - max);
3942                 }
3943                 show_request(m, last, "\t\tE ");
3944         }
3945
3946         last = NULL;
3947         count = 0;
3948         if (execlists->queue_priority_hint != INT_MIN)
3949                 drm_printf(m, "\t\tQueue priority hint: %d\n",
3950                            execlists->queue_priority_hint);
3951         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
3952                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
3953                 int i;
3954
3955                 priolist_for_each_request(rq, p, i) {
3956                         if (count++ < max - 1)
3957                                 show_request(m, rq, "\t\tQ ");
3958                         else
3959                                 last = rq;
3960                 }
3961         }
3962         if (last) {
3963                 if (count > max) {
3964                         drm_printf(m,
3965                                    "\t\t...skipping %d queued requests...\n",
3966                                    count - max);
3967                 }
3968                 show_request(m, last, "\t\tQ ");
3969         }
3970
3971         last = NULL;
3972         count = 0;
3973         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
3974                 struct virtual_engine *ve =
3975                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3976                 struct i915_request *rq = READ_ONCE(ve->request);
3977
3978                 if (rq) {
3979                         if (count++ < max - 1)
3980                                 show_request(m, rq, "\t\tV ");
3981                         else
3982                                 last = rq;
3983                 }
3984         }
3985         if (last) {
3986                 if (count > max) {
3987                         drm_printf(m,
3988                                    "\t\t...skipping %d virtual requests...\n",
3989                                    count - max);
3990                 }
3991                 show_request(m, last, "\t\tV ");
3992         }
3993
3994         spin_unlock_irqrestore(&engine->active.lock, flags);
3995 }
3996
3997 void intel_lr_context_reset(struct intel_engine_cs *engine,
3998                             struct intel_context *ce,
3999                             u32 head,
4000                             bool scrub)
4001 {
4002         /*
4003          * We want a simple context + ring to execute the breadcrumb update.
4004          * We cannot rely on the context being intact across the GPU hang,
4005          * so clear it and rebuild just what we need for the breadcrumb.
4006          * All pending requests for this context will be zapped, and any
4007          * future request will be after userspace has had the opportunity
4008          * to recreate its own state.
4009          */
4010         if (scrub) {
4011                 u32 *regs = ce->lrc_reg_state;
4012
4013                 if (engine->pinned_default_state) {
4014                         memcpy(regs, /* skip restoring the vanilla PPHWSP */
4015                                engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
4016                                engine->context_size - PAGE_SIZE);
4017                 }
4018                 execlists_init_reg_state(regs, ce, engine, ce->ring);
4019         }
4020
4021         /* Rerun the request; its payload has been neutered (if guilty). */
4022         ce->ring->head = head;
4023         intel_ring_update_space(ce->ring);
4024
4025         __execlists_update_reg_state(ce, engine);
4026 }
4027
4028 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4029 #include "selftest_lrc.c"
4030 #endif