Linux-libre 5.3.12-gnu
[librecmc/linux-libre.git] / drivers / gpu / drm / i915 / i915_gpu_error.c
1 /*
2  * Copyright (c) 2008 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *    Keith Packard <keithp@keithp.com>
26  *    Mika Kuoppala <mika.kuoppala@intel.com>
27  *
28  */
29
30 #include <linux/ascii85.h>
31 #include <linux/nmi.h>
32 #include <linux/scatterlist.h>
33 #include <linux/stop_machine.h>
34 #include <linux/utsname.h>
35 #include <linux/zlib.h>
36
37 #include <drm/drm_print.h>
38
39 #include "display/intel_atomic.h"
40 #include "display/intel_overlay.h"
41
42 #include "gem/i915_gem_context.h"
43
44 #include "i915_drv.h"
45 #include "i915_gpu_error.h"
46 #include "i915_scatterlist.h"
47 #include "intel_csr.h"
48
49 static inline const struct intel_engine_cs *
50 engine_lookup(const struct drm_i915_private *i915, unsigned int id)
51 {
52         if (id >= I915_NUM_ENGINES)
53                 return NULL;
54
55         return i915->engine[id];
56 }
57
58 static inline const char *
59 __engine_name(const struct intel_engine_cs *engine)
60 {
61         return engine ? engine->name : "";
62 }
63
64 static const char *
65 engine_name(const struct drm_i915_private *i915, unsigned int id)
66 {
67         return __engine_name(engine_lookup(i915, id));
68 }
69
70 static const char *tiling_flag(int tiling)
71 {
72         switch (tiling) {
73         default:
74         case I915_TILING_NONE: return "";
75         case I915_TILING_X: return " X";
76         case I915_TILING_Y: return " Y";
77         }
78 }
79
80 static const char *dirty_flag(int dirty)
81 {
82         return dirty ? " dirty" : "";
83 }
84
85 static const char *purgeable_flag(int purgeable)
86 {
87         return purgeable ? " purgeable" : "";
88 }
89
90 static void __sg_set_buf(struct scatterlist *sg,
91                          void *addr, unsigned int len, loff_t it)
92 {
93         sg->page_link = (unsigned long)virt_to_page(addr);
94         sg->offset = offset_in_page(addr);
95         sg->length = len;
96         sg->dma_address = it;
97 }
98
99 static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
100 {
101         if (!len)
102                 return false;
103
104         if (e->bytes + len + 1 <= e->size)
105                 return true;
106
107         if (e->bytes) {
108                 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
109                 e->iter += e->bytes;
110                 e->buf = NULL;
111                 e->bytes = 0;
112         }
113
114         if (e->cur == e->end) {
115                 struct scatterlist *sgl;
116
117                 sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
118                 if (!sgl) {
119                         e->err = -ENOMEM;
120                         return false;
121                 }
122
123                 if (e->cur) {
124                         e->cur->offset = 0;
125                         e->cur->length = 0;
126                         e->cur->page_link =
127                                 (unsigned long)sgl | SG_CHAIN;
128                 } else {
129                         e->sgl = sgl;
130                 }
131
132                 e->cur = sgl;
133                 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
134         }
135
136         e->size = ALIGN(len + 1, SZ_64K);
137         e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
138         if (!e->buf) {
139                 e->size = PAGE_ALIGN(len + 1);
140                 e->buf = kmalloc(e->size, GFP_KERNEL);
141         }
142         if (!e->buf) {
143                 e->err = -ENOMEM;
144                 return false;
145         }
146
147         return true;
148 }
149
150 __printf(2, 0)
151 static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
152                                const char *fmt, va_list args)
153 {
154         va_list ap;
155         int len;
156
157         if (e->err)
158                 return;
159
160         va_copy(ap, args);
161         len = vsnprintf(NULL, 0, fmt, ap);
162         va_end(ap);
163         if (len <= 0) {
164                 e->err = len;
165                 return;
166         }
167
168         if (!__i915_error_grow(e, len))
169                 return;
170
171         GEM_BUG_ON(e->bytes >= e->size);
172         len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
173         if (len < 0) {
174                 e->err = len;
175                 return;
176         }
177         e->bytes += len;
178 }
179
180 static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
181 {
182         unsigned len;
183
184         if (e->err || !str)
185                 return;
186
187         len = strlen(str);
188         if (!__i915_error_grow(e, len))
189                 return;
190
191         GEM_BUG_ON(e->bytes + len > e->size);
192         memcpy(e->buf + e->bytes, str, len);
193         e->bytes += len;
194 }
195
196 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
197 #define err_puts(e, s) i915_error_puts(e, s)
198
199 static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
200 {
201         i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
202 }
203
204 static inline struct drm_printer
205 i915_error_printer(struct drm_i915_error_state_buf *e)
206 {
207         struct drm_printer p = {
208                 .printfn = __i915_printfn_error,
209                 .arg = e,
210         };
211         return p;
212 }
213
214 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
215
216 struct compress {
217         struct z_stream_s zstream;
218         void *tmp;
219 };
220
221 static bool compress_init(struct compress *c)
222 {
223         struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
224
225         zstream->workspace =
226                 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
227                         GFP_ATOMIC | __GFP_NOWARN);
228         if (!zstream->workspace)
229                 return false;
230
231         if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
232                 kfree(zstream->workspace);
233                 return false;
234         }
235
236         c->tmp = NULL;
237         if (i915_has_memcpy_from_wc())
238                 c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
239
240         return true;
241 }
242
243 static void *compress_next_page(struct drm_i915_error_object *dst)
244 {
245         unsigned long page;
246
247         if (dst->page_count >= dst->num_pages)
248                 return ERR_PTR(-ENOSPC);
249
250         page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
251         if (!page)
252                 return ERR_PTR(-ENOMEM);
253
254         return dst->pages[dst->page_count++] = (void *)page;
255 }
256
257 static int compress_page(struct compress *c,
258                          void *src,
259                          struct drm_i915_error_object *dst)
260 {
261         struct z_stream_s *zstream = &c->zstream;
262
263         zstream->next_in = src;
264         if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
265                 zstream->next_in = c->tmp;
266         zstream->avail_in = PAGE_SIZE;
267
268         do {
269                 if (zstream->avail_out == 0) {
270                         zstream->next_out = compress_next_page(dst);
271                         if (IS_ERR(zstream->next_out))
272                                 return PTR_ERR(zstream->next_out);
273
274                         zstream->avail_out = PAGE_SIZE;
275                 }
276
277                 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
278                         return -EIO;
279
280                 touch_nmi_watchdog();
281         } while (zstream->avail_in);
282
283         /* Fallback to uncompressed if we increase size? */
284         if (0 && zstream->total_out > zstream->total_in)
285                 return -E2BIG;
286
287         return 0;
288 }
289
290 static int compress_flush(struct compress *c,
291                           struct drm_i915_error_object *dst)
292 {
293         struct z_stream_s *zstream = &c->zstream;
294
295         do {
296                 switch (zlib_deflate(zstream, Z_FINISH)) {
297                 case Z_OK: /* more space requested */
298                         zstream->next_out = compress_next_page(dst);
299                         if (IS_ERR(zstream->next_out))
300                                 return PTR_ERR(zstream->next_out);
301
302                         zstream->avail_out = PAGE_SIZE;
303                         break;
304
305                 case Z_STREAM_END:
306                         goto end;
307
308                 default: /* any error */
309                         return -EIO;
310                 }
311         } while (1);
312
313 end:
314         memset(zstream->next_out, 0, zstream->avail_out);
315         dst->unused = zstream->avail_out;
316         return 0;
317 }
318
319 static void compress_fini(struct compress *c,
320                           struct drm_i915_error_object *dst)
321 {
322         struct z_stream_s *zstream = &c->zstream;
323
324         zlib_deflateEnd(zstream);
325         kfree(zstream->workspace);
326         if (c->tmp)
327                 free_page((unsigned long)c->tmp);
328 }
329
330 static void err_compression_marker(struct drm_i915_error_state_buf *m)
331 {
332         err_puts(m, ":");
333 }
334
335 #else
336
337 struct compress {
338 };
339
340 static bool compress_init(struct compress *c)
341 {
342         return true;
343 }
344
345 static int compress_page(struct compress *c,
346                          void *src,
347                          struct drm_i915_error_object *dst)
348 {
349         unsigned long page;
350         void *ptr;
351
352         page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
353         if (!page)
354                 return -ENOMEM;
355
356         ptr = (void *)page;
357         if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
358                 memcpy(ptr, src, PAGE_SIZE);
359         dst->pages[dst->page_count++] = ptr;
360
361         return 0;
362 }
363
364 static int compress_flush(struct compress *c,
365                           struct drm_i915_error_object *dst)
366 {
367         return 0;
368 }
369
370 static void compress_fini(struct compress *c,
371                           struct drm_i915_error_object *dst)
372 {
373 }
374
375 static void err_compression_marker(struct drm_i915_error_state_buf *m)
376 {
377         err_puts(m, "~");
378 }
379
380 #endif
381
382 static void print_error_buffers(struct drm_i915_error_state_buf *m,
383                                 const char *name,
384                                 struct drm_i915_error_buffer *err,
385                                 int count)
386 {
387         err_printf(m, "%s [%d]:\n", name, count);
388
389         while (count--) {
390                 err_printf(m, "    %08x_%08x %8u %02x %02x",
391                            upper_32_bits(err->gtt_offset),
392                            lower_32_bits(err->gtt_offset),
393                            err->size,
394                            err->read_domains,
395                            err->write_domain);
396                 err_puts(m, tiling_flag(err->tiling));
397                 err_puts(m, dirty_flag(err->dirty));
398                 err_puts(m, purgeable_flag(err->purgeable));
399                 err_puts(m, err->userptr ? " userptr" : "");
400                 err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
401
402                 if (err->name)
403                         err_printf(m, " (name: %d)", err->name);
404                 if (err->fence_reg != I915_FENCE_REG_NONE)
405                         err_printf(m, " (fence: %d)", err->fence_reg);
406
407                 err_puts(m, "\n");
408                 err++;
409         }
410 }
411
412 static void error_print_instdone(struct drm_i915_error_state_buf *m,
413                                  const struct drm_i915_error_engine *ee)
414 {
415         int slice;
416         int subslice;
417
418         err_printf(m, "  INSTDONE: 0x%08x\n",
419                    ee->instdone.instdone);
420
421         if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3)
422                 return;
423
424         err_printf(m, "  SC_INSTDONE: 0x%08x\n",
425                    ee->instdone.slice_common);
426
427         if (INTEL_GEN(m->i915) <= 6)
428                 return;
429
430         for_each_instdone_slice_subslice(m->i915, slice, subslice)
431                 err_printf(m, "  SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
432                            slice, subslice,
433                            ee->instdone.sampler[slice][subslice]);
434
435         for_each_instdone_slice_subslice(m->i915, slice, subslice)
436                 err_printf(m, "  ROW_INSTDONE[%d][%d]: 0x%08x\n",
437                            slice, subslice,
438                            ee->instdone.row[slice][subslice]);
439 }
440
441 static void error_print_request(struct drm_i915_error_state_buf *m,
442                                 const char *prefix,
443                                 const struct drm_i915_error_request *erq,
444                                 const unsigned long epoch)
445 {
446         if (!erq->seqno)
447                 return;
448
449         err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
450                    prefix, erq->pid, erq->context, erq->seqno,
451                    test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
452                             &erq->flags) ? "!" : "",
453                    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
454                             &erq->flags) ? "+" : "",
455                    erq->sched_attr.priority,
456                    jiffies_to_msecs(erq->jiffies - epoch),
457                    erq->start, erq->head, erq->tail);
458 }
459
460 static void error_print_context(struct drm_i915_error_state_buf *m,
461                                 const char *header,
462                                 const struct drm_i915_error_context *ctx)
463 {
464         err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n",
465                    header, ctx->comm, ctx->pid, ctx->hw_id,
466                    ctx->sched_attr.priority, ctx->guilty, ctx->active);
467 }
468
469 static void error_print_engine(struct drm_i915_error_state_buf *m,
470                                const struct drm_i915_error_engine *ee,
471                                const unsigned long epoch)
472 {
473         int n;
474
475         err_printf(m, "%s command stream:\n",
476                    engine_name(m->i915, ee->engine_id));
477         err_printf(m, "  IDLE?: %s\n", yesno(ee->idle));
478         err_printf(m, "  START: 0x%08x\n", ee->start);
479         err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
480         err_printf(m, "  TAIL:  0x%08x [0x%08x, 0x%08x]\n",
481                    ee->tail, ee->rq_post, ee->rq_tail);
482         err_printf(m, "  CTL:   0x%08x\n", ee->ctl);
483         err_printf(m, "  MODE:  0x%08x\n", ee->mode);
484         err_printf(m, "  HWS:   0x%08x\n", ee->hws);
485         err_printf(m, "  ACTHD: 0x%08x %08x\n",
486                    (u32)(ee->acthd>>32), (u32)ee->acthd);
487         err_printf(m, "  IPEIR: 0x%08x\n", ee->ipeir);
488         err_printf(m, "  IPEHR: 0x%08x\n", ee->ipehr);
489
490         error_print_instdone(m, ee);
491
492         if (ee->batchbuffer) {
493                 u64 start = ee->batchbuffer->gtt_offset;
494                 u64 end = start + ee->batchbuffer->gtt_size;
495
496                 err_printf(m, "  batch: [0x%08x_%08x, 0x%08x_%08x]\n",
497                            upper_32_bits(start), lower_32_bits(start),
498                            upper_32_bits(end), lower_32_bits(end));
499         }
500         if (INTEL_GEN(m->i915) >= 4) {
501                 err_printf(m, "  BBADDR: 0x%08x_%08x\n",
502                            (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
503                 err_printf(m, "  BB_STATE: 0x%08x\n", ee->bbstate);
504                 err_printf(m, "  INSTPS: 0x%08x\n", ee->instps);
505         }
506         err_printf(m, "  INSTPM: 0x%08x\n", ee->instpm);
507         err_printf(m, "  FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
508                    lower_32_bits(ee->faddr));
509         if (INTEL_GEN(m->i915) >= 6) {
510                 err_printf(m, "  RC PSMI: 0x%08x\n", ee->rc_psmi);
511                 err_printf(m, "  FAULT_REG: 0x%08x\n", ee->fault_reg);
512         }
513         if (HAS_PPGTT(m->i915)) {
514                 err_printf(m, "  GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
515
516                 if (INTEL_GEN(m->i915) >= 8) {
517                         int i;
518                         for (i = 0; i < 4; i++)
519                                 err_printf(m, "  PDP%d: 0x%016llx\n",
520                                            i, ee->vm_info.pdp[i]);
521                 } else {
522                         err_printf(m, "  PP_DIR_BASE: 0x%08x\n",
523                                    ee->vm_info.pp_dir_base);
524                 }
525         }
526         err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
527         err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
528         err_printf(m, "  hangcheck timestamp: %dms (%lu%s)\n",
529                    jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
530                    ee->hangcheck_timestamp,
531                    ee->hangcheck_timestamp == epoch ? "; epoch" : "");
532         err_printf(m, "  engine reset count: %u\n", ee->reset_count);
533
534         for (n = 0; n < ee->num_ports; n++) {
535                 err_printf(m, "  ELSP[%d]:", n);
536                 error_print_request(m, " ", &ee->execlist[n], epoch);
537         }
538
539         error_print_context(m, "  Active context: ", &ee->context);
540 }
541
542 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
543 {
544         va_list args;
545
546         va_start(args, f);
547         i915_error_vprintf(e, f, args);
548         va_end(args);
549 }
550
551 static void print_error_obj(struct drm_i915_error_state_buf *m,
552                             struct intel_engine_cs *engine,
553                             const char *name,
554                             struct drm_i915_error_object *obj)
555 {
556         char out[ASCII85_BUFSZ];
557         int page;
558
559         if (!obj)
560                 return;
561
562         if (name) {
563                 err_printf(m, "%s --- %s = 0x%08x %08x\n",
564                            engine ? engine->name : "global", name,
565                            upper_32_bits(obj->gtt_offset),
566                            lower_32_bits(obj->gtt_offset));
567         }
568
569         err_compression_marker(m);
570         for (page = 0; page < obj->page_count; page++) {
571                 int i, len;
572
573                 len = PAGE_SIZE;
574                 if (page == obj->page_count - 1)
575                         len -= obj->unused;
576                 len = ascii85_encode_len(len);
577
578                 for (i = 0; i < len; i++)
579                         err_puts(m, ascii85_encode(obj->pages[page][i], out));
580         }
581         err_puts(m, "\n");
582 }
583
584 static void err_print_capabilities(struct drm_i915_error_state_buf *m,
585                                    const struct intel_device_info *info,
586                                    const struct intel_runtime_info *runtime,
587                                    const struct intel_driver_caps *caps)
588 {
589         struct drm_printer p = i915_error_printer(m);
590
591         intel_device_info_dump_flags(info, &p);
592         intel_driver_caps_print(caps, &p);
593         intel_device_info_dump_topology(&runtime->sseu, &p);
594 }
595
596 static void err_print_params(struct drm_i915_error_state_buf *m,
597                              const struct i915_params *params)
598 {
599         struct drm_printer p = i915_error_printer(m);
600
601         i915_params_dump(params, &p);
602 }
603
604 static void err_print_pciid(struct drm_i915_error_state_buf *m,
605                             struct drm_i915_private *i915)
606 {
607         struct pci_dev *pdev = i915->drm.pdev;
608
609         err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
610         err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
611         err_printf(m, "PCI Subsystem: %04x:%04x\n",
612                    pdev->subsystem_vendor,
613                    pdev->subsystem_device);
614 }
615
616 static void err_print_uc(struct drm_i915_error_state_buf *m,
617                          const struct i915_error_uc *error_uc)
618 {
619         struct drm_printer p = i915_error_printer(m);
620         const struct i915_gpu_state *error =
621                 container_of(error_uc, typeof(*error), uc);
622
623         if (!error->device_info.has_guc)
624                 return;
625
626         intel_uc_fw_dump(&error_uc->guc_fw, &p);
627         intel_uc_fw_dump(&error_uc->huc_fw, &p);
628         print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
629 }
630
631 static void err_free_sgl(struct scatterlist *sgl)
632 {
633         while (sgl) {
634                 struct scatterlist *sg;
635
636                 for (sg = sgl; !sg_is_chain(sg); sg++) {
637                         kfree(sg_virt(sg));
638                         if (sg_is_last(sg))
639                                 break;
640                 }
641
642                 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
643                 free_page((unsigned long)sgl);
644                 sgl = sg;
645         }
646 }
647
648 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
649                                struct i915_gpu_state *error)
650 {
651         struct drm_i915_error_object *obj;
652         struct timespec64 ts;
653         int i, j;
654
655         if (*error->error_msg)
656                 err_printf(m, "%s\n", error->error_msg);
657         err_printf(m, "Kernel: %s %s\n",
658                    init_utsname()->release,
659                    init_utsname()->machine);
660         ts = ktime_to_timespec64(error->time);
661         err_printf(m, "Time: %lld s %ld us\n",
662                    (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
663         ts = ktime_to_timespec64(error->boottime);
664         err_printf(m, "Boottime: %lld s %ld us\n",
665                    (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
666         ts = ktime_to_timespec64(error->uptime);
667         err_printf(m, "Uptime: %lld s %ld us\n",
668                    (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
669         err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
670         err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
671                    error->capture,
672                    jiffies_to_msecs(jiffies - error->capture),
673                    jiffies_to_msecs(error->capture - error->epoch));
674
675         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
676                 if (!error->engine[i].context.pid)
677                         continue;
678
679                 err_printf(m, "Active process (on ring %s): %s [%d]\n",
680                            engine_name(m->i915, i),
681                            error->engine[i].context.comm,
682                            error->engine[i].context.pid);
683         }
684         err_printf(m, "Reset count: %u\n", error->reset_count);
685         err_printf(m, "Suspend count: %u\n", error->suspend_count);
686         err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
687         err_printf(m, "Subplatform: 0x%x\n",
688                    intel_subplatform(&error->runtime_info,
689                                      error->device_info.platform));
690         err_print_pciid(m, m->i915);
691
692         err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
693
694         if (HAS_CSR(m->i915)) {
695                 struct intel_csr *csr = &m->i915->csr;
696
697                 err_printf(m, "DMC loaded: %s\n",
698                            yesno(csr->dmc_payload != NULL));
699                 err_printf(m, "DMC fw version: %d.%d\n",
700                            CSR_VERSION_MAJOR(csr->version),
701                            CSR_VERSION_MINOR(csr->version));
702         }
703
704         err_printf(m, "GT awake: %s\n", yesno(error->awake));
705         err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
706         err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
707         err_printf(m, "EIR: 0x%08x\n", error->eir);
708         err_printf(m, "IER: 0x%08x\n", error->ier);
709         for (i = 0; i < error->ngtier; i++)
710                 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
711         err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
712         err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
713         err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
714         err_printf(m, "CCID: 0x%08x\n", error->ccid);
715
716         for (i = 0; i < error->nfence; i++)
717                 err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
718
719         if (INTEL_GEN(m->i915) >= 6) {
720                 err_printf(m, "ERROR: 0x%08x\n", error->error);
721
722                 if (INTEL_GEN(m->i915) >= 8)
723                         err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
724                                    error->fault_data1, error->fault_data0);
725
726                 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
727         }
728
729         if (IS_GEN(m->i915, 7))
730                 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
731
732         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
733                 if (error->engine[i].engine_id != -1)
734                         error_print_engine(m, &error->engine[i], error->epoch);
735         }
736
737         for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
738                 char buf[128];
739                 int len, first = 1;
740
741                 if (!error->active_vm[i])
742                         break;
743
744                 len = scnprintf(buf, sizeof(buf), "Active (");
745                 for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
746                         if (error->engine[j].vm != error->active_vm[i])
747                                 continue;
748
749                         len += scnprintf(buf + len, sizeof(buf), "%s%s",
750                                          first ? "" : ", ",
751                                          m->i915->engine[j]->name);
752                         first = 0;
753                 }
754                 scnprintf(buf + len, sizeof(buf), ")");
755                 print_error_buffers(m, buf,
756                                     error->active_bo[i],
757                                     error->active_bo_count[i]);
758         }
759
760         print_error_buffers(m, "Pinned (global)",
761                             error->pinned_bo,
762                             error->pinned_bo_count);
763
764         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
765                 const struct drm_i915_error_engine *ee = &error->engine[i];
766
767                 obj = ee->batchbuffer;
768                 if (obj) {
769                         err_puts(m, m->i915->engine[i]->name);
770                         if (ee->context.pid)
771                                 err_printf(m, " (submitted by %s [%d])",
772                                            ee->context.comm,
773                                            ee->context.pid);
774                         err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
775                                    upper_32_bits(obj->gtt_offset),
776                                    lower_32_bits(obj->gtt_offset));
777                         print_error_obj(m, m->i915->engine[i], NULL, obj);
778                 }
779
780                 for (j = 0; j < ee->user_bo_count; j++)
781                         print_error_obj(m, m->i915->engine[i],
782                                         "user", ee->user_bo[j]);
783
784                 if (ee->num_requests) {
785                         err_printf(m, "%s --- %d requests\n",
786                                    m->i915->engine[i]->name,
787                                    ee->num_requests);
788                         for (j = 0; j < ee->num_requests; j++)
789                                 error_print_request(m, " ",
790                                                     &ee->requests[j],
791                                                     error->epoch);
792                 }
793
794                 print_error_obj(m, m->i915->engine[i],
795                                 "ringbuffer", ee->ringbuffer);
796
797                 print_error_obj(m, m->i915->engine[i],
798                                 "HW Status", ee->hws_page);
799
800                 print_error_obj(m, m->i915->engine[i],
801                                 "HW context", ee->ctx);
802
803                 print_error_obj(m, m->i915->engine[i],
804                                 "WA context", ee->wa_ctx);
805
806                 print_error_obj(m, m->i915->engine[i],
807                                 "WA batchbuffer", ee->wa_batchbuffer);
808
809                 print_error_obj(m, m->i915->engine[i],
810                                 "NULL context", ee->default_state);
811         }
812
813         if (error->overlay)
814                 intel_overlay_print_error_state(m, error->overlay);
815
816         if (error->display)
817                 intel_display_print_error_state(m, error->display);
818
819         err_print_capabilities(m, &error->device_info, &error->runtime_info,
820                                &error->driver_caps);
821         err_print_params(m, &error->params);
822         err_print_uc(m, &error->uc);
823 }
824
825 static int err_print_to_sgl(struct i915_gpu_state *error)
826 {
827         struct drm_i915_error_state_buf m;
828
829         if (IS_ERR(error))
830                 return PTR_ERR(error);
831
832         if (READ_ONCE(error->sgl))
833                 return 0;
834
835         memset(&m, 0, sizeof(m));
836         m.i915 = error->i915;
837
838         __err_print_to_sgl(&m, error);
839
840         if (m.buf) {
841                 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
842                 m.bytes = 0;
843                 m.buf = NULL;
844         }
845         if (m.cur) {
846                 GEM_BUG_ON(m.end < m.cur);
847                 sg_mark_end(m.cur - 1);
848         }
849         GEM_BUG_ON(m.sgl && !m.cur);
850
851         if (m.err) {
852                 err_free_sgl(m.sgl);
853                 return m.err;
854         }
855
856         if (cmpxchg(&error->sgl, NULL, m.sgl))
857                 err_free_sgl(m.sgl);
858
859         return 0;
860 }
861
862 ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
863                                       char *buf, loff_t off, size_t rem)
864 {
865         struct scatterlist *sg;
866         size_t count;
867         loff_t pos;
868         int err;
869
870         if (!error || !rem)
871                 return 0;
872
873         err = err_print_to_sgl(error);
874         if (err)
875                 return err;
876
877         sg = READ_ONCE(error->fit);
878         if (!sg || off < sg->dma_address)
879                 sg = error->sgl;
880         if (!sg)
881                 return 0;
882
883         pos = sg->dma_address;
884         count = 0;
885         do {
886                 size_t len, start;
887
888                 if (sg_is_chain(sg)) {
889                         sg = sg_chain_ptr(sg);
890                         GEM_BUG_ON(sg_is_chain(sg));
891                 }
892
893                 len = sg->length;
894                 if (pos + len <= off) {
895                         pos += len;
896                         continue;
897                 }
898
899                 start = sg->offset;
900                 if (pos < off) {
901                         GEM_BUG_ON(off - pos > len);
902                         len -= off - pos;
903                         start += off - pos;
904                         pos = off;
905                 }
906
907                 len = min(len, rem);
908                 GEM_BUG_ON(!len || len > sg->length);
909
910                 memcpy(buf, page_address(sg_page(sg)) + start, len);
911
912                 count += len;
913                 pos += len;
914
915                 buf += len;
916                 rem -= len;
917                 if (!rem) {
918                         WRITE_ONCE(error->fit, sg);
919                         break;
920                 }
921         } while (!sg_is_last(sg++));
922
923         return count;
924 }
925
926 static void i915_error_object_free(struct drm_i915_error_object *obj)
927 {
928         int page;
929
930         if (obj == NULL)
931                 return;
932
933         for (page = 0; page < obj->page_count; page++)
934                 free_page((unsigned long)obj->pages[page]);
935
936         kfree(obj);
937 }
938
939
940 static void cleanup_params(struct i915_gpu_state *error)
941 {
942         i915_params_free(&error->params);
943 }
944
945 static void cleanup_uc_state(struct i915_gpu_state *error)
946 {
947         struct i915_error_uc *error_uc = &error->uc;
948
949         kfree(error_uc->guc_fw.path);
950         kfree(error_uc->huc_fw.path);
951         i915_error_object_free(error_uc->guc_log);
952 }
953
954 void __i915_gpu_state_free(struct kref *error_ref)
955 {
956         struct i915_gpu_state *error =
957                 container_of(error_ref, typeof(*error), ref);
958         long i, j;
959
960         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
961                 struct drm_i915_error_engine *ee = &error->engine[i];
962
963                 for (j = 0; j < ee->user_bo_count; j++)
964                         i915_error_object_free(ee->user_bo[j]);
965                 kfree(ee->user_bo);
966
967                 i915_error_object_free(ee->batchbuffer);
968                 i915_error_object_free(ee->wa_batchbuffer);
969                 i915_error_object_free(ee->ringbuffer);
970                 i915_error_object_free(ee->hws_page);
971                 i915_error_object_free(ee->ctx);
972                 i915_error_object_free(ee->wa_ctx);
973
974                 kfree(ee->requests);
975         }
976
977         for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
978                 kfree(error->active_bo[i]);
979         kfree(error->pinned_bo);
980
981         kfree(error->overlay);
982         kfree(error->display);
983
984         cleanup_params(error);
985         cleanup_uc_state(error);
986
987         err_free_sgl(error->sgl);
988         kfree(error);
989 }
990
991 static struct drm_i915_error_object *
992 i915_error_object_create(struct drm_i915_private *i915,
993                          struct i915_vma *vma)
994 {
995         struct i915_ggtt *ggtt = &i915->ggtt;
996         const u64 slot = ggtt->error_capture.start;
997         struct drm_i915_error_object *dst;
998         struct compress compress;
999         unsigned long num_pages;
1000         struct sgt_iter iter;
1001         dma_addr_t dma;
1002         int ret;
1003
1004         if (!vma || !vma->pages)
1005                 return NULL;
1006
1007         num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
1008         num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
1009         dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
1010                       GFP_ATOMIC | __GFP_NOWARN);
1011         if (!dst)
1012                 return NULL;
1013
1014         dst->gtt_offset = vma->node.start;
1015         dst->gtt_size = vma->node.size;
1016         dst->num_pages = num_pages;
1017         dst->page_count = 0;
1018         dst->unused = 0;
1019
1020         if (!compress_init(&compress)) {
1021                 kfree(dst);
1022                 return NULL;
1023         }
1024
1025         ret = -EINVAL;
1026         for_each_sgt_dma(dma, iter, vma->pages) {
1027                 void __iomem *s;
1028
1029                 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
1030
1031                 s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
1032                 ret = compress_page(&compress, (void  __force *)s, dst);
1033                 io_mapping_unmap_atomic(s);
1034                 if (ret)
1035                         break;
1036         }
1037
1038         if (ret || compress_flush(&compress, dst)) {
1039                 while (dst->page_count--)
1040                         free_page((unsigned long)dst->pages[dst->page_count]);
1041                 kfree(dst);
1042                 dst = NULL;
1043         }
1044
1045         compress_fini(&compress, dst);
1046         return dst;
1047 }
1048
1049 static void capture_bo(struct drm_i915_error_buffer *err,
1050                        struct i915_vma *vma)
1051 {
1052         struct drm_i915_gem_object *obj = vma->obj;
1053
1054         err->size = obj->base.size;
1055         err->name = obj->base.name;
1056
1057         err->gtt_offset = vma->node.start;
1058         err->read_domains = obj->read_domains;
1059         err->write_domain = obj->write_domain;
1060         err->fence_reg = vma->fence ? vma->fence->id : -1;
1061         err->tiling = i915_gem_object_get_tiling(obj);
1062         err->dirty = obj->mm.dirty;
1063         err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
1064         err->userptr = obj->userptr.mm != NULL;
1065         err->cache_level = obj->cache_level;
1066 }
1067
1068 static u32 capture_error_bo(struct drm_i915_error_buffer *err,
1069                             int count, struct list_head *head,
1070                             unsigned int flags)
1071 #define ACTIVE_ONLY BIT(0)
1072 #define PINNED_ONLY BIT(1)
1073 {
1074         struct i915_vma *vma;
1075         int i = 0;
1076
1077         list_for_each_entry(vma, head, vm_link) {
1078                 if (!vma->obj)
1079                         continue;
1080
1081                 if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma))
1082                         continue;
1083
1084                 if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma))
1085                         continue;
1086
1087                 capture_bo(err++, vma);
1088                 if (++i == count)
1089                         break;
1090         }
1091
1092         return i;
1093 }
1094
1095 /*
1096  * Generate a semi-unique error code. The code is not meant to have meaning, The
1097  * code's only purpose is to try to prevent false duplicated bug reports by
1098  * grossly estimating a GPU error state.
1099  *
1100  * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
1101  * the hang if we could strip the GTT offset information from it.
1102  *
1103  * It's only a small step better than a random number in its current form.
1104  */
1105 static u32 i915_error_generate_code(struct i915_gpu_state *error,
1106                                     intel_engine_mask_t engine_mask)
1107 {
1108         /*
1109          * IPEHR would be an ideal way to detect errors, as it's the gross
1110          * measure of "the command that hung." However, has some very common
1111          * synchronization commands which almost always appear in the case
1112          * strictly a client bug. Use instdone to differentiate those some.
1113          */
1114         if (engine_mask) {
1115                 struct drm_i915_error_engine *ee =
1116                         &error->engine[ffs(engine_mask)];
1117
1118                 return ee->ipehr ^ ee->instdone.instdone;
1119         }
1120
1121         return 0;
1122 }
1123
1124 static void gem_record_fences(struct i915_gpu_state *error)
1125 {
1126         struct drm_i915_private *dev_priv = error->i915;
1127         struct intel_uncore *uncore = &dev_priv->uncore;
1128         int i;
1129
1130         if (INTEL_GEN(dev_priv) >= 6) {
1131                 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1132                         error->fence[i] =
1133                                 intel_uncore_read64(uncore,
1134                                                     FENCE_REG_GEN6_LO(i));
1135         } else if (INTEL_GEN(dev_priv) >= 4) {
1136                 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1137                         error->fence[i] =
1138                                 intel_uncore_read64(uncore,
1139                                                     FENCE_REG_965_LO(i));
1140         } else {
1141                 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1142                         error->fence[i] =
1143                                 intel_uncore_read(uncore, FENCE_REG(i));
1144         }
1145         error->nfence = i;
1146 }
1147
1148 static void error_record_engine_registers(struct i915_gpu_state *error,
1149                                           struct intel_engine_cs *engine,
1150                                           struct drm_i915_error_engine *ee)
1151 {
1152         struct drm_i915_private *dev_priv = engine->i915;
1153
1154         if (INTEL_GEN(dev_priv) >= 6) {
1155                 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
1156                 if (INTEL_GEN(dev_priv) >= 8)
1157                         ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
1158                 else
1159                         ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
1160         }
1161
1162         if (INTEL_GEN(dev_priv) >= 4) {
1163                 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
1164                 ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
1165                 ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
1166                 ee->instps = ENGINE_READ(engine, RING_INSTPS);
1167                 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
1168                 if (INTEL_GEN(dev_priv) >= 8) {
1169                         ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
1170                         ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
1171                 }
1172                 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
1173         } else {
1174                 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
1175                 ee->ipeir = ENGINE_READ(engine, IPEIR);
1176                 ee->ipehr = ENGINE_READ(engine, IPEHR);
1177         }
1178
1179         intel_engine_get_instdone(engine, &ee->instdone);
1180
1181         ee->instpm = ENGINE_READ(engine, RING_INSTPM);
1182         ee->acthd = intel_engine_get_active_head(engine);
1183         ee->start = ENGINE_READ(engine, RING_START);
1184         ee->head = ENGINE_READ(engine, RING_HEAD);
1185         ee->tail = ENGINE_READ(engine, RING_TAIL);
1186         ee->ctl = ENGINE_READ(engine, RING_CTL);
1187         if (INTEL_GEN(dev_priv) > 2)
1188                 ee->mode = ENGINE_READ(engine, RING_MI_MODE);
1189
1190         if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
1191                 i915_reg_t mmio;
1192
1193                 if (IS_GEN(dev_priv, 7)) {
1194                         switch (engine->id) {
1195                         default:
1196                                 MISSING_CASE(engine->id);
1197                                 /* fall through */
1198                         case RCS0:
1199                                 mmio = RENDER_HWS_PGA_GEN7;
1200                                 break;
1201                         case BCS0:
1202                                 mmio = BLT_HWS_PGA_GEN7;
1203                                 break;
1204                         case VCS0:
1205                                 mmio = BSD_HWS_PGA_GEN7;
1206                                 break;
1207                         case VECS0:
1208                                 mmio = VEBOX_HWS_PGA_GEN7;
1209                                 break;
1210                         }
1211                 } else if (IS_GEN(engine->i915, 6)) {
1212                         mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
1213                 } else {
1214                         /* XXX: gen8 returns to sanity */
1215                         mmio = RING_HWS_PGA(engine->mmio_base);
1216                 }
1217
1218                 ee->hws = I915_READ(mmio);
1219         }
1220
1221         ee->idle = intel_engine_is_idle(engine);
1222         if (!ee->idle)
1223                 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
1224         ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
1225                                                   engine);
1226
1227         if (HAS_PPGTT(dev_priv)) {
1228                 int i;
1229
1230                 ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
1231
1232                 if (IS_GEN(dev_priv, 6)) {
1233                         ee->vm_info.pp_dir_base =
1234                                 ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
1235                 } else if (IS_GEN(dev_priv, 7)) {
1236                         ee->vm_info.pp_dir_base =
1237                                 ENGINE_READ(engine, RING_PP_DIR_BASE);
1238                 } else if (INTEL_GEN(dev_priv) >= 8) {
1239                         u32 base = engine->mmio_base;
1240
1241                         for (i = 0; i < 4; i++) {
1242                                 ee->vm_info.pdp[i] =
1243                                         I915_READ(GEN8_RING_PDP_UDW(base, i));
1244                                 ee->vm_info.pdp[i] <<= 32;
1245                                 ee->vm_info.pdp[i] |=
1246                                         I915_READ(GEN8_RING_PDP_LDW(base, i));
1247                         }
1248                 }
1249         }
1250 }
1251
1252 static void record_request(struct i915_request *request,
1253                            struct drm_i915_error_request *erq)
1254 {
1255         struct i915_gem_context *ctx = request->gem_context;
1256
1257         erq->flags = request->fence.flags;
1258         erq->context = request->fence.context;
1259         erq->seqno = request->fence.seqno;
1260         erq->sched_attr = request->sched.attr;
1261         erq->jiffies = request->emitted_jiffies;
1262         erq->start = i915_ggtt_offset(request->ring->vma);
1263         erq->head = request->head;
1264         erq->tail = request->tail;
1265
1266         rcu_read_lock();
1267         erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0;
1268         rcu_read_unlock();
1269 }
1270
1271 static void engine_record_requests(struct intel_engine_cs *engine,
1272                                    struct i915_request *first,
1273                                    struct drm_i915_error_engine *ee)
1274 {
1275         struct i915_request *request;
1276         int count;
1277
1278         count = 0;
1279         request = first;
1280         list_for_each_entry_from(request, &engine->active.requests, sched.link)
1281                 count++;
1282         if (!count)
1283                 return;
1284
1285         ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
1286         if (!ee->requests)
1287                 return;
1288
1289         ee->num_requests = count;
1290
1291         count = 0;
1292         request = first;
1293         list_for_each_entry_from(request,
1294                                  &engine->active.requests, sched.link) {
1295                 if (count >= ee->num_requests) {
1296                         /*
1297                          * If the ring request list was changed in
1298                          * between the point where the error request
1299                          * list was created and dimensioned and this
1300                          * point then just exit early to avoid crashes.
1301                          *
1302                          * We don't need to communicate that the
1303                          * request list changed state during error
1304                          * state capture and that the error state is
1305                          * slightly incorrect as a consequence since we
1306                          * are typically only interested in the request
1307                          * list state at the point of error state
1308                          * capture, not in any changes happening during
1309                          * the capture.
1310                          */
1311                         break;
1312                 }
1313
1314                 record_request(request, &ee->requests[count++]);
1315         }
1316         ee->num_requests = count;
1317 }
1318
1319 static void error_record_engine_execlists(struct intel_engine_cs *engine,
1320                                           struct drm_i915_error_engine *ee)
1321 {
1322         const struct intel_engine_execlists * const execlists = &engine->execlists;
1323         unsigned int n;
1324
1325         for (n = 0; n < execlists_num_ports(execlists); n++) {
1326                 struct i915_request *rq = port_request(&execlists->port[n]);
1327
1328                 if (!rq)
1329                         break;
1330
1331                 record_request(rq, &ee->execlist[n]);
1332         }
1333
1334         ee->num_ports = n;
1335 }
1336
1337 static void record_context(struct drm_i915_error_context *e,
1338                            struct i915_gem_context *ctx)
1339 {
1340         if (ctx->pid) {
1341                 struct task_struct *task;
1342
1343                 rcu_read_lock();
1344                 task = pid_task(ctx->pid, PIDTYPE_PID);
1345                 if (task) {
1346                         strcpy(e->comm, task->comm);
1347                         e->pid = task->pid;
1348                 }
1349                 rcu_read_unlock();
1350         }
1351
1352         e->hw_id = ctx->hw_id;
1353         e->sched_attr = ctx->sched;
1354         e->guilty = atomic_read(&ctx->guilty_count);
1355         e->active = atomic_read(&ctx->active_count);
1356 }
1357
1358 static void request_record_user_bo(struct i915_request *request,
1359                                    struct drm_i915_error_engine *ee)
1360 {
1361         struct i915_capture_list *c;
1362         struct drm_i915_error_object **bo;
1363         long count, max;
1364
1365         max = 0;
1366         for (c = request->capture_list; c; c = c->next)
1367                 max++;
1368         if (!max)
1369                 return;
1370
1371         bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1372         if (!bo) {
1373                 /* If we can't capture everything, try to capture something. */
1374                 max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
1375                 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1376         }
1377         if (!bo)
1378                 return;
1379
1380         count = 0;
1381         for (c = request->capture_list; c; c = c->next) {
1382                 bo[count] = i915_error_object_create(request->i915, c->vma);
1383                 if (!bo[count])
1384                         break;
1385                 if (++count == max)
1386                         break;
1387         }
1388
1389         ee->user_bo = bo;
1390         ee->user_bo_count = count;
1391 }
1392
1393 static struct drm_i915_error_object *
1394 capture_object(struct drm_i915_private *dev_priv,
1395                struct drm_i915_gem_object *obj)
1396 {
1397         if (obj && i915_gem_object_has_pages(obj)) {
1398                 struct i915_vma fake = {
1399                         .node = { .start = U64_MAX, .size = obj->base.size },
1400                         .size = obj->base.size,
1401                         .pages = obj->mm.pages,
1402                         .obj = obj,
1403                 };
1404
1405                 return i915_error_object_create(dev_priv, &fake);
1406         } else {
1407                 return NULL;
1408         }
1409 }
1410
1411 static void gem_record_rings(struct i915_gpu_state *error)
1412 {
1413         struct drm_i915_private *i915 = error->i915;
1414         struct i915_ggtt *ggtt = &i915->ggtt;
1415         int i;
1416
1417         for (i = 0; i < I915_NUM_ENGINES; i++) {
1418                 struct intel_engine_cs *engine = i915->engine[i];
1419                 struct drm_i915_error_engine *ee = &error->engine[i];
1420                 struct i915_request *request;
1421                 unsigned long flags;
1422
1423                 ee->engine_id = -1;
1424
1425                 if (!engine)
1426                         continue;
1427
1428                 ee->engine_id = i;
1429
1430                 error_record_engine_registers(error, engine, ee);
1431                 error_record_engine_execlists(engine, ee);
1432
1433                 spin_lock_irqsave(&engine->active.lock, flags);
1434                 request = intel_engine_find_active_request(engine);
1435                 if (request) {
1436                         struct i915_gem_context *ctx = request->gem_context;
1437                         struct intel_ring *ring = request->ring;
1438
1439                         ee->vm = ctx->vm ?: &ggtt->vm;
1440
1441                         record_context(&ee->context, ctx);
1442
1443                         /* We need to copy these to an anonymous buffer
1444                          * as the simplest method to avoid being overwritten
1445                          * by userspace.
1446                          */
1447                         ee->batchbuffer =
1448                                 i915_error_object_create(i915, request->batch);
1449
1450                         if (HAS_BROKEN_CS_TLB(i915))
1451                                 ee->wa_batchbuffer =
1452                                         i915_error_object_create(i915,
1453                                                                  i915->gt.scratch);
1454                         request_record_user_bo(request, ee);
1455
1456                         ee->ctx =
1457                                 i915_error_object_create(i915,
1458                                                          request->hw_context->state);
1459
1460                         error->simulated |=
1461                                 i915_gem_context_no_error_capture(ctx);
1462
1463                         ee->rq_head = request->head;
1464                         ee->rq_post = request->postfix;
1465                         ee->rq_tail = request->tail;
1466
1467                         ee->cpu_ring_head = ring->head;
1468                         ee->cpu_ring_tail = ring->tail;
1469                         ee->ringbuffer =
1470                                 i915_error_object_create(i915, ring->vma);
1471
1472                         engine_record_requests(engine, request, ee);
1473                 }
1474                 spin_unlock_irqrestore(&engine->active.lock, flags);
1475
1476                 ee->hws_page =
1477                         i915_error_object_create(i915,
1478                                                  engine->status_page.vma);
1479
1480                 ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
1481
1482                 ee->default_state = capture_object(i915, engine->default_state);
1483         }
1484 }
1485
1486 static void gem_capture_vm(struct i915_gpu_state *error,
1487                            struct i915_address_space *vm,
1488                            int idx)
1489 {
1490         struct drm_i915_error_buffer *active_bo;
1491         struct i915_vma *vma;
1492         int count;
1493
1494         count = 0;
1495         list_for_each_entry(vma, &vm->bound_list, vm_link)
1496                 if (i915_vma_is_active(vma))
1497                         count++;
1498
1499         active_bo = NULL;
1500         if (count)
1501                 active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
1502         if (active_bo)
1503                 count = capture_error_bo(active_bo,
1504                                          count, &vm->bound_list,
1505                                          ACTIVE_ONLY);
1506         else
1507                 count = 0;
1508
1509         error->active_vm[idx] = vm;
1510         error->active_bo[idx] = active_bo;
1511         error->active_bo_count[idx] = count;
1512 }
1513
1514 static void capture_active_buffers(struct i915_gpu_state *error)
1515 {
1516         int cnt = 0, i, j;
1517
1518         BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
1519         BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
1520         BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
1521
1522         /* Scan each engine looking for unique active contexts/vm */
1523         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1524                 struct drm_i915_error_engine *ee = &error->engine[i];
1525                 bool found;
1526
1527                 if (!ee->vm)
1528                         continue;
1529
1530                 found = false;
1531                 for (j = 0; j < i && !found; j++)
1532                         found = error->engine[j].vm == ee->vm;
1533                 if (!found)
1534                         gem_capture_vm(error, ee->vm, cnt++);
1535         }
1536 }
1537
1538 static void capture_pinned_buffers(struct i915_gpu_state *error)
1539 {
1540         struct i915_address_space *vm = &error->i915->ggtt.vm;
1541         struct drm_i915_error_buffer *bo;
1542         struct i915_vma *vma;
1543         int count;
1544
1545         count = 0;
1546         list_for_each_entry(vma, &vm->bound_list, vm_link)
1547                 count++;
1548
1549         bo = NULL;
1550         if (count)
1551                 bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
1552         if (!bo)
1553                 return;
1554
1555         error->pinned_bo_count =
1556                 capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY);
1557         error->pinned_bo = bo;
1558 }
1559
1560 static void capture_uc_state(struct i915_gpu_state *error)
1561 {
1562         struct drm_i915_private *i915 = error->i915;
1563         struct i915_error_uc *error_uc = &error->uc;
1564
1565         /* Capturing uC state won't be useful if there is no GuC */
1566         if (!error->device_info.has_guc)
1567                 return;
1568
1569         error_uc->guc_fw = i915->guc.fw;
1570         error_uc->huc_fw = i915->huc.fw;
1571
1572         /* Non-default firmware paths will be specified by the modparam.
1573          * As modparams are generally accesible from the userspace make
1574          * explicit copies of the firmware paths.
1575          */
1576         error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
1577         error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
1578         error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma);
1579 }
1580
1581 /* Capture all registers which don't fit into another category. */
1582 static void capture_reg_state(struct i915_gpu_state *error)
1583 {
1584         struct drm_i915_private *i915 = error->i915;
1585         struct intel_uncore *uncore = &i915->uncore;
1586         int i;
1587
1588         /* General organization
1589          * 1. Registers specific to a single generation
1590          * 2. Registers which belong to multiple generations
1591          * 3. Feature specific registers.
1592          * 4. Everything else
1593          * Please try to follow the order.
1594          */
1595
1596         /* 1: Registers specific to a single generation */
1597         if (IS_VALLEYVIEW(i915)) {
1598                 error->gtier[0] = intel_uncore_read(uncore, GTIER);
1599                 error->ier = intel_uncore_read(uncore, VLV_IER);
1600                 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
1601         }
1602
1603         if (IS_GEN(i915, 7))
1604                 error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
1605
1606         if (INTEL_GEN(i915) >= 8) {
1607                 error->fault_data0 = intel_uncore_read(uncore,
1608                                                        GEN8_FAULT_TLB_DATA0);
1609                 error->fault_data1 = intel_uncore_read(uncore,
1610                                                        GEN8_FAULT_TLB_DATA1);
1611         }
1612
1613         if (IS_GEN(i915, 6)) {
1614                 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
1615                 error->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
1616                 error->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
1617         }
1618
1619         /* 2: Registers which belong to multiple generations */
1620         if (INTEL_GEN(i915) >= 7)
1621                 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
1622
1623         if (INTEL_GEN(i915) >= 6) {
1624                 error->derrmr = intel_uncore_read(uncore, DERRMR);
1625                 error->error = intel_uncore_read(uncore, ERROR_GEN6);
1626                 error->done_reg = intel_uncore_read(uncore, DONE_REG);
1627         }
1628
1629         if (INTEL_GEN(i915) >= 5)
1630                 error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE));
1631
1632         /* 3: Feature specific registers */
1633         if (IS_GEN_RANGE(i915, 6, 7)) {
1634                 error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
1635                 error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
1636         }
1637
1638         /* 4: Everything else */
1639         if (INTEL_GEN(i915) >= 11) {
1640                 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1641                 error->gtier[0] =
1642                         intel_uncore_read(uncore,
1643                                           GEN11_RENDER_COPY_INTR_ENABLE);
1644                 error->gtier[1] =
1645                         intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
1646                 error->gtier[2] =
1647                         intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
1648                 error->gtier[3] =
1649                         intel_uncore_read(uncore,
1650                                           GEN11_GPM_WGBOXPERF_INTR_ENABLE);
1651                 error->gtier[4] =
1652                         intel_uncore_read(uncore,
1653                                           GEN11_CRYPTO_RSVD_INTR_ENABLE);
1654                 error->gtier[5] =
1655                         intel_uncore_read(uncore,
1656                                           GEN11_GUNIT_CSME_INTR_ENABLE);
1657                 error->ngtier = 6;
1658         } else if (INTEL_GEN(i915) >= 8) {
1659                 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1660                 for (i = 0; i < 4; i++)
1661                         error->gtier[i] = intel_uncore_read(uncore,
1662                                                             GEN8_GT_IER(i));
1663                 error->ngtier = 4;
1664         } else if (HAS_PCH_SPLIT(i915)) {
1665                 error->ier = intel_uncore_read(uncore, DEIER);
1666                 error->gtier[0] = intel_uncore_read(uncore, GTIER);
1667                 error->ngtier = 1;
1668         } else if (IS_GEN(i915, 2)) {
1669                 error->ier = intel_uncore_read16(uncore, GEN2_IER);
1670         } else if (!IS_VALLEYVIEW(i915)) {
1671                 error->ier = intel_uncore_read(uncore, GEN2_IER);
1672         }
1673         error->eir = intel_uncore_read(uncore, EIR);
1674         error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
1675 }
1676
1677 static const char *
1678 error_msg(struct i915_gpu_state *error,
1679           intel_engine_mask_t engines, const char *msg)
1680 {
1681         int len;
1682         int i;
1683
1684         for (i = 0; i < ARRAY_SIZE(error->engine); i++)
1685                 if (!error->engine[i].context.pid)
1686                         engines &= ~BIT(i);
1687
1688         len = scnprintf(error->error_msg, sizeof(error->error_msg),
1689                         "GPU HANG: ecode %d:%x:0x%08x",
1690                         INTEL_GEN(error->i915), engines,
1691                         i915_error_generate_code(error, engines));
1692         if (engines) {
1693                 /* Just show the first executing process, more is confusing */
1694                 i = __ffs(engines);
1695                 len += scnprintf(error->error_msg + len,
1696                                  sizeof(error->error_msg) - len,
1697                                  ", in %s [%d]",
1698                                  error->engine[i].context.comm,
1699                                  error->engine[i].context.pid);
1700         }
1701         if (msg)
1702                 len += scnprintf(error->error_msg + len,
1703                                  sizeof(error->error_msg) - len,
1704                                  ", %s", msg);
1705
1706         return error->error_msg;
1707 }
1708
1709 static void capture_gen_state(struct i915_gpu_state *error)
1710 {
1711         struct drm_i915_private *i915 = error->i915;
1712
1713         error->awake = i915->gt.awake;
1714         error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
1715         error->suspended = i915->runtime_pm.suspended;
1716
1717         error->iommu = -1;
1718 #ifdef CONFIG_INTEL_IOMMU
1719         error->iommu = intel_iommu_gfx_mapped;
1720 #endif
1721         error->reset_count = i915_reset_count(&i915->gpu_error);
1722         error->suspend_count = i915->suspend_count;
1723
1724         memcpy(&error->device_info,
1725                INTEL_INFO(i915),
1726                sizeof(error->device_info));
1727         memcpy(&error->runtime_info,
1728                RUNTIME_INFO(i915),
1729                sizeof(error->runtime_info));
1730         error->driver_caps = i915->caps;
1731 }
1732
1733 static void capture_params(struct i915_gpu_state *error)
1734 {
1735         i915_params_copy(&error->params, &i915_modparams);
1736 }
1737
1738 static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
1739 {
1740         unsigned long epoch = error->capture;
1741         int i;
1742
1743         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1744                 const struct drm_i915_error_engine *ee = &error->engine[i];
1745
1746                 if (ee->hangcheck_timestamp &&
1747                     time_before(ee->hangcheck_timestamp, epoch))
1748                         epoch = ee->hangcheck_timestamp;
1749         }
1750
1751         return epoch;
1752 }
1753
1754 static void capture_finish(struct i915_gpu_state *error)
1755 {
1756         struct i915_ggtt *ggtt = &error->i915->ggtt;
1757         const u64 slot = ggtt->error_capture.start;
1758
1759         ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
1760 }
1761
1762 static int capture(void *data)
1763 {
1764         struct i915_gpu_state *error = data;
1765
1766         error->time = ktime_get_real();
1767         error->boottime = ktime_get_boottime();
1768         error->uptime = ktime_sub(ktime_get(),
1769                                   error->i915->gt.last_init_time);
1770         error->capture = jiffies;
1771
1772         capture_params(error);
1773         capture_gen_state(error);
1774         capture_uc_state(error);
1775         capture_reg_state(error);
1776         gem_record_fences(error);
1777         gem_record_rings(error);
1778         capture_active_buffers(error);
1779         capture_pinned_buffers(error);
1780
1781         error->overlay = intel_overlay_capture_error_state(error->i915);
1782         error->display = intel_display_capture_error_state(error->i915);
1783
1784         error->epoch = capture_find_epoch(error);
1785
1786         capture_finish(error);
1787         return 0;
1788 }
1789
1790 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
1791
1792 struct i915_gpu_state *
1793 i915_capture_gpu_state(struct drm_i915_private *i915)
1794 {
1795         struct i915_gpu_state *error;
1796
1797         /* Check if GPU capture has been disabled */
1798         error = READ_ONCE(i915->gpu_error.first_error);
1799         if (IS_ERR(error))
1800                 return error;
1801
1802         error = kzalloc(sizeof(*error), GFP_ATOMIC);
1803         if (!error) {
1804                 i915_disable_error_state(i915, -ENOMEM);
1805                 return ERR_PTR(-ENOMEM);
1806         }
1807
1808         kref_init(&error->ref);
1809         error->i915 = i915;
1810
1811         stop_machine(capture, error, NULL);
1812
1813         return error;
1814 }
1815
1816 /**
1817  * i915_capture_error_state - capture an error record for later analysis
1818  * @i915: i915 device
1819  * @engine_mask: the mask of engines triggering the hang
1820  * @msg: a message to insert into the error capture header
1821  *
1822  * Should be called when an error is detected (either a hang or an error
1823  * interrupt) to capture error state from the time of the error.  Fills
1824  * out a structure which becomes available in debugfs for user level tools
1825  * to pick up.
1826  */
1827 void i915_capture_error_state(struct drm_i915_private *i915,
1828                               intel_engine_mask_t engine_mask,
1829                               const char *msg)
1830 {
1831         static bool warned;
1832         struct i915_gpu_state *error;
1833         unsigned long flags;
1834
1835         if (!i915_modparams.error_capture)
1836                 return;
1837
1838         if (READ_ONCE(i915->gpu_error.first_error))
1839                 return;
1840
1841         error = i915_capture_gpu_state(i915);
1842         if (IS_ERR(error))
1843                 return;
1844
1845         dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
1846
1847         if (!error->simulated) {
1848                 spin_lock_irqsave(&i915->gpu_error.lock, flags);
1849                 if (!i915->gpu_error.first_error) {
1850                         i915->gpu_error.first_error = error;
1851                         error = NULL;
1852                 }
1853                 spin_unlock_irqrestore(&i915->gpu_error.lock, flags);
1854         }
1855
1856         if (error) {
1857                 __i915_gpu_state_free(&error->ref);
1858                 return;
1859         }
1860
1861         if (!warned &&
1862             ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
1863                 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1864                 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1865                 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
1866                 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
1867                 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1868                          i915->drm.primary->index);
1869                 warned = true;
1870         }
1871 }
1872
1873 struct i915_gpu_state *
1874 i915_first_error_state(struct drm_i915_private *i915)
1875 {
1876         struct i915_gpu_state *error;
1877
1878         spin_lock_irq(&i915->gpu_error.lock);
1879         error = i915->gpu_error.first_error;
1880         if (!IS_ERR_OR_NULL(error))
1881                 i915_gpu_state_get(error);
1882         spin_unlock_irq(&i915->gpu_error.lock);
1883
1884         return error;
1885 }
1886
1887 void i915_reset_error_state(struct drm_i915_private *i915)
1888 {
1889         struct i915_gpu_state *error;
1890
1891         spin_lock_irq(&i915->gpu_error.lock);
1892         error = i915->gpu_error.first_error;
1893         if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
1894                 i915->gpu_error.first_error = NULL;
1895         spin_unlock_irq(&i915->gpu_error.lock);
1896
1897         if (!IS_ERR_OR_NULL(error))
1898                 i915_gpu_state_put(error);
1899 }
1900
1901 void i915_disable_error_state(struct drm_i915_private *i915, int err)
1902 {
1903         spin_lock_irq(&i915->gpu_error.lock);
1904         if (!i915->gpu_error.first_error)
1905                 i915->gpu_error.first_error = ERR_PTR(err);
1906         spin_unlock_irq(&i915->gpu_error.lock);
1907 }