Linux-libre 4.9.135-gnu
[librecmc/linux-libre.git] / arch / x86 / events / intel / ds.c
1 #include <linux/bitops.h>
2 #include <linux/types.h>
3 #include <linux/slab.h>
4
5 #include <asm/kaiser.h>
6 #include <asm/perf_event.h>
7 #include <asm/insn.h>
8
9 #include "../perf_event.h"
10
11 static
12 DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
13
14 /* The size of a BTS record in bytes: */
15 #define BTS_RECORD_SIZE         24
16
17 #define BTS_BUFFER_SIZE         (PAGE_SIZE << 4)
18 #define PEBS_BUFFER_SIZE        (PAGE_SIZE << 4)
19 #define PEBS_FIXUP_SIZE         PAGE_SIZE
20
21 /*
22  * pebs_record_32 for p4 and core not supported
23
24 struct pebs_record_32 {
25         u32 flags, ip;
26         u32 ax, bc, cx, dx;
27         u32 si, di, bp, sp;
28 };
29
30  */
31
32 union intel_x86_pebs_dse {
33         u64 val;
34         struct {
35                 unsigned int ld_dse:4;
36                 unsigned int ld_stlb_miss:1;
37                 unsigned int ld_locked:1;
38                 unsigned int ld_reserved:26;
39         };
40         struct {
41                 unsigned int st_l1d_hit:1;
42                 unsigned int st_reserved1:3;
43                 unsigned int st_stlb_miss:1;
44                 unsigned int st_locked:1;
45                 unsigned int st_reserved2:26;
46         };
47 };
48
49
50 /*
51  * Map PEBS Load Latency Data Source encodings to generic
52  * memory data source information
53  */
54 #define P(a, b) PERF_MEM_S(a, b)
55 #define OP_LH (P(OP, LOAD) | P(LVL, HIT))
56 #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
57
58 /* Version for Sandy Bridge and later */
59 static u64 pebs_data_source[] = {
60         P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
61         OP_LH | P(LVL, L1)  | P(SNOOP, NONE),   /* 0x01: L1 local */
62         OP_LH | P(LVL, LFB) | P(SNOOP, NONE),   /* 0x02: LFB hit */
63         OP_LH | P(LVL, L2)  | P(SNOOP, NONE),   /* 0x03: L2 hit */
64         OP_LH | P(LVL, L3)  | P(SNOOP, NONE),   /* 0x04: L3 hit */
65         OP_LH | P(LVL, L3)  | P(SNOOP, MISS),   /* 0x05: L3 hit, snoop miss */
66         OP_LH | P(LVL, L3)  | P(SNOOP, HIT),    /* 0x06: L3 hit, snoop hit */
67         OP_LH | P(LVL, L3)  | P(SNOOP, HITM),   /* 0x07: L3 hit, snoop hitm */
68         OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
69         OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
70         OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
71         OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
72         OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
73         OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
74         OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
75         OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
76 };
77
78 /* Patch up minor differences in the bits */
79 void __init intel_pmu_pebs_data_source_nhm(void)
80 {
81         pebs_data_source[0x05] = OP_LH | P(LVL, L3)  | P(SNOOP, HIT);
82         pebs_data_source[0x06] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
83         pebs_data_source[0x07] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
84 }
85
86 static u64 precise_store_data(u64 status)
87 {
88         union intel_x86_pebs_dse dse;
89         u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
90
91         dse.val = status;
92
93         /*
94          * bit 4: TLB access
95          * 1 = stored missed 2nd level TLB
96          *
97          * so it either hit the walker or the OS
98          * otherwise hit 2nd level TLB
99          */
100         if (dse.st_stlb_miss)
101                 val |= P(TLB, MISS);
102         else
103                 val |= P(TLB, HIT);
104
105         /*
106          * bit 0: hit L1 data cache
107          * if not set, then all we know is that
108          * it missed L1D
109          */
110         if (dse.st_l1d_hit)
111                 val |= P(LVL, HIT);
112         else
113                 val |= P(LVL, MISS);
114
115         /*
116          * bit 5: Locked prefix
117          */
118         if (dse.st_locked)
119                 val |= P(LOCK, LOCKED);
120
121         return val;
122 }
123
124 static u64 precise_datala_hsw(struct perf_event *event, u64 status)
125 {
126         union perf_mem_data_src dse;
127
128         dse.val = PERF_MEM_NA;
129
130         if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
131                 dse.mem_op = PERF_MEM_OP_STORE;
132         else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
133                 dse.mem_op = PERF_MEM_OP_LOAD;
134
135         /*
136          * L1 info only valid for following events:
137          *
138          * MEM_UOPS_RETIRED.STLB_MISS_STORES
139          * MEM_UOPS_RETIRED.LOCK_STORES
140          * MEM_UOPS_RETIRED.SPLIT_STORES
141          * MEM_UOPS_RETIRED.ALL_STORES
142          */
143         if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
144                 if (status & 1)
145                         dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
146                 else
147                         dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
148         }
149         return dse.val;
150 }
151
152 static u64 load_latency_data(u64 status)
153 {
154         union intel_x86_pebs_dse dse;
155         u64 val;
156         int model = boot_cpu_data.x86_model;
157         int fam = boot_cpu_data.x86;
158
159         dse.val = status;
160
161         /*
162          * use the mapping table for bit 0-3
163          */
164         val = pebs_data_source[dse.ld_dse];
165
166         /*
167          * Nehalem models do not support TLB, Lock infos
168          */
169         if (fam == 0x6 && (model == 26 || model == 30
170             || model == 31 || model == 46)) {
171                 val |= P(TLB, NA) | P(LOCK, NA);
172                 return val;
173         }
174         /*
175          * bit 4: TLB access
176          * 0 = did not miss 2nd level TLB
177          * 1 = missed 2nd level TLB
178          */
179         if (dse.ld_stlb_miss)
180                 val |= P(TLB, MISS) | P(TLB, L2);
181         else
182                 val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
183
184         /*
185          * bit 5: locked prefix
186          */
187         if (dse.ld_locked)
188                 val |= P(LOCK, LOCKED);
189
190         return val;
191 }
192
193 struct pebs_record_core {
194         u64 flags, ip;
195         u64 ax, bx, cx, dx;
196         u64 si, di, bp, sp;
197         u64 r8,  r9,  r10, r11;
198         u64 r12, r13, r14, r15;
199 };
200
201 struct pebs_record_nhm {
202         u64 flags, ip;
203         u64 ax, bx, cx, dx;
204         u64 si, di, bp, sp;
205         u64 r8,  r9,  r10, r11;
206         u64 r12, r13, r14, r15;
207         u64 status, dla, dse, lat;
208 };
209
210 /*
211  * Same as pebs_record_nhm, with two additional fields.
212  */
213 struct pebs_record_hsw {
214         u64 flags, ip;
215         u64 ax, bx, cx, dx;
216         u64 si, di, bp, sp;
217         u64 r8,  r9,  r10, r11;
218         u64 r12, r13, r14, r15;
219         u64 status, dla, dse, lat;
220         u64 real_ip, tsx_tuning;
221 };
222
223 union hsw_tsx_tuning {
224         struct {
225                 u32 cycles_last_block     : 32,
226                     hle_abort             : 1,
227                     rtm_abort             : 1,
228                     instruction_abort     : 1,
229                     non_instruction_abort : 1,
230                     retry                 : 1,
231                     data_conflict         : 1,
232                     capacity_writes       : 1,
233                     capacity_reads        : 1;
234         };
235         u64         value;
236 };
237
238 #define PEBS_HSW_TSX_FLAGS      0xff00000000ULL
239
240 /* Same as HSW, plus TSC */
241
242 struct pebs_record_skl {
243         u64 flags, ip;
244         u64 ax, bx, cx, dx;
245         u64 si, di, bp, sp;
246         u64 r8,  r9,  r10, r11;
247         u64 r12, r13, r14, r15;
248         u64 status, dla, dse, lat;
249         u64 real_ip, tsx_tuning;
250         u64 tsc;
251 };
252
253 void init_debug_store_on_cpu(int cpu)
254 {
255         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
256
257         if (!ds)
258                 return;
259
260         wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
261                      (u32)((u64)(unsigned long)ds),
262                      (u32)((u64)(unsigned long)ds >> 32));
263 }
264
265 void fini_debug_store_on_cpu(int cpu)
266 {
267         if (!per_cpu(cpu_hw_events, cpu).ds)
268                 return;
269
270         wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
271 }
272
273 static DEFINE_PER_CPU(void *, insn_buffer);
274
275 static void *dsalloc(size_t size, gfp_t flags, int node)
276 {
277 #ifdef CONFIG_PAGE_TABLE_ISOLATION
278         unsigned int order = get_order(size);
279         struct page *page;
280         unsigned long addr;
281
282         page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
283         if (!page)
284                 return NULL;
285         addr = (unsigned long)page_address(page);
286         if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
287                 __free_pages(page, order);
288                 addr = 0;
289         }
290         return (void *)addr;
291 #else
292         return kmalloc_node(size, flags | __GFP_ZERO, node);
293 #endif
294 }
295
296 static void dsfree(const void *buffer, size_t size)
297 {
298 #ifdef CONFIG_PAGE_TABLE_ISOLATION
299         if (!buffer)
300                 return;
301         kaiser_remove_mapping((unsigned long)buffer, size);
302         free_pages((unsigned long)buffer, get_order(size));
303 #else
304         kfree(buffer);
305 #endif
306 }
307
308 static int alloc_pebs_buffer(int cpu)
309 {
310         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
311         int node = cpu_to_node(cpu);
312         int max;
313         void *buffer, *ibuffer;
314
315         if (!x86_pmu.pebs)
316                 return 0;
317
318         buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
319         if (unlikely(!buffer))
320                 return -ENOMEM;
321
322         /*
323          * HSW+ already provides us the eventing ip; no need to allocate this
324          * buffer then.
325          */
326         if (x86_pmu.intel_cap.pebs_format < 2) {
327                 ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
328                 if (!ibuffer) {
329                         dsfree(buffer, x86_pmu.pebs_buffer_size);
330                         return -ENOMEM;
331                 }
332                 per_cpu(insn_buffer, cpu) = ibuffer;
333         }
334
335         max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
336
337         ds->pebs_buffer_base = (u64)(unsigned long)buffer;
338         ds->pebs_index = ds->pebs_buffer_base;
339         ds->pebs_absolute_maximum = ds->pebs_buffer_base +
340                 max * x86_pmu.pebs_record_size;
341
342         return 0;
343 }
344
345 static void release_pebs_buffer(int cpu)
346 {
347         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
348
349         if (!ds || !x86_pmu.pebs)
350                 return;
351
352         kfree(per_cpu(insn_buffer, cpu));
353         per_cpu(insn_buffer, cpu) = NULL;
354
355         dsfree((void *)(unsigned long)ds->pebs_buffer_base,
356                         x86_pmu.pebs_buffer_size);
357         ds->pebs_buffer_base = 0;
358 }
359
360 static int alloc_bts_buffer(int cpu)
361 {
362         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
363         int node = cpu_to_node(cpu);
364         int max, thresh;
365         void *buffer;
366
367         if (!x86_pmu.bts)
368                 return 0;
369
370         buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
371         if (unlikely(!buffer)) {
372                 WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
373                 return -ENOMEM;
374         }
375
376         max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
377         thresh = max / 16;
378
379         ds->bts_buffer_base = (u64)(unsigned long)buffer;
380         ds->bts_index = ds->bts_buffer_base;
381         ds->bts_absolute_maximum = ds->bts_buffer_base +
382                 max * BTS_RECORD_SIZE;
383         ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
384                 thresh * BTS_RECORD_SIZE;
385
386         return 0;
387 }
388
389 static void release_bts_buffer(int cpu)
390 {
391         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
392
393         if (!ds || !x86_pmu.bts)
394                 return;
395
396         dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
397         ds->bts_buffer_base = 0;
398 }
399
400 static int alloc_ds_buffer(int cpu)
401 {
402         struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
403
404         memset(ds, 0, sizeof(*ds));
405         per_cpu(cpu_hw_events, cpu).ds = ds;
406
407         return 0;
408 }
409
410 static void release_ds_buffer(int cpu)
411 {
412         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
413
414         if (!ds)
415                 return;
416
417         per_cpu(cpu_hw_events, cpu).ds = NULL;
418 }
419
420 void release_ds_buffers(void)
421 {
422         int cpu;
423
424         if (!x86_pmu.bts && !x86_pmu.pebs)
425                 return;
426
427         get_online_cpus();
428         for_each_online_cpu(cpu)
429                 fini_debug_store_on_cpu(cpu);
430
431         for_each_possible_cpu(cpu) {
432                 release_pebs_buffer(cpu);
433                 release_bts_buffer(cpu);
434                 release_ds_buffer(cpu);
435         }
436         put_online_cpus();
437 }
438
439 void reserve_ds_buffers(void)
440 {
441         int bts_err = 0, pebs_err = 0;
442         int cpu;
443
444         x86_pmu.bts_active = 0;
445         x86_pmu.pebs_active = 0;
446
447         if (!x86_pmu.bts && !x86_pmu.pebs)
448                 return;
449
450         if (!x86_pmu.bts)
451                 bts_err = 1;
452
453         if (!x86_pmu.pebs)
454                 pebs_err = 1;
455
456         get_online_cpus();
457
458         for_each_possible_cpu(cpu) {
459                 if (alloc_ds_buffer(cpu)) {
460                         bts_err = 1;
461                         pebs_err = 1;
462                 }
463
464                 if (!bts_err && alloc_bts_buffer(cpu))
465                         bts_err = 1;
466
467                 if (!pebs_err && alloc_pebs_buffer(cpu))
468                         pebs_err = 1;
469
470                 if (bts_err && pebs_err)
471                         break;
472         }
473
474         if (bts_err) {
475                 for_each_possible_cpu(cpu)
476                         release_bts_buffer(cpu);
477         }
478
479         if (pebs_err) {
480                 for_each_possible_cpu(cpu)
481                         release_pebs_buffer(cpu);
482         }
483
484         if (bts_err && pebs_err) {
485                 for_each_possible_cpu(cpu)
486                         release_ds_buffer(cpu);
487         } else {
488                 if (x86_pmu.bts && !bts_err)
489                         x86_pmu.bts_active = 1;
490
491                 if (x86_pmu.pebs && !pebs_err)
492                         x86_pmu.pebs_active = 1;
493
494                 for_each_online_cpu(cpu)
495                         init_debug_store_on_cpu(cpu);
496         }
497
498         put_online_cpus();
499 }
500
501 /*
502  * BTS
503  */
504
505 struct event_constraint bts_constraint =
506         EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
507
508 void intel_pmu_enable_bts(u64 config)
509 {
510         unsigned long debugctlmsr;
511
512         debugctlmsr = get_debugctlmsr();
513
514         debugctlmsr |= DEBUGCTLMSR_TR;
515         debugctlmsr |= DEBUGCTLMSR_BTS;
516         if (config & ARCH_PERFMON_EVENTSEL_INT)
517                 debugctlmsr |= DEBUGCTLMSR_BTINT;
518
519         if (!(config & ARCH_PERFMON_EVENTSEL_OS))
520                 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
521
522         if (!(config & ARCH_PERFMON_EVENTSEL_USR))
523                 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
524
525         update_debugctlmsr(debugctlmsr);
526 }
527
528 void intel_pmu_disable_bts(void)
529 {
530         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
531         unsigned long debugctlmsr;
532
533         if (!cpuc->ds)
534                 return;
535
536         debugctlmsr = get_debugctlmsr();
537
538         debugctlmsr &=
539                 ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
540                   DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
541
542         update_debugctlmsr(debugctlmsr);
543 }
544
545 int intel_pmu_drain_bts_buffer(void)
546 {
547         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
548         struct debug_store *ds = cpuc->ds;
549         struct bts_record {
550                 u64     from;
551                 u64     to;
552                 u64     flags;
553         };
554         struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
555         struct bts_record *at, *base, *top;
556         struct perf_output_handle handle;
557         struct perf_event_header header;
558         struct perf_sample_data data;
559         unsigned long skip = 0;
560         struct pt_regs regs;
561
562         if (!event)
563                 return 0;
564
565         if (!x86_pmu.bts_active)
566                 return 0;
567
568         base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
569         top  = (struct bts_record *)(unsigned long)ds->bts_index;
570
571         if (top <= base)
572                 return 0;
573
574         memset(&regs, 0, sizeof(regs));
575
576         ds->bts_index = ds->bts_buffer_base;
577
578         perf_sample_data_init(&data, 0, event->hw.last_period);
579
580         /*
581          * BTS leaks kernel addresses in branches across the cpl boundary,
582          * such as traps or system calls, so unless the user is asking for
583          * kernel tracing (and right now it's not possible), we'd need to
584          * filter them out. But first we need to count how many of those we
585          * have in the current batch. This is an extra O(n) pass, however,
586          * it's much faster than the other one especially considering that
587          * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
588          * alloc_bts_buffer()).
589          */
590         for (at = base; at < top; at++) {
591                 /*
592                  * Note that right now *this* BTS code only works if
593                  * attr::exclude_kernel is set, but let's keep this extra
594                  * check here in case that changes.
595                  */
596                 if (event->attr.exclude_kernel &&
597                     (kernel_ip(at->from) || kernel_ip(at->to)))
598                         skip++;
599         }
600
601         /*
602          * Prepare a generic sample, i.e. fill in the invariant fields.
603          * We will overwrite the from and to address before we output
604          * the sample.
605          */
606         rcu_read_lock();
607         perf_prepare_sample(&header, &data, event, &regs);
608
609         if (perf_output_begin(&handle, event, header.size *
610                               (top - base - skip)))
611                 goto unlock;
612
613         for (at = base; at < top; at++) {
614                 /* Filter out any records that contain kernel addresses. */
615                 if (event->attr.exclude_kernel &&
616                     (kernel_ip(at->from) || kernel_ip(at->to)))
617                         continue;
618
619                 data.ip         = at->from;
620                 data.addr       = at->to;
621
622                 perf_output_sample(&handle, &header, &data, event);
623         }
624
625         perf_output_end(&handle);
626
627         /* There's new data available. */
628         event->hw.interrupts++;
629         event->pending_kill = POLL_IN;
630 unlock:
631         rcu_read_unlock();
632         return 1;
633 }
634
635 static inline void intel_pmu_drain_pebs_buffer(void)
636 {
637         struct pt_regs regs;
638
639         x86_pmu.drain_pebs(&regs);
640 }
641
642 void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
643 {
644         if (!sched_in)
645                 intel_pmu_drain_pebs_buffer();
646 }
647
648 /*
649  * PEBS
650  */
651 struct event_constraint intel_core2_pebs_event_constraints[] = {
652         INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
653         INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
654         INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
655         INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
656         INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
657         /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
658         INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
659         EVENT_CONSTRAINT_END
660 };
661
662 struct event_constraint intel_atom_pebs_event_constraints[] = {
663         INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
664         INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
665         INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
666         /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
667         INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
668         /* Allow all events as PEBS with no flags */
669         INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
670         EVENT_CONSTRAINT_END
671 };
672
673 struct event_constraint intel_slm_pebs_event_constraints[] = {
674         /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
675         INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
676         /* Allow all events as PEBS with no flags */
677         INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
678         EVENT_CONSTRAINT_END
679 };
680
681 struct event_constraint intel_glm_pebs_event_constraints[] = {
682         /* Allow all events as PEBS with no flags */
683         INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
684         EVENT_CONSTRAINT_END
685 };
686
687 struct event_constraint intel_nehalem_pebs_event_constraints[] = {
688         INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
689         INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
690         INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
691         INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
692         INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
693         INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
694         INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
695         INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
696         INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
697         INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
698         INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
699         /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
700         INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
701         EVENT_CONSTRAINT_END
702 };
703
704 struct event_constraint intel_westmere_pebs_event_constraints[] = {
705         INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
706         INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
707         INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
708         INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
709         INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
710         INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
711         INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
712         INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
713         INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
714         INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
715         INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
716         /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
717         INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
718         EVENT_CONSTRAINT_END
719 };
720
721 struct event_constraint intel_snb_pebs_event_constraints[] = {
722         INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
723         INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
724         INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
725         /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
726         INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
727         INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
728         INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
729         INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
730         INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
731         /* Allow all events as PEBS with no flags */
732         INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
733         EVENT_CONSTRAINT_END
734 };
735
736 struct event_constraint intel_ivb_pebs_event_constraints[] = {
737         INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
738         INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
739         INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
740         /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
741         INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
742         /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
743         INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
744         INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
745         INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
746         INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
747         INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
748         /* Allow all events as PEBS with no flags */
749         INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
750         EVENT_CONSTRAINT_END
751 };
752
753 struct event_constraint intel_hsw_pebs_event_constraints[] = {
754         INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
755         INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
756         /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
757         INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
758         /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
759         INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
760         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
761         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
762         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
763         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
764         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
765         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
766         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
767         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
768         INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
769         INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
770         INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
771         /* Allow all events as PEBS with no flags */
772         INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
773         EVENT_CONSTRAINT_END
774 };
775
776 struct event_constraint intel_bdw_pebs_event_constraints[] = {
777         INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
778         INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
779         /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
780         INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
781         /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
782         INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
783         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
784         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
785         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
786         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
787         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
788         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
789         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
790         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
791         INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
792         INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
793         INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
794         /* Allow all events as PEBS with no flags */
795         INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
796         EVENT_CONSTRAINT_END
797 };
798
799
800 struct event_constraint intel_skl_pebs_event_constraints[] = {
801         INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),      /* INST_RETIRED.PREC_DIST */
802         /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
803         INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
804         /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
805         INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
806         INTEL_PLD_CONSTRAINT(0x1cd, 0xf),                     /* MEM_TRANS_RETIRED.* */
807         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
808         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
809         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
810         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
811         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
812         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
813         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
814         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
815         INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
816         INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
817         INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
818         /* Allow all events as PEBS with no flags */
819         INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
820         EVENT_CONSTRAINT_END
821 };
822
823 struct event_constraint *intel_pebs_constraints(struct perf_event *event)
824 {
825         struct event_constraint *c;
826
827         if (!event->attr.precise_ip)
828                 return NULL;
829
830         if (x86_pmu.pebs_constraints) {
831                 for_each_event_constraint(c, x86_pmu.pebs_constraints) {
832                         if ((event->hw.config & c->cmask) == c->code) {
833                                 event->hw.flags |= c->flags;
834                                 return c;
835                         }
836                 }
837         }
838
839         return &emptyconstraint;
840 }
841
842 /*
843  * We need the sched_task callback even for per-cpu events when we use
844  * the large interrupt threshold, such that we can provide PID and TID
845  * to PEBS samples.
846  */
847 static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
848 {
849         return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
850 }
851
852 static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
853 {
854         struct debug_store *ds = cpuc->ds;
855         u64 threshold;
856
857         if (cpuc->n_pebs == cpuc->n_large_pebs) {
858                 threshold = ds->pebs_absolute_maximum -
859                         x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
860         } else {
861                 threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
862         }
863
864         ds->pebs_interrupt_threshold = threshold;
865 }
866
867 static void
868 pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu)
869 {
870         /*
871          * Make sure we get updated with the first PEBS
872          * event. It will trigger also during removal, but
873          * that does not hurt:
874          */
875         bool update = cpuc->n_pebs == 1;
876
877         if (needed_cb != pebs_needs_sched_cb(cpuc)) {
878                 if (!needed_cb)
879                         perf_sched_cb_inc(pmu);
880                 else
881                         perf_sched_cb_dec(pmu);
882
883                 update = true;
884         }
885
886         if (update)
887                 pebs_update_threshold(cpuc);
888 }
889
890 void intel_pmu_pebs_add(struct perf_event *event)
891 {
892         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
893         struct hw_perf_event *hwc = &event->hw;
894         bool needed_cb = pebs_needs_sched_cb(cpuc);
895
896         cpuc->n_pebs++;
897         if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
898                 cpuc->n_large_pebs++;
899
900         pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
901 }
902
903 void intel_pmu_pebs_enable(struct perf_event *event)
904 {
905         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
906         struct hw_perf_event *hwc = &event->hw;
907         struct debug_store *ds = cpuc->ds;
908
909         hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
910
911         cpuc->pebs_enabled |= 1ULL << hwc->idx;
912
913         if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
914                 cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
915         else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
916                 cpuc->pebs_enabled |= 1ULL << 63;
917
918         /*
919          * Use auto-reload if possible to save a MSR write in the PMI.
920          * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
921          */
922         if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
923                 ds->pebs_event_reset[hwc->idx] =
924                         (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
925         }
926 }
927
928 void intel_pmu_pebs_del(struct perf_event *event)
929 {
930         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
931         struct hw_perf_event *hwc = &event->hw;
932         bool needed_cb = pebs_needs_sched_cb(cpuc);
933
934         cpuc->n_pebs--;
935         if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
936                 cpuc->n_large_pebs--;
937
938         pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
939 }
940
941 void intel_pmu_pebs_disable(struct perf_event *event)
942 {
943         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
944         struct hw_perf_event *hwc = &event->hw;
945
946         if (cpuc->n_pebs == cpuc->n_large_pebs)
947                 intel_pmu_drain_pebs_buffer();
948
949         cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
950
951         if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
952                 cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
953         else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
954                 cpuc->pebs_enabled &= ~(1ULL << 63);
955
956         if (cpuc->enabled)
957                 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
958
959         hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
960 }
961
962 void intel_pmu_pebs_enable_all(void)
963 {
964         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
965
966         if (cpuc->pebs_enabled)
967                 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
968 }
969
970 void intel_pmu_pebs_disable_all(void)
971 {
972         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
973
974         if (cpuc->pebs_enabled)
975                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
976 }
977
978 static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
979 {
980         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
981         unsigned long from = cpuc->lbr_entries[0].from;
982         unsigned long old_to, to = cpuc->lbr_entries[0].to;
983         unsigned long ip = regs->ip;
984         int is_64bit = 0;
985         void *kaddr;
986         int size;
987
988         /*
989          * We don't need to fixup if the PEBS assist is fault like
990          */
991         if (!x86_pmu.intel_cap.pebs_trap)
992                 return 1;
993
994         /*
995          * No LBR entry, no basic block, no rewinding
996          */
997         if (!cpuc->lbr_stack.nr || !from || !to)
998                 return 0;
999
1000         /*
1001          * Basic blocks should never cross user/kernel boundaries
1002          */
1003         if (kernel_ip(ip) != kernel_ip(to))
1004                 return 0;
1005
1006         /*
1007          * unsigned math, either ip is before the start (impossible) or
1008          * the basic block is larger than 1 page (sanity)
1009          */
1010         if ((ip - to) > PEBS_FIXUP_SIZE)
1011                 return 0;
1012
1013         /*
1014          * We sampled a branch insn, rewind using the LBR stack
1015          */
1016         if (ip == to) {
1017                 set_linear_ip(regs, from);
1018                 return 1;
1019         }
1020
1021         size = ip - to;
1022         if (!kernel_ip(ip)) {
1023                 int bytes;
1024                 u8 *buf = this_cpu_read(insn_buffer);
1025
1026                 /* 'size' must fit our buffer, see above */
1027                 bytes = copy_from_user_nmi(buf, (void __user *)to, size);
1028                 if (bytes != 0)
1029                         return 0;
1030
1031                 kaddr = buf;
1032         } else {
1033                 kaddr = (void *)to;
1034         }
1035
1036         do {
1037                 struct insn insn;
1038
1039                 old_to = to;
1040
1041 #ifdef CONFIG_X86_64
1042                 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
1043 #endif
1044                 insn_init(&insn, kaddr, size, is_64bit);
1045                 insn_get_length(&insn);
1046                 /*
1047                  * Make sure there was not a problem decoding the
1048                  * instruction and getting the length.  This is
1049                  * doubly important because we have an infinite
1050                  * loop if insn.length=0.
1051                  */
1052                 if (!insn.length)
1053                         break;
1054
1055                 to += insn.length;
1056                 kaddr += insn.length;
1057                 size -= insn.length;
1058         } while (to < ip);
1059
1060         if (to == ip) {
1061                 set_linear_ip(regs, old_to);
1062                 return 1;
1063         }
1064
1065         /*
1066          * Even though we decoded the basic block, the instruction stream
1067          * never matched the given IP, either the TO or the IP got corrupted.
1068          */
1069         return 0;
1070 }
1071
1072 static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
1073 {
1074         if (pebs->tsx_tuning) {
1075                 union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
1076                 return tsx.cycles_last_block;
1077         }
1078         return 0;
1079 }
1080
1081 static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
1082 {
1083         u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
1084
1085         /* For RTM XABORTs also log the abort code from AX */
1086         if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
1087                 txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
1088         return txn;
1089 }
1090
1091 static void setup_pebs_sample_data(struct perf_event *event,
1092                                    struct pt_regs *iregs, void *__pebs,
1093                                    struct perf_sample_data *data,
1094                                    struct pt_regs *regs)
1095 {
1096 #define PERF_X86_EVENT_PEBS_HSW_PREC \
1097                 (PERF_X86_EVENT_PEBS_ST_HSW | \
1098                  PERF_X86_EVENT_PEBS_LD_HSW | \
1099                  PERF_X86_EVENT_PEBS_NA_HSW)
1100         /*
1101          * We cast to the biggest pebs_record but are careful not to
1102          * unconditionally access the 'extra' entries.
1103          */
1104         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1105         struct pebs_record_skl *pebs = __pebs;
1106         u64 sample_type;
1107         int fll, fst, dsrc;
1108         int fl = event->hw.flags;
1109
1110         if (pebs == NULL)
1111                 return;
1112
1113         regs->flags &= ~PERF_EFLAGS_EXACT;
1114         sample_type = event->attr.sample_type;
1115         dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
1116
1117         fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
1118         fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
1119
1120         perf_sample_data_init(data, 0, event->hw.last_period);
1121
1122         data->period = event->hw.last_period;
1123
1124         /*
1125          * Use latency for weight (only avail with PEBS-LL)
1126          */
1127         if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
1128                 data->weight = pebs->lat;
1129
1130         /*
1131          * data.data_src encodes the data source
1132          */
1133         if (dsrc) {
1134                 u64 val = PERF_MEM_NA;
1135                 if (fll)
1136                         val = load_latency_data(pebs->dse);
1137                 else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
1138                         val = precise_datala_hsw(event, pebs->dse);
1139                 else if (fst)
1140                         val = precise_store_data(pebs->dse);
1141                 data->data_src.val = val;
1142         }
1143
1144         /*
1145          * We use the interrupt regs as a base because the PEBS record does not
1146          * contain a full regs set, specifically it seems to lack segment
1147          * descriptors, which get used by things like user_mode().
1148          *
1149          * In the simple case fix up only the IP for PERF_SAMPLE_IP.
1150          *
1151          * We must however always use BP,SP from iregs for the unwinder to stay
1152          * sane; the record BP,SP can point into thin air when the record is
1153          * from a previous PMI context or an (I)RET happend between the record
1154          * and PMI.
1155          */
1156         *regs = *iregs;
1157         regs->flags = pebs->flags;
1158
1159         if (sample_type & PERF_SAMPLE_REGS_INTR) {
1160                 regs->ax = pebs->ax;
1161                 regs->bx = pebs->bx;
1162                 regs->cx = pebs->cx;
1163                 regs->dx = pebs->dx;
1164                 regs->si = pebs->si;
1165                 regs->di = pebs->di;
1166
1167                 /*
1168                  * Per the above; only set BP,SP if we don't need callchains.
1169                  *
1170                  * XXX: does this make sense?
1171                  */
1172                 if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) {
1173                         regs->bp = pebs->bp;
1174                         regs->sp = pebs->sp;
1175                 }
1176
1177                 /*
1178                  * Preserve PERF_EFLAGS_VM from set_linear_ip().
1179                  */
1180                 regs->flags = pebs->flags | (regs->flags & PERF_EFLAGS_VM);
1181 #ifndef CONFIG_X86_32
1182                 regs->r8 = pebs->r8;
1183                 regs->r9 = pebs->r9;
1184                 regs->r10 = pebs->r10;
1185                 regs->r11 = pebs->r11;
1186                 regs->r12 = pebs->r12;
1187                 regs->r13 = pebs->r13;
1188                 regs->r14 = pebs->r14;
1189                 regs->r15 = pebs->r15;
1190 #endif
1191         }
1192
1193         if (event->attr.precise_ip > 1) {
1194                 /* Haswell and later have the eventing IP, so use it: */
1195                 if (x86_pmu.intel_cap.pebs_format >= 2) {
1196                         set_linear_ip(regs, pebs->real_ip);
1197                         regs->flags |= PERF_EFLAGS_EXACT;
1198                 } else {
1199                         /* Otherwise use PEBS off-by-1 IP: */
1200                         set_linear_ip(regs, pebs->ip);
1201
1202                         /* ... and try to fix it up using the LBR entries: */
1203                         if (intel_pmu_pebs_fixup_ip(regs))
1204                                 regs->flags |= PERF_EFLAGS_EXACT;
1205                 }
1206         } else
1207                 set_linear_ip(regs, pebs->ip);
1208
1209
1210         if ((sample_type & PERF_SAMPLE_ADDR) &&
1211             x86_pmu.intel_cap.pebs_format >= 1)
1212                 data->addr = pebs->dla;
1213
1214         if (x86_pmu.intel_cap.pebs_format >= 2) {
1215                 /* Only set the TSX weight when no memory weight. */
1216                 if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
1217                         data->weight = intel_hsw_weight(pebs);
1218
1219                 if (sample_type & PERF_SAMPLE_TRANSACTION)
1220                         data->txn = intel_hsw_transaction(pebs);
1221         }
1222
1223         /*
1224          * v3 supplies an accurate time stamp, so we use that
1225          * for the time stamp.
1226          *
1227          * We can only do this for the default trace clock.
1228          */
1229         if (x86_pmu.intel_cap.pebs_format >= 3 &&
1230                 event->attr.use_clockid == 0)
1231                 data->time = native_sched_clock_from_tsc(pebs->tsc);
1232
1233         if (has_branch_stack(event))
1234                 data->br_stack = &cpuc->lbr_stack;
1235 }
1236
1237 static inline void *
1238 get_next_pebs_record_by_bit(void *base, void *top, int bit)
1239 {
1240         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1241         void *at;
1242         u64 pebs_status;
1243
1244         /*
1245          * fmt0 does not have a status bitfield (does not use
1246          * perf_record_nhm format)
1247          */
1248         if (x86_pmu.intel_cap.pebs_format < 1)
1249                 return base;
1250
1251         if (base == NULL)
1252                 return NULL;
1253
1254         for (at = base; at < top; at += x86_pmu.pebs_record_size) {
1255                 struct pebs_record_nhm *p = at;
1256
1257                 if (test_bit(bit, (unsigned long *)&p->status)) {
1258                         /* PEBS v3 has accurate status bits */
1259                         if (x86_pmu.intel_cap.pebs_format >= 3)
1260                                 return at;
1261
1262                         if (p->status == (1 << bit))
1263                                 return at;
1264
1265                         /* clear non-PEBS bit and re-check */
1266                         pebs_status = p->status & cpuc->pebs_enabled;
1267                         pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
1268                         if (pebs_status == (1 << bit))
1269                                 return at;
1270                 }
1271         }
1272         return NULL;
1273 }
1274
1275 /*
1276  * Special variant of intel_pmu_save_and_restart() for auto-reload.
1277  */
1278 static int
1279 intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
1280 {
1281         struct hw_perf_event *hwc = &event->hw;
1282         int shift = 64 - x86_pmu.cntval_bits;
1283         u64 period = hwc->sample_period;
1284         u64 prev_raw_count, new_raw_count;
1285         s64 new, old;
1286
1287         WARN_ON(!period);
1288
1289         /*
1290          * drain_pebs() only happens when the PMU is disabled.
1291          */
1292         WARN_ON(this_cpu_read(cpu_hw_events.enabled));
1293
1294         prev_raw_count = local64_read(&hwc->prev_count);
1295         rdpmcl(hwc->event_base_rdpmc, new_raw_count);
1296         local64_set(&hwc->prev_count, new_raw_count);
1297
1298         /*
1299          * Since the counter increments a negative counter value and
1300          * overflows on the sign switch, giving the interval:
1301          *
1302          *   [-period, 0]
1303          *
1304          * the difference between two consequtive reads is:
1305          *
1306          *   A) value2 - value1;
1307          *      when no overflows have happened in between,
1308          *
1309          *   B) (0 - value1) + (value2 - (-period));
1310          *      when one overflow happened in between,
1311          *
1312          *   C) (0 - value1) + (n - 1) * (period) + (value2 - (-period));
1313          *      when @n overflows happened in between.
1314          *
1315          * Here A) is the obvious difference, B) is the extension to the
1316          * discrete interval, where the first term is to the top of the
1317          * interval and the second term is from the bottom of the next
1318          * interval and C) the extension to multiple intervals, where the
1319          * middle term is the whole intervals covered.
1320          *
1321          * An equivalent of C, by reduction, is:
1322          *
1323          *   value2 - value1 + n * period
1324          */
1325         new = ((s64)(new_raw_count << shift) >> shift);
1326         old = ((s64)(prev_raw_count << shift) >> shift);
1327         local64_add(new - old + count * period, &event->count);
1328
1329         perf_event_update_userpage(event);
1330
1331         return 0;
1332 }
1333
1334 static void __intel_pmu_pebs_event(struct perf_event *event,
1335                                    struct pt_regs *iregs,
1336                                    void *base, void *top,
1337                                    int bit, int count)
1338 {
1339         struct hw_perf_event *hwc = &event->hw;
1340         struct perf_sample_data data;
1341         struct pt_regs regs;
1342         void *at = get_next_pebs_record_by_bit(base, top, bit);
1343
1344         if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
1345                 /*
1346                  * Now, auto-reload is only enabled in fixed period mode.
1347                  * The reload value is always hwc->sample_period.
1348                  * May need to change it, if auto-reload is enabled in
1349                  * freq mode later.
1350                  */
1351                 intel_pmu_save_and_restart_reload(event, count);
1352         } else if (!intel_pmu_save_and_restart(event))
1353                 return;
1354
1355         while (count > 1) {
1356                 setup_pebs_sample_data(event, iregs, at, &data, &regs);
1357                 perf_event_output(event, &data, &regs);
1358                 at += x86_pmu.pebs_record_size;
1359                 at = get_next_pebs_record_by_bit(at, top, bit);
1360                 count--;
1361         }
1362
1363         setup_pebs_sample_data(event, iregs, at, &data, &regs);
1364
1365         /*
1366          * All but the last records are processed.
1367          * The last one is left to be able to call the overflow handler.
1368          */
1369         if (perf_event_overflow(event, &data, &regs)) {
1370                 x86_pmu_stop(event, 0);
1371                 return;
1372         }
1373
1374 }
1375
1376 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
1377 {
1378         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1379         struct debug_store *ds = cpuc->ds;
1380         struct perf_event *event = cpuc->events[0]; /* PMC0 only */
1381         struct pebs_record_core *at, *top;
1382         int n;
1383
1384         if (!x86_pmu.pebs_active)
1385                 return;
1386
1387         at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
1388         top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
1389
1390         /*
1391          * Whatever else happens, drain the thing
1392          */
1393         ds->pebs_index = ds->pebs_buffer_base;
1394
1395         if (!test_bit(0, cpuc->active_mask))
1396                 return;
1397
1398         WARN_ON_ONCE(!event);
1399
1400         if (!event->attr.precise_ip)
1401                 return;
1402
1403         n = top - at;
1404         if (n <= 0) {
1405                 if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
1406                         intel_pmu_save_and_restart_reload(event, 0);
1407                 return;
1408         }
1409
1410         __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
1411 }
1412
1413 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
1414 {
1415         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1416         struct debug_store *ds = cpuc->ds;
1417         struct perf_event *event;
1418         void *base, *at, *top;
1419         short counts[MAX_PEBS_EVENTS] = {};
1420         short error[MAX_PEBS_EVENTS] = {};
1421         int bit, i;
1422
1423         if (!x86_pmu.pebs_active)
1424                 return;
1425
1426         base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
1427         top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
1428
1429         ds->pebs_index = ds->pebs_buffer_base;
1430
1431         if (unlikely(base >= top)) {
1432                 /*
1433                  * The drain_pebs() could be called twice in a short period
1434                  * for auto-reload event in pmu::read(). There are no
1435                  * overflows have happened in between.
1436                  * It needs to call intel_pmu_save_and_restart_reload() to
1437                  * update the event->count for this case.
1438                  */
1439                 for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled,
1440                                  x86_pmu.max_pebs_events) {
1441                         event = cpuc->events[bit];
1442                         if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
1443                                 intel_pmu_save_and_restart_reload(event, 0);
1444                 }
1445                 return;
1446         }
1447
1448         for (at = base; at < top; at += x86_pmu.pebs_record_size) {
1449                 struct pebs_record_nhm *p = at;
1450                 u64 pebs_status;
1451
1452                 pebs_status = p->status & cpuc->pebs_enabled;
1453                 pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
1454
1455                 /* PEBS v3 has more accurate status bits */
1456                 if (x86_pmu.intel_cap.pebs_format >= 3) {
1457                         for_each_set_bit(bit, (unsigned long *)&pebs_status,
1458                                          x86_pmu.max_pebs_events)
1459                                 counts[bit]++;
1460
1461                         continue;
1462                 }
1463
1464                 /*
1465                  * On some CPUs the PEBS status can be zero when PEBS is
1466                  * racing with clearing of GLOBAL_STATUS.
1467                  *
1468                  * Normally we would drop that record, but in the
1469                  * case when there is only a single active PEBS event
1470                  * we can assume it's for that event.
1471                  */
1472                 if (!pebs_status && cpuc->pebs_enabled &&
1473                         !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
1474                         pebs_status = cpuc->pebs_enabled;
1475
1476                 bit = find_first_bit((unsigned long *)&pebs_status,
1477                                         x86_pmu.max_pebs_events);
1478                 if (bit >= x86_pmu.max_pebs_events)
1479                         continue;
1480
1481                 /*
1482                  * The PEBS hardware does not deal well with the situation
1483                  * when events happen near to each other and multiple bits
1484                  * are set. But it should happen rarely.
1485                  *
1486                  * If these events include one PEBS and multiple non-PEBS
1487                  * events, it doesn't impact PEBS record. The record will
1488                  * be handled normally. (slow path)
1489                  *
1490                  * If these events include two or more PEBS events, the
1491                  * records for the events can be collapsed into a single
1492                  * one, and it's not possible to reconstruct all events
1493                  * that caused the PEBS record. It's called collision.
1494                  * If collision happened, the record will be dropped.
1495                  */
1496                 if (p->status != (1ULL << bit)) {
1497                         for_each_set_bit(i, (unsigned long *)&pebs_status,
1498                                          x86_pmu.max_pebs_events)
1499                                 error[i]++;
1500                         continue;
1501                 }
1502
1503                 counts[bit]++;
1504         }
1505
1506         for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
1507                 if ((counts[bit] == 0) && (error[bit] == 0))
1508                         continue;
1509
1510                 event = cpuc->events[bit];
1511                 if (WARN_ON_ONCE(!event))
1512                         continue;
1513
1514                 if (WARN_ON_ONCE(!event->attr.precise_ip))
1515                         continue;
1516
1517                 /* log dropped samples number */
1518                 if (error[bit]) {
1519                         perf_log_lost_samples(event, error[bit]);
1520
1521                         if (perf_event_account_interrupt(event))
1522                                 x86_pmu_stop(event, 0);
1523                 }
1524
1525                 if (counts[bit]) {
1526                         __intel_pmu_pebs_event(event, iregs, base,
1527                                                top, bit, counts[bit]);
1528                 }
1529         }
1530 }
1531
1532 /*
1533  * BTS, PEBS probe and setup
1534  */
1535
1536 void __init intel_ds_init(void)
1537 {
1538         /*
1539          * No support for 32bit formats
1540          */
1541         if (!boot_cpu_has(X86_FEATURE_DTES64))
1542                 return;
1543
1544         x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
1545         x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
1546         x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
1547         if (x86_pmu.pebs) {
1548                 char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
1549                 int format = x86_pmu.intel_cap.pebs_format;
1550
1551                 switch (format) {
1552                 case 0:
1553                         pr_cont("PEBS fmt0%c, ", pebs_type);
1554                         x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
1555                         /*
1556                          * Using >PAGE_SIZE buffers makes the WRMSR to
1557                          * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
1558                          * mysteriously hang on Core2.
1559                          *
1560                          * As a workaround, we don't do this.
1561                          */
1562                         x86_pmu.pebs_buffer_size = PAGE_SIZE;
1563                         x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
1564                         break;
1565
1566                 case 1:
1567                         pr_cont("PEBS fmt1%c, ", pebs_type);
1568                         x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
1569                         x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
1570                         break;
1571
1572                 case 2:
1573                         pr_cont("PEBS fmt2%c, ", pebs_type);
1574                         x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
1575                         x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
1576                         break;
1577
1578                 case 3:
1579                         pr_cont("PEBS fmt3%c, ", pebs_type);
1580                         x86_pmu.pebs_record_size =
1581                                                 sizeof(struct pebs_record_skl);
1582                         x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
1583                         x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
1584                         break;
1585
1586                 default:
1587                         pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
1588                         x86_pmu.pebs = 0;
1589                 }
1590         }
1591 }
1592
1593 void perf_restore_debug_store(void)
1594 {
1595         struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
1596
1597         if (!x86_pmu.bts && !x86_pmu.pebs)
1598                 return;
1599
1600         wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
1601 }