Linux-libre 4.4.135-gnu
[librecmc/linux-libre.git] / arch / x86 / mm / tlb.c
1 #include <linux/init.h>
2
3 #include <linux/mm.h>
4 #include <linux/spinlock.h>
5 #include <linux/smp.h>
6 #include <linux/interrupt.h>
7 #include <linux/module.h>
8 #include <linux/cpu.h>
9 #include <linux/debugfs.h>
10
11 #include <asm/tlbflush.h>
12 #include <asm/mmu_context.h>
13 #include <asm/cache.h>
14 #include <asm/apic.h>
15 #include <asm/uv/uv.h>
16 #include <asm/kaiser.h>
17
18 /*
19  *      TLB flushing, formerly SMP-only
20  *              c/o Linus Torvalds.
21  *
22  *      These mean you can really definitely utterly forget about
23  *      writing to user space from interrupts. (Its not allowed anyway).
24  *
25  *      Optimizations Manfred Spraul <manfred@colorfullife.com>
26  *
27  *      More scalable flush, from Andi Kleen
28  *
29  *      Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
30  */
31
32 struct flush_tlb_info {
33         struct mm_struct *flush_mm;
34         unsigned long flush_start;
35         unsigned long flush_end;
36 };
37
38 static void load_new_mm_cr3(pgd_t *pgdir)
39 {
40         unsigned long new_mm_cr3 = __pa(pgdir);
41
42         if (kaiser_enabled) {
43                 /*
44                  * We reuse the same PCID for different tasks, so we must
45                  * flush all the entries for the PCID out when we change tasks.
46                  * Flush KERN below, flush USER when returning to userspace in
47                  * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
48                  *
49                  * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
50                  * do it here, but can only be used if X86_FEATURE_INVPCID is
51                  * available - and many machines support pcid without invpcid.
52                  *
53                  * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
54                  * would be needed in the write_cr3() below - if PCIDs enabled.
55                  */
56                 BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
57                 kaiser_flush_tlb_on_return_to_user();
58         }
59
60         /*
61          * Caution: many callers of this function expect
62          * that load_cr3() is serializing and orders TLB
63          * fills with respect to the mm_cpumask writes.
64          */
65         write_cr3(new_mm_cr3);
66 }
67
68 /*
69  * We cannot call mmdrop() because we are in interrupt context,
70  * instead update mm->cpu_vm_mask.
71  */
72 void leave_mm(int cpu)
73 {
74         struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
75         if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
76                 BUG();
77         if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
78                 cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
79                 load_new_mm_cr3(swapper_pg_dir);
80                 /*
81                  * This gets called in the idle path where RCU
82                  * functions differently.  Tracing normally
83                  * uses RCU, so we have to call the tracepoint
84                  * specially here.
85                  */
86                 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
87         }
88 }
89 EXPORT_SYMBOL_GPL(leave_mm);
90
91 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
92                struct task_struct *tsk)
93 {
94         unsigned long flags;
95
96         local_irq_save(flags);
97         switch_mm_irqs_off(prev, next, tsk);
98         local_irq_restore(flags);
99 }
100
101 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
102                         struct task_struct *tsk)
103 {
104         unsigned cpu = smp_processor_id();
105
106         if (likely(prev != next)) {
107                 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
108                 this_cpu_write(cpu_tlbstate.active_mm, next);
109                 cpumask_set_cpu(cpu, mm_cpumask(next));
110
111                 /*
112                  * Re-load page tables.
113                  *
114                  * This logic has an ordering constraint:
115                  *
116                  *  CPU 0: Write to a PTE for 'next'
117                  *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
118                  *  CPU 1: set bit 1 in next's mm_cpumask
119                  *  CPU 1: load from the PTE that CPU 0 writes (implicit)
120                  *
121                  * We need to prevent an outcome in which CPU 1 observes
122                  * the new PTE value and CPU 0 observes bit 1 clear in
123                  * mm_cpumask.  (If that occurs, then the IPI will never
124                  * be sent, and CPU 0's TLB will contain a stale entry.)
125                  *
126                  * The bad outcome can occur if either CPU's load is
127                  * reordered before that CPU's store, so both CPUs must
128                  * execute full barriers to prevent this from happening.
129                  *
130                  * Thus, switch_mm needs a full barrier between the
131                  * store to mm_cpumask and any operation that could load
132                  * from next->pgd.  TLB fills are special and can happen
133                  * due to instruction fetches or for no reason at all,
134                  * and neither LOCK nor MFENCE orders them.
135                  * Fortunately, load_cr3() is serializing and gives the
136                  * ordering guarantee we need.
137                  *
138                  */
139                 load_new_mm_cr3(next->pgd);
140
141                 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
142
143                 /* Stop flush ipis for the previous mm */
144                 cpumask_clear_cpu(cpu, mm_cpumask(prev));
145
146                 /* Load per-mm CR4 state */
147                 load_mm_cr4(next);
148
149 #ifdef CONFIG_MODIFY_LDT_SYSCALL
150                 /*
151                  * Load the LDT, if the LDT is different.
152                  *
153                  * It's possible that prev->context.ldt doesn't match
154                  * the LDT register.  This can happen if leave_mm(prev)
155                  * was called and then modify_ldt changed
156                  * prev->context.ldt but suppressed an IPI to this CPU.
157                  * In this case, prev->context.ldt != NULL, because we
158                  * never set context.ldt to NULL while the mm still
159                  * exists.  That means that next->context.ldt !=
160                  * prev->context.ldt, because mms never share an LDT.
161                  */
162                 if (unlikely(prev->context.ldt != next->context.ldt))
163                         load_mm_ldt(next);
164 #endif
165         } else {
166                 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
167                 BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
168
169                 if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
170                         /*
171                          * On established mms, the mm_cpumask is only changed
172                          * from irq context, from ptep_clear_flush() while in
173                          * lazy tlb mode, and here. Irqs are blocked during
174                          * schedule, protecting us from simultaneous changes.
175                          */
176                         cpumask_set_cpu(cpu, mm_cpumask(next));
177
178                         /*
179                          * We were in lazy tlb mode and leave_mm disabled
180                          * tlb flush IPI delivery. We must reload CR3
181                          * to make sure to use no freed page tables.
182                          *
183                          * As above, load_cr3() is serializing and orders TLB
184                          * fills with respect to the mm_cpumask write.
185                          */
186                         load_new_mm_cr3(next->pgd);
187                         trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
188                         load_mm_cr4(next);
189                         load_mm_ldt(next);
190                 }
191         }
192 }
193
194 /*
195  * The flush IPI assumes that a thread switch happens in this order:
196  * [cpu0: the cpu that switches]
197  * 1) switch_mm() either 1a) or 1b)
198  * 1a) thread switch to a different mm
199  * 1a1) set cpu_tlbstate to TLBSTATE_OK
200  *      Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
201  *      if cpu0 was in lazy tlb mode.
202  * 1a2) update cpu active_mm
203  *      Now cpu0 accepts tlb flushes for the new mm.
204  * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
205  *      Now the other cpus will send tlb flush ipis.
206  * 1a4) change cr3.
207  * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
208  *      Stop ipi delivery for the old mm. This is not synchronized with
209  *      the other cpus, but flush_tlb_func ignore flush ipis for the wrong
210  *      mm, and in the worst case we perform a superfluous tlb flush.
211  * 1b) thread switch without mm change
212  *      cpu active_mm is correct, cpu0 already handles flush ipis.
213  * 1b1) set cpu_tlbstate to TLBSTATE_OK
214  * 1b2) test_and_set the cpu bit in cpu_vm_mask.
215  *      Atomically set the bit [other cpus will start sending flush ipis],
216  *      and test the bit.
217  * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
218  * 2) switch %%esp, ie current
219  *
220  * The interrupt must handle 2 special cases:
221  * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
222  * - the cpu performs speculative tlb reads, i.e. even if the cpu only
223  *   runs in kernel space, the cpu could load tlb entries for user space
224  *   pages.
225  *
226  * The good news is that cpu_tlbstate is local to each cpu, no
227  * write/read ordering problems.
228  */
229
230 /*
231  * TLB flush funcation:
232  * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
233  * 2) Leave the mm if we are in the lazy tlb mode.
234  */
235 static void flush_tlb_func(void *info)
236 {
237         struct flush_tlb_info *f = info;
238
239         inc_irq_stat(irq_tlb_count);
240
241         if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
242                 return;
243
244         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
245         if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
246                 if (f->flush_end == TLB_FLUSH_ALL) {
247                         local_flush_tlb();
248                         trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
249                 } else {
250                         unsigned long addr;
251                         unsigned long nr_pages =
252                                 (f->flush_end - f->flush_start) / PAGE_SIZE;
253                         addr = f->flush_start;
254                         while (addr < f->flush_end) {
255                                 __flush_tlb_single(addr);
256                                 addr += PAGE_SIZE;
257                         }
258                         trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
259                 }
260         } else
261                 leave_mm(smp_processor_id());
262
263 }
264
265 void native_flush_tlb_others(const struct cpumask *cpumask,
266                                  struct mm_struct *mm, unsigned long start,
267                                  unsigned long end)
268 {
269         struct flush_tlb_info info;
270
271         info.flush_mm = mm;
272         info.flush_start = start;
273         info.flush_end = end;
274
275         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
276         if (end == TLB_FLUSH_ALL)
277                 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
278         else
279                 trace_tlb_flush(TLB_REMOTE_SEND_IPI,
280                                 (end - start) >> PAGE_SHIFT);
281
282         if (is_uv_system()) {
283                 unsigned int cpu;
284
285                 cpu = smp_processor_id();
286                 cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
287                 if (cpumask)
288                         smp_call_function_many(cpumask, flush_tlb_func,
289                                                                 &info, 1);
290                 return;
291         }
292         smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
293 }
294
295 /*
296  * See Documentation/x86/tlb.txt for details.  We choose 33
297  * because it is large enough to cover the vast majority (at
298  * least 95%) of allocations, and is small enough that we are
299  * confident it will not cause too much overhead.  Each single
300  * flush is about 100 ns, so this caps the maximum overhead at
301  * _about_ 3,000 ns.
302  *
303  * This is in units of pages.
304  */
305 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
306
307 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
308                                 unsigned long end, unsigned long vmflag)
309 {
310         unsigned long addr;
311         /* do a global flush by default */
312         unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
313
314         preempt_disable();
315
316         if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
317                 base_pages_to_flush = (end - start) >> PAGE_SHIFT;
318         if (base_pages_to_flush > tlb_single_page_flush_ceiling)
319                 base_pages_to_flush = TLB_FLUSH_ALL;
320
321         if (current->active_mm != mm) {
322                 /* Synchronize with switch_mm. */
323                 smp_mb();
324
325                 goto out;
326         }
327
328         if (!current->mm) {
329                 leave_mm(smp_processor_id());
330
331                 /* Synchronize with switch_mm. */
332                 smp_mb();
333
334                 goto out;
335         }
336
337         /*
338          * Both branches below are implicit full barriers (MOV to CR or
339          * INVLPG) that synchronize with switch_mm.
340          */
341         if (base_pages_to_flush == TLB_FLUSH_ALL) {
342                 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
343                 local_flush_tlb();
344         } else {
345                 /* flush range by one by one 'invlpg' */
346                 for (addr = start; addr < end;  addr += PAGE_SIZE) {
347                         count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
348                         __flush_tlb_single(addr);
349                 }
350         }
351         trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
352 out:
353         if (base_pages_to_flush == TLB_FLUSH_ALL) {
354                 start = 0UL;
355                 end = TLB_FLUSH_ALL;
356         }
357         if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
358                 flush_tlb_others(mm_cpumask(mm), mm, start, end);
359         preempt_enable();
360 }
361
362 static void do_flush_tlb_all(void *info)
363 {
364         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
365         __flush_tlb_all();
366         if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
367                 leave_mm(smp_processor_id());
368 }
369
370 void flush_tlb_all(void)
371 {
372         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
373         on_each_cpu(do_flush_tlb_all, NULL, 1);
374 }
375
376 static void do_kernel_range_flush(void *info)
377 {
378         struct flush_tlb_info *f = info;
379         unsigned long addr;
380
381         /* flush range by one by one 'invlpg' */
382         for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
383                 __flush_tlb_single(addr);
384 }
385
386 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
387 {
388
389         /* Balance as user space task's flush, a bit conservative */
390         if (end == TLB_FLUSH_ALL ||
391             (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
392                 on_each_cpu(do_flush_tlb_all, NULL, 1);
393         } else {
394                 struct flush_tlb_info info;
395                 info.flush_start = start;
396                 info.flush_end = end;
397                 on_each_cpu(do_kernel_range_flush, &info, 1);
398         }
399 }
400
401 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
402                              size_t count, loff_t *ppos)
403 {
404         char buf[32];
405         unsigned int len;
406
407         len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
408         return simple_read_from_buffer(user_buf, count, ppos, buf, len);
409 }
410
411 static ssize_t tlbflush_write_file(struct file *file,
412                  const char __user *user_buf, size_t count, loff_t *ppos)
413 {
414         char buf[32];
415         ssize_t len;
416         int ceiling;
417
418         len = min(count, sizeof(buf) - 1);
419         if (copy_from_user(buf, user_buf, len))
420                 return -EFAULT;
421
422         buf[len] = '\0';
423         if (kstrtoint(buf, 0, &ceiling))
424                 return -EINVAL;
425
426         if (ceiling < 0)
427                 return -EINVAL;
428
429         tlb_single_page_flush_ceiling = ceiling;
430         return count;
431 }
432
433 static const struct file_operations fops_tlbflush = {
434         .read = tlbflush_read_file,
435         .write = tlbflush_write_file,
436         .llseek = default_llseek,
437 };
438
439 static int __init create_tlb_single_page_flush_ceiling(void)
440 {
441         debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
442                             arch_debugfs_dir, NULL, &fops_tlbflush);
443         return 0;
444 }
445 late_initcall(create_tlb_single_page_flush_ceiling);