Linux-libre 3.0.94-gnu1
[librecmc/linux-libre.git] / mm / hugetlb.c
1 /*
2  * Generic hugetlb support.
3  * (C) William Irwin, April 2004
4  */
5 #include <linux/list.h>
6 #include <linux/init.h>
7 #include <linux/module.h>
8 #include <linux/mm.h>
9 #include <linux/seq_file.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/mmu_notifier.h>
13 #include <linux/nodemask.h>
14 #include <linux/pagemap.h>
15 #include <linux/mempolicy.h>
16 #include <linux/cpuset.h>
17 #include <linux/mutex.h>
18 #include <linux/bootmem.h>
19 #include <linux/sysfs.h>
20 #include <linux/slab.h>
21 #include <linux/rmap.h>
22 #include <linux/swap.h>
23 #include <linux/swapops.h>
24
25 #include <asm/page.h>
26 #include <asm/pgtable.h>
27 #include <asm/io.h>
28
29 #include <linux/hugetlb.h>
30 #include <linux/node.h>
31 #include "internal.h"
32
33 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
34 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35 unsigned long hugepages_treat_as_movable;
36
37 static int max_hstate;
38 unsigned int default_hstate_idx;
39 struct hstate hstates[HUGE_MAX_HSTATE];
40
41 __initdata LIST_HEAD(huge_boot_pages);
42
43 /* for command line parsing */
44 static struct hstate * __initdata parsed_hstate;
45 static unsigned long __initdata default_hstate_max_huge_pages;
46 static unsigned long __initdata default_hstate_size;
47
48 #define for_each_hstate(h) \
49         for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
50
51 /*
52  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
53  */
54 static DEFINE_SPINLOCK(hugetlb_lock);
55
56 /*
57  * Region tracking -- allows tracking of reservations and instantiated pages
58  *                    across the pages in a mapping.
59  *
60  * The region data structures are protected by a combination of the mmap_sem
61  * and the hugetlb_instantion_mutex.  To access or modify a region the caller
62  * must either hold the mmap_sem for write, or the mmap_sem for read and
63  * the hugetlb_instantiation mutex:
64  *
65  *      down_write(&mm->mmap_sem);
66  * or
67  *      down_read(&mm->mmap_sem);
68  *      mutex_lock(&hugetlb_instantiation_mutex);
69  */
70 struct file_region {
71         struct list_head link;
72         long from;
73         long to;
74 };
75
76 static long region_add(struct list_head *head, long f, long t)
77 {
78         struct file_region *rg, *nrg, *trg;
79
80         /* Locate the region we are either in or before. */
81         list_for_each_entry(rg, head, link)
82                 if (f <= rg->to)
83                         break;
84
85         /* Round our left edge to the current segment if it encloses us. */
86         if (f > rg->from)
87                 f = rg->from;
88
89         /* Check for and consume any regions we now overlap with. */
90         nrg = rg;
91         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
92                 if (&rg->link == head)
93                         break;
94                 if (rg->from > t)
95                         break;
96
97                 /* If this area reaches higher then extend our area to
98                  * include it completely.  If this is not the first area
99                  * which we intend to reuse, free it. */
100                 if (rg->to > t)
101                         t = rg->to;
102                 if (rg != nrg) {
103                         list_del(&rg->link);
104                         kfree(rg);
105                 }
106         }
107         nrg->from = f;
108         nrg->to = t;
109         return 0;
110 }
111
112 static long region_chg(struct list_head *head, long f, long t)
113 {
114         struct file_region *rg, *nrg;
115         long chg = 0;
116
117         /* Locate the region we are before or in. */
118         list_for_each_entry(rg, head, link)
119                 if (f <= rg->to)
120                         break;
121
122         /* If we are below the current region then a new region is required.
123          * Subtle, allocate a new region at the position but make it zero
124          * size such that we can guarantee to record the reservation. */
125         if (&rg->link == head || t < rg->from) {
126                 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
127                 if (!nrg)
128                         return -ENOMEM;
129                 nrg->from = f;
130                 nrg->to   = f;
131                 INIT_LIST_HEAD(&nrg->link);
132                 list_add(&nrg->link, rg->link.prev);
133
134                 return t - f;
135         }
136
137         /* Round our left edge to the current segment if it encloses us. */
138         if (f > rg->from)
139                 f = rg->from;
140         chg = t - f;
141
142         /* Check for and consume any regions we now overlap with. */
143         list_for_each_entry(rg, rg->link.prev, link) {
144                 if (&rg->link == head)
145                         break;
146                 if (rg->from > t)
147                         return chg;
148
149                 /* We overlap with this area, if it extends further than
150                  * us then we must extend ourselves.  Account for its
151                  * existing reservation. */
152                 if (rg->to > t) {
153                         chg += rg->to - t;
154                         t = rg->to;
155                 }
156                 chg -= rg->to - rg->from;
157         }
158         return chg;
159 }
160
161 static long region_truncate(struct list_head *head, long end)
162 {
163         struct file_region *rg, *trg;
164         long chg = 0;
165
166         /* Locate the region we are either in or before. */
167         list_for_each_entry(rg, head, link)
168                 if (end <= rg->to)
169                         break;
170         if (&rg->link == head)
171                 return 0;
172
173         /* If we are in the middle of a region then adjust it. */
174         if (end > rg->from) {
175                 chg = rg->to - end;
176                 rg->to = end;
177                 rg = list_entry(rg->link.next, typeof(*rg), link);
178         }
179
180         /* Drop any remaining regions. */
181         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
182                 if (&rg->link == head)
183                         break;
184                 chg += rg->to - rg->from;
185                 list_del(&rg->link);
186                 kfree(rg);
187         }
188         return chg;
189 }
190
191 static long region_count(struct list_head *head, long f, long t)
192 {
193         struct file_region *rg;
194         long chg = 0;
195
196         /* Locate each segment we overlap with, and count that overlap. */
197         list_for_each_entry(rg, head, link) {
198                 int seg_from;
199                 int seg_to;
200
201                 if (rg->to <= f)
202                         continue;
203                 if (rg->from >= t)
204                         break;
205
206                 seg_from = max(rg->from, f);
207                 seg_to = min(rg->to, t);
208
209                 chg += seg_to - seg_from;
210         }
211
212         return chg;
213 }
214
215 /*
216  * Convert the address within this vma to the page offset within
217  * the mapping, in pagecache page units; huge pages here.
218  */
219 static pgoff_t vma_hugecache_offset(struct hstate *h,
220                         struct vm_area_struct *vma, unsigned long address)
221 {
222         return ((address - vma->vm_start) >> huge_page_shift(h)) +
223                         (vma->vm_pgoff >> huge_page_order(h));
224 }
225
226 pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
227                                      unsigned long address)
228 {
229         return vma_hugecache_offset(hstate_vma(vma), vma, address);
230 }
231
232 /*
233  * Return the size of the pages allocated when backing a VMA. In the majority
234  * cases this will be same size as used by the page table entries.
235  */
236 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
237 {
238         struct hstate *hstate;
239
240         if (!is_vm_hugetlb_page(vma))
241                 return PAGE_SIZE;
242
243         hstate = hstate_vma(vma);
244
245         return 1UL << (hstate->order + PAGE_SHIFT);
246 }
247 EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
248
249 /*
250  * Return the page size being used by the MMU to back a VMA. In the majority
251  * of cases, the page size used by the kernel matches the MMU size. On
252  * architectures where it differs, an architecture-specific version of this
253  * function is required.
254  */
255 #ifndef vma_mmu_pagesize
256 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
257 {
258         return vma_kernel_pagesize(vma);
259 }
260 #endif
261
262 /*
263  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
264  * bits of the reservation map pointer, which are always clear due to
265  * alignment.
266  */
267 #define HPAGE_RESV_OWNER    (1UL << 0)
268 #define HPAGE_RESV_UNMAPPED (1UL << 1)
269 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
270
271 /*
272  * These helpers are used to track how many pages are reserved for
273  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
274  * is guaranteed to have their future faults succeed.
275  *
276  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
277  * the reserve counters are updated with the hugetlb_lock held. It is safe
278  * to reset the VMA at fork() time as it is not in use yet and there is no
279  * chance of the global counters getting corrupted as a result of the values.
280  *
281  * The private mapping reservation is represented in a subtly different
282  * manner to a shared mapping.  A shared mapping has a region map associated
283  * with the underlying file, this region map represents the backing file
284  * pages which have ever had a reservation assigned which this persists even
285  * after the page is instantiated.  A private mapping has a region map
286  * associated with the original mmap which is attached to all VMAs which
287  * reference it, this region map represents those offsets which have consumed
288  * reservation ie. where pages have been instantiated.
289  */
290 static unsigned long get_vma_private_data(struct vm_area_struct *vma)
291 {
292         return (unsigned long)vma->vm_private_data;
293 }
294
295 static void set_vma_private_data(struct vm_area_struct *vma,
296                                                         unsigned long value)
297 {
298         vma->vm_private_data = (void *)value;
299 }
300
301 struct resv_map {
302         struct kref refs;
303         struct list_head regions;
304 };
305
306 static struct resv_map *resv_map_alloc(void)
307 {
308         struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
309         if (!resv_map)
310                 return NULL;
311
312         kref_init(&resv_map->refs);
313         INIT_LIST_HEAD(&resv_map->regions);
314
315         return resv_map;
316 }
317
318 static void resv_map_release(struct kref *ref)
319 {
320         struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
321
322         /* Clear out any active regions before we release the map. */
323         region_truncate(&resv_map->regions, 0);
324         kfree(resv_map);
325 }
326
327 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
328 {
329         VM_BUG_ON(!is_vm_hugetlb_page(vma));
330         if (!(vma->vm_flags & VM_MAYSHARE))
331                 return (struct resv_map *)(get_vma_private_data(vma) &
332                                                         ~HPAGE_RESV_MASK);
333         return NULL;
334 }
335
336 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
337 {
338         VM_BUG_ON(!is_vm_hugetlb_page(vma));
339         VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
340
341         set_vma_private_data(vma, (get_vma_private_data(vma) &
342                                 HPAGE_RESV_MASK) | (unsigned long)map);
343 }
344
345 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
346 {
347         VM_BUG_ON(!is_vm_hugetlb_page(vma));
348         VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
349
350         set_vma_private_data(vma, get_vma_private_data(vma) | flags);
351 }
352
353 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
354 {
355         VM_BUG_ON(!is_vm_hugetlb_page(vma));
356
357         return (get_vma_private_data(vma) & flag) != 0;
358 }
359
360 /* Decrement the reserved pages in the hugepage pool by one */
361 static void decrement_hugepage_resv_vma(struct hstate *h,
362                         struct vm_area_struct *vma)
363 {
364         if (vma->vm_flags & VM_NORESERVE)
365                 return;
366
367         if (vma->vm_flags & VM_MAYSHARE) {
368                 /* Shared mappings always use reserves */
369                 h->resv_huge_pages--;
370         } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
371                 /*
372                  * Only the process that called mmap() has reserves for
373                  * private mappings.
374                  */
375                 h->resv_huge_pages--;
376         }
377 }
378
379 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
380 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
381 {
382         VM_BUG_ON(!is_vm_hugetlb_page(vma));
383         if (!(vma->vm_flags & VM_MAYSHARE))
384                 vma->vm_private_data = (void *)0;
385 }
386
387 /* Returns true if the VMA has associated reserve pages */
388 static int vma_has_reserves(struct vm_area_struct *vma)
389 {
390         if (vma->vm_flags & VM_MAYSHARE)
391                 return 1;
392         if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
393                 return 1;
394         return 0;
395 }
396
397 static void copy_gigantic_page(struct page *dst, struct page *src)
398 {
399         int i;
400         struct hstate *h = page_hstate(src);
401         struct page *dst_base = dst;
402         struct page *src_base = src;
403
404         for (i = 0; i < pages_per_huge_page(h); ) {
405                 cond_resched();
406                 copy_highpage(dst, src);
407
408                 i++;
409                 dst = mem_map_next(dst, dst_base, i);
410                 src = mem_map_next(src, src_base, i);
411         }
412 }
413
414 void copy_huge_page(struct page *dst, struct page *src)
415 {
416         int i;
417         struct hstate *h = page_hstate(src);
418
419         if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
420                 copy_gigantic_page(dst, src);
421                 return;
422         }
423
424         might_sleep();
425         for (i = 0; i < pages_per_huge_page(h); i++) {
426                 cond_resched();
427                 copy_highpage(dst + i, src + i);
428         }
429 }
430
431 static void enqueue_huge_page(struct hstate *h, struct page *page)
432 {
433         int nid = page_to_nid(page);
434         list_add(&page->lru, &h->hugepage_freelists[nid]);
435         h->free_huge_pages++;
436         h->free_huge_pages_node[nid]++;
437 }
438
439 static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
440 {
441         struct page *page;
442
443         if (list_empty(&h->hugepage_freelists[nid]))
444                 return NULL;
445         page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
446         list_del(&page->lru);
447         set_page_refcounted(page);
448         h->free_huge_pages--;
449         h->free_huge_pages_node[nid]--;
450         return page;
451 }
452
453 static struct page *dequeue_huge_page_vma(struct hstate *h,
454                                 struct vm_area_struct *vma,
455                                 unsigned long address, int avoid_reserve)
456 {
457         struct page *page = NULL;
458         struct mempolicy *mpol;
459         nodemask_t *nodemask;
460         struct zonelist *zonelist;
461         struct zone *zone;
462         struct zoneref *z;
463         unsigned int cpuset_mems_cookie;
464
465 retry_cpuset:
466         cpuset_mems_cookie = get_mems_allowed();
467         zonelist = huge_zonelist(vma, address,
468                                         htlb_alloc_mask, &mpol, &nodemask);
469         /*
470          * A child process with MAP_PRIVATE mappings created by their parent
471          * have no page reserves. This check ensures that reservations are
472          * not "stolen". The child may still get SIGKILLed
473          */
474         if (!vma_has_reserves(vma) &&
475                         h->free_huge_pages - h->resv_huge_pages == 0)
476                 goto err;
477
478         /* If reserves cannot be used, ensure enough pages are in the pool */
479         if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
480                 goto err;
481
482         for_each_zone_zonelist_nodemask(zone, z, zonelist,
483                                                 MAX_NR_ZONES - 1, nodemask) {
484                 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
485                         page = dequeue_huge_page_node(h, zone_to_nid(zone));
486                         if (page) {
487                                 if (!avoid_reserve)
488                                         decrement_hugepage_resv_vma(h, vma);
489                                 break;
490                         }
491                 }
492         }
493
494         mpol_cond_put(mpol);
495         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
496                 goto retry_cpuset;
497         return page;
498
499 err:
500         mpol_cond_put(mpol);
501         return NULL;
502 }
503
504 static void update_and_free_page(struct hstate *h, struct page *page)
505 {
506         int i;
507
508         VM_BUG_ON(h->order >= MAX_ORDER);
509
510         h->nr_huge_pages--;
511         h->nr_huge_pages_node[page_to_nid(page)]--;
512         for (i = 0; i < pages_per_huge_page(h); i++) {
513                 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
514                                 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
515                                 1 << PG_private | 1<< PG_writeback);
516         }
517         set_compound_page_dtor(page, NULL);
518         set_page_refcounted(page);
519         arch_release_hugepage(page);
520         __free_pages(page, huge_page_order(h));
521 }
522
523 struct hstate *size_to_hstate(unsigned long size)
524 {
525         struct hstate *h;
526
527         for_each_hstate(h) {
528                 if (huge_page_size(h) == size)
529                         return h;
530         }
531         return NULL;
532 }
533
534 static void free_huge_page(struct page *page)
535 {
536         /*
537          * Can't pass hstate in here because it is called from the
538          * compound page destructor.
539          */
540         struct hstate *h = page_hstate(page);
541         int nid = page_to_nid(page);
542         struct address_space *mapping;
543
544         mapping = (struct address_space *) page_private(page);
545         set_page_private(page, 0);
546         page->mapping = NULL;
547         BUG_ON(page_count(page));
548         BUG_ON(page_mapcount(page));
549         INIT_LIST_HEAD(&page->lru);
550
551         spin_lock(&hugetlb_lock);
552         if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
553                 update_and_free_page(h, page);
554                 h->surplus_huge_pages--;
555                 h->surplus_huge_pages_node[nid]--;
556         } else {
557                 enqueue_huge_page(h, page);
558         }
559         spin_unlock(&hugetlb_lock);
560         if (mapping)
561                 hugetlb_put_quota(mapping, 1);
562 }
563
564 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
565 {
566         set_compound_page_dtor(page, free_huge_page);
567         spin_lock(&hugetlb_lock);
568         h->nr_huge_pages++;
569         h->nr_huge_pages_node[nid]++;
570         spin_unlock(&hugetlb_lock);
571         put_page(page); /* free it into the hugepage allocator */
572 }
573
574 static void prep_compound_gigantic_page(struct page *page, unsigned long order)
575 {
576         int i;
577         int nr_pages = 1 << order;
578         struct page *p = page + 1;
579
580         /* we rely on prep_new_huge_page to set the destructor */
581         set_compound_order(page, order);
582         __SetPageHead(page);
583         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
584                 __SetPageTail(p);
585                 set_page_count(p, 0);
586                 p->first_page = page;
587         }
588 }
589
590 int PageHuge(struct page *page)
591 {
592         compound_page_dtor *dtor;
593
594         if (!PageCompound(page))
595                 return 0;
596
597         page = compound_head(page);
598         dtor = get_compound_page_dtor(page);
599
600         return dtor == free_huge_page;
601 }
602
603 EXPORT_SYMBOL_GPL(PageHuge);
604
605 pgoff_t __basepage_index(struct page *page)
606 {
607         struct page *page_head = compound_head(page);
608         pgoff_t index = page_index(page_head);
609         unsigned long compound_idx;
610
611         if (!PageHuge(page_head))
612                 return page_index(page);
613
614         if (compound_order(page_head) >= MAX_ORDER)
615                 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
616         else
617                 compound_idx = page - page_head;
618
619         return (index << compound_order(page_head)) + compound_idx;
620 }
621
622 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
623 {
624         struct page *page;
625
626         if (h->order >= MAX_ORDER)
627                 return NULL;
628
629         page = alloc_pages_exact_node(nid,
630                 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
631                                                 __GFP_REPEAT|__GFP_NOWARN,
632                 huge_page_order(h));
633         if (page) {
634                 if (arch_prepare_hugepage(page)) {
635                         __free_pages(page, huge_page_order(h));
636                         return NULL;
637                 }
638                 prep_new_huge_page(h, page, nid);
639         }
640
641         return page;
642 }
643
644 /*
645  * common helper functions for hstate_next_node_to_{alloc|free}.
646  * We may have allocated or freed a huge page based on a different
647  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
648  * be outside of *nodes_allowed.  Ensure that we use an allowed
649  * node for alloc or free.
650  */
651 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
652 {
653         nid = next_node(nid, *nodes_allowed);
654         if (nid == MAX_NUMNODES)
655                 nid = first_node(*nodes_allowed);
656         VM_BUG_ON(nid >= MAX_NUMNODES);
657
658         return nid;
659 }
660
661 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
662 {
663         if (!node_isset(nid, *nodes_allowed))
664                 nid = next_node_allowed(nid, nodes_allowed);
665         return nid;
666 }
667
668 /*
669  * returns the previously saved node ["this node"] from which to
670  * allocate a persistent huge page for the pool and advance the
671  * next node from which to allocate, handling wrap at end of node
672  * mask.
673  */
674 static int hstate_next_node_to_alloc(struct hstate *h,
675                                         nodemask_t *nodes_allowed)
676 {
677         int nid;
678
679         VM_BUG_ON(!nodes_allowed);
680
681         nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
682         h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
683
684         return nid;
685 }
686
687 static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
688 {
689         struct page *page;
690         int start_nid;
691         int next_nid;
692         int ret = 0;
693
694         start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
695         next_nid = start_nid;
696
697         do {
698                 page = alloc_fresh_huge_page_node(h, next_nid);
699                 if (page) {
700                         ret = 1;
701                         break;
702                 }
703                 next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
704         } while (next_nid != start_nid);
705
706         if (ret)
707                 count_vm_event(HTLB_BUDDY_PGALLOC);
708         else
709                 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
710
711         return ret;
712 }
713
714 /*
715  * helper for free_pool_huge_page() - return the previously saved
716  * node ["this node"] from which to free a huge page.  Advance the
717  * next node id whether or not we find a free huge page to free so
718  * that the next attempt to free addresses the next node.
719  */
720 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
721 {
722         int nid;
723
724         VM_BUG_ON(!nodes_allowed);
725
726         nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
727         h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
728
729         return nid;
730 }
731
732 /*
733  * Free huge page from pool from next node to free.
734  * Attempt to keep persistent huge pages more or less
735  * balanced over allowed nodes.
736  * Called with hugetlb_lock locked.
737  */
738 static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
739                                                          bool acct_surplus)
740 {
741         int start_nid;
742         int next_nid;
743         int ret = 0;
744
745         start_nid = hstate_next_node_to_free(h, nodes_allowed);
746         next_nid = start_nid;
747
748         do {
749                 /*
750                  * If we're returning unused surplus pages, only examine
751                  * nodes with surplus pages.
752                  */
753                 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
754                     !list_empty(&h->hugepage_freelists[next_nid])) {
755                         struct page *page =
756                                 list_entry(h->hugepage_freelists[next_nid].next,
757                                           struct page, lru);
758                         list_del(&page->lru);
759                         h->free_huge_pages--;
760                         h->free_huge_pages_node[next_nid]--;
761                         if (acct_surplus) {
762                                 h->surplus_huge_pages--;
763                                 h->surplus_huge_pages_node[next_nid]--;
764                         }
765                         update_and_free_page(h, page);
766                         ret = 1;
767                         break;
768                 }
769                 next_nid = hstate_next_node_to_free(h, nodes_allowed);
770         } while (next_nid != start_nid);
771
772         return ret;
773 }
774
775 static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
776 {
777         struct page *page;
778         unsigned int r_nid;
779
780         if (h->order >= MAX_ORDER)
781                 return NULL;
782
783         /*
784          * Assume we will successfully allocate the surplus page to
785          * prevent racing processes from causing the surplus to exceed
786          * overcommit
787          *
788          * This however introduces a different race, where a process B
789          * tries to grow the static hugepage pool while alloc_pages() is
790          * called by process A. B will only examine the per-node
791          * counters in determining if surplus huge pages can be
792          * converted to normal huge pages in adjust_pool_surplus(). A
793          * won't be able to increment the per-node counter, until the
794          * lock is dropped by B, but B doesn't drop hugetlb_lock until
795          * no more huge pages can be converted from surplus to normal
796          * state (and doesn't try to convert again). Thus, we have a
797          * case where a surplus huge page exists, the pool is grown, and
798          * the surplus huge page still exists after, even though it
799          * should just have been converted to a normal huge page. This
800          * does not leak memory, though, as the hugepage will be freed
801          * once it is out of use. It also does not allow the counters to
802          * go out of whack in adjust_pool_surplus() as we don't modify
803          * the node values until we've gotten the hugepage and only the
804          * per-node value is checked there.
805          */
806         spin_lock(&hugetlb_lock);
807         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
808                 spin_unlock(&hugetlb_lock);
809                 return NULL;
810         } else {
811                 h->nr_huge_pages++;
812                 h->surplus_huge_pages++;
813         }
814         spin_unlock(&hugetlb_lock);
815
816         if (nid == NUMA_NO_NODE)
817                 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
818                                    __GFP_REPEAT|__GFP_NOWARN,
819                                    huge_page_order(h));
820         else
821                 page = alloc_pages_exact_node(nid,
822                         htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
823                         __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
824
825         if (page && arch_prepare_hugepage(page)) {
826                 __free_pages(page, huge_page_order(h));
827                 return NULL;
828         }
829
830         spin_lock(&hugetlb_lock);
831         if (page) {
832                 r_nid = page_to_nid(page);
833                 set_compound_page_dtor(page, free_huge_page);
834                 /*
835                  * We incremented the global counters already
836                  */
837                 h->nr_huge_pages_node[r_nid]++;
838                 h->surplus_huge_pages_node[r_nid]++;
839                 __count_vm_event(HTLB_BUDDY_PGALLOC);
840         } else {
841                 h->nr_huge_pages--;
842                 h->surplus_huge_pages--;
843                 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
844         }
845         spin_unlock(&hugetlb_lock);
846
847         return page;
848 }
849
850 /*
851  * This allocation function is useful in the context where vma is irrelevant.
852  * E.g. soft-offlining uses this function because it only cares physical
853  * address of error page.
854  */
855 struct page *alloc_huge_page_node(struct hstate *h, int nid)
856 {
857         struct page *page;
858
859         spin_lock(&hugetlb_lock);
860         page = dequeue_huge_page_node(h, nid);
861         spin_unlock(&hugetlb_lock);
862
863         if (!page)
864                 page = alloc_buddy_huge_page(h, nid);
865
866         return page;
867 }
868
869 /*
870  * Increase the hugetlb pool such that it can accommodate a reservation
871  * of size 'delta'.
872  */
873 static int gather_surplus_pages(struct hstate *h, int delta)
874 {
875         struct list_head surplus_list;
876         struct page *page, *tmp;
877         int ret, i;
878         int needed, allocated;
879
880         needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
881         if (needed <= 0) {
882                 h->resv_huge_pages += delta;
883                 return 0;
884         }
885
886         allocated = 0;
887         INIT_LIST_HEAD(&surplus_list);
888
889         ret = -ENOMEM;
890 retry:
891         spin_unlock(&hugetlb_lock);
892         for (i = 0; i < needed; i++) {
893                 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
894                 if (!page)
895                         /*
896                          * We were not able to allocate enough pages to
897                          * satisfy the entire reservation so we free what
898                          * we've allocated so far.
899                          */
900                         goto free;
901
902                 list_add(&page->lru, &surplus_list);
903         }
904         allocated += needed;
905
906         /*
907          * After retaking hugetlb_lock, we need to recalculate 'needed'
908          * because either resv_huge_pages or free_huge_pages may have changed.
909          */
910         spin_lock(&hugetlb_lock);
911         needed = (h->resv_huge_pages + delta) -
912                         (h->free_huge_pages + allocated);
913         if (needed > 0)
914                 goto retry;
915
916         /*
917          * The surplus_list now contains _at_least_ the number of extra pages
918          * needed to accommodate the reservation.  Add the appropriate number
919          * of pages to the hugetlb pool and free the extras back to the buddy
920          * allocator.  Commit the entire reservation here to prevent another
921          * process from stealing the pages as they are added to the pool but
922          * before they are reserved.
923          */
924         needed += allocated;
925         h->resv_huge_pages += delta;
926         ret = 0;
927
928         /* Free the needed pages to the hugetlb pool */
929         list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
930                 if ((--needed) < 0)
931                         break;
932                 list_del(&page->lru);
933                 /*
934                  * This page is now managed by the hugetlb allocator and has
935                  * no users -- drop the buddy allocator's reference.
936                  */
937                 put_page_testzero(page);
938                 VM_BUG_ON(page_count(page));
939                 enqueue_huge_page(h, page);
940         }
941         spin_unlock(&hugetlb_lock);
942
943         /* Free unnecessary surplus pages to the buddy allocator */
944 free:
945         if (!list_empty(&surplus_list)) {
946                 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
947                         list_del(&page->lru);
948                         put_page(page);
949                 }
950         }
951         spin_lock(&hugetlb_lock);
952
953         return ret;
954 }
955
956 /*
957  * When releasing a hugetlb pool reservation, any surplus pages that were
958  * allocated to satisfy the reservation must be explicitly freed if they were
959  * never used.
960  * Called with hugetlb_lock held.
961  */
962 static void return_unused_surplus_pages(struct hstate *h,
963                                         unsigned long unused_resv_pages)
964 {
965         unsigned long nr_pages;
966
967         /* Uncommit the reservation */
968         h->resv_huge_pages -= unused_resv_pages;
969
970         /* Cannot return gigantic pages currently */
971         if (h->order >= MAX_ORDER)
972                 return;
973
974         nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
975
976         /*
977          * We want to release as many surplus pages as possible, spread
978          * evenly across all nodes with memory. Iterate across these nodes
979          * until we can no longer free unreserved surplus pages. This occurs
980          * when the nodes with surplus pages have no free pages.
981          * free_pool_huge_page() will balance the the freed pages across the
982          * on-line nodes with memory and will handle the hstate accounting.
983          */
984         while (nr_pages--) {
985                 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
986                         break;
987         }
988 }
989
990 /*
991  * Determine if the huge page at addr within the vma has an associated
992  * reservation.  Where it does not we will need to logically increase
993  * reservation and actually increase quota before an allocation can occur.
994  * Where any new reservation would be required the reservation change is
995  * prepared, but not committed.  Once the page has been quota'd allocated
996  * an instantiated the change should be committed via vma_commit_reservation.
997  * No action is required on failure.
998  */
999 static long vma_needs_reservation(struct hstate *h,
1000                         struct vm_area_struct *vma, unsigned long addr)
1001 {
1002         struct address_space *mapping = vma->vm_file->f_mapping;
1003         struct inode *inode = mapping->host;
1004
1005         if (vma->vm_flags & VM_MAYSHARE) {
1006                 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1007                 return region_chg(&inode->i_mapping->private_list,
1008                                                         idx, idx + 1);
1009
1010         } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1011                 return 1;
1012
1013         } else  {
1014                 long err;
1015                 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1016                 struct resv_map *reservations = vma_resv_map(vma);
1017
1018                 err = region_chg(&reservations->regions, idx, idx + 1);
1019                 if (err < 0)
1020                         return err;
1021                 return 0;
1022         }
1023 }
1024 static void vma_commit_reservation(struct hstate *h,
1025                         struct vm_area_struct *vma, unsigned long addr)
1026 {
1027         struct address_space *mapping = vma->vm_file->f_mapping;
1028         struct inode *inode = mapping->host;
1029
1030         if (vma->vm_flags & VM_MAYSHARE) {
1031                 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1032                 region_add(&inode->i_mapping->private_list, idx, idx + 1);
1033
1034         } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1035                 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1036                 struct resv_map *reservations = vma_resv_map(vma);
1037
1038                 /* Mark this page used in the map. */
1039                 region_add(&reservations->regions, idx, idx + 1);
1040         }
1041 }
1042
1043 static struct page *alloc_huge_page(struct vm_area_struct *vma,
1044                                     unsigned long addr, int avoid_reserve)
1045 {
1046         struct hstate *h = hstate_vma(vma);
1047         struct page *page;
1048         struct address_space *mapping = vma->vm_file->f_mapping;
1049         struct inode *inode = mapping->host;
1050         long chg;
1051
1052         /*
1053          * Processes that did not create the mapping will have no reserves and
1054          * will not have accounted against quota. Check that the quota can be
1055          * made before satisfying the allocation
1056          * MAP_NORESERVE mappings may also need pages and quota allocated
1057          * if no reserve mapping overlaps.
1058          */
1059         chg = vma_needs_reservation(h, vma, addr);
1060         if (chg < 0)
1061                 return ERR_PTR(-VM_FAULT_OOM);
1062         if (chg)
1063                 if (hugetlb_get_quota(inode->i_mapping, chg))
1064                         return ERR_PTR(-VM_FAULT_SIGBUS);
1065
1066         spin_lock(&hugetlb_lock);
1067         page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1068         spin_unlock(&hugetlb_lock);
1069
1070         if (!page) {
1071                 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1072                 if (!page) {
1073                         hugetlb_put_quota(inode->i_mapping, chg);
1074                         return ERR_PTR(-VM_FAULT_SIGBUS);
1075                 }
1076         }
1077
1078         set_page_private(page, (unsigned long) mapping);
1079
1080         vma_commit_reservation(h, vma, addr);
1081
1082         return page;
1083 }
1084
1085 int __weak alloc_bootmem_huge_page(struct hstate *h)
1086 {
1087         struct huge_bootmem_page *m;
1088         int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
1089
1090         while (nr_nodes) {
1091                 void *addr;
1092
1093                 addr = __alloc_bootmem_node_nopanic(
1094                                 NODE_DATA(hstate_next_node_to_alloc(h,
1095                                                 &node_states[N_HIGH_MEMORY])),
1096                                 huge_page_size(h), huge_page_size(h), 0);
1097
1098                 if (addr) {
1099                         /*
1100                          * Use the beginning of the huge page to store the
1101                          * huge_bootmem_page struct (until gather_bootmem
1102                          * puts them into the mem_map).
1103                          */
1104                         m = addr;
1105                         goto found;
1106                 }
1107                 nr_nodes--;
1108         }
1109         return 0;
1110
1111 found:
1112         BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
1113         /* Put them into a private list first because mem_map is not up yet */
1114         list_add(&m->list, &huge_boot_pages);
1115         m->hstate = h;
1116         return 1;
1117 }
1118
1119 static void prep_compound_huge_page(struct page *page, int order)
1120 {
1121         if (unlikely(order > (MAX_ORDER - 1)))
1122                 prep_compound_gigantic_page(page, order);
1123         else
1124                 prep_compound_page(page, order);
1125 }
1126
1127 /* Put bootmem huge pages into the standard lists after mem_map is up */
1128 static void __init gather_bootmem_prealloc(void)
1129 {
1130         struct huge_bootmem_page *m;
1131
1132         list_for_each_entry(m, &huge_boot_pages, list) {
1133                 struct page *page = virt_to_page(m);
1134                 struct hstate *h = m->hstate;
1135                 __ClearPageReserved(page);
1136                 WARN_ON(page_count(page) != 1);
1137                 prep_compound_huge_page(page, h->order);
1138                 prep_new_huge_page(h, page, page_to_nid(page));
1139                 /*
1140                  * If we had gigantic hugepages allocated at boot time, we need
1141                  * to restore the 'stolen' pages to totalram_pages in order to
1142                  * fix confusing memory reports from free(1) and another
1143                  * side-effects, like CommitLimit going negative.
1144                  */
1145                 if (h->order > (MAX_ORDER - 1))
1146                         totalram_pages += 1 << h->order;
1147         }
1148 }
1149
1150 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1151 {
1152         unsigned long i;
1153
1154         for (i = 0; i < h->max_huge_pages; ++i) {
1155                 if (h->order >= MAX_ORDER) {
1156                         if (!alloc_bootmem_huge_page(h))
1157                                 break;
1158                 } else if (!alloc_fresh_huge_page(h,
1159                                          &node_states[N_HIGH_MEMORY]))
1160                         break;
1161         }
1162         h->max_huge_pages = i;
1163 }
1164
1165 static void __init hugetlb_init_hstates(void)
1166 {
1167         struct hstate *h;
1168
1169         for_each_hstate(h) {
1170                 /* oversize hugepages were init'ed in early boot */
1171                 if (h->order < MAX_ORDER)
1172                         hugetlb_hstate_alloc_pages(h);
1173         }
1174 }
1175
1176 static char * __init memfmt(char *buf, unsigned long n)
1177 {
1178         if (n >= (1UL << 30))
1179                 sprintf(buf, "%lu GB", n >> 30);
1180         else if (n >= (1UL << 20))
1181                 sprintf(buf, "%lu MB", n >> 20);
1182         else
1183                 sprintf(buf, "%lu KB", n >> 10);
1184         return buf;
1185 }
1186
1187 static void __init report_hugepages(void)
1188 {
1189         struct hstate *h;
1190
1191         for_each_hstate(h) {
1192                 char buf[32];
1193                 printk(KERN_INFO "HugeTLB registered %s page size, "
1194                                  "pre-allocated %ld pages\n",
1195                         memfmt(buf, huge_page_size(h)),
1196                         h->free_huge_pages);
1197         }
1198 }
1199
1200 #ifdef CONFIG_HIGHMEM
1201 static void try_to_free_low(struct hstate *h, unsigned long count,
1202                                                 nodemask_t *nodes_allowed)
1203 {
1204         int i;
1205
1206         if (h->order >= MAX_ORDER)
1207                 return;
1208
1209         for_each_node_mask(i, *nodes_allowed) {
1210                 struct page *page, *next;
1211                 struct list_head *freel = &h->hugepage_freelists[i];
1212                 list_for_each_entry_safe(page, next, freel, lru) {
1213                         if (count >= h->nr_huge_pages)
1214                                 return;
1215                         if (PageHighMem(page))
1216                                 continue;
1217                         list_del(&page->lru);
1218                         update_and_free_page(h, page);
1219                         h->free_huge_pages--;
1220                         h->free_huge_pages_node[page_to_nid(page)]--;
1221                 }
1222         }
1223 }
1224 #else
1225 static inline void try_to_free_low(struct hstate *h, unsigned long count,
1226                                                 nodemask_t *nodes_allowed)
1227 {
1228 }
1229 #endif
1230
1231 /*
1232  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
1233  * balanced by operating on them in a round-robin fashion.
1234  * Returns 1 if an adjustment was made.
1235  */
1236 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1237                                 int delta)
1238 {
1239         int start_nid, next_nid;
1240         int ret = 0;
1241
1242         VM_BUG_ON(delta != -1 && delta != 1);
1243
1244         if (delta < 0)
1245                 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
1246         else
1247                 start_nid = hstate_next_node_to_free(h, nodes_allowed);
1248         next_nid = start_nid;
1249
1250         do {
1251                 int nid = next_nid;
1252                 if (delta < 0)  {
1253                         /*
1254                          * To shrink on this node, there must be a surplus page
1255                          */
1256                         if (!h->surplus_huge_pages_node[nid]) {
1257                                 next_nid = hstate_next_node_to_alloc(h,
1258                                                                 nodes_allowed);
1259                                 continue;
1260                         }
1261                 }
1262                 if (delta > 0) {
1263                         /*
1264                          * Surplus cannot exceed the total number of pages
1265                          */
1266                         if (h->surplus_huge_pages_node[nid] >=
1267                                                 h->nr_huge_pages_node[nid]) {
1268                                 next_nid = hstate_next_node_to_free(h,
1269                                                                 nodes_allowed);
1270                                 continue;
1271                         }
1272                 }
1273
1274                 h->surplus_huge_pages += delta;
1275                 h->surplus_huge_pages_node[nid] += delta;
1276                 ret = 1;
1277                 break;
1278         } while (next_nid != start_nid);
1279
1280         return ret;
1281 }
1282
1283 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1284 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1285                                                 nodemask_t *nodes_allowed)
1286 {
1287         unsigned long min_count, ret;
1288
1289         if (h->order >= MAX_ORDER)
1290                 return h->max_huge_pages;
1291
1292         /*
1293          * Increase the pool size
1294          * First take pages out of surplus state.  Then make up the
1295          * remaining difference by allocating fresh huge pages.
1296          *
1297          * We might race with alloc_buddy_huge_page() here and be unable
1298          * to convert a surplus huge page to a normal huge page. That is
1299          * not critical, though, it just means the overall size of the
1300          * pool might be one hugepage larger than it needs to be, but
1301          * within all the constraints specified by the sysctls.
1302          */
1303         spin_lock(&hugetlb_lock);
1304         while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1305                 if (!adjust_pool_surplus(h, nodes_allowed, -1))
1306                         break;
1307         }
1308
1309         while (count > persistent_huge_pages(h)) {
1310                 /*
1311                  * If this allocation races such that we no longer need the
1312                  * page, free_huge_page will handle it by freeing the page
1313                  * and reducing the surplus.
1314                  */
1315                 spin_unlock(&hugetlb_lock);
1316                 ret = alloc_fresh_huge_page(h, nodes_allowed);
1317                 spin_lock(&hugetlb_lock);
1318                 if (!ret)
1319                         goto out;
1320
1321                 /* Bail for signals. Probably ctrl-c from user */
1322                 if (signal_pending(current))
1323                         goto out;
1324         }
1325
1326         /*
1327          * Decrease the pool size
1328          * First return free pages to the buddy allocator (being careful
1329          * to keep enough around to satisfy reservations).  Then place
1330          * pages into surplus state as needed so the pool will shrink
1331          * to the desired size as pages become free.
1332          *
1333          * By placing pages into the surplus state independent of the
1334          * overcommit value, we are allowing the surplus pool size to
1335          * exceed overcommit. There are few sane options here. Since
1336          * alloc_buddy_huge_page() is checking the global counter,
1337          * though, we'll note that we're not allowed to exceed surplus
1338          * and won't grow the pool anywhere else. Not until one of the
1339          * sysctls are changed, or the surplus pages go out of use.
1340          */
1341         min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1342         min_count = max(count, min_count);
1343         try_to_free_low(h, min_count, nodes_allowed);
1344         while (min_count < persistent_huge_pages(h)) {
1345                 if (!free_pool_huge_page(h, nodes_allowed, 0))
1346                         break;
1347         }
1348         while (count < persistent_huge_pages(h)) {
1349                 if (!adjust_pool_surplus(h, nodes_allowed, 1))
1350                         break;
1351         }
1352 out:
1353         ret = persistent_huge_pages(h);
1354         spin_unlock(&hugetlb_lock);
1355         return ret;
1356 }
1357
1358 #define HSTATE_ATTR_RO(_name) \
1359         static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1360
1361 #define HSTATE_ATTR(_name) \
1362         static struct kobj_attribute _name##_attr = \
1363                 __ATTR(_name, 0644, _name##_show, _name##_store)
1364
1365 static struct kobject *hugepages_kobj;
1366 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1367
1368 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
1369
1370 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1371 {
1372         int i;
1373
1374         for (i = 0; i < HUGE_MAX_HSTATE; i++)
1375                 if (hstate_kobjs[i] == kobj) {
1376                         if (nidp)
1377                                 *nidp = NUMA_NO_NODE;
1378                         return &hstates[i];
1379                 }
1380
1381         return kobj_to_node_hstate(kobj, nidp);
1382 }
1383
1384 static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1385                                         struct kobj_attribute *attr, char *buf)
1386 {
1387         struct hstate *h;
1388         unsigned long nr_huge_pages;
1389         int nid;
1390
1391         h = kobj_to_hstate(kobj, &nid);
1392         if (nid == NUMA_NO_NODE)
1393                 nr_huge_pages = h->nr_huge_pages;
1394         else
1395                 nr_huge_pages = h->nr_huge_pages_node[nid];
1396
1397         return sprintf(buf, "%lu\n", nr_huge_pages);
1398 }
1399
1400 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1401                         struct kobject *kobj, struct kobj_attribute *attr,
1402                         const char *buf, size_t len)
1403 {
1404         int err;
1405         int nid;
1406         unsigned long count;
1407         struct hstate *h;
1408         NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1409
1410         err = strict_strtoul(buf, 10, &count);
1411         if (err)
1412                 goto out;
1413
1414         h = kobj_to_hstate(kobj, &nid);
1415         if (h->order >= MAX_ORDER) {
1416                 err = -EINVAL;
1417                 goto out;
1418         }
1419
1420         if (nid == NUMA_NO_NODE) {
1421                 /*
1422                  * global hstate attribute
1423                  */
1424                 if (!(obey_mempolicy &&
1425                                 init_nodemask_of_mempolicy(nodes_allowed))) {
1426                         NODEMASK_FREE(nodes_allowed);
1427                         nodes_allowed = &node_states[N_HIGH_MEMORY];
1428                 }
1429         } else if (nodes_allowed) {
1430                 /*
1431                  * per node hstate attribute: adjust count to global,
1432                  * but restrict alloc/free to the specified node.
1433                  */
1434                 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1435                 init_nodemask_of_node(nodes_allowed, nid);
1436         } else
1437                 nodes_allowed = &node_states[N_HIGH_MEMORY];
1438
1439         h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1440
1441         if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1442                 NODEMASK_FREE(nodes_allowed);
1443
1444         return len;
1445 out:
1446         NODEMASK_FREE(nodes_allowed);
1447         return err;
1448 }
1449
1450 static ssize_t nr_hugepages_show(struct kobject *kobj,
1451                                        struct kobj_attribute *attr, char *buf)
1452 {
1453         return nr_hugepages_show_common(kobj, attr, buf);
1454 }
1455
1456 static ssize_t nr_hugepages_store(struct kobject *kobj,
1457                struct kobj_attribute *attr, const char *buf, size_t len)
1458 {
1459         return nr_hugepages_store_common(false, kobj, attr, buf, len);
1460 }
1461 HSTATE_ATTR(nr_hugepages);
1462
1463 #ifdef CONFIG_NUMA
1464
1465 /*
1466  * hstate attribute for optionally mempolicy-based constraint on persistent
1467  * huge page alloc/free.
1468  */
1469 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1470                                        struct kobj_attribute *attr, char *buf)
1471 {
1472         return nr_hugepages_show_common(kobj, attr, buf);
1473 }
1474
1475 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1476                struct kobj_attribute *attr, const char *buf, size_t len)
1477 {
1478         return nr_hugepages_store_common(true, kobj, attr, buf, len);
1479 }
1480 HSTATE_ATTR(nr_hugepages_mempolicy);
1481 #endif
1482
1483
1484 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1485                                         struct kobj_attribute *attr, char *buf)
1486 {
1487         struct hstate *h = kobj_to_hstate(kobj, NULL);
1488         return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1489 }
1490
1491 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1492                 struct kobj_attribute *attr, const char *buf, size_t count)
1493 {
1494         int err;
1495         unsigned long input;
1496         struct hstate *h = kobj_to_hstate(kobj, NULL);
1497
1498         if (h->order >= MAX_ORDER)
1499                 return -EINVAL;
1500
1501         err = strict_strtoul(buf, 10, &input);
1502         if (err)
1503                 return err;
1504
1505         spin_lock(&hugetlb_lock);
1506         h->nr_overcommit_huge_pages = input;
1507         spin_unlock(&hugetlb_lock);
1508
1509         return count;
1510 }
1511 HSTATE_ATTR(nr_overcommit_hugepages);
1512
1513 static ssize_t free_hugepages_show(struct kobject *kobj,
1514                                         struct kobj_attribute *attr, char *buf)
1515 {
1516         struct hstate *h;
1517         unsigned long free_huge_pages;
1518         int nid;
1519
1520         h = kobj_to_hstate(kobj, &nid);
1521         if (nid == NUMA_NO_NODE)
1522                 free_huge_pages = h->free_huge_pages;
1523         else
1524                 free_huge_pages = h->free_huge_pages_node[nid];
1525
1526         return sprintf(buf, "%lu\n", free_huge_pages);
1527 }
1528 HSTATE_ATTR_RO(free_hugepages);
1529
1530 static ssize_t resv_hugepages_show(struct kobject *kobj,
1531                                         struct kobj_attribute *attr, char *buf)
1532 {
1533         struct hstate *h = kobj_to_hstate(kobj, NULL);
1534         return sprintf(buf, "%lu\n", h->resv_huge_pages);
1535 }
1536 HSTATE_ATTR_RO(resv_hugepages);
1537
1538 static ssize_t surplus_hugepages_show(struct kobject *kobj,
1539                                         struct kobj_attribute *attr, char *buf)
1540 {
1541         struct hstate *h;
1542         unsigned long surplus_huge_pages;
1543         int nid;
1544
1545         h = kobj_to_hstate(kobj, &nid);
1546         if (nid == NUMA_NO_NODE)
1547                 surplus_huge_pages = h->surplus_huge_pages;
1548         else
1549                 surplus_huge_pages = h->surplus_huge_pages_node[nid];
1550
1551         return sprintf(buf, "%lu\n", surplus_huge_pages);
1552 }
1553 HSTATE_ATTR_RO(surplus_hugepages);
1554
1555 static struct attribute *hstate_attrs[] = {
1556         &nr_hugepages_attr.attr,
1557         &nr_overcommit_hugepages_attr.attr,
1558         &free_hugepages_attr.attr,
1559         &resv_hugepages_attr.attr,
1560         &surplus_hugepages_attr.attr,
1561 #ifdef CONFIG_NUMA
1562         &nr_hugepages_mempolicy_attr.attr,
1563 #endif
1564         NULL,
1565 };
1566
1567 static struct attribute_group hstate_attr_group = {
1568         .attrs = hstate_attrs,
1569 };
1570
1571 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1572                                     struct kobject **hstate_kobjs,
1573                                     struct attribute_group *hstate_attr_group)
1574 {
1575         int retval;
1576         int hi = h - hstates;
1577
1578         hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1579         if (!hstate_kobjs[hi])
1580                 return -ENOMEM;
1581
1582         retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
1583         if (retval)
1584                 kobject_put(hstate_kobjs[hi]);
1585
1586         return retval;
1587 }
1588
1589 static void __init hugetlb_sysfs_init(void)
1590 {
1591         struct hstate *h;
1592         int err;
1593
1594         hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1595         if (!hugepages_kobj)
1596                 return;
1597
1598         for_each_hstate(h) {
1599                 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1600                                          hstate_kobjs, &hstate_attr_group);
1601                 if (err)
1602                         printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1603                                                                 h->name);
1604         }
1605 }
1606
1607 #ifdef CONFIG_NUMA
1608
1609 /*
1610  * node_hstate/s - associate per node hstate attributes, via their kobjects,
1611  * with node sysdevs in node_devices[] using a parallel array.  The array
1612  * index of a node sysdev or _hstate == node id.
1613  * This is here to avoid any static dependency of the node sysdev driver, in
1614  * the base kernel, on the hugetlb module.
1615  */
1616 struct node_hstate {
1617         struct kobject          *hugepages_kobj;
1618         struct kobject          *hstate_kobjs[HUGE_MAX_HSTATE];
1619 };
1620 struct node_hstate node_hstates[MAX_NUMNODES];
1621
1622 /*
1623  * A subset of global hstate attributes for node sysdevs
1624  */
1625 static struct attribute *per_node_hstate_attrs[] = {
1626         &nr_hugepages_attr.attr,
1627         &free_hugepages_attr.attr,
1628         &surplus_hugepages_attr.attr,
1629         NULL,
1630 };
1631
1632 static struct attribute_group per_node_hstate_attr_group = {
1633         .attrs = per_node_hstate_attrs,
1634 };
1635
1636 /*
1637  * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
1638  * Returns node id via non-NULL nidp.
1639  */
1640 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1641 {
1642         int nid;
1643
1644         for (nid = 0; nid < nr_node_ids; nid++) {
1645                 struct node_hstate *nhs = &node_hstates[nid];
1646                 int i;
1647                 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1648                         if (nhs->hstate_kobjs[i] == kobj) {
1649                                 if (nidp)
1650                                         *nidp = nid;
1651                                 return &hstates[i];
1652                         }
1653         }
1654
1655         BUG();
1656         return NULL;
1657 }
1658
1659 /*
1660  * Unregister hstate attributes from a single node sysdev.
1661  * No-op if no hstate attributes attached.
1662  */
1663 void hugetlb_unregister_node(struct node *node)
1664 {
1665         struct hstate *h;
1666         struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1667
1668         if (!nhs->hugepages_kobj)
1669                 return;         /* no hstate attributes */
1670
1671         for_each_hstate(h)
1672                 if (nhs->hstate_kobjs[h - hstates]) {
1673                         kobject_put(nhs->hstate_kobjs[h - hstates]);
1674                         nhs->hstate_kobjs[h - hstates] = NULL;
1675                 }
1676
1677         kobject_put(nhs->hugepages_kobj);
1678         nhs->hugepages_kobj = NULL;
1679 }
1680
1681 /*
1682  * hugetlb module exit:  unregister hstate attributes from node sysdevs
1683  * that have them.
1684  */
1685 static void hugetlb_unregister_all_nodes(void)
1686 {
1687         int nid;
1688
1689         /*
1690          * disable node sysdev registrations.
1691          */
1692         register_hugetlbfs_with_node(NULL, NULL);
1693
1694         /*
1695          * remove hstate attributes from any nodes that have them.
1696          */
1697         for (nid = 0; nid < nr_node_ids; nid++)
1698                 hugetlb_unregister_node(&node_devices[nid]);
1699 }
1700
1701 /*
1702  * Register hstate attributes for a single node sysdev.
1703  * No-op if attributes already registered.
1704  */
1705 void hugetlb_register_node(struct node *node)
1706 {
1707         struct hstate *h;
1708         struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1709         int err;
1710
1711         if (nhs->hugepages_kobj)
1712                 return;         /* already allocated */
1713
1714         nhs->hugepages_kobj = kobject_create_and_add("hugepages",
1715                                                         &node->sysdev.kobj);
1716         if (!nhs->hugepages_kobj)
1717                 return;
1718
1719         for_each_hstate(h) {
1720                 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
1721                                                 nhs->hstate_kobjs,
1722                                                 &per_node_hstate_attr_group);
1723                 if (err) {
1724                         printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
1725                                         " for node %d\n",
1726                                                 h->name, node->sysdev.id);
1727                         hugetlb_unregister_node(node);
1728                         break;
1729                 }
1730         }
1731 }
1732
1733 /*
1734  * hugetlb init time:  register hstate attributes for all registered node
1735  * sysdevs of nodes that have memory.  All on-line nodes should have
1736  * registered their associated sysdev by this time.
1737  */
1738 static void hugetlb_register_all_nodes(void)
1739 {
1740         int nid;
1741
1742         for_each_node_state(nid, N_HIGH_MEMORY) {
1743                 struct node *node = &node_devices[nid];
1744                 if (node->sysdev.id == nid)
1745                         hugetlb_register_node(node);
1746         }
1747
1748         /*
1749          * Let the node sysdev driver know we're here so it can
1750          * [un]register hstate attributes on node hotplug.
1751          */
1752         register_hugetlbfs_with_node(hugetlb_register_node,
1753                                      hugetlb_unregister_node);
1754 }
1755 #else   /* !CONFIG_NUMA */
1756
1757 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1758 {
1759         BUG();
1760         if (nidp)
1761                 *nidp = -1;
1762         return NULL;
1763 }
1764
1765 static void hugetlb_unregister_all_nodes(void) { }
1766
1767 static void hugetlb_register_all_nodes(void) { }
1768
1769 #endif
1770
1771 static void __exit hugetlb_exit(void)
1772 {
1773         struct hstate *h;
1774
1775         hugetlb_unregister_all_nodes();
1776
1777         for_each_hstate(h) {
1778                 kobject_put(hstate_kobjs[h - hstates]);
1779         }
1780
1781         kobject_put(hugepages_kobj);
1782 }
1783 module_exit(hugetlb_exit);
1784
1785 static int __init hugetlb_init(void)
1786 {
1787         /* Some platform decide whether they support huge pages at boot
1788          * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
1789          * there is no such support
1790          */
1791         if (HPAGE_SHIFT == 0)
1792                 return 0;
1793
1794         if (!size_to_hstate(default_hstate_size)) {
1795                 default_hstate_size = HPAGE_SIZE;
1796                 if (!size_to_hstate(default_hstate_size))
1797                         hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1798         }
1799         default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
1800         if (default_hstate_max_huge_pages)
1801                 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1802
1803         hugetlb_init_hstates();
1804
1805         gather_bootmem_prealloc();
1806
1807         report_hugepages();
1808
1809         hugetlb_sysfs_init();
1810
1811         hugetlb_register_all_nodes();
1812
1813         return 0;
1814 }
1815 module_init(hugetlb_init);
1816
1817 /* Should be called on processing a hugepagesz=... option */
1818 void __init hugetlb_add_hstate(unsigned order)
1819 {
1820         struct hstate *h;
1821         unsigned long i;
1822
1823         if (size_to_hstate(PAGE_SIZE << order)) {
1824                 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1825                 return;
1826         }
1827         BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
1828         BUG_ON(order == 0);
1829         h = &hstates[max_hstate++];
1830         h->order = order;
1831         h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1832         h->nr_huge_pages = 0;
1833         h->free_huge_pages = 0;
1834         for (i = 0; i < MAX_NUMNODES; ++i)
1835                 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1836         h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1837         h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1838         snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1839                                         huge_page_size(h)/1024);
1840
1841         parsed_hstate = h;
1842 }
1843
1844 static int __init hugetlb_nrpages_setup(char *s)
1845 {
1846         unsigned long *mhp;
1847         static unsigned long *last_mhp;
1848
1849         /*
1850          * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
1851          * so this hugepages= parameter goes to the "default hstate".
1852          */
1853         if (!max_hstate)
1854                 mhp = &default_hstate_max_huge_pages;
1855         else
1856                 mhp = &parsed_hstate->max_huge_pages;
1857
1858         if (mhp == last_mhp) {
1859                 printk(KERN_WARNING "hugepages= specified twice without "
1860                         "interleaving hugepagesz=, ignoring\n");
1861                 return 1;
1862         }
1863
1864         if (sscanf(s, "%lu", mhp) <= 0)
1865                 *mhp = 0;
1866
1867         /*
1868          * Global state is always initialized later in hugetlb_init.
1869          * But we need to allocate >= MAX_ORDER hstates here early to still
1870          * use the bootmem allocator.
1871          */
1872         if (max_hstate && parsed_hstate->order >= MAX_ORDER)
1873                 hugetlb_hstate_alloc_pages(parsed_hstate);
1874
1875         last_mhp = mhp;
1876
1877         return 1;
1878 }
1879 __setup("hugepages=", hugetlb_nrpages_setup);
1880
1881 static int __init hugetlb_default_setup(char *s)
1882 {
1883         default_hstate_size = memparse(s, &s);
1884         return 1;
1885 }
1886 __setup("default_hugepagesz=", hugetlb_default_setup);
1887
1888 static unsigned int cpuset_mems_nr(unsigned int *array)
1889 {
1890         int node;
1891         unsigned int nr = 0;
1892
1893         for_each_node_mask(node, cpuset_current_mems_allowed)
1894                 nr += array[node];
1895
1896         return nr;
1897 }
1898
1899 #ifdef CONFIG_SYSCTL
1900 static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1901                          struct ctl_table *table, int write,
1902                          void __user *buffer, size_t *length, loff_t *ppos)
1903 {
1904         struct hstate *h = &default_hstate;
1905         unsigned long tmp;
1906         int ret;
1907
1908         tmp = h->max_huge_pages;
1909
1910         if (write && h->order >= MAX_ORDER)
1911                 return -EINVAL;
1912
1913         table->data = &tmp;
1914         table->maxlen = sizeof(unsigned long);
1915         ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1916         if (ret)
1917                 goto out;
1918
1919         if (write) {
1920                 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
1921                                                 GFP_KERNEL | __GFP_NORETRY);
1922                 if (!(obey_mempolicy &&
1923                                init_nodemask_of_mempolicy(nodes_allowed))) {
1924                         NODEMASK_FREE(nodes_allowed);
1925                         nodes_allowed = &node_states[N_HIGH_MEMORY];
1926                 }
1927                 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
1928
1929                 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1930                         NODEMASK_FREE(nodes_allowed);
1931         }
1932 out:
1933         return ret;
1934 }
1935
1936 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1937                           void __user *buffer, size_t *length, loff_t *ppos)
1938 {
1939
1940         return hugetlb_sysctl_handler_common(false, table, write,
1941                                                         buffer, length, ppos);
1942 }
1943
1944 #ifdef CONFIG_NUMA
1945 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
1946                           void __user *buffer, size_t *length, loff_t *ppos)
1947 {
1948         return hugetlb_sysctl_handler_common(true, table, write,
1949                                                         buffer, length, ppos);
1950 }
1951 #endif /* CONFIG_NUMA */
1952
1953 int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1954                         void __user *buffer,
1955                         size_t *length, loff_t *ppos)
1956 {
1957         proc_dointvec(table, write, buffer, length, ppos);
1958         if (hugepages_treat_as_movable)
1959                 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
1960         else
1961                 htlb_alloc_mask = GFP_HIGHUSER;
1962         return 0;
1963 }
1964
1965 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1966                         void __user *buffer,
1967                         size_t *length, loff_t *ppos)
1968 {
1969         struct hstate *h = &default_hstate;
1970         unsigned long tmp;
1971         int ret;
1972
1973         tmp = h->nr_overcommit_huge_pages;
1974
1975         if (write && h->order >= MAX_ORDER)
1976                 return -EINVAL;
1977
1978         table->data = &tmp;
1979         table->maxlen = sizeof(unsigned long);
1980         ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1981         if (ret)
1982                 goto out;
1983
1984         if (write) {
1985                 spin_lock(&hugetlb_lock);
1986                 h->nr_overcommit_huge_pages = tmp;
1987                 spin_unlock(&hugetlb_lock);
1988         }
1989 out:
1990         return ret;
1991 }
1992
1993 #endif /* CONFIG_SYSCTL */
1994
1995 void hugetlb_report_meminfo(struct seq_file *m)
1996 {
1997         struct hstate *h = &default_hstate;
1998         seq_printf(m,
1999                         "HugePages_Total:   %5lu\n"
2000                         "HugePages_Free:    %5lu\n"
2001                         "HugePages_Rsvd:    %5lu\n"
2002                         "HugePages_Surp:    %5lu\n"
2003                         "Hugepagesize:   %8lu kB\n",
2004                         h->nr_huge_pages,
2005                         h->free_huge_pages,
2006                         h->resv_huge_pages,
2007                         h->surplus_huge_pages,
2008                         1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2009 }
2010
2011 int hugetlb_report_node_meminfo(int nid, char *buf)
2012 {
2013         struct hstate *h = &default_hstate;
2014         return sprintf(buf,
2015                 "Node %d HugePages_Total: %5u\n"
2016                 "Node %d HugePages_Free:  %5u\n"
2017                 "Node %d HugePages_Surp:  %5u\n",
2018                 nid, h->nr_huge_pages_node[nid],
2019                 nid, h->free_huge_pages_node[nid],
2020                 nid, h->surplus_huge_pages_node[nid]);
2021 }
2022
2023 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
2024 unsigned long hugetlb_total_pages(void)
2025 {
2026         struct hstate *h;
2027         unsigned long nr_total_pages = 0;
2028
2029         for_each_hstate(h)
2030                 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
2031         return nr_total_pages;
2032 }
2033
2034 static int hugetlb_acct_memory(struct hstate *h, long delta)
2035 {
2036         int ret = -ENOMEM;
2037
2038         spin_lock(&hugetlb_lock);
2039         /*
2040          * When cpuset is configured, it breaks the strict hugetlb page
2041          * reservation as the accounting is done on a global variable. Such
2042          * reservation is completely rubbish in the presence of cpuset because
2043          * the reservation is not checked against page availability for the
2044          * current cpuset. Application can still potentially OOM'ed by kernel
2045          * with lack of free htlb page in cpuset that the task is in.
2046          * Attempt to enforce strict accounting with cpuset is almost
2047          * impossible (or too ugly) because cpuset is too fluid that
2048          * task or memory node can be dynamically moved between cpusets.
2049          *
2050          * The change of semantics for shared hugetlb mapping with cpuset is
2051          * undesirable. However, in order to preserve some of the semantics,
2052          * we fall back to check against current free page availability as
2053          * a best attempt and hopefully to minimize the impact of changing
2054          * semantics that cpuset has.
2055          */
2056         if (delta > 0) {
2057                 if (gather_surplus_pages(h, delta) < 0)
2058                         goto out;
2059
2060                 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
2061                         return_unused_surplus_pages(h, delta);
2062                         goto out;
2063                 }
2064         }
2065
2066         ret = 0;
2067         if (delta < 0)
2068                 return_unused_surplus_pages(h, (unsigned long) -delta);
2069
2070 out:
2071         spin_unlock(&hugetlb_lock);
2072         return ret;
2073 }
2074
2075 static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2076 {
2077         struct resv_map *reservations = vma_resv_map(vma);
2078
2079         /*
2080          * This new VMA should share its siblings reservation map if present.
2081          * The VMA will only ever have a valid reservation map pointer where
2082          * it is being copied for another still existing VMA.  As that VMA
2083          * has a reference to the reservation map it cannot disappear until
2084          * after this open call completes.  It is therefore safe to take a
2085          * new reference here without additional locking.
2086          */
2087         if (reservations)
2088                 kref_get(&reservations->refs);
2089 }
2090
2091 static void resv_map_put(struct vm_area_struct *vma)
2092 {
2093         struct resv_map *reservations = vma_resv_map(vma);
2094
2095         if (!reservations)
2096                 return;
2097         kref_put(&reservations->refs, resv_map_release);
2098 }
2099
2100 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2101 {
2102         struct hstate *h = hstate_vma(vma);
2103         struct resv_map *reservations = vma_resv_map(vma);
2104         unsigned long reserve;
2105         unsigned long start;
2106         unsigned long end;
2107
2108         if (reservations) {
2109                 start = vma_hugecache_offset(h, vma, vma->vm_start);
2110                 end = vma_hugecache_offset(h, vma, vma->vm_end);
2111
2112                 reserve = (end - start) -
2113                         region_count(&reservations->regions, start, end);
2114
2115                 resv_map_put(vma);
2116
2117                 if (reserve) {
2118                         hugetlb_acct_memory(h, -reserve);
2119                         hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
2120                 }
2121         }
2122 }
2123
2124 /*
2125  * We cannot handle pagefaults against hugetlb pages at all.  They cause
2126  * handle_mm_fault() to try to instantiate regular-sized pages in the
2127  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
2128  * this far.
2129  */
2130 static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2131 {
2132         BUG();
2133         return 0;
2134 }
2135
2136 const struct vm_operations_struct hugetlb_vm_ops = {
2137         .fault = hugetlb_vm_op_fault,
2138         .open = hugetlb_vm_op_open,
2139         .close = hugetlb_vm_op_close,
2140 };
2141
2142 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
2143                                 int writable)
2144 {
2145         pte_t entry;
2146
2147         if (writable) {
2148                 entry =
2149                     pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
2150         } else {
2151                 entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
2152         }
2153         entry = pte_mkyoung(entry);
2154         entry = pte_mkhuge(entry);
2155
2156         return entry;
2157 }
2158
2159 static void set_huge_ptep_writable(struct vm_area_struct *vma,
2160                                    unsigned long address, pte_t *ptep)
2161 {
2162         pte_t entry;
2163
2164         entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2165         if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
2166                 update_mmu_cache(vma, address, ptep);
2167         }
2168 }
2169
2170
2171 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2172                             struct vm_area_struct *vma)
2173 {
2174         pte_t *src_pte, *dst_pte, entry;
2175         struct page *ptepage;
2176         unsigned long addr;
2177         int cow;
2178         struct hstate *h = hstate_vma(vma);
2179         unsigned long sz = huge_page_size(h);
2180
2181         cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
2182
2183         for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2184                 src_pte = huge_pte_offset(src, addr);
2185                 if (!src_pte)
2186                         continue;
2187                 dst_pte = huge_pte_alloc(dst, addr, sz);
2188                 if (!dst_pte)
2189                         goto nomem;
2190
2191                 /* If the pagetables are shared don't copy or take references */
2192                 if (dst_pte == src_pte)
2193                         continue;
2194
2195                 spin_lock(&dst->page_table_lock);
2196                 spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
2197                 if (!huge_pte_none(huge_ptep_get(src_pte))) {
2198                         if (cow)
2199                                 huge_ptep_set_wrprotect(src, addr, src_pte);
2200                         entry = huge_ptep_get(src_pte);
2201                         ptepage = pte_page(entry);
2202                         get_page(ptepage);
2203                         page_dup_rmap(ptepage);
2204                         set_huge_pte_at(dst, addr, dst_pte, entry);
2205                 }
2206                 spin_unlock(&src->page_table_lock);
2207                 spin_unlock(&dst->page_table_lock);
2208         }
2209         return 0;
2210
2211 nomem:
2212         return -ENOMEM;
2213 }
2214
2215 static int is_hugetlb_entry_migration(pte_t pte)
2216 {
2217         swp_entry_t swp;
2218
2219         if (huge_pte_none(pte) || pte_present(pte))
2220                 return 0;
2221         swp = pte_to_swp_entry(pte);
2222         if (non_swap_entry(swp) && is_migration_entry(swp)) {
2223                 return 1;
2224         } else
2225                 return 0;
2226 }
2227
2228 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2229 {
2230         swp_entry_t swp;
2231
2232         if (huge_pte_none(pte) || pte_present(pte))
2233                 return 0;
2234         swp = pte_to_swp_entry(pte);
2235         if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
2236                 return 1;
2237         } else
2238                 return 0;
2239 }
2240
2241 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2242                             unsigned long end, struct page *ref_page)
2243 {
2244         struct mm_struct *mm = vma->vm_mm;
2245         unsigned long address;
2246         pte_t *ptep;
2247         pte_t pte;
2248         struct page *page;
2249         struct page *tmp;
2250         struct hstate *h = hstate_vma(vma);
2251         unsigned long sz = huge_page_size(h);
2252
2253         /*
2254          * A page gathering list, protected by per file i_mmap_mutex. The
2255          * lock is used to avoid list corruption from multiple unmapping
2256          * of the same page since we are using page->lru.
2257          */
2258         LIST_HEAD(page_list);
2259
2260         WARN_ON(!is_vm_hugetlb_page(vma));
2261         BUG_ON(start & ~huge_page_mask(h));
2262         BUG_ON(end & ~huge_page_mask(h));
2263
2264         mmu_notifier_invalidate_range_start(mm, start, end);
2265         spin_lock(&mm->page_table_lock);
2266         for (address = start; address < end; address += sz) {
2267                 ptep = huge_pte_offset(mm, address);
2268                 if (!ptep)
2269                         continue;
2270
2271                 if (huge_pmd_unshare(mm, &address, ptep))
2272                         continue;
2273
2274                 /*
2275                  * If a reference page is supplied, it is because a specific
2276                  * page is being unmapped, not a range. Ensure the page we
2277                  * are about to unmap is the actual page of interest.
2278                  */
2279                 if (ref_page) {
2280                         pte = huge_ptep_get(ptep);
2281                         if (huge_pte_none(pte))
2282                                 continue;
2283                         page = pte_page(pte);
2284                         if (page != ref_page)
2285                                 continue;
2286
2287                         /*
2288                          * Mark the VMA as having unmapped its page so that
2289                          * future faults in this VMA will fail rather than
2290                          * looking like data was lost
2291                          */
2292                         set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
2293                 }
2294
2295                 pte = huge_ptep_get_and_clear(mm, address, ptep);
2296                 if (huge_pte_none(pte))
2297                         continue;
2298
2299                 /*
2300                  * HWPoisoned hugepage is already unmapped and dropped reference
2301                  */
2302                 if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
2303                         continue;
2304
2305                 page = pte_page(pte);
2306                 if (pte_dirty(pte))
2307                         set_page_dirty(page);
2308                 list_add(&page->lru, &page_list);
2309         }
2310         spin_unlock(&mm->page_table_lock);
2311         flush_tlb_range(vma, start, end);
2312         mmu_notifier_invalidate_range_end(mm, start, end);
2313         list_for_each_entry_safe(page, tmp, &page_list, lru) {
2314                 page_remove_rmap(page);
2315                 list_del(&page->lru);
2316                 put_page(page);
2317         }
2318 }
2319
2320 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2321                           unsigned long end, struct page *ref_page)
2322 {
2323         mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
2324         __unmap_hugepage_range(vma, start, end, ref_page);
2325         /*
2326          * Clear this flag so that x86's huge_pmd_share page_table_shareable
2327          * test will fail on a vma being torn down, and not grab a page table
2328          * on its way out.  We're lucky that the flag has such an appropriate
2329          * name, and can in fact be safely cleared here. We could clear it
2330          * before the __unmap_hugepage_range above, but all that's necessary
2331          * is to clear it before releasing the i_mmap_mutex below.
2332          *
2333          * This works because in the contexts this is called, the VMA is
2334          * going to be destroyed. It is not vunerable to madvise(DONTNEED)
2335          * because madvise is not supported on hugetlbfs. The same applies
2336          * for direct IO. unmap_hugepage_range() is only being called just
2337          * before free_pgtables() so clearing VM_MAYSHARE will not cause
2338          * surprises later.
2339          */
2340         vma->vm_flags &= ~VM_MAYSHARE;
2341         mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2342 }
2343
2344 /*
2345  * This is called when the original mapper is failing to COW a MAP_PRIVATE
2346  * mappping it owns the reserve page for. The intention is to unmap the page
2347  * from other VMAs and let the children be SIGKILLed if they are faulting the
2348  * same region.
2349  */
2350 static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2351                                 struct page *page, unsigned long address)
2352 {
2353         struct hstate *h = hstate_vma(vma);
2354         struct vm_area_struct *iter_vma;
2355         struct address_space *mapping;
2356         struct prio_tree_iter iter;
2357         pgoff_t pgoff;
2358
2359         /*
2360          * vm_pgoff is in PAGE_SIZE units, hence the different calculation
2361          * from page cache lookup which is in HPAGE_SIZE units.
2362          */
2363         address = address & huge_page_mask(h);
2364         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
2365                 + (vma->vm_pgoff >> PAGE_SHIFT);
2366         mapping = (struct address_space *)page_private(page);
2367
2368         /*
2369          * Take the mapping lock for the duration of the table walk. As
2370          * this mapping should be shared between all the VMAs,
2371          * __unmap_hugepage_range() is called as the lock is already held
2372          */
2373         mutex_lock(&mapping->i_mmap_mutex);
2374         vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
2375                 /* Do not unmap the current VMA */
2376                 if (iter_vma == vma)
2377                         continue;
2378
2379                 /*
2380                  * Unmap the page from other VMAs without their own reserves.
2381                  * They get marked to be SIGKILLed if they fault in these
2382                  * areas. This is because a future no-page fault on this VMA
2383                  * could insert a zeroed page instead of the data existing
2384                  * from the time of fork. This would look like data corruption
2385                  */
2386                 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
2387                         __unmap_hugepage_range(iter_vma,
2388                                 address, address + huge_page_size(h),
2389                                 page);
2390         }
2391         mutex_unlock(&mapping->i_mmap_mutex);
2392
2393         return 1;
2394 }
2395
2396 /*
2397  * Hugetlb_cow() should be called with page lock of the original hugepage held.
2398  */
2399 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2400                         unsigned long address, pte_t *ptep, pte_t pte,
2401                         struct page *pagecache_page)
2402 {
2403         struct hstate *h = hstate_vma(vma);
2404         struct page *old_page, *new_page;
2405         int avoidcopy;
2406         int outside_reserve = 0;
2407
2408         old_page = pte_page(pte);
2409
2410 retry_avoidcopy:
2411         /* If no-one else is actually using this page, avoid the copy
2412          * and just make the page writable */
2413         avoidcopy = (page_mapcount(old_page) == 1);
2414         if (avoidcopy) {
2415                 if (PageAnon(old_page))
2416                         page_move_anon_rmap(old_page, vma, address);
2417                 set_huge_ptep_writable(vma, address, ptep);
2418                 return 0;
2419         }
2420
2421         /*
2422          * If the process that created a MAP_PRIVATE mapping is about to
2423          * perform a COW due to a shared page count, attempt to satisfy
2424          * the allocation without using the existing reserves. The pagecache
2425          * page is used to determine if the reserve at this address was
2426          * consumed or not. If reserves were used, a partial faulted mapping
2427          * at the time of fork() could consume its reserves on COW instead
2428          * of the full address range.
2429          */
2430         if (!(vma->vm_flags & VM_MAYSHARE) &&
2431                         is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
2432                         old_page != pagecache_page)
2433                 outside_reserve = 1;
2434
2435         page_cache_get(old_page);
2436
2437         /* Drop page_table_lock as buddy allocator may be called */
2438         spin_unlock(&mm->page_table_lock);
2439         new_page = alloc_huge_page(vma, address, outside_reserve);
2440
2441         if (IS_ERR(new_page)) {
2442                 page_cache_release(old_page);
2443
2444                 /*
2445                  * If a process owning a MAP_PRIVATE mapping fails to COW,
2446                  * it is due to references held by a child and an insufficient
2447                  * huge page pool. To guarantee the original mappers
2448                  * reliability, unmap the page from child processes. The child
2449                  * may get SIGKILLed if it later faults.
2450                  */
2451                 if (outside_reserve) {
2452                         BUG_ON(huge_pte_none(pte));
2453                         if (unmap_ref_private(mm, vma, old_page, address)) {
2454                                 BUG_ON(huge_pte_none(pte));
2455                                 spin_lock(&mm->page_table_lock);
2456                                 goto retry_avoidcopy;
2457                         }
2458                         WARN_ON_ONCE(1);
2459                 }
2460
2461                 /* Caller expects lock to be held */
2462                 spin_lock(&mm->page_table_lock);
2463                 return -PTR_ERR(new_page);
2464         }
2465
2466         /*
2467          * When the original hugepage is shared one, it does not have
2468          * anon_vma prepared.
2469          */
2470         if (unlikely(anon_vma_prepare(vma))) {
2471                 page_cache_release(new_page);
2472                 page_cache_release(old_page);
2473                 /* Caller expects lock to be held */
2474                 spin_lock(&mm->page_table_lock);
2475                 return VM_FAULT_OOM;
2476         }
2477
2478         copy_user_huge_page(new_page, old_page, address, vma,
2479                             pages_per_huge_page(h));
2480         __SetPageUptodate(new_page);
2481
2482         /*
2483          * Retake the page_table_lock to check for racing updates
2484          * before the page tables are altered
2485          */
2486         spin_lock(&mm->page_table_lock);
2487         ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2488         if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2489                 /* Break COW */
2490                 mmu_notifier_invalidate_range_start(mm,
2491                         address & huge_page_mask(h),
2492                         (address & huge_page_mask(h)) + huge_page_size(h));
2493                 huge_ptep_clear_flush(vma, address, ptep);
2494                 set_huge_pte_at(mm, address, ptep,
2495                                 make_huge_pte(vma, new_page, 1));
2496                 page_remove_rmap(old_page);
2497                 hugepage_add_new_anon_rmap(new_page, vma, address);
2498                 /* Make the old page be freed below */
2499                 new_page = old_page;
2500                 mmu_notifier_invalidate_range_end(mm,
2501                         address & huge_page_mask(h),
2502                         (address & huge_page_mask(h)) + huge_page_size(h));
2503         }
2504         page_cache_release(new_page);
2505         page_cache_release(old_page);
2506         return 0;
2507 }
2508
2509 /* Return the pagecache page at a given address within a VMA */
2510 static struct page *hugetlbfs_pagecache_page(struct hstate *h,
2511                         struct vm_area_struct *vma, unsigned long address)
2512 {
2513         struct address_space *mapping;
2514         pgoff_t idx;
2515
2516         mapping = vma->vm_file->f_mapping;
2517         idx = vma_hugecache_offset(h, vma, address);
2518
2519         return find_lock_page(mapping, idx);
2520 }
2521
2522 /*
2523  * Return whether there is a pagecache page to back given address within VMA.
2524  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
2525  */
2526 static bool hugetlbfs_pagecache_present(struct hstate *h,
2527                         struct vm_area_struct *vma, unsigned long address)
2528 {
2529         struct address_space *mapping;
2530         pgoff_t idx;
2531         struct page *page;
2532
2533         mapping = vma->vm_file->f_mapping;
2534         idx = vma_hugecache_offset(h, vma, address);
2535
2536         page = find_get_page(mapping, idx);
2537         if (page)
2538                 put_page(page);
2539         return page != NULL;
2540 }
2541
2542 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2543                         unsigned long address, pte_t *ptep, unsigned int flags)
2544 {
2545         struct hstate *h = hstate_vma(vma);
2546         int ret = VM_FAULT_SIGBUS;
2547         pgoff_t idx;
2548         unsigned long size;
2549         struct page *page;
2550         struct address_space *mapping;
2551         pte_t new_pte;
2552
2553         /*
2554          * Currently, we are forced to kill the process in the event the
2555          * original mapper has unmapped pages from the child due to a failed
2556          * COW. Warn that such a situation has occurred as it may not be obvious
2557          */
2558         if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2559                 printk(KERN_WARNING
2560                         "PID %d killed due to inadequate hugepage pool\n",
2561                         current->pid);
2562                 return ret;
2563         }
2564
2565         mapping = vma->vm_file->f_mapping;
2566         idx = vma_hugecache_offset(h, vma, address);
2567
2568         /*
2569          * Use page lock to guard against racing truncation
2570          * before we get page_table_lock.
2571          */
2572 retry:
2573         page = find_lock_page(mapping, idx);
2574         if (!page) {
2575                 size = i_size_read(mapping->host) >> huge_page_shift(h);
2576                 if (idx >= size)
2577                         goto out;
2578                 page = alloc_huge_page(vma, address, 0);
2579                 if (IS_ERR(page)) {
2580                         ret = -PTR_ERR(page);
2581                         goto out;
2582                 }
2583                 clear_huge_page(page, address, pages_per_huge_page(h));
2584                 __SetPageUptodate(page);
2585
2586                 if (vma->vm_flags & VM_MAYSHARE) {
2587                         int err;
2588                         struct inode *inode = mapping->host;
2589
2590                         err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
2591                         if (err) {
2592                                 put_page(page);
2593                                 if (err == -EEXIST)
2594                                         goto retry;
2595                                 goto out;
2596                         }
2597
2598                         spin_lock(&inode->i_lock);
2599                         inode->i_blocks += blocks_per_huge_page(h);
2600                         spin_unlock(&inode->i_lock);
2601                         page_dup_rmap(page);
2602                 } else {
2603                         lock_page(page);
2604                         if (unlikely(anon_vma_prepare(vma))) {
2605                                 ret = VM_FAULT_OOM;
2606                                 goto backout_unlocked;
2607                         }
2608                         hugepage_add_new_anon_rmap(page, vma, address);
2609                 }
2610         } else {
2611                 /*
2612                  * If memory error occurs between mmap() and fault, some process
2613                  * don't have hwpoisoned swap entry for errored virtual address.
2614                  * So we need to block hugepage fault by PG_hwpoison bit check.
2615                  */
2616                 if (unlikely(PageHWPoison(page))) {
2617                         ret = VM_FAULT_HWPOISON | 
2618                               VM_FAULT_SET_HINDEX(h - hstates);
2619                         goto backout_unlocked;
2620                 }
2621                 page_dup_rmap(page);
2622         }
2623
2624         /*
2625          * If we are going to COW a private mapping later, we examine the
2626          * pending reservations for this page now. This will ensure that
2627          * any allocations necessary to record that reservation occur outside
2628          * the spinlock.
2629          */
2630         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
2631                 if (vma_needs_reservation(h, vma, address) < 0) {
2632                         ret = VM_FAULT_OOM;
2633                         goto backout_unlocked;
2634                 }
2635
2636         spin_lock(&mm->page_table_lock);
2637         size = i_size_read(mapping->host) >> huge_page_shift(h);
2638         if (idx >= size)
2639                 goto backout;
2640
2641         ret = 0;
2642         if (!huge_pte_none(huge_ptep_get(ptep)))
2643                 goto backout;
2644
2645         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
2646                                 && (vma->vm_flags & VM_SHARED)));
2647         set_huge_pte_at(mm, address, ptep, new_pte);
2648
2649         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
2650                 /* Optimization, do the COW without a second fault */
2651                 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
2652         }
2653
2654         spin_unlock(&mm->page_table_lock);
2655         unlock_page(page);
2656 out:
2657         return ret;
2658
2659 backout:
2660         spin_unlock(&mm->page_table_lock);
2661 backout_unlocked:
2662         unlock_page(page);
2663         put_page(page);
2664         goto out;
2665 }
2666
2667 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2668                         unsigned long address, unsigned int flags)
2669 {
2670         pte_t *ptep;
2671         pte_t entry;
2672         int ret;
2673         struct page *page = NULL;
2674         struct page *pagecache_page = NULL;
2675         static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2676         struct hstate *h = hstate_vma(vma);
2677
2678         ptep = huge_pte_offset(mm, address);
2679         if (ptep) {
2680                 entry = huge_ptep_get(ptep);
2681                 if (unlikely(is_hugetlb_entry_migration(entry))) {
2682                         migration_entry_wait_huge(mm, ptep);
2683                         return 0;
2684                 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2685                         return VM_FAULT_HWPOISON_LARGE | 
2686                                VM_FAULT_SET_HINDEX(h - hstates);
2687         }
2688
2689         ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2690         if (!ptep)
2691                 return VM_FAULT_OOM;
2692
2693         /*
2694          * Serialize hugepage allocation and instantiation, so that we don't
2695          * get spurious allocation failures if two CPUs race to instantiate
2696          * the same page in the page cache.
2697          */
2698         mutex_lock(&hugetlb_instantiation_mutex);
2699         entry = huge_ptep_get(ptep);
2700         if (huge_pte_none(entry)) {
2701                 ret = hugetlb_no_page(mm, vma, address, ptep, flags);
2702                 goto out_mutex;
2703         }
2704
2705         ret = 0;
2706
2707         /*
2708          * If we are going to COW the mapping later, we examine the pending
2709          * reservations for this page now. This will ensure that any
2710          * allocations necessary to record that reservation occur outside the
2711          * spinlock. For private mappings, we also lookup the pagecache
2712          * page now as it is used to determine if a reservation has been
2713          * consumed.
2714          */
2715         if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
2716                 if (vma_needs_reservation(h, vma, address) < 0) {
2717                         ret = VM_FAULT_OOM;
2718                         goto out_mutex;
2719                 }
2720
2721                 if (!(vma->vm_flags & VM_MAYSHARE))
2722                         pagecache_page = hugetlbfs_pagecache_page(h,
2723                                                                 vma, address);
2724         }
2725
2726         /*
2727          * hugetlb_cow() requires page locks of pte_page(entry) and
2728          * pagecache_page, so here we need take the former one
2729          * when page != pagecache_page or !pagecache_page.
2730          * Note that locking order is always pagecache_page -> page,
2731          * so no worry about deadlock.
2732          */
2733         page = pte_page(entry);
2734         get_page(page);
2735         if (page != pagecache_page)
2736                 lock_page(page);
2737
2738         spin_lock(&mm->page_table_lock);
2739         /* Check for a racing update before calling hugetlb_cow */
2740         if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
2741                 goto out_page_table_lock;
2742
2743
2744         if (flags & FAULT_FLAG_WRITE) {
2745                 if (!pte_write(entry)) {
2746                         ret = hugetlb_cow(mm, vma, address, ptep, entry,
2747                                                         pagecache_page);
2748                         goto out_page_table_lock;
2749                 }
2750                 entry = pte_mkdirty(entry);
2751         }
2752         entry = pte_mkyoung(entry);
2753         if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2754                                                 flags & FAULT_FLAG_WRITE))
2755                 update_mmu_cache(vma, address, ptep);
2756
2757 out_page_table_lock:
2758         spin_unlock(&mm->page_table_lock);
2759
2760         if (pagecache_page) {
2761                 unlock_page(pagecache_page);
2762                 put_page(pagecache_page);
2763         }
2764         if (page != pagecache_page)
2765                 unlock_page(page);
2766         put_page(page);
2767
2768 out_mutex:
2769         mutex_unlock(&hugetlb_instantiation_mutex);
2770
2771         return ret;
2772 }
2773
2774 /* Can be overriden by architectures */
2775 __attribute__((weak)) struct page *
2776 follow_huge_pud(struct mm_struct *mm, unsigned long address,
2777                pud_t *pud, int write)
2778 {
2779         BUG();
2780         return NULL;
2781 }
2782
2783 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2784                         struct page **pages, struct vm_area_struct **vmas,
2785                         unsigned long *position, int *length, int i,
2786                         unsigned int flags)
2787 {
2788         unsigned long pfn_offset;
2789         unsigned long vaddr = *position;
2790         int remainder = *length;
2791         struct hstate *h = hstate_vma(vma);
2792
2793         spin_lock(&mm->page_table_lock);
2794         while (vaddr < vma->vm_end && remainder) {
2795                 pte_t *pte;
2796                 int absent;
2797                 struct page *page;
2798
2799                 /*
2800                  * Some archs (sparc64, sh*) have multiple pte_ts to
2801                  * each hugepage.  We have to make sure we get the
2802                  * first, for the page indexing below to work.
2803                  */
2804                 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2805                 absent = !pte || huge_pte_none(huge_ptep_get(pte));
2806
2807                 /*
2808                  * When coredumping, it suits get_dump_page if we just return
2809                  * an error where there's an empty slot with no huge pagecache
2810                  * to back it.  This way, we avoid allocating a hugepage, and
2811                  * the sparse dumpfile avoids allocating disk blocks, but its
2812                  * huge holes still show up with zeroes where they need to be.
2813                  */
2814                 if (absent && (flags & FOLL_DUMP) &&
2815                     !hugetlbfs_pagecache_present(h, vma, vaddr)) {
2816                         remainder = 0;
2817                         break;
2818                 }
2819
2820                 /*
2821                  * We need call hugetlb_fault for both hugepages under migration
2822                  * (in which case hugetlb_fault waits for the migration,) and
2823                  * hwpoisoned hugepages (in which case we need to prevent the
2824                  * caller from accessing to them.) In order to do this, we use
2825                  * here is_swap_pte instead of is_hugetlb_entry_migration and
2826                  * is_hugetlb_entry_hwpoisoned. This is because it simply covers
2827                  * both cases, and because we can't follow correct pages
2828                  * directly from any kind of swap entries.
2829                  */
2830                 if (absent || is_swap_pte(huge_ptep_get(pte)) ||
2831                     ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2832                         int ret;
2833
2834                         spin_unlock(&mm->page_table_lock);
2835                         ret = hugetlb_fault(mm, vma, vaddr,
2836                                 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2837                         spin_lock(&mm->page_table_lock);
2838                         if (!(ret & VM_FAULT_ERROR))
2839                                 continue;
2840
2841                         remainder = 0;
2842                         break;
2843                 }
2844
2845                 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
2846                 page = pte_page(huge_ptep_get(pte));
2847 same_page:
2848                 if (pages) {
2849                         pages[i] = mem_map_offset(page, pfn_offset);
2850                         get_page(pages[i]);
2851                 }
2852
2853                 if (vmas)
2854                         vmas[i] = vma;
2855
2856                 vaddr += PAGE_SIZE;
2857                 ++pfn_offset;
2858                 --remainder;
2859                 ++i;
2860                 if (vaddr < vma->vm_end && remainder &&
2861                                 pfn_offset < pages_per_huge_page(h)) {
2862                         /*
2863                          * We use pfn_offset to avoid touching the pageframes
2864                          * of this compound page.
2865                          */
2866                         goto same_page;
2867                 }
2868         }
2869         spin_unlock(&mm->page_table_lock);
2870         *length = remainder;
2871         *position = vaddr;
2872
2873         return i ? i : -EFAULT;
2874 }
2875
2876 void hugetlb_change_protection(struct vm_area_struct *vma,
2877                 unsigned long address, unsigned long end, pgprot_t newprot)
2878 {
2879         struct mm_struct *mm = vma->vm_mm;
2880         unsigned long start = address;
2881         pte_t *ptep;
2882         pte_t pte;
2883         struct hstate *h = hstate_vma(vma);
2884
2885         BUG_ON(address >= end);
2886         flush_cache_range(vma, address, end);
2887
2888         mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
2889         spin_lock(&mm->page_table_lock);
2890         for (; address < end; address += huge_page_size(h)) {
2891                 ptep = huge_pte_offset(mm, address);
2892                 if (!ptep)
2893                         continue;
2894                 if (huge_pmd_unshare(mm, &address, ptep))
2895                         continue;
2896                 if (!huge_pte_none(huge_ptep_get(ptep))) {
2897                         pte = huge_ptep_get_and_clear(mm, address, ptep);
2898                         pte = pte_mkhuge(pte_modify(pte, newprot));
2899                         set_huge_pte_at(mm, address, ptep, pte);
2900                 }
2901         }
2902         spin_unlock(&mm->page_table_lock);
2903         /*
2904          * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
2905          * may have cleared our pud entry and done put_page on the page table:
2906          * once we release i_mmap_mutex, another task can do the final put_page
2907          * and that page table be reused and filled with junk.
2908          */
2909         flush_tlb_range(vma, start, end);
2910         mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2911 }
2912
2913 int hugetlb_reserve_pages(struct inode *inode,
2914                                         long from, long to,
2915                                         struct vm_area_struct *vma,
2916                                         vm_flags_t vm_flags)
2917 {
2918         long ret, chg;
2919         struct hstate *h = hstate_inode(inode);
2920
2921         /*
2922          * Only apply hugepage reservation if asked. At fault time, an
2923          * attempt will be made for VM_NORESERVE to allocate a page
2924          * and filesystem quota without using reserves
2925          */
2926         if (vm_flags & VM_NORESERVE)
2927                 return 0;
2928
2929         /*
2930          * Shared mappings base their reservation on the number of pages that
2931          * are already allocated on behalf of the file. Private mappings need
2932          * to reserve the full area even if read-only as mprotect() may be
2933          * called to make the mapping read-write. Assume !vma is a shm mapping
2934          */
2935         if (!vma || vma->vm_flags & VM_MAYSHARE)
2936                 chg = region_chg(&inode->i_mapping->private_list, from, to);
2937         else {
2938                 struct resv_map *resv_map = resv_map_alloc();
2939                 if (!resv_map)
2940                         return -ENOMEM;
2941
2942                 chg = to - from;
2943
2944                 set_vma_resv_map(vma, resv_map);
2945                 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2946         }
2947
2948         if (chg < 0) {
2949                 ret = chg;
2950                 goto out_err;
2951         }
2952
2953         /* There must be enough filesystem quota for the mapping */
2954         if (hugetlb_get_quota(inode->i_mapping, chg)) {
2955                 ret = -ENOSPC;
2956                 goto out_err;
2957         }
2958
2959         /*
2960          * Check enough hugepages are available for the reservation.
2961          * Hand back the quota if there are not
2962          */
2963         ret = hugetlb_acct_memory(h, chg);
2964         if (ret < 0) {
2965                 hugetlb_put_quota(inode->i_mapping, chg);
2966                 goto out_err;
2967         }
2968
2969         /*
2970          * Account for the reservations made. Shared mappings record regions
2971          * that have reservations as they are shared by multiple VMAs.
2972          * When the last VMA disappears, the region map says how much
2973          * the reservation was and the page cache tells how much of
2974          * the reservation was consumed. Private mappings are per-VMA and
2975          * only the consumed reservations are tracked. When the VMA
2976          * disappears, the original reservation is the VMA size and the
2977          * consumed reservations are stored in the map. Hence, nothing
2978          * else has to be done for private mappings here
2979          */
2980         if (!vma || vma->vm_flags & VM_MAYSHARE)
2981                 region_add(&inode->i_mapping->private_list, from, to);
2982         return 0;
2983 out_err:
2984         if (vma)
2985                 resv_map_put(vma);
2986         return ret;
2987 }
2988
2989 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2990 {
2991         struct hstate *h = hstate_inode(inode);
2992         long chg = region_truncate(&inode->i_mapping->private_list, offset);
2993
2994         spin_lock(&inode->i_lock);
2995         inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2996         spin_unlock(&inode->i_lock);
2997
2998         hugetlb_put_quota(inode->i_mapping, (chg - freed));
2999         hugetlb_acct_memory(h, -(chg - freed));
3000 }
3001
3002 #ifdef CONFIG_MEMORY_FAILURE
3003
3004 /* Should be called in hugetlb_lock */
3005 static int is_hugepage_on_freelist(struct page *hpage)
3006 {
3007         struct page *page;
3008         struct page *tmp;
3009         struct hstate *h = page_hstate(hpage);
3010         int nid = page_to_nid(hpage);
3011
3012         list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
3013                 if (page == hpage)
3014                         return 1;
3015         return 0;
3016 }
3017
3018 /*
3019  * This function is called from memory failure code.
3020  * Assume the caller holds page lock of the head page.
3021  */
3022 int dequeue_hwpoisoned_huge_page(struct page *hpage)
3023 {
3024         struct hstate *h = page_hstate(hpage);
3025         int nid = page_to_nid(hpage);
3026         int ret = -EBUSY;
3027
3028         spin_lock(&hugetlb_lock);
3029         if (is_hugepage_on_freelist(hpage)) {
3030                 list_del(&hpage->lru);
3031                 set_page_refcounted(hpage);
3032                 h->free_huge_pages--;
3033                 h->free_huge_pages_node[nid]--;
3034                 ret = 0;
3035         }
3036         spin_unlock(&hugetlb_lock);
3037         return ret;
3038 }
3039 #endif