hugetlb.c 125 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2
/*
 * Generic hugetlb support.
3
 * (C) Nadia Yvette Chambers, April 2004
Linus Torvalds's avatar
Linus Torvalds committed
4 5 6 7
 */
#include <linux/list.h>
#include <linux/init.h>
#include <linux/mm.h>
8
#include <linux/seq_file.h>
Linus Torvalds's avatar
Linus Torvalds committed
9 10
#include <linux/sysctl.h>
#include <linux/highmem.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
11
#include <linux/mmu_notifier.h>
Linus Torvalds's avatar
Linus Torvalds committed
12
#include <linux/nodemask.h>
13
#include <linux/pagemap.h>
14
#include <linux/mempolicy.h>
15
#include <linux/compiler.h>
16
#include <linux/cpuset.h>
17
#include <linux/mutex.h>
18
#include <linux/bootmem.h>
19
#include <linux/sysfs.h>
20
#include <linux/slab.h>
21
#include <linux/rmap.h>
22 23
#include <linux/swap.h>
#include <linux/swapops.h>
24
#include <linux/page-isolation.h>
25
#include <linux/jhash.h>
26

27 28
#include <asm/page.h>
#include <asm/pgtable.h>
29
#include <asm/tlb.h>
30

31
#include <linux/io.h>
32
#include <linux/hugetlb.h>
33
#include <linux/hugetlb_cgroup.h>
34
#include <linux/node.h>
35
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
36

37
int hugepages_treat_as_movable;
38

39
int hugetlb_max_hstate __read_mostly;
40 41
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
42 43 44 45 46
/*
 * Minimum page order among possible hugepage sizes, set to a proper value
 * at boot time.
 */
static unsigned int minimum_order __read_mostly = UINT_MAX;
47

48 49
__initdata LIST_HEAD(huge_boot_pages);

50 51 52
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
53
static unsigned long __initdata default_hstate_size;
54
static bool __initdata parsed_valid_hugepagesz = true;
55

56
/*
57 58
 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
 * free_huge_pages, and surplus_huge_pages.
59
 */
60
DEFINE_SPINLOCK(hugetlb_lock);
61

62 63 64 65 66
/*
 * Serializes faults on the same logical page.  This is used to
 * prevent spurious OOMs when the hugepage pool is fully utilized.
 */
static int num_fault_mutexes;
67
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
68

69 70 71
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);

72 73 74 75 76 77 78
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
	bool free = (spool->count == 0) && (spool->used_hpages == 0);

	spin_unlock(&spool->lock);

	/* If no pages are used, and no other handles to the subpool
79 80 81 82 83 84
	 * remain, give up any reservations mased on minimum size and
	 * free the subpool */
	if (free) {
		if (spool->min_hpages != -1)
			hugetlb_acct_memory(spool->hstate,
						-spool->min_hpages);
85
		kfree(spool);
86
	}
87 88
}

89 90
struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
						long min_hpages)
91 92 93
{
	struct hugepage_subpool *spool;

94
	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
95 96 97 98 99
	if (!spool)
		return NULL;

	spin_lock_init(&spool->lock);
	spool->count = 1;
100 101 102 103 104 105 106 107 108
	spool->max_hpages = max_hpages;
	spool->hstate = h;
	spool->min_hpages = min_hpages;

	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
		kfree(spool);
		return NULL;
	}
	spool->rsv_hpages = min_hpages;
109 110 111 112 113 114 115 116 117 118 119 120

	return spool;
}

void hugepage_put_subpool(struct hugepage_subpool *spool)
{
	spin_lock(&spool->lock);
	BUG_ON(!spool->count);
	spool->count--;
	unlock_or_release_subpool(spool);
}

121 122 123 124 125 126 127 128 129
/*
 * Subpool accounting for allocating and reserving pages.
 * Return -ENOMEM if there are not enough resources to satisfy the
 * the request.  Otherwise, return the number of pages by which the
 * global pools must be adjusted (upward).  The returned value may
 * only be different than the passed value (delta) in the case where
 * a subpool minimum size must be manitained.
 */
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
130 131
				      long delta)
{
132
	long ret = delta;
133 134

	if (!spool)
135
		return ret;
136 137

	spin_lock(&spool->lock);
138 139 140 141 142 143 144 145

	if (spool->max_hpages != -1) {		/* maximum size accounting */
		if ((spool->used_hpages + delta) <= spool->max_hpages)
			spool->used_hpages += delta;
		else {
			ret = -ENOMEM;
			goto unlock_ret;
		}
146 147
	}

148 149
	/* minimum size accounting */
	if (spool->min_hpages != -1 && spool->rsv_hpages) {
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
		if (delta > spool->rsv_hpages) {
			/*
			 * Asking for more reserves than those already taken on
			 * behalf of subpool.  Return difference.
			 */
			ret = delta - spool->rsv_hpages;
			spool->rsv_hpages = 0;
		} else {
			ret = 0;	/* reserves already accounted for */
			spool->rsv_hpages -= delta;
		}
	}

unlock_ret:
	spin_unlock(&spool->lock);
165 166 167
	return ret;
}

168 169 170 171 172 173 174
/*
 * Subpool accounting for freeing and unreserving pages.
 * Return the number of global page reservations that must be dropped.
 * The return value may only be different than the passed value (delta)
 * in the case where a subpool minimum size must be maintained.
 */
static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
175 176
				       long delta)
{
177 178
	long ret = delta;

179
	if (!spool)
180
		return delta;
181 182

	spin_lock(&spool->lock);
183 184 185 186

	if (spool->max_hpages != -1)		/* maximum size accounting */
		spool->used_hpages -= delta;

187 188
	 /* minimum size accounting */
	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
189 190 191 192 193 194 195 196 197 198 199 200 201 202
		if (spool->rsv_hpages + delta <= spool->min_hpages)
			ret = 0;
		else
			ret = spool->rsv_hpages + delta - spool->min_hpages;

		spool->rsv_hpages += delta;
		if (spool->rsv_hpages > spool->min_hpages)
			spool->rsv_hpages = spool->min_hpages;
	}

	/*
	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
	 * quota reference, free it now.
	 */
203
	unlock_or_release_subpool(spool);
204 205

	return ret;
206 207 208 209 210 211 212 213 214
}

static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
{
	return HUGETLBFS_SB(inode->i_sb)->spool;
}

static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
Al Viro's avatar
Al Viro committed
215
	return subpool_inode(file_inode(vma->vm_file));
216 217
}

218 219 220
/*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
221
 *
222 223 224 225 226 227 228 229 230 231 232 233 234 235
 * The region data structures are embedded into a resv_map and protected
 * by a resv_map's lock.  The set of regions within the resv_map represent
 * reservations for huge pages, or huge pages that have already been
 * instantiated within the map.  The from and to elements are huge page
 * indicies into the associated mapping.  from indicates the starting index
 * of the region.  to represents the first index past the end of  the region.
 *
 * For example, a file region structure with from == 0 and to == 4 represents
 * four huge pages in a mapping.  It is important to note that the to element
 * represents the first element past the end of the region. This is used in
 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
 *
 * Interval notation of the form [from, to) will be used to indicate that
 * the endpoint from is inclusive and to is exclusive.
236 237 238 239 240 241 242
 */
struct file_region {
	struct list_head link;
	long from;
	long to;
};

243 244
/*
 * Add the huge page range represented by [f, t) to the reserve
245 246 247 248 249 250 251 252
 * map.  In the normal case, existing regions will be expanded
 * to accommodate the specified range.  Sufficient regions should
 * exist for expansion due to the previous call to region_chg
 * with the same range.  However, it is possible that region_del
 * could have been called after region_chg and modifed the map
 * in such a way that no region exists to be expanded.  In this
 * case, pull a region descriptor from the cache associated with
 * the map and use that for the new range.
253 254 255
 *
 * Return the number of new huge pages added to the map.  This
 * number is greater than or equal to zero.
256
 */
257
static long region_add(struct resv_map *resv, long f, long t)
258
{
259
	struct list_head *head = &resv->regions;
260
	struct file_region *rg, *nrg, *trg;
261
	long add = 0;
262

263
	spin_lock(&resv->lock);
264 265 266 267 268
	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
	/*
	 * If no region exists which can be expanded to include the
	 * specified range, the list must have been modified by an
	 * interleving call to region_del().  Pull a region descriptor
	 * from the cache and use it for this range.
	 */
	if (&rg->link == head || t < rg->from) {
		VM_BUG_ON(resv->region_cache_count <= 0);

		resv->region_cache_count--;
		nrg = list_first_entry(&resv->region_cache, struct file_region,
					link);
		list_del(&nrg->link);

		nrg->from = f;
		nrg->to = t;
		list_add(&nrg->link, rg->link.prev);

		add += t - f;
		goto out_locked;
	}

291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;

	/* Check for and consume any regions we now overlap with. */
	nrg = rg;
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			break;

		/* If this area reaches higher then extend our area to
		 * include it completely.  If this is not the first area
		 * which we intend to reuse, free it. */
		if (rg->to > t)
			t = rg->to;
		if (rg != nrg) {
309 310 311 312 313
			/* Decrement return value by the deleted range.
			 * Another range will span this area so that by
			 * end of routine add will be >= zero
			 */
			add -= (rg->to - rg->from);
314 315 316 317
			list_del(&rg->link);
			kfree(rg);
		}
	}
318 319

	add += (nrg->from - f);		/* Added to beginning of region */
320
	nrg->from = f;
321
	add += t - nrg->to;		/* Added to end of region */
322
	nrg->to = t;
323

324 325
out_locked:
	resv->adds_in_progress--;
326
	spin_unlock(&resv->lock);
327 328
	VM_BUG_ON(add < 0);
	return add;
329 330
}

331 332 333 334 335 336 337 338 339 340 341 342 343
/*
 * Examine the existing reserve map and determine how many
 * huge pages in the specified range [f, t) are NOT currently
 * represented.  This routine is called before a subsequent
 * call to region_add that will actually modify the reserve
 * map to add the specified range [f, t).  region_chg does
 * not change the number of huge pages represented by the
 * map.  However, if the existing regions in the map can not
 * be expanded to represent the new range, a new file_region
 * structure is added to the map as a placeholder.  This is
 * so that the subsequent region_add call will have all the
 * regions it needs and will not fail.
 *
344 345 346 347 348 349 350 351
 * Upon entry, region_chg will also examine the cache of region descriptors
 * associated with the map.  If there are not enough descriptors cached, one
 * will be allocated for the in progress add operation.
 *
 * Returns the number of huge pages that need to be added to the existing
 * reservation map for the range [f, t).  This number is greater or equal to
 * zero.  -ENOMEM is returned if a new file_region structure or cache entry
 * is needed and can not be allocated.
352
 */
353
static long region_chg(struct resv_map *resv, long f, long t)
354
{
355
	struct list_head *head = &resv->regions;
356
	struct file_region *rg, *nrg = NULL;
357 358
	long chg = 0;

359 360
retry:
	spin_lock(&resv->lock);
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
retry_locked:
	resv->adds_in_progress++;

	/*
	 * Check for sufficient descriptors in the cache to accommodate
	 * the number of in progress add operations.
	 */
	if (resv->adds_in_progress > resv->region_cache_count) {
		struct file_region *trg;

		VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
		/* Must drop lock to allocate a new descriptor. */
		resv->adds_in_progress--;
		spin_unlock(&resv->lock);

		trg = kmalloc(sizeof(*trg), GFP_KERNEL);
377 378
		if (!trg) {
			kfree(nrg);
379
			return -ENOMEM;
380
		}
381 382 383 384 385 386 387

		spin_lock(&resv->lock);
		list_add(&trg->link, &resv->region_cache);
		resv->region_cache_count++;
		goto retry_locked;
	}

388 389 390 391 392 393 394 395 396
	/* Locate the region we are before or in. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* If we are below the current region then a new region is required.
	 * Subtle, allocate a new region at the position but make it zero
	 * size such that we can guarantee to record the reservation. */
	if (&rg->link == head || t < rg->from) {
397
		if (!nrg) {
398
			resv->adds_in_progress--;
399 400 401 402 403 404 405 406 407 408
			spin_unlock(&resv->lock);
			nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
			if (!nrg)
				return -ENOMEM;

			nrg->from = f;
			nrg->to   = f;
			INIT_LIST_HEAD(&nrg->link);
			goto retry;
		}
409

410 411 412
		list_add(&nrg->link, rg->link.prev);
		chg = t - f;
		goto out_nrg;
413 414 415 416 417 418 419 420 421 422 423 424
	}

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;
	chg = t - f;

	/* Check for and consume any regions we now overlap with. */
	list_for_each_entry(rg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
425
			goto out;
426

427
		/* We overlap with this area, if it extends further than
428 429 430 431 432 433 434 435
		 * us then we must extend ourselves.  Account for its
		 * existing reservation. */
		if (rg->to > t) {
			chg += rg->to - t;
			t = rg->to;
		}
		chg -= rg->to - rg->from;
	}
436 437 438 439 440 441 442 443

out:
	spin_unlock(&resv->lock);
	/*  We already know we raced and no longer need the new region */
	kfree(nrg);
	return chg;
out_nrg:
	spin_unlock(&resv->lock);
444 445 446
	return chg;
}

447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
/*
 * Abort the in progress add operation.  The adds_in_progress field
 * of the resv_map keeps track of the operations in progress between
 * calls to region_chg and region_add.  Operations are sometimes
 * aborted after the call to region_chg.  In such cases, region_abort
 * is called to decrement the adds_in_progress counter.
 *
 * NOTE: The range arguments [f, t) are not needed or used in this
 * routine.  They are kept to make reading the calling code easier as
 * arguments will match the associated region_chg call.
 */
static void region_abort(struct resv_map *resv, long f, long t)
{
	spin_lock(&resv->lock);
	VM_BUG_ON(!resv->region_cache_count);
	resv->adds_in_progress--;
	spin_unlock(&resv->lock);
}

466
/*
467 468 469 470 471 472 473 474 475 476 477 478
 * Delete the specified range [f, t) from the reserve map.  If the
 * t parameter is LONG_MAX, this indicates that ALL regions after f
 * should be deleted.  Locate the regions which intersect [f, t)
 * and either trim, delete or split the existing regions.
 *
 * Returns the number of huge pages deleted from the reserve map.
 * In the normal case, the return value is zero or more.  In the
 * case where a region must be split, a new region descriptor must
 * be allocated.  If the allocation fails, -ENOMEM will be returned.
 * NOTE: If the parameter t == LONG_MAX, then we will never split
 * a region and possibly return -ENOMEM.  Callers specifying
 * t == LONG_MAX do not need to check for -ENOMEM error.
479
 */
480
static long region_del(struct resv_map *resv, long f, long t)
481
{
482
	struct list_head *head = &resv->regions;
483
	struct file_region *rg, *trg;
484 485
	struct file_region *nrg = NULL;
	long del = 0;
486

487
retry:
488
	spin_lock(&resv->lock);
489
	list_for_each_entry_safe(rg, trg, head, link) {
490 491 492 493 494 495 496 497
		/*
		 * Skip regions before the range to be deleted.  file_region
		 * ranges are normally of the form [from, to).  However, there
		 * may be a "placeholder" entry in the map which is of the form
		 * (from, to) with from == to.  Check for placeholder entries
		 * at the beginning of the range to be deleted.
		 */
		if (rg->to <= f && (rg->to != rg->from || rg->to != f))
498
			continue;
499

500
		if (rg->from >= t)
501 502
			break;

503 504 505 506 507 508 509 510 511 512 513 514 515
		if (f > rg->from && t < rg->to) { /* Must split region */
			/*
			 * Check for an entry in the cache before dropping
			 * lock and attempting allocation.
			 */
			if (!nrg &&
			    resv->region_cache_count > resv->adds_in_progress) {
				nrg = list_first_entry(&resv->region_cache,
							struct file_region,
							link);
				list_del(&nrg->link);
				resv->region_cache_count--;
			}
516

517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
			if (!nrg) {
				spin_unlock(&resv->lock);
				nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
				if (!nrg)
					return -ENOMEM;
				goto retry;
			}

			del += t - f;

			/* New entry for end of split region */
			nrg->from = t;
			nrg->to = rg->to;
			INIT_LIST_HEAD(&nrg->link);

			/* Original entry is trimmed */
			rg->to = f;

			list_add(&nrg->link, &rg->link);
			nrg = NULL;
537
			break;
538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
		}

		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
			del += rg->to - rg->from;
			list_del(&rg->link);
			kfree(rg);
			continue;
		}

		if (f <= rg->from) {	/* Trim beginning of region */
			del += t - rg->from;
			rg->from = t;
		} else {		/* Trim end of region */
			del += rg->to - f;
			rg->to = f;
		}
554
	}
555 556

	spin_unlock(&resv->lock);
557 558
	kfree(nrg);
	return del;
559 560
}

561 562 563 564 565 566 567 568 569
/*
 * A rare out of memory error was encountered which prevented removal of
 * the reserve map region for a page.  The huge page itself was free'ed
 * and removed from the page cache.  This routine will adjust the subpool
 * usage count, and the global reserve count if needed.  By incrementing
 * these counts, the reserve map entry which could not be deleted will
 * appear as a "reserved" entry instead of simply dangling with incorrect
 * counts.
 */
570
void hugetlb_fix_reserve_counts(struct inode *inode)
571 572 573 574 575
{
	struct hugepage_subpool *spool = subpool_inode(inode);
	long rsv_adjust;

	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
576
	if (rsv_adjust) {
577 578 579 580 581 582
		struct hstate *h = hstate_inode(inode);

		hugetlb_acct_memory(h, 1);
	}
}

583 584 585 586
/*
 * Count and return the number of huge pages in the reserve map
 * that intersect with the range [f, t).
 */
587
static long region_count(struct resv_map *resv, long f, long t)
588
{
589
	struct list_head *head = &resv->regions;
590 591 592
	struct file_region *rg;
	long chg = 0;

593
	spin_lock(&resv->lock);
594 595
	/* Locate each segment we overlap with, and count that overlap. */
	list_for_each_entry(rg, head, link) {
596 597
		long seg_from;
		long seg_to;
598 599 600 601 602 603 604 605 606 607 608

		if (rg->to <= f)
			continue;
		if (rg->from >= t)
			break;

		seg_from = max(rg->from, f);
		seg_to = min(rg->to, t);

		chg += seg_to - seg_from;
	}
609
	spin_unlock(&resv->lock);
610 611 612 613

	return chg;
}

614 615 616 617
/*
 * Convert the address within this vma to the page offset within
 * the mapping, in pagecache page units; huge pages here.
 */
618 619
static pgoff_t vma_hugecache_offset(struct hstate *h,
			struct vm_area_struct *vma, unsigned long address)
620
{
621 622
	return ((address - vma->vm_start) >> huge_page_shift(h)) +
			(vma->vm_pgoff >> huge_page_order(h));
623 624
}

625 626 627 628 629
pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
				     unsigned long address)
{
	return vma_hugecache_offset(hstate_vma(vma), vma, address);
}
630
EXPORT_SYMBOL_GPL(linear_hugepage_index);
631

632 633 634 635 636 637 638 639 640 641 642 643 644
/*
 * Return the size of the pages allocated when backing a VMA. In the majority
 * cases this will be same size as used by the page table entries.
 */
unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
	struct hstate *hstate;

	if (!is_vm_hugetlb_page(vma))
		return PAGE_SIZE;

	hstate = hstate_vma(vma);

645
	return 1UL << huge_page_shift(hstate);
646
}
647
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
648

649 650 651 652 653 654 655 656 657 658 659 660 661
/*
 * Return the page size being used by the MMU to back a VMA. In the majority
 * of cases, the page size used by the kernel matches the MMU size. On
 * architectures where it differs, an architecture-specific version of this
 * function is required.
 */
#ifndef vma_mmu_pagesize
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
	return vma_kernel_pagesize(vma);
}
#endif

662 663 664 665 666 667 668
/*
 * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
 * bits of the reservation map pointer, which are always clear due to
 * alignment.
 */
#define HPAGE_RESV_OWNER    (1UL << 0)
#define HPAGE_RESV_UNMAPPED (1UL << 1)
669
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
670

671 672 673 674 675 676 677 678 679
/*
 * These helpers are used to track how many pages are reserved for
 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
 * is guaranteed to have their future faults succeed.
 *
 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
 * the reserve counters are updated with the hugetlb_lock held. It is safe
 * to reset the VMA at fork() time as it is not in use yet and there is no
 * chance of the global counters getting corrupted as a result of the values.
680 681 682 683 684 685 686 687 688
 *
 * The private mapping reservation is represented in a subtly different
 * manner to a shared mapping.  A shared mapping has a region map associated
 * with the underlying file, this region map represents the backing file
 * pages which have ever had a reservation assigned which this persists even
 * after the page is instantiated.  A private mapping has a region map
 * associated with the original mmap which is attached to all VMAs which
 * reference it, this region map represents those offsets which have consumed
 * reservation ie. where pages have been instantiated.
689
 */
690 691 692 693 694 695 696 697 698 699 700
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
{
	return (unsigned long)vma->vm_private_data;
}

static void set_vma_private_data(struct vm_area_struct *vma,
							unsigned long value)
{
	vma->vm_private_data = (void *)value;
}

701
struct resv_map *resv_map_alloc(void)
702 703
{
	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
704 705 706 707 708
	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);

	if (!resv_map || !rg) {
		kfree(resv_map);
		kfree(rg);
709
		return NULL;
710
	}
711 712

	kref_init(&resv_map->refs);
713
	spin_lock_init(&resv_map->lock);
714 715
	INIT_LIST_HEAD(&resv_map->regions);

716 717 718 719 720 721
	resv_map->adds_in_progress = 0;

	INIT_LIST_HEAD(&resv_map->region_cache);
	list_add(&rg->link, &resv_map->region_cache);
	resv_map->region_cache_count = 1;

722 723 724
	return resv_map;
}

725
void resv_map_release(struct kref *ref)
726 727
{
	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
728 729
	struct list_head *head = &resv_map->region_cache;
	struct file_region *rg, *trg;
730 731

	/* Clear out any active regions before we release the map. */
732
	region_del(resv_map, 0, LONG_MAX);
733 734 735 736 737 738 739 740 741

	/* ... and any entries left in the cache */
	list_for_each_entry_safe(rg, trg, head, link) {
		list_del(&rg->link);
		kfree(rg);
	}

	VM_BUG_ON(resv_map->adds_in_progress);

742 743 744
	kfree(resv_map);
}

745 746 747 748 749
static inline struct resv_map *inode_resv_map(struct inode *inode)
{
	return inode->i_mapping->private_data;
}

750
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
751
{
752
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
753 754 755 756 757 758 759
	if (vma->vm_flags & VM_MAYSHARE) {
		struct address_space *mapping = vma->vm_file->f_mapping;
		struct inode *inode = mapping->host;

		return inode_resv_map(inode);

	} else {
760 761
		return (struct resv_map *)(get_vma_private_data(vma) &
							~HPAGE_RESV_MASK);
762
	}
763 764
}

765
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
766
{
767 768
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
769

770 771
	set_vma_private_data(vma, (get_vma_private_data(vma) &
				HPAGE_RESV_MASK) | (unsigned long)map);
772 773 774 775
}

static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
776 777
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
778 779

	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
780 781 782 783
}

static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
784
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
785 786

	return (get_vma_private_data(vma) & flag) != 0;
787 788
}

789
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
790 791
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
792
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
793
	if (!(vma->vm_flags & VM_MAYSHARE))
794 795 796 797
		vma->vm_private_data = (void *)0;
}

/* Returns true if the VMA has associated reserve pages */
798
static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
799
{
800 801 802 803 804 805 806 807 808 809 810
	if (vma->vm_flags & VM_NORESERVE) {
		/*
		 * This address is already reserved by other process(chg == 0),
		 * so, we should decrement reserved count. Without decrementing,
		 * reserve count remains after releasing inode, because this
		 * allocated page will go into page cache and is regarded as
		 * coming from reserved pool in releasing step.  Currently, we
		 * don't have any other solution to deal with this situation
		 * properly, so add work-around here.
		 */
		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
811
			return true;
812
		else
813
			return false;
814
	}
815 816

	/* Shared mappings always use reserves */
817 818 819 820 821 822 823 824 825 826 827 828 829
	if (vma->vm_flags & VM_MAYSHARE) {
		/*
		 * We know VM_NORESERVE is not set.  Therefore, there SHOULD
		 * be a region map for all pages.  The only situation where
		 * there is no region map is if a hole was punched via
		 * fallocate.  In this case, there really are no reverves to
		 * use.  This situation is indicated if chg != 0.
		 */
		if (chg)
			return false;
		else
			return true;
	}
830 831 832 833 834

	/*
	 * Only the process that called mmap() has reserves for
	 * private mappings.
	 */
835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855
	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
		/*
		 * Like the shared case above, a hole punch or truncate
		 * could have been performed on the private mapping.
		 * Examine the value of chg to determine if reserves
		 * actually exist or were previously consumed.
		 * Very Subtle - The value of chg comes from a previous
		 * call to vma_needs_reserves().  The reserve map for
		 * private mappings has different (opposite) semantics
		 * than that of shared mappings.  vma_needs_reserves()
		 * has already taken this difference in semantics into
		 * account.  Therefore, the meaning of chg is the same
		 * as in the shared case above.  Code could easily be
		 * combined, but keeping it separate draws attention to
		 * subtle differences.
		 */
		if (chg)
			return false;
		else
			return true;
	}
856

857
	return false;
858 859
}

860
static void enqueue_huge_page(struct hstate *h, struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
861 862
{
	int nid = page_to_nid(page);
863
	list_move(&page->lru, &h->hugepage_freelists[nid]);
864 865
	h->free_huge_pages++;
	h->free_huge_pages_node[nid]++;
Linus Torvalds's avatar
Linus Torvalds committed
866 867
}

868 869 870 871
static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
{
	struct page *page;

872 873 874 875 876 877 878 879
	list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
		if (!is_migrate_isolate_page(page))
			break;
	/*
	 * if 'non-isolated free hugepage' not found on the list,
	 * the allocation fails.
	 */
	if (&h->hugepage_freelists[nid] == &page->lru)
880
		return NULL;
881
	list_move(&page->lru, &h->hugepage_activelist);
882
	set_page_refcounted(page);
883 884 885 886 887
	h->free_huge_pages--;
	h->free_huge_pages_node[nid]--;
	return page;
}

888 889 890
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
891
	if (hugepages_treat_as_movable || hugepage_migration_supported(h))
892 893 894 895 896
		return GFP_HIGHUSER_MOVABLE;
	else
		return GFP_HIGHUSER;
}

897 898
static struct page *dequeue_huge_page_vma(struct hstate *h,
				struct vm_area_struct *vma,
899 900
				unsigned long address, int avoid_reserve,
				long chg)
Linus Torvalds's avatar
Linus Torvalds committed
901
{
902
	struct page *page = NULL;
903
	struct mempolicy *mpol;
904
	nodemask_t *nodemask;
905
	struct zonelist *zonelist;
906 907
	struct zone *zone;
	struct zoneref *z;
908
	unsigned int cpuset_mems_cookie;
Linus Torvalds's avatar
Linus Torvalds committed
909

910 911 912 913 914
	/*
	 * A child process with MAP_PRIVATE mappings created by their parent
	 * have no page reserves. This check ensures that reservations are
	 * not "stolen". The child may still get SIGKILLed
	 */
915
	if (!vma_has_reserves(vma, chg) &&
916
			h->free_huge_pages - h->resv_huge_pages == 0)
917
		goto err;
918

919
	/* If reserves cannot be used, ensure enough pages are in the pool */
920
	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
921
		goto err;
922

923
retry_cpuset:
924
	cpuset_mems_cookie = read_mems_allowed_begin();
925
	zonelist = huge_zonelist(vma, address,
926
					htlb_alloc_mask(h), &mpol, &nodemask);
927

928 929
	for_each_zone_zonelist_nodemask(zone, z, zonelist,
						MAX_NR_ZONES - 1, nodemask) {
930
		if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
931 932
			page = dequeue_huge_page_node(h, zone_to_nid(zone));
			if (page) {
933 934 935 936 937
				if (avoid_reserve)
					break;
				if (!vma_has_reserves(vma, chg))
					break;

938
				SetPagePrivate(page);
939
				h->resv_huge_pages--;
940 941
				break;
			}
942
		}
Linus Torvalds's avatar
Linus Torvalds committed
943
	}
944

945
	mpol_cond_put(mpol);
946
	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
947
		goto retry_cpuset;
Linus Torvalds's avatar
Linus Torvalds committed
948
	return page;
949 950 951

err:
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
952 953
}

954 955 956 957 958 959 960 961 962
/*
 * common helper functions for hstate_next_node_to_{alloc|free}.
 * We may have allocated or freed a huge page based on a different
 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
 * be outside of *nodes_allowed.  Ensure that we use an allowed
 * node for alloc or free.
 */
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
963
	nid = next_node_in(nid, *nodes_allowed);
964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
	VM_BUG_ON(nid >= MAX_NUMNODES);

	return nid;
}

static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
{
	if (!node_isset(nid, *nodes_allowed))
		nid = next_node_allowed(nid, nodes_allowed);
	return nid;
}

/*
 * returns the previously saved node ["this node"] from which to
 * allocate a persistent huge page for the pool and advance the
 * next node from which to allocate, handling wrap at end of node
 * mask.
 */
static int hstate_next_node_to_alloc(struct hstate *h,
					nodemask_t *nodes_allowed)
{
	int nid;

	VM_BUG_ON(!nodes_allowed);

	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);

	return nid;
}

/*
 * helper for free_pool_huge_page() - return the previously saved
 * node ["this node"] from which to free a huge page.  Advance the
 * next node id whether or not we find a free huge page to free so
 * that the next attempt to free addresses the next node.
 */
static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{
	int nid;

	VM_BUG_ON(!nodes_allowed);

	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);

	return nid;
}

#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
	for (nr_nodes = nodes_weight(*mask);				\
		nr_nodes > 0 &&						\
		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
		nr_nodes--)

#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
	for (nr_nodes = nodes_weight(*mask);				\
		nr_nodes > 0 &&						\
		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
		nr_nodes--)

1025
#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \
1026 1027
	((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
	defined(CONFIG_CMA))
1028
static void destroy_compound_gigantic_page(struct page *page,
1029
					unsigned int order)
1030 1031 1032 1033 1034
{
	int i;
	int nr_pages = 1 << order;
	struct page *p = page + 1;

1035
	atomic_set(compound_mapcount_ptr(page), 0);
1036
	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1037
		clear_compound_head(p);
1038 1039 1040 1041 1042 1043 1044
		set_page_refcounted(p);
	}

	set_compound_order(page, 0);
	__ClearPageHead(page);
}

1045
static void free_gigantic_page(struct page *page, unsigned int order)
1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056
{
	free_contig_range(page_to_pfn(page), 1 << order);
}

static int __alloc_gigantic_page(unsigned long start_pfn,
				unsigned long nr_pages)
{
	unsigned long end_pfn = start_pfn + nr_pages;
	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
}

1057 1058
static bool pfn_range_valid_gigantic(struct zone *z,
			unsigned long start_pfn, unsigned long nr_pages)
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068
{
	unsigned long i, end_pfn = start_pfn + nr_pages;
	struct page *page;

	for (i = start_pfn; i < end_pfn; i++) {
		if (!pfn_valid(i))
			return false;

		page = pfn_to_page(i);

1069 1070 1071
		if (page_zone(page) != z)
			return false;

1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
		if (PageReserved(page))
			return false;

		if (page_count(page) > 0)
			return false;

		if (PageHuge(page))
			return false;
	}

	return true;
}

static bool zone_spans_last_pfn(const struct zone *zone,
			unsigned long start_pfn, unsigned long nr_pages)
{
	unsigned long last_pfn = start_pfn + nr_pages - 1;
	return zone_spans_pfn(zone, last_pfn);
}

1092
static struct page *alloc_gigantic_page(int nid, unsigned int order)
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
{
	unsigned long nr_pages = 1 << order;
	unsigned long ret, pfn, flags;
	struct zone *z;

	z = NODE_DATA(nid)->node_zones;
	for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
		spin_lock_irqsave(&z->lock, flags);

		pfn = ALIGN(z->zone_start_pfn, nr_pages);
		while (zone_spans_last_pfn(z, pfn, nr_pages)) {
1104
			if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
				/*
				 * We release the zone lock here because
				 * alloc_contig_range() will also lock the zone
				 * at some point. If there's an allocation
				 * spinning on this lock, it may win the race
				 * and cause alloc_contig_range() to fail...
				 */
				spin_unlock_irqrestore(&z->lock, flags);
				ret = __alloc_gigantic_page(pfn, nr_pages);
				if (!ret)
					return pfn_to_page(pfn);
				spin_lock_irqsave(&z->lock, flags);
			}
			pfn += nr_pages;
		}

		spin_unlock_irqrestore(&z->lock, flags);
	}

	return NULL;
}

static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
1128
static void prep_compound_gigantic_page(struct page *page, unsigned int order);
1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160

static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
{
	struct page *page;

	page = alloc_gigantic_page(nid, huge_page_order(h));
	if (page) {
		prep_compound_gigantic_page(page, huge_page_order(h));
		prep_new_huge_page(h, page, nid);
	}

	return page;
}

static int alloc_fresh_gigantic_page(struct hstate *h,
				nodemask_t *nodes_allowed)
{
	struct page *page = NULL;
	int nr_nodes, node;

	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
		page = alloc_fresh_gigantic_page_node(h, node);
		if (page)
			return 1;
	}

	return 0;
}

static inline bool gigantic_page_supported(void) { return true; }
#else
static inline bool gigantic_page_supported(void) { return false; }
1161
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1162
static inline void destroy_compound_gigantic_page(struct page *page,
1163
						unsigned int order) { }
1164 1165 1166 1167
static inline int alloc_fresh_gigantic_page(struct hstate *h,
					nodemask_t *nodes_allowed) { return 0; }
#endif

1168
static void update_and_free_page(struct hstate *h, struct page *page)
1169 1170
{
	int i;
1171

1172 1173
	if (hstate_is_gigantic(h) && !gigantic_page_supported())
		return;
1174

1175 1176 1177
	h->nr_huge_pages--;
	h->nr_huge_pages_node[page_to_nid(page)]--;
	for (i = 0; i < pages_per_huge_page(h); i++) {
1178 1179
		page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
				1 << PG_referenced | 1 << PG_dirty |
1180 1181
				1 << PG_active | 1 << PG_private |
				1 << PG_writeback);
1182
	}
1183
	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1184
	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
1185
	set_page_refcounted(page);
1186 1187 1188 1189 1190 1191
	if (hstate_is_gigantic(h)) {
		destroy_compound_gigantic_page(page, huge_page_order(h));
		free_gigantic_page(page, huge_page_order(h));
	} else {
		__free_pages(page, huge_page_order(h));
	}
1192 1193
}

1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204
struct hstate *size_to_hstate(unsigned long size)
{
	struct hstate *h;

	for_each_hstate(h) {
		if (huge_page_size(h) == size)
			return h;
	}
	return NULL;
}

1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229
/*
 * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
 * to hstate->hugepage_activelist.)
 *
 * This function can be called for tail pages, but never returns true for them.
 */
bool page_huge_active(struct page *page)
{
	VM_BUG_ON_PAGE(!PageHuge(page), page);
	return PageHead(page) && PagePrivate(&page[1]);
}

/* never called for tail page */
static void set_page_huge_active(struct page *page)
{
	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
	SetPagePrivate(&page[1]);
}

static void clear_page_huge_active(struct page *page)
{
	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
	ClearPagePrivate(&page[1]);
}

1230
void free_huge_page(struct page *page)
1231
{
1232 1233 1234 1235
	/*
	 * Can't pass hstate in here because it is called from the
	 * compound page destructor.
	 */
1236
	struct hstate *h = page_hstate(page);
1237
	int nid = page_to_nid(page);
1238 1239
	struct hugepage_subpool *spool =
		(struct hugepage_subpool *)page_private(page);
1240
	bool restore_reserve;
1241

1242
	set_page_private(page, 0);
1243
	page->mapping = NULL;
1244 1245
	VM_BUG_ON_PAGE(page_count(page), page);
	VM_BUG_ON_PAGE(page_mapcount(page), page);
1246
	restore_reserve = PagePrivate(page);
1247
	ClearPagePrivate(page);
1248

1249 1250 1251 1252 1253 1254 1255 1256
	/*
	 * A return code of zero implies that the subpool will be under its
	 * minimum size if the reservation is not restored after page is free.
	 * Therefore, force restore_reserve operation.
	 */
	if (hugepage_subpool_put_pages(spool, 1) == 0)
		restore_reserve = true;

1257
	spin_lock(&hugetlb_lock);
1258
	clear_page_huge_active(page);
1259 1260
	hugetlb_cgroup_uncharge_page(hstate_index(h),
				     pages_per_huge_page(h), page);
1261 1262 1263
	if (restore_reserve)
		h->resv_huge_pages++;

1264
	if (h->surplus_huge_pages_node[nid]) {
1265 1266
		/* remove the page from active list */
		list_del(&page->lru);
1267 1268 1269
		update_and_free_page(h, page);
		h->surplus_huge_pages--;
		h->surplus_huge_pages_node[nid]--;
1270
	} else {
1271
		arch_clear_hugepage_flags(page);
1272
		enqueue_huge_page(h, page);
1273
	}
1274 1275 1276
	spin_unlock(&hugetlb_lock);
}

1277
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1278
{
1279
	INIT_LIST_HEAD(&page->lru);
1280
	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1281
	spin_lock(&hugetlb_lock);
1282
	set_hugetlb_cgroup(page, NULL);
1283 1284
	h->nr_huge_pages++;
	h->nr_huge_pages_node[nid]++;
1285 1286 1287 1288
	spin_unlock(&hugetlb_lock);
	put_page(page); /* free it into the hugepage allocator */
}

1289
static void prep_compound_gigantic_page(struct page *page, unsigned int order)
1290 1291 1292 1293 1294 1295 1296
{
	int i;
	int nr_pages = 1 << order;
	struct page *p = page + 1;

	/* we rely on prep_new_huge_page to set the destructor */
	set_compound_order(page, order);
1297
	__ClearPageReserved(page);
1298
	__SetPageHead(page);
1299
	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312
		/*
		 * For gigantic hugepages allocated through bootmem at
		 * boot, it's safer to be consistent with the not-gigantic
		 * hugepages and clear the PG_reserved bit from all tail pages
		 * too.  Otherwse drivers using get_user_pages() to access tail
		 * pages may get the reference counting wrong if they see
		 * PG_reserved set on a tail page (despite the head page not
		 * having PG_reserved set).  Enforcing this consistency between
		 * head and tail pages allows drivers to optimize away a check
		 * on the head page when they need know if put_page() is needed
		 * after get_user_pages().
		 */
		__ClearPageReserved(p);
1313
		set_page_count(p, 0);
1314
		set_compound_head(p, page);
1315
	}
1316
	atomic_set(compound_mapcount_ptr(page), -1);
1317 1318
}

1319 1320 1321 1322 1323
/*
 * PageHuge() only returns true for hugetlbfs pages, but not for normal or
 * transparent huge pages.  See the PageTransHuge() documentation for more
 * details.
 */
1324 1325 1326 1327 1328 1329
int PageHuge(struct page *page)
{
	if (!PageCompound(page))
		return 0;

	page = compound_head(page);
1330
	return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
1331
}
1332 1333
EXPORT_SYMBOL_GPL(PageHuge);

1334 1335 1336 1337 1338 1339 1340 1341 1342
/*
 * PageHeadHuge() only returns true for hugetlbfs head page, but not for
 * normal or transparent huge pages.
 */
int PageHeadHuge(struct page *page_head)
{
	if (!PageHead(page_head))
		return 0;

1343
	return get_compound_page_dtor(page_head) == free_huge_page;
1344 1345
}

1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
pgoff_t __basepage_index(struct page *page)
{
	struct page *page_head = compound_head(page);
	pgoff_t index = page_index(page_head);
	unsigned long compound_idx;

	if (!PageHuge(page_head))
		return page_index(page);

	if (compound_order(page_head) >= MAX_ORDER)
		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
	else
		compound_idx = page - page_head;

	return (index << compound_order(page_head)) + compound_idx;
}

1363
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
Linus Torvalds's avatar
Linus Torvalds committed
1364 1365
{
	struct page *page;
1366

1367
	page = __alloc_pages_node(nid,
1368
		htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1369
						__GFP_REPEAT|__GFP_NOWARN,
1370
		huge_page_order(h));
Linus Torvalds's avatar
Linus Torvalds committed
1371
	if (page) {
1372
		prep_new_huge_page(h, page, nid);
Linus Torvalds's avatar
Linus Torvalds committed
1373
	}
1374 1375 1376 1377

	return page;
}

1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399
static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
	struct page *page;
	int nr_nodes, node;
	int ret = 0;

	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
		page = alloc_fresh_huge_page_node(h, node);
		if (page) {
			ret = 1;
			break;
		}
	}

	if (ret)
		count_vm_event(HTLB_BUDDY_PGALLOC);
	else
		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);

	return ret;
}

1400 1401 1402 1403 1404 1405
/*
 * Free huge page from pool from next node to free.
 * Attempt to keep persistent huge pages more or less
 * balanced over allowed nodes.
 * Called with hugetlb_lock locked.
 */
1406 1407
static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
							 bool acct_surplus)
1408
{
1409
	int nr_nodes, node;
1410 1411
	int ret = 0;

1412
	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1413 1414 1415 1416
		/*
		 * If we're returning unused surplus pages, only examine
		 * nodes with surplus pages.
		 */
1417 1418
		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
		    !list_empty(&h->hugepage_freelists[node])) {
1419
			struct page *page =
1420
				list_entry(h->hugepage_freelists[node].next,
1421 1422 1423
					  struct page, lru);
			list_del(&page->lru);
			h->free_huge_pages--;
1424
			h->free_huge_pages_node[node]--;
1425 1426
			if (acct_surplus) {
				h->surplus_huge_pages--;
1427
				h->surplus_huge_pages_node[node]--;
1428
			}
1429 1430
			update_and_free_page(h, page);
			ret = 1;
1431
			break;
1432
		}
1433
	}
1434 1435 1436 1437

	return ret;
}

1438 1439
/*
 * Dissolve a given free hugepage into free buddy pages. This function does
1440 1441 1442
 * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
 * number of free hugepages would be reduced below the number of reserved
 * hugepages.
1443
 */
1444
static int dissolve_free_huge_page(struct page *page)
1445
{
1446 1447
	int rc = 0;

1448 1449
	spin_lock(&hugetlb_lock);
	if (PageHuge(page) && !page_count(page)) {
1450 1451 1452
		struct page *head = compound_head(page);
		struct hstate *h = page_hstate(head);
		int nid = page_to_nid(head);
1453 1454 1455 1456
		if (h->free_huge_pages - h->resv_huge_pages == 0) {
			rc = -EBUSY;
			goto out;
		}
1457
		list_del(&head->lru);
1458 1459
		h->free_huge_pages--;
		h->free_huge_pages_node[nid]--;
1460
		h->max_huge_pages--;
1461
		update_and_free_page(h, head);
1462
	}
1463
out:
1464
	spin_unlock(&hugetlb_lock);
1465
	return rc;
1466 1467 1468 1469 1470
}

/*
 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
 * make specified memory blocks removable from the system.
1471 1472
 * Note that this will dissolve a free gigantic hugepage completely, if any
 * part of it lies within the given range.
1473 1474
 * Also note that if dissolve_free_huge_page() returns with an error, all
 * free hugepages that were dissolved before that error are lost.
1475
 */
1476
int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1477 1478
{
	unsigned long pfn;
1479
	struct page *page;
1480
	int rc = 0;
1481

1482
	if (!hugepages_supported())
1483
		return rc;
1484

1485 1486 1487 1488 1489 1490 1491 1492
	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
		page = pfn_to_page(pfn);
		if (PageHuge(page) && !page_count(page)) {
			rc = dissolve_free_huge_page(page);
			if (rc)
				break;
		}
	}
1493 1494

	return rc;
1495 1496