huge_memory.c 79.1 KB
Newer Older
1 2 3 4 5 6 7
/*
 *  Copyright (C) 2009  Red Hat, Inc.
 *
 *  This work is licensed under the terms of the GNU GPL, version 2. See
 *  the COPYING file in the top-level directory.
 */

8 9
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

10 11
#include <linux/mm.h>
#include <linux/sched.h>
12
#include <linux/sched/coredump.h>
13
#include <linux/sched/numa_balancing.h>
14 15 16 17 18
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
19
#include <linux/shrinker.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
20
#include <linux/mm_inline.h>
21
#include <linux/swapops.h>
22
#include <linux/dax.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
23
#include <linux/khugepaged.h>
24
#include <linux/freezer.h>
25
#include <linux/pfn_t.h>
26
#include <linux/mman.h>
27
#include <linux/memremap.h>
28
#include <linux/pagemap.h>
29
#include <linux/debugfs.h>
30
#include <linux/migrate.h>
31
#include <linux/hashtable.h>
32
#include <linux/userfaultfd_k.h>
33
#include <linux/page_idle.h>
34
#include <linux/shmem_fs.h>
35
#include <linux/oom.h>
36

37 38 39 40
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"

Andrea Arcangeli's avatar
Andrea Arcangeli committed
41
/*
42 43 44 45 46 47
 * By default transparent hugepage support is disabled in order that avoid
 * to risk increase the memory footprint of applications without a guaranteed
 * benefit. When transparent hugepage support is enabled, is for all mappings,
 * and khugepaged scans all mappings.
 * Defrag is invoked by khugepaged hugepage allocations and by page faults
 * for all hugepage allocations.
Andrea Arcangeli's avatar
Andrea Arcangeli committed
48
 */
49
unsigned long transparent_hugepage_flags __read_mostly =
50
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
Andrea Arcangeli's avatar
Andrea Arcangeli committed
51
	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
52 53 54 55
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
56
	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
57 58
	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
59

60
static struct shrinker deferred_split_shrinker;
61

62
static atomic_t huge_zero_refcount;
63
struct page *huge_zero_page __read_mostly;
64

65
static struct page *get_huge_zero_page(void)
66 67 68 69
{
	struct page *zero_page;
retry:
	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
70
		return READ_ONCE(huge_zero_page);
71 72

	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
73
			HPAGE_PMD_ORDER);
74 75
	if (!zero_page) {
		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
76
		return NULL;
77 78
	}
	count_vm_event(THP_ZERO_PAGE_ALLOC);
79
	preempt_disable();
80
	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
81
		preempt_enable();
82
		__free_pages(zero_page, compound_order(zero_page));
83 84 85 86 87 88
		goto retry;
	}

	/* We take additional reference here. It will be put back by shrinker */
	atomic_set(&huge_zero_refcount, 2);
	preempt_enable();
89
	return READ_ONCE(huge_zero_page);
90 91
}

92
static void put_huge_zero_page(void)
93
{
94 95 96 97 98
	/*
	 * Counter should never go to zero here. Only shrinker can put
	 * last reference.
	 */
	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
99 100
}

101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
struct page *mm_get_huge_zero_page(struct mm_struct *mm)
{
	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
		return READ_ONCE(huge_zero_page);

	if (!get_huge_zero_page())
		return NULL;

	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
		put_huge_zero_page();

	return READ_ONCE(huge_zero_page);
}

void mm_put_huge_zero_page(struct mm_struct *mm)
{
	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
		put_huge_zero_page();
}

121 122
static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
					struct shrink_control *sc)
123
{
124 125 126
	/* we can free zero page only if last reference remains */
	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}
127

128 129 130
static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
				       struct shrink_control *sc)
{
131
	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
132 133
		struct page *zero_page = xchg(&huge_zero_page, NULL);
		BUG_ON(zero_page == NULL);
134
		__free_pages(zero_page, compound_order(zero_page));
135
		return HPAGE_PMD_NR;
136 137 138
	}

	return 0;
139 140
}

141
static struct shrinker huge_zero_page_shrinker = {
142 143
	.count_objects = shrink_huge_zero_page_count,
	.scan_objects = shrink_huge_zero_page_scan,
144 145 146
	.seeks = DEFAULT_SEEKS,
};

147 148 149 150
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
			    struct kobj_attribute *attr, char *buf)
{
151 152 153 154 155 156
	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
		return sprintf(buf, "[always] madvise never\n");
	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
		return sprintf(buf, "always [madvise] never\n");
	else
		return sprintf(buf, "always madvise [never]\n");
157
}
158

159 160 161 162
static ssize_t enabled_store(struct kobject *kobj,
			     struct kobj_attribute *attr,
			     const char *buf, size_t count)
{
163
	ssize_t ret = count;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
164

165 166 167 168 169 170 171 172 173 174 175 176 177 178
	if (!memcmp("always", buf,
		    min(sizeof("always")-1, count))) {
		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
	} else if (!memcmp("madvise", buf,
			   min(sizeof("madvise")-1, count))) {
		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
	} else if (!memcmp("never", buf,
			   min(sizeof("never")-1, count))) {
		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
	} else
		ret = -EINVAL;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
179 180

	if (ret > 0) {
181
		int err = start_stop_khugepaged();
Andrea Arcangeli's avatar
Andrea Arcangeli committed
182 183 184 185
		if (err)
			ret = err;
	}
	return ret;
186 187 188 189
}
static struct kobj_attribute enabled_attr =
	__ATTR(enabled, 0644, enabled_show, enabled_store);

190
ssize_t single_hugepage_flag_show(struct kobject *kobj,
191 192 193
				struct kobj_attribute *attr, char *buf,
				enum transparent_hugepage_flag flag)
{
194 195
	return sprintf(buf, "%d\n",
		       !!test_bit(flag, &transparent_hugepage_flags));
196
}
197

198
ssize_t single_hugepage_flag_store(struct kobject *kobj,
199 200 201 202
				 struct kobj_attribute *attr,
				 const char *buf, size_t count,
				 enum transparent_hugepage_flag flag)
{
203 204 205 206 207 208 209 210 211 212
	unsigned long value;
	int ret;

	ret = kstrtoul(buf, 10, &value);
	if (ret < 0)
		return ret;
	if (value > 1)
		return -EINVAL;

	if (value)
213
		set_bit(flag, &transparent_hugepage_flags);
214
	else
215 216 217 218 219 220 221 222
		clear_bit(flag, &transparent_hugepage_flags);

	return count;
}

static ssize_t defrag_show(struct kobject *kobj,
			   struct kobj_attribute *attr, char *buf)
{
223
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
224
		return sprintf(buf, "[always] defer defer+madvise madvise never\n");
225
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
226 227 228 229 230 231
		return sprintf(buf, "always [defer] defer+madvise madvise never\n");
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
		return sprintf(buf, "always defer [defer+madvise] madvise never\n");
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
		return sprintf(buf, "always defer defer+madvise [madvise] never\n");
	return sprintf(buf, "always defer defer+madvise madvise [never]\n");
232
}
233

234 235 236 237
static ssize_t defrag_store(struct kobject *kobj,
			    struct kobj_attribute *attr,
			    const char *buf, size_t count)
{
238 239 240 241 242 243 244 245 246 247 248 249
	if (!memcmp("always", buf,
		    min(sizeof("always")-1, count))) {
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
	} else if (!memcmp("defer+madvise", buf,
		    min(sizeof("defer+madvise")-1, count))) {
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
250 251 252 253 254 255
	} else if (!memcmp("defer", buf,
		    min(sizeof("defer")-1, count))) {
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
	} else if (!memcmp("madvise", buf,
			   min(sizeof("madvise")-1, count))) {
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
	} else if (!memcmp("never", buf,
			   min(sizeof("never")-1, count))) {
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
	} else
		return -EINVAL;

	return count;
272 273 274 275
}
static struct kobj_attribute defrag_attr =
	__ATTR(defrag, 0644, defrag_show, defrag_store);

276 277 278
static ssize_t use_zero_page_show(struct kobject *kobj,
		struct kobj_attribute *attr, char *buf)
{
279
	return single_hugepage_flag_show(kobj, attr, buf,
280 281 282 283 284
				TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static ssize_t use_zero_page_store(struct kobject *kobj,
		struct kobj_attribute *attr, const char *buf, size_t count)
{
285
	return single_hugepage_flag_store(kobj, attr, buf, count,
286 287 288 289
				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static struct kobj_attribute use_zero_page_attr =
	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
290 291 292 293 294 295 296 297 298

static ssize_t hpage_pmd_size_show(struct kobject *kobj,
		struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
}
static struct kobj_attribute hpage_pmd_size_attr =
	__ATTR_RO(hpage_pmd_size);

299 300 301 302
#ifdef CONFIG_DEBUG_VM
static ssize_t debug_cow_show(struct kobject *kobj,
				struct kobj_attribute *attr, char *buf)
{
303
	return single_hugepage_flag_show(kobj, attr, buf,
304 305 306 307 308 309
				TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
}
static ssize_t debug_cow_store(struct kobject *kobj,
			       struct kobj_attribute *attr,
			       const char *buf, size_t count)
{
310
	return single_hugepage_flag_store(kobj, attr, buf, count,
311 312 313 314 315 316 317 318 319
				 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
}
static struct kobj_attribute debug_cow_attr =
	__ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
#endif /* CONFIG_DEBUG_VM */

static struct attribute *hugepage_attr[] = {
	&enabled_attr.attr,
	&defrag_attr.attr,
320
	&use_zero_page_attr.attr,
321
	&hpage_pmd_size_attr.attr,
322
#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
323 324
	&shmem_enabled_attr.attr,
#endif
325 326 327 328 329 330
#ifdef CONFIG_DEBUG_VM
	&debug_cow_attr.attr,
#endif
	NULL,
};

331
static const struct attribute_group hugepage_attr_group = {
332
	.attrs = hugepage_attr,
Andrea Arcangeli's avatar
Andrea Arcangeli committed
333 334
};

335
static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
336 337 338
{
	int err;

339 340
	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
	if (unlikely(!*hugepage_kobj)) {
341
		pr_err("failed to create transparent hugepage kobject\n");
342
		return -ENOMEM;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
343 344
	}

345
	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
346
	if (err) {
347
		pr_err("failed to register transparent hugepage group\n");
348
		goto delete_obj;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
349 350
	}

351
	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
352
	if (err) {
353
		pr_err("failed to register transparent hugepage group\n");
354
		goto remove_hp_group;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
355
	}
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392

	return 0;

remove_hp_group:
	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
delete_obj:
	kobject_put(*hugepage_kobj);
	return err;
}

static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
	kobject_put(hugepage_kobj);
}
#else
static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
	return 0;
}

static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
}
#endif /* CONFIG_SYSFS */

static int __init hugepage_init(void)
{
	int err;
	struct kobject *hugepage_kobj;

	if (!has_transparent_hugepage()) {
		transparent_hugepage_flags = 0;
		return -EINVAL;
	}

393 394 395 396 397 398 399 400 401 402
	/*
	 * hugepages can't be allocated by the buddy allocator
	 */
	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
	/*
	 * we use page->mapping and page->index in second tail page
	 * as list_head: assuming THP order >= 2
	 */
	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);

403 404
	err = hugepage_init_sysfs(&hugepage_kobj);
	if (err)
405
		goto err_sysfs;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
406

407
	err = khugepaged_init();
Andrea Arcangeli's avatar
Andrea Arcangeli committed
408
	if (err)
409
		goto err_slab;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
410

411 412 413
	err = register_shrinker(&huge_zero_page_shrinker);
	if (err)
		goto err_hzp_shrinker;
414 415 416
	err = register_shrinker(&deferred_split_shrinker);
	if (err)
		goto err_split_shrinker;
417

418 419 420 421 422
	/*
	 * By default disable transparent hugepages on smaller systems,
	 * where the extra memory used could hurt more than TLB overhead
	 * is likely to save.  The admin can still enable it through /sys.
	 */
423
	if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
424
		transparent_hugepage_flags = 0;
425 426
		return 0;
	}
427

428
	err = start_stop_khugepaged();
429 430
	if (err)
		goto err_khugepaged;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
431

432
	return 0;
433
err_khugepaged:
434 435
	unregister_shrinker(&deferred_split_shrinker);
err_split_shrinker:
436 437
	unregister_shrinker(&huge_zero_page_shrinker);
err_hzp_shrinker:
438
	khugepaged_destroy();
439
err_slab:
440
	hugepage_exit_sysfs(hugepage_kobj);
441
err_sysfs:
Andrea Arcangeli's avatar
Andrea Arcangeli committed
442
	return err;
443
}
444
subsys_initcall(hugepage_init);
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471

static int __init setup_transparent_hugepage(char *str)
{
	int ret = 0;
	if (!str)
		goto out;
	if (!strcmp(str, "always")) {
		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
			&transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
			  &transparent_hugepage_flags);
		ret = 1;
	} else if (!strcmp(str, "madvise")) {
		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
			  &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
			&transparent_hugepage_flags);
		ret = 1;
	} else if (!strcmp(str, "never")) {
		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
			  &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
			  &transparent_hugepage_flags);
		ret = 1;
	}
out:
	if (!ret)
472
		pr_warn("transparent_hugepage= cannot parse, ignored\n");
473 474 475 476
	return ret;
}
__setup("transparent_hugepage=", setup_transparent_hugepage);

477
pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
478 479 480 481 482 483
{
	if (likely(vma->vm_flags & VM_WRITE))
		pmd = pmd_mkwrite(pmd);
	return pmd;
}

484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
static inline struct list_head *page_deferred_list(struct page *page)
{
	/*
	 * ->lru in the tail pages is occupied by compound_head.
	 * Let's use ->mapping + ->index in the second tail page as list_head.
	 */
	return (struct list_head *)&page[2].mapping;
}

void prep_transhuge_page(struct page *page)
{
	/*
	 * we use page->mapping and page->indexlru in second tail page
	 * as list_head: assuming THP order >= 2
	 */

	INIT_LIST_HEAD(page_deferred_list(page));
	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}

504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546
unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
		loff_t off, unsigned long flags, unsigned long size)
{
	unsigned long addr;
	loff_t off_end = off + len;
	loff_t off_align = round_up(off, size);
	unsigned long len_pad;

	if (off_end <= off_align || (off_end - off_align) < size)
		return 0;

	len_pad = len + size;
	if (len_pad < len || (off + len_pad) < off)
		return 0;

	addr = current->mm->get_unmapped_area(filp, 0, len_pad,
					      off >> PAGE_SHIFT, flags);
	if (IS_ERR_VALUE(addr))
		return 0;

	addr += (off - addr) & (size - 1);
	return addr;
}

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags)
{
	loff_t off = (loff_t)pgoff << PAGE_SHIFT;

	if (addr)
		goto out;
	if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
		goto out;

	addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
	if (addr)
		return addr;

 out:
	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);

547
static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
548
		gfp_t gfp)
549
{
550
	struct vm_area_struct *vma = vmf->vma;
551
	struct mem_cgroup *memcg;
552
	pgtable_t pgtable;
553
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
554
	int ret = 0;
555

556
	VM_BUG_ON_PAGE(!PageCompound(page), page);
557

558 559
	if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
				  true)) {
560 561 562 563
		put_page(page);
		count_vm_event(THP_FAULT_FALLBACK);
		return VM_FAULT_FALLBACK;
	}
564

Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
565
	pgtable = pte_alloc_one(vma->vm_mm, haddr);
566
	if (unlikely(!pgtable)) {
567 568
		ret = VM_FAULT_OOM;
		goto release;
569
	}
570

571
	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
572 573 574 575 576
	/*
	 * The memory barrier inside __SetPageUptodate makes sure that
	 * clear_huge_page writes become visible before the set_pmd_at()
	 * write.
	 */
577 578
	__SetPageUptodate(page);

579 580
	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
	if (unlikely(!pmd_none(*vmf->pmd))) {
581
		goto unlock_release;
582 583
	} else {
		pmd_t entry;
584

585 586 587 588
		ret = check_stable_address_space(vma->vm_mm);
		if (ret)
			goto unlock_release;

589 590 591 592
		/* Deliver the page fault to userland */
		if (userfaultfd_missing(vma)) {
			int ret;

593
			spin_unlock(vmf->ptl);
594
			mem_cgroup_cancel_charge(page, memcg, true);
595
			put_page(page);
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
596
			pte_free(vma->vm_mm, pgtable);
597
			ret = handle_userfault(vmf, VM_UFFD_MISSING);
598 599 600 601
			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
			return ret;
		}

602 603
		entry = mk_huge_pmd(page, vma->vm_page_prot);
		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
604
		page_add_new_anon_rmap(page, vma, haddr, true);
605
		mem_cgroup_commit_charge(page, memcg, false, true);
606
		lru_cache_add_active_or_unevictable(page, vma);
607 608
		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
609 610
		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
		atomic_long_inc(&vma->vm_mm->nr_ptes);
611
		spin_unlock(vmf->ptl);
612
		count_vm_event(THP_FAULT_ALLOC);
613 614
	}

615
	return 0;
616 617 618 619 620 621 622 623 624
unlock_release:
	spin_unlock(vmf->ptl);
release:
	if (pgtable)
		pte_free(vma->vm_mm, pgtable);
	mem_cgroup_cancel_charge(page, memcg, true);
	put_page(page);
	return ret;

625 626
}

627
/*
628 629 630 631 632 633 634
 * always: directly stall for all thp allocations
 * defer: wake kswapd and fail if not immediately available
 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
 *		  fail if not immediately available
 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
 *	    available
 * never: never stall for any thp allocation
635 636 637
 */
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
{
638
	const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
639

640
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
641
		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
642 643 644 645 646 647 648 649
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
							     __GFP_KSWAPD_RECLAIM);
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
							     0);
650
	return GFP_TRANSHUGE_LIGHT;
651 652
}

653
/* Caller must hold page table lock. */
654
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
655
		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
656
		struct page *zero_page)
657 658
{
	pmd_t entry;
659 660
	if (!pmd_none(*pmd))
		return false;
661
	entry = mk_pmd(zero_page, vma->vm_page_prot);
662
	entry = pmd_mkhuge(entry);
663 664
	if (pgtable)
		pgtable_trans_huge_deposit(mm, pmd, pgtable);
665
	set_pmd_at(mm, haddr, pmd, entry);
666
	atomic_long_inc(&mm->nr_ptes);
667
	return true;
668 669
}

670
int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
671
{
672
	struct vm_area_struct *vma = vmf->vma;
673
	gfp_t gfp;
674
	struct page *page;
675
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
676

677
	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
678
		return VM_FAULT_FALLBACK;
679 680
	if (unlikely(anon_vma_prepare(vma)))
		return VM_FAULT_OOM;
681
	if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
682
		return VM_FAULT_OOM;
683
	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
684
			!mm_forbids_zeropage(vma->vm_mm) &&
685 686 687 688
			transparent_hugepage_use_zero_page()) {
		pgtable_t pgtable;
		struct page *zero_page;
		bool set;
689
		int ret;
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
690
		pgtable = pte_alloc_one(vma->vm_mm, haddr);
691
		if (unlikely(!pgtable))
Andrea Arcangeli's avatar
Andrea Arcangeli committed
692
			return VM_FAULT_OOM;
693
		zero_page = mm_get_huge_zero_page(vma->vm_mm);
694
		if (unlikely(!zero_page)) {
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
695
			pte_free(vma->vm_mm, pgtable);
696
			count_vm_event(THP_FAULT_FALLBACK);
697
			return VM_FAULT_FALLBACK;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
698
		}
699
		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
700 701
		ret = 0;
		set = false;
702
		if (pmd_none(*vmf->pmd)) {
703 704 705 706
			ret = check_stable_address_space(vma->vm_mm);
			if (ret) {
				spin_unlock(vmf->ptl);
			} else if (userfaultfd_missing(vma)) {
707 708
				spin_unlock(vmf->ptl);
				ret = handle_userfault(vmf, VM_UFFD_MISSING);
709 710
				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
			} else {
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
711
				set_huge_zero_page(pgtable, vma->vm_mm, vma,
712 713
						   haddr, vmf->pmd, zero_page);
				spin_unlock(vmf->ptl);
714 715 716
				set = true;
			}
		} else
717
			spin_unlock(vmf->ptl);
718
		if (!set)
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
719
			pte_free(vma->vm_mm, pgtable);
720
		return ret;
721
	}
722
	gfp = alloc_hugepage_direct_gfpmask(vma);
723
	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
724 725
	if (unlikely(!page)) {
		count_vm_event(THP_FAULT_FALLBACK);
726
		return VM_FAULT_FALLBACK;
727
	}
728
	prep_transhuge_page(page);
729
	return __do_huge_pmd_anonymous_page(vmf, page, gfp);
730 731
}

732
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
733 734
		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
		pgtable_t pgtable)
735 736 737 738 739 740
{
	struct mm_struct *mm = vma->vm_mm;
	pmd_t entry;
	spinlock_t *ptl;

	ptl = pmd_lock(mm, pmd);
741 742 743
	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
	if (pfn_t_devmap(pfn))
		entry = pmd_mkdevmap(entry);
744 745 746
	if (write) {
		entry = pmd_mkyoung(pmd_mkdirty(entry));
		entry = maybe_pmd_mkwrite(entry, vma);
747
	}
748 749 750 751 752 753

	if (pgtable) {
		pgtable_trans_huge_deposit(mm, pmd, pgtable);
		atomic_long_inc(&mm->nr_ptes);
	}

754 755
	set_pmd_at(mm, addr, pmd, entry);
	update_mmu_cache_pmd(vma, addr, pmd);
756 757 758 759
	spin_unlock(ptl);
}

int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
760
			pmd_t *pmd, pfn_t pfn, bool write)
761 762
{
	pgprot_t pgprot = vma->vm_page_prot;
763
	pgtable_t pgtable = NULL;
764 765 766 767 768 769 770 771 772
	/*
	 * If we had pmd_special, we could avoid all these restrictions,
	 * but we need to be consistent with PTEs and architectures that
	 * can't support a 'special' bit.
	 */
	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
						(VM_PFNMAP|VM_MIXEDMAP));
	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
773
	BUG_ON(!pfn_t_devmap(pfn));
774 775 776

	if (addr < vma->vm_start || addr >= vma->vm_end)
		return VM_FAULT_SIGBUS;
777

778 779 780 781 782 783
	if (arch_needs_pgtable_deposit()) {
		pgtable = pte_alloc_one(vma->vm_mm, addr);
		if (!pgtable)
			return VM_FAULT_OOM;
	}

784 785
	track_pfn_insert(vma, &pgprot, pfn);

786
	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
787
	return VM_FAULT_NOPAGE;
788
}
789
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
790

791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
{
	if (likely(vma->vm_flags & VM_WRITE))
		pud = pud_mkwrite(pud);
	return pud;
}

static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
		pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
{
	struct mm_struct *mm = vma->vm_mm;
	pud_t entry;
	spinlock_t *ptl;

	ptl = pud_lock(mm, pud);
	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
	if (pfn_t_devmap(pfn))
		entry = pud_mkdevmap(entry);
	if (write) {
		entry = pud_mkyoung(pud_mkdirty(entry));
		entry = maybe_pud_mkwrite(entry, vma);
	}
	set_pud_at(mm, addr, pud, entry);
	update_mmu_cache_pud(vma, addr, pud);
	spin_unlock(ptl);
}

int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
			pud_t *pud, pfn_t pfn, bool write)
{
	pgprot_t pgprot = vma->vm_page_prot;
	/*
	 * If we had pud_special, we could avoid all these restrictions,
	 * but we need to be consistent with PTEs and architectures that
	 * can't support a 'special' bit.
	 */
	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
						(VM_PFNMAP|VM_MIXEDMAP));
	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
	BUG_ON(!pfn_t_devmap(pfn));

	if (addr < vma->vm_start || addr >= vma->vm_end)
		return VM_FAULT_SIGBUS;

	track_pfn_insert(vma, &pgprot, pfn);

	insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
	return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

845
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
846
		pmd_t *pmd, int flags)
847 848 849
{
	pmd_t _pmd;

850 851 852
	_pmd = pmd_mkyoung(*pmd);
	if (flags & FOLL_WRITE)
		_pmd = pmd_mkdirty(_pmd);
853
	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
854
				pmd, _pmd, flags & FOLL_WRITE))
855 856 857 858 859 860 861 862 863 864 865 866 867
		update_mmu_cache_pmd(vma, addr, pmd);
}

struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
		pmd_t *pmd, int flags)
{
	unsigned long pfn = pmd_pfn(*pmd);
	struct mm_struct *mm = vma->vm_mm;
	struct dev_pagemap *pgmap;
	struct page *page;

	assert_spin_locked(pmd_lockptr(mm, pmd));

868 869 870 871 872 873
	/*
	 * When we COW a devmap PMD entry, we split it into PTEs, so we should
	 * not be in this function with `flags & FOLL_COW` set.
	 */
	WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");

874 875 876 877 878 879 880 881 882
	if (flags & FOLL_WRITE && !pmd_write(*pmd))
		return NULL;

	if (pmd_present(*pmd) && pmd_devmap(*pmd))
		/* pass */;
	else
		return NULL;

	if (flags & FOLL_TOUCH)
883
		touch_pmd(vma, addr, pmd, flags);
884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902

	/*
	 * device mapped pages can only be returned if the
	 * caller will manage the page reference count.
	 */
	if (!(flags & FOLL_GET))
		return ERR_PTR(-EEXIST);

	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
	pgmap = get_dev_pagemap(pfn, NULL);
	if (!pgmap)
		return ERR_PTR(-EFAULT);
	page = pfn_to_page(pfn);
	get_page(page);
	put_dev_pagemap(pgmap);

	return page;
}

903 904 905 906
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
		  struct vm_area_struct *vma)
{
907
	spinlock_t *dst_ptl, *src_ptl;
908 909
	struct page *src_page;
	pmd_t pmd;
910
	pgtable_t pgtable = NULL;
911
	int ret = -ENOMEM;
912

913 914 915 916 917 918 919
	/* Skip if can be re-fill on fault */
	if (!vma_is_anonymous(vma))
		return 0;

	pgtable = pte_alloc_one(dst_mm, addr);
	if (unlikely(!pgtable))
		goto out;
920

921 922 923
	dst_ptl = pmd_lock(dst_mm, dst_pmd);
	src_ptl = pmd_lockptr(src_mm, src_pmd);
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
924 925 926

	ret = -EAGAIN;
	pmd = *src_pmd;
927 928 929 930 931 932 933 934 935

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
	if (unlikely(is_swap_pmd(pmd))) {
		swp_entry_t entry = pmd_to_swp_entry(pmd);

		VM_BUG_ON(!is_pmd_migration_entry(pmd));
		if (is_write_migration_entry(entry)) {
			make_migration_entry_read(&entry);
			pmd = swp_entry_to_pmd(entry);
936 937
			if (pmd_swp_soft_dirty(*src_pmd))
				pmd = pmd_swp_mksoft_dirty(pmd);
938 939
			set_pmd_at(src_mm, addr, src_pmd, pmd);
		}
940 941 942
		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
		atomic_long_inc(&dst_mm->nr_ptes);
		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
943 944 945 946 947 948
		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
		ret = 0;
		goto out_unlock;
	}
#endif

949
	if (unlikely(!pmd_trans_huge(pmd))) {
950 951 952
		pte_free(dst_mm, pgtable);
		goto out_unlock;
	}
953
	/*
954
	 * When page table lock is held, the huge zero pmd should not be
955 956 957 958
	 * under splitting since we don't split the page itself, only pmd to
	 * a page table.
	 */
	if (is_huge_zero_pmd(pmd)) {
959
		struct page *zero_page;
960 961 962 963 964
		/*
		 * get_huge_zero_page() will never allocate a new page here,
		 * since we already have a zero page to copy. It just takes a
		 * reference.
		 */
965
		zero_page = mm_get_huge_zero_page(dst_mm);
966
		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
967
				zero_page);
968 969 970
		ret = 0;
		goto out_unlock;
	}
971

972 973 974 975 976 977 978
	src_page = pmd_page(pmd);
	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
	get_page(src_page);
	page_dup_rmap(src_page, true);
	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
	atomic_long_inc(&dst_mm->nr_ptes);
	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
979 980 981 982 983 984 985

	pmdp_set_wrprotect(src_mm, addr, src_pmd);
	pmd = pmd_mkold(pmd_wrprotect(pmd));
	set_pmd_at(dst_mm, addr, dst_pmd, pmd);

	ret = 0;
out_unlock:
986 987
	spin_unlock(src_ptl);
	spin_unlock(dst_ptl);
988 989 990 991
out:
	return ret;
}

992 993
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
994
		pud_t *pud, int flags)
995 996 997
{
	pud_t _pud;

998 999 1000
	_pud = pud_mkyoung(*pud);
	if (flags & FOLL_WRITE)
		_pud = pud_mkdirty(_pud);
1001
	if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1002
				pud, _pud, flags & FOLL_WRITE))
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
		update_mmu_cache_pud(vma, addr, pud);
}

struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
		pud_t *pud, int flags)
{
	unsigned long pfn = pud_pfn(*pud);
	struct mm_struct *mm = vma->vm_mm;
	struct dev_pagemap *pgmap;
	struct page *page;

	assert_spin_locked(pud_lockptr(mm, pud));

	if (flags & FOLL_WRITE && !pud_write(*pud))
		return NULL;

	if (pud_present(*pud) && pud_devmap(*pud))
		/* pass */;
	else
		return NULL;

	if (flags & FOLL_TOUCH)
1025
		touch_pud(vma, addr, pud, flags);
1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103

	/*
	 * device mapped pages can only be returned if the
	 * caller will manage the page reference count.
	 */
	if (!(flags & FOLL_GET))
		return ERR_PTR(-EEXIST);

	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
	pgmap = get_dev_pagemap(pfn, NULL);
	if (!pgmap)
		return ERR_PTR(-EFAULT);
	page = pfn_to_page(pfn);
	get_page(page);
	put_dev_pagemap(pgmap);

	return page;
}

int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
		  struct vm_area_struct *vma)
{
	spinlock_t *dst_ptl, *src_ptl;
	pud_t pud;
	int ret;

	dst_ptl = pud_lock(dst_mm, dst_pud);
	src_ptl = pud_lockptr(src_mm, src_pud);
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

	ret = -EAGAIN;
	pud = *src_pud;
	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
		goto out_unlock;

	/*
	 * When page table lock is held, the huge zero pud should not be
	 * under splitting since we don't split the page itself, only pud to
	 * a page table.
	 */
	if (is_huge_zero_pud(pud)) {
		/* No huge zero pud yet */
	}

	pudp_set_wrprotect(src_mm, addr, src_pud);
	pud = pud_mkold(pud_wrprotect(pud));
	set_pud_at(dst_mm, addr, dst_pud, pud);

	ret = 0;
out_unlock:
	spin_unlock(src_ptl);
	spin_unlock(dst_ptl);
	return ret;
}

void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
	pud_t entry;
	unsigned long haddr;
	bool write = vmf->flags & FAULT_FLAG_WRITE;

	vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
		goto unlock;

	entry = pud_mkyoung(orig_pud);
	if (write)
		entry = pud_mkdirty(entry);
	haddr = vmf->address & HPAGE_PUD_MASK;
	if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
		update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);

unlock:
	spin_unlock(vmf->ptl);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

1104
void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
1105 1106 1107
{
	pmd_t entry;
	unsigned long haddr;
1108
	bool write = vmf->flags & FAULT_FLAG_WRITE;
1109

1110 1111
	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1112 1113 1114
		goto unlock;

	entry = pmd_mkyoung(orig_pmd);
1115 1116
	if (write)
		entry = pmd_mkdirty(entry);
1117
	haddr = vmf->address & HPAGE_PMD_MASK;
1118
	if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
1119
		update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
1120 1121

unlock:
1122
	spin_unlock(vmf->ptl);
1123 1124
}

1125
static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
1126
		struct page *page)
1127
{
1128 1129
	struct vm_area_struct *vma = vmf->vma;
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1130
	struct mem_cgroup *memcg;
1131 1132 1133 1134
	pgtable_t pgtable;
	pmd_t _pmd;
	int ret = 0, i;
	struct page **pages;
1135 1136
	unsigned long mmun_start;	/* For mmu_notifiers */
	unsigned long mmun_end;		/* For mmu_notifiers */
1137 1138 1139 1140 1141 1142 1143 1144 1145

	pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
			GFP_KERNEL);
	if (unlikely(!pages)) {
		ret |= VM_FAULT_OOM;
		goto out;
	}

	for (i = 0; i < HPAGE_PMD_NR; i++) {
1146
		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
1147
					       vmf->address, page_to_nid(page));
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1148
		if (unlikely(!pages[i] ||
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
1149 1150
			     mem_cgroup_try_charge(pages[i], vma->vm_mm,
				     GFP_KERNEL, &memcg, false))) {
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1151
			if (pages[i])
1152
				put_page(pages[i]);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1153
			while (--i >= 0) {
1154 1155
				memcg = (void *)page_private(pages[i]);
				set_page_private(pages[i], 0);
1156 1157
				mem_cgroup_cancel_charge(pages[i], memcg,
						false);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1158 1159
				put_page(pages[i]);
			}
1160 1161 1162 1163
			kfree(pages);
			ret |= VM_FAULT_OOM;
			goto out;
		}
1164
		set_page_private(pages[i], (unsigned long)memcg);
1165 1166 1167 1168
	}

	for (i = 0; i < HPAGE_PMD_NR; i++) {
		copy_user_highpage(pages[i], page + i,
1169
				   haddr + PAGE_SIZE * i, vma);
1170 1171 1172 1173
		__SetPageUptodate(pages[i]);
		cond_resched();
	}

1174 1175
	mmun_start = haddr;
	mmun_end   = haddr + HPAGE_PMD_SIZE;
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
1176
	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
1177

1178 1179
	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1180
		goto out_free_pages;
1181
	VM_BUG_ON_PAGE(!PageHead(page), page);
1182

1183
	pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1184 1185
	/* leave pmd empty until pte is filled */

1186
	pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
1187
	pmd_populate(vma->vm_mm, &_pmd, pgtable);
1188 1189

	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
1190
		pte_t entry;
1191 1192
		entry = mk_pte(pages[i], vma->vm_page_prot);
		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1193 1194
		memcg = (void *)page_private(pages[i]);
		set_page_private(pages[i], 0);
1195
		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
1196
		mem_cgroup_commit_charge(pages[i], memcg, false, false);
1197
		lru_cache_add_active_or_unevictable(pages[i], vma);
1198 1199 1200 1201
		vmf->pte = pte_offset_map(&_pmd, haddr);
		VM_BUG_ON(!pte_none(*vmf->pte));
		set_pte_at(vma->vm_mm,