fork.c 59.3 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

#include <linux/slab.h>
15
#include <linux/sched/autogroup.h>
16
#include <linux/sched/mm.h>
17
#include <linux/sched/coredump.h>
18
#include <linux/sched/user.h>
19
#include <linux/sched/numa_balancing.h>
20
#include <linux/sched/stat.h>
21
#include <linux/sched/task.h>
22
#include <linux/sched/task_stack.h>
23
#include <linux/sched/cputime.h>
24
#include <linux/rtmutex.h>
Linus Torvalds's avatar
Linus Torvalds committed
25 26 27 28 29 30 31 32 33
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
Al Viro's avatar
Al Viro committed
34
#include <linux/fdtable.h>
35
#include <linux/iocontext.h>
Linus Torvalds's avatar
Linus Torvalds committed
36 37 38
#include <linux/key.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
39
#include <linux/mmu_notifier.h>
40
#include <linux/hmm.h>
Linus Torvalds's avatar
Linus Torvalds committed
41
#include <linux/fs.h>
Davidlohr Bueso's avatar
Davidlohr Bueso committed
42 43
#include <linux/mm.h>
#include <linux/vmacache.h>
44
#include <linux/nsproxy.h>
45
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
46
#include <linux/cpu.h>
47
#include <linux/cgroup.h>
Linus Torvalds's avatar
Linus Torvalds committed
48
#include <linux/security.h>
49
#include <linux/hugetlb.h>
50
#include <linux/seccomp.h>
Linus Torvalds's avatar
Linus Torvalds committed
51 52 53 54
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
55
#include <linux/compat.h>
56
#include <linux/kthread.h>
57
#include <linux/task_io_accounting_ops.h>
58
#include <linux/rcupdate.h>
Linus Torvalds's avatar
Linus Torvalds committed
59 60 61
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
62
#include <linux/memcontrol.h>
63
#include <linux/ftrace.h>
64
#include <linux/proc_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
65 66
#include <linux/profile.h>
#include <linux/rmap.h>
Hugh Dickins's avatar
Hugh Dickins committed
67
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
68
#include <linux/acct.h>
69
#include <linux/userfaultfd_k.h>
70
#include <linux/tsacct_kern.h>
71
#include <linux/cn_proc.h>
Rafael J. Wysocki's avatar
Rafael J. Wysocki committed
72
#include <linux/freezer.h>
73
#include <linux/delayacct.h>
74
#include <linux/taskstats_kern.h>
75
#include <linux/random.h>
Miloslav Trmac's avatar
Miloslav Trmac committed
76
#include <linux/tty.h>
77
#include <linux/blkdev.h>
78
#include <linux/fs_struct.h>
79
#include <linux/magic.h>
80
#include <linux/perf_event.h>
81
#include <linux/posix-timers.h>
82
#include <linux/user-return-notifier.h>
Ying Han's avatar
Ying Han committed
83
#include <linux/oom.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
84
#include <linux/khugepaged.h>
85
#include <linux/signalfd.h>
86
#include <linux/uprobes.h>
87
#include <linux/aio.h>
88
#include <linux/compiler.h>
89
#include <linux/sysctl.h>
Dmitry Vyukov's avatar
Dmitry Vyukov committed
90
#include <linux/kcov.h>
91
#include <linux/livepatch.h>
92
#include <linux/thread_info.h>
93
#include <ipipe/thread_info.h>
Linus Torvalds's avatar
Linus Torvalds committed
94 95 96

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
97
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
98 99 100 101
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

102 103
#include <trace/events/sched.h>

104 105 106
#define CREATE_TRACE_POINTS
#include <trace/events/task.h>

107 108 109 110 111 112 113 114 115 116
/*
 * Minimum number of threads to boot the kernel
 */
#define MIN_THREADS 20

/*
 * Maximum number of threads
 */
#define MAX_THREADS FUTEX_TID_MASK

Linus Torvalds's avatar
Linus Torvalds committed
117 118 119 120
/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
unsigned long total_forks;	/* Handle normal Linux uptimes. */
121
int nr_threads;			/* The idle threads do not count.. */
Linus Torvalds's avatar
Linus Torvalds committed
122 123 124 125 126

int max_threads;		/* tunable limit on nr_threads */

DEFINE_PER_CPU(unsigned long, process_counts) = 0;

127
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
128 129 130 131 132 133 134 135

#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
	return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */
Linus Torvalds's avatar
Linus Torvalds committed
136 137 138 139 140 141

int nr_processes(void)
{
	int cpu;
	int total = 0;

142
	for_each_possible_cpu(cpu)
Linus Torvalds's avatar
Linus Torvalds committed
143 144 145 146 147
		total += per_cpu(process_counts, cpu);

	return total;
}

148 149 150 151
void __weak arch_release_task_struct(struct task_struct *tsk)
{
}

152
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
153
static struct kmem_cache *task_struct_cachep;
154 155 156 157 158 159 160 161 162 163

static inline struct task_struct *alloc_task_struct_node(int node)
{
	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

static inline void free_task_struct(struct task_struct *tsk)
{
	kmem_cache_free(task_struct_cachep, tsk);
}
Linus Torvalds's avatar
Linus Torvalds committed
164 165
#endif

166
void __weak arch_release_thread_stack(unsigned long *stack)
167 168 169
{
}

170
#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
171

172 173 174 175
/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
176
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
177 178 179 180 181 182 183 184

#ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
 * flush.  Try to minimize the number of calls by caching stacks.
 */
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202

static int free_vm_stack_cache(unsigned int cpu)
{
	struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
	int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
		struct vm_struct *vm_stack = cached_vm_stacks[i];

		if (!vm_stack)
			continue;

		vfree(vm_stack->addr);
		cached_vm_stacks[i] = NULL;
	}

	return 0;
}
203 204
#endif

205
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
206
{
207
#ifdef CONFIG_VMAP_STACK
208 209 210 211
	void *stack;
	int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
212 213 214
		struct vm_struct *s;

		s = this_cpu_xchg(cached_stacks[i], NULL);
215 216 217 218

		if (!s)
			continue;

219 220
		/* Clear stale pointers from reused stack. */
		memset(s->addr, 0, THREAD_SIZE);
221

222 223 224 225
		tsk->stack_vm_area = s;
		return s->addr;
	}

226
	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
227
				     VMALLOC_START, VMALLOC_END,
228
				     THREADINFO_GFP,
229 230
				     PAGE_KERNEL,
				     0, node, __builtin_return_address(0));
231 232 233 234 235 236 237 238 239 240

	/*
	 * We can't call find_vm_area() in interrupt context, and
	 * free_thread_stack() can be called in interrupt context,
	 * so cache the vm_struct.
	 */
	if (stack)
		tsk->stack_vm_area = find_vm_area(stack);
	return stack;
#else
241 242
	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
					     THREAD_SIZE_ORDER);
243 244

	return page ? page_address(page) : NULL;
245
#endif
246 247
}

248
static inline void free_thread_stack(struct task_struct *tsk)
249
{
250 251 252 253 254
#ifdef CONFIG_VMAP_STACK
	if (task_stack_vm_area(tsk)) {
		int i;

		for (i = 0; i < NR_CACHED_STACKS; i++) {
255 256
			if (this_cpu_cmpxchg(cached_stacks[i],
					NULL, tsk->stack_vm_area) != NULL)
257 258 259 260 261
				continue;

			return;
		}

262
		vfree_atomic(tsk->stack);
263 264 265 266 267
		return;
	}
#endif

	__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
268
}
269
# else
270
static struct kmem_cache *thread_stack_cache;
271

272
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
273 274
						  int node)
{
275
	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
276 277
}

278
static void free_thread_stack(struct task_struct *tsk)
279
{
280
	kmem_cache_free(thread_stack_cache, tsk->stack);
281 282
}

283
void thread_stack_cache_init(void)
284
{
285
	thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
286
					      THREAD_SIZE, 0, NULL);
287
	BUG_ON(thread_stack_cache == NULL);
288 289
}
# endif
290 291
#endif

Linus Torvalds's avatar
Linus Torvalds committed
292
/* SLAB cache for signal_struct structures (tsk->signal) */
293
static struct kmem_cache *signal_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
294 295

/* SLAB cache for sighand_struct structures (tsk->sighand) */
296
struct kmem_cache *sighand_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
297 298

/* SLAB cache for files_struct structures (tsk->files) */
299
struct kmem_cache *files_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
300 301

/* SLAB cache for fs_struct structures (tsk->fs) */
302
struct kmem_cache *fs_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
303 304

/* SLAB cache for vm_area_struct structures */
305
struct kmem_cache *vm_area_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
306 307

/* SLAB cache for mm_struct structures (tsk->mm) */
308
static struct kmem_cache *mm_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
309

310
static void account_kernel_stack(struct task_struct *tsk, int account)
311
{
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
	void *stack = task_stack_page(tsk);
	struct vm_struct *vm = task_stack_vm_area(tsk);

	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);

	if (vm) {
		int i;

		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);

		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
			mod_zone_page_state(page_zone(vm->pages[i]),
					    NR_KERNEL_STACK_KB,
					    PAGE_SIZE / 1024 * account);
		}

		/* All stack pages belong to the same memcg. */
329 330
		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
				     account * (THREAD_SIZE / 1024));
331 332 333 334 335 336 337 338 339 340
	} else {
		/*
		 * All stack pages are in the same zone and belong to the
		 * same memcg.
		 */
		struct page *first_page = virt_to_page(stack);

		mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
				    THREAD_SIZE / 1024 * account);

341 342
		mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
				     account * (THREAD_SIZE / 1024));
343
	}
344 345
}

346
static void release_task_stack(struct task_struct *tsk)
Linus Torvalds's avatar
Linus Torvalds committed
347
{
348 349 350
	if (WARN_ON(tsk->state != TASK_DEAD))
		return;  /* Better to leak the stack than to free prematurely */

351
	account_kernel_stack(tsk, -1);
352
	arch_release_thread_stack(tsk->stack);
353
	free_thread_stack(tsk);
354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
	tsk->stack = NULL;
#ifdef CONFIG_VMAP_STACK
	tsk->stack_vm_area = NULL;
#endif
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
	if (atomic_dec_and_test(&tsk->stack_refcount))
		release_task_stack(tsk);
}
#endif

void free_task(struct task_struct *tsk)
{
#ifndef CONFIG_THREAD_INFO_IN_TASK
	/*
	 * The task is finally done with both the stack and thread_info,
	 * so free both.
	 */
	release_task_stack(tsk);
#else
	/*
	 * If the task had a separate stack allocation, it should be gone
	 * by now.
	 */
	WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
#endif
Ingo Molnar's avatar
Ingo Molnar committed
383
	rt_mutex_debug_task_free(tsk);
384
	ftrace_graph_exit_task(tsk);
385
	put_seccomp_filter(tsk);
386
	arch_release_task_struct(tsk);
387 388
	if (tsk->flags & PF_KTHREAD)
		free_kthread_struct(tsk);
Linus Torvalds's avatar
Linus Torvalds committed
389 390 391 392
	free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);

393 394
static inline void free_signal_struct(struct signal_struct *sig)
{
395
	taskstats_tgid_free(sig);
396
	sched_autogroup_exit(sig);
397 398 399 400
	/*
	 * __mmdrop is not safe to call from softirq context on x86 due to
	 * pgd_dtor so postpone it to the async context
	 */
401
	if (sig->oom_mm)
402
		mmdrop_async(sig->oom_mm);
403 404 405 406 407
	kmem_cache_free(signal_cachep, sig);
}

static inline void put_signal_struct(struct signal_struct *sig)
{
408
	if (atomic_dec_and_test(&sig->sigcnt))
409 410 411
		free_signal_struct(sig);
}

412
void __put_task_struct(struct task_struct *tsk)
Linus Torvalds's avatar
Linus Torvalds committed
413
{
Eugene Teo's avatar
Eugene Teo committed
414
	WARN_ON(!tsk->exit_state);
Linus Torvalds's avatar
Linus Torvalds committed
415 416 417
	WARN_ON(atomic_read(&tsk->usage));
	WARN_ON(tsk == current);

418
	cgroup_free(tsk);
419
	task_numa_free(tsk);
420
	security_task_free(tsk);
421
	exit_creds(tsk);
422
	delayacct_tsk_free(tsk);
423
	put_signal_struct(tsk->signal);
Linus Torvalds's avatar
Linus Torvalds committed
424 425 426 427

	if (!profile_handoff_task(tsk))
		free_task(tsk);
}
428
EXPORT_SYMBOL_GPL(__put_task_struct);
Linus Torvalds's avatar
Linus Torvalds committed
429

430
void __init __weak arch_task_cache_init(void) { }
431

432 433 434
/*
 * set_max_threads
 */
435
static void set_max_threads(unsigned int max_threads_suggested)
436
{
437
	u64 threads;
438 439

	/*
440 441
	 * The number of threads shall be limited such that the thread
	 * structures may only consume a small part of the available memory.
442
	 */
443 444 445 446 447 448
	if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
		threads = MAX_THREADS;
	else
		threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
				    (u64) THREAD_SIZE * 8UL);

449 450 451
	if (threads > max_threads_suggested)
		threads = max_threads_suggested;

452
	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
453 454
}

455 456 457 458
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif
459

460
void __init fork_init(void)
Linus Torvalds's avatar
Linus Torvalds committed
461
{
462
	int i;
463
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
Linus Torvalds's avatar
Linus Torvalds committed
464
#ifndef ARCH_MIN_TASKALIGN
465
#define ARCH_MIN_TASKALIGN	0
Linus Torvalds's avatar
Linus Torvalds committed
466
#endif
467
	int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
468

Linus Torvalds's avatar
Linus Torvalds committed
469
	/* create a slab on which task_structs can be allocated */
470
	task_struct_cachep = kmem_cache_create("task_struct",
471
			arch_task_struct_size, align,
472
			SLAB_PANIC|SLAB_ACCOUNT, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
473 474
#endif

475 476 477
	/* do the arch specific task caches init */
	arch_task_cache_init();

478
	set_max_threads(MAX_THREADS);
Linus Torvalds's avatar
Linus Torvalds committed
479 480 481 482 483

	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
	init_task.signal->rlim[RLIMIT_SIGPENDING] =
		init_task.signal->rlim[RLIMIT_NPROC];
484

485 486 487
	for (i = 0; i < UCOUNT_COUNTS; i++) {
		init_user_ns.ucount_max[i] = max_threads/2;
	}
488 489 490 491 492

#ifdef CONFIG_VMAP_STACK
	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
			  NULL, free_vm_stack_cache);
#endif
493 494

	lockdep_init_task(&init_task);
Linus Torvalds's avatar
Linus Torvalds committed
495 496
}

497
int __weak arch_dup_task_struct(struct task_struct *dst,
498 499 500 501 502 503
					       struct task_struct *src)
{
	*dst = *src;
	return 0;
}

504 505 506 507 508 509 510 511
void set_task_stack_end_magic(struct task_struct *tsk)
{
	unsigned long *stackend;

	stackend = end_of_stack(tsk);
	*stackend = STACK_END_MAGIC;	/* for overflow detection */
}

512
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
Linus Torvalds's avatar
Linus Torvalds committed
513 514
{
	struct task_struct *tsk;
515
	unsigned long *stack;
516
	struct vm_struct *stack_vm_area;
Peter Zijlstra's avatar
Peter Zijlstra committed
517
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
518

519 520
	if (node == NUMA_NO_NODE)
		node = tsk_fork_get_node(orig);
521
	tsk = alloc_task_struct_node(node);
Linus Torvalds's avatar
Linus Torvalds committed
522 523 524
	if (!tsk)
		return NULL;

525 526
	stack = alloc_thread_stack_node(tsk, node);
	if (!stack)
527
		goto free_tsk;
Linus Torvalds's avatar
Linus Torvalds committed
528

529 530
	stack_vm_area = task_stack_vm_area(tsk);

531
	err = arch_dup_task_struct(tsk, orig);
532 533 534 535 536 537 538 539 540 541

	/*
	 * arch_dup_task_struct() clobbers the stack-related fields.  Make
	 * sure they're properly initialized before using any stack-related
	 * functions again.
	 */
	tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
	tsk->stack_vm_area = stack_vm_area;
#endif
542 543 544
#ifdef CONFIG_THREAD_INFO_IN_TASK
	atomic_set(&tsk->stack_refcount, 1);
#endif
545

546
	if (err)
547
		goto free_stack;
548

Kees Cook's avatar
Kees Cook committed
549 550 551 552 553 554 555 556 557
#ifdef CONFIG_SECCOMP
	/*
	 * We must handle setting up seccomp filters once we're under
	 * the sighand lock in case orig has changed between now and
	 * then. Until then, filter must be NULL to avoid messing up
	 * the usage counts on the error path calling free_task.
	 */
	tsk->seccomp.filter = NULL;
#endif
558 559

	setup_thread_stack(tsk, orig);
560 561
	__ipipe_init_threadflags(task_thread_info(tsk));
	__ipipe_init_threadinfo(&task_thread_info(tsk)->ipipe_data);
562
	clear_user_return_notifier(tsk);
563
	clear_tsk_need_resched(tsk);
564
	set_task_stack_end_magic(tsk);
Linus Torvalds's avatar
Linus Torvalds committed
565

566
#ifdef CONFIG_CC_STACKPROTECTOR
567
	tsk->stack_canary = get_random_canary();
568 569
#endif

570 571 572 573 574
	/*
	 * One for us, one for whoever does the "release_task()" (usually
	 * parent)
	 */
	atomic_set(&tsk->usage, 2);
575
#ifdef CONFIG_BLK_DEV_IO_TRACE
576
	tsk->btrace_seq = 0;
577
#endif
578
	tsk->splice_pipe = NULL;
579
	tsk->task_frag.page = NULL;
580
	tsk->wake_q.next = NULL;
581

582
	account_kernel_stack(tsk, 1);
583

Dmitry Vyukov's avatar
Dmitry Vyukov committed
584 585
	kcov_task_init(tsk);

586 587 588 589
#ifdef CONFIG_FAULT_INJECTION
	tsk->fail_nth = 0;
#endif

Linus Torvalds's avatar
Linus Torvalds committed
590
	return tsk;
591

592
free_stack:
593
	free_thread_stack(tsk);
594
free_tsk:
595 596
	free_task_struct(tsk);
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
597 598 599
}

#ifdef CONFIG_MMU
600 601
static __latent_entropy int dup_mmap(struct mm_struct *mm,
					struct mm_struct *oldmm)
Linus Torvalds's avatar
Linus Torvalds committed
602
{
603
	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
Linus Torvalds's avatar
Linus Torvalds committed
604 605 606
	struct rb_node **rb_link, *rb_parent;
	int retval;
	unsigned long charge;
607
	LIST_HEAD(uf);
Linus Torvalds's avatar
Linus Torvalds committed
608

609
	uprobe_start_dup_mmap();
610 611 612 613
	if (down_write_killable(&oldmm->mmap_sem)) {
		retval = -EINTR;
		goto fail_uprobe_end;
	}
614
	flush_cache_dup_mm(oldmm);
615
	uprobe_dup_mmap(oldmm, mm);
616 617 618 619
	/*
	 * Not linked in yet - no deadlock potential:
	 */
	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
620

621 622 623
	/* No ordering required: file already has been exposed. */
	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));

624
	mm->total_vm = oldmm->total_vm;
625
	mm->data_vm = oldmm->data_vm;
626 627 628
	mm->exec_vm = oldmm->exec_vm;
	mm->stack_vm = oldmm->stack_vm;

Linus Torvalds's avatar
Linus Torvalds committed
629 630 631
	rb_link = &mm->mm_rb.rb_node;
	rb_parent = NULL;
	pprev = &mm->mmap;
Hugh Dickins's avatar
Hugh Dickins committed
632
	retval = ksm_fork(mm, oldmm);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
633 634 635
	if (retval)
		goto out;
	retval = khugepaged_fork(mm, oldmm);
Hugh Dickins's avatar
Hugh Dickins committed
636 637
	if (retval)
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
638

639
	prev = NULL;
640
	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
Linus Torvalds's avatar
Linus Torvalds committed
641 642 643
		struct file *file;

		if (mpnt->vm_flags & VM_DONTCOPY) {
644
			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
Linus Torvalds's avatar
Linus Torvalds committed
645 646 647 648
			continue;
		}
		charge = 0;
		if (mpnt->vm_flags & VM_ACCOUNT) {
649 650
			unsigned long len = vma_pages(mpnt);

651
			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
Linus Torvalds's avatar
Linus Torvalds committed
652 653 654
				goto fail_nomem;
			charge = len;
		}
655
		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
656 657 658
		if (!tmp)
			goto fail_nomem;
		*tmp = *mpnt;
659
		INIT_LIST_HEAD(&tmp->anon_vma_chain);
660 661
		retval = vma_dup_policy(mpnt, tmp);
		if (retval)
Linus Torvalds's avatar
Linus Torvalds committed
662
			goto fail_nomem_policy;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
663
		tmp->vm_mm = mm;
664 665 666
		retval = dup_userfaultfd(tmp, &uf);
		if (retval)
			goto fail_nomem_anon_vma_fork;
667 668 669 670 671 672
		if (tmp->vm_flags & VM_WIPEONFORK) {
			/* VM_WIPEONFORK gets a clean slate in the child. */
			tmp->anon_vma = NULL;
			if (anon_vma_prepare(tmp))
				goto fail_nomem_anon_vma_fork;
		} else if (anon_vma_fork(tmp, mpnt))
673
			goto fail_nomem_anon_vma_fork;
674
		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
675
		tmp->vm_next = tmp->vm_prev = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
676 677
		file = tmp->vm_file;
		if (file) {
Al Viro's avatar
Al Viro committed
678
			struct inode *inode = file_inode(file);
Hugh Dickins's avatar
Hugh Dickins committed
679 680
			struct address_space *mapping = file->f_mapping;

Linus Torvalds's avatar
Linus Torvalds committed
681 682 683
			get_file(file);
			if (tmp->vm_flags & VM_DENYWRITE)
				atomic_dec(&inode->i_writecount);
684
			i_mmap_lock_write(mapping);
Hugh Dickins's avatar
Hugh Dickins committed
685
			if (tmp->vm_flags & VM_SHARED)
686
				atomic_inc(&mapping->i_mmap_writable);
Hugh Dickins's avatar
Hugh Dickins committed
687 688
			flush_dcache_mmap_lock(mapping);
			/* insert tmp into the share list, just after mpnt */
689 690
			vma_interval_tree_insert_after(tmp, mpnt,
					&mapping->i_mmap);
Hugh Dickins's avatar
Hugh Dickins committed
691
			flush_dcache_mmap_unlock(mapping);
692
			i_mmap_unlock_write(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
693 694
		}

695 696 697 698 699 700 701 702
		/*
		 * Clear hugetlb-related page reserves for children. This only
		 * affects MAP_PRIVATE mappings. Faults generated by the child
		 * are not guaranteed to succeed, even if read-only
		 */
		if (is_vm_hugetlb_page(tmp))
			reset_vma_resv_huge_pages(tmp);

Linus Torvalds's avatar
Linus Torvalds committed
703
		/*
704
		 * Link in the new vma and copy the page table entries.
Linus Torvalds's avatar
Linus Torvalds committed
705 706 707
		 */
		*pprev = tmp;
		pprev = &tmp->vm_next;
708 709
		tmp->vm_prev = prev;
		prev = tmp;
Linus Torvalds's avatar
Linus Torvalds committed
710 711 712 713 714 715

		__vma_link_rb(mm, tmp, rb_link, rb_parent);
		rb_link = &tmp->vm_rb.rb_right;
		rb_parent = &tmp->vm_rb;

		mm->map_count++;
716 717
		if (!(tmp->vm_flags & VM_WIPEONFORK))
			retval = copy_page_range(mm, oldmm, mpnt);
Linus Torvalds's avatar
Linus Torvalds committed
718 719 720 721 722 723 724

		if (tmp->vm_ops && tmp->vm_ops->open)
			tmp->vm_ops->open(tmp);

		if (retval)
			goto out;
	}
725
	/* a new mm has just been created */
726
	retval = arch_dup_mmap(oldmm, mm);
Linus Torvalds's avatar
Linus Torvalds committed
727
out:
728
	up_write(&mm->mmap_sem);
729
	flush_tlb_mm(oldmm);
Linus Torvalds's avatar
Linus Torvalds committed
730
	up_write(&oldmm->mmap_sem);
731
	dup_userfaultfd_complete(&uf);
732
fail_uprobe_end:
733
	uprobe_end_dup_mmap();
Linus Torvalds's avatar
Linus Torvalds committed
734
	return retval;
735
fail_nomem_anon_vma_fork:
736
	mpol_put(vma_policy(tmp));
Linus Torvalds's avatar
Linus Torvalds committed
737 738 739 740 741 742 743 744
fail_nomem_policy:
	kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
	retval = -ENOMEM;
	vm_unacct_memory(charge);
	goto out;
}

745
static inline int mm_alloc_pgd(struct mm_struct *mm)
Linus Torvalds's avatar
Linus Torvalds committed
746 747 748 749 750 751 752
{
	mm->pgd = pgd_alloc(mm);
	if (unlikely(!mm->pgd))
		return -ENOMEM;
	return 0;
}

753
static inline void mm_free_pgd(struct mm_struct *mm)
Linus Torvalds's avatar
Linus Torvalds committed
754
{
755
	pgd_free(mm, mm->pgd);
Linus Torvalds's avatar
Linus Torvalds committed
756 757
}
#else
758 759 760 761 762 763 764
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
	down_write(&oldmm->mmap_sem);
	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
	up_write(&oldmm->mmap_sem);
	return 0;
}
Linus Torvalds's avatar
Linus Torvalds committed
765 766 767 768
#define mm_alloc_pgd(mm)	(0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */

Daniel Walker's avatar
Daniel Walker committed
769
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
Linus Torvalds's avatar
Linus Torvalds committed
770

771
#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
Linus Torvalds's avatar
Linus Torvalds committed
772 773
#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))

774 775 776 777 778 779 780 781 782 783 784 785
static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;

static int __init coredump_filter_setup(char *s)
{
	default_dump_filter =
		(simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
		MMF_DUMP_FILTER_MASK;
	return 1;
}

__setup("coredump_filter=", coredump_filter_setup);

Linus Torvalds's avatar
Linus Torvalds committed
786 787
#include <linux/init_task.h>

788 789 790 791
static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
	spin_lock_init(&mm->ioctx_lock);
792
	mm->ioctx_table = NULL;
793 794 795
#endif
}

796 797 798 799 800 801 802
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
	mm->owner = p;
#endif
}

803 804 805 806 807 808 809
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
	mm->uprobes_state.xol_area = NULL;
#endif
}

810 811
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
	struct user_namespace *user_ns)
Linus Torvalds's avatar
Linus Torvalds committed
812
{
813 814 815
	mm->mmap = NULL;
	mm->mm_rb = RB_ROOT;
	mm->vmacache_seqnum = 0;
Linus Torvalds's avatar
Linus Torvalds committed
816 817 818 819
	atomic_set(&mm->mm_users, 1);
	atomic_set(&mm->mm_count, 1);
	init_rwsem(&mm->mmap_sem);
	INIT_LIST_HEAD(&mm->mmlist);
820
	mm->core_state = NULL;
821
	atomic_long_set(&mm->nr_ptes, 0);
822
	mm_nr_pmds_init(mm);
823 824
	mm->map_count = 0;
	mm->locked_vm = 0;
Vladimir Davydov's avatar
Vladimir Davydov committed
825
	mm->pinned_vm = 0;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
826
	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
Linus Torvalds's avatar
Linus Torvalds committed
827
	spin_lock_init(&mm->page_table_lock);
828
	mm_init_cpumask(mm);
829
	mm_init_aio(mm);
830
	mm_init_owner(mm, p);
831
	RCU_INIT_POINTER(mm->exe_file, NULL);
832
	mmu_notifier_mm_init(mm);
833
	hmm_mm_init(mm);
834
	init_tlb_flush_pending(mm);
835 836 837
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
	mm->pmd_huge_pte = NULL;
#endif
838
	mm_init_uprobes_state(mm);
Linus Torvalds's avatar
Linus Torvalds committed
839

840 841 842 843 844
	if (current->mm) {
		mm->flags = current->mm->flags & MMF_INIT_MASK;
		mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
	} else {
		mm->flags = default_dump_filter;
Linus Torvalds's avatar
Linus Torvalds committed
845
		mm->def_flags = 0;
846 847
	}

848 849 850 851 852
	if (mm_alloc_pgd(mm))
		goto fail_nopgd;

	if (init_new_context(p, mm))
		goto fail_nocontext;
853

854
	mm->user_ns = get_user_ns(user_ns);
855 856 857 858 859
	return mm;

fail_nocontext:
	mm_free_pgd(mm);
fail_nopgd:
Linus Torvalds's avatar
Linus Torvalds committed
860 861 862 863
	free_mm(mm);
	return NULL;
}

864 865 866 867 868 869 870 871 872 873 874
static void check_mm(struct mm_struct *mm)
{
	int i;

	for (i = 0; i < NR_MM_COUNTERS; i++) {
		long x = atomic_long_read(&mm->rss_stat.count[i]);

		if (unlikely(x))
			printk(KERN_ALERT "BUG: Bad rss-counter state "
					  "mm:%p idx:%d val:%ld\n", mm, i, x);
	}
875 876 877 878 879 880 881 882

	if (atomic_long_read(&mm->nr_ptes))
		pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
				atomic_long_read(&mm->nr_ptes));
	if (mm_nr_pmds(mm))
		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
				mm_nr_pmds(mm));

883
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
884
	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
885 886 887
#endif
}

Linus Torvalds's avatar
Linus Torvalds committed
888 889 890
/*
 * Allocate and initialize an mm_struct.
 */
891
struct mm_struct *mm_alloc(void)
Linus Torvalds's avatar
Linus Torvalds committed
892
{
893
	struct mm_struct *mm;
Linus Torvalds's avatar
Linus Torvalds committed
894 895

	mm = allocate_mm();
896 897 898 899
	if (!mm)
		return NULL;

	memset(mm, 0, sizeof(*mm));
900
	return mm_init(mm, current, current_user_ns());
Linus Torvalds's avatar
Linus Torvalds committed
901 902 903 904 905 906 907
}

/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
908
void __mmdrop(struct mm_struct *mm)
Linus Torvalds's avatar
Linus Torvalds committed
909 910 911 912
{
	BUG_ON(mm == &init_mm);
	mm_free_pgd(mm);
	destroy_context(mm);
913
	hmm_mm_destroy(mm);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
914
	mmu_notifier_mm_destroy(mm);
915
	check_mm(mm);
916
	put_user_ns(mm->user_ns);
Linus Torvalds's avatar
Linus Torvalds committed
917 918
	free_mm(mm);
}
919
EXPORT_SYMBOL_GPL(__mmdrop);
Linus Torvalds's avatar
Linus Torvalds committed
920

921 922 923 924 925 926 927 928 929
static inline void __mmput(struct mm_struct *mm)
{
	VM_BUG_ON(atomic_read(&mm->mm_users));

	uprobe_clear_state(mm);
	exit_aio(mm);
	ksm_exit(mm);
	khugepaged_exit(mm); /* must run before exit_mmap */
	exit_mmap(mm);
930
	mm_put_huge_zero_page(mm);
931 932 933 934 935 936 937 938 939 940 941
	set_mm_exe_file(mm, NULL);
	if (!list_empty(&mm->mmlist)) {
		spin_lock(&mmlist_lock);
		list_del(&mm->mmlist);
		spin_unlock(&mmlist_lock);
	}
	if (mm->binfmt)
		module_put(mm->binfmt->module);
	mmdrop(mm);
}

Linus Torvalds's avatar
Linus Torvalds committed
942 943 944 945 946
/*
 * Decrement the use count and release all resources for an mm.
 */
void mmput(struct mm_struct *mm)
{
Andrew Morton's avatar
Andrew Morton committed
947 948
	might_sleep();

949 950 951 952 953
	if (atomic_dec_and_test(&mm->mm_users))
		__mmput(mm);
}
EXPORT_SYMBOL_GPL(mmput);

954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971
#ifdef CONFIG_MMU
static void mmput_async_fn(struct work_struct *work)
{
	struct mm_struct *mm = container_of(work, struct mm_struct,
					    async_put_work);

	__mmput(mm);
}

void mmput_async(struct mm_struct *mm)
{
	if (atomic_dec_and_test(&mm->mm_users)) {
		INIT_WORK(&mm->async_put_work, mmput_async_fn);
		schedule_work(&mm->async_put_work);
	}
}
#endif

972 973 974 975 976
/**
 * set_mm_exe_file - change a reference to the mm's executable file
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
977 978 979 980 981
 * Main users are mmput() and sys_execve(). Callers prevent concurrent
 * invocations: in mmput() nobody alive left, in execve task is single
 * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
 * mm->exe_file, but does so without using set_mm_exe_file() in order
 * to do avoid the need for any locks.
982
 */
983 984
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
985 986 987 988 989 990 991 992
	struct file *old_exe_file;

	/*
	 * It is safe to dereference the exe_file without RCU as
	 * this function is only called if nobody else can access
	 * this mm -- see comment above for justification.
	 */
	old_exe_file = rcu_dereference_raw(mm->exe_file);
993

994 995
	if (new_exe_file)
		get_file(new_exe_file);
996 997 998
	rcu_assign_pointer(mm->exe_file, new_exe_file);
	if (old_exe_file)
		fput(old_exe_file);
999 1000
}

1001 1002 1003 1004 1005 1006
/**
 * get_mm_exe_file - acquire a reference to the mm's executable file
 *
 * Returns %NULL if mm has no associated executable file.
 * User must release file via fput().
 */
1007 1008 1009 1010
struct file *get_mm_exe_file(struct mm_struct *mm)
{
	struct file *exe_file;

1011 1012 1013 1014 1015
	rcu_read_lock();
	exe_file = rcu_dereference(mm->exe_file);