fork.c 60.2 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

#include <linux/slab.h>
15
#include <linux/sched/autogroup.h>
16
#include <linux/sched/mm.h>
17
#include <linux/sched/coredump.h>
18
#include <linux/sched/user.h>
19
#include <linux/sched/numa_balancing.h>
20
#include <linux/sched/stat.h>
21
#include <linux/sched/task.h>
22
#include <linux/sched/task_stack.h>
23
#include <linux/sched/cputime.h>
24
#include <linux/rtmutex.h>
Linus Torvalds's avatar
Linus Torvalds committed
25 26 27 28 29 30 31 32 33
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
Al Viro's avatar
Al Viro committed
34
#include <linux/fdtable.h>
35
#include <linux/iocontext.h>
Linus Torvalds's avatar
Linus Torvalds committed
36 37 38
#include <linux/key.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
39
#include <linux/mmu_notifier.h>
40
#include <linux/hmm.h>
Linus Torvalds's avatar
Linus Torvalds committed
41
#include <linux/fs.h>
Davidlohr Bueso's avatar
Davidlohr Bueso committed
42 43
#include <linux/mm.h>
#include <linux/vmacache.h>
44
#include <linux/nsproxy.h>
45
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
46
#include <linux/cpu.h>
47
#include <linux/cgroup.h>
Linus Torvalds's avatar
Linus Torvalds committed
48
#include <linux/security.h>
49
#include <linux/hugetlb.h>
50
#include <linux/seccomp.h>
Linus Torvalds's avatar
Linus Torvalds committed
51 52 53 54
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
55
#include <linux/compat.h>
56
#include <linux/kthread.h>
57
#include <linux/ipipe.h>
58
#include <linux/task_io_accounting_ops.h>
59
#include <linux/rcupdate.h>
Linus Torvalds's avatar
Linus Torvalds committed
60 61 62
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
63
#include <linux/memcontrol.h>
64
#include <linux/ftrace.h>
65
#include <linux/proc_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
66 67
#include <linux/profile.h>
#include <linux/rmap.h>
Hugh Dickins's avatar
Hugh Dickins committed
68
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
69
#include <linux/acct.h>
70
#include <linux/userfaultfd_k.h>
71
#include <linux/tsacct_kern.h>
72
#include <linux/cn_proc.h>
Rafael J. Wysocki's avatar
Rafael J. Wysocki committed
73
#include <linux/freezer.h>
74
#include <linux/delayacct.h>
75
#include <linux/taskstats_kern.h>
76
#include <linux/random.h>
Miloslav Trmac's avatar
Miloslav Trmac committed
77
#include <linux/tty.h>
78
#include <linux/blkdev.h>
79
#include <linux/fs_struct.h>
80
#include <linux/magic.h>
81
#include <linux/perf_event.h>
82
#include <linux/posix-timers.h>
83
#include <linux/user-return-notifier.h>
Ying Han's avatar
Ying Han committed
84
#include <linux/oom.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
85
#include <linux/khugepaged.h>
86
#include <linux/signalfd.h>
87
#include <linux/uprobes.h>
88
#include <linux/aio.h>
89
#include <linux/compiler.h>
90
#include <linux/sysctl.h>
Dmitry Vyukov's avatar
Dmitry Vyukov committed
91
#include <linux/kcov.h>
92
#include <linux/livepatch.h>
93
#include <linux/thread_info.h>
94
#include <ipipe/thread_info.h>
Linus Torvalds's avatar
Linus Torvalds committed
95 96 97

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
98
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
99 100 101 102
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

103 104
#include <trace/events/sched.h>

105 106 107
#define CREATE_TRACE_POINTS
#include <trace/events/task.h>

108 109 110 111 112 113 114 115 116 117
/*
 * Minimum number of threads to boot the kernel
 */
#define MIN_THREADS 20

/*
 * Maximum number of threads
 */
#define MAX_THREADS FUTEX_TID_MASK

Linus Torvalds's avatar
Linus Torvalds committed
118 119 120 121
/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
unsigned long total_forks;	/* Handle normal Linux uptimes. */
122
int nr_threads;			/* The idle threads do not count.. */
Linus Torvalds's avatar
Linus Torvalds committed
123 124 125 126 127

int max_threads;		/* tunable limit on nr_threads */

DEFINE_PER_CPU(unsigned long, process_counts) = 0;

128
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
129 130 131 132 133 134 135 136

#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
	return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */
Linus Torvalds's avatar
Linus Torvalds committed
137 138 139 140 141 142

int nr_processes(void)
{
	int cpu;
	int total = 0;

143
	for_each_possible_cpu(cpu)
Linus Torvalds's avatar
Linus Torvalds committed
144 145 146 147 148
		total += per_cpu(process_counts, cpu);

	return total;
}

149 150 151 152
void __weak arch_release_task_struct(struct task_struct *tsk)
{
}

153
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
154
static struct kmem_cache *task_struct_cachep;
155 156 157 158 159 160 161 162 163 164

static inline struct task_struct *alloc_task_struct_node(int node)
{
	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

static inline void free_task_struct(struct task_struct *tsk)
{
	kmem_cache_free(task_struct_cachep, tsk);
}
Linus Torvalds's avatar
Linus Torvalds committed
165 166
#endif

167
void __weak arch_release_thread_stack(unsigned long *stack)
168 169 170
{
}

171
#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
172

173 174 175 176
/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
177
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
178 179 180 181 182 183 184 185

#ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
 * flush.  Try to minimize the number of calls by caching stacks.
 */
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203

static int free_vm_stack_cache(unsigned int cpu)
{
	struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
	int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
		struct vm_struct *vm_stack = cached_vm_stacks[i];

		if (!vm_stack)
			continue;

		vfree(vm_stack->addr);
		cached_vm_stacks[i] = NULL;
	}

	return 0;
}
204 205
#endif

206
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
207
{
208
#ifdef CONFIG_VMAP_STACK
209 210 211 212
	void *stack;
	int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
213 214 215
		struct vm_struct *s;

		s = this_cpu_xchg(cached_stacks[i], NULL);
216 217 218 219

		if (!s)
			continue;

220 221
		/* Clear stale pointers from reused stack. */
		memset(s->addr, 0, THREAD_SIZE);
222

223 224 225 226
		tsk->stack_vm_area = s;
		return s->addr;
	}

227
	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
228
				     VMALLOC_START, VMALLOC_END,
229
				     THREADINFO_GFP,
230 231
				     PAGE_KERNEL,
				     0, node, __builtin_return_address(0));
232 233 234 235 236 237 238 239 240 241

	/*
	 * We can't call find_vm_area() in interrupt context, and
	 * free_thread_stack() can be called in interrupt context,
	 * so cache the vm_struct.
	 */
	if (stack)
		tsk->stack_vm_area = find_vm_area(stack);
	return stack;
#else
242 243
	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
					     THREAD_SIZE_ORDER);
244 245

	return page ? page_address(page) : NULL;
246
#endif
247 248
}

249
static inline void free_thread_stack(struct task_struct *tsk)
250
{
251 252 253 254 255
#ifdef CONFIG_VMAP_STACK
	if (task_stack_vm_area(tsk)) {
		int i;

		for (i = 0; i < NR_CACHED_STACKS; i++) {
256 257
			if (this_cpu_cmpxchg(cached_stacks[i],
					NULL, tsk->stack_vm_area) != NULL)
258 259 260 261 262
				continue;

			return;
		}

263
		vfree_atomic(tsk->stack);
264 265 266 267 268
		return;
	}
#endif

	__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
269
}
270
# else
271
static struct kmem_cache *thread_stack_cache;
272

273
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
274 275
						  int node)
{
276
	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
277 278
}

279
static void free_thread_stack(struct task_struct *tsk)
280
{
281
	kmem_cache_free(thread_stack_cache, tsk->stack);
282 283
}

284
void thread_stack_cache_init(void)
285
{
286
	thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
287
					      THREAD_SIZE, 0, NULL);
288
	BUG_ON(thread_stack_cache == NULL);
289 290
}
# endif
291 292
#endif

Linus Torvalds's avatar
Linus Torvalds committed
293
/* SLAB cache for signal_struct structures (tsk->signal) */
294
static struct kmem_cache *signal_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
295 296

/* SLAB cache for sighand_struct structures (tsk->sighand) */
297
struct kmem_cache *sighand_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
298 299

/* SLAB cache for files_struct structures (tsk->files) */
300
struct kmem_cache *files_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
301 302

/* SLAB cache for fs_struct structures (tsk->fs) */
303
struct kmem_cache *fs_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
304 305

/* SLAB cache for vm_area_struct structures */
306
struct kmem_cache *vm_area_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
307 308

/* SLAB cache for mm_struct structures (tsk->mm) */
309
static struct kmem_cache *mm_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
310

311
static void account_kernel_stack(struct task_struct *tsk, int account)
312
{
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
	void *stack = task_stack_page(tsk);
	struct vm_struct *vm = task_stack_vm_area(tsk);

	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);

	if (vm) {
		int i;

		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);

		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
			mod_zone_page_state(page_zone(vm->pages[i]),
					    NR_KERNEL_STACK_KB,
					    PAGE_SIZE / 1024 * account);
		}

		/* All stack pages belong to the same memcg. */
330 331
		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
				     account * (THREAD_SIZE / 1024));
332 333 334 335 336 337 338 339 340 341
	} else {
		/*
		 * All stack pages are in the same zone and belong to the
		 * same memcg.
		 */
		struct page *first_page = virt_to_page(stack);

		mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
				    THREAD_SIZE / 1024 * account);

342 343
		mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
				     account * (THREAD_SIZE / 1024));
344
	}
345 346
}

347
static void release_task_stack(struct task_struct *tsk)
Linus Torvalds's avatar
Linus Torvalds committed
348
{
349 350 351
	if (WARN_ON(tsk->state != TASK_DEAD))
		return;  /* Better to leak the stack than to free prematurely */

352
	account_kernel_stack(tsk, -1);
353
	arch_release_thread_stack(tsk->stack);
354
	free_thread_stack(tsk);
355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
	tsk->stack = NULL;
#ifdef CONFIG_VMAP_STACK
	tsk->stack_vm_area = NULL;
#endif
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
	if (atomic_dec_and_test(&tsk->stack_refcount))
		release_task_stack(tsk);
}
#endif

void free_task(struct task_struct *tsk)
{
#ifndef CONFIG_THREAD_INFO_IN_TASK
	/*
	 * The task is finally done with both the stack and thread_info,
	 * so free both.
	 */
	release_task_stack(tsk);
#else
	/*
	 * If the task had a separate stack allocation, it should be gone
	 * by now.
	 */
	WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
#endif
Ingo Molnar's avatar
Ingo Molnar committed
384
	rt_mutex_debug_task_free(tsk);
385
	ftrace_graph_exit_task(tsk);
386
	put_seccomp_filter(tsk);
387
	arch_release_task_struct(tsk);
388 389
	if (tsk->flags & PF_KTHREAD)
		free_kthread_struct(tsk);
Linus Torvalds's avatar
Linus Torvalds committed
390 391 392 393
	free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);

394 395
static inline void free_signal_struct(struct signal_struct *sig)
{
396
	taskstats_tgid_free(sig);
397
	sched_autogroup_exit(sig);
398 399 400 401
	/*
	 * __mmdrop is not safe to call from softirq context on x86 due to
	 * pgd_dtor so postpone it to the async context
	 */
402
	if (sig->oom_mm)
403
		mmdrop_async(sig->oom_mm);
404 405 406 407 408
	kmem_cache_free(signal_cachep, sig);
}

static inline void put_signal_struct(struct signal_struct *sig)
{
409
	if (atomic_dec_and_test(&sig->sigcnt))
410 411 412
		free_signal_struct(sig);
}

413
void __put_task_struct(struct task_struct *tsk)
Linus Torvalds's avatar
Linus Torvalds committed
414
{
Eugene Teo's avatar
Eugene Teo committed
415
	WARN_ON(!tsk->exit_state);
Linus Torvalds's avatar
Linus Torvalds committed
416 417 418
	WARN_ON(atomic_read(&tsk->usage));
	WARN_ON(tsk == current);

419
	cgroup_free(tsk);
420
	task_numa_free(tsk);
421
	security_task_free(tsk);
422
	exit_creds(tsk);
423
	delayacct_tsk_free(tsk);
424
	put_signal_struct(tsk->signal);
Linus Torvalds's avatar
Linus Torvalds committed
425 426 427 428

	if (!profile_handoff_task(tsk))
		free_task(tsk);
}
429
EXPORT_SYMBOL_GPL(__put_task_struct);
Linus Torvalds's avatar
Linus Torvalds committed
430

431
void __init __weak arch_task_cache_init(void) { }
432

433 434 435
/*
 * set_max_threads
 */
436
static void set_max_threads(unsigned int max_threads_suggested)
437
{
438
	u64 threads;
439 440

	/*
441 442
	 * The number of threads shall be limited such that the thread
	 * structures may only consume a small part of the available memory.
443
	 */
444 445 446 447 448 449
	if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
		threads = MAX_THREADS;
	else
		threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
				    (u64) THREAD_SIZE * 8UL);

450 451 452
	if (threads > max_threads_suggested)
		threads = max_threads_suggested;

453
	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
454 455
}

456 457 458 459
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif
460

461
void __init fork_init(void)
Linus Torvalds's avatar
Linus Torvalds committed
462
{
463
	int i;
464
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
Linus Torvalds's avatar
Linus Torvalds committed
465
#ifndef ARCH_MIN_TASKALIGN
466
#define ARCH_MIN_TASKALIGN	0
Linus Torvalds's avatar
Linus Torvalds committed
467
#endif
468
	int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
469

Linus Torvalds's avatar
Linus Torvalds committed
470
	/* create a slab on which task_structs can be allocated */
471
	task_struct_cachep = kmem_cache_create("task_struct",
472
			arch_task_struct_size, align,
473
			SLAB_PANIC|SLAB_ACCOUNT, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
474 475
#endif

476 477 478
	/* do the arch specific task caches init */
	arch_task_cache_init();

479
	set_max_threads(MAX_THREADS);
Linus Torvalds's avatar
Linus Torvalds committed
480 481 482 483 484

	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
	init_task.signal->rlim[RLIMIT_SIGPENDING] =
		init_task.signal->rlim[RLIMIT_NPROC];
485

486 487 488
	for (i = 0; i < UCOUNT_COUNTS; i++) {
		init_user_ns.ucount_max[i] = max_threads/2;
	}
489 490 491 492 493

#ifdef CONFIG_VMAP_STACK
	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
			  NULL, free_vm_stack_cache);
#endif
494 495

	lockdep_init_task(&init_task);
Linus Torvalds's avatar
Linus Torvalds committed
496 497
}

498
int __weak arch_dup_task_struct(struct task_struct *dst,
499 500 501 502 503 504
					       struct task_struct *src)
{
	*dst = *src;
	return 0;
}

505 506 507 508 509 510 511 512
void set_task_stack_end_magic(struct task_struct *tsk)
{
	unsigned long *stackend;

	stackend = end_of_stack(tsk);
	*stackend = STACK_END_MAGIC;	/* for overflow detection */
}

513
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
Linus Torvalds's avatar
Linus Torvalds committed
514 515
{
	struct task_struct *tsk;
516
	unsigned long *stack;
517
	struct vm_struct *stack_vm_area;
Peter Zijlstra's avatar
Peter Zijlstra committed
518
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
519

520 521
	if (node == NUMA_NO_NODE)
		node = tsk_fork_get_node(orig);
522
	tsk = alloc_task_struct_node(node);
Linus Torvalds's avatar
Linus Torvalds committed
523 524 525
	if (!tsk)
		return NULL;

526 527
	stack = alloc_thread_stack_node(tsk, node);
	if (!stack)
528
		goto free_tsk;
Linus Torvalds's avatar
Linus Torvalds committed
529

530 531
	stack_vm_area = task_stack_vm_area(tsk);

532
	err = arch_dup_task_struct(tsk, orig);
533 534 535 536 537 538 539 540 541 542

	/*
	 * arch_dup_task_struct() clobbers the stack-related fields.  Make
	 * sure they're properly initialized before using any stack-related
	 * functions again.
	 */
	tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
	tsk->stack_vm_area = stack_vm_area;
#endif
543 544 545
#ifdef CONFIG_THREAD_INFO_IN_TASK
	atomic_set(&tsk->stack_refcount, 1);
#endif
546

547
	if (err)
548
		goto free_stack;
549

Kees Cook's avatar
Kees Cook committed
550 551 552 553 554 555 556 557 558
#ifdef CONFIG_SECCOMP
	/*
	 * We must handle setting up seccomp filters once we're under
	 * the sighand lock in case orig has changed between now and
	 * then. Until then, filter must be NULL to avoid messing up
	 * the usage counts on the error path calling free_task.
	 */
	tsk->seccomp.filter = NULL;
#endif
559 560

	setup_thread_stack(tsk, orig);
561 562
	__ipipe_init_threadflags(task_thread_info(tsk));
	__ipipe_init_threadinfo(&task_thread_info(tsk)->ipipe_data);
563
	clear_user_return_notifier(tsk);
564
	clear_tsk_need_resched(tsk);
565
	set_task_stack_end_magic(tsk);
Linus Torvalds's avatar
Linus Torvalds committed
566

567
#ifdef CONFIG_CC_STACKPROTECTOR
568
	tsk->stack_canary = get_random_canary();
569 570
#endif

571 572 573 574 575
	/*
	 * One for us, one for whoever does the "release_task()" (usually
	 * parent)
	 */
	atomic_set(&tsk->usage, 2);
576
#ifdef CONFIG_BLK_DEV_IO_TRACE
577
	tsk->btrace_seq = 0;
578
#endif
579
	tsk->splice_pipe = NULL;
580
	tsk->task_frag.page = NULL;
581
	tsk->wake_q.next = NULL;
582

583
	account_kernel_stack(tsk, 1);
584

Dmitry Vyukov's avatar
Dmitry Vyukov committed
585 586
	kcov_task_init(tsk);

587 588 589 590
#ifdef CONFIG_FAULT_INJECTION
	tsk->fail_nth = 0;
#endif

Linus Torvalds's avatar
Linus Torvalds committed
591
	return tsk;
592

593
free_stack:
594
	free_thread_stack(tsk);
595
free_tsk:
596 597
	free_task_struct(tsk);
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
598 599 600
}

#ifdef CONFIG_MMU
601 602
static __latent_entropy int dup_mmap(struct mm_struct *mm,
					struct mm_struct *oldmm)
Linus Torvalds's avatar
Linus Torvalds committed
603
{
604
	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
Linus Torvalds's avatar
Linus Torvalds committed
605 606 607
	struct rb_node **rb_link, *rb_parent;
	int retval;
	unsigned long charge;
608
	LIST_HEAD(uf);
Linus Torvalds's avatar
Linus Torvalds committed
609

610
	uprobe_start_dup_mmap();
611 612 613 614
	if (down_write_killable(&oldmm->mmap_sem)) {
		retval = -EINTR;
		goto fail_uprobe_end;
	}
615
	flush_cache_dup_mm(oldmm);
616
	uprobe_dup_mmap(oldmm, mm);
617 618 619 620
	/*
	 * Not linked in yet - no deadlock potential:
	 */
	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
621

622 623 624
	/* No ordering required: file already has been exposed. */
	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));

625
	mm->total_vm = oldmm->total_vm;
626
	mm->data_vm = oldmm->data_vm;
627 628 629
	mm->exec_vm = oldmm->exec_vm;
	mm->stack_vm = oldmm->stack_vm;

Linus Torvalds's avatar
Linus Torvalds committed
630 631 632
	rb_link = &mm->mm_rb.rb_node;
	rb_parent = NULL;
	pprev = &mm->mmap;
Hugh Dickins's avatar
Hugh Dickins committed
633
	retval = ksm_fork(mm, oldmm);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
634 635 636
	if (retval)
		goto out;
	retval = khugepaged_fork(mm, oldmm);
Hugh Dickins's avatar
Hugh Dickins committed
637 638
	if (retval)
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
639

640
	prev = NULL;
641
	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
Linus Torvalds's avatar
Linus Torvalds committed
642 643 644
		struct file *file;

		if (mpnt->vm_flags & VM_DONTCOPY) {
645
			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
Linus Torvalds's avatar
Linus Torvalds committed
646 647 648 649
			continue;
		}
		charge = 0;
		if (mpnt->vm_flags & VM_ACCOUNT) {
650 651
			unsigned long len = vma_pages(mpnt);

652
			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
Linus Torvalds's avatar
Linus Torvalds committed
653 654 655
				goto fail_nomem;
			charge = len;
		}
656
		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
657 658 659
		if (!tmp)
			goto fail_nomem;
		*tmp = *mpnt;
660
		INIT_LIST_HEAD(&tmp->anon_vma_chain);
661 662
		retval = vma_dup_policy(mpnt, tmp);
		if (retval)
Linus Torvalds's avatar
Linus Torvalds committed
663
			goto fail_nomem_policy;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
664
		tmp->vm_mm = mm;
665 666 667
		retval = dup_userfaultfd(tmp, &uf);
		if (retval)
			goto fail_nomem_anon_vma_fork;
668 669 670 671 672 673
		if (tmp->vm_flags & VM_WIPEONFORK) {
			/* VM_WIPEONFORK gets a clean slate in the child. */
			tmp->anon_vma = NULL;
			if (anon_vma_prepare(tmp))
				goto fail_nomem_anon_vma_fork;
		} else if (anon_vma_fork(tmp, mpnt))
674
			goto fail_nomem_anon_vma_fork;
675
		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
676
		tmp->vm_next = tmp->vm_prev = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
677 678
		file = tmp->vm_file;
		if (file) {
Al Viro's avatar
Al Viro committed
679
			struct inode *inode = file_inode(file);
Hugh Dickins's avatar
Hugh Dickins committed
680 681
			struct address_space *mapping = file->f_mapping;

Linus Torvalds's avatar
Linus Torvalds committed
682 683 684
			get_file(file);
			if (tmp->vm_flags & VM_DENYWRITE)
				atomic_dec(&inode->i_writecount);
685
			i_mmap_lock_write(mapping);
Hugh Dickins's avatar
Hugh Dickins committed
686
			if (tmp->vm_flags & VM_SHARED)
687
				atomic_inc(&mapping->i_mmap_writable);
Hugh Dickins's avatar
Hugh Dickins committed
688 689
			flush_dcache_mmap_lock(mapping);
			/* insert tmp into the share list, just after mpnt */
690 691
			vma_interval_tree_insert_after(tmp, mpnt,
					&mapping->i_mmap);
Hugh Dickins's avatar
Hugh Dickins committed
692
			flush_dcache_mmap_unlock(mapping);
693
			i_mmap_unlock_write(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
694 695
		}

696 697 698 699 700 701 702 703
		/*
		 * Clear hugetlb-related page reserves for children. This only
		 * affects MAP_PRIVATE mappings. Faults generated by the child
		 * are not guaranteed to succeed, even if read-only
		 */
		if (is_vm_hugetlb_page(tmp))
			reset_vma_resv_huge_pages(tmp);

Linus Torvalds's avatar
Linus Torvalds committed
704
		/*
705
		 * Link in the new vma and copy the page table entries.
Linus Torvalds's avatar
Linus Torvalds committed
706 707 708
		 */
		*pprev = tmp;
		pprev = &tmp->vm_next;
709 710
		tmp->vm_prev = prev;
		prev = tmp;
Linus Torvalds's avatar
Linus Torvalds committed
711 712 713 714 715 716

		__vma_link_rb(mm, tmp, rb_link, rb_parent);
		rb_link = &tmp->vm_rb.rb_right;
		rb_parent = &tmp->vm_rb;

		mm->map_count++;
717 718
		if (!(tmp->vm_flags & VM_WIPEONFORK))
			retval = copy_page_range(mm, oldmm, mpnt);
Linus Torvalds's avatar
Linus Torvalds committed
719 720 721 722 723 724 725

		if (tmp->vm_ops && tmp->vm_ops->open)
			tmp->vm_ops->open(tmp);

		if (retval)
			goto out;
	}
726
	/* a new mm has just been created */
727
	retval = arch_dup_mmap(oldmm, mm);
Linus Torvalds's avatar
Linus Torvalds committed
728
out:
729
	up_write(&mm->mmap_sem);
730
	flush_tlb_mm(oldmm);
Linus Torvalds's avatar
Linus Torvalds committed
731
	up_write(&oldmm->mmap_sem);
732
	dup_userfaultfd_complete(&uf);
733
fail_uprobe_end:
734
	uprobe_end_dup_mmap();
Linus Torvalds's avatar
Linus Torvalds committed
735
	return retval;
736
fail_nomem_anon_vma_fork:
737
	mpol_put(vma_policy(tmp));
Linus Torvalds's avatar
Linus Torvalds committed
738 739 740 741 742 743 744 745
fail_nomem_policy:
	kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
	retval = -ENOMEM;
	vm_unacct_memory(charge);
	goto out;
}

746
static inline int mm_alloc_pgd(struct mm_struct *mm)
Linus Torvalds's avatar
Linus Torvalds committed
747 748 749 750 751 752 753
{
	mm->pgd = pgd_alloc(mm);
	if (unlikely(!mm->pgd))
		return -ENOMEM;
	return 0;
}

754
static inline void mm_free_pgd(struct mm_struct *mm)
Linus Torvalds's avatar
Linus Torvalds committed
755
{
756
	pgd_free(mm, mm->pgd);
Linus Torvalds's avatar
Linus Torvalds committed
757 758
}
#else
759 760 761 762 763 764 765
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
	down_write(&oldmm->mmap_sem);
	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
	up_write(&oldmm->mmap_sem);
	return 0;
}
Linus Torvalds's avatar
Linus Torvalds committed
766 767 768 769
#define mm_alloc_pgd(mm)	(0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */

Daniel Walker's avatar
Daniel Walker committed
770
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
Linus Torvalds's avatar
Linus Torvalds committed
771

772
#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
Linus Torvalds's avatar
Linus Torvalds committed
773 774
#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))

775 776 777 778 779 780 781 782 783 784 785 786
static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;

static int __init coredump_filter_setup(char *s)
{
	default_dump_filter =
		(simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
		MMF_DUMP_FILTER_MASK;
	return 1;
}

__setup("coredump_filter=", coredump_filter_setup);

Linus Torvalds's avatar
Linus Torvalds committed
787 788
#include <linux/init_task.h>

789 790 791 792
static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
	spin_lock_init(&mm->ioctx_lock);
793
	mm->ioctx_table = NULL;
794 795 796
#endif
}

797 798 799 800 801 802 803 804 805
static __always_inline void mm_clear_owner(struct mm_struct *mm,
					   struct task_struct *p)
{
#ifdef CONFIG_MEMCG
	if (mm->owner == p)
		WRITE_ONCE(mm->owner, NULL);
#endif
}

806 807 808 809 810 811 812
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
	mm->owner = p;
#endif
}

813 814 815 816 817 818 819
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
	mm->uprobes_state.xol_area = NULL;
#endif
}

820 821
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
	struct user_namespace *user_ns)
Linus Torvalds's avatar
Linus Torvalds committed
822
{
823 824 825
	mm->mmap = NULL;
	mm->mm_rb = RB_ROOT;
	mm->vmacache_seqnum = 0;
Linus Torvalds's avatar
Linus Torvalds committed
826 827 828 829
	atomic_set(&mm->mm_users, 1);
	atomic_set(&mm->mm_count, 1);
	init_rwsem(&mm->mmap_sem);
	INIT_LIST_HEAD(&mm->mmlist);
830
	mm->core_state = NULL;
831
	atomic_long_set(&mm->nr_ptes, 0);
832
	mm_nr_pmds_init(mm);
833 834
	mm->map_count = 0;
	mm->locked_vm = 0;
Vladimir Davydov's avatar
Vladimir Davydov committed
835
	mm->pinned_vm = 0;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
836
	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
Linus Torvalds's avatar
Linus Torvalds committed
837
	spin_lock_init(&mm->page_table_lock);
838
	mm_init_cpumask(mm);
839
	mm_init_aio(mm);
840
	mm_init_owner(mm, p);
841
	RCU_INIT_POINTER(mm->exe_file, NULL);
842
	mmu_notifier_mm_init(mm);
843
	hmm_mm_init(mm);
844
	init_tlb_flush_pending(mm);
845 846 847
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
	mm->pmd_huge_pte = NULL;
#endif
848
	mm_init_uprobes_state(mm);
Linus Torvalds's avatar
Linus Torvalds committed
849

850 851 852 853 854
	if (current->mm) {
		mm->flags = current->mm->flags & MMF_INIT_MASK;
		mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
	} else {
		mm->flags = default_dump_filter;
Linus Torvalds's avatar
Linus Torvalds committed
855
		mm->def_flags = 0;
856 857
	}

858 859 860 861 862
	if (mm_alloc_pgd(mm))
		goto fail_nopgd;

	if (init_new_context(p, mm))
		goto fail_nocontext;
863

864
	mm->user_ns = get_user_ns(user_ns);
865 866 867 868 869
	return mm;

fail_nocontext:
	mm_free_pgd(mm);
fail_nopgd:
Linus Torvalds's avatar
Linus Torvalds committed
870 871 872 873
	free_mm(mm);
	return NULL;
}

874 875 876 877 878 879 880 881 882 883 884
static void check_mm(struct mm_struct *mm)
{
	int i;

	for (i = 0; i < NR_MM_COUNTERS; i++) {
		long x = atomic_long_read(&mm->rss_stat.count[i]);

		if (unlikely(x))
			printk(KERN_ALERT "BUG: Bad rss-counter state "
					  "mm:%p idx:%d val:%ld\n", mm, i, x);
	}
885 886 887 888 889 890 891 892

	if (atomic_long_read(&mm->nr_ptes))
		pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
				atomic_long_read(&mm->nr_ptes));
	if (mm_nr_pmds(mm))
		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
				mm_nr_pmds(mm));

893
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
894
	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
895 896 897
#endif
}

Linus Torvalds's avatar
Linus Torvalds committed
898 899 900
/*
 * Allocate and initialize an mm_struct.
 */
901
struct mm_struct *mm_alloc(void)
Linus Torvalds's avatar
Linus Torvalds committed
902
{
903
	struct mm_struct *mm;
Linus Torvalds's avatar
Linus Torvalds committed
904 905

	mm = allocate_mm();
906 907 908 909
	if (!mm)
		return NULL;

	memset(mm, 0, sizeof(*mm));
910
	return mm_init(mm, current, current_user_ns());
Linus Torvalds's avatar
Linus Torvalds committed
911 912 913 914 915 916 917
}

/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
918
void __mmdrop(struct mm_struct *mm)
Linus Torvalds's avatar
Linus Torvalds committed
919 920 921 922
{
	BUG_ON(mm == &init_mm);
	mm_free_pgd(mm);
	destroy_context(mm);
923
	hmm_mm_destroy(mm);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
924
	mmu_notifier_mm_destroy(mm);
925
	check_mm(mm);
926
	put_user_ns(mm->user_ns);
Linus Torvalds's avatar
Linus Torvalds committed
927 928
	free_mm(mm);
}
929
EXPORT_SYMBOL_GPL(__mmdrop);
Linus Torvalds's avatar
Linus Torvalds committed
930

931 932 933 934 935 936 937 938
static inline void __mmput(struct mm_struct *mm)
{
	VM_BUG_ON(atomic_read(&mm->mm_users));

	uprobe_clear_state(mm);
	exit_aio(mm);
	ksm_exit(mm);
	khugepaged_exit(mm); /* must run before exit_mmap */
939
	__ipipe_report_cleanup(mm);
940
	exit_mmap(mm);
941
	mm_put_huge_zero_page(mm);
942 943 944 945 946 947 948 949 950 951 952
	set_mm_exe_file(mm, NULL);
	if (!list_empty(&mm->mmlist)) {
		spin_lock(&mmlist_lock);
		list_del(&mm->mmlist);
		spin_unlock(&mmlist_lock);
	}
	if (mm->binfmt)
		module_put(mm->binfmt->module);
	mmdrop(mm);
}

Linus Torvalds's avatar
Linus Torvalds committed
953 954 955 956 957
/*
 * Decrement the use count and release all resources for an mm.
 */
void mmput(struct mm_struct *mm)
{
Andrew Morton's avatar
Andrew Morton committed
958 959
	might_sleep();

960 961 962 963 964
	if (atomic_dec_and_test(&mm->mm_users))
		__mmput(mm);
}
EXPORT_SYMBOL_GPL(mmput);

965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
#ifdef CONFIG_MMU
static void mmput_async_fn(struct work_struct *work)
{
	struct mm_struct *mm = container_of(work, struct mm_struct,
					    async_put_work);

	__mmput(mm);
}

void mmput_async(struct mm_struct *mm)
{
	if (atomic_dec_and_test(&mm->mm_users)) {
		INIT_WORK(&mm->async_put_work, mmput_async_fn);
		schedule_work(&mm->async_put_work);
	}
}
#endif

983 984 985 986 987
/**
 * set_mm_exe_file - change a reference to the mm's executable file
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
988 989 990 991 992
 * Main users are mmput() and sys_execve(). Callers prevent concurrent
 * invocations: in mmput() nobody alive left, in execve task is single
 * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
 * mm->exe_file, but does so without using set_mm_exe_file() in order
 * to do avoid the need for any locks.
993
 */
994 995
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
996 997 998 999 1000 1001 1002 1003
	struct file *old_exe_file;

	/*
	 * It is safe to dereference the exe_file without RCU as
	 * this function is only called if nobody else can access
	 * this mm -- see comment above for justification.
	 */
	old_exe_file = rcu_dereference_raw(mm->exe_file);
1004

1005 1006
	if (new_exe_file)
		get_file(new_exe_file);
1007 1008 1009
	rcu_assign_pointer(mm->exe_file, new_exe_file);
	if (old_exe_file)
		fput(old_exe_file);
1010 1011
}

1012 1013 1014 1015 1016