seccomp.c 31.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6
/*
 * linux/kernel/seccomp.c
 *
 * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
 *
7 8 9 10 11 12 13 14
 * Copyright (C) 2012 Google, Inc.
 * Will Drewry <wad@chromium.org>
 *
 * This defines a simple but solid secure-computing facility.
 *
 * Mode 1 uses a fixed list of allowed system calls.
 * Mode 2 allows user-defined system call filters in the form
 *        of Berkeley Packet Filters/Linux Socket Filters.
Linus Torvalds's avatar
Linus Torvalds committed
15 16
 */

17
#include <linux/refcount.h>
18
#include <linux/audit.h>
19
#include <linux/compat.h>
20
#include <linux/coredump.h>
21
#include <linux/kmemleak.h>
22 23
#include <linux/nospec.h>
#include <linux/prctl.h>
24
#include <linux/sched.h>
25
#include <linux/sched/task_stack.h>
26
#include <linux/seccomp.h>
27
#include <linux/slab.h>
Kees Cook's avatar
Kees Cook committed
28
#include <linux/syscalls.h>
29
#include <linux/sysctl.h>
Linus Torvalds's avatar
Linus Torvalds committed
30

31
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
32
#include <asm/syscall.h>
33
#endif
34 35 36

#ifdef CONFIG_SECCOMP_FILTER
#include <linux/filter.h>
37
#include <linux/pid.h>
38
#include <linux/ptrace.h>
39 40 41 42 43 44 45 46 47 48 49
#include <linux/security.h>
#include <linux/tracehook.h>
#include <linux/uaccess.h>

/**
 * struct seccomp_filter - container for seccomp BPF programs
 *
 * @usage: reference count to manage the object lifetime.
 *         get/put helpers should be used when accessing an instance
 *         outside of a lifetime-guarded section.  In general, this
 *         is only needed for handling filters shared across tasks.
50
 * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
51
 * @prev: points to a previously installed, or inherited, filter
52
 * @prog: the BPF program to evaluate
53 54 55 56 57 58 59 60 61 62 63 64
 *
 * seccomp_filter objects are organized in a tree linked via the @prev
 * pointer.  For any task, it appears to be a singly-linked list starting
 * with current->seccomp.filter, the most recently attached or inherited filter.
 * However, multiple filters may share a @prev node, by way of fork(), which
 * results in a unidirectional tree existing in memory.  This is similar to
 * how namespaces work.
 *
 * seccomp_filter objects should never be modified after being attached
 * to a task_struct (other than @usage).
 */
struct seccomp_filter {
65
	refcount_t usage;
66
	bool log;
67
	struct seccomp_filter *prev;
68
	struct bpf_prog *prog;
69 70 71 72 73
};

/* Limit any path through the tree to 256KB worth of instructions. */
#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))

74
/*
75 76 77
 * Endianness is explicitly ignored and left for BPF program authors to manage
 * as per the specific architecture.
 */
78
static void populate_seccomp_data(struct seccomp_data *sd)
79
{
80 81
	struct task_struct *task = current;
	struct pt_regs *regs = task_pt_regs(task);
82
	unsigned long args[6];
83

84
	sd->nr = syscall_get_nr(task, regs);
85
	sd->arch = syscall_get_arch();
86 87 88 89 90 91 92
	syscall_get_arguments(task, regs, 0, 6, args);
	sd->args[0] = args[0];
	sd->args[1] = args[1];
	sd->args[2] = args[2];
	sd->args[3] = args[3];
	sd->args[4] = args[4];
	sd->args[5] = args[5];
93
	sd->instruction_pointer = KSTK_EIP(task);
94 95 96 97 98 99 100
}

/**
 *	seccomp_check_filter - verify seccomp filter code
 *	@filter: filter to verify
 *	@flen: length of filter
 *
101
 * Takes a previously checked filter (by bpf_check_classic) and
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
 * redirects all filter code that loads struct sk_buff data
 * and related data through seccomp_bpf_load.  It also
 * enforces length and alignment checking of those loads.
 *
 * Returns 0 if the rule set is legal or -EINVAL if not.
 */
static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
{
	int pc;
	for (pc = 0; pc < flen; pc++) {
		struct sock_filter *ftest = &filter[pc];
		u16 code = ftest->code;
		u32 k = ftest->k;

		switch (code) {
117
		case BPF_LD | BPF_W | BPF_ABS:
118
			ftest->code = BPF_LDX | BPF_W | BPF_ABS;
119 120 121 122
			/* 32-bit aligned and not out of bounds. */
			if (k >= sizeof(struct seccomp_data) || k & 3)
				return -EINVAL;
			continue;
123
		case BPF_LD | BPF_W | BPF_LEN:
124
			ftest->code = BPF_LD | BPF_IMM;
125 126
			ftest->k = sizeof(struct seccomp_data);
			continue;
127
		case BPF_LDX | BPF_W | BPF_LEN:
128
			ftest->code = BPF_LDX | BPF_IMM;
129 130 131
			ftest->k = sizeof(struct seccomp_data);
			continue;
		/* Explicitly include allowed calls. */
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
		case BPF_RET | BPF_K:
		case BPF_RET | BPF_A:
		case BPF_ALU | BPF_ADD | BPF_K:
		case BPF_ALU | BPF_ADD | BPF_X:
		case BPF_ALU | BPF_SUB | BPF_K:
		case BPF_ALU | BPF_SUB | BPF_X:
		case BPF_ALU | BPF_MUL | BPF_K:
		case BPF_ALU | BPF_MUL | BPF_X:
		case BPF_ALU | BPF_DIV | BPF_K:
		case BPF_ALU | BPF_DIV | BPF_X:
		case BPF_ALU | BPF_AND | BPF_K:
		case BPF_ALU | BPF_AND | BPF_X:
		case BPF_ALU | BPF_OR | BPF_K:
		case BPF_ALU | BPF_OR | BPF_X:
		case BPF_ALU | BPF_XOR | BPF_K:
		case BPF_ALU | BPF_XOR | BPF_X:
		case BPF_ALU | BPF_LSH | BPF_K:
		case BPF_ALU | BPF_LSH | BPF_X:
		case BPF_ALU | BPF_RSH | BPF_K:
		case BPF_ALU | BPF_RSH | BPF_X:
		case BPF_ALU | BPF_NEG:
		case BPF_LD | BPF_IMM:
		case BPF_LDX | BPF_IMM:
		case BPF_MISC | BPF_TAX:
		case BPF_MISC | BPF_TXA:
		case BPF_LD | BPF_MEM:
		case BPF_LDX | BPF_MEM:
		case BPF_ST:
		case BPF_STX:
		case BPF_JMP | BPF_JA:
		case BPF_JMP | BPF_JEQ | BPF_K:
		case BPF_JMP | BPF_JEQ | BPF_X:
		case BPF_JMP | BPF_JGE | BPF_K:
		case BPF_JMP | BPF_JGE | BPF_X:
		case BPF_JMP | BPF_JGT | BPF_K:
		case BPF_JMP | BPF_JGT | BPF_X:
		case BPF_JMP | BPF_JSET | BPF_K:
		case BPF_JMP | BPF_JSET | BPF_X:
170 171 172 173 174 175 176 177 178
			continue;
		default:
			return -EINVAL;
		}
	}
	return 0;
}

/**
179 180
 * seccomp_run_filters - evaluates all seccomp filters against @sd
 * @sd: optional seccomp data to be passed to filters
181 182 183
 * @match: stores struct seccomp_filter that resulted in the return value,
 *         unless filter returned SECCOMP_RET_ALLOW, in which case it will
 *         be unchanged.
184 185 186
 *
 * Returns valid seccomp BPF response codes.
 */
187
#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
188 189
static u32 seccomp_run_filters(const struct seccomp_data *sd,
			       struct seccomp_filter **match)
190
{
191
	struct seccomp_data sd_local;
Will Drewry's avatar
Will Drewry committed
192
	u32 ret = SECCOMP_RET_ALLOW;
193 194
	/* Make sure cross-thread synced filter points somewhere sane. */
	struct seccomp_filter *f =
195
			READ_ONCE(current->seccomp.filter);
Will Drewry's avatar
Will Drewry committed
196 197

	/* Ensure unexpected behavior doesn't result in failing open. */
198
	if (unlikely(WARN_ON(f == NULL)))
199
		return SECCOMP_RET_KILL_PROCESS;
Will Drewry's avatar
Will Drewry committed
200

201 202 203 204
	if (!sd) {
		populate_seccomp_data(&sd_local);
		sd = &sd_local;
	}
205

206 207
	/*
	 * All filters in the list are evaluated and the lowest BPF return
Will Drewry's avatar
Will Drewry committed
208
	 * value always takes priority (ignoring the DATA).
209
	 */
210
	for (; f; f = f->prev) {
211
		u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
212

213
		if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
Will Drewry's avatar
Will Drewry committed
214
			ret = cur_ret;
215 216
			*match = f;
		}
217 218 219
	}
	return ret;
}
220
#endif /* CONFIG_SECCOMP_FILTER */
221

222 223
static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
{
224
	assert_spin_locked(&current->sighand->siglock);
225

226 227 228 229 230 231
	if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
		return false;

	return true;
}

232
void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
233

234
static inline void seccomp_assign_mode(struct task_struct *task,
235 236
				       unsigned long seccomp_mode,
				       unsigned long flags)
237
{
238
	assert_spin_locked(&task->sighand->siglock);
239

240 241 242 243 244 245
	task->seccomp.mode = seccomp_mode;
	/*
	 * Make sure TIF_SECCOMP cannot be set before the mode (and
	 * filter) is set.
	 */
	smp_mb__before_atomic();
246 247
	/* Assume default seccomp processes want spec flaw mitigation. */
	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
248
		arch_seccomp_spec_mitigate(task);
249
	set_tsk_thread_flag(task, TIF_SECCOMP);
250 251 252
}

#ifdef CONFIG_SECCOMP_FILTER
253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
/* Returns 1 if the parent is an ancestor of the child. */
static int is_ancestor(struct seccomp_filter *parent,
		       struct seccomp_filter *child)
{
	/* NULL is the root ancestor. */
	if (parent == NULL)
		return 1;
	for (; child; child = child->prev)
		if (child == parent)
			return 1;
	return 0;
}

/**
 * seccomp_can_sync_threads: checks if all threads can be synchronized
 *
 * Expects sighand and cred_guard_mutex locks to be held.
 *
 * Returns 0 on success, -ve on error, or the pid of a thread which was
 * either not in the correct seccomp mode or it did not have an ancestral
 * seccomp filter.
 */
static inline pid_t seccomp_can_sync_threads(void)
{
	struct task_struct *thread, *caller;

	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
280
	assert_spin_locked(&current->sighand->siglock);
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315

	/* Validate all threads being eligible for synchronization. */
	caller = current;
	for_each_thread(caller, thread) {
		pid_t failed;

		/* Skip current, since it is initiating the sync. */
		if (thread == caller)
			continue;

		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
		    (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
		     is_ancestor(thread->seccomp.filter,
				 caller->seccomp.filter)))
			continue;

		/* Return the first thread that cannot be synchronized. */
		failed = task_pid_vnr(thread);
		/* If the pid cannot be resolved, then return -ESRCH */
		if (unlikely(WARN_ON(failed == 0)))
			failed = -ESRCH;
		return failed;
	}

	return 0;
}

/**
 * seccomp_sync_threads: sets all threads to use current's filter
 *
 * Expects sighand and cred_guard_mutex locks to be held, and for
 * seccomp_can_sync_threads() to have returned success already
 * without dropping the locks.
 *
 */
316
static inline void seccomp_sync_threads(unsigned long flags)
317 318 319 320
{
	struct task_struct *thread, *caller;

	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
321
	assert_spin_locked(&current->sighand->siglock);
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339

	/* Synchronize all threads. */
	caller = current;
	for_each_thread(caller, thread) {
		/* Skip current, since it needs no changes. */
		if (thread == caller)
			continue;

		/* Get a task reference for the new leaf node. */
		get_seccomp_filter(caller);
		/*
		 * Drop the task reference to the shared ancestor since
		 * current's path will hold a reference.  (This also
		 * allows a put before the assignment.)
		 */
		put_seccomp_filter(thread);
		smp_store_release(&thread->seccomp.filter,
				  caller->seccomp.filter);
340 341 342 343 344 345 346 347 348 349

		/*
		 * Don't let an unprivileged task work around
		 * the no_new_privs restriction by creating
		 * a thread that sets it up, enters seccomp,
		 * then dies.
		 */
		if (task_no_new_privs(caller))
			task_set_no_new_privs(thread);

350 351 352 353 354 355
		/*
		 * Opt the other thread into seccomp if needed.
		 * As threads are considered to be trust-realm
		 * equivalent (see ptrace_may_access), it is safe to
		 * allow one thread to transition the other.
		 */
356
		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
357 358
			seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
					    flags);
359 360 361
	}
}

362
/**
363
 * seccomp_prepare_filter: Prepares a seccomp filter for use.
364 365
 * @fprog: BPF program to install
 *
366
 * Returns filter on success or an ERR_PTR on failure.
367
 */
368
static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
369
{
370 371
	struct seccomp_filter *sfilter;
	int ret;
372
	const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
373 374

	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
375
		return ERR_PTR(-EINVAL);
376

377
	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
378 379

	/*
380
	 * Installing a seccomp filter requires that the task has
381 382 383 384
	 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
	 * This avoids scenarios where unprivileged tasks can affect the
	 * behavior of privileged children.
	 */
385
	if (!task_no_new_privs(current) &&
386 387
	    security_capable_noaudit(current_cred(), current_user_ns(),
				     CAP_SYS_ADMIN) != 0)
388
		return ERR_PTR(-EACCES);
389

390
	/* Allocate a new seccomp_filter */
391 392
	sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
	if (!sfilter)
393
		return ERR_PTR(-ENOMEM);
394 395

	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
396
					seccomp_check_filter, save_orig);
397 398 399
	if (ret < 0) {
		kfree(sfilter);
		return ERR_PTR(ret);
400
	}
401

402
	refcount_set(&sfilter->usage, 1);
403

404
	return sfilter;
405 406 407
}

/**
408
 * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
409 410 411 412
 * @user_filter: pointer to the user data containing a sock_fprog.
 *
 * Returns 0 on success and non-zero otherwise.
 */
413 414
static struct seccomp_filter *
seccomp_prepare_user_filter(const char __user *user_filter)
415 416
{
	struct sock_fprog fprog;
417
	struct seccomp_filter *filter = ERR_PTR(-EFAULT);
418 419

#ifdef CONFIG_COMPAT
420
	if (in_compat_syscall()) {
421 422 423 424 425 426 427 428 429
		struct compat_sock_fprog fprog32;
		if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
			goto out;
		fprog.len = fprog32.len;
		fprog.filter = compat_ptr(fprog32.filter);
	} else /* falls through to the if below. */
#endif
	if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
		goto out;
430
	filter = seccomp_prepare_filter(&fprog);
431
out:
432 433 434 435 436 437 438 439
	return filter;
}

/**
 * seccomp_attach_filter: validate and attach filter
 * @flags:  flags to change filter behavior
 * @filter: seccomp filter to add to the current process
 *
440 441
 * Caller must be holding current->sighand->siglock lock.
 *
442 443 444 445 446 447 448 449
 * Returns 0 on success, -ve on error.
 */
static long seccomp_attach_filter(unsigned int flags,
				  struct seccomp_filter *filter)
{
	unsigned long total_insns;
	struct seccomp_filter *walker;

450
	assert_spin_locked(&current->sighand->siglock);
451

452 453 454 455 456 457 458
	/* Validate resulting filter length. */
	total_insns = filter->prog->len;
	for (walker = current->seccomp.filter; walker; walker = walker->prev)
		total_insns += walker->prog->len + 4;  /* 4 instr penalty */
	if (total_insns > MAX_INSNS_PER_PATH)
		return -ENOMEM;

459 460 461 462 463 464 465 466 467
	/* If thread sync has been requested, check that it is possible. */
	if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
		int ret;

		ret = seccomp_can_sync_threads();
		if (ret)
			return ret;
	}

468 469 470 471
	/* Set log flag, if present. */
	if (flags & SECCOMP_FILTER_FLAG_LOG)
		filter->log = true;

472 473 474 475 476 477 478
	/*
	 * If there is an existing filter, make it the prev and don't drop its
	 * task reference.
	 */
	filter->prev = current->seccomp.filter;
	current->seccomp.filter = filter;

479 480
	/* Now that the new filter is in place, synchronize to all threads. */
	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
481
		seccomp_sync_threads(flags);
482

483
	return 0;
484 485
}

486
static void __get_seccomp_filter(struct seccomp_filter *filter)
487 488 489 490 491
{
	/* Reference count is bounded by the number of total processes. */
	refcount_inc(&filter->usage);
}

492 493 494 495 496 497
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
void get_seccomp_filter(struct task_struct *tsk)
{
	struct seccomp_filter *orig = tsk->seccomp.filter;
	if (!orig)
		return;
498
	__get_seccomp_filter(orig);
499 500
}

501 502 503
static inline void seccomp_filter_free(struct seccomp_filter *filter)
{
	if (filter) {
504
		bpf_prog_destroy(filter->prog);
505 506 507 508
		kfree(filter);
	}
}

509
static void __put_seccomp_filter(struct seccomp_filter *orig)
510 511
{
	/* Clean up single-reference branches iteratively. */
512
	while (orig && refcount_dec_and_test(&orig->usage)) {
513 514
		struct seccomp_filter *freeme = orig;
		orig = orig->prev;
515
		seccomp_filter_free(freeme);
516 517
	}
}
Will Drewry's avatar
Will Drewry committed
518

519 520 521 522 523 524
/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
void put_seccomp_filter(struct task_struct *tsk)
{
	__put_seccomp_filter(tsk->seccomp.filter);
}

525 526 527 528 529 530 531 532 533 534 535
static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
{
	memset(info, 0, sizeof(*info));
	info->si_signo = SIGSYS;
	info->si_code = SYS_SECCOMP;
	info->si_call_addr = (void __user *)KSTK_EIP(current);
	info->si_errno = reason;
	info->si_arch = syscall_get_arch();
	info->si_syscall = syscall;
}

Will Drewry's avatar
Will Drewry committed
536 537 538 539 540 541 542 543 544 545
/**
 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
 * @syscall: syscall number to send to userland
 * @reason: filter-supplied reason code to send to userland (via si_errno)
 *
 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
 */
static void seccomp_send_sigsys(int syscall, int reason)
{
	struct siginfo info;
546
	seccomp_init_siginfo(&info, syscall, reason);
Will Drewry's avatar
Will Drewry committed
547 548
	force_sig_info(SIGSYS, &info, current);
}
549
#endif	/* CONFIG_SECCOMP_FILTER */
Linus Torvalds's avatar
Linus Torvalds committed
550

551
/* For use with seccomp_actions_logged */
552 553
#define SECCOMP_LOG_KILL_PROCESS	(1 << 0)
#define SECCOMP_LOG_KILL_THREAD		(1 << 1)
554 555 556
#define SECCOMP_LOG_TRAP		(1 << 2)
#define SECCOMP_LOG_ERRNO		(1 << 3)
#define SECCOMP_LOG_TRACE		(1 << 4)
557 558
#define SECCOMP_LOG_LOG			(1 << 5)
#define SECCOMP_LOG_ALLOW		(1 << 6)
559

560 561
static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
				    SECCOMP_LOG_KILL_THREAD  |
562 563 564
				    SECCOMP_LOG_TRAP  |
				    SECCOMP_LOG_ERRNO |
				    SECCOMP_LOG_TRACE |
565
				    SECCOMP_LOG_LOG;
566

567 568
static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
			       bool requested)
569 570 571 572 573
{
	bool log = false;

	switch (action) {
	case SECCOMP_RET_ALLOW:
574
		break;
575
	case SECCOMP_RET_TRAP:
576 577
		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
		break;
578
	case SECCOMP_RET_ERRNO:
579 580
		log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
		break;
581
	case SECCOMP_RET_TRACE:
582
		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
583
		break;
584 585 586
	case SECCOMP_RET_LOG:
		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
		break;
587 588
	case SECCOMP_RET_KILL_THREAD:
		log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
589 590 591 592
		break;
	case SECCOMP_RET_KILL_PROCESS:
	default:
		log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
593 594 595
	}

	/*
596
	 * Force an audit message to be emitted when the action is RET_KILL_*,
597 598
	 * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
	 * allowed to be logged by the admin.
599 600 601 602 603 604 605 606 607 608 609
	 */
	if (log)
		return __audit_seccomp(syscall, signr, action);

	/*
	 * Let the audit subsystem decide if the action should be audited based
	 * on whether the current task itself is being audited.
	 */
	return audit_seccomp(syscall, signr, action);
}

Linus Torvalds's avatar
Linus Torvalds committed
610 611 612 613 614
/*
 * Secure computing mode 1 allows only read/write/exit/sigreturn.
 * To be fully secure this must be combined with rlimit
 * to limit the stack allocations too.
 */
615
static const int mode1_syscalls[] = {
Linus Torvalds's avatar
Linus Torvalds committed
616 617 618 619
	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
	0, /* null terminated */
};

620
static void __secure_computing_strict(int this_syscall)
Linus Torvalds's avatar
Linus Torvalds committed
621
{
622
	const int *syscall_whitelist = mode1_syscalls;
623
#ifdef CONFIG_COMPAT
624
	if (in_compat_syscall())
625
		syscall_whitelist = get_compat_mode1_syscalls();
626 627 628 629 630 631 632 633 634
#endif
	do {
		if (*syscall_whitelist == this_syscall)
			return;
	} while (*++syscall_whitelist);

#ifdef SECCOMP_DEBUG
	dump_stack();
#endif
635
	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
636 637 638 639 640 641 642 643
	do_exit(SIGKILL);
}

#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
void secure_computing_strict(int this_syscall)
{
	int mode = current->seccomp.mode;

644
	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
645 646 647
	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
		return;

648
	if (mode == SECCOMP_MODE_DISABLED)
649 650 651 652 653 654 655
		return;
	else if (mode == SECCOMP_MODE_STRICT)
		__secure_computing_strict(this_syscall);
	else
		BUG();
}
#else
656 657

#ifdef CONFIG_SECCOMP_FILTER
658 659
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
			    const bool recheck_after_trace)
660 661
{
	u32 filter_ret, action;
662
	struct seccomp_filter *match = NULL;
663
	int data;
Linus Torvalds's avatar
Linus Torvalds committed
664

665 666 667 668 669 670
	/*
	 * Make sure that any changes to mode from another thread have
	 * been seen after TIF_SECCOMP was seen.
	 */
	rmb();

671
	filter_ret = seccomp_run_filters(sd, &match);
672
	data = filter_ret & SECCOMP_RET_DATA;
673
	action = filter_ret & SECCOMP_RET_ACTION_FULL;
674 675 676

	switch (action) {
	case SECCOMP_RET_ERRNO:
677 678 679
		/* Set low-order bits as an errno, capped at MAX_ERRNO. */
		if (data > MAX_ERRNO)
			data = MAX_ERRNO;
680
		syscall_set_return_value(current, task_pt_regs(current),
681 682 683 684 685
					 -data, 0);
		goto skip;

	case SECCOMP_RET_TRAP:
		/* Show the handler the original registers. */
686
		syscall_rollback(current, task_pt_regs(current));
687 688 689 690 691
		/* Let the filter pass back 16 bits of data. */
		seccomp_send_sigsys(this_syscall, data);
		goto skip;

	case SECCOMP_RET_TRACE:
692 693 694 695
		/* We've been put in this state by the ptracer already. */
		if (recheck_after_trace)
			return 0;

Kees Cook's avatar
Kees Cook committed
696 697 698 699 700 701 702 703 704 705 706 707
		/* ENOSYS these calls if there is no tracer attached. */
		if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
			syscall_set_return_value(current,
						 task_pt_regs(current),
						 -ENOSYS, 0);
			goto skip;
		}

		/* Allow the BPF to provide the event message */
		ptrace_event(PTRACE_EVENT_SECCOMP, data);
		/*
		 * The delivery of a fatal signal during event
708 709 710 711 712 713 714
		 * notification may silently skip tracer notification,
		 * which could leave us with a potentially unmodified
		 * syscall that the tracer would have liked to have
		 * changed. Since the process is about to die, we just
		 * force the syscall to be skipped and let the signal
		 * kill the process and correctly handle any tracer exit
		 * notifications.
Kees Cook's avatar
Kees Cook committed
715 716
		 */
		if (fatal_signal_pending(current))
717
			goto skip;
Kees Cook's avatar
Kees Cook committed
718 719 720 721 722
		/* Check if the tracer forced the syscall to be skipped. */
		this_syscall = syscall_get_nr(current, task_pt_regs(current));
		if (this_syscall < 0)
			goto skip;

723 724 725 726 727 728 729 730 731
		/*
		 * Recheck the syscall, since it may have changed. This
		 * intentionally uses a NULL struct seccomp_data to force
		 * a reload of all registers. This does not goto skip since
		 * a skip would have already been reported.
		 */
		if (__seccomp_filter(this_syscall, NULL, true))
			return -1;

Kees Cook's avatar
Kees Cook committed
732
		return 0;
733

734 735 736 737
	case SECCOMP_RET_LOG:
		seccomp_log(this_syscall, 0, action, true);
		return 0;

738
	case SECCOMP_RET_ALLOW:
739 740 741 742 743
		/*
		 * Note that the "match" filter will always be NULL for
		 * this action since SECCOMP_RET_ALLOW is the starting
		 * state in seccomp_run_filters().
		 */
Kees Cook's avatar
Kees Cook committed
744
		return 0;
745

746
	case SECCOMP_RET_KILL_THREAD:
747
	case SECCOMP_RET_KILL_PROCESS:
748
	default:
749
		seccomp_log(this_syscall, SIGSYS, action, true);
750
		/* Dump core only if this is the last remaining thread. */
751 752
		if (action == SECCOMP_RET_KILL_PROCESS ||
		    get_nr_threads(current) == 1) {
753 754
			siginfo_t info;

755 756 757 758 759 760
			/* Show the original registers in the dump. */
			syscall_rollback(current, task_pt_regs(current));
			/* Trigger a manual coredump since do_exit skips it. */
			seccomp_init_siginfo(&info, this_syscall, data);
			do_coredump(&info);
		}
761 762 763 764
		if (action == SECCOMP_RET_KILL_PROCESS)
			do_group_exit(SIGSYS);
		else
			do_exit(SIGSYS);
765 766 767 768 769
	}

	unreachable();

skip:
770
	seccomp_log(this_syscall, 0, action, match ? match->log : false);
Kees Cook's avatar
Kees Cook committed
771 772 773
	return -1;
}
#else
774 775
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
			    const bool recheck_after_trace)
Kees Cook's avatar
Kees Cook committed
776 777
{
	BUG();
778
}
Linus Torvalds's avatar
Linus Torvalds committed
779
#endif
780

Kees Cook's avatar
Kees Cook committed
781
int __secure_computing(const struct seccomp_data *sd)
782 783
{
	int mode = current->seccomp.mode;
Kees Cook's avatar
Kees Cook committed
784
	int this_syscall;
785

786
	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
787
	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
Kees Cook's avatar
Kees Cook committed
788 789 790 791
		return 0;

	this_syscall = sd ? sd->nr :
		syscall_get_nr(current, task_pt_regs(current));
792

793
	switch (mode) {
794
	case SECCOMP_MODE_STRICT:
795
		__secure_computing_strict(this_syscall);  /* may call do_exit */
Kees Cook's avatar
Kees Cook committed
796
		return 0;
797
	case SECCOMP_MODE_FILTER:
798
		return __seccomp_filter(this_syscall, sd, false);
Linus Torvalds's avatar
Linus Torvalds committed
799 800 801
	default:
		BUG();
	}
802
}
803
#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
804 805 806 807 808 809

long prctl_get_seccomp(void)
{
	return current->seccomp.mode;
}

810
/**
811
 * seccomp_set_mode_strict: internal function for setting strict seccomp
812 813 814 815 816
 *
 * Once current->seccomp.mode is non-zero, it may not be changed.
 *
 * Returns 0 on success or -EINVAL on failure.
 */
817
static long seccomp_set_mode_strict(void)
818
{
819
	const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
820
	long ret = -EINVAL;
821

822 823
	spin_lock_irq(&current->sighand->siglock);

824
	if (!seccomp_may_assign_mode(seccomp_mode))
825 826
		goto out;

827
#ifdef TIF_NOTSC
828
	disable_TSC();
829
#endif
830
	seccomp_assign_mode(current, seccomp_mode, 0);
831 832 833
	ret = 0;

out:
834
	spin_unlock_irq(&current->sighand->siglock);
835 836 837 838

	return ret;
}

839
#ifdef CONFIG_SECCOMP_FILTER
840 841
/**
 * seccomp_set_mode_filter: internal function for setting seccomp filter
Kees Cook's avatar
Kees Cook committed
842
 * @flags:  flags to change filter behavior
843 844 845 846 847 848 849 850 851 852
 * @filter: struct sock_fprog containing filter
 *
 * This function may be called repeatedly to install additional filters.
 * Every filter successfully installed will be evaluated (in reverse order)
 * for each system call the task makes.
 *
 * Once current->seccomp.mode is non-zero, it may not be changed.
 *
 * Returns 0 on success or -EINVAL on failure.
 */
Kees Cook's avatar
Kees Cook committed
853 854
static long seccomp_set_mode_filter(unsigned int flags,
				    const char __user *filter)
855 856
{
	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
857
	struct seccomp_filter *prepared = NULL;
858 859
	long ret = -EINVAL;

Kees Cook's avatar
Kees Cook committed
860
	/* Validate flags. */
861
	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
862
		return -EINVAL;
Kees Cook's avatar
Kees Cook committed
863

864 865 866 867 868
	/* Prepare the new filter before holding any locks. */
	prepared = seccomp_prepare_user_filter(filter);
	if (IS_ERR(prepared))
		return PTR_ERR(prepared);

869 870 871 872 873 874 875 876
	/*
	 * Make sure we cannot change seccomp or nnp state via TSYNC
	 * while another thread is in the middle of calling exec.
	 */
	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
	    mutex_lock_killable(&current->signal->cred_guard_mutex))
		goto out_free;

877 878
	spin_lock_irq(&current->sighand->siglock);

879 880 881
	if (!seccomp_may_assign_mode(seccomp_mode))
		goto out;

882
	ret = seccomp_attach_filter(flags, prepared);
883
	if (ret)
884
		goto out;
885 886
	/* Do not free the successfully attached filter. */
	prepared = NULL;
887

888
	seccomp_assign_mode(current, seccomp_mode, flags);
889
out:
890
	spin_unlock_irq(&current->sighand->siglock);
891 892 893
	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
		mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
894
	seccomp_filter_free(prepared);
895 896
	return ret;
}
897
#else
Kees Cook's avatar
Kees Cook committed
898 899
static inline long seccomp_set_mode_filter(unsigned int flags,
					   const char __user *filter)
900 901 902 903
{
	return -EINVAL;
}
#endif
904

905 906 907 908 909 910 911 912
static long seccomp_get_action_avail(const char __user *uaction)
{
	u32 action;

	if (copy_from_user(&action, uaction, sizeof(action)))
		return -EFAULT;

	switch (action) {
913
	case SECCOMP_RET_KILL_PROCESS:
914
	case SECCOMP_RET_KILL_THREAD:
915 916 917
	case SECCOMP_RET_TRAP:
	case SECCOMP_RET_ERRNO:
	case SECCOMP_RET_TRACE:
918
	case SECCOMP_RET_LOG:
919 920 921 922 923 924 925 926 927
	case SECCOMP_RET_ALLOW:
		break;
	default:
		return -EOPNOTSUPP;
	}

	return 0;
}

Kees Cook's avatar
Kees Cook committed
928 929 930 931 932 933 934 935 936 937 938
/* Common entry point for both prctl and syscall. */
static long do_seccomp(unsigned int op, unsigned int flags,
		       const char __user *uargs)
{
	switch (op) {
	case SECCOMP_SET_MODE_STRICT:
		if (flags != 0 || uargs != NULL)
			return -EINVAL;
		return seccomp_set_mode_strict();
	case SECCOMP_SET_MODE_FILTER:
		return seccomp_set_mode_filter(flags, uargs);
939 940 941 942 943
	case SECCOMP_GET_ACTION_AVAIL:
		if (flags != 0)
			return -EINVAL;

		return seccomp_get_action_avail(uargs);
Kees Cook's avatar
Kees Cook committed
944 945 946 947 948 949 950 951 952 953 954
	default:
		return -EINVAL;
	}
}

SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
			 const char __user *, uargs)
{
	return do_seccomp(op, flags, uargs);
}

955 956 957 958 959 960 961 962 963
/**
 * prctl_set_seccomp: configures current->seccomp.mode
 * @seccomp_mode: requested mode to use
 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
 *
 * Returns 0 on success or -EINVAL on failure.
 */
long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
{
Kees Cook's avatar
Kees Cook committed
964 965 966
	unsigned int op;
	char __user *uargs;

967 968
	switch (seccomp_mode) {
	case SECCOMP_MODE_STRICT:
Kees Cook's avatar
Kees Cook committed
969 970 971 972 973 974 975 976
		op = SECCOMP_SET_MODE_STRICT;
		/*
		 * Setting strict mode through prctl always ignored filter,
		 * so make sure it is always NULL here to pass the internal
		 * check in do_seccomp().
		 */
		uargs = NULL;
		break;
977
	case SECCOMP_MODE_FILTER:
Kees Cook's avatar
Kees Cook committed
978 979 980
		op = SECCOMP_SET_MODE_FILTER;
		uargs = filter;
		break;
981 982 983
	default:
		return -EINVAL;
	}
Kees Cook's avatar
Kees Cook committed
984 985 986

	/* prctl interface doesn't have flags, so they are always zero. */
	return do_seccomp(op, 0, uargs);
987
}
988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034

#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
			void __user *data)
{
	struct seccomp_filter *filter;
	struct sock_fprog_kern *fprog;
	long ret;
	unsigned long count = 0;

	if (!capable(CAP_SYS_ADMIN) ||
	    current->seccomp.mode != SECCOMP_MODE_DISABLED) {
		return -EACCES;
	}

	spin_lock_irq(&task->sighand->siglock);
	if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
		ret = -EINVAL;
		goto out;
	}

	filter = task->seccomp.filter;
	while (filter) {
		filter = filter->prev;
		count++;
	}

	if (filter_off >= count) {
		ret = -ENOENT;
		goto out;
	}
	count -= filter_off;

	filter = task->seccomp.filter;
	while (filter && count > 1) {
		filter = filter->prev;
		count--;
	}

	if (WARN_ON(count != 1 || !filter)) {
		/* The filter tree shouldn't shrink while we're using it. */
		ret = -ENOENT;
		goto out;
	}

	fprog = filter->prog->orig_prog;
	if (!fprog) {
Mickaël Salaün's avatar
Mickaël Salaün committed
1035
		/* This must be a new non-cBPF filter, since we save
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046
		 * every cBPF filter's orig_prog above when
		 * CONFIG_CHECKPOINT_RESTORE is enabled.
		 */
		ret = -EMEDIUMTYPE;
		goto out;
	}

	ret = fprog->len;
	if (!data)
		goto out;

1047
	__get_seccomp_filter(filter);
1048 1049 1050 1051 1052
	spin_unlock_irq(&task->sighand->siglock);

	if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
		ret = -EFAULT;

1053
	__put_seccomp_filter(filter);
1054 1055 1056 1057 1058 1059 1060
	return ret;

out:
	spin_unlock_irq(&task->sighand->siglock);
	return ret;
}
#endif
1061 1062 1063 1064

#ifdef CONFIG_SYSCTL

/* Human readable action names for friendly sysctl interaction */
1065
#define SECCOMP_RET_KILL_PROCESS_NAME	"kill_process"
1066
#define SECCOMP_RET_KILL_THREAD_NAME	"kill_thread"
1067 1068 1069
#define SECCOMP_RET_TRAP_NAME		"trap"
#define SECCOMP_RET_ERRNO_NAME		"errno"
#define SECCOMP_RET_TRACE_NAME		"trace"
1070
#define SECCOMP_RET_LOG_NAME		"log"
1071 1072
#define SECCOMP_RET_ALLOW_NAME		"allow"

1073
static const char seccomp_actions_avail[] =
1074
				SECCOMP_RET_KILL_PROCESS_NAME	" "
1075 1076 1077 1078 1079 1080
				SECCOMP_RET_KILL_THREAD_NAME	" "
				SECCOMP_RET_TRAP_NAME		" "
				SECCOMP_RET_ERRNO_NAME		" "
				SECCOMP_RET_TRACE_NAME		" "
				SECCOMP_RET_LOG_NAME		" "
				SECCOMP_RET_ALLOW_NAME;
1081

1082 1083 1084 1085 1086 1087
struct seccomp_log_name {
	u32		log;
	const char	*name;
};

static const struct seccomp_log_name seccomp_log_names[] = {
1088
	{ SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
1089
	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
1090 1091 1092
	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
1093
	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204
	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
	{ }
};

static bool seccomp_names_from_actions_logged(char *names, size_t size,
					      u32 actions_logged)
{
	const struct seccomp_log_name *cur;
	bool append_space = false;

	for (cur = seccomp_log_names; cur->name && size; cur++) {
		ssize_t ret;

		if (!(actions_logged & cur->log))
			continue;

		if (append_space) {
			ret = strscpy(names, " ", size);
			if (ret < 0)
				return false;

			names += ret;
			size -= ret;
		} else
			append_space = true;

		ret = strscpy(names, cur->name, size);
		if (ret < 0)
			return false;

		names += ret;
		size -= ret;
	}

	return true;
}

static bool seccomp_action_logged_from_name(u32 *action_logged,
					    const char *name)
{
	const struct seccomp_log_name *cur;

	for (cur = seccomp_log_names; cur->name; cur++) {
		if (!strcmp(cur->name, name)) {
			*action_logged = cur->log;
			return true;
		}
	}

	return false;
}

static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
{
	char *name;

	*actions_logged = 0;
	while ((name = strsep(&names, " ")) && *name) {
		u32 action_logged = 0;

		if (!seccomp_action_logged_from_name(&action_logged, name))
			return false;

		*actions_logged |= action_logged;
	}

	return true;
}

static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
					  void __user *buffer, size_t *lenp,
					  loff_t *ppos)
{
	char names[sizeof(seccomp_actions_avail)];
	struct ctl_table table;
	int ret;

	if (write && !capable(CAP_SYS_ADMIN))
		return -EPERM;

	memset(names, 0, sizeof(names));

	if (!write) {
		if (!seccomp_names_from_actions_logged(names, sizeof(names),
						       seccomp_actions_logged))
			return -EINVAL;
	}

	table = *ro_table;
	table.data = names;
	table.maxlen = sizeof(names);
	ret = proc_dostring(&table, write, buffer, lenp, ppos);
	if (ret)
		return ret;

	if (write) {
		u32 actions_logged;

		if (!seccomp_actions_logged_from_names(&actions_logged,
						       table.data))
			return -EINVAL;

		if (actions_logged & SECCOMP_LOG_ALLOW)
			return -EINVAL;

		seccomp_actions_logged = actions_logged;
	}

	return 0;
}

1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218
static struct ctl_path seccomp_sysctl_path[] = {
	{ .procname = "kernel", },
	{ .procname = "seccomp", },
	{ }
};

static struct ctl_table seccomp_sysctl_table[] = {
	{
		.procname	= "actions_avail",
		.data		= (void *) &seccomp_actions_avail,
		.maxlen		= sizeof(seccomp_actions_avail),
		.mode		= 0444,
		.proc_handler	= proc_dostring,
	},
1219 1220 1221 1222 1223
	{
		.procname	= "actions_logged",
		.mode		= 0644,
		.proc_handler	= seccomp_actions_logged_handler,
	},
1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
	{ }
};

static int __init seccomp_sysctl_init(void)
{
	struct ctl_table_header *hdr;

	hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
	if (!hdr)
		pr_warn("seccomp: sysctl registration failed\n");
	else
		kmemleak_not_leak(hdr);

	return 0;
}

device_initcall(seccomp_sysctl_init)

#endif /* CONFIG_SYSCTL */