core.c 224 KB
Newer Older
1
/*
2
 * Performance events core code:
3
 *
4
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
Al Viro's avatar
Al Viro committed
7
 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8
 *
9
 * For licensing details see kernel-base/COPYING
10 11 12
 */

#include <linux/fs.h>
13
#include <linux/mm.h>
14 15
#include <linux/cpu.h>
#include <linux/smp.h>
Peter Zijlstra's avatar
Peter Zijlstra committed
16
#include <linux/idr.h>
17
#include <linux/file.h>
18
#include <linux/poll.h>
19
#include <linux/slab.h>
20
#include <linux/hash.h>
21
#include <linux/tick.h>
22
#include <linux/sysfs.h>
23
#include <linux/dcache.h>
24
#include <linux/percpu.h>
25
#include <linux/ptrace.h>
26
#include <linux/reboot.h>
27
#include <linux/vmstat.h>
Peter Zijlstra's avatar
Peter Zijlstra committed
28
#include <linux/device.h>
29
#include <linux/export.h>
30
#include <linux/vmalloc.h>
31 32
#include <linux/hardirq.h>
#include <linux/rculist.h>
33 34 35
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
36
#include <linux/kernel_stat.h>
37
#include <linux/cgroup.h>
38
#include <linux/perf_event.h>
39
#include <linux/trace_events.h>
40
#include <linux/hw_breakpoint.h>
41
#include <linux/mm_types.h>
42
#include <linux/module.h>
43
#include <linux/mman.h>
Pawel Moll's avatar
Pawel Moll committed
44
#include <linux/compat.h>
45 46
#include <linux/bpf.h>
#include <linux/filter.h>
47

48 49
#include "internal.h"

50 51
#include <asm/irq_regs.h>

52 53
static struct workqueue_struct *perf_wq;

54 55
typedef int (*remote_function_f)(void *);

56
struct remote_function_call {
57
	struct task_struct	*p;
58
	remote_function_f	func;
59 60
	void			*info;
	int			ret;
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
};

static void remote_function(void *data)
{
	struct remote_function_call *tfc = data;
	struct task_struct *p = tfc->p;

	if (p) {
		tfc->ret = -EAGAIN;
		if (task_cpu(p) != smp_processor_id() || !task_curr(p))
			return;
	}

	tfc->ret = tfc->func(tfc->info);
}

/**
 * task_function_call - call a function on the cpu on which a task runs
 * @p:		the task to evaluate
 * @func:	the function to be called
 * @info:	the function call argument
 *
 * Calls the function @func when the task is currently running. This might
 * be on the current CPU, which just calls the function directly
 *
 * returns: @func return value, or
 *	    -ESRCH  - when the process isn't running
 *	    -EAGAIN - when the process moved away
 */
static int
91
task_function_call(struct task_struct *p, remote_function_f func, void *info)
92 93
{
	struct remote_function_call data = {
94 95 96 97
		.p	= p,
		.func	= func,
		.info	= info,
		.ret	= -ESRCH, /* No such (running) process */
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
	};

	if (task_curr(p))
		smp_call_function_single(task_cpu(p), remote_function, &data, 1);

	return data.ret;
}

/**
 * cpu_function_call - call a function on the cpu
 * @func:	the function to be called
 * @info:	the function call argument
 *
 * Calls the function @func on the remote cpu.
 *
 * returns: @func return value or -ENXIO when the cpu is offline
 */
115
static int cpu_function_call(int cpu, remote_function_f func, void *info)
116 117
{
	struct remote_function_call data = {
118 119 120 121
		.p	= NULL,
		.func	= func,
		.info	= info,
		.ret	= -ENXIO, /* No such CPU */
122 123 124 125 126 127 128
	};

	smp_call_function_single(cpu, remote_function, &data, 1);

	return data.ret;
}

129 130 131 132 133 134 135
#define EVENT_OWNER_KERNEL ((void *) -1)

static bool is_kernel_event(struct perf_event *event)
{
	return event->owner == EVENT_OWNER_KERNEL;
}

Stephane Eranian's avatar
Stephane Eranian committed
136 137
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
		       PERF_FLAG_FD_OUTPUT  |\
138 139
		       PERF_FLAG_PID_CGROUP |\
		       PERF_FLAG_FD_CLOEXEC)
Stephane Eranian's avatar
Stephane Eranian committed
140

141 142 143 144 145 146 147
/*
 * branch priv levels that need permission checks
 */
#define PERF_SAMPLE_BRANCH_PERM_PLM \
	(PERF_SAMPLE_BRANCH_KERNEL |\
	 PERF_SAMPLE_BRANCH_HV)

148 149 150 151 152 153
enum event_type_t {
	EVENT_FLEXIBLE = 0x1,
	EVENT_PINNED = 0x2,
	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};

Stephane Eranian's avatar
Stephane Eranian committed
154 155 156 157
/*
 * perf_sched_events : >0 events exist
 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 */
158
struct static_key_deferred perf_sched_events __read_mostly;
Stephane Eranian's avatar
Stephane Eranian committed
159
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
160
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
Stephane Eranian's avatar
Stephane Eranian committed
161

162 163 164
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
165
static atomic_t nr_freq_events __read_mostly;
166
static atomic_t nr_switch_events __read_mostly;
167

168 169 170 171
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;

172
/*
173
 * perf event paranoia level:
174 175
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
176
 *   1 - disallow cpu events for unpriv
177
 *   2 - disallow kernel profiling for unpriv
178
 */
179
int sysctl_perf_event_paranoid __read_mostly = 1;
180

181 182
/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
183 184

/*
185
 * max perf event sample rate
186
 */
187 188 189 190 191 192 193 194 195
#define DEFAULT_MAX_SAMPLE_RATE		100000
#define DEFAULT_SAMPLE_PERIOD_NS	(NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
#define DEFAULT_CPU_TIME_MAX_PERCENT	25

int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;

static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS;

Peter Zijlstra's avatar
Peter Zijlstra committed
196 197
static int perf_sample_allowed_ns __read_mostly =
	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
198

199
static void update_perf_cpu_limits(void)
200 201 202 203
{
	u64 tmp = perf_sample_period_ns;

	tmp *= sysctl_perf_cpu_time_max_percent;
204
	do_div(tmp, 100);
Peter Zijlstra's avatar
Peter Zijlstra committed
205
	ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
206
}
207

208 209
static int perf_rotate_context(struct perf_cpu_context *cpuctx);

210 211 212 213
int perf_proc_update_handler(struct ctl_table *table, int write,
		void __user *buffer, size_t *lenp,
		loff_t *ppos)
{
214
	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
215 216 217 218 219

	if (ret || !write)
		return ret;

	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
220 221 222 223 224 225 226 227 228 229 230 231
	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
	update_perf_cpu_limits();

	return 0;
}

int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;

int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
				void __user *buffer, size_t *lenp,
				loff_t *ppos)
{
232
	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
233 234 235 236 237

	if (ret || !write)
		return ret;

	update_perf_cpu_limits();
238 239 240

	return 0;
}
241

242 243 244 245 246 247 248
/*
 * perf samples are done in some very critical code paths (NMIs).
 * If they take too much CPU time, the system can lock up and not
 * get any real work done.  This will drop the sample rate when
 * we detect that events are taking too long.
 */
#define NR_ACCUMULATED_SAMPLES 128
Peter Zijlstra's avatar
Peter Zijlstra committed
249
static DEFINE_PER_CPU(u64, running_sample_length);
250

251
static void perf_duration_warn(struct irq_work *w)
252
{
253
	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
254
	u64 avg_local_sample_len;
255
	u64 local_samples_len;
256

257
	local_samples_len = __this_cpu_read(running_sample_length);
258 259 260 261 262
	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;

	printk_ratelimited(KERN_WARNING
			"perf interrupt took too long (%lld > %lld), lowering "
			"kernel.perf_event_max_sample_rate to %d\n",
263
			avg_local_sample_len, allowed_ns >> 1,
264 265 266 267 268 269 270
			sysctl_perf_event_sample_rate);
}

static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);

void perf_sample_event_took(u64 sample_len_ns)
{
Peter Zijlstra's avatar
Peter Zijlstra committed
271
	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
272 273
	u64 avg_local_sample_len;
	u64 local_samples_len;
274

Peter Zijlstra's avatar
Peter Zijlstra committed
275
	if (allowed_ns == 0)
276 277 278
		return;

	/* decay the counter by 1 average sample */
279
	local_samples_len = __this_cpu_read(running_sample_length);
280 281
	local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
	local_samples_len += sample_len_ns;
282
	__this_cpu_write(running_sample_length, local_samples_len);
283 284 285 286 287 288 289 290

	/*
	 * note: this will be biased artifically low until we have
	 * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
	 * from having to maintain a count.
	 */
	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;

Peter Zijlstra's avatar
Peter Zijlstra committed
291
	if (avg_local_sample_len <= allowed_ns)
292 293 294 295 296 297 298 299 300 301
		return;

	if (max_samples_per_tick <= 1)
		return;

	max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
	sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;

	update_perf_cpu_limits();
302

303 304 305 306 307 308
	if (!irq_work_queue(&perf_duration_work)) {
		early_printk("perf interrupt took too long (%lld > %lld), lowering "
			     "kernel.perf_event_max_sample_rate to %d\n",
			     avg_local_sample_len, allowed_ns >> 1,
			     sysctl_perf_event_sample_rate);
	}
309 310
}

311
static atomic64_t perf_event_id;
312

313 314 315 316
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
			      enum event_type_t event_type);

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Stephane Eranian's avatar
Stephane Eranian committed
317 318 319 320 321
			     enum event_type_t event_type,
			     struct task_struct *task);

static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
322

323
void __weak perf_event_print_debug(void)	{ }
324

325
extern __weak const char *perf_pmu_name(void)
326
{
327
	return "pmu";
328 329
}

330 331 332 333 334
static inline u64 perf_clock(void)
{
	return local_clock();
}

335 336 337 338 339
static inline u64 perf_event_clock(struct perf_event *event)
{
	return event->clock();
}

Stephane Eranian's avatar
Stephane Eranian committed
340 341 342 343 344 345
static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
}

346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
			  struct perf_event_context *ctx)
{
	raw_spin_lock(&cpuctx->ctx.lock);
	if (ctx)
		raw_spin_lock(&ctx->lock);
}

static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
			    struct perf_event_context *ctx)
{
	if (ctx)
		raw_spin_unlock(&ctx->lock);
	raw_spin_unlock(&cpuctx->ctx.lock);
}

Stephane Eranian's avatar
Stephane Eranian committed
362 363 364 365 366 367 368 369
#ifdef CONFIG_CGROUP_PERF

static inline bool
perf_cgroup_match(struct perf_event *event)
{
	struct perf_event_context *ctx = event->ctx;
	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);

370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
	/* @event doesn't care about cgroup */
	if (!event->cgrp)
		return true;

	/* wants specific cgroup scope but @cpuctx isn't associated with any */
	if (!cpuctx->cgrp)
		return false;

	/*
	 * Cgroup scoping is recursive.  An event enabled for a cgroup is
	 * also enabled for all its descendant cgroups.  If @cpuctx's
	 * cgroup is a descendant of @event's (the test covers identity
	 * case), it's a match.
	 */
	return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
				    event->cgrp->css.cgroup);
Stephane Eranian's avatar
Stephane Eranian committed
386 387 388 389
}

static inline void perf_detach_cgroup(struct perf_event *event)
{
390
	css_put(&event->cgrp->css);
Stephane Eranian's avatar
Stephane Eranian committed
391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
	event->cgrp = NULL;
}

static inline int is_cgroup_event(struct perf_event *event)
{
	return event->cgrp != NULL;
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
	struct perf_cgroup_info *t;

	t = per_cpu_ptr(event->cgrp->info, event->cpu);
	return t->time;
}

static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
{
	struct perf_cgroup_info *info;
	u64 now;

	now = perf_clock();

	info = this_cpu_ptr(cgrp->info);

	info->time += now - info->timestamp;
	info->timestamp = now;
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
{
422 423 424 425 426 427 428 429 430
	struct perf_cgroup *cgrp = cpuctx->cgrp;
	struct cgroup_subsys_state *css;

	if (cgrp) {
		for (css = &cgrp->css; css; css = css->parent) {
			cgrp = container_of(css, struct perf_cgroup, css);
			__update_cgrp_time(cgrp);
		}
	}
Stephane Eranian's avatar
Stephane Eranian committed
431 432 433 434
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
435 436
	struct perf_cgroup *cgrp;

Stephane Eranian's avatar
Stephane Eranian committed
437
	/*
438 439
	 * ensure we access cgroup data only when needed and
	 * when we know the cgroup is pinned (css_get)
Stephane Eranian's avatar
Stephane Eranian committed
440
	 */
441
	if (!is_cgroup_event(event))
Stephane Eranian's avatar
Stephane Eranian committed
442 443
		return;

444
	cgrp = perf_cgroup_from_task(current, event->ctx);
445 446 447 448 449
	/*
	 * Do not update time when cgroup is not active
	 */
	if (cgrp == event->cgrp)
		__update_cgrp_time(event->cgrp);
Stephane Eranian's avatar
Stephane Eranian committed
450 451 452
}

static inline void
453 454
perf_cgroup_set_timestamp(struct task_struct *task,
			  struct perf_event_context *ctx)
Stephane Eranian's avatar
Stephane Eranian committed
455 456 457
{
	struct perf_cgroup *cgrp;
	struct perf_cgroup_info *info;
458
	struct cgroup_subsys_state *css;
Stephane Eranian's avatar
Stephane Eranian committed
459

460 461 462 463 464 465
	/*
	 * ctx->lock held by caller
	 * ensure we do not access cgroup data
	 * unless we have the cgroup pinned (css_get)
	 */
	if (!task || !ctx->nr_cgroups)
Stephane Eranian's avatar
Stephane Eranian committed
466 467
		return;

468
	cgrp = perf_cgroup_from_task(task, ctx);
469 470 471 472 473 474

	for (css = &cgrp->css; css; css = css->parent) {
		cgrp = container_of(css, struct perf_cgroup, css);
		info = this_cpu_ptr(cgrp->info);
		info->timestamp = ctx->timestamp;
	}
Stephane Eranian's avatar
Stephane Eranian committed
475 476 477 478 479 480 481 482 483 484 485
}

#define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
#define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */

/*
 * reschedule events based on the cgroup constraint of task.
 *
 * mode SWOUT : schedule out everything
 * mode SWIN : schedule in based on cgroup for next
 */
486
static void perf_cgroup_switch(struct task_struct *task, int mode)
Stephane Eranian's avatar
Stephane Eranian committed
487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505
{
	struct perf_cpu_context *cpuctx;
	struct pmu *pmu;
	unsigned long flags;

	/*
	 * disable interrupts to avoid geting nr_cgroup
	 * changes via __perf_event_disable(). Also
	 * avoids preemption.
	 */
	local_irq_save(flags);

	/*
	 * we reschedule only in the presence of cgroup
	 * constrained events.
	 */

	list_for_each_entry_rcu(pmu, &pmus, entry) {
		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
506 507
		if (cpuctx->unique_pmu != pmu)
			continue; /* ensure we process each cpuctx once */
Stephane Eranian's avatar
Stephane Eranian committed
508 509 510 511 512 513 514 515 516

		/*
		 * perf_cgroup_events says at least one
		 * context on this CPU has cgroup events.
		 *
		 * ctx->nr_cgroups reports the number of cgroup
		 * events for a context.
		 */
		if (cpuctx->ctx.nr_cgroups > 0) {
517 518
			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
			perf_pmu_disable(cpuctx->ctx.pmu);
Stephane Eranian's avatar
Stephane Eranian committed
519 520 521 522 523 524 525 526 527 528 529

			if (mode & PERF_CGROUP_SWOUT) {
				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
				/*
				 * must not be done before ctxswout due
				 * to event_filter_match() in event_sched_out()
				 */
				cpuctx->cgrp = NULL;
			}

			if (mode & PERF_CGROUP_SWIN) {
530
				WARN_ON_ONCE(cpuctx->cgrp);
531 532 533 534
				/*
				 * set cgrp before ctxsw in to allow
				 * event_filter_match() to not have to pass
				 * task around
535 536
				 * we pass the cpuctx->ctx to perf_cgroup_from_task()
				 * because cgorup events are only per-cpu
Stephane Eranian's avatar
Stephane Eranian committed
537
				 */
538
				cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
Stephane Eranian's avatar
Stephane Eranian committed
539 540
				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
			}
541 542
			perf_pmu_enable(cpuctx->ctx.pmu);
			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
Stephane Eranian's avatar
Stephane Eranian committed
543 544 545 546 547 548
		}
	}

	local_irq_restore(flags);
}

549 550
static inline void perf_cgroup_sched_out(struct task_struct *task,
					 struct task_struct *next)
Stephane Eranian's avatar
Stephane Eranian committed
551
{
552 553 554
	struct perf_cgroup *cgrp1;
	struct perf_cgroup *cgrp2 = NULL;

555
	rcu_read_lock();
556 557
	/*
	 * we come here when we know perf_cgroup_events > 0
558 559
	 * we do not need to pass the ctx here because we know
	 * we are holding the rcu lock
560
	 */
561
	cgrp1 = perf_cgroup_from_task(task, NULL);
562 563 564 565 566 567

	/*
	 * next is NULL when called from perf_event_enable_on_exec()
	 * that will systematically cause a cgroup_switch()
	 */
	if (next)
568
		cgrp2 = perf_cgroup_from_task(next, NULL);
569 570 571 572 573 574 575 576

	/*
	 * only schedule out current cgroup events if we know
	 * that we are switching to a different cgroup. Otherwise,
	 * do no touch the cgroup events.
	 */
	if (cgrp1 != cgrp2)
		perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
577 578

	rcu_read_unlock();
Stephane Eranian's avatar
Stephane Eranian committed
579 580
}

581 582
static inline void perf_cgroup_sched_in(struct task_struct *prev,
					struct task_struct *task)
Stephane Eranian's avatar
Stephane Eranian committed
583
{
584 585 586
	struct perf_cgroup *cgrp1;
	struct perf_cgroup *cgrp2 = NULL;

587
	rcu_read_lock();
588 589
	/*
	 * we come here when we know perf_cgroup_events > 0
590 591
	 * we do not need to pass the ctx here because we know
	 * we are holding the rcu lock
592
	 */
593
	cgrp1 = perf_cgroup_from_task(task, NULL);
594 595

	/* prev can never be NULL */
596
	cgrp2 = perf_cgroup_from_task(prev, NULL);
597 598 599 600 601 602 603 604

	/*
	 * only need to schedule in cgroup events if we are changing
	 * cgroup during ctxsw. Cgroup events were not scheduled
	 * out of ctxsw out if that was not the case.
	 */
	if (cgrp1 != cgrp2)
		perf_cgroup_switch(task, PERF_CGROUP_SWIN);
605 606

	rcu_read_unlock();
Stephane Eranian's avatar
Stephane Eranian committed
607 608 609 610 611 612 613 614
}

static inline int perf_cgroup_connect(int fd, struct perf_event *event,
				      struct perf_event_attr *attr,
				      struct perf_event *group_leader)
{
	struct perf_cgroup *cgrp;
	struct cgroup_subsys_state *css;
615 616
	struct fd f = fdget(fd);
	int ret = 0;
Stephane Eranian's avatar
Stephane Eranian committed
617

618
	if (!f.file)
Stephane Eranian's avatar
Stephane Eranian committed
619 620
		return -EBADF;

Al Viro's avatar
Al Viro committed
621
	css = css_tryget_online_from_dir(f.file->f_path.dentry,
622
					 &perf_event_cgrp_subsys);
623 624 625 626
	if (IS_ERR(css)) {
		ret = PTR_ERR(css);
		goto out;
	}
Stephane Eranian's avatar
Stephane Eranian committed
627 628 629 630 631 632 633 634 635 636 637 638 639

	cgrp = container_of(css, struct perf_cgroup, css);
	event->cgrp = cgrp;

	/*
	 * all events in a group must monitor
	 * the same cgroup because a task belongs
	 * to only one perf cgroup at a time
	 */
	if (group_leader && group_leader->cgrp != cgrp) {
		perf_detach_cgroup(event);
		ret = -EINVAL;
	}
640
out:
641
	fdput(f);
Stephane Eranian's avatar
Stephane Eranian committed
642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714
	return ret;
}

static inline void
perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
{
	struct perf_cgroup_info *t;
	t = per_cpu_ptr(event->cgrp->info, event->cpu);
	event->shadow_ctx_time = now - t->timestamp;
}

static inline void
perf_cgroup_defer_enabled(struct perf_event *event)
{
	/*
	 * when the current task's perf cgroup does not match
	 * the event's, we need to remember to call the
	 * perf_mark_enable() function the first time a task with
	 * a matching perf cgroup is scheduled in.
	 */
	if (is_cgroup_event(event) && !perf_cgroup_match(event))
		event->cgrp_defer_enabled = 1;
}

static inline void
perf_cgroup_mark_enabled(struct perf_event *event,
			 struct perf_event_context *ctx)
{
	struct perf_event *sub;
	u64 tstamp = perf_event_time(event);

	if (!event->cgrp_defer_enabled)
		return;

	event->cgrp_defer_enabled = 0;

	event->tstamp_enabled = tstamp - event->total_time_enabled;
	list_for_each_entry(sub, &event->sibling_list, group_entry) {
		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
			sub->cgrp_defer_enabled = 0;
		}
	}
}
#else /* !CONFIG_CGROUP_PERF */

static inline bool
perf_cgroup_match(struct perf_event *event)
{
	return true;
}

static inline void perf_detach_cgroup(struct perf_event *event)
{}

static inline int is_cgroup_event(struct perf_event *event)
{
	return 0;
}

static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
{
	return 0;
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
{
}

715 716
static inline void perf_cgroup_sched_out(struct task_struct *task,
					 struct task_struct *next)
Stephane Eranian's avatar
Stephane Eranian committed
717 718 719
{
}

720 721
static inline void perf_cgroup_sched_in(struct task_struct *prev,
					struct task_struct *task)
Stephane Eranian's avatar
Stephane Eranian committed
722 723 724 725 726 727 728 729 730 731 732
{
}

static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
				      struct perf_event_attr *attr,
				      struct perf_event *group_leader)
{
	return -EINVAL;
}

static inline void
733 734
perf_cgroup_set_timestamp(struct task_struct *task,
			  struct perf_event_context *ctx)
Stephane Eranian's avatar
Stephane Eranian committed
735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764
{
}

void
perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
{
}

static inline void
perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
{
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
	return 0;
}

static inline void
perf_cgroup_defer_enabled(struct perf_event *event)
{
}

static inline void
perf_cgroup_mark_enabled(struct perf_event *event,
			 struct perf_event_context *ctx)
{
}
#endif

765 766 767 768 769 770 771 772
/*
 * set default to be dependent on timer tick just
 * like original code
 */
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
 * function must be called with interrupts disbled
 */
773
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
774 775 776 777 778 779 780 781 782
{
	struct perf_cpu_context *cpuctx;
	int rotations = 0;

	WARN_ON(!irqs_disabled());

	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
	rotations = perf_rotate_context(cpuctx);

783 784
	raw_spin_lock(&cpuctx->hrtimer_lock);
	if (rotations)
785
		hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
786 787 788
	else
		cpuctx->hrtimer_active = 0;
	raw_spin_unlock(&cpuctx->hrtimer_lock);
789

790
	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
791 792
}

793
static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
794
{
795
	struct hrtimer *timer = &cpuctx->hrtimer;
796
	struct pmu *pmu = cpuctx->ctx.pmu;
797
	u64 interval;
798 799 800 801 802

	/* no multiplexing needed for SW PMU */
	if (pmu->task_ctx_nr == perf_sw_context)
		return;

803 804 805 806
	/*
	 * check default is sane, if not set then force to
	 * default interval (1/tick)
	 */
807 808 809
	interval = pmu->hrtimer_interval_ms;
	if (interval < 1)
		interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
810

811
	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
812

813 814
	raw_spin_lock_init(&cpuctx->hrtimer_lock);
	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
815
	timer->function = perf_mux_hrtimer_handler;
816 817
}

818
static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
819
{
820
	struct hrtimer *timer = &cpuctx->hrtimer;
821
	struct pmu *pmu = cpuctx->ctx.pmu;
822
	unsigned long flags;
823 824 825

	/* not for SW PMU */
	if (pmu->task_ctx_nr == perf_sw_context)
826
		return 0;
827

828 829 830 831 832 833 834
	raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
	if (!cpuctx->hrtimer_active) {
		cpuctx->hrtimer_active = 1;
		hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
	}
	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
835

836
	return 0;
837 838
}

Peter Zijlstra's avatar
Peter Zijlstra committed
839
void perf_pmu_disable(struct pmu *pmu)
840
{
Peter Zijlstra's avatar
Peter Zijlstra committed
841 842 843
	int *count = this_cpu_ptr(pmu->pmu_disable_count);
	if (!(*count)++)
		pmu->pmu_disable(pmu);
844 845
}

Peter Zijlstra's avatar
Peter Zijlstra committed
846
void perf_pmu_enable(struct pmu *pmu)
847
{
Peter Zijlstra's avatar
Peter Zijlstra committed
848 849 850
	int *count = this_cpu_ptr(pmu->pmu_disable_count);
	if (!--(*count))
		pmu->pmu_enable(pmu);
851 852
}

853
static DEFINE_PER_CPU(struct list_head, active_ctx_list);
854 855

/*
856 857 858 859
 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
 * perf_event_task_tick() are fully serialized because they're strictly cpu
 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
 * disabled, while perf_event_task_tick is called from IRQ context.
860
 */
861
static void perf_event_ctx_activate(struct perf_event_context *ctx)
862
{
863
	struct list_head *head = this_cpu_ptr(&active_ctx_list);
864

865
	WARN_ON(!irqs_disabled());
866

867 868 869 870 871 872 873 874 875 876 877 878
	WARN_ON(!list_empty(&ctx->active_ctx_list));

	list_add(&ctx->active_ctx_list, head);
}

static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
{
	WARN_ON(!irqs_disabled());

	WARN_ON(list_empty(&ctx->active_ctx_list));

	list_del_init(&ctx->active_ctx_list);
879 880
}

881
static void get_ctx(struct perf_event_context *ctx)
882
{
883
	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
884 885
}

886 887 888 889 890 891 892 893 894
static void free_ctx(struct rcu_head *head)
{
	struct perf_event_context *ctx;

	ctx = container_of(head, struct perf_event_context, rcu_head);
	kfree(ctx->task_ctx_data);
	kfree(ctx);
}

895
static void put_ctx(struct perf_event_context *ctx)
896
{
897 898 899
	if (atomic_dec_and_test(&ctx->refcount)) {
		if (ctx->parent_ctx)
			put_ctx(ctx->parent_ctx);
900 901
		if (ctx->task)
			put_task_struct(ctx->task);
902
		call_rcu(&ctx->rcu_head, free_ctx);
903
	}
904 905
}

Peter Zijlstra's avatar
Peter Zijlstra committed
906 907 908 909 910 911 912
/*
 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
 * perf_pmu_migrate_context() we need some magic.
 *
 * Those places that change perf_event::ctx will hold both
 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 *
913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936
 * Lock ordering is by mutex address. There are two other sites where
 * perf_event_context::mutex nests and those are:
 *
 *  - perf_event_exit_task_context()	[ child , 0 ]
 *      __perf_event_exit_task()
 *        sync_child_event()
 *          put_event()			[ parent, 1 ]
 *
 *  - perf_event_init_context()		[ parent, 0 ]
 *      inherit_task_group()
 *        inherit_group()
 *          inherit_event()
 *            perf_event_alloc()
 *              perf_init_event()
 *                perf_try_init_event()	[ child , 1 ]
 *
 * While it appears there is an obvious deadlock here -- the parent and child
 * nesting levels are inverted between the two. This is in fact safe because
 * life-time rules separate them. That is an exiting task cannot fork, and a
 * spawning task cannot (yet) exit.
 *
 * But remember that that these are parent<->child context relations, and
 * migration does not affect children, therefore these two orderings should not
 * interact.
Peter Zijlstra's avatar
Peter Zijlstra committed
937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959
 *
 * The change in perf_event::ctx does not affect children (as claimed above)
 * because the sys_perf_event_open() case will install a new event and break
 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
 * concerned with cpuctx and that doesn't have children.
 *
 * The places that change perf_event::ctx will issue:
 *
 *   perf_remove_from_context();
 *   synchronize_rcu();
 *   perf_install_in_context();
 *
 * to affect the change. The remove_from_context() + synchronize_rcu() should
 * quiesce the event, after which we can install it in the new location. This
 * means that only external vectors (perf_fops, prctl) can perturb the event
 * while in transit. Therefore all such accessors should also acquire
 * perf_event_context::mutex to serialize against this.
 *
 * However; because event->ctx can change while we're waiting to acquire
 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
 * function.
 *
 * Lock order:
960
 *    cred_guard_mutex
Peter Zijlstra's avatar
Peter Zijlstra committed
961 962 963 964 965 966 967
 *	task_struct::perf_event_mutex
 *	  perf_event_context::mutex
 *	    perf_event_context::lock
 *	    perf_event::child_mutex;
 *	    perf_event::mmap_mutex
 *	    mmap_sem
 */
968 969
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
Peter Zijlstra's avatar
Peter Zijlstra committed
970 971 972 973 974 975 976 977 978 979 980 981
{
	struct perf_event_context *ctx;

again:
	rcu_read_lock();
	ctx = ACCESS_ONCE(event->ctx);
	if (!atomic_inc_not_zero(&ctx->refcount)) {
		rcu_read_unlock();
		goto again;
	}
	rcu_read_unlock();

982
	mutex_lock_nested(&ctx->mutex, nesting);
Peter Zijlstra's avatar
Peter Zijlstra committed
983 984 985 986 987 988 989 990 991
	if (event->ctx != ctx) {
		mutex_unlock(&ctx->mutex);
		put_ctx(ctx);
		goto again;
	}

	return ctx;
}

992 993 994 995 996 997
static inline struct perf_event_context *
perf_event_ctx_lock(struct perf_event *event)
{
	return perf_event_ctx_lock_nested(event, 0);
}

Peter Zijlstra's avatar
Peter Zijlstra committed
998 999 1000 1001 1002 1003 1004
static void perf_event_ctx_unlock(struct perf_event *event,
				  struct perf_event_context *ctx)
{
	mutex_unlock(&ctx->mutex);
	put_ctx(ctx);
}

1005 1006 1007 1008 1009 1010 1011
/*
 * This must be done under the ctx->lock, such as to serialize against
 * context_equiv(), therefore we cannot call put_ctx() since that might end up
 * calling scheduler related locks and ctx->lock nests inside those.
 */
static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context *ctx)
1012
{
1013 1014 1015 1016 1017
	struct perf_event_context *parent_ctx = ctx->parent_ctx;

	lockdep_assert_held(&ctx->lock);

	if (parent_ctx)
1018
		ctx->parent_ctx = NULL;
1019
	ctx->generation++;
1020 1021

	return parent_ctx;
1022 1023
}

1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
	/*
	 * only top level events have the pid namespace they were created in
	 */
	if (event->parent)
		event = event->parent;

	return task_tgid_nr_ns(p, event->ns);
}

static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
{
	/*
	 * only top level events have the pid namespace they were created in
	 */
	if (event->parent)
		event = event->parent;

	return task_pid_nr_ns(p, event->ns);
}

1046
/*
1047
 * If we inherit events we want to return the parent event id
1048 1049
 * to userspace.
 */
1050
static u64 primary_event_id(struct perf_event *event)
1051
{
1052
	u64 id = event->id;
1053

1054 1055
	if (event->parent)
		id = event->parent->id;
1056 1057 1058 1059

	return id;
}

1060
/*
1061
 * Get the perf_event_context for a task and lock it.
1062 1063 1064
 * This has to cope with with the fact that until it is locked,
 * the context could get moved to another task.
 */
1065
static struct perf_event_context *
Peter Zijlstra's avatar
Peter Zijlstra committed
1066
perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1067
{
1068
	struct perf_event_context *ctx;
1069

Peter Zijlstra's avatar
Peter Zijlstra committed
1070
retry:
1071 1072 1073
	/*
	 * One of the few rules of preemptible RCU is that one cannot do
	 * rcu_read_unlock() while holding a scheduler (or nested) lock when
1074
	 * part of the read side critical section was irqs-enabled -- see
1075 1076 1077
	 * rcu_read_unlock_special().
	 *
	 * Since ctx->lock nests under rq->lock we must ensure the entire read
1078
	 * side critical section has interrupts disabled.
1079
	 */
1080
	local_irq_save(*flags);
1081
	rcu_read_lock();
Peter Zijlstra's avatar
Peter Zijlstra committed
1082
	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1083 1084 1085 1086
	if (ctx) {
		/*
		 * If this context is a clone of another, it might
		 * get swapped for another underneath us by
1087
		 * perf_event_task_sched_out, though the
1088 1089 1090 1091 1092 1093
		 * rcu_read_lock() protects us from any context
		 * getting freed.  Lock the context and check if it
		 * got swapped before we could get the lock, and retry
		 * if so.  If we locked the right context, then it
		 * can't get swapped on us any more.
		 */
1094
		raw_spin_lock(&ctx->lock);
Peter Zijlstra's avatar
Peter Zijlstra committed
1095
		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1096
			raw_spin_unlock(&ctx->lock);
1097
			rcu_read_unlock();
1098
			local_irq_restore(*flags);
1099 1100
			goto retry;
		}
1101 1102

		if (!atomic_inc_not_zero(&ctx->refcount)) {
1103
			raw_spin_unlock(&ctx->lock);
1104 1105
			ctx = NULL;
		}
1106 1107
	}
	rcu_read_unlock();
1108 1109
	if (!ctx)
		local_irq_restore(*flags);
1110 1111 1112 1113 1114 1115 1116 1117
	return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
1118 1119
static struct perf_event_context *
perf_pin_task_context(struct task_struct *task, int ctxn)
1120
{
1121
	struct perf_event_context *ctx;
1122 1123
	unsigned long flags;

Peter Zijlstra's avatar
Peter Zijlstra committed
1124
	ctx = perf_lock_task_context(task, ctxn, &flags);
1125 1126
	if (ctx) {
		++ctx->pin_count;
1127
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
1128 1129 1130 1131
	}
	return ctx;
}

1132
static void perf_unpin_context(struct perf_event_context *ctx)
1133 1134 1135
{
	unsigned long flags;

1136
	raw_spin_lock_irqsave(&ctx->lock, flags);
1137
	--ctx->pin_count;
1138
	raw_spin_unlock_irqrestore(&ctx->lock, flags);
1139 1140
}

1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
/*
 * Update the record of the current time in a context.
 */
static void update_context_time(struct perf_event_context *ctx)
{
	u64 now = perf_clock();

	ctx->time += now - ctx->timestamp;
	ctx->timestamp = now;
}

1152 1153 1154
static u64 perf_event_time(struct perf_event *event)
{
	struct perf_event_context *ctx = event->ctx;
Stephane Eranian's avatar
Stephane Eranian committed
1155 1156 1157 1158

	if (is_cgroup_event(event))
		return perf_cgroup_event_time(event);

1159 1160 1161
	return ctx ? ctx->time : 0;
}

1162 1163
/*
 * Update the total_time_enabled and total_time_running fields for a event.
1164
 * The caller of this function needs to hold the ctx->lock.
1165 1166 1167 1168 1169 1170 1171 1172 1173
 */
static void update_event_times(struct perf_event *event)
{
	struct perf_event_context *ctx = event->ctx;
	u64 run_end;

	if (event->state < PERF_EVENT_STATE_INACTIVE ||
	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
		return;