trace_syscalls.c 16.3 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/slab.h>
4
#include <linux/kernel.h>
5
#include <linux/ftrace.h>
6
#include <linux/perf_event.h>
7 8 9 10 11
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

12
static DEFINE_MUTEX(syscall_trace_lock);
13 14
static int sys_refcount_enter;
static int sys_refcount_exit;
15 16
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17

18 19 20 21 22
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type);
static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type);

23 24 25 26 27 28 29 30 31 32 33
static int syscall_enter_define_fields(struct ftrace_event_call *call);
static int syscall_exit_define_fields(struct ftrace_event_call *call);

static struct list_head *
syscall_get_enter_fields(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	return &entry->enter_fields;
}

34
struct trace_event_functions enter_syscall_print_funcs = {
35
	.trace		= print_syscall_enter,
36 37 38
};

struct trace_event_functions exit_syscall_print_funcs = {
39
	.trace		= print_syscall_exit,
40 41
};

42
struct ftrace_event_class event_class_syscall_enter = {
43 44 45 46 47
	.system		= "syscalls",
	.reg		= syscall_enter_register,
	.define_fields	= syscall_enter_define_fields,
	.get_fields	= syscall_get_enter_fields,
	.raw_init	= init_syscall_trace,
48 49 50
};

struct ftrace_event_class event_class_syscall_exit = {
51 52 53 54 55
	.system		= "syscalls",
	.reg		= syscall_exit_register,
	.define_fields	= syscall_exit_define_fields,
	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
	.raw_init	= init_syscall_trace,
56 57
};

58 59
extern struct syscall_metadata *__start_syscalls_metadata[];
extern struct syscall_metadata *__stop_syscalls_metadata[];
60 61 62

static struct syscall_metadata **syscalls_metadata;

63 64
static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
65
{
66 67
	struct syscall_metadata **start;
	struct syscall_metadata **stop;
68 69 70
	char str[KSYM_SYMBOL_LEN];


71 72
	start = __start_syscalls_metadata;
	stop = __stop_syscalls_metadata;
73 74 75 76 77 78 79 80 81
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

	for ( ; start < stop; start++) {
		/*
		 * Only compare after the "sys" prefix. Archs that use
		 * syscall wrappers may have syscalls symbols aliases prefixed
		 * with "SyS" instead of "sys", leading to an unwanted
		 * mismatch.
		 */
82 83
		if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
			return *start;
84 85 86 87 88 89 90 91 92 93 94 95
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

96
enum print_line_t
97 98
print_syscall_enter(struct trace_iterator *iter, int flags,
		    struct trace_event *event)
99 100 101 102 103 104 105
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

106
	trace = (typeof(trace))ent;
107 108
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
109

110 111 112
	if (!entry)
		goto end;

113
	if (entry->enter_event->event.type != ent->type) {
114 115 116 117
		WARN_ON_ONCE(1);
		goto end;
	}

118 119 120 121 122 123
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
124
		if (trace_flags & TRACE_ITER_VERBOSE) {
125 126 127 128 129
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
130
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
131
				       trace->args[i],
132
				       i == entry->nb_args - 1 ? "" : ", ");
133 134 135 136
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

137 138 139 140
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

141
end:
142 143 144 145
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

146 147 148 149
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
150 151
print_syscall_exit(struct trace_iterator *iter, int flags,
		   struct trace_event *event)
152 153 154 155 156 157 158 159
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

160
	trace = (typeof(trace))ent;
161 162
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
163

164 165 166 167 168
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

169
	if (entry->exit_event->event.type != ent->type) {
170 171 172 173
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

174 175 176 177 178 179 180 181
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

182 183 184 185 186
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
187 188
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
189

190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

251
static int syscall_enter_define_fields(struct ftrace_event_call *call)
252 253
{
	struct syscall_trace_enter trace;
254
	struct syscall_metadata *meta = call->data;
255 256 257 258
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

259 260 261 262
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

263
	for (i = 0; i < meta->nb_args; i++) {
264 265
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
266 267
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
268 269 270 271 272 273
		offset += sizeof(unsigned long);
	}

	return ret;
}

274
static int syscall_exit_define_fields(struct ftrace_event_call *call)
275 276 277 278
{
	struct syscall_trace_exit trace;
	int ret;

279 280 281 282
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

283
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
284
				 FILTER_OTHER);
285 286 287 288

	return ret;
}

289
void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
290
{
291 292 293
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
294
	struct ring_buffer *buffer;
295
	int size;
296 297 298
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
299 300
	if (syscall_nr < 0)
		return;
301 302
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
303

304 305 306 307 308 309
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

310
	event = trace_current_buffer_lock_reserve(&buffer,
311
			sys_data->enter_event->event.type, size, 0, 0);
312 313 314 315 316 317 318
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

319 320 321
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
322 323
}

324
void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
325
{
326 327 328
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
329
	struct ring_buffer *buffer;
330 331 332
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
333 334
	if (syscall_nr < 0)
		return;
335 336
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
337

338 339 340 341
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

342
	event = trace_current_buffer_lock_reserve(&buffer,
343
			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
344 345 346 347 348 349 350
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

351 352 353
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
354 355
}

356
int reg_event_syscall_enter(struct ftrace_event_call *call)
357
{
358 359 360
	int ret = 0;
	int num;

361
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
362
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
363 364 365
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
366
		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
367
	if (!ret) {
368 369 370 371 372
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
373 374
}

375
void unreg_event_syscall_enter(struct ftrace_event_call *call)
376
{
377
	int num;
378

379
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
380
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
381 382 383 384 385
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
386
		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
387 388
	mutex_unlock(&syscall_trace_lock);
}
389

390
int reg_event_syscall_exit(struct ftrace_event_call *call)
391
{
392 393 394
	int ret = 0;
	int num;

395
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
396
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
397 398 399
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
400
		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
401
	if (!ret) {
402 403
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
404
	}
405 406 407
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
408

409
void unreg_event_syscall_exit(struct ftrace_event_call *call)
410 411
{
	int num;
412

413
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
414
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
415 416 417 418 419
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
420
		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
421
	mutex_unlock(&syscall_trace_lock);
422
}
423

424 425 426
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;
427 428 429 430 431 432 433 434
	int num;

	num = ((struct syscall_metadata *)call->data)->syscall_nr;
	if (num < 0 || num >= NR_syscalls) {
		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
				((struct syscall_metadata *)call->data)->name);
		return -ENOSYS;
	}
435

436 437 438
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

439 440 441
	id = trace_event_raw_init(call);

	if (id < 0) {
442
		free_syscall_print_fmt(call);
443
		return id;
444
	}
445 446

	return id;
447 448
}

449 450 451 452 453
unsigned long __init arch_syscall_addr(int nr)
{
	return (unsigned long)sys_call_table[nr];
}

454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
470 471 472 473
		if (!meta)
			continue;

		meta->syscall_nr = i;
474 475 476 477 478 479 480
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

481
#ifdef CONFIG_PERF_EVENTS
482

483 484 485 486
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
487

488
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
489 490
{
	struct syscall_metadata *sys_data;
491
	struct syscall_trace_enter *rec;
492
	struct hlist_head *head;
493
	int syscall_nr;
494
	int rctx;
495
	int size;
496 497

	syscall_nr = syscall_get_nr(current, regs);
498
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
499 500 501 502 503 504
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

505 506 507 508 509
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

510 511
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
512 513
		return;

514
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
515
				sys_data->enter_event->event.type, regs, &rctx);
516 517
	if (!rec)
		return;
518 519 520 521

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
522

523
	head = this_cpu_ptr(sys_data->enter_event->perf_events);
524
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
525 526
}

527
int perf_sysenter_enable(struct ftrace_event_call *call)
528 529 530 531
{
	int ret = 0;
	int num;

532
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
533 534

	mutex_lock(&syscall_trace_lock);
535
	if (!sys_perf_refcount_enter)
536
		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
537 538 539 540
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
541 542
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
543 544 545 546 547
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

548
void perf_sysenter_disable(struct ftrace_event_call *call)
549 550 551
{
	int num;

552
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
553 554

	mutex_lock(&syscall_trace_lock);
555 556 557
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
558
		unregister_trace_sys_enter(perf_syscall_enter, NULL);
559 560 561
	mutex_unlock(&syscall_trace_lock);
}

562
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
563 564
{
	struct syscall_metadata *sys_data;
565
	struct syscall_trace_exit *rec;
566
	struct hlist_head *head;
567
	int syscall_nr;
568
	int rctx;
569
	int size;
570 571

	syscall_nr = syscall_get_nr(current, regs);
572
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
573 574 575 576 577 578
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

579 580 581
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
582

583 584 585 586
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
587 588
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
589 590
		return;

591
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
592
				sys_data->exit_event->event.type, regs, &rctx);
593 594
	if (!rec)
		return;
595 596 597 598

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

599
	head = this_cpu_ptr(sys_data->exit_event->perf_events);
600
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
601 602
}

603
int perf_sysexit_enable(struct ftrace_event_call *call)
604 605 606 607
{
	int ret = 0;
	int num;

608
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
609 610

	mutex_lock(&syscall_trace_lock);
611
	if (!sys_perf_refcount_exit)
612
		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
613 614
	if (ret) {
		pr_info("event trace: Could not activate"
615
				"syscall exit trace point");
616
	} else {
617 618
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
619 620 621 622 623
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

624
void perf_sysexit_disable(struct ftrace_event_call *call)
625 626 627
{
	int num;

628
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
629 630

	mutex_lock(&syscall_trace_lock);
631 632 633
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
634
		unregister_trace_sys_exit(perf_syscall_exit, NULL);
635 636 637
	mutex_unlock(&syscall_trace_lock);
}

638
#endif /* CONFIG_PERF_EVENTS */
639

640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_enter(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_enter(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysenter_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysenter_disable(event);
		return 0;
#endif
	}
	return 0;
}

static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_exit(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_exit(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysexit_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysexit_disable(event);
		return 0;
#endif
	}
	return 0;
}