trace_syscalls.c 16.5 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/slab.h>
4
#include <linux/kernel.h>
5
#include <linux/ftrace.h>
6
#include <linux/perf_event.h>
7 8 9 10 11
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

12
static DEFINE_MUTEX(syscall_trace_lock);
13 14
static int sys_refcount_enter;
static int sys_refcount_exit;
15 16
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17

18 19 20 21 22
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type);
static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type);

23 24 25 26 27 28 29 30 31 32 33
static int syscall_enter_define_fields(struct ftrace_event_call *call);
static int syscall_exit_define_fields(struct ftrace_event_call *call);

static struct list_head *
syscall_get_enter_fields(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	return &entry->enter_fields;
}

34
struct trace_event_functions enter_syscall_print_funcs = {
35
	.trace		= print_syscall_enter,
36 37 38
};

struct trace_event_functions exit_syscall_print_funcs = {
39
	.trace		= print_syscall_exit,
40 41
};

42
struct ftrace_event_class event_class_syscall_enter = {
43 44 45 46 47
	.system		= "syscalls",
	.reg		= syscall_enter_register,
	.define_fields	= syscall_enter_define_fields,
	.get_fields	= syscall_get_enter_fields,
	.raw_init	= init_syscall_trace,
48 49 50
};

struct ftrace_event_class event_class_syscall_exit = {
51 52 53 54 55
	.system		= "syscalls",
	.reg		= syscall_exit_register,
	.define_fields	= syscall_exit_define_fields,
	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
	.raw_init	= init_syscall_trace,
56 57
};

58 59
extern struct syscall_metadata *__start_syscalls_metadata[];
extern struct syscall_metadata *__stop_syscalls_metadata[];
60 61 62

static struct syscall_metadata **syscalls_metadata;

63 64 65 66 67 68 69 70 71 72 73 74 75
#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
{
	/*
	 * Only compare after the "sys" prefix. Archs that use
	 * syscall wrappers may have syscalls symbols aliases prefixed
	 * with "SyS" instead of "sys", leading to an unwanted
	 * mismatch.
	 */
	return !strcmp(sym + 3, name + 3);
}
#endif

76 77
static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
78
{
79 80
	struct syscall_metadata **start;
	struct syscall_metadata **stop;
81 82 83
	char str[KSYM_SYMBOL_LEN];


84 85
	start = __start_syscalls_metadata;
	stop = __stop_syscalls_metadata;
86 87
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

88 89 90
	if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
		return NULL;

91
	for ( ; start < stop; start++) {
92
		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
93
			return *start;
94 95 96 97 98 99 100 101 102 103 104 105
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

106
enum print_line_t
107 108
print_syscall_enter(struct trace_iterator *iter, int flags,
		    struct trace_event *event)
109 110 111 112 113 114 115
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

116
	trace = (typeof(trace))ent;
117 118
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
119

120 121 122
	if (!entry)
		goto end;

123
	if (entry->enter_event->event.type != ent->type) {
124 125 126 127
		WARN_ON_ONCE(1);
		goto end;
	}

128 129 130 131 132 133
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
134
		if (trace_flags & TRACE_ITER_VERBOSE) {
135 136 137 138 139
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
140
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
141
				       trace->args[i],
142
				       i == entry->nb_args - 1 ? "" : ", ");
143 144 145 146
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

147 148 149 150
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

151
end:
152 153 154 155
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

156 157 158 159
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
160 161
print_syscall_exit(struct trace_iterator *iter, int flags,
		   struct trace_event *event)
162 163 164 165 166 167 168 169
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

170
	trace = (typeof(trace))ent;
171 172
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
173

174 175 176 177 178
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

179
	if (entry->exit_event->event.type != ent->type) {
180 181 182 183
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

184 185 186 187 188 189 190 191
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

192 193 194 195 196
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
197 198
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
199

200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

261
static int syscall_enter_define_fields(struct ftrace_event_call *call)
262 263
{
	struct syscall_trace_enter trace;
264
	struct syscall_metadata *meta = call->data;
265 266 267 268
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

269 270 271 272
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

273
	for (i = 0; i < meta->nb_args; i++) {
274 275
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
276 277
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
278 279 280 281 282 283
		offset += sizeof(unsigned long);
	}

	return ret;
}

284
static int syscall_exit_define_fields(struct ftrace_event_call *call)
285 286 287 288
{
	struct syscall_trace_exit trace;
	int ret;

289 290 291 292
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

293
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
294
				 FILTER_OTHER);
295 296 297 298

	return ret;
}

299
void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
300
{
301 302 303
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
304
	struct ring_buffer *buffer;
305
	int size;
306 307 308
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
309 310
	if (syscall_nr < 0)
		return;
311 312
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
313

314 315 316 317 318 319
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

320
	event = trace_current_buffer_lock_reserve(&buffer,
321
			sys_data->enter_event->event.type, size, 0, 0);
322 323 324 325 326 327 328
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

329 330 331
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
332 333
}

334
void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
335
{
336 337 338
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
339
	struct ring_buffer *buffer;
340 341 342
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
343 344
	if (syscall_nr < 0)
		return;
345 346
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
347

348 349 350 351
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

352
	event = trace_current_buffer_lock_reserve(&buffer,
353
			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
354 355 356 357 358 359 360
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

361 362 363
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
364 365
}

366
int reg_event_syscall_enter(struct ftrace_event_call *call)
367
{
368 369 370
	int ret = 0;
	int num;

371
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
372
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
373 374 375
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
376
		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
377
	if (!ret) {
378 379 380 381 382
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
383 384
}

385
void unreg_event_syscall_enter(struct ftrace_event_call *call)
386
{
387
	int num;
388

389
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
390
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
391 392 393 394 395
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
396
		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
397 398
	mutex_unlock(&syscall_trace_lock);
}
399

400
int reg_event_syscall_exit(struct ftrace_event_call *call)
401
{
402 403 404
	int ret = 0;
	int num;

405
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
406
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
407 408 409
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
410
		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
411
	if (!ret) {
412 413
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
414
	}
415 416 417
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
418

419
void unreg_event_syscall_exit(struct ftrace_event_call *call)
420 421
{
	int num;
422

423
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
424
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
425 426 427 428 429
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
430
		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
431
	mutex_unlock(&syscall_trace_lock);
432
}
433

434 435 436
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;
437 438 439 440 441 442 443 444
	int num;

	num = ((struct syscall_metadata *)call->data)->syscall_nr;
	if (num < 0 || num >= NR_syscalls) {
		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
				((struct syscall_metadata *)call->data)->name);
		return -ENOSYS;
	}
445

446 447 448
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

449 450 451
	id = trace_event_raw_init(call);

	if (id < 0) {
452
		free_syscall_print_fmt(call);
453
		return id;
454
	}
455 456

	return id;
457 458
}

459
unsigned long __init __weak arch_syscall_addr(int nr)
460 461 462 463
{
	return (unsigned long)sys_call_table[nr];
}

464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
480 481 482 483
		if (!meta)
			continue;

		meta->syscall_nr = i;
484 485 486 487 488 489 490
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

491
#ifdef CONFIG_PERF_EVENTS
492

493 494 495 496
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
497

498
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
499 500
{
	struct syscall_metadata *sys_data;
501
	struct syscall_trace_enter *rec;
502
	struct hlist_head *head;
503
	int syscall_nr;
504
	int rctx;
505
	int size;
506 507

	syscall_nr = syscall_get_nr(current, regs);
508
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
509 510 511 512 513 514
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

515 516 517 518 519
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

520 521
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
522 523
		return;

524
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
525
				sys_data->enter_event->event.type, regs, &rctx);
526 527
	if (!rec)
		return;
528 529 530 531

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
532

533
	head = this_cpu_ptr(sys_data->enter_event->perf_events);
534
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
535 536
}

537
int perf_sysenter_enable(struct ftrace_event_call *call)
538 539 540 541
{
	int ret = 0;
	int num;

542
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
543 544

	mutex_lock(&syscall_trace_lock);
545
	if (!sys_perf_refcount_enter)
546
		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
547 548 549 550
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
551 552
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
553 554 555 556 557
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

558
void perf_sysenter_disable(struct ftrace_event_call *call)
559 560 561
{
	int num;

562
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
563 564

	mutex_lock(&syscall_trace_lock);
565 566 567
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
568
		unregister_trace_sys_enter(perf_syscall_enter, NULL);
569 570 571
	mutex_unlock(&syscall_trace_lock);
}

572
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
573 574
{
	struct syscall_metadata *sys_data;
575
	struct syscall_trace_exit *rec;
576
	struct hlist_head *head;
577
	int syscall_nr;
578
	int rctx;
579
	int size;
580 581

	syscall_nr = syscall_get_nr(current, regs);
582
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
583 584 585 586 587 588
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

589 590 591
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
592

593 594 595 596
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
597 598
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
599 600
		return;

601
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
602
				sys_data->exit_event->event.type, regs, &rctx);
603 604
	if (!rec)
		return;
605 606 607 608

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

609
	head = this_cpu_ptr(sys_data->exit_event->perf_events);
610
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
611 612
}

613
int perf_sysexit_enable(struct ftrace_event_call *call)
614 615 616 617
{
	int ret = 0;
	int num;

618
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
619 620

	mutex_lock(&syscall_trace_lock);
621
	if (!sys_perf_refcount_exit)
622
		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
623 624
	if (ret) {
		pr_info("event trace: Could not activate"
625
				"syscall exit trace point");
626
	} else {
627 628
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
629 630 631 632 633
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

634
void perf_sysexit_disable(struct ftrace_event_call *call)
635 636 637
{
	int num;

638
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
639 640

	mutex_lock(&syscall_trace_lock);
641 642 643
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
644
		unregister_trace_sys_exit(perf_syscall_exit, NULL);
645 646 647
	mutex_unlock(&syscall_trace_lock);
}

648
#endif /* CONFIG_PERF_EVENTS */
649

650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_enter(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_enter(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysenter_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysenter_disable(event);
		return 0;
#endif
	}
	return 0;
}

static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_exit(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_exit(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysexit_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysexit_disable(event);
		return 0;
#endif
	}
	return 0;
}