blktrace.c 45.4 KB
Newer Older
1
/*
2
 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mutex.h>
24
#include <linux/slab.h>
25
#include <linux/debugfs.h>
26
#include <linux/export.h>
27
#include <linux/time.h>
28
#include <linux/uaccess.h>
29
#include <linux/list.h>
30
#include <linux/blk-cgroup.h>
31

32 33
#include "../../block/blk.h"

34 35
#include <trace/events/block.h>

36
#include "trace_output.h"
37

38 39
#ifdef CONFIG_BLK_DEV_IO_TRACE

40 41
static unsigned int blktrace_seq __read_mostly = 1;

42
static struct trace_array *blk_tr;
43
static bool blk_tracer_enabled __read_mostly;
44

45 46 47
static LIST_HEAD(running_trace_list);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);

48
/* Select an alternative, minimalistic output than the original one */
49
#define TRACE_BLK_OPT_CLASSIC	0x1
50
#define TRACE_BLK_OPT_CGROUP	0x2
51
#define TRACE_BLK_OPT_CGNAME	0x4
52 53 54

static struct tracer_opt blk_tracer_opts[] = {
	/* Default disable the minimalistic output */
55
	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
56 57
#ifdef CONFIG_BLK_CGROUP
	{ TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
58
	{ TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) },
59
#endif
60 61 62 63 64 65 66 67
	{ }
};

static struct tracer_flags blk_tracer_flags = {
	.val  = 0,
	.opts = blk_tracer_opts,
};

68
/* Global reference count of probes */
69 70
static DEFINE_MUTEX(blk_probe_mutex);
static int blk_probes_ref;
71

72
static void blk_register_tracepoints(void);
73 74
static void blk_unregister_tracepoints(void);

75 76 77
/*
 * Send out a notify message.
 */
78
static void trace_note(struct blk_trace *bt, pid_t pid, int action,
79 80
		       const void *data, size_t len,
		       union kernfs_node_id *cgid)
81 82
{
	struct blk_io_trace *t;
83
	struct ring_buffer_event *event = NULL;
84
	struct ring_buffer *buffer = NULL;
85 86 87
	int pc = 0;
	int cpu = smp_processor_id();
	bool blk_tracer = blk_tracer_enabled;
88
	ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
89 90

	if (blk_tracer) {
91
		buffer = blk_tr->trace_buffer.buffer;
92
		pc = preempt_count();
93
		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
94
						  sizeof(*t) + len + cgid_len,
95 96 97 98 99 100
						  0, pc);
		if (!event)
			return;
		t = ring_buffer_event_data(event);
		goto record_it;
	}
101

102 103 104
	if (!bt->rchan)
		return;

105
	t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
106 107
	if (t) {
		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
108
		t->time = ktime_to_ns(ktime_get());
109
record_it:
110
		t->device = bt->dev;
111
		t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
112 113
		t->pid = pid;
		t->cpu = cpu;
114 115 116 117
		t->pdu_len = len + cgid_len;
		if (cgid)
			memcpy((void *)t + sizeof(*t), cgid, cgid_len);
		memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
118 119

		if (blk_tracer)
120
			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
121
	}
122 123
}

124 125 126 127
/*
 * Send out a notify for this process, if we haven't done so since a trace
 * started
 */
128
static void trace_note_tsk(struct task_struct *tsk)
129
{
130 131 132
	unsigned long flags;
	struct blk_trace *bt;

133
	tsk->btrace_seq = blktrace_seq;
134 135 136
	spin_lock_irqsave(&running_trace_lock, flags);
	list_for_each_entry(bt, &running_trace_list, running_list) {
		trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
137
			   sizeof(tsk->comm), NULL);
138 139
	}
	spin_unlock_irqrestore(&running_trace_lock, flags);
140
}
141

142 143
static void trace_note_time(struct blk_trace *bt)
{
144
	struct timespec64 now;
145 146 147
	unsigned long flags;
	u32 words[2];

148 149 150
	/* need to check user space to see if this breaks in y2038 or y2106 */
	ktime_get_real_ts64(&now);
	words[0] = (u32)now.tv_sec;
151 152 153
	words[1] = now.tv_nsec;

	local_irq_save(flags);
154
	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL);
155
	local_irq_restore(flags);
156 157
}

158 159
void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
	const char *fmt, ...)
160 161 162
{
	int n;
	va_list args;
163
	unsigned long flags;
164
	char *buf;
165

166 167
	if (unlikely(bt->trace_state != Blktrace_running &&
		     !blk_tracer_enabled))
168 169
		return;

170 171 172 173 174 175 176
	/*
	 * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
	 * message to the trace.
	 */
	if (!(bt->act_mask & BLK_TC_NOTIFY))
		return;

177
	local_irq_save(flags);
178
	buf = this_cpu_ptr(bt->msg_data);
179
	va_start(args, fmt);
180
	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
181 182
	va_end(args);

183 184 185 186 187 188
	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
		blkcg = NULL;
#ifdef CONFIG_BLK_CGROUP
	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
		blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL);
#else
189
	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL);
190
#endif
191
	local_irq_restore(flags);
192 193 194
}
EXPORT_SYMBOL_GPL(__trace_note_message);

195 196 197 198 199
static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
			 pid_t pid)
{
	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
		return 1;
200
	if (sector && (sector < bt->start_lba || sector > bt->end_lba))
201 202 203 204 205 206 207 208 209 210
		return 1;
	if (bt->pid && pid != bt->pid)
		return 1;

	return 0;
}

/*
 * Data direction bit lookup
 */
211 212
static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
				 BLK_TC_ACT(BLK_TC_WRITE) };
213

214
#define BLK_TC_RAHEAD		BLK_TC_AHEAD
215
#define BLK_TC_PREFLUSH		BLK_TC_FLUSH
216

217
/* The ilog2() calls fall out because they're constant */
218 219
#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
220 221 222 223 224

/*
 * The worker for the various blk_add_trace*() types. Fills out a
 * blk_io_trace structure and places it in a per-cpu subbuffer.
 */
225
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
226
		     int op, int op_flags, u32 what, int error, int pdu_len,
227
		     void *pdu_data, union kernfs_node_id *cgid)
228 229
{
	struct task_struct *tsk = current;
230
	struct ring_buffer_event *event = NULL;
231
	struct ring_buffer *buffer = NULL;
232
	struct blk_io_trace *t;
233
	unsigned long flags = 0;
234 235
	unsigned long *sequence;
	pid_t pid;
236
	int cpu, pc = 0;
237
	bool blk_tracer = blk_tracer_enabled;
238
	ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
239

240
	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
241 242
		return;

243 244 245 246
	what |= ddir_act[op_is_write(op) ? WRITE : READ];
	what |= MASK_TC_BIT(op_flags, SYNC);
	what |= MASK_TC_BIT(op_flags, RAHEAD);
	what |= MASK_TC_BIT(op_flags, META);
247
	what |= MASK_TC_BIT(op_flags, PREFLUSH);
248
	what |= MASK_TC_BIT(op_flags, FUA);
Adrian Hunter's avatar
Adrian Hunter committed
249
	if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
250
		what |= BLK_TC_ACT(BLK_TC_DISCARD);
251 252
	if (op == REQ_OP_FLUSH)
		what |= BLK_TC_ACT(BLK_TC_FLUSH);
253 254
	if (cgid)
		what |= __BLK_TA_CGROUP;
255 256

	pid = tsk->pid;
257
	if (act_log_check(bt, what, sector, pid))
258
		return;
259 260
	cpu = raw_smp_processor_id();

261
	if (blk_tracer) {
262 263
		tracing_record_cmdline(current);

264
		buffer = blk_tr->trace_buffer.buffer;
265
		pc = preempt_count();
266
		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
267
						  sizeof(*t) + pdu_len + cgid_len,
268
						  0, pc);
269 270
		if (!event)
			return;
271
		t = ring_buffer_event_data(event);
272 273
		goto record_it;
	}
274

275 276 277
	if (unlikely(tsk->btrace_seq != blktrace_seq))
		trace_note_tsk(tsk);

278 279 280
	/*
	 * A word about the locking here - we disable interrupts to reserve
	 * some space in the relay per-cpu buffer, to prevent an irq
281
	 * from coming in and stepping on our toes.
282 283
	 */
	local_irq_save(flags);
284
	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
285 286 287 288 289
	if (t) {
		sequence = per_cpu_ptr(bt->sequence, cpu);

		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
		t->sequence = ++(*sequence);
290
		t->time = ktime_to_ns(ktime_get());
291
record_it:
292
		/*
293 294 295 296 297 298 299
		 * These two are not needed in ftrace as they are in the
		 * generic trace_entry, filled by tracing_generic_entry_update,
		 * but for the trace_event->bin() synthesizer benefit we do it
		 * here too.
		 */
		t->cpu = cpu;
		t->pid = pid;
300

301 302 303 304 305
		t->sector = sector;
		t->bytes = bytes;
		t->action = what;
		t->device = bt->dev;
		t->error = error;
306
		t->pdu_len = pdu_len + cgid_len;
307

308 309
		if (cgid_len)
			memcpy((void *)t + sizeof(*t), cgid, cgid_len);
310
		if (pdu_len)
311
			memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
312

313
		if (blk_tracer) {
314
			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
315 316
			return;
		}
317 318 319 320 321
	}

	local_irq_restore(flags);
}

322
static void blk_trace_free(struct blk_trace *bt)
323
{
324
	debugfs_remove(bt->msg_file);
325
	debugfs_remove(bt->dropped_file);
326
	relay_close(bt->rchan);
327
	debugfs_remove(bt->dir);
328
	free_percpu(bt->sequence);
329
	free_percpu(bt->msg_data);
330
	kfree(bt);
331 332
}

333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
static void get_probe_ref(void)
{
	mutex_lock(&blk_probe_mutex);
	if (++blk_probes_ref == 1)
		blk_register_tracepoints();
	mutex_unlock(&blk_probe_mutex);
}

static void put_probe_ref(void)
{
	mutex_lock(&blk_probe_mutex);
	if (!--blk_probes_ref)
		blk_unregister_tracepoints();
	mutex_unlock(&blk_probe_mutex);
}

349 350 351
static void blk_trace_cleanup(struct blk_trace *bt)
{
	blk_trace_free(bt);
352
	put_probe_ref();
353 354
}

355
int blk_trace_remove(struct request_queue *q)
356 357 358 359 360 361 362
{
	struct blk_trace *bt;

	bt = xchg(&q->blk_trace, NULL);
	if (!bt)
		return -EINVAL;

363
	if (bt->trace_state != Blktrace_running)
364 365 366 367
		blk_trace_cleanup(bt);

	return 0;
}
368
EXPORT_SYMBOL_GPL(blk_trace_remove);
369 370 371 372 373 374 375 376 377 378 379 380

static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
				size_t count, loff_t *ppos)
{
	struct blk_trace *bt = filp->private_data;
	char buf[16];

	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));

	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
}

381
static const struct file_operations blk_dropped_fops = {
382
	.owner =	THIS_MODULE,
383
	.open =		simple_open,
384
	.read =		blk_dropped_read,
385
	.llseek =	default_llseek,
386 387
};

388 389 390 391 392 393
static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
				size_t count, loff_t *ppos)
{
	char *msg;
	struct blk_trace *bt;

394
	if (count >= BLK_TN_MAX_MSG)
395 396
		return -EINVAL;

397 398 399
	msg = memdup_user_nul(buffer, count);
	if (IS_ERR(msg))
		return PTR_ERR(msg);
400 401

	bt = filp->private_data;
402
	__trace_note_message(bt, NULL, "%s", msg);
403 404 405 406 407 408 409
	kfree(msg);

	return count;
}

static const struct file_operations blk_msg_fops = {
	.owner =	THIS_MODULE,
410
	.open =		simple_open,
411
	.write =	blk_msg_write,
412
	.llseek =	noop_llseek,
413 414
};

415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
/*
 * Keep track of how many times we encountered a full subbuffer, to aid
 * the user space app in telling how many lost events there were.
 */
static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
				     void *prev_subbuf, size_t prev_padding)
{
	struct blk_trace *bt;

	if (!relay_buf_full(buf))
		return 1;

	bt = buf->chan->private_data;
	atomic_inc(&bt->dropped);
	return 0;
}

static int blk_remove_buf_file_callback(struct dentry *dentry)
{
	debugfs_remove(dentry);
435

436 437 438 439 440
	return 0;
}

static struct dentry *blk_create_buf_file_callback(const char *filename,
						   struct dentry *parent,
Al Viro's avatar
Al Viro committed
441
						   umode_t mode,
442 443 444 445 446 447 448 449 450 451 452 453 454
						   struct rchan_buf *buf,
						   int *is_global)
{
	return debugfs_create_file(filename, mode, parent, buf,
					&relay_file_operations);
}

static struct rchan_callbacks blk_relay_callbacks = {
	.subbuf_start		= blk_subbuf_start_callback,
	.create_buf_file	= blk_create_buf_file_callback,
	.remove_buf_file	= blk_remove_buf_file_callback,
};

455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
static void blk_trace_setup_lba(struct blk_trace *bt,
				struct block_device *bdev)
{
	struct hd_struct *part = NULL;

	if (bdev)
		part = bdev->bd_part;

	if (part) {
		bt->start_lba = part->start_sect;
		bt->end_lba = part->start_sect + part->nr_sects;
	} else {
		bt->start_lba = 0;
		bt->end_lba = -1ULL;
	}
}

472 473 474
/*
 * Setup everything required to start tracing
 */
475 476 477
static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
			      struct block_device *bdev,
			      struct blk_user_trace_setup *buts)
478
{
479
	struct blk_trace *bt = NULL;
480
	struct dentry *dir = NULL;
481
	int ret;
482

483
	if (!buts->buf_size || !buts->buf_nr)
484 485
		return -EINVAL;

486 487
	strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
	buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
488 489 490 491 492

	/*
	 * some device names have larger paths - convert the slashes
	 * to underscores for this to work as expected
	 */
493
	strreplace(buts->name, '/', '_');
494 495 496

	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
	if (!bt)
497
		return -ENOMEM;
498

499
	ret = -ENOMEM;
500 501 502 503
	bt->sequence = alloc_percpu(unsigned long);
	if (!bt->sequence)
		goto err;

504
	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
505 506 507
	if (!bt->msg_data)
		goto err;

508
	ret = -ENOENT;
509

510 511
	if (!blk_debugfs_root)
		goto err;
512

513 514 515
	dir = debugfs_lookup(buts->name, blk_debugfs_root);
	if (!dir)
		bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
516 517 518
	if (!dir)
		goto err;

519
	bt->dev = dev;
520
	atomic_set(&bt->dropped, 0);
521
	INIT_LIST_HEAD(&bt->running_list);
522 523

	ret = -EIO;
524 525
	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
					       &blk_dropped_fops);
526 527 528
	if (!bt->dropped_file)
		goto err;

529 530 531 532
	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
	if (!bt->msg_file)
		goto err;

533 534
	bt->rchan = relay_open("trace", dir, buts->buf_size,
				buts->buf_nr, &blk_relay_callbacks, bt);
535 536 537
	if (!bt->rchan)
		goto err;

538
	bt->act_mask = buts->act_mask;
539 540 541
	if (!bt->act_mask)
		bt->act_mask = (u16) -1;

542
	blk_trace_setup_lba(bt, bdev);
543

544 545 546 547 548 549
	/* overwrite with user settings */
	if (buts->start_lba)
		bt->start_lba = buts->start_lba;
	if (buts->end_lba)
		bt->end_lba = buts->end_lba;

550
	bt->pid = buts->pid;
551 552 553
	bt->trace_state = Blktrace_setup;

	ret = -EBUSY;
554
	if (cmpxchg(&q->blk_trace, NULL, bt))
555 556
		goto err;

557
	get_probe_ref();
558

559
	ret = 0;
560
err:
561 562 563 564
	if (dir && !bt->dir)
		dput(dir);
	if (ret)
		blk_trace_free(bt);
565 566
	return ret;
}
567

568
int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
569
		    struct block_device *bdev,
570
		    char __user *arg)
571 572 573 574 575 576 577 578
{
	struct blk_user_trace_setup buts;
	int ret;

	ret = copy_from_user(&buts, arg, sizeof(buts));
	if (ret)
		return -EFAULT;

579
	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
580 581 582
	if (ret)
		return ret;

583 584
	if (copy_to_user(arg, &buts, sizeof(buts))) {
		blk_trace_remove(q);
585
		return -EFAULT;
586
	}
587 588
	return 0;
}
589
EXPORT_SYMBOL_GPL(blk_trace_setup);
590

591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
static int compat_blk_trace_setup(struct request_queue *q, char *name,
				  dev_t dev, struct block_device *bdev,
				  char __user *arg)
{
	struct blk_user_trace_setup buts;
	struct compat_blk_user_trace_setup cbuts;
	int ret;

	if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
		return -EFAULT;

	buts = (struct blk_user_trace_setup) {
		.act_mask = cbuts.act_mask,
		.buf_size = cbuts.buf_size,
		.buf_nr = cbuts.buf_nr,
		.start_lba = cbuts.start_lba,
		.end_lba = cbuts.end_lba,
		.pid = cbuts.pid,
	};

	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
	if (ret)
		return ret;

616
	if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
617 618 619 620 621 622 623 624
		blk_trace_remove(q);
		return -EFAULT;
	}

	return 0;
}
#endif

625
int blk_trace_startstop(struct request_queue *q, int start)
626 627
{
	int ret;
628
	struct blk_trace *bt = q->blk_trace;
629

630
	if (bt == NULL)
631 632 633 634 635 636 637 638 639 640 641 642 643
		return -EINVAL;

	/*
	 * For starting a trace, we can transition from a setup or stopped
	 * trace. For stopping a trace, the state must be running
	 */
	ret = -EINVAL;
	if (start) {
		if (bt->trace_state == Blktrace_setup ||
		    bt->trace_state == Blktrace_stopped) {
			blktrace_seq++;
			smp_mb();
			bt->trace_state = Blktrace_running;
644 645 646
			spin_lock_irq(&running_trace_lock);
			list_add(&bt->running_list, &running_trace_list);
			spin_unlock_irq(&running_trace_lock);
647 648

			trace_note_time(bt);
649 650 651 652 653
			ret = 0;
		}
	} else {
		if (bt->trace_state == Blktrace_running) {
			bt->trace_state = Blktrace_stopped;
654 655 656
			spin_lock_irq(&running_trace_lock);
			list_del_init(&bt->running_list);
			spin_unlock_irq(&running_trace_lock);
657 658 659 660 661 662 663
			relay_flush(bt->rchan);
			ret = 0;
		}
	}

	return ret;
}
664
EXPORT_SYMBOL_GPL(blk_trace_startstop);
665

666 667 668 669 670 671
/*
 * When reading or writing the blktrace sysfs files, the references to the
 * opened sysfs or device files should prevent the underlying block device
 * from being removed. So no further delete protection is really needed.
 */

672 673 674
/**
 * blk_trace_ioctl: - handle the ioctls associated with tracing
 * @bdev:	the block device
675
 * @cmd:	the ioctl cmd
676 677 678 679 680
 * @arg:	the argument data, if any
 *
 **/
int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
{
681
	struct request_queue *q;
682
	int ret, start = 0;
683
	char b[BDEVNAME_SIZE];
684 685 686 687 688

	q = bdev_get_queue(bdev);
	if (!q)
		return -ENXIO;

689
	mutex_lock(&q->blk_trace_mutex);
690 691 692

	switch (cmd) {
	case BLKTRACESETUP:
Jean Delvare's avatar
Jean Delvare committed
693
		bdevname(bdev, b);
694
		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
695
		break;
696 697 698 699 700 701
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
	case BLKTRACESETUP32:
		bdevname(bdev, b);
		ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
		break;
#endif
702 703 704 705 706 707 708 709 710 711 712 713 714
	case BLKTRACESTART:
		start = 1;
	case BLKTRACESTOP:
		ret = blk_trace_startstop(q, start);
		break;
	case BLKTRACETEARDOWN:
		ret = blk_trace_remove(q);
		break;
	default:
		ret = -ENOTTY;
		break;
	}

715
	mutex_unlock(&q->blk_trace_mutex);
716 717 718 719 720 721 722 723
	return ret;
}

/**
 * blk_trace_shutdown: - stop and cleanup trace structures
 * @q:    the request queue associated with the device
 *
 **/
724
void blk_trace_shutdown(struct request_queue *q)
725
{
726 727 728 729
	if (q->blk_trace) {
		blk_trace_startstop(q, 0);
		blk_trace_remove(q);
	}
730
}
731

732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761
#ifdef CONFIG_BLK_CGROUP
static union kernfs_node_id *
blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
	struct blk_trace *bt = q->blk_trace;

	if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
		return NULL;

	if (!bio->bi_css)
		return NULL;
	return cgroup_get_kernfs_id(bio->bi_css->cgroup);
}
#else
static union kernfs_node_id *
blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
	return NULL;
}
#endif

static union kernfs_node_id *
blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
{
	if (!rq->bio)
		return NULL;
	/* Use the first bio */
	return blk_trace_bio_get_cgid(q, rq->bio);
}

762 763 764 765 766 767 768
/*
 * blktrace probes
 */

/**
 * blk_add_trace_rq - Add a trace for a request oriented action
 * @rq:		the source request
769
 * @error:	return status to log
770
 * @nr_bytes:	number of completed bytes
771
 * @what:	the action
772
 * @cgid:	the cgroup info
773 774 775 776 777
 *
 * Description:
 *     Records an action against a request. Will log the bio offset + size.
 *
 **/
778
static void blk_add_trace_rq(struct request *rq, int error,
779 780
			     unsigned int nr_bytes, u32 what,
			     union kernfs_node_id *cgid)
781
{
782
	struct blk_trace *bt = rq->q->blk_trace;
783 784 785 786

	if (likely(!bt))
		return;

787
	if (blk_rq_is_passthrough(rq))
788
		what |= BLK_TC_ACT(BLK_TC_PC);
789
	else
790
		what |= BLK_TC_ACT(BLK_TC_FS);
791 792

	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
793
			rq->cmd_flags, what, error, 0, NULL, cgid);
794 795
}

796 797
static void blk_add_trace_rq_insert(void *ignore,
				    struct request_queue *q, struct request *rq)
798
{
799 800
	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
			 blk_trace_request_get_cgid(q, rq));
801 802
}

803 804
static void blk_add_trace_rq_issue(void *ignore,
				   struct request_queue *q, struct request *rq)
805
{
806 807
	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
			 blk_trace_request_get_cgid(q, rq));
808 809
}

810 811
static void blk_add_trace_rq_requeue(void *ignore,
				     struct request_queue *q,
812
				     struct request *rq)
813
{
814 815
	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
			 blk_trace_request_get_cgid(q, rq));
816 817
}

818 819
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
			int error, unsigned int nr_bytes)
820
{
821 822
	blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
			 blk_trace_request_get_cgid(rq->q, rq));
823 824 825 826 827 828 829
}

/**
 * blk_add_trace_bio - Add a trace for a bio oriented action
 * @q:		queue the io is for
 * @bio:	the source bio
 * @what:	the action
830
 * @error:	error, if any
831 832 833 834 835 836
 *
 * Description:
 *     Records an action against a bio. Will log the bio offset + size.
 *
 **/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
837
			      u32 what, int error, union kernfs_node_id *cgid)
838 839 840 841 842 843
{
	struct blk_trace *bt = q->blk_trace;

	if (likely(!bt))
		return;

844
	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
845
			bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
846 847
}

848 849
static void blk_add_trace_bio_bounce(void *ignore,
				     struct request_queue *q, struct bio *bio)
850
{
851 852
	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
			  blk_trace_bio_get_cgid(q, bio));
853 854
}

855 856 857
static void blk_add_trace_bio_complete(void *ignore,
				       struct request_queue *q, struct bio *bio,
				       int error)
858
{
859 860
	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
			  blk_trace_bio_get_cgid(q, bio));
861 862
}

863 864
static void blk_add_trace_bio_backmerge(void *ignore,
					struct request_queue *q,
865
					struct request *rq,
866
					struct bio *bio)
867
{
868 869
	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
			 blk_trace_bio_get_cgid(q, bio));
870 871
}

872 873
static void blk_add_trace_bio_frontmerge(void *ignore,
					 struct request_queue *q,
874
					 struct request *rq,
875
					 struct bio *bio)
876
{
877 878
	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
			  blk_trace_bio_get_cgid(q, bio));
879 880
}

881 882
static void blk_add_trace_bio_queue(void *ignore,
				    struct request_queue *q, struct bio *bio)
883
{
884 885
	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
			  blk_trace_bio_get_cgid(q, bio));
886 887
}

888 889
static void blk_add_trace_getrq(void *ignore,
				struct request_queue *q,
890
				struct bio *bio, int rw)
891 892
{
	if (bio)
893 894
		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
				  blk_trace_bio_get_cgid(q, bio));
895 896 897 898
	else {
		struct blk_trace *bt = q->blk_trace;

		if (bt)
899
			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
900
					NULL, NULL);
901 902 903 904
	}
}


905 906
static void blk_add_trace_sleeprq(void *ignore,
				  struct request_queue *q,
907
				  struct bio *bio, int rw)
908 909
{
	if (bio)
910 911
		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
				  blk_trace_bio_get_cgid(q, bio));
912 913 914 915
	else {
		struct blk_trace *bt = q->blk_trace;

		if (bt)
916
			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
917
					0, 0, NULL, NULL);
918 919 920
	}
}

921
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
922 923 924 925
{
	struct blk_trace *bt = q->blk_trace;

	if (bt)
926
		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
927 928
}

929 930
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
				    unsigned int depth, bool explicit)
931 932 933 934
{
	struct blk_trace *bt = q->blk_trace;

	if (bt) {
935
		__be64 rpdu = cpu_to_be64(depth);
936
		u32 what;
937

938 939 940 941 942
		if (explicit)
			what = BLK_TA_UNPLUG_IO;
		else
			what = BLK_TA_UNPLUG_TIMER;

943
		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
944 945 946
	}
}

947 948
static void blk_add_trace_split(void *ignore,
				struct request_queue *q, struct bio *bio,
949 950 951 952 953 954 955
				unsigned int pdu)
{
	struct blk_trace *bt = q->blk_trace;

	if (bt) {
		__be64 rpdu = cpu_to_be64(pdu);

956
		__blk_add_trace(bt, bio->bi_iter.bi_sector,
957
				bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
958
				BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
959
				&rpdu, blk_trace_bio_get_cgid(q, bio));
960 961 962 963
	}
}

/**
964
 * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
965
 * @ignore:	trace callback data parameter (not used)
966 967 968
 * @q:		queue the io is for
 * @bio:	the source bio
 * @dev:	target device
969
 * @from:	source sector
970 971 972 973 974 975
 *
 * Description:
 *     Device mapper or raid target sometimes need to split a bio because
 *     it spans a stripe (or similar). Add a trace for that action.
 *
 **/
976 977 978
static void blk_add_trace_bio_remap(void *ignore,
				    struct request_queue *q, struct bio *bio,
				    dev_t dev, sector_t from)
979 980 981 982 983 984 985
{
	struct blk_trace *bt = q->blk_trace;
	struct blk_io_trace_remap r;

	if (likely(!bt))
		return;

986
	r.device_from = cpu_to_be32(dev);
987
	r.device_to   = cpu_to_be32(bio_dev(bio));
988
	r.sector_from = cpu_to_be64(from);
989

990
	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
991
			bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
992
			sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
993 994
}

995 996
/**
 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
997
 * @ignore:	trace callback data parameter (not used)
998 999 1000 1001 1002 1003 1004 1005 1006 1007
 * @q:		queue the io is for
 * @rq:		the source request
 * @dev:	target device
 * @from:	source sector
 *
 * Description:
 *     Device mapper remaps request to other devices.
 *     Add a trace for that action.
 *
 **/
1008 1009
static void blk_add_trace_rq_remap(void *ignore,
				   struct request_queue *q,
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
				   struct request *rq, dev_t dev,
				   sector_t from)
{
	struct blk_trace *bt = q->blk_trace;
	struct blk_io_trace_remap r;

	if (likely(!bt))
		return;

	r.device_from = cpu_to_be32(dev);
	r.device_to   = cpu_to_be32(disk_devt(rq->rq_disk));
	r.sector_from = cpu_to_be64(from);

	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
1024
			rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
1025
			sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
1026 1027
}

1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
/**
 * blk_add_driver_data - Add binary message with driver-specific data
 * @q:		queue the io is for
 * @rq:		io request
 * @data:	driver-specific data
 * @len:	length of driver-specific data
 *
 * Description:
 *     Some drivers might want to write driver-specific data per request.
 *
 **/
void blk_add_driver_data(struct request_queue *q,
			 struct request *rq,
			 void *data, size_t len)
{
	struct blk_trace *bt = q->blk_trace;

	if (likely(!bt))
		return;

1048
	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
1049 1050
				BLK_TA_DRV_DATA, 0, len, data,
				blk_trace_request_get_cgid(q, rq));
1051 1052 1053
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);

1054
static void blk_register_tracepoints(void)
1055 1056 1057
{
	int ret;

1058
	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
1059
	WARN_ON(ret);
1060
	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
1061
	WARN_ON(ret);
1062
	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
1063
	WARN_ON(ret);
1064
	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
1065
	WARN_ON(ret);
1066
	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
1067
	WARN_ON(ret);
1068
	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
1069
	WARN_ON(ret);
1070
	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
1071
	WARN_ON(ret);
1072
	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
1073
	WARN_ON(ret);
1074
	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
1075
	WARN_ON(ret);
1076
	ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
1077
	WARN_ON(ret);
1078
	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
1079
	WARN_ON(ret);
1080
	ret = register_trace_block_plug(blk_add_trace_plug, NULL);
1081
	WARN_ON(ret);
1082
	ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
1083
	WARN_ON(ret);
1084
	ret = register_trace_block_split(blk_add_trace_split, NULL);
1085
	WARN_ON(ret);
1086
	ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1087
	WARN_ON(ret);
1088
	ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1089
	WARN_ON(ret);
1090 1091 1092 1093
}

static void blk_unregister_tracepoints(void)
{
1094
	unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1095
	unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1096
	unregister_trace_block_split(blk_add_trace_split, NULL);
1097
	unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
	unregister_trace_block_plug(blk_add_trace_plug, NULL);
	unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
	unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
	unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
	unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
	unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
	unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
	unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
1110 1111 1112

	tracepoint_synchronize_unregister();
}
1113 1114 1115 1116 1117 1118 1119

/*
 * struct blk_io_tracer formatting routines
 */

static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
{
1120
	int i = 0;
1121
	int tc = t->action >> BLK_TC_SHIFT;
1122

1123
	if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
1124 1125 1126 1127
		rwbs[i++] = 'N';
		goto out;
	}

1128 1129 1130
	if (tc & BLK_TC_FLUSH)
		rwbs[i++] = 'F';

1131
	if (tc & BLK_TC_DISCARD)
1132
		rwbs[i++] = 'D';
1133
	else if (tc & BLK_TC_WRITE)
1134 1135 1136 1137 1138 1139
		rwbs[i++] = 'W';
	else if (t->bytes)
		rwbs[i++] = 'R';
	else
		rwbs[i++] = 'N';

1140 1141
	if (tc & BLK_TC_FUA)
		rwbs[i++] = 'F';
1142
	if (tc & BLK_TC_AHEAD)
1143
		rwbs[i++] = 'A';
1144
	if (tc & BLK_TC_SYNC)
1145
		rwbs[i++] = 'S';
1146
	if (tc & BLK_TC_META)
1147
		rwbs[i++] = 'M';
1148
out:
1149
	rwbs[i] = '\0';
1150 1151 1152 1153 1154 1155 1156 1157
}

static inline
const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
{
	return (const struct blk_io_trace *)ent;
}

1158 1159 1160 1161 1162 1163 1164
static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
{
	return (void *)(te_blk_io_trace(ent) + 1) +
		(has_cg ? sizeof(union kernfs_node_id) : 0);
}

static inline const void *cgid_start(const struct trace_entry *ent)
1165
{
1166 1167 1168 1169 1170 1171 1172
	return (void *)(te_blk_io_trace(ent) + 1);
}

static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
{
	return te_blk_io_trace(ent)->pdu_len -
			(has_cg ? sizeof(union kernfs_node_id) : 0);
1173 1174
}

1175 1176 1177 1178 1179 1180 1181 1182 1183 1184
static inline u32 t_action(const struct trace_entry *ent)
{
	return te_blk_io_trace(ent)->action;
}

static inline u32 t_bytes(const struct trace_entry *ent)
{
	return te_blk_io_trace(ent)->bytes;
}

1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196
static inline u32 t_sec(const struct trace_entry *ent)
{
	return te_blk_io_trace(ent)->bytes >> 9;
}

static inline unsigned long long t_sector(const struct trace_entry *ent)
{
	return te_blk_io_trace(ent)->sector;
}

static inline __u16 t_error(const struct trace_entry *ent)
{
Li Zefan's avatar
Li Zefan committed
1197
	return te_blk_io_trace(ent)->error;
1198 1199
}

1200
static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
1201
{
1202
	const __u64 *val = pdu_start(ent, has_cg);
1203 1204 1205 1206
	return be64_to_cpu(*val);
}

static void get_pdu_remap(const struct trace_entry *ent,
1207
			  struct blk_io_trace_remap *r, bool has_cg)
1208
{
1209
	const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
1210
	__u64 sector_from = __r->sector_from;
1211 1212

	r->device_from = be32_to_cpu(__r->device_from);
1213 1214
	r->device_to   = be32_to_cpu(__r->device_to);
	r->sector_from = be64_to_cpu(sector_from);
1215 1216
}

1217 1218
typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
	bool has_cg);
1219

1220 1221
static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
	bool has_cg)
1222
{
1223
	char rwbs[RWBS_LEN];
1224 1225
	unsigned long long ts  = iter->ts;
	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
1226
	unsigned secs	       = (unsigned long)ts;
1227
	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1228 1229 1230

	fill_rwbs(rwbs, t);

1231 1232 1233 1234
	trace_seq_printf(&iter->seq,
			 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
			 MAJOR(t->device), MINOR(t->device), iter->cpu,
			 secs, nsec_rem, iter->ent->pid, act, rwbs);
1235 1236
}

1237 1238
static void blk_log_action(struct trace_iterator *iter, const char *act,
	bool has_cg)
1239
{
1240
	char rwbs[RWBS_LEN];
1241 1242
	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);