blk-mq.c 72.3 KB
Newer Older
1 2 3 4 5 6
/*
 * Block multiqueue core code
 *
 * Copyright (C) 2013-2014 Jens Axboe
 * Copyright (C) 2013-2014 Christoph Hellwig
 */
7 8 9 10 11
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
12
#include <linux/kmemleak.h>
13 14 15 16 17 18 19 20 21 22
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/llist.h>
#include <linux/list_sort.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
23
#include <linux/sched/topology.h>
24
#include <linux/sched/signal.h>
25
#include <linux/delay.h>
26
#include <linux/crash_dump.h>
27
#include <linux/prefetch.h>
28 29 30 31 32 33

#include <trace/events/block.h>

#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
34
#include "blk-mq-debugfs.h"
35
#include "blk-mq-tag.h"
36
#include "blk-stat.h"
37
#include "blk-wbt.h"
38
#include "blk-mq-sched.h"
39

40 41 42
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);

43 44 45 46
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
	int ddir, bytes, bucket;

Jens Axboe's avatar
Jens Axboe committed
47
	ddir = rq_data_dir(rq);
48 49 50 51 52 53 54 55 56 57 58 59
	bytes = blk_rq_bytes(rq);

	bucket = ddir + 2*(ilog2(bytes) - 9);

	if (bucket < 0)
		return -1;
	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;

	return bucket;
}

60 61 62
/*
 * Check if any of the ctx's have pending work in this hardware queue
 */
63
bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
64
{
65 66 67
	return sbitmap_any_bit_set(&hctx->ctx_map) ||
			!list_empty_careful(&hctx->dispatch) ||
			blk_mq_sched_has_work(hctx);
68 69
}

70 71 72 73 74 75
/*
 * Mark this ctx as having pending work in this hardware queue
 */
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				     struct blk_mq_ctx *ctx)
{
76 77
	if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
		sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
78 79 80 81 82
}

static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
				      struct blk_mq_ctx *ctx)
{
83
	sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
84 85
}

86 87 88 89 90 91 92 93 94 95 96 97 98 99
struct mq_inflight {
	struct hd_struct *part;
	unsigned int *inflight;
};

static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
				  struct request *rq, void *priv,
				  bool reserved)
{
	struct mq_inflight *mi = priv;

	if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
	    !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
		/*
100 101 102 103
		 * index[0] counts the specific partition that was asked
		 * for. index[1] counts the ones that are active on the
		 * whole device, so increment that if mi->part is indeed
		 * a partition, and not a whole device.
104
		 */
105
		if (rq->part == mi->part)
106
			mi->inflight[0]++;
107 108
		if (mi->part->partno)
			mi->inflight[1]++;
109 110 111 112 113 114 115 116
	}
}

void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
		      unsigned int inflight[2])
{
	struct mq_inflight mi = { .part = part, .inflight = inflight, };

117
	inflight[0] = inflight[1] = 0;
118 119 120
	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
}

121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
				     struct request *rq, void *priv,
				     bool reserved)
{
	struct mq_inflight *mi = priv;

	if (rq->part == mi->part)
		mi->inflight[rq_data_dir(rq)]++;
}

void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
			 unsigned int inflight[2])
{
	struct mq_inflight mi = { .part = part, .inflight = inflight, };

	inflight[0] = inflight[1] = 0;
	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
}

140
void blk_freeze_queue_start(struct request_queue *q)
141
{
142
	int freeze_depth;
143

144 145
	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
	if (freeze_depth == 1) {
146
		percpu_ref_kill(&q->q_usage_counter);
147
		blk_mq_run_hw_queues(q, false);
148
	}
149
}
150
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
151

152
void blk_mq_freeze_queue_wait(struct request_queue *q)
153
{
154
	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
155
}
156
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
157

158 159 160 161 162 163 164 165
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
				     unsigned long timeout)
{
	return wait_event_timeout(q->mq_freeze_wq,
					percpu_ref_is_zero(&q->q_usage_counter),
					timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
166

167 168 169 170
/*
 * Guarantee no request is in use, so we can change any data structure of
 * the queue afterward.
 */
171
void blk_freeze_queue(struct request_queue *q)
172
{
173 174 175 176 177 178 179
	/*
	 * In the !blk_mq case we are only calling this to kill the
	 * q_usage_counter, otherwise this increases the freeze depth
	 * and waits for it to return to zero.  For this reason there is
	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
	 * exported to drivers as the only user for unfreeze is blk_mq.
	 */
180
	blk_freeze_queue_start(q);
181 182
	if (!q->mq_ops)
		blk_drain_queue(q);
183 184
	blk_mq_freeze_queue_wait(q);
}
185 186 187 188 189 190 191 192 193

void blk_mq_freeze_queue(struct request_queue *q)
{
	/*
	 * ...just an alias to keep freeze and unfreeze actions balanced
	 * in the blk_mq_* namespace
	 */
	blk_freeze_queue(q);
}
194
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
195

196
void blk_mq_unfreeze_queue(struct request_queue *q)
197
{
198
	int freeze_depth;
199

200 201 202
	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
	WARN_ON_ONCE(freeze_depth < 0);
	if (!freeze_depth) {
203
		percpu_ref_reinit(&q->q_usage_counter);
204
		wake_up_all(&q->mq_freeze_wq);
205
	}
206
}
207
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
208

209 210 211 212 213 214 215 216 217 218 219 220 221 222
/*
 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 * mpt3sas driver such that this function can be removed.
 */
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
	queue_flag_set(QUEUE_FLAG_QUIESCED, q);
	spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);

223
/**
224
 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
225 226 227
 * @q: request queue.
 *
 * Note: this function does not prevent that the struct request end_io()
228 229 230
 * callback function is invoked. Once this function is returned, we make
 * sure no dispatch can happen until the queue is unquiesced via
 * blk_mq_unquiesce_queue().
231 232 233 234 235 236 237
 */
void blk_mq_quiesce_queue(struct request_queue *q)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned int i;
	bool rcu = false;

238
	blk_mq_quiesce_queue_nowait(q);
239

240 241
	queue_for_each_hw_ctx(q, hctx, i) {
		if (hctx->flags & BLK_MQ_F_BLOCKING)
242
			synchronize_srcu(hctx->queue_rq_srcu);
243 244 245 246 247 248 249 250
		else
			rcu = true;
	}
	if (rcu)
		synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);

251 252 253 254 255 256 257 258 259
/*
 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 * @q: request queue.
 *
 * This function recovers queue into the state before quiescing
 * which is done by blk_mq_quiesce_queue.
 */
void blk_mq_unquiesce_queue(struct request_queue *q)
{
260 261 262
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
263
	queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
264
	spin_unlock_irqrestore(q->queue_lock, flags);
265

266 267
	/* dispatch requests which are inserted during quiescing */
	blk_mq_run_hw_queues(q, true);
268 269 270
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);

271 272 273 274 275 276 277 278
void blk_mq_wake_waiters(struct request_queue *q)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned int i;

	queue_for_each_hw_ctx(q, hctx, i)
		if (blk_mq_hw_queue_mapped(hctx))
			blk_mq_tag_wakeup_all(hctx->tags, true);
279 280 281 282 283 284 285

	/*
	 * If we are called because the queue has now been marked as
	 * dying, we need to ensure that processes currently waiting on
	 * the queue are notified as well.
	 */
	wake_up_all(&q->mq_freeze_wq);
286 287
}

288 289 290 291 292 293
bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
{
	return blk_mq_has_free_tags(hctx->tags);
}
EXPORT_SYMBOL(blk_mq_can_queue);

294 295
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
		unsigned int tag, unsigned int op)
296
{
297 298 299
	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
	struct request *rq = tags->static_rqs[tag];

300 301
	rq->rq_flags = 0;

302 303 304 305 306 307 308 309 310 311 312 313 314
	if (data->flags & BLK_MQ_REQ_INTERNAL) {
		rq->tag = -1;
		rq->internal_tag = tag;
	} else {
		if (blk_mq_tag_busy(data->hctx)) {
			rq->rq_flags = RQF_MQ_INFLIGHT;
			atomic_inc(&data->hctx->nr_active);
		}
		rq->tag = tag;
		rq->internal_tag = -1;
		data->hctx->tags->rqs[rq->tag] = rq;
	}

315 316
	INIT_LIST_HEAD(&rq->queuelist);
	/* csd/requeue_work/fifo_time is initialized before use */
317 318
	rq->q = data->q;
	rq->mq_ctx = data->ctx;
319
	rq->cmd_flags = op;
320
	if (blk_queue_io_stat(data->q))
321
		rq->rq_flags |= RQF_IO_STAT;
322 323 324 325 326 327
	/* do not touch atomic flags, it needs atomic ops against the timer */
	rq->cpu = -1;
	INIT_HLIST_NODE(&rq->hash);
	RB_CLEAR_NODE(&rq->rb_node);
	rq->rq_disk = NULL;
	rq->part = NULL;
328
	rq->start_time = jiffies;
329 330
#ifdef CONFIG_BLK_CGROUP
	rq->rl = NULL;
331
	set_start_time_ns(rq);
332 333 334 335 336 337 338 339 340 341 342
	rq->io_start_time_ns = 0;
#endif
	rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
	rq->nr_integrity_segments = 0;
#endif
	rq->special = NULL;
	/* tag was already set */
	rq->extra_len = 0;

	INIT_LIST_HEAD(&rq->timeout_list);
343 344
	rq->timeout = 0;

345 346 347 348
	rq->end_io = NULL;
	rq->end_io_data = NULL;
	rq->next_rq = NULL;

349 350
	data->ctx->rq_dispatched[op_is_sync(op)]++;
	return rq;
351 352
}

353 354 355 356 357 358
static struct request *blk_mq_get_request(struct request_queue *q,
		struct bio *bio, unsigned int op,
		struct blk_mq_alloc_data *data)
{
	struct elevator_queue *e = q->elevator;
	struct request *rq;
359
	unsigned int tag;
360
	struct blk_mq_ctx *local_ctx = NULL;
361 362 363 364

	blk_queue_enter_live(q);
	data->q = q;
	if (likely(!data->ctx))
365
		data->ctx = local_ctx = blk_mq_get_ctx(q);
366 367
	if (likely(!data->hctx))
		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
368 369
	if (op & REQ_NOWAIT)
		data->flags |= BLK_MQ_REQ_NOWAIT;
370 371 372 373 374 375 376 377

	if (e) {
		data->flags |= BLK_MQ_REQ_INTERNAL;

		/*
		 * Flush requests are special and go directly to the
		 * dispatch list.
		 */
378 379
		if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
			e->type->ops.mq.limit_depth(op, data);
380 381
	}

382 383
	tag = blk_mq_get_tag(data);
	if (tag == BLK_MQ_TAG_FAIL) {
384 385 386 387
		if (local_ctx) {
			blk_mq_put_ctx(local_ctx);
			data->ctx = NULL;
		}
388 389
		blk_queue_exit(q);
		return NULL;
390 391
	}

392
	rq = blk_mq_rq_ctx_init(data, tag, op);
393 394
	if (!op_is_flush(op)) {
		rq->elv.icq = NULL;
395
		if (e && e->type->ops.mq.prepare_request) {
396 397 398
			if (e->type->icq_cache && rq_ioc(bio))
				blk_mq_sched_assign_ioc(rq, bio);

399 400
			e->type->ops.mq.prepare_request(rq, bio);
			rq->rq_flags |= RQF_ELVPRIV;
401
		}
402 403 404
	}
	data->hctx->queued++;
	return rq;
405 406
}

407
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
408
		unsigned int flags)
409
{
410
	struct blk_mq_alloc_data alloc_data = { .flags = flags };
411
	struct request *rq;
412
	int ret;
413

414
	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
415 416
	if (ret)
		return ERR_PTR(ret);
417

418
	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
419
	blk_queue_exit(q);
420

421
	if (!rq)
422
		return ERR_PTR(-EWOULDBLOCK);
423

424 425
	blk_mq_put_ctx(alloc_data.ctx);

426 427 428
	rq->__data_len = 0;
	rq->__sector = (sector_t) -1;
	rq->bio = rq->biotail = NULL;
429 430
	return rq;
}
431
EXPORT_SYMBOL(blk_mq_alloc_request);
432

433 434
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
		unsigned int op, unsigned int flags, unsigned int hctx_idx)
435
{
436
	struct blk_mq_alloc_data alloc_data = { .flags = flags };
437
	struct request *rq;
438
	unsigned int cpu;
439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
	int ret;

	/*
	 * If the tag allocator sleeps we could get an allocation for a
	 * different hardware context.  No need to complicate the low level
	 * allocator for this for the rare use case of a command tied to
	 * a specific queue.
	 */
	if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
		return ERR_PTR(-EINVAL);

	if (hctx_idx >= q->nr_hw_queues)
		return ERR_PTR(-EIO);

	ret = blk_queue_enter(q, true);
	if (ret)
		return ERR_PTR(ret);

457 458 459 460
	/*
	 * Check if the hardware context is actually mapped to anything.
	 * If not tell the caller that it should skip this queue.
	 */
461 462 463 464
	alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
	if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
		blk_queue_exit(q);
		return ERR_PTR(-EXDEV);
465
	}
466 467
	cpu = cpumask_first(alloc_data.hctx->cpumask);
	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
468

469
	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
470
	blk_queue_exit(q);
471

472 473 474 475
	if (!rq)
		return ERR_PTR(-EWOULDBLOCK);

	return rq;
476 477 478
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);

479
void blk_mq_free_request(struct request *rq)
480 481
{
	struct request_queue *q = rq->q;
482 483 484 485 486
	struct elevator_queue *e = q->elevator;
	struct blk_mq_ctx *ctx = rq->mq_ctx;
	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	const int sched_tag = rq->internal_tag;

487
	if (rq->rq_flags & RQF_ELVPRIV) {
488 489 490 491 492 493 494
		if (e && e->type->ops.mq.finish_request)
			e->type->ops.mq.finish_request(rq);
		if (rq->elv.icq) {
			put_io_context(rq->elv.icq->ioc);
			rq->elv.icq = NULL;
		}
	}
495

496
	ctx->rq_completed[rq_is_sync(rq)]++;
497
	if (rq->rq_flags & RQF_MQ_INFLIGHT)
498
		atomic_dec(&hctx->nr_active);
499 500

	wbt_done(q->rq_wb, &rq->issue_stat);
501

502
	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
503
	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
504 505 506
	if (rq->tag != -1)
		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
	if (sched_tag != -1)
507
		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
508
	blk_mq_sched_restart(hctx);
509
	blk_queue_exit(q);
510
}
511
EXPORT_SYMBOL_GPL(blk_mq_free_request);
512

513
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
514
{
515 516
	blk_account_io_done(rq);

Christoph Hellwig's avatar
Christoph Hellwig committed
517
	if (rq->end_io) {
518
		wbt_done(rq->q->rq_wb, &rq->issue_stat);
519
		rq->end_io(rq, error);
Christoph Hellwig's avatar
Christoph Hellwig committed
520 521 522
	} else {
		if (unlikely(blk_bidi_rq(rq)))
			blk_mq_free_request(rq->next_rq);
523
		blk_mq_free_request(rq);
Christoph Hellwig's avatar
Christoph Hellwig committed
524
	}
525
}
526
EXPORT_SYMBOL(__blk_mq_end_request);
527

528
void blk_mq_end_request(struct request *rq, blk_status_t error)
529 530 531
{
	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
		BUG();
532
	__blk_mq_end_request(rq, error);
533
}
534
EXPORT_SYMBOL(blk_mq_end_request);
535

536
static void __blk_mq_complete_request_remote(void *data)
537
{
538
	struct request *rq = data;
539

540
	rq->q->softirq_done_fn(rq);
541 542
}

543
static void __blk_mq_complete_request(struct request *rq)
544 545
{
	struct blk_mq_ctx *ctx = rq->mq_ctx;
546
	bool shared = false;
547 548
	int cpu;

549 550 551 552 553 554 555
	if (rq->internal_tag != -1)
		blk_mq_sched_completed_request(rq);
	if (rq->rq_flags & RQF_STATS) {
		blk_mq_poll_stats_start(rq->q);
		blk_stat_add(rq);
	}

556
	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
557 558 559
		rq->q->softirq_done_fn(rq);
		return;
	}
560 561

	cpu = get_cpu();
562 563 564 565
	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
		shared = cpus_share_cache(cpu, ctx->cpu);

	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
566
		rq->csd.func = __blk_mq_complete_request_remote;
567 568
		rq->csd.info = rq;
		rq->csd.flags = 0;
569
		smp_call_function_single_async(ctx->cpu, &rq->csd);
570
	} else {
571
		rq->q->softirq_done_fn(rq);
572
	}
573 574
	put_cpu();
}
575 576 577 578 579 580 581 582 583

/**
 * blk_mq_complete_request - end I/O on a request
 * @rq:		the request being processed
 *
 * Description:
 *	Ends all I/O on a request. It does not handle partial completions.
 *	The actual completion happens out-of-order, through a IPI handler.
 **/
584
void blk_mq_complete_request(struct request *rq)
585
{
586 587 588
	struct request_queue *q = rq->q;

	if (unlikely(blk_should_fake_timeout(q)))
589
		return;
590
	if (!blk_mark_rq_complete(rq))
591
		__blk_mq_complete_request(rq);
592 593
}
EXPORT_SYMBOL(blk_mq_complete_request);
594

595 596 597 598 599 600
int blk_mq_request_started(struct request *rq)
{
	return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
}
EXPORT_SYMBOL_GPL(blk_mq_request_started);

601
void blk_mq_start_request(struct request *rq)
602 603 604
{
	struct request_queue *q = rq->q;

605 606
	blk_mq_sched_started_request(rq);

607 608
	trace_block_rq_issue(q, rq);

609
	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
610
		blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
611
		rq->rq_flags |= RQF_STATS;
612
		wbt_issue(q->rq_wb, &rq->issue_stat);
613 614
	}

615
	blk_add_timer(rq);
616

617 618 619 620 621 622
	/*
	 * Ensure that ->deadline is visible before set the started
	 * flag and clear the completed flag.
	 */
	smp_mb__before_atomic();

623 624 625 626 627 628
	/*
	 * Mark us as started and clear complete. Complete might have been
	 * set if requeue raced with timeout, which then marked it as
	 * complete. So be sure to clear complete again when we start
	 * the request, otherwise we'll ignore the completion event.
	 */
629 630 631 632
	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
		set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
633 634 635 636 637 638 639 640 641

	if (q->dma_drain_size && blk_rq_bytes(rq)) {
		/*
		 * Make sure space for the drain appears.  We know we can do
		 * this because max_hw_segments has been adjusted to be one
		 * fewer than the device can handle.
		 */
		rq->nr_phys_segments++;
	}
642
}
643
EXPORT_SYMBOL(blk_mq_start_request);
644

645 646
/*
 * When we reach here because queue is busy, REQ_ATOM_COMPLETE
647
 * flag isn't set yet, so there may be race with timeout handler,
648 649 650 651 652 653
 * but given rq->deadline is just set in .queue_rq() under
 * this situation, the race won't be possible in reality because
 * rq->timeout should be set as big enough to cover the window
 * between blk_mq_start_request() called from .queue_rq() and
 * clearing REQ_ATOM_STARTED here.
 */
654
static void __blk_mq_requeue_request(struct request *rq)
655 656 657 658
{
	struct request_queue *q = rq->q;

	trace_block_rq_requeue(q, rq);
659
	wbt_requeue(q->rq_wb, &rq->issue_stat);
660

661 662 663 664
	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
		if (q->dma_drain_size && blk_rq_bytes(rq))
			rq->nr_phys_segments--;
	}
665 666
}

667
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
668 669 670
{
	__blk_mq_requeue_request(rq);

671 672 673
	/* this request will be re-inserted to io scheduler queue */
	blk_mq_sched_requeue_request(rq);

674
	BUG_ON(blk_queued_rq(rq));
675
	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
676 677 678
}
EXPORT_SYMBOL(blk_mq_requeue_request);

679 680 681
static void blk_mq_requeue_work(struct work_struct *work)
{
	struct request_queue *q =
682
		container_of(work, struct request_queue, requeue_work.work);
683 684 685
	LIST_HEAD(rq_list);
	struct request *rq, *next;

686
	spin_lock_irq(&q->requeue_lock);
687
	list_splice_init(&q->requeue_list, &rq_list);
688
	spin_unlock_irq(&q->requeue_lock);
689 690

	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
691
		if (!(rq->rq_flags & RQF_SOFTBARRIER))
692 693
			continue;

694
		rq->rq_flags &= ~RQF_SOFTBARRIER;
695
		list_del_init(&rq->queuelist);
696
		blk_mq_sched_insert_request(rq, true, false, false, true);
697 698 699 700 701
	}

	while (!list_empty(&rq_list)) {
		rq = list_entry(rq_list.next, struct request, queuelist);
		list_del_init(&rq->queuelist);
702
		blk_mq_sched_insert_request(rq, false, false, false, true);
703 704
	}

705
	blk_mq_run_hw_queues(q, false);
706 707
}

708 709
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
				bool kick_requeue_list)
710 711 712 713 714 715 716 717
{
	struct request_queue *q = rq->q;
	unsigned long flags;

	/*
	 * We abuse this flag that is otherwise used by the I/O scheduler to
	 * request head insertation from the workqueue.
	 */
718
	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
719 720 721

	spin_lock_irqsave(&q->requeue_lock, flags);
	if (at_head) {
722
		rq->rq_flags |= RQF_SOFTBARRIER;
723 724 725 726 727
		list_add(&rq->queuelist, &q->requeue_list);
	} else {
		list_add_tail(&rq->queuelist, &q->requeue_list);
	}
	spin_unlock_irqrestore(&q->requeue_lock, flags);
728 729 730

	if (kick_requeue_list)
		blk_mq_kick_requeue_list(q);
731 732 733 734 735
}
EXPORT_SYMBOL(blk_mq_add_to_requeue_list);

void blk_mq_kick_requeue_list(struct request_queue *q)
{
736
	kblockd_schedule_delayed_work(&q->requeue_work, 0);
737 738 739
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);

740 741 742
void blk_mq_delay_kick_requeue_list(struct request_queue *q,
				    unsigned long msecs)
{
743 744
	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
				    msecs_to_jiffies(msecs));
745 746 747
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);

748 749
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
750 751
	if (tag < tags->nr_tags) {
		prefetch(tags->rqs[tag]);
752
		return tags->rqs[tag];
753
	}
754 755

	return NULL;
756 757 758
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);

759
struct blk_mq_timeout_data {
760 761
	unsigned long next;
	unsigned int next_set;
762 763
};

764
void blk_mq_rq_timed_out(struct request *req, bool reserved)
765
{
766
	const struct blk_mq_ops *ops = req->q->mq_ops;
767
	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
768 769 770 771 772 773 774

	/*
	 * We know that complete is set at this point. If STARTED isn't set
	 * anymore, then the request isn't active and the "timeout" should
	 * just be ignored. This can happen due to the bitflag ordering.
	 * Timeout first checks if STARTED is set, and if it is, assumes
	 * the request is active. But if we race with completion, then
775
	 * both flags will get cleared. So check here again, and ignore
776 777
	 * a timeout event with a request that isn't active.
	 */
778 779
	if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
		return;
780

781
	if (ops->timeout)
782
		ret = ops->timeout(req, reserved);
783 784 785 786 787 788 789 790 791 792 793 794 795 796 797

	switch (ret) {
	case BLK_EH_HANDLED:
		__blk_mq_complete_request(req);
		break;
	case BLK_EH_RESET_TIMER:
		blk_add_timer(req);
		blk_clear_rq_complete(req);
		break;
	case BLK_EH_NOT_HANDLED:
		break;
	default:
		printk(KERN_ERR "block: bad eh return: %d\n", ret);
		break;
	}
798
}
799

800 801 802 803
static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
		struct request *rq, void *priv, bool reserved)
{
	struct blk_mq_timeout_data *data = priv;
804

805
	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
806
		return;
807

808 809 810 811 812 813 814 815 816 817 818 819 820
	/*
	 * The rq being checked may have been freed and reallocated
	 * out already here, we avoid this race by checking rq->deadline
	 * and REQ_ATOM_COMPLETE flag together:
	 *
	 * - if rq->deadline is observed as new value because of
	 *   reusing, the rq won't be timed out because of timing.
	 * - if rq->deadline is observed as previous value,
	 *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
	 *   because we put a barrier between setting rq->deadline
	 *   and clearing the flag in blk_mq_start_request(), so
	 *   this rq won't be timed out too.
	 */
821 822
	if (time_after_eq(jiffies, rq->deadline)) {
		if (!blk_mark_rq_complete(rq))
823
			blk_mq_rq_timed_out(rq, reserved);
824 825 826 827
	} else if (!data->next_set || time_after(data->next, rq->deadline)) {
		data->next = rq->deadline;
		data->next_set = 1;
	}
828 829
}

830
static void blk_mq_timeout_work(struct work_struct *work)
831
{
832 833
	struct request_queue *q =
		container_of(work, struct request_queue, timeout_work);
834 835 836 837 838
	struct blk_mq_timeout_data data = {
		.next		= 0,
		.next_set	= 0,
	};
	int i;
839

840 841 842 843 844 845 846 847 848
	/* A deadlock might occur if a request is stuck requiring a
	 * timeout at the same time a queue freeze is waiting
	 * completion, since the timeout code would not be able to
	 * acquire the queue reference here.
	 *
	 * That's why we don't use blk_queue_enter here; instead, we use
	 * percpu_ref_tryget directly, because we need to be able to
	 * obtain a reference even in the short window between the queue
	 * starting to freeze, by dropping the first reference in
849
	 * blk_freeze_queue_start, and the moment the last request is
850 851 852 853
	 * consumed, marked by the instant q_usage_counter reaches
	 * zero.
	 */
	if (!percpu_ref_tryget(&q->q_usage_counter))
854 855
		return;

856
	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
857

858 859 860
	if (data.next_set) {
		data.next = blk_rq_timeout(round_jiffies_up(data.next));
		mod_timer(&q->timeout, data.next);
861
	} else {
862 863
		struct blk_mq_hw_ctx *hctx;

864 865 866 867 868
		queue_for_each_hw_ctx(q, hctx, i) {
			/* the hctx may be unmapped, so check it here */
			if (blk_mq_hw_queue_mapped(hctx))
				blk_mq_tag_idle(hctx);
		}
869
	}
870
	blk_queue_exit(q);
871 872
}

873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890
struct flush_busy_ctx_data {
	struct blk_mq_hw_ctx *hctx;
	struct list_head *list;
};

static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
{
	struct flush_busy_ctx_data *flush_data = data;
	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];

	sbitmap_clear_bit(sb, bitnr);
	spin_lock(&ctx->lock);
	list_splice_tail_init(&ctx->rq_list, flush_data->list);
	spin_unlock(&ctx->lock);
	return true;
}

891 892 893 894
/*
 * Process software queues that have been marked busy, splicing them
 * to the for-dispatch
 */
895
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
896
{
897 898 899 900
	struct flush_busy_ctx_data data = {
		.hctx = hctx,
		.list = list,
	};
901

902
	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
903
}
904
EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
905

906 907 908 909
static inline unsigned int queued_to_index(unsigned int queued)
{
	if (!queued)
		return 0;
910

911
	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
912 913
}

914 915
bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
			   bool wait)
916 917 918 919 920 921 922
{
	struct blk_mq_alloc_data data = {
		.q = rq->q,
		.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
		.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
	};

923 924
	might_sleep_if(wait);

925 926
	if (rq->tag != -1)
		goto done;
927

928 929 930
	if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
		data.flags |= BLK_MQ_REQ_RESERVED;

931 932
	rq->tag = blk_mq_get_tag(&data);
	if (rq->tag >= 0) {
933 934 935 936
		if (blk_mq_tag_busy(data.hctx)) {
			rq->rq_flags |= RQF_MQ_INFLIGHT;
			atomic_inc(&data.hctx->nr_active);
		}
937 938 939
		data.hctx->tags->rqs[rq->tag] = rq;
	}

940 941 942 943
done:
	if (hctx)
		*hctx = data.hctx;
	return rq->tag != -1;
944 945
}

946 947
static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
				    struct request *rq)
948 949 950 951 952 953 954 955 956 957
{
	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
	rq->tag = -1;

	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
		atomic_dec(&hctx->nr_active);
	}
}

958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977
static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
				       struct request *rq)
{
	if (rq->tag == -1 || rq->internal_tag == -1)
		return;

	__blk_mq_put_driver_tag(hctx, rq);
}

static void blk_mq_put_driver_tag(struct request *rq)
{
	struct blk_mq_hw_ctx *hctx;

	if (rq->tag == -1 || rq->internal_tag == -1)
		return;

	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
	__blk_mq_put_driver_tag(hctx, rq);
}

978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
/*
 * If we fail getting a driver tag because all the driver tags are already
 * assigned and on the dispatch list, BUT the first entry does not have a
 * tag, then we could deadlock. For that case, move entries with assigned
 * driver tags to the front, leaving the set of tagged requests in the
 * same order, and the untagged set in the same order.
 */
static bool reorder_tags_to_front(struct list_head *list)
{
	struct request *rq, *tmp, *first = NULL;

	list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
		if (rq == first)
			break;
		if (rq->tag != -1) {
			list_move(&rq->queuelist, list);
			if (!first)
				first = rq;
		}
	}

	return first != NULL;
}

1002
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
1003 1004 1005 1006 1007 1008
				void *key)
{
	struct blk_mq_hw_ctx *hctx;

	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);

1009
	list_del(&wait->entry);
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039
	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
	blk_mq_run_hw_queue(hctx, true);
	return 1;
}

static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
{
	struct sbq_wait_state *ws;

	/*
	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
	 * The thread which wins the race to grab this bit adds the hardware
	 * queue to the wait queue.
	 */
	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
		return false;

	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);

	/*
	 * As soon as this returns, it's no longer safe to fiddle with
	 * hctx->dispatch_wait, since a completion can wake up the wait queue
	 * and unlock the bit.
	 */
	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
	return true;
}

1040
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
1041
{
1042
	struct blk_mq_hw_ctx *hctx;
1043
	struct request *rq;
1044
	int errors, queued;
1045

1046 1047 1048
	if (list_empty(list))
		return false;

1049 1050 1051
	/*
	 * Now process all the entries, sending them to the driver.
	 */
1052
	errors = queued = 0;
1053
	do {
1054
		struct blk_mq_queue_data bd;
1055
		blk_status_t ret;
1056

1057
		rq = list_first_entry(list, struct request, queuelist);
1058 1059 1060
		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
			if (!queued && reorder_tags_to_front(list))
				continue;
1061 1062

			/*
1063 1064
			 * The initial allocation attempt failed, so we need to
			 * rerun the hardware queue when a tag is freed.
1065
			 */
1066 1067 1068 1069 1070 1071 1072 1073 1074
			if (!blk_mq_dispatch_wait_add(hctx))
				break;

			/*
			 * It's possible that a tag was freed in the window
			 * between the allocation failure and adding the
			 * hardware queue to the wait queue.
			 */
			if (!blk_mq_get_driver_tag(rq, &hctx, false))
1075
				break;
1076
		}
1077

1078 1079
		list_del_init(&rq->queuelist);

1080
		bd.rq = rq;
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093

		/*
		 * Flag last if we have no more requests, or if we have more
		 * but can't assign a driver tag to it.
		 */
		if (list_empty(list))
			bd.last = true;
		else {
			struct request *nxt;

			nxt = list_first_entry(list, struct request, queuelist);
			bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
		}
1094 1095

		ret = q->mq_ops->queue_rq(hctx, &bd);
1096
		if (ret == BLK_STS_RESOURCE) {
1097
			blk_mq_put_driver_tag_hctx(hctx, rq);
1098
			list_add(&rq->queuelist, list);
1099
			__blk_mq_requeue_request(rq);
1100
			break;
1101 1102 1103
		}

		if (unlikely(ret != BLK_STS_OK)) {
1104
			errors++;
1105
			blk_mq_end_request(rq, BLK_STS_IOERR);
1106
			continue;
1107 1108
		}

1109
		queued++;
1110
	} while (!list_empty(list));
1111

1112
	hctx->dispatched[queued_to_index(queued)]++;
1113 1114 1115 1116 1117

	/*
	 * Any items that need requeuing? Stuff them into hctx->dispatch,
	 * that is where we will continue on next queue run.
	 */
1118
	if (!list_empty(list)) {
1119
		/*
1120 1121
		 * If an I/O scheduler has been configured and we got a driver
		 * tag for the next request already, free it again.
1122 1123 1124 1125
		 */
		rq = list_first_entry(list, struct request, queuelist);
		blk_mq_put_driver_tag(rq);

1126
		spin_lock(&hctx->lock);
1127
		list_splice_init(list, &hctx->dispatch);
1128
		spin_unlock(&hctx->lock);
1129

1130
		/*
1131 1132 1133
		 * If SCHED_RESTART was set by the caller of this function and
		 * it is no longer set that means that it was cleared by another
		 * thread and hence that a queue rerun is needed.
1134
		 *
1135 1136 1137 1138
		 * If TAG_WAITING is set that means that an I/O scheduler has
		 * been configured and another thread is waiting for a driver
		 * tag. To guarantee fairness, do not rerun this hardware queue
		 * but let the other thread grab the driver tag.
1139
		 *