blk-core.c 96.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5
/*
 * Copyright (C) 1991, 1992 Linus Torvalds
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
6 7
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
 *	-  July2000
Linus Torvalds's avatar
Linus Torvalds committed
8 9 10 11 12 13 14 15 16 17 18
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
 */

/*
 * This handles all read/write requests to block devices
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
19
#include <linux/blk-mq.h>
Linus Torvalds's avatar
Linus Torvalds committed
20 21 22 23 24 25 26 27 28
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
29
#include <linux/task_io_accounting_ops.h>
30
#include <linux/fault-inject.h>
31
#include <linux/list_sort.h>
32
#include <linux/delay.h>
33
#include <linux/ratelimit.h>
Lin Ming's avatar
Lin Ming committed
34
#include <linux/pm_runtime.h>
35
#include <linux/blk-cgroup.h>
36
#include <linux/debugfs.h>
37 38 39

#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
Linus Torvalds's avatar
Linus Torvalds committed
40

41
#include "blk.h"
42
#include "blk-mq.h"
43
#include "blk-mq-sched.h"
44
#include "blk-wbt.h"
45

46 47 48 49
#ifdef CONFIG_DEBUG_FS
struct dentry *blk_debugfs_root;
#endif

50
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
51
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
52
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
Keith Busch's avatar
Keith Busch committed
53
EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
54
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
55

56 57
DEFINE_IDA(blk_queue_ida);

Linus Torvalds's avatar
Linus Torvalds committed
58 59 60
/*
 * For the allocated request tables
 */
61
struct kmem_cache *request_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
62 63 64 65

/*
 * For queue allocation
 */
66
struct kmem_cache *blk_requestq_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
67 68 69 70

/*
 * Controlling structure to kblockd
 */
71
static struct workqueue_struct *kblockd_workqueue;
Linus Torvalds's avatar
Linus Torvalds committed
72

73 74 75 76 77
static void blk_clear_congested(struct request_list *rl, int sync)
{
#ifdef CONFIG_CGROUP_WRITEBACK
	clear_wb_congested(rl->blkg->wb_congested, sync);
#else
78 79 80 81 82
	/*
	 * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
	 * flip its congestion state for events on other blkcgs.
	 */
	if (rl == &rl->q->root_rl)
83
		clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
84 85 86 87 88 89 90 91
#endif
}

static void blk_set_congested(struct request_list *rl, int sync)
{
#ifdef CONFIG_CGROUP_WRITEBACK
	set_wb_congested(rl->blkg->wb_congested, sync);
#else
92 93
	/* see blk_clear_congested() */
	if (rl == &rl->q->root_rl)
94
		set_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
95 96 97
#endif
}

98
void blk_queue_congestion_threshold(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
99 100 101 102 103 104 105 106 107 108 109 110 111 112
{
	int nr;

	nr = q->nr_requests - (q->nr_requests / 8) + 1;
	if (nr > q->nr_requests)
		nr = q->nr_requests;
	q->nr_congestion_on = nr;

	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
	if (nr < 1)
		nr = 1;
	q->nr_congestion_off = nr;
}

113
void blk_rq_init(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
114
{
115 116
	memset(rq, 0, sizeof(*rq));

Linus Torvalds's avatar
Linus Torvalds committed
117
	INIT_LIST_HEAD(&rq->queuelist);
118
	INIT_LIST_HEAD(&rq->timeout_list);
119
	rq->cpu = -1;
120
	rq->q = q;
121
	rq->__sector = (sector_t) -1;
122 123
	INIT_HLIST_NODE(&rq->hash);
	RB_CLEAR_NODE(&rq->rb_node);
124
	rq->tag = -1;
125
	rq->internal_tag = -1;
126
	rq->start_time = jiffies;
127
	set_start_time_ns(rq);
128
	rq->part = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
129
}
130
EXPORT_SYMBOL(blk_rq_init);
Linus Torvalds's avatar
Linus Torvalds committed
131

132 133 134 135 136 137 138 139 140 141 142 143 144 145
static const struct {
	int		errno;
	const char	*name;
} blk_errors[] = {
	[BLK_STS_OK]		= { 0,		"" },
	[BLK_STS_NOTSUPP]	= { -EOPNOTSUPP, "operation not supported" },
	[BLK_STS_TIMEOUT]	= { -ETIMEDOUT,	"timeout" },
	[BLK_STS_NOSPC]		= { -ENOSPC,	"critical space allocation" },
	[BLK_STS_TRANSPORT]	= { -ENOLINK,	"recoverable transport" },
	[BLK_STS_TARGET]	= { -EREMOTEIO,	"critical target" },
	[BLK_STS_NEXUS]		= { -EBADE,	"critical nexus" },
	[BLK_STS_MEDIUM]	= { -ENODATA,	"critical medium" },
	[BLK_STS_PROTECTION]	= { -EILSEQ,	"protection" },
	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
146
	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
147

148 149 150
	/* device mapper special case, should not leak out: */
	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },

151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
	/* everything else not covered above: */
	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
};

blk_status_t errno_to_blk_status(int errno)
{
	int i;

	for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
		if (blk_errors[i].errno == errno)
			return (__force blk_status_t)i;
	}

	return BLK_STS_IOERR;
}
EXPORT_SYMBOL_GPL(errno_to_blk_status);

int blk_status_to_errno(blk_status_t status)
{
	int idx = (__force int)status;

172
	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
173 174 175 176 177 178 179 180 181
		return -EIO;
	return blk_errors[idx].errno;
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);

static void print_req_error(struct request *req, blk_status_t status)
{
	int idx = (__force int)status;

182
	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
183 184 185 186 187 188 189 190
		return;

	printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n",
			   __func__, blk_errors[idx].name, req->rq_disk ?
			   req->rq_disk->disk_name : "?",
			   (unsigned long long)blk_rq_pos(req));
}

191
static void req_bio_endio(struct request *rq, struct bio *bio,
192
			  unsigned int nbytes, blk_status_t error)
Linus Torvalds's avatar
Linus Torvalds committed
193
{
194
	if (error)
195
		bio->bi_status = error;
196

197
	if (unlikely(rq->rq_flags & RQF_QUIET))
198
		bio_set_flag(bio, BIO_QUIET);
199

200
	bio_advance(bio, nbytes);
201

202
	/* don't actually finish bio if it's part of flush sequence */
203
	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
204
		bio_endio(bio);
Linus Torvalds's avatar
Linus Torvalds committed
205 206 207 208
}

void blk_dump_rq_flags(struct request *rq, char *msg)
{
209 210
	printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
		rq->rq_disk ? rq->rq_disk->disk_name : "?",
211
		(unsigned long long) rq->cmd_flags);
Linus Torvalds's avatar
Linus Torvalds committed
212

213 214 215
	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
	       (unsigned long long)blk_rq_pos(rq),
	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
216 217
	printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
	       rq->bio, rq->biotail, blk_rq_bytes(rq));
Linus Torvalds's avatar
Linus Torvalds committed
218 219 220
}
EXPORT_SYMBOL(blk_dump_rq_flags);

221
static void blk_delay_work(struct work_struct *work)
Linus Torvalds's avatar
Linus Torvalds committed
222
{
223
	struct request_queue *q;
Linus Torvalds's avatar
Linus Torvalds committed
224

225 226
	q = container_of(work, struct request_queue, delay_work.work);
	spin_lock_irq(q->queue_lock);
227
	__blk_run_queue(q);
228
	spin_unlock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
229 230 231
}

/**
232 233 234
 * blk_delay_queue - restart queueing after defined interval
 * @q:		The &struct request_queue in question
 * @msecs:	Delay in msecs
Linus Torvalds's avatar
Linus Torvalds committed
235 236
 *
 * Description:
237 238
 *   Sometimes queueing needs to be postponed for a little while, to allow
 *   resources to come back. This function will make sure that queueing is
239
 *   restarted around the specified time.
240 241
 */
void blk_delay_queue(struct request_queue *q, unsigned long msecs)
242
{
243
	lockdep_assert_held(q->queue_lock);
244
	WARN_ON_ONCE(q->mq_ops);
245

246 247 248
	if (likely(!blk_queue_dead(q)))
		queue_delayed_work(kblockd_workqueue, &q->delay_work,
				   msecs_to_jiffies(msecs));
249
}
250
EXPORT_SYMBOL(blk_delay_queue);
251

252 253 254 255 256 257 258 259 260 261 262
/**
 * blk_start_queue_async - asynchronously restart a previously stopped queue
 * @q:    The &struct request_queue in question
 *
 * Description:
 *   blk_start_queue_async() will clear the stop flag on the queue, and
 *   ensure that the request_fn for the queue is run from an async
 *   context.
 **/
void blk_start_queue_async(struct request_queue *q)
{
263
	lockdep_assert_held(q->queue_lock);
264
	WARN_ON_ONCE(q->mq_ops);
265

266 267 268 269 270
	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
	blk_run_queue_async(q);
}
EXPORT_SYMBOL(blk_start_queue_async);

Linus Torvalds's avatar
Linus Torvalds committed
271 272
/**
 * blk_start_queue - restart a previously stopped queue
273
 * @q:    The &struct request_queue in question
Linus Torvalds's avatar
Linus Torvalds committed
274 275 276 277
 *
 * Description:
 *   blk_start_queue() will clear the stop flag on the queue, and call
 *   the request_fn for the queue if it was in a stopped state when
278
 *   entered. Also see blk_stop_queue().
Linus Torvalds's avatar
Linus Torvalds committed
279
 **/
280
void blk_start_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
281
{
282
	lockdep_assert_held(q->queue_lock);
283
	WARN_ON(!in_interrupt() && !irqs_disabled());
284
	WARN_ON_ONCE(q->mq_ops);
285

286
	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
287
	__blk_run_queue(q);
Linus Torvalds's avatar
Linus Torvalds committed
288 289 290 291 292
}
EXPORT_SYMBOL(blk_start_queue);

/**
 * blk_stop_queue - stop a queue
293
 * @q:    The &struct request_queue in question
Linus Torvalds's avatar
Linus Torvalds committed
294 295 296 297 298 299 300 301 302
 *
 * Description:
 *   The Linux block layer assumes that a block driver will consume all
 *   entries on the request queue when the request_fn strategy is called.
 *   Often this will not happen, because of hardware limitations (queue
 *   depth settings). If a device driver gets a 'queue full' response,
 *   or if it simply chooses not to queue more I/O at one point, it can
 *   call this function to prevent the request_fn from being called until
 *   the driver has signalled it's ready to go again. This happens by calling
303
 *   blk_start_queue() to restart queue operations.
Linus Torvalds's avatar
Linus Torvalds committed
304
 **/
305
void blk_stop_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
306
{
307
	lockdep_assert_held(q->queue_lock);
308
	WARN_ON_ONCE(q->mq_ops);
309

310
	cancel_delayed_work(&q->delay_work);
311
	queue_flag_set(QUEUE_FLAG_STOPPED, q);
Linus Torvalds's avatar
Linus Torvalds committed
312 313 314 315 316 317 318 319 320 321 322 323
}
EXPORT_SYMBOL(blk_stop_queue);

/**
 * blk_sync_queue - cancel any pending callbacks on a queue
 * @q: the queue
 *
 * Description:
 *     The block layer may perform asynchronous callback activity
 *     on a queue, such as calling the unplug function after a timeout.
 *     A block device may call blk_sync_queue to ensure that any
 *     such activity is cancelled, thus allowing it to release resources
324
 *     that the callbacks might use. The caller must already have made sure
Linus Torvalds's avatar
Linus Torvalds committed
325 326 327
 *     that its ->make_request_fn will not re-add plugging prior to calling
 *     this function.
 *
328
 *     This function does not cancel any asynchronous activity arising
329
 *     out of elevator or throttling code. That would require elevator_exit()
330
 *     and blkcg_exit_queue() to be called with queue lock initialized.
331
 *
Linus Torvalds's avatar
Linus Torvalds committed
332 333 334
 */
void blk_sync_queue(struct request_queue *q)
{
335
	del_timer_sync(&q->timeout);
336
	cancel_work_sync(&q->timeout_work);
337 338 339 340 341

	if (q->mq_ops) {
		struct blk_mq_hw_ctx *hctx;
		int i;

342
		cancel_delayed_work_sync(&q->requeue_work);
343
		queue_for_each_hw_ctx(q, hctx, i)
344
			cancel_delayed_work_sync(&hctx->run_work);
345 346 347
	} else {
		cancel_delayed_work_sync(&q->delay_work);
	}
Linus Torvalds's avatar
Linus Torvalds committed
348 349 350
}
EXPORT_SYMBOL(blk_sync_queue);

351 352 353 354 355 356 357 358 359 360 361 362 363
/**
 * __blk_run_queue_uncond - run a queue whether or not it has been stopped
 * @q:	The queue to run
 *
 * Description:
 *    Invoke request handling on a queue if there are any pending requests.
 *    May be used to restart request handling after a request has completed.
 *    This variant runs the queue whether or not the queue has been
 *    stopped. Must be called with the queue lock held and interrupts
 *    disabled. See also @blk_run_queue.
 */
inline void __blk_run_queue_uncond(struct request_queue *q)
{
364
	lockdep_assert_held(q->queue_lock);
365
	WARN_ON_ONCE(q->mq_ops);
366

367 368 369
	if (unlikely(blk_queue_dead(q)))
		return;

370 371 372 373 374 375 376 377
	/*
	 * Some request_fn implementations, e.g. scsi_request_fn(), unlock
	 * the queue lock internally. As a result multiple threads may be
	 * running such a request function concurrently. Keep track of the
	 * number of active request_fn invocations such that blk_drain_queue()
	 * can wait until all these request_fn calls have finished.
	 */
	q->request_fn_active++;
378
	q->request_fn(q);
379
	q->request_fn_active--;
380
}
381
EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
382

Linus Torvalds's avatar
Linus Torvalds committed
383
/**
384
 * __blk_run_queue - run a single device queue
Linus Torvalds's avatar
Linus Torvalds committed
385
 * @q:	The queue to run
386 387
 *
 * Description:
388
 *    See @blk_run_queue.
Linus Torvalds's avatar
Linus Torvalds committed
389
 */
390
void __blk_run_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
391
{
392
	lockdep_assert_held(q->queue_lock);
393
	WARN_ON_ONCE(q->mq_ops);
394

395 396 397
	if (unlikely(blk_queue_stopped(q)))
		return;

398
	__blk_run_queue_uncond(q);
399 400
}
EXPORT_SYMBOL(__blk_run_queue);
401

402 403 404 405 406 407
/**
 * blk_run_queue_async - run a single device queue in workqueue context
 * @q:	The queue to run
 *
 * Description:
 *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
408 409 410 411 412 413
 *    of us.
 *
 * Note:
 *    Since it is not allowed to run q->delay_work after blk_cleanup_queue()
 *    has canceled q->delay_work, callers must hold the queue lock to avoid
 *    race conditions between blk_cleanup_queue() and blk_run_queue_async().
414 415 416
 */
void blk_run_queue_async(struct request_queue *q)
{
417
	lockdep_assert_held(q->queue_lock);
418
	WARN_ON_ONCE(q->mq_ops);
419

420
	if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
421
		mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
422
}
423
EXPORT_SYMBOL(blk_run_queue_async);
424

425 426 427
/**
 * blk_run_queue - run a single device queue
 * @q: The queue to run
428 429 430
 *
 * Description:
 *    Invoke request handling on this queue, if it has pending work to do.
431
 *    May be used to restart queueing when a request has completed.
432 433 434 435 436
 */
void blk_run_queue(struct request_queue *q)
{
	unsigned long flags;

437 438
	WARN_ON_ONCE(q->mq_ops);

439
	spin_lock_irqsave(q->queue_lock, flags);
440
	__blk_run_queue(q);
Linus Torvalds's avatar
Linus Torvalds committed
441 442 443 444
	spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL(blk_run_queue);

445
void blk_put_queue(struct request_queue *q)
446 447 448
{
	kobject_put(&q->kobj);
}
449
EXPORT_SYMBOL(blk_put_queue);
450

451
/**
452
 * __blk_drain_queue - drain requests from request_queue
453
 * @q: queue to drain
454
 * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
455
 *
456 457 458
 * Drain requests from @q.  If @drain_all is set, all requests are drained.
 * If not, only ELVPRIV requests are drained.  The caller is responsible
 * for ensuring that no new requests which need to be drained are queued.
459
 */
460 461 462
static void __blk_drain_queue(struct request_queue *q, bool drain_all)
	__releases(q->queue_lock)
	__acquires(q->queue_lock)
463
{
464 465
	int i;

466
	lockdep_assert_held(q->queue_lock);
467
	WARN_ON_ONCE(q->mq_ops);
468

469
	while (true) {
470
		bool drain = false;
471

472 473 474 475 476 477 478
		/*
		 * The caller might be trying to drain @q before its
		 * elevator is initialized.
		 */
		if (q->elevator)
			elv_drain_elevator(q);

479
		blkcg_drain_queue(q);
480

481 482
		/*
		 * This function might be called on a queue which failed
483 484 485 486
		 * driver init after queue creation or is not yet fully
		 * active yet.  Some drivers (e.g. fd and loop) get unhappy
		 * in such cases.  Kick queue iff dispatch queue has
		 * something on it and @q has request_fn set.
487
		 */
488
		if (!list_empty(&q->queue_head) && q->request_fn)
489
			__blk_run_queue(q);
490

491
		drain |= q->nr_rqs_elvpriv;
492
		drain |= q->request_fn_active;
493 494 495 496 497 498 499

		/*
		 * Unfortunately, requests are queued at and tracked from
		 * multiple places and there's no single counter which can
		 * be drained.  Check all the queues and counters.
		 */
		if (drain_all) {
500
			struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
501 502
			drain |= !list_empty(&q->queue_head);
			for (i = 0; i < 2; i++) {
503
				drain |= q->nr_rqs[i];
504
				drain |= q->in_flight[i];
505 506
				if (fq)
				    drain |= !list_empty(&fq->flush_queue[i]);
507 508
			}
		}
509

510
		if (!drain)
511
			break;
512 513 514

		spin_unlock_irq(q->queue_lock);

515
		msleep(10);
516 517

		spin_lock_irq(q->queue_lock);
518
	}
519 520 521 522 523 524 525

	/*
	 * With queue marked dead, any woken up waiter will fail the
	 * allocation path, so the wakeup chaining is lost and we're
	 * left with hung waiters. We need to wake up those waiters.
	 */
	if (q->request_fn) {
526 527 528 529 530
		struct request_list *rl;

		blk_queue_for_each_rl(rl, q)
			for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
				wake_up_all(&rl->wait[i]);
531
	}
532 533
}

534 535 536 537 538 539 540
void blk_drain_queue(struct request_queue *q)
{
	spin_lock_irq(q->queue_lock);
	__blk_drain_queue(q, true);
	spin_unlock_irq(q->queue_lock);
}

541 542 543 544 545 546
/**
 * blk_queue_bypass_start - enter queue bypass mode
 * @q: queue of interest
 *
 * In bypass mode, only the dispatch FIFO queue of @q is used.  This
 * function makes @q enter bypass mode and drains all requests which were
547
 * throttled or issued before.  On return, it's guaranteed that no request
548 549
 * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
 * inside queue or RCU read lock.
550 551 552
 */
void blk_queue_bypass_start(struct request_queue *q)
{
553 554
	WARN_ON_ONCE(q->mq_ops);

555
	spin_lock_irq(q->queue_lock);
556
	q->bypass_depth++;
557 558 559
	queue_flag_set(QUEUE_FLAG_BYPASS, q);
	spin_unlock_irq(q->queue_lock);

560 561 562 563 564 565
	/*
	 * Queues start drained.  Skip actual draining till init is
	 * complete.  This avoids lenghty delays during queue init which
	 * can happen many times during boot.
	 */
	if (blk_queue_init_done(q)) {
566 567 568 569
		spin_lock_irq(q->queue_lock);
		__blk_drain_queue(q, false);
		spin_unlock_irq(q->queue_lock);

570 571 572
		/* ensure blk_queue_bypass() is %true inside RCU read lock */
		synchronize_rcu();
	}
573 574 575 576 577 578 579 580
}
EXPORT_SYMBOL_GPL(blk_queue_bypass_start);

/**
 * blk_queue_bypass_end - leave queue bypass mode
 * @q: queue of interest
 *
 * Leave bypass mode and restore the normal queueing behavior.
581 582 583
 *
 * Note: although blk_queue_bypass_start() is only called for blk-sq queues,
 * this function is called for both blk-sq and blk-mq queues.
584 585 586 587 588 589 590 591 592 593 594
 */
void blk_queue_bypass_end(struct request_queue *q)
{
	spin_lock_irq(q->queue_lock);
	if (!--q->bypass_depth)
		queue_flag_clear(QUEUE_FLAG_BYPASS, q);
	WARN_ON_ONCE(q->bypass_depth < 0);
	spin_unlock_irq(q->queue_lock);
}
EXPORT_SYMBOL_GPL(blk_queue_bypass_end);

595 596
void blk_set_queue_dying(struct request_queue *q)
{
597 598 599
	spin_lock_irq(q->queue_lock);
	queue_flag_set(QUEUE_FLAG_DYING, q);
	spin_unlock_irq(q->queue_lock);
600

601 602 603 604 605 606 607
	/*
	 * When queue DYING flag is set, we need to block new req
	 * entering queue, so we call blk_freeze_queue_start() to
	 * prevent I/O from crossing blk_queue_enter().
	 */
	blk_freeze_queue_start(q);

608 609 610 611 612
	if (q->mq_ops)
		blk_mq_wake_waiters(q);
	else {
		struct request_list *rl;

613
		spin_lock_irq(q->queue_lock);
614 615
		blk_queue_for_each_rl(rl, q) {
			if (rl->rq_pool) {
616 617
				wake_up_all(&rl->wait[BLK_RW_SYNC]);
				wake_up_all(&rl->wait[BLK_RW_ASYNC]);
618 619
			}
		}
620
		spin_unlock_irq(q->queue_lock);
621 622 623 624
	}
}
EXPORT_SYMBOL_GPL(blk_set_queue_dying);

625 626 627 628
/**
 * blk_cleanup_queue - shutdown a request queue
 * @q: request queue to shutdown
 *
629 630
 * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
 * put it.  All future requests will be failed immediately with -ENODEV.
631
 */
632
void blk_cleanup_queue(struct request_queue *q)
633
{
634
	spinlock_t *lock = q->queue_lock;
635

636
	/* mark @q DYING, no new request or merges will be allowed afterwards */
637
	mutex_lock(&q->sysfs_lock);
638
	blk_set_queue_dying(q);
639
	spin_lock_irq(lock);
640

641
	/*
642
	 * A dying queue is permanently in bypass mode till released.  Note
643 644 645 646 647 648 649
	 * that, unlike blk_queue_bypass_start(), we aren't performing
	 * synchronize_rcu() after entering bypass mode to avoid the delay
	 * as some drivers create and destroy a lot of queues while
	 * probing.  This is still safe because blk_release_queue() will be
	 * called only after the queue refcnt drops to zero and nothing,
	 * RCU or not, would be traversing the queue by then.
	 */
650 651 652
	q->bypass_depth++;
	queue_flag_set(QUEUE_FLAG_BYPASS, q);

653 654
	queue_flag_set(QUEUE_FLAG_NOMERGES, q);
	queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
655
	queue_flag_set(QUEUE_FLAG_DYING, q);
656 657 658
	spin_unlock_irq(lock);
	mutex_unlock(&q->sysfs_lock);

659 660 661 662
	/*
	 * Drain all requests queued before DYING marking. Set DEAD flag to
	 * prevent that q->request_fn() gets invoked after draining finished.
	 */
663
	blk_freeze_queue(q);
664
	spin_lock_irq(lock);
665
	queue_flag_set(QUEUE_FLAG_DEAD, q);
666
	spin_unlock_irq(lock);
667

668 669 670 671
	/*
	 * make sure all in-progress dispatch are completed because
	 * blk_freeze_queue() can only complete all requests, and
	 * dispatch may still be in-progress since we dispatch requests
672 673 674 675 676
	 * from more than one contexts.
	 *
	 * No need to quiesce queue if it isn't initialized yet since
	 * blk_freeze_queue() should be enough for cases of passthrough
	 * request.
677
	 */
678
	if (q->mq_ops && blk_queue_init_done(q))
679 680
		blk_mq_quiesce_queue(q);

681 682 683
	/* for synchronous bio-based driver finish in-flight integrity i/o */
	blk_flush_integrity();

684
	/* @q won't process any more request, flush async actions */
685
	del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
686 687
	blk_sync_queue(q);

688 689
	if (q->mq_ops)
		blk_mq_free_queue(q);
690
	percpu_ref_exit(&q->q_usage_counter);
691

692 693 694 695 696
	spin_lock_irq(lock);
	if (q->queue_lock != &q->__queue_lock)
		q->queue_lock = &q->__queue_lock;
	spin_unlock_irq(lock);

697
	/* @q is and will stay empty, shutdown and put */
698 699
	blk_put_queue(q);
}
Linus Torvalds's avatar
Linus Torvalds committed
700 701
EXPORT_SYMBOL(blk_cleanup_queue);

702
/* Allocate memory local to the request queue */
703
static void *alloc_request_simple(gfp_t gfp_mask, void *data)
704
{
705 706 707
	struct request_queue *q = data;

	return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node);
708 709
}

710
static void free_request_simple(void *element, void *data)
711 712 713 714
{
	kmem_cache_free(request_cachep, element);
}

715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
static void *alloc_request_size(gfp_t gfp_mask, void *data)
{
	struct request_queue *q = data;
	struct request *rq;

	rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask,
			q->node);
	if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) {
		kfree(rq);
		rq = NULL;
	}
	return rq;
}

static void free_request_size(void *element, void *data)
{
	struct request_queue *q = data;

	if (q->exit_rq_fn)
		q->exit_rq_fn(q, element);
	kfree(element);
}

738 739
int blk_init_rl(struct request_list *rl, struct request_queue *q,
		gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
740
{
741 742 743
	if (unlikely(rl->rq_pool))
		return 0;

744
	rl->q = q;
745 746 747 748
	rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
	rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
Linus Torvalds's avatar
Linus Torvalds committed
749

750 751 752 753 754 755 756 757 758
	if (q->cmd_size) {
		rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
				alloc_request_size, free_request_size,
				q, gfp_mask, q->node);
	} else {
		rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
				alloc_request_simple, free_request_simple,
				q, gfp_mask, q->node);
	}
Linus Torvalds's avatar
Linus Torvalds committed
759 760 761
	if (!rl->rq_pool)
		return -ENOMEM;

762 763 764
	if (rl != &q->root_rl)
		WARN_ON_ONCE(!blk_get_queue(q));

Linus Torvalds's avatar
Linus Torvalds committed
765 766 767
	return 0;
}

768
void blk_exit_rl(struct request_queue *q, struct request_list *rl)
769
{
770
	if (rl->rq_pool) {
771
		mempool_destroy(rl->rq_pool);
772 773 774
		if (rl != &q->root_rl)
			blk_put_queue(q);
	}
775 776
}

777
struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
778
{
779
	return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
780 781
}
EXPORT_SYMBOL(blk_alloc_queue);
Linus Torvalds's avatar
Linus Torvalds committed
782

783
int blk_queue_enter(struct request_queue *q, bool nowait)
784 785 786 787 788 789
{
	while (true) {

		if (percpu_ref_tryget_live(&q->q_usage_counter))
			return 0;

790
		if (nowait)
791 792
			return -EBUSY;

793
		/*
794
		 * read pair of barrier in blk_freeze_queue_start(),
795
		 * we need to order reading __PERCPU_REF_DEAD flag of
796 797 798
		 * .q_usage_counter and reading .mq_freeze_depth or
		 * queue dying flag, otherwise the following wait may
		 * never return if the two reads are reordered.
799 800 801
		 */
		smp_rmb();

802 803 804
		wait_event(q->mq_freeze_wq,
			   !atomic_read(&q->mq_freeze_depth) ||
			   blk_queue_dying(q));
805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822
		if (blk_queue_dying(q))
			return -ENODEV;
	}
}

void blk_queue_exit(struct request_queue *q)
{
	percpu_ref_put(&q->q_usage_counter);
}

static void blk_queue_usage_counter_release(struct percpu_ref *ref)
{
	struct request_queue *q =
		container_of(ref, struct request_queue, q_usage_counter);

	wake_up_all(&q->mq_freeze_wq);
}

823 824 825 826 827 828 829
static void blk_rq_timed_out_timer(unsigned long data)
{
	struct request_queue *q = (struct request_queue *)data;

	kblockd_schedule_work(&q->timeout_work);
}

830
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
831
{
832
	struct request_queue *q;
833

834
	q = kmem_cache_alloc_node(blk_requestq_cachep,
835
				gfp_mask | __GFP_ZERO, node_id);
Linus Torvalds's avatar
Linus Torvalds committed
836 837 838
	if (!q)
		return NULL;

839
	q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
840
	if (q->id < 0)
841
		goto fail_q;
842

843
	q->bio_split = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
844 845 846
	if (!q->bio_split)
		goto fail_id;

847 848 849 850
	q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
	if (!q->backing_dev_info)
		goto fail_split;

851 852 853 854
	q->stats = blk_alloc_queue_stats();
	if (!q->stats)
		goto fail_stats;

855
	q->backing_dev_info->ra_pages =
856
			(VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
857 858
	q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
	q->backing_dev_info->name = "block";
859
	q->node = node_id;
860

861
	setup_timer(&q->backing_dev_info->laptop_mode_wb_timer,
862
		    laptop_mode_timer_fn, (unsigned long) q);
863
	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
864
	INIT_WORK(&q->timeout_work, NULL);
865
	INIT_LIST_HEAD(&q->queue_head);
866
	INIT_LIST_HEAD(&q->timeout_list);
867
	INIT_LIST_HEAD(&q->icq_list);
868
#ifdef CONFIG_BLK_CGROUP
869
	INIT_LIST_HEAD(&q->blkg_list);
870
#endif
871
	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
872

873
	kobject_init(&q->kobj, &blk_queue_ktype);
Linus Torvalds's avatar
Linus Torvalds committed
874

875 876 877
#ifdef CONFIG_BLK_DEV_IO_TRACE
	mutex_init(&q->blk_trace_mutex);
#endif
878
	mutex_init(&q->sysfs_lock);
879
	spin_lock_init(&q->__queue_lock);
880

881 882 883 884 885 886
	/*
	 * By default initialize queue_lock to internal lock and driver can
	 * override it later if need be.
	 */
	q->queue_lock = &q->__queue_lock;

887 888 889
	/*
	 * A queue starts its life with bypass turned on to avoid
	 * unnecessary bypass on/off overhead and nasty surprises during
890 891
	 * init.  The initial bypass will be finished when the queue is
	 * registered by blk_register_queue().
892 893 894 895
	 */
	q->bypass_depth = 1;
	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);

896 897
	init_waitqueue_head(&q->mq_freeze_wq);

898 899 900 901 902 903 904
	/*
	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
	 * See blk_register_queue() for details.
	 */
	if (percpu_ref_init(&q->q_usage_counter,
				blk_queue_usage_counter_release,
				PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
905
		goto fail_bdi;
906

907 908 909
	if (blkcg_init_queue(q))
		goto fail_ref;

Linus Torvalds's avatar
Linus Torvalds committed
910
	return q;
911

912 913
fail_ref:
	percpu_ref_exit(&q->q_usage_counter);
914
fail_bdi:
915 916
	blk_free_queue_stats(q->stats);
fail_stats:
917
	bdi_put(q->backing_dev_info);
918 919
fail_split:
	bioset_free(q->bio_split);
920 921 922 923 924
fail_id:
	ida_simple_remove(&blk_queue_ida, q->id);
fail_q:
	kmem_cache_free(blk_requestq_cachep, q);
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
925
}
926
EXPORT_SYMBOL(blk_alloc_queue_node);
Linus Torvalds's avatar
Linus Torvalds committed
927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949

/**
 * blk_init_queue  - prepare a request queue for use with a block device
 * @rfn:  The function to be called to process requests that have been
 *        placed on the queue.
 * @lock: Request queue spin lock
 *
 * Description:
 *    If a block device wishes to use the standard request handling procedures,
 *    which sorts requests and coalesces adjacent requests, then it must
 *    call blk_init_queue().  The function @rfn will be called when there
 *    are requests on the queue that need to be processed.  If the device
 *    supports plugging, then @rfn may not be called immediately when requests
 *    are available on the queue, but may be called at some time later instead.
 *    Plugged queues are generally unplugged when a buffer belonging to one
 *    of the requests on the queue is needed, or due to memory pressure.
 *
 *    @rfn is not required, or even expected, to remove all requests off the
 *    queue, but only as many as it can handle at a time.  If it does leave
 *    requests on the queue, it is responsible for arranging that the requests
 *    get dealt with eventually.
 *
 *    The queue spin lock must be held while manipulating the requests on the
950 951
 *    request queue; this lock will be taken also from interrupt context, so irq
 *    disabling is needed for it.
Linus Torvalds's avatar
Linus Torvalds committed
952
 *
953
 *    Function returns a pointer to the initialized request queue, or %NULL if
Linus Torvalds's avatar
Linus Torvalds committed
954 955 956 957 958 959
 *    it didn't succeed.
 *
 * Note:
 *    blk_init_queue() must be paired with a blk_cleanup_queue() call
 *    when the block device is deactivated (such as at module unload).
 **/
960

961
struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
Linus Torvalds's avatar
Linus Torvalds committed
962
{
963
	return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
964 965 966
}
EXPORT_SYMBOL(blk_init_queue);

967
struct request_queue *
968 969
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
970
	struct request_queue *q;
Linus Torvalds's avatar
Linus Torvalds committed
971

972 973
	q = blk_alloc_queue_node(GFP_KERNEL, node_id);
	if (!q)
974 975
		return NULL;

976 977 978 979 980 981 982
	q->request_fn = rfn;
	if (lock)
		q->queue_lock = lock;
	if (blk_init_allocated_queue(q) < 0) {
		blk_cleanup_queue(q);
		return NULL;
	}
983

984
	return q;
985 986 987
}
EXPORT_SYMBOL(blk_init_queue_node);

988
static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
989

Linus Torvalds's avatar
Linus Torvalds committed
990

991 992
int blk_init_allocated_queue(struct request_queue *q)
{
993 994
	WARN_ON_ONCE(q->mq_ops);

995
	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
996
	if (!q->fq)
997
		return -ENOMEM;
998

999 1000
	if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL))
		goto out_free_flush_queue;
1001

1002
	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
1003
		goto out_exit_flush_rq;
Linus Torvalds's avatar
Linus Torvalds committed
1004

1005
	INIT_WORK(&q->timeout_work, blk_timeout_work);
1006
	q->queue_flags		|= QUEUE_FLAG_DEFAULT;
1007

1008 1009 1010
	/*
	 * This also sets hw/phys segments, boundary and size
	 */
1011
	blk_queue_make_request(q, blk_queue_bio);
Linus Torvalds's avatar
Linus Torvalds committed
1012

1013 1014
	q->sg_reserved_size = INT_MAX;

1015 1016 1017
	/* Protect q->elevator from elevator_change */
	mutex_lock(&q->sysfs_lock);

1018
	/* init elevator */
1019 1020
	if (elevator_init(q, NULL)) {
		mutex_unlock(&q->sysfs_lock);
1021
		goto out_exit_flush_rq;
1022 1023 1024
	}

	mutex_unlock(&q->sysfs_lock);
1025
	return 0;
1026

1027 1028 1029 1030
out_exit_flush_rq:
	if (q->exit_rq_fn)
		q->exit_rq_fn(q, q->fq->flush_rq);
out_free_flush_queue:
1031
	blk_free_flush_queue(q->fq);
1032
	q->fq = NULL;
1033
	return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
1034
}
1035
EXPORT_SYMBOL(blk_init_allocated_queue);
Linus Torvalds's avatar
Linus Torvalds committed
1036

1037
bool blk_get_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
1038
{
1039
	if (likely(!blk_queue_dying(q))) {
1040 1041
		__blk_get_queue(q);
		return true;
Linus Torvalds's avatar
Linus Torvalds committed
1042 1043
	}

1044
	return false;
Linus Torvalds's avatar
Linus Torvalds committed
1045
}
1046
EXPORT_SYMBOL(blk_get_queue);
Linus Torvalds's avatar
Linus Torvalds committed
1047

1048
static inline void blk_free_request(struct request_list *rl, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
1049
{
1050
	if (rq->rq_flags & RQF_ELVPRIV) {
1051
		elv_put_request(rl->q, rq);
1052
		if (rq->elv.icq)
1053
			put_io_context(rq->elv.icq->ioc);
1054 1055
	}

1056
	mempool_free(rq, rl->rq_pool);
Linus Torvalds's avatar
Linus Torvalds committed
1057 1058 1059 1060 1061 1062
}

/*
 * ioc_batching returns true if the ioc is a valid batching request and
 * should be given priority access to a request.
 */
1063
static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
Linus Torvalds's avatar
Linus Torvalds committed
1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
{
	if (!ioc)
		return 0;

	/*
	 * Make sure the process is able to allocate at least 1 request
	 * even if the batch times out, otherwise we could theoretically
	 * lose wakeups.
	 */
	return ioc->nr_batch_requests == q->nr_batching ||
		(ioc->nr_batch_requests > 0
		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
}

/*
 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
 * will cause the process to be a "batcher" on all queues in the system. This
 * is the behaviour we want though - once it gets a wakeup it should be given
 * a nice run.
 */
1084
static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
Linus Torvalds's avatar
Linus Torvalds committed
1085 1086 1087 1088 1089 1090 1091 1092
{
	if (!ioc || ioc_batching(q, ioc))
		return;

	ioc->nr_batch_requests = q->nr_batching;
	ioc->last_waited = jiffies;
}

1093
static void __freed_request(struct request_list *rl, int sync)
Linus Torvalds's avatar
Linus Torvalds committed
1094
{
1095
	struct request_queue *q = rl->q;
Linus Torvalds's avatar
Linus Torvalds committed
1096

1097 1098
	if (rl->count[sync] < queue_congestion_off_threshold(q))
		blk_clear_congested(rl, sync);
Linus Torvalds's avatar
Linus Torvalds committed
1099

1100 1101 1102
	if (rl->count[sync] + 1 <= q->nr_requests) {
		if (waitqueue_active(&rl->wait[sync]))
			wake_up(&rl->wait[sync]);
Linus Torvalds's avatar
Linus Torvalds committed
1103

1104
		blk_clear_rl_full(rl, sync);
Linus Torvalds's avatar
Linus Torvalds committed
1105 1106 1107 1108 1109 1110 1111
	}
}

/*
 * A request has just been released.  Account for it, update the full and
 * congestion status, wake up any waiters.   Called under q->queue_lock.
 */
1112 1113
static void freed_request(struct request_list *rl, bool sync,
		req_flags_t rq_flags)
Linus Torvalds's avatar
Linus Torvalds committed
1114
{
1115
	struct request_queue *q = rl->q;
Linus Torvalds's avatar
Linus Torvalds committed
1116

1117
	q->nr_rqs[sync]--;
1118
	rl->count[sync]--;
1119
	if (rq_flags & RQF_ELVPRIV)
1120
		q->nr_rqs_elvpriv--;
Linus Torvalds's avatar
Linus Torvalds committed
1121

1122
	__freed_request(rl, sync);
Linus Torvalds's avatar
Linus Torvalds committed
1123

1124
	if (unlikely(rl->starved[sync ^ 1]))
1125
		__freed_request(rl, sync ^ 1);
Linus Torvalds's avatar
Linus Torvalds committed
1126 1127
}

1128 1129 1130
int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
{
	struct request_list *rl;
1131
	int on_thresh, off_thresh;
1132

1133 1134
	WARN_ON_ONCE(q->mq_ops);

1135 1136 1137
	spin_lock_irq(q->queue_lock);
	q->nr_requests = nr;
	blk_queue_congestion_threshold(q);
1138 1139
	on_thresh = queue_congestion_on_threshold(q);
	off_thresh = queue_congestion_off_threshold(q);
1140

1141 1142 1143 1144 1145
	blk_queue_for_each_rl(rl, q) {
		if (rl->count[BLK_RW_SYNC] >= on_thresh)
			blk_set_congested(rl, BLK_RW_SYNC);
		else if (rl->count[BLK_RW_SYNC] < off_thresh)
			blk_clear_congested(rl, BLK_RW_SYNC);
1146

1147 1148 1149 1150
		if (rl->count[BLK_RW_ASYNC] >= on_thresh)
			blk_set_congested(rl, BLK_RW_ASYNC);
		else if (rl->count[BLK_RW_ASYNC] < off_thresh)
			blk_clear_congested(rl, BLK_RW_ASYNC);
1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170

		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
			blk_set_rl_full(rl, BLK_RW_SYNC);
		} else {
			blk_clear_rl_full(rl, BLK_RW_SYNC);
			wake_up(&rl->wait[BLK_RW_SYNC]);
		}

		if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
			blk_set_rl_full(rl, BLK_RW_ASYNC);
		} else {
			blk_clear_rl_full(rl, BLK_RW_ASYNC);
			wake_up(&rl->wait[BLK_RW_ASYNC]);
		}
	}

	spin_unlock_irq(q->queue_lock);
	return 0;
}

1171
/**
1172
 * __get_request - get a free request
1173
 * @rl: request list to allocate from
1174
 * @op: operation and flags
1175 1176 1177 1178 1179 1180
 * @bio: bio to allocate request for (can be %NULL)
 * @gfp_mask: allocation mask
 *
 * Get a free request from @q.  This function may fail under memory
 * pressure or if @q is dead.
 *
1181
 * Must be called with @q->queue_lock held and,
1182 1183
 * Returns ERR_PTR on failure, with @q->queue_lock held.
 * Returns request pointer on success, with @q->queue_lock *not held*.
Linus Torvalds's avatar
Linus Torvalds committed
1184
 */
1185 1186
static struct request *__get_request(struct request_list *rl, unsigned int op,
		struct bio *bio, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
1187
{
1188
	struct request_queue *q = rl->q;
1189
	struct request *rq;
1190 1191
	struct elevator_type *et = q->elevator->type;
	struct io_context *ioc = rq_ioc(bio);
1192
	struct io_cq *icq = NULL;
1193
	const bool is_sync = op_is_sync(op);
1194
	int may_queue;
1195
	req_flags_t rq_flags = RQF_ALLOCED;
1196

1197 1198
	lockdep_assert_held(q->queue_lock);

1199
	if (unlikely(blk_queue_dying(q)))
1200
		return ERR_PTR(-ENODEV);
1201

1202
	may_queue = elv_may_queue(q, op);
1203 1204 1205
	if (may_queue == ELV_MQUEUE_NO)
		goto rq_starved;

1206 1207
	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
		if (rl->count[is_sync]+1 >= q->nr_requests) {
1208 1209 1210 1211 1212 1213
			/*
			 * The queue will fill after this allocation, so set
			 * it as full, and mark this process as "batching".
			 * This process will be allowed to complete a batch of
			 * requests, others will be blocked.
			 */
1214
			if (!blk_rl_full(rl, is_sync)) {
1215
				ioc_set_batching(q, ioc);
1216
				blk_set_rl_full(rl, is_sync);
1217 1218 1219 1220 1221 1222 1223 1224
			} else {
				if (may_queue != ELV_MQUEUE_MUST
						&& !ioc_batching(q, ioc)) {
					/*
					 * The queue is full and the allocating
					 * process is not a "batcher", and not
					 * exempted by the IO scheduler
					 */
Joe Lawrence's avatar