cfq-iosched.c 63.7 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6
/*
 *  CFQ, or complete fairness queueing, disk scheduler.
 *
 *  Based on ideas from a previously unfinished io
 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
 *
7
 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
Linus Torvalds's avatar
Linus Torvalds committed
8 9
 */
#include <linux/module.h>
Al Viro's avatar
Al Viro committed
10 11
#include <linux/blkdev.h>
#include <linux/elevator.h>
Linus Torvalds's avatar
Linus Torvalds committed
12
#include <linux/rbtree.h>
13
#include <linux/ioprio.h>
14
#include <linux/blktrace_api.h>
Linus Torvalds's avatar
Linus Torvalds committed
15 16 17 18

/*
 * tunables
 */
19 20
/* max queue in one round of service */
static const int cfq_quantum = 4;
21
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
22 23 24 25
/* maximum backwards seek, in KiB */
static const int cfq_back_max = 16 * 1024;
/* penalty of a backwards seek */
static const int cfq_back_penalty = 2;
26
static const int cfq_slice_sync = HZ / 10;
Jens Axboe's avatar
Jens Axboe committed
27
static int cfq_slice_async = HZ / 25;
28
static const int cfq_slice_async_rq = 2;
29
static int cfq_slice_idle = HZ / 125;
30

31
/*
32
 * offset from end of service tree
33
 */
34
#define CFQ_IDLE_DELAY		(HZ / 5)
35 36 37 38 39 40

/*
 * below this threshold, we consider thinktime immediate
 */
#define CFQ_MIN_TT		(2)

41
#define CFQ_SLICE_SCALE		(5)
42
#define CFQ_HW_QUEUE_MIN	(5)
43

44 45
#define RQ_CIC(rq)		\
	((struct cfq_io_context *) (rq)->elevator_private)
46
#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
Linus Torvalds's avatar
Linus Torvalds committed
47

48 49
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
Linus Torvalds's avatar
Linus Torvalds committed
50

51
static DEFINE_PER_CPU(unsigned long, ioc_count);
52
static struct completion *ioc_gone;
53
static DEFINE_SPINLOCK(ioc_gone_lock);
54

55 56 57 58
#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

59 60
#define sample_valid(samples)	((samples) > 80)

61 62 63 64 65 66 67 68 69 70 71 72
/*
 * Most of our rbtree usage is for sorting with min extraction, so
 * if we cache the leftmost node we don't have to walk down the tree
 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
 * move this into the elevator for the rq sorting as well.
 */
struct cfq_rb_root {
	struct rb_root rb;
	struct rb_node *left;
};
#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, }

73 74 75
/*
 * Per block device queue structure
 */
Linus Torvalds's avatar
Linus Torvalds committed
76
struct cfq_data {
77
	struct request_queue *queue;
78 79 80 81

	/*
	 * rr list of queues with requests and the count of them
	 */
82
	struct cfq_rb_root service_tree;
83 84 85 86 87 88 89 90

	/*
	 * Each priority tree is sorted by next_request position.  These
	 * trees are used when determining if two or more queues are
	 * interleaving requests (see cfq_close_cooperator).
	 */
	struct rb_root prio_trees[CFQ_PRIO_LISTS];

91
	unsigned int busy_queues;
92 93 94 95 96
	/*
	 * Used to track any pending rt requests so we can pre-empt current
	 * non-RT cfqq in service when this value is non-zero.
	 */
	unsigned int busy_rt_queues;
97 98

	int rq_in_driver;
99
	int sync_flight;
100 101 102 103 104

	/*
	 * queue-depth detection
	 */
	int rq_queued;
105
	int hw_tag;
106 107
	int hw_tag_samples;
	int rq_in_driver_peak;
Linus Torvalds's avatar
Linus Torvalds committed
108

109 110 111 112 113
	/*
	 * idle window management
	 */
	struct timer_list idle_slice_timer;
	struct work_struct unplug_work;
Linus Torvalds's avatar
Linus Torvalds committed
114

115 116 117
	struct cfq_queue *active_queue;
	struct cfq_io_context *active_cic;

118 119 120 121 122
	/*
	 * async queue for each priority case
	 */
	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
	struct cfq_queue *async_idle_cfqq;
123

Jens Axboe's avatar
Jens Axboe committed
124
	sector_t last_position;
Linus Torvalds's avatar
Linus Torvalds committed
125 126 127 128 129

	/*
	 * tunables, see top of file
	 */
	unsigned int cfq_quantum;
130
	unsigned int cfq_fifo_expire[2];
Linus Torvalds's avatar
Linus Torvalds committed
131 132
	unsigned int cfq_back_penalty;
	unsigned int cfq_back_max;
133 134 135
	unsigned int cfq_slice[2];
	unsigned int cfq_slice_async_rq;
	unsigned int cfq_slice_idle;
136 137

	struct list_head cic_list;
Linus Torvalds's avatar
Linus Torvalds committed
138 139
};

140 141 142
/*
 * Per process-grouping structure
 */
Linus Torvalds's avatar
Linus Torvalds committed
143 144 145
struct cfq_queue {
	/* reference count */
	atomic_t ref;
146 147
	/* various state flags, see below */
	unsigned int flags;
Linus Torvalds's avatar
Linus Torvalds committed
148 149
	/* parent cfq_data */
	struct cfq_data *cfqd;
150 151 152 153
	/* service_tree member */
	struct rb_node rb_node;
	/* service_tree key */
	unsigned long rb_key;
154 155
	/* prio tree member */
	struct rb_node p_node;
156 157
	/* prio tree root we belong to, if any */
	struct rb_root *p_root;
Linus Torvalds's avatar
Linus Torvalds committed
158 159 160
	/* sorted list of pending requests */
	struct rb_root sort_list;
	/* if fifo isn't expired, next request to serve */
Jens Axboe's avatar
Jens Axboe committed
161
	struct request *next_rq;
Linus Torvalds's avatar
Linus Torvalds committed
162 163 164 165 166
	/* requests queued in sort_list */
	int queued[2];
	/* currently allocated requests */
	int allocated[2];
	/* fifo list of requests in sort_list */
167
	struct list_head fifo;
Linus Torvalds's avatar
Linus Torvalds committed
168

169
	unsigned long slice_end;
170
	long slice_resid;
171
	unsigned int slice_dispatch;
Linus Torvalds's avatar
Linus Torvalds committed
172

173 174
	/* pending metadata requests */
	int meta_pending;
Jens Axboe's avatar
Jens Axboe committed
175 176
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;
177 178 179 180 181

	/* io prio of this group */
	unsigned short ioprio, org_ioprio;
	unsigned short ioprio_class, org_ioprio_class;

182
	pid_t pid;
Linus Torvalds's avatar
Linus Torvalds committed
183 184
};

Jens Axboe's avatar
Jens Axboe committed
185
enum cfqq_state_flags {
186 187
	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
188
	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
189 190 191 192 193
	CFQ_CFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
194
	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
195
	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
196
	CFQ_CFQQ_FLAG_coop,		/* has done a coop jump of the queue */
Jens Axboe's avatar
Jens Axboe committed
197 198 199 200 201
};

#define CFQ_CFQQ_FNS(name)						\
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
{									\
202
	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
Jens Axboe's avatar
Jens Axboe committed
203 204 205
}									\
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
{									\
206
	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
Jens Axboe's avatar
Jens Axboe committed
207 208 209
}									\
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
{									\
210
	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
Jens Axboe's avatar
Jens Axboe committed
211 212 213 214
}

CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
215
CFQ_CFQQ_FNS(must_dispatch);
Jens Axboe's avatar
Jens Axboe committed
216 217 218 219 220
CFQ_CFQQ_FNS(must_alloc);
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
221
CFQ_CFQQ_FNS(slice_new);
222
CFQ_CFQQ_FNS(sync);
223
CFQ_CFQQ_FNS(coop);
Jens Axboe's avatar
Jens Axboe committed
224 225
#undef CFQ_CFQQ_FNS

226 227 228 229 230
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
#define cfq_log(cfqd, fmt, args...)	\
	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

231
static void cfq_dispatch_insert(struct request_queue *, struct request *);
232
static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
233
				       struct io_context *, gfp_t);
234
static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
						struct io_context *);

static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
					    int is_sync)
{
	return cic->cfqq[!!is_sync];
}

static inline void cic_set_cfqq(struct cfq_io_context *cic,
				struct cfq_queue *cfqq, int is_sync)
{
	cic->cfqq[!!is_sync] = cfqq;
}

/*
 * We regard a request as SYNC, if it's either a read or has the SYNC bit
 * set (in which case it could also be direct WRITE).
 */
static inline int cfq_bio_sync(struct bio *bio)
{
	if (bio_data_dir(bio) == READ || bio_sync(bio))
		return 1;

	return 0;
}
Linus Torvalds's avatar
Linus Torvalds committed
260

Andrew Morton's avatar
Andrew Morton committed
261 262 263 264 265 266
/*
 * scheduler run of queue, if there are requests pending and no one in the
 * driver that will restart queueing
 */
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
{
267 268
	if (cfqd->busy_queues) {
		cfq_log(cfqd, "schedule dispatch");
269
		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
270
	}
Andrew Morton's avatar
Andrew Morton committed
271 272
}

273
static int cfq_queue_empty(struct request_queue *q)
Andrew Morton's avatar
Andrew Morton committed
274 275 276
{
	struct cfq_data *cfqd = q->elevator->elevator_data;

277
	return !cfqd->busy_queues;
Andrew Morton's avatar
Andrew Morton committed
278 279
}

280 281 282 283 284
/*
 * Scale schedule slice based on io priority. Use the sync time slice only
 * if a queue is marked sync and has sync io queued. A sync queue with async
 * io only, should not get full sync slice length.
 */
285 286
static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync,
				 unsigned short prio)
287
{
288
	const int base_slice = cfqd->cfq_slice[sync];
289

290 291 292 293
	WARN_ON(prio >= IOPRIO_BE_NR);

	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
}
294

295 296 297 298
static inline int
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
299 300 301 302 303 304
}

static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
305
	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
}

/*
 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
 * isn't valid until the first request from the dispatch is activated
 * and the slice time set.
 */
static inline int cfq_slice_used(struct cfq_queue *cfqq)
{
	if (cfq_cfqq_slice_new(cfqq))
		return 0;
	if (time_before(jiffies, cfqq->slice_end))
		return 0;

	return 1;
}

Linus Torvalds's avatar
Linus Torvalds committed
323
/*
Jens Axboe's avatar
Jens Axboe committed
324
 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
Linus Torvalds's avatar
Linus Torvalds committed
325
 * We choose the request that is closest to the head right now. Distance
326
 * behind the head is penalized and only allowed to a certain extent.
Linus Torvalds's avatar
Linus Torvalds committed
327
 */
Jens Axboe's avatar
Jens Axboe committed
328 329
static struct request *
cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
Linus Torvalds's avatar
Linus Torvalds committed
330 331 332
{
	sector_t last, s1, s2, d1 = 0, d2 = 0;
	unsigned long back_max;
333 334 335
#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
Linus Torvalds's avatar
Linus Torvalds committed
336

Jens Axboe's avatar
Jens Axboe committed
337 338 339 340
	if (rq1 == NULL || rq1 == rq2)
		return rq2;
	if (rq2 == NULL)
		return rq1;
341

Jens Axboe's avatar
Jens Axboe committed
342 343 344 345
	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
		return rq1;
	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
		return rq2;
346 347 348 349
	if (rq_is_meta(rq1) && !rq_is_meta(rq2))
		return rq1;
	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
		return rq2;
Linus Torvalds's avatar
Linus Torvalds committed
350

351 352
	s1 = blk_rq_pos(rq1);
	s2 = blk_rq_pos(rq2);
Linus Torvalds's avatar
Linus Torvalds committed
353

Jens Axboe's avatar
Jens Axboe committed
354
	last = cfqd->last_position;
Linus Torvalds's avatar
Linus Torvalds committed
355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370

	/*
	 * by definition, 1KiB is 2 sectors
	 */
	back_max = cfqd->cfq_back_max * 2;

	/*
	 * Strict one way elevator _except_ in the case where we allow
	 * short backward seeks which are biased as twice the cost of a
	 * similar forward seek.
	 */
	if (s1 >= last)
		d1 = s1 - last;
	else if (s1 + back_max >= last)
		d1 = (last - s1) * cfqd->cfq_back_penalty;
	else
371
		wrap |= CFQ_RQ1_WRAP;
Linus Torvalds's avatar
Linus Torvalds committed
372 373 374 375 376 377

	if (s2 >= last)
		d2 = s2 - last;
	else if (s2 + back_max >= last)
		d2 = (last - s2) * cfqd->cfq_back_penalty;
	else
378
		wrap |= CFQ_RQ2_WRAP;
Linus Torvalds's avatar
Linus Torvalds committed
379 380

	/* Found required data */
381 382 383 384 385 386

	/*
	 * By doing switch() on the bit mask "wrap" we avoid having to
	 * check two variables for all permutations: --> faster!
	 */
	switch (wrap) {
Jens Axboe's avatar
Jens Axboe committed
387
	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
388
		if (d1 < d2)
Jens Axboe's avatar
Jens Axboe committed
389
			return rq1;
390
		else if (d2 < d1)
Jens Axboe's avatar
Jens Axboe committed
391
			return rq2;
392 393
		else {
			if (s1 >= s2)
Jens Axboe's avatar
Jens Axboe committed
394
				return rq1;
395
			else
Jens Axboe's avatar
Jens Axboe committed
396
				return rq2;
397
		}
Linus Torvalds's avatar
Linus Torvalds committed
398

399
	case CFQ_RQ2_WRAP:
Jens Axboe's avatar
Jens Axboe committed
400
		return rq1;
401
	case CFQ_RQ1_WRAP:
Jens Axboe's avatar
Jens Axboe committed
402 403
		return rq2;
	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
404 405 406 407 408 409 410 411
	default:
		/*
		 * Since both rqs are wrapped,
		 * start with the one that's further behind head
		 * (--> only *one* back seek required),
		 * since back seek takes more time than forward.
		 */
		if (s1 <= s2)
Jens Axboe's avatar
Jens Axboe committed
412
			return rq1;
Linus Torvalds's avatar
Linus Torvalds committed
413
		else
Jens Axboe's avatar
Jens Axboe committed
414
			return rq2;
Linus Torvalds's avatar
Linus Torvalds committed
415 416 417
	}
}

418 419 420
/*
 * The below is leftmost cache rbtree addon
 */
421
static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
422 423 424 425
{
	if (!root->left)
		root->left = rb_first(&root->rb);

426 427 428 429
	if (root->left)
		return rb_entry(root->left, struct cfq_queue, rb_node);

	return NULL;
430 431
}

432 433 434 435 436 437
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
	rb_erase(n, root);
	RB_CLEAR_NODE(n);
}

438 439 440 441
static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
{
	if (root->left == n)
		root->left = NULL;
442
	rb_erase_init(n, &root->rb);
443 444
}

Linus Torvalds's avatar
Linus Torvalds committed
445 446 447
/*
 * would be nice to take fifo expire time into account as well
 */
Jens Axboe's avatar
Jens Axboe committed
448 449 450
static struct request *
cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
		  struct request *last)
Linus Torvalds's avatar
Linus Torvalds committed
451
{
452 453
	struct rb_node *rbnext = rb_next(&last->rb_node);
	struct rb_node *rbprev = rb_prev(&last->rb_node);
Jens Axboe's avatar
Jens Axboe committed
454
	struct request *next = NULL, *prev = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
455

456
	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
Linus Torvalds's avatar
Linus Torvalds committed
457 458

	if (rbprev)
Jens Axboe's avatar
Jens Axboe committed
459
		prev = rb_entry_rq(rbprev);
Linus Torvalds's avatar
Linus Torvalds committed
460

461
	if (rbnext)
Jens Axboe's avatar
Jens Axboe committed
462
		next = rb_entry_rq(rbnext);
463 464 465
	else {
		rbnext = rb_first(&cfqq->sort_list);
		if (rbnext && rbnext != &last->rb_node)
Jens Axboe's avatar
Jens Axboe committed
466
			next = rb_entry_rq(rbnext);
467
	}
Linus Torvalds's avatar
Linus Torvalds committed
468

469
	return cfq_choose_req(cfqd, next, prev);
Linus Torvalds's avatar
Linus Torvalds committed
470 471
}

472 473
static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
				      struct cfq_queue *cfqq)
Linus Torvalds's avatar
Linus Torvalds committed
474
{
475 476 477
	/*
	 * just an approximation, should be ok.
	 */
478 479
	return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
480 481
}

482 483 484 485 486
/*
 * The cfqd->service_tree holds all pending cfq_queue's that have
 * requests waiting to be processed. It is sorted in the order that
 * we will service the queues.
 */
487 488
static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
				 int add_front)
489
{
490 491
	struct rb_node **p, *parent;
	struct cfq_queue *__cfqq;
492
	unsigned long rb_key;
493
	int left;
494

495 496 497 498 499 500 501 502 503
	if (cfq_class_idle(cfqq)) {
		rb_key = CFQ_IDLE_DELAY;
		parent = rb_last(&cfqd->service_tree.rb);
		if (parent && parent != &cfqq->rb_node) {
			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
			rb_key += __cfqq->rb_key;
		} else
			rb_key += jiffies;
	} else if (!add_front) {
504 505 506 507 508
		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
		rb_key += cfqq->slice_resid;
		cfqq->slice_resid = 0;
	} else
		rb_key = 0;
Linus Torvalds's avatar
Linus Torvalds committed
509

510
	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
511
		/*
512
		 * same position, nothing more to do
513
		 */
514 515
		if (rb_key == cfqq->rb_key)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
516

517
		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
Linus Torvalds's avatar
Linus Torvalds committed
518
	}
519

520
	left = 1;
521 522
	parent = NULL;
	p = &cfqd->service_tree.rb.rb_node;
523
	while (*p) {
524
		struct rb_node **n;
525

526 527 528
		parent = *p;
		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

529 530
		/*
		 * sort RT queues first, we always want to give
531 532
		 * preference to them. IDLE queues goes to the back.
		 * after that, sort on the next service time.
533 534
		 */
		if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
535
			n = &(*p)->rb_left;
536
		else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
537 538 539 540 541
			n = &(*p)->rb_right;
		else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
			n = &(*p)->rb_left;
		else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
			n = &(*p)->rb_right;
542
		else if (rb_key < __cfqq->rb_key)
543 544 545 546 547
			n = &(*p)->rb_left;
		else
			n = &(*p)->rb_right;

		if (n == &(*p)->rb_right)
548
			left = 0;
549 550

		p = n;
551 552
	}

553 554 555
	if (left)
		cfqd->service_tree.left = &cfqq->rb_node;

556 557
	cfqq->rb_key = rb_key;
	rb_link_node(&cfqq->rb_node, parent, p);
558
	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
Linus Torvalds's avatar
Linus Torvalds committed
559 560
}

561
static struct cfq_queue *
562 563 564
cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
		     sector_t sector, struct rb_node **ret_parent,
		     struct rb_node ***rb_link)
565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
{
	struct rb_node **p, *parent;
	struct cfq_queue *cfqq = NULL;

	parent = NULL;
	p = &root->rb_node;
	while (*p) {
		struct rb_node **n;

		parent = *p;
		cfqq = rb_entry(parent, struct cfq_queue, p_node);

		/*
		 * Sort strictly based on sector.  Smallest to the left,
		 * largest to the right.
		 */
581
		if (sector > blk_rq_pos(cfqq->next_rq))
582
			n = &(*p)->rb_right;
583
		else if (sector < blk_rq_pos(cfqq->next_rq))
584 585 586 587
			n = &(*p)->rb_left;
		else
			break;
		p = n;
588
		cfqq = NULL;
589 590 591 592 593
	}

	*ret_parent = parent;
	if (rb_link)
		*rb_link = p;
594
	return cfqq;
595 596 597 598 599 600 601
}

static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	struct rb_node **p, *parent;
	struct cfq_queue *__cfqq;

602 603 604 605
	if (cfqq->p_root) {
		rb_erase(&cfqq->p_node, cfqq->p_root);
		cfqq->p_root = NULL;
	}
606 607 608 609 610 611

	if (cfq_class_idle(cfqq))
		return;
	if (!cfqq->next_rq)
		return;

612
	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
613 614
	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
				      blk_rq_pos(cfqq->next_rq), &parent, &p);
615 616
	if (!__cfqq) {
		rb_link_node(&cfqq->p_node, parent, p);
617 618 619
		rb_insert_color(&cfqq->p_node, cfqq->p_root);
	} else
		cfqq->p_root = NULL;
620 621
}

622 623 624
/*
 * Update cfqq's position in the service tree.
 */
625
static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
Jens Axboe's avatar
Jens Axboe committed
626 627 628 629
{
	/*
	 * Resorting requires the cfqq to be on the RR list already.
	 */
630
	if (cfq_cfqq_on_rr(cfqq)) {
631
		cfq_service_tree_add(cfqd, cfqq, 0);
632 633
		cfq_prio_tree_add(cfqd, cfqq);
	}
Jens Axboe's avatar
Jens Axboe committed
634 635
}

Linus Torvalds's avatar
Linus Torvalds committed
636 637
/*
 * add to busy list of queues for service, trying to be fair in ordering
638
 * the pending list according to last request service
Linus Torvalds's avatar
Linus Torvalds committed
639
 */
640
static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
Linus Torvalds's avatar
Linus Torvalds committed
641
{
642
	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
Jens Axboe's avatar
Jens Axboe committed
643 644
	BUG_ON(cfq_cfqq_on_rr(cfqq));
	cfq_mark_cfqq_on_rr(cfqq);
Linus Torvalds's avatar
Linus Torvalds committed
645
	cfqd->busy_queues++;
646 647
	if (cfq_class_rt(cfqq))
		cfqd->busy_rt_queues++;
Linus Torvalds's avatar
Linus Torvalds committed
648

649
	cfq_resort_rr_list(cfqd, cfqq);
Linus Torvalds's avatar
Linus Torvalds committed
650 651
}

652 653 654 655
/*
 * Called when the cfqq no longer has requests pending, remove it from
 * the service tree.
 */
656
static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
Linus Torvalds's avatar
Linus Torvalds committed
657
{
658
	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
Jens Axboe's avatar
Jens Axboe committed
659 660
	BUG_ON(!cfq_cfqq_on_rr(cfqq));
	cfq_clear_cfqq_on_rr(cfqq);
Linus Torvalds's avatar
Linus Torvalds committed
661

662 663
	if (!RB_EMPTY_NODE(&cfqq->rb_node))
		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
664 665 666 667
	if (cfqq->p_root) {
		rb_erase(&cfqq->p_node, cfqq->p_root);
		cfqq->p_root = NULL;
	}
668

Linus Torvalds's avatar
Linus Torvalds committed
669 670
	BUG_ON(!cfqd->busy_queues);
	cfqd->busy_queues--;
671 672
	if (cfq_class_rt(cfqq))
		cfqd->busy_rt_queues--;
Linus Torvalds's avatar
Linus Torvalds committed
673 674 675 676 677
}

/*
 * rb tree support functions
 */
678
static void cfq_del_rq_rb(struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
679
{
Jens Axboe's avatar
Jens Axboe committed
680
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
681
	struct cfq_data *cfqd = cfqq->cfqd;
Jens Axboe's avatar
Jens Axboe committed
682
	const int sync = rq_is_sync(rq);
Linus Torvalds's avatar
Linus Torvalds committed
683

684 685
	BUG_ON(!cfqq->queued[sync]);
	cfqq->queued[sync]--;
Linus Torvalds's avatar
Linus Torvalds committed
686

Jens Axboe's avatar
Jens Axboe committed
687
	elv_rb_del(&cfqq->sort_list, rq);
Linus Torvalds's avatar
Linus Torvalds committed
688

689
	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
690
		cfq_del_cfqq_rr(cfqd, cfqq);
Linus Torvalds's avatar
Linus Torvalds committed
691 692
}

Jens Axboe's avatar
Jens Axboe committed
693
static void cfq_add_rq_rb(struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
694
{
Jens Axboe's avatar
Jens Axboe committed
695
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
Linus Torvalds's avatar
Linus Torvalds committed
696
	struct cfq_data *cfqd = cfqq->cfqd;
697
	struct request *__alias, *prev;
Linus Torvalds's avatar
Linus Torvalds committed
698

699
	cfqq->queued[rq_is_sync(rq)]++;
Linus Torvalds's avatar
Linus Torvalds committed
700 701 702 703 704

	/*
	 * looks a little odd, but the first insert might return an alias.
	 * if that happens, put the alias on the dispatch list
	 */
705
	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
Jens Axboe's avatar
Jens Axboe committed
706
		cfq_dispatch_insert(cfqd->queue, __alias);
707 708 709

	if (!cfq_cfqq_on_rr(cfqq))
		cfq_add_cfqq_rr(cfqd, cfqq);
710 711 712 713

	/*
	 * check if this request is a better next-serve candidate
	 */
714
	prev = cfqq->next_rq;
715
	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
716 717 718 719 720 721 722

	/*
	 * adjust priority tree position, if ->next_rq changes
	 */
	if (prev != cfqq->next_rq)
		cfq_prio_tree_add(cfqd, cfqq);

723
	BUG_ON(!cfqq->next_rq);
Linus Torvalds's avatar
Linus Torvalds committed
724 725
}

726
static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
727
{
728 729
	elv_rb_del(&cfqq->sort_list, rq);
	cfqq->queued[rq_is_sync(rq)]--;
Jens Axboe's avatar
Jens Axboe committed
730
	cfq_add_rq_rb(rq);
Linus Torvalds's avatar
Linus Torvalds committed
731 732
}

733 734
static struct request *
cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
735
{
736
	struct task_struct *tsk = current;
737
	struct cfq_io_context *cic;
738
	struct cfq_queue *cfqq;
Linus Torvalds's avatar
Linus Torvalds committed
739

740
	cic = cfq_cic_lookup(cfqd, tsk->io_context);
741 742 743 744
	if (!cic)
		return NULL;

	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
745 746 747
	if (cfqq) {
		sector_t sector = bio->bi_sector + bio_sectors(bio);

748
		return elv_rb_find(&cfqq->sort_list, sector);
749
	}
Linus Torvalds's avatar
Linus Torvalds committed
750 751 752 753

	return NULL;
}

754
static void cfq_activate_request(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
755
{
756
	struct cfq_data *cfqd = q->elevator->elevator_data;
Jens Axboe's avatar
Jens Axboe committed
757

758
	cfqd->rq_in_driver++;
759 760
	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
						cfqd->rq_in_driver);
761

762
	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
Linus Torvalds's avatar
Linus Torvalds committed
763 764
}

765
static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
766
{
767 768 769 770
	struct cfq_data *cfqd = q->elevator->elevator_data;

	WARN_ON(!cfqd->rq_in_driver);
	cfqd->rq_in_driver--;
771 772
	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
						cfqd->rq_in_driver);
Linus Torvalds's avatar
Linus Torvalds committed
773 774
}

775
static void cfq_remove_request(struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
776
{
Jens Axboe's avatar
Jens Axboe committed
777
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
778

Jens Axboe's avatar
Jens Axboe committed
779 780
	if (cfqq->next_rq == rq)
		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
Linus Torvalds's avatar
Linus Torvalds committed
781

782
	list_del_init(&rq->queuelist);
Jens Axboe's avatar
Jens Axboe committed
783
	cfq_del_rq_rb(rq);
784

785
	cfqq->cfqd->rq_queued--;
786 787 788 789
	if (rq_is_meta(rq)) {
		WARN_ON(!cfqq->meta_pending);
		cfqq->meta_pending--;
	}
Linus Torvalds's avatar
Linus Torvalds committed
790 791
}

792 793
static int cfq_merge(struct request_queue *q, struct request **req,
		     struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
794 795 796 797
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
	struct request *__rq;

798
	__rq = cfq_find_rq_fmerge(cfqd, bio);
799
	if (__rq && elv_rq_merge_ok(__rq, bio)) {
800 801
		*req = __rq;
		return ELEVATOR_FRONT_MERGE;
Linus Torvalds's avatar
Linus Torvalds committed
802 803 804 805 806
	}

	return ELEVATOR_NO_MERGE;
}

807
static void cfq_merged_request(struct request_queue *q, struct request *req,
808
			       int type)
Linus Torvalds's avatar
Linus Torvalds committed
809
{
810
	if (type == ELEVATOR_FRONT_MERGE) {
Jens Axboe's avatar
Jens Axboe committed
811
		struct cfq_queue *cfqq = RQ_CFQQ(req);
Linus Torvalds's avatar
Linus Torvalds committed
812

Jens Axboe's avatar
Jens Axboe committed
813
		cfq_reposition_rq_rb(cfqq, req);
Linus Torvalds's avatar
Linus Torvalds committed
814 815 816 817
	}
}

static void
818
cfq_merged_requests(struct request_queue *q, struct request *rq,
Linus Torvalds's avatar
Linus Torvalds committed
819 820
		    struct request *next)
{
821 822 823 824 825 826 827
	/*
	 * reposition in fifo if next is older than rq
	 */
	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
	    time_before(next->start_time, rq->start_time))
		list_move(&rq->queuelist, &next->queuelist);

828
	cfq_remove_request(next);
829 830
}

831
static int cfq_allow_merge(struct request_queue *q, struct request *rq,
832 833 834
			   struct bio *bio)
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
835
	struct cfq_io_context *cic;
836 837 838
	struct cfq_queue *cfqq;

	/*
839
	 * Disallow merge of a sync bio into an async request.
840
	 */
841
	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
842 843 844
		return 0;

	/*
845 846
	 * Lookup the cfqq that this bio will be queued with. Allow
	 * merge only if rq is queued there.
847
	 */
848
	cic = cfq_cic_lookup(cfqd, current->io_context);
849 850
	if (!cic)
		return 0;
851

852
	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
853 854
	if (cfqq == RQ_CFQQ(rq))
		return 1;
855

856
	return 0;
857 858
}

859 860
static void __cfq_set_active_queue(struct cfq_data *cfqd,
				   struct cfq_queue *cfqq)
861 862
{
	if (cfqq) {
863
		cfq_log_cfqq(cfqd, cfqq, "set_active");
864
		cfqq->slice_end = 0;
865 866 867
		cfqq->slice_dispatch = 0;

		cfq_clear_cfqq_wait_request(cfqq);
868
		cfq_clear_cfqq_must_dispatch(cfqq);
Jens Axboe's avatar
Jens Axboe committed
869 870
		cfq_clear_cfqq_must_alloc_slice(cfqq);
		cfq_clear_cfqq_fifo_expire(cfqq);
871
		cfq_mark_cfqq_slice_new(cfqq);
872 873

		del_timer(&cfqd->idle_slice_timer);
874 875 876 877 878
	}

	cfqd->active_queue = cfqq;
}

879 880 881 882 883
/*
 * current cfqq expired its slice (or was too idle), select new one
 */
static void
__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
884
		    int timed_out)
885
{
886 887
	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);

888 889 890 891 892 893
	if (cfq_cfqq_wait_request(cfqq))
		del_timer(&cfqd->idle_slice_timer);

	cfq_clear_cfqq_wait_request(cfqq);

	/*
894
	 * store what was left of this slice, if the queue idled/timed out
895
	 */
896
	if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
897
		cfqq->slice_resid = cfqq->slice_end - jiffies;
898 899
		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
	}
900

901
	cfq_resort_rr_list(cfqd, cfqq);
902 903 904 905 906 907 908 909 910 911

	if (cfqq == cfqd->active_queue)
		cfqd->active_queue = NULL;

	if (cfqd->active_cic) {
		put_io_context(cfqd->active_cic->ioc);
		cfqd->active_cic = NULL;
	}
}

912
static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out)
913 914 915 916
{
	struct cfq_queue *cfqq = cfqd->active_queue;

	if (cfqq)
917
		__cfq_slice_expired(cfqd, cfqq, timed_out);
918 919
}

920 921 922 923
/*
 * Get next queue for service. Unless we have a queue preemption,
 * we'll simply select the first cfqq in the service tree.
 */
Jens Axboe's avatar
Jens Axboe committed
924
static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
925
{
926 927
	if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
		return NULL;
928

929
	return cfq_rb_first(&cfqd->service_tree);
Jens Axboe's avatar
Jens Axboe committed
930 931
}

932 933 934
/*
 * Get and set a new active queue for service.
 */
935 936
static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
					      struct cfq_queue *cfqq)
Jens Axboe's avatar
Jens Axboe committed
937
{
938 939 940 941 942
	if (!cfqq) {
		cfqq = cfq_get_next_queue(cfqd);
		if (cfqq)
			cfq_clear_cfqq_coop(cfqq);
	}
Jens Axboe's avatar
Jens Axboe committed
943

944
	__cfq_set_active_queue(cfqd, cfqq);
Jens Axboe's avatar
Jens Axboe committed
945
	return cfqq;
946 947
}

948 949 950
static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
					  struct request *rq)
{
951 952
	if (blk_rq_pos(rq) >= cfqd->last_position)
		return blk_rq_pos(rq) - cfqd->last_position;
953
	else
954
		return cfqd->last_position - blk_rq_pos(rq);
955 956
}

957 958 959
#define CIC_SEEK_THR	8 * 1024
#define CIC_SEEKY(cic)	((cic)->seek_mean > CIC_SEEK_THR)

Jens Axboe's avatar
Jens Axboe committed
960 961 962
static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
{
	struct cfq_io_context *cic = cfqd->active_cic;
963
	sector_t sdist = cic->seek_mean;