cfq-iosched.c 76.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6
/*
 *  CFQ, or complete fairness queueing, disk scheduler.
 *
 *  Based on ideas from a previously unfinished io
 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
 *
7
 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
Linus Torvalds's avatar
Linus Torvalds committed
8 9
 */
#include <linux/module.h>
Al Viro's avatar
Al Viro committed
10 11
#include <linux/blkdev.h>
#include <linux/elevator.h>
Randy Dunlap's avatar
Randy Dunlap committed
12
#include <linux/jiffies.h>
Linus Torvalds's avatar
Linus Torvalds committed
13
#include <linux/rbtree.h>
14
#include <linux/ioprio.h>
15
#include <linux/blktrace_api.h>
Linus Torvalds's avatar
Linus Torvalds committed
16 17 18 19

/*
 * tunables
 */
20 21
/* max queue in one round of service */
static const int cfq_quantum = 4;
22
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
23 24 25 26
/* maximum backwards seek, in KiB */
static const int cfq_back_max = 16 * 1024;
/* penalty of a backwards seek */
static const int cfq_back_penalty = 2;
27
static const int cfq_slice_sync = HZ / 10;
Jens Axboe's avatar
Jens Axboe committed
28
static int cfq_slice_async = HZ / 25;
29
static const int cfq_slice_async_rq = 2;
30
static int cfq_slice_idle = HZ / 125;
31 32
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
static const int cfq_hist_divisor = 4;
33

34
/*
35
 * offset from end of service tree
36
 */
37
#define CFQ_IDLE_DELAY		(HZ / 5)
38 39 40 41 42 43

/*
 * below this threshold, we consider thinktime immediate
 */
#define CFQ_MIN_TT		(2)

44 45 46 47 48 49
/*
 * Allow merged cfqqs to perform this amount of seeky I/O before
 * deciding to break the queues up again.
 */
#define CFQQ_COOP_TOUT		(HZ)

50
#define CFQ_SLICE_SCALE		(5)
51
#define CFQ_HW_QUEUE_MIN	(5)
52

53 54
#define RQ_CIC(rq)		\
	((struct cfq_io_context *) (rq)->elevator_private)
55
#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
Linus Torvalds's avatar
Linus Torvalds committed
56

57 58
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
Linus Torvalds's avatar
Linus Torvalds committed
59

60
static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
61
static struct completion *ioc_gone;
62
static DEFINE_SPINLOCK(ioc_gone_lock);
63

64 65 66 67
#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

68 69
#define sample_valid(samples)	((samples) > 80)

70 71 72 73 74 75 76 77 78
/*
 * Most of our rbtree usage is for sorting with min extraction, so
 * if we cache the leftmost node we don't have to walk down the tree
 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
 * move this into the elevator for the rq sorting as well.
 */
struct cfq_rb_root {
	struct rb_root rb;
	struct rb_node *left;
79
	unsigned count;
80
};
81
#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, 0, }
82

83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
/*
 * Per process-grouping structure
 */
struct cfq_queue {
	/* reference count */
	atomic_t ref;
	/* various state flags, see below */
	unsigned int flags;
	/* parent cfq_data */
	struct cfq_data *cfqd;
	/* service_tree member */
	struct rb_node rb_node;
	/* service_tree key */
	unsigned long rb_key;
	/* prio tree member */
	struct rb_node p_node;
	/* prio tree root we belong to, if any */
	struct rb_root *p_root;
	/* sorted list of pending requests */
	struct rb_root sort_list;
	/* if fifo isn't expired, next request to serve */
	struct request *next_rq;
	/* requests queued in sort_list */
	int queued[2];
	/* currently allocated requests */
	int allocated[2];
	/* fifo list of requests in sort_list */
	struct list_head fifo;

	unsigned long slice_end;
	long slice_resid;
	unsigned int slice_dispatch;

	/* pending metadata requests */
	int meta_pending;
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;

	/* io prio of this group */
	unsigned short ioprio, org_ioprio;
	unsigned short ioprio_class, org_ioprio_class;

125 126 127 128
	unsigned int seek_samples;
	u64 seek_total;
	sector_t seek_mean;
	sector_t last_request_pos;
129
	unsigned long seeky_start;
130

131
	pid_t pid;
Jeff Moyer's avatar
Jeff Moyer committed
132

133
	struct cfq_rb_root *service_tree;
Jeff Moyer's avatar
Jeff Moyer committed
134
	struct cfq_queue *new_cfqq;
135 136
};

137
/*
138
 * First index in the service_trees.
139 140 141 142 143 144 145 146
 * IDLE is handled separately, so it has negative index
 */
enum wl_prio_t {
	IDLE_WORKLOAD = -1,
	BE_WORKLOAD = 0,
	RT_WORKLOAD = 1
};

147 148 149 150 151 152 153 154 155 156
/*
 * Second index in the service_trees.
 */
enum wl_type_t {
	ASYNC_WORKLOAD = 0,
	SYNC_NOIDLE_WORKLOAD = 1,
	SYNC_WORKLOAD = 2
};


157 158 159
/*
 * Per block device queue structure
 */
Linus Torvalds's avatar
Linus Torvalds committed
160
struct cfq_data {
161
	struct request_queue *queue;
162 163

	/*
164 165 166
	 * rr lists of queues with requests, onle rr for each priority class.
	 * Counts are embedded in the cfq_rb_root
	 */
167
	struct cfq_rb_root service_trees[2][3];
168 169 170
	struct cfq_rb_root service_tree_idle;
	/*
	 * The priority currently being served
171
	 */
172
	enum wl_prio_t serving_prio;
173 174
	enum wl_type_t serving_type;
	unsigned long workload_expires;
175 176 177 178 179 180 181 182

	/*
	 * Each priority tree is sorted by next_request position.  These
	 * trees are used when determining if two or more queues are
	 * interleaving requests (see cfq_close_cooperator).
	 */
	struct rb_root prio_trees[CFQ_PRIO_LISTS];

183
	unsigned int busy_queues;
184
	unsigned int busy_queues_avg[2];
185

186
	int rq_in_driver[2];
187
	int sync_flight;
188 189 190 191 192

	/*
	 * queue-depth detection
	 */
	int rq_queued;
193
	int hw_tag;
194 195 196 197 198 199 200 201
	/*
	 * hw_tag can be
	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
	 *  0 => no NCQ
	 */
	int hw_tag_est_depth;
	unsigned int hw_tag_samples;
Linus Torvalds's avatar
Linus Torvalds committed
202

203 204 205 206
	/*
	 * idle window management
	 */
	struct timer_list idle_slice_timer;
207
	struct work_struct unplug_work;
Linus Torvalds's avatar
Linus Torvalds committed
208

209 210 211
	struct cfq_queue *active_queue;
	struct cfq_io_context *active_cic;

212 213 214 215 216
	/*
	 * async queue for each priority case
	 */
	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
	struct cfq_queue *async_idle_cfqq;
217

Jens Axboe's avatar
Jens Axboe committed
218
	sector_t last_position;
Linus Torvalds's avatar
Linus Torvalds committed
219 220 221 222 223

	/*
	 * tunables, see top of file
	 */
	unsigned int cfq_quantum;
224
	unsigned int cfq_fifo_expire[2];
Linus Torvalds's avatar
Linus Torvalds committed
225 226
	unsigned int cfq_back_penalty;
	unsigned int cfq_back_max;
227 228 229
	unsigned int cfq_slice[2];
	unsigned int cfq_slice_async_rq;
	unsigned int cfq_slice_idle;
230
	unsigned int cfq_latency;
231 232

	struct list_head cic_list;
Linus Torvalds's avatar
Linus Torvalds committed
233

234 235 236 237
	/*
	 * Fallback dummy cfqq for extreme OOM conditions
	 */
	struct cfq_queue oom_cfqq;
238 239

	unsigned long last_end_sync_rq;
Linus Torvalds's avatar
Linus Torvalds committed
240 241
};

242
static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio,
243
					    enum wl_type_t type,
244 245 246 247 248
					    struct cfq_data *cfqd)
{
	if (prio == IDLE_WORKLOAD)
		return &cfqd->service_tree_idle;

249
	return &cfqd->service_trees[prio][type];
250 251
}

Jens Axboe's avatar
Jens Axboe committed
252
enum cfqq_state_flags {
253 254
	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
255
	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
256 257 258 259
	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
260
	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
261
	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
262
	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
Jens Axboe's avatar
Jens Axboe committed
263 264 265 266 267
};

#define CFQ_CFQQ_FNS(name)						\
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
{									\
268
	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
Jens Axboe's avatar
Jens Axboe committed
269 270 271
}									\
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
{									\
272
	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
Jens Axboe's avatar
Jens Axboe committed
273 274 275
}									\
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
{									\
276
	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
Jens Axboe's avatar
Jens Axboe committed
277 278 279 280
}

CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
281
CFQ_CFQQ_FNS(must_dispatch);
Jens Axboe's avatar
Jens Axboe committed
282 283 284 285
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
286
CFQ_CFQQ_FNS(slice_new);
287
CFQ_CFQQ_FNS(sync);
288
CFQ_CFQQ_FNS(coop);
Jens Axboe's avatar
Jens Axboe committed
289 290
#undef CFQ_CFQQ_FNS

291 292 293 294 295
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
#define cfq_log(cfqd, fmt, args...)	\
	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

296 297 298 299 300 301 302 303 304
static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
{
	if (cfq_class_idle(cfqq))
		return IDLE_WORKLOAD;
	if (cfq_class_rt(cfqq))
		return RT_WORKLOAD;
	return BE_WORKLOAD;
}

305 306 307 308 309 310 311 312 313 314

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
{
	if (!cfq_cfqq_sync(cfqq))
		return ASYNC_WORKLOAD;
	if (!cfq_cfqq_idle_window(cfqq))
		return SYNC_NOIDLE_WORKLOAD;
	return SYNC_WORKLOAD;
}

315 316 317 318 319
static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd)
{
	if (wl == IDLE_WORKLOAD)
		return cfqd->service_tree_idle.count;

320 321 322
	return cfqd->service_trees[wl][ASYNC_WORKLOAD].count
		+ cfqd->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
		+ cfqd->service_trees[wl][SYNC_WORKLOAD].count;
323 324
}

325
static void cfq_dispatch_insert(struct request_queue *, struct request *);
326
static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
327
				       struct io_context *, gfp_t);
328
static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
329 330
						struct io_context *);

331 332 333 334 335
static inline int rq_in_driver(struct cfq_data *cfqd)
{
	return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
}

336
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
337
					    bool is_sync)
338
{
339
	return cic->cfqq[is_sync];
340 341 342
}

static inline void cic_set_cfqq(struct cfq_io_context *cic,
343
				struct cfq_queue *cfqq, bool is_sync)
344
{
345
	cic->cfqq[is_sync] = cfqq;
346 347 348 349 350 351
}

/*
 * We regard a request as SYNC, if it's either a read or has the SYNC bit
 * set (in which case it could also be direct WRITE).
 */
352
static inline bool cfq_bio_sync(struct bio *bio)
353
{
354
	return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
355
}
Linus Torvalds's avatar
Linus Torvalds committed
356

Andrew Morton's avatar
Andrew Morton committed
357 358 359 360
/*
 * scheduler run of queue, if there are requests pending and no one in the
 * driver that will restart queueing
 */
361
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
Andrew Morton's avatar
Andrew Morton committed
362
{
363 364
	if (cfqd->busy_queues) {
		cfq_log(cfqd, "schedule dispatch");
365
		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
366
	}
Andrew Morton's avatar
Andrew Morton committed
367 368
}

369
static int cfq_queue_empty(struct request_queue *q)
Andrew Morton's avatar
Andrew Morton committed
370 371 372
{
	struct cfq_data *cfqd = q->elevator->elevator_data;

373
	return !cfqd->busy_queues;
Andrew Morton's avatar
Andrew Morton committed
374 375
}

376 377 378 379 380
/*
 * Scale schedule slice based on io priority. Use the sync time slice only
 * if a queue is marked sync and has sync io queued. A sync queue with async
 * io only, should not get full sync slice length.
 */
381
static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
382
				 unsigned short prio)
383
{
384
	const int base_slice = cfqd->cfq_slice[sync];
385

386 387 388 389
	WARN_ON(prio >= IOPRIO_BE_NR);

	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
}
390

391 392 393 394
static inline int
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
395 396
}

397 398 399 400 401 402
/*
 * get averaged number of queues of RT/BE priority.
 * average is updated, with a formula that gives more weight to higher numbers,
 * to quickly follows sudden increases and decrease slowly
 */

403 404
static inline unsigned cfq_get_avg_queues(struct cfq_data *cfqd, bool rt)
{
405 406 407
	unsigned min_q, max_q;
	unsigned mult  = cfq_hist_divisor - 1;
	unsigned round = cfq_hist_divisor / 2;
408
	unsigned busy = cfq_busy_queues_wl(rt, cfqd);
409 410 411 412 413 414 415 416

	min_q = min(cfqd->busy_queues_avg[rt], busy);
	max_q = max(cfqd->busy_queues_avg[rt], busy);
	cfqd->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
		cfq_hist_divisor;
	return cfqd->busy_queues_avg[rt];
}

417 418 419
static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
	if (cfqd->cfq_latency) {
		/* interested queues (we consider only the ones with the same
		 * priority class) */
		unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
		unsigned sync_slice = cfqd->cfq_slice[1];
		unsigned expect_latency = sync_slice * iq;
		if (expect_latency > cfq_target_latency) {
			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
			/* scale low_slice according to IO priority
			 * and sync vs async */
			unsigned low_slice =
				min(slice, base_low_slice * slice / sync_slice);
			/* the adapted slice value is scaled to fit all iqs
			 * into the target latency */
			slice = max(slice * cfq_target_latency / expect_latency,
				    low_slice);
		}
	}
	cfqq->slice_end = jiffies + slice;
440
	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
441 442 443 444 445 446 447
}

/*
 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
 * isn't valid until the first request from the dispatch is activated
 * and the slice time set.
 */
448
static inline bool cfq_slice_used(struct cfq_queue *cfqq)
449 450 451 452 453 454 455 456 457
{
	if (cfq_cfqq_slice_new(cfqq))
		return 0;
	if (time_before(jiffies, cfqq->slice_end))
		return 0;

	return 1;
}

Linus Torvalds's avatar
Linus Torvalds committed
458
/*
Jens Axboe's avatar
Jens Axboe committed
459
 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
Linus Torvalds's avatar
Linus Torvalds committed
460
 * We choose the request that is closest to the head right now. Distance
461
 * behind the head is penalized and only allowed to a certain extent.
Linus Torvalds's avatar
Linus Torvalds committed
462
 */
Jens Axboe's avatar
Jens Axboe committed
463
static struct request *
464
cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
Linus Torvalds's avatar
Linus Torvalds committed
465
{
466
	sector_t s1, s2, d1 = 0, d2 = 0;
Linus Torvalds's avatar
Linus Torvalds committed
467
	unsigned long back_max;
468 469 470
#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
Linus Torvalds's avatar
Linus Torvalds committed
471

Jens Axboe's avatar
Jens Axboe committed
472 473 474 475
	if (rq1 == NULL || rq1 == rq2)
		return rq2;
	if (rq2 == NULL)
		return rq1;
476

Jens Axboe's avatar
Jens Axboe committed
477 478 479 480
	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
		return rq1;
	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
		return rq2;
481 482 483 484
	if (rq_is_meta(rq1) && !rq_is_meta(rq2))
		return rq1;
	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
		return rq2;
Linus Torvalds's avatar
Linus Torvalds committed
485

486 487
	s1 = blk_rq_pos(rq1);
	s2 = blk_rq_pos(rq2);
Linus Torvalds's avatar
Linus Torvalds committed
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503

	/*
	 * by definition, 1KiB is 2 sectors
	 */
	back_max = cfqd->cfq_back_max * 2;

	/*
	 * Strict one way elevator _except_ in the case where we allow
	 * short backward seeks which are biased as twice the cost of a
	 * similar forward seek.
	 */
	if (s1 >= last)
		d1 = s1 - last;
	else if (s1 + back_max >= last)
		d1 = (last - s1) * cfqd->cfq_back_penalty;
	else
504
		wrap |= CFQ_RQ1_WRAP;
Linus Torvalds's avatar
Linus Torvalds committed
505 506 507 508 509 510

	if (s2 >= last)
		d2 = s2 - last;
	else if (s2 + back_max >= last)
		d2 = (last - s2) * cfqd->cfq_back_penalty;
	else
511
		wrap |= CFQ_RQ2_WRAP;
Linus Torvalds's avatar
Linus Torvalds committed
512 513

	/* Found required data */
514 515 516 517 518 519

	/*
	 * By doing switch() on the bit mask "wrap" we avoid having to
	 * check two variables for all permutations: --> faster!
	 */
	switch (wrap) {
Jens Axboe's avatar
Jens Axboe committed
520
	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
521
		if (d1 < d2)
Jens Axboe's avatar
Jens Axboe committed
522
			return rq1;
523
		else if (d2 < d1)
Jens Axboe's avatar
Jens Axboe committed
524
			return rq2;
525 526
		else {
			if (s1 >= s2)
Jens Axboe's avatar
Jens Axboe committed
527
				return rq1;
528
			else
Jens Axboe's avatar
Jens Axboe committed
529
				return rq2;
530
		}
Linus Torvalds's avatar
Linus Torvalds committed
531

532
	case CFQ_RQ2_WRAP:
Jens Axboe's avatar
Jens Axboe committed
533
		return rq1;
534
	case CFQ_RQ1_WRAP:
Jens Axboe's avatar
Jens Axboe committed
535 536
		return rq2;
	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
537 538 539 540 541 542 543 544
	default:
		/*
		 * Since both rqs are wrapped,
		 * start with the one that's further behind head
		 * (--> only *one* back seek required),
		 * since back seek takes more time than forward.
		 */
		if (s1 <= s2)
Jens Axboe's avatar
Jens Axboe committed
545
			return rq1;
Linus Torvalds's avatar
Linus Torvalds committed
546
		else
Jens Axboe's avatar
Jens Axboe committed
547
			return rq2;
Linus Torvalds's avatar
Linus Torvalds committed
548 549 550
	}
}

551 552 553
/*
 * The below is leftmost cache rbtree addon
 */
554
static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
555 556 557 558
{
	if (!root->left)
		root->left = rb_first(&root->rb);

559 560 561 562
	if (root->left)
		return rb_entry(root->left, struct cfq_queue, rb_node);

	return NULL;
563 564
}

565 566 567 568 569 570
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
	rb_erase(n, root);
	RB_CLEAR_NODE(n);
}

571 572 573 574
static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
{
	if (root->left == n)
		root->left = NULL;
575
	rb_erase_init(n, &root->rb);
576
	--root->count;
577 578
}

Linus Torvalds's avatar
Linus Torvalds committed
579 580 581
/*
 * would be nice to take fifo expire time into account as well
 */
Jens Axboe's avatar
Jens Axboe committed
582 583 584
static struct request *
cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
		  struct request *last)
Linus Torvalds's avatar
Linus Torvalds committed
585
{
586 587
	struct rb_node *rbnext = rb_next(&last->rb_node);
	struct rb_node *rbprev = rb_prev(&last->rb_node);
Jens Axboe's avatar
Jens Axboe committed
588
	struct request *next = NULL, *prev = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
589

590
	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
Linus Torvalds's avatar
Linus Torvalds committed
591 592

	if (rbprev)
Jens Axboe's avatar
Jens Axboe committed
593
		prev = rb_entry_rq(rbprev);
Linus Torvalds's avatar
Linus Torvalds committed
594

595
	if (rbnext)
Jens Axboe's avatar
Jens Axboe committed
596
		next = rb_entry_rq(rbnext);
597 598 599
	else {
		rbnext = rb_first(&cfqq->sort_list);
		if (rbnext && rbnext != &last->rb_node)
Jens Axboe's avatar
Jens Axboe committed
600
			next = rb_entry_rq(rbnext);
601
	}
Linus Torvalds's avatar
Linus Torvalds committed
602

603
	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
Linus Torvalds's avatar
Linus Torvalds committed
604 605
}

606 607
static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
				      struct cfq_queue *cfqq)
Linus Torvalds's avatar
Linus Torvalds committed
608
{
609 610 611 612
	struct cfq_rb_root *service_tree;

	service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd);

613 614 615
	/*
	 * just an approximation, should be ok.
	 */
616 617
	return  service_tree->count * (cfq_prio_slice(cfqd, 1, 0) -
		   cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
618 619
}

620
/*
621
 * The cfqd->service_trees holds all pending cfq_queue's that have
622 623 624
 * requests waiting to be processed. It is sorted in the order that
 * we will service the queues.
 */
625
static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
626
				 bool add_front)
627
{
628 629
	struct rb_node **p, *parent;
	struct cfq_queue *__cfqq;
630
	unsigned long rb_key;
631
	struct cfq_rb_root *service_tree;
632
	int left;
633

634
	service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd);
635 636
	if (cfq_class_idle(cfqq)) {
		rb_key = CFQ_IDLE_DELAY;
637
		parent = rb_last(&service_tree->rb);
638 639 640 641 642 643
		if (parent && parent != &cfqq->rb_node) {
			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
			rb_key += __cfqq->rb_key;
		} else
			rb_key += jiffies;
	} else if (!add_front) {
644 645 646 647 648 649
		/*
		 * Get our rb key offset. Subtract any residual slice
		 * value carried from last service. A negative resid
		 * count indicates slice overrun, and this should position
		 * the next service time further away in the tree.
		 */
650
		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
651
		rb_key -= cfqq->slice_resid;
652
		cfqq->slice_resid = 0;
653 654
	} else {
		rb_key = -HZ;
655
		__cfqq = cfq_rb_first(service_tree);
656 657
		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
	}
Linus Torvalds's avatar
Linus Torvalds committed
658

659
	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
660
		/*
661
		 * same position, nothing more to do
662
		 */
663 664
		if (rb_key == cfqq->rb_key &&
		    cfqq->service_tree == service_tree)
665
			return;
Linus Torvalds's avatar
Linus Torvalds committed
666

667 668
		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
		cfqq->service_tree = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
669
	}
670

671
	left = 1;
672
	parent = NULL;
673 674
	cfqq->service_tree = service_tree;
	p = &service_tree->rb.rb_node;
675
	while (*p) {
676
		struct rb_node **n;
677

678 679 680
		parent = *p;
		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

681
		/*
682
		 * sort by key, that represents service time.
683
		 */
684
		if (time_before(rb_key, __cfqq->rb_key))
685
			n = &(*p)->rb_left;
686
		else {
687
			n = &(*p)->rb_right;
688
			left = 0;
689
		}
690 691

		p = n;
692 693
	}

694
	if (left)
695
		service_tree->left = &cfqq->rb_node;
696

697 698
	cfqq->rb_key = rb_key;
	rb_link_node(&cfqq->rb_node, parent, p);
699 700
	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
	service_tree->count++;
Linus Torvalds's avatar
Linus Torvalds committed
701 702
}

703
static struct cfq_queue *
704 705 706
cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
		     sector_t sector, struct rb_node **ret_parent,
		     struct rb_node ***rb_link)
707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722
{
	struct rb_node **p, *parent;
	struct cfq_queue *cfqq = NULL;

	parent = NULL;
	p = &root->rb_node;
	while (*p) {
		struct rb_node **n;

		parent = *p;
		cfqq = rb_entry(parent, struct cfq_queue, p_node);

		/*
		 * Sort strictly based on sector.  Smallest to the left,
		 * largest to the right.
		 */
723
		if (sector > blk_rq_pos(cfqq->next_rq))
724
			n = &(*p)->rb_right;
725
		else if (sector < blk_rq_pos(cfqq->next_rq))
726 727 728 729
			n = &(*p)->rb_left;
		else
			break;
		p = n;
730
		cfqq = NULL;
731 732 733 734 735
	}

	*ret_parent = parent;
	if (rb_link)
		*rb_link = p;
736
	return cfqq;
737 738 739 740 741 742 743
}

static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	struct rb_node **p, *parent;
	struct cfq_queue *__cfqq;

744 745 746 747
	if (cfqq->p_root) {
		rb_erase(&cfqq->p_node, cfqq->p_root);
		cfqq->p_root = NULL;
	}
748 749 750 751 752 753

	if (cfq_class_idle(cfqq))
		return;
	if (!cfqq->next_rq)
		return;

754
	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
755 756
	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
				      blk_rq_pos(cfqq->next_rq), &parent, &p);
757 758
	if (!__cfqq) {
		rb_link_node(&cfqq->p_node, parent, p);
759 760 761
		rb_insert_color(&cfqq->p_node, cfqq->p_root);
	} else
		cfqq->p_root = NULL;
762 763
}

764 765 766
/*
 * Update cfqq's position in the service tree.
 */
767
static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
Jens Axboe's avatar
Jens Axboe committed
768 769 770 771
{
	/*
	 * Resorting requires the cfqq to be on the RR list already.
	 */
772
	if (cfq_cfqq_on_rr(cfqq)) {
773
		cfq_service_tree_add(cfqd, cfqq, 0);
774 775
		cfq_prio_tree_add(cfqd, cfqq);
	}
Jens Axboe's avatar
Jens Axboe committed
776 777
}

Linus Torvalds's avatar
Linus Torvalds committed
778 779
/*
 * add to busy list of queues for service, trying to be fair in ordering
780
 * the pending list according to last request service
Linus Torvalds's avatar
Linus Torvalds committed
781
 */
782
static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
Linus Torvalds's avatar
Linus Torvalds committed
783
{
784
	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
Jens Axboe's avatar
Jens Axboe committed
785 786
	BUG_ON(cfq_cfqq_on_rr(cfqq));
	cfq_mark_cfqq_on_rr(cfqq);
Linus Torvalds's avatar
Linus Torvalds committed
787 788
	cfqd->busy_queues++;

789
	cfq_resort_rr_list(cfqd, cfqq);
Linus Torvalds's avatar
Linus Torvalds committed
790 791
}

792 793 794 795
/*
 * Called when the cfqq no longer has requests pending, remove it from
 * the service tree.
 */
796
static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
Linus Torvalds's avatar
Linus Torvalds committed
797
{
798
	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
Jens Axboe's avatar
Jens Axboe committed
799 800
	BUG_ON(!cfq_cfqq_on_rr(cfqq));
	cfq_clear_cfqq_on_rr(cfqq);
Linus Torvalds's avatar
Linus Torvalds committed
801

802 803 804 805
	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
		cfqq->service_tree = NULL;
	}
806 807 808 809
	if (cfqq->p_root) {
		rb_erase(&cfqq->p_node, cfqq->p_root);
		cfqq->p_root = NULL;
	}
810

Linus Torvalds's avatar
Linus Torvalds committed
811 812 813 814 815 816 817
	BUG_ON(!cfqd->busy_queues);
	cfqd->busy_queues--;
}

/*
 * rb tree support functions
 */
818
static void cfq_del_rq_rb(struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
819
{
Jens Axboe's avatar
Jens Axboe committed
820
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
821
	struct cfq_data *cfqd = cfqq->cfqd;
Jens Axboe's avatar
Jens Axboe committed
822
	const int sync = rq_is_sync(rq);
Linus Torvalds's avatar
Linus Torvalds committed
823

824 825
	BUG_ON(!cfqq->queued[sync]);
	cfqq->queued[sync]--;
Linus Torvalds's avatar
Linus Torvalds committed
826

Jens Axboe's avatar
Jens Axboe committed
827
	elv_rb_del(&cfqq->sort_list, rq);
Linus Torvalds's avatar
Linus Torvalds committed
828

829
	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
830
		cfq_del_cfqq_rr(cfqd, cfqq);
Linus Torvalds's avatar
Linus Torvalds committed
831 832
}

Jens Axboe's avatar
Jens Axboe committed
833
static void cfq_add_rq_rb(struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
834
{
Jens Axboe's avatar
Jens Axboe committed
835
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
Linus Torvalds's avatar
Linus Torvalds committed
836
	struct cfq_data *cfqd = cfqq->cfqd;
837
	struct request *__alias, *prev;
Linus Torvalds's avatar
Linus Torvalds committed
838

839
	cfqq->queued[rq_is_sync(rq)]++;
Linus Torvalds's avatar
Linus Torvalds committed
840 841 842 843 844

	/*
	 * looks a little odd, but the first insert might return an alias.
	 * if that happens, put the alias on the dispatch list
	 */
845
	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
Jens Axboe's avatar
Jens Axboe committed
846
		cfq_dispatch_insert(cfqd->queue, __alias);
847 848 849

	if (!cfq_cfqq_on_rr(cfqq))
		cfq_add_cfqq_rr(cfqd, cfqq);
850 851 852 853

	/*
	 * check if this request is a better next-serve candidate
	 */
854
	prev = cfqq->next_rq;
855
	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
856 857 858 859 860 861 862

	/*
	 * adjust priority tree position, if ->next_rq changes
	 */
	if (prev != cfqq->next_rq)
		cfq_prio_tree_add(cfqd, cfqq);

863
	BUG_ON(!cfqq->next_rq);
Linus Torvalds's avatar
Linus Torvalds committed
864 865
}

866
static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
867
{
868 869
	elv_rb_del(&cfqq->sort_list, rq);
	cfqq->queued[rq_is_sync(rq)]--;
Jens Axboe's avatar
Jens Axboe committed
870
	cfq_add_rq_rb(rq);
Linus Torvalds's avatar
Linus Torvalds committed
871 872
}

873 874
static struct request *
cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
875
{
876
	struct task_struct *tsk = current;
877
	struct cfq_io_context *cic;
878
	struct cfq_queue *cfqq;
Linus Torvalds's avatar
Linus Torvalds committed
879

880
	cic = cfq_cic_lookup(cfqd, tsk->io_context);
881 882 883 884
	if (!cic)
		return NULL;

	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
885 886 887
	if (cfqq) {
		sector_t sector = bio->bi_sector + bio_sectors(bio);

888
		return elv_rb_find(&cfqq->sort_list, sector);
889
	}
Linus Torvalds's avatar
Linus Torvalds committed
890 891 892 893

	return NULL;
}

894
static void cfq_activate_request(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
895
{
896
	struct cfq_data *cfqd = q->elevator->elevator_data;
Jens Axboe's avatar
Jens Axboe committed
897

898
	cfqd->rq_in_driver[rq_is_sync(rq)]++;
899
	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
900
						rq_in_driver(cfqd));
901

902
	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
Linus Torvalds's avatar
Linus Torvalds committed
903 904
}

905
static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
906
{
907
	struct cfq_data *cfqd = q->elevator->elevator_data;
908
	const int sync = rq_is_sync(rq);
909

910 911
	WARN_ON(!cfqd->rq_in_driver[sync]);
	cfqd->rq_in_driver[sync]--;
912
	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
913
						rq_in_driver(cfqd));
Linus Torvalds's avatar
Linus Torvalds committed
914 915
}

916
static void cfq_remove_request(struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
917
{
Jens Axboe's avatar
Jens Axboe committed
918
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
919

Jens Axboe's avatar
Jens Axboe committed
920 921
	if (cfqq->next_rq == rq)
		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
Linus Torvalds's avatar
Linus Torvalds committed
922

923
	list_del_init(&rq->queuelist);
Jens Axboe's avatar
Jens Axboe committed
924
	cfq_del_rq_rb(rq);
925

926
	cfqq->cfqd->rq_queued--;
927 928 929 930
	if (rq_is_meta(rq)) {
		WARN_ON(!cfqq->meta_pending);
		cfqq->meta_pending--;
	}
Linus Torvalds's avatar
Linus Torvalds committed
931 932
}

933 934
static int cfq_merge(struct request_queue *q, struct request **req,
		     struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
935 936 937 938
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
	struct request *__rq;

939
	__rq = cfq_find_rq_fmerge(cfqd, bio);
940
	if (__rq && elv_rq_merge_ok(__rq, bio)) {
941 942
		*req = __rq;
		return ELEVATOR_FRONT_MERGE;
Linus Torvalds's avatar
Linus Torvalds committed
943 944 945 946 947
	}

	return ELEVATOR_NO_MERGE;
}

948
static void cfq_merged_request(struct request_queue *q, struct request *req,
949
			       int type)
Linus Torvalds's avatar
Linus Torvalds committed
950
{
951
	if (type == ELEVATOR_FRONT_MERGE) {
Jens Axboe's avatar
Jens Axboe committed
952
		struct cfq_queue *cfqq = RQ_CFQQ(req);
Linus Torvalds's avatar
Linus Torvalds committed
953

Jens Axboe's avatar
Jens Axboe committed
954
		cfq_reposition_rq_rb(cfqq, req);
Linus Torvalds's avatar
Linus Torvalds committed
955 956 957 958
	}
}

static void
959
cfq_merged_requests(struct request_queue *q, struct request *rq,
Linus Torvalds's avatar
Linus Torvalds committed
960 961
		    struct request *next)
{
962
	struct cfq_queue *cfqq = RQ_CFQQ(rq);