cfq-iosched.c 124 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6
/*
 *  CFQ, or complete fairness queueing, disk scheduler.
 *
 *  Based on ideas from a previously unfinished io
 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
 *
7
 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
Linus Torvalds's avatar
Linus Torvalds committed
8 9
 */
#include <linux/module.h>
10
#include <linux/slab.h>
Al Viro's avatar
Al Viro committed
11 12
#include <linux/blkdev.h>
#include <linux/elevator.h>
Randy Dunlap's avatar
Randy Dunlap committed
13
#include <linux/jiffies.h>
Linus Torvalds's avatar
Linus Torvalds committed
14
#include <linux/rbtree.h>
15
#include <linux/ioprio.h>
16
#include <linux/blktrace_api.h>
17
#include <linux/blk-cgroup.h>
18
#include "blk.h"
Linus Torvalds's avatar
Linus Torvalds committed
19 20 21 22

/*
 * tunables
 */
23
/* max queue in one round of service */
Shaohua Li's avatar
Shaohua Li committed
24
static const int cfq_quantum = 8;
25
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
26 27 28 29
/* maximum backwards seek, in KiB */
static const int cfq_back_max = 16 * 1024;
/* penalty of a backwards seek */
static const int cfq_back_penalty = 2;
30
static const int cfq_slice_sync = HZ / 10;
Jens Axboe's avatar
Jens Axboe committed
31
static int cfq_slice_async = HZ / 25;
32
static const int cfq_slice_async_rq = 2;
33
static int cfq_slice_idle = HZ / 125;
34
static int cfq_group_idle = HZ / 125;
35 36
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
static const int cfq_hist_divisor = 4;
37

38
/*
39
 * offset from end of service tree
40
 */
41
#define CFQ_IDLE_DELAY		(HZ / 5)
42 43 44 45 46 47

/*
 * below this threshold, we consider thinktime immediate
 */
#define CFQ_MIN_TT		(2)

48
#define CFQ_SLICE_SCALE		(5)
49
#define CFQ_HW_QUEUE_MIN	(5)
50
#define CFQ_SERVICE_SHIFT       12
51

52
#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
53
#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
54
#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
55
#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
56

57 58 59
#define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
Linus Torvalds's avatar
Linus Torvalds committed
60

61
static struct kmem_cache *cfq_pool;
Linus Torvalds's avatar
Linus Torvalds committed
62

63 64 65 66
#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

67
#define sample_valid(samples)	((samples) > 80)
68
#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
69

70
/* blkio-related constants */
71 72 73
#define CFQ_WEIGHT_LEGACY_MIN	10
#define CFQ_WEIGHT_LEGACY_DFL	500
#define CFQ_WEIGHT_LEGACY_MAX	1000
74

75 76 77 78 79 80 81 82
struct cfq_ttime {
	unsigned long last_end_request;

	unsigned long ttime_total;
	unsigned long ttime_samples;
	unsigned long ttime_mean;
};

83 84 85 86 87 88 89 90 91
/*
 * Most of our rbtree usage is for sorting with min extraction, so
 * if we cache the leftmost node we don't have to walk down the tree
 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
 * move this into the elevator for the rq sorting as well.
 */
struct cfq_rb_root {
	struct rb_root rb;
	struct rb_node *left;
92
	unsigned count;
93
	u64 min_vdisktime;
94
	struct cfq_ttime ttime;
95
};
96 97
#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, \
			.ttime = {.last_end_request = jiffies,},}
98

99 100 101 102 103
/*
 * Per process-grouping structure
 */
struct cfq_queue {
	/* reference count */
104
	int ref;
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
	/* various state flags, see below */
	unsigned int flags;
	/* parent cfq_data */
	struct cfq_data *cfqd;
	/* service_tree member */
	struct rb_node rb_node;
	/* service_tree key */
	unsigned long rb_key;
	/* prio tree member */
	struct rb_node p_node;
	/* prio tree root we belong to, if any */
	struct rb_root *p_root;
	/* sorted list of pending requests */
	struct rb_root sort_list;
	/* if fifo isn't expired, next request to serve */
	struct request *next_rq;
	/* requests queued in sort_list */
	int queued[2];
	/* currently allocated requests */
	int allocated[2];
	/* fifo list of requests in sort_list */
	struct list_head fifo;

128 129
	/* time when queue got scheduled in to dispatch first request. */
	unsigned long dispatch_start;
130
	unsigned int allocated_slice;
131
	unsigned int slice_dispatch;
132 133
	/* time when first request from queue completed and slice started. */
	unsigned long slice_start;
134 135 136
	unsigned long slice_end;
	long slice_resid;

137 138
	/* pending priority requests */
	int prio_pending;
139 140 141 142 143
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;

	/* io prio of this group */
	unsigned short ioprio, org_ioprio;
144
	unsigned short ioprio_class;
145

146 147
	pid_t pid;

148
	u32 seek_history;
149 150
	sector_t last_request_pos;

151
	struct cfq_rb_root *service_tree;
Jeff Moyer's avatar
Jeff Moyer committed
152
	struct cfq_queue *new_cfqq;
153
	struct cfq_group *cfqg;
154 155
	/* Number of sectors dispatched from queue in single dispatch round */
	unsigned long nr_sectors;
156 157
};

158
/*
159
 * First index in the service_trees.
160 161
 * IDLE is handled separately, so it has negative index
 */
162
enum wl_class_t {
163
	BE_WORKLOAD = 0,
164 165
	RT_WORKLOAD = 1,
	IDLE_WORKLOAD = 2,
166
	CFQ_PRIO_NR,
167 168
};

169 170 171 172 173 174 175 176 177
/*
 * Second index in the service_trees.
 */
enum wl_type_t {
	ASYNC_WORKLOAD = 0,
	SYNC_NOIDLE_WORKLOAD = 1,
	SYNC_WORKLOAD = 2
};

178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
struct cfqg_stats {
#ifdef CONFIG_CFQ_GROUP_IOSCHED
	/* number of ios merged */
	struct blkg_rwstat		merged;
	/* total time spent on device in ns, may not be accurate w/ queueing */
	struct blkg_rwstat		service_time;
	/* total time spent waiting in scheduler queue in ns */
	struct blkg_rwstat		wait_time;
	/* number of IOs queued up */
	struct blkg_rwstat		queued;
	/* total disk time and nr sectors dispatched by this group */
	struct blkg_stat		time;
#ifdef CONFIG_DEBUG_BLK_CGROUP
	/* time not charged to this cgroup */
	struct blkg_stat		unaccounted_time;
	/* sum of number of ios queued across all samples */
	struct blkg_stat		avg_queue_size_sum;
	/* count of samples taken for average */
	struct blkg_stat		avg_queue_size_samples;
	/* how many times this group has been removed from service tree */
	struct blkg_stat		dequeue;
	/* total time spent waiting for it to be assigned a timeslice. */
	struct blkg_stat		group_wait_time;
Tejun Heo's avatar
Tejun Heo committed
201
	/* time spent idling for this blkcg_gq */
202 203 204 205 206 207 208 209 210 211 212 213
	struct blkg_stat		idle_time;
	/* total time with empty current active q with other requests queued */
	struct blkg_stat		empty_time;
	/* fields after this shouldn't be cleared on stat reset */
	uint64_t			start_group_wait_time;
	uint64_t			start_idle_time;
	uint64_t			start_empty_time;
	uint16_t			flags;
#endif	/* CONFIG_DEBUG_BLK_CGROUP */
#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
};

214 215 216
/* Per-cgroup data */
struct cfq_group_data {
	/* must be the first member */
217
	struct blkcg_policy_data cpd;
218 219 220 221 222

	unsigned int weight;
	unsigned int leaf_weight;
};

223 224
/* This is per cgroup per device grouping structure */
struct cfq_group {
225 226 227
	/* must be the first member */
	struct blkg_policy_data pd;

228 229 230 231 232
	/* group service_tree member */
	struct rb_node rb_node;

	/* group service_tree key */
	u64 vdisktime;
Tejun Heo's avatar
Tejun Heo committed
233

234 235 236 237 238 239 240 241 242 243 244 245
	/*
	 * The number of active cfqgs and sum of their weights under this
	 * cfqg.  This covers this cfqg's leaf_weight and all children's
	 * weights, but does not cover weights of further descendants.
	 *
	 * If a cfqg is on the service tree, it's active.  An active cfqg
	 * also activates its parent and contributes to the children_weight
	 * of the parent.
	 */
	int nr_active;
	unsigned int children_weight;

246 247 248 249 250 251 252 253 254 255 256 257
	/*
	 * vfraction is the fraction of vdisktime that the tasks in this
	 * cfqg are entitled to.  This is determined by compounding the
	 * ratios walking up from this cfqg to the root.
	 *
	 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
	 * vfractions on a service tree is approximately 1.  The sum may
	 * deviate a bit due to rounding errors and fluctuations caused by
	 * cfqgs entering and leaving the service tree.
	 */
	unsigned int vfraction;

Tejun Heo's avatar
Tejun Heo committed
258 259 260 261 262 263
	/*
	 * There are two weights - (internal) weight is the weight of this
	 * cfqg against the sibling cfqgs.  leaf_weight is the wight of
	 * this cfqg against the child cfqgs.  For the root cfqg, both
	 * weights are kept in sync for backward compatibility.
	 */
264
	unsigned int weight;
265
	unsigned int new_weight;
266
	unsigned int dev_weight;
267

Tejun Heo's avatar
Tejun Heo committed
268 269 270 271
	unsigned int leaf_weight;
	unsigned int new_leaf_weight;
	unsigned int dev_leaf_weight;

272 273 274
	/* number of cfqq currently on this group */
	int nr_cfqq;

275
	/*
276
	 * Per group busy queues average. Useful for workload slice calc. We
277 278 279 280 281 282 283 284 285 286 287
	 * create the array for each prio class but at run time it is used
	 * only for RT and BE class and slot for IDLE class remains unused.
	 * This is primarily done to avoid confusion and a gcc warning.
	 */
	unsigned int busy_queues_avg[CFQ_PRIO_NR];
	/*
	 * rr lists of queues with requests. We maintain service trees for
	 * RT and BE classes. These trees are subdivided in subclasses
	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
	 * class there is no subclassification and all the cfq queues go on
	 * a single tree service_tree_idle.
288 289 290 291
	 * Counts are embedded in the cfq_rb_root
	 */
	struct cfq_rb_root service_trees[2][3];
	struct cfq_rb_root service_tree_idle;
292

293 294 295
	unsigned long saved_wl_slice;
	enum wl_type_t saved_wl_type;
	enum wl_class_t saved_wl_class;
296

297 298
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;
299
	struct cfq_ttime ttime;
300
	struct cfqg_stats stats;	/* stats for this cfqg */
301 302 303 304 305

	/* async queue for each priority case */
	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
	struct cfq_queue *async_idle_cfqq;

306
};
307

308 309 310 311
struct cfq_io_cq {
	struct io_cq		icq;		/* must be the first member */
	struct cfq_queue	*cfqq[2];
	struct cfq_ttime	ttime;
Tejun Heo's avatar
Tejun Heo committed
312 313
	int			ioprio;		/* the current ioprio */
#ifdef CONFIG_CFQ_GROUP_IOSCHED
Tejun Heo's avatar
Tejun Heo committed
314
	uint64_t		blkcg_serial_nr; /* the current blkcg serial */
Tejun Heo's avatar
Tejun Heo committed
315
#endif
316 317
};

318 319 320
/*
 * Per block device queue structure
 */
Linus Torvalds's avatar
Linus Torvalds committed
321
struct cfq_data {
322
	struct request_queue *queue;
323 324
	/* Root service tree for cfq_groups */
	struct cfq_rb_root grp_service_tree;
325
	struct cfq_group *root_group;
326

327 328
	/*
	 * The priority currently being served
329
	 */
330 331
	enum wl_class_t serving_wl_class;
	enum wl_type_t serving_wl_type;
332
	unsigned long workload_expires;
333
	struct cfq_group *serving_group;
334 335 336 337 338 339 340 341

	/*
	 * Each priority tree is sorted by next_request position.  These
	 * trees are used when determining if two or more queues are
	 * interleaving requests (see cfq_close_cooperator).
	 */
	struct rb_root prio_trees[CFQ_PRIO_LISTS];

342
	unsigned int busy_queues;
343
	unsigned int busy_sync_queues;
344

345 346
	int rq_in_driver;
	int rq_in_flight[2];
347 348 349 350 351

	/*
	 * queue-depth detection
	 */
	int rq_queued;
352
	int hw_tag;
353 354 355 356 357 358 359 360
	/*
	 * hw_tag can be
	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
	 *  0 => no NCQ
	 */
	int hw_tag_est_depth;
	unsigned int hw_tag_samples;
Linus Torvalds's avatar
Linus Torvalds committed
361

362 363 364 365
	/*
	 * idle window management
	 */
	struct timer_list idle_slice_timer;
366
	struct work_struct unplug_work;
Linus Torvalds's avatar
Linus Torvalds committed
367

368
	struct cfq_queue *active_queue;
369
	struct cfq_io_cq *active_cic;
370

Jens Axboe's avatar
Jens Axboe committed
371
	sector_t last_position;
Linus Torvalds's avatar
Linus Torvalds committed
372 373 374 375 376

	/*
	 * tunables, see top of file
	 */
	unsigned int cfq_quantum;
377
	unsigned int cfq_fifo_expire[2];
Linus Torvalds's avatar
Linus Torvalds committed
378 379
	unsigned int cfq_back_penalty;
	unsigned int cfq_back_max;
380 381 382
	unsigned int cfq_slice[2];
	unsigned int cfq_slice_async_rq;
	unsigned int cfq_slice_idle;
383
	unsigned int cfq_group_idle;
384
	unsigned int cfq_latency;
385
	unsigned int cfq_target_latency;
386

387 388 389 390
	/*
	 * Fallback dummy cfqq for extreme OOM conditions
	 */
	struct cfq_queue oom_cfqq;
391

392
	unsigned long last_delayed_sync;
Linus Torvalds's avatar
Linus Torvalds committed
393 394
};

395
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
396
static void cfq_put_queue(struct cfq_queue *cfqq);
397

398
static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
399
					    enum wl_class_t class,
400
					    enum wl_type_t type)
401
{
402 403 404
	if (!cfqg)
		return NULL;

405
	if (class == IDLE_WORKLOAD)
406
		return &cfqg->service_tree_idle;
407

408
	return &cfqg->service_trees[class][type];
409 410
}

Jens Axboe's avatar
Jens Axboe committed
411
enum cfqq_state_flags {
412 413
	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
414
	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
415 416 417 418
	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
419
	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
420
	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
421
	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
422
	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
423
	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
424
	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
Jens Axboe's avatar
Jens Axboe committed
425 426 427 428 429
};

#define CFQ_CFQQ_FNS(name)						\
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
{									\
430
	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
Jens Axboe's avatar
Jens Axboe committed
431 432 433
}									\
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
{									\
434
	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
Jens Axboe's avatar
Jens Axboe committed
435 436 437
}									\
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
{									\
438
	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
Jens Axboe's avatar
Jens Axboe committed
439 440 441 442
}

CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
443
CFQ_CFQQ_FNS(must_dispatch);
Jens Axboe's avatar
Jens Axboe committed
444 445 446 447
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
448
CFQ_CFQQ_FNS(slice_new);
449
CFQ_CFQQ_FNS(sync);
450
CFQ_CFQQ_FNS(coop);
451
CFQ_CFQQ_FNS(split_coop);
452
CFQ_CFQQ_FNS(deep);
453
CFQ_CFQQ_FNS(wait_busy);
Jens Axboe's avatar
Jens Axboe committed
454 455
#undef CFQ_CFQQ_FNS

456
#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
457

458 459 460 461 462
/* cfqg stats flags */
enum cfqg_stats_flags {
	CFQG_stats_waiting = 0,
	CFQG_stats_idling,
	CFQG_stats_empty,
463 464
};

465 466
#define CFQG_FLAG_FNS(name)						\
static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)	\
467
{									\
468
	stats->flags |= (1 << CFQG_stats_##name);			\
469
}									\
470
static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)	\
471
{									\
472
	stats->flags &= ~(1 << CFQG_stats_##name);			\
473
}									\
474
static inline int cfqg_stats_##name(struct cfqg_stats *stats)		\
475
{									\
476
	return (stats->flags & (1 << CFQG_stats_##name)) != 0;		\
477 478
}									\

479 480 481 482
CFQG_FLAG_FNS(waiting)
CFQG_FLAG_FNS(idling)
CFQG_FLAG_FNS(empty)
#undef CFQG_FLAG_FNS
483 484

/* This should be called with the queue_lock held. */
485
static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
486 487 488
{
	unsigned long long now;

489
	if (!cfqg_stats_waiting(stats))
490 491 492 493 494 495
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_group_wait_time))
		blkg_stat_add(&stats->group_wait_time,
			      now - stats->start_group_wait_time);
496
	cfqg_stats_clear_waiting(stats);
497 498 499
}

/* This should be called with the queue_lock held. */
500 501
static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
						 struct cfq_group *curr_cfqg)
502
{
503
	struct cfqg_stats *stats = &cfqg->stats;
504

505
	if (cfqg_stats_waiting(stats))
506
		return;
507
	if (cfqg == curr_cfqg)
508
		return;
509 510
	stats->start_group_wait_time = sched_clock();
	cfqg_stats_mark_waiting(stats);
511 512 513
}

/* This should be called with the queue_lock held. */
514
static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
515 516 517
{
	unsigned long long now;

518
	if (!cfqg_stats_empty(stats))
519 520 521 522 523 524
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_empty_time))
		blkg_stat_add(&stats->empty_time,
			      now - stats->start_empty_time);
525
	cfqg_stats_clear_empty(stats);
526 527
}

528
static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
529
{
530
	blkg_stat_add(&cfqg->stats.dequeue, 1);
531 532
}

533
static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
534
{
535
	struct cfqg_stats *stats = &cfqg->stats;
536

537
	if (blkg_rwstat_total(&stats->queued))
538 539 540 541 542 543 544
		return;

	/*
	 * group is already marked empty. This can happen if cfqq got new
	 * request in parent group and moved to this group while being added
	 * to service tree. Just ignore the event and move on.
	 */
545
	if (cfqg_stats_empty(stats))
546 547 548
		return;

	stats->start_empty_time = sched_clock();
549
	cfqg_stats_mark_empty(stats);
550 551
}

552
static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
553
{
554
	struct cfqg_stats *stats = &cfqg->stats;
555

556
	if (cfqg_stats_idling(stats)) {
557 558 559 560 561
		unsigned long long now = sched_clock();

		if (time_after64(now, stats->start_idle_time))
			blkg_stat_add(&stats->idle_time,
				      now - stats->start_idle_time);
562
		cfqg_stats_clear_idling(stats);
563 564 565
	}
}

566
static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
567
{
568
	struct cfqg_stats *stats = &cfqg->stats;
569

570
	BUG_ON(cfqg_stats_idling(stats));
571 572

	stats->start_idle_time = sched_clock();
573
	cfqg_stats_mark_idling(stats);
574 575
}

576
static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
577
{
578
	struct cfqg_stats *stats = &cfqg->stats;
579 580

	blkg_stat_add(&stats->avg_queue_size_sum,
581
		      blkg_rwstat_total(&stats->queued));
582
	blkg_stat_add(&stats->avg_queue_size_samples, 1);
583
	cfqg_stats_update_group_wait_time(stats);
584 585 586 587
}

#else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

Tejun Heo's avatar
Tejun Heo committed
588 589 590 591 592 593 594
static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
595 596 597 598

#endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

#ifdef CONFIG_CFQ_GROUP_IOSCHED
599

600 601 602 603 604 605 606 607
static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
{
	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
}

static struct cfq_group_data
*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
{
608
	return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
609 610 611 612 613 614 615
}

static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
{
	return pd_to_blkg(&cfqg->pd);
}

616 617 618 619 620 621 622
static struct blkcg_policy blkcg_policy_cfq;

static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
{
	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
}

623 624 625 626 627
static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
{
	return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
}

628
static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
629
{
630
	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
631

632
	return pblkg ? blkg_to_cfqg(pblkg) : NULL;
633 634
}

635 636 637 638 639 640 641
static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
				      struct cfq_group *ancestor)
{
	return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
				    cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
}

642 643 644 645 646 647 648 649 650 651
static inline void cfqg_get(struct cfq_group *cfqg)
{
	return blkg_get(cfqg_to_blkg(cfqg));
}

static inline void cfqg_put(struct cfq_group *cfqg)
{
	return blkg_put(cfqg_to_blkg(cfqg));
}

Tejun Heo's avatar
Tejun Heo committed
652 653 654 655
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	do {			\
	char __pbuf[128];						\
									\
	blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));	\
656 657 658
	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
Tejun Heo's avatar
Tejun Heo committed
659 660 661 662 663 664 665 666 667
			  __pbuf, ##args);				\
} while (0)

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)	do {			\
	char __pbuf[128];						\
									\
	blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf));		\
	blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args);	\
} while (0)
668

669
static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
670 671
					    struct cfq_group *curr_cfqg, int op,
					    int op_flags)
672
{
673
	blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, 1);
674 675
	cfqg_stats_end_empty_time(&cfqg->stats);
	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
676 677
}

678 679
static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
			unsigned long time, unsigned long unaccounted_time)
680
{
681
	blkg_stat_add(&cfqg->stats.time, time);
682
#ifdef CONFIG_DEBUG_BLK_CGROUP
683
	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
684
#endif
685 686
}

687 688
static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op,
					       int op_flags)
689
{
690
	blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, -1);
691 692
}

693 694
static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op,
					       int op_flags)
695
{
696
	blkg_rwstat_add(&cfqg->stats.merged, op, op_flags, 1);
697 698
}

699
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
700 701
			uint64_t start_time, uint64_t io_start_time, int op,
			int op_flags)
702
{
703
	struct cfqg_stats *stats = &cfqg->stats;
704 705 706
	unsigned long long now = sched_clock();

	if (time_after64(now, io_start_time))
707 708
		blkg_rwstat_add(&stats->service_time, op, op_flags,
				now - io_start_time);
709
	if (time_after64(io_start_time, start_time))
710
		blkg_rwstat_add(&stats->wait_time, op, op_flags,
711
				io_start_time - start_time);
712 713
}

714 715
/* @stats = 0 */
static void cfqg_stats_reset(struct cfqg_stats *stats)
716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732
{
	/* queued stats shouldn't be cleared */
	blkg_rwstat_reset(&stats->merged);
	blkg_rwstat_reset(&stats->service_time);
	blkg_rwstat_reset(&stats->wait_time);
	blkg_stat_reset(&stats->time);
#ifdef CONFIG_DEBUG_BLK_CGROUP
	blkg_stat_reset(&stats->unaccounted_time);
	blkg_stat_reset(&stats->avg_queue_size_sum);
	blkg_stat_reset(&stats->avg_queue_size_samples);
	blkg_stat_reset(&stats->dequeue);
	blkg_stat_reset(&stats->group_wait_time);
	blkg_stat_reset(&stats->idle_time);
	blkg_stat_reset(&stats->empty_time);
#endif
}

733
/* @to += @from */
734
static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
735 736
{
	/* queued stats shouldn't be cleared */
737 738 739 740
	blkg_rwstat_add_aux(&to->merged, &from->merged);
	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
	blkg_stat_add_aux(&from->time, &from->time);
741
#ifdef CONFIG_DEBUG_BLK_CGROUP
742 743 744 745 746 747 748
	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
749 750 751 752
#endif
}

/*
753
 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
754 755 756 757 758 759 760 761 762 763 764 765
 * recursive stats can still account for the amount used by this cfqg after
 * it's gone.
 */
static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
{
	struct cfq_group *parent = cfqg_parent(cfqg);

	lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);

	if (unlikely(!parent))
		return;

766
	cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
767 768 769
	cfqg_stats_reset(&cfqg->stats);
}

770 771
#else	/* CONFIG_CFQ_GROUP_IOSCHED */

772
static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
773 774 775 776 777
static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
				      struct cfq_group *ancestor)
{
	return true;
}
778 779 780
static inline void cfqg_get(struct cfq_group *cfqg) { }
static inline void cfqg_put(struct cfq_group *cfqg) { }

781
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
782 783 784 785
	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid,	\
			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
				##args)
786
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
787

788
static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
789
			struct cfq_group *curr_cfqg, int op, int op_flags) { }
790 791
static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
			unsigned long time, unsigned long unaccounted_time) { }
792 793 794 795
static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op,
			int op_flags) { }
static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op,
			int op_flags) { }
796
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
797 798
			uint64_t start_time, uint64_t io_start_time, int op,
			int op_flags) { }
799

800 801
#endif	/* CONFIG_CFQ_GROUP_IOSCHED */

802 803 804
#define cfq_log(cfqd, fmt, args...)	\
	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

805 806 807 808 809 810 811 812 813 814
/* Traverses through cfq group service trees */
#define for_each_cfqg_st(cfqg, i, j, st) \
	for (i = 0; i <= IDLE_WORKLOAD; i++) \
		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
			: &cfqg->service_tree_idle; \
			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
			(i == IDLE_WORKLOAD && j == 0); \
			j++, st = i < IDLE_WORKLOAD ? \
			&cfqg->service_trees[i][j]: NULL) \

815 816 817 818 819 820 821 822 823 824 825 826
static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
	struct cfq_ttime *ttime, bool group_idle)
{
	unsigned long slice;
	if (!sample_valid(ttime->ttime_samples))
		return false;
	if (group_idle)
		slice = cfqd->cfq_group_idle;
	else
		slice = cfqd->cfq_slice_idle;
	return ttime->ttime_mean > slice;
}
827

828 829 830 831 832 833 834 835 836 837 838 839 840 841 842
static inline bool iops_mode(struct cfq_data *cfqd)
{
	/*
	 * If we are not idling on queues and it is a NCQ drive, parallel
	 * execution of requests is on and measuring time is not possible
	 * in most of the cases until and unless we drive shallower queue
	 * depths and that becomes a performance bottleneck. In such cases
	 * switch to start providing fairness in terms of number of IOs.
	 */
	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
		return true;
	else
		return false;
}

843
static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
844 845 846 847 848 849 850 851
{
	if (cfq_class_idle(cfqq))
		return IDLE_WORKLOAD;
	if (cfq_class_rt(cfqq))
		return RT_WORKLOAD;
	return BE_WORKLOAD;
}

852 853 854 855 856 857 858 859 860 861

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
{
	if (!cfq_cfqq_sync(cfqq))
		return ASYNC_WORKLOAD;
	if (!cfq_cfqq_idle_window(cfqq))
		return SYNC_NOIDLE_WORKLOAD;
	return SYNC_WORKLOAD;
}

862
static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
863 864
					struct cfq_data *cfqd,
					struct cfq_group *cfqg)
865
{
866
	if (wl_class == IDLE_WORKLOAD)
867
		return cfqg->service_tree_idle.count;
868

869 870 871
	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
		cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
		cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
872 873
}

874 875 876
static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
					struct cfq_group *cfqg)
{
877 878
	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
		cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
879 880
}

881
static void cfq_dispatch_insert(struct request_queue *, struct request *);
882
static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
883
				       struct cfq_io_cq *cic, struct bio *bio);
884

885 886 887 888 889 890
static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
{
	/* cic->icq is the first member, %NULL will convert to %NULL */
	return container_of(icq, struct cfq_io_cq, icq);
}

891 892 893 894 895 896 897 898
static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
					       struct io_context *ioc)
{
	if (ioc)
		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
	return NULL;
}

899
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
900
{
901
	return cic->cfqq[is_sync];
902 903
}

904 905
static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
				bool is_sync)
906
{
907
	cic->cfqq[is_sync] = cfqq;
908 909
}

910
static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
911
{
912
	return cic->icq.q->elevator->elevator_data;
913 914
}

915 916 917 918
/*
 * We regard a request as SYNC, if it's either a read or has the SYNC bit
 * set (in which case it could also be direct WRITE).
 */
919
static inline bool cfq_bio_sync(struct bio *bio)
920
{
921
	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
922
}
Linus Torvalds's avatar
Linus Torvalds committed
923

Andrew Morton's avatar
Andrew Morton committed
924 925 926 927
/*
 * scheduler run of queue, if there are requests pending and no one in the
 * driver that will restart queueing
 */
928
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
Andrew Morton's avatar
Andrew Morton committed
929
{
930 931
	if (cfqd->busy_queues) {
		cfq_log(cfqd, "schedule dispatch");
932
		kblockd_schedule_work(&cfqd->unplug_work);
933
	}
Andrew Morton's avatar
Andrew Morton committed
934 935
}

936 937 938 939 940
/*
 * Scale schedule slice based on io priority. Use the sync time slice only
 * if a queue is marked sync and has sync io queued. A sync queue with async
 * io only, should not get full sync slice length.
 */
941
static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
942
				 unsigned short prio)
943
{
944
	const int base_slice = cfqd->cfq_slice[sync];
945

946 947 948 949
	WARN_ON(prio >= IOPRIO_BE_NR);

	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
}
950

951 952 953 954
static inline int
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
955 956
}

957 958 959 960 961 962 963 964 965 966 967 968 969 970
/**
 * cfqg_scale_charge - scale disk time charge according to cfqg weight
 * @charge: disk time being charged
 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
 *
 * Scale @charge according to @vfraction, which is in range (0, 1].  The
 * scaling is inversely proportional.
 *
 * scaled = charge / vfraction
 *
 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
 */
static inline u64 cfqg_scale_charge(unsigned long charge,
				    unsigned int vfraction)
971
{
972
	u64 c = charge << CFQ_SERVICE_SHIFT;	/* make it fixed point */
973

974 975 976 977
	/* charge / vfraction */
	c <<= CFQ_SERVICE_SHIFT;
	do_div(c, vfraction);
	return c;
978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003
}

static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
{
	s64 delta = (s64)(vdisktime - min_vdisktime);
	if (delta > 0)
		min_vdisktime = vdisktime;

	return min_vdisktime;
}

static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
{
	s64 delta = (s64)(vdisktime - min_vdisktime);
	if (delta < 0)
		min_vdisktime = vdisktime;

	return min_vdisktime;
}

static void update_min_vdisktime(struct cfq_rb_root *st)
{
	struct cfq_group *cfqg;

	if (st->left) {
		cfqg = rb_entry_cfqg(st->left);
1004 1005
		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
						  cfqg->vdisktime);
1006 1007 1008
	}
}

1009 1010 1011 1012 1013 1014
/*
 * get averaged number of queues of RT/BE priority.
 * average is updated, with a formula that gives more weight to higher numbers,
 * to quickly follows sudden increases and decrease slowly
 */

1015 1016
static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
					struct cfq_group *cfqg, bool rt)
1017
{
1018 1019 1020
	unsigned min_q, max_q;
	unsigned mult  = cfq_hist_divisor - 1;
	unsigned round = cfq_hist_divisor / 2;
1021
	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
1022

1023 1024 1025
	min_q = min(cfqg->busy_queues_avg[rt], busy);
	max_q = max(cfqg->busy_queues_avg[rt], busy);
	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
1026
		cfq_hist_divisor;
1027 1028 1029 1030 1031 1032
	return cfqg->busy_queues_avg[rt];
}

static inline unsigned
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
1033
	return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
1034 1035
}

1036
static inline unsigned
1037
cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1038
{
1039 1040
	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
	if (cfqd->cfq_latency) {
1041 1042 1043 1044 1045 1046
		/*
		 * interested queues (we consider only the ones with the same
		 * priority class in the cfq group)
		 */
		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
						cfq_class_rt(cfqq));
1047 1048
		unsigned sync_slice = cfqd->cfq_slice[1];
		unsigned expect_latency = sync_slice * iq;
1049 1050 1051
		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);

		if (expect_latency > group_slice) {
1052 1053 1054 1055 1056 1057 1058
			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
			/* scale low_slice according to IO priority
			 * and sync vs async */
			unsigned low_slice =
				min(slice, base_low_slice * slice / sync_slice);
			/* the adapted slice value is scaled to fit all iqs
			 * into the target latency */
1059
			slice = max(slice * group_slice / expect_latency,
1060 1061 1062
				    low_slice);
		}
	}
1063 1064 1065 1066 1067 1068
	return slice;
}

static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
1069
	unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1070

1071
	cfqq->slice_start = jiffies;
1072
	cfqq->slice_end = jiffies + slice;
1073
	cfqq->allocated_slice = slice;
1074
	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
1075 1076 1077 1078 1079 1080 1081
}

/*
 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
 * isn't valid until the first request from the dispatch is activated
 * and the slice time set.
 */
1082
static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1083 1084
{
	if (cfq_cfqq_slice_new(cfqq))
Shaohua Li's avatar
Shaohua Li committed
1085
		return false;
1086
	if (time_before(jiffies, cfqq->slice_end))
Shaohua Li's avatar
Shaohua Li committed
1087
		return false;
1088

Shaohua Li's avatar
Shaohua Li committed
1089
	return true;
1090 1091
}

Linus Torvalds's avatar
Linus Torvalds committed
1092
/*
Jens Axboe's avatar
Jens Axboe committed
1093
 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
Linus Torvalds's avatar
Linus Torvalds committed
1094
 * We choose the request that is closest to the head right now. Distance
1095
 * behind the head is penalized and only allowed to a certain extent.
Linus Torvalds's avatar
Linus Torvalds committed
1096
 */
Jens Axboe's avatar
Jens Axboe committed
1097
static struct request *
1098
cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
Linus Torvalds's avatar
Linus Torvalds committed
1099
{
1100
	sector_t s1, s2, d1 = 0, d2 = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1101
	unsigned long back_max;
1102 1103 1104
#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
Linus Torvalds's avatar
Linus Torvalds committed
1105

Jens Axboe's avatar
Jens Axboe committed
1106 1107 1108 1109
	if (rq1 == NULL || rq1 == rq2)
		return rq2;
	if (rq2 == NULL)
		return rq1;