cfq-iosched.c 122 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6
/*
 *  CFQ, or complete fairness queueing, disk scheduler.
 *
 *  Based on ideas from a previously unfinished io
 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
 *
7
 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
Linus Torvalds's avatar
Linus Torvalds committed
8 9
 */
#include <linux/module.h>
10
#include <linux/slab.h>
Al Viro's avatar
Al Viro committed
11 12
#include <linux/blkdev.h>
#include <linux/elevator.h>
Randy Dunlap's avatar
Randy Dunlap committed
13
#include <linux/jiffies.h>
Linus Torvalds's avatar
Linus Torvalds committed
14
#include <linux/rbtree.h>
15
#include <linux/ioprio.h>
16
#include <linux/blktrace_api.h>
17
#include <linux/blk-cgroup.h>
18
#include "blk.h"
Linus Torvalds's avatar
Linus Torvalds committed
19 20 21 22

/*
 * tunables
 */
23
/* max queue in one round of service */
Shaohua Li's avatar
Shaohua Li committed
24
static const int cfq_quantum = 8;
25
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
26 27 28 29
/* maximum backwards seek, in KiB */
static const int cfq_back_max = 16 * 1024;
/* penalty of a backwards seek */
static const int cfq_back_penalty = 2;
30
static const int cfq_slice_sync = HZ / 10;
Jens Axboe's avatar
Jens Axboe committed
31
static int cfq_slice_async = HZ / 25;
32
static const int cfq_slice_async_rq = 2;
33
static int cfq_slice_idle = HZ / 125;
34
static int cfq_group_idle = HZ / 125;
35 36
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
static const int cfq_hist_divisor = 4;
37

38
/*
39
 * offset from end of service tree
40
 */
41
#define CFQ_IDLE_DELAY		(HZ / 5)
42 43 44 45 46 47

/*
 * below this threshold, we consider thinktime immediate
 */
#define CFQ_MIN_TT		(2)

48
#define CFQ_SLICE_SCALE		(5)
49
#define CFQ_HW_QUEUE_MIN	(5)
50
#define CFQ_SERVICE_SHIFT       12
51

52
#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
53
#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
54
#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
55
#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
56

57 58 59
#define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
Linus Torvalds's avatar
Linus Torvalds committed
60

61
static struct kmem_cache *cfq_pool;
Linus Torvalds's avatar
Linus Torvalds committed
62

63 64 65 66
#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

67
#define sample_valid(samples)	((samples) > 80)
68
#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
69

70 71 72 73 74
/* blkio-related constants */
#define CFQ_WEIGHT_MIN          10
#define CFQ_WEIGHT_MAX          1000
#define CFQ_WEIGHT_DEFAULT      500

75 76 77 78 79 80 81 82
struct cfq_ttime {
	unsigned long last_end_request;

	unsigned long ttime_total;
	unsigned long ttime_samples;
	unsigned long ttime_mean;
};

83 84 85 86 87 88 89 90 91
/*
 * Most of our rbtree usage is for sorting with min extraction, so
 * if we cache the leftmost node we don't have to walk down the tree
 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
 * move this into the elevator for the rq sorting as well.
 */
struct cfq_rb_root {
	struct rb_root rb;
	struct rb_node *left;
92
	unsigned count;
93
	u64 min_vdisktime;
94
	struct cfq_ttime ttime;
95
};
96 97
#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, \
			.ttime = {.last_end_request = jiffies,},}
98

99 100 101 102 103
/*
 * Per process-grouping structure
 */
struct cfq_queue {
	/* reference count */
104
	int ref;
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
	/* various state flags, see below */
	unsigned int flags;
	/* parent cfq_data */
	struct cfq_data *cfqd;
	/* service_tree member */
	struct rb_node rb_node;
	/* service_tree key */
	unsigned long rb_key;
	/* prio tree member */
	struct rb_node p_node;
	/* prio tree root we belong to, if any */
	struct rb_root *p_root;
	/* sorted list of pending requests */
	struct rb_root sort_list;
	/* if fifo isn't expired, next request to serve */
	struct request *next_rq;
	/* requests queued in sort_list */
	int queued[2];
	/* currently allocated requests */
	int allocated[2];
	/* fifo list of requests in sort_list */
	struct list_head fifo;

128 129
	/* time when queue got scheduled in to dispatch first request. */
	unsigned long dispatch_start;
130
	unsigned int allocated_slice;
131
	unsigned int slice_dispatch;
132 133
	/* time when first request from queue completed and slice started. */
	unsigned long slice_start;
134 135 136
	unsigned long slice_end;
	long slice_resid;

137 138
	/* pending priority requests */
	int prio_pending;
139 140 141 142 143
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;

	/* io prio of this group */
	unsigned short ioprio, org_ioprio;
144
	unsigned short ioprio_class;
145

146 147
	pid_t pid;

148
	u32 seek_history;
149 150
	sector_t last_request_pos;

151
	struct cfq_rb_root *service_tree;
Jeff Moyer's avatar
Jeff Moyer committed
152
	struct cfq_queue *new_cfqq;
153
	struct cfq_group *cfqg;
154 155
	/* Number of sectors dispatched from queue in single dispatch round */
	unsigned long nr_sectors;
156 157
};

158
/*
159
 * First index in the service_trees.
160 161
 * IDLE is handled separately, so it has negative index
 */
162
enum wl_class_t {
163
	BE_WORKLOAD = 0,
164 165
	RT_WORKLOAD = 1,
	IDLE_WORKLOAD = 2,
166
	CFQ_PRIO_NR,
167 168
};

169 170 171 172 173 174 175 176 177
/*
 * Second index in the service_trees.
 */
enum wl_type_t {
	ASYNC_WORKLOAD = 0,
	SYNC_NOIDLE_WORKLOAD = 1,
	SYNC_WORKLOAD = 2
};

178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
struct cfqg_stats {
#ifdef CONFIG_CFQ_GROUP_IOSCHED
	/* number of ios merged */
	struct blkg_rwstat		merged;
	/* total time spent on device in ns, may not be accurate w/ queueing */
	struct blkg_rwstat		service_time;
	/* total time spent waiting in scheduler queue in ns */
	struct blkg_rwstat		wait_time;
	/* number of IOs queued up */
	struct blkg_rwstat		queued;
	/* total disk time and nr sectors dispatched by this group */
	struct blkg_stat		time;
#ifdef CONFIG_DEBUG_BLK_CGROUP
	/* time not charged to this cgroup */
	struct blkg_stat		unaccounted_time;
	/* sum of number of ios queued across all samples */
	struct blkg_stat		avg_queue_size_sum;
	/* count of samples taken for average */
	struct blkg_stat		avg_queue_size_samples;
	/* how many times this group has been removed from service tree */
	struct blkg_stat		dequeue;
	/* total time spent waiting for it to be assigned a timeslice. */
	struct blkg_stat		group_wait_time;
Tejun Heo's avatar
Tejun Heo committed
201
	/* time spent idling for this blkcg_gq */
202 203 204 205 206 207 208 209 210 211 212 213
	struct blkg_stat		idle_time;
	/* total time with empty current active q with other requests queued */
	struct blkg_stat		empty_time;
	/* fields after this shouldn't be cleared on stat reset */
	uint64_t			start_group_wait_time;
	uint64_t			start_idle_time;
	uint64_t			start_empty_time;
	uint16_t			flags;
#endif	/* CONFIG_DEBUG_BLK_CGROUP */
#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
};

214 215 216
/* Per-cgroup data */
struct cfq_group_data {
	/* must be the first member */
217
	struct blkcg_policy_data cpd;
218 219 220 221 222

	unsigned int weight;
	unsigned int leaf_weight;
};

223 224
/* This is per cgroup per device grouping structure */
struct cfq_group {
225 226 227
	/* must be the first member */
	struct blkg_policy_data pd;

228 229 230 231 232
	/* group service_tree member */
	struct rb_node rb_node;

	/* group service_tree key */
	u64 vdisktime;
Tejun Heo's avatar
Tejun Heo committed
233

234 235 236 237 238 239 240 241 242 243 244 245
	/*
	 * The number of active cfqgs and sum of their weights under this
	 * cfqg.  This covers this cfqg's leaf_weight and all children's
	 * weights, but does not cover weights of further descendants.
	 *
	 * If a cfqg is on the service tree, it's active.  An active cfqg
	 * also activates its parent and contributes to the children_weight
	 * of the parent.
	 */
	int nr_active;
	unsigned int children_weight;

246 247 248 249 250 251 252 253 254 255 256 257
	/*
	 * vfraction is the fraction of vdisktime that the tasks in this
	 * cfqg are entitled to.  This is determined by compounding the
	 * ratios walking up from this cfqg to the root.
	 *
	 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
	 * vfractions on a service tree is approximately 1.  The sum may
	 * deviate a bit due to rounding errors and fluctuations caused by
	 * cfqgs entering and leaving the service tree.
	 */
	unsigned int vfraction;

Tejun Heo's avatar
Tejun Heo committed
258 259 260 261 262 263
	/*
	 * There are two weights - (internal) weight is the weight of this
	 * cfqg against the sibling cfqgs.  leaf_weight is the wight of
	 * this cfqg against the child cfqgs.  For the root cfqg, both
	 * weights are kept in sync for backward compatibility.
	 */
264
	unsigned int weight;
265
	unsigned int new_weight;
266
	unsigned int dev_weight;
267

Tejun Heo's avatar
Tejun Heo committed
268 269 270 271
	unsigned int leaf_weight;
	unsigned int new_leaf_weight;
	unsigned int dev_leaf_weight;

272 273 274
	/* number of cfqq currently on this group */
	int nr_cfqq;

275
	/*
276
	 * Per group busy queues average. Useful for workload slice calc. We
277 278 279 280 281 282 283 284 285 286 287
	 * create the array for each prio class but at run time it is used
	 * only for RT and BE class and slot for IDLE class remains unused.
	 * This is primarily done to avoid confusion and a gcc warning.
	 */
	unsigned int busy_queues_avg[CFQ_PRIO_NR];
	/*
	 * rr lists of queues with requests. We maintain service trees for
	 * RT and BE classes. These trees are subdivided in subclasses
	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
	 * class there is no subclassification and all the cfq queues go on
	 * a single tree service_tree_idle.
288 289 290 291
	 * Counts are embedded in the cfq_rb_root
	 */
	struct cfq_rb_root service_trees[2][3];
	struct cfq_rb_root service_tree_idle;
292

293 294 295
	unsigned long saved_wl_slice;
	enum wl_type_t saved_wl_type;
	enum wl_class_t saved_wl_class;
296

297 298
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;
299
	struct cfq_ttime ttime;
300
	struct cfqg_stats stats;	/* stats for this cfqg */
301 302 303 304 305

	/* async queue for each priority case */
	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
	struct cfq_queue *async_idle_cfqq;

306
};
307

308 309 310 311
struct cfq_io_cq {
	struct io_cq		icq;		/* must be the first member */
	struct cfq_queue	*cfqq[2];
	struct cfq_ttime	ttime;
Tejun Heo's avatar
Tejun Heo committed
312 313
	int			ioprio;		/* the current ioprio */
#ifdef CONFIG_CFQ_GROUP_IOSCHED
Tejun Heo's avatar
Tejun Heo committed
314
	uint64_t		blkcg_serial_nr; /* the current blkcg serial */
Tejun Heo's avatar
Tejun Heo committed
315
#endif
316 317
};

318 319 320
/*
 * Per block device queue structure
 */
Linus Torvalds's avatar
Linus Torvalds committed
321
struct cfq_data {
322
	struct request_queue *queue;
323 324
	/* Root service tree for cfq_groups */
	struct cfq_rb_root grp_service_tree;
325
	struct cfq_group *root_group;
326

327 328
	/*
	 * The priority currently being served
329
	 */
330 331
	enum wl_class_t serving_wl_class;
	enum wl_type_t serving_wl_type;
332
	unsigned long workload_expires;
333
	struct cfq_group *serving_group;
334 335 336 337 338 339 340 341

	/*
	 * Each priority tree is sorted by next_request position.  These
	 * trees are used when determining if two or more queues are
	 * interleaving requests (see cfq_close_cooperator).
	 */
	struct rb_root prio_trees[CFQ_PRIO_LISTS];

342
	unsigned int busy_queues;
343
	unsigned int busy_sync_queues;
344

345 346
	int rq_in_driver;
	int rq_in_flight[2];
347 348 349 350 351

	/*
	 * queue-depth detection
	 */
	int rq_queued;
352
	int hw_tag;
353 354 355 356 357 358 359 360
	/*
	 * hw_tag can be
	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
	 *  0 => no NCQ
	 */
	int hw_tag_est_depth;
	unsigned int hw_tag_samples;
Linus Torvalds's avatar
Linus Torvalds committed
361

362 363 364 365
	/*
	 * idle window management
	 */
	struct timer_list idle_slice_timer;
366
	struct work_struct unplug_work;
Linus Torvalds's avatar
Linus Torvalds committed
367

368
	struct cfq_queue *active_queue;
369
	struct cfq_io_cq *active_cic;
370

Jens Axboe's avatar
Jens Axboe committed
371
	sector_t last_position;
Linus Torvalds's avatar
Linus Torvalds committed
372 373 374 375 376

	/*
	 * tunables, see top of file
	 */
	unsigned int cfq_quantum;
377
	unsigned int cfq_fifo_expire[2];
Linus Torvalds's avatar
Linus Torvalds committed
378 379
	unsigned int cfq_back_penalty;
	unsigned int cfq_back_max;
380 381 382
	unsigned int cfq_slice[2];
	unsigned int cfq_slice_async_rq;
	unsigned int cfq_slice_idle;
383
	unsigned int cfq_group_idle;
384
	unsigned int cfq_latency;
385
	unsigned int cfq_target_latency;
386

387 388 389 390
	/*
	 * Fallback dummy cfqq for extreme OOM conditions
	 */
	struct cfq_queue oom_cfqq;
391

392
	unsigned long last_delayed_sync;
Linus Torvalds's avatar
Linus Torvalds committed
393 394
};

395
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
396
static void cfq_put_queue(struct cfq_queue *cfqq);
397

398
static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
399
					    enum wl_class_t class,
400
					    enum wl_type_t type)
401
{
402 403 404
	if (!cfqg)
		return NULL;

405
	if (class == IDLE_WORKLOAD)
406
		return &cfqg->service_tree_idle;
407

408
	return &cfqg->service_trees[class][type];
409 410
}

Jens Axboe's avatar
Jens Axboe committed
411
enum cfqq_state_flags {
412 413
	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
414
	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
415 416 417 418
	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
419
	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
420
	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
421
	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
422
	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
423
	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
424
	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
Jens Axboe's avatar
Jens Axboe committed
425 426 427 428 429
};

#define CFQ_CFQQ_FNS(name)						\
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
{									\
430
	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
Jens Axboe's avatar
Jens Axboe committed
431 432 433
}									\
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
{									\
434
	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
Jens Axboe's avatar
Jens Axboe committed
435 436 437
}									\
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
{									\
438
	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
Jens Axboe's avatar
Jens Axboe committed
439 440 441 442
}

CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
443
CFQ_CFQQ_FNS(must_dispatch);
Jens Axboe's avatar
Jens Axboe committed
444 445 446 447
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
448
CFQ_CFQQ_FNS(slice_new);
449
CFQ_CFQQ_FNS(sync);
450
CFQ_CFQQ_FNS(coop);
451
CFQ_CFQQ_FNS(split_coop);
452
CFQ_CFQQ_FNS(deep);
453
CFQ_CFQQ_FNS(wait_busy);
Jens Axboe's avatar
Jens Axboe committed
454 455
#undef CFQ_CFQQ_FNS

456
#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
457

458 459 460 461 462
/* cfqg stats flags */
enum cfqg_stats_flags {
	CFQG_stats_waiting = 0,
	CFQG_stats_idling,
	CFQG_stats_empty,
463 464
};

465 466
#define CFQG_FLAG_FNS(name)						\
static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)	\
467
{									\
468
	stats->flags |= (1 << CFQG_stats_##name);			\
469
}									\
470
static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)	\
471
{									\
472
	stats->flags &= ~(1 << CFQG_stats_##name);			\
473
}									\
474
static inline int cfqg_stats_##name(struct cfqg_stats *stats)		\
475
{									\
476
	return (stats->flags & (1 << CFQG_stats_##name)) != 0;		\
477 478
}									\

479 480 481 482
CFQG_FLAG_FNS(waiting)
CFQG_FLAG_FNS(idling)
CFQG_FLAG_FNS(empty)
#undef CFQG_FLAG_FNS
483 484

/* This should be called with the queue_lock held. */
485
static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
486 487 488
{
	unsigned long long now;

489
	if (!cfqg_stats_waiting(stats))
490 491 492 493 494 495
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_group_wait_time))
		blkg_stat_add(&stats->group_wait_time,
			      now - stats->start_group_wait_time);
496
	cfqg_stats_clear_waiting(stats);
497 498 499
}

/* This should be called with the queue_lock held. */
500 501
static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
						 struct cfq_group *curr_cfqg)
502
{
503
	struct cfqg_stats *stats = &cfqg->stats;
504

505
	if (cfqg_stats_waiting(stats))
506
		return;
507
	if (cfqg == curr_cfqg)
508
		return;
509 510
	stats->start_group_wait_time = sched_clock();
	cfqg_stats_mark_waiting(stats);
511 512 513
}

/* This should be called with the queue_lock held. */
514
static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
515 516 517
{
	unsigned long long now;

518
	if (!cfqg_stats_empty(stats))
519 520 521 522 523 524
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_empty_time))
		blkg_stat_add(&stats->empty_time,
			      now - stats->start_empty_time);
525
	cfqg_stats_clear_empty(stats);
526 527
}

528
static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
529
{
530
	blkg_stat_add(&cfqg->stats.dequeue, 1);
531 532
}

533
static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
534
{
535
	struct cfqg_stats *stats = &cfqg->stats;
536

537
	if (blkg_rwstat_total(&stats->queued))
538 539 540 541 542 543 544
		return;

	/*
	 * group is already marked empty. This can happen if cfqq got new
	 * request in parent group and moved to this group while being added
	 * to service tree. Just ignore the event and move on.
	 */
545
	if (cfqg_stats_empty(stats))
546 547 548
		return;

	stats->start_empty_time = sched_clock();
549
	cfqg_stats_mark_empty(stats);
550 551
}

552
static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
553
{
554
	struct cfqg_stats *stats = &cfqg->stats;
555

556
	if (cfqg_stats_idling(stats)) {
557 558 559 560 561
		unsigned long long now = sched_clock();

		if (time_after64(now, stats->start_idle_time))
			blkg_stat_add(&stats->idle_time,
				      now - stats->start_idle_time);
562
		cfqg_stats_clear_idling(stats);
563 564 565
	}
}

566
static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
567
{
568
	struct cfqg_stats *stats = &cfqg->stats;
569

570
	BUG_ON(cfqg_stats_idling(stats));
571 572

	stats->start_idle_time = sched_clock();
573
	cfqg_stats_mark_idling(stats);
574 575
}

576
static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
577
{
578
	struct cfqg_stats *stats = &cfqg->stats;
579 580

	blkg_stat_add(&stats->avg_queue_size_sum,
581
		      blkg_rwstat_total(&stats->queued));
582
	blkg_stat_add(&stats->avg_queue_size_samples, 1);
583
	cfqg_stats_update_group_wait_time(stats);
584 585 586 587
}

#else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

Tejun Heo's avatar
Tejun Heo committed
588 589 590 591 592 593 594
static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
595 596 597 598

#endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

#ifdef CONFIG_CFQ_GROUP_IOSCHED
599

600 601 602 603 604 605 606 607
static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
{
	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
}

static struct cfq_group_data
*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
{
608
	return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
609 610 611 612 613 614 615
}

static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
{
	return pd_to_blkg(&cfqg->pd);
}

616 617 618 619 620 621 622
static struct blkcg_policy blkcg_policy_cfq;

static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
{
	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
}

623 624 625 626 627
static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
{
	return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
}

628
static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
629
{
630
	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
631

632
	return pblkg ? blkg_to_cfqg(pblkg) : NULL;
633 634
}

635 636 637 638 639 640 641 642 643 644
static inline void cfqg_get(struct cfq_group *cfqg)
{
	return blkg_get(cfqg_to_blkg(cfqg));
}

static inline void cfqg_put(struct cfq_group *cfqg)
{
	return blkg_put(cfqg_to_blkg(cfqg));
}

Tejun Heo's avatar
Tejun Heo committed
645 646 647 648
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	do {			\
	char __pbuf[128];						\
									\
	blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));	\
649 650 651
	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
Tejun Heo's avatar
Tejun Heo committed
652 653 654 655 656 657 658 659 660
			  __pbuf, ##args);				\
} while (0)

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)	do {			\
	char __pbuf[128];						\
									\
	blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf));		\
	blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args);	\
} while (0)
661

662 663
static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
					    struct cfq_group *curr_cfqg, int rw)
664
{
665 666 667
	blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
	cfqg_stats_end_empty_time(&cfqg->stats);
	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
668 669
}

670 671
static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
			unsigned long time, unsigned long unaccounted_time)
672
{
673
	blkg_stat_add(&cfqg->stats.time, time);
674
#ifdef CONFIG_DEBUG_BLK_CGROUP
675
	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
676
#endif
677 678
}

679
static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
680
{
681
	blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
682 683
}

684
static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
685
{
686
	blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
687 688
}

689 690
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
			uint64_t start_time, uint64_t io_start_time, int rw)
691
{
692
	struct cfqg_stats *stats = &cfqg->stats;
693 694 695 696 697 698 699
	unsigned long long now = sched_clock();

	if (time_after64(now, io_start_time))
		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
	if (time_after64(io_start_time, start_time))
		blkg_rwstat_add(&stats->wait_time, rw,
				io_start_time - start_time);
700 701
}

702 703
/* @stats = 0 */
static void cfqg_stats_reset(struct cfqg_stats *stats)
704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
{
	/* queued stats shouldn't be cleared */
	blkg_rwstat_reset(&stats->merged);
	blkg_rwstat_reset(&stats->service_time);
	blkg_rwstat_reset(&stats->wait_time);
	blkg_stat_reset(&stats->time);
#ifdef CONFIG_DEBUG_BLK_CGROUP
	blkg_stat_reset(&stats->unaccounted_time);
	blkg_stat_reset(&stats->avg_queue_size_sum);
	blkg_stat_reset(&stats->avg_queue_size_samples);
	blkg_stat_reset(&stats->dequeue);
	blkg_stat_reset(&stats->group_wait_time);
	blkg_stat_reset(&stats->idle_time);
	blkg_stat_reset(&stats->empty_time);
#endif
}

721
/* @to += @from */
722
static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
723 724
{
	/* queued stats shouldn't be cleared */
725 726 727 728
	blkg_rwstat_add_aux(&to->merged, &from->merged);
	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
	blkg_stat_add_aux(&from->time, &from->time);
729
#ifdef CONFIG_DEBUG_BLK_CGROUP
730 731 732 733 734 735 736
	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
737 738 739 740
#endif
}

/*
741
 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
742 743 744 745 746 747 748 749 750 751 752 753
 * recursive stats can still account for the amount used by this cfqg after
 * it's gone.
 */
static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
{
	struct cfq_group *parent = cfqg_parent(cfqg);

	lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);

	if (unlikely(!parent))
		return;

754
	cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
755 756 757
	cfqg_stats_reset(&cfqg->stats);
}

758 759
#else	/* CONFIG_CFQ_GROUP_IOSCHED */

760
static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
761 762 763
static inline void cfqg_get(struct cfq_group *cfqg) { }
static inline void cfqg_put(struct cfq_group *cfqg) { }

764
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
765 766 767 768
	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid,	\
			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
				##args)
769
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
770

771 772 773 774 775 776 777 778
static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
			struct cfq_group *curr_cfqg, int rw) { }
static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
			unsigned long time, unsigned long unaccounted_time) { }
static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
			uint64_t start_time, uint64_t io_start_time, int rw) { }
779

780 781
#endif	/* CONFIG_CFQ_GROUP_IOSCHED */

782 783 784
#define cfq_log(cfqd, fmt, args...)	\
	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

785 786 787 788 789 790 791 792 793 794
/* Traverses through cfq group service trees */
#define for_each_cfqg_st(cfqg, i, j, st) \
	for (i = 0; i <= IDLE_WORKLOAD; i++) \
		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
			: &cfqg->service_tree_idle; \
			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
			(i == IDLE_WORKLOAD && j == 0); \
			j++, st = i < IDLE_WORKLOAD ? \
			&cfqg->service_trees[i][j]: NULL) \

795 796 797 798 799 800 801 802 803 804 805 806
static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
	struct cfq_ttime *ttime, bool group_idle)
{
	unsigned long slice;
	if (!sample_valid(ttime->ttime_samples))
		return false;
	if (group_idle)
		slice = cfqd->cfq_group_idle;
	else
		slice = cfqd->cfq_slice_idle;
	return ttime->ttime_mean > slice;
}
807

808 809 810 811 812 813 814 815 816 817 818 819 820 821 822
static inline bool iops_mode(struct cfq_data *cfqd)
{
	/*
	 * If we are not idling on queues and it is a NCQ drive, parallel
	 * execution of requests is on and measuring time is not possible
	 * in most of the cases until and unless we drive shallower queue
	 * depths and that becomes a performance bottleneck. In such cases
	 * switch to start providing fairness in terms of number of IOs.
	 */
	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
		return true;
	else
		return false;
}

823
static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
824 825 826 827 828 829 830 831
{
	if (cfq_class_idle(cfqq))
		return IDLE_WORKLOAD;
	if (cfq_class_rt(cfqq))
		return RT_WORKLOAD;
	return BE_WORKLOAD;
}

832 833 834 835 836 837 838 839 840 841

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
{
	if (!cfq_cfqq_sync(cfqq))
		return ASYNC_WORKLOAD;
	if (!cfq_cfqq_idle_window(cfqq))
		return SYNC_NOIDLE_WORKLOAD;
	return SYNC_WORKLOAD;
}

842
static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
843 844
					struct cfq_data *cfqd,
					struct cfq_group *cfqg)
845
{
846
	if (wl_class == IDLE_WORKLOAD)
847
		return cfqg->service_tree_idle.count;
848

849 850 851
	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
		cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
		cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
852 853
}

854 855 856
static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
					struct cfq_group *cfqg)
{
857 858
	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
		cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
859 860
}

861
static void cfq_dispatch_insert(struct request_queue *, struct request *);
862
static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
863
				       struct cfq_io_cq *cic, struct bio *bio);
864

865 866 867 868 869 870
static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
{
	/* cic->icq is the first member, %NULL will convert to %NULL */
	return container_of(icq, struct cfq_io_cq, icq);
}

871 872 873 874 875 876 877 878
static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
					       struct io_context *ioc)
{
	if (ioc)
		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
	return NULL;
}

879
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
880
{
881
	return cic->cfqq[is_sync];
882 883
}

884 885
static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
				bool is_sync)
886
{
887
	cic->cfqq[is_sync] = cfqq;
888 889
}

890
static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
891
{
892
	return cic->icq.q->elevator->elevator_data;
893 894
}

895 896 897 898
/*
 * We regard a request as SYNC, if it's either a read or has the SYNC bit
 * set (in which case it could also be direct WRITE).
 */
899
static inline bool cfq_bio_sync(struct bio *bio)
900
{
901
	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
902
}
Linus Torvalds's avatar
Linus Torvalds committed
903

Andrew Morton's avatar
Andrew Morton committed
904 905 906 907
/*
 * scheduler run of queue, if there are requests pending and no one in the
 * driver that will restart queueing
 */
908
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
Andrew Morton's avatar
Andrew Morton committed
909
{
910 911
	if (cfqd->busy_queues) {
		cfq_log(cfqd, "schedule dispatch");
912
		kblockd_schedule_work(&cfqd->unplug_work);
913
	}
Andrew Morton's avatar
Andrew Morton committed
914 915
}

916 917 918 919 920
/*
 * Scale schedule slice based on io priority. Use the sync time slice only
 * if a queue is marked sync and has sync io queued. A sync queue with async
 * io only, should not get full sync slice length.
 */
921
static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
922
				 unsigned short prio)
923
{
924
	const int base_slice = cfqd->cfq_slice[sync];
925

926 927 928 929
	WARN_ON(prio >= IOPRIO_BE_NR);

	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
}
930

931 932 933 934
static inline int
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
935 936
}

937 938 939 940 941 942 943 944 945 946 947 948 949 950
/**
 * cfqg_scale_charge - scale disk time charge according to cfqg weight
 * @charge: disk time being charged
 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
 *
 * Scale @charge according to @vfraction, which is in range (0, 1].  The
 * scaling is inversely proportional.
 *
 * scaled = charge / vfraction
 *
 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
 */
static inline u64 cfqg_scale_charge(unsigned long charge,
				    unsigned int vfraction)
951
{
952
	u64 c = charge << CFQ_SERVICE_SHIFT;	/* make it fixed point */
953

954 955 956 957
	/* charge / vfraction */
	c <<= CFQ_SERVICE_SHIFT;
	do_div(c, vfraction);
	return c;
958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983
}

static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
{
	s64 delta = (s64)(vdisktime - min_vdisktime);
	if (delta > 0)
		min_vdisktime = vdisktime;

	return min_vdisktime;
}

static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
{
	s64 delta = (s64)(vdisktime - min_vdisktime);
	if (delta < 0)
		min_vdisktime = vdisktime;

	return min_vdisktime;
}

static void update_min_vdisktime(struct cfq_rb_root *st)
{
	struct cfq_group *cfqg;

	if (st->left) {
		cfqg = rb_entry_cfqg(st->left);
984 985
		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
						  cfqg->vdisktime);
986 987 988
	}
}

989 990 991 992 993 994
/*
 * get averaged number of queues of RT/BE priority.
 * average is updated, with a formula that gives more weight to higher numbers,
 * to quickly follows sudden increases and decrease slowly
 */

995 996
static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
					struct cfq_group *cfqg, bool rt)
997
{
998 999 1000
	unsigned min_q, max_q;
	unsigned mult  = cfq_hist_divisor - 1;
	unsigned round = cfq_hist_divisor / 2;
1001
	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
1002

1003 1004 1005
	min_q = min(cfqg->busy_queues_avg[rt], busy);
	max_q = max(cfqg->busy_queues_avg[rt], busy);
	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
1006
		cfq_hist_divisor;
1007 1008 1009 1010 1011 1012
	return cfqg->busy_queues_avg[rt];
}

static inline unsigned
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
1013
	return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;