cfq-iosched.c 127 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6
/*
 *  CFQ, or complete fairness queueing, disk scheduler.
 *
 *  Based on ideas from a previously unfinished io
 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
 *
7
 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
Linus Torvalds's avatar
Linus Torvalds committed
8 9
 */
#include <linux/module.h>
10
#include <linux/slab.h>
11
#include <linux/sched/clock.h>
Al Viro's avatar
Al Viro committed
12 13
#include <linux/blkdev.h>
#include <linux/elevator.h>
14
#include <linux/ktime.h>
Linus Torvalds's avatar
Linus Torvalds committed
15
#include <linux/rbtree.h>
16
#include <linux/ioprio.h>
17
#include <linux/blktrace_api.h>
18
#include <linux/blk-cgroup.h>
19
#include "blk.h"
20
#include "blk-wbt.h"
Linus Torvalds's avatar
Linus Torvalds committed
21 22 23 24

/*
 * tunables
 */
25
/* max queue in one round of service */
Shaohua Li's avatar
Shaohua Li committed
26
static const int cfq_quantum = 8;
27
static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
28 29 30 31
/* maximum backwards seek, in KiB */
static const int cfq_back_max = 16 * 1024;
/* penalty of a backwards seek */
static const int cfq_back_penalty = 2;
32 33
static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
static u64 cfq_slice_async = NSEC_PER_SEC / 25;
34
static const int cfq_slice_async_rq = 2;
35 36 37
static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
static u64 cfq_group_idle = NSEC_PER_SEC / 125;
static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
38
static const int cfq_hist_divisor = 4;
39

40
/*
41
 * offset from end of queue service tree for idle class
42
 */
43
#define CFQ_IDLE_DELAY		(NSEC_PER_SEC / 5)
44 45 46 47
/* offset from end of group service tree under time slice mode */
#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
/* offset from end of group service under IOPS mode */
#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
48 49 50 51

/*
 * below this threshold, we consider thinktime immediate
 */
52
#define CFQ_MIN_TT		(2 * NSEC_PER_SEC / HZ)
53

54
#define CFQ_SLICE_SCALE		(5)
55
#define CFQ_HW_QUEUE_MIN	(5)
56
#define CFQ_SERVICE_SHIFT       12
57

58
#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
59
#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
60
#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
61
#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
62

63 64 65
#define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
Linus Torvalds's avatar
Linus Torvalds committed
66

67
static struct kmem_cache *cfq_pool;
Linus Torvalds's avatar
Linus Torvalds committed
68

69 70 71 72
#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

73
#define sample_valid(samples)	((samples) > 80)
74
#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
75

76
/* blkio-related constants */
77 78 79
#define CFQ_WEIGHT_LEGACY_MIN	10
#define CFQ_WEIGHT_LEGACY_DFL	500
#define CFQ_WEIGHT_LEGACY_MAX	1000
80

81
struct cfq_ttime {
82
	u64 last_end_request;
83

84 85
	u64 ttime_total;
	u64 ttime_mean;
86 87 88
	unsigned long ttime_samples;
};

89 90 91 92 93 94 95
/*
 * Most of our rbtree usage is for sorting with min extraction, so
 * if we cache the leftmost node we don't have to walk down the tree
 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
 * move this into the elevator for the rq sorting as well.
 */
struct cfq_rb_root {
96
	struct rb_root_cached rb;
97
	struct rb_node *rb_rightmost;
98
	unsigned count;
99
	u64 min_vdisktime;
100
	struct cfq_ttime ttime;
101
};
102
#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
103
			.rb_rightmost = NULL,			     \
104
			.ttime = {.last_end_request = ktime_get_ns(),},}
105

106 107 108 109 110
/*
 * Per process-grouping structure
 */
struct cfq_queue {
	/* reference count */
111
	int ref;
112 113 114 115 116 117 118
	/* various state flags, see below */
	unsigned int flags;
	/* parent cfq_data */
	struct cfq_data *cfqd;
	/* service_tree member */
	struct rb_node rb_node;
	/* service_tree key */
119
	u64 rb_key;
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
	/* prio tree member */
	struct rb_node p_node;
	/* prio tree root we belong to, if any */
	struct rb_root *p_root;
	/* sorted list of pending requests */
	struct rb_root sort_list;
	/* if fifo isn't expired, next request to serve */
	struct request *next_rq;
	/* requests queued in sort_list */
	int queued[2];
	/* currently allocated requests */
	int allocated[2];
	/* fifo list of requests in sort_list */
	struct list_head fifo;

135
	/* time when queue got scheduled in to dispatch first request. */
136 137 138
	u64 dispatch_start;
	u64 allocated_slice;
	u64 slice_dispatch;
139
	/* time when first request from queue completed and slice started. */
140 141
	u64 slice_start;
	u64 slice_end;
142
	s64 slice_resid;
143

144 145
	/* pending priority requests */
	int prio_pending;
146 147 148 149 150
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;

	/* io prio of this group */
	unsigned short ioprio, org_ioprio;
151
	unsigned short ioprio_class, org_ioprio_class;
152

153 154
	pid_t pid;

155
	u32 seek_history;
156 157
	sector_t last_request_pos;

158
	struct cfq_rb_root *service_tree;
Jeff Moyer's avatar
Jeff Moyer committed
159
	struct cfq_queue *new_cfqq;
160
	struct cfq_group *cfqg;
161 162
	/* Number of sectors dispatched from queue in single dispatch round */
	unsigned long nr_sectors;
163 164
};

165
/*
166
 * First index in the service_trees.
167 168
 * IDLE is handled separately, so it has negative index
 */
169
enum wl_class_t {
170
	BE_WORKLOAD = 0,
171 172
	RT_WORKLOAD = 1,
	IDLE_WORKLOAD = 2,
173
	CFQ_PRIO_NR,
174 175
};

176 177 178 179 180 181 182 183 184
/*
 * Second index in the service_trees.
 */
enum wl_type_t {
	ASYNC_WORKLOAD = 0,
	SYNC_NOIDLE_WORKLOAD = 1,
	SYNC_WORKLOAD = 2
};

185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
struct cfqg_stats {
#ifdef CONFIG_CFQ_GROUP_IOSCHED
	/* number of ios merged */
	struct blkg_rwstat		merged;
	/* total time spent on device in ns, may not be accurate w/ queueing */
	struct blkg_rwstat		service_time;
	/* total time spent waiting in scheduler queue in ns */
	struct blkg_rwstat		wait_time;
	/* number of IOs queued up */
	struct blkg_rwstat		queued;
	/* total disk time and nr sectors dispatched by this group */
	struct blkg_stat		time;
#ifdef CONFIG_DEBUG_BLK_CGROUP
	/* time not charged to this cgroup */
	struct blkg_stat		unaccounted_time;
	/* sum of number of ios queued across all samples */
	struct blkg_stat		avg_queue_size_sum;
	/* count of samples taken for average */
	struct blkg_stat		avg_queue_size_samples;
	/* how many times this group has been removed from service tree */
	struct blkg_stat		dequeue;
	/* total time spent waiting for it to be assigned a timeslice. */
	struct blkg_stat		group_wait_time;
Tejun Heo's avatar
Tejun Heo committed
208
	/* time spent idling for this blkcg_gq */
209 210 211 212 213 214 215 216 217 218 219 220
	struct blkg_stat		idle_time;
	/* total time with empty current active q with other requests queued */
	struct blkg_stat		empty_time;
	/* fields after this shouldn't be cleared on stat reset */
	uint64_t			start_group_wait_time;
	uint64_t			start_idle_time;
	uint64_t			start_empty_time;
	uint16_t			flags;
#endif	/* CONFIG_DEBUG_BLK_CGROUP */
#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
};

221 222 223
/* Per-cgroup data */
struct cfq_group_data {
	/* must be the first member */
224
	struct blkcg_policy_data cpd;
225 226 227 228 229

	unsigned int weight;
	unsigned int leaf_weight;
};

230 231
/* This is per cgroup per device grouping structure */
struct cfq_group {
232 233 234
	/* must be the first member */
	struct blkg_policy_data pd;

235 236 237 238 239
	/* group service_tree member */
	struct rb_node rb_node;

	/* group service_tree key */
	u64 vdisktime;
Tejun Heo's avatar
Tejun Heo committed
240

241 242 243 244 245 246 247 248 249 250 251 252
	/*
	 * The number of active cfqgs and sum of their weights under this
	 * cfqg.  This covers this cfqg's leaf_weight and all children's
	 * weights, but does not cover weights of further descendants.
	 *
	 * If a cfqg is on the service tree, it's active.  An active cfqg
	 * also activates its parent and contributes to the children_weight
	 * of the parent.
	 */
	int nr_active;
	unsigned int children_weight;

253 254 255 256 257 258 259 260 261 262 263 264
	/*
	 * vfraction is the fraction of vdisktime that the tasks in this
	 * cfqg are entitled to.  This is determined by compounding the
	 * ratios walking up from this cfqg to the root.
	 *
	 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
	 * vfractions on a service tree is approximately 1.  The sum may
	 * deviate a bit due to rounding errors and fluctuations caused by
	 * cfqgs entering and leaving the service tree.
	 */
	unsigned int vfraction;

Tejun Heo's avatar
Tejun Heo committed
265 266 267 268 269 270
	/*
	 * There are two weights - (internal) weight is the weight of this
	 * cfqg against the sibling cfqgs.  leaf_weight is the wight of
	 * this cfqg against the child cfqgs.  For the root cfqg, both
	 * weights are kept in sync for backward compatibility.
	 */
271
	unsigned int weight;
272
	unsigned int new_weight;
273
	unsigned int dev_weight;
274

Tejun Heo's avatar
Tejun Heo committed
275 276 277 278
	unsigned int leaf_weight;
	unsigned int new_leaf_weight;
	unsigned int dev_leaf_weight;

279 280 281
	/* number of cfqq currently on this group */
	int nr_cfqq;

282
	/*
283
	 * Per group busy queues average. Useful for workload slice calc. We
284 285 286 287 288 289 290 291 292 293 294
	 * create the array for each prio class but at run time it is used
	 * only for RT and BE class and slot for IDLE class remains unused.
	 * This is primarily done to avoid confusion and a gcc warning.
	 */
	unsigned int busy_queues_avg[CFQ_PRIO_NR];
	/*
	 * rr lists of queues with requests. We maintain service trees for
	 * RT and BE classes. These trees are subdivided in subclasses
	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
	 * class there is no subclassification and all the cfq queues go on
	 * a single tree service_tree_idle.
295 296 297 298
	 * Counts are embedded in the cfq_rb_root
	 */
	struct cfq_rb_root service_trees[2][3];
	struct cfq_rb_root service_tree_idle;
299

300
	u64 saved_wl_slice;
301 302
	enum wl_type_t saved_wl_type;
	enum wl_class_t saved_wl_class;
303

304 305
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;
306
	struct cfq_ttime ttime;
307
	struct cfqg_stats stats;	/* stats for this cfqg */
308 309 310 311 312

	/* async queue for each priority case */
	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
	struct cfq_queue *async_idle_cfqq;

313
};
314

315 316 317 318
struct cfq_io_cq {
	struct io_cq		icq;		/* must be the first member */
	struct cfq_queue	*cfqq[2];
	struct cfq_ttime	ttime;
Tejun Heo's avatar
Tejun Heo committed
319 320
	int			ioprio;		/* the current ioprio */
#ifdef CONFIG_CFQ_GROUP_IOSCHED
Tejun Heo's avatar
Tejun Heo committed
321
	uint64_t		blkcg_serial_nr; /* the current blkcg serial */
Tejun Heo's avatar
Tejun Heo committed
322
#endif
323 324
};

325 326 327
/*
 * Per block device queue structure
 */
Linus Torvalds's avatar
Linus Torvalds committed
328
struct cfq_data {
329
	struct request_queue *queue;
330 331
	/* Root service tree for cfq_groups */
	struct cfq_rb_root grp_service_tree;
332
	struct cfq_group *root_group;
333

334 335
	/*
	 * The priority currently being served
336
	 */
337 338
	enum wl_class_t serving_wl_class;
	enum wl_type_t serving_wl_type;
339
	u64 workload_expires;
340
	struct cfq_group *serving_group;
341 342 343 344 345 346 347 348

	/*
	 * Each priority tree is sorted by next_request position.  These
	 * trees are used when determining if two or more queues are
	 * interleaving requests (see cfq_close_cooperator).
	 */
	struct rb_root prio_trees[CFQ_PRIO_LISTS];

349
	unsigned int busy_queues;
350
	unsigned int busy_sync_queues;
351

352 353
	int rq_in_driver;
	int rq_in_flight[2];
354 355 356 357 358

	/*
	 * queue-depth detection
	 */
	int rq_queued;
359
	int hw_tag;
360 361 362 363 364 365 366 367
	/*
	 * hw_tag can be
	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
	 *  0 => no NCQ
	 */
	int hw_tag_est_depth;
	unsigned int hw_tag_samples;
Linus Torvalds's avatar
Linus Torvalds committed
368

369 370 371
	/*
	 * idle window management
	 */
372
	struct hrtimer idle_slice_timer;
373
	struct work_struct unplug_work;
Linus Torvalds's avatar
Linus Torvalds committed
374

375
	struct cfq_queue *active_queue;
376
	struct cfq_io_cq *active_cic;
377

Jens Axboe's avatar
Jens Axboe committed
378
	sector_t last_position;
Linus Torvalds's avatar
Linus Torvalds committed
379 380 381 382 383 384 385

	/*
	 * tunables, see top of file
	 */
	unsigned int cfq_quantum;
	unsigned int cfq_back_penalty;
	unsigned int cfq_back_max;
386
	unsigned int cfq_slice_async_rq;
387
	unsigned int cfq_latency;
388 389 390 391 392
	u64 cfq_fifo_expire[2];
	u64 cfq_slice[2];
	u64 cfq_slice_idle;
	u64 cfq_group_idle;
	u64 cfq_target_latency;
393

394 395 396 397
	/*
	 * Fallback dummy cfqq for extreme OOM conditions
	 */
	struct cfq_queue oom_cfqq;
398

399
	u64 last_delayed_sync;
Linus Torvalds's avatar
Linus Torvalds committed
400 401
};

402
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
403
static void cfq_put_queue(struct cfq_queue *cfqq);
404

405
static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
406
					    enum wl_class_t class,
407
					    enum wl_type_t type)
408
{
409 410 411
	if (!cfqg)
		return NULL;

412
	if (class == IDLE_WORKLOAD)
413
		return &cfqg->service_tree_idle;
414

415
	return &cfqg->service_trees[class][type];
416 417
}

Jens Axboe's avatar
Jens Axboe committed
418
enum cfqq_state_flags {
419 420
	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
421
	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
422 423 424 425
	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
426
	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
427
	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
428
	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
429
	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
430
	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
431
	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
Jens Axboe's avatar
Jens Axboe committed
432 433 434 435 436
};

#define CFQ_CFQQ_FNS(name)						\
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
{									\
437
	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
Jens Axboe's avatar
Jens Axboe committed
438 439 440
}									\
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
{									\
441
	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
Jens Axboe's avatar
Jens Axboe committed
442 443 444
}									\
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
{									\
445
	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
Jens Axboe's avatar
Jens Axboe committed
446 447 448 449
}

CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
450
CFQ_CFQQ_FNS(must_dispatch);
Jens Axboe's avatar
Jens Axboe committed
451 452 453 454
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
455
CFQ_CFQQ_FNS(slice_new);
456
CFQ_CFQQ_FNS(sync);
457
CFQ_CFQQ_FNS(coop);
458
CFQ_CFQQ_FNS(split_coop);
459
CFQ_CFQQ_FNS(deep);
460
CFQ_CFQQ_FNS(wait_busy);
Jens Axboe's avatar
Jens Axboe committed
461 462
#undef CFQ_CFQQ_FNS

463
#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
464

465 466 467 468 469
/* cfqg stats flags */
enum cfqg_stats_flags {
	CFQG_stats_waiting = 0,
	CFQG_stats_idling,
	CFQG_stats_empty,
470 471
};

472 473
#define CFQG_FLAG_FNS(name)						\
static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)	\
474
{									\
475
	stats->flags |= (1 << CFQG_stats_##name);			\
476
}									\
477
static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)	\
478
{									\
479
	stats->flags &= ~(1 << CFQG_stats_##name);			\
480
}									\
481
static inline int cfqg_stats_##name(struct cfqg_stats *stats)		\
482
{									\
483
	return (stats->flags & (1 << CFQG_stats_##name)) != 0;		\
484 485
}									\

486 487 488 489
CFQG_FLAG_FNS(waiting)
CFQG_FLAG_FNS(idling)
CFQG_FLAG_FNS(empty)
#undef CFQG_FLAG_FNS
490 491

/* This should be called with the queue_lock held. */
492
static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
493 494 495
{
	unsigned long long now;

496
	if (!cfqg_stats_waiting(stats))
497 498 499 500 501 502
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_group_wait_time))
		blkg_stat_add(&stats->group_wait_time,
			      now - stats->start_group_wait_time);
503
	cfqg_stats_clear_waiting(stats);
504 505 506
}

/* This should be called with the queue_lock held. */
507 508
static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
						 struct cfq_group *curr_cfqg)
509
{
510
	struct cfqg_stats *stats = &cfqg->stats;
511

512
	if (cfqg_stats_waiting(stats))
513
		return;
514
	if (cfqg == curr_cfqg)
515
		return;
516 517
	stats->start_group_wait_time = sched_clock();
	cfqg_stats_mark_waiting(stats);
518 519 520
}

/* This should be called with the queue_lock held. */
521
static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
522 523 524
{
	unsigned long long now;

525
	if (!cfqg_stats_empty(stats))
526 527 528 529 530 531
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_empty_time))
		blkg_stat_add(&stats->empty_time,
			      now - stats->start_empty_time);
532
	cfqg_stats_clear_empty(stats);
533 534
}

535
static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
536
{
537
	blkg_stat_add(&cfqg->stats.dequeue, 1);
538 539
}

540
static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
541
{
542
	struct cfqg_stats *stats = &cfqg->stats;
543

544
	if (blkg_rwstat_total(&stats->queued))
545 546 547 548 549 550 551
		return;

	/*
	 * group is already marked empty. This can happen if cfqq got new
	 * request in parent group and moved to this group while being added
	 * to service tree. Just ignore the event and move on.
	 */
552
	if (cfqg_stats_empty(stats))
553 554 555
		return;

	stats->start_empty_time = sched_clock();
556
	cfqg_stats_mark_empty(stats);
557 558
}

559
static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
560
{
561
	struct cfqg_stats *stats = &cfqg->stats;
562

563
	if (cfqg_stats_idling(stats)) {
564 565 566 567 568
		unsigned long long now = sched_clock();

		if (time_after64(now, stats->start_idle_time))
			blkg_stat_add(&stats->idle_time,
				      now - stats->start_idle_time);
569
		cfqg_stats_clear_idling(stats);
570 571 572
	}
}

573
static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
574
{
575
	struct cfqg_stats *stats = &cfqg->stats;
576

577
	BUG_ON(cfqg_stats_idling(stats));
578 579

	stats->start_idle_time = sched_clock();
580
	cfqg_stats_mark_idling(stats);
581 582
}

583
static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
584
{
585
	struct cfqg_stats *stats = &cfqg->stats;
586 587

	blkg_stat_add(&stats->avg_queue_size_sum,
588
		      blkg_rwstat_total(&stats->queued));
589
	blkg_stat_add(&stats->avg_queue_size_samples, 1);
590
	cfqg_stats_update_group_wait_time(stats);
591 592 593 594
}

#else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

Tejun Heo's avatar
Tejun Heo committed
595 596 597 598 599 600 601
static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
602 603 604 605

#endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

#ifdef CONFIG_CFQ_GROUP_IOSCHED
606

607 608 609 610 611 612 613 614
static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
{
	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
}

static struct cfq_group_data
*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
{
615
	return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
616 617 618 619 620 621 622
}

static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
{
	return pd_to_blkg(&cfqg->pd);
}

623 624 625 626 627 628 629
static struct blkcg_policy blkcg_policy_cfq;

static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
{
	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
}

630 631 632 633 634
static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
{
	return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
}

635
static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
636
{
637
	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
638

639
	return pblkg ? blkg_to_cfqg(pblkg) : NULL;
640 641
}

642 643 644 645 646 647 648
static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
				      struct cfq_group *ancestor)
{
	return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
				    cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
}

649 650 651 652 653 654 655 656 657 658
static inline void cfqg_get(struct cfq_group *cfqg)
{
	return blkg_get(cfqg_to_blkg(cfqg));
}

static inline void cfqg_put(struct cfq_group *cfqg)
{
	return blkg_put(cfqg_to_blkg(cfqg));
}

Tejun Heo's avatar
Tejun Heo committed
659
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	do {			\
660 661 662
	blk_add_cgroup_trace_msg((cfqd)->queue,				\
			cfqg_to_blkg((cfqq)->cfqg)->blkcg,		\
			"cfq%d%c%c " fmt, (cfqq)->pid,			\
663 664
			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
665
			  ##args);					\
Tejun Heo's avatar
Tejun Heo committed
666 667 668
} while (0)

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)	do {			\
669 670
	blk_add_cgroup_trace_msg((cfqd)->queue,				\
			cfqg_to_blkg(cfqg)->blkcg, fmt, ##args);	\
Tejun Heo's avatar
Tejun Heo committed
671
} while (0)
672

673
static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
674 675
					    struct cfq_group *curr_cfqg,
					    unsigned int op)
676
{
677
	blkg_rwstat_add(&cfqg->stats.queued, op, 1);
678 679
	cfqg_stats_end_empty_time(&cfqg->stats);
	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
680 681
}

682
static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
683
			uint64_t time, unsigned long unaccounted_time)
684
{
685
	blkg_stat_add(&cfqg->stats.time, time);
686
#ifdef CONFIG_DEBUG_BLK_CGROUP
687
	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
688
#endif
689 690
}

691 692
static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
					       unsigned int op)
693
{
694
	blkg_rwstat_add(&cfqg->stats.queued, op, -1);
695 696
}

697 698
static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
					       unsigned int op)
699
{
700
	blkg_rwstat_add(&cfqg->stats.merged, op, 1);
701 702
}

703
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
704 705
			uint64_t start_time, uint64_t io_start_time,
			unsigned int op)
706
{
707
	struct cfqg_stats *stats = &cfqg->stats;
708 709 710
	unsigned long long now = sched_clock();

	if (time_after64(now, io_start_time))
711
		blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
712
	if (time_after64(io_start_time, start_time))
713
		blkg_rwstat_add(&stats->wait_time, op,
714
				io_start_time - start_time);
715 716
}

717 718
/* @stats = 0 */
static void cfqg_stats_reset(struct cfqg_stats *stats)
719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735
{
	/* queued stats shouldn't be cleared */
	blkg_rwstat_reset(&stats->merged);
	blkg_rwstat_reset(&stats->service_time);
	blkg_rwstat_reset(&stats->wait_time);
	blkg_stat_reset(&stats->time);
#ifdef CONFIG_DEBUG_BLK_CGROUP
	blkg_stat_reset(&stats->unaccounted_time);
	blkg_stat_reset(&stats->avg_queue_size_sum);
	blkg_stat_reset(&stats->avg_queue_size_samples);
	blkg_stat_reset(&stats->dequeue);
	blkg_stat_reset(&stats->group_wait_time);
	blkg_stat_reset(&stats->idle_time);
	blkg_stat_reset(&stats->empty_time);
#endif
}

736
/* @to += @from */
737
static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
738 739
{
	/* queued stats shouldn't be cleared */
740 741 742 743
	blkg_rwstat_add_aux(&to->merged, &from->merged);
	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
	blkg_stat_add_aux(&from->time, &from->time);
744
#ifdef CONFIG_DEBUG_BLK_CGROUP
745 746 747 748 749 750 751
	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
752 753 754 755
#endif
}

/*
756
 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
757 758 759 760 761 762 763 764 765 766 767 768
 * recursive stats can still account for the amount used by this cfqg after
 * it's gone.
 */
static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
{
	struct cfq_group *parent = cfqg_parent(cfqg);

	lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);

	if (unlikely(!parent))
		return;

769
	cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
770 771 772
	cfqg_stats_reset(&cfqg->stats);
}

773 774
#else	/* CONFIG_CFQ_GROUP_IOSCHED */

775
static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
776 777 778 779 780
static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
				      struct cfq_group *ancestor)
{
	return true;
}
781 782 783
static inline void cfqg_get(struct cfq_group *cfqg) { }
static inline void cfqg_put(struct cfq_group *cfqg) { }

784
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
785 786 787 788
	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid,	\
			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
				##args)
789
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
790

791
static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
792
			struct cfq_group *curr_cfqg, unsigned int op) { }
793
static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
794
			uint64_t time, unsigned long unaccounted_time) { }
795 796 797 798
static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
			unsigned int op) { }
static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
			unsigned int op) { }
799
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
800 801
			uint64_t start_time, uint64_t io_start_time,
			unsigned int op) { }
802

803 804
#endif	/* CONFIG_CFQ_GROUP_IOSCHED */

805 806 807
#define cfq_log(cfqd, fmt, args...)	\
	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

808 809 810 811 812 813 814 815 816 817
/* Traverses through cfq group service trees */
#define for_each_cfqg_st(cfqg, i, j, st) \
	for (i = 0; i <= IDLE_WORKLOAD; i++) \
		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
			: &cfqg->service_tree_idle; \
			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
			(i == IDLE_WORKLOAD && j == 0); \
			j++, st = i < IDLE_WORKLOAD ? \
			&cfqg->service_trees[i][j]: NULL) \

818 819 820
static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
	struct cfq_ttime *ttime, bool group_idle)
{
821
	u64 slice;
822 823 824 825 826 827 828 829
	if (!sample_valid(ttime->ttime_samples))
		return false;
	if (group_idle)
		slice = cfqd->cfq_group_idle;
	else
		slice = cfqd->cfq_slice_idle;
	return ttime->ttime_mean > slice;
}
830

831 832 833 834 835 836 837 838 839 840 841 842 843 844 845
static inline bool iops_mode(struct cfq_data *cfqd)
{
	/*
	 * If we are not idling on queues and it is a NCQ drive, parallel
	 * execution of requests is on and measuring time is not possible
	 * in most of the cases until and unless we drive shallower queue
	 * depths and that becomes a performance bottleneck. In such cases
	 * switch to start providing fairness in terms of number of IOs.
	 */
	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
		return true;
	else
		return false;
}

846
static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
847 848 849 850 851 852 853 854
{
	if (cfq_class_idle(cfqq))
		return IDLE_WORKLOAD;
	if (cfq_class_rt(cfqq))
		return RT_WORKLOAD;
	return BE_WORKLOAD;
}

855 856 857 858 859 860 861 862 863 864

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
{
	if (!cfq_cfqq_sync(cfqq))
		return ASYNC_WORKLOAD;
	if (!cfq_cfqq_idle_window(cfqq))
		return SYNC_NOIDLE_WORKLOAD;
	return SYNC_WORKLOAD;
}

865
static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
866 867
					struct cfq_data *cfqd,
					struct cfq_group *cfqg)
868
{
869
	if (wl_class == IDLE_WORKLOAD)
870
		return cfqg->service_tree_idle.count;
871

872 873 874
	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
		cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
		cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
875 876
}

877 878 879
static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
					struct cfq_group *cfqg)
{
880 881
	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
		cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
882 883
}

884
static void cfq_dispatch_insert(struct request_queue *, struct request *);
885
static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
886
				       struct cfq_io_cq *cic, struct bio *bio);
887

888 889 890 891 892 893
static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
{
	/* cic->icq is the first member, %NULL will convert to %NULL */
	return container_of(icq, struct cfq_io_cq, icq);
}

894 895 896 897 898 899 900 901
static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
					       struct io_context *ioc)
{
	if (ioc)
		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
	return NULL;
}

902
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
903
{
904
	return cic->cfqq[is_sync];
905 906
}

907 908
static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
				bool is_sync)
909
{
910
	cic->cfqq[is_sync] = cfqq;
911 912
}

913
static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
914
{
915
	return cic->icq.q->elevator->elevator_data;
916 917
}

Andrew Morton's avatar
Andrew Morton committed
918 919 920 921
/*
 * scheduler run of queue, if there are requests pending and no one in the
 * driver that will restart queueing
 */
922
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
Andrew Morton's avatar
Andrew Morton committed
923
{
924 925
	if (cfqd->busy_queues) {
		cfq_log(cfqd, "schedule dispatch");
926
		kblockd_schedule_work(&cfqd->unplug_work);
927
	}
Andrew Morton's avatar
Andrew Morton committed
928 929
}

930 931 932 933 934
/*
 * Scale schedule slice based on io priority. Use the sync time slice only
 * if a queue is marked sync and has sync io queued. A sync queue with async
 * io only, should not get full sync slice length.
 */
935
static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
936
				 unsigned short prio)
937
{
938 939
	u64 base_slice = cfqd->cfq_slice[sync];
	u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
940

941 942
	WARN_ON(prio >= IOPRIO_BE_NR);

943
	return base_slice + (slice * (4 - prio));
944
}
945

946
static inline u64
947 948 949
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{