fs-writeback.c 37.5 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10
/*
 * fs/fs-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
 * Contains all the functions related to writing back and waiting
 * upon dirty inodes against superblocks, and writing back dirty
 * pages against inodes.  ie: data writeback.  Writeout of the
 * inode itself is not handled here.
 *
11
 * 10Apr2002	Andrew Morton
Linus Torvalds's avatar
Linus Torvalds committed
12 13 14 15 16
 *		Split out of fs/inode.c
 *		Additions for address_space-based writeback
 */

#include <linux/kernel.h>
17
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
18
#include <linux/spinlock.h>
19
#include <linux/slab.h>
Linus Torvalds's avatar
Linus Torvalds committed
20 21 22
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
23
#include <linux/pagemap.h>
24 25
#include <linux/kthread.h>
#include <linux/freezer.h>
Linus Torvalds's avatar
Linus Torvalds committed
26 27 28
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
29
#include <linux/tracepoint.h>
30
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
31

32 33 34 35 36
/*
 * 4MB minimal write chunk size
 */
#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))

37 38 39
/*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */
40
struct wb_writeback_work {
41 42
	long nr_pages;
	struct super_block *sb;
43
	unsigned long *older_than_this;
44
	enum writeback_sync_modes sync_mode;
45
	unsigned int tagged_writepages:1;
46 47 48
	unsigned int for_kupdate:1;
	unsigned int range_cyclic:1;
	unsigned int for_background:1;
49
	enum wb_reason reason;		/* why was writeback initiated? */
50

51
	struct list_head list;		/* pending work list */
52
	struct completion *done;	/* set if the caller waits */
53 54
};

55 56 57 58 59
/*
 * We don't actually have pdflush, but this one is exported though /proc...
 */
int nr_pdflush_threads;

60 61 62 63
/**
 * writeback_in_progress - determine whether there is writeback in progress
 * @bdi: the device's backing_dev_info structure.
 *
64 65
 * Determine whether there is writeback waiting to be handled against a
 * backing device.
66 67 68
 */
int writeback_in_progress(struct backing_dev_info *bdi)
{
Jan Kara's avatar
Jan Kara committed
69
	return test_bit(BDI_writeback_running, &bdi->state);
70 71
}

72 73 74 75
static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
	struct super_block *sb = inode->i_sb;

76 77 78 79
	if (strcmp(sb->s_type->name, "bdev") == 0)
		return inode->i_mapping->backing_dev_info;

	return sb->s_bdi;
80 81
}

Nick Piggin's avatar
Nick Piggin committed
82 83 84 85 86
static inline struct inode *wb_inode(struct list_head *head)
{
	return list_entry(head, struct inode, i_wb_list);
}

87 88 89 90 91 92 93 94
/*
 * Include the creation of the trace points after defining the
 * wb_writeback_work structure and inline functions so that the definition
 * remains local to this file.
 */
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>

95 96
/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
97
{
98 99 100 101 102 103 104
	if (bdi->wb.task) {
		wake_up_process(bdi->wb.task);
	} else {
		/*
		 * The bdi thread isn't there, wake up the forker thread which
		 * will create and run it.
		 */
105
		wake_up_process(default_backing_dev_info.wb.task);
Linus Torvalds's avatar
Linus Torvalds committed
106
	}
107 108 109 110 111 112 113 114 115 116 117 118
}

static void bdi_queue_work(struct backing_dev_info *bdi,
			   struct wb_writeback_work *work)
{
	trace_writeback_queue(bdi, work);

	spin_lock_bh(&bdi->wb_lock);
	list_add_tail(&work->list, &bdi->work_list);
	if (!bdi->wb.task)
		trace_writeback_nothread(bdi, work);
	bdi_wakeup_flusher(bdi);
119
	spin_unlock_bh(&bdi->wb_lock);
Linus Torvalds's avatar
Linus Torvalds committed
120 121
}

122 123
static void
__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
124
		      bool range_cyclic, enum wb_reason reason)
Linus Torvalds's avatar
Linus Torvalds committed
125
{
126
	struct wb_writeback_work *work;
127

128 129 130 131
	/*
	 * This is WB_SYNC_NONE writeback, so if allocation fails just
	 * wakeup the thread for old dirty data writeback
	 */
132 133
	work = kzalloc(sizeof(*work), GFP_ATOMIC);
	if (!work) {
134 135
		if (bdi->wb.task) {
			trace_writeback_nowork(bdi);
136
			wake_up_process(bdi->wb.task);
137
		}
138
		return;
139
	}
140

141 142 143
	work->sync_mode	= WB_SYNC_NONE;
	work->nr_pages	= nr_pages;
	work->range_cyclic = range_cyclic;
144
	work->reason	= reason;
145

146
	bdi_queue_work(bdi, work);
147 148 149 150 151 152
}

/**
 * bdi_start_writeback - start writeback
 * @bdi: the backing device to write from
 * @nr_pages: the number of pages to write
153
 * @reason: reason why some writeback work was initiated
154 155 156
 *
 * Description:
 *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
Lucas De Marchi's avatar
Lucas De Marchi committed
157
 *   started when this function returns, we make no guarantees on
158
 *   completion. Caller need not hold sb s_umount semaphore.
159 160
 *
 */
161 162
void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
			enum wb_reason reason)
163
{
164
	__bdi_start_writeback(bdi, nr_pages, true, reason);
165
}
166

167 168 169 170 171
/**
 * bdi_start_background_writeback - start background writeback
 * @bdi: the backing device to write from
 *
 * Description:
172 173 174 175
 *   This makes sure WB_SYNC_NONE background writeback happens. When
 *   this function returns, it is only guaranteed that for given BDI
 *   some IO is happening if we are over background dirty threshold.
 *   Caller need not hold sb s_umount semaphore.
176 177 178
 */
void bdi_start_background_writeback(struct backing_dev_info *bdi)
{
179 180 181 182
	/*
	 * We just wake up the flusher thread. It will perform background
	 * writeback as soon as there is no other work to do.
	 */
183
	trace_writeback_wake_background(bdi);
184 185 186
	spin_lock_bh(&bdi->wb_lock);
	bdi_wakeup_flusher(bdi);
	spin_unlock_bh(&bdi->wb_lock);
Linus Torvalds's avatar
Linus Torvalds committed
187 188
}

189 190 191 192 193
/*
 * Remove the inode from the writeback list it is on.
 */
void inode_wb_list_del(struct inode *inode)
{
194 195 196
	struct backing_dev_info *bdi = inode_to_bdi(inode);

	spin_lock(&bdi->wb.list_lock);
197
	list_del_init(&inode->i_wb_list);
198
	spin_unlock(&bdi->wb.list_lock);
199 200
}

201 202 203 204 205
/*
 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 * furthest end of its superblock's dirty-inode list.
 *
 * Before stamping the inode's ->dirtied_when, we check to see whether it is
206
 * already the most-recently-dirtied inode on the b_dirty list.  If that is
207 208 209
 * the case then the inode must have been redirtied while it was being written
 * out and we don't reset its dirtied_when.
 */
210
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
211
{
212
	assert_spin_locked(&wb->list_lock);
213
	if (!list_empty(&wb->b_dirty)) {
214
		struct inode *tail;
215

Nick Piggin's avatar
Nick Piggin committed
216
		tail = wb_inode(wb->b_dirty.next);
217
		if (time_before(inode->dirtied_when, tail->dirtied_when))
218 219
			inode->dirtied_when = jiffies;
	}
Nick Piggin's avatar
Nick Piggin committed
220
	list_move(&inode->i_wb_list, &wb->b_dirty);
221 222
}

223
/*
224
 * requeue inode for re-scanning after bdi->b_io list is exhausted.
225
 */
226
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
227
{
228
	assert_spin_locked(&wb->list_lock);
Nick Piggin's avatar
Nick Piggin committed
229
	list_move(&inode->i_wb_list, &wb->b_more_io);
230 231
}

Joern Engel's avatar
Joern Engel committed
232 233 234
static void inode_sync_complete(struct inode *inode)
{
	/*
235
	 * Prevent speculative execution through
236
	 * spin_unlock(&wb->list_lock);
Joern Engel's avatar
Joern Engel committed
237
	 */
238

Joern Engel's avatar
Joern Engel committed
239 240 241 242
	smp_mb();
	wake_up_bit(&inode->i_state, __I_SYNC);
}

243 244 245 246 247 248 249 250
static bool inode_dirtied_after(struct inode *inode, unsigned long t)
{
	bool ret = time_after(inode->dirtied_when, t);
#ifndef CONFIG_64BIT
	/*
	 * For inodes being constantly redirtied, dirtied_when can get stuck.
	 * It _appears_ to be in the future, but is actually in distant past.
	 * This test is necessary to prevent such wrapped-around relative times
251
	 * from permanently stopping the whole bdi writeback.
252 253 254 255 256 257
	 */
	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
#endif
	return ret;
}

258 259 260
/*
 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
 */
261
static int move_expired_inodes(struct list_head *delaying_queue,
262
			       struct list_head *dispatch_queue,
263
			       struct wb_writeback_work *work)
264
{
265 266
	LIST_HEAD(tmp);
	struct list_head *pos, *node;
267
	struct super_block *sb = NULL;
268
	struct inode *inode;
269
	int do_sb_sort = 0;
270
	int moved = 0;
271

272
	while (!list_empty(delaying_queue)) {
Nick Piggin's avatar
Nick Piggin committed
273
		inode = wb_inode(delaying_queue->prev);
274 275
		if (work->older_than_this &&
		    inode_dirtied_after(inode, *work->older_than_this))
276
			break;
277 278 279
		if (sb && sb != inode->i_sb)
			do_sb_sort = 1;
		sb = inode->i_sb;
Nick Piggin's avatar
Nick Piggin committed
280
		list_move(&inode->i_wb_list, &tmp);
281
		moved++;
282 283
	}

284 285 286
	/* just one sb in list, splice to dispatch_queue and we're done */
	if (!do_sb_sort) {
		list_splice(&tmp, dispatch_queue);
287
		goto out;
288 289
	}

290 291
	/* Move inodes from one superblock together */
	while (!list_empty(&tmp)) {
Nick Piggin's avatar
Nick Piggin committed
292
		sb = wb_inode(tmp.prev)->i_sb;
293
		list_for_each_prev_safe(pos, node, &tmp) {
Nick Piggin's avatar
Nick Piggin committed
294
			inode = wb_inode(pos);
295
			if (inode->i_sb == sb)
Nick Piggin's avatar
Nick Piggin committed
296
				list_move(&inode->i_wb_list, dispatch_queue);
297
		}
298
	}
299 300
out:
	return moved;
301 302 303 304
}

/*
 * Queue all expired dirty inodes for io, eldest first.
305 306 307 308 309 310 311 312
 * Before
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    gf         edc     BA
 * After
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    g          fBAedc
 *                                           |
 *                                           +--> dequeue for IO
313
 */
314
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
315
{
316
	int moved;
317
	assert_spin_locked(&wb->list_lock);
318
	list_splice_init(&wb->b_more_io, &wb->b_io);
319 320
	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
	trace_writeback_queue_io(wb, work, moved);
321 322
}

323
static int write_inode(struct inode *inode, struct writeback_control *wbc)
324
{
325
	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
326
		return inode->i_sb->s_op->write_inode(inode, wbc);
327
	return 0;
328 329
}

Linus Torvalds's avatar
Linus Torvalds committed
330
/*
331 332
 * Wait for writeback on an inode to complete.
 */
333 334
static void inode_wait_for_writeback(struct inode *inode,
				     struct bdi_writeback *wb)
335 336 337 338 339
{
	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
	wait_queue_head_t *wqh;

	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
340 341
	while (inode->i_state & I_SYNC) {
		spin_unlock(&inode->i_lock);
342
		spin_unlock(&wb->list_lock);
343
		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
344
		spin_lock(&wb->list_lock);
345
		spin_lock(&inode->i_lock);
346
	}
347 348 349
}

/*
350
 * Write out an inode's dirty pages.  Called under wb->list_lock and
351 352
 * inode->i_lock.  Either the caller has an active reference on the inode or
 * the inode has I_WILL_FREE set.
353
 *
Linus Torvalds's avatar
Linus Torvalds committed
354 355 356 357 358 359 360
 * If `wait' is set, wait on the writeout.
 *
 * The whole writeout design is quite complex and fragile.  We want to avoid
 * starvation of particular inodes when others are being redirtied, prevent
 * livelocks, etc.
 */
static int
361 362
writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
		       struct writeback_control *wbc)
Linus Torvalds's avatar
Linus Torvalds committed
363 364
{
	struct address_space *mapping = inode->i_mapping;
365
	long nr_to_write = wbc->nr_to_write;
366
	unsigned dirty;
Linus Torvalds's avatar
Linus Torvalds committed
367 368
	int ret;

369
	assert_spin_locked(&wb->list_lock);
370 371
	assert_spin_locked(&inode->i_lock);

372 373 374 375 376 377 378 379
	if (!atomic_read(&inode->i_count))
		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
	else
		WARN_ON(inode->i_state & I_WILL_FREE);

	if (inode->i_state & I_SYNC) {
		/*
		 * If this inode is locked for writeback and we are not doing
380
		 * writeback-for-data-integrity, move it to b_more_io so that
381 382 383
		 * writeback can proceed with the other inodes on s_io.
		 *
		 * We'll have another go at writing back this inode when we
384
		 * completed a full scan of b_io.
385
		 */
386
		if (wbc->sync_mode != WB_SYNC_ALL) {
387
			requeue_io(inode, wb);
388 389
			trace_writeback_single_inode_requeue(inode, wbc,
							     nr_to_write);
390 391 392 393 394 395
			return 0;
		}

		/*
		 * It's a data-integrity sync.  We must wait.
		 */
396
		inode_wait_for_writeback(inode, wb);
397 398
	}

Joern Engel's avatar
Joern Engel committed
399
	BUG_ON(inode->i_state & I_SYNC);
Linus Torvalds's avatar
Linus Torvalds committed
400

401
	/* Set I_SYNC, reset I_DIRTY_PAGES */
Joern Engel's avatar
Joern Engel committed
402
	inode->i_state |= I_SYNC;
403
	inode->i_state &= ~I_DIRTY_PAGES;
404
	spin_unlock(&inode->i_lock);
405
	spin_unlock(&wb->list_lock);
Linus Torvalds's avatar
Linus Torvalds committed
406 407 408

	ret = do_writepages(mapping, wbc);

409 410 411 412 413
	/*
	 * Make sure to wait on the data before writing out the metadata.
	 * This is important for filesystems that modify metadata on data
	 * I/O completion.
	 */
414
	if (wbc->sync_mode == WB_SYNC_ALL) {
415
		int err = filemap_fdatawait(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
416 417 418 419
		if (ret == 0)
			ret = err;
	}

420 421 422 423 424
	/*
	 * Some filesystems may redirty the inode during the writeback
	 * due to delalloc, clear dirty metadata flags right before
	 * write_inode()
	 */
425
	spin_lock(&inode->i_lock);
426 427
	dirty = inode->i_state & I_DIRTY;
	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
428
	spin_unlock(&inode->i_lock);
429 430
	/* Don't write the inode if only I_DIRTY_PAGES was set */
	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
431
		int err = write_inode(inode, wbc);
Linus Torvalds's avatar
Linus Torvalds committed
432 433 434 435
		if (ret == 0)
			ret = err;
	}

436
	spin_lock(&wb->list_lock);
437
	spin_lock(&inode->i_lock);
Joern Engel's avatar
Joern Engel committed
438
	inode->i_state &= ~I_SYNC;
439
	if (!(inode->i_state & I_FREEING)) {
440 441 442 443 444 445 446 447 448
		/*
		 * Sync livelock prevention. Each inode is tagged and synced in
		 * one shot. If still dirty, it will be redirty_tail()'ed below.
		 * Update the dirty time to prevent enqueue and sync it again.
		 */
		if ((inode->i_state & I_DIRTY) &&
		    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
			inode->dirtied_when = jiffies;

449
		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
Linus Torvalds's avatar
Linus Torvalds committed
450 451
			/*
			 * We didn't write back all the pages.  nfs_writepages()
452
			 * sometimes bales out without doing anything.
453
			 */
454 455
			inode->i_state |= I_DIRTY_PAGES;
			if (wbc->nr_to_write <= 0) {
Linus Torvalds's avatar
Linus Torvalds committed
456
				/*
457
				 * slice used up: queue for next turn
Linus Torvalds's avatar
Linus Torvalds committed
458
				 */
459
				requeue_io(inode, wb);
Linus Torvalds's avatar
Linus Torvalds committed
460 461
			} else {
				/*
462 463 464 465 466
				 * Writeback blocked by something other than
				 * congestion. Delay the inode for some time to
				 * avoid spinning on the CPU (100% iowait)
				 * retrying writeback of the dirty page/inode
				 * that cannot be performed immediately.
Linus Torvalds's avatar
Linus Torvalds committed
467
				 */
468
				redirty_tail(inode, wb);
Linus Torvalds's avatar
Linus Torvalds committed
469
			}
470 471 472 473 474 475 476
		} else if (inode->i_state & I_DIRTY) {
			/*
			 * Filesystems can dirty the inode during writeback
			 * operations, such as delayed allocation during
			 * submission or metadata updates after data IO
			 * completion.
			 */
477
			redirty_tail(inode, wb);
Linus Torvalds's avatar
Linus Torvalds committed
478 479
		} else {
			/*
480 481 482
			 * The inode is clean.  At this point we either have
			 * a reference to the inode or it's on it's way out.
			 * No need to add it back to the LRU.
Linus Torvalds's avatar
Linus Torvalds committed
483
			 */
Nick Piggin's avatar
Nick Piggin committed
484
			list_del_init(&inode->i_wb_list);
Linus Torvalds's avatar
Linus Torvalds committed
485 486
		}
	}
Joern Engel's avatar
Joern Engel committed
487
	inode_sync_complete(inode);
488
	trace_writeback_single_inode(inode, wbc, nr_to_write);
Linus Torvalds's avatar
Linus Torvalds committed
489 490 491
	return ret;
}

492 493
static long writeback_chunk_size(struct backing_dev_info *bdi,
				 struct wb_writeback_work *work)
494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511
{
	long pages;

	/*
	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
	 * here avoids calling into writeback_inodes_wb() more than once.
	 *
	 * The intended call sequence for WB_SYNC_ALL writeback is:
	 *
	 *      wb_writeback()
	 *          writeback_sb_inodes()       <== called only once
	 *              write_cache_pages()     <== called once for each inode
	 *                   (quickly) tag currently dirty pages
	 *                   (maybe slowly) sync all tagged pages
	 */
	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
		pages = LONG_MAX;
512 513 514 515 516 517 518
	else {
		pages = min(bdi->avg_write_bandwidth / 2,
			    global_dirty_limit / DIRTY_SCOPE);
		pages = min(pages, work->nr_pages);
		pages = round_down(pages + MIN_WRITEBACK_PAGES,
				   MIN_WRITEBACK_PAGES);
	}
519 520 521 522

	return pages;
}

523 524
/*
 * Write a portion of b_io inodes which belong to @sb.
525 526
 *
 * If @only_this_sb is true, then find and write all such
527 528
 * inodes. Otherwise write only ones which go sequentially
 * in reverse order.
529
 *
530
 * Return the number of pages and/or inodes written.
531
 */
532 533 534
static long writeback_sb_inodes(struct super_block *sb,
				struct bdi_writeback *wb,
				struct wb_writeback_work *work)
Linus Torvalds's avatar
Linus Torvalds committed
535
{
536 537 538 539 540 541 542 543 544 545 546 547 548
	struct writeback_control wbc = {
		.sync_mode		= work->sync_mode,
		.tagged_writepages	= work->tagged_writepages,
		.for_kupdate		= work->for_kupdate,
		.for_background		= work->for_background,
		.range_cyclic		= work->range_cyclic,
		.range_start		= 0,
		.range_end		= LLONG_MAX,
	};
	unsigned long start_time = jiffies;
	long write_chunk;
	long wrote = 0;  /* count both pages and inodes */

549
	while (!list_empty(&wb->b_io)) {
Nick Piggin's avatar
Nick Piggin committed
550
		struct inode *inode = wb_inode(wb->b_io.prev);
551 552

		if (inode->i_sb != sb) {
553
			if (work->sb) {
554 555 556 557 558
				/*
				 * We only want to write back data for this
				 * superblock, move all inodes not belonging
				 * to it back onto the dirty list.
				 */
559
				redirty_tail(inode, wb);
560 561 562 563 564 565 566 567
				continue;
			}

			/*
			 * The inode belongs to a different superblock.
			 * Bounce back to the caller to unpin this and
			 * pin the next superblock.
			 */
568
			break;
569 570
		}

571 572 573 574 575
		/*
		 * Don't bother with new inodes or inodes beeing freed, first
		 * kind does not need peridic writeout yet, and for the latter
		 * kind writeout is handled by the freer.
		 */
576
		spin_lock(&inode->i_lock);
577
		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
578
			spin_unlock(&inode->i_lock);
579
			redirty_tail(inode, wb);
580 581
			continue;
		}
Linus Torvalds's avatar
Linus Torvalds committed
582
		__iget(inode);
583
		write_chunk = writeback_chunk_size(wb->bdi, work);
584 585
		wbc.nr_to_write = write_chunk;
		wbc.pages_skipped = 0;
586

587
		writeback_single_inode(inode, wb, &wbc);
588

589 590 591 592 593
		work->nr_pages -= write_chunk - wbc.nr_to_write;
		wrote += write_chunk - wbc.nr_to_write;
		if (!(inode->i_state & I_DIRTY))
			wrote++;
		if (wbc.pages_skipped) {
Linus Torvalds's avatar
Linus Torvalds committed
594 595 596 597
			/*
			 * writeback is not making progress due to locked
			 * buffers.  Skip this inode for now.
			 */
598
			redirty_tail(inode, wb);
Linus Torvalds's avatar
Linus Torvalds committed
599
		}
600
		spin_unlock(&inode->i_lock);
601
		spin_unlock(&wb->list_lock);
Linus Torvalds's avatar
Linus Torvalds committed
602
		iput(inode);
603
		cond_resched();
604
		spin_lock(&wb->list_lock);
605 606 607 608 609 610 611 612 613
		/*
		 * bail out to wb_writeback() often enough to check
		 * background threshold and other termination conditions.
		 */
		if (wrote) {
			if (time_is_before_jiffies(start_time + HZ / 10UL))
				break;
			if (work->nr_pages <= 0)
				break;
614
		}
Linus Torvalds's avatar
Linus Torvalds committed
615
	}
616
	return wrote;
617 618
}

619 620
static long __writeback_inodes_wb(struct bdi_writeback *wb,
				  struct wb_writeback_work *work)
621
{
622 623
	unsigned long start_time = jiffies;
	long wrote = 0;
Nick Piggin's avatar
Nick Piggin committed
624

625
	while (!list_empty(&wb->b_io)) {
Nick Piggin's avatar
Nick Piggin committed
626
		struct inode *inode = wb_inode(wb->b_io.prev);
627
		struct super_block *sb = inode->i_sb;
628

629
		if (!grab_super_passive(sb)) {
630 631 632 633 634 635
			/*
			 * grab_super_passive() may fail consistently due to
			 * s_umount being grabbed by someone else. Don't use
			 * requeue_io() to avoid busy retrying the inode/sb.
			 */
			redirty_tail(inode, wb);
636
			continue;
637
		}
638
		wrote += writeback_sb_inodes(sb, wb, work);
639
		drop_super(sb);
640

641 642 643 644 645 646 647
		/* refer to the same tests at the end of writeback_sb_inodes */
		if (wrote) {
			if (time_is_before_jiffies(start_time + HZ / 10UL))
				break;
			if (work->nr_pages <= 0)
				break;
		}
648
	}
649
	/* Leave any unwritten inodes on b_io */
650
	return wrote;
651 652
}

653 654
long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
				enum wb_reason reason)
655
{
656 657 658 659
	struct wb_writeback_work work = {
		.nr_pages	= nr_pages,
		.sync_mode	= WB_SYNC_NONE,
		.range_cyclic	= 1,
660
		.reason		= reason,
661
	};
662

663
	spin_lock(&wb->list_lock);
664
	if (list_empty(&wb->b_io))
665
		queue_io(wb, &work);
666
	__writeback_inodes_wb(wb, &work);
667
	spin_unlock(&wb->list_lock);
668

669 670
	return nr_pages - work.nr_pages;
}
671

672
static bool over_bground_thresh(struct backing_dev_info *bdi)
673 674 675
{
	unsigned long background_thresh, dirty_thresh;

676
	global_dirty_limits(&background_thresh, &dirty_thresh);
677

678 679 680 681 682 683 684 685 686
	if (global_page_state(NR_FILE_DIRTY) +
	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
		return true;

	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
				bdi_dirty_limit(bdi, background_thresh))
		return true;

	return false;
687 688
}

689 690 691 692 693 694 695
/*
 * Called under wb->list_lock. If there are multiple wb per bdi,
 * only the flusher working on the first wb should do it.
 */
static void wb_update_bandwidth(struct bdi_writeback *wb,
				unsigned long start_time)
{
696
	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
697 698
}

699 700
/*
 * Explicit flushing or periodic writeback of "old" data.
701
 *
702 703 704 705
 * Define "old": the first time one of an inode's pages is dirtied, we mark the
 * dirtying-time in the inode's address_space.  So this periodic writeback code
 * just walks the superblock inode list, writing back any inodes which are
 * older than a specific point in time.
706
 *
707 708 709
 * Try to run once per dirty_writeback_interval.  But if a writeback event
 * takes longer than a dirty_writeback_interval interval, then leave a
 * one-second gap.
710
 *
711 712
 * older_than_this takes precedence over nr_to_write.  So we'll only write back
 * all dirty pages if they are all attached to "old" mappings.
713
 */
714
static long wb_writeback(struct bdi_writeback *wb,
715
			 struct wb_writeback_work *work)
716
{
717
	unsigned long wb_start = jiffies;
718
	long nr_pages = work->nr_pages;
719
	unsigned long oldest_jif;
Jan Kara's avatar
Jan Kara committed
720
	struct inode *inode;
721
	long progress;
722

723
	oldest_jif = jiffies;
724
	work->older_than_this = &oldest_jif;
Nick Piggin's avatar
Nick Piggin committed
725

726
	spin_lock(&wb->list_lock);
727 728
	for (;;) {
		/*
729
		 * Stop writeback when nr_pages has been consumed
730
		 */
731
		if (work->nr_pages <= 0)
732
			break;
733

734 735 736 737 738 739 740 741 742 743
		/*
		 * Background writeout and kupdate-style writeback may
		 * run forever. Stop them if there is other work to do
		 * so that e.g. sync can proceed. They'll be restarted
		 * after the other works are all done.
		 */
		if ((work->for_background || work->for_kupdate) &&
		    !list_empty(&wb->bdi->work_list))
			break;

Nick Piggin's avatar
Nick Piggin committed
744
		/*
745 746
		 * For background writeout, stop when we are below the
		 * background dirty threshold
Nick Piggin's avatar
Nick Piggin committed
747
		 */
748
		if (work->for_background && !over_bground_thresh(wb->bdi))
749
			break;
Nick Piggin's avatar
Nick Piggin committed
750

751 752 753 754 755 756
		/*
		 * Kupdate and background works are special and we want to
		 * include all inodes that need writing. Livelock avoidance is
		 * handled by these works yielding to any other work so we are
		 * safe.
		 */
757 758 759
		if (work->for_kupdate) {
			oldest_jif = jiffies -
				msecs_to_jiffies(dirty_expire_interval * 10);
760 761
		} else if (work->for_background)
			oldest_jif = jiffies;
762

763
		trace_writeback_start(wb->bdi, work);
764
		if (list_empty(&wb->b_io))
765
			queue_io(wb, work);
766
		if (work->sb)
767
			progress = writeback_sb_inodes(work->sb, wb, work);
768
		else
769 770
			progress = __writeback_inodes_wb(wb, work);
		trace_writeback_written(wb->bdi, work);
771

772
		wb_update_bandwidth(wb, wb_start);
773 774

		/*
775 776 777 778 779 780
		 * Did we write something? Try for more
		 *
		 * Dirty inodes are moved to b_io for writeback in batches.
		 * The completion of the current batch does not necessarily
		 * mean the overall work is done. So we keep looping as long
		 * as made some progress on cleaning pages or inodes.
781
		 */
782
		if (progress)
783 784
			continue;
		/*
785
		 * No more inodes for IO, bail
786
		 */
787
		if (list_empty(&wb->b_more_io))
788
			break;
789 790 791 792 793 794
		/*
		 * Nothing written. Wait for some inode to
		 * become available for writeback. Otherwise
		 * we'll just busyloop.
		 */
		if (!list_empty(&wb->b_more_io))  {
795
			trace_writeback_wait(wb->bdi, work);
Nick Piggin's avatar
Nick Piggin committed
796
			inode = wb_inode(wb->b_more_io.prev);
797
			spin_lock(&inode->i_lock);
798
			inode_wait_for_writeback(inode, wb);
799
			spin_unlock(&inode->i_lock);
800 801
		}
	}
802
	spin_unlock(&wb->list_lock);
803

804
	return nr_pages - work->nr_pages;
805 806 807
}

/*
808
 * Return the next wb_writeback_work struct that hasn't been processed yet.
809
 */
810
static struct wb_writeback_work *
811
get_next_work_item(struct backing_dev_info *bdi)
812
{
813
	struct wb_writeback_work *work = NULL;
814

815
	spin_lock_bh(&bdi->wb_lock);
816 817 818 819
	if (!list_empty(&bdi->work_list)) {
		work = list_entry(bdi->work_list.next,
				  struct wb_writeback_work, list);
		list_del_init(&work->list);
820
	}
821
	spin_unlock_bh(&bdi->wb_lock);
822
	return work;
823 824
}

825 826 827 828 829 830 831 832 833 834 835
/*
 * Add in the number of potentially dirty inodes, because each inode
 * write can dirty pagecache in the underlying blockdev.
 */
static unsigned long get_nr_dirty_pages(void)
{
	return global_page_state(NR_FILE_DIRTY) +
		global_page_state(NR_UNSTABLE_NFS) +
		get_nr_dirty_inodes();
}

836 837
static long wb_check_background_flush(struct bdi_writeback *wb)
{
838
	if (over_bground_thresh(wb->bdi)) {
839 840 841 842 843 844

		struct wb_writeback_work work = {
			.nr_pages	= LONG_MAX,
			.sync_mode	= WB_SYNC_NONE,
			.for_background	= 1,
			.range_cyclic	= 1,
845
			.reason		= WB_REASON_BACKGROUND,
846 847 848 849 850 851 852 853
		};

		return wb_writeback(wb, &work);
	}

	return 0;
}

854 855 856 857 858
static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
	unsigned long expired;
	long nr_pages;

859 860 861 862 863 864
	/*
	 * When set to zero, disable periodic writeback
	 */
	if (!dirty_writeback_interval)
		return 0;

865 866 867 868 869 870
	expired = wb->last_old_flush +
			msecs_to_jiffies(dirty_writeback_interval * 10);
	if (time_before(jiffies, expired))
		return 0;

	wb->last_old_flush = jiffies;
871
	nr_pages = get_nr_dirty_pages();
872

873
	if (nr_pages) {
874
		struct wb_writeback_work work = {
875 876 877 878
			.nr_pages	= nr_pages,
			.sync_mode	= WB_SYNC_NONE,
			.for_kupdate	= 1,
			.range_cyclic	= 1,
879
			.reason		= WB_REASON_PERIODIC,
880 881
		};

882
		return wb_writeback(wb, &work);
883
	}
884 885 886 887 888 889 890 891 892 893

	return 0;
}

/*
 * Retrieve work items and do the writeback they describe
 */
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
	struct backing_dev_info *bdi = wb->bdi;
894
	struct wb_writeback_work *work;
895
	long wrote = 0;
896

Jan Kara's avatar
Jan Kara committed
897
	set_bit(BDI_writeback_running, &wb->bdi->state);
898
	while ((work = get_next_work_item(bdi)) != NULL) {
899 900
		/*
		 * Override sync mode, in case we must wait for completion
901
		 * because this thread is exiting now.
902 903
		 */
		if (force_wait)
904
			work->sync_mode = WB_SYNC_ALL;
905

906 907
		trace_writeback_exec(bdi, work);

908
		wrote += wb_writeback(wb, work);
909 910

		/*
911 912
		 * Notify the caller of completion if this is a synchronous
		 * work item, otherwise just free it.