bitmap.c 68.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
 *
 * bitmap_create  - sets up the bitmap structure
 * bitmap_destroy - destroys the bitmap structure
 *
 * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
 * - added disk storage for bitmap
 * - changes to allow various bitmap chunk sizes
 */

/*
 * Still to do:
 *
 * flush after percent set rather than just time based. (maybe both).
 */

18
#include <linux/blkdev.h>
19 20 21 22 23 24 25 26 27 28
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/timer.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/buffer_head.h>
29
#include <linux/seq_file.h>
30
#include <trace/events/block.h>
31
#include "md.h"
32
#include "bitmap.h"
33

34
static inline char *bmname(struct bitmap *bitmap)
35 36 37 38 39 40 41 42 43 44 45 46 47 48
{
	return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
}

/*
 * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
 *
 * 1) check to see if this page is allocated, if it's not then try to alloc
 * 2) if the alloc fails, set the page's hijacked flag so we'll use the
 *    page pointer directly as a counter
 *
 * if we find our page, we increment the page's refcount so that it stays
 * allocated while we're using it
 */
49
static int bitmap_checkpage(struct bitmap_counts *bitmap,
50
			    unsigned long page, int create, int no_hijack)
51 52
__releases(bitmap->lock)
__acquires(bitmap->lock)
53 54 55 56
{
	unsigned char *mappage;

	if (page >= bitmap->pages) {
57 58 59 60
		/* This can happen if bitmap_start_sync goes beyond
		 * End-of-device while looking for a whole page.
		 * It is harmless.
		 */
61 62 63 64 65 66 67 68 69 70 71 72 73 74
		return -EINVAL;
	}

	if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */
		return 0;

	if (bitmap->bp[page].map) /* page is already allocated, just return */
		return 0;

	if (!create)
		return -ENOENT;

	/* this page has not been allocated yet */

75
	spin_unlock_irq(&bitmap->lock);
76 77 78 79 80 81 82 83 84 85 86 87 88
	/* It is possible that this is being called inside a
	 * prepare_to_wait/finish_wait loop from raid5c:make_request().
	 * In general it is not permitted to sleep in that context as it
	 * can cause the loop to spin freely.
	 * That doesn't apply here as we can only reach this point
	 * once with any loop.
	 * When this function completes, either bp[page].map or
	 * bp[page].hijacked.  In either case, this function will
	 * abort before getting to this point again.  So there is
	 * no risk of a free-spin, and so it is safe to assert
	 * that sleeping here is allowed.
	 */
	sched_annotate_sleep();
89
	mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
90 91 92
	spin_lock_irq(&bitmap->lock);

	if (mappage == NULL) {
93
		pr_debug("md/bitmap: map page allocation failed, hijacking\n");
94 95 96
		/* We don't support hijack for cluster raid */
		if (no_hijack)
			return -ENOMEM;
97 98 99 100
		/* failed - set the hijacked flag so that we can use the
		 * pointer as a counter */
		if (!bitmap->bp[page].map)
			bitmap->bp[page].hijacked = 1;
101 102
	} else if (bitmap->bp[page].map ||
		   bitmap->bp[page].hijacked) {
103
		/* somebody beat us to getting the page */
104
		kfree(mappage);
105
	} else {
106

107
		/* no page was in place and we have one, so install it */
108

109 110 111
		bitmap->bp[page].map = mappage;
		bitmap->missing_pages--;
	}
112 113 114 115 116 117
	return 0;
}

/* if page is completely empty, put it back on the free list, or dealloc it */
/* if page was hijacked, unmark the flag so it might get alloced next time */
/* Note: lock should be held when calling this */
118
static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
119 120 121 122 123 124 125 126 127 128 129
{
	char *ptr;

	if (bitmap->bp[page].count) /* page is still busy */
		return;

	/* page is no longer in use, it can be released */

	if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */
		bitmap->bp[page].hijacked = 0;
		bitmap->bp[page].map = NULL;
130 131 132 133 134
	} else {
		/* normal case, free the page */
		ptr = bitmap->bp[page].map;
		bitmap->bp[page].map = NULL;
		bitmap->missing_pages++;
135
		kfree(ptr);
136 137 138 139 140 141 142 143 144 145 146
	}
}

/*
 * bitmap file handling - read and write the bitmap file and its superblock
 */

/*
 * basic page I/O operations
 */

147
/* IO operations when bitmap is stored near all superblocks */
148 149 150
static int read_sb_page(struct mddev *mddev, loff_t offset,
			struct page *page,
			unsigned long index, int size)
151 152 153
{
	/* choose a good rdev and read the page from there */

154
	struct md_rdev *rdev;
155 156
	sector_t target;

NeilBrown's avatar
NeilBrown committed
157
	rdev_for_each(rdev, mddev) {
158
		if (! test_bit(In_sync, &rdev->flags)
159 160
		    || test_bit(Faulty, &rdev->flags)
		    || test_bit(Bitmap_sync, &rdev->flags))
161 162
			continue;

163
		target = offset + index * (PAGE_SIZE/512);
164

165
		if (sync_page_io(rdev, target,
166
				 roundup(size, bdev_logical_block_size(rdev->bdev)),
Mike Christie's avatar
Mike Christie committed
167
				 page, REQ_OP_READ, 0, true)) {
168
			page->index = index;
169
			return 0;
170 171
		}
	}
172
	return -EIO;
173 174
}

175
static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev)
176 177 178 179 180 181 182
{
	/* Iterate the disks of an mddev, using rcu to protect access to the
	 * linked list, and raising the refcount of devices we return to ensure
	 * they don't disappear while in use.
	 * As devices are only added or removed when raid_disk is < 0 and
	 * nr_pending is 0 and In_sync is clear, the entries we return will
	 * still be in the same position on the list when we re-enter
183
	 * list_for_each_entry_continue_rcu.
184 185 186 187 188
	 *
	 * Note that if entered with 'rdev == NULL' to start at the
	 * beginning, we temporarily assign 'rdev' to an address which
	 * isn't really an rdev, but which can be used by
	 * list_for_each_entry_continue_rcu() to find the first entry.
189 190 191 192
	 */
	rcu_read_lock();
	if (rdev == NULL)
		/* start at the beginning */
193
		rdev = list_entry(&mddev->disks, struct md_rdev, same_set);
194 195 196 197
	else {
		/* release the previous rdev and start from there. */
		rdev_dec_pending(rdev, mddev);
	}
198
	list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) {
199 200 201 202 203 204 205 206 207 208 209 210
		if (rdev->raid_disk >= 0 &&
		    !test_bit(Faulty, &rdev->flags)) {
			/* this is a usable devices */
			atomic_inc(&rdev->nr_pending);
			rcu_read_unlock();
			return rdev;
		}
	}
	rcu_read_unlock();
	return NULL;
}

211
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
212
{
213
	struct md_rdev *rdev;
214
	struct block_device *bdev;
215
	struct mddev *mddev = bitmap->mddev;
216
	struct bitmap_storage *store = &bitmap->storage;
217

218 219
restart:
	rdev = NULL;
220
	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
221 222
		int size = PAGE_SIZE;
		loff_t offset = mddev->bitmap_info.offset;
223 224 225

		bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;

226 227 228 229 230
		if (page->index == store->file_pages-1) {
			int last_page_size = store->bytes & (PAGE_SIZE-1);
			if (last_page_size == 0)
				last_page_size = PAGE_SIZE;
			size = roundup(last_page_size,
231
				       bdev_logical_block_size(bdev));
232
		}
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
		/* Just make sure we aren't corrupting data or
		 * metadata
		 */
		if (mddev->external) {
			/* Bitmap could be anywhere. */
			if (rdev->sb_start + offset + (page->index
						       * (PAGE_SIZE/512))
			    > rdev->data_offset
			    &&
			    rdev->sb_start + offset
			    < (rdev->data_offset + mddev->dev_sectors
			     + (PAGE_SIZE/512)))
				goto bad_alignment;
		} else if (offset < 0) {
			/* DATA  BITMAP METADATA  */
			if (offset
			    + (long)(page->index * (PAGE_SIZE/512))
			    + size/512 > 0)
				/* bitmap runs in to metadata */
				goto bad_alignment;
			if (rdev->data_offset + mddev->dev_sectors
			    > rdev->sb_start + offset)
				/* data runs in to bitmap */
				goto bad_alignment;
		} else if (rdev->sb_start < rdev->data_offset) {
			/* METADATA BITMAP DATA */
			if (rdev->sb_start
			    + offset
			    + page->index*(PAGE_SIZE/512) + size/512
			    > rdev->data_offset)
				/* bitmap runs in to data */
				goto bad_alignment;
		} else {
			/* DATA METADATA BITMAP - no problems */
		}
		md_super_write(mddev, rdev,
			       rdev->sb_start + offset
			       + page->index * (PAGE_SIZE/512),
			       size,
			       page);
273
	}
274

275 276
	if (wait && md_super_wait(mddev) < 0)
		goto restart;
277
	return 0;
278 279 280

 bad_alignment:
	return -EINVAL;
281 282
}

283
static void bitmap_file_kick(struct bitmap *bitmap);
284
/*
285
 * write out a page to a file
286
 */
287
static void write_page(struct bitmap *bitmap, struct page *page, int wait)
288
{
289
	struct buffer_head *bh;
290

291
	if (bitmap->storage.file == NULL) {
292 293
		switch (write_sb_page(bitmap, page, wait)) {
		case -EINVAL:
294
			set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
295
		}
296
	} else {
297

298
		bh = page_buffers(page);
299

300 301 302 303
		while (bh && bh->b_blocknr) {
			atomic_inc(&bitmap->pending_writes);
			set_buffer_locked(bh);
			set_buffer_mapped(bh);
304
			submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
305 306
			bh = bh->b_this_page;
		}
307

308
		if (wait)
309 310
			wait_event(bitmap->write_wait,
				   atomic_read(&bitmap->pending_writes)==0);
311
	}
312
	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
313
		bitmap_file_kick(bitmap);
314 315 316 317 318
}

static void end_bitmap_write(struct buffer_head *bh, int uptodate)
{
	struct bitmap *bitmap = bh->b_private;
319

320 321
	if (!uptodate)
		set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
322 323 324
	if (atomic_dec_and_test(&bitmap->pending_writes))
		wake_up(&bitmap->write_wait);
}
325

326 327 328 329 330 331
/* copied from buffer.c */
static void
__clear_page_buffers(struct page *page)
{
	ClearPagePrivate(page);
	set_page_private(page, 0);
332
	put_page(page);
333 334 335
}
static void free_buffers(struct page *page)
{
336
	struct buffer_head *bh;
337

338 339 340 341
	if (!PagePrivate(page))
		return;

	bh = page_buffers(page);
342 343 344 345
	while (bh) {
		struct buffer_head *next = bh->b_this_page;
		free_buffer_head(bh);
		bh = next;
346
	}
347 348
	__clear_page_buffers(page);
	put_page(page);
349 350
}

351 352 353 354 355 356 357
/* read a page from a file.
 * We both read the page, and attach buffers to the page to record the
 * address of each block (using bmap).  These addresses will be used
 * to write the block later, completely bypassing the filesystem.
 * This usage is similar to how swap files are handled, and allows us
 * to write to a file with no concerns of memory allocation failing.
 */
358 359 360 361
static int read_page(struct file *file, unsigned long index,
		     struct bitmap *bitmap,
		     unsigned long count,
		     struct page *page)
362
{
363
	int ret = 0;
Al Viro's avatar
Al Viro committed
364
	struct inode *inode = file_inode(file);
365 366
	struct buffer_head *bh;
	sector_t block;
367

368 369
	pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
		 (unsigned long long)index << PAGE_SHIFT);
370

371 372
	bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
	if (!bh) {
373
		ret = -ENOMEM;
374 375
		goto out;
	}
376 377 378 379 380 381 382 383 384
	attach_page_buffers(page, bh);
	block = index << (PAGE_SHIFT - inode->i_blkbits);
	while (bh) {
		if (count == 0)
			bh->b_blocknr = 0;
		else {
			bh->b_blocknr = bmap(inode, block);
			if (bh->b_blocknr == 0) {
				/* Cannot use this file! */
385
				ret = -EINVAL;
386 387 388 389 390 391 392 393 394 395
				goto out;
			}
			bh->b_bdev = inode->i_sb->s_bdev;
			if (count < (1<<inode->i_blkbits))
				count = 0;
			else
				count -= (1<<inode->i_blkbits);

			bh->b_end_io = end_bitmap_write;
			bh->b_private = bitmap;
396 397 398
			atomic_inc(&bitmap->pending_writes);
			set_buffer_locked(bh);
			set_buffer_mapped(bh);
399
			submit_bh(REQ_OP_READ, 0, bh);
400 401 402 403 404
		}
		block++;
		bh = bh->b_this_page;
	}
	page->index = index;
405 406 407

	wait_event(bitmap->write_wait,
		   atomic_read(&bitmap->pending_writes)==0);
408
	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
409
		ret = -EIO;
410
out:
411
	if (ret)
412 413 414 415
		pr_err("md: bitmap read error: (%dB @ %llu): %d\n",
		       (int)PAGE_SIZE,
		       (unsigned long long)index << PAGE_SHIFT,
		       ret);
416
	return ret;
417 418 419 420 421 422
}

/*
 * bitmap file superblock operations
 */

423 424 425 426 427 428 429 430 431 432 433
/*
 * bitmap_wait_writes() should be called before writing any bitmap
 * blocks, to ensure previous writes, particularly from
 * bitmap_daemon_work(), have completed.
 */
static void bitmap_wait_writes(struct bitmap *bitmap)
{
	if (bitmap->storage.file)
		wait_event(bitmap->write_wait,
			   atomic_read(&bitmap->pending_writes)==0);
	else
434 435 436 437 438 439 440
		/* Note that we ignore the return value.  The writes
		 * might have failed, but that would just mean that
		 * some bits which should be cleared haven't been,
		 * which is safe.  The relevant bitmap blocks will
		 * probably get written again, but there is no great
		 * loss if they aren't.
		 */
441 442 443 444
		md_super_wait(bitmap->mddev);
}


445
/* update the event counter and sync the superblock to disk */
446
void bitmap_update_sb(struct bitmap *bitmap)
447 448 449 450
{
	bitmap_super_t *sb;

	if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
451
		return;
452 453
	if (bitmap->mddev->bitmap_info.external)
		return;
454
	if (!bitmap->storage.sb_page) /* no superblock */
455
		return;
456
	sb = kmap_atomic(bitmap->storage.sb_page);
457
	sb->events = cpu_to_le64(bitmap->mddev->events);
458
	if (bitmap->mddev->events < bitmap->events_cleared)
459 460
		/* rocking back to read-only */
		bitmap->events_cleared = bitmap->mddev->events;
461 462
	sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
	sb->state = cpu_to_le32(bitmap->flags);
463 464 465
	/* Just in case these have been changed via sysfs: */
	sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
	sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
466 467 468
	/* This might have been changed by a reshape */
	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
	sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
469
	sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
470 471
	sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
					   bitmap_info.space);
472
	kunmap_atomic(sb);
473
	write_page(bitmap, bitmap->storage.sb_page, 1);
474
}
475
EXPORT_SYMBOL(bitmap_update_sb);
476 477 478 479 480 481

/* print out the bitmap file superblock */
void bitmap_print_sb(struct bitmap *bitmap)
{
	bitmap_super_t *sb;

482
	if (!bitmap || !bitmap->storage.sb_page)
483
		return;
484
	sb = kmap_atomic(bitmap->storage.sb_page);
485 486 487 488
	pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
	pr_debug("         magic: %08x\n", le32_to_cpu(sb->magic));
	pr_debug("       version: %d\n", le32_to_cpu(sb->version));
	pr_debug("          uuid: %08x.%08x.%08x.%08x\n",
489 490 491 492
		 le32_to_cpu(*(__u32 *)(sb->uuid+0)),
		 le32_to_cpu(*(__u32 *)(sb->uuid+4)),
		 le32_to_cpu(*(__u32 *)(sb->uuid+8)),
		 le32_to_cpu(*(__u32 *)(sb->uuid+12)));
493 494 495 496 497 498 499 500 501 502
	pr_debug("        events: %llu\n",
		 (unsigned long long) le64_to_cpu(sb->events));
	pr_debug("events cleared: %llu\n",
		 (unsigned long long) le64_to_cpu(sb->events_cleared));
	pr_debug("         state: %08x\n", le32_to_cpu(sb->state));
	pr_debug("     chunksize: %d B\n", le32_to_cpu(sb->chunksize));
	pr_debug("  daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
	pr_debug("     sync size: %llu KB\n",
		 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
	pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind));
503
	kunmap_atomic(sb);
504 505
}

506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
/*
 * bitmap_new_disk_sb
 * @bitmap
 *
 * This function is somewhat the reverse of bitmap_read_sb.  bitmap_read_sb
 * reads and verifies the on-disk bitmap superblock and populates bitmap_info.
 * This function verifies 'bitmap_info' and populates the on-disk bitmap
 * structure, which is to be written to disk.
 *
 * Returns: 0 on success, -Exxx on error
 */
static int bitmap_new_disk_sb(struct bitmap *bitmap)
{
	bitmap_super_t *sb;
	unsigned long chunksize, daemon_sleep, write_behind;

522
	bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
523 524
	if (bitmap->storage.sb_page == NULL)
		return -ENOMEM;
525
	bitmap->storage.sb_page->index = 0;
526

527
	sb = kmap_atomic(bitmap->storage.sb_page);
528 529 530 531 532 533 534

	sb->magic = cpu_to_le32(BITMAP_MAGIC);
	sb->version = cpu_to_le32(BITMAP_MAJOR_HI);

	chunksize = bitmap->mddev->bitmap_info.chunksize;
	BUG_ON(!chunksize);
	if (!is_power_of_2(chunksize)) {
535
		kunmap_atomic(sb);
536
		pr_warn("bitmap chunksize not a power of 2\n");
537 538 539 540 541
		return -EINVAL;
	}
	sb->chunksize = cpu_to_le32(chunksize);

	daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
542
	if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
543
		pr_debug("Choosing daemon_sleep default (5 sec)\n");
544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563
		daemon_sleep = 5 * HZ;
	}
	sb->daemon_sleep = cpu_to_le32(daemon_sleep);
	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;

	/*
	 * FIXME: write_behind for RAID1.  If not specified, what
	 * is a good choice?  We choose COUNTER_MAX / 2 arbitrarily.
	 */
	write_behind = bitmap->mddev->bitmap_info.max_write_behind;
	if (write_behind > COUNTER_MAX)
		write_behind = COUNTER_MAX / 2;
	sb->write_behind = cpu_to_le32(write_behind);
	bitmap->mddev->bitmap_info.max_write_behind = write_behind;

	/* keep the array size field of the bitmap superblock up to date */
	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);

	memcpy(sb->uuid, bitmap->mddev->uuid, 16);

564
	set_bit(BITMAP_STALE, &bitmap->flags);
565
	sb->state = cpu_to_le32(bitmap->flags);
566 567
	bitmap->events_cleared = bitmap->mddev->events;
	sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
568
	bitmap->mddev->bitmap_info.nodes = 0;
569

570
	kunmap_atomic(sb);
571 572 573 574

	return 0;
}

575 576 577 578 579
/* read the superblock from the bitmap file and initialize some bitmap fields */
static int bitmap_read_sb(struct bitmap *bitmap)
{
	char *reason = NULL;
	bitmap_super_t *sb;
580
	unsigned long chunksize, daemon_sleep, write_behind;
581
	unsigned long long events;
582
	int nodes = 0;
583
	unsigned long sectors_reserved = 0;
584
	int err = -EINVAL;
585
	struct page *sb_page;
586
	loff_t offset = bitmap->mddev->bitmap_info.offset;
587

588
	if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
589 590 591
		chunksize = 128 * 1024 * 1024;
		daemon_sleep = 5 * HZ;
		write_behind = 0;
592
		set_bit(BITMAP_STALE, &bitmap->flags);
593 594 595
		err = 0;
		goto out_no_sb;
	}
596
	/* page 0 is the superblock, read it... */
597 598 599
	sb_page = alloc_page(GFP_KERNEL);
	if (!sb_page)
		return -ENOMEM;
600
	bitmap->storage.sb_page = sb_page;
601

602
re_read:
603 604
	/* If cluster_slot is set, the cluster is setup */
	if (bitmap->cluster_slot >= 0) {
605
		sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
606

607 608
		sector_div(bm_blocks,
			   bitmap->mddev->bitmap_info.chunksize >> 9);
609 610 611
		/* bits to bytes */
		bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
		/* to 4k blocks */
612
		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
613
		offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
614
		pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
615
			bitmap->cluster_slot, offset);
616 617
	}

618 619
	if (bitmap->storage.file) {
		loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
620 621
		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;

622
		err = read_page(bitmap->storage.file, 0,
623
				bitmap, bytes, sb_page);
624
	} else {
625
		err = read_sb_page(bitmap->mddev,
626
				   offset,
627
				   sb_page,
Shaohua Li's avatar
Shaohua Li committed
628
				   0, sizeof(bitmap_super_t));
629
	}
630
	if (err)
631 632
		return err;

633
	err = -EINVAL;
634
	sb = kmap_atomic(sb_page);
635 636

	chunksize = le32_to_cpu(sb->chunksize);
637
	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
638
	write_behind = le32_to_cpu(sb->write_behind);
639
	sectors_reserved = le32_to_cpu(sb->sectors_reserved);
640 641
	/* Setup nodes/clustername only if bitmap version is
	 * cluster-compatible
642
	 */
643
	if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
644 645 646 647
		nodes = le32_to_cpu(sb->nodes);
		strlcpy(bitmap->mddev->bitmap_info.cluster_name,
				sb->cluster_name, 64);
	}
648 649 650 651

	/* verify that the bitmap-specific fields are valid */
	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
		reason = "bad magic";
652
	else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
653
		 le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
654
		reason = "unrecognized superblock version";
655
	else if (chunksize < 512)
656
		reason = "bitmap chunksize too small";
657
	else if (!is_power_of_2(chunksize))
658
		reason = "bitmap chunksize not a power of 2";
659
	else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
660
		reason = "daemon sleep period out of range";
661 662
	else if (write_behind > COUNTER_MAX)
		reason = "write-behind limit out of range (0 - 16383)";
663
	if (reason) {
664
		pr_warn("%s: invalid bitmap file superblock: %s\n",
665 666 667 668 669 670 671
			bmname(bitmap), reason);
		goto out;
	}

	/* keep the array size field of the bitmap superblock up to date */
	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);

672 673 674 675 676 677
	if (bitmap->mddev->persistent) {
		/*
		 * We have a persistent array superblock, so compare the
		 * bitmap's UUID and event counter to the mddev's
		 */
		if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
678 679
			pr_warn("%s: bitmap superblock UUID mismatch\n",
				bmname(bitmap));
680 681 682
			goto out;
		}
		events = le64_to_cpu(sb->events);
683
		if (!nodes && (events < bitmap->mddev->events)) {
684 685 686
			pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n",
				bmname(bitmap), events,
				(unsigned long long) bitmap->mddev->events);
687
			set_bit(BITMAP_STALE, &bitmap->flags);
688
		}
689
	}
690

691
	/* assign fields using values from superblock */
692
	bitmap->flags |= le32_to_cpu(sb->state);
693
	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
694
		set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
695
	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
696
	strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
697
	err = 0;
698

699
out:
700
	kunmap_atomic(sb);
701
	/* Assigning chunksize is required for "re_read" */
702
	bitmap->mddev->bitmap_info.chunksize = chunksize;
703
	if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
704 705
		err = md_setup_cluster(bitmap->mddev, nodes);
		if (err) {
706 707
			pr_warn("%s: Could not setup cluster service (%d)\n",
				bmname(bitmap), err);
708 709 710 711 712 713 714
			goto out_no_sb;
		}
		bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
		goto re_read;
	}


715
out_no_sb:
716
	if (test_bit(BITMAP_STALE, &bitmap->flags))
717 718 719 720
		bitmap->events_cleared = bitmap->mddev->events;
	bitmap->mddev->bitmap_info.chunksize = chunksize;
	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
	bitmap->mddev->bitmap_info.max_write_behind = write_behind;
721
	bitmap->mddev->bitmap_info.nodes = nodes;
722 723 724
	if (bitmap->mddev->bitmap_info.space == 0 ||
	    bitmap->mddev->bitmap_info.space > sectors_reserved)
		bitmap->mddev->bitmap_info.space = sectors_reserved;
725
	if (err) {
726
		bitmap_print_sb(bitmap);
727
		if (bitmap->cluster_slot < 0)
728 729
			md_cluster_stop(bitmap->mddev);
	}
730 731 732 733 734 735 736
	return err;
}

/*
 * general bitmap file operations
 */

737 738 739 740 741 742
/*
 * on-disk bitmap:
 *
 * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
 * file a page at a time. There's a superblock at the start of the file.
 */
743
/* calculate the index of the page that contains this bit */
744 745
static inline unsigned long file_page_index(struct bitmap_storage *store,
					    unsigned long chunk)
746
{
747
	if (store->sb_page)
748 749
		chunk += sizeof(bitmap_super_t) << 3;
	return chunk >> PAGE_BIT_SHIFT;
750 751 752
}

/* calculate the (bit) offset of this bit within a page */
753 754
static inline unsigned long file_page_offset(struct bitmap_storage *store,
					     unsigned long chunk)
755
{
756
	if (store->sb_page)
757 758
		chunk += sizeof(bitmap_super_t) << 3;
	return chunk & (PAGE_BITS - 1);
759 760 761 762 763 764
}

/*
 * return a pointer to the page in the filemap that contains the given bit
 *
 */
765
static inline struct page *filemap_get_page(struct bitmap_storage *store,
766
					    unsigned long chunk)
767
{
768
	if (file_page_index(store, chunk) >= store->file_pages)
769
		return NULL;
770
	return store->filemap[file_page_index(store, chunk)];
771 772
}

773
static int bitmap_storage_alloc(struct bitmap_storage *store,
774 775
				unsigned long chunks, int with_super,
				int slot_number)
776
{
777
	int pnum, offset = 0;
778 779 780 781 782 783 784 785
	unsigned long num_pages;
	unsigned long bytes;

	bytes = DIV_ROUND_UP(chunks, 8);
	if (with_super)
		bytes += sizeof(bitmap_super_t);

	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
786
	offset = slot_number * num_pages;
787 788 789 790 791 792 793

	store->filemap = kmalloc(sizeof(struct page *)
				 * num_pages, GFP_KERNEL);
	if (!store->filemap)
		return -ENOMEM;

	if (with_super && !store->sb_page) {
794
		store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
795 796 797
		if (store->sb_page == NULL)
			return -ENOMEM;
	}
798

799 800 801 802
	pnum = 0;
	if (store->sb_page) {
		store->filemap[0] = store->sb_page;
		pnum = 1;
803
		store->sb_page->index = offset;
804
	}
805

806
	for ( ; pnum < num_pages; pnum++) {
807
		store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
808 809 810 811
		if (!store->filemap[pnum]) {
			store->file_pages = pnum;
			return -ENOMEM;
		}
812
		store->filemap[pnum]->index = pnum + offset;
813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828
	}
	store->file_pages = pnum;

	/* We need 4 bits per page, rounded up to a multiple
	 * of sizeof(unsigned long) */
	store->filemap_attr = kzalloc(
		roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
		GFP_KERNEL);
	if (!store->filemap_attr)
		return -ENOMEM;

	store->bytes = bytes;

	return 0;
}

829
static void bitmap_file_unmap(struct bitmap_storage *store)
830 831 832
{
	struct page **map, *sb_page;
	int pages;
833
	struct file *file;
834

835
	file = store->file;
836 837 838
	map = store->filemap;
	pages = store->file_pages;
	sb_page = store->sb_page;
839 840

	while (pages--)
841
		if (map[pages] != sb_page) /* 0 is sb_page, release it below */
842
			free_buffers(map[pages]);
843
	kfree(map);
844
	kfree(store->filemap_attr);
845

846 847
	if (sb_page)
		free_buffers(sb_page);
848

849
	if (file) {
Al Viro's avatar
Al Viro committed
850
		struct inode *inode = file_inode(file);
851
		invalidate_mapping_pages(inode->i_mapping, 0, -1);
852
		fput(file);
853
	}
854 855 856 857 858 859 860 861 862 863 864
}

/*
 * bitmap_file_kick - if an error occurs while manipulating the bitmap file
 * then it is no longer reliable, so we stop using it and we mark the file
 * as failed in the superblock
 */
static void bitmap_file_kick(struct bitmap *bitmap)
{
	char *path, *ptr = NULL;

865
	if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
866
		bitmap_update_sb(bitmap);
867

868
		if (bitmap->storage.file) {
869 870
			path = kmalloc(PAGE_SIZE, GFP_KERNEL);
			if (path)
Miklos Szeredi's avatar
Miklos Szeredi committed
871
				ptr = file_path(bitmap->storage.file,
872
					     path, PAGE_SIZE);
873

874 875
			pr_warn("%s: kicking failed bitmap file %s from array!\n",
				bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
876

877 878
			kfree(path);
		} else
879 880
			pr_warn("%s: disabling internal bitmap due to errors\n",
				bmname(bitmap));
881
	}
882 883 884
}

enum bitmap_page_attr {
885
	BITMAP_PAGE_DIRTY = 0,     /* there are set bits that need to be synced */
886 887
	BITMAP_PAGE_PENDING = 1,   /* there are bits that are being cleaned.
				    * i.e. counter is 1 or 2. */
888
	BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
889 890
};

891 892
static inline void set_page_attr(struct bitmap *bitmap, int pnum,
				 enum bitmap_page_attr attr)
893
{
894
	set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
895 896
}

897 898
static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
				   enum bitmap_page_attr attr)
899
{
900
	clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
901 902
}

903 904
static inline int test_page_attr(struct bitmap *bitmap, int pnum,
				 enum bitmap_page_attr attr)
905
{
906
	return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
907 908
}

909 910 911 912 913 914
static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum,
					   enum bitmap_page_attr attr)
{
	return test_and_clear_bit((pnum<<2) + attr,
				  bitmap->storage.filemap_attr);
}
915 916 917 918 919 920 921 922 923 924
/*
 * bitmap_file_set_bit -- called before performing a write to the md device
 * to set (and eventually sync) a particular bit in the bitmap file
 *
 * we set the bit immediately, then we record the page number so that
 * when an unplug occurs, we can flush the dirty pages out to disk
 */
static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
{
	unsigned long bit;
925
	struct page *page;
926
	void *kaddr;
927
	unsigned long chunk = block >> bitmap->counts.chunkshift;
928 929 930 931 932
	struct bitmap_storage *store = &bitmap->storage;
	unsigned long node_offset = 0;

	if (mddev_is_clustered(bitmap->mddev))
		node_offset = bitmap->cluster_slot * store->file_pages;
933

934
	page = filemap_get_page(&bitmap->storage, chunk);
935 936
	if (!page)
		return;
937
	bit = file_page_offset(&bitmap->storage, chunk);
938

939
	/* set the bit */
940
	kaddr = kmap_atomic(page);
941
	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
942 943
		set_bit(bit, kaddr);
	else
944
		set_bit_le(bit, kaddr);
945
	kunmap_atomic(kaddr);
946
	pr_debug("set file bit %lu page %lu\n", bit, page->index);
947
	/* record page number so it gets flushed to disk when unplug occurs */
948
	set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY);
949 950
}

951 952 953 954 955
static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
{
	unsigned long bit;
	struct page *page;
	void *paddr;
956
	unsigned long chunk = block >> bitmap->counts.chunkshift;
957 958 959 960 961
	struct bitmap_storage *store = &bitmap->storage;
	unsigned long node_offset = 0;

	if (mddev_is_clustered(bitmap->mddev))
		node_offset = bitmap->cluster_slot * store->file_pages;
962

963
	page = filemap_get_page(&bitmap->storage, chunk);
964 965
	if (!page)
		return;
966
	bit = file_page_offset(&bitmap->storage, chunk);
967
	paddr = kmap_atomic(page);
968
	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
969 970
		clear_bit(bit, paddr);
	else
971
		clear_bit_le(bit, paddr);
972
	kunmap_atomic(paddr);
973 974
	if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
		set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING);
975 976 977 978
		bitmap->allclean = 0;
	}
}

979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
{
	unsigned long bit;
	struct page *page;
	void *paddr;
	unsigned long chunk = block >> bitmap->counts.chunkshift;
	int set = 0;

	page = filemap_get_page(&bitmap->storage, chunk);
	if (!page)
		return -EINVAL;
	bit = file_page_offset(&bitmap->storage, chunk);
	paddr = kmap_atomic(page);
	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
		set = test_bit(bit, paddr);
	else
		set = test_bit_le(bit, paddr);
	kunmap_atomic(paddr);
	return set;
}


1001 1002 1003
/* this gets called when the md device is ready to unplug its underlying
 * (slave) device queues -- before we let any writes go down, we need to
 * sync the dirty pages of the bitmap file to disk */
1004
void bitmap_unplug(struct bitmap *bitmap)
1005
{
1006
	unsigned long i;
1007
	int dirty, need_write;
1008
	int writing = 0;
1009

1010 1011
	if (!bitmap || !bitmap->storage.filemap ||
	    test_bit(BITMAP_STALE, &bitmap->flags))
1012
		return;
1013 1014 1015

	/* look at each page to see if there are any set bits that need to be
	 * flushed out to disk */
1016
	for (i = 0; i < bitmap->storage.file_pages; i++) {
1017
		if (!bitmap->storage.filemap)
1018
			return;
1019 1020 1021 1022
		dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
		need_write = test_and_clear_page_attr(bitmap, i,
						      BITMAP_PAGE_NEEDWRITE);
		if (dirty || need_write) {
1023
			if (!writing) {
1024
				bitmap_wait_writes(bitmap);
1025 1026 1027 1028
				if (bitmap->mddev->queue)
					blk_add_trace_msg(bitmap->mddev->queue,
							  "md bitmap_unplug");
			}
1029
			clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
1030
			write_page(bitmap, bitmap->storage.filemap[i], 0);
1031
			writing = 1;
1032
		}
1033
	}
1034 1035
	if (writing)
		bitmap_wait_writes(bitmap);
1036

1037
	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
1038
		bitmap_file_kick(bitmap);
1039
}
1040
EXPORT_SYMBOL(bitmap_unplug);
1041