core.c 72.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * NVM Express device driver
 * Copyright (c) 2011-2014, Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

#include <linux/blkdev.h>
#include <linux/blk-mq.h>
17
#include <linux/delay.h>
18
#include <linux/errno.h>
19
#include <linux/hdreg.h>
20
#include <linux/kernel.h>
21 22
#include <linux/module.h>
#include <linux/list_sort.h>
23 24
#include <linux/slab.h>
#include <linux/types.h>
25 26 27 28
#include <linux/pr.h>
#include <linux/ptrace.h>
#include <linux/nvme_ioctl.h>
#include <linux/t10-pi.h>
29
#include <linux/pm_qos.h>
30
#include <asm/unaligned.h>
31 32

#include "nvme.h"
Sagi Grimberg's avatar
Sagi Grimberg committed
33
#include "fabrics.h"
34

35 36
#define NVME_MINORS		(1U << MINORBITS)

37 38 39
unsigned char admin_timeout = 60;
module_param(admin_timeout, byte, 0644);
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
40
EXPORT_SYMBOL_GPL(admin_timeout);
41 42 43 44

unsigned char nvme_io_timeout = 30;
module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
45
EXPORT_SYMBOL_GPL(nvme_io_timeout);
46

47
static unsigned char shutdown_timeout = 5;
48 49 50
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");

51 52
static u8 nvme_max_retries = 5;
module_param_named(max_retries, nvme_max_retries, byte, 0644);
Keith Busch's avatar
Keith Busch committed
53
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
54

55 56 57
static int nvme_char_major;
module_param(nvme_char_major, int, 0);

58
static unsigned long default_ps_max_latency_us = 100000;
59 60 61 62
module_param(default_ps_max_latency_us, ulong, 0644);
MODULE_PARM_DESC(default_ps_max_latency_us,
		 "max power saving latency for new devices; use PM QOS to change per device");

63 64 65 66
static bool force_apst;
module_param(force_apst, bool, 0644);
MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");

67 68 69 70
static bool streams;
module_param(streams, bool, 0644);
MODULE_PARM_DESC(streams, "turn on support for Streams write directives");

71 72 73
struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq);

74
static LIST_HEAD(nvme_ctrl_list);
Ming Lin's avatar
Ming Lin committed
75
static DEFINE_SPINLOCK(dev_list_lock);
76

77 78
static struct class *nvme_class;

79 80 81 82 83
static __le32 nvme_get_log_dw10(u8 lid, size_t size)
{
	return cpu_to_le32((((size / 4) - 1) << 16) | lid);
}

84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
		return -EBUSY;
	if (!queue_work(nvme_wq, &ctrl->reset_work))
		return -EBUSY;
	return 0;
}
EXPORT_SYMBOL_GPL(nvme_reset_ctrl);

static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
{
	int ret;

	ret = nvme_reset_ctrl(ctrl);
	if (!ret)
		flush_work(&ctrl->reset_work);
	return ret;
}

104
static blk_status_t nvme_error_status(struct request *req)
105 106 107
{
	switch (nvme_req(req)->status & 0x7ff) {
	case NVME_SC_SUCCESS:
108
		return BLK_STS_OK;
109
	case NVME_SC_CAP_EXCEEDED:
110
		return BLK_STS_NOSPC;
111
	case NVME_SC_ONCS_NOT_SUPPORTED:
112
		return BLK_STS_NOTSUPP;
113 114 115
	case NVME_SC_WRITE_FAULT:
	case NVME_SC_READ_ERROR:
	case NVME_SC_UNWRITTEN_BLOCK:
116 117 118
		return BLK_STS_MEDIUM;
	default:
		return BLK_STS_IOERR;
119 120 121
	}
}

122
static inline bool nvme_req_needs_retry(struct request *req)
123
{
124 125
	if (blk_noretry_request(req))
		return false;
126
	if (nvme_req(req)->status & NVME_SC_DNR)
127 128 129
		return false;
	if (jiffies - req->start_time >= req->timeout)
		return false;
130
	if (nvme_req(req)->retries >= nvme_max_retries)
131 132
		return false;
	return true;
133 134 135 136
}

void nvme_complete_rq(struct request *req)
{
137 138
	if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
		nvme_req(req)->retries++;
139
		blk_mq_requeue_request(req, true);
140
		return;
141 142
	}

143
	blk_mq_end_request(req, nvme_error_status(req));
144 145 146
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);

147 148 149 150 151 152 153 154 155 156 157 158 159
void nvme_cancel_request(struct request *req, void *data, bool reserved)
{
	int status;

	if (!blk_mq_request_started(req))
		return;

	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
				"Cancelling I/O %d", req->tag);

	status = NVME_SC_ABORT_REQ;
	if (blk_queue_dying(req->q))
		status |= NVME_SC_DNR;
160
	nvme_req(req)->status = status;
161
	blk_mq_complete_request(req);
162

163 164 165
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);

166 167 168
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
		enum nvme_ctrl_state new_state)
{
169
	enum nvme_ctrl_state old_state;
170 171 172
	bool changed = false;

	spin_lock_irq(&ctrl->lock);
173 174

	old_state = ctrl->state;
175 176 177
	switch (new_state) {
	case NVME_CTRL_LIVE:
		switch (old_state) {
178
		case NVME_CTRL_NEW:
179
		case NVME_CTRL_RESETTING:
180
		case NVME_CTRL_RECONNECTING:
181 182 183 184 185 186 187 188 189
			changed = true;
			/* FALLTHRU */
		default:
			break;
		}
		break;
	case NVME_CTRL_RESETTING:
		switch (old_state) {
		case NVME_CTRL_NEW:
190 191 192 193 194 195 196 197 198
		case NVME_CTRL_LIVE:
			changed = true;
			/* FALLTHRU */
		default:
			break;
		}
		break;
	case NVME_CTRL_RECONNECTING:
		switch (old_state) {
199 200 201 202 203 204 205 206 207 208 209
		case NVME_CTRL_LIVE:
			changed = true;
			/* FALLTHRU */
		default:
			break;
		}
		break;
	case NVME_CTRL_DELETING:
		switch (old_state) {
		case NVME_CTRL_LIVE:
		case NVME_CTRL_RESETTING:
210
		case NVME_CTRL_RECONNECTING:
211 212 213 214 215 216
			changed = true;
			/* FALLTHRU */
		default:
			break;
		}
		break;
217 218 219 220 221 222 223 224 225
	case NVME_CTRL_DEAD:
		switch (old_state) {
		case NVME_CTRL_DELETING:
			changed = true;
			/* FALLTHRU */
		default:
			break;
		}
		break;
226 227 228 229 230 231 232
	default:
		break;
	}

	if (changed)
		ctrl->state = new_state;

233 234
	spin_unlock_irq(&ctrl->lock);

235 236 237 238
	return changed;
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);

239 240 241 242
static void nvme_free_ns(struct kref *kref)
{
	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);

243 244
	if (ns->ndev)
		nvme_nvm_unregister(ns);
245

246 247 248 249 250
	if (ns->disk) {
		spin_lock(&dev_list_lock);
		ns->disk->private_data = NULL;
		spin_unlock(&dev_list_lock);
	}
251 252

	put_disk(ns->disk);
253 254
	ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
	nvme_put_ctrl(ns->ctrl);
255 256 257
	kfree(ns);
}

258
static void nvme_put_ns(struct nvme_ns *ns)
259 260 261 262 263 264 265 266 267 268
{
	kref_put(&ns->kref, nvme_free_ns);
}

static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
{
	struct nvme_ns *ns;

	spin_lock(&dev_list_lock);
	ns = disk->private_data;
269 270 271 272 273 274
	if (ns) {
		if (!kref_get_unless_zero(&ns->kref))
			goto fail;
		if (!try_module_get(ns->ctrl->ops->module))
			goto fail_put_ns;
	}
275 276 277
	spin_unlock(&dev_list_lock);

	return ns;
278 279 280 281 282 283

fail_put_ns:
	kref_put(&ns->kref, nvme_free_ns);
fail:
	spin_unlock(&dev_list_lock);
	return NULL;
284 285
}

286
struct request *nvme_alloc_request(struct request_queue *q,
287
		struct nvme_command *cmd, unsigned int flags, int qid)
288
{
289
	unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
290 291
	struct request *req;

292
	if (qid == NVME_QID_ANY) {
293
		req = blk_mq_alloc_request(q, op, flags);
294
	} else {
295
		req = blk_mq_alloc_request_hctx(q, op, flags,
296 297
				qid ? qid - 1 : 0);
	}
298
	if (IS_ERR(req))
299
		return req;
300 301

	req->cmd_flags |= REQ_FAILFAST_DRIVER;
302
	nvme_req(req)->cmd = cmd;
303

304 305
	return req;
}
306
EXPORT_SYMBOL_GPL(nvme_alloc_request);
307

308 309 310 311 312 313 314
static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
{
	struct nvme_command c;

	memset(&c, 0, sizeof(c));

	c.directive.opcode = nvme_admin_directive_send;
Arnav Dawn's avatar
Arnav Dawn committed
315
	c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
	c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
	c.directive.dtype = NVME_DIR_IDENTIFY;
	c.directive.tdtype = NVME_DIR_STREAMS;
	c.directive.endir = enable ? NVME_DIR_ENDIR : 0;

	return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
}

static int nvme_disable_streams(struct nvme_ctrl *ctrl)
{
	return nvme_toggle_streams(ctrl, false);
}

static int nvme_enable_streams(struct nvme_ctrl *ctrl)
{
	return nvme_toggle_streams(ctrl, true);
}

static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
				  struct streams_directive_params *s, u32 nsid)
{
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	memset(s, 0, sizeof(*s));

	c.directive.opcode = nvme_admin_directive_recv;
	c.directive.nsid = cpu_to_le32(nsid);
344
	c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
	c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
	c.directive.dtype = NVME_DIR_STREAMS;

	return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
}

static int nvme_configure_directives(struct nvme_ctrl *ctrl)
{
	struct streams_directive_params s;
	int ret;

	if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
		return 0;
	if (!streams)
		return 0;

	ret = nvme_enable_streams(ctrl);
	if (ret)
		return ret;

Arnav Dawn's avatar
Arnav Dawn committed
365
	ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
	if (ret)
		return ret;

	ctrl->nssa = le16_to_cpu(s.nssa);
	if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
		dev_info(ctrl->device, "too few streams (%u) available\n",
					ctrl->nssa);
		nvme_disable_streams(ctrl);
		return 0;
	}

	ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
	dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
	return 0;
}

/*
 * Check if 'req' has a write hint associated with it. If it does, assign
 * a valid namespace stream to the write.
 */
static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
				     struct request *req, u16 *control,
				     u32 *dsmgmt)
{
	enum rw_hint streamid = req->write_hint;

	if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
		streamid = 0;
	else {
		streamid--;
		if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
			return;

		*control |= NVME_RW_DTYPE_STREAMS;
		*dsmgmt |= streamid << 16;
	}

	if (streamid < ARRAY_SIZE(req->q->write_hints))
		req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
}

Ming Lin's avatar
Ming Lin committed
407 408 409 410 411 412 413 414
static inline void nvme_setup_flush(struct nvme_ns *ns,
		struct nvme_command *cmnd)
{
	memset(cmnd, 0, sizeof(*cmnd));
	cmnd->common.opcode = nvme_cmd_flush;
	cmnd->common.nsid = cpu_to_le32(ns->ns_id);
}

415
static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
Ming Lin's avatar
Ming Lin committed
416 417
		struct nvme_command *cmnd)
{
418
	unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
Ming Lin's avatar
Ming Lin committed
419
	struct nvme_dsm_range *range;
420
	struct bio *bio;
Ming Lin's avatar
Ming Lin committed
421

422
	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
Ming Lin's avatar
Ming Lin committed
423
	if (!range)
424
		return BLK_STS_RESOURCE;
Ming Lin's avatar
Ming Lin committed
425

426 427 428 429 430 431 432 433 434 435 436 437
	__rq_for_each_bio(bio, req) {
		u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
		u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;

		range[n].cattr = cpu_to_le32(0);
		range[n].nlb = cpu_to_le32(nlb);
		range[n].slba = cpu_to_le64(slba);
		n++;
	}

	if (WARN_ON_ONCE(n != segments)) {
		kfree(range);
438
		return BLK_STS_IOERR;
439
	}
Ming Lin's avatar
Ming Lin committed
440 441 442 443

	memset(cmnd, 0, sizeof(*cmnd));
	cmnd->dsm.opcode = nvme_cmd_dsm;
	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
444
	cmnd->dsm.nr = cpu_to_le32(segments - 1);
Ming Lin's avatar
Ming Lin committed
445 446
	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);

447 448
	req->special_vec.bv_page = virt_to_page(range);
	req->special_vec.bv_offset = offset_in_page(range);
449
	req->special_vec.bv_len = sizeof(*range) * segments;
450
	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
Ming Lin's avatar
Ming Lin committed
451

452
	return BLK_STS_OK;
Ming Lin's avatar
Ming Lin committed
453 454
}

455 456
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
		struct request *req, struct nvme_command *cmnd)
Ming Lin's avatar
Ming Lin committed
457
{
458
	struct nvme_ctrl *ctrl = ns->ctrl;
Ming Lin's avatar
Ming Lin committed
459 460 461
	u16 control = 0;
	u32 dsmgmt = 0;

462 463 464 465 466
	/*
	 * If formated with metadata, require the block layer provide a buffer
	 * unless this namespace is formated such that the metadata can be
	 * stripped/generated by the controller with PRACT=1.
	 */
467 468
	if (ns && ns->ms &&
	    (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) &&
469 470 471
	    !blk_integrity_rq(req) && !blk_rq_is_passthrough(req))
		return BLK_STS_NOTSUPP;

Ming Lin's avatar
Ming Lin committed
472 473 474 475 476 477 478 479 480 481 482 483 484 485
	if (req->cmd_flags & REQ_FUA)
		control |= NVME_RW_FUA;
	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
		control |= NVME_RW_LR;

	if (req->cmd_flags & REQ_RAHEAD)
		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;

	memset(cmnd, 0, sizeof(*cmnd));
	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);

486 487 488
	if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
		nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);

Ming Lin's avatar
Ming Lin committed
489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
	if (ns->ms) {
		switch (ns->pi_type) {
		case NVME_NS_DPS_PI_TYPE3:
			control |= NVME_RW_PRINFO_PRCHK_GUARD;
			break;
		case NVME_NS_DPS_PI_TYPE1:
		case NVME_NS_DPS_PI_TYPE2:
			control |= NVME_RW_PRINFO_PRCHK_GUARD |
					NVME_RW_PRINFO_PRCHK_REF;
			cmnd->rw.reftag = cpu_to_le32(
					nvme_block_nr(ns, blk_rq_pos(req)));
			break;
		}
		if (!blk_integrity_rq(req))
			control |= NVME_RW_PRINFO_PRACT;
	}

	cmnd->rw.control = cpu_to_le16(control);
	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
508
	return 0;
Ming Lin's avatar
Ming Lin committed
509 510
}

511
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
Ming Lin's avatar
Ming Lin committed
512 513
		struct nvme_command *cmd)
{
514
	blk_status_t ret = BLK_STS_OK;
Ming Lin's avatar
Ming Lin committed
515

516
	if (!(req->rq_flags & RQF_DONTPREP)) {
517
		nvme_req(req)->retries = 0;
518
		nvme_req(req)->flags = 0;
519 520 521
		req->rq_flags |= RQF_DONTPREP;
	}

522 523 524
	switch (req_op(req)) {
	case REQ_OP_DRV_IN:
	case REQ_OP_DRV_OUT:
525
		memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
526 527
		break;
	case REQ_OP_FLUSH:
Ming Lin's avatar
Ming Lin committed
528
		nvme_setup_flush(ns, cmd);
529
		break;
530 531
	case REQ_OP_WRITE_ZEROES:
		/* currently only aliased to deallocate for a few ctrls: */
532
	case REQ_OP_DISCARD:
Ming Lin's avatar
Ming Lin committed
533
		ret = nvme_setup_discard(ns, req, cmd);
534 535 536
		break;
	case REQ_OP_READ:
	case REQ_OP_WRITE:
537
		ret = nvme_setup_rw(ns, req, cmd);
538 539 540
		break;
	default:
		WARN_ON_ONCE(1);
541
		return BLK_STS_IOERR;
542
	}
Ming Lin's avatar
Ming Lin committed
543

544
	cmd->common.command_id = req->tag;
Ming Lin's avatar
Ming Lin committed
545 546 547 548
	return ret;
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);

549 550 551 552 553
/*
 * Returns 0 on success.  If the result is negative, it's a Linux error code;
 * if the result is positive, it's an NVM Express status code
 */
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
554
		union nvme_result *result, void *buffer, unsigned bufflen,
555
		unsigned timeout, int qid, int at_head, int flags)
556 557 558 559
{
	struct request *req;
	int ret;

560
	req = nvme_alloc_request(q, cmd, flags, qid);
561 562 563 564 565
	if (IS_ERR(req))
		return PTR_ERR(req);

	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;

566 567 568 569
	if (buffer && bufflen) {
		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
		if (ret)
			goto out;
570 571
	}

572
	blk_execute_rq(req->q, NULL, req, at_head);
573 574
	if (result)
		*result = nvme_req(req)->result;
575 576 577 578
	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
		ret = -EINTR;
	else
		ret = nvme_req(req)->status;
579 580 581 582
 out:
	blk_mq_free_request(req);
	return ret;
}
583
EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
584 585 586 587

int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
		void *buffer, unsigned bufflen)
{
588 589
	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
			NVME_QID_ANY, 0, 0);
590
}
591
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
592

593 594 595 596
int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
		void __user *ubuffer, unsigned bufflen,
		void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
		u32 *result, unsigned timeout)
597
{
598
	bool write = nvme_is_write(cmd);
599 600
	struct nvme_ns *ns = q->queuedata;
	struct gendisk *disk = ns ? ns->disk : NULL;
601
	struct request *req;
602 603
	struct bio *bio = NULL;
	void *meta = NULL;
604 605
	int ret;

606
	req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
607 608 609 610 611 612
	if (IS_ERR(req))
		return PTR_ERR(req);

	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;

	if (ubuffer && bufflen) {
613 614 615 616 617 618
		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
				GFP_KERNEL);
		if (ret)
			goto out;
		bio = req->bio;

619 620
		if (!disk)
			goto submit;
621
		bio->bi_disk = disk;
622

623
		if (meta_buffer && meta_len) {
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
			struct bio_integrity_payload *bip;

			meta = kmalloc(meta_len, GFP_KERNEL);
			if (!meta) {
				ret = -ENOMEM;
				goto out_unmap;
			}

			if (write) {
				if (copy_from_user(meta, meta_buffer,
						meta_len)) {
					ret = -EFAULT;
					goto out_free_meta;
				}
			}

			bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
641 642
			if (IS_ERR(bip)) {
				ret = PTR_ERR(bip);
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658
				goto out_free_meta;
			}

			bip->bip_iter.bi_size = meta_len;
			bip->bip_iter.bi_sector = meta_seed;

			ret = bio_integrity_add_page(bio, virt_to_page(meta),
					meta_len, offset_in_page(meta));
			if (ret != meta_len) {
				ret = -ENOMEM;
				goto out_free_meta;
			}
		}
	}
 submit:
	blk_execute_rq(req->q, disk, req, 0);
659 660 661 662
	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
		ret = -EINTR;
	else
		ret = nvme_req(req)->status;
663
	if (result)
664
		*result = le32_to_cpu(nvme_req(req)->result.u32);
665 666 667 668 669 670 671
	if (meta && !ret && !write) {
		if (copy_to_user(meta_buffer, meta, meta_len))
			ret = -EFAULT;
	}
 out_free_meta:
	kfree(meta);
 out_unmap:
672
	if (bio)
673
		blk_rq_unmap_user(bio);
674 675 676 677 678
 out:
	blk_mq_free_request(req);
	return ret;
}

679 680 681 682 683 684 685 686
int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
		void __user *ubuffer, unsigned bufflen, u32 *result,
		unsigned timeout)
{
	return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
			result, timeout);
}

687
static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
Sagi Grimberg's avatar
Sagi Grimberg committed
688 689 690 691 692
{
	struct nvme_ctrl *ctrl = rq->end_io_data;

	blk_mq_free_request(rq);

693
	if (status) {
Sagi Grimberg's avatar
Sagi Grimberg committed
694
		dev_err(ctrl->device,
695 696
			"failed nvme_keep_alive_end_io error=%d\n",
				status);
Sagi Grimberg's avatar
Sagi Grimberg committed
697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731
		return;
	}

	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
}

static int nvme_keep_alive(struct nvme_ctrl *ctrl)
{
	struct nvme_command c;
	struct request *rq;

	memset(&c, 0, sizeof(c));
	c.common.opcode = nvme_admin_keep_alive;

	rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED,
			NVME_QID_ANY);
	if (IS_ERR(rq))
		return PTR_ERR(rq);

	rq->timeout = ctrl->kato * HZ;
	rq->end_io_data = ctrl;

	blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);

	return 0;
}

static void nvme_keep_alive_work(struct work_struct *work)
{
	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
			struct nvme_ctrl, ka_work);

	if (nvme_keep_alive(ctrl)) {
		/* allocation failure, reset the controller */
		dev_err(ctrl->device, "keep-alive failed\n");
732
		nvme_reset_ctrl(ctrl);
Sagi Grimberg's avatar
Sagi Grimberg committed
733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755
		return;
	}
}

void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
{
	if (unlikely(ctrl->kato == 0))
		return;

	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
}
EXPORT_SYMBOL_GPL(nvme_start_keep_alive);

void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
{
	if (unlikely(ctrl->kato == 0))
		return;

	cancel_delayed_work_sync(&ctrl->ka_work);
}
EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);

Keith Busch's avatar
Keith Busch committed
756
static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
757 758 759 760 761 762
{
	struct nvme_command c = { };
	int error;

	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
	c.identify.opcode = nvme_admin_identify;
763
	c.identify.cns = NVME_ID_CNS_CTRL;
764 765 766 767 768 769 770 771 772 773 774 775

	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
	if (!*id)
		return -ENOMEM;

	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
			sizeof(struct nvme_id_ctrl));
	if (error)
		kfree(*id);
	return error;
}

776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
static int nvme_identify_ns_descs(struct nvme_ns *ns, unsigned nsid)
{
	struct nvme_command c = { };
	int status;
	void *data;
	int pos;
	int len;

	c.identify.opcode = nvme_admin_identify;
	c.identify.nsid = cpu_to_le32(nsid);
	c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;

	data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
	if (!data)
		return -ENOMEM;

	status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, data,
				      NVME_IDENTIFY_DATA_SIZE);
	if (status)
		goto free_data;

	for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
		struct nvme_ns_id_desc *cur = data + pos;

		if (cur->nidl == 0)
			break;

		switch (cur->nidt) {
		case NVME_NIDT_EUI64:
			if (cur->nidl != NVME_NIDT_EUI64_LEN) {
				dev_warn(ns->ctrl->device,
					 "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n",
					 cur->nidl);
				goto free_data;
			}
			len = NVME_NIDT_EUI64_LEN;
			memcpy(ns->eui, data + pos + sizeof(*cur), len);
			break;
		case NVME_NIDT_NGUID:
			if (cur->nidl != NVME_NIDT_NGUID_LEN) {
				dev_warn(ns->ctrl->device,
					 "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n",
					 cur->nidl);
				goto free_data;
			}
			len = NVME_NIDT_NGUID_LEN;
			memcpy(ns->nguid, data + pos + sizeof(*cur), len);
			break;
		case NVME_NIDT_UUID:
			if (cur->nidl != NVME_NIDT_UUID_LEN) {
				dev_warn(ns->ctrl->device,
					 "ctrl returned bogus length: %d for NVME_NIDT_UUID\n",
					 cur->nidl);
				goto free_data;
			}
			len = NVME_NIDT_UUID_LEN;
			uuid_copy(&ns->uuid, data + pos + sizeof(*cur));
			break;
		default:
			/* Skip unnkown types */
			len = cur->nidl;
			break;
		}

		len += sizeof(*cur);
	}
free_data:
	kfree(data);
	return status;
}

847 848 849 850 851
static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
{
	struct nvme_command c = { };

	c.identify.opcode = nvme_admin_identify;
852
	c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
853 854 855 856
	c.identify.nsid = cpu_to_le32(nsid);
	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
}

Keith Busch's avatar
Keith Busch committed
857
static int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
858 859 860 861 862 863
		struct nvme_id_ns **id)
{
	struct nvme_command c = { };
	int error;

	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
864 865
	c.identify.opcode = nvme_admin_identify;
	c.identify.nsid = cpu_to_le32(nsid);
866
	c.identify.cns = NVME_ID_CNS_NS;
867 868 869 870 871 872 873 874 875 876 877 878

	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
	if (!*id)
		return -ENOMEM;

	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
			sizeof(struct nvme_id_ns));
	if (error)
		kfree(*id);
	return error;
}

Keith Busch's avatar
Keith Busch committed
879
static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
880
		      void *buffer, size_t buflen, u32 *result)
881 882
{
	struct nvme_command c;
883
	union nvme_result res;
884
	int ret;
885 886 887 888 889 890

	memset(&c, 0, sizeof(c));
	c.features.opcode = nvme_admin_set_features;
	c.features.fid = cpu_to_le32(fid);
	c.features.dword11 = cpu_to_le32(dword11);

891
	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
892
			buffer, buflen, 0, NVME_QID_ANY, 0, 0);
893
	if (ret >= 0 && result)
894
		*result = le32_to_cpu(res.u32);
895
	return ret;
896 897
}

898 899 900 901 902 903
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
{
	u32 q_count = (*count - 1) | ((*count - 1) << 16);
	u32 result;
	int status, nr_io_queues;

904
	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
905
			&result);
906
	if (status < 0)
907 908
		return status;

909 910 911 912 913 914
	/*
	 * Degraded controllers might return an error when setting the queue
	 * count.  We still want to be able to bring them online and offer
	 * access to the admin queue, as that might be only way to fix them up.
	 */
	if (status > 0) {
915
		dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
916 917 918 919 920 921
		*count = 0;
	} else {
		nr_io_queues = min(result & 0xffff, result >> 16) + 1;
		*count = min(*count, nr_io_queues);
	}

922 923
	return 0;
}
924
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
925

926 927 928 929 930 931 932 933 934
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
{
	struct nvme_user_io io;
	struct nvme_command c;
	unsigned length, meta_len;
	void __user *metadata;

	if (copy_from_user(&io, uio, sizeof(io)))
		return -EFAULT;
935 936
	if (io.flags)
		return -EINVAL;
937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975

	switch (io.opcode) {
	case nvme_cmd_write:
	case nvme_cmd_read:
	case nvme_cmd_compare:
		break;
	default:
		return -EINVAL;
	}

	length = (io.nblocks + 1) << ns->lba_shift;
	meta_len = (io.nblocks + 1) * ns->ms;
	metadata = (void __user *)(uintptr_t)io.metadata;

	if (ns->ext) {
		length += meta_len;
		meta_len = 0;
	} else if (meta_len) {
		if ((io.metadata & 3) || !io.metadata)
			return -EINVAL;
	}

	memset(&c, 0, sizeof(c));
	c.rw.opcode = io.opcode;
	c.rw.flags = io.flags;
	c.rw.nsid = cpu_to_le32(ns->ns_id);
	c.rw.slba = cpu_to_le64(io.slba);
	c.rw.length = cpu_to_le16(io.nblocks);
	c.rw.control = cpu_to_le16(io.control);
	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
	c.rw.reftag = cpu_to_le32(io.reftag);
	c.rw.apptag = cpu_to_le16(io.apptag);
	c.rw.appmask = cpu_to_le16(io.appmask);

	return __nvme_submit_user_cmd(ns->queue, &c,
			(void __user *)(uintptr_t)io.addr, length,
			metadata, meta_len, io.slba, NULL, 0);
}

976
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
977 978 979 980 981 982 983 984 985 986 987
			struct nvme_passthru_cmd __user *ucmd)
{
	struct nvme_passthru_cmd cmd;
	struct nvme_command c;
	unsigned timeout = 0;
	int status;

	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;
	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
		return -EFAULT;
988 989
	if (cmd.flags)
		return -EINVAL;
990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007

	memset(&c, 0, sizeof(c));
	c.common.opcode = cmd.opcode;
	c.common.flags = cmd.flags;
	c.common.nsid = cpu_to_le32(cmd.nsid);
	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);

	if (cmd.timeout_ms)
		timeout = msecs_to_jiffies(cmd.timeout_ms);

	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1008
			(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
			&cmd.result, timeout);
	if (status >= 0) {
		if (put_user(cmd.result, &ucmd->result))
			return -EFAULT;
	}

	return status;
}

static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
		unsigned int cmd, unsigned long arg)
{
	struct nvme_ns *ns = bdev->bd_disk->private_data;

	switch (cmd) {
	case NVME_IOCTL_ID:
		force_successful_syscall_return();
		return ns->ns_id;
	case NVME_IOCTL_ADMIN_CMD:
		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
	case NVME_IOCTL_IO_CMD:
		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
	case NVME_IOCTL_SUBMIT_IO:
		return nvme_submit_io(ns, (void __user *)arg);
	default:
1034 1035 1036 1037
#ifdef CONFIG_NVM
		if (ns->ndev)
			return nvme_nvm_ioctl(ns, cmd, arg);
#endif
1038
		if (is_sed_ioctl(cmd))
1039
			return sed_ioctl(ns->ctrl->opal_dev, cmd,
1040
					 (void __user *) arg);
1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
		return -ENOTTY;
	}
}

#ifdef CONFIG_COMPAT
static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
			unsigned int cmd, unsigned long arg)
{
	return nvme_ioctl(bdev, mode, cmd, arg);
}
#else
#define nvme_compat_ioctl	NULL
#endif

static int nvme_open(struct block_device *bdev, fmode_t mode)
{
	return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
}

static void nvme_release(struct gendisk *disk, fmode_t mode)
{
1062 1063 1064 1065
	struct nvme_ns *ns = disk->private_data;

	module_put(ns->ctrl->ops->module);
	nvme_put_ns(ns);
1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077
}

static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
	/* some standard values */
	geo->heads = 1 << 6;
	geo->sectors = 1 << 5;
	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
	return 0;
}

#ifdef CONFIG_BLK_DEV_INTEGRITY
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100
static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
		u16 bs)
{
	struct nvme_ns *ns = disk->private_data;
	u16 old_ms = ns->ms;
	u8 pi_type = 0;

	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);

	/* PI implementation requires metadata equal t10 pi tuple size */
	if (ns->ms == sizeof(struct t10_pi_tuple))
		pi_type = id->dps & NVME_NS_DPS_PI_MASK;

	if (blk_get_integrity(disk) &&
	    (ns->pi_type != pi_type || ns->ms != old_ms ||
	     bs != queue_logical_block_size(disk->queue) ||
	     (ns->ms && ns->ext)))
		blk_integrity_unregister(disk);

	ns->pi_type = pi_type;
}

1101 1102 1103 1104
static void nvme_init_integrity(struct nvme_ns *ns)
{
	struct blk_integrity integrity;

1105
	memset(&integrity, 0, sizeof(integrity));
1106 1107 1108
	switch (ns->pi_type) {
	case NVME_NS_DPS_PI_TYPE3:
		integrity.profile = &t10_pi_type3_crc;
1109 1110
		integrity.tag_size = sizeof(u16) + sizeof(u32);
		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1111 1112 1113 1114
		break;
	case NVME_NS_DPS_PI_TYPE1:
	case NVME_NS_DPS_PI_TYPE2:
		integrity.profile = &t10_pi_type1_crc;
1115 1116
		integrity.tag_size = sizeof(u16);
		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1117 1118 1119 1120 1121 1122 1123 1124 1125 1126
		break;
	default:
		integrity.profile = NULL;
		break;
	}
	integrity.tuple_size = ns->ms;
	blk_integrity_register(ns->disk, &integrity);
	blk_queue_max_integrity_segments(ns->queue, 1);
}
#else
1127 1128 1129 1130
static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
		u16 bs)
{
}
1131 1132 1133 1134 1135
static void nvme_init_integrity(struct nvme_ns *ns)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */

1136 1137 1138 1139 1140 1141
static void nvme_set_chunk_size(struct nvme_ns *ns)
{
	u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
	blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
}

1142 1143
static void nvme_config_discard(struct nvme_ns *ns)
{
1144
	struct nvme_ctrl *ctrl = ns->ctrl;
1145
	u32 logical_block_size = queue_logical_block_size(ns->queue);
1146

1147 1148 1149
	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
			NVME_DSM_MAX_RANGES);

1150 1151 1152 1153 1154 1155 1156 1157 1158
	if (ctrl->nr_streams && ns->sws && ns->sgs) {
		unsigned int sz = logical_block_size * ns->sws * ns->sgs;

		ns->queue->limits.discard_alignment = sz;
		ns->queue->limits.discard_granularity = sz;
	} else {
		ns->queue->limits.discard_alignment = logical_block_size;
		ns->queue->limits.discard_granularity = logical_block_size;
	}
1159
	blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
1160
	blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
1161
	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
1162 1163 1164

	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
		blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
1165 1166
}

1167
static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
1168
{
1169
	if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) {
1170
		dev_warn(ns->ctrl->dev, "%s: Identify failure\n", __func__);
1171 1172 1173
		return -ENODEV;
	}

1174 1175 1176
	if ((*id)->ncap == 0) {
		kfree(*id);
		return -ENODEV;
1177 1178
	}

1179
	if (ns->ctrl->vs >= NVME_VS(1, 1, 0))
1180
		memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui));
1181
	if (ns->ctrl->vs >= NVME_VS(1, 2, 0))
1182
		memcpy(ns->nguid, (*id)->nguid, sizeof(ns->nguid));
1183 1184 1185 1186 1187 1188 1189 1190
	if (ns->ctrl->vs >= NVME_VS(1, 3, 0)) {
		 /* Don't treat error as fatal we potentially
		  * already have a NGUID or EUI-64
		  */
		if (nvme_identify_ns_descs(ns, ns->ns_id))
			dev_warn(ns->ctrl->device,
				 "%s: Identify Descriptors failed\n", __func__);
	}