whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 11efae3506d882a8782bc89493a32e467defd6b9
parent 465c209db83e2cdaeb4a52f4e107a9fc636704db
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Sat, 16 Mar 2019 12:36:39 -0700

Merge tag 'for-5.1/block-post-20190315' of git://git.kernel.dk/linux-block

Pull more block layer changes from Jens Axboe:
 "This is a collection of both stragglers, and fixes that came in after
  I finalized the initial pull. This contains:

   - An MD pull request from Song, with a few minor fixes

   - Set of NVMe patches via Christoph

   - Pull request from Konrad, with a few fixes for xen/blkback

   - pblk fix IO calculation fix (Javier)

   - Segment calculation fix for pass-through (Ming)

   - Fallthrough annotation for blkcg (Mathieu)"

* tag 'for-5.1/block-post-20190315' of git://git.kernel.dk/linux-block: (25 commits)
  blkcg: annotate implicit fall through
  nvme-tcp: support C2HData with SUCCESS flag
  nvmet: ignore EOPNOTSUPP for discard
  nvme: add proper write zeroes setup for the multipath device
  nvme: add proper discard setup for the multipath device
  nvme: remove nvme_ns_config_oncs
  nvme: disable Write Zeroes for qemu controllers
  nvmet-fc: bring Disconnect into compliance with FC-NVME spec
  nvmet-fc: fix issues with targetport assoc_list list walking
  nvme-fc: reject reconnect if io queue count is reduced to zero
  nvme-fc: fix numa_node when dev is null
  nvme-fc: use nr_phys_segments to determine existence of sgl
  nvme-loop: init nvmet_ctrl fatal_err_work when allocate
  nvme: update comment to make the code easier to read
  nvme: put ns_head ref if namespace fails allocation
  nvme-trace: fix cdw10 buffer overrun
  nvme: don't warn on block content change effects
  nvme: add get-feature to admin cmds tracer
  md: Fix failed allocation of md_register_thread
  It's wrong to add len to sector_nr in raid10 reshape twice
  ...

Diffstat:
MDocumentation/admin-guide/md.rst | 3+++
Mblock/blk-merge.c | 15++++++++-------
Mdrivers/block/xen-blkback/xenbus.c | 99+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mdrivers/lightnvm/pblk-rl.c | 7++++++-
Mdrivers/md/raid10.c | 3++-
Mdrivers/md/raid5-log.h | 1+
Mdrivers/md/raid5-ppl.c | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdrivers/md/raid5.c | 3+++
Mdrivers/nvme/host/core.c | 28+++++++++++++---------------
Mdrivers/nvme/host/fc.c | 36+++++++++++++++++++++++++++++++-----
Mdrivers/nvme/host/nvme.h | 5+++++
Mdrivers/nvme/host/pci.c | 3++-
Mdrivers/nvme/host/tcp.c | 32++++++++++++++++++++++++++++----
Mdrivers/nvme/host/trace.c | 14++++++++++++++
Mdrivers/nvme/host/trace.h | 2+-
Mdrivers/nvme/target/core.c | 20++++++++++----------
Mdrivers/nvme/target/fc.c | 42++++++------------------------------------
Mdrivers/nvme/target/io-cmd-bdev.c | 8++++----
Mdrivers/nvme/target/io-cmd-file.c | 2+-
Mkernel/trace/blktrace.c | 1+
20 files changed, 259 insertions(+), 128 deletions(-)

diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst @@ -756,3 +756,6 @@ These currently include: The cache mode for raid5. raid5 could include an extra disk for caching. The mode can be "write-throuth" and "write-back". The default is "write-through". + + ppl_write_hint + NVMe stream ID to be set for each PPL write request. diff --git a/block/blk-merge.c b/block/blk-merge.c @@ -180,7 +180,7 @@ static unsigned get_max_segment_size(struct request_queue *q, */ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, unsigned *nsegs, unsigned *last_seg_size, - unsigned *front_seg_size, unsigned *sectors) + unsigned *front_seg_size, unsigned *sectors, unsigned max_segs) { unsigned len = bv->bv_len; unsigned total_len = 0; @@ -190,7 +190,7 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, * Multi-page bvec may be too big to hold in one segment, so the * current bvec has to be splitted as multiple segments. */ - while (len && new_nsegs + *nsegs < queue_max_segments(q)) { + while (len && new_nsegs + *nsegs < max_segs) { seg_size = get_max_segment_size(q, bv->bv_offset + total_len); seg_size = min(seg_size, len); @@ -240,6 +240,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bool do_split = true; struct bio *new = NULL; const unsigned max_sectors = get_max_io_size(q, bio); + const unsigned max_segs = queue_max_segments(q); bio_for_each_bvec(bv, bio, iter) { /* @@ -254,14 +255,14 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, * Consider this a new segment if we're splitting in * the middle of this vector. */ - if (nsegs < queue_max_segments(q) && + if (nsegs < max_segs && sectors < max_sectors) { /* split in the middle of bvec */ bv.bv_len = (max_sectors - sectors) << 9; bvec_split_segs(q, &bv, &nsegs, &seg_size, &front_seg_size, - &sectors); + &sectors, max_segs); } goto split; } @@ -283,7 +284,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, continue; } new_segment: - if (nsegs == queue_max_segments(q)) + if (nsegs == max_segs) goto split; bvprv = bv; @@ -296,7 +297,7 @@ new_segment: if (nsegs == 1 && seg_size > front_seg_size) front_seg_size = seg_size; } else if (bvec_split_segs(q, &bv, &nsegs, &seg_size, - &front_seg_size, &sectors)) { + &front_seg_size, &sectors, max_segs)) { goto split; } } @@ -415,7 +416,7 @@ new_segment: bvprv = bv; prev = 1; bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size, - &front_seg_size, NULL); + &front_seg_size, NULL, UINT_MAX); } bbio = bio; } diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c @@ -926,7 +926,7 @@ static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) int err, i, j; struct xen_blkif *blkif = ring->blkif; struct xenbus_device *dev = blkif->be->dev; - unsigned int ring_page_order, nr_grefs, evtchn; + unsigned int nr_grefs, evtchn; err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u", &evtchn); @@ -936,43 +936,42 @@ static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) return err; } - err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", - &ring_page_order); - if (err != 1) { - err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]); + nr_grefs = blkif->nr_ring_pages; + + if (unlikely(!nr_grefs)) { + WARN_ON(true); + return -EINVAL; + } + + for (i = 0; i < nr_grefs; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_scanf(XBT_NIL, dir, ring_ref_name, + "%u", &ring_ref[i]); + if (err != 1) { + if (nr_grefs == 1) + break; + err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir); + xenbus_dev_fatal(dev, err, "reading %s/%s", + dir, ring_ref_name); return err; } - nr_grefs = 1; - } else { - unsigned int i; + } - if (ring_page_order > xen_blkif_max_ring_order) { + if (err != 1) { + WARN_ON(nr_grefs != 1); + + err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", + &ring_ref[0]); + if (err != 1) { err = -EINVAL; - xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", - dir, ring_page_order, - xen_blkif_max_ring_order); + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir); return err; } - - nr_grefs = 1 << ring_page_order; - for (i = 0; i < nr_grefs; i++) { - char ring_ref_name[RINGREF_NAME_LEN]; - - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_scanf(XBT_NIL, dir, ring_ref_name, - "%u", &ring_ref[i]); - if (err != 1) { - err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/%s", - dir, ring_ref_name); - return err; - } - } } - blkif->nr_ring_pages = nr_grefs; for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { req = kzalloc(sizeof(*req), GFP_KERNEL); @@ -1023,6 +1022,7 @@ fail: static int connect_ring(struct backend_info *be) { struct xenbus_device *dev = be->dev; + struct xen_blkif *blkif = be->blkif; unsigned int pers_grants; char protocol[64] = ""; int err, i; @@ -1030,28 +1030,29 @@ static int connect_ring(struct backend_info *be) size_t xspathsize; const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */ unsigned int requested_num_queues = 0; + unsigned int ring_page_order; pr_debug("%s %s\n", __func__, dev->otherend); - be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; + blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; err = xenbus_scanf(XBT_NIL, dev->otherend, "protocol", "%63s", protocol); if (err <= 0) strcpy(protocol, "unspecified, assuming default"); else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; else { xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); return -ENOSYS; } pers_grants = xenbus_read_unsigned(dev->otherend, "feature-persistent", 0); - be->blkif->vbd.feature_gnt_persistent = pers_grants; - be->blkif->vbd.overflow_max_grants = 0; + blkif->vbd.feature_gnt_persistent = pers_grants; + blkif->vbd.overflow_max_grants = 0; /* * Read the number of hardware queues from frontend. @@ -1067,16 +1068,30 @@ static int connect_ring(struct backend_info *be) requested_num_queues, xenblk_max_queues); return -ENOSYS; } - be->blkif->nr_rings = requested_num_queues; - if (xen_blkif_alloc_rings(be->blkif)) + blkif->nr_rings = requested_num_queues; + if (xen_blkif_alloc_rings(blkif)) return -ENOMEM; pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename, - be->blkif->nr_rings, be->blkif->blk_protocol, protocol, + blkif->nr_rings, blkif->blk_protocol, protocol, pers_grants ? "persistent grants" : ""); - if (be->blkif->nr_rings == 1) - return read_per_ring_refs(&be->blkif->rings[0], dev->otherend); + ring_page_order = xenbus_read_unsigned(dev->otherend, + "ring-page-order", 0); + + if (ring_page_order > xen_blkif_max_ring_order) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, + "requested ring page order %d exceed max:%d", + ring_page_order, + xen_blkif_max_ring_order); + return err; + } + + blkif->nr_ring_pages = 1 << ring_page_order; + + if (blkif->nr_rings == 1) + return read_per_ring_refs(&blkif->rings[0], dev->otherend); else { xspathsize = strlen(dev->otherend) + xenstore_path_ext_size; xspath = kmalloc(xspathsize, GFP_KERNEL); @@ -1085,10 +1100,10 @@ static int connect_ring(struct backend_info *be) return -ENOMEM; } - for (i = 0; i < be->blkif->nr_rings; i++) { + for (i = 0; i < blkif->nr_rings; i++) { memset(xspath, 0, xspathsize); snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i); - err = read_per_ring_refs(&be->blkif->rings[i], xspath); + err = read_per_ring_refs(&blkif->rings[i], xspath); if (err) { kfree(xspath); return err; diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c @@ -233,10 +233,15 @@ void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold) /* To start with, all buffer is available to user I/O writers */ rl->rb_budget = budget; rl->rb_user_max = budget; - rl->rb_max_io = threshold ? (budget - threshold) : (budget - 1); rl->rb_gc_max = 0; rl->rb_state = PBLK_RL_HIGH; + /* Maximize I/O size and ansure that back threshold is respected */ + if (threshold) + rl->rb_max_io = budget - pblk->min_write_pgs_data - threshold; + else + rl->rb_max_io = budget - pblk->min_write_pgs_data - 1; + atomic_set(&rl->rb_user_cnt, 0); atomic_set(&rl->rb_gc_cnt, 0); atomic_set(&rl->rb_space, -1); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c @@ -3939,6 +3939,8 @@ static int raid10_run(struct mddev *mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, "reshape"); + if (!mddev->sync_thread) + goto out_free_conf; } return 0; @@ -4670,7 +4672,6 @@ read_more: atomic_inc(&r10_bio->remaining); read_bio->bi_next = NULL; generic_make_request(read_bio); - sector_nr += nr_sectors; sectors_done += nr_sectors; if (sector_nr <= last) goto read_more; diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h @@ -45,6 +45,7 @@ extern void ppl_stripe_write_finished(struct stripe_head *sh); extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); extern void ppl_quiesce(struct r5conf *conf, int quiesce); extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio); +extern struct md_sysfs_entry ppl_write_hint; static inline bool raid5_has_log(struct r5conf *conf) { diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c @@ -20,6 +20,7 @@ #include <linux/raid/md_p.h> #include "md.h" #include "raid5.h" +#include "raid5-log.h" /* * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for @@ -115,6 +116,8 @@ struct ppl_conf { /* stripes to retry if failed to allocate io_unit */ struct list_head no_mem_stripes; spinlock_t no_mem_stripes_lock; + + unsigned short write_hint; }; struct ppl_log { @@ -474,6 +477,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio_set_dev(bio, log->rdev->bdev); bio->bi_iter.bi_sector = log->next_io_sector; bio_add_page(bio, io->header_page, PAGE_SIZE, 0); + bio->bi_write_hint = ppl_conf->write_hint; pr_debug("%s: log->current_io_sector: %llu\n", __func__, (unsigned long long)log->next_io_sector); @@ -503,6 +507,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &ppl_conf->bs); bio->bi_opf = prev->bi_opf; + bio->bi_write_hint = prev->bi_write_hint; bio_copy_dev(bio, prev); bio->bi_iter.bi_sector = bio_end_sector(prev); bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); @@ -1407,6 +1412,7 @@ int ppl_init_log(struct r5conf *conf) atomic64_set(&ppl_conf->seq, 0); INIT_LIST_HEAD(&ppl_conf->no_mem_stripes); spin_lock_init(&ppl_conf->no_mem_stripes_lock); + ppl_conf->write_hint = RWF_WRITE_LIFE_NOT_SET; if (!mddev->external) { ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); @@ -1501,3 +1507,60 @@ int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add) return ret; } + +static ssize_t +ppl_write_hint_show(struct mddev *mddev, char *buf) +{ + size_t ret = 0; + struct r5conf *conf; + struct ppl_conf *ppl_conf = NULL; + + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf && raid5_has_ppl(conf)) + ppl_conf = conf->log_private; + ret = sprintf(buf, "%d\n", ppl_conf ? ppl_conf->write_hint : 0); + spin_unlock(&mddev->lock); + + return ret; +} + +static ssize_t +ppl_write_hint_store(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + struct ppl_conf *ppl_conf; + int err = 0; + unsigned short new; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtou16(page, 10, &new)) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + + conf = mddev->private; + if (!conf) { + err = -ENODEV; + } else if (raid5_has_ppl(conf)) { + ppl_conf = conf->log_private; + if (!ppl_conf) + err = -EINVAL; + else + ppl_conf->write_hint = new; + } else { + err = -EINVAL; + } + + mddev_unlock(mddev); + + return err ?: len; +} + +struct md_sysfs_entry +ppl_write_hint = __ATTR(ppl_write_hint, S_IRUGO | S_IWUSR, + ppl_write_hint_show, + ppl_write_hint_store); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c @@ -6650,6 +6650,7 @@ static struct attribute *raid5_attrs[] = { &raid5_skip_copy.attr, &raid5_rmw_level.attr, &r5c_journal_mode.attr, + &ppl_write_hint.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -7393,6 +7394,8 @@ static int raid5_run(struct mddev *mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, "reshape"); + if (!mddev->sync_thread) + goto abort; } /* Ok, everything is just fine now */ diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c @@ -179,8 +179,8 @@ static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) int ret = 0; /* - * Keep a reference until the work is flushed since ->delete_ctrl - * can free the controller. + * Keep a reference until nvme_do_delete_ctrl() complete, + * since ->delete_ctrl can free the controller. */ nvme_get_ctrl(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) @@ -1250,7 +1250,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (ns) { if (ctrl->effects) effects = le32_to_cpu(ctrl->effects->iocs[opcode]); - if (effects & ~NVME_CMD_EFFECTS_CSUPP) + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn(ctrl->device, "IO command:%02x has unhandled effects:%08x\n", opcode, effects); @@ -1495,10 +1495,10 @@ static void nvme_set_chunk_size(struct nvme_ns *ns) blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); } -static void nvme_config_discard(struct nvme_ns *ns) +static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) { struct nvme_ctrl *ctrl = ns->ctrl; - struct request_queue *queue = ns->queue; + struct request_queue *queue = disk->queue; u32 size = queue_logical_block_size(queue); if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { @@ -1526,12 +1526,13 @@ static void nvme_config_discard(struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } -static inline void nvme_config_write_zeroes(struct nvme_ns *ns) +static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) { u32 max_sectors; unsigned short bs = 1 << ns->lba_shift; - if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES)) + if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) || + (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) return; /* * Even though NVMe spec explicitly states that MDTS is not @@ -1548,13 +1549,7 @@ static inline void nvme_config_write_zeroes(struct nvme_ns *ns) else max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9; - blk_queue_max_write_zeroes_sectors(ns->queue, max_sectors); -} - -static inline void nvme_ns_config_oncs(struct nvme_ns *ns) -{ - nvme_config_discard(ns); - nvme_config_write_zeroes(ns); + blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors); } static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, @@ -1610,7 +1605,9 @@ static void nvme_update_disk_info(struct gendisk *disk, capacity = 0; set_capacity(disk, capacity); - nvme_ns_config_oncs(ns); + + nvme_config_discard(disk, ns); + nvme_config_write_zeroes(disk, ns); if (id->nsattr & (1 << 0)) set_disk_ro(disk, true); @@ -3304,6 +3301,7 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); mutex_unlock(&ctrl->subsys->lock); + nvme_put_ns_head(ns->head); out_free_id: kfree(id); out_free_queue: diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c @@ -2107,7 +2107,7 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, freq->sg_cnt = 0; - if (!blk_rq_payload_bytes(rq)) + if (!blk_rq_nr_phys_segments(rq)) return 0; freq->sg_table.sgl = freq->first_sgl; @@ -2304,12 +2304,23 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) return ret; - data_len = blk_rq_payload_bytes(rq); - if (data_len) + /* + * nvme core doesn't quite treat the rq opaquely. Commands such + * as WRITE ZEROES will return a non-zero rq payload_bytes yet + * there is no actual payload to be transferred. + * To get it right, key data transmission on there being 1 or + * more physical segments in the sg list. If there is no + * physical segments, there is no payload. + */ + if (blk_rq_nr_phys_segments(rq)) { + data_len = blk_rq_payload_bytes(rq); io_dir = ((rq_data_dir(rq) == WRITE) ? NVMEFC_FCP_WRITE : NVMEFC_FCP_READ); - else + } else { + data_len = 0; io_dir = NVMEFC_FCP_NODATA; + } + return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir); } @@ -2464,6 +2475,7 @@ static int nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + u32 prior_ioq_cnt = ctrl->ctrl.queue_count - 1; unsigned int nr_io_queues; int ret; @@ -2476,6 +2488,13 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) return ret; } + if (!nr_io_queues && prior_ioq_cnt) { + dev_info(ctrl->ctrl.device, + "Fail Reconnect: At least 1 io queue " + "required (was %d)\n", prior_ioq_cnt); + return -ENOSPC; + } + ctrl->ctrl.queue_count = nr_io_queues + 1; /* check for io queues existing */ if (ctrl->ctrl.queue_count == 1) @@ -2489,6 +2508,10 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queues; + if (prior_ioq_cnt != nr_io_queues) + dev_info(ctrl->ctrl.device, + "reconnect: revising io queue count from %d to %d\n", + prior_ioq_cnt, nr_io_queues); blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); return 0; @@ -3006,7 +3029,10 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.opts = opts; ctrl->ctrl.nr_reconnects = 0; - ctrl->ctrl.numa_node = dev_to_node(lport->dev); + if (lport->dev) + ctrl->ctrl.numa_node = dev_to_node(lport->dev); + else + ctrl->ctrl.numa_node = NUMA_NO_NODE; INIT_LIST_HEAD(&ctrl->ctrl_list); ctrl->lport = lport; ctrl->rport = rport; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h @@ -87,6 +87,11 @@ enum nvme_quirks { * Ignore device provided subnqn. */ NVME_QUIRK_IGNORE_DEV_SUBNQN = (1 << 8), + + /* + * Broken Write Zeroes. + */ + NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c @@ -2937,7 +2937,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ - .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, + .driver_data = NVME_QUIRK_IDENTIFY_CNS | + NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c @@ -463,6 +463,15 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, queue->data_remaining = le32_to_cpu(pdu->data_length); + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS && + unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x SUCCESS set but not last PDU\n", + nvme_tcp_queue_id(queue), rq->tag); + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return -EPROTO; + } + return 0; } @@ -618,6 +627,14 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return ret; } +static inline void nvme_tcp_end_request(struct request *rq, __le16 status) +{ + union nvme_result res = {}; + + nvme_end_request(rq, cpu_to_le16(status << 1), res); +} + + static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { @@ -685,6 +702,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) + nvme_tcp_end_request(rq, NVME_SC_SUCCESS); nvme_tcp_init_recv_ctx(queue); } } @@ -695,6 +714,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; char *ddgst = (char *)&queue->recv_ddgst; size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining); off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining; @@ -718,6 +738,13 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, return -EIO; } + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { + struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); + + nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + } + nvme_tcp_init_recv_ctx(queue); return 0; } @@ -815,10 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) static void nvme_tcp_fail_request(struct nvme_tcp_request *req) { - union nvme_result res = {}; - - nvme_end_request(blk_mq_rq_from_pdu(req), - cpu_to_le16(NVME_SC_DATA_XFER_ERROR), res); + nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR); } static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c @@ -50,7 +50,19 @@ static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_admin_get_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sel = cdw10[1] & 0x7; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_putc(p, 0); + return ret; +} static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) { @@ -101,6 +113,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_create_cq(p, cdw10); case nvme_admin_identify: return nvme_trace_admin_identify(p, cdw10); + case nvme_admin_get_features: + return nvme_trace_admin_get_features(p, cdw10); default: return nvme_trace_common(p, cdw10); } diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h @@ -108,7 +108,7 @@ TRACE_EVENT(nvme_setup_cmd, __entry->metadata = le64_to_cpu(cmd->common.metadata); __assign_disk_name(__entry->disk, req->rq_disk); memcpy(__entry->cdw10, &cmd->common.cdw10, - 6 * sizeof(__entry->cdw10)); + sizeof(__entry->cdw10)); ), TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", __entry->ctrl_id, __print_disk_name(__entry->disk), diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c @@ -1163,6 +1163,15 @@ static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl) put_device(ctrl->p2p_client); } +static void nvmet_fatal_error_handler(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, fatal_err_work); + + pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); + ctrl->ops->delete_ctrl(ctrl); +} + u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) { @@ -1205,6 +1214,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); INIT_LIST_HEAD(&ctrl->async_events); INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL); + INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); @@ -1308,21 +1318,11 @@ void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) kref_put(&ctrl->ref, nvmet_ctrl_free); } -static void nvmet_fatal_error_handler(struct work_struct *work) -{ - struct nvmet_ctrl *ctrl = - container_of(work, struct nvmet_ctrl, fatal_err_work); - - pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); - ctrl->ops->delete_ctrl(ctrl); -} - void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) { mutex_lock(&ctrl->lock); if (!(ctrl->csts & NVME_CSTS_CFS)) { ctrl->csts |= NVME_CSTS_CFS; - INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); schedule_work(&ctrl->fatal_err_work); } mutex_unlock(&ctrl->lock); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c @@ -1143,10 +1143,8 @@ __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport) &tgtport->assoc_list, a_list) { if (!nvmet_fc_tgt_a_get(assoc)) continue; - spin_unlock_irqrestore(&tgtport->lock, flags); - nvmet_fc_delete_target_assoc(assoc); - nvmet_fc_tgt_a_put(assoc); - spin_lock_irqsave(&tgtport->lock, flags); + if (!schedule_work(&assoc->del_work)) + nvmet_fc_tgt_a_put(assoc); } spin_unlock_irqrestore(&tgtport->lock, flags); } @@ -1185,7 +1183,8 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) nvmet_fc_tgtport_put(tgtport); if (found_ctrl) { - schedule_work(&assoc->del_work); + if (!schedule_work(&assoc->del_work)) + nvmet_fc_tgt_a_put(assoc); return; } @@ -1503,10 +1502,8 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, (struct fcnvme_ls_disconnect_rqst *)iod->rqstbuf; struct fcnvme_ls_disconnect_acc *acc = (struct fcnvme_ls_disconnect_acc *)iod->rspbuf; - struct nvmet_fc_tgt_queue *queue = NULL; struct nvmet_fc_tgt_assoc *assoc; int ret = 0; - bool del_assoc = false; memset(acc, 0, sizeof(*acc)); @@ -1537,18 +1534,7 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, assoc = nvmet_fc_find_target_assoc(tgtport, be64_to_cpu(rqst->associd.association_id)); iod->assoc = assoc; - if (assoc) { - if (rqst->discon_cmd.scope == - FCNVME_DISCONN_CONNECTION) { - queue = nvmet_fc_find_target_queue(tgtport, - be64_to_cpu( - rqst->discon_cmd.id)); - if (!queue) { - nvmet_fc_tgt_a_put(assoc); - ret = VERR_NO_CONN; - } - } - } else + if (!assoc) ret = VERR_NO_ASSOC; } @@ -1576,26 +1562,10 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, sizeof(struct fcnvme_ls_disconnect_acc)), FCNVME_LS_DISCONNECT); - - /* are we to delete a Connection ID (queue) */ - if (queue) { - int qid = queue->qid; - - nvmet_fc_delete_target_queue(queue); - - /* release the get taken by find_target_queue */ - nvmet_fc_tgt_q_put(queue); - - /* tear association down if io queue terminated */ - if (!qid) - del_assoc = true; - } - /* release get taken in nvmet_fc_find_target_assoc */ nvmet_fc_tgt_a_put(iod->assoc); - if (del_assoc) - nvmet_fc_delete_target_assoc(iod->assoc); + nvmet_fc_delete_target_assoc(iod->assoc); } diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c @@ -194,11 +194,11 @@ static u16 nvmet_bdev_discard_range(struct nvmet_req *req, le64_to_cpu(range->slba) << (ns->blksize_shift - 9), le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), GFP_KERNEL, 0, bio); - - if (ret) + if (ret && ret != -EOPNOTSUPP) { req->error_slba = le64_to_cpu(range->slba); - - return blk_to_nvme_status(req, errno_to_blk_status(ret)); + return blk_to_nvme_status(req, errno_to_blk_status(ret)); + } + return NVME_SC_SUCCESS; } static void nvmet_bdev_execute_discard(struct nvmet_req *req) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c @@ -297,7 +297,7 @@ static void nvmet_file_execute_discard(struct nvmet_req *req) } ret = vfs_fallocate(req->ns->file, mode, offset, len); - if (ret) { + if (ret && ret != -EOPNOTSUPP) { req->error_slba = le64_to_cpu(range.slba); status = errno_to_nvme_status(req, ret); break; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c @@ -723,6 +723,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) #endif case BLKTRACESTART: start = 1; + /* fall through */ case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break;