whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit ffb8e45cf33e14d9a565491aec7abe039bebcfce
parent 7376e39ad96583545faefa2e7798bcb6a2a212a7
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Fri, 29 Mar 2019 14:43:07 -0700

Merge tag 'for-linus-20190329' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
 "Small set of fixes that should go into this series. This contains:

   - compat signal mask fix for io_uring (Arnd)

   - EAGAIN corner case for direct vs buffered writes for io_uring
     (Roman)

   - NVMe pull request from Christoph with various little fixes

   - sbitmap ws_active fix, which caused a perf regression for shared
     tags (me)

   - sbitmap bit ordering fix (Ming)

   - libata on-stack DMA fix (Raymond)"

* tag 'for-linus-20190329' of git://git.kernel.dk/linux-block:
  nvmet: fix error flow during ns enable
  nvmet: fix building bvec from sg list
  nvme-multipath: relax ANA state check
  nvme-tcp: fix an endianess miss-annotation
  libata: fix using DMA buffers on stack
  io_uring: offload write to async worker in case of -EAGAIN
  sbitmap: order READ/WRITE freed instance and setting clear bit
  blk-mq: fix sbitmap ws_active for shared tags
  io_uring: fix big-endian compat signal mask handling
  blk-mq: update comment for blk_mq_hctx_has_pending()
  blk-mq: use blk_mq_put_driver_tag() to put tag

Diffstat:
Mblock/blk-flush.c | 4++--
Mblock/blk-mq.c | 16+++++++++++++---
Mblock/blk-mq.h | 9---------
Mdrivers/ata/libata-zpodd.c | 34++++++++++++++++++++++++----------
Mdrivers/nvme/host/multipath.c | 5+----
Mdrivers/nvme/host/tcp.c | 2+-
Mdrivers/nvme/target/core.c | 4++--
Mdrivers/nvme/target/io-cmd-file.c | 20++++++++++----------
Mfs/io_uring.c | 26++++++++++++++++++++++++--
Mlib/sbitmap.c | 11+++++++++++
10 files changed, 88 insertions(+), 43 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c @@ -220,7 +220,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); flush_rq->tag = -1; } else { - blk_mq_put_driver_tag_hctx(hctx, flush_rq); + blk_mq_put_driver_tag(flush_rq); flush_rq->internal_tag = -1; } @@ -324,7 +324,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) if (q->elevator) { WARN_ON(rq->tag < 0); - blk_mq_put_driver_tag_hctx(hctx, rq); + blk_mq_put_driver_tag(rq); } /* diff --git a/block/blk-mq.c b/block/blk-mq.c @@ -59,7 +59,8 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) } /* - * Check if any of the ctx's have pending work in this hardware queue + * Check if any of the ctx, dispatch list or elevator + * have pending work in this hardware queue. */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { @@ -1071,7 +1072,13 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); spin_lock(&hctx->dispatch_wait_lock); - list_del_init(&wait->entry); + if (!list_empty(&wait->entry)) { + struct sbitmap_queue *sbq; + + list_del_init(&wait->entry); + sbq = &hctx->tags->bitmap_tags; + atomic_dec(&sbq->ws_active); + } spin_unlock(&hctx->dispatch_wait_lock); blk_mq_run_hw_queue(hctx, true); @@ -1087,6 +1094,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { + struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; @@ -1109,7 +1117,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, if (!list_empty_careful(&wait->entry)) return false; - wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait; + wq = &bt_wait_ptr(sbq, hctx)->wait; spin_lock_irq(&wq->lock); spin_lock(&hctx->dispatch_wait_lock); @@ -1119,6 +1127,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, return false; } + atomic_inc(&sbq->ws_active); wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); @@ -1139,6 +1148,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, * someone else gets the wakeup. */ list_del_init(&wait->entry); + atomic_dec(&sbq->ws_active); spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); diff --git a/block/blk-mq.h b/block/blk-mq.h @@ -224,15 +224,6 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, } } -static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - if (rq->tag == -1 || rq->internal_tag == -1) - return; - - __blk_mq_put_driver_tag(hctx, rq); -} - static inline void blk_mq_put_driver_tag(struct request *rq) { if (rq->tag == -1 || rq->internal_tag == -1) diff --git a/drivers/ata/libata-zpodd.c b/drivers/ata/libata-zpodd.c @@ -52,38 +52,52 @@ static int eject_tray(struct ata_device *dev) /* Per the spec, only slot type and drawer type ODD can be supported */ static enum odd_mech_type zpodd_get_mech_type(struct ata_device *dev) { - char buf[16]; + char *buf; unsigned int ret; - struct rm_feature_desc *desc = (void *)(buf + 8); + struct rm_feature_desc *desc; struct ata_taskfile tf; static const char cdb[] = { GPCMD_GET_CONFIGURATION, 2, /* only 1 feature descriptor requested */ 0, 3, /* 3, removable medium feature */ 0, 0, 0,/* reserved */ - 0, sizeof(buf), + 0, 16, 0, 0, 0, }; + buf = kzalloc(16, GFP_KERNEL); + if (!buf) + return ODD_MECH_TYPE_UNSUPPORTED; + desc = (void *)(buf + 8); + ata_tf_init(dev, &tf); tf.flags = ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE; tf.command = ATA_CMD_PACKET; tf.protocol = ATAPI_PROT_PIO; - tf.lbam = sizeof(buf); + tf.lbam = 16; ret = ata_exec_internal(dev, &tf, cdb, DMA_FROM_DEVICE, - buf, sizeof(buf), 0); - if (ret) + buf, 16, 0); + if (ret) { + kfree(buf); return ODD_MECH_TYPE_UNSUPPORTED; + } - if (be16_to_cpu(desc->feature_code) != 3) + if (be16_to_cpu(desc->feature_code) != 3) { + kfree(buf); return ODD_MECH_TYPE_UNSUPPORTED; + } - if (desc->mech_type == 0 && desc->load == 0 && desc->eject == 1) + if (desc->mech_type == 0 && desc->load == 0 && desc->eject == 1) { + kfree(buf); return ODD_MECH_TYPE_SLOT; - else if (desc->mech_type == 1 && desc->load == 0 && desc->eject == 1) + } else if (desc->mech_type == 1 && desc->load == 0 && + desc->eject == 1) { + kfree(buf); return ODD_MECH_TYPE_DRAWER; - else + } else { + kfree(buf); return ODD_MECH_TYPE_UNSUPPORTED; + } } /* Test if ODD is zero power ready by sense code */ diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c @@ -404,15 +404,12 @@ static inline bool nvme_state_is_live(enum nvme_ana_state state) static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, struct nvme_ns *ns) { - enum nvme_ana_state old; - mutex_lock(&ns->head->lock); - old = ns->ana_state; ns->ana_grpid = le32_to_cpu(desc->grpid); ns->ana_state = desc->state; clear_bit(NVME_NS_ANA_PENDING, &ns->flags); - if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old)) + if (nvme_state_is_live(ns->ana_state)) nvme_mpath_set_live(ns); mutex_unlock(&ns->head->lock); } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c @@ -627,7 +627,7 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return ret; } -static inline void nvme_tcp_end_request(struct request *rq, __le16 status) +static inline void nvme_tcp_end_request(struct request *rq, u16 status) { union nvme_result res = {}; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c @@ -509,7 +509,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns) ret = nvmet_p2pmem_ns_enable(ns); if (ret) - goto out_unlock; + goto out_dev_disable; list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) nvmet_p2pmem_ns_add_p2p(ctrl, ns); @@ -550,7 +550,7 @@ out_unlock: out_dev_put: list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); - +out_dev_disable: nvmet_ns_dev_disable(ns); goto out_unlock; } diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c @@ -75,11 +75,11 @@ err: return ret; } -static void nvmet_file_init_bvec(struct bio_vec *bv, struct sg_page_iter *iter) +static void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg) { - bv->bv_page = sg_page_iter_page(iter); - bv->bv_offset = iter->sg->offset; - bv->bv_len = PAGE_SIZE - iter->sg->offset; + bv->bv_page = sg_page(sg); + bv->bv_offset = sg->offset; + bv->bv_len = sg->length; } static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, @@ -128,14 +128,14 @@ static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2) static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags) { - ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE); - struct sg_page_iter sg_pg_iter; + ssize_t nr_bvec = req->sg_cnt; unsigned long bv_cnt = 0; bool is_sync = false; size_t len = 0, total_len = 0; ssize_t ret = 0; loff_t pos; - + int i; + struct scatterlist *sg; if (req->f.mpool_alloc && nr_bvec > NVMET_MAX_MPOOL_BVEC) is_sync = true; @@ -147,8 +147,8 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags) } memset(&req->f.iocb, 0, sizeof(struct kiocb)); - for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) { - nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter); + for_each_sg(req->sg, sg, req->sg_cnt, i) { + nvmet_file_init_bvec(&req->f.bvec[bv_cnt], sg); len += req->f.bvec[bv_cnt].bv_len; total_len += req->f.bvec[bv_cnt].bv_len; bv_cnt++; @@ -225,7 +225,7 @@ static void nvmet_file_submit_buffered_io(struct nvmet_req *req) static void nvmet_file_execute_rw(struct nvmet_req *req) { - ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE); + ssize_t nr_bvec = req->sg_cnt; if (!req->sg_cnt || !nr_bvec) { nvmet_req_complete(req, 0); diff --git a/fs/io_uring.c b/fs/io_uring.c @@ -1022,6 +1022,8 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); if (!ret) { + ssize_t ret2; + /* * Open-code file_start_write here to grab freeze protection, * which will be released by another thread in @@ -1036,7 +1038,19 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, SB_FREEZE_WRITE); } kiocb->ki_flags |= IOCB_WRITE; - io_rw_done(kiocb, call_write_iter(file, kiocb, &iter)); + + ret2 = call_write_iter(file, kiocb, &iter); + if (!force_nonblock || ret2 != -EAGAIN) { + io_rw_done(kiocb, ret2); + } else { + /* + * If ->needs_lock is true, we're already in async + * context. + */ + if (!s->needs_lock) + io_async_list_note(WRITE, req, iov_count); + ret = -EAGAIN; + } } out_free: kfree(iovec); @@ -1968,7 +1982,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return 0; if (sig) { - ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, + &ksigmask, &sigsaved, sigsz); + else +#endif + ret = set_user_sigmask(sig, &ksigmask, + &sigsaved, sigsz); + if (ret) return ret; } diff --git a/lib/sbitmap.c b/lib/sbitmap.c @@ -591,6 +591,17 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) { + /* + * Once the clear bit is set, the bit may be allocated out. + * + * Orders READ/WRITE on the asssociated instance(such as request + * of blk_mq) by this bit for avoiding race with re-allocation, + * and its pair is the memory barrier implied in __sbitmap_get_word. + * + * One invariant is that the clear bit has to be zero when the bit + * is in use. + */ + smp_mb__before_atomic(); sbitmap_deferred_clear_bit(&sbq->sb, nr); /*