whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit b5b1de3537e2cd8f52971224a1be24bb3ce34a65
parent 90de1fb83e7c760aa403381f072486fc4e3e8b5f
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu,  1 Nov 2018 14:42:49 -0700

Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

Pull virtio/vhost updates from Michael Tsirkin:
 "Fixes and tweaks:

   - virtio balloon page hinting support

   - vhost scsi control queue

   - misc fixes"

* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost:
  MAINTAINERS: remove reference to bogus vsock file
  vhost/scsi: Use common handling code in request queue handler
  vhost/scsi: Extract common handling code from control queue handler
  vhost/scsi: Respond to control queue operations
  vhost/scsi: truncate T10 PI iov_iter to prot_bytes
  virtio-balloon: VIRTIO_BALLOON_F_PAGE_POISON
  mm/page_poison: expose page_poisoning_enabled to kernel modules
  virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
  kvm_config: add CONFIG_VIRTIO_MENU

Diffstat:
MMAINTAINERS | 1-
Mdrivers/vhost/scsi.c | 426+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Mdrivers/virtio/virtio_balloon.c | 374++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
Minclude/uapi/linux/virtio_balloon.h | 8++++++++
Mkernel/configs/kvm_guest.config | 1+
Mmm/page_poison.c | 6++++++
6 files changed, 685 insertions(+), 131 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS @@ -15858,7 +15858,6 @@ F: net/vmw_vsock/virtio_transport_common.c F: net/vmw_vsock/virtio_transport.c F: drivers/net/vsockmon.c F: drivers/vhost/vsock.c -F: drivers/vhost/vsock.h F: tools/testing/vsock/ VIRTIO CONSOLE DRIVER diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c @@ -203,6 +203,19 @@ struct vhost_scsi { int vs_events_nr; /* num of pending events, protected by vq->mutex */ }; +/* + * Context for processing request and control queue operations. + */ +struct vhost_scsi_ctx { + int head; + unsigned int out, in; + size_t req_size, rsp_size; + size_t out_size, in_size; + u8 *target, *lunp; + void *req; + struct iov_iter out_iter; +}; + static struct workqueue_struct *vhost_scsi_workqueue; /* Global spinlock to protect vhost_scsi TPG list for vhost IOCTL access */ @@ -800,24 +813,120 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs, pr_err("Faulted on virtio_scsi_cmd_resp\n"); } +static int +vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq, + struct vhost_scsi_ctx *vc) +{ + int ret = -ENXIO; + + vc->head = vhost_get_vq_desc(vq, vq->iov, + ARRAY_SIZE(vq->iov), &vc->out, &vc->in, + NULL, NULL); + + pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n", + vc->head, vc->out, vc->in); + + /* On error, stop handling until the next kick. */ + if (unlikely(vc->head < 0)) + goto done; + + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (vc->head == vq->num) { + if (unlikely(vhost_enable_notify(&vs->dev, vq))) { + vhost_disable_notify(&vs->dev, vq); + ret = -EAGAIN; + } + goto done; + } + + /* + * Get the size of request and response buffers. + * FIXME: Not correct for BIDI operation + */ + vc->out_size = iov_length(vq->iov, vc->out); + vc->in_size = iov_length(&vq->iov[vc->out], vc->in); + + /* + * Copy over the virtio-scsi request header, which for a + * ANY_LAYOUT enabled guest may span multiple iovecs, or a + * single iovec may contain both the header + outgoing + * WRITE payloads. + * + * copy_from_iter() will advance out_iter, so that it will + * point at the start of the outgoing WRITE payload, if + * DMA_TO_DEVICE is set. + */ + iov_iter_init(&vc->out_iter, WRITE, vq->iov, vc->out, vc->out_size); + ret = 0; + +done: + return ret; +} + +static int +vhost_scsi_chk_size(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc) +{ + if (unlikely(vc->in_size < vc->rsp_size)) { + vq_err(vq, + "Response buf too small, need min %zu bytes got %zu", + vc->rsp_size, vc->in_size); + return -EINVAL; + } else if (unlikely(vc->out_size < vc->req_size)) { + vq_err(vq, + "Request buf too small, need min %zu bytes got %zu", + vc->req_size, vc->out_size); + return -EIO; + } + + return 0; +} + +static int +vhost_scsi_get_req(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc, + struct vhost_scsi_tpg **tpgp) +{ + int ret = -EIO; + + if (unlikely(!copy_from_iter_full(vc->req, vc->req_size, + &vc->out_iter))) { + vq_err(vq, "Faulted on copy_from_iter\n"); + } else if (unlikely(*vc->lunp != 1)) { + /* virtio-scsi spec requires byte 0 of the lun to be 1 */ + vq_err(vq, "Illegal virtio-scsi lun: %u\n", *vc->lunp); + } else { + struct vhost_scsi_tpg **vs_tpg, *tpg; + + vs_tpg = vq->private_data; /* validated at handler entry */ + + tpg = READ_ONCE(vs_tpg[*vc->target]); + if (unlikely(!tpg)) { + vq_err(vq, "Target 0x%x does not exist\n", *vc->target); + } else { + if (tpgp) + *tpgp = tpg; + ret = 0; + } + } + + return ret; +} + static void vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) { struct vhost_scsi_tpg **vs_tpg, *tpg; struct virtio_scsi_cmd_req v_req; struct virtio_scsi_cmd_req_pi v_req_pi; + struct vhost_scsi_ctx vc; struct vhost_scsi_cmd *cmd; - struct iov_iter out_iter, in_iter, prot_iter, data_iter; + struct iov_iter in_iter, prot_iter, data_iter; u64 tag; u32 exp_data_len, data_direction; - unsigned int out = 0, in = 0; - int head, ret, prot_bytes; - size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp); - size_t out_size, in_size; + int ret, prot_bytes; u16 lun; - u8 *target, *lunp, task_attr; + u8 task_attr; bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI); - void *req, *cdb; + void *cdb; mutex_lock(&vq->mutex); /* @@ -828,85 +937,47 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (!vs_tpg) goto out; + memset(&vc, 0, sizeof(vc)); + vc.rsp_size = sizeof(struct virtio_scsi_cmd_resp); + vhost_disable_notify(&vs->dev, vq); for (;;) { - head = vhost_get_vq_desc(vq, vq->iov, - ARRAY_SIZE(vq->iov), &out, &in, - NULL, NULL); - pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n", - head, out, in); - /* On error, stop handling until the next kick. */ - if (unlikely(head < 0)) - break; - /* Nothing new? Wait for eventfd to tell us they refilled. */ - if (head == vq->num) { - if (unlikely(vhost_enable_notify(&vs->dev, vq))) { - vhost_disable_notify(&vs->dev, vq); - continue; - } - break; - } - /* - * Check for a sane response buffer so we can report early - * errors back to the guest. - */ - if (unlikely(vq->iov[out].iov_len < rsp_size)) { - vq_err(vq, "Expecting at least virtio_scsi_cmd_resp" - " size, got %zu bytes\n", vq->iov[out].iov_len); - break; - } + ret = vhost_scsi_get_desc(vs, vq, &vc); + if (ret) + goto err; + /* * Setup pointers and values based upon different virtio-scsi * request header if T10_PI is enabled in KVM guest. */ if (t10_pi) { - req = &v_req_pi; - req_size = sizeof(v_req_pi); - lunp = &v_req_pi.lun[0]; - target = &v_req_pi.lun[1]; + vc.req = &v_req_pi; + vc.req_size = sizeof(v_req_pi); + vc.lunp = &v_req_pi.lun[0]; + vc.target = &v_req_pi.lun[1]; } else { - req = &v_req; - req_size = sizeof(v_req); - lunp = &v_req.lun[0]; - target = &v_req.lun[1]; + vc.req = &v_req; + vc.req_size = sizeof(v_req); + vc.lunp = &v_req.lun[0]; + vc.target = &v_req.lun[1]; } - /* - * FIXME: Not correct for BIDI operation - */ - out_size = iov_length(vq->iov, out); - in_size = iov_length(&vq->iov[out], in); /* - * Copy over the virtio-scsi request header, which for a - * ANY_LAYOUT enabled guest may span multiple iovecs, or a - * single iovec may contain both the header + outgoing - * WRITE payloads. - * - * copy_from_iter() will advance out_iter, so that it will - * point at the start of the outgoing WRITE payload, if - * DMA_TO_DEVICE is set. + * Validate the size of request and response buffers. + * Check for a sane response buffer so we can report + * early errors back to the guest. */ - iov_iter_init(&out_iter, WRITE, vq->iov, out, out_size); + ret = vhost_scsi_chk_size(vq, &vc); + if (ret) + goto err; - if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) { - vq_err(vq, "Faulted on copy_from_iter\n"); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; - } - /* virtio-scsi spec requires byte 0 of the lun to be 1 */ - if (unlikely(*lunp != 1)) { - vq_err(vq, "Illegal virtio-scsi lun: %u\n", *lunp); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; - } + ret = vhost_scsi_get_req(vq, &vc, &tpg); + if (ret) + goto err; + + ret = -EIO; /* bad target on any error from here on */ - tpg = READ_ONCE(vs_tpg[*target]); - if (unlikely(!tpg)) { - /* Target does not exist, fail the request */ - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; - } /* * Determine data_direction by calculating the total outgoing * iovec sizes + incoming iovec sizes vs. virtio-scsi request + @@ -924,17 +995,17 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) */ prot_bytes = 0; - if (out_size > req_size) { + if (vc.out_size > vc.req_size) { data_direction = DMA_TO_DEVICE; - exp_data_len = out_size - req_size; - data_iter = out_iter; - } else if (in_size > rsp_size) { + exp_data_len = vc.out_size - vc.req_size; + data_iter = vc.out_iter; + } else if (vc.in_size > vc.rsp_size) { data_direction = DMA_FROM_DEVICE; - exp_data_len = in_size - rsp_size; + exp_data_len = vc.in_size - vc.rsp_size; - iov_iter_init(&in_iter, READ, &vq->iov[out], in, - rsp_size + exp_data_len); - iov_iter_advance(&in_iter, rsp_size); + iov_iter_init(&in_iter, READ, &vq->iov[vc.out], vc.in, + vc.rsp_size + exp_data_len); + iov_iter_advance(&in_iter, vc.rsp_size); data_iter = in_iter; } else { data_direction = DMA_NONE; @@ -950,21 +1021,20 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (data_direction != DMA_TO_DEVICE) { vq_err(vq, "Received non zero pi_bytesout," " but wrong data_direction\n"); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesout); } else if (v_req_pi.pi_bytesin) { if (data_direction != DMA_FROM_DEVICE) { vq_err(vq, "Received non zero pi_bytesin," " but wrong data_direction\n"); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin); } /* - * Set prot_iter to data_iter, and advance past any + * Set prot_iter to data_iter and truncate it to + * prot_bytes, and advance data_iter past any * preceeding prot_bytes that may be present. * * Also fix up the exp_data_len to reflect only the @@ -973,6 +1043,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (prot_bytes) { exp_data_len -= prot_bytes; prot_iter = data_iter; + iov_iter_truncate(&prot_iter, prot_bytes); iov_iter_advance(&data_iter, prot_bytes); } tag = vhost64_to_cpu(vq, v_req_pi.tag); @@ -996,8 +1067,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) vq_err(vq, "Received SCSI CDB with command_size: %d that" " exceeds SCSI_MAX_VARLEN_CDB_SIZE: %d\n", scsi_command_size(cdb), VHOST_SCSI_MAX_CDB_SIZE); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } cmd = vhost_scsi_get_tag(vq, tpg, cdb, tag, lun, task_attr, exp_data_len + prot_bytes, @@ -1005,13 +1075,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (IS_ERR(cmd)) { vq_err(vq, "vhost_scsi_get_tag failed %ld\n", PTR_ERR(cmd)); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } cmd->tvc_vhost = vs; cmd->tvc_vq = vq; - cmd->tvc_resp_iov = vq->iov[out]; - cmd->tvc_in_iovs = in; + cmd->tvc_resp_iov = vq->iov[vc.out]; + cmd->tvc_in_iovs = vc.in; pr_debug("vhost_scsi got command opcode: %#02x, lun: %d\n", cmd->tvc_cdb[0], cmd->tvc_lun); @@ -1019,14 +1088,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) " %d\n", cmd, exp_data_len, prot_bytes, data_direction); if (data_direction != DMA_NONE) { - ret = vhost_scsi_mapal(cmd, - prot_bytes, &prot_iter, - exp_data_len, &data_iter); - if (unlikely(ret)) { + if (unlikely(vhost_scsi_mapal(cmd, prot_bytes, + &prot_iter, exp_data_len, + &data_iter))) { vq_err(vq, "Failed to map iov to sgl\n"); vhost_scsi_release_cmd(&cmd->tvc_se_cmd); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } } /* @@ -1034,7 +1101,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) * complete the virtio-scsi request in TCM callback context via * vhost_scsi_queue_data_in() and vhost_scsi_queue_status() */ - cmd->tvc_vq_desc = head; + cmd->tvc_vq_desc = vc.head; /* * Dispatch cmd descriptor for cmwq execution in process * context provided by vhost_scsi_workqueue. This also ensures @@ -1043,6 +1110,166 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) */ INIT_WORK(&cmd->work, vhost_scsi_submission_work); queue_work(vhost_scsi_workqueue, &cmd->work); + ret = 0; +err: + /* + * ENXIO: No more requests, or read error, wait for next kick + * EINVAL: Invalid response buffer, drop the request + * EIO: Respond with bad target + * EAGAIN: Pending request + */ + if (ret == -ENXIO) + break; + else if (ret == -EIO) + vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out); + } +out: + mutex_unlock(&vq->mutex); +} + +static void +vhost_scsi_send_tmf_reject(struct vhost_scsi *vs, + struct vhost_virtqueue *vq, + struct vhost_scsi_ctx *vc) +{ + struct virtio_scsi_ctrl_tmf_resp __user *resp; + struct virtio_scsi_ctrl_tmf_resp rsp; + int ret; + + pr_debug("%s\n", __func__); + memset(&rsp, 0, sizeof(rsp)); + rsp.response = VIRTIO_SCSI_S_FUNCTION_REJECTED; + resp = vq->iov[vc->out].iov_base; + ret = __copy_to_user(resp, &rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0); + else + pr_err("Faulted on virtio_scsi_ctrl_tmf_resp\n"); +} + +static void +vhost_scsi_send_an_resp(struct vhost_scsi *vs, + struct vhost_virtqueue *vq, + struct vhost_scsi_ctx *vc) +{ + struct virtio_scsi_ctrl_an_resp __user *resp; + struct virtio_scsi_ctrl_an_resp rsp; + int ret; + + pr_debug("%s\n", __func__); + memset(&rsp, 0, sizeof(rsp)); /* event_actual = 0 */ + rsp.response = VIRTIO_SCSI_S_OK; + resp = vq->iov[vc->out].iov_base; + ret = __copy_to_user(resp, &rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0); + else + pr_err("Faulted on virtio_scsi_ctrl_an_resp\n"); +} + +static void +vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) +{ + union { + __virtio32 type; + struct virtio_scsi_ctrl_an_req an; + struct virtio_scsi_ctrl_tmf_req tmf; + } v_req; + struct vhost_scsi_ctx vc; + size_t typ_size; + int ret; + + mutex_lock(&vq->mutex); + /* + * We can handle the vq only after the endpoint is setup by calling the + * VHOST_SCSI_SET_ENDPOINT ioctl. + */ + if (!vq->private_data) + goto out; + + memset(&vc, 0, sizeof(vc)); + + vhost_disable_notify(&vs->dev, vq); + + for (;;) { + ret = vhost_scsi_get_desc(vs, vq, &vc); + if (ret) + goto err; + + /* + * Get the request type first in order to setup + * other parameters dependent on the type. + */ + vc.req = &v_req.type; + typ_size = sizeof(v_req.type); + + if (unlikely(!copy_from_iter_full(vc.req, typ_size, + &vc.out_iter))) { + vq_err(vq, "Faulted on copy_from_iter tmf type\n"); + /* + * The size of the response buffer depends on the + * request type and must be validated against it. + * Since the request type is not known, don't send + * a response. + */ + continue; + } + + switch (v_req.type) { + case VIRTIO_SCSI_T_TMF: + vc.req = &v_req.tmf; + vc.req_size = sizeof(struct virtio_scsi_ctrl_tmf_req); + vc.rsp_size = sizeof(struct virtio_scsi_ctrl_tmf_resp); + vc.lunp = &v_req.tmf.lun[0]; + vc.target = &v_req.tmf.lun[1]; + break; + case VIRTIO_SCSI_T_AN_QUERY: + case VIRTIO_SCSI_T_AN_SUBSCRIBE: + vc.req = &v_req.an; + vc.req_size = sizeof(struct virtio_scsi_ctrl_an_req); + vc.rsp_size = sizeof(struct virtio_scsi_ctrl_an_resp); + vc.lunp = &v_req.an.lun[0]; + vc.target = NULL; + break; + default: + vq_err(vq, "Unknown control request %d", v_req.type); + continue; + } + + /* + * Validate the size of request and response buffers. + * Check for a sane response buffer so we can report + * early errors back to the guest. + */ + ret = vhost_scsi_chk_size(vq, &vc); + if (ret) + goto err; + + /* + * Get the rest of the request now that its size is known. + */ + vc.req += typ_size; + vc.req_size -= typ_size; + + ret = vhost_scsi_get_req(vq, &vc, NULL); + if (ret) + goto err; + + if (v_req.type == VIRTIO_SCSI_T_TMF) + vhost_scsi_send_tmf_reject(vs, vq, &vc); + else + vhost_scsi_send_an_resp(vs, vq, &vc); +err: + /* + * ENXIO: No more requests, or read error, wait for next kick + * EINVAL: Invalid response buffer, drop the request + * EIO: Respond with bad target + * EAGAIN: Pending request + */ + if (ret == -ENXIO) + break; + else if (ret == -EIO) + vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out); } out: mutex_unlock(&vq->mutex); @@ -1050,7 +1277,12 @@ out: static void vhost_scsi_ctl_handle_kick(struct vhost_work *work) { + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_scsi *vs = container_of(vq->dev, struct vhost_scsi, dev); + pr_debug("%s: The handling func for control queue.\n", __func__); + vhost_scsi_ctl_handle_vq(vs, vq); } static void diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c @@ -41,13 +41,34 @@ #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80 +#define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NOMEMALLOC) +/* The order of free page blocks to report to host */ +#define VIRTIO_BALLOON_FREE_PAGE_ORDER (MAX_ORDER - 1) +/* The size of a free page block in bytes */ +#define VIRTIO_BALLOON_FREE_PAGE_SIZE \ + (1 << (VIRTIO_BALLOON_FREE_PAGE_ORDER + PAGE_SHIFT)) + #ifdef CONFIG_BALLOON_COMPACTION static struct vfsmount *balloon_mnt; #endif +enum virtio_balloon_vq { + VIRTIO_BALLOON_VQ_INFLATE, + VIRTIO_BALLOON_VQ_DEFLATE, + VIRTIO_BALLOON_VQ_STATS, + VIRTIO_BALLOON_VQ_FREE_PAGE, + VIRTIO_BALLOON_VQ_MAX +}; + struct virtio_balloon { struct virtio_device *vdev; - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq; + + /* Balloon's own wq for cpu-intensive work items */ + struct workqueue_struct *balloon_wq; + /* The free page reporting work item submitted to the balloon wq */ + struct work_struct report_free_page_work; /* The balloon servicing is delegated to a freezable workqueue. */ struct work_struct update_balloon_stats_work; @@ -57,6 +78,18 @@ struct virtio_balloon { spinlock_t stop_update_lock; bool stop_update; + /* The list of allocated free pages, waiting to be given back to mm */ + struct list_head free_page_list; + spinlock_t free_page_list_lock; + /* The number of free page blocks on the above list */ + unsigned long num_free_page_blocks; + /* The cmd id received from host */ + u32 cmd_id_received; + /* The cmd id that is actively in use */ + __virtio32 cmd_id_active; + /* Buffer to store the stop sign */ + __virtio32 cmd_id_stop; + /* Waiting for host to ack the pages we released. */ wait_queue_head_t acked; @@ -320,17 +353,6 @@ static void stats_handle_request(struct virtio_balloon *vb) virtqueue_kick(vq); } -static void virtballoon_changed(struct virtio_device *vdev) -{ - struct virtio_balloon *vb = vdev->priv; - unsigned long flags; - - spin_lock_irqsave(&vb->stop_update_lock, flags); - if (!vb->stop_update) - queue_work(system_freezable_wq, &vb->update_balloon_size_work); - spin_unlock_irqrestore(&vb->stop_update_lock, flags); -} - static inline s64 towards_target(struct virtio_balloon *vb) { s64 target; @@ -347,6 +369,60 @@ static inline s64 towards_target(struct virtio_balloon *vb) return target - vb->num_pages; } +/* Gives back @num_to_return blocks of free pages to mm. */ +static unsigned long return_free_pages_to_mm(struct virtio_balloon *vb, + unsigned long num_to_return) +{ + struct page *page; + unsigned long num_returned; + + spin_lock_irq(&vb->free_page_list_lock); + for (num_returned = 0; num_returned < num_to_return; num_returned++) { + page = balloon_page_pop(&vb->free_page_list); + if (!page) + break; + free_pages((unsigned long)page_address(page), + VIRTIO_BALLOON_FREE_PAGE_ORDER); + } + vb->num_free_page_blocks -= num_returned; + spin_unlock_irq(&vb->free_page_list_lock); + + return num_returned; +} + +static void virtballoon_changed(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + unsigned long flags; + s64 diff = towards_target(vb); + + if (diff) { + spin_lock_irqsave(&vb->stop_update_lock, flags); + if (!vb->stop_update) + queue_work(system_freezable_wq, + &vb->update_balloon_size_work); + spin_unlock_irqrestore(&vb->stop_update_lock, flags); + } + + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + virtio_cread(vdev, struct virtio_balloon_config, + free_page_report_cmd_id, &vb->cmd_id_received); + if (vb->cmd_id_received == VIRTIO_BALLOON_CMD_ID_DONE) { + /* Pass ULONG_MAX to give back all the free pages */ + return_free_pages_to_mm(vb, ULONG_MAX); + } else if (vb->cmd_id_received != VIRTIO_BALLOON_CMD_ID_STOP && + vb->cmd_id_received != + virtio32_to_cpu(vdev, vb->cmd_id_active)) { + spin_lock_irqsave(&vb->stop_update_lock, flags); + if (!vb->stop_update) { + queue_work(vb->balloon_wq, + &vb->report_free_page_work); + } + spin_unlock_irqrestore(&vb->stop_update_lock, flags); + } + } +} + static void update_balloon_size(struct virtio_balloon *vb) { u32 actual = vb->num_pages; @@ -389,26 +465,44 @@ static void update_balloon_size_func(struct work_struct *work) static int init_vqs(struct virtio_balloon *vb) { - struct virtqueue *vqs[3]; - vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request }; - static const char * const names[] = { "inflate", "deflate", "stats" }; - int err, nvqs; + struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX]; + vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX]; + const char *names[VIRTIO_BALLOON_VQ_MAX]; + int err; /* - * We expect two virtqueues: inflate and deflate, and - * optionally stat. + * Inflateq and deflateq are used unconditionally. The names[] + * will be NULL if the related feature is not enabled, which will + * cause no allocation for the corresponding virtqueue in find_vqs. */ - nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2; - err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL); + callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack; + names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate"; + callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack; + names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate"; + names[VIRTIO_BALLOON_VQ_STATS] = NULL; + names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { + names[VIRTIO_BALLOON_VQ_STATS] = "stats"; + callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request; + } + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq"; + callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; + } + + err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, + vqs, callbacks, names, NULL, NULL); if (err) return err; - vb->inflate_vq = vqs[0]; - vb->deflate_vq = vqs[1]; + vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE]; + vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE]; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { struct scatterlist sg; unsigned int num_stats; - vb->stats_vq = vqs[2]; + vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS]; /* * Prime this virtqueue with one buffer so the hypervisor can @@ -426,9 +520,145 @@ static int init_vqs(struct virtio_balloon *vb) } virtqueue_kick(vb->stats_vq); } + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE]; + + return 0; +} + +static int send_cmd_id_start(struct virtio_balloon *vb) +{ + struct scatterlist sg; + struct virtqueue *vq = vb->free_page_vq; + int err, unused; + + /* Detach all the used buffers from the vq */ + while (virtqueue_get_buf(vq, &unused)) + ; + + vb->cmd_id_active = cpu_to_virtio32(vb->vdev, vb->cmd_id_received); + sg_init_one(&sg, &vb->cmd_id_active, sizeof(vb->cmd_id_active)); + err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_active, GFP_KERNEL); + if (!err) + virtqueue_kick(vq); + return err; +} + +static int send_cmd_id_stop(struct virtio_balloon *vb) +{ + struct scatterlist sg; + struct virtqueue *vq = vb->free_page_vq; + int err, unused; + + /* Detach all the used buffers from the vq */ + while (virtqueue_get_buf(vq, &unused)) + ; + + sg_init_one(&sg, &vb->cmd_id_stop, sizeof(vb->cmd_id_stop)); + err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_stop, GFP_KERNEL); + if (!err) + virtqueue_kick(vq); + return err; +} + +static int get_free_page_and_send(struct virtio_balloon *vb) +{ + struct virtqueue *vq = vb->free_page_vq; + struct page *page; + struct scatterlist sg; + int err, unused; + void *p; + + /* Detach all the used buffers from the vq */ + while (virtqueue_get_buf(vq, &unused)) + ; + + page = alloc_pages(VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG, + VIRTIO_BALLOON_FREE_PAGE_ORDER); + /* + * When the allocation returns NULL, it indicates that we have got all + * the possible free pages, so return -EINTR to stop. + */ + if (!page) + return -EINTR; + + p = page_address(page); + sg_init_one(&sg, p, VIRTIO_BALLOON_FREE_PAGE_SIZE); + /* There is always 1 entry reserved for the cmd id to use. */ + if (vq->num_free > 1) { + err = virtqueue_add_inbuf(vq, &sg, 1, p, GFP_KERNEL); + if (unlikely(err)) { + free_pages((unsigned long)p, + VIRTIO_BALLOON_FREE_PAGE_ORDER); + return err; + } + virtqueue_kick(vq); + spin_lock_irq(&vb->free_page_list_lock); + balloon_page_push(&vb->free_page_list, page); + vb->num_free_page_blocks++; + spin_unlock_irq(&vb->free_page_list_lock); + } else { + /* + * The vq has no available entry to add this page block, so + * just free it. + */ + free_pages((unsigned long)p, VIRTIO_BALLOON_FREE_PAGE_ORDER); + } + + return 0; +} + +static int send_free_pages(struct virtio_balloon *vb) +{ + int err; + u32 cmd_id_active; + + while (1) { + /* + * If a stop id or a new cmd id was just received from host, + * stop the reporting. + */ + cmd_id_active = virtio32_to_cpu(vb->vdev, vb->cmd_id_active); + if (cmd_id_active != vb->cmd_id_received) + break; + + /* + * The free page blocks are allocated and sent to host one by + * one. + */ + err = get_free_page_and_send(vb); + if (err == -EINTR) + break; + else if (unlikely(err)) + return err; + } + return 0; } +static void report_free_page_func(struct work_struct *work) +{ + int err; + struct virtio_balloon *vb = container_of(work, struct virtio_balloon, + report_free_page_work); + struct device *dev = &vb->vdev->dev; + + /* Start by sending the received cmd id to host with an outbuf. */ + err = send_cmd_id_start(vb); + if (unlikely(err)) + dev_err(dev, "Failed to send a start id, err = %d\n", err); + + err = send_free_pages(vb); + if (unlikely(err)) + dev_err(dev, "Failed to send a free page, err = %d\n", err); + + /* End by sending a stop id to host with an outbuf. */ + err = send_cmd_id_stop(vb); + if (unlikely(err)) + dev_err(dev, "Failed to send a stop id, err = %d\n", err); +} + #ifdef CONFIG_BALLOON_COMPACTION /* * virtballoon_migratepage - perform the balloon page migration on behalf of @@ -512,14 +742,23 @@ static struct file_system_type balloon_fs = { #endif /* CONFIG_BALLOON_COMPACTION */ -static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, - struct shrink_control *sc) +static unsigned long shrink_free_pages(struct virtio_balloon *vb, + unsigned long pages_to_free) { - unsigned long pages_to_free, pages_freed = 0; - struct virtio_balloon *vb = container_of(shrinker, - struct virtio_balloon, shrinker); + unsigned long blocks_to_free, blocks_freed; - pages_to_free = sc->nr_to_scan * VIRTIO_BALLOON_PAGES_PER_PAGE; + pages_to_free = round_up(pages_to_free, + 1 << VIRTIO_BALLOON_FREE_PAGE_ORDER); + blocks_to_free = pages_to_free >> VIRTIO_BALLOON_FREE_PAGE_ORDER; + blocks_freed = return_free_pages_to_mm(vb, blocks_to_free); + + return blocks_freed << VIRTIO_BALLOON_FREE_PAGE_ORDER; +} + +static unsigned long shrink_balloon_pages(struct virtio_balloon *vb, + unsigned long pages_to_free) +{ + unsigned long pages_freed = 0; /* * One invocation of leak_balloon can deflate at most @@ -527,12 +766,33 @@ static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, * multiple times to deflate pages till reaching pages_to_free. */ while (vb->num_pages && pages_to_free) { + pages_freed += leak_balloon(vb, pages_to_free) / + VIRTIO_BALLOON_PAGES_PER_PAGE; pages_to_free -= pages_freed; - pages_freed += leak_balloon(vb, pages_to_free); } update_balloon_size(vb); - return pages_freed / VIRTIO_BALLOON_PAGES_PER_PAGE; + return pages_freed; +} + +static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long pages_to_free, pages_freed = 0; + struct virtio_balloon *vb = container_of(shrinker, + struct virtio_balloon, shrinker); + + pages_to_free = sc->nr_to_scan * VIRTIO_BALLOON_PAGES_PER_PAGE; + + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + pages_freed = shrink_free_pages(vb, pages_to_free); + + if (pages_freed >= pages_to_free) + return pages_freed; + + pages_freed += shrink_balloon_pages(vb, pages_to_free - pages_freed); + + return pages_freed; } static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, @@ -540,8 +800,12 @@ static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, { struct virtio_balloon *vb = container_of(shrinker, struct virtio_balloon, shrinker); + unsigned long count; - return vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE; + count = vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE; + count += vb->num_free_page_blocks >> VIRTIO_BALLOON_FREE_PAGE_ORDER; + + return count; } static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb) @@ -561,6 +825,7 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb) static int virtballoon_probe(struct virtio_device *vdev) { struct virtio_balloon *vb; + __u32 poison_val; int err; if (!vdev->config->get) { @@ -604,6 +869,36 @@ static int virtballoon_probe(struct virtio_device *vdev) } vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops; #endif + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + /* + * There is always one entry reserved for cmd id, so the ring + * size needs to be at least two to report free page hints. + */ + if (virtqueue_get_vring_size(vb->free_page_vq) < 2) { + err = -ENOSPC; + goto out_del_vqs; + } + vb->balloon_wq = alloc_workqueue("balloon-wq", + WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0); + if (!vb->balloon_wq) { + err = -ENOMEM; + goto out_del_vqs; + } + INIT_WORK(&vb->report_free_page_work, report_free_page_func); + vb->cmd_id_received = VIRTIO_BALLOON_CMD_ID_STOP; + vb->cmd_id_active = cpu_to_virtio32(vb->vdev, + VIRTIO_BALLOON_CMD_ID_STOP); + vb->cmd_id_stop = cpu_to_virtio32(vb->vdev, + VIRTIO_BALLOON_CMD_ID_STOP); + vb->num_free_page_blocks = 0; + spin_lock_init(&vb->free_page_list_lock); + INIT_LIST_HEAD(&vb->free_page_list); + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) { + memset(&poison_val, PAGE_POISON, sizeof(poison_val)); + virtio_cwrite(vb->vdev, struct virtio_balloon_config, + poison_val, &poison_val); + } + } /* * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to decide if a * shrinker needs to be registered to relieve memory pressure. @@ -611,7 +906,7 @@ static int virtballoon_probe(struct virtio_device *vdev) if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) { err = virtio_balloon_register_shrinker(vb); if (err) - goto out_del_vqs; + goto out_del_balloon_wq; } virtio_device_ready(vdev); @@ -619,6 +914,9 @@ static int virtballoon_probe(struct virtio_device *vdev) virtballoon_changed(vdev); return 0; +out_del_balloon_wq: + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) + destroy_workqueue(vb->balloon_wq); out_del_vqs: vdev->config->del_vqs(vdev); out_free_vb: @@ -652,6 +950,11 @@ static void virtballoon_remove(struct virtio_device *vdev) cancel_work_sync(&vb->update_balloon_size_work); cancel_work_sync(&vb->update_balloon_stats_work); + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + cancel_work_sync(&vb->report_free_page_work); + destroy_workqueue(vb->balloon_wq); + } + remove_common(vb); #ifdef CONFIG_BALLOON_COMPACTION if (vb->vb_dev_info.inode) @@ -695,6 +998,9 @@ static int virtballoon_restore(struct virtio_device *vdev) static int virtballoon_validate(struct virtio_device *vdev) { + if (!page_poisoning_enabled()) + __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON); + __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM); return 0; } @@ -703,6 +1009,8 @@ static unsigned int features[] = { VIRTIO_BALLOON_F_MUST_TELL_HOST, VIRTIO_BALLOON_F_STATS_VQ, VIRTIO_BALLOON_F_DEFLATE_ON_OOM, + VIRTIO_BALLOON_F_FREE_PAGE_HINT, + VIRTIO_BALLOON_F_PAGE_POISON, }; static struct virtio_driver virtio_balloon_driver = { diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h @@ -34,15 +34,23 @@ #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ +#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ +#define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 +#define VIRTIO_BALLOON_CMD_ID_STOP 0 +#define VIRTIO_BALLOON_CMD_ID_DONE 1 struct virtio_balloon_config { /* Number of pages host wants Guest to give up. */ __u32 num_pages; /* Number of pages we've actually got in balloon. */ __u32 actual; + /* Free page report command id, readonly by guest */ + __u32 free_page_report_cmd_id; + /* Stores PAGE_POISON if page poisoning is in use */ + __u32 poison_val; }; #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */ diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config @@ -20,6 +20,7 @@ CONFIG_PARAVIRT=y CONFIG_KVM_GUEST=y CONFIG_S390_GUEST=y CONFIG_VIRTIO=y +CONFIG_VIRTIO_MENU=y CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_BLK=y CONFIG_VIRTIO_CONSOLE=y diff --git a/mm/page_poison.c b/mm/page_poison.c @@ -17,6 +17,11 @@ static int __init early_page_poison_param(char *buf) } early_param("page_poison", early_page_poison_param); +/** + * page_poisoning_enabled - check if page poisoning is enabled + * + * Return true if page poisoning is enabled, or false if not. + */ bool page_poisoning_enabled(void) { /* @@ -29,6 +34,7 @@ bool page_poisoning_enabled(void) (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && debug_pagealloc_enabled())); } +EXPORT_SYMBOL_GPL(page_poisoning_enabled); static void poison_page(struct page *page) {