Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Nov 2018 21:42:49 +0000 (14:42 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Nov 2018 21:42:49 +0000 (14:42 -0700)
Pull virtio/vhost updates from Michael Tsirkin:
 "Fixes and tweaks:

   - virtio balloon page hinting support

   - vhost scsi control queue

   - misc fixes"

* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost:
  MAINTAINERS: remove reference to bogus vsock file
  vhost/scsi: Use common handling code in request queue handler
  vhost/scsi: Extract common handling code from control queue handler
  vhost/scsi: Respond to control queue operations
  vhost/scsi: truncate T10 PI iov_iter to prot_bytes
  virtio-balloon: VIRTIO_BALLOON_F_PAGE_POISON
  mm/page_poison: expose page_poisoning_enabled to kernel modules
  virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
  kvm_config: add CONFIG_VIRTIO_MENU

MAINTAINERS
drivers/vhost/scsi.c
drivers/virtio/virtio_balloon.c
include/uapi/linux/virtio_balloon.h
kernel/configs/kvm_guest.config
mm/page_poison.c

index 690c2f6..bb97067 100644 (file)
@@ -15858,7 +15858,6 @@ F:      net/vmw_vsock/virtio_transport_common.c
 F:     net/vmw_vsock/virtio_transport.c
 F:     drivers/net/vsockmon.c
 F:     drivers/vhost/vsock.c
-F:     drivers/vhost/vsock.h
 F:     tools/testing/vsock/
 
 VIRTIO CONSOLE DRIVER
index c24bb69..50dffe8 100644 (file)
@@ -203,6 +203,19 @@ struct vhost_scsi {
        int vs_events_nr; /* num of pending events, protected by vq->mutex */
 };
 
+/*
+ * Context for processing request and control queue operations.
+ */
+struct vhost_scsi_ctx {
+       int head;
+       unsigned int out, in;
+       size_t req_size, rsp_size;
+       size_t out_size, in_size;
+       u8 *target, *lunp;
+       void *req;
+       struct iov_iter out_iter;
+};
+
 static struct workqueue_struct *vhost_scsi_workqueue;
 
 /* Global spinlock to protect vhost_scsi TPG list for vhost IOCTL access */
@@ -800,24 +813,120 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs,
                pr_err("Faulted on virtio_scsi_cmd_resp\n");
 }
 
+static int
+vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq,
+                   struct vhost_scsi_ctx *vc)
+{
+       int ret = -ENXIO;
+
+       vc->head = vhost_get_vq_desc(vq, vq->iov,
+                                    ARRAY_SIZE(vq->iov), &vc->out, &vc->in,
+                                    NULL, NULL);
+
+       pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
+                vc->head, vc->out, vc->in);
+
+       /* On error, stop handling until the next kick. */
+       if (unlikely(vc->head < 0))
+               goto done;
+
+       /* Nothing new?  Wait for eventfd to tell us they refilled. */
+       if (vc->head == vq->num) {
+               if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
+                       vhost_disable_notify(&vs->dev, vq);
+                       ret = -EAGAIN;
+               }
+               goto done;
+       }
+
+       /*
+        * Get the size of request and response buffers.
+        * FIXME: Not correct for BIDI operation
+        */
+       vc->out_size = iov_length(vq->iov, vc->out);
+       vc->in_size = iov_length(&vq->iov[vc->out], vc->in);
+
+       /*
+        * Copy over the virtio-scsi request header, which for a
+        * ANY_LAYOUT enabled guest may span multiple iovecs, or a
+        * single iovec may contain both the header + outgoing
+        * WRITE payloads.
+        *
+        * copy_from_iter() will advance out_iter, so that it will
+        * point at the start of the outgoing WRITE payload, if
+        * DMA_TO_DEVICE is set.
+        */
+       iov_iter_init(&vc->out_iter, WRITE, vq->iov, vc->out, vc->out_size);
+       ret = 0;
+
+done:
+       return ret;
+}
+
+static int
+vhost_scsi_chk_size(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc)
+{
+       if (unlikely(vc->in_size < vc->rsp_size)) {
+               vq_err(vq,
+                      "Response buf too small, need min %zu bytes got %zu",
+                      vc->rsp_size, vc->in_size);
+               return -EINVAL;
+       } else if (unlikely(vc->out_size < vc->req_size)) {
+               vq_err(vq,
+                      "Request buf too small, need min %zu bytes got %zu",
+                      vc->req_size, vc->out_size);
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static int
+vhost_scsi_get_req(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc,
+                  struct vhost_scsi_tpg **tpgp)
+{
+       int ret = -EIO;
+
+       if (unlikely(!copy_from_iter_full(vc->req, vc->req_size,
+                                         &vc->out_iter))) {
+               vq_err(vq, "Faulted on copy_from_iter\n");
+       } else if (unlikely(*vc->lunp != 1)) {
+               /* virtio-scsi spec requires byte 0 of the lun to be 1 */
+               vq_err(vq, "Illegal virtio-scsi lun: %u\n", *vc->lunp);
+       } else {
+               struct vhost_scsi_tpg **vs_tpg, *tpg;
+
+               vs_tpg = vq->private_data;      /* validated at handler entry */
+
+               tpg = READ_ONCE(vs_tpg[*vc->target]);
+               if (unlikely(!tpg)) {
+                       vq_err(vq, "Target 0x%x does not exist\n", *vc->target);
+               } else {
+                       if (tpgp)
+                               *tpgp = tpg;
+                       ret = 0;
+               }
+       }
+
+       return ret;
+}
+
 static void
 vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 {
        struct vhost_scsi_tpg **vs_tpg, *tpg;
        struct virtio_scsi_cmd_req v_req;
        struct virtio_scsi_cmd_req_pi v_req_pi;
+       struct vhost_scsi_ctx vc;
        struct vhost_scsi_cmd *cmd;
-       struct iov_iter out_iter, in_iter, prot_iter, data_iter;
+       struct iov_iter in_iter, prot_iter, data_iter;
        u64 tag;
        u32 exp_data_len, data_direction;
-       unsigned int out = 0, in = 0;
-       int head, ret, prot_bytes;
-       size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp);
-       size_t out_size, in_size;
+       int ret, prot_bytes;
        u16 lun;
-       u8 *target, *lunp, task_attr;
+       u8 task_attr;
        bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI);
-       void *req, *cdb;
+       void *cdb;
 
        mutex_lock(&vq->mutex);
        /*
@@ -828,85 +937,47 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
        if (!vs_tpg)
                goto out;
 
+       memset(&vc, 0, sizeof(vc));
+       vc.rsp_size = sizeof(struct virtio_scsi_cmd_resp);
+
        vhost_disable_notify(&vs->dev, vq);
 
        for (;;) {
-               head = vhost_get_vq_desc(vq, vq->iov,
-                                        ARRAY_SIZE(vq->iov), &out, &in,
-                                        NULL, NULL);
-               pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
-                        head, out, in);
-               /* On error, stop handling until the next kick. */
-               if (unlikely(head < 0))
-                       break;
-               /* Nothing new?  Wait for eventfd to tell us they refilled. */
-               if (head == vq->num) {
-                       if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
-                               vhost_disable_notify(&vs->dev, vq);
-                               continue;
-                       }
-                       break;
-               }
-               /*
-                * Check for a sane response buffer so we can report early
-                * errors back to the guest.
-                */
-               if (unlikely(vq->iov[out].iov_len < rsp_size)) {
-                       vq_err(vq, "Expecting at least virtio_scsi_cmd_resp"
-                               " size, got %zu bytes\n", vq->iov[out].iov_len);
-                       break;
-               }
+               ret = vhost_scsi_get_desc(vs, vq, &vc);
+               if (ret)
+                       goto err;
+
                /*
                 * Setup pointers and values based upon different virtio-scsi
                 * request header if T10_PI is enabled in KVM guest.
                 */
                if (t10_pi) {
-                       req = &v_req_pi;
-                       req_size = sizeof(v_req_pi);
-                       lunp = &v_req_pi.lun[0];
-                       target = &v_req_pi.lun[1];
+                       vc.req = &v_req_pi;
+                       vc.req_size = sizeof(v_req_pi);
+                       vc.lunp = &v_req_pi.lun[0];
+                       vc.target = &v_req_pi.lun[1];
                } else {
-                       req = &v_req;
-                       req_size = sizeof(v_req);
-                       lunp = &v_req.lun[0];
-                       target = &v_req.lun[1];
+                       vc.req = &v_req;
+                       vc.req_size = sizeof(v_req);
+                       vc.lunp = &v_req.lun[0];
+                       vc.target = &v_req.lun[1];
                }
-               /*
-                * FIXME: Not correct for BIDI operation
-                */
-               out_size = iov_length(vq->iov, out);
-               in_size = iov_length(&vq->iov[out], in);
 
                /*
-                * Copy over the virtio-scsi request header, which for a
-                * ANY_LAYOUT enabled guest may span multiple iovecs, or a
-                * single iovec may contain both the header + outgoing
-                * WRITE payloads.
-                *
-                * copy_from_iter() will advance out_iter, so that it will
-                * point at the start of the outgoing WRITE payload, if
-                * DMA_TO_DEVICE is set.
+                * Validate the size of request and response buffers.
+                * Check for a sane response buffer so we can report
+                * early errors back to the guest.
                 */
-               iov_iter_init(&out_iter, WRITE, vq->iov, out, out_size);
+               ret = vhost_scsi_chk_size(vq, &vc);
+               if (ret)
+                       goto err;
 
-               if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) {
-                       vq_err(vq, "Faulted on copy_from_iter\n");
-                       vhost_scsi_send_bad_target(vs, vq, head, out);
-                       continue;
-               }
-               /* virtio-scsi spec requires byte 0 of the lun to be 1 */
-               if (unlikely(*lunp != 1)) {
-                       vq_err(vq, "Illegal virtio-scsi lun: %u\n", *lunp);
-                       vhost_scsi_send_bad_target(vs, vq, head, out);
-                       continue;
-               }
+               ret = vhost_scsi_get_req(vq, &vc, &tpg);
+               if (ret)
+                       goto err;
+
+               ret = -EIO;     /* bad target on any error from here on */
 
-               tpg = READ_ONCE(vs_tpg[*target]);
-               if (unlikely(!tpg)) {
-                       /* Target does not exist, fail the request */
-                       vhost_scsi_send_bad_target(vs, vq, head, out);
-                       continue;
-               }
                /*
                 * Determine data_direction by calculating the total outgoing
                 * iovec sizes + incoming iovec sizes vs. virtio-scsi request +
@@ -924,17 +995,17 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
                 */
                prot_bytes = 0;
 
-               if (out_size > req_size) {
+               if (vc.out_size > vc.req_size) {
                        data_direction = DMA_TO_DEVICE;
-                       exp_data_len = out_size - req_size;
-                       data_iter = out_iter;
-               } else if (in_size > rsp_size) {
+                       exp_data_len = vc.out_size - vc.req_size;
+                       data_iter = vc.out_iter;
+               } else if (vc.in_size > vc.rsp_size) {
                        data_direction = DMA_FROM_DEVICE;
-                       exp_data_len = in_size - rsp_size;
+                       exp_data_len = vc.in_size - vc.rsp_size;
 
-                       iov_iter_init(&in_iter, READ, &vq->iov[out], in,
-                                     rsp_size + exp_data_len);
-                       iov_iter_advance(&in_iter, rsp_size);
+                       iov_iter_init(&in_iter, READ, &vq->iov[vc.out], vc.in,
+                                     vc.rsp_size + exp_data_len);
+                       iov_iter_advance(&in_iter, vc.rsp_size);
                        data_iter = in_iter;
                } else {
                        data_direction = DMA_NONE;
@@ -950,21 +1021,20 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
                                if (data_direction != DMA_TO_DEVICE) {
                                        vq_err(vq, "Received non zero pi_bytesout,"
                                                " but wrong data_direction\n");
-                                       vhost_scsi_send_bad_target(vs, vq, head, out);
-                                       continue;
+                                       goto err;
                                }
                                prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesout);
                        } else if (v_req_pi.pi_bytesin) {
                                if (data_direction != DMA_FROM_DEVICE) {
                                        vq_err(vq, "Received non zero pi_bytesin,"
                                                " but wrong data_direction\n");
-                                       vhost_scsi_send_bad_target(vs, vq, head, out);
-                                       continue;
+                                       goto err;
                                }
                                prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin);
                        }
                        /*
-                        * Set prot_iter to data_iter, and advance past any
+                        * Set prot_iter to data_iter and truncate it to
+                        * prot_bytes, and advance data_iter past any
                         * preceeding prot_bytes that may be present.
                         *
                         * Also fix up the exp_data_len to reflect only the
@@ -973,6 +1043,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
                        if (prot_bytes) {
                                exp_data_len -= prot_bytes;
                                prot_iter = data_iter;
+                               iov_iter_truncate(&prot_iter, prot_bytes);
                                iov_iter_advance(&data_iter, prot_bytes);
                        }
                        tag = vhost64_to_cpu(vq, v_req_pi.tag);
@@ -996,8 +1067,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
                        vq_err(vq, "Received SCSI CDB with command_size: %d that"
                                " exceeds SCSI_MAX_VARLEN_CDB_SIZE: %d\n",
                                scsi_command_size(cdb), VHOST_SCSI_MAX_CDB_SIZE);
-                       vhost_scsi_send_bad_target(vs, vq, head, out);
-                       continue;
+                               goto err;
                }
                cmd = vhost_scsi_get_tag(vq, tpg, cdb, tag, lun, task_attr,
                                         exp_data_len + prot_bytes,
@@ -1005,13 +1075,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
                if (IS_ERR(cmd)) {
                        vq_err(vq, "vhost_scsi_get_tag failed %ld\n",
                               PTR_ERR(cmd));
-                       vhost_scsi_send_bad_target(vs, vq, head, out);
-                       continue;
+                       goto err;
                }
                cmd->tvc_vhost = vs;
                cmd->tvc_vq = vq;
-               cmd->tvc_resp_iov = vq->iov[out];
-               cmd->tvc_in_iovs = in;
+               cmd->tvc_resp_iov = vq->iov[vc.out];
+               cmd->tvc_in_iovs = vc.in;
 
                pr_debug("vhost_scsi got command opcode: %#02x, lun: %d\n",
                         cmd->tvc_cdb[0], cmd->tvc_lun);
@@ -1019,14 +1088,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
                         " %d\n", cmd, exp_data_len, prot_bytes, data_direction);
 
                if (data_direction != DMA_NONE) {
-                       ret = vhost_scsi_mapal(cmd,
-                                              prot_bytes, &prot_iter,
-                                              exp_data_len, &data_iter);
-                       if (unlikely(ret)) {
+                       if (unlikely(vhost_scsi_mapal(cmd, prot_bytes,
+                                                     &prot_iter, exp_data_len,
+                                                     &data_iter))) {
                                vq_err(vq, "Failed to map iov to sgl\n");
                                vhost_scsi_release_cmd(&cmd->tvc_se_cmd);
-                               vhost_scsi_send_bad_target(vs, vq, head, out);
-                               continue;
+                               goto err;
                        }
                }
                /*
@@ -1034,7 +1101,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
                 * complete the virtio-scsi request in TCM callback context via
                 * vhost_scsi_queue_data_in() and vhost_scsi_queue_status()
                 */
-               cmd->tvc_vq_desc = head;
+               cmd->tvc_vq_desc = vc.head;
                /*
                 * Dispatch cmd descriptor for cmwq execution in process
                 * context provided by vhost_scsi_workqueue.  This also ensures
@@ -1043,6 +1110,166 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
                 */
                INIT_WORK(&cmd->work, vhost_scsi_submission_work);
                queue_work(vhost_scsi_workqueue, &cmd->work);
+               ret = 0;
+err:
+               /*
+                * ENXIO:  No more requests, or read error, wait for next kick
+                * EINVAL: Invalid response buffer, drop the request
+                * EIO:    Respond with bad target
+                * EAGAIN: Pending request
+                */
+               if (ret == -ENXIO)
+                       break;
+               else if (ret == -EIO)
+                       vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
+       }
+out:
+       mutex_unlock(&vq->mutex);
+}
+
+static void
+vhost_scsi_send_tmf_reject(struct vhost_scsi *vs,
+                          struct vhost_virtqueue *vq,
+                          struct vhost_scsi_ctx *vc)
+{
+       struct virtio_scsi_ctrl_tmf_resp __user *resp;
+       struct virtio_scsi_ctrl_tmf_resp rsp;
+       int ret;
+
+       pr_debug("%s\n", __func__);
+       memset(&rsp, 0, sizeof(rsp));
+       rsp.response = VIRTIO_SCSI_S_FUNCTION_REJECTED;
+       resp = vq->iov[vc->out].iov_base;
+       ret = __copy_to_user(resp, &rsp, sizeof(rsp));
+       if (!ret)
+               vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0);
+       else
+               pr_err("Faulted on virtio_scsi_ctrl_tmf_resp\n");
+}
+
+static void
+vhost_scsi_send_an_resp(struct vhost_scsi *vs,
+                       struct vhost_virtqueue *vq,
+                       struct vhost_scsi_ctx *vc)
+{
+       struct virtio_scsi_ctrl_an_resp __user *resp;
+       struct virtio_scsi_ctrl_an_resp rsp;
+       int ret;
+
+       pr_debug("%s\n", __func__);
+       memset(&rsp, 0, sizeof(rsp));   /* event_actual = 0 */
+       rsp.response = VIRTIO_SCSI_S_OK;
+       resp = vq->iov[vc->out].iov_base;
+       ret = __copy_to_user(resp, &rsp, sizeof(rsp));
+       if (!ret)
+               vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0);
+       else
+               pr_err("Faulted on virtio_scsi_ctrl_an_resp\n");
+}
+
+static void
+vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
+{
+       union {
+               __virtio32 type;
+               struct virtio_scsi_ctrl_an_req an;
+               struct virtio_scsi_ctrl_tmf_req tmf;
+       } v_req;
+       struct vhost_scsi_ctx vc;
+       size_t typ_size;
+       int ret;
+
+       mutex_lock(&vq->mutex);
+       /*
+        * We can handle the vq only after the endpoint is setup by calling the
+        * VHOST_SCSI_SET_ENDPOINT ioctl.
+        */
+       if (!vq->private_data)
+               goto out;
+
+       memset(&vc, 0, sizeof(vc));
+
+       vhost_disable_notify(&vs->dev, vq);
+
+       for (;;) {
+               ret = vhost_scsi_get_desc(vs, vq, &vc);
+               if (ret)
+                       goto err;
+
+               /*
+                * Get the request type first in order to setup
+                * other parameters dependent on the type.
+                */
+               vc.req = &v_req.type;
+               typ_size = sizeof(v_req.type);
+
+               if (unlikely(!copy_from_iter_full(vc.req, typ_size,
+                                                 &vc.out_iter))) {
+                       vq_err(vq, "Faulted on copy_from_iter tmf type\n");
+                       /*
+                        * The size of the response buffer depends on the
+                        * request type and must be validated against it.
+                        * Since the request type is not known, don't send
+                        * a response.
+                        */
+                       continue;
+               }
+
+               switch (v_req.type) {
+               case VIRTIO_SCSI_T_TMF:
+                       vc.req = &v_req.tmf;
+                       vc.req_size = sizeof(struct virtio_scsi_ctrl_tmf_req);
+                       vc.rsp_size = sizeof(struct virtio_scsi_ctrl_tmf_resp);
+                       vc.lunp = &v_req.tmf.lun[0];
+                       vc.target = &v_req.tmf.lun[1];
+                       break;
+               case VIRTIO_SCSI_T_AN_QUERY:
+               case VIRTIO_SCSI_T_AN_SUBSCRIBE:
+                       vc.req = &v_req.an;
+                       vc.req_size = sizeof(struct virtio_scsi_ctrl_an_req);
+                       vc.rsp_size = sizeof(struct virtio_scsi_ctrl_an_resp);
+                       vc.lunp = &v_req.an.lun[0];
+                       vc.target = NULL;
+                       break;
+               default:
+                       vq_err(vq, "Unknown control request %d", v_req.type);
+                       continue;
+               }
+
+               /*
+                * Validate the size of request and response buffers.
+                * Check for a sane response buffer so we can report
+                * early errors back to the guest.
+                */
+               ret = vhost_scsi_chk_size(vq, &vc);
+               if (ret)
+                       goto err;
+
+               /*
+                * Get the rest of the request now that its size is known.
+                */
+               vc.req += typ_size;
+               vc.req_size -= typ_size;
+
+               ret = vhost_scsi_get_req(vq, &vc, NULL);
+               if (ret)
+                       goto err;
+
+               if (v_req.type == VIRTIO_SCSI_T_TMF)
+                       vhost_scsi_send_tmf_reject(vs, vq, &vc);
+               else
+                       vhost_scsi_send_an_resp(vs, vq, &vc);
+err:
+               /*
+                * ENXIO:  No more requests, or read error, wait for next kick
+                * EINVAL: Invalid response buffer, drop the request
+                * EIO:    Respond with bad target
+                * EAGAIN: Pending request
+                */
+               if (ret == -ENXIO)
+                       break;
+               else if (ret == -EIO)
+                       vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
        }
 out:
        mutex_unlock(&vq->mutex);
@@ -1050,7 +1277,12 @@ out:
 
 static void vhost_scsi_ctl_handle_kick(struct vhost_work *work)
 {
+       struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+                                               poll.work);
+       struct vhost_scsi *vs = container_of(vq->dev, struct vhost_scsi, dev);
+
        pr_debug("%s: The handling func for control queue.\n", __func__);
+       vhost_scsi_ctl_handle_vq(vs, vq);
 }
 
 static void
index d1c1f62..728ecd1 100644 (file)
 #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256
 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
 
+#define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \
+                                            __GFP_NOMEMALLOC)
+/* The order of free page blocks to report to host */
+#define VIRTIO_BALLOON_FREE_PAGE_ORDER (MAX_ORDER - 1)
+/* The size of a free page block in bytes */
+#define VIRTIO_BALLOON_FREE_PAGE_SIZE \
+       (1 << (VIRTIO_BALLOON_FREE_PAGE_ORDER + PAGE_SHIFT))
+
 #ifdef CONFIG_BALLOON_COMPACTION
 static struct vfsmount *balloon_mnt;
 #endif
 
+enum virtio_balloon_vq {
+       VIRTIO_BALLOON_VQ_INFLATE,
+       VIRTIO_BALLOON_VQ_DEFLATE,
+       VIRTIO_BALLOON_VQ_STATS,
+       VIRTIO_BALLOON_VQ_FREE_PAGE,
+       VIRTIO_BALLOON_VQ_MAX
+};
+
 struct virtio_balloon {
        struct virtio_device *vdev;
-       struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+       struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+       /* Balloon's own wq for cpu-intensive work items */
+       struct workqueue_struct *balloon_wq;
+       /* The free page reporting work item submitted to the balloon wq */
+       struct work_struct report_free_page_work;
 
        /* The balloon servicing is delegated to a freezable workqueue. */
        struct work_struct update_balloon_stats_work;
@@ -57,6 +78,18 @@ struct virtio_balloon {
        spinlock_t stop_update_lock;
        bool stop_update;
 
+       /* The list of allocated free pages, waiting to be given back to mm */
+       struct list_head free_page_list;
+       spinlock_t free_page_list_lock;
+       /* The number of free page blocks on the above list */
+       unsigned long num_free_page_blocks;
+       /* The cmd id received from host */
+       u32 cmd_id_received;
+       /* The cmd id that is actively in use */
+       __virtio32 cmd_id_active;
+       /* Buffer to store the stop sign */
+       __virtio32 cmd_id_stop;
+
        /* Waiting for host to ack the pages we released. */
        wait_queue_head_t acked;
 
@@ -320,17 +353,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
        virtqueue_kick(vq);
 }
 
-static void virtballoon_changed(struct virtio_device *vdev)
-{
-       struct virtio_balloon *vb = vdev->priv;
-       unsigned long flags;
-
-       spin_lock_irqsave(&vb->stop_update_lock, flags);
-       if (!vb->stop_update)
-               queue_work(system_freezable_wq, &vb->update_balloon_size_work);
-       spin_unlock_irqrestore(&vb->stop_update_lock, flags);
-}
-
 static inline s64 towards_target(struct virtio_balloon *vb)
 {
        s64 target;
@@ -347,6 +369,60 @@ static inline s64 towards_target(struct virtio_balloon *vb)
        return target - vb->num_pages;
 }
 
+/* Gives back @num_to_return blocks of free pages to mm. */
+static unsigned long return_free_pages_to_mm(struct virtio_balloon *vb,
+                                            unsigned long num_to_return)
+{
+       struct page *page;
+       unsigned long num_returned;
+
+       spin_lock_irq(&vb->free_page_list_lock);
+       for (num_returned = 0; num_returned < num_to_return; num_returned++) {
+               page = balloon_page_pop(&vb->free_page_list);
+               if (!page)
+                       break;
+               free_pages((unsigned long)page_address(page),
+                          VIRTIO_BALLOON_FREE_PAGE_ORDER);
+       }
+       vb->num_free_page_blocks -= num_returned;
+       spin_unlock_irq(&vb->free_page_list_lock);
+
+       return num_returned;
+}
+
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+       struct virtio_balloon *vb = vdev->priv;
+       unsigned long flags;
+       s64 diff = towards_target(vb);
+
+       if (diff) {
+               spin_lock_irqsave(&vb->stop_update_lock, flags);
+               if (!vb->stop_update)
+                       queue_work(system_freezable_wq,
+                                  &vb->update_balloon_size_work);
+               spin_unlock_irqrestore(&vb->stop_update_lock, flags);
+       }
+
+       if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+               virtio_cread(vdev, struct virtio_balloon_config,
+                            free_page_report_cmd_id, &vb->cmd_id_received);
+               if (vb->cmd_id_received == VIRTIO_BALLOON_CMD_ID_DONE) {
+                       /* Pass ULONG_MAX to give back all the free pages */
+                       return_free_pages_to_mm(vb, ULONG_MAX);
+               } else if (vb->cmd_id_received != VIRTIO_BALLOON_CMD_ID_STOP &&
+                          vb->cmd_id_received !=
+                          virtio32_to_cpu(vdev, vb->cmd_id_active)) {
+                       spin_lock_irqsave(&vb->stop_update_lock, flags);
+                       if (!vb->stop_update) {
+                               queue_work(vb->balloon_wq,
+                                          &vb->report_free_page_work);
+                       }
+                       spin_unlock_irqrestore(&vb->stop_update_lock, flags);
+               }
+       }
+}
+
 static void update_balloon_size(struct virtio_balloon *vb)
 {
        u32 actual = vb->num_pages;
@@ -389,26 +465,44 @@ static void update_balloon_size_func(struct work_struct *work)
 
 static int init_vqs(struct virtio_balloon *vb)
 {
-       struct virtqueue *vqs[3];
-       vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
-       static const char * const names[] = { "inflate", "deflate", "stats" };
-       int err, nvqs;
+       struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX];
+       vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX];
+       const char *names[VIRTIO_BALLOON_VQ_MAX];
+       int err;
 
        /*
-        * We expect two virtqueues: inflate and deflate, and
-        * optionally stat.
+        * Inflateq and deflateq are used unconditionally. The names[]
+        * will be NULL if the related feature is not enabled, which will
+        * cause no allocation for the corresponding virtqueue in find_vqs.
         */
-       nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
-       err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
+       callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack;
+       names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate";
+       callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack;
+       names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
+       names[VIRTIO_BALLOON_VQ_STATS] = NULL;
+       names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
+
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+               names[VIRTIO_BALLOON_VQ_STATS] = "stats";
+               callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request;
+       }
+
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+               names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq";
+               callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
+       }
+
+       err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
+                                        vqs, callbacks, names, NULL, NULL);
        if (err)
                return err;
 
-       vb->inflate_vq = vqs[0];
-       vb->deflate_vq = vqs[1];
+       vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
+       vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
        if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
                struct scatterlist sg;
                unsigned int num_stats;
-               vb->stats_vq = vqs[2];
+               vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS];
 
                /*
                 * Prime this virtqueue with one buffer so the hypervisor can
@@ -426,9 +520,145 @@ static int init_vqs(struct virtio_balloon *vb)
                }
                virtqueue_kick(vb->stats_vq);
        }
+
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+               vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE];
+
+       return 0;
+}
+
+static int send_cmd_id_start(struct virtio_balloon *vb)
+{
+       struct scatterlist sg;
+       struct virtqueue *vq = vb->free_page_vq;
+       int err, unused;
+
+       /* Detach all the used buffers from the vq */
+       while (virtqueue_get_buf(vq, &unused))
+               ;
+
+       vb->cmd_id_active = cpu_to_virtio32(vb->vdev, vb->cmd_id_received);
+       sg_init_one(&sg, &vb->cmd_id_active, sizeof(vb->cmd_id_active));
+       err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_active, GFP_KERNEL);
+       if (!err)
+               virtqueue_kick(vq);
+       return err;
+}
+
+static int send_cmd_id_stop(struct virtio_balloon *vb)
+{
+       struct scatterlist sg;
+       struct virtqueue *vq = vb->free_page_vq;
+       int err, unused;
+
+       /* Detach all the used buffers from the vq */
+       while (virtqueue_get_buf(vq, &unused))
+               ;
+
+       sg_init_one(&sg, &vb->cmd_id_stop, sizeof(vb->cmd_id_stop));
+       err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_stop, GFP_KERNEL);
+       if (!err)
+               virtqueue_kick(vq);
+       return err;
+}
+
+static int get_free_page_and_send(struct virtio_balloon *vb)
+{
+       struct virtqueue *vq = vb->free_page_vq;
+       struct page *page;
+       struct scatterlist sg;
+       int err, unused;
+       void *p;
+
+       /* Detach all the used buffers from the vq */
+       while (virtqueue_get_buf(vq, &unused))
+               ;
+
+       page = alloc_pages(VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG,
+                          VIRTIO_BALLOON_FREE_PAGE_ORDER);
+       /*
+        * When the allocation returns NULL, it indicates that we have got all
+        * the possible free pages, so return -EINTR to stop.
+        */
+       if (!page)
+               return -EINTR;
+
+       p = page_address(page);
+       sg_init_one(&sg, p, VIRTIO_BALLOON_FREE_PAGE_SIZE);
+       /* There is always 1 entry reserved for the cmd id to use. */
+       if (vq->num_free > 1) {
+               err = virtqueue_add_inbuf(vq, &sg, 1, p, GFP_KERNEL);
+               if (unlikely(err)) {
+                       free_pages((unsigned long)p,
+                                  VIRTIO_BALLOON_FREE_PAGE_ORDER);
+                       return err;
+               }
+               virtqueue_kick(vq);
+               spin_lock_irq(&vb->free_page_list_lock);
+               balloon_page_push(&vb->free_page_list, page);
+               vb->num_free_page_blocks++;
+               spin_unlock_irq(&vb->free_page_list_lock);
+       } else {
+               /*
+                * The vq has no available entry to add this page block, so
+                * just free it.
+                */
+               free_pages((unsigned long)p, VIRTIO_BALLOON_FREE_PAGE_ORDER);
+       }
+
+       return 0;
+}
+
+static int send_free_pages(struct virtio_balloon *vb)
+{
+       int err;
+       u32 cmd_id_active;
+
+       while (1) {
+               /*
+                * If a stop id or a new cmd id was just received from host,
+                * stop the reporting.
+                */
+               cmd_id_active = virtio32_to_cpu(vb->vdev, vb->cmd_id_active);
+               if (cmd_id_active != vb->cmd_id_received)
+                       break;
+
+               /*
+                * The free page blocks are allocated and sent to host one by
+                * one.
+                */
+               err = get_free_page_and_send(vb);
+               if (err == -EINTR)
+                       break;
+               else if (unlikely(err))
+                       return err;
+       }
+
        return 0;
 }
 
+static void report_free_page_func(struct work_struct *work)
+{
+       int err;
+       struct virtio_balloon *vb = container_of(work, struct virtio_balloon,
+                                                report_free_page_work);
+       struct device *dev = &vb->vdev->dev;
+
+       /* Start by sending the received cmd id to host with an outbuf. */
+       err = send_cmd_id_start(vb);
+       if (unlikely(err))
+               dev_err(dev, "Failed to send a start id, err = %d\n", err);
+
+       err = send_free_pages(vb);
+       if (unlikely(err))
+               dev_err(dev, "Failed to send a free page, err = %d\n", err);
+
+       /* End by sending a stop id to host with an outbuf. */
+       err = send_cmd_id_stop(vb);
+       if (unlikely(err))
+               dev_err(dev, "Failed to send a stop id, err = %d\n", err);
+}
+
 #ifdef CONFIG_BALLOON_COMPACTION
 /*
  * virtballoon_migratepage - perform the balloon page migration on behalf of
@@ -512,14 +742,23 @@ static struct file_system_type balloon_fs = {
 
 #endif /* CONFIG_BALLOON_COMPACTION */
 
-static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker,
-                                                 struct shrink_control *sc)
+static unsigned long shrink_free_pages(struct virtio_balloon *vb,
+                                      unsigned long pages_to_free)
 {
-       unsigned long pages_to_free, pages_freed = 0;
-       struct virtio_balloon *vb = container_of(shrinker,
-                                       struct virtio_balloon, shrinker);
+       unsigned long blocks_to_free, blocks_freed;
 
-       pages_to_free = sc->nr_to_scan * VIRTIO_BALLOON_PAGES_PER_PAGE;
+       pages_to_free = round_up(pages_to_free,
+                                1 << VIRTIO_BALLOON_FREE_PAGE_ORDER);
+       blocks_to_free = pages_to_free >> VIRTIO_BALLOON_FREE_PAGE_ORDER;
+       blocks_freed = return_free_pages_to_mm(vb, blocks_to_free);
+
+       return blocks_freed << VIRTIO_BALLOON_FREE_PAGE_ORDER;
+}
+
+static unsigned long shrink_balloon_pages(struct virtio_balloon *vb,
+                                         unsigned long pages_to_free)
+{
+       unsigned long pages_freed = 0;
 
        /*
         * One invocation of leak_balloon can deflate at most
@@ -527,12 +766,33 @@ static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker,
         * multiple times to deflate pages till reaching pages_to_free.
         */
        while (vb->num_pages && pages_to_free) {
+               pages_freed += leak_balloon(vb, pages_to_free) /
+                                       VIRTIO_BALLOON_PAGES_PER_PAGE;
                pages_to_free -= pages_freed;
-               pages_freed += leak_balloon(vb, pages_to_free);
        }
        update_balloon_size(vb);
 
-       return pages_freed / VIRTIO_BALLOON_PAGES_PER_PAGE;
+       return pages_freed;
+}
+
+static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker,
+                                                 struct shrink_control *sc)
+{
+       unsigned long pages_to_free, pages_freed = 0;
+       struct virtio_balloon *vb = container_of(shrinker,
+                                       struct virtio_balloon, shrinker);
+
+       pages_to_free = sc->nr_to_scan * VIRTIO_BALLOON_PAGES_PER_PAGE;
+
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+               pages_freed = shrink_free_pages(vb, pages_to_free);
+
+       if (pages_freed >= pages_to_free)
+               return pages_freed;
+
+       pages_freed += shrink_balloon_pages(vb, pages_to_free - pages_freed);
+
+       return pages_freed;
 }
 
 static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
@@ -540,8 +800,12 @@ static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
 {
        struct virtio_balloon *vb = container_of(shrinker,
                                        struct virtio_balloon, shrinker);
+       unsigned long count;
 
-       return vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE;
+       count = vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE;
+       count += vb->num_free_page_blocks >> VIRTIO_BALLOON_FREE_PAGE_ORDER;
+
+       return count;
 }
 
 static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb)
@@ -561,6 +825,7 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb)
 static int virtballoon_probe(struct virtio_device *vdev)
 {
        struct virtio_balloon *vb;
+       __u32 poison_val;
        int err;
 
        if (!vdev->config->get) {
@@ -604,6 +869,36 @@ static int virtballoon_probe(struct virtio_device *vdev)
        }
        vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
 #endif
+       if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+               /*
+                * There is always one entry reserved for cmd id, so the ring
+                * size needs to be at least two to report free page hints.
+                */
+               if (virtqueue_get_vring_size(vb->free_page_vq) < 2) {
+                       err = -ENOSPC;
+                       goto out_del_vqs;
+               }
+               vb->balloon_wq = alloc_workqueue("balloon-wq",
+                                       WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
+               if (!vb->balloon_wq) {
+                       err = -ENOMEM;
+                       goto out_del_vqs;
+               }
+               INIT_WORK(&vb->report_free_page_work, report_free_page_func);
+               vb->cmd_id_received = VIRTIO_BALLOON_CMD_ID_STOP;
+               vb->cmd_id_active = cpu_to_virtio32(vb->vdev,
+                                                 VIRTIO_BALLOON_CMD_ID_STOP);
+               vb->cmd_id_stop = cpu_to_virtio32(vb->vdev,
+                                                 VIRTIO_BALLOON_CMD_ID_STOP);
+               vb->num_free_page_blocks = 0;
+               spin_lock_init(&vb->free_page_list_lock);
+               INIT_LIST_HEAD(&vb->free_page_list);
+               if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) {
+                       memset(&poison_val, PAGE_POISON, sizeof(poison_val));
+                       virtio_cwrite(vb->vdev, struct virtio_balloon_config,
+                                     poison_val, &poison_val);
+               }
+       }
        /*
         * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to decide if a
         * shrinker needs to be registered to relieve memory pressure.
@@ -611,7 +906,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
        if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) {
                err = virtio_balloon_register_shrinker(vb);
                if (err)
-                       goto out_del_vqs;
+                       goto out_del_balloon_wq;
        }
        virtio_device_ready(vdev);
 
@@ -619,6 +914,9 @@ static int virtballoon_probe(struct virtio_device *vdev)
                virtballoon_changed(vdev);
        return 0;
 
+out_del_balloon_wq:
+       if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+               destroy_workqueue(vb->balloon_wq);
 out_del_vqs:
        vdev->config->del_vqs(vdev);
 out_free_vb:
@@ -652,6 +950,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
        cancel_work_sync(&vb->update_balloon_size_work);
        cancel_work_sync(&vb->update_balloon_stats_work);
 
+       if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+               cancel_work_sync(&vb->report_free_page_work);
+               destroy_workqueue(vb->balloon_wq);
+       }
+
        remove_common(vb);
 #ifdef CONFIG_BALLOON_COMPACTION
        if (vb->vb_dev_info.inode)
@@ -695,6 +998,9 @@ static int virtballoon_restore(struct virtio_device *vdev)
 
 static int virtballoon_validate(struct virtio_device *vdev)
 {
+       if (!page_poisoning_enabled())
+               __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON);
+
        __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM);
        return 0;
 }
@@ -703,6 +1009,8 @@ static unsigned int features[] = {
        VIRTIO_BALLOON_F_MUST_TELL_HOST,
        VIRTIO_BALLOON_F_STATS_VQ,
        VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
+       VIRTIO_BALLOON_F_FREE_PAGE_HINT,
+       VIRTIO_BALLOON_F_PAGE_POISON,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
index 13b8cb5..a1966cd 100644 (file)
 #define VIRTIO_BALLOON_F_MUST_TELL_HOST        0 /* Tell before reclaiming pages */
 #define VIRTIO_BALLOON_F_STATS_VQ      1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM        2 /* Deflate balloon on OOM */
+#define VIRTIO_BALLOON_F_FREE_PAGE_HINT        3 /* VQ to report free pages */
+#define VIRTIO_BALLOON_F_PAGE_POISON   4 /* Guest is using page poisoning */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
 
+#define VIRTIO_BALLOON_CMD_ID_STOP     0
+#define VIRTIO_BALLOON_CMD_ID_DONE     1
 struct virtio_balloon_config {
        /* Number of pages host wants Guest to give up. */
        __u32 num_pages;
        /* Number of pages we've actually got in balloon. */
        __u32 actual;
+       /* Free page report command id, readonly by guest */
+       __u32 free_page_report_cmd_id;
+       /* Stores PAGE_POISON if page poisoning is in use */
+       __u32 poison_val;
 };
 
 #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */
index 108fecc..208481d 100644 (file)
@@ -20,6 +20,7 @@ CONFIG_PARAVIRT=y
 CONFIG_KVM_GUEST=y
 CONFIG_S390_GUEST=y
 CONFIG_VIRTIO=y
+CONFIG_VIRTIO_MENU=y
 CONFIG_VIRTIO_PCI=y
 CONFIG_VIRTIO_BLK=y
 CONFIG_VIRTIO_CONSOLE=y
index f7e2a67..f0c15e9 100644 (file)
@@ -17,6 +17,11 @@ static int __init early_page_poison_param(char *buf)
 }
 early_param("page_poison", early_page_poison_param);
 
+/**
+ * page_poisoning_enabled - check if page poisoning is enabled
+ *
+ * Return true if page poisoning is enabled, or false if not.
+ */
 bool page_poisoning_enabled(void)
 {
        /*
@@ -29,6 +34,7 @@ bool page_poisoning_enabled(void)
                (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
                debug_pagealloc_enabled()));
 }
+EXPORT_SYMBOL_GPL(page_poisoning_enabled);
 
 static void poison_page(struct page *page)
 {