whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 4f9020ffde71ddb92bc2f65ce0b00232bc88c590
parent 736706bee3298208343a76096370e4f6a5c55915
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Mon,  4 Mar 2019 13:24:27 -0800

Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull vfs fixes from Al Viro:
 "Assorted fixes that sat in -next for a while, all over the place"

* 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
  aio: Fix locking in aio_poll()
  exec: Fix mem leak in kernel_read_file
  copy_mount_string: Limit string length to PATH_MAX
  cgroup: saner refcounting for cgroup_root
  fix cgroup_do_mount() handling of failure exits

Diffstat:
Mfs/aio.c | 12+++++++++---
Mfs/exec.c | 2+-
Mfs/kernfs/mount.c | 8++++++--
Mfs/namespace.c | 2+-
Mkernel/cgroup/cgroup-internal.h | 2+-
Mkernel/cgroup/cgroup-v1.c | 58+++++++++++++---------------------------------------------
Mkernel/cgroup/cgroup.c | 25+++++++++++++------------
7 files changed, 44 insertions(+), 65 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c @@ -1666,6 +1666,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); __poll_t mask = key_to_poll(key); + unsigned long flags; req->woken = true; @@ -1674,10 +1675,15 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (!(mask & req->events)) return 0; - /* try to complete the iocb inline if we can: */ - if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { + /* + * Try to complete the iocb inline if we can. Use + * irqsave/irqrestore because not all filesystems (e.g. fuse) + * call this function with IRQs disabled and because IRQs + * have to be disabled before ctx_lock is obtained. + */ + if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { list_del(&iocb->ki_list); - spin_unlock(&iocb->ki_ctx->ctx_lock); + spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); list_del_init(&req->wait.entry); aio_poll_complete(iocb, mask); diff --git a/fs/exec.c b/fs/exec.c @@ -932,7 +932,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, bytes = kernel_read(file, *buf + pos, i_size - pos, &pos); if (bytes < 0) { ret = bytes; - goto out; + goto out_free; } if (bytes == 0) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c @@ -196,8 +196,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, return dentry; knparent = find_next_ancestor(kn, NULL); - if (WARN_ON(!knparent)) + if (WARN_ON(!knparent)) { + dput(dentry); return ERR_PTR(-EINVAL); + } do { struct dentry *dtmp; @@ -206,8 +208,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, if (kn == knparent) return dentry; kntmp = find_next_ancestor(kn, knparent); - if (WARN_ON(!kntmp)) + if (WARN_ON(!kntmp)) { + dput(dentry); return ERR_PTR(-EINVAL); + } dtmp = lookup_one_len_unlocked(kntmp->name, dentry, strlen(kntmp->name)); dput(dentry); diff --git a/fs/namespace.c b/fs/namespace.c @@ -2744,7 +2744,7 @@ void *copy_mount_options(const void __user * data) char *copy_mount_string(const void __user *data) { - return data ? strndup_user(data, PAGE_SIZE) : NULL; + return data ? strndup_user(data, PATH_MAX) : NULL; } /* diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h @@ -198,7 +198,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_root *root, unsigned long magic, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c @@ -1116,13 +1116,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, void *data, unsigned long magic, struct cgroup_namespace *ns) { - struct super_block *pinned_sb = NULL; struct cgroup_sb_opts opts; struct cgroup_root *root; struct cgroup_subsys *ss; struct dentry *dentry; int i, ret; - bool new_root = false; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); @@ -1184,29 +1182,6 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, if (root->flags ^ opts.flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - /* - * We want to reuse @root whose lifetime is governed by its - * ->cgrp. Let's check whether @root is alive and keep it - * that way. As cgroup_kill_sb() can happen anytime, we - * want to block it by pinning the sb so that @root doesn't - * get killed before mount is complete. - * - * With the sb pinned, tryget_live can reliably indicate - * whether @root can be reused. If it's being killed, - * drain it. We can use wait_queue for the wait but this - * path is super cold. Let's just sleep a bit and retry. - */ - pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR(pinned_sb) || - !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - if (!IS_ERR_OR_NULL(pinned_sb)) - deactivate_super(pinned_sb); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - ret = 0; goto out_unlock; } @@ -1232,15 +1207,20 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, ret = -ENOMEM; goto out_unlock; } - new_root = true; init_cgroup_root(root, &opts); - ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); + ret = cgroup_setup_root(root, opts.subsys_mask); if (ret) cgroup_free_root(root); out_unlock: + if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + msleep(10); + ret = restart_syscall(); + goto out_free; + } mutex_unlock(&cgroup_mutex); out_free: kfree(opts.release_agent); @@ -1252,25 +1232,13 @@ out_free: dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, CGROUP_SUPER_MAGIC, ns); - /* - * There's a race window after we release cgroup_mutex and before - * allocating a superblock. Make sure a concurrent process won't - * be able to re-use the root during this window by delaying the - * initialization of root refcnt. - */ - if (new_root) { - mutex_lock(&cgroup_mutex); - percpu_ref_reinit(&root->cgrp.self.refcnt); - mutex_unlock(&cgroup_mutex); + if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) { + struct super_block *sb = dentry->d_sb; + dput(dentry); + deactivate_locked_super(sb); + msleep(10); + dentry = ERR_PTR(restart_syscall()); } - - /* - * If @pinned_sb, we're reusing an existing root and holding an - * extra ref on its sb. Mount is complete. Put the extra ref. - */ - if (pinned_sb) - deactivate_super(pinned_sb); - return dentry; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c @@ -1927,7 +1927,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1944,7 +1944,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, - ref_flags, GFP_KERNEL); + 0, GFP_KERNEL); if (ret) goto out; @@ -2033,7 +2033,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_namespace *ns) { struct dentry *dentry; - bool new_sb; + bool new_sb = false; dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); @@ -2043,6 +2043,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, */ if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { struct dentry *nsdentry; + struct super_block *sb = dentry->d_sb; struct cgroup *cgrp; mutex_lock(&cgroup_mutex); @@ -2053,12 +2054,14 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(dentry); + if (IS_ERR(nsdentry)) + deactivate_locked_super(sb); dentry = nsdentry; } - if (IS_ERR(dentry) || !new_sb) + if (!new_sb) cgroup_put(&root->cgrp); return dentry; @@ -2118,18 +2121,16 @@ static void cgroup_kill_sb(struct super_block *sb) struct cgroup_root *root = cgroup_root_from_kf(kf_root); /* - * If @root doesn't have any mounts or children, start killing it. + * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. * * And don't kill the default root. */ - if (!list_empty(&root->cgrp.self.children) || - root == &cgrp_dfl_root) - cgroup_put(&root->cgrp); - else + if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && + !percpu_ref_is_dying(&root->cgrp.self.refcnt)) percpu_ref_kill(&root->cgrp.self.refcnt); - + cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } @@ -5399,7 +5400,7 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); - BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); mutex_unlock(&cgroup_mutex);