whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 48751b562bce96c4284885571da1bdda7673f38e
parent 4d8d9f540b780f7a3688a72275aecd8fd99c99e5
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu, 13 Sep 2018 19:21:40 -1000

Merge tag 'ovl-fixes-4.19-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs

Pull overlayfs fixes from Miklos Szeredi:
 "This fixes a regression in the recent file stacking update, reported
  and fixed by Amir Goldstein. The fix is fairly trivial, but involves
  adding a fadvise() f_op and the associated churn in the vfs. As
  discussed on -fsdevel, there are other possible uses for this method,
  than allowing proper stacking for overlays.

  And there's one other fix for a syzkaller detected oops"

* tag 'ovl-fixes-4.19-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs:
  ovl: fix oopses in ovl_fill_super() failure paths
  ovl: add ovl_fadvise()
  vfs: implement readahead(2) using POSIX_FADV_WILLNEED
  vfs: add the fadvise() file operation
  Documentation/filesystems: update documentation of file_operations
  ovl: fix GPF in swapfile_activate of file from overlayfs over xfs
  ovl: respect FIEMAP_FLAG_SYNC flag

Diffstat:
MDocumentation/filesystems/vfs.txt | 21+++++++++++++++++++--
Mfs/overlayfs/file.c | 23++++++++++++++++++++---
Mfs/overlayfs/inode.c | 10++++++++++
Mfs/overlayfs/super.c | 26++++++++++++++------------
Minclude/linux/fs.h | 5+++++
Mmm/Makefile | 3+--
Mmm/fadvise.c | 81+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mmm/readahead.c | 45+++++++++++++++++----------------------------
8 files changed, 134 insertions(+), 80 deletions(-)

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt @@ -848,7 +848,7 @@ struct file_operations ---------------------- This describes how the VFS can manipulate an open file. As of kernel -4.1, the following members are defined: +4.18, the following members are defined: struct file_operations { struct module *owner; @@ -858,11 +858,11 @@ struct file_operations { ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); + int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); - int (*mremap)(struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); @@ -882,6 +882,10 @@ struct file_operations { #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); + int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + int (*fadvise)(struct file *, loff_t, loff_t, int); }; Again, all methods are called without any locks being held, unless @@ -899,6 +903,9 @@ otherwise noted. iterate: called when the VFS needs to read the directory contents + iterate_shared: called when the VFS needs to read the directory contents + when filesystem supports concurrent dir iterators + poll: called by the VFS when a process wants to check if there is activity on this file and (optionally) go to sleep until there is activity. Called by the select(2) and poll(2) system calls @@ -951,6 +958,16 @@ otherwise noted. fallocate: called by the VFS to preallocate blocks or punch a hole. + copy_file_range: called by the copy_file_range(2) system call. + + clone_file_range: called by the ioctl(2) system call for FICLONERANGE and + FICLONE commands. + + dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE + command. + + fadvise: possibly called by the fadvise64() system call. + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c @@ -131,9 +131,6 @@ static int ovl_open(struct inode *inode, struct file *file) if (IS_ERR(realfile)) return PTR_ERR(realfile); - /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ - file->f_mapping = realfile->f_mapping; - file->private_data = realfile; return 0; @@ -334,6 +331,25 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len return ret; } +static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + struct fd real; + const struct cred *old_cred; + int ret; + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_fadvise(real.file, offset, len, advice); + revert_creds(old_cred); + + fdput(real); + + return ret; +} + static long ovl_real_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -502,6 +518,7 @@ const struct file_operations ovl_file_operations = { .fsync = ovl_fsync, .mmap = ovl_mmap, .fallocate = ovl_fallocate, + .fadvise = ovl_fadvise, .unlocked_ioctl = ovl_ioctl, .compat_ioctl = ovl_compat_ioctl, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c @@ -467,6 +467,10 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -EOPNOTSUPP; old_cred = ovl_override_creds(inode->i_sb); + + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(realinode->i_mapping); + err = realinode->i_op->fiemap(realinode, fieinfo, start, len); revert_creds(old_cred); @@ -500,6 +504,11 @@ static const struct inode_operations ovl_special_inode_operations = { .update_time = ovl_update_time, }; +const struct address_space_operations ovl_aops = { + /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ + .direct_IO = noop_direct_IO, +}; + /* * It is possible to stack overlayfs instance on top of another * overlayfs instance as lower layer. We need to annonate the @@ -571,6 +580,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, case S_IFREG: inode->i_op = &ovl_file_inode_operations; inode->i_fop = &ovl_file_operations; + inode->i_mapping->a_ops = &ovl_aops; break; case S_IFDIR: diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c @@ -982,16 +982,6 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath) if (err) goto out; - err = -EBUSY; - if (ovl_inuse_trylock(upperpath->dentry)) { - ofs->upperdir_locked = true; - } else if (ofs->config.index) { - pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); - goto out; - } else { - pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); - } - upper_mnt = clone_private_mount(upperpath); err = PTR_ERR(upper_mnt); if (IS_ERR(upper_mnt)) { @@ -1002,6 +992,17 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath) /* Don't inherit atime flags */ upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); ofs->upper_mnt = upper_mnt; + + err = -EBUSY; + if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) { + ofs->upperdir_locked = true; + } else if (ofs->config.index) { + pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); + goto out; + } else { + pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); + } + err = 0; out: return err; @@ -1101,8 +1102,10 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath) goto out; } + ofs->workbasedir = dget(workpath.dentry); + err = -EBUSY; - if (ovl_inuse_trylock(workpath.dentry)) { + if (ovl_inuse_trylock(ofs->workbasedir)) { ofs->workdir_locked = true; } else if (ofs->config.index) { pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); @@ -1111,7 +1114,6 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath) pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); } - ofs->workbasedir = dget(workpath.dentry); err = ovl_make_workdir(ofs, &workpath); if (err) goto out; diff --git a/include/linux/fs.h b/include/linux/fs.h @@ -1763,6 +1763,7 @@ struct file_operations { u64); int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + int (*fadvise)(struct file *, loff_t, loff_t, int); } __randomize_layout; struct inode_operations { @@ -3459,4 +3460,8 @@ static inline bool dir_relax_shared(struct inode *inode) extern bool path_noexec(const struct path *path); extern void inode_nohighmem(struct inode *inode); +/* mm/fadvise.c */ +extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, + int advice); + #endif /* _LINUX_FS_H */ diff --git a/mm/Makefile b/mm/Makefile @@ -32,7 +32,7 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o endif -obj-y := filemap.o mempool.o oom_kill.o \ +obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ @@ -49,7 +49,6 @@ else obj-y += bootmem.o endif -obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o endif diff --git a/mm/fadvise.c b/mm/fadvise.c @@ -27,9 +27,9 @@ * deactivate the pages and clear PG_Referenced. */ -int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +static int generic_fadvise(struct file *file, loff_t offset, loff_t len, + int advice) { - struct fd f = fdget(fd); struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; @@ -37,22 +37,14 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; - int ret = 0; - - if (!f.file) - return -EBADF; - inode = file_inode(f.file); - if (S_ISFIFO(inode->i_mode)) { - ret = -ESPIPE; - goto out; - } + inode = file_inode(file); + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; - mapping = f.file->f_mapping; - if (!mapping || len < 0) { - ret = -EINVAL; - goto out; - } + mapping = file->f_mapping; + if (!mapping || len < 0) + return -EINVAL; bdi = inode_to_bdi(mapping->host); @@ -67,9 +59,9 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) /* no bad return value, but ignore advice */ break; default: - ret = -EINVAL; + return -EINVAL; } - goto out; + return 0; } /* @@ -85,21 +77,21 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: - f.file->f_ra.ra_pages = bdi->ra_pages; - spin_lock(&f.file->f_lock); - f.file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: - spin_lock(&f.file->f_lock); - f.file->f_mode |= FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_SEQUENTIAL: - f.file->f_ra.ra_pages = bdi->ra_pages * 2; - spin_lock(&f.file->f_lock); - f.file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_WILLNEED: /* First and last PARTIAL page! */ @@ -115,8 +107,7 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) * Ignore return value because fadvise() shall return * success even if filesystem can't retrieve a hint, */ - force_page_cache_readahead(mapping, f.file, start_index, - nrpages); + force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: break; @@ -183,9 +174,32 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) } break; default: - ret = -EINVAL; + return -EINVAL; } -out: + return 0; +} + +int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + if (file->f_op->fadvise) + return file->f_op->fadvise(file, offset, len, advice); + + return generic_fadvise(file, offset, len, advice); +} +EXPORT_SYMBOL(vfs_fadvise); + +#ifdef CONFIG_ADVISE_SYSCALLS + +int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +{ + struct fd f = fdget(fd); + int ret; + + if (!f.file) + return -EBADF; + + ret = vfs_fadvise(f.file, offset, len, advice); + fdput(f); return ret; } @@ -203,3 +217,4 @@ SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice) } #endif +#endif diff --git a/mm/readahead.c b/mm/readahead.c @@ -20,6 +20,7 @@ #include <linux/file.h> #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> +#include <linux/fadvise.h> #include "internal.h" @@ -575,24 +576,6 @@ page_cache_async_readahead(struct address_space *mapping, } EXPORT_SYMBOL_GPL(page_cache_async_readahead); -static ssize_t -do_readahead(struct address_space *mapping, struct file *filp, - pgoff_t index, unsigned long nr) -{ - if (!mapping || !mapping->a_ops) - return -EINVAL; - - /* - * Readahead doesn't make sense for DAX inodes, but we don't want it - * to report a failure either. Instead, we just return success and - * don't do any work. - */ - if (dax_mapping(mapping)) - return 0; - - return force_page_cache_readahead(mapping, filp, index, nr); -} - ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; @@ -600,16 +583,22 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) ret = -EBADF; f = fdget(fd); - if (f.file) { - if (f.file->f_mode & FMODE_READ) { - struct address_space *mapping = f.file->f_mapping; - pgoff_t start = offset >> PAGE_SHIFT; - pgoff_t end = (offset + count - 1) >> PAGE_SHIFT; - unsigned long len = end - start + 1; - ret = do_readahead(mapping, f.file, start, len); - } - fdput(f); - } + if (!f.file || !(f.file->f_mode & FMODE_READ)) + goto out; + + /* + * The readahead() syscall is intended to run only on files + * that can execute readahead. If readahead is not possible + * on this file, then we must return -EINVAL. + */ + ret = -EINVAL; + if (!f.file->f_mapping || !f.file->f_mapping->a_ops || + !S_ISREG(file_inode(f.file)->i_mode)) + goto out; + + ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); +out: + fdput(f); return ret; }