whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 9b5cf826ef8b607d452ba7bf683ae5510a745232
parent 31990f0f5366a8f66688edae8688723b22034108
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Wed, 31 Oct 2018 14:50:02 -0700

Merge tag 'fuse-update-4.20' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse

Pull fuse updates from Miklos Szeredi:
 "As well as the usual bug fixes, this adds the following new features:

   - cached readdir and readlink

   - max I/O size increased from 128k to 1M

   - improved performance and scalability of request queues

   - copy_file_range support

  The only non-fuse bits are trivial cleanups of macros in
  <linux/bitops.h>"

* tag 'fuse-update-4.20' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse: (31 commits)
  fuse: enable caching of symlinks
  fuse: only invalidate atime in direct read
  fuse: don't need GETATTR after every READ
  fuse: allow fine grained attr cache invaldation
  bitops: protect variables in bit_clear_unless() macro
  bitops: protect variables in set_mask_bits() macro
  fuse: realloc page array
  fuse: add max_pages to init_out
  fuse: allocate page array more efficiently
  fuse: reduce size of struct fuse_inode
  fuse: use iversion for readdir cache verification
  fuse: use mtime for readdir cache verification
  fuse: add readdir cache version
  fuse: allow using readdir cache
  fuse: allow caching readdir
  fuse: extract fuse_emit() helper
  fuse: add FOPEN_CACHE_DIR
  fuse: split out readdir.c
  fuse: Use hash table to link processing request
  fuse: kill req->intr_unique
  ...

Diffstat:
Mfs/fuse/Makefile | 2+-
Mfs/fuse/control.c | 34++++++++++++++++++++++++++--------
Mfs/fuse/dev.c | 221++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
Mfs/fuse/dir.c | 381++++++++++++++++++++++---------------------------------------------------------
Mfs/fuse/file.c | 158+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Mfs/fuse/fuse_i.h | 124+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Mfs/fuse/inode.c | 53+++++++++++++++++++++++++++++++++++++----------------
Afs/fuse/readdir.c | 569+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/linux/bitops.h | 30+++++++++++++++---------------
Minclude/uapi/linux/fuse.h | 119++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
10 files changed, 1201 insertions(+), 490 deletions(-)

diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o -fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o +fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c @@ -107,7 +107,7 @@ static ssize_t fuse_conn_max_background_read(struct file *file, if (!fc) return 0; - val = fc->max_background; + val = READ_ONCE(fc->max_background); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); @@ -125,7 +125,12 @@ static ssize_t fuse_conn_max_background_write(struct file *file, if (ret > 0) { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { + spin_lock(&fc->bg_lock); fc->max_background = val; + fc->blocked = fc->num_background >= fc->max_background; + if (!fc->blocked) + wake_up(&fc->blocked_waitq); + spin_unlock(&fc->bg_lock); fuse_conn_put(fc); } } @@ -144,7 +149,7 @@ static ssize_t fuse_conn_congestion_threshold_read(struct file *file, if (!fc) return 0; - val = fc->congestion_threshold; + val = READ_ONCE(fc->congestion_threshold); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); @@ -155,18 +160,31 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, size_t count, loff_t *ppos) { unsigned uninitialized_var(val); + struct fuse_conn *fc; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, max_user_congthresh); - if (ret > 0) { - struct fuse_conn *fc = fuse_ctl_file_conn_get(file); - if (fc) { - fc->congestion_threshold = val; - fuse_conn_put(fc); + if (ret <= 0) + goto out; + fc = fuse_ctl_file_conn_get(file); + if (!fc) + goto out; + + spin_lock(&fc->bg_lock); + fc->congestion_threshold = val; + if (fc->sb) { + if (fc->num_background < fc->congestion_threshold) { + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + } else { + set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } } - + spin_unlock(&fc->bg_lock); + fuse_conn_put(fc); +out: return ret; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c @@ -25,6 +25,10 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); +/* Ordinary requests have even IDs, while interrupts IDs are odd */ +#define FUSE_INT_REQ_BIT (1ULL << 0) +#define FUSE_REQ_ID_STEP (1ULL << 1) + static struct kmem_cache *fuse_req_cachep; static struct fuse_dev *fuse_get_dev(struct file *file) @@ -40,9 +44,6 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, struct fuse_page_desc *page_descs, unsigned npages) { - memset(req, 0, sizeof(*req)); - memset(pages, 0, sizeof(*pages) * npages); - memset(page_descs, 0, sizeof(*page_descs) * npages); INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); @@ -53,30 +54,36 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, __set_bit(FR_PENDING, &req->flags); } +static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) +{ + struct page **pages; + + pages = kzalloc(npages * (sizeof(struct page *) + + sizeof(struct fuse_page_desc)), flags); + *desc = (void *) pages + npages * sizeof(struct page *); + + return pages; +} + static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags); + struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); if (req) { - struct page **pages; - struct fuse_page_desc *page_descs; - - if (npages <= FUSE_REQ_INLINE_PAGES) { + struct page **pages = NULL; + struct fuse_page_desc *page_descs = NULL; + + WARN_ON(npages > FUSE_MAX_MAX_PAGES); + if (npages > FUSE_REQ_INLINE_PAGES) { + pages = fuse_req_pages_alloc(npages, flags, + &page_descs); + if (!pages) { + kmem_cache_free(fuse_req_cachep, req); + return NULL; + } + } else if (npages) { pages = req->inline_pages; page_descs = req->inline_page_descs; - } else { - pages = kmalloc_array(npages, sizeof(struct page *), - flags); - page_descs = - kmalloc_array(npages, - sizeof(struct fuse_page_desc), - flags); - } - - if (!pages || !page_descs) { - kfree(pages); - kfree(page_descs); - kmem_cache_free(fuse_req_cachep, req); - return NULL; } fuse_request_init(req, pages, page_descs, npages); @@ -95,12 +102,41 @@ struct fuse_req *fuse_request_alloc_nofs(unsigned npages) return __fuse_request_alloc(npages, GFP_NOFS); } -void fuse_request_free(struct fuse_req *req) +static void fuse_req_pages_free(struct fuse_req *req) { - if (req->pages != req->inline_pages) { + if (req->pages != req->inline_pages) kfree(req->pages); - kfree(req->page_descs); - } +} + +bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, + gfp_t flags) +{ + struct page **pages; + struct fuse_page_desc *page_descs; + unsigned int npages = min_t(unsigned int, + max_t(unsigned int, req->max_pages * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), + fc->max_pages); + WARN_ON(npages <= req->max_pages); + + pages = fuse_req_pages_alloc(npages, flags, &page_descs); + if (!pages) + return false; + + memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages); + memcpy(page_descs, req->page_descs, + sizeof(struct fuse_page_desc) * req->max_pages); + fuse_req_pages_free(req); + req->pages = pages; + req->page_descs = page_descs; + req->max_pages = npages; + + return true; +} + +void fuse_request_free(struct fuse_req *req) +{ + fuse_req_pages_free(req); kmem_cache_free(fuse_req_cachep, req); } @@ -235,8 +271,10 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) struct file *file = req->stolen_file; struct fuse_file *ff = file->private_data; + WARN_ON(req->max_pages); spin_lock(&fc->lock); - fuse_request_init(req, req->pages, req->page_descs, req->max_pages); + memset(req, 0, sizeof(*req)); + fuse_request_init(req, NULL, NULL, 0); BUG_ON(ff->reserved_req); ff->reserved_req = req; wake_up_all(&fc->reserved_req_waitq); @@ -287,10 +325,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) * We get here in the unlikely case that a background * request was allocated but not sent */ - spin_lock(&fc->lock); + spin_lock(&fc->bg_lock); if (!fc->blocked) wake_up(&fc->blocked_waitq); - spin_unlock(&fc->lock); + spin_unlock(&fc->bg_lock); } if (test_bit(FR_WAITING, &req->flags)) { @@ -319,7 +357,13 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) static u64 fuse_get_unique(struct fuse_iqueue *fiq) { - return ++fiq->reqctr; + fiq->reqctr += FUSE_REQ_ID_STEP; + return fiq->reqctr; +} + +static unsigned int fuse_req_hash(u64 unique) +{ + return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) @@ -353,12 +397,13 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, static void flush_bg_queue(struct fuse_conn *fc) { + struct fuse_iqueue *fiq = &fc->iq; + while (fc->active_background < fc->max_background && !list_empty(&fc->bg_queue)) { struct fuse_req *req; - struct fuse_iqueue *fiq = &fc->iq; - req = list_entry(fc->bg_queue.next, struct fuse_req, list); + req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; spin_lock(&fiq->waitq.lock); @@ -389,14 +434,21 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); if (test_bit(FR_BACKGROUND, &req->flags)) { - spin_lock(&fc->lock); + spin_lock(&fc->bg_lock); clear_bit(FR_BACKGROUND, &req->flags); - if (fc->num_background == fc->max_background) + if (fc->num_background == fc->max_background) { fc->blocked = 0; - - /* Wake up next waiter, if any */ - if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) wake_up(&fc->blocked_waitq); + } else if (!fc->blocked) { + /* + * Wake up next waiter, if any. It's okay to use + * waitqueue_active(), as we've already synced up + * fc->blocked with waiters with the wake_up() call + * above. + */ + if (waitqueue_active(&fc->blocked_waitq)) + wake_up(&fc->blocked_waitq); + } if (fc->num_background == fc->congestion_threshold && fc->sb) { clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); @@ -405,7 +457,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) fc->num_background--; fc->active_background--; flush_bg_queue(fc); - spin_unlock(&fc->lock); + spin_unlock(&fc->bg_lock); } wake_up(&req->waitq); if (req->end) @@ -573,40 +625,38 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) return ret; } -/* - * Called under fc->lock - * - * fc->connected must have been checked previously - */ -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req) +bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(!test_bit(FR_BACKGROUND, &req->flags)); + bool queued = false; + + WARN_ON(!test_bit(FR_BACKGROUND, &req->flags)); if (!test_bit(FR_WAITING, &req->flags)) { __set_bit(FR_WAITING, &req->flags); atomic_inc(&fc->num_waiting); } __set_bit(FR_ISREPLY, &req->flags); - fc->num_background++; - if (fc->num_background == fc->max_background) - fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fc->sb) { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + spin_lock(&fc->bg_lock); + if (likely(fc->connected)) { + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + if (fc->num_background == fc->congestion_threshold && fc->sb) { + set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + } + list_add_tail(&req->list, &fc->bg_queue); + flush_bg_queue(fc); + queued = true; } - list_add_tail(&req->list, &fc->bg_queue); - flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); + + return queued; } void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(!req->end); - spin_lock(&fc->lock); - if (fc->connected) { - fuse_request_send_background_locked(fc, req); - spin_unlock(&fc->lock); - } else { - spin_unlock(&fc->lock); + WARN_ON(!req->end); + if (!fuse_request_queue_background(fc, req)) { req->out.h.error = -ENOTCONN; req->end(fc, req); fuse_put_request(fc, req); @@ -1084,12 +1134,11 @@ __releases(fiq->waitq.lock) int err; list_del_init(&req->intr_entry); - req->intr_unique = fuse_get_unique(fiq); memset(&ih, 0, sizeof(ih)); memset(&arg, 0, sizeof(arg)); ih.len = reqsize; ih.opcode = FUSE_INTERRUPT; - ih.unique = req->intr_unique; + ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT); arg.unique = req->in.h.unique; spin_unlock(&fiq->waitq.lock); @@ -1238,6 +1287,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_req *req; struct fuse_in *in; unsigned reqsize; + unsigned int hash; restart: spin_lock(&fiq->waitq.lock); @@ -1310,13 +1360,16 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, err = reqsize; goto out_end; } - list_move_tail(&req->list, &fpq->processing); - spin_unlock(&fpq->lock); + hash = fuse_req_hash(req->in.h.unique); + list_move_tail(&req->list, &fpq->processing[hash]); + __fuse_get_request(req); set_bit(FR_SENT, &req->flags); + spin_unlock(&fpq->lock); /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) queue_interrupt(fiq, req); + fuse_put_request(fc, req); return reqsize; @@ -1663,7 +1716,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - int num_pages; + unsigned int num_pages; offset = outarg->offset & ~PAGE_MASK; file_size = i_size_read(inode); @@ -1675,7 +1728,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, num = file_size - outarg->offset; num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ); + num_pages = min(num_pages, fc->max_pages); req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) @@ -1792,10 +1845,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, /* Look up request on processing list by unique ID */ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) { + unsigned int hash = fuse_req_hash(unique); struct fuse_req *req; - list_for_each_entry(req, &fpq->processing, list) { - if (req->in.h.unique == unique || req->intr_unique == unique) + list_for_each_entry(req, &fpq->processing[hash], list) { + if (req->in.h.unique == unique) return req; } return NULL; @@ -1869,22 +1923,26 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, if (!fpq->connected) goto err_unlock_pq; - req = request_find(fpq, oh.unique); + req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); if (!req) goto err_unlock_pq; - /* Is it an interrupt reply? */ - if (req->intr_unique == oh.unique) { + /* Is it an interrupt reply ID? */ + if (oh.unique & FUSE_INT_REQ_BIT) { + __fuse_get_request(req); spin_unlock(&fpq->lock); err = -EINVAL; - if (nbytes != sizeof(struct fuse_out_header)) + if (nbytes != sizeof(struct fuse_out_header)) { + fuse_put_request(fc, req); goto err_finish; + } if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) queue_interrupt(&fc->iq, req); + fuse_put_request(fc, req); fuse_copy_finish(cs); return nbytes; @@ -2102,9 +2160,13 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) struct fuse_dev *fud; struct fuse_req *req, *next; LIST_HEAD(to_end); + unsigned int i; + /* Background queuing checks fc->connected under bg_lock */ + spin_lock(&fc->bg_lock); fc->connected = 0; - fc->blocked = 0; + spin_unlock(&fc->bg_lock); + fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { @@ -2123,11 +2185,16 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) } spin_unlock(&req->waitq.lock); } - list_splice_tail_init(&fpq->processing, &to_end); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_tail_init(&fpq->processing[i], + &to_end); spin_unlock(&fpq->lock); } + spin_lock(&fc->bg_lock); + fc->blocked = 0; fc->max_background = UINT_MAX; flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); spin_lock(&fiq->waitq.lock); fiq->connected = 0; @@ -2163,10 +2230,12 @@ int fuse_dev_release(struct inode *inode, struct file *file) struct fuse_conn *fc = fud->fc; struct fuse_pqueue *fpq = &fud->pq; LIST_HEAD(to_end); + unsigned int i; spin_lock(&fpq->lock); WARN_ON(!list_empty(&fpq->io)); - list_splice_init(&fpq->processing, &to_end); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); end_requests(fc, &to_end); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c @@ -14,24 +14,9 @@ #include <linux/namei.h> #include <linux/slab.h> #include <linux/xattr.h> +#include <linux/iversion.h> #include <linux/posix_acl.h> -static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) -{ - struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_inode *fi = get_fuse_inode(dir); - - if (!fc->do_readdirplus) - return false; - if (!fc->readdirplus_auto) - return true; - if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) - return true; - if (ctx->pos == 0) - return true; - return false; -} - static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -80,8 +65,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec) * Set dentry and possibly attribute timeouts from the lookup/mk* * replies */ -static void fuse_change_entry_timeout(struct dentry *entry, - struct fuse_entry_out *o) +void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) { fuse_dentry_settime(entry, time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); @@ -92,18 +76,29 @@ static u64 attr_timeout(struct fuse_attr_out *o) return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } -static u64 entry_attr_timeout(struct fuse_entry_out *o) +u64 entry_attr_timeout(struct fuse_entry_out *o) { return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } +static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) +{ + set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask); +} + /* * Mark the attributes as stale, so that at the next call to * ->getattr() they will be fetched from userspace */ void fuse_invalidate_attr(struct inode *inode) { - get_fuse_inode(inode)->i_time = 0; + fuse_invalidate_attr_mask(inode, STATX_BASIC_STATS); +} + +static void fuse_dir_changed(struct inode *dir) +{ + fuse_invalidate_attr(dir); + inode_maybe_inc_iversion(dir, false); } /** @@ -113,7 +108,7 @@ void fuse_invalidate_attr(struct inode *inode) void fuse_invalidate_atime(struct inode *inode) { if (!IS_RDONLY(inode)) - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, STATX_ATIME); } /* @@ -262,11 +257,6 @@ invalid: goto out; } -static int invalid_nodeid(u64 nodeid) -{ - return !nodeid || nodeid == FUSE_ROOT_ID; -} - static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); @@ -469,7 +459,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, kfree(forget); d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); err = finish_open(file, entry, generic_file_open); if (err) { fuse_sync_release(ff, flags); @@ -583,7 +573,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, } else { fuse_change_entry_timeout(entry, &outarg); } - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); return 0; out_put_forget_req: @@ -693,7 +683,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) drop_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); fuse_update_ctime(inode); } else if (err == -EINTR) @@ -715,7 +705,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) err = fuse_simple_request(fc, &args); if (!err) { clear_nlink(d_inode(entry)); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); @@ -754,9 +744,9 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, fuse_update_ctime(d_inode(newent)); } - fuse_invalidate_attr(olddir); + fuse_dir_changed(olddir); if (olddir != newdir) - fuse_invalidate_attr(newdir); + fuse_dir_changed(newdir); /* newent will end up negative */ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { @@ -932,7 +922,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, } static int fuse_update_get_attr(struct inode *inode, struct file *file, - struct kstat *stat, unsigned int flags) + struct kstat *stat, u32 request_mask, + unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; @@ -942,6 +933,8 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; + else if (request_mask & READ_ONCE(fi->inval_mask)) + sync = true; else sync = time_before64(fi->i_time, get_jiffies_64()); @@ -959,7 +952,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, int fuse_update_attributes(struct inode *inode, struct file *file) { - return fuse_update_get_attr(inode, file, NULL, 0); + /* Do *not* need to get atime for internal purposes */ + return fuse_update_get_attr(inode, file, NULL, + STATX_BASIC_STATS & ~STATX_ATIME, 0); } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, @@ -989,7 +984,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!entry) goto unlock; - fuse_invalidate_attr(parent); + fuse_dir_changed(parent); fuse_invalidate_entry(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { @@ -1165,271 +1160,78 @@ static int fuse_permission(struct inode *inode, int mask) return err; } -static int parse_dirfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx) -{ - while (nbytes >= FUSE_NAME_OFFSET) { - struct fuse_dirent *dirent = (struct fuse_dirent *) buf; - size_t reclen = FUSE_DIRENT_SIZE(dirent); - if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) - return -EIO; - if (reclen > nbytes) - break; - if (memchr(dirent->name, '/', dirent->namelen) != NULL) - return -EIO; - - if (!dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type)) - break; - - buf += reclen; - nbytes -= reclen; - ctx->pos = dirent->off; - } - - return 0; -} - -static int fuse_direntplus_link(struct file *file, - struct fuse_direntplus *direntplus, - u64 attr_version) -{ - struct fuse_entry_out *o = &direntplus->entry_out; - struct fuse_dirent *dirent = &direntplus->dirent; - struct dentry *parent = file->f_path.dentry; - struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); - struct dentry *dentry; - struct dentry *alias; - struct inode *dir = d_inode(parent); - struct fuse_conn *fc; - struct inode *inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); - - if (!o->nodeid) { - /* - * Unlike in the case of fuse_lookup, zero nodeid does not mean - * ENOENT. Instead, it only means the userspace filesystem did - * not want to return attributes/handle for this entry. - * - * So do nothing. - */ - return 0; - } - - if (name.name[0] == '.') { - /* - * We could potentially refresh the attributes of the directory - * and its parent? - */ - if (name.len == 1) - return 0; - if (name.name[1] == '.' && name.len == 2) - return 0; - } - - if (invalid_nodeid(o->nodeid)) - return -EIO; - if (!fuse_valid_type(o->attr.mode)) - return -EIO; - - fc = get_fuse_conn(dir); - - name.hash = full_name_hash(parent, name.name, name.len); - dentry = d_lookup(parent, &name); - if (!dentry) { -retry: - dentry = d_alloc_parallel(parent, &name, &wq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - } - if (!d_in_lookup(dentry)) { - struct fuse_inode *fi; - inode = d_inode(dentry); - if (!inode || - get_node_id(inode) != o->nodeid || - ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { - d_invalidate(dentry); - dput(dentry); - goto retry; - } - if (is_bad_inode(inode)) { - dput(dentry); - return -EIO; - } - - fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->nlookup++; - spin_unlock(&fc->lock); - - forget_all_cached_acls(inode); - fuse_change_attributes(inode, &o->attr, - entry_attr_timeout(o), - attr_version); - /* - * The other branch comes via fuse_iget() - * which bumps nlookup inside - */ - } else { - inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, - &o->attr, entry_attr_timeout(o), - attr_version); - if (!inode) - inode = ERR_PTR(-ENOMEM); - - alias = d_splice_alias(inode, dentry); - d_lookup_done(dentry); - if (alias) { - dput(dentry); - dentry = alias; - } - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - } - if (fc->readdirplus_auto) - set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); - fuse_change_entry_timeout(dentry, o); - - dput(dentry); - return 0; -} - -static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx, u64 attr_version) +static int fuse_readlink_page(struct inode *inode, struct page *page) { - struct fuse_direntplus *direntplus; - struct fuse_dirent *dirent; - size_t reclen; - int over = 0; - int ret; - - while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { - direntplus = (struct fuse_direntplus *) buf; - dirent = &direntplus->dirent; - reclen = FUSE_DIRENTPLUS_SIZE(direntplus); - - if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) - return -EIO; - if (reclen > nbytes) - break; - if (memchr(dirent->name, '/', dirent->namelen) != NULL) - return -EIO; - - if (!over) { - /* We fill entries into dstbuf only as much as - it can hold. But we still continue iterating - over remaining entries to link them. If not, - we need to send a FORGET for each of those - which we did not link. - */ - over = !dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type); - if (!over) - ctx->pos = dirent->off; - } - - buf += reclen; - nbytes -= reclen; - - ret = fuse_direntplus_link(file, direntplus, attr_version); - if (ret) - fuse_force_forget(file, direntplus->entry_out.nodeid); - } - - return 0; -} - -static int fuse_readdir(struct file *file, struct dir_context *ctx) -{ - int plus, err; - size_t nbytes; - struct page *page; - struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; - u64 attr_version = 0; - bool locked; - - if (is_bad_inode(inode)) - return -EIO; + int err; req = fuse_get_req(fc, 1); if (IS_ERR(req)) return PTR_ERR(req); - page = alloc_page(GFP_KERNEL); - if (!page) { - fuse_put_request(fc, req); - return -ENOMEM; - } - - plus = fuse_use_readdirplus(inode, ctx); + req->out.page_zeroing = 1; req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE; - if (plus) { - attr_version = fuse_get_attr_version(fc); - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); - } else { - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); - } - locked = fuse_lock_inode(inode); + req->page_descs[0].length = PAGE_SIZE - 1; + req->in.h.opcode = FUSE_READLINK; + req->in.h.nodeid = get_node_id(inode); + req->out.argvar = 1; + req->out.numargs = 1; + req->out.args[0].size = PAGE_SIZE - 1; fuse_request_send(fc, req); - fuse_unlock_inode(inode, locked); - nbytes = req->out.args[0].size; err = req->out.h.error; - fuse_put_request(fc, req); + if (!err) { - if (plus) { - err = parse_dirplusfile(page_address(page), nbytes, - file, ctx, - attr_version); - } else { - err = parse_dirfile(page_address(page), nbytes, file, - ctx); - } + char *link = page_address(page); + size_t len = req->out.args[0].size; + + BUG_ON(len >= PAGE_SIZE); + link[len] = '\0'; } - __free_page(page); + fuse_put_request(fc, req); fuse_invalidate_atime(inode); + return err; } -static const char *fuse_get_link(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) +static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - char *link; - ssize_t ret; + struct page *page; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out_err; + if (fc->cache_symlinks) + return page_get_link(dentry, inode, callback); + + err = -ECHILD; if (!dentry) - return ERR_PTR(-ECHILD); + goto out_err; - link = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!link) - return ERR_PTR(-ENOMEM); + page = alloc_page(GFP_KERNEL); + err = -ENOMEM; + if (!page) + goto out_err; - args.in.h.opcode = FUSE_READLINK; - args.in.h.nodeid = get_node_id(inode); - args.out.argvar = 1; - args.out.numargs = 1; - args.out.args[0].size = PAGE_SIZE - 1; - args.out.args[0].value = link; - ret = fuse_simple_request(fc, &args); - if (ret < 0) { - kfree(link); - link = ERR_PTR(ret); - } else { - link[ret] = '\0'; - set_delayed_call(done, kfree_link, link); + err = fuse_readlink_page(inode, page); + if (err) { + __free_page(page); + goto out_err; } - fuse_invalidate_atime(inode); - return link; + + set_delayed_call(callback, page_put_link, page); + + return page_address(page); + +out_err: + return ERR_PTR(err); } static int fuse_dir_open(struct inode *inode, struct file *file) @@ -1662,8 +1464,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, file = NULL; } - if (attr->ia_valid & ATTR_SIZE) + if (attr->ia_valid & ATTR_SIZE) { + if (WARN_ON(!S_ISREG(inode->i_mode))) + return -EIO; is_truncate = true; + } if (is_truncate) { fuse_set_nowrite(inode); @@ -1811,7 +1616,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, if (!fuse_allow_current_process(fc)) return -EACCES; - return fuse_update_get_attr(inode, NULL, stat, flags); + return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); } static const struct inode_operations fuse_dir_inode_operations = { @@ -1867,11 +1672,37 @@ void fuse_init_common(struct inode *inode) void fuse_init_dir(struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + inode->i_op = &fuse_dir_inode_operations; inode->i_fop = &fuse_dir_operations; + + spin_lock_init(&fi->rdc.lock); + fi->rdc.cached = false; + fi->rdc.size = 0; + fi->rdc.pos = 0; + fi->rdc.version = 0; } +static int fuse_symlink_readpage(struct file *null, struct page *page) +{ + int err = fuse_readlink_page(page->mapping->host, page); + + if (!err) + SetPageUptodate(page); + + unlock_page(page); + + return err; +} + +static const struct address_space_operations fuse_symlink_aops = { + .readpage = fuse_symlink_readpage, +}; + void fuse_init_symlink(struct inode *inode) { inode->i_op = &fuse_symlink_inode_operations; + inode->i_data.a_ops = &fuse_symlink_aops; + inode_nohighmem(inode); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c @@ -59,6 +59,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) } INIT_LIST_HEAD(&ff->write_entry); + mutex_init(&ff->readdir.lock); refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -73,6 +74,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) void fuse_file_free(struct fuse_file *ff) { fuse_request_free(ff->reserved_req); + mutex_destroy(&ff->readdir.lock); kfree(ff); } @@ -848,11 +850,11 @@ static int fuse_readpages_fill(void *_data, struct page *page) fuse_wait_on_page_writeback(inode, page->index); if (req->num_pages && - (req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (req->num_pages == fc->max_pages || (req->num_pages + 1) * PAGE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { - int nr_alloc = min_t(unsigned, data->nr_pages, - FUSE_MAX_PAGES_PER_REQ); + unsigned int nr_alloc = min_t(unsigned int, data->nr_pages, + fc->max_pages); fuse_send_readpages(req, data->file); if (fc->async_read) req = fuse_get_req_for_background(fc, nr_alloc); @@ -887,7 +889,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; - int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ); + unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages); err = -EIO; if (is_bad_inode(inode)) @@ -1102,12 +1104,13 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, return count > 0 ? count : err; } -static inline unsigned fuse_wr_pages(loff_t pos, size_t len) +static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, + unsigned int max_pages) { - return min_t(unsigned, + return min_t(unsigned int, ((pos + len - 1) >> PAGE_SHIFT) - (pos >> PAGE_SHIFT) + 1, - FUSE_MAX_PAGES_PER_REQ); + max_pages); } static ssize_t fuse_perform_write(struct kiocb *iocb, @@ -1129,7 +1132,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, do { struct fuse_req *req; ssize_t count; - unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii)); + unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), + fc->max_pages); req = fuse_get_req(fc, nr_pages); if (IS_ERR(req)) { @@ -1319,11 +1323,6 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, return ret < 0 ? ret : 0; } -static inline int fuse_iter_npages(const struct iov_iter *ii_p) -{ - return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ); -} - ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { @@ -1343,9 +1342,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, int err = 0; if (io->async) - req = fuse_get_req_for_background(fc, fuse_iter_npages(iter)); + req = fuse_get_req_for_background(fc, iov_iter_npages(iter, + fc->max_pages)); else - req = fuse_get_req(fc, fuse_iter_npages(iter)); + req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1390,9 +1390,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, fuse_put_request(fc, req); if (io->async) req = fuse_get_req_for_background(fc, - fuse_iter_npages(iter)); + iov_iter_npages(iter, fc->max_pages)); else - req = fuse_get_req(fc, fuse_iter_npages(iter)); + req = fuse_get_req(fc, iov_iter_npages(iter, + fc->max_pages)); if (IS_ERR(req)) break; } @@ -1418,7 +1419,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, res = fuse_direct_io(io, iter, ppos, 0); - fuse_invalidate_attr(inode); + fuse_invalidate_atime(inode); return res; } @@ -1487,6 +1488,7 @@ __acquires(fc->lock) struct fuse_inode *fi = get_fuse_inode(req->inode); struct fuse_write_in *inarg = &req->misc.write.in; __u64 data_size = req->num_pages * PAGE_SIZE; + bool queued; if (!fc->connected) goto out_free; @@ -1502,7 +1504,8 @@ __acquires(fc->lock) req->in.args[1].size = inarg->size; fi->writectr++; - fuse_request_send_background_locked(fc, req); + queued = fuse_request_queue_background(fc, req); + WARN_ON(!queued); return; out_free: @@ -1819,12 +1822,18 @@ static int fuse_writepages_fill(struct page *page, is_writeback = fuse_page_is_writeback(inode, page->index); if (req && req->num_pages && - (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (is_writeback || req->num_pages == fc->max_pages || (req->num_pages + 1) * PAGE_SIZE > fc->max_write || data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { fuse_writepages_send(data); data->req = NULL; + } else if (req && req->num_pages == req->max_pages) { + if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) { + fuse_writepages_send(data); + req = data->req = NULL; + } } + err = -ENOMEM; tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) @@ -1847,7 +1856,7 @@ static int fuse_writepages_fill(struct page *page, struct fuse_inode *fi = get_fuse_inode(inode); err = -ENOMEM; - req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ); + req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); if (!req) { __free_page(tmp_page); goto out_unlock; @@ -1904,6 +1913,7 @@ static int fuse_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_wb_data data; int err; @@ -1916,7 +1926,7 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + data.orig_pages = kcalloc(fc->max_pages, sizeof(struct page *), GFP_NOFS); if (!data.orig_pages) @@ -2387,10 +2397,11 @@ static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, } /* Make sure iov_length() won't overflow */ -static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, + size_t count) { size_t n; - u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + u32 max = fc->max_pages << PAGE_SHIFT; for (n = 0; n < count; n++, iov++) { if (iov->iov_len > (size_t) max) @@ -2514,7 +2525,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL); + pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!pages || !iov_page) goto out; @@ -2553,7 +2564,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, /* make sure there are enough buffer pages and init request with them */ err = -ENOMEM; - if (max_pages > FUSE_MAX_PAGES_PER_REQ) + if (max_pages > fc->max_pages) goto out; while (num_pages < max_pages) { pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); @@ -2640,11 +2651,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iov = iov_page; out_iov = in_iov + in_iovs; - err = fuse_verify_ioctl_iov(in_iov, in_iovs); + err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs); if (err) goto out; - err = fuse_verify_ioctl_iov(out_iov, out_iovs); + err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs); if (err) goto out; @@ -2835,9 +2846,9 @@ static void fuse_do_truncate(struct file *file) fuse_do_setattr(file_dentry(file), &attr, file); } -static inline loff_t fuse_round_up(loff_t off) +static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) { - return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); + return round_up(off, fc->max_pages << PAGE_SHIFT); } static ssize_t @@ -2866,7 +2877,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { if (offset >= i_size) return 0; - iov_iter_truncate(iter, fuse_round_up(i_size - offset)); + iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); count = iov_iter_count(iter); } @@ -3011,6 +3022,82 @@ out: return err; } +static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) +{ + struct fuse_file *ff_in = file_in->private_data; + struct fuse_file *ff_out = file_out->private_data; + struct inode *inode_out = file_inode(file_out); + struct fuse_inode *fi_out = get_fuse_inode(inode_out); + struct fuse_conn *fc = ff_in->fc; + FUSE_ARGS(args); + struct fuse_copy_file_range_in inarg = { + .fh_in = ff_in->fh, + .off_in = pos_in, + .nodeid_out = ff_out->nodeid, + .fh_out = ff_out->fh, + .off_out = pos_out, + .len = len, + .flags = flags + }; + struct fuse_write_out outarg; + ssize_t err; + /* mark unstable when write-back is not used, and file_out gets + * extended */ + bool is_unstable = (!fc->writeback_cache) && + ((pos_out + len) > inode_out->i_size); + + if (fc->no_copy_file_range) + return -EOPNOTSUPP; + + inode_lock(inode_out); + + if (fc->writeback_cache) { + err = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + len); + if (err) + goto out; + + fuse_sync_writes(inode_out); + } + + if (is_unstable) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); + + args.in.h.opcode = FUSE_COPY_FILE_RANGE; + args.in.h.nodeid = ff_in->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } + if (err) + goto out; + + if (fc->writeback_cache) { + fuse_write_update_size(inode_out, pos_out + outarg.size); + file_update_time(file_out); + } + + fuse_invalidate_attr(inode_out); + + err = outarg.size; +out: + if (is_unstable) + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); + + inode_unlock(inode_out); + + return err; +} + static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read_iter = fuse_file_read_iter, @@ -3027,6 +3114,7 @@ static const struct file_operations fuse_file_operations = { .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, .fallocate = fuse_file_fallocate, + .copy_file_range = fuse_copy_file_range, }; static const struct file_operations fuse_direct_io_file_operations = { @@ -3062,6 +3150,14 @@ static const struct address_space_operations fuse_file_aops = { void fuse_init_file_inode(struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + + INIT_LIST_HEAD(&fi->write_files); + INIT_LIST_HEAD(&fi->queued_writes); + fi->writectr = 0; + init_waitqueue_head(&fi->page_waitq); + INIT_LIST_HEAD(&fi->writepages); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h @@ -28,8 +28,11 @@ #include <linux/refcount.h> #include <linux/user_namespace.h> -/** Max number of pages that can be used in a single read request */ -#define FUSE_MAX_PAGES_PER_REQ 32 +/** Default max number of pages that can be used in a single read request */ +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 + +/** Maximum of max_pages received in init_out */ +#define FUSE_MAX_MAX_PAGES 256 /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN @@ -77,6 +80,9 @@ struct fuse_inode { /** Time in jiffies until the file attributes are valid */ u64 i_time; + /* Which attributes are invalid */ + u32 inval_mask; + /** The sticky bit in inode->i_mode may have been removed, so preserve the original mode */ umode_t orig_i_mode; @@ -87,21 +93,51 @@ struct fuse_inode { /** Version of last attribute change */ u64 attr_version; - /** Files usable in writepage. Protected by fc->lock */ - struct list_head write_files; + union { + /* Write related fields (regular file only) */ + struct { + /* Files usable in writepage. Protected by fc->lock */ + struct list_head write_files; + + /* Writepages pending on truncate or fsync */ + struct list_head queued_writes; - /** Writepages pending on truncate or fsync */ - struct list_head queued_writes; + /* Number of sent writes, a negative bias + * (FUSE_NOWRITE) means more writes are blocked */ + int writectr; + + /* Waitq for writepage completion */ + wait_queue_head_t page_waitq; + + /* List of writepage requestst (pending or sent) */ + struct list_head writepages; + }; + + /* readdir cache (directory only) */ + struct { + /* true if fully cached */ + bool cached; - /** Number of sent writes, a negative bias (FUSE_NOWRITE) - * means more writes are blocked */ - int writectr; + /* size of cache */ + loff_t size; - /** Waitq for writepage completion */ - wait_queue_head_t page_waitq; + /* position at end of cache (position of next entry) */ + loff_t pos; - /** List of writepage requestst (pending or sent) */ - struct list_head writepages; + /* version of the cache */ + u64 version; + + /* modification time of directory when cache was + * started */ + struct timespec64 mtime; + + /* iversion of directory when cache was started */ + u64 iversion; + + /* protects above fields */ + spinlock_t lock; + } rdc; + }; /** Miscellaneous bits describing inode state */ unsigned long state; @@ -148,6 +184,25 @@ struct fuse_file { /** Entry on inode's write_files list */ struct list_head write_entry; + /* Readdir related */ + struct { + /* + * Protects below fields against (crazy) parallel readdir on + * same open file. Uncontended in the normal case. + */ + struct mutex lock; + + /* Dir stream position */ + loff_t pos; + + /* Offset in cache */ + loff_t cache_off; + + /* Version of cache we are reading */ + u64 version; + + } readdir; + /** RB node to be linked on fuse_conn->polled_files */ struct rb_node polled_node; @@ -311,9 +366,6 @@ struct fuse_req { /** refcount */ refcount_t count; - /** Unique ID for the interrupt request */ - u64 intr_unique; - /* Request flags, updated with test/set/clear_bit() */ unsigned long flags; @@ -411,6 +463,9 @@ struct fuse_iqueue { struct fasync_struct *fasync; }; +#define FUSE_PQ_HASH_BITS 8 +#define FUSE_PQ_HASH_SIZE (1 << FUSE_PQ_HASH_BITS) + struct fuse_pqueue { /** Connection established */ unsigned connected; @@ -418,8 +473,8 @@ struct fuse_pqueue { /** Lock protecting accessess to members of this structure */ spinlock_t lock; - /** The list of requests being processed */ - struct list_head processing; + /** Hash table of requests being processed */ + struct list_head *processing; /** The list of requests under I/O */ struct list_head io; @@ -476,6 +531,9 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; + /** Maxmum number of pages that can be used in a single request */ + unsigned int max_pages; + /** Input queue */ struct fuse_iqueue iq; @@ -500,6 +558,10 @@ struct fuse_conn { /** The list of background requests set aside for later queuing */ struct list_head bg_queue; + /** Protects: max_background, congestion_threshold, num_background, + * active_background, bg_queue, blocked */ + spinlock_t bg_lock; + /** Flag indicating that INIT reply has been received. Allocating * any fuse request will be suspended until the flag is set */ int initialized; @@ -551,6 +613,9 @@ struct fuse_conn { /** handle fs handles killing suid/sgid/cap on write/chown/trunc */ unsigned handle_killpriv:1; + /** cache READLINK responses in page cache */ + unsigned cache_symlinks:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -637,6 +702,9 @@ struct fuse_conn { /** Allow other than the mounter user to access the filesystem ? */ unsigned allow_other:1; + /** Does the filesystem support copy_file_range? */ + unsigned no_copy_file_range:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -697,6 +765,11 @@ static inline u64 get_node_id(struct inode *inode) return get_fuse_inode(inode)->nodeid; } +static inline int invalid_nodeid(u64 nodeid) +{ + return !nodeid || nodeid == FUSE_ROOT_ID; +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -812,6 +885,10 @@ struct fuse_req *fuse_request_alloc(unsigned npages); struct fuse_req *fuse_request_alloc_nofs(unsigned npages); +bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, + gfp_t flags); + + /** * Free a request */ @@ -856,9 +933,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); * Send a request in the background */ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); - -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req); +bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); @@ -873,6 +948,9 @@ void fuse_invalidate_entry_cache(struct dentry *entry); void fuse_invalidate_atime(struct inode *inode); +u64 entry_attr_timeout(struct fuse_entry_out *o); +void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); + /** * Acquire reference to fuse_conn */ @@ -992,4 +1070,8 @@ struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type); int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); + +/* readdir.c */ +int fuse_readdir(struct file *file, struct dir_context *ctx); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c @@ -90,16 +90,12 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi = get_fuse_inode(inode); fi->i_time = 0; + fi->inval_mask = 0; fi->nodeid = 0; fi->nlookup = 0; fi->attr_version = 0; - fi->writectr = 0; fi->orig_ino = 0; fi->state = 0; - INIT_LIST_HEAD(&fi->write_files); - INIT_LIST_HEAD(&fi->queued_writes); - INIT_LIST_HEAD(&fi->writepages); - init_waitqueue_head(&fi->page_waitq); mutex_init(&fi->mutex); fi->forget = fuse_alloc_forget(); if (!fi->forget) { @@ -119,8 +115,10 @@ static void fuse_i_callback(struct rcu_head *head) static void fuse_destroy_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); - BUG_ON(!list_empty(&fi->write_files)); - BUG_ON(!list_empty(&fi->queued_writes)); + if (S_ISREG(inode->i_mode)) { + WARN_ON(!list_empty(&fi->write_files)); + WARN_ON(!list_empty(&fi->queued_writes)); + } mutex_destroy(&fi->mutex); kfree(fi->forget); call_rcu(&inode->i_rcu, fuse_i_callback); @@ -167,6 +165,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, fi->attr_version = ++fc->attr_version; fi->i_time = attr_valid; + WRITE_ONCE(fi->inval_mask, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -594,9 +593,11 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq) static void fuse_pqueue_init(struct fuse_pqueue *fpq) { - memset(fpq, 0, sizeof(struct fuse_pqueue)); + unsigned int i; + spin_lock_init(&fpq->lock); - INIT_LIST_HEAD(&fpq->processing); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + INIT_LIST_HEAD(&fpq->processing[i]); INIT_LIST_HEAD(&fpq->io); fpq->connected = 1; } @@ -605,6 +606,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); + spin_lock_init(&fc->bg_lock); init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); @@ -852,6 +854,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); + spin_lock(&fc->bg_lock); if (arg->max_background) { fc->max_background = arg->max_background; @@ -865,6 +868,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) fc->congestion_threshold > max_user_congthresh) fc->congestion_threshold = max_user_congthresh; } + spin_unlock(&fc->bg_lock); } static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) @@ -924,8 +928,15 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->posix_acl = 1; fc->sb->s_xattr = fuse_acl_xattr_handlers; } + if (arg->flags & FUSE_CACHE_SYMLINKS) + fc->cache_symlinks = 1; if (arg->flags & FUSE_ABORT_ERROR) fc->abort_err = 1; + if (arg->flags & FUSE_MAX_PAGES) { + fc->max_pages = + min_t(unsigned int, FUSE_MAX_MAX_PAGES, + max_t(unsigned int, arg->max_pages, 1)); + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -957,7 +968,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | - FUSE_ABORT_ERROR; + FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1022,17 +1033,26 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) { struct fuse_dev *fud; + struct list_head *pq; fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); - if (fud) { - fud->fc = fuse_conn_get(fc); - fuse_pqueue_init(&fud->pq); + if (!fud) + return NULL; - spin_lock(&fc->lock); - list_add_tail(&fud->entry, &fc->devices); - spin_unlock(&fc->lock); + pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); + if (!pq) { + kfree(fud); + return NULL; } + fud->pq.processing = pq; + fud->fc = fuse_conn_get(fc); + fuse_pqueue_init(&fud->pq); + + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); + return fud; } EXPORT_SYMBOL_GPL(fuse_dev_alloc); @@ -1141,6 +1161,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = max_t(unsigned, 4096, d.max_read); + fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; /* Used by get_root_inode() */ sb->s_fs_info = fc; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c @@ -0,0 +1,569 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2018 Miklos Szeredi <miklos@szeredi.hu> + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + + +#include "fuse_i.h" +#include <linux/iversion.h> +#include <linux/posix_acl.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> + +static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_inode *fi = get_fuse_inode(dir); + + if (!fc->do_readdirplus) + return false; + if (!fc->readdirplus_auto) + return true; + if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) + return true; + if (ctx->pos == 0) + return true; + return false; +} + +static void fuse_add_dirent_to_cache(struct file *file, + struct fuse_dirent *dirent, loff_t pos) +{ + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); + size_t reclen = FUSE_DIRENT_SIZE(dirent); + pgoff_t index; + struct page *page; + loff_t size; + u64 version; + unsigned int offset; + void *addr; + + spin_lock(&fi->rdc.lock); + /* + * Is cache already completed? Or this entry does not go at the end of + * cache? + */ + if (fi->rdc.cached || pos != fi->rdc.pos) { + spin_unlock(&fi->rdc.lock); + return; + } + version = fi->rdc.version; + size = fi->rdc.size; + offset = size & ~PAGE_MASK; + index = size >> PAGE_SHIFT; + /* Dirent doesn't fit in current page? Jump to next page. */ + if (offset + reclen > PAGE_SIZE) { + index++; + offset = 0; + } + spin_unlock(&fi->rdc.lock); + + if (offset) { + page = find_lock_page(file->f_mapping, index); + } else { + page = find_or_create_page(file->f_mapping, index, + mapping_gfp_mask(file->f_mapping)); + } + if (!page) + return; + + spin_lock(&fi->rdc.lock); + /* Raced with another readdir */ + if (fi->rdc.version != version || fi->rdc.size != size || + WARN_ON(fi->rdc.pos != pos)) + goto unlock; + + addr = kmap_atomic(page); + if (!offset) + clear_page(addr); + memcpy(addr + offset, dirent, reclen); + kunmap_atomic(addr); + fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; + fi->rdc.pos = dirent->off; +unlock: + spin_unlock(&fi->rdc.lock); + unlock_page(page); + put_page(page); +} + +static void fuse_readdir_cache_end(struct file *file, loff_t pos) +{ + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); + loff_t end; + + spin_lock(&fi->rdc.lock); + /* does cache end position match current position? */ + if (fi->rdc.pos != pos) { + spin_unlock(&fi->rdc.lock); + return; + } + + fi->rdc.cached = true; + end = ALIGN(fi->rdc.size, PAGE_SIZE); + spin_unlock(&fi->rdc.lock); + + /* truncate unused tail of cache */ + truncate_inode_pages(file->f_mapping, end); +} + +static bool fuse_emit(struct file *file, struct dir_context *ctx, + struct fuse_dirent *dirent) +{ + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_CACHE_DIR) + fuse_add_dirent_to_cache(file, dirent, ctx->pos); + + return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, + dirent->type); +} + +static int parse_dirfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx) +{ + while (nbytes >= FUSE_NAME_OFFSET) { + struct fuse_dirent *dirent = (struct fuse_dirent *) buf; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!fuse_emit(file, ctx, dirent)) + break; + + buf += reclen; + nbytes -= reclen; + ctx->pos = dirent->off; + } + + return 0; +} + +static int fuse_direntplus_link(struct file *file, + struct fuse_direntplus *direntplus, + u64 attr_version) +{ + struct fuse_entry_out *o = &direntplus->entry_out; + struct fuse_dirent *dirent = &direntplus->dirent; + struct dentry *parent = file->f_path.dentry; + struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = d_inode(parent); + struct fuse_conn *fc; + struct inode *inode; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + + if (!o->nodeid) { + /* + * Unlike in the case of fuse_lookup, zero nodeid does not mean + * ENOENT. Instead, it only means the userspace filesystem did + * not want to return attributes/handle for this entry. + * + * So do nothing. + */ + return 0; + } + + if (name.name[0] == '.') { + /* + * We could potentially refresh the attributes of the directory + * and its parent? + */ + if (name.len == 1) + return 0; + if (name.name[1] == '.' && name.len == 2) + return 0; + } + + if (invalid_nodeid(o->nodeid)) + return -EIO; + if (!fuse_valid_type(o->attr.mode)) + return -EIO; + + fc = get_fuse_conn(dir); + + name.hash = full_name_hash(parent, name.name, name.len); + dentry = d_lookup(parent, &name); + if (!dentry) { +retry: + dentry = d_alloc_parallel(parent, &name, &wq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + if (!d_in_lookup(dentry)) { + struct fuse_inode *fi; + inode = d_inode(dentry); + if (!inode || + get_node_id(inode) != o->nodeid || + ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { + d_invalidate(dentry); + dput(dentry); + goto retry; + } + if (is_bad_inode(inode)) { + dput(dentry); + return -EIO; + } + + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + + forget_all_cached_acls(inode); + fuse_change_attributes(inode, &o->attr, + entry_attr_timeout(o), + attr_version); + /* + * The other branch comes via fuse_iget() + * which bumps nlookup inside + */ + } else { + inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, + &o->attr, entry_attr_timeout(o), + attr_version); + if (!inode) + inode = ERR_PTR(-ENOMEM); + + alias = d_splice_alias(inode, dentry); + d_lookup_done(dentry); + if (alias) { + dput(dentry); + dentry = alias; + } + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + if (fc->readdirplus_auto) + set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + fuse_change_entry_timeout(dentry, o); + + dput(dentry); + return 0; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx, u64 attr_version) +{ + struct fuse_direntplus *direntplus; + struct fuse_dirent *dirent; + size_t reclen; + int over = 0; + int ret; + + while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { + direntplus = (struct fuse_direntplus *) buf; + dirent = &direntplus->dirent; + reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!over) { + /* We fill entries into dstbuf only as much as + it can hold. But we still continue iterating + over remaining entries to link them. If not, + we need to send a FORGET for each of those + which we did not link. + */ + over = !fuse_emit(file, ctx, dirent); + if (!over) + ctx->pos = dirent->off; + } + + buf += reclen; + nbytes -= reclen; + + ret = fuse_direntplus_link(file, direntplus, attr_version); + if (ret) + fuse_force_forget(file, direntplus->entry_out.nodeid); + } + + return 0; +} + +static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) +{ + int plus, err; + size_t nbytes; + struct page *page; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + u64 attr_version = 0; + bool locked; + + req = fuse_get_req(fc, 1); + if (IS_ERR(req)) + return PTR_ERR(req); + + page = alloc_page(GFP_KERNEL); + if (!page) { + fuse_put_request(fc, req); + return -ENOMEM; + } + + plus = fuse_use_readdirplus(inode, ctx); + req->out.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + req->page_descs[0].length = PAGE_SIZE; + if (plus) { + attr_version = fuse_get_attr_version(fc); + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, + FUSE_READDIRPLUS); + } else { + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, + FUSE_READDIR); + } + locked = fuse_lock_inode(inode); + fuse_request_send(fc, req); + fuse_unlock_inode(inode, locked); + nbytes = req->out.args[0].size; + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + if (!nbytes) { + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_CACHE_DIR) + fuse_readdir_cache_end(file, ctx->pos); + } else if (plus) { + err = parse_dirplusfile(page_address(page), nbytes, + file, ctx, attr_version); + } else { + err = parse_dirfile(page_address(page), nbytes, file, + ctx); + } + } + + __free_page(page); + fuse_invalidate_atime(inode); + return err; +} + +enum fuse_parse_result { + FOUND_ERR = -1, + FOUND_NONE = 0, + FOUND_SOME, + FOUND_ALL, +}; + +static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, + void *addr, unsigned int size, + struct dir_context *ctx) +{ + unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK; + enum fuse_parse_result res = FOUND_NONE; + + WARN_ON(offset >= size); + + for (;;) { + struct fuse_dirent *dirent = addr + offset; + unsigned int nbytes = size - offset; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + + if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen) + break; + + if (WARN_ON(dirent->namelen > FUSE_NAME_MAX)) + return FOUND_ERR; + if (WARN_ON(reclen > nbytes)) + return FOUND_ERR; + if (WARN_ON(memchr(dirent->name, '/', dirent->namelen) != NULL)) + return FOUND_ERR; + + if (ff->readdir.pos == ctx->pos) { + res = FOUND_SOME; + if (!dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type)) + return FOUND_ALL; + ctx->pos = dirent->off; + } + ff->readdir.pos = dirent->off; + ff->readdir.cache_off += reclen; + + offset += reclen; + } + + return res; +} + +static void fuse_rdc_reset(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + fi->rdc.cached = false; + fi->rdc.version++; + fi->rdc.size = 0; + fi->rdc.pos = 0; +} + +#define UNCACHED 1 + +static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + enum fuse_parse_result res; + pgoff_t index; + unsigned int size; + struct page *page; + void *addr; + + /* Seeked? If so, reset the cache stream */ + if (ff->readdir.pos != ctx->pos) { + ff->readdir.pos = 0; + ff->readdir.cache_off = 0; + } + + /* + * We're just about to start reading into the cache or reading the + * cache; both cases require an up-to-date mtime value. + */ + if (!ctx->pos && fc->auto_inval_data) { + int err = fuse_update_attributes(inode, file); + + if (err) + return err; + } + +retry: + spin_lock(&fi->rdc.lock); +retry_locked: + if (!fi->rdc.cached) { + /* Starting cache? Set cache mtime. */ + if (!ctx->pos && !fi->rdc.size) { + fi->rdc.mtime = inode->i_mtime; + fi->rdc.iversion = inode_query_iversion(inode); + } + spin_unlock(&fi->rdc.lock); + return UNCACHED; + } + /* + * When at the beginning of the directory (i.e. just after opendir(3) or + * rewinddir(3)), then need to check whether directory contents have + * changed, and reset the cache if so. + */ + if (!ctx->pos) { + if (inode_peek_iversion(inode) != fi->rdc.iversion || + !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) { + fuse_rdc_reset(inode); + goto retry_locked; + } + } + + /* + * If cache version changed since the last getdents() call, then reset + * the cache stream. + */ + if (ff->readdir.version != fi->rdc.version) { + ff->readdir.pos = 0; + ff->readdir.cache_off = 0; + } + /* + * If at the beginning of the cache, than reset version to + * current. + */ + if (ff->readdir.pos == 0) + ff->readdir.version = fi->rdc.version; + + WARN_ON(fi->rdc.size < ff->readdir.cache_off); + + index = ff->readdir.cache_off >> PAGE_SHIFT; + + if (index == (fi->rdc.size >> PAGE_SHIFT)) + size = fi->rdc.size & ~PAGE_MASK; + else + size = PAGE_SIZE; + spin_unlock(&fi->rdc.lock); + + /* EOF? */ + if ((ff->readdir.cache_off & ~PAGE_MASK) == size) + return 0; + + page = find_get_page_flags(file->f_mapping, index, + FGP_ACCESSED | FGP_LOCK); + spin_lock(&fi->rdc.lock); + if (!page) { + /* + * Uh-oh: page gone missing, cache is useless + */ + if (fi->rdc.version == ff->readdir.version) + fuse_rdc_reset(inode); + goto retry_locked; + } + + /* Make sure it's still the same version after getting the page. */ + if (ff->readdir.version != fi->rdc.version) { + spin_unlock(&fi->rdc.lock); + unlock_page(page); + put_page(page); + goto retry; + } + spin_unlock(&fi->rdc.lock); + + /* + * Contents of the page are now protected against changing by holding + * the page lock. + */ + addr = kmap(page); + res = fuse_parse_cache(ff, addr, size, ctx); + kunmap(page); + unlock_page(page); + put_page(page); + + if (res == FOUND_ERR) + return -EIO; + + if (res == FOUND_ALL) + return 0; + + if (size == PAGE_SIZE) { + /* We hit end of page: skip to next page. */ + ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE); + goto retry; + } + + /* + * End of cache reached. If found position, then we are done, otherwise + * need to fall back to uncached, since the position we were looking for + * wasn't in the cache. + */ + return res == FOUND_SOME ? 0 : UNCACHED; +} + +int fuse_readdir(struct file *file, struct dir_context *ctx) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + int err; + + if (is_bad_inode(inode)) + return -EIO; + + mutex_lock(&ff->readdir.lock); + + err = UNCACHED; + if (ff->open_flags & FOPEN_CACHE_DIR) + err = fuse_readdir_cached(file, ctx); + if (err == UNCACHED) + err = fuse_readdir_uncached(file, ctx); + + mutex_unlock(&ff->readdir.lock); + + return err; +} diff --git a/include/linux/bitops.h b/include/linux/bitops.h @@ -236,33 +236,33 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, #ifdef __KERNEL__ #ifndef set_mask_bits -#define set_mask_bits(ptr, _mask, _bits) \ +#define set_mask_bits(ptr, mask, bits) \ ({ \ - const typeof(*ptr) mask = (_mask), bits = (_bits); \ - typeof(*ptr) old, new; \ + const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \ + typeof(*(ptr)) old__, new__; \ \ do { \ - old = READ_ONCE(*ptr); \ - new = (old & ~mask) | bits; \ - } while (cmpxchg(ptr, old, new) != old); \ + old__ = READ_ONCE(*(ptr)); \ + new__ = (old__ & ~mask__) | bits__; \ + } while (cmpxchg(ptr, old__, new__) != old__); \ \ - new; \ + new__; \ }) #endif #ifndef bit_clear_unless -#define bit_clear_unless(ptr, _clear, _test) \ +#define bit_clear_unless(ptr, clear, test) \ ({ \ - const typeof(*ptr) clear = (_clear), test = (_test); \ - typeof(*ptr) old, new; \ + const typeof(*(ptr)) clear__ = (clear), test__ = (test);\ + typeof(*(ptr)) old__, new__; \ \ do { \ - old = READ_ONCE(*ptr); \ - new = old & ~clear; \ - } while (!(old & test) && \ - cmpxchg(ptr, old, new) != old); \ + old__ = READ_ONCE(*(ptr)); \ + new__ = old__ & ~clear__; \ + } while (!(old__ & test__) && \ + cmpxchg(ptr, old__, new__) != old__); \ \ - !(old & test); \ + !(old__ & test__); \ }) #endif diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h @@ -116,6 +116,12 @@ * * 7.27 * - add FUSE_ABORT_ERROR + * + * 7.28 + * - add FUSE_COPY_FILE_RANGE + * - add FOPEN_CACHE_DIR + * - add FUSE_MAX_PAGES, add max_pages to init_out + * - add FUSE_CACHE_SYMLINKS */ #ifndef _LINUX_FUSE_H @@ -151,7 +157,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 27 +#define FUSE_KERNEL_MINOR_VERSION 28 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -219,10 +225,12 @@ struct fuse_file_lock { * FOPEN_DIRECT_IO: bypass page cache for this open file * FOPEN_KEEP_CACHE: don't invalidate the data cache on open * FOPEN_NONSEEKABLE: the file is not seekable + * FOPEN_CACHE_DIR: allow caching this directory */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) #define FOPEN_NONSEEKABLE (1 << 2) +#define FOPEN_CACHE_DIR (1 << 3) /** * INIT request/reply flags @@ -249,6 +257,8 @@ struct fuse_file_lock { * FUSE_HANDLE_KILLPRIV: fs handles killing suid/sgid/cap on write/chown/trunc * FUSE_POSIX_ACL: filesystem supports posix acls * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED + * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages + * FUSE_CACHE_SYMLINKS: cache READLINK responses */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -272,6 +282,8 @@ struct fuse_file_lock { #define FUSE_HANDLE_KILLPRIV (1 << 19) #define FUSE_POSIX_ACL (1 << 20) #define FUSE_ABORT_ERROR (1 << 21) +#define FUSE_MAX_PAGES (1 << 22) +#define FUSE_CACHE_SYMLINKS (1 << 23) /** * CUSE INIT request/reply flags @@ -337,53 +349,54 @@ struct fuse_file_lock { #define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0) enum fuse_opcode { - FUSE_LOOKUP = 1, - FUSE_FORGET = 2, /* no reply */ - FUSE_GETATTR = 3, - FUSE_SETATTR = 4, - FUSE_READLINK = 5, - FUSE_SYMLINK = 6, - FUSE_MKNOD = 8, - FUSE_MKDIR = 9, - FUSE_UNLINK = 10, - FUSE_RMDIR = 11, - FUSE_RENAME = 12, - FUSE_LINK = 13, - FUSE_OPEN = 14, - FUSE_READ = 15, - FUSE_WRITE = 16, - FUSE_STATFS = 17, - FUSE_RELEASE = 18, - FUSE_FSYNC = 20, - FUSE_SETXATTR = 21, - FUSE_GETXATTR = 22, - FUSE_LISTXATTR = 23, - FUSE_REMOVEXATTR = 24, - FUSE_FLUSH = 25, - FUSE_INIT = 26, - FUSE_OPENDIR = 27, - FUSE_READDIR = 28, - FUSE_RELEASEDIR = 29, - FUSE_FSYNCDIR = 30, - FUSE_GETLK = 31, - FUSE_SETLK = 32, - FUSE_SETLKW = 33, - FUSE_ACCESS = 34, - FUSE_CREATE = 35, - FUSE_INTERRUPT = 36, - FUSE_BMAP = 37, - FUSE_DESTROY = 38, - FUSE_IOCTL = 39, - FUSE_POLL = 40, - FUSE_NOTIFY_REPLY = 41, - FUSE_BATCH_FORGET = 42, - FUSE_FALLOCATE = 43, - FUSE_READDIRPLUS = 44, - FUSE_RENAME2 = 45, - FUSE_LSEEK = 46, + FUSE_LOOKUP = 1, + FUSE_FORGET = 2, /* no reply */ + FUSE_GETATTR = 3, + FUSE_SETATTR = 4, + FUSE_READLINK = 5, + FUSE_SYMLINK = 6, + FUSE_MKNOD = 8, + FUSE_MKDIR = 9, + FUSE_UNLINK = 10, + FUSE_RMDIR = 11, + FUSE_RENAME = 12, + FUSE_LINK = 13, + FUSE_OPEN = 14, + FUSE_READ = 15, + FUSE_WRITE = 16, + FUSE_STATFS = 17, + FUSE_RELEASE = 18, + FUSE_FSYNC = 20, + FUSE_SETXATTR = 21, + FUSE_GETXATTR = 22, + FUSE_LISTXATTR = 23, + FUSE_REMOVEXATTR = 24, + FUSE_FLUSH = 25, + FUSE_INIT = 26, + FUSE_OPENDIR = 27, + FUSE_READDIR = 28, + FUSE_RELEASEDIR = 29, + FUSE_FSYNCDIR = 30, + FUSE_GETLK = 31, + FUSE_SETLK = 32, + FUSE_SETLKW = 33, + FUSE_ACCESS = 34, + FUSE_CREATE = 35, + FUSE_INTERRUPT = 36, + FUSE_BMAP = 37, + FUSE_DESTROY = 38, + FUSE_IOCTL = 39, + FUSE_POLL = 40, + FUSE_NOTIFY_REPLY = 41, + FUSE_BATCH_FORGET = 42, + FUSE_FALLOCATE = 43, + FUSE_READDIRPLUS = 44, + FUSE_RENAME2 = 45, + FUSE_LSEEK = 46, + FUSE_COPY_FILE_RANGE = 47, /* CUSE specific operations */ - CUSE_INIT = 4096, + CUSE_INIT = 4096, }; enum fuse_notify_code { @@ -610,7 +623,9 @@ struct fuse_init_out { uint16_t congestion_threshold; uint32_t max_write; uint32_t time_gran; - uint32_t unused[9]; + uint16_t max_pages; + uint16_t padding; + uint32_t unused[8]; }; #define CUSE_INIT_INFO_MAX 4096 @@ -792,4 +807,14 @@ struct fuse_lseek_out { uint64_t offset; }; +struct fuse_copy_file_range_in { + uint64_t fh_in; + uint64_t off_in; + uint64_t nodeid_out; + uint64_t fh_out; + uint64_t off_out; + uint64_t len; + uint64_t flags; +}; + #endif /* _LINUX_FUSE_H */