whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 6b3a707736301c2128ca85ce85fb13f60b5e350a
parent 4443f8e6ac7755cd775c70d08be8042dc2f936cb
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Sun, 14 Apr 2019 15:09:40 -0700

Merge branch 'page-refs' (page ref overflow)

Merge page ref overflow branch.

Jann Horn reported that he can overflow the page ref count with
sufficient memory (and a filesystem that is intentionally extremely
slow).

Admittedly it's not exactly easy.  To have more than four billion
references to a page requires a minimum of 32GB of kernel memory just
for the pointers to the pages, much less any metadata to keep track of
those pointers.  Jann needed a total of 140GB of memory and a specially
crafted filesystem that leaves all reads pending (in order to not ever
free the page references and just keep adding more).

Still, we have a fairly straightforward way to limit the two obvious
user-controllable sources of page references: direct-IO like page
references gotten through get_user_pages(), and the splice pipe page
duplication.  So let's just do that.

* branch page-refs:
  fs: prevent page refcount overflow in pipe_buf_get
  mm: prevent get_user_pages() from overflowing page refcount
  mm: add 'try_get_page()' helper function
  mm: make page ref count overflow check tighter and more explicit

Diffstat:
Mfs/fuse/dev.c | 12++++++------
Mfs/pipe.c | 4++--
Mfs/splice.c | 12++++++++++--
Minclude/linux/mm.h | 15++++++++++++++-
Minclude/linux/pipe_fs_i.h | 10++++++----
Mkernel/trace/trace.c | 6+++++-
Mmm/gup.c | 48++++++++++++++++++++++++++++++++++++------------
Mmm/hugetlb.c | 13+++++++++++++
8 files changed, 92 insertions(+), 28 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c @@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; ret = -EINVAL; - if (rem < len) { - pipe_unlock(pipe); - goto out; - } + if (rem < len) + goto out_free; rem = len; while (rem) { @@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; } else { - pipe_buf_get(pipe, ibuf); + if (!pipe_buf_get(pipe, ibuf)) + goto out_free; + *obuf = *ibuf; obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = rem; @@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); pipe_lock(pipe); +out_free: for (idx = 0; idx < nbuf; idx++) pipe_buf_release(pipe, &bufs[idx]); pipe_unlock(pipe); -out: kvfree(bufs); return ret; } diff --git a/fs/pipe.c b/fs/pipe.c @@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal); * in the tee() system call, when we duplicate the buffers in one * pipe into another. */ -void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - get_page(buf->page); + return try_get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); diff --git a/fs/splice.c b/fs/splice.c @@ -1593,7 +1593,11 @@ retry: * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } *obuf = *ibuf; /* @@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } obuf = opipe->bufs + nbuf; *obuf = *ibuf; diff --git a/include/linux/mm.h b/include/linux/mm.h @@ -966,6 +966,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page) } #endif /* CONFIG_DEV_PAGEMAP_OPS */ +/* 127: arbitrary random number, small enough to assemble well */ +#define page_ref_zero_or_close_to_overflow(page) \ + ((unsigned int) page_ref_count(page) + 127u <= 127u) + static inline void get_page(struct page *page) { page = compound_head(page); @@ -973,8 +977,17 @@ static inline void get_page(struct page *page) * Getting a normal page or the head of a compound page * requires to already have an elevated page->_refcount. */ - VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); + VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); + page_ref_inc(page); +} + +static inline __must_check bool try_get_page(struct page *page) +{ + page = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) + return false; page_ref_inc(page); + return true; } static inline void put_page(struct page *page) diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h @@ -101,18 +101,20 @@ struct pipe_buf_operations { /* * Get a reference to the pipe buffer. */ - void (*get)(struct pipe_inode_info *, struct pipe_buffer *); + bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; /** * pipe_buf_get - get a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to + * + * Return: %true if the reference was successfully obtained. */ -static inline void pipe_buf_get(struct pipe_inode_info *pipe, +static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - buf->ops->get(pipe, buf); + return buf->ops->get(pipe, buf); } /** @@ -171,7 +173,7 @@ struct pipe_inode_info *alloc_pipe_info(void); void free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ -void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); +bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c @@ -7041,12 +7041,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, buf->private = 0; } -static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, +static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; + if (ref->ref > INT_MAX/2) + return false; + ref->ref++; + return true; } /* Pipe buffer operations for a buffer. */ diff --git a/mm/gup.c b/mm/gup.c @@ -160,8 +160,12 @@ retry: goto retry; } - if (flags & FOLL_GET) - get_page(page); + if (flags & FOLL_GET) { + if (unlikely(!try_get_page(page))) { + page = ERR_PTR(-ENOMEM); + goto out; + } + } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -298,7 +302,10 @@ retry_locked: if (pmd_trans_unstable(pmd)) ret = -EBUSY; } else { - get_page(page); + if (unlikely(!try_get_page(page))) { + spin_unlock(ptl); + return ERR_PTR(-ENOMEM); + } spin_unlock(ptl); lock_page(page); ret = split_huge_page(page); @@ -500,7 +507,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if (is_device_public_page(*page)) goto unmap; } - get_page(*page); + if (unlikely(!try_get_page(*page))) { + ret = -ENOMEM; + goto unmap; + } out: ret = 0; unmap: @@ -1545,6 +1555,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } } +/* + * Return the compund head page with ref appropriately incremented, + * or NULL if that failed. + */ +static inline struct page *try_get_compound_head(struct page *page, int refs) +{ + struct page *head = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(head) < 0)) + return NULL; + if (unlikely(!page_cache_add_speculative(head, refs))) + return NULL; + return head; +} + #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) @@ -1579,9 +1603,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - head = compound_head(page); - if (!page_cache_get_speculative(head)) + head = try_get_compound_head(page, 1); + if (!head) goto pte_unmap; if (unlikely(pte_val(pte) != pte_val(*ptep))) { @@ -1720,8 +1744,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pmd_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pmd_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1758,8 +1782,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pud_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pud_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1795,8 +1819,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pgd_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pgd_page(orig), refs); + if (!head) { *nr -= refs; return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c @@ -4299,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); + + /* + * Instead of doing 'try_get_page()' below in the same_page + * loop, just check the count once here. + */ + if (unlikely(page_count(page) <= 0)) { + if (pages) { + spin_unlock(ptl); + remainder = 0; + err = -ENOMEM; + break; + } + } same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset);