whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit d8f190ee836a4581ba906731835d735cb97948f5
parent 6c7954b7eb76578866eba179709c5883f29f747f
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Fri, 30 Nov 2018 18:45:49 -0800

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "31 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (31 commits)
  ocfs2: fix potential use after free
  mm/khugepaged: fix the xas_create_range() error path
  mm/khugepaged: collapse_shmem() do not crash on Compound
  mm/khugepaged: collapse_shmem() without freezing new_page
  mm/khugepaged: minor reorderings in collapse_shmem()
  mm/khugepaged: collapse_shmem() remember to clear holes
  mm/khugepaged: fix crashes due to misaccounted holes
  mm/khugepaged: collapse_shmem() stop if punched or truncated
  mm/huge_memory: fix lockdep complaint on 32-bit i_size_read()
  mm/huge_memory: splitting set mapping+index before unfreeze
  mm/huge_memory: rename freeze_page() to unmap_page()
  initramfs: clean old path before creating a hardlink
  kernel/kcov.c: mark funcs in __sanitizer_cov_trace_pc() as notrace
  psi: make disabling/enabling easier for vendor kernels
  proc: fixup map_files test on arm
  debugobjects: avoid recursive calls with kmemleak
  userfaultfd: shmem: UFFDIO_COPY: set the page dirty if VM_WRITE is not set
  userfaultfd: shmem: add i_size checks
  userfaultfd: shmem/hugetlbfs: only allow to register VM_MAYWRITE vmas
  userfaultfd: shmem: allocate anonymous memory for MAP_PRIVATE shmem
  ...

Diffstat:
MDocumentation/admin-guide/kernel-parameters.txt | 4++++
MMAINTAINERS | 13+++++++------
Mfs/hfs/btree.c | 3++-
Mfs/hfsplus/btree.c | 3++-
Mfs/ocfs2/export.c | 2+-
Mfs/ocfs2/move_extents.c | 47++++++++++++++++++++++++++---------------------
Mfs/userfaultfd.c | 15+++++++++++++++
Minclude/linux/psi.h | 3++-
Minit/Kconfig | 9+++++++++
Minit/initramfs.c | 22++++++++++++----------
Mkernel/kcov.c | 4++--
Mkernel/sched/psi.c | 30+++++++++++++++++++++---------
Mkernel/sched/stats.h | 8++++----
Mlib/debugobjects.c | 5++---
Mlib/test_kmod.c | 1-
Mmm/gup.c | 3+--
Mmm/huge_memory.c | 43+++++++++++++++++++++++++------------------
Mmm/hugetlb.c | 2+-
Mmm/khugepaged.c | 140++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mmm/page_alloc.c | 4+++-
Mmm/rmap.c | 13+++----------
Mmm/shmem.c | 43+++++++++++++++++++++++++++++++++++++------
Mmm/truncate.c | 8++++++--
Mmm/userfaultfd.c | 62++++++++++++++++++++++++++++++++++++++++++++++----------------
Mtools/testing/selftests/proc/proc-self-map-files-002.c | 9+++++++--
25 files changed, 319 insertions(+), 177 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt @@ -3505,6 +3505,10 @@ before loading. See Documentation/blockdev/ramdisk.txt. + psi= [KNL] Enable or disable pressure stall information + tracking. + Format: <bool> + psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to probe for; one of (bare|imps|exps|lifebook|any). psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports diff --git a/MAINTAINERS b/MAINTAINERS @@ -2491,7 +2491,7 @@ F: drivers/net/wireless/ath/* ATHEROS ATH5K WIRELESS DRIVER M: Jiri Slaby <jirislaby@gmail.com> M: Nick Kossifidis <mickflemm@gmail.com> -M: "Luis R. Rodriguez" <mcgrof@do-not-panic.com> +M: Luis Chamberlain <mcgrof@kernel.org> L: linux-wireless@vger.kernel.org W: http://wireless.kernel.org/en/users/Drivers/ath5k S: Maintained @@ -5835,7 +5835,7 @@ F: include/uapi/linux/firewire*.h F: tools/firewire/ FIRMWARE LOADER (request_firmware) -M: Luis R. Rodriguez <mcgrof@kernel.org> +M: Luis Chamberlain <mcgrof@kernel.org> L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/firmware_class/ @@ -8135,7 +8135,7 @@ F: tools/testing/selftests/ F: Documentation/dev-tools/kselftest* KERNEL USERMODE HELPER -M: "Luis R. Rodriguez" <mcgrof@kernel.org> +M: Luis Chamberlain <mcgrof@kernel.org> L: linux-kernel@vger.kernel.org S: Maintained F: kernel/umh.c @@ -8311,7 +8311,7 @@ F: mm/kmemleak.c F: mm/kmemleak-test.c KMOD KERNEL MODULE LOADER - USERMODE HELPER -M: "Luis R. Rodriguez" <mcgrof@kernel.org> +M: Luis Chamberlain <mcgrof@kernel.org> L: linux-kernel@vger.kernel.org S: Maintained F: kernel/kmod.c @@ -12061,7 +12061,7 @@ F: kernel/printk/ F: include/linux/printk.h PRISM54 WIRELESS DRIVER -M: "Luis R. Rodriguez" <mcgrof@gmail.com> +M: Luis Chamberlain <mcgrof@kernel.org> L: linux-wireless@vger.kernel.org W: http://wireless.kernel.org/en/users/Drivers/p54 S: Obsolete @@ -12075,9 +12075,10 @@ S: Maintained F: fs/proc/ F: include/linux/proc_fs.h F: tools/testing/selftests/proc/ +F: Documentation/filesystems/proc.txt PROC SYSCTL -M: "Luis R. Rodriguez" <mcgrof@kernel.org> +M: Luis Chamberlain <mcgrof@kernel.org> M: Kees Cook <keescook@chromium.org> L: linux-kernel@vger.kernel.org L: linux-fsdevel@vger.kernel.org diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c @@ -338,13 +338,14 @@ void hfs_bmap_free(struct hfs_bnode *node) nidx -= len * 8; i = node->next; - hfs_bnode_put(node); if (!i) { /* panic */; pr_crit("unable to free bnode %u. bmap not found!\n", node->this); + hfs_bnode_put(node); return; } + hfs_bnode_put(node); node = hfs_bnode_find(tree, i); if (IS_ERR(node)) return; diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c @@ -466,14 +466,15 @@ void hfs_bmap_free(struct hfs_bnode *node) nidx -= len * 8; i = node->next; - hfs_bnode_put(node); if (!i) { /* panic */; pr_crit("unable to free bnode %u. " "bmap not found!\n", node->this); + hfs_bnode_put(node); return; } + hfs_bnode_put(node); node = hfs_bnode_find(tree, i); if (IS_ERR(node)) return; diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c @@ -125,10 +125,10 @@ check_err: check_gen: if (handle->ih_generation != inode->i_generation) { - iput(inode); trace_ocfs2_get_dentry_generation((unsigned long long)blkno, handle->ih_generation, inode->i_generation); + iput(inode); result = ERR_PTR(-ESTALE); goto bail; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c @@ -157,18 +157,14 @@ out: } /* - * lock allocators, and reserving appropriate number of bits for - * meta blocks and data clusters. - * - * in some cases, we don't need to reserve clusters, just let data_ac - * be NULL. + * lock allocator, and reserve appropriate number of bits for + * meta blocks. */ -static int ocfs2_lock_allocators_move_extents(struct inode *inode, +static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_move, u32 extents_to_split, struct ocfs2_alloc_context **meta_ac, - struct ocfs2_alloc_context **data_ac, int extra_blocks, int *credits) { @@ -193,13 +189,6 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, goto out; } - if (data_ac) { - ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); - if (ret) { - mlog_errno(ret); - goto out; - } - } *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); @@ -259,10 +248,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, - &context->meta_ac, - &context->data_ac, - extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + *len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; @@ -285,6 +274,21 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } + /* + * Make sure ocfs2_reserve_cluster is called after + * __ocfs2_flush_truncate_log, otherwise, dead lock may happen. + * + * If ocfs2_reserve_cluster is called + * before __ocfs2_flush_truncate_log, dead lock on global bitmap + * may happen. + * + */ + ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac); + if (ret) { + mlog_errno(ret); + goto out_unlock_mutex; + } + handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -617,9 +621,10 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, - &context->meta_ac, - NULL, extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c @@ -1361,6 +1361,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (!vma_can_userfault(cur)) goto out_unlock; + + /* + * UFFDIO_COPY will fill file holes even without + * PROT_WRITE. This check enforces that if this is a + * MAP_SHARED, the process has write permission to the backing + * file. If VM_MAYWRITE is set it also enforces that on a + * MAP_SHARED vma: there is no F_WRITE_SEAL and no further + * F_WRITE_SEAL can be taken until the vma is destroyed. + */ + ret = -EPERM; + if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) + goto out_unlock; + /* * If this vma contains ending address, and huge pages * check alignment. @@ -1406,6 +1419,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, BUG_ON(!vma_can_userfault(vma)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this @@ -1552,6 +1566,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, cond_resched(); BUG_ON(!vma_can_userfault(vma)); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this diff --git a/include/linux/psi.h b/include/linux/psi.h @@ -1,6 +1,7 @@ #ifndef _LINUX_PSI_H #define _LINUX_PSI_H +#include <linux/jump_label.h> #include <linux/psi_types.h> #include <linux/sched.h> @@ -9,7 +10,7 @@ struct css_set; #ifdef CONFIG_PSI -extern bool psi_disabled; +extern struct static_key_false psi_disabled; void psi_init(void); diff --git a/init/Kconfig b/init/Kconfig @@ -509,6 +509,15 @@ config PSI Say N if unsure. +config PSI_DEFAULT_DISABLED + bool "Require boot parameter to enable pressure stall information tracking" + default n + depends on PSI + help + If set, pressure stall information tracking will be disabled + per default but can be enabled through passing psi_enable=1 + on the kernel commandline during boot. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/init/initramfs.c b/init/initramfs.c @@ -291,16 +291,6 @@ static int __init do_reset(void) return 1; } -static int __init maybe_link(void) -{ - if (nlink >= 2) { - char *old = find_link(major, minor, ino, mode, collected); - if (old) - return (ksys_link(old, collected) < 0) ? -1 : 1; - } - return 0; -} - static void __init clean_path(char *path, umode_t fmode) { struct kstat st; @@ -313,6 +303,18 @@ static void __init clean_path(char *path, umode_t fmode) } } +static int __init maybe_link(void) +{ + if (nlink >= 2) { + char *old = find_link(major, minor, ino, mode, collected); + if (old) { + clean_path(collected, 0); + return (ksys_link(old, collected) < 0) ? -1 : 1; + } + } + return 0; +} + static __initdata int wfd; static int __init do_name(void) diff --git a/kernel/kcov.c b/kernel/kcov.c @@ -56,7 +56,7 @@ struct kcov { struct task_struct *t; }; -static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) +static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { unsigned int mode; @@ -78,7 +78,7 @@ static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) return mode == needed_mode; } -static unsigned long canonicalize_ip(unsigned long ip) +static notrace unsigned long canonicalize_ip(unsigned long ip) { #ifdef CONFIG_RANDOMIZE_BASE ip -= kaslr_offset(); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c @@ -136,8 +136,18 @@ static int psi_bug __read_mostly; -bool psi_disabled __read_mostly; -core_param(psi_disabled, psi_disabled, bool, 0644); +DEFINE_STATIC_KEY_FALSE(psi_disabled); + +#ifdef CONFIG_PSI_DEFAULT_DISABLED +bool psi_enable; +#else +bool psi_enable = true; +#endif +static int __init setup_psi(char *str) +{ + return kstrtobool(str, &psi_enable) == 0; +} +__setup("psi=", setup_psi); /* Running averages - we need to be higher-res than loadavg */ #define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ @@ -169,8 +179,10 @@ static void group_init(struct psi_group *group) void __init psi_init(void) { - if (psi_disabled) + if (!psi_enable) { + static_branch_enable(&psi_disabled); return; + } psi_period = jiffies_to_nsecs(PSI_FREQ); group_init(&psi_system); @@ -549,7 +561,7 @@ void psi_memstall_enter(unsigned long *flags) struct rq_flags rf; struct rq *rq; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; *flags = current->flags & PF_MEMSTALL; @@ -579,7 +591,7 @@ void psi_memstall_leave(unsigned long *flags) struct rq_flags rf; struct rq *rq; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (*flags) @@ -600,7 +612,7 @@ void psi_memstall_leave(unsigned long *flags) #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return 0; cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); @@ -612,7 +624,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) void psi_cgroup_free(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; cancel_delayed_work_sync(&cgroup->psi.clock_work); @@ -637,7 +649,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq; - if (psi_disabled) { + if (static_branch_likely(&psi_disabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -673,7 +685,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; update_stats(group); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h @@ -66,7 +66,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) { int clear = 0, set = TSK_RUNNING; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (!wakeup || p->sched_psi_wake_requeue) { @@ -86,7 +86,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) { int clear = TSK_RUNNING, set = 0; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (!sleep) { @@ -102,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) static inline void psi_ttwu_dequeue(struct task_struct *p) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; /* * Is the task being migrated during a wakeup? Make sure to @@ -128,7 +128,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) static inline void psi_task_tick(struct rq *rq) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (unlikely(rq->curr->flags & PF_MEMSTALL)) diff --git a/lib/debugobjects.c b/lib/debugobjects.c @@ -135,7 +135,6 @@ static void fill_pool(void) if (!new) return; - kmemleak_ignore(new); raw_spin_lock_irqsave(&pool_lock, flags); hlist_add_head(&new->node, &obj_pool); debug_objects_allocated++; @@ -1128,7 +1127,6 @@ static int __init debug_objects_replace_static_objects(void) obj = kmem_cache_zalloc(obj_cache, GFP_KERNEL); if (!obj) goto free; - kmemleak_ignore(obj); hlist_add_head(&obj->node, &objects); } @@ -1184,7 +1182,8 @@ void __init debug_objects_mem_init(void) obj_cache = kmem_cache_create("debug_objects_cache", sizeof (struct debug_obj), 0, - SLAB_DEBUG_OBJECTS, NULL); + SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE, + NULL); if (!obj_cache || debug_objects_replace_static_objects()) { debug_objects_enabled = 0; diff --git a/lib/test_kmod.c b/lib/test_kmod.c @@ -1214,7 +1214,6 @@ void unregister_test_dev_kmod(struct kmod_test_device *test_dev) dev_info(test_dev->dev, "removing interface\n"); misc_deregister(&test_dev->misc_dev); - kfree(&test_dev->misc_dev.name); mutex_unlock(&test_dev->config_mutex); mutex_unlock(&test_dev->trigger_mutex); diff --git a/mm/gup.c b/mm/gup.c @@ -702,12 +702,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!vma || start >= vma->vm_end) { vma = find_extend_vma(mm, start); if (!vma && in_gate_area(mm, start)) { - int ret; ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &pages[i] : NULL); if (ret) - return i ? : ret; + goto out; ctx.page_mask = 0; goto next_page; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c @@ -2350,7 +2350,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, } } -static void freeze_page(struct page *page) +static void unmap_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; @@ -2365,7 +2365,7 @@ static void freeze_page(struct page *page) VM_BUG_ON_PAGE(!unmap_success, page); } -static void unfreeze_page(struct page *page) +static void remap_page(struct page *page) { int i; if (PageTransHuge(page)) { @@ -2402,6 +2402,12 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_unevictable) | (1L << PG_dirty))); + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, + page_tail); + page_tail->mapping = head->mapping; + page_tail->index = head->index + tail; + /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); @@ -2422,12 +2428,6 @@ static void __split_huge_page_tail(struct page *head, int tail, if (page_is_idle(head)) set_page_idle(page_tail); - /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, - page_tail); - page_tail->mapping = head->mapping; - - page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); /* @@ -2439,12 +2439,11 @@ static void __split_huge_page_tail(struct page *head, int tail, } static void __split_huge_page(struct page *page, struct list_head *list, - unsigned long flags) + pgoff_t end, unsigned long flags) { struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; - pgoff_t end = -1; int i; lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); @@ -2452,9 +2451,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - if (!PageAnon(page)) - end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ @@ -2483,7 +2479,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); - unfreeze_page(head); + remap_page(head); for (i = 0; i < HPAGE_PMD_NR; i++) { struct page *subpage = head + i; @@ -2626,6 +2622,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) int count, mapcount, extra_pins, ret; bool mlocked; unsigned long flags; + pgoff_t end; VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -2648,6 +2645,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ret = -EBUSY; goto out; } + end = -1; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -2661,10 +2659,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) anon_vma = NULL; i_mmap_lock_read(mapping); + + /* + *__split_huge_page() may need to trim off pages beyond EOF: + * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, + * which cannot be nested inside the page tree lock. So note + * end now: i_size itself may be changed at any moment, but + * head page lock is good enough to serialize the trimming. + */ + end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); } /* - * Racy check if we can split the page, before freeze_page() will + * Racy check if we can split the page, before unmap_page() will * split PMDs */ if (!can_split_huge_page(head, &extra_pins)) { @@ -2673,7 +2680,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } mlocked = PageMlocked(page); - freeze_page(head); + unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); /* Make sure the page is not on per-CPU pagevec as it takes pin */ @@ -2707,7 +2714,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) __dec_node_page_state(page, NR_SHMEM_THPS); spin_unlock(&pgdata->split_queue_lock); - __split_huge_page(page, list, flags); + __split_huge_page(page, list, end, flags); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2727,7 +2734,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) fail: if (mapping) xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); - unfreeze_page(head); + remap_page(head); ret = -EBUSY; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c @@ -4080,7 +4080,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_sem */ if (unlikely(ret)) { - ret = -EFAULT; + ret = -ENOENT; *pagep = page; /* don't free the page */ goto out; diff --git a/mm/khugepaged.c b/mm/khugepaged.c @@ -1287,7 +1287,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * collapse_shmem - collapse small tmpfs/shmem pages into huge one. * * Basic scheme is simple, details are more complex: - * - allocate and freeze a new huge page; + * - allocate and lock a new huge page; * - scan page cache replacing old pages with the new one * + swap in pages if necessary; * + fill in gaps; @@ -1295,11 +1295,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * - if replacing succeeds: * + copy data over; * + free old pages; - * + unfreeze huge page; + * + unlock huge page; * - if replacing failed; * + put all pages back and unfreeze them; * + restore gaps in the page cache; - * + free huge page; + * + unlock and free huge page; */ static void collapse_shmem(struct mm_struct *mm, struct address_space *mapping, pgoff_t start, @@ -1329,19 +1329,6 @@ static void collapse_shmem(struct mm_struct *mm, goto out; } - new_page->index = start; - new_page->mapping = mapping; - __SetPageSwapBacked(new_page); - __SetPageLocked(new_page); - BUG_ON(!page_ref_freeze(new_page, 1)); - - /* - * At this point the new_page is 'frozen' (page_count() is zero), - * locked and not up-to-date. It's safe to insert it into the page - * cache, because nobody would be able to map it or use it in other - * way until we unfreeze it. - */ - /* This will be less messy when we use multi-index entries */ do { xas_lock_irq(&xas); @@ -1349,19 +1336,44 @@ static void collapse_shmem(struct mm_struct *mm, if (!xas_error(&xas)) break; xas_unlock_irq(&xas); - if (!xas_nomem(&xas, GFP_KERNEL)) + if (!xas_nomem(&xas, GFP_KERNEL)) { + mem_cgroup_cancel_charge(new_page, memcg, true); + result = SCAN_FAIL; goto out; + } } while (1); + __SetPageLocked(new_page); + __SetPageSwapBacked(new_page); + new_page->index = start; + new_page->mapping = mapping; + + /* + * At this point the new_page is locked and not up-to-date. + * It's safe to insert it into the page cache, because nobody would + * be able to map it or use it in another way until we unlock it. + */ + xas_set(&xas, start); for (index = start; index < end; index++) { struct page *page = xas_next(&xas); VM_BUG_ON(index != xas.xa_index); if (!page) { + /* + * Stop if extent has been truncated or hole-punched, + * and is now completely empty. + */ + if (index == start) { + if (!xas_next_entry(&xas, end - 1)) { + result = SCAN_TRUNCATED; + goto xa_locked; + } + xas_set(&xas, index); + } if (!shmem_charge(mapping->host, 1)) { result = SCAN_FAIL; - break; + goto xa_locked; } xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); nr_none++; @@ -1376,13 +1388,12 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_FAIL; goto xa_unlocked; } - xas_lock_irq(&xas); - xas_set(&xas, index); } else if (trylock_page(page)) { get_page(page); + xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; - break; + goto xa_locked; } /* @@ -1391,17 +1402,24 @@ static void collapse_shmem(struct mm_struct *mm, */ VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); - VM_BUG_ON_PAGE(PageTransCompound(page), page); + + /* + * If file was truncated then extended, or hole-punched, before + * we locked the first page, then a THP might be there already. + */ + if (PageTransCompound(page)) { + result = SCAN_PAGE_COMPOUND; + goto out_unlock; + } if (page_mapping(page) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } - xas_unlock_irq(&xas); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; - goto out_isolate_failed; + goto out_unlock; } if (page_mapped(page)) @@ -1421,7 +1439,9 @@ static void collapse_shmem(struct mm_struct *mm, */ if (!page_ref_freeze(page, 3)) { result = SCAN_PAGE_COUNT; - goto out_lru; + xas_unlock_irq(&xas); + putback_lru_page(page); + goto out_unlock; } /* @@ -1433,71 +1453,74 @@ static void collapse_shmem(struct mm_struct *mm, /* Finally, replace with the new page. */ xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); continue; -out_lru: - xas_unlock_irq(&xas); - putback_lru_page(page); -out_isolate_failed: - unlock_page(page); - put_page(page); - goto xa_unlocked; out_unlock: unlock_page(page); put_page(page); - break; + goto xa_unlocked; } - xas_unlock_irq(&xas); + __inc_node_page_state(new_page, NR_SHMEM_THPS); + if (nr_none) { + struct zone *zone = page_zone(new_page); + + __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); + __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); + } + +xa_locked: + xas_unlock_irq(&xas); xa_unlocked: + if (result == SCAN_SUCCEED) { struct page *page, *tmp; - struct zone *zone = page_zone(new_page); /* * Replacing old pages with new one has succeeded, now we * need to copy the content and free the old pages. */ + index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { + while (index < page->index) { + clear_highpage(new_page + (index % HPAGE_PMD_NR)); + index++; + } copy_highpage(new_page + (page->index % HPAGE_PMD_NR), page); list_del(&page->lru); - unlock_page(page); - page_ref_unfreeze(page, 1); page->mapping = NULL; + page_ref_unfreeze(page, 1); ClearPageActive(page); ClearPageUnevictable(page); + unlock_page(page); put_page(page); + index++; } - - local_irq_disable(); - __inc_node_page_state(new_page, NR_SHMEM_THPS); - if (nr_none) { - __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); - __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); + while (index < end) { + clear_highpage(new_page + (index % HPAGE_PMD_NR)); + index++; } - local_irq_enable(); - /* - * Remove pte page tables, so we can re-fault - * the page as huge. - */ - retract_page_tables(mapping, start); - - /* Everything is ready, let's unfreeze the new_page */ - set_page_dirty(new_page); SetPageUptodate(new_page); - page_ref_unfreeze(new_page, HPAGE_PMD_NR); + page_ref_add(new_page, HPAGE_PMD_NR - 1); + set_page_dirty(new_page); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_anon(new_page); - unlock_page(new_page); + /* + * Remove pte page tables, so we can re-fault the page as huge. + */ + retract_page_tables(mapping, start); *hpage = NULL; khugepaged_pages_collapsed++; } else { struct page *page; + /* Something went wrong: roll back page cache changes */ - shmem_uncharge(mapping->host, nr_none); xas_lock_irq(&xas); + mapping->nrpages -= nr_none; + shmem_uncharge(mapping->host, nr_none); + xas_set(&xas, start); xas_for_each(&xas, page, end - 1) { page = list_first_entry_or_null(&pagelist, @@ -1519,19 +1542,18 @@ xa_unlocked: xas_store(&xas, page); xas_pause(&xas); xas_unlock_irq(&xas); - putback_lru_page(page); unlock_page(page); + putback_lru_page(page); xas_lock_irq(&xas); } VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - /* Unfreeze new_page, caller would take care about freeing it */ - page_ref_unfreeze(new_page, 1); mem_cgroup_cancel_charge(new_page, memcg, true); - unlock_page(new_page); new_page->mapping = NULL; } + + unlock_page(new_page); out: VM_BUG_ON(!list_empty(&pagelist)); /* TODO: tracepoints */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c @@ -5813,8 +5813,10 @@ void __meminit init_currently_empty_zone(struct zone *zone, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; + int zone_idx = zone_idx(zone) + 1; - pgdat->nr_zones = zone_idx(zone) + 1; + if (zone_idx > pgdat->nr_zones) + pgdat->nr_zones = zone_idx; zone->zone_start_pfn = zone_start_pfn; diff --git a/mm/rmap.c b/mm/rmap.c @@ -1627,16 +1627,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, address + PAGE_SIZE); } else { /* - * We should not need to notify here as we reach this - * case only from freeze_page() itself only call from - * split_huge_page_to_list() so everything below must - * be true: - * - page is not anonymous - * - page is locked - * - * So as it is a locked file back page thus it can not - * be remove from the page cache and replace by a new - * page before mmu_notifier_invalidate_range_end so no + * This is a locked file-backed page, thus it cannot + * be removed from the page cache and replaced by a new + * page before mmu_notifier_invalidate_range_end, so no * concurrent thread might update its page table to * point at new page while a device still is using this * page. diff --git a/mm/shmem.c b/mm/shmem.c @@ -297,12 +297,14 @@ bool shmem_charge(struct inode *inode, long pages) if (!shmem_inode_acct_block(inode, pages)) return false; + /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ + inode->i_mapping->nrpages += pages; + spin_lock_irqsave(&info->lock, flags); info->alloced += pages; inode->i_blocks += pages * BLOCKS_PER_PAGE; shmem_recalc_inode(inode); spin_unlock_irqrestore(&info->lock, flags); - inode->i_mapping->nrpages += pages; return true; } @@ -312,6 +314,8 @@ void shmem_uncharge(struct inode *inode, long pages) struct shmem_inode_info *info = SHMEM_I(inode); unsigned long flags; + /* nrpages adjustment done by __delete_from_page_cache() or caller */ + spin_lock_irqsave(&info->lock, flags); info->alloced -= pages; inode->i_blocks -= pages * BLOCKS_PER_PAGE; @@ -1509,11 +1513,13 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, { struct page *oldpage, *newpage; struct address_space *swap_mapping; + swp_entry_t entry; pgoff_t swap_index; int error; oldpage = *pagep; - swap_index = page_private(oldpage); + entry.val = page_private(oldpage); + swap_index = swp_offset(entry); swap_mapping = page_mapping(oldpage); /* @@ -1532,7 +1538,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, __SetPageLocked(newpage); __SetPageSwapBacked(newpage); SetPageUptodate(newpage); - set_page_private(newpage, swap_index); + set_page_private(newpage, entry.val); SetPageSwapCache(newpage); /* @@ -2214,6 +2220,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, struct page *page; pte_t _dst_pte, *dst_pte; int ret; + pgoff_t offset, max_off; ret = -ENOMEM; if (!shmem_inode_acct_block(inode, 1)) @@ -2236,7 +2243,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, *pagep = page; shmem_inode_unacct_blocks(inode, 1); /* don't free the page */ - return -EFAULT; + return -ENOENT; } } else { /* mfill_zeropage_atomic */ clear_highpage(page); @@ -2251,6 +2258,12 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, __SetPageSwapBacked(page); __SetPageUptodate(page); + ret = -EFAULT; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) + goto out_release; + ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); if (ret) goto out_release; @@ -2265,9 +2278,25 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + else { + /* + * We don't set the pte dirty if the vma has no + * VM_WRITE permission, so mark the page dirty or it + * could be freed from under us. We could do it + * unconditionally before unlock_page(), but doing it + * only if VM_WRITE is not set is faster. + */ + set_page_dirty(page); + } - ret = -EEXIST; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + + ret = -EFAULT; + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) + goto out_release_uncharge_unlock; + + ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_release_uncharge_unlock; @@ -2285,13 +2314,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); - unlock_page(page); pte_unmap_unlock(dst_pte, ptl); + unlock_page(page); ret = 0; out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); + ClearPageDirty(page); + delete_from_page_cache(page); out_release_uncharge: mem_cgroup_cancel_charge(page, memcg, false); out_release: diff --git a/mm/truncate.c b/mm/truncate.c @@ -517,9 +517,13 @@ void truncate_inode_pages_final(struct address_space *mapping) */ xa_lock_irq(&mapping->i_pages); xa_unlock_irq(&mapping->i_pages); - - truncate_inode_pages(mapping, 0); } + + /* + * Cleancache needs notification even if there are no pages or shadow + * entries. + */ + truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c @@ -33,6 +33,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, void *page_kaddr; int ret; struct page *page; + pgoff_t offset, max_off; + struct inode *inode; if (!*pagep) { ret = -ENOMEM; @@ -48,7 +50,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_sem */ if (unlikely(ret)) { - ret = -EFAULT; + ret = -ENOENT; *pagep = page; /* don't free the page */ goto out; @@ -73,8 +75,17 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (dst_vma->vm_flags & VM_WRITE) _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); - ret = -EEXIST; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (dst_vma->vm_file) { + /* the shmem MAP_PRIVATE case requires checking the i_size */ + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + ret = -EFAULT; + if (unlikely(offset >= max_off)) + goto out_release_uncharge_unlock; + } + ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_release_uncharge_unlock; @@ -108,11 +119,22 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; + pgoff_t offset, max_off; + struct inode *inode; _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); - ret = -EEXIST; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (dst_vma->vm_file) { + /* the shmem MAP_PRIVATE case requires checking the i_size */ + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + ret = -EFAULT; + if (unlikely(offset >= max_off)) + goto out_unlock; + } + ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_unlock; set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -205,8 +227,9 @@ retry: if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) goto out_unlock; /* - * Only allow __mcopy_atomic_hugetlb on userfaultfd - * registered ranges. + * Check the vma is registered in uffd, this is + * required to enforce the VM_MAYWRITE check done at + * uffd registration time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; @@ -274,7 +297,7 @@ retry: cond_resched(); - if (unlikely(err == -EFAULT)) { + if (unlikely(err == -ENOENT)) { up_read(&dst_mm->mmap_sem); BUG_ON(!page); @@ -380,7 +403,17 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, { ssize_t err; - if (vma_is_anonymous(dst_vma)) { + /* + * The normal page fault path for a shmem will invoke the + * fault, fill the hole in the file and COW it right away. The + * result generates plain anonymous memory. So when we are + * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll + * generate anonymous memory directly without actually filling + * the hole. For the MAP_PRIVATE case the robustness check + * only happens in the pagetable (to verify it's still none) + * and not in the radix tree. + */ + if (!(dst_vma->vm_flags & VM_SHARED)) { if (!zeropage) err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page); @@ -449,13 +482,9 @@ retry: if (!dst_vma) goto out_unlock; /* - * Be strict and only allow __mcopy_atomic on userfaultfd - * registered ranges to prevent userland errors going - * unnoticed. As far as the VM consistency is concerned, it - * would be perfectly safe to remove this check, but there's - * no useful usage for __mcopy_atomic ouside of userfaultfd - * registered ranges. This is after all why these are ioctls - * belonging to the userfaultfd and not syscalls. + * Check the vma is registered in uffd, this is required to + * enforce the VM_MAYWRITE check done at uffd registration + * time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; @@ -489,7 +518,8 @@ retry: * dst_vma. */ err = -ENOMEM; - if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma))) + if (!(dst_vma->vm_flags & VM_SHARED) && + unlikely(anon_vma_prepare(dst_vma))) goto out_unlock; while (src_addr < src_start + len) { @@ -530,7 +560,7 @@ retry: src_addr, &page, zeropage); cond_resched(); - if (unlikely(err == -EFAULT)) { + if (unlikely(err == -ENOENT)) { void *page_kaddr; up_read(&dst_mm->mmap_sem); diff --git a/tools/testing/selftests/proc/proc-self-map-files-002.c b/tools/testing/selftests/proc/proc-self-map-files-002.c @@ -13,7 +13,7 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -/* Test readlink /proc/self/map_files/... with address 0. */ +/* Test readlink /proc/self/map_files/... with minimum address. */ #include <errno.h> #include <sys/types.h> #include <sys/stat.h> @@ -47,6 +47,11 @@ static void fail(const char *fmt, unsigned long a, unsigned long b) int main(void) { const unsigned int PAGE_SIZE = sysconf(_SC_PAGESIZE); +#ifdef __arm__ + unsigned long va = 2 * PAGE_SIZE; +#else + unsigned long va = 0; +#endif void *p; int fd; unsigned long a, b; @@ -55,7 +60,7 @@ int main(void) if (fd == -1) return 1; - p = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); + p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); if (p == MAP_FAILED) { if (errno == EPERM) return 2;