whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 091a1eaa0e309b0e8dcbf3f2da12c7f3d03ed182
parent 5943a9bbbb98b5c957662edd2fc902cc14e65895
Author: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date:   Fri,  5 Oct 2018 16:33:03 -0700

Merge branch 'akpm'

* akpm:
  mm: madvise(MADV_DODUMP): allow hugetlbfs pages
  ocfs2: fix locking for res->tracking and dlm->tracking_list
  mm/vmscan.c: fix int overflow in callers of do_shrink_slab()
  mm/vmstat.c: skip NR_TLB_REMOTE_FLUSH* properly
  mm/vmstat.c: fix outdated vmstat_text
  proc: restrict kernel stack dumps to root
  mm/hugetlb: add mmap() encodings for 32MB and 512MB page sizes
  mm/migrate.c: split only transparent huge pages when allocation fails
  ipc/shm.c: use ERR_CAST() for shm_lock() error return
  mm/gup_benchmark: fix unsigned comparison to zero in __gup_benchmark_ioctl
  mm, thp: fix mlocking THP page with migration enabled
  ocfs2: fix crash in ocfs2_duplicate_clusters_by_page()
  hugetlb: take PMD sharing into account when flushing tlb/caches
  mm: migration: fix migration of huge PMD shared pages

Diffstat:
Mfs/ocfs2/dlm/dlmmaster.c | 4++--
Mfs/ocfs2/refcounttree.c | 16++++++++++++----
Mfs/proc/base.c | 14++++++++++++++
Minclude/linux/hugetlb.h | 14++++++++++++++
Minclude/linux/mm.h | 6++++++
Minclude/uapi/asm-generic/hugetlb_encode.h | 2++
Minclude/uapi/linux/memfd.h | 2++
Minclude/uapi/linux/mman.h | 2++
Minclude/uapi/linux/shm.h | 2++
Mipc/shm.c | 2+-
Mmm/gup_benchmark.c | 3++-
Mmm/huge_memory.c | 2+-
Mmm/hugetlb.c | 90+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Mmm/madvise.c | 2+-
Mmm/migrate.c | 5++++-
Mmm/rmap.c | 42+++++++++++++++++++++++++++++++++++++++---
Mmm/vmscan.c | 7+++----
Mmm/vmstat.c | 4+++-
18 files changed, 189 insertions(+), 30 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c @@ -584,9 +584,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; - spin_lock(&dlm->spinlock); + spin_lock(&dlm->track_lock); list_add_tail(&res->tracking, &dlm->tracking_list); - spin_unlock(&dlm->spinlock); + spin_unlock(&dlm->track_lock); memset(res->lvb, 0, DLM_LVB_LEN); memset(res->refmap, 0, sizeof(res->refmap)); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c @@ -2946,6 +2946,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (map_end & (PAGE_SIZE - 1)) to = map_end & (PAGE_SIZE - 1); +retry: page = find_or_create_page(mapping, page_index, GFP_NOFS); if (!page) { ret = -ENOMEM; @@ -2954,11 +2955,18 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } /* - * In case PAGE_SIZE <= CLUSTER_SIZE, This page - * can't be dirtied before we CoW it out. + * In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty + * page, so write it back. */ - if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) - BUG_ON(PageDirty(page)); + if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) { + if (PageDirty(page)) { + /* + * write_on_page will unlock the page on return + */ + ret = write_one_page(page); + goto retry; + } + } if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); diff --git a/fs/proc/base.c b/fs/proc/base.c @@ -407,6 +407,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, unsigned long *entries; int err; + /* + * The ability to racily run the kernel stack unwinder on a running task + * and then observe the unwinder output is scary; while it is useful for + * debugging kernel issues, it can also allow an attacker to leak kernel + * stack contents. + * Doing this in a manner that is at least safe from races would require + * some work to ensure that the remote task can not be scheduled; and + * even then, this would still expose the unwinder as local attack + * surface. + * Therefore, this interface is restricted to root. + */ + if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EACCES; + entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), GFP_KERNEL); if (!entries) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h @@ -140,6 +140,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end); struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); struct page *follow_huge_pd(struct vm_area_struct *vma, @@ -170,6 +172,18 @@ static inline unsigned long hugetlb_total_pages(void) return 0; } +static inline int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, + pte_t *ptep) +{ + return 0; +} + +static inline void adjust_range_if_pmd_sharing_possible( + struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} + #define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; }) #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) diff --git a/include/linux/mm.h b/include/linux/mm.h @@ -2455,6 +2455,12 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } +static inline bool range_in_vma(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + return (vma && vma->vm_start <= start && end <= vma->vm_end); +} + #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); void vma_set_page_prot(struct vm_area_struct *vma); diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h @@ -26,7 +26,9 @@ #define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h @@ -25,7 +25,9 @@ #define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB #define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB #define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MFD_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB #define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MFD_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB #define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB #define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h @@ -28,7 +28,9 @@ #define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB #define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB #define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MAP_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB #define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MAP_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB #define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB #define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h @@ -65,7 +65,9 @@ struct shmid_ds { #define SHM_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB #define SHM_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB #define SHM_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define SHM_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB #define SHM_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define SHM_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB #define SHM_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB #define SHM_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define SHM_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB diff --git a/ipc/shm.c b/ipc/shm.c @@ -206,7 +206,7 @@ err: * Callers of shm_lock() must validate the status of the returned ipc * object pointer and error out as appropriate. */ - return (void *)ipcp; + return ERR_CAST(ipcp); } static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c @@ -19,7 +19,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, struct gup_benchmark *gup) { ktime_t start_time, end_time; - unsigned long i, nr, nr_pages, addr, next; + unsigned long i, nr_pages, addr, next; + int nr; struct page **pages; nr_pages = gup->size / PAGE_SIZE; diff --git a/mm/huge_memory.c b/mm/huge_memory.c @@ -2931,7 +2931,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) else page_add_file_rmap(new, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) mlock_vma_page(new); update_mmu_cache_pmd(vma, address, pvmw->pmd); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c @@ -3326,8 +3326,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - const unsigned long mmun_start = start; /* For mmu_notifiers */ - const unsigned long mmun_end = end; /* For mmu_notifiers */ + unsigned long mmun_start = start; /* For mmu_notifiers */ + unsigned long mmun_end = end; /* For mmu_notifiers */ WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -3339,6 +3339,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, */ tlb_remove_check_page_size_change(tlb, sz); tlb_start_vma(tlb, vma); + + /* + * If sharing possible, alert mmu notifiers of worst case. + */ + adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; for (; address < end; address += sz) { @@ -3349,6 +3354,10 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) { spin_unlock(ptl); + /* + * We just unmapped a page of PMDs by clearing a PUD. + * The caller's TLB flush range should cover this area. + */ continue; } @@ -3431,12 +3440,23 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, { struct mm_struct *mm; struct mmu_gather tlb; + unsigned long tlb_start = start; + unsigned long tlb_end = end; + + /* + * If shared PMDs were possibly used within this vma range, adjust + * start/end for worst case tlb flushing. + * Note that we can not be sure if PMDs are shared until we try to + * unmap pages. However, we want to make sure TLB flushing covers + * the largest possible range. + */ + adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end); mm = vma->vm_mm; - tlb_gather_mmu(&tlb, mm, start, end); + tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end); __unmap_hugepage_range(&tlb, vma, start, end, ref_page); - tlb_finish_mmu(&tlb, start, end); + tlb_finish_mmu(&tlb, tlb_start, tlb_end); } /* @@ -4298,11 +4318,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t pte; struct hstate *h = hstate_vma(vma); unsigned long pages = 0; + unsigned long f_start = start; + unsigned long f_end = end; + bool shared_pmd = false; + + /* + * In the case of shared PMDs, the area to flush could be beyond + * start/end. Set f_start/f_end to cover the maximum possible + * range if PMD sharing is possible. + */ + adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end); BUG_ON(address >= end); - flush_cache_range(vma, address, end); + flush_cache_range(vma, f_start, f_end); - mmu_notifier_invalidate_range_start(mm, start, end); + mmu_notifier_invalidate_range_start(mm, f_start, f_end); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; @@ -4313,6 +4343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, if (huge_pmd_unshare(mm, &address, ptep)) { pages++; spin_unlock(ptl); + shared_pmd = true; continue; } pte = huge_ptep_get(ptep); @@ -4348,9 +4379,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: * once we release i_mmap_rwsem, another task can do the final put_page - * and that page table be reused and filled with junk. + * and that page table be reused and filled with junk. If we actually + * did unshare a page of pmds, flush the range corresponding to the pud. */ - flush_hugetlb_tlb_range(vma, start, end); + if (shared_pmd) + flush_hugetlb_tlb_range(vma, f_start, f_end); + else + flush_hugetlb_tlb_range(vma, start, end); /* * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. @@ -4358,7 +4393,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); - mmu_notifier_invalidate_range_end(mm, start, end); + mmu_notifier_invalidate_range_end(mm, f_start, f_end); return pages << h->order; } @@ -4545,13 +4580,41 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && - vma->vm_start <= base && end <= vma->vm_end) + if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) return true; return false; } /* + * Determine if start,end range within vma could be mapped by shared pmd. + * If yes, adjust start and end to cover range associated with possible + * shared pmd mappings. + */ +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ + unsigned long check_addr = *start; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) { + unsigned long a_start = check_addr & PUD_MASK; + unsigned long a_end = a_start + PUD_SIZE; + + /* + * If sharing is possible, adjust start/end if necessary. + */ + if (range_in_vma(vma, a_start, a_end)) { + if (a_start < *start) + *start = a_start; + if (a_end > *end) + *end = a_end; + } + } +} + +/* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the @@ -4648,6 +4711,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } + +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} #define want_pmd_share() (0) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ diff --git a/mm/madvise.c b/mm/madvise.c @@ -96,7 +96,7 @@ static long madvise_behavior(struct vm_area_struct *vma, new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - if (new_flags & VM_SPECIAL) { + if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { error = -EINVAL; goto out; } diff --git a/mm/migrate.c b/mm/migrate.c @@ -275,6 +275,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); + if (PageTransHuge(page) && PageMlocked(page)) + clear_page_mlock(page); + /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); } @@ -1411,7 +1414,7 @@ retry: * we encounter them after the rest of the list * is processed. */ - if (PageTransHuge(page)) { + if (PageTransHuge(page) && !PageHuge(page)) { lock_page(page); rc = split_huge_page_to_list(page, from); unlock_page(page); diff --git a/mm/rmap.c b/mm/rmap.c @@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } /* - * We have to assume the worse case ie pmd for invalidation. Note that - * the page can not be free in this function as call of try_to_unmap() - * must hold a reference on the page. + * For THP, we have to assume the worse case ie pmd for invalidation. + * For hugetlb, it could be much worse if we need to do pud + * invalidation in the case of pmd sharing. + * + * Note that the page can not be free in this function as call of + * try_to_unmap() must hold a reference on the page. */ end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + if (PageHuge(page)) { + /* + * If sharing is possible, start and end will be adjusted + * accordingly. + */ + adjust_range_if_pmd_sharing_possible(vma, &start, &end); + } mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { @@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; + if (PageHuge(page)) { + if (huge_pmd_unshare(mm, &address, pvmw.pte)) { + /* + * huge_pmd_unshare unmapped an entire PMD + * page. There is no way of knowing exactly + * which PMDs may be cached for this mm, so + * we must flush them all. start/end were + * already adjusted above to cover this range. + */ + flush_cache_range(vma, start, end); + flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range(mm, start, end); + + /* + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + } if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && diff --git a/mm/vmscan.c b/mm/vmscan.c @@ -580,8 +580,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { struct memcg_shrinker_map *map; - unsigned long freed = 0; - int ret, i; + unsigned long ret, freed = 0; + int i; if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) return 0; @@ -677,9 +677,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { + unsigned long ret, freed = 0; struct shrinker *shrinker; - unsigned long freed = 0; - int ret; if (!mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); diff --git a/mm/vmstat.c b/mm/vmstat.c @@ -1275,6 +1275,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", +#else + "", /* nr_tlb_remote_flush */ + "", /* nr_tlb_remote_flush_received */ #endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", @@ -1283,7 +1286,6 @@ const char * const vmstat_text[] = { #ifdef CONFIG_DEBUG_VM_VMACACHE "vmacache_find_calls", "vmacache_find_hits", - "vmacache_full_flushes", #endif #ifdef CONFIG_SWAP "swap_ra",