whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 8dcd175bc3d50b78413c56d5b17d4bddd77412ef
parent afe6fe7036c6efdcb46cabc64bec9b6e4a005210
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Wed,  6 Mar 2019 10:31:36 -0800

Merge branch 'akpm' (patches from Andrew)

Merge misc updates from Andrew Morton:

 - a few misc things

 - ocfs2 updates

 - most of MM

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (159 commits)
  tools/testing/selftests/proc/proc-self-syscall.c: remove duplicate include
  proc: more robust bulk read test
  proc: test /proc/*/maps, smaps, smaps_rollup, statm
  proc: use seq_puts() everywhere
  proc: read kernel cpu stat pointer once
  proc: remove unused argument in proc_pid_lookup()
  fs/proc/thread_self.c: code cleanup for proc_setup_thread_self()
  fs/proc/self.c: code cleanup for proc_setup_self()
  proc: return exit code 4 for skipped tests
  mm,mremap: bail out earlier in mremap_to under map pressure
  mm/sparse: fix a bad comparison
  mm/memory.c: do_fault: avoid usage of stale vm_area_struct
  writeback: fix inode cgroup switching comment
  mm/huge_memory.c: fix "orig_pud" set but not used
  mm/hotplug: fix an imbalance with DEBUG_PAGEALLOC
  mm/memcontrol.c: fix bad line in comment
  mm/cma.c: cma_declare_contiguous: correct err handling
  mm/page_ext.c: fix an imbalance with kmemleak
  mm/compaction: pass pgdat to too_many_isolated() instead of zone
  mm: remove zone_lru_lock() function, access ->lru_lock directly
  ...

Diffstat:
MDocumentation/admin-guide/cgroup-v2.rst | 16++++++++++++++++
MDocumentation/admin-guide/mm/pagemap.rst | 9++++++---
MDocumentation/cgroup-v1/memcg_test.txt | 4++--
MDocumentation/cgroup-v1/memory.txt | 4++--
MMAINTAINERS | 8++++++++
March/alpha/include/asm/topology.h | 3++-
March/arm64/Kconfig | 4++++
March/arm64/include/asm/hugetlb.h | 5+++++
March/arm64/include/asm/memory.h | 4----
March/arm64/kernel/machine_kexec.c | 3+--
March/arm64/mm/hugetlbpage.c | 20++++++++++++++++++++
March/arm64/mm/init.c | 27---------------------------
March/arm64/mm/numa.c | 2+-
March/ia64/kernel/numa.c | 2+-
March/ia64/kernel/perfmon.c | 59++++-------------------------------------------------------
March/ia64/mm/discontig.c | 6+++---
March/m68k/mm/memory.c | 2+-
March/powerpc/include/asm/book3s/64/hugetlb.h | 12++++++++++++
March/powerpc/include/asm/book3s/64/pgtable.h | 18++++++++++++++++++
March/powerpc/include/asm/book3s/64/radix.h | 4++++
March/powerpc/include/asm/pci-bridge.h | 3++-
March/powerpc/kernel/paca.c | 3++-
March/powerpc/kernel/pci-common.c | 3++-
March/powerpc/kernel/vdso.c | 2--
March/powerpc/mm/hugetlbpage-hash64.c | 25+++++++++++++++++++++++++
March/powerpc/mm/hugetlbpage-radix.c | 17+++++++++++++++++
March/powerpc/mm/mmu_context_iommu.c | 145++++++++++++++++++++++++-------------------------------------------------------
March/powerpc/mm/numa.c | 16++++++++--------
March/powerpc/mm/pgtable-book3s64.c | 25+++++++++++++++++++++++++
March/powerpc/mm/pgtable-radix.c | 18++++++++++++++++++
March/powerpc/platforms/powernv/memtrace.c | 5+++--
March/riscv/kernel/vdso.c | 1-
March/s390/include/asm/pgtable.h | 5+++--
March/s390/kernel/vdso.c | 2--
March/s390/mm/pgtable.c | 8+++++---
March/sh/kernel/syscalls/syscalltbl.sh | 4++--
March/sh/kernel/syscalls_32.S | 2+-
March/sparc/kernel/pci_fire.c | 3++-
March/sparc/kernel/pci_schizo.c | 3++-
March/sparc/kernel/psycho_common.c | 3++-
March/sparc/kernel/sbus.c | 3++-
March/sparc/mm/init_64.c | 6+++---
March/x86/include/asm/paravirt.h | 13+++++++------
March/x86/include/asm/paravirt_types.h | 5+++--
March/x86/include/asm/pci.h | 3++-
March/x86/include/asm/uaccess.h | 24++++++++++++------------
March/x86/kernel/apic/x2apic_uv_x.c | 7++++---
March/x86/kernel/setup_percpu.c | 2+-
March/x86/kernel/smpboot.c | 3++-
March/x86/lib/usercopy_32.c | 8++++----
March/x86/mm/numa.c | 4++--
March/x86/xen/mmu.h | 4++--
March/x86/xen/mmu_pv.c | 8++++----
Mdrivers/block/mtip32xx/mtip32xx.c | 5+++--
Mdrivers/char/agp/efficeon-agp.c | 2--
Mdrivers/dma/dmaengine.c | 4+++-
Mdrivers/gpu/drm/i915/i915_utils.h | 6------
Mdrivers/hv/hv_balloon.c | 21++++++++++++++++-----
Mdrivers/infiniband/hw/hfi1/affinity.c | 3++-
Mdrivers/infiniband/hw/hfi1/init.c | 3++-
Mdrivers/iommu/dmar.c | 5+++--
Mdrivers/iommu/intel-iommu.c | 3++-
Mdrivers/misc/sgi-xp/xpc_uv.c | 3++-
Mdrivers/misc/vmw_balloon.c | 32++++++++++++++++++++++++++++++++
Mdrivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5+++--
Mdrivers/xen/balloon.c | 18+++++++++++++-----
Mfs/file.c | 1+
Mfs/hugetlbfs/inode.c | 2+-
Mfs/inode.c | 8+-------
Mfs/kernfs/file.c | 31++++++++++++++++++++-----------
Mfs/ocfs2/alloc.c | 159++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mfs/ocfs2/cluster/nodemanager.c | 14++++++++------
Mfs/ocfs2/dlmglue.c | 5+++++
Mfs/ocfs2/ocfs2.h | 1+
Mfs/ocfs2/ocfs2_trace.h | 2++
Mfs/ocfs2/slot_map.c | 8++------
Mfs/ocfs2/super.c | 2++
Mfs/pipe.c | 3+--
Mfs/proc/array.c | 16++++++++--------
Mfs/proc/base.c | 4++--
Mfs/proc/internal.h | 2+-
Mfs/proc/page.c | 4++--
Mfs/proc/root.c | 2+-
Mfs/proc/self.c | 16++++++++--------
Mfs/proc/stat.c | 60++++++++++++++++++++++++++++++++----------------------------
Mfs/proc/task_mmu.c | 8+++++---
Mfs/proc/task_nommu.c | 2+-
Mfs/proc/thread_self.c | 16++++++++--------
Minclude/asm-generic/pgtable.h | 18+++++++++---------
Minclude/linux/backing-dev.h | 2+-
Minclude/linux/balloon_compaction.h | 34+++++++++++++---------------------
Minclude/linux/cgroup-defs.h | 4++++
Minclude/linux/compaction.h | 7+++----
Minclude/linux/device.h | 2+-
Minclude/linux/frontswap.h | 7+++++++
Minclude/linux/fs.h | 2+-
Minclude/linux/gfp.h | 30+++++++++++++++---------------
Minclude/linux/hugetlb.h | 70+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Minclude/linux/kasan-checks.h | 2+-
Minclude/linux/kernfs.h | 6++++++
Minclude/linux/ksm.h | 7+++++++
Minclude/linux/list.h | 11+++++++++++
Minclude/linux/memcontrol.h | 47+++++++++++++++++++++++++++++++++++++++++++----
Minclude/linux/memory_hotplug.h | 2+-
Minclude/linux/mm.h | 3++-
Minclude/linux/mm_types.h | 2+-
Minclude/linux/mmzone.h | 8+++-----
Minclude/linux/nodemask.h | 8++++----
Minclude/linux/page-flags.h | 44++++++++++++++++++++++++++++++++++++++------
Minclude/linux/pagemap.h | 31+++++++++----------------------
Minclude/linux/poison.h | 2+-
Minclude/linux/sched.h | 5+++++
Minclude/linux/sched/mm.h | 48++++++++++++++++++++++++++++++++++++++++--------
Minclude/linux/shmem_fs.h | 3++-
Minclude/linux/slub_def.h | 12++++++------
Minclude/linux/swap.h | 4++--
Minclude/uapi/linux/fcntl.h | 1+
Minclude/uapi/linux/kernel-page-flags.h | 2+-
Minit/init_task.c | 3++-
Mkernel/cgroup/cgroup.c | 12++++++++++++
Mkernel/crash_core.c | 2++
Mkernel/kthread.c | 3++-
Mkernel/power/snapshot.c | 17+++++++++++------
Mkernel/sched/core.c | 3+++
Mkernel/sched/fair.c | 15++++++++-------
Mkernel/sysctl.c | 2+-
Mlib/Kconfig.debug | 31+++++++++++++------------------
Mlib/Kconfig.kasan | 10----------
Mlib/Makefile | 1+
Mlib/cpumask.c | 3++-
Mlib/test_kasan.c | 24------------------------
Alib/test_vmalloc.c | 551+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mmm/Kconfig.debug | 17+++++++++++++++++
Mmm/cma.c | 4+++-
Mmm/cma_debug.c | 11++++-------
Mmm/compaction.c | 1039+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Mmm/dmapool.c | 13+++++++++----
Mmm/failslab.c | 14++++----------
Mmm/filemap.c | 93+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Mmm/gup.c | 200+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Mmm/gup_benchmark.c | 8++------
Mmm/huge_memory.c | 37+++++++++++++++++--------------------
Mmm/hugetlb.c | 17++++++++++-------
Mmm/internal.h | 24++++++++++++++++++------
Mmm/kasan/common.c | 2++
Mmm/kasan/generic.c | 19-------------------
Mmm/kasan/generic_report.c | 3---
Mmm/kasan/init.c | 6+++---
Mmm/kasan/kasan.h | 3---
Mmm/khugepaged.c | 2++
Mmm/ksm.c | 77++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mmm/list_lru.c | 3+--
Mmm/memblock.c | 3+--
Mmm/memcontrol.c | 150+++++++++++++++++++++++++++++++++++++++----------------------------------------
Mmm/memfd.c | 3++-
Mmm/memory-failure.c | 14++++++--------
Mmm/memory.c | 72++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mmm/memory_hotplug.c | 55++++++++++++++++++++++++++++++++++---------------------
Mmm/mempolicy.c | 4++--
Mmm/mempool.c | 8++++++++
Mmm/migrate.c | 14+++++++-------
Mmm/mlock.c | 14+++++++-------
Mmm/mmap.c | 15+++++++--------
Mmm/mprotect.c | 6+++---
Mmm/mremap.c | 17+++++++++++++++++
Mmm/oom_kill.c | 81++++++++++++++++++++-----------------------------------------------------------
Mmm/page-writeback.c | 24++++++++++++++++--------
Mmm/page_alloc.c | 160+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Mmm/page_ext.c | 3++-
Mmm/page_idle.c | 8++++----
Mmm/page_owner.c | 8+++-----
Mmm/page_poison.c | 4++++
Mmm/readahead.c | 2++
Mmm/rmap.c | 2+-
Mmm/shmem.c | 741++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Mmm/slab.c | 34++++++++++++++++++++++++----------
Mmm/slab.h | 4----
Mmm/slab_common.c | 12+++++++++---
Mmm/slub.c | 16+++++++---------
Mmm/sparse.c | 2+-
Mmm/swap.c | 16++++++++--------
Mmm/swap_state.c | 23++++++++++++++++++++++-
Mmm/swapfile.c | 487+++++++++++++++++++++++++++++++------------------------------------------------
Mmm/truncate.c | 6++++--
Mmm/util.c | 37++++++++++++++++++++++++++-----------
Mmm/vmalloc.c | 459++++++++++++++++++++++++++++++++++++++++++-------------------------------------
Mmm/vmscan.c | 88+++++++++++++++++++++++++++++--------------------------------------------------
Mmm/vmstat.c | 15++++-----------
Mmm/workingset.c | 5++---
Mnet/core/pktgen.c | 3++-
Mnet/qrtr/qrtr.c | 3++-
Mscripts/Makefile.kasan | 5-----
Mscripts/decode_stacktrace.sh | 9++++++++-
Mscripts/gcc-plugins/Kconfig | 4----
Atools/include/linux/numa.h | 16++++++++++++++++
Mtools/perf/bench/numa.c | 7++++---
Mtools/testing/selftests/Makefile | 1+
Mtools/testing/selftests/memfd/memfd_test.c | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/testing/selftests/proc/.gitignore | 1+
Mtools/testing/selftests/proc/Makefile | 1+
Mtools/testing/selftests/proc/proc-loadavg-001.c | 2+-
Atools/testing/selftests/proc/proc-pid-vm.c | 406+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/testing/selftests/proc/proc-self-map-files-002.c | 2+-
Mtools/testing/selftests/proc/proc-self-syscall.c | 3+--
Mtools/testing/selftests/proc/proc-self-wchan.c | 2+-
Mtools/testing/selftests/proc/read.c | 14++++++++++++++
Atools/testing/selftests/tmpfs/.gitignore | 1+
Atools/testing/selftests/tmpfs/Makefile | 7+++++++
Atools/testing/selftests/tmpfs/bug-link-o-tmpfile.c | 67+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/testing/selftests/vm/run_vmtests | 16++++++++++++++++
Atools/testing/selftests/vm/test_vmalloc.sh | 176+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/vm/page-types.c | 2+-
Mtools/vm/slabinfo.c | 35+++++++++++++++++++----------------
213 files changed, 4918 insertions(+), 2315 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst @@ -1189,6 +1189,10 @@ PAGE_SIZE multiple when read back. Amount of cached filesystem data that was modified and is currently being written back to disk + anon_thp + Amount of memory used in anonymous mappings backed by + transparent hugepages + inactive_anon, active_anon, inactive_file, active_file, unevictable Amount of memory, swap-backed and filesystem-backed, on the internal memory management lists used by the @@ -1248,6 +1252,18 @@ PAGE_SIZE multiple when read back. Amount of reclaimed lazyfree pages + thp_fault_alloc + + Number of transparent hugepages which were allocated to satisfy + a page fault, including COW faults. This counter is not present + when CONFIG_TRANSPARENT_HUGEPAGE is not set. + + thp_collapse_alloc + + Number of transparent hugepages which were allocated to allow + collapsing an existing range of pages. This counter is not + present when CONFIG_TRANSPARENT_HUGEPAGE is not set. + memory.swap.current A read-only single value file which exists on non-root cgroups. diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst @@ -75,9 +75,10 @@ number of times a page is mapped. 20. NOPAGE 21. KSM 22. THP - 23. BALLOON + 23. OFFLINE 24. ZERO_PAGE 25. IDLE + 26. PGTABLE * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the memory cgroup each page is charged to, indexed by PFN. Only available when @@ -118,8 +119,8 @@ Short descriptions to the page flags identical memory pages dynamically shared between one or more processes 22 - THP contiguous pages which construct transparent hugepages -23 - BALLOON - balloon compaction page +23 - OFFLINE + page is logically offline 24 - ZERO_PAGE zero page for pfn_zero or huge_zero page 25 - IDLE @@ -128,6 +129,8 @@ Short descriptions to the page flags Note that this flag may be stale in case the page was accessed via a PTE. To make sure the flag is up-to-date one has to read ``/sys/kernel/mm/page_idle/bitmap`` first. +26 - PGTABLE + page is in use as a page table IO related page flags --------------------- diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt @@ -107,9 +107,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. 8. LRU Each memcg has its own private LRU. Now, its handling is under global - VM's control (means that it's handled under global zone_lru_lock). + VM's control (means that it's handled under global pgdat->lru_lock). Almost all routines around memcg's LRU is called by global LRU's - list management functions under zone_lru_lock(). + list management functions under pgdat->lru_lock. A special function is mem_cgroup_isolate_pages(). This scans memcg's private LRU and call __isolate_lru_page() to extract a page diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt @@ -267,11 +267,11 @@ When oom event notifier is registered, event will be delivered. Other lock order is following: PG_locked. mm->page_table_lock - zone_lru_lock + pgdat->lru_lock lock_page_cgroup. In many cases, just lock_page_cgroup() is called. per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by - zone_lru_lock, it has no lock of its own. + pgdat->lru_lock, it has no lock of its own. 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) diff --git a/MAINTAINERS b/MAINTAINERS @@ -9835,6 +9835,14 @@ F: kernel/sched/membarrier.c F: include/uapi/linux/membarrier.h F: arch/powerpc/include/asm/membarrier.h +MEMBLOCK +M: Mike Rapoport <rppt@linux.ibm.com> +L: linux-mm@kvack.org +S: Maintained +F: include/linux/memblock.h +F: mm/memblock.c +F: Documentation/core-api/boot-time-mm.rst + MEMORY MANAGEMENT L: linux-mm@kvack.org W: http://www.linux-mm.org diff --git a/arch/alpha/include/asm/topology.h b/arch/alpha/include/asm/topology.h @@ -4,6 +4,7 @@ #include <linux/smp.h> #include <linux/threads.h> +#include <linux/numa.h> #include <asm/machvec.h> #ifdef CONFIG_NUMA @@ -29,7 +30,7 @@ static const struct cpumask *cpumask_of_node(int node) { int cpu; - if (node == -1) + if (node == NUMA_NO_NODE) return cpu_all_mask; cpumask_clear(&node_to_cpumask_map[node]); diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig @@ -1467,6 +1467,10 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC +config ARCH_ENABLE_HUGEPAGE_MIGRATION + def_bool y + depends on HUGETLB_PAGE && MIGRATION + menu "Power management options" source "kernel/power/Kconfig" diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h @@ -20,6 +20,11 @@ #include <asm/page.h> +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION +#define arch_hugetlb_migration_supported arch_hugetlb_migration_supported +extern bool arch_hugetlb_migration_supported(struct hstate *h); +#endif + #define __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(pte_t *ptep) { diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h @@ -80,11 +80,7 @@ */ #ifdef CONFIG_KASAN #define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - KASAN_SHADOW_SCALE_SHIFT)) -#ifdef CONFIG_KASAN_EXTRA -#define KASAN_THREAD_SHIFT 2 -#else #define KASAN_THREAD_SHIFT 1 -#endif /* CONFIG_KASAN_EXTRA */ #else #define KASAN_SHADOW_SIZE (0) #define KASAN_THREAD_SHIFT 0 diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c @@ -321,7 +321,7 @@ void crash_post_resume(void) * but does not hold any data of loaded kernel image. * * Note that all the pages in crash dump kernel memory have been initially - * marked as Reserved in kexec_reserve_crashkres_pages(). + * marked as Reserved as memory was allocated via memblock_reserve(). * * In hibernation, the pages which are Reserved and yet "nosave" are excluded * from the hibernation iamge. crash_is_nosave() does thich check for crash @@ -361,7 +361,6 @@ void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) for (addr = begin; addr < end; addr += PAGE_SIZE) { page = phys_to_page(addr); - ClearPageReserved(page); free_reserved_page(page); } } diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c @@ -27,6 +27,26 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION +bool arch_hugetlb_migration_supported(struct hstate *h) +{ + size_t pagesize = huge_page_size(h); + + switch (pagesize) { +#ifdef CONFIG_ARM64_4K_PAGES + case PUD_SIZE: +#endif + case PMD_SIZE: + case CONT_PMD_SIZE: + case CONT_PTE_SIZE: + return true; + } + pr_warn("%s: unrecognized huge page size 0x%lx\n", + __func__, pagesize); + return false; +} +#endif + int pmd_huge(pmd_t pmd) { return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c @@ -118,35 +118,10 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; } - -static void __init kexec_reserve_crashkres_pages(void) -{ -#ifdef CONFIG_HIBERNATION - phys_addr_t addr; - struct page *page; - - if (!crashk_res.end) - return; - - /* - * To reduce the size of hibernation image, all the pages are - * marked as Reserved initially. - */ - for (addr = crashk_res.start; addr < (crashk_res.end + 1); - addr += PAGE_SIZE) { - page = phys_to_page(addr); - SetPageReserved(page); - } -#endif -} #else static void __init reserve_crashkernel(void) { } - -static void __init kexec_reserve_crashkres_pages(void) -{ -} #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_CRASH_DUMP @@ -586,8 +561,6 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ memblock_free_all(); - kexec_reserve_crashkres_pages(); - mem_init_print_info(NULL); /* diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c @@ -120,7 +120,7 @@ static void __init setup_node_to_cpumask_map(void) } /* cpumask_of_node() will now work */ - pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } /* diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c @@ -74,7 +74,7 @@ void __init build_cpu_to_node_map(void) cpumask_clear(&node_to_cpu_mask[node]); for_each_possible_early_cpu(cpu) { - node = -1; + node = NUMA_NO_NODE; for (i = 0; i < NR_CPUS; ++i) if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { node = node_cpuid[i].nid; diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c @@ -583,17 +583,6 @@ pfm_put_task(struct task_struct *task) if (task != current) put_task_struct(task); } -static inline void -pfm_reserve_page(unsigned long a) -{ - SetPageReserved(vmalloc_to_page((void *)a)); -} -static inline void -pfm_unreserve_page(unsigned long a) -{ - ClearPageReserved(vmalloc_to_page((void*)a)); -} - static inline unsigned long pfm_protect_ctx_ctxsw(pfm_context_t *x) { @@ -816,44 +805,6 @@ pfm_reset_msgq(pfm_context_t *ctx) DPRINT(("ctx=%p msgq reset\n", ctx)); } -static void * -pfm_rvmalloc(unsigned long size) -{ - void *mem; - unsigned long addr; - - size = PAGE_ALIGN(size); - mem = vzalloc(size); - if (mem) { - //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem); - addr = (unsigned long)mem; - while (size > 0) { - pfm_reserve_page(addr); - addr+=PAGE_SIZE; - size-=PAGE_SIZE; - } - } - return mem; -} - -static void -pfm_rvfree(void *mem, unsigned long size) -{ - unsigned long addr; - - if (mem) { - DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size)); - addr = (unsigned long) mem; - while ((long) size > 0) { - pfm_unreserve_page(addr); - addr+=PAGE_SIZE; - size-=PAGE_SIZE; - } - vfree(mem); - } - return; -} - static pfm_context_t * pfm_context_alloc(int ctx_flags) { @@ -1498,7 +1449,7 @@ pfm_free_smpl_buffer(pfm_context_t *ctx) /* * free the buffer */ - pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size); + vfree(ctx->ctx_smpl_hdr); ctx->ctx_smpl_hdr = NULL; ctx->ctx_smpl_size = 0UL; @@ -2137,7 +2088,7 @@ doit: * All memory free operations (especially for vmalloc'ed memory) * MUST be done with interrupts ENABLED. */ - if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size); + vfree(smpl_buf_addr); /* * return the memory used by the context @@ -2266,10 +2217,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t /* * We do the easy to undo allocations first. - * - * pfm_rvmalloc(), clears the buffer, so there is no leak */ - smpl_buf = pfm_rvmalloc(size); + smpl_buf = vzalloc(size); if (smpl_buf == NULL) { DPRINT(("Can't allocate sampling buffer\n")); return -ENOMEM; @@ -2346,7 +2295,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t error: vm_area_free(vma); error_kmem: - pfm_rvfree(smpl_buf, size); + vfree(smpl_buf); return -ENOMEM; } diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c @@ -227,7 +227,7 @@ void __init setup_per_cpu_areas(void) * CPUs are put into groups according to node. Walk cpu_map * and create new groups at node boundaries. */ - prev_node = -1; + prev_node = NUMA_NO_NODE; ai->nr_groups = 0; for (unit = 0; unit < nr_units; unit++) { cpu = cpu_map[unit]; @@ -435,7 +435,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize) { void *ptr = NULL; u8 best = 0xff; - int bestnode = -1, node, anynode = 0; + int bestnode = NUMA_NO_NODE, node, anynode = 0; for_each_online_node(node) { if (node_isset(node, memory_less_mask)) @@ -447,7 +447,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize) anynode = node; } - if (bestnode == -1) + if (bestnode == NUMA_NO_NODE) bestnode = anynode; ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE, diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c @@ -51,7 +51,7 @@ void __init init_pointer_table(unsigned long ptable) pr_debug("init_pointer_table: %lx, %x\n", ptable, PD_MARKBITS(dp)); /* unreserve the page so it's possible to free that page */ - PD_PAGE(dp)->flags &= ~(1 << PG_reserved); + __ClearPageReserved(PD_PAGE(dp)); init_page_count(PD_PAGE(dp)); return; diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -13,6 +13,10 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); +extern void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte); + static inline int hstate_get_psize(struct hstate *hstate) { unsigned long shift; @@ -42,4 +46,12 @@ static inline bool gigantic_page_supported(void) /* hugepd entry valid bit */ #define HUGEPD_VAL_BITS (0x8000000000000000UL) +#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start +extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + +#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit +extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t new_pte); #endif diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1306,6 +1306,24 @@ static inline int pud_pfn(pud_t pud) BUILD_BUG(); return 0; } +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION +pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); +void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, + pte_t *, pte_t, pte_t); + +/* + * Returns true for a R -> RW upgrade of pte + */ +static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_val) +{ + if (!(old_val & _PAGE_READ)) + return false; + + if ((!(old_val & _PAGE_WRITE)) && (new_val & _PAGE_WRITE)) + return true; + + return false; +} #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h @@ -127,6 +127,10 @@ extern void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep pte_t entry, unsigned long address, int psize); +extern void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte); + static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, unsigned long set) { diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h @@ -10,6 +10,7 @@ #include <linux/pci.h> #include <linux/list.h> #include <linux/ioport.h> +#include <linux/numa.h> struct device_node; @@ -265,7 +266,7 @@ extern int pcibios_map_io_space(struct pci_bus *bus); #ifdef CONFIG_NUMA #define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE)) #else -#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = -1) +#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = NUMA_NO_NODE) #endif #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c @@ -11,6 +11,7 @@ #include <linux/export.h> #include <linux/memblock.h> #include <linux/sched/task.h> +#include <linux/numa.h> #include <asm/lppaca.h> #include <asm/paca.h> @@ -36,7 +37,7 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align, * which will put its paca in the right place. */ if (cpu == boot_cpuid) { - nid = -1; + nid = NUMA_NO_NODE; memblock_set_bottom_up(true); } else { nid = early_cpu_to_node(cpu); diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c @@ -32,6 +32,7 @@ #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/vgaarb.h> +#include <linux/numa.h> #include <asm/processor.h> #include <asm/io.h> @@ -132,7 +133,7 @@ struct pci_controller *pcibios_alloc_controller(struct device_node *dev) int nid = of_node_to_nid(dev); if (nid < 0 || !node_online(nid)) - nid = -1; + nid = NUMA_NO_NODE; PHB_SET_NODE(phb, nid); } diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c @@ -798,7 +798,6 @@ static int __init vdso_init(void) BUG_ON(vdso32_pagelist == NULL); for (i = 0; i < vdso32_pages; i++) { struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); - ClearPageReserved(pg); get_page(pg); vdso32_pagelist[i] = pg; } @@ -812,7 +811,6 @@ static int __init vdso_init(void) BUG_ON(vdso64_pagelist == NULL); for (i = 0; i < vdso64_pages; i++) { struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); - ClearPageReserved(pg); get_page(pg); vdso64_pagelist[i] = pg; } diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -121,3 +121,28 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } + +pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + unsigned long pte_val; + /* + * Clear the _PAGE_PRESENT so that no hardware parallel update is + * possible. Also keep the pte_present true so that we don't take + * wrong fault. + */ + pte_val = pte_update(vma->vm_mm, addr, ptep, + _PAGE_PRESENT, _PAGE_INVALID, 1); + + return __pte(pte_val); +} + +void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + + if (radix_enabled()) + return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, + old_pte, pte); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c @@ -90,3 +90,20 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return vm_unmapped_area(&info); } + +void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * To avoid NMMU hang while relaxing access we need to flush the tlb before + * we set the new value. + */ + if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + (atomic_read(&mm->context.copros) > 0)) + radix__flush_hugetlb_page(vma, addr); + + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c @@ -21,6 +21,7 @@ #include <linux/sizes.h> #include <asm/mmu_context.h> #include <asm/pte-walk.h> +#include <linux/mm_inline.h> static DEFINE_MUTEX(mem_list_mutex); @@ -34,8 +35,18 @@ struct mm_iommu_table_group_mem_t { atomic64_t mapped; unsigned int pageshift; u64 ua; /* userspace address */ - u64 entries; /* number of entries in hpas[] */ - u64 *hpas; /* vmalloc'ed */ + u64 entries; /* number of entries in hpas/hpages[] */ + /* + * in mm_iommu_get we temporarily use this to store + * struct page address. + * + * We need to convert ua to hpa in real mode. Make it + * simpler by storing physical address. + */ + union { + struct page **hpages; /* vmalloc'ed */ + phys_addr_t *hpas; + }; #define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) u64 dev_hpa; /* Device memory base address */ }; @@ -80,64 +91,13 @@ bool mm_iommu_preregistered(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mm_iommu_preregistered); -/* - * Taken from alloc_migrate_target with changes to remove CMA allocations - */ -struct page *new_iommu_non_cma_page(struct page *page, unsigned long private) -{ - gfp_t gfp_mask = GFP_USER; - struct page *new_page; - - if (PageCompound(page)) - return NULL; - - if (PageHighMem(page)) - gfp_mask |= __GFP_HIGHMEM; - - /* - * We don't want the allocation to force an OOM if possibe - */ - new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN); - return new_page; -} - -static int mm_iommu_move_page_from_cma(struct page *page) -{ - int ret = 0; - LIST_HEAD(cma_migrate_pages); - - /* Ignore huge pages for now */ - if (PageCompound(page)) - return -EBUSY; - - lru_add_drain(); - ret = isolate_lru_page(page); - if (ret) - return ret; - - list_add(&page->lru, &cma_migrate_pages); - put_page(page); /* Drop the gup reference */ - - ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page, - NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE); - if (ret) { - if (!list_empty(&cma_migrate_pages)) - putback_movable_pages(&cma_migrate_pages); - } - - return 0; -} - static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, - unsigned long entries, unsigned long dev_hpa, - struct mm_iommu_table_group_mem_t **pmem) + unsigned long entries, unsigned long dev_hpa, + struct mm_iommu_table_group_mem_t **pmem) { struct mm_iommu_table_group_mem_t *mem; - long i, j, ret = 0, locked_entries = 0; + long i, ret, locked_entries = 0; unsigned int pageshift; - unsigned long flags; - unsigned long cur_ua; - struct page *page = NULL; mutex_lock(&mem_list_mutex); @@ -187,62 +147,43 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, goto unlock_exit; } + down_read(&mm->mmap_sem); + ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + up_read(&mm->mmap_sem); + if (ret != entries) { + /* free the reference taken */ + for (i = 0; i < ret; i++) + put_page(mem->hpages[i]); + + vfree(mem->hpas); + kfree(mem); + ret = -EFAULT; + goto unlock_exit; + } + + pageshift = PAGE_SHIFT; for (i = 0; i < entries; ++i) { - cur_ua = ua + (i << PAGE_SHIFT); - if (1 != get_user_pages_fast(cur_ua, - 1/* pages */, 1/* iswrite */, &page)) { - ret = -EFAULT; - for (j = 0; j < i; ++j) - put_page(pfn_to_page(mem->hpas[j] >> - PAGE_SHIFT)); - vfree(mem->hpas); - kfree(mem); - goto unlock_exit; - } + struct page *page = mem->hpages[i]; + /* - * If we get a page from the CMA zone, since we are going to - * be pinning these entries, we might as well move them out - * of the CMA zone if possible. NOTE: faulting in + migration - * can be expensive. Batching can be considered later + * Allow to use larger than 64k IOMMU pages. Only do that + * if we are backed by hugetlb. */ - if (is_migrate_cma_page(page)) { - if (mm_iommu_move_page_from_cma(page)) - goto populate; - if (1 != get_user_pages_fast(cur_ua, - 1/* pages */, 1/* iswrite */, - &page)) { - ret = -EFAULT; - for (j = 0; j < i; ++j) - put_page(pfn_to_page(mem->hpas[j] >> - PAGE_SHIFT)); - vfree(mem->hpas); - kfree(mem); - goto unlock_exit; - } - } -populate: - pageshift = PAGE_SHIFT; - if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) { - pte_t *pte; + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { struct page *head = compound_head(page); - unsigned int compshift = compound_order(head); - unsigned int pteshift; - - local_irq_save(flags); /* disables as well */ - pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift); - - /* Double check it is still the same pinned page */ - if (pte && pte_page(*pte) == head && - pteshift == compshift + PAGE_SHIFT) - pageshift = max_t(unsigned int, pteshift, - PAGE_SHIFT); - local_irq_restore(flags); + + pageshift = compound_order(head) + PAGE_SHIFT; } mem->pageshift = min(mem->pageshift, pageshift); + /* + * We don't need struct page reference any more, switch + * to physical address. + */ mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; } good_exit: + ret = 0; atomic64_set(&mem->mapped, 1); mem->used = 1; mem->ua = ua; diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c @@ -84,7 +84,7 @@ static void __init setup_node_to_cpumask_map(void) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ - dbg("Node to cpumask map for %d nodes\n", nr_node_ids); + dbg("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init fake_numa_create_new_node(unsigned long end_pfn, @@ -215,7 +215,7 @@ static void initialize_distance_lookup_table(int nid, */ static int associativity_to_nid(const __be32 *associativity) { - int nid = -1; + int nid = NUMA_NO_NODE; if (min_common_depth == -1) goto out; @@ -225,7 +225,7 @@ static int associativity_to_nid(const __be32 *associativity) /* POWER4 LPAR uses 0xffff as invalid node */ if (nid == 0xffff || nid >= MAX_NUMNODES) - nid = -1; + nid = NUMA_NO_NODE; if (nid > 0 && of_read_number(associativity, 1) >= distance_ref_points_depth) { @@ -244,7 +244,7 @@ out: */ static int of_node_to_nid_single(struct device_node *device) { - int nid = -1; + int nid = NUMA_NO_NODE; const __be32 *tmp; tmp = of_get_associativity(device); @@ -256,7 +256,7 @@ static int of_node_to_nid_single(struct device_node *device) /* Walk the device tree upwards, looking for an associativity id */ int of_node_to_nid(struct device_node *device) { - int nid = -1; + int nid = NUMA_NO_NODE; of_node_get(device); while (device) { @@ -454,7 +454,7 @@ static int of_drconf_to_nid_single(struct drmem_lmb *lmb) */ static int numa_setup_cpu(unsigned long lcpu) { - int nid = -1; + int nid = NUMA_NO_NODE; struct device_node *cpu; /* @@ -930,7 +930,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) { struct drmem_lmb *lmb; unsigned long lmb_size; - int nid = -1; + int nid = NUMA_NO_NODE; lmb_size = drmem_lmb_size(); @@ -960,7 +960,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) static int hot_add_node_scn_to_nid(unsigned long scn_addr) { struct device_node *memory; - int nid = -1; + int nid = NUMA_NO_NODE; for_each_node_by_type(memory, "memory") { unsigned long start, size; diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c @@ -401,6 +401,31 @@ void arch_report_meminfo(struct seq_file *m) } #endif /* CONFIG_PROC_FS */ +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) +{ + unsigned long pte_val; + + /* + * Clear the _PAGE_PRESENT so that no hardware parallel update is + * possible. Also keep the pte_present true so that we don't take + * wrong fault. + */ + pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0); + + return __pte(pte_val); + +} + +void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + if (radix_enabled()) + return radix__ptep_modify_prot_commit(vma, addr, + ptep, old_pte, pte); + set_pte_at(vma->vm_mm, addr, ptep, pte); +} + /* * For hash translation mode, we use the deposited table to store hash slot * information and they are stored at PTRS_PER_PMD offset from related pmd diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c @@ -1063,3 +1063,21 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, } /* See ptesync comment in radix__set_pte_at */ } + +void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * To avoid NMMU hang while relaxing access we need to flush the tlb before + * we set the new value. We need to do this only for radix, because hash + * translation does flush when updating the linux pte. + */ + if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + (atomic_read(&mm->context.copros) > 0)) + radix__flush_tlb_page(vma, addr); + + set_pte_at(mm, addr, ptep, pte); +} diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> +#include <linux/numa.h> #include <asm/machdep.h> #include <asm/debugfs.h> @@ -223,7 +224,7 @@ static int memtrace_online(void) ent = &memtrace_array[i]; /* We have onlined this chunk previously */ - if (ent->nid == -1) + if (ent->nid == NUMA_NO_NODE) continue; /* Remove from io mappings */ @@ -257,7 +258,7 @@ static int memtrace_online(void) */ debugfs_remove_recursive(ent->dir); pr_info("Added trace memory back to node %d\n", ent->nid); - ent->size = ent->start = ent->nid = -1; + ent->size = ent->start = ent->nid = NUMA_NO_NODE; } if (ret) return ret; diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c @@ -54,7 +54,6 @@ static int __init vdso_init(void) struct page *pg; pg = virt_to_page(vdso_start + (i << PAGE_SHIFT)); - ClearPageReserved(pg); vdso_pagelist[i] = pg; } vdso_pagelist[i] = virt_to_page(vdso_data); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h @@ -1069,8 +1069,9 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION -pte_t ptep_modify_prot_start(struct mm_struct *, unsigned long, pte_t *); -void ptep_modify_prot_commit(struct mm_struct *, unsigned long, pte_t *, pte_t); +pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); +void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, + pte_t *, pte_t, pte_t); #define __HAVE_ARCH_PTEP_CLEAR_FLUSH static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c @@ -291,7 +291,6 @@ static int __init vdso_init(void) BUG_ON(vdso32_pagelist == NULL); for (i = 0; i < vdso32_pages - 1; i++) { struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); - ClearPageReserved(pg); get_page(pg); vdso32_pagelist[i] = pg; } @@ -309,7 +308,6 @@ static int __init vdso_init(void) BUG_ON(vdso64_pagelist == NULL); for (i = 0; i < vdso64_pages - 1; i++) { struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); - ClearPageReserved(pg); get_page(pg); vdso64_pagelist[i] = pg; } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c @@ -301,12 +301,13 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_xchg_lazy); -pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pgste_t pgste; pte_t old; int nodat; + struct mm_struct *mm = vma->vm_mm; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); @@ -319,10 +320,11 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, return old; } -void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) +void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) { pgste_t pgste; + struct mm_struct *mm = vma->vm_mm; if (!MACHINE_HAS_NX) pte_val(pte) &= ~_PAGE_NOEXEC; diff --git a/arch/sh/kernel/syscalls/syscalltbl.sh b/arch/sh/kernel/syscalls/syscalltbl.sh @@ -13,10 +13,10 @@ emit() { t_entry="$3" while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}" + printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" t_nxt=$((t_nxt+1)) done - printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}" + printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" } grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S @@ -10,7 +10,7 @@ #include <linux/sys.h> #include <linux/linkage.h> -#define __SYSCALL(nr, entry, nargs) .long entry +#define __SYSCALL(nr, entry) .long entry .data ENTRY(sys_call_table) #include <asm/syscall_table.h> diff --git a/arch/sparc/kernel/pci_fire.c b/arch/sparc/kernel/pci_fire.c @@ -11,6 +11,7 @@ #include <linux/export.h> #include <linux/irq.h> #include <linux/of_device.h> +#include <linux/numa.h> #include <asm/prom.h> #include <asm/irq.h> @@ -416,7 +417,7 @@ static int pci_fire_pbm_init(struct pci_pbm_info *pbm, struct device_node *dp = op->dev.of_node; int err; - pbm->numa_node = -1; + pbm->numa_node = NUMA_NO_NODE; pbm->pci_ops = &sun4u_pci_ops; pbm->config_space_reg_bits = 12; diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/interrupt.h> #include <linux/of_device.h> +#include <linux/numa.h> #include <asm/iommu.h> #include <asm/irq.h> @@ -1347,7 +1348,7 @@ static int schizo_pbm_init(struct pci_pbm_info *pbm, pbm->next = pci_pbm_root; pci_pbm_root = pbm; - pbm->numa_node = -1; + pbm->numa_node = NUMA_NO_NODE; pbm->pci_ops = &sun4u_pci_ops; pbm->config_space_reg_bits = 8; diff --git a/arch/sparc/kernel/psycho_common.c b/arch/sparc/kernel/psycho_common.c @@ -5,6 +5,7 @@ */ #include <linux/kernel.h> #include <linux/interrupt.h> +#include <linux/numa.h> #include <asm/upa.h> @@ -454,7 +455,7 @@ void psycho_pbm_init_common(struct pci_pbm_info *pbm, struct platform_device *op struct device_node *dp = op->dev.of_node; pbm->name = dp->full_name; - pbm->numa_node = -1; + pbm->numa_node = NUMA_NO_NODE; pbm->chip_type = chip_type; pbm->chip_version = of_getintprop_default(dp, "version#", 0); pbm->chip_revision = of_getintprop_default(dp, "module-revision#", 0); diff --git a/arch/sparc/kernel/sbus.c b/arch/sparc/kernel/sbus.c @@ -15,6 +15,7 @@ #include <linux/interrupt.h> #include <linux/of.h> #include <linux/of_device.h> +#include <linux/numa.h> #include <asm/page.h> #include <asm/io.h> @@ -561,7 +562,7 @@ static void __init sbus_iommu_init(struct platform_device *op) op->dev.archdata.iommu = iommu; op->dev.archdata.stc = strbuf; - op->dev.archdata.numa_node = -1; + op->dev.archdata.numa_node = NUMA_NO_NODE; reg_base = regs + SYSIO_IOMMUREG_BASE; iommu->iommu_control = reg_base + IOMMU_CONTROL; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c @@ -976,13 +976,13 @@ static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid) { int prev_nid, new_nid; - prev_nid = -1; + prev_nid = NUMA_NO_NODE; for ( ; start < end; start += PAGE_SIZE) { for (new_nid = 0; new_nid < num_node_masks; new_nid++) { struct node_mem_mask *p = &node_masks[new_nid]; if ((start & p->mask) == p->match) { - if (prev_nid == -1) + if (prev_nid == NUMA_NO_NODE) prev_nid = new_nid; break; } @@ -1208,7 +1208,7 @@ int of_node_to_nid(struct device_node *dp) md = mdesc_grab(); count = 0; - nid = -1; + nid = NUMA_NO_NODE; mdesc_for_each_node_by_name(md, grp, "group") { if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) { nid = count; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h @@ -422,25 +422,26 @@ static inline pgdval_t pgd_val(pgd_t pgd) } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION -static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, +static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pteval_t ret; - ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep); + ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep); return (pte_t) { .pte = ret }; } -static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) +static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) { + if (sizeof(pteval_t) > sizeof(long)) /* 5 arg words */ - pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte); + pv_ops.mmu.ptep_modify_prot_commit(vma, addr, ptep, pte); else PVOP_VCALL4(mmu.ptep_modify_prot_commit, - mm, addr, ptep, pte.pte); + vma, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h @@ -55,6 +55,7 @@ struct task_struct; struct cpumask; struct flush_tlb_info; struct mmu_gather; +struct vm_area_struct; /* * Wrapper type for pointers to code which uses the non-standard @@ -254,9 +255,9 @@ struct pv_mmu_ops { pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, + pte_t (*ptep_modify_prot_start)(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); - void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, + void (*ptep_modify_prot_commit)(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte); struct paravirt_callee_save pte_val; diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/scatterlist.h> +#include <linux/numa.h> #include <asm/io.h> #include <asm/pat.h> #include <asm/x86_init.h> @@ -141,7 +142,7 @@ cpumask_of_pcibus(const struct pci_bus *bus) int node; node = __pcibus_to_node(bus); - return (node == -1) ? cpu_online_mask : + return (node == NUMA_NO_NODE) ? cpu_online_mask : cpumask_of_node(node); } #endif diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h @@ -75,7 +75,7 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un #endif /** - * access_ok: - Checks if a user space pointer is valid + * access_ok - Checks if a user space pointer is valid * @addr: User space pointer to start of block to check * @size: Size of block to check * @@ -84,12 +84,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * * Checks if a pointer to a block of memory in user space is valid. * - * Returns true (nonzero) if the memory block may be valid, false (zero) - * if it is definitely invalid. - * * Note that, depending on architecture, this function probably just * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. + * + * Return: true (nonzero) if the memory block may be valid, false (zero) + * if it is definitely invalid. */ #define access_ok(addr, size) \ ({ \ @@ -134,7 +134,7 @@ extern int __get_user_bad(void); __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) /** - * get_user: - Get a simple variable from user space. + * get_user - Get a simple variable from user space. * @x: Variable to store result. * @ptr: Source address, in user space. * @@ -148,7 +148,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * - * Returns zero on success, or -EFAULT on error. + * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ /* @@ -226,7 +226,7 @@ extern void __put_user_4(void); extern void __put_user_8(void); /** - * put_user: - Write a simple value into user space. + * put_user - Write a simple value into user space. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * @@ -240,7 +240,7 @@ extern void __put_user_8(void); * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * - * Returns zero on success, or -EFAULT on error. + * Return: zero on success, or -EFAULT on error. */ #define put_user(x, ptr) \ ({ \ @@ -502,7 +502,7 @@ struct __large_struct { unsigned long buf[100]; }; } while (0) /** - * __get_user: - Get a simple variable from user space, with less checking. + * __get_user - Get a simple variable from user space, with less checking. * @x: Variable to store result. * @ptr: Source address, in user space. * @@ -519,7 +519,7 @@ struct __large_struct { unsigned long buf[100]; }; * Caller must check the pointer with access_ok() before calling this * function. * - * Returns zero on success, or -EFAULT on error. + * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ @@ -527,7 +527,7 @@ struct __large_struct { unsigned long buf[100]; }; __get_user_nocheck((x), (ptr), sizeof(*(ptr))) /** - * __put_user: - Write a simple value into user space, with less checking. + * __put_user - Write a simple value into user space, with less checking. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * @@ -544,7 +544,7 @@ struct __large_struct { unsigned long buf[100]; }; * Caller must check the pointer with access_ok() before calling this * function. * - * Returns zero on success, or -EFAULT on error. + * Return: zero on success, or -EFAULT on error. */ #define __put_user(x, ptr) \ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -27,6 +27,7 @@ #include <linux/crash_dump.h> #include <linux/reboot.h> #include <linux/memory.h> +#include <linux/numa.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -1390,7 +1391,7 @@ static void __init build_socket_tables(void) } /* Set socket -> node values: */ - lnid = -1; + lnid = NUMA_NO_NODE; for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); int apicid, sockid; @@ -1521,7 +1522,7 @@ static void __init uv_system_init_hub(void) new_hub->pnode = 0xffff; new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); - new_hub->memory_nid = -1; + new_hub->memory_nid = NUMA_NO_NODE; new_hub->nr_possible_cpus = 0; new_hub->nr_online_cpus = 0; } @@ -1538,7 +1539,7 @@ static void __init uv_system_init_hub(void) uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; - if (uv_cpu_hub_info(cpu)->memory_nid == -1) + if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE) uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); /* Init memoryless node: */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c @@ -171,7 +171,7 @@ void __init setup_per_cpu_areas(void) unsigned long delta; int rc; - pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n", + pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c @@ -56,6 +56,7 @@ #include <linux/stackprotector.h> #include <linux/gfp.h> #include <linux/cpuidle.h> +#include <linux/numa.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -841,7 +842,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { - static int current_node = -1; + static int current_node = NUMA_NO_NODE; int node = early_cpu_to_node(cpu); static int width, node_width; diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c @@ -54,13 +54,13 @@ do { \ } while (0) /** - * clear_user: - Zero a block of memory in user space. + * clear_user - Zero a block of memory in user space. * @to: Destination address, in user space. * @n: Number of bytes to zero. * * Zero a block of memory in user space. * - * Returns number of bytes that could not be cleared. + * Return: number of bytes that could not be cleared. * On success, this will be zero. */ unsigned long @@ -74,14 +74,14 @@ clear_user(void __user *to, unsigned long n) EXPORT_SYMBOL(clear_user); /** - * __clear_user: - Zero a block of memory in user space, with less checking. + * __clear_user - Zero a block of memory in user space, with less checking. * @to: Destination address, in user space. * @n: Number of bytes to zero. * * Zero a block of memory in user space. Caller must check * the specified block with access_ok() before calling this function. * - * Returns number of bytes that could not be cleared. + * Return: number of bytes that could not be cleared. * On success, this will be zero. */ unsigned long diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c @@ -123,7 +123,7 @@ void __init setup_node_to_cpumask_map(void) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ - pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init numa_add_memblk_to(int nid, u64 start, u64 end, @@ -866,7 +866,7 @@ const struct cpumask *cpumask_of_node(int node) { if (node >= nr_node_ids) { printk(KERN_WARNING - "cpumask_of_node(%d): node > nr_node_ids(%d)\n", + "cpumask_of_node(%d): node > nr_node_ids(%u)\n", node, nr_node_ids); dump_stack(); return cpu_none_mask; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h @@ -17,8 +17,8 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, +pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte); unsigned long xen_read_cr2_direct(void); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c @@ -306,20 +306,20 @@ static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, __xen_set_pte(ptep, pteval); } -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, +pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ - trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); + trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep); return *ptep; } -void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, +void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { struct mmu_update u; - trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); + trace_xen_mmu_ptep_modify_prot_commit(vma->vm_mm, addr, ptep, pte); xen_mc_batch(); u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c @@ -40,6 +40,7 @@ #include <linux/export.h> #include <linux/debugfs.h> #include <linux/prefetch.h> +#include <linux/numa.h> #include "mtip32xx.h" #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) @@ -4018,9 +4019,9 @@ static int get_least_used_cpu_on_node(int node) /* Helper for selecting a node in round robin mode */ static inline int mtip_get_next_rr_node(void) { - static int next_node = -1; + static int next_node = NUMA_NO_NODE; - if (next_node == -1) { + if (next_node == NUMA_NO_NODE) { next_node = first_online_node; return next_node; } diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c @@ -163,7 +163,6 @@ static int efficeon_free_gatt_table(struct agp_bridge_data *bridge) unsigned long page = efficeon_private.l1_table[index]; if (page) { efficeon_private.l1_table[index] = 0; - ClearPageReserved(virt_to_page((char *)page)); free_page(page); freed++; } @@ -219,7 +218,6 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge) efficeon_free_gatt_table(agp_bridge); return -ENOMEM; } - SetPageReserved(virt_to_page((char *)page)); for (offset = 0; offset < PAGE_SIZE; offset += clflush_chunk) clflush((char *)page+offset); diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c @@ -63,6 +63,7 @@ #include <linux/acpi_dma.h> #include <linux/of_dma.h> #include <linux/mempool.h> +#include <linux/numa.h> static DEFINE_MUTEX(dma_list_mutex); static DEFINE_IDA(dma_ida); @@ -386,7 +387,8 @@ EXPORT_SYMBOL(dma_issue_pending_all); static bool dma_chan_is_local(struct dma_chan *chan, int cpu) { int node = dev_to_node(chan->device->dev); - return node == -1 || cpumask_test_cpu(cpu, cpumask_of_node(node)); + return node == NUMA_NO_NODE || + cpumask_test_cpu(cpu, cpumask_of_node(node)); } /** diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h @@ -123,12 +123,6 @@ static inline u64 ptr_to_u64(const void *ptr) #include <linux/list.h> -static inline int list_is_first(const struct list_head *list, - const struct list_head *head) -{ - return head->next == list; -} - static inline void __list_del_many(struct list_head *head, struct list_head *first) { diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c @@ -681,8 +681,13 @@ static struct notifier_block hv_memory_nb = { /* Check if the particular page is backed and can be onlined and online it. */ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) { - if (!has_pfn_is_backed(has, page_to_pfn(pg))) + if (!has_pfn_is_backed(has, page_to_pfn(pg))) { + if (!PageOffline(pg)) + __SetPageOffline(pg); return; + } + if (PageOffline(pg)) + __ClearPageOffline(pg); /* This frame is currently backed; online the page. */ __online_page_set_limits(pg); @@ -771,7 +776,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, } } -static void hv_online_page(struct page *pg) +static void hv_online_page(struct page *pg, unsigned int order) { struct hv_hotadd_state *has; unsigned long flags; @@ -780,10 +785,11 @@ static void hv_online_page(struct page *pg) spin_lock_irqsave(&dm_device.ha_lock, flags); list_for_each_entry(has, &dm_device.ha_region_list, list) { /* The page belongs to a different HAS. */ - if ((pfn < has->start_pfn) || (pfn >= has->end_pfn)) + if ((pfn < has->start_pfn) || + (pfn + (1UL << order) > has->end_pfn)) continue; - hv_page_online_one(has, pg); + hv_bring_pgs_online(has, pfn, 1UL << order); break; } spin_unlock_irqrestore(&dm_device.ha_lock, flags); @@ -1201,6 +1207,7 @@ static void free_balloon_pages(struct hv_dynmem_device *dm, for (i = 0; i < num_pages; i++) { pg = pfn_to_page(i + start_frame); + __ClearPageOffline(pg); __free_page(pg); dm->num_pages_ballooned--; } @@ -1213,7 +1220,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, struct dm_balloon_response *bl_resp, int alloc_unit) { - unsigned int i = 0; + unsigned int i, j; struct page *pg; if (num_pages < alloc_unit) @@ -1245,6 +1252,10 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, if (alloc_unit != 1) split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); + /* mark all pages offline */ + for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++) + __SetPageOffline(pg + j); + bl_resp->range_count++; bl_resp->range_array[i].finfo.start_page = page_to_pfn(pg); diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c @@ -48,6 +48,7 @@ #include <linux/cpumask.h> #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/numa.h> #include "hfi.h" #include "affinity.h" @@ -777,7 +778,7 @@ void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd) _dev_comp_vect_cpu_mask_clean_up(dd, entry); unlock: mutex_unlock(&node_affinity.lock); - dd->node = -1; + dd->node = NUMA_NO_NODE; } /* diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c @@ -54,6 +54,7 @@ #include <linux/printk.h> #include <linux/hrtimer.h> #include <linux/bitmap.h> +#include <linux/numa.h> #include <rdma/rdma_vt.h> #include "hfi.h" @@ -1303,7 +1304,7 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, dd->unit = ret; list_add(&dd->list, &hfi1_dev_list); } - dd->node = -1; + dd->node = NUMA_NO_NODE; spin_unlock_irqrestore(&hfi1_devs_lock, flags); idr_preload_end(); diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c @@ -39,6 +39,7 @@ #include <linux/dmi.h> #include <linux/slab.h> #include <linux/iommu.h> +#include <linux/numa.h> #include <asm/irq_remapping.h> #include <asm/iommu_table.h> @@ -477,7 +478,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg) int node = acpi_map_pxm_to_node(rhsa->proximity_domain); if (!node_online(node)) - node = -1; + node = NUMA_NO_NODE; drhd->iommu->node = node; return 0; } @@ -1062,7 +1063,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) iommu->msagaw = msagaw; iommu->segment = drhd->segment; - iommu->node = -1; + iommu->node = NUMA_NO_NODE; ver = readl(iommu->reg + DMAR_VER_REG); pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n", diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c @@ -47,6 +47,7 @@ #include <linux/dma-contiguous.h> #include <linux/dma-direct.h> #include <linux/crash_dump.h> +#include <linux/numa.h> #include <asm/irq_remapping.h> #include <asm/cacheflush.h> #include <asm/iommu.h> @@ -1716,7 +1717,7 @@ static struct dmar_domain *alloc_domain(int flags) return NULL; memset(domain, 0, sizeof(*domain)); - domain->nid = -1; + domain->nid = NUMA_NO_NODE; domain->flags = flags; domain->has_iotlb_device = false; INIT_LIST_HEAD(&domain->devices); diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c @@ -22,6 +22,7 @@ #include <linux/module.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/numa.h> #include <asm/uv/uv_hub.h> #if defined CONFIG_X86_64 #include <asm/uv/bios.h> @@ -61,7 +62,7 @@ static struct xpc_heartbeat_uv *xpc_heartbeat_uv; XPC_NOTIFY_MSG_SIZE_UV) #define XPC_NOTIFY_IRQ_NAME "xpc_notify" -static int xpc_mq_node = -1; +static int xpc_mq_node = NUMA_NO_NODE; static struct xpc_gru_mq_uv *xpc_activate_mq_uv; static struct xpc_gru_mq_uv *xpc_notify_mq_uv; diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c @@ -557,6 +557,36 @@ vmballoon_page_in_frames(enum vmballoon_page_size_type page_size) } /** + * vmballoon_mark_page_offline() - mark a page as offline + * @page: pointer for the page. + * @page_size: the size of the page. + */ +static void +vmballoon_mark_page_offline(struct page *page, + enum vmballoon_page_size_type page_size) +{ + int i; + + for (i = 0; i < vmballoon_page_in_frames(page_size); i++) + __SetPageOffline(page + i); +} + +/** + * vmballoon_mark_page_online() - mark a page as online + * @page: pointer for the page. + * @page_size: the size of the page. + */ +static void +vmballoon_mark_page_online(struct page *page, + enum vmballoon_page_size_type page_size) +{ + int i; + + for (i = 0; i < vmballoon_page_in_frames(page_size); i++) + __ClearPageOffline(page + i); +} + +/** * vmballoon_send_get_target() - Retrieve desired balloon size from the host. * * @b: pointer to the balloon. @@ -612,6 +642,7 @@ static int vmballoon_alloc_page_list(struct vmballoon *b, ctl->page_size); if (page) { + vmballoon_mark_page_offline(page, ctl->page_size); /* Success. Add the page to the list and continue. */ list_add(&page->lru, &ctl->pages); continue; @@ -850,6 +881,7 @@ static void vmballoon_release_page_list(struct list_head *page_list, list_for_each_entry_safe(page, tmp, page_list, lru) { list_del(&page->lru); + vmballoon_mark_page_online(page, page_size); __free_pages(page, vmballoon_page_order(page_size)); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -27,6 +27,7 @@ #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/atomic.h> +#include <linux/numa.h> #include <scsi/fc/fc_fcoe.h> #include <net/udp_tunnel.h> #include <net/pkt_cls.h> @@ -6418,7 +6419,7 @@ int ixgbe_setup_tx_resources(struct ixgbe_ring *tx_ring) { struct device *dev = tx_ring->dev; int orig_node = dev_to_node(dev); - int ring_node = -1; + int ring_node = NUMA_NO_NODE; int size; size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count; @@ -6512,7 +6513,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, { struct device *dev = rx_ring->dev; int orig_node = dev_to_node(dev); - int ring_node = -1; + int ring_node = NUMA_NO_NODE; int size; size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count; diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c @@ -369,14 +369,20 @@ static enum bp_state reserve_additional_memory(void) return BP_ECANCELED; } -static void xen_online_page(struct page *page) +static void xen_online_page(struct page *page, unsigned int order) { - __online_page_set_limits(page); + unsigned long i, size = (1 << order); + unsigned long start_pfn = page_to_pfn(page); + struct page *p; + pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); mutex_lock(&balloon_mutex); - - __balloon_append(page); - + for (i = 0; i < size; i++) { + p = pfn_to_page(start_pfn + i); + __online_page_set_limits(p); + __SetPageOffline(p); + __balloon_append(p); + } mutex_unlock(&balloon_mutex); } @@ -441,6 +447,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages) xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); /* Relinquish the page back to the allocator. */ + __ClearPageOffline(page); free_reserved_page(page); } @@ -467,6 +474,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) state = BP_EAGAIN; break; } + __SetPageOffline(page); adjust_managed_page_count(page, -1); xenmem_reservation_scrub_page(page); list_add(&page->lru, &pages); diff --git a/fs/file.c b/fs/file.c @@ -457,6 +457,7 @@ struct files_struct init_files = { .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), + .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c @@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; } diff --git a/fs/inode.c b/fs/inode.c @@ -2093,14 +2093,8 @@ EXPORT_SYMBOL(inode_dio_wait); void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask) { - unsigned int old_flags, new_flags; - WARN_ON_ONCE(flags & ~mask); - do { - old_flags = READ_ONCE(inode->i_flags); - new_flags = (old_flags & ~mask) | flags; - } while (unlikely(cmpxchg(&inode->i_flags, old_flags, - new_flags) != old_flags)); + set_mask_bits(&inode->i_flags, mask, flags); } EXPORT_SYMBOL(inode_set_flags); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c @@ -832,26 +832,35 @@ void kernfs_drain_open_files(struct kernfs_node *kn) * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ +__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) +{ + struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry); + struct kernfs_open_node *on = kn->attr.open; + + poll_wait(of->file, &on->poll, wait); + + if (of->event != atomic_read(&on->event)) + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + + return DEFAULT_POLLMASK; +} + static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); - struct kernfs_open_node *on = kn->attr.open; + __poll_t ret; if (!kernfs_get_active(kn)) - goto trigger; + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; - poll_wait(filp, &on->poll, wait); + if (kn->attr.ops->poll) + ret = kn->attr.ops->poll(of, wait); + else + ret = kernfs_generic_poll(of, wait); kernfs_put_active(kn); - - if (of->event != atomic_read(&on->event)) - goto trigger; - - return DEFAULT_POLLMASK; - - trigger: - return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + return ret; } static void kernfs_notify_workfn(struct work_struct *work) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb, return count; } -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +static +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range) { struct ocfs2_super *osb = OCFS2_SB(sb); - u64 start, len, trimmed, first_group, last_group, group; + u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0; int ret, cnt; u32 first_bit, last_bit, minlen; struct buffer_head *main_bm_bh = NULL; @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) struct buffer_head *gd_bh = NULL; struct ocfs2_dinode *main_bm; struct ocfs2_group_desc *gd = NULL; - struct ocfs2_trim_fs_info info, *pinfo = NULL; start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) return -EINVAL; + trace_ocfs2_trim_mainbm(start, len, minlen); + +next_group: main_bm_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) } main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; - if (start >= le32_to_cpu(main_bm->i_clusters)) { - ret = -EINVAL; - goto out_unlock; - } - - len = range->len >> osb->s_clustersize_bits; - if (start + len > le32_to_cpu(main_bm->i_clusters)) - len = le32_to_cpu(main_bm->i_clusters) - start; - - trace_ocfs2_trim_fs(start, len, minlen); - - ocfs2_trim_fs_lock_res_init(osb); - ret = ocfs2_trim_fs_lock(osb, NULL, 1); - if (ret < 0) { - if (ret != -EAGAIN) { - mlog_errno(ret); - ocfs2_trim_fs_lock_res_uninit(osb); + /* + * Do some check before trim the first group. + */ + if (!group) { + if (start >= le32_to_cpu(main_bm->i_clusters)) { + ret = -EINVAL; goto out_unlock; } - mlog(ML_NOTICE, "Wait for trim on device (%s) to " - "finish, which is running from another node.\n", - osb->dev_str); - ret = ocfs2_trim_fs_lock(osb, &info, 0); - if (ret < 0) { - mlog_errno(ret); - ocfs2_trim_fs_lock_res_uninit(osb); - goto out_unlock; - } + if (start + len > le32_to_cpu(main_bm->i_clusters)) + len = le32_to_cpu(main_bm->i_clusters) - start; - if (info.tf_valid && info.tf_success && - info.tf_start == start && info.tf_len == len && - info.tf_minlen == minlen) { - /* Avoid sending duplicated trim to a shared device */ - mlog(ML_NOTICE, "The same trim on device (%s) was " - "just done from node (%u), return.\n", - osb->dev_str, info.tf_nodenum); - range->len = info.tf_trimlen; - goto out_trimunlock; - } + /* + * Determine first and last group to examine based on + * start and len + */ + first_group = ocfs2_which_cluster_group(main_bm_inode, start); + if (first_group == osb->first_cluster_group_blkno) + first_bit = start; + else + first_bit = start - ocfs2_blocks_to_clusters(sb, + first_group); + last_group = ocfs2_which_cluster_group(main_bm_inode, + start + len - 1); + group = first_group; } - info.tf_nodenum = osb->node_num; - info.tf_start = start; - info.tf_len = len; - info.tf_minlen = minlen; - - /* Determine first and last group to examine based on start and len */ - first_group = ocfs2_which_cluster_group(main_bm_inode, start); - if (first_group == osb->first_cluster_group_blkno) - first_bit = start; - else - first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); - last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); - last_bit = osb->bitmap_cpg; - - trimmed = 0; - for (group = first_group; group <= last_group;) { + do { if (first_bit + len >= osb->bitmap_cpg) last_bit = osb->bitmap_cpg; else @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); else group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); - } - range->len = trimmed * sb->s_blocksize; + } while (0); - info.tf_trimlen = range->len; - info.tf_success = (ret ? 0 : 1); - pinfo = &info; -out_trimunlock: - ocfs2_trim_fs_unlock(osb, pinfo); - ocfs2_trim_fs_lock_res_uninit(osb); out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); + main_bm_bh = NULL; out_mutex: inode_unlock(main_bm_inode); iput(main_bm_inode); + + /* + * If all the groups trim are not done or failed, but we should release + * main_bm related locks for avoiding the current IO starve, then go to + * trim the next group + */ + if (ret >= 0 && group <= last_group) + goto next_group; out: + range->len = trimmed * sb->s_blocksize; + return ret; +} + +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct ocfs2_trim_fs_info info, *pinfo = NULL; + + ocfs2_trim_fs_lock_res_init(osb); + + trace_ocfs2_trim_fs(range->start, range->len, range->minlen); + + ret = ocfs2_trim_fs_lock(osb, NULL, 1); + if (ret < 0) { + if (ret != -EAGAIN) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + return ret; + } + + mlog(ML_NOTICE, "Wait for trim on device (%s) to " + "finish, which is running from another node.\n", + osb->dev_str); + ret = ocfs2_trim_fs_lock(osb, &info, 0); + if (ret < 0) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + return ret; + } + + if (info.tf_valid && info.tf_success && + info.tf_start == range->start && + info.tf_len == range->len && + info.tf_minlen == range->minlen) { + /* Avoid sending duplicated trim to a shared device */ + mlog(ML_NOTICE, "The same trim on device (%s) was " + "just done from node (%u), return.\n", + osb->dev_str, info.tf_nodenum); + range->len = info.tf_trimlen; + goto out; + } + } + + info.tf_nodenum = osb->node_num; + info.tf_start = range->start; + info.tf_len = range->len; + info.tf_minlen = range->minlen; + + ret = ocfs2_trim_mainbm(sb, range); + + info.tf_trimlen = range->len; + info.tf_success = (ret < 0 ? 0 : 1); + pinfo = &info; +out: + ocfs2_trim_fs_unlock(osb, pinfo); + ocfs2_trim_fs_lock_res_uninit(osb); return ret; } diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c @@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group, struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); - o2net_disconnect_node(node); + if (cluster->cl_nodes[node->nd_num] == node) { + o2net_disconnect_node(node); - if (cluster->cl_has_local && - (cluster->cl_local_node == node->nd_num)) { - cluster->cl_has_local = 0; - cluster->cl_local_node = O2NM_INVALID_NODE_NUM; - o2net_stop_listening(node); + if (cluster->cl_has_local && + (cluster->cl_local_node == node->nd_num)) { + cluster->cl_has_local = 0; + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + o2net_stop_listening(node); + } } /* XXX call into net to stop this node from trading messages */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) { struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + /* Only one trimfs thread are allowed to work at the same time. */ + mutex_lock(&osb->obs_trim_fs_mutex); + ocfs2_lock_res_init_once(lockres); ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb) ocfs2_simple_drop_lockres(osb, lockres); ocfs2_lock_res_free(lockres); + + mutex_unlock(&osb->obs_trim_fs_mutex); } static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h @@ -407,6 +407,7 @@ struct ocfs2_super struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; struct ocfs2_lock_res osb_trim_fs_lockres; + struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent, DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm); + DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); /* End of trace events for fs/ocfs2/alloc.c. */ diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c @@ -55,7 +55,7 @@ struct ocfs2_slot_info { unsigned int si_blocks; struct buffer_head **si_bh; unsigned int si_num_slots; - struct ocfs2_slot *si_slots; + struct ocfs2_slot si_slots[]; }; @@ -420,9 +420,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) struct inode *inode = NULL; struct ocfs2_slot_info *si; - si = kzalloc(sizeof(struct ocfs2_slot_info) + - (sizeof(struct ocfs2_slot) * osb->max_slots), - GFP_KERNEL); + si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL); if (!si) { status = -ENOMEM; mlog_errno(status); @@ -431,8 +429,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) si->si_extended = ocfs2_uses_extended_slot_map(osb); si->si_num_slots = osb->max_slots; - si->si_slots = (struct ocfs2_slot *)((char *)si + - sizeof(struct ocfs2_slot_info)); inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb) if (ocfs2_is_hard_readonly(osb)) goto leave; + mutex_init(&osb->obs_trim_fs_mutex); + status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); diff --git a/fs/pipe.c b/fs/pipe.c @@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct page *page = buf->page; if (page_count(page) == 1) { - if (memcg_kmem_enabled()) - memcg_kmem_uncharge(page, 0); + memcg_kmem_uncharge(page, 0); __SetPageLocked(page); return 0; } diff --git a/fs/proc/array.c b/fs/proc/array.c @@ -343,28 +343,28 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #endif - seq_printf(m, "\nSpeculation_Store_Bypass:\t"); + seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { case -EINVAL: - seq_printf(m, "unknown"); + seq_puts(m, "unknown"); break; case PR_SPEC_NOT_AFFECTED: - seq_printf(m, "not vulnerable"); + seq_puts(m, "not vulnerable"); break; case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: - seq_printf(m, "thread force mitigated"); + seq_puts(m, "thread force mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_DISABLE: - seq_printf(m, "thread mitigated"); + seq_puts(m, "thread mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_ENABLE: - seq_printf(m, "thread vulnerable"); + seq_puts(m, "thread vulnerable"); break; case PR_SPEC_DISABLE: - seq_printf(m, "globally mitigated"); + seq_puts(m, "globally mitigated"); break; default: - seq_printf(m, "vulnerable"); + seq_puts(m, "vulnerable"); break; } seq_putc(m, '\n'); diff --git a/fs/proc/base.c b/fs/proc/base.c @@ -456,7 +456,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { if (unlikely(!sched_info_on())) - seq_printf(m, "0 0 0\n"); + seq_puts(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, @@ -3161,7 +3161,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, return d_splice_alias(inode, dentry); } -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) +struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { struct task_struct *task; unsigned tgid; diff --git a/fs/proc/internal.h b/fs/proc/internal.h @@ -162,7 +162,7 @@ extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struc extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); -extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); +struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ diff --git a/fs/proc/page.c b/fs/proc/page.c @@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page) else if (page_count(page) == 0 && is_free_buddy_page(page)) u |= 1 << KPF_BUDDY; - if (PageBalloon(page)) - u |= 1 << KPF_BALLOON; + if (PageOffline(page)) + u |= 1 << KPF_OFFLINE; if (PageTable(page)) u |= 1 << KPF_PGTABLE; diff --git a/fs/proc/root.c b/fs/proc/root.c @@ -154,7 +154,7 @@ static int proc_root_getattr(const struct path *path, struct kstat *stat, static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { - if (!proc_pid_lookup(dir, dentry, flags)) + if (!proc_pid_lookup(dentry, flags)) return NULL; return proc_lookup(dir, dentry, flags); diff --git a/fs/proc/self.c b/fs/proc/self.c @@ -38,6 +38,7 @@ int proc_setup_self(struct super_block *s) struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *self; + int ret = -ENOMEM; inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); @@ -51,20 +52,19 @@ int proc_setup_self(struct super_block *s) inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; d_add(self, inode); + ret = 0; } else { dput(self); - self = ERR_PTR(-ENOMEM); } - } else { - self = ERR_PTR(-ENOMEM); } inode_unlock(root_inode); - if (IS_ERR(self)) { + + if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); - return PTR_ERR(self); - } - ns->proc_self = self; - return 0; + else + ns->proc_self = self; + + return ret; } void __init proc_self_init(void) diff --git a/fs/proc/stat.c b/fs/proc/stat.c @@ -23,21 +23,21 @@ #ifdef arch_idle_time -static u64 get_idle_time(int cpu) +static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle; - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle = kcs->cpustat[CPUTIME_IDLE]; if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) idle += arch_idle_time(cpu); return idle; } -static u64 get_iowait_time(int cpu) +static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait; - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + iowait = kcs->cpustat[CPUTIME_IOWAIT]; if (cpu_online(cpu) && nr_iowait_cpu(cpu)) iowait += arch_idle_time(cpu); return iowait; @@ -45,7 +45,7 @@ static u64 get_iowait_time(int cpu) #else -static u64 get_idle_time(int cpu) +static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; @@ -54,14 +54,14 @@ static u64 get_idle_time(int cpu) if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle = kcs->cpustat[CPUTIME_IDLE]; else idle = idle_usecs * NSEC_PER_USEC; return idle; } -static u64 get_iowait_time(int cpu) +static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait, iowait_usecs = -1ULL; @@ -70,7 +70,7 @@ static u64 get_iowait_time(int cpu) if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + iowait = kcs->cpustat[CPUTIME_IOWAIT]; else iowait = iowait_usecs * NSEC_PER_USEC; @@ -120,16 +120,18 @@ static int show_stat(struct seq_file *p, void *v) getboottime64(&boottime); for_each_possible_cpu(i) { - user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle += get_idle_time(i); - iowait += get_iowait_time(i); - irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + struct kernel_cpustat *kcs = &kcpustat_cpu(i); + + user += kcs->cpustat[CPUTIME_USER]; + nice += kcs->cpustat[CPUTIME_NICE]; + system += kcs->cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(kcs, i); + iowait += get_iowait_time(kcs, i); + irq += kcs->cpustat[CPUTIME_IRQ]; + softirq += kcs->cpustat[CPUTIME_SOFTIRQ]; + steal += kcs->cpustat[CPUTIME_STEAL]; + guest += kcs->cpustat[CPUTIME_GUEST]; + guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); @@ -155,17 +157,19 @@ static int show_stat(struct seq_file *p, void *v) seq_putc(p, '\n'); for_each_online_cpu(i) { + struct kernel_cpustat *kcs = &kcpustat_cpu(i); + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle = get_idle_time(i); - iowait = get_iowait_time(i); - irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + user = kcs->cpustat[CPUTIME_USER]; + nice = kcs->cpustat[CPUTIME_NICE]; + system = kcs->cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(kcs, i); + iowait = get_iowait_time(kcs, i); + irq = kcs->cpustat[CPUTIME_IRQ]; + softirq = kcs->cpustat[CPUTIME_SOFTIRQ]; + steal = kcs->cpustat[CPUTIME_STEAL]; + guest = kcs->cpustat[CPUTIME_GUEST]; + guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c @@ -948,10 +948,12 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, pte_t ptent = *pte; if (pte_present(ptent)) { - ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); - ptent = pte_wrprotect(ptent); + pte_t old_pte; + + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); - ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c @@ -178,7 +178,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) seq_file_path(m, file, ""); } else if (mm && is_stack(vma)) { seq_pad(m, ' '); - seq_printf(m, "[stack]"); + seq_puts(m, "[stack]"); } seq_putc(m, '\n'); diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c @@ -38,6 +38,7 @@ int proc_setup_thread_self(struct super_block *s) struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *thread_self; + int ret = -ENOMEM; inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); @@ -51,20 +52,19 @@ int proc_setup_thread_self(struct super_block *s) inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_thread_self_inode_operations; d_add(thread_self, inode); + ret = 0; } else { dput(thread_self); - thread_self = ERR_PTR(-ENOMEM); } - } else { - thread_self = ERR_PTR(-ENOMEM); } inode_unlock(root_inode); - if (IS_ERR(thread_self)) { + + if (ret) pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); - return PTR_ERR(thread_self); - } - ns->proc_thread_self = thread_self; - return 0; + else + ns->proc_thread_self = thread_self; + + return ret; } void __init proc_thread_self_init(void) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h @@ -606,7 +606,7 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd) return 0; } -static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, +static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -615,10 +615,10 @@ static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, * non-present, preventing the hardware from asynchronously * updating it. */ - return ptep_get_and_clear(mm, addr, ptep); + return ptep_get_and_clear(vma->vm_mm, addr, ptep); } -static inline void __ptep_modify_prot_commit(struct mm_struct *mm, +static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -626,7 +626,7 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm, * The pte is non-present, so there's no hardware state to * preserve. */ - set_pte_at(mm, addr, ptep, pte); + set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -644,22 +644,22 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm, * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ -static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, +static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - return __ptep_modify_prot_start(mm, addr, ptep); + return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. */ -static inline void ptep_modify_prot_commit(struct mm_struct *mm, +static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t old_pte, pte_t pte) { - __ptep_modify_prot_commit(mm, addr, ptep, pte); + __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h @@ -365,7 +365,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) rcu_read_lock(); /* - * Paired with store_release in inode_switch_wb_work_fn() and + * Paired with store_release in inode_switch_wbs_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h @@ -4,15 +4,18 @@ * * Common interface definitions for making balloon pages movable by compaction. * - * Despite being perfectly possible to perform ballooned pages migration, they - * make a special corner case to compaction scans because balloon pages are not - * enlisted at any LRU list like the other pages we do compact / migrate. + * Balloon page migration makes use of the general non-lru movable page + * feature. + * + * page->private is used to reference the responsible balloon device. + * page->mapping is used in context of non-lru page migration to reference + * the address space operations for page isolation/migration/compaction. * * As the page isolation scanning step a compaction thread does is a lockless * procedure (from a page standpoint), it might bring some racy situations while * performing balloon page compaction. In order to sort out these racy scenarios * and safely perform balloon's page compaction and migration we must, always, - * ensure following these three simple rules: + * ensure following these simple rules: * * i. when updating a balloon's page ->mapping element, strictly do it under * the following lock order, independently of the far superior @@ -21,19 +24,8 @@ * +--spin_lock_irq(&b_dev_info->pages_lock); * ... page->mapping updates here ... * - * ii. before isolating or dequeueing a balloon page from the balloon device - * pages list, the page reference counter must be raised by one and the - * extra refcount must be dropped when the page is enqueued back into - * the balloon device page list, thus a balloon page keeps its reference - * counter raised only while it is under our special handling; - * - * iii. after the lockless scan step have selected a potential balloon page for - * isolation, re-test the PageBalloon mark and the PagePrivate flag - * under the proper page lock, to ensure isolating a valid balloon page - * (not yet isolated, nor under release procedure) - * - * iv. isolation or dequeueing procedure must clear PagePrivate flag under - * page lock together with removing page from balloon device page list. + * ii. isolation or dequeueing procedure must remove the page from balloon + * device page list under b_dev_info->pages_lock. * * The functions provided by this interface are placed to help on coping with * the aforementioned balloon page corner case, as well as to ensure the simple @@ -103,7 +95,7 @@ extern int balloon_page_migrate(struct address_space *mapping, static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); __SetPageMovable(page, balloon->inode->i_mapping); set_page_private(page, (unsigned long)balloon); list_add(&page->lru, &balloon->pages); @@ -119,7 +111,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, */ static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); __ClearPageMovable(page); set_page_private(page, 0); /* @@ -149,13 +141,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void) static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); list_add(&page->lru, &balloon->pages); } static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); list_del(&page->lru); } diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h @@ -32,6 +32,7 @@ struct kernfs_node; struct kernfs_ops; struct kernfs_open_file; struct seq_file; +struct poll_table_struct; #define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 @@ -574,6 +575,9 @@ struct cftype { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); + __poll_t (*poll)(struct kernfs_open_file *of, + struct poll_table_struct *pt); + #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif diff --git a/include/linux/compaction.h b/include/linux/compaction.h @@ -88,14 +88,13 @@ extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; -extern int sysctl_extfrag_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos); extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, - const struct alloc_context *ac, enum compact_priority prio); + const struct alloc_context *ac, enum compact_priority prio, + struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int classzone_idx); @@ -227,8 +226,8 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_i #endif /* CONFIG_COMPACTION */ -#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) struct node; +#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) extern int compaction_register_node(struct node *node); extern void compaction_unregister_node(struct node *node); diff --git a/include/linux/device.h b/include/linux/device.h @@ -1095,7 +1095,7 @@ static inline void set_dev_node(struct device *dev, int node) #else static inline int dev_to_node(struct device *dev) { - return -1; + return NUMA_NO_NODE; } static inline void set_dev_node(struct device *dev, int node) { diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h @@ -7,6 +7,13 @@ #include <linux/bitops.h> #include <linux/jump_label.h> +/* + * Return code to denote that requested number of + * frontswap pages are unused(moved to page cache). + * Used in in shmem_unuse and try_to_unuse. + */ +#define FRONTSWAP_PAGES_UNUSED 2 + struct frontswap_ops { void (*init)(unsigned); /* this swap type was just swapon'ed */ int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ diff --git a/include/linux/fs.h b/include/linux/fs.h @@ -2091,7 +2091,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See - * inode_switch_wb_work_fn() for details. + * inode_switch_wbs_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper * and work dirs among overlayfs mounts. diff --git a/include/linux/gfp.h b/include/linux/gfp.h @@ -24,21 +24,21 @@ struct vm_area_struct; #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u -#define ___GFP_WRITE 0x100u -#define ___GFP_NOWARN 0x200u -#define ___GFP_RETRY_MAYFAIL 0x400u -#define ___GFP_NOFAIL 0x800u -#define ___GFP_NORETRY 0x1000u -#define ___GFP_MEMALLOC 0x2000u -#define ___GFP_COMP 0x4000u -#define ___GFP_ZERO 0x8000u -#define ___GFP_NOMEMALLOC 0x10000u -#define ___GFP_HARDWALL 0x20000u -#define ___GFP_THISNODE 0x40000u -#define ___GFP_ATOMIC 0x80000u -#define ___GFP_ACCOUNT 0x100000u -#define ___GFP_DIRECT_RECLAIM 0x200000u -#define ___GFP_KSWAPD_RECLAIM 0x400000u +#define ___GFP_ZERO 0x100u +#define ___GFP_ATOMIC 0x200u +#define ___GFP_DIRECT_RECLAIM 0x400u +#define ___GFP_KSWAPD_RECLAIM 0x800u +#define ___GFP_WRITE 0x1000u +#define ___GFP_NOWARN 0x2000u +#define ___GFP_RETRY_MAYFAIL 0x4000u +#define ___GFP_NOFAIL 0x8000u +#define ___GFP_NORETRY 0x10000u +#define ___GFP_MEMALLOC 0x20000u +#define ___GFP_COMP 0x40000u +#define ___GFP_NOMEMALLOC 0x80000u +#define ___GFP_HARDWALL 0x100000u +#define ___GFP_THISNODE 0x200000u +#define ___GFP_ACCOUNT 0x400000u #ifdef CONFIG_LOCKDEP #define ___GFP_NOLOCKDEP 0x800000u #else diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h @@ -371,6 +371,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask); struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -493,17 +495,54 @@ static inline pgoff_t basepage_index(struct page *page) extern int dissolve_free_huge_page(struct page *page); extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); -static inline bool hugepage_migration_supported(struct hstate *h) -{ + #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION +#ifndef arch_hugetlb_migration_supported +static inline bool arch_hugetlb_migration_supported(struct hstate *h) +{ if ((huge_page_shift(h) == PMD_SHIFT) || - (huge_page_shift(h) == PGDIR_SHIFT)) + (huge_page_shift(h) == PUD_SHIFT) || + (huge_page_shift(h) == PGDIR_SHIFT)) return true; else return false; +} +#endif #else +static inline bool arch_hugetlb_migration_supported(struct hstate *h) +{ return false; +} #endif + +static inline bool hugepage_migration_supported(struct hstate *h) +{ + return arch_hugetlb_migration_supported(h); +} + +/* + * Movability check is different as compared to migration check. + * It determines whether or not a huge page should be placed on + * movable zone or not. Movability of any huge page should be + * required only if huge page size is supported for migration. + * There wont be any reason for the huge page to be movable if + * it is not migratable to start with. Also the size of the huge + * page should be large enough to be placed under a movable zone + * and still feasible enough to be migratable. Just the presence + * in movable zone does not make the migration feasible. + * + * So even though large huge page sizes like the gigantic ones + * are migratable they should not be movable because its not + * feasible to migrate them from movable zone. + */ +static inline bool hugepage_movable_supported(struct hstate *h) +{ + if (!hugepage_migration_supported(h)) + return false; + + if (hstate_is_gigantic(h)) + return false; + return true; } static inline spinlock_t *huge_pte_lockptr(struct hstate *h, @@ -543,6 +582,26 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr set_huge_pte_at(mm, addr, ptep, pte); } #endif + +#ifndef huge_ptep_modify_prot_start +#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start +static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); +} +#endif + +#ifndef huge_ptep_modify_prot_commit +#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit +static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} +#endif + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page(v, a, r) NULL @@ -602,6 +661,11 @@ static inline bool hugepage_migration_supported(struct hstate *h) return false; } +static inline bool hugepage_movable_supported(struct hstate *h) +{ + return false; +} + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h @@ -2,7 +2,7 @@ #ifndef _LINUX_KASAN_CHECKS_H #define _LINUX_KASAN_CHECKS_H -#ifdef CONFIG_KASAN +#if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL) void kasan_check_read(const volatile void *p, unsigned int size); void kasan_check_write(const volatile void *p, unsigned int size); #else diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h @@ -25,6 +25,7 @@ struct seq_file; struct vm_area_struct; struct super_block; struct file_system_type; +struct poll_table_struct; struct kernfs_open_node; struct kernfs_iattrs; @@ -261,6 +262,9 @@ struct kernfs_ops { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); + __poll_t (*poll)(struct kernfs_open_file *of, + struct poll_table_struct *pt); + int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -350,6 +354,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns); int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); +__poll_t kernfs_generic_poll(struct kernfs_open_file *of, + struct poll_table_struct *pt); void kernfs_notify(struct kernfs_node *kn); const void *kernfs_super_ns(struct super_block *sb); diff --git a/include/linux/ksm.h b/include/linux/ksm.h @@ -53,6 +53,8 @@ struct page *ksm_might_need_to_copy(struct page *page, void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); +bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, unsigned long address); #else /* !CONFIG_KSM */ @@ -86,6 +88,11 @@ static inline void rmap_walk_ksm(struct page *page, static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) { } +static inline bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + return false; +} #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ diff --git a/include/linux/list.h b/include/linux/list.h @@ -207,6 +207,17 @@ static inline void list_bulk_move_tail(struct list_head *head, } /** + * list_is_first -- tests whether @ list is the first entry in list @head + * @list: the entry to test + * @head: the head of the list + */ +static inline int list_is_first(const struct list_head *list, + const struct list_head *head) +{ + return list->prev == head; +} + +/** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h @@ -429,6 +429,11 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) +{ + return mem_cgroup_from_css(seq_css(m)); +} + static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { struct mem_cgroup_per_node *mz; @@ -937,6 +942,11 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) +{ + return NULL; +} + static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { return NULL; @@ -1273,12 +1283,12 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); void memcg_kmem_put_cache(struct kmem_cache *cachep); -int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_KMEM -int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); -void memcg_kmem_uncharge(struct page *page, int order); +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); +void __memcg_kmem_uncharge(struct page *page, int order); +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg); extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1300,6 +1310,26 @@ static inline bool memcg_kmem_enabled(void) return static_branch_unlikely(&memcg_kmem_enabled_key); } +static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + if (memcg_kmem_enabled()) + return __memcg_kmem_charge(page, gfp, order); + return 0; +} + +static inline void memcg_kmem_uncharge(struct page *page, int order) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_uncharge(page, order); +} + +static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, + int order, struct mem_cgroup *memcg) +{ + if (memcg_kmem_enabled()) + return __memcg_kmem_charge_memcg(page, gfp, order, memcg); + return 0; +} /* * helper for accessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function @@ -1325,6 +1355,15 @@ static inline void memcg_kmem_uncharge(struct page *page, int order) { } +static inline int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + return 0; +} + +static inline void __memcg_kmem_uncharge(struct page *page, int order) +{ +} + #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h @@ -89,7 +89,7 @@ extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, unsigned long *valid_start, unsigned long *valid_end); extern void __offline_isolated_pages(unsigned long, unsigned long); -typedef void (*online_page_callback_t)(struct page *page); +typedef void (*online_page_callback_t)(struct page *page, unsigned int order); extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); diff --git a/include/linux/mm.h b/include/linux/mm.h @@ -1536,7 +1536,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -#ifdef CONFIG_FS_DAX + +#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA) long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h @@ -80,7 +80,7 @@ struct page { struct { /* Page cache and anonymous pages */ /** * @lru: Pageout list, eg. active_list protected by - * zone_lru_lock. Sometimes used as a generic list + * pgdat->lru_lock. Sometimes used as a generic list * by the page owner. */ struct list_head lru; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h @@ -480,6 +480,8 @@ struct zone { unsigned long compact_cached_free_pfn; /* pfn where async and sync compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[2]; + unsigned long compact_init_migrate_pfn; + unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION @@ -728,10 +730,6 @@ typedef struct pglist_data { #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) -static inline spinlock_t *zone_lru_lock(struct zone *zone) -{ - return &zone->zone_pgdat->lru_lock; -} static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) { @@ -1299,7 +1297,7 @@ void memory_present(int nid, unsigned long start, unsigned long end); /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we - * need to check pfn validility within that MAX_ORDER_NR_PAGES block. + * need to check pfn validity within that MAX_ORDER_NR_PAGES block. * pfn_valid_within() should be used in this case; we optimise this away * when we have no holes within a MAX_ORDER_NR_PAGES block. */ diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h @@ -444,8 +444,8 @@ static inline int next_memory_node(int nid) return next_node(nid, node_states[N_MEMORY]); } -extern int nr_node_ids; -extern int nr_online_nodes; +extern unsigned int nr_node_ids; +extern unsigned int nr_online_nodes; static inline void node_set_online(int nid) { @@ -485,8 +485,8 @@ static inline int num_node_state(enum node_states state) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) -#define nr_node_ids 1 -#define nr_online_nodes 1 +#define nr_node_ids 1U +#define nr_online_nodes 1U #define node_set_online(node) node_set_state((node), N_ONLINE) #define node_set_offline(node) node_clear_state((node), N_ONLINE) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h @@ -17,8 +17,37 @@ /* * Various page->flags bits: * - * PG_reserved is set for special pages, which can never be swapped out. Some - * of them might not even exist... + * PG_reserved is set for special pages. The "struct page" of such a page + * should in general not be touched (e.g. set dirty) except by its owner. + * Pages marked as PG_reserved include: + * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, + * initrd, HW tables) + * - Pages reserved or allocated early during boot (before the page allocator + * was initialized). This includes (depending on the architecture) the + * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much + * much more. Once (if ever) freed, PG_reserved is cleared and they will + * be given to the page allocator. + * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying + * to read/write these pages might end badly. Don't touch! + * - The zero page(s) + * - Pages not added to the page allocator when onlining a section because + * they were excluded via the online_page_callback() or because they are + * PG_hwpoison. + * - Pages allocated in the context of kexec/kdump (loaded kernel image, + * control pages, vmcoreinfo) + * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are + * not marked PG_reserved (as they might be in use by somebody else who does + * not respect the caching strategy). + * - Pages part of an offline section (struct pages of offline sections should + * not be trusted as they will be initialized when first onlined). + * - MCA pages on ia64 + * - Pages holding CPU notes for POWER Firmware Assisted Dump + * - Device memory (e.g. PMEM, DAX, HMM) + * Some PG_reserved pages will be excluded from the hibernation image. + * PG_reserved does in general not hinder anybody from dumping or swapping + * and is no longer required for remap_pfn_range(). ioremap might require it. + * Consequently, PG_reserved for a page mapped into user space can indicate + * the zero page, the vDSO, MMIO pages or device memory. * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by @@ -671,7 +700,7 @@ PAGEFLAG_FALSE(DoubleMap) /* Reserve 0x0000007f to catch underflows of page_mapcount */ #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 -#define PG_balloon 0x00000100 +#define PG_offline 0x00000100 #define PG_kmemcg 0x00000200 #define PG_table 0x00000400 @@ -706,10 +735,13 @@ static __always_inline void __ClearPage##uname(struct page *page) \ PAGE_TYPE_OPS(Buddy, buddy) /* - * PageBalloon() is true for pages that are on the balloon page list - * (see mm/balloon_compaction.c). + * PageOffline() indicates that the page is logically offline although the + * containing section is online. (e.g. inflated in a balloon driver or + * not onlined when onlining the section). + * The content of these pages is effectively stale. Such pages should not + * be touched (read/write/dump/save) except by their owner. */ -PAGE_TYPE_OPS(Balloon, balloon) +PAGE_TYPE_OPS(Offline, offline) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h @@ -164,7 +164,7 @@ void release_pages(struct page **pages, int nr); * will find the page or it will not. Likewise, the old find_get_page could run * either before the insertion or afterwards, depending on timing. */ -static inline int page_cache_get_speculative(struct page *page) +static inline int __page_cache_add_speculative(struct page *page, int count) { #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT @@ -180,10 +180,10 @@ static inline int page_cache_get_speculative(struct page *page) * SMP requires. */ VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_inc(page); + page_ref_add(page, count); #else - if (unlikely(!get_page_unless_zero(page))) { + if (unlikely(!page_ref_add_unless(page, count, 0))) { /* * Either the page has been freed, or will be freed. * In either case, retry here and the caller should @@ -197,27 +197,14 @@ static inline int page_cache_get_speculative(struct page *page) return 1; } -/* - * Same as above, but add instead of inc (could just be merged) - */ -static inline int page_cache_add_speculative(struct page *page, int count) +static inline int page_cache_get_speculative(struct page *page) { - VM_BUG_ON(in_interrupt()); - -#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) -# ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic() && !irqs_disabled()); -# endif - VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_add(page, count); - -#else - if (unlikely(!page_ref_add_unless(page, count, 0))) - return 0; -#endif - VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page); + return __page_cache_add_speculative(page, 1); +} - return 1; +static inline int page_cache_add_speculative(struct page *page, int count) +{ + return __page_cache_add_speculative(page, count); } #ifdef CONFIG_NUMA diff --git a/include/linux/poison.h b/include/linux/poison.h @@ -30,7 +30,7 @@ */ #define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) -/********** mm/debug-pagealloc.c **********/ +/********** mm/page_poison.c **********/ #ifdef CONFIG_PAGE_POISONING_ZERO #define PAGE_POISON 0x00 #else diff --git a/include/linux/sched.h b/include/linux/sched.h @@ -48,6 +48,7 @@ struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; +struct capture_control; struct robust_list_head; struct sched_attr; struct sched_param; @@ -950,6 +951,9 @@ struct task_struct { struct io_context *io_context; +#ifdef CONFIG_COMPACTION + struct capture_control *capture_control; +#endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; @@ -1395,6 +1399,7 @@ extern struct pid *cad_pid; #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ +#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h @@ -148,17 +148,25 @@ static inline bool in_vfork(struct task_struct *tsk) * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS + * PF_MEMALLOC_NOCMA implies no allocation from CMA region. */ static inline gfp_t current_gfp_context(gfp_t flags) { - /* - * NOIO implies both NOIO and NOFS and it is a weaker context - * so always make sure it makes precedence - */ - if (unlikely(current->flags & PF_MEMALLOC_NOIO)) - flags &= ~(__GFP_IO | __GFP_FS); - else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) - flags &= ~__GFP_FS; + if (unlikely(current->flags & + (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA))) { + /* + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precedence + */ + if (current->flags & PF_MEMALLOC_NOIO) + flags &= ~(__GFP_IO | __GFP_FS); + else if (current->flags & PF_MEMALLOC_NOFS) + flags &= ~__GFP_FS; +#ifdef CONFIG_CMA + if (current->flags & PF_MEMALLOC_NOCMA) + flags &= ~__GFP_MOVABLE; +#endif + } return flags; } @@ -248,6 +256,30 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +#ifdef CONFIG_CMA +static inline unsigned int memalloc_nocma_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; + + current->flags |= PF_MEMALLOC_NOCMA; + return flags; +} + +static inline void memalloc_nocma_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; +} +#else +static inline unsigned int memalloc_nocma_save(void) +{ + return 0; +} + +static inline void memalloc_nocma_restore(unsigned int flags) +{ +} +#endif + #ifdef CONFIG_MEMCG /** * memalloc_use_memcg - Starts the remote memcg charging scope. diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h @@ -72,7 +72,8 @@ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); -extern int shmem_unuse(swp_entry_t entry, struct page *page); +extern int shmem_unuse(unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse); extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h @@ -81,12 +81,12 @@ struct kmem_cache_order_objects { */ struct kmem_cache { struct kmem_cache_cpu __percpu *cpu_slab; - /* Used for retriving partial slabs etc */ + /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; - unsigned int size; /* The size of an object including meta data */ - unsigned int object_size;/* The size of an object without meta data */ - unsigned int offset; /* Free pointer offset. */ + unsigned int size; /* The size of an object including metadata */ + unsigned int object_size;/* The size of an object without metadata */ + unsigned int offset; /* Free pointer offset */ #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; @@ -110,7 +110,7 @@ struct kmem_cache { #endif #ifdef CONFIG_MEMCG struct memcg_cache_params memcg_params; - /* for propagation, maximum size of a stored attr */ + /* For propagation, maximum size of a stored attr */ unsigned int max_attr_size; #ifdef CONFIG_SYSFS struct kset *memcg_kset; @@ -151,7 +151,7 @@ struct kmem_cache { #else #define slub_cpu_partial(s) (0) #define slub_set_cpu_partial(s, n) -#endif // CONFIG_SLUB_CPU_PARTIAL +#endif /* CONFIG_SLUB_CPU_PARTIAL */ #ifdef CONFIG_SYSFS #define SLAB_SUPPORTS_SYSFS diff --git a/include/linux/swap.h b/include/linux/swap.h @@ -307,7 +307,7 @@ struct vma_swap_readahead { }; /* linux/mm/workingset.c */ -void *workingset_eviction(struct address_space *mapping, struct page *page); +void *workingset_eviction(struct page *page); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); @@ -625,7 +625,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) return vm_swappiness; /* root ? */ - if (mem_cgroup_disabled() || !memcg->css.parent) + if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) return vm_swappiness; return memcg->swappiness; diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h @@ -41,6 +41,7 @@ #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ #define F_SEAL_GROW 0x0004 /* prevent file from growing */ #define F_SEAL_WRITE 0x0008 /* prevent writes */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ /* (1U << 31) is reserved for signed error codes */ /* diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h @@ -32,7 +32,7 @@ #define KPF_KSM 21 #define KPF_THP 22 -#define KPF_BALLOON 23 +#define KPF_OFFLINE 23 #define KPF_ZERO_PAGE 24 #define KPF_IDLE 25 #define KPF_PGTABLE 26 diff --git a/init/init_task.c b/init/init_task.c @@ -10,6 +10,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/audit.h> +#include <linux/numa.h> #include <asm/pgtable.h> #include <linux/uaccess.h> @@ -154,7 +155,7 @@ struct task_struct init_task .vtime.state = VTIME_SYS, #endif #ifdef CONFIG_NUMA_BALANCING - .numa_preferred_nid = -1, + .numa_preferred_nid = NUMA_NO_NODE, .numa_group = NULL, .numa_faults = NULL, #endif diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c @@ -3534,6 +3534,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) +{ + struct cftype *cft = of->kn->priv; + + if (cft->poll) + return cft->poll(of, pt); + + return kernfs_generic_poll(of, pt); +} + static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); @@ -3572,6 +3582,7 @@ static struct kernfs_ops cgroup_kf_single_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; @@ -3580,6 +3591,7 @@ static struct kernfs_ops cgroup_kf_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, diff --git a/kernel/crash_core.c b/kernel/crash_core.c @@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif arch_crash_save_vmcoreinfo(); diff --git a/kernel/kthread.c b/kernel/kthread.c @@ -20,6 +20,7 @@ #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> +#include <linux/numa.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -681,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags, { struct kthread_worker *worker; struct task_struct *task; - int node = -1; + int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c @@ -1215,14 +1215,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(!PageHighMem(page)); - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || - PageReserved(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageReserved(page) || PageOffline(page)) return NULL; if (page_is_guard(page)) @@ -1277,8 +1279,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(PageHighMem(page)); @@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; + if (PageOffline(page)) + return NULL; + if (PageReserved(page) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; diff --git a/kernel/sched/core.c b/kernel/sched/core.c @@ -2220,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif +#ifdef CONFIG_COMPACTION + p->capture_control = NULL; +#endif init_numa_balancing(clone_flags, p); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c @@ -1173,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = -1; + p->numa_preferred_nid = NUMA_NO_NODE; return; } @@ -1193,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); } static void account_numa_dequeue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } @@ -1413,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * two full passes of the "multi-stage node selection" test that is * executed below. */ - if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) return true; @@ -1861,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -2108,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1; + int seq, nid, max_nid = NUMA_NO_NODE; unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; @@ -2651,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) * the preferred node. */ if (dst_nid == p->numa_preferred_nid || - (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + (p->numa_preferred_nid != NUMA_NO_NODE && + src_nid != p->numa_preferred_nid)) return; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c @@ -1471,7 +1471,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_extfrag_threshold, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = sysctl_extfrag_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug @@ -222,7 +222,6 @@ config ENABLE_MUST_CHECK config FRAME_WARN int "Warn for stack frames larger than (needs gcc 4.4)" range 0 8192 - default 3072 if KASAN_EXTRA default 2048 if GCC_PLUGIN_LATENT_ENTROPY default 1280 if (!64BIT && PARISC) default 1024 if (!64BIT && !PARISC) @@ -266,23 +265,6 @@ config UNUSED_SYMBOLS you really need it, and what the merge plan to the mainline kernel for your module is. -config PAGE_OWNER - bool "Track page owner" - depends on DEBUG_KERNEL && STACKTRACE_SUPPORT - select DEBUG_FS - select STACKTRACE - select STACKDEPOT - select PAGE_EXTENSION - help - This keeps track of what call chain is the owner of a page, may - help to find bare alloc_page(s) leaks. Even if you include this - feature on your build, it is disabled in default. You should pass - "page_owner=on" to boot parameter in order to enable it. Eats - a fair amount of memory if enabled. See tools/vm/page_owner_sort.c - for user-space helper. - - If unsure, say N. - config DEBUG_FS bool "Debug Filesystem" help @@ -1876,6 +1858,19 @@ config TEST_LKM If unsure, say N. +config TEST_VMALLOC + tristate "Test module for stress/performance analysis of vmalloc allocator" + default n + depends on MMU + depends on m + help + This builds the "test_vmalloc" module that should be used for + stress and performance analysis. So, any new change for vmalloc + subsystem can be evaluated from performance and stability point + of view. + + If unsure, say N. + config TEST_USER_COPY tristate "Test user/kernel boundary protections" depends on m diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan @@ -78,16 +78,6 @@ config KASAN_SW_TAGS endchoice -config KASAN_EXTRA - bool "KASAN: extra checks" - depends on KASAN_GENERIC && DEBUG_KERNEL && !COMPILE_TEST - help - This enables further checks in generic KASAN, for now it only - includes the address-use-after-scope check that can lead to - excessive kernel stack usage, frame size warnings and longer - compile time. - See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 - choice prompt "Instrumentation type" depends on KASAN diff --git a/lib/Makefile b/lib/Makefile @@ -60,6 +60,7 @@ UBSAN_SANITIZE_test_ubsan.o := y obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_LKM) += test_module.o +obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o obj-$(CONFIG_TEST_OVERFLOW) += test_overflow.o obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o obj-$(CONFIG_TEST_SORT) += test_sort.o diff --git a/lib/cpumask.c b/lib/cpumask.c @@ -5,6 +5,7 @@ #include <linux/cpumask.h> #include <linux/export.h> #include <linux/memblock.h> +#include <linux/numa.h> /** * cpumask_next - get the next cpu in a cpumask @@ -206,7 +207,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node) /* Wrap: we always want a cpu. */ i %= num_online_cpus(); - if (node == -1) { + if (node == NUMA_NO_NODE) { for_each_cpu(cpu, cpu_online_mask) if (i-- == 0) return cpu; diff --git a/lib/test_kasan.c b/lib/test_kasan.c @@ -480,29 +480,6 @@ static noinline void __init copy_user_test(void) kfree(kmem); } -static noinline void __init use_after_scope_test(void) -{ - volatile char *volatile p; - - pr_info("use-after-scope on int\n"); - { - int local = 0; - - p = (char *)&local; - } - p[0] = 1; - p[3] = 1; - - pr_info("use-after-scope on array\n"); - { - char local[1024] = {0}; - - p = local; - } - p[0] = 1; - p[1023] = 1; -} - static noinline void __init kasan_alloca_oob_left(void) { volatile int i = 10; @@ -682,7 +659,6 @@ static int __init kmalloc_tests_init(void) kasan_alloca_oob_right(); ksize_unpoisons_memory(); copy_user_test(); - use_after_scope_test(); kmem_cache_double_free(); kmem_cache_invalid_free(); kasan_memchr(); diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Test module for stress and analyze performance of vmalloc allocator. + * (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com> + */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/random.h> +#include <linux/kthread.h> +#include <linux/moduleparam.h> +#include <linux/completion.h> +#include <linux/delay.h> +#include <linux/rwsem.h> +#include <linux/mm.h> + +#define __param(type, name, init, msg) \ + static type name = init; \ + module_param(name, type, 0444); \ + MODULE_PARM_DESC(name, msg) \ + +__param(bool, single_cpu_test, false, + "Use single first online CPU to run tests"); + +__param(bool, sequential_test_order, false, + "Use sequential stress tests order"); + +__param(int, test_repeat_count, 1, + "Set test repeat counter"); + +__param(int, test_loop_count, 1000000, + "Set test loop counter"); + +__param(int, run_test_mask, INT_MAX, + "Set tests specified in the mask.\n\n" + "\t\tid: 1, name: fix_size_alloc_test\n" + "\t\tid: 2, name: full_fit_alloc_test\n" + "\t\tid: 4, name: long_busy_list_alloc_test\n" + "\t\tid: 8, name: random_size_alloc_test\n" + "\t\tid: 16, name: fix_align_alloc_test\n" + "\t\tid: 32, name: random_size_align_alloc_test\n" + "\t\tid: 64, name: align_shift_alloc_test\n" + "\t\tid: 128, name: pcpu_alloc_test\n" + /* Add a new test case description here. */ +); + +/* + * Depends on single_cpu_test parameter. If it is true, then + * use first online CPU to trigger a test on, otherwise go with + * all online CPUs. + */ +static cpumask_t cpus_run_test_mask = CPU_MASK_NONE; + +/* + * Read write semaphore for synchronization of setup + * phase that is done in main thread and workers. + */ +static DECLARE_RWSEM(prepare_for_test_rwsem); + +/* + * Completion tracking for worker threads. + */ +static DECLARE_COMPLETION(test_all_done_comp); +static atomic_t test_n_undone = ATOMIC_INIT(0); + +static inline void +test_report_one_done(void) +{ + if (atomic_dec_and_test(&test_n_undone)) + complete(&test_all_done_comp); +} + +static int random_size_align_alloc_test(void) +{ + unsigned long size, align, rnd; + void *ptr; + int i; + + for (i = 0; i < test_loop_count; i++) { + get_random_bytes(&rnd, sizeof(rnd)); + + /* + * Maximum 1024 pages, if PAGE_SIZE is 4096. + */ + align = 1 << (rnd % 23); + + /* + * Maximum 10 pages. + */ + size = ((rnd % 10) + 1) * PAGE_SIZE; + + ptr = __vmalloc_node_range(size, align, + VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, + PAGE_KERNEL, + 0, 0, __builtin_return_address(0)); + + if (!ptr) + return -1; + + vfree(ptr); + } + + return 0; +} + +/* + * This test case is supposed to be failed. + */ +static int align_shift_alloc_test(void) +{ + unsigned long align; + void *ptr; + int i; + + for (i = 0; i < BITS_PER_LONG; i++) { + align = ((unsigned long) 1) << i; + + ptr = __vmalloc_node_range(PAGE_SIZE, align, + VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, + PAGE_KERNEL, + 0, 0, __builtin_return_address(0)); + + if (!ptr) + return -1; + + vfree(ptr); + } + + return 0; +} + +static int fix_align_alloc_test(void) +{ + void *ptr; + int i; + + for (i = 0; i < test_loop_count; i++) { + ptr = __vmalloc_node_range(5 * PAGE_SIZE, + THREAD_ALIGN << 1, + VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, + PAGE_KERNEL, + 0, 0, __builtin_return_address(0)); + + if (!ptr) + return -1; + + vfree(ptr); + } + + return 0; +} + +static int random_size_alloc_test(void) +{ + unsigned int n; + void *p; + int i; + + for (i = 0; i < test_loop_count; i++) { + get_random_bytes(&n, sizeof(i)); + n = (n % 100) + 1; + + p = vmalloc(n * PAGE_SIZE); + + if (!p) + return -1; + + *((__u8 *)p) = 1; + vfree(p); + } + + return 0; +} + +static int long_busy_list_alloc_test(void) +{ + void *ptr_1, *ptr_2; + void **ptr; + int rv = -1; + int i; + + ptr = vmalloc(sizeof(void *) * 15000); + if (!ptr) + return rv; + + for (i = 0; i < 15000; i++) + ptr[i] = vmalloc(1 * PAGE_SIZE); + + for (i = 0; i < test_loop_count; i++) { + ptr_1 = vmalloc(100 * PAGE_SIZE); + if (!ptr_1) + goto leave; + + ptr_2 = vmalloc(1 * PAGE_SIZE); + if (!ptr_2) { + vfree(ptr_1); + goto leave; + } + + *((__u8 *)ptr_1) = 0; + *((__u8 *)ptr_2) = 1; + + vfree(ptr_1); + vfree(ptr_2); + } + + /* Success */ + rv = 0; + +leave: + for (i = 0; i < 15000; i++) + vfree(ptr[i]); + + vfree(ptr); + return rv; +} + +static int full_fit_alloc_test(void) +{ + void **ptr, **junk_ptr, *tmp; + int junk_length; + int rv = -1; + int i; + + junk_length = fls(num_online_cpus()); + junk_length *= (32 * 1024 * 1024 / PAGE_SIZE); + + ptr = vmalloc(sizeof(void *) * junk_length); + if (!ptr) + return rv; + + junk_ptr = vmalloc(sizeof(void *) * junk_length); + if (!junk_ptr) { + vfree(ptr); + return rv; + } + + for (i = 0; i < junk_length; i++) { + ptr[i] = vmalloc(1 * PAGE_SIZE); + junk_ptr[i] = vmalloc(1 * PAGE_SIZE); + } + + for (i = 0; i < junk_length; i++) + vfree(junk_ptr[i]); + + for (i = 0; i < test_loop_count; i++) { + tmp = vmalloc(1 * PAGE_SIZE); + + if (!tmp) + goto error; + + *((__u8 *)tmp) = 1; + vfree(tmp); + } + + /* Success */ + rv = 0; + +error: + for (i = 0; i < junk_length; i++) + vfree(ptr[i]); + + vfree(ptr); + vfree(junk_ptr); + + return rv; +} + +static int fix_size_alloc_test(void) +{ + void *ptr; + int i; + + for (i = 0; i < test_loop_count; i++) { + ptr = vmalloc(3 * PAGE_SIZE); + + if (!ptr) + return -1; + + *((__u8 *)ptr) = 0; + + vfree(ptr); + } + + return 0; +} + +static int +pcpu_alloc_test(void) +{ + int rv = 0; +#ifndef CONFIG_NEED_PER_CPU_KM + void __percpu **pcpu; + size_t size, align; + int i; + + pcpu = vmalloc(sizeof(void __percpu *) * 35000); + if (!pcpu) + return -1; + + for (i = 0; i < 35000; i++) { + unsigned int r; + + get_random_bytes(&r, sizeof(i)); + size = (r % (PAGE_SIZE / 4)) + 1; + + /* + * Maximum PAGE_SIZE + */ + get_random_bytes(&r, sizeof(i)); + align = 1 << ((i % 11) + 1); + + pcpu[i] = __alloc_percpu(size, align); + if (!pcpu[i]) + rv = -1; + } + + for (i = 0; i < 35000; i++) + free_percpu(pcpu[i]); + + vfree(pcpu); +#endif + return rv; +} + +struct test_case_desc { + const char *test_name; + int (*test_func)(void); +}; + +static struct test_case_desc test_case_array[] = { + { "fix_size_alloc_test", fix_size_alloc_test }, + { "full_fit_alloc_test", full_fit_alloc_test }, + { "long_busy_list_alloc_test", long_busy_list_alloc_test }, + { "random_size_alloc_test", random_size_alloc_test }, + { "fix_align_alloc_test", fix_align_alloc_test }, + { "random_size_align_alloc_test", random_size_align_alloc_test }, + { "align_shift_alloc_test", align_shift_alloc_test }, + { "pcpu_alloc_test", pcpu_alloc_test }, + /* Add a new test case here. */ +}; + +struct test_case_data { + int test_failed; + int test_passed; + u64 time; +}; + +/* Split it to get rid of: WARNING: line over 80 characters */ +static struct test_case_data + per_cpu_test_data[NR_CPUS][ARRAY_SIZE(test_case_array)]; + +static struct test_driver { + struct task_struct *task; + unsigned long start; + unsigned long stop; + int cpu; +} per_cpu_test_driver[NR_CPUS]; + +static void shuffle_array(int *arr, int n) +{ + unsigned int rnd; + int i, j, x; + + for (i = n - 1; i > 0; i--) { + get_random_bytes(&rnd, sizeof(rnd)); + + /* Cut the range. */ + j = rnd % i; + + /* Swap indexes. */ + x = arr[i]; + arr[i] = arr[j]; + arr[j] = x; + } +} + +static int test_func(void *private) +{ + struct test_driver *t = private; + cpumask_t newmask = CPU_MASK_NONE; + int random_array[ARRAY_SIZE(test_case_array)]; + int index, i, j, ret; + ktime_t kt; + u64 delta; + + cpumask_set_cpu(t->cpu, &newmask); + set_cpus_allowed_ptr(current, &newmask); + + for (i = 0; i < ARRAY_SIZE(test_case_array); i++) + random_array[i] = i; + + if (!sequential_test_order) + shuffle_array(random_array, ARRAY_SIZE(test_case_array)); + + /* + * Block until initialization is done. + */ + down_read(&prepare_for_test_rwsem); + + t->start = get_cycles(); + for (i = 0; i < ARRAY_SIZE(test_case_array); i++) { + index = random_array[i]; + + /* + * Skip tests if run_test_mask has been specified. + */ + if (!((run_test_mask & (1 << index)) >> index)) + continue; + + kt = ktime_get(); + for (j = 0; j < test_repeat_count; j++) { + ret = test_case_array[index].test_func(); + if (!ret) + per_cpu_test_data[t->cpu][index].test_passed++; + else + per_cpu_test_data[t->cpu][index].test_failed++; + } + + /* + * Take an average time that test took. + */ + delta = (u64) ktime_us_delta(ktime_get(), kt); + do_div(delta, (u32) test_repeat_count); + + per_cpu_test_data[t->cpu][index].time = delta; + } + t->stop = get_cycles(); + + up_read(&prepare_for_test_rwsem); + test_report_one_done(); + + /* + * Wait for the kthread_stop() call. + */ + while (!kthread_should_stop()) + msleep(10); + + return 0; +} + +static void +init_test_configurtion(void) +{ + /* + * Reset all data of all CPUs. + */ + memset(per_cpu_test_data, 0, sizeof(per_cpu_test_data)); + + if (single_cpu_test) + cpumask_set_cpu(cpumask_first(cpu_online_mask), + &cpus_run_test_mask); + else + cpumask_and(&cpus_run_test_mask, cpu_online_mask, + cpu_online_mask); + + if (test_repeat_count <= 0) + test_repeat_count = 1; + + if (test_loop_count <= 0) + test_loop_count = 1; +} + +static void do_concurrent_test(void) +{ + int cpu, ret; + + /* + * Set some basic configurations plus sanity check. + */ + init_test_configurtion(); + + /* + * Put on hold all workers. + */ + down_write(&prepare_for_test_rwsem); + + for_each_cpu(cpu, &cpus_run_test_mask) { + struct test_driver *t = &per_cpu_test_driver[cpu]; + + t->cpu = cpu; + t->task = kthread_run(test_func, t, "vmalloc_test/%d", cpu); + + if (!IS_ERR(t->task)) + /* Success. */ + atomic_inc(&test_n_undone); + else + pr_err("Failed to start kthread for %d CPU\n", cpu); + } + + /* + * Now let the workers do their job. + */ + up_write(&prepare_for_test_rwsem); + + /* + * Sleep quiet until all workers are done with 1 second + * interval. Since the test can take a lot of time we + * can run into a stack trace of the hung task. That is + * why we go with completion_timeout and HZ value. + */ + do { + ret = wait_for_completion_timeout(&test_all_done_comp, HZ); + } while (!ret); + + for_each_cpu(cpu, &cpus_run_test_mask) { + struct test_driver *t = &per_cpu_test_driver[cpu]; + int i; + + if (!IS_ERR(t->task)) + kthread_stop(t->task); + + for (i = 0; i < ARRAY_SIZE(test_case_array); i++) { + if (!((run_test_mask & (1 << i)) >> i)) + continue; + + pr_info( + "Summary: %s passed: %d failed: %d repeat: %d loops: %d avg: %llu usec\n", + test_case_array[i].test_name, + per_cpu_test_data[cpu][i].test_passed, + per_cpu_test_data[cpu][i].test_failed, + test_repeat_count, test_loop_count, + per_cpu_test_data[cpu][i].time); + } + + pr_info("All test took CPU%d=%lu cycles\n", + cpu, t->stop - t->start); + } +} + +static int vmalloc_test_init(void) +{ + do_concurrent_test(); + return -EAGAIN; /* Fail will directly unload the module */ +} + +static void vmalloc_test_exit(void) +{ +} + +module_init(vmalloc_test_init) +module_exit(vmalloc_test_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Uladzislau Rezki"); +MODULE_DESCRIPTION("vmalloc test module"); diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug @@ -39,6 +39,23 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT Enable debug page memory allocations by default? This value can be overridden by debug_pagealloc=off|on. +config PAGE_OWNER + bool "Track page owner" + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + select DEBUG_FS + select STACKTRACE + select STACKDEPOT + select PAGE_EXTENSION + help + This keeps track of what call chain is the owner of a page, may + help to find bare alloc_page(s) leaks. Even if you include this + feature on your build, it is disabled in default. You should pass + "page_owner=on" to boot parameter in order to enable it. Eats + a fair amount of memory if enabled. See tools/vm/page_owner_sort.c + for user-space helper. + + If unsure, say N. + config PAGE_POISONING bool "Poison pages after freeing" select PAGE_POISONING_NO_SANITY if HIBERNATION diff --git a/mm/cma.c b/mm/cma.c @@ -353,12 +353,14 @@ int __init cma_declare_contiguous(phys_addr_t base, ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); if (ret) - goto err; + goto free_mem; pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, &base); return 0; +free_mem: + memblock_free(base, size); err: pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); return ret; diff --git a/mm/cma_debug.c b/mm/cma_debug.c @@ -21,8 +21,6 @@ struct cma_mem { unsigned long n; }; -static struct dentry *cma_debugfs_root; - static int cma_debugfs_get(void *data, u64 *val) { unsigned long *p = data; @@ -162,7 +160,7 @@ static int cma_alloc_write(void *data, u64 val) } DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); -static void cma_debugfs_add_one(struct cma *cma, int idx) +static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { struct dentry *tmp; char name[16]; @@ -170,7 +168,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) scnprintf(name, sizeof(name), "cma-%s", cma->name); - tmp = debugfs_create_dir(name, cma_debugfs_root); + tmp = debugfs_create_dir(name, root_dentry); debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); @@ -188,14 +186,13 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) static int __init cma_debugfs_init(void) { + struct dentry *cma_debugfs_root; int i; cma_debugfs_root = debugfs_create_dir("cma", NULL); - if (!cma_debugfs_root) - return -ENOMEM; for (i = 0; i < cma_area_count; i++) - cma_debugfs_add_one(&cma_areas[i], i); + cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root); return 0; } diff --git a/mm/compaction.c b/mm/compaction.c @@ -66,7 +66,7 @@ static unsigned long release_freepages(struct list_head *freelist) return high_pfn; } -static void map_pages(struct list_head *list) +static void split_map_pages(struct list_head *list) { unsigned int i, order, nr_pages; struct page *page, *next; @@ -237,6 +237,70 @@ static bool pageblock_skip_persistent(struct page *page) return false; } +static bool +__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, + bool check_target) +{ + struct page *page = pfn_to_online_page(pfn); + struct page *end_page; + unsigned long block_pfn; + + if (!page) + return false; + if (zone != page_zone(page)) + return false; + if (pageblock_skip_persistent(page)) + return false; + + /* + * If skip is already cleared do no further checking once the + * restart points have been set. + */ + if (check_source && check_target && !get_pageblock_skip(page)) + return true; + + /* + * If clearing skip for the target scanner, do not select a + * non-movable pageblock as the starting point. + */ + if (!check_source && check_target && + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + return false; + + /* + * Only clear the hint if a sample indicates there is either a + * free page or an LRU page in the block. One or other condition + * is necessary for the block to be a migration source/target. + */ + block_pfn = pageblock_start_pfn(pfn); + pfn = max(block_pfn, zone->zone_start_pfn); + page = pfn_to_page(pfn); + if (zone != page_zone(page)) + return false; + pfn = block_pfn + pageblock_nr_pages; + pfn = min(pfn, zone_end_pfn(zone)); + end_page = pfn_to_page(pfn); + + do { + if (pfn_valid_within(pfn)) { + if (check_source && PageLRU(page)) { + clear_pageblock_skip(page); + return true; + } + + if (check_target && PageBuddy(page)) { + clear_pageblock_skip(page); + return true; + } + } + + page += (1 << PAGE_ALLOC_COSTLY_ORDER); + pfn += (1 << PAGE_ALLOC_COSTLY_ORDER); + } while (page < end_page); + + return false; +} + /* * This function is called to clear all cached information on pageblocks that * should be skipped for page isolation when the migrate and free page scanner @@ -244,30 +308,54 @@ static bool pageblock_skip_persistent(struct page *page) */ static void __reset_isolation_suitable(struct zone *zone) { - unsigned long start_pfn = zone->zone_start_pfn; - unsigned long end_pfn = zone_end_pfn(zone); - unsigned long pfn; + unsigned long migrate_pfn = zone->zone_start_pfn; + unsigned long free_pfn = zone_end_pfn(zone); + unsigned long reset_migrate = free_pfn; + unsigned long reset_free = migrate_pfn; + bool source_set = false; + bool free_set = false; + + if (!zone->compact_blockskip_flush) + return; zone->compact_blockskip_flush = false; - /* Walk the zone and mark every pageblock as suitable for isolation */ - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { - struct page *page; - + /* + * Walk the zone and update pageblock skip information. Source looks + * for PageLRU while target looks for PageBuddy. When the scanner + * is found, both PageBuddy and PageLRU are checked as the pageblock + * is suitable as both source and target. + */ + for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages, + free_pfn -= pageblock_nr_pages) { cond_resched(); - page = pfn_to_online_page(pfn); - if (!page) - continue; - if (zone != page_zone(page)) - continue; - if (pageblock_skip_persistent(page)) - continue; + /* Update the migrate PFN */ + if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) && + migrate_pfn < reset_migrate) { + source_set = true; + reset_migrate = migrate_pfn; + zone->compact_init_migrate_pfn = reset_migrate; + zone->compact_cached_migrate_pfn[0] = reset_migrate; + zone->compact_cached_migrate_pfn[1] = reset_migrate; + } - clear_pageblock_skip(page); + /* Update the free PFN */ + if (__reset_isolation_pfn(zone, free_pfn, free_set, true) && + free_pfn > reset_free) { + free_set = true; + reset_free = free_pfn; + zone->compact_init_free_pfn = reset_free; + zone->compact_cached_free_pfn = reset_free; + } } - reset_cached_positions(zone); + /* Leave no distance if no suitable block was reset */ + if (reset_migrate >= reset_free) { + zone->compact_cached_migrate_pfn[0] = migrate_pfn; + zone->compact_cached_migrate_pfn[1] = migrate_pfn; + zone->compact_cached_free_pfn = free_pfn; + } } void reset_isolation_suitable(pg_data_t *pgdat) @@ -286,15 +374,53 @@ void reset_isolation_suitable(pg_data_t *pgdat) } /* + * Sets the pageblock skip bit if it was clear. Note that this is a hint as + * locks are not required for read/writers. Returns true if it was already set. + */ +static bool test_and_set_skip(struct compact_control *cc, struct page *page, + unsigned long pfn) +{ + bool skip; + + /* Do no update if skip hint is being ignored */ + if (cc->ignore_skip_hint) + return false; + + if (!IS_ALIGNED(pfn, pageblock_nr_pages)) + return false; + + skip = get_pageblock_skip(page); + if (!skip && !cc->no_set_skip_hint) + set_pageblock_skip(page); + + return skip; +} + +static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) +{ + struct zone *zone = cc->zone; + + pfn = pageblock_end_pfn(pfn); + + /* Set for isolation rather than compaction */ + if (cc->no_set_skip_hint) + return; + + if (pfn > zone->compact_cached_migrate_pfn[0]) + zone->compact_cached_migrate_pfn[0] = pfn; + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) + zone->compact_cached_migrate_pfn[1] = pfn; +} + +/* * If no pages were isolated then mark this pageblock to be skipped in the * future. The information is later cleared by __reset_isolation_suitable(). */ static void update_pageblock_skip(struct compact_control *cc, - struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + struct page *page, unsigned long pfn) { struct zone *zone = cc->zone; - unsigned long pfn; if (cc->no_set_skip_hint) return; @@ -302,24 +428,11 @@ static void update_pageblock_skip(struct compact_control *cc, if (!page) return; - if (nr_isolated) - return; - set_pageblock_skip(page); - pfn = page_to_pfn(page); - /* Update where async and sync compaction should restart */ - if (migrate_scanner) { - if (pfn > zone->compact_cached_migrate_pfn[0]) - zone->compact_cached_migrate_pfn[0] = pfn; - if (cc->mode != MIGRATE_ASYNC && - pfn > zone->compact_cached_migrate_pfn[1]) - zone->compact_cached_migrate_pfn[1] = pfn; - } else { - if (pfn < zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = pfn; - } + if (pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; } #else static inline bool isolation_suitable(struct compact_control *cc, @@ -334,32 +447,42 @@ static inline bool pageblock_skip_persistent(struct page *page) } static inline void update_pageblock_skip(struct compact_control *cc, - struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + struct page *page, unsigned long pfn) +{ +} + +static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) +{ +} + +static bool test_and_set_skip(struct compact_control *cc, struct page *page, + unsigned long pfn) { + return false; } #endif /* CONFIG_COMPACTION */ /* * Compaction requires the taking of some coarse locks that are potentially - * very heavily contended. For async compaction, back out if the lock cannot - * be taken immediately. For sync compaction, spin on the lock if needed. + * very heavily contended. For async compaction, trylock and record if the + * lock is contended. The lock will still be acquired but compaction will + * abort when the current block is finished regardless of success rate. + * Sync compaction acquires the lock. * - * Returns true if the lock is held - * Returns false if the lock is not held and compaction should abort + * Always returns true which makes it easier to track lock state in callers. */ -static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, +static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, struct compact_control *cc) { - if (cc->mode == MIGRATE_ASYNC) { - if (!spin_trylock_irqsave(lock, *flags)) { - cc->contended = true; - return false; - } - } else { - spin_lock_irqsave(lock, *flags); + /* Track if the lock is contended in async mode */ + if (cc->mode == MIGRATE_ASYNC && !cc->contended) { + if (spin_trylock_irqsave(lock, *flags)) + return true; + + cc->contended = true; } + spin_lock_irqsave(lock, *flags); return true; } @@ -391,37 +514,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock, return true; } - if (need_resched()) { - if (cc->mode == MIGRATE_ASYNC) { - cc->contended = true; - return true; - } - cond_resched(); - } - - return false; -} - -/* - * Aside from avoiding lock contention, compaction also periodically checks - * need_resched() and either schedules in sync compaction or aborts async - * compaction. This is similar to what compact_unlock_should_abort() does, but - * is used where no lock is concerned. - * - * Returns false when no scheduling was needed, or sync compaction scheduled. - * Returns true when async compaction should abort. - */ -static inline bool compact_should_abort(struct compact_control *cc) -{ - /* async compaction aborts if contended */ - if (need_resched()) { - if (cc->mode == MIGRATE_ASYNC) { - cc->contended = true; - return true; - } - - cond_resched(); - } + cond_resched(); return false; } @@ -435,19 +528,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long *start_pfn, unsigned long end_pfn, struct list_head *freelist, + unsigned int stride, bool strict) { int nr_scanned = 0, total_isolated = 0; - struct page *cursor, *valid_page = NULL; + struct page *cursor; unsigned long flags = 0; bool locked = false; unsigned long blockpfn = *start_pfn; unsigned int order; + /* Strict mode is for isolation, speed is secondary */ + if (strict) + stride = 1; + cursor = pfn_to_page(blockpfn); /* Isolate free pages. */ - for (; blockpfn < end_pfn; blockpfn++, cursor++) { + for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) { int isolated; struct page *page = cursor; @@ -465,9 +563,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (!pfn_valid_within(blockpfn)) goto isolate_fail; - if (!valid_page) - valid_page = page; - /* * For compound pages such as THP and hugetlbfs, we can save * potentially a lot of iterations if we skip them at once. @@ -495,18 +590,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * recheck as well. */ if (!locked) { - /* - * The zone lock must be held to isolate freepages. - * Unfortunately this is a very coarse lock and can be - * heavily contended if there are parallel allocations - * or parallel compactions. For async compaction do not - * spin on the lock and we acquire the lock as late as - * possible. - */ - locked = compact_trylock_irqsave(&cc->zone->lock, + locked = compact_lock_irqsave(&cc->zone->lock, &flags, cc); - if (!locked) - break; /* Recheck this is a buddy page under lock */ if (!PageBuddy(page)) @@ -565,10 +650,6 @@ isolate_fail: if (strict && blockpfn < end_pfn) total_isolated = 0; - /* Update the pageblock-skip if the whole pageblock was scanned */ - if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, false); - cc->total_free_scanned += nr_scanned; if (total_isolated) count_compact_events(COMPACTISOLATED, total_isolated); @@ -626,7 +707,7 @@ isolate_freepages_range(struct compact_control *cc, break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, &freelist, true); + block_end_pfn, &freelist, 0, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -644,7 +725,7 @@ isolate_freepages_range(struct compact_control *cc, } /* __isolate_free_page() does not map the pages */ - map_pages(&freelist); + split_map_pages(&freelist); if (pfn < end_pfn) { /* Loop terminated early, cleanup. */ @@ -657,16 +738,16 @@ isolate_freepages_range(struct compact_control *cc, } /* Similar to reclaim, but different enough that they don't share logic */ -static bool too_many_isolated(struct zone *zone) +static bool too_many_isolated(pg_data_t *pgdat) { unsigned long active, inactive, isolated; - inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + - node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); - active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) + - node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON); - isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) + - node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON); + inactive = node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_ANON); + active = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_ACTIVE_ANON); + isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + + node_page_state(pgdat, NR_ISOLATED_ANON); return isolated > (inactive + active) / 2; } @@ -693,7 +774,7 @@ static unsigned long isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn, isolate_mode_t isolate_mode) { - struct zone *zone = cc->zone; + pg_data_t *pgdat = cc->zone->zone_pgdat; unsigned long nr_scanned = 0, nr_isolated = 0; struct lruvec *lruvec; unsigned long flags = 0; @@ -702,13 +783,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long start_pfn = low_pfn; bool skip_on_failure = false; unsigned long next_skip_pfn = 0; + bool skip_updated = false; /* * Ensure that there are not too many pages isolated from the LRU * list by either parallel reclaimers or compaction. If there are, * delay for some time until fewer pages are isolated */ - while (unlikely(too_many_isolated(zone))) { + while (unlikely(too_many_isolated(pgdat))) { /* async migration should just abort */ if (cc->mode == MIGRATE_ASYNC) return 0; @@ -719,8 +801,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, return 0; } - if (compact_should_abort(cc)) - return 0; + cond_resched(); if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { skip_on_failure = true; @@ -758,8 +839,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * if contended. */ if (!(low_pfn % SWAP_CLUSTER_MAX) - && compact_unlock_should_abort(zone_lru_lock(zone), flags, - &locked, cc)) + && compact_unlock_should_abort(&pgdat->lru_lock, + flags, &locked, cc)) break; if (!pfn_valid_within(low_pfn)) @@ -768,8 +849,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, page = pfn_to_page(low_pfn); - if (!valid_page) + /* + * Check if the pageblock has already been marked skipped. + * Only the aligned PFN is checked as the caller isolates + * COMPACT_CLUSTER_MAX at a time so the second call must + * not falsely conclude that the block should be skipped. + */ + if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { + if (!cc->ignore_skip_hint && get_pageblock_skip(page)) { + low_pfn = end_pfn; + goto isolate_abort; + } valid_page = page; + } /* * Skip if free. We read page order here without zone lock @@ -818,7 +910,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(__PageMovable(page)) && !PageIsolated(page)) { if (locked) { - spin_unlock_irqrestore(zone_lru_lock(zone), + spin_unlock_irqrestore(&pgdat->lru_lock, flags); locked = false; } @@ -848,10 +940,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* If we already hold the lock, we can skip some rechecking */ if (!locked) { - locked = compact_trylock_irqsave(zone_lru_lock(zone), + locked = compact_lock_irqsave(&pgdat->lru_lock, &flags, cc); - if (!locked) - break; + + /* Try get exclusive access under lock */ + if (!skip_updated) { + skip_updated = true; + if (test_and_set_skip(cc, page, low_pfn)) + goto isolate_abort; + } /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) @@ -868,7 +965,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } } - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(page, pgdat); /* Try isolate the page */ if (__isolate_lru_page(page, isolate_mode) != 0) @@ -887,16 +984,13 @@ isolate_success: nr_isolated++; /* - * Record where we could have freed pages by migration and not - * yet flushed them to buddy allocator. - * - this is the lowest page that was isolated and likely be - * then freed by migration. + * Avoid isolating too much unless this block is being + * rescanned (e.g. dirty/writeback pages, parallel allocation) + * or a lock is contended. For contention, isolate quickly to + * potentially remove one source of contention. */ - if (!cc->last_migrated_pfn) - cc->last_migrated_pfn = low_pfn; - - /* Avoid isolating too much */ - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX && + !cc->rescan && !cc->contended) { ++low_pfn; break; } @@ -913,12 +1007,11 @@ isolate_fail: */ if (nr_isolated) { if (locked) { - spin_unlock_irqrestore(zone_lru_lock(zone), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); locked = false; } putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; - cc->last_migrated_pfn = 0; nr_isolated = 0; } @@ -939,15 +1032,23 @@ isolate_fail: if (unlikely(low_pfn > end_pfn)) low_pfn = end_pfn; +isolate_abort: if (locked) - spin_unlock_irqrestore(zone_lru_lock(zone), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); /* - * Update the pageblock-skip information and cached scanner pfn, - * if the whole pageblock was scanned without isolating any page. + * Updated the cached scanner pfn once the pageblock has been scanned + * Pages will either be migrated in which case there is no point + * scanning in the near future or migration failed in which case the + * failure reason may persist. The block is marked for skipping if + * there were no pages isolated in the block or if the block is + * rescanned twice in a row. */ - if (low_pfn == end_pfn) - update_pageblock_skip(cc, valid_page, nr_isolated, true); + if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { + if (valid_page && !skip_updated) + set_pageblock_skip(valid_page); + update_cached_migrate(cc, low_pfn); + } trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, nr_scanned, nr_isolated); @@ -1013,6 +1114,9 @@ static bool suitable_migration_source(struct compact_control *cc, { int block_mt; + if (pageblock_skip_persistent(page)) + return false; + if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) return true; @@ -1050,6 +1154,12 @@ static bool suitable_migration_target(struct compact_control *cc, return false; } +static inline unsigned int +freelist_scan_limit(struct compact_control *cc) +{ + return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1; +} + /* * Test whether the free scanner has reached the same or lower pageblock than * the migration scanner, and compaction should thus terminate. @@ -1061,6 +1171,248 @@ static inline bool compact_scanners_met(struct compact_control *cc) } /* + * Used when scanning for a suitable migration target which scans freelists + * in reverse. Reorders the list such as the unscanned pages are scanned + * first on the next iteration of the free scanner + */ +static void +move_freelist_head(struct list_head *freelist, struct page *freepage) +{ + LIST_HEAD(sublist); + + if (!list_is_last(freelist, &freepage->lru)) { + list_cut_before(&sublist, freelist, &freepage->lru); + if (!list_empty(&sublist)) + list_splice_tail(&sublist, freelist); + } +} + +/* + * Similar to move_freelist_head except used by the migration scanner + * when scanning forward. It's possible for these list operations to + * move against each other if they search the free list exactly in + * lockstep. + */ +static void +move_freelist_tail(struct list_head *freelist, struct page *freepage) +{ + LIST_HEAD(sublist); + + if (!list_is_first(freelist, &freepage->lru)) { + list_cut_position(&sublist, freelist, &freepage->lru); + if (!list_empty(&sublist)) + list_splice_tail(&sublist, freelist); + } +} + +static void +fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated) +{ + unsigned long start_pfn, end_pfn; + struct page *page = pfn_to_page(pfn); + + /* Do not search around if there are enough pages already */ + if (cc->nr_freepages >= cc->nr_migratepages) + return; + + /* Minimise scanning during async compaction */ + if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC) + return; + + /* Pageblock boundaries */ + start_pfn = pageblock_start_pfn(pfn); + end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone)); + + /* Scan before */ + if (start_pfn != pfn) { + isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false); + if (cc->nr_freepages >= cc->nr_migratepages) + return; + } + + /* Scan after */ + start_pfn = pfn + nr_isolated; + if (start_pfn != end_pfn) + isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); + + /* Skip this pageblock in the future as it's full or nearly full */ + if (cc->nr_freepages < cc->nr_migratepages) + set_pageblock_skip(page); +} + +/* Search orders in round-robin fashion */ +static int next_search_order(struct compact_control *cc, int order) +{ + order--; + if (order < 0) + order = cc->order - 1; + + /* Search wrapped around? */ + if (order == cc->search_order) { + cc->search_order--; + if (cc->search_order < 0) + cc->search_order = cc->order - 1; + return -1; + } + + return order; +} + +static unsigned long +fast_isolate_freepages(struct compact_control *cc) +{ + unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1); + unsigned int nr_scanned = 0; + unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0; + unsigned long nr_isolated = 0; + unsigned long distance; + struct page *page = NULL; + bool scan_start = false; + int order; + + /* Full compaction passes in a negative order */ + if (cc->order <= 0) + return cc->free_pfn; + + /* + * If starting the scan, use a deeper search and use the highest + * PFN found if a suitable one is not found. + */ + if (cc->free_pfn >= cc->zone->compact_init_free_pfn) { + limit = pageblock_nr_pages >> 1; + scan_start = true; + } + + /* + * Preferred point is in the top quarter of the scan space but take + * a pfn from the top half if the search is problematic. + */ + distance = (cc->free_pfn - cc->migrate_pfn); + low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2)); + min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1)); + + if (WARN_ON_ONCE(min_pfn > low_pfn)) + low_pfn = min_pfn; + + /* + * Search starts from the last successful isolation order or the next + * order to search after a previous failure + */ + cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order); + + for (order = cc->search_order; + !page && order >= 0; + order = next_search_order(cc, order)) { + struct free_area *area = &cc->zone->free_area[order]; + struct list_head *freelist; + struct page *freepage; + unsigned long flags; + unsigned int order_scanned = 0; + + if (!area->nr_free) + continue; + + spin_lock_irqsave(&cc->zone->lock, flags); + freelist = &area->free_list[MIGRATE_MOVABLE]; + list_for_each_entry_reverse(freepage, freelist, lru) { + unsigned long pfn; + + order_scanned++; + nr_scanned++; + pfn = page_to_pfn(freepage); + + if (pfn >= highest) + highest = pageblock_start_pfn(pfn); + + if (pfn >= low_pfn) { + cc->fast_search_fail = 0; + cc->search_order = order; + page = freepage; + break; + } + + if (pfn >= min_pfn && pfn > high_pfn) { + high_pfn = pfn; + + /* Shorten the scan if a candidate is found */ + limit >>= 1; + } + + if (order_scanned >= limit) + break; + } + + /* Use a minimum pfn if a preferred one was not found */ + if (!page && high_pfn) { + page = pfn_to_page(high_pfn); + + /* Update freepage for the list reorder below */ + freepage = page; + } + + /* Reorder to so a future search skips recent pages */ + move_freelist_head(freelist, freepage); + + /* Isolate the page if available */ + if (page) { + if (__isolate_free_page(page, order)) { + set_page_private(page, order); + nr_isolated = 1 << order; + cc->nr_freepages += nr_isolated; + list_add_tail(&page->lru, &cc->freepages); + count_compact_events(COMPACTISOLATED, nr_isolated); + } else { + /* If isolation fails, abort the search */ + order = -1; + page = NULL; + } + } + + spin_unlock_irqrestore(&cc->zone->lock, flags); + + /* + * Smaller scan on next order so the total scan ig related + * to freelist_scan_limit. + */ + if (order_scanned >= limit) + limit = min(1U, limit >> 1); + } + + if (!page) { + cc->fast_search_fail++; + if (scan_start) { + /* + * Use the highest PFN found above min. If one was + * not found, be pessemistic for direct compaction + * and use the min mark. + */ + if (highest) { + page = pfn_to_page(highest); + cc->free_pfn = highest; + } else { + if (cc->direct_compaction) { + page = pfn_to_page(min_pfn); + cc->free_pfn = min_pfn; + } + } + } + } + + if (highest && highest >= cc->zone->compact_cached_free_pfn) { + highest -= pageblock_nr_pages; + cc->zone->compact_cached_free_pfn = highest; + } + + cc->total_free_scanned += nr_scanned; + if (!page) + return cc->free_pfn; + + low_pfn = page_to_pfn(page); + fast_isolate_around(cc, low_pfn, nr_isolated); + return low_pfn; +} + +/* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. */ @@ -1073,6 +1425,12 @@ static void isolate_freepages(struct compact_control *cc) unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ struct list_head *freelist = &cc->freepages; + unsigned int stride; + + /* Try a small search of the free lists for a candidate */ + isolate_start_pfn = fast_isolate_freepages(cc); + if (cc->nr_freepages) + goto splitmap; /* * Initialise the free scanner. The starting point is where we last @@ -1086,10 +1444,11 @@ static void isolate_freepages(struct compact_control *cc) * is using. */ isolate_start_pfn = cc->free_pfn; - block_start_pfn = pageblock_start_pfn(cc->free_pfn); + block_start_pfn = pageblock_start_pfn(isolate_start_pfn); block_end_pfn = min(block_start_pfn + pageblock_nr_pages, zone_end_pfn(zone)); low_pfn = pageblock_end_pfn(cc->migrate_pfn); + stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1; /* * Isolate free pages until enough are available to migrate the @@ -1100,14 +1459,14 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { + unsigned long nr_isolated; + /* * This can iterate a massively long zone without finding any - * suitable migration targets, so periodically check if we need - * to schedule, or even abort async compaction. + * suitable migration targets, so periodically check resched. */ - if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) - && compact_should_abort(cc)) - break; + if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) + cond_resched(); page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone); @@ -1123,15 +1482,15 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, - freelist, false); + nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, freelist, stride, false); - /* - * If we isolated enough freepages, or aborted due to lock - * contention, terminate. - */ - if ((cc->nr_freepages >= cc->nr_migratepages) - || cc->contended) { + /* Update the skip hint if the full pageblock was scanned */ + if (isolate_start_pfn == block_end_pfn) + update_pageblock_skip(cc, page, block_start_pfn); + + /* Are enough freepages isolated? */ + if (cc->nr_freepages >= cc->nr_migratepages) { if (isolate_start_pfn >= block_end_pfn) { /* * Restart at previous pageblock if more @@ -1148,10 +1507,14 @@ static void isolate_freepages(struct compact_control *cc) */ break; } - } - /* __isolate_free_page() does not map the pages */ - map_pages(freelist); + /* Adjust stride depending on isolation */ + if (nr_isolated) { + stride = 1; + continue; + } + stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1); + } /* * Record where the free scanner will restart next time. Either we @@ -1160,6 +1523,10 @@ static void isolate_freepages(struct compact_control *cc) * and the loop terminated due to isolate_start_pfn < low_pfn */ cc->free_pfn = isolate_start_pfn; + +splitmap: + /* __isolate_free_page() does not map the pages */ + split_map_pages(freelist); } /* @@ -1172,13 +1539,8 @@ static struct page *compaction_alloc(struct page *migratepage, struct compact_control *cc = (struct compact_control *)data; struct page *freepage; - /* - * Isolate free pages if necessary, and if we are not aborting due to - * contention. - */ if (list_empty(&cc->freepages)) { - if (!cc->contended) - isolate_freepages(cc); + isolate_freepages(cc); if (list_empty(&cc->freepages)) return NULL; @@ -1217,6 +1579,147 @@ typedef enum { */ int sysctl_compact_unevictable_allowed __read_mostly = 1; +static inline void +update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) +{ + if (cc->fast_start_pfn == ULONG_MAX) + return; + + if (!cc->fast_start_pfn) + cc->fast_start_pfn = pfn; + + cc->fast_start_pfn = min(cc->fast_start_pfn, pfn); +} + +static inline unsigned long +reinit_migrate_pfn(struct compact_control *cc) +{ + if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX) + return cc->migrate_pfn; + + cc->migrate_pfn = cc->fast_start_pfn; + cc->fast_start_pfn = ULONG_MAX; + + return cc->migrate_pfn; +} + +/* + * Briefly search the free lists for a migration source that already has + * some free pages to reduce the number of pages that need migration + * before a pageblock is free. + */ +static unsigned long fast_find_migrateblock(struct compact_control *cc) +{ + unsigned int limit = freelist_scan_limit(cc); + unsigned int nr_scanned = 0; + unsigned long distance; + unsigned long pfn = cc->migrate_pfn; + unsigned long high_pfn; + int order; + + /* Skip hints are relied on to avoid repeats on the fast search */ + if (cc->ignore_skip_hint) + return pfn; + + /* + * If the migrate_pfn is not at the start of a zone or the start + * of a pageblock then assume this is a continuation of a previous + * scan restarted due to COMPACT_CLUSTER_MAX. + */ + if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) + return pfn; + + /* + * For smaller orders, just linearly scan as the number of pages + * to migrate should be relatively small and does not necessarily + * justify freeing up a large block for a small allocation. + */ + if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) + return pfn; + + /* + * Only allow kcompactd and direct requests for movable pages to + * quickly clear out a MOVABLE pageblock for allocation. This + * reduces the risk that a large movable pageblock is freed for + * an unmovable/reclaimable small allocation. + */ + if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) + return pfn; + + /* + * When starting the migration scanner, pick any pageblock within the + * first half of the search space. Otherwise try and pick a pageblock + * within the first eighth to reduce the chances that a migration + * target later becomes a source. + */ + distance = (cc->free_pfn - cc->migrate_pfn) >> 1; + if (cc->migrate_pfn != cc->zone->zone_start_pfn) + distance >>= 2; + high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance); + + for (order = cc->order - 1; + order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit; + order--) { + struct free_area *area = &cc->zone->free_area[order]; + struct list_head *freelist; + unsigned long flags; + struct page *freepage; + + if (!area->nr_free) + continue; + + spin_lock_irqsave(&cc->zone->lock, flags); + freelist = &area->free_list[MIGRATE_MOVABLE]; + list_for_each_entry(freepage, freelist, lru) { + unsigned long free_pfn; + + nr_scanned++; + free_pfn = page_to_pfn(freepage); + if (free_pfn < high_pfn) { + /* + * Avoid if skipped recently. Ideally it would + * move to the tail but even safe iteration of + * the list assumes an entry is deleted, not + * reordered. + */ + if (get_pageblock_skip(freepage)) { + if (list_is_last(freelist, &freepage->lru)) + break; + + continue; + } + + /* Reorder to so a future search skips recent pages */ + move_freelist_tail(freelist, freepage); + + update_fast_start_pfn(cc, free_pfn); + pfn = pageblock_start_pfn(free_pfn); + cc->fast_search_fail = 0; + set_pageblock_skip(freepage); + break; + } + + if (nr_scanned >= limit) { + cc->fast_search_fail++; + move_freelist_tail(freelist, freepage); + break; + } + } + spin_unlock_irqrestore(&cc->zone->lock, flags); + } + + cc->total_migrate_scanned += nr_scanned; + + /* + * If fast scanning failed then use a cached entry for a page block + * that had free pages as the basis for starting a linear scan. + */ + if (pfn == cc->migrate_pfn) + pfn = reinit_migrate_pfn(cc); + + return pfn; +} + /* * Isolate all pages that can be migrated from the first suitable block, * starting at the block pointed to by the migrate scanner pfn within @@ -1232,16 +1735,25 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); + bool fast_find_block; /* * Start at where we last stopped, or beginning of the zone as - * initialized by compact_zone() + * initialized by compact_zone(). The first failure will use + * the lowest PFN as the starting point for linear scanning. */ - low_pfn = cc->migrate_pfn; + low_pfn = fast_find_migrateblock(cc); block_start_pfn = pageblock_start_pfn(low_pfn); if (block_start_pfn < zone->zone_start_pfn) block_start_pfn = zone->zone_start_pfn; + /* + * fast_find_migrateblock marks a pageblock skipped so to avoid + * the isolation_suitable check below, check whether the fast + * search was successful. + */ + fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail; + /* Only scan within a pageblock boundary */ block_end_pfn = pageblock_end_pfn(low_pfn); @@ -1250,6 +1762,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * Do not cross the free scanner. */ for (; block_end_pfn <= cc->free_pfn; + fast_find_block = false, low_pfn = block_end_pfn, block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -1257,34 +1770,45 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* * This can potentially iterate a massively long zone with * many pageblocks unsuitable, so periodically check if we - * need to schedule, or even abort async compaction. + * need to schedule. */ - if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) - && compact_should_abort(cc)) - break; + if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) + cond_resched(); page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone); if (!page) continue; - /* If isolation recently failed, do not retry */ - if (!isolation_suitable(cc, page)) + /* + * If isolation recently failed, do not retry. Only check the + * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock + * to be visited multiple times. Assume skip was checked + * before making it "skip" so other compaction instances do + * not scan the same block. + */ + if (IS_ALIGNED(low_pfn, pageblock_nr_pages) && + !fast_find_block && !isolation_suitable(cc, page)) continue; /* - * For async compaction, also only scan in MOVABLE blocks. - * Async compaction is optimistic to see if the minimum amount - * of work satisfies the allocation. + * For async compaction, also only scan in MOVABLE blocks + * without huge pages. Async compaction is optimistic to see + * if the minimum amount of work satisfies the allocation. + * The cached PFN is updated as it's possible that all + * remaining blocks between source and target are unsuitable + * and the compaction scanners fail to meet. */ - if (!suitable_migration_source(cc, page)) + if (!suitable_migration_source(cc, page)) { + update_cached_migrate(cc, block_end_pfn); continue; + } /* Perform the isolation */ low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); - if (!low_pfn || cc->contended) + if (!low_pfn) return ISOLATE_ABORT; /* @@ -1310,19 +1834,16 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static enum compact_result __compact_finished(struct zone *zone, - struct compact_control *cc) +static enum compact_result __compact_finished(struct compact_control *cc) { unsigned int order; const int migratetype = cc->migratetype; - - if (cc->contended || fatal_signal_pending(current)) - return COMPACT_CONTENDED; + int ret; /* Compaction run completes if the migrate and free scanner meet */ if (compact_scanners_met(cc)) { /* Let the next compaction start anew. */ - reset_cached_positions(zone); + reset_cached_positions(cc->zone); /* * Mark that the PG_migrate_skip information should be cleared @@ -1331,7 +1852,7 @@ static enum compact_result __compact_finished(struct zone *zone, * based on an allocation request. */ if (cc->direct_compaction) - zone->compact_blockskip_flush = true; + cc->zone->compact_blockskip_flush = true; if (cc->whole_zone) return COMPACT_COMPLETE; @@ -1342,20 +1863,19 @@ static enum compact_result __compact_finished(struct zone *zone, if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; - if (cc->finishing_block) { - /* - * We have finished the pageblock, but better check again that - * we really succeeded. - */ - if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) - cc->finishing_block = false; - else - return COMPACT_CONTINUE; - } + /* + * Always finish scanning a pageblock to reduce the possibility of + * fallbacks in the future. This is particularly important when + * migration source is unmovable/reclaimable but it's not worth + * special casing. + */ + if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ + ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[order]; + struct free_area *area = &cc->zone->free_area[order]; bool can_steal; /* Job done if page is free of the right migratetype */ @@ -1393,21 +1913,23 @@ static enum compact_result __compact_finished(struct zone *zone, return COMPACT_SUCCESS; } - cc->finishing_block = true; - return COMPACT_CONTINUE; + ret = COMPACT_CONTINUE; + break; } } - return COMPACT_NO_SUITABLE_PAGE; + if (cc->contended || fatal_signal_pending(current)) + ret = COMPACT_CONTENDED; + + return ret; } -static enum compact_result compact_finished(struct zone *zone, - struct compact_control *cc) +static enum compact_result compact_finished(struct compact_control *cc) { int ret; - ret = __compact_finished(zone, cc); - trace_mm_compaction_finished(zone, cc->order, ret); + ret = __compact_finished(cc); + trace_mm_compaction_finished(cc->zone, cc->order, ret); if (ret == COMPACT_NO_SUITABLE_PAGE) ret = COMPACT_CONTINUE; @@ -1534,15 +2056,18 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, return false; } -static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) +static enum compact_result +compact_zone(struct compact_control *cc, struct capture_control *capc) { enum compact_result ret; - unsigned long start_pfn = zone->zone_start_pfn; - unsigned long end_pfn = zone_end_pfn(zone); + unsigned long start_pfn = cc->zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(cc->zone); + unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; + bool update_cached; cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); - ret = compaction_suitable(zone, cc->order, cc->alloc_flags, + ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) @@ -1555,8 +2080,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro * Clear pageblock skip if there were failures recently and compaction * is about to be retried after being deferred. */ - if (compaction_restarting(zone, cc->order)) - __reset_isolation_suitable(zone); + if (compaction_restarting(cc->zone, cc->order)) + __reset_isolation_suitable(cc->zone); /* * Setup to move all movable pages to the end of the zone. Used cached @@ -1564,43 +2089,76 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro * want to compact the whole zone), but check that it is initialised * by ensuring the values are within zone boundaries. */ + cc->fast_start_pfn = 0; if (cc->whole_zone) { cc->migrate_pfn = start_pfn; cc->free_pfn = pageblock_start_pfn(end_pfn - 1); } else { - cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; - cc->free_pfn = zone->compact_cached_free_pfn; + cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync]; + cc->free_pfn = cc->zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { cc->free_pfn = pageblock_start_pfn(end_pfn - 1); - zone->compact_cached_free_pfn = cc->free_pfn; + cc->zone->compact_cached_free_pfn = cc->free_pfn; } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; - zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; + cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } - if (cc->migrate_pfn == start_pfn) + if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn) cc->whole_zone = true; } - cc->last_migrated_pfn = 0; + last_migrated_pfn = 0; + + /* + * Migrate has separate cached PFNs for ASYNC and SYNC* migration on + * the basis that some migrations will fail in ASYNC mode. However, + * if the cached PFNs match and pageblocks are skipped due to having + * no isolation candidates, then the sync state does not matter. + * Until a pageblock with isolation candidates is found, keep the + * cached PFNs in sync to avoid revisiting the same blocks. + */ + update_cached = !sync && + cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1]; trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); migrate_prep_local(); - while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { + while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { int err; + unsigned long start_pfn = cc->migrate_pfn; + + /* + * Avoid multiple rescans which can happen if a page cannot be + * isolated (dirty/writeback in async mode) or if the migrated + * pages are being allocated before the pageblock is cleared. + * The first rescan will capture the entire pageblock for + * migration. If it fails, it'll be marked skip and scanning + * will proceed as normal. + */ + cc->rescan = false; + if (pageblock_start_pfn(last_migrated_pfn) == + pageblock_start_pfn(start_pfn)) { + cc->rescan = true; + } - switch (isolate_migratepages(zone, cc)) { + switch (isolate_migratepages(cc->zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; + last_migrated_pfn = 0; goto out; case ISOLATE_NONE: + if (update_cached) { + cc->zone->compact_cached_migrate_pfn[1] = + cc->zone->compact_cached_migrate_pfn[0]; + } + /* * We haven't isolated and migrated anything, but * there might still be unflushed migrations from @@ -1608,6 +2166,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro */ goto check_drain; case ISOLATE_SUCCESS: + update_cached = false; + last_migrated_pfn = start_pfn; ; } @@ -1639,8 +2199,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro cc->migrate_pfn = block_end_pfn( cc->migrate_pfn - 1, cc->order); /* Draining pcplists is useless in this case */ - cc->last_migrated_pfn = 0; - + last_migrated_pfn = 0; } } @@ -1652,21 +2211,26 @@ check_drain: * compact_finished() can detect immediately if allocation * would succeed. */ - if (cc->order > 0 && cc->last_migrated_pfn) { + if (cc->order > 0 && last_migrated_pfn) { int cpu; unsigned long current_block_start = block_start_pfn(cc->migrate_pfn, cc->order); - if (cc->last_migrated_pfn < current_block_start) { + if (last_migrated_pfn < current_block_start) { cpu = get_cpu(); lru_add_drain_cpu(cpu); - drain_local_pages(zone); + drain_local_pages(cc->zone); put_cpu(); /* No more flushing until we migrate again */ - cc->last_migrated_pfn = 0; + last_migrated_pfn = 0; } } + /* Stop if a page has been captured */ + if (capc && capc->page) { + ret = COMPACT_SUCCESS; + break; + } } out: @@ -1685,8 +2249,8 @@ out: * Only go back, not forward. The cached pfn might have been * already reset to zone end in compact_finished() */ - if (free_pfn > zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = free_pfn; + if (free_pfn > cc->zone->compact_cached_free_pfn) + cc->zone->compact_cached_free_pfn = free_pfn; } count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); @@ -1700,7 +2264,8 @@ out: static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum compact_priority prio, - unsigned int alloc_flags, int classzone_idx) + unsigned int alloc_flags, int classzone_idx, + struct page **capture) { enum compact_result ret; struct compact_control cc = { @@ -1709,6 +2274,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .total_migrate_scanned = 0, .total_free_scanned = 0, .order = order, + .search_order = order, .gfp_mask = gfp_mask, .zone = zone, .mode = (prio == COMPACT_PRIO_ASYNC) ? @@ -1720,14 +2286,24 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) }; + struct capture_control capc = { + .cc = &cc, + .page = NULL, + }; + + if (capture) + current->capture_control = &capc; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - ret = compact_zone(zone, &cc); + ret = compact_zone(&cc, &capc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); + *capture = capc.page; + current->capture_control = NULL; + return ret; } @@ -1745,7 +2321,7 @@ int sysctl_extfrag_threshold = 500; */ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum compact_priority prio) + enum compact_priority prio, struct page **capture) { int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; @@ -1773,7 +2349,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } status = compact_zone_order(zone, order, gfp_mask, prio, - alloc_flags, ac_classzone_idx(ac)); + alloc_flags, ac_classzone_idx(ac), capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ @@ -1841,7 +2417,7 @@ static void compact_node(int nid) INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - compact_zone(zone, &cc); + compact_zone(&cc, NULL); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); @@ -1876,14 +2452,6 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, return 0; } -int sysctl_extfrag_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - proc_dointvec_minmax(table, write, buffer, length, ppos); - - return 0; -} - #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) static ssize_t sysfs_compact_node(struct device *dev, struct device_attribute *attr, @@ -1948,6 +2516,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct zone *zone; struct compact_control cc = { .order = pgdat->kcompactd_max_order, + .search_order = pgdat->kcompactd_max_order, .total_migrate_scanned = 0, .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, @@ -1983,7 +2552,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (kthread_should_stop()) return; - status = compact_zone(zone, &cc); + status = compact_zone(&cc, NULL); if (status == COMPACT_SUCCESS) { compaction_defer_reset(zone, cc.order, false); diff --git a/mm/dmapool.c b/mm/dmapool.c @@ -114,10 +114,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL); * @size: size of the blocks in this pool. * @align: alignment requirement for blocks; must be a power of two * @boundary: returned blocks won't cross this power of two boundary - * Context: !in_interrupt() + * Context: not in_interrupt() * - * Returns a dma allocation pool with the requested characteristics, or - * null if one can't be created. Given one of these pools, dma_pool_alloc() + * Given one of these pools, dma_pool_alloc() * may be used to allocate memory. Such memory will all have "consistent" * DMA mappings, accessible by the device and its driver without using * cache flushing primitives. The actual size of blocks allocated may be @@ -127,6 +126,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL); * cross that size boundary. This is useful for devices which have * addressing restrictions on individual DMA transfers, such as not crossing * boundaries of 4KBytes. + * + * Return: a dma allocation pool with the requested characteristics, or + * %NULL if one can't be created. */ struct dma_pool *dma_pool_create(const char *name, struct device *dev, size_t size, size_t align, size_t boundary) @@ -313,7 +315,7 @@ EXPORT_SYMBOL(dma_pool_destroy); * @mem_flags: GFP_* bitmask * @handle: pointer to dma address of block * - * This returns the kernel virtual address of a currently unused block, + * Return: the kernel virtual address of a currently unused block, * and reports its dma address through the handle. * If such a memory block can't be allocated, %NULL is returned. */ @@ -498,6 +500,9 @@ static int dmam_pool_match(struct device *dev, void *res, void *match_data) * * Managed dma_pool_create(). DMA pool created with this function is * automatically destroyed on driver detach. + * + * Return: a managed dma allocation pool with the requested + * characteristics, or %NULL if one can't be created. */ struct dma_pool *dmam_pool_create(const char *name, struct device *dev, size_t size, size_t align, size_t allocation) diff --git a/mm/failslab.c b/mm/failslab.c @@ -48,18 +48,12 @@ static int __init failslab_debugfs_init(void) if (IS_ERR(dir)) return PTR_ERR(dir); - if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_reclaim)) - goto fail; - if (!debugfs_create_bool("cache-filter", mode, dir, - &failslab.cache_filter)) - goto fail; + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &failslab.ignore_gfp_reclaim); + debugfs_create_bool("cache-filter", mode, dir, + &failslab.cache_filter); return 0; -fail: - debugfs_remove_recursive(dir); - - return -ENOMEM; } late_initcall(failslab_debugfs_init); diff --git a/mm/filemap.c b/mm/filemap.c @@ -98,8 +98,8 @@ * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) - * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) - * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) + * ->pgdat->lru_lock (follow_page->mark_page_accessed) + * ->pgdat->lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) @@ -392,6 +392,8 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) * opposed to a regular memory cleansing writeback. The difference between * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. + * + * Return: %0 on success, negative error code otherwise. */ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) @@ -438,6 +440,8 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. + * + * Return: %0 on success, negative error code otherwise. */ int filemap_flush(struct address_space *mapping) { @@ -453,6 +457,9 @@ EXPORT_SYMBOL(filemap_flush); * * Find at least one page in the range supplied, usually used to check if * direct writing in this range will trigger a writeback. + * + * Return: %true if at least one page exists in the specified range, + * %false otherwise. */ bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) @@ -529,6 +536,8 @@ static void __filemap_fdatawait_range(struct address_space *mapping, * Since the error status of the address space is cleared by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. + * + * Return: error status of the address space. */ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) @@ -551,6 +560,8 @@ EXPORT_SYMBOL(filemap_fdatawait_range); * Since the error status of the file is advanced by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. + * + * Return: error status of the address space vs. the file->f_wb_err cursor. */ int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) { @@ -572,6 +583,8 @@ EXPORT_SYMBOL(file_fdatawait_range); * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) + * + * Return: error status of the address space. */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { @@ -623,6 +636,8 @@ EXPORT_SYMBOL(filemap_write_and_wait); * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). + * + * Return: error status of the address space. */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) @@ -678,6 +693,8 @@ EXPORT_SYMBOL(__filemap_set_wb_err); * While we handle mapping->wb_err with atomic operations, the f_wb_err * value is protected by the f_lock since we must ensure that it reflects * the latest value swapped in for this file descriptor. + * + * Return: %0 on success, negative error code otherwise. */ int file_check_and_advance_wb_err(struct file *file) { @@ -720,6 +737,8 @@ EXPORT_SYMBOL(file_check_and_advance_wb_err); * * After writing out and waiting on the data, we check and advance the * f_wb_err cursor to the latest value, and return any errors detected there. + * + * Return: %0 on success, negative error code otherwise. */ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) { @@ -753,6 +772,8 @@ EXPORT_SYMBOL(file_write_and_wait_range); * caller must do that. * * The remove + add is atomic. This function cannot fail. + * + * Return: %0 */ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { @@ -867,6 +888,8 @@ error: * * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. + * + * Return: %0 on success, negative error code otherwise. */ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) @@ -1463,7 +1486,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * Otherwise, %NULL is returned. + * Return: the found page or shadow entry, %NULL if nothing is found. */ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { @@ -1521,9 +1544,9 @@ EXPORT_SYMBOL(find_get_entry); * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * Otherwise, %NULL is returned. - * * find_lock_entry() may sleep. + * + * Return: the found page or shadow entry, %NULL if nothing is found. */ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) { @@ -1563,12 +1586,14 @@ EXPORT_SYMBOL(find_lock_entry); * - FGP_CREAT: If page is not present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU * list. The page is returned locked and with an increased - * refcount. Otherwise, NULL is returned. + * refcount. * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. * * If there is a page cache page, it is returned with an increased refcount. + * + * Return: the found page or %NULL otherwise. */ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t gfp_mask) @@ -1656,8 +1681,7 @@ EXPORT_SYMBOL(pagecache_get_page); * Any shadow entries of evicted pages, or swap entries from * shmem/tmpfs, are included in the returned array. * - * find_get_entries() returns the number of pages and shadow entries - * which were found. + * Return: the number of pages and shadow entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, @@ -1727,8 +1751,8 @@ retry: * indexes. There may be holes in the indices due to not-present pages. * We also update @start to index the next page for the traversal. * - * find_get_pages_range() returns the number of pages which were found. If this - * number is smaller than @nr_pages, the end of specified range has been + * Return: the number of pages which were found. If this number is + * smaller than @nr_pages, the end of specified range has been * reached. */ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, @@ -1765,7 +1789,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pages[ret] = page; if (++ret == nr_pages) { - *start = page->index + 1; + *start = xas.xa_index + 1; goto out; } continue; @@ -1801,7 +1825,7 @@ out: * find_get_pages_contig() works exactly like find_get_pages(), except * that the returned number of pages are guaranteed to be contiguous. * - * find_get_pages_contig() returns the number of pages which were found. + * Return: the number of pages which were found. */ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) @@ -1837,16 +1861,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, if (unlikely(page != xas_reload(&xas))) goto put_page; - /* - * must check mapping and index after taking the ref. - * otherwise we can get both false positives and false - * negatives, which is just confusing to the caller. - */ - if (!page->mapping || page_to_pgoff(page) != xas.xa_index) { - put_page(page); - break; - } - pages[ret] = page; if (++ret == nr_pages) break; @@ -1872,6 +1886,8 @@ EXPORT_SYMBOL(find_get_pages_contig); * * Like find_get_pages, except we only return pages which are tagged with * @tag. We update @index to index the next page for the traversal. + * + * Return: the number of pages which were found. */ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag, unsigned int nr_pages, @@ -1911,7 +1927,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pages[ret] = page; if (++ret == nr_pages) { - *index = page->index + 1; + *index = xas.xa_index + 1; goto out; } continue; @@ -1949,6 +1965,8 @@ EXPORT_SYMBOL(find_get_pages_range_tag); * * Like find_get_entries, except we only return entries which are tagged with * @tag. + * + * Return: the number of entries which were found. */ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, xa_mark_t tag, unsigned int nr_entries, @@ -2034,6 +2052,10 @@ static void shrink_readahead_size_eio(struct file *filp, * * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. + * + * Return: + * * total number of bytes copied, including those the were already @written + * * negative error code if nothing was copied */ static ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) @@ -2295,6 +2317,9 @@ out: * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. + * Return: + * * number of bytes copied, even for partial reads + * * negative error code if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) @@ -2362,6 +2387,8 @@ EXPORT_SYMBOL(generic_file_read_iter); * * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. + * + * Return: %0 on success, negative error code otherwise. */ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) { @@ -2476,6 +2503,8 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * has not been released. * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. + * + * Return: bitwise-OR of %VM_FAULT_ codes. */ vm_fault_t filemap_fault(struct vm_fault *vmf) { @@ -2861,6 +2890,8 @@ out: * not set, try to fill the page and wait for it to become unlocked. * * If the page does not get brought uptodate, return -EIO. + * + * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, @@ -2881,6 +2912,8 @@ EXPORT_SYMBOL(read_cache_page); * any new page allocations done using the specified allocation flags. * * If the page does not get brought uptodate, return -EIO. + * + * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, @@ -3081,7 +3114,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) { /* If there are pages to writeback, return */ if (filemap_range_has_page(inode->i_mapping, pos, - pos + write_len)) + pos + write_len - 1)) return -EAGAIN; } else { written = filemap_write_and_wait_range(mapping, pos, @@ -3264,6 +3297,10 @@ EXPORT_SYMBOL(generic_perform_write); * This function does *not* take care of syncing data in case of O_SYNC write. * A caller has to handle it. This is mainly due to the fact that we want to * avoid syncing under i_mutex. + * + * Return: + * * number of bytes written, even for truncated writes + * * negative error code if no data has been written at all */ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -3348,6 +3385,10 @@ EXPORT_SYMBOL(__generic_file_write_iter); * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file * and acquires i_mutex as needed. + * Return: + * * negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * number of bytes written, even for truncated writes */ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -3374,8 +3415,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * @gfp_mask: memory allocation flags (and I/O mode) * * The address_space is to try to release any data against the page - * (presumably at page->private). If the release was successful, return '1'. - * Otherwise return zero. + * (presumably at page->private). * * This may also be called if PG_fscache is set on a page, indicating that the * page is known to the local caching routines. @@ -3383,6 +3423,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * The @gfp_mask argument specifies whether I/O may be performed to release * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). * + * Return: %1 if the release was successful, otherwise return zero. */ int try_to_release_page(struct page *page, gfp_t gfp_mask) { diff --git a/mm/gup.c b/mm/gup.c @@ -13,6 +13,9 @@ #include <linux/sched/signal.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> +#include <linux/migrate.h> +#include <linux/mm_inline.h> +#include <linux/sched/mm.h> #include <asm/mmu_context.h> #include <asm/pgtable.h> @@ -1126,7 +1129,167 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); +#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) + #ifdef CONFIG_FS_DAX +static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) +{ + long i; + struct vm_area_struct *vma_prev = NULL; + + for (i = 0; i < nr_pages; i++) { + struct vm_area_struct *vma = vmas[i]; + + if (vma == vma_prev) + continue; + + vma_prev = vma; + + if (vma_is_fsdax(vma)) + return true; + } + return false; +} +#else +static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) +{ + return false; +} +#endif + +#ifdef CONFIG_CMA +static struct page *new_non_cma_page(struct page *page, unsigned long private) +{ + /* + * We want to make sure we allocate the new page from the same node + * as the source page. + */ + int nid = page_to_nid(page); + /* + * Trying to allocate a page for migration. Ignore allocation + * failure warnings. We don't force __GFP_THISNODE here because + * this node here is the node where we have CMA reservation and + * in some case these nodes will have really less non movable + * allocation memory. + */ + gfp_t gfp_mask = GFP_USER | __GFP_NOWARN; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + +#ifdef CONFIG_HUGETLB_PAGE + if (PageHuge(page)) { + struct hstate *h = page_hstate(page); + /* + * We don't want to dequeue from the pool because pool pages will + * mostly be from the CMA region. + */ + return alloc_migrate_huge_page(h, gfp_mask, nid, NULL); + } +#endif + if (PageTransHuge(page)) { + struct page *thp; + /* + * ignore allocation failure warnings + */ + gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN; + + /* + * Remove the movable mask so that we don't allocate from + * CMA area again. + */ + thp_gfpmask &= ~__GFP_MOVABLE; + thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER); + if (!thp) + return NULL; + prep_transhuge_page(thp); + return thp; + } + + return __alloc_pages_node(nid, gfp_mask, 0); +} + +static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, + unsigned int gup_flags, + struct page **pages, + struct vm_area_struct **vmas) +{ + long i; + bool drain_allow = true; + bool migrate_allow = true; + LIST_HEAD(cma_page_list); + +check_again: + for (i = 0; i < nr_pages; i++) { + /* + * If we get a page from the CMA zone, since we are going to + * be pinning these entries, we might as well move them out + * of the CMA zone if possible. + */ + if (is_migrate_cma_page(pages[i])) { + + struct page *head = compound_head(pages[i]); + + if (PageHuge(head)) { + isolate_huge_page(head, &cma_page_list); + } else { + if (!PageLRU(head) && drain_allow) { + lru_add_drain_all(); + drain_allow = false; + } + + if (!isolate_lru_page(head)) { + list_add_tail(&head->lru, &cma_page_list); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + + page_is_file_cache(head), + hpage_nr_pages(head)); + } + } + } + } + + if (!list_empty(&cma_page_list)) { + /* + * drop the above get_user_pages reference. + */ + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + + if (migrate_pages(&cma_page_list, new_non_cma_page, + NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) { + /* + * some of the pages failed migration. Do get_user_pages + * without migration. + */ + migrate_allow = false; + + if (!list_empty(&cma_page_list)) + putback_movable_pages(&cma_page_list); + } + /* + * We did migrate all the pages, Try to get the page references again + * migrating any new CMA pages which we failed to isolate earlier. + */ + nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + if ((nr_pages > 0) && migrate_allow) { + drain_allow = true; + goto check_again; + } + } + + return nr_pages; +} +#else +static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, + unsigned int gup_flags, + struct page **pages, + struct vm_area_struct **vmas) +{ + return nr_pages; +} +#endif + /* * This is the same as get_user_pages() in that it assumes we are * operating on the current task's mm, but it goes further to validate @@ -1140,11 +1303,11 @@ EXPORT_SYMBOL(get_user_pages); * Contrast this to iov_iter_get_pages() usages which are transient. */ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas_arg) + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas_arg) { struct vm_area_struct **vmas = vmas_arg; - struct vm_area_struct *vma_prev = NULL; + unsigned long flags; long rc, i; if (!pages) @@ -1157,31 +1320,20 @@ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, return -ENOMEM; } + flags = memalloc_nocma_save(); rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + memalloc_nocma_restore(flags); + if (rc < 0) + goto out; - for (i = 0; i < rc; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma == vma_prev) - continue; - - vma_prev = vma; - - if (vma_is_fsdax(vma)) - break; - } - - /* - * Either get_user_pages() failed, or the vma validation - * succeeded, in either case we don't need to put_page() before - * returning. - */ - if (i >= rc) + if (check_dax_vmas(vmas, rc)) { + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; goto out; + } - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; + rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); out: if (vmas != vmas_arg) kfree(vmas); diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c @@ -122,12 +122,8 @@ static const struct file_operations gup_benchmark_fops = { static int gup_benchmark_init(void) { - void *ret; - - ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, - &gup_benchmark_fops); - if (!ret) - pr_warn("Failed to create gup_benchmark in debugfs"); + debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, + &gup_benchmark_fops); return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c @@ -33,6 +33,7 @@ #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/oom.h> +#include <linux/numa.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -616,6 +617,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); + count_memcg_events(memcg, THP_FAULT_ALLOC, 1); } return 0; @@ -1337,6 +1339,7 @@ alloc: } count_vm_event(THP_FAULT_ALLOC); + count_memcg_events(memcg, THP_FAULT_ALLOC, 1); if (!page) clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); @@ -1475,7 +1478,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - int page_nid = -1, this_nid = numa_node_id(); + int page_nid = NUMA_NO_NODE, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; bool migrated = false; @@ -1520,7 +1523,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) */ page_locked = trylock_page(page); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) { + if (target_nid == NUMA_NO_NODE) { /* If the page was locked, there are no parallel migrations */ if (page_locked) goto clear_pmdnuma; @@ -1528,7 +1531,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - page_nid = -1; + page_nid = NUMA_NO_NODE; if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); @@ -1549,14 +1552,14 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (unlikely(!pmd_same(pmd, *vmf->pmd))) { unlock_page(page); put_page(page); - page_nid = -1; + page_nid = NUMA_NO_NODE; goto out_unlock; } /* Bail if we fail to protect against THP splits for any reason */ if (unlikely(!anon_vma)) { put_page(page); - page_nid = -1; + page_nid = NUMA_NO_NODE; goto clear_pmdnuma; } @@ -1618,7 +1621,7 @@ out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); - if (page_nid != -1) + if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); @@ -1979,7 +1982,6 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr) { - pud_t orig_pud; spinlock_t *ptl; ptl = __pud_trans_huge_lock(pud, vma); @@ -1991,8 +1993,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, * pgtable_trans_huge_withdraw after finishing pudp related * operations. */ - orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, - tlb->fullmm); + pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_dax(vma)) { spin_unlock(ptl); @@ -2437,11 +2438,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, pgoff_t end, unsigned long flags) { struct page *head = compound_head(page); - struct zone *zone = page_zone(head); + pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; int i; - lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(head, pgdat); /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); @@ -2472,7 +2473,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, xa_unlock(&head->mapping->i_pages); } - spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); remap_page(head); @@ -2683,7 +2684,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) lru_add_drain(); /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); + spin_lock_irqsave(&pgdata->lru_lock, flags); if (mapping) { XA_STATE(xas, &mapping->i_pages, page_index(head)); @@ -2728,7 +2729,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_unlock(&pgdata->split_queue_lock); fail: if (mapping) xa_unlock(&mapping->i_pages); - spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); + spin_unlock_irqrestore(&pgdata->lru_lock, flags); remap_page(head); ret = -EBUSY; } @@ -2886,12 +2887,8 @@ DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, static int __init split_huge_pages_debugfs(void) { - void *ret; - - ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, - &split_huge_pages_fops); - if (!ret) - pr_warn("Failed to create split_huge_pages in debugfs"); + debugfs_create_file("split_huge_pages", 0200, NULL, NULL, + &split_huge_pages_fops); return 0; } late_initcall(split_huge_pages_debugfs); diff --git a/mm/hugetlb.c b/mm/hugetlb.c @@ -25,6 +25,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/jhash.h> +#include <linux/numa.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -887,7 +888,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, struct zonelist *zonelist; struct zone *zone; struct zoneref *z; - int node = -1; + int node = NUMA_NO_NODE; zonelist = node_zonelist(nid, gfp_mask); @@ -919,7 +920,7 @@ retry_cpuset: /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepage_migration_supported(h)) + if (hugepage_movable_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; @@ -1586,8 +1587,8 @@ out_unlock: return page; } -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { struct page *page; @@ -4398,10 +4399,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, continue; } if (!huge_pte_none(pte)) { - pte = huge_ptep_get_and_clear(mm, address, ptep); - pte = pte_mkhuge(huge_pte_modify(pte, newprot)); + pte_t old_pte; + + old_pte = huge_ptep_modify_prot_start(vma, address, ptep); + pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); pte = arch_make_huge_pte(pte, vma, NULL, 0); - set_huge_pte_at(mm, address, ptep, pte); + huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; } spin_unlock(ptl); diff --git a/mm/internal.h b/mm/internal.h @@ -163,6 +163,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, extern int __isolate_free_page(struct page *page, unsigned int order); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); +extern void __free_pages_core(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); @@ -183,14 +184,16 @@ extern int user_min_free_kbytes; struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ + unsigned int nr_freepages; /* Number of isolated free pages */ + unsigned int nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ + unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; - unsigned long nr_freepages; /* Number of isolated free pages */ - unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long total_migrate_scanned; unsigned long total_free_scanned; - unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long migrate_pfn; /* isolate_migratepages search base */ - unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ + unsigned short fast_search_fail;/* failures to use free list searches */ + short search_order; /* order to start a fast search at */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ @@ -203,7 +206,16 @@ struct compact_control { bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ - bool finishing_block; /* Finishing current pageblock */ + bool rescan; /* Rescanning the same pageblock */ +}; + +/* + * Used in direct compaction when a page should be taken from the freelists + * immediately when one is created during the free path. + */ +struct capture_control { + struct compact_control *cc; + struct page *page; }; unsigned long diff --git a/mm/kasan/common.c b/mm/kasan/common.c @@ -14,6 +14,8 @@ * */ +#define __KASAN_INTERNAL + #include <linux/export.h> #include <linux/interrupt.h> #include <linux/init.h> diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c @@ -275,25 +275,6 @@ EXPORT_SYMBOL(__asan_storeN_noabort); void __asan_handle_no_return(void) {} EXPORT_SYMBOL(__asan_handle_no_return); -/* Emitted by compiler to poison large objects when they go out of scope. */ -void __asan_poison_stack_memory(const void *addr, size_t size) -{ - /* - * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded - * by redzones, so we simply round up size to simplify logic. - */ - kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), - KASAN_USE_AFTER_SCOPE); -} -EXPORT_SYMBOL(__asan_poison_stack_memory); - -/* Emitted by compiler to unpoison large objects when they go into scope. */ -void __asan_unpoison_stack_memory(const void *addr, size_t size) -{ - kasan_unpoison_shadow(addr, size); -} -EXPORT_SYMBOL(__asan_unpoison_stack_memory); - /* Emitted by compiler to poison alloca()ed objects. */ void __asan_alloca_poison(unsigned long addr, size_t size) { diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c @@ -82,9 +82,6 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) case KASAN_KMALLOC_FREE: bug_type = "use-after-free"; break; - case KASAN_USE_AFTER_SCOPE: - bug_type = "use-after-scope"; - break; case KASAN_ALLOCA_LEFT: case KASAN_ALLOCA_RIGHT: bug_type = "alloca-out-of-bounds"; diff --git a/mm/kasan/init.c b/mm/kasan/init.c @@ -42,7 +42,7 @@ static inline bool kasan_p4d_table(pgd_t pgd) #else static inline bool kasan_p4d_table(pgd_t pgd) { - return 0; + return false; } #endif #if CONFIG_PGTABLE_LEVELS > 3 @@ -54,7 +54,7 @@ static inline bool kasan_pud_table(p4d_t p4d) #else static inline bool kasan_pud_table(p4d_t p4d) { - return 0; + return false; } #endif #if CONFIG_PGTABLE_LEVELS > 2 @@ -66,7 +66,7 @@ static inline bool kasan_pmd_table(pud_t pud) #else static inline bool kasan_pmd_table(pud_t pud) { - return 0; + return false; } #endif pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h @@ -34,7 +34,6 @@ #define KASAN_STACK_MID 0xF2 #define KASAN_STACK_RIGHT 0xF3 #define KASAN_STACK_PARTIAL 0xF4 -#define KASAN_USE_AFTER_SCOPE 0xF8 /* * alloca redzone shadow values @@ -187,8 +186,6 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size); void __asan_loadN(unsigned long addr, size_t size); void __asan_storeN(unsigned long addr, size_t size); void __asan_handle_no_return(void); -void __asan_poison_stack_memory(const void *addr, size_t size); -void __asan_unpoison_stack_memory(const void *addr, size_t size); void __asan_alloca_poison(unsigned long addr, size_t size); void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); diff --git a/mm/khugepaged.c b/mm/khugepaged.c @@ -1074,6 +1074,7 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); mem_cgroup_commit_charge(new_page, memcg, false, true); + count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -1502,6 +1503,7 @@ xa_unlocked: page_ref_add(new_page, HPAGE_PMD_NR - 1); set_page_dirty(new_page); mem_cgroup_commit_charge(new_page, memcg, false, true); + count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_anon(new_page); /* diff --git a/mm/ksm.c b/mm/ksm.c @@ -598,7 +598,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, chain->chain_prune_time = jiffies; chain->rmap_hlist_len = STABLE_NODE_CHAIN; #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) - chain->nid = -1; /* debug */ + chain->nid = NUMA_NO_NODE; /* debug */ #endif ksm_stable_node_chains++; @@ -667,6 +667,12 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) free_stable_node(stable_node); } +enum get_ksm_page_flags { + GET_KSM_PAGE_NOLOCK, + GET_KSM_PAGE_LOCK, + GET_KSM_PAGE_TRYLOCK +}; + /* * get_ksm_page: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. @@ -686,7 +692,8 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) +static struct page *get_ksm_page(struct stable_node *stable_node, + enum get_ksm_page_flags flags) { struct page *page; void *expected_mapping; @@ -706,8 +713,9 @@ again: * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; - * but if page is swapcache in migrate_page_move_mapping(), it might - * still be our page, in which case it's essential to keep the node. + * the same is in reuse_ksm_page() case; but if page is swapcache + * in migrate_page_move_mapping(), it might still be our page, + * in which case it's essential to keep the node. */ while (!get_page_unless_zero(page)) { /* @@ -728,8 +736,15 @@ again: goto stale; } - if (lock_it) { + if (flags == GET_KSM_PAGE_TRYLOCK) { + if (!trylock_page(page)) { + put_page(page); + return ERR_PTR(-EBUSY); + } + } else if (flags == GET_KSM_PAGE_LOCK) lock_page(page); + + if (flags != GET_KSM_PAGE_NOLOCK) { if (READ_ONCE(page->mapping) != expected_mapping) { unlock_page(page); put_page(page); @@ -763,7 +778,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) struct page *page; stable_node = rmap_item->head; - page = get_ksm_page(stable_node, true); + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); if (!page) goto out; @@ -863,7 +878,7 @@ static int remove_stable_node(struct stable_node *stable_node) struct page *page; int err; - page = get_ksm_page(stable_node, true); + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); if (!page) { /* * get_ksm_page did remove_node_from_stable_tree itself. @@ -1385,7 +1400,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup, * stable_node parameter itself will be freed from * under us if it returns NULL. */ - _tree_page = get_ksm_page(dup, false); + _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); if (!_tree_page) continue; nr += 1; @@ -1508,7 +1523,7 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *_stable_node_dup = stable_node; - return get_ksm_page(stable_node, false); + return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); } /* * _stable_node_dup set to NULL means the stable_node @@ -1613,7 +1628,8 @@ again: * wrprotected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, false); + tree_page = get_ksm_page(stable_node_any, + GET_KSM_PAGE_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { @@ -1673,7 +1689,12 @@ again: * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node_dup, true); + tree_page = get_ksm_page(stable_node_dup, + GET_KSM_PAGE_TRYLOCK); + + if (PTR_ERR(tree_page) == -EBUSY) + return ERR_PTR(-EBUSY); + if (unlikely(!tree_page)) /* * The tree may have been rebalanced, @@ -1842,7 +1863,8 @@ again: * wrprotected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, false); + tree_page = get_ksm_page(stable_node_any, + GET_KSM_PAGE_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { @@ -2068,6 +2090,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) remove_rmap_item_from_tree(rmap_item); if (kpage) { + if (PTR_ERR(kpage) == -EBUSY) + return; + err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { /* @@ -2242,7 +2267,8 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { - page = get_ksm_page(stable_node, false); + page = get_ksm_page(stable_node, + GET_KSM_PAGE_NOLOCK); if (page) put_page(page); cond_resched(); @@ -2642,6 +2668,31 @@ again: goto again; } +bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ +#ifdef CONFIG_DEBUG_VM + if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || + WARN_ON(!page_mapped(page)) || + WARN_ON(!PageLocked(page))) { + dump_page(page, "reuse_ksm_page"); + return false; + } +#endif + + if (PageSwapCache(page) || !page_stable_node(page)) + return false; + /* Prohibit parallel get_ksm_page() */ + if (!page_ref_freeze(page, 1)) + return false; + + page_move_anon_rmap(page, vma); + page->index = linear_page_index(vma, address); + page_ref_unfreeze(page, 1); + + return true; +} #ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { diff --git a/mm/list_lru.c b/mm/list_lru.c @@ -601,7 +601,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key, struct shrinker *shrinker) { int i; - size_t size = sizeof(*lru->node) * nr_node_ids; int err = -ENOMEM; #ifdef CONFIG_MEMCG_KMEM @@ -612,7 +611,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, #endif memcg_get_cache_ids(); - lru->node = kzalloc(size, GFP_KERNEL); + lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); if (!lru->node) goto out; diff --git a/mm/memblock.c b/mm/memblock.c @@ -2005,8 +2005,7 @@ DEFINE_SHOW_ATTRIBUTE(memblock_debug); static int __init memblock_init_debugfs(void) { struct dentry *root = debugfs_create_dir("memblock", NULL); - if (!root) - return -ENXIO; + debugfs_create_file("memory", 0444, root, &memblock.memory, &memblock_debug_fops); debugfs_create_file("reserved", 0444, root, diff --git a/mm/memcontrol.c b/mm/memcontrol.c @@ -39,6 +39,7 @@ #include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> +#include <linux/vm_event_item.h> #include <linux/smp.h> #include <linux/page-flags.h> #include <linux/backing-dev.h> @@ -248,6 +249,12 @@ enum res_type { iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) +static inline bool should_force_charge(void) +{ + return tsk_is_oom_victim(current) || fatal_signal_pending(current) || + (current->flags & PF_EXITING); +} + /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -1389,8 +1396,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, }; bool ret; - mutex_lock(&oom_lock); - ret = out_of_memory(&oc); + if (mutex_lock_killable(&oom_lock)) + return true; + /* + * A few threads which were not waiting at mutex_lock_killable() can + * fail to bail out. Therefore, check again after holding oom_lock. + */ + ret = should_force_charge() || out_of_memory(&oc); mutex_unlock(&oom_lock); return ret; } @@ -2209,9 +2221,7 @@ retry: * bypass the last charges so that they can exit quickly and * free their memory. */ - if (unlikely(tsk_is_oom_victim(current) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) + if (unlikely(should_force_charge())) goto force; /* @@ -2352,13 +2362,13 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) static void lock_page_lru(struct page *page, int *isolated) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); if (PageLRU(page)) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(page, pgdat); ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_lru(page)); *isolated = 1; @@ -2368,17 +2378,17 @@ static void lock_page_lru(struct page *page, int *isolated) static void unlock_page_lru(struct page *page, int isolated) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); if (isolated) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(page, pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, page_lru(page)); } - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); } static void commit_charge(struct page *page, struct mem_cgroup *memcg, @@ -2573,7 +2583,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) } /** - * memcg_kmem_charge_memcg: charge a kmem page + * __memcg_kmem_charge_memcg: charge a kmem page * @page: page to charge * @gfp: reclaim mode * @order: allocation order @@ -2581,7 +2591,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) * * Returns 0 on success, an error code on failure. */ -int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg) { unsigned int nr_pages = 1 << order; @@ -2604,24 +2614,24 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, } /** - * memcg_kmem_charge: charge a kmem page to the current memory cgroup + * __memcg_kmem_charge: charge a kmem page to the current memory cgroup * @page: page to charge * @gfp: reclaim mode * @order: allocation order * * Returns 0 on success, an error code on failure. */ -int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret = 0; - if (mem_cgroup_disabled() || memcg_kmem_bypass()) + if (memcg_kmem_bypass()) return 0; memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { - ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) __SetPageKmemcg(page); } @@ -2629,11 +2639,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) return ret; } /** - * memcg_kmem_uncharge: uncharge a kmem page + * __memcg_kmem_uncharge: uncharge a kmem page * @page: page to uncharge * @order: allocation order */ -void memcg_kmem_uncharge(struct page *page, int order) +void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; unsigned int nr_pages = 1 << order; @@ -2664,7 +2674,7 @@ void memcg_kmem_uncharge(struct page *page, int order) /* * Because tail pages are not marked as "used", set it. We're under - * zone_lru_lock and migration entries setup in all page mappings. + * pgdat->lru_lock and migration entries setup in all page mappings. */ void mem_cgroup_split_huge_fixup(struct page *head) { @@ -3337,7 +3347,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) const struct numa_stat *stat; int nid; unsigned long nr; - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); @@ -3388,7 +3398,7 @@ static const char *const memcg1_event_names[] = { static int memcg_stat_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; @@ -3626,8 +3636,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, size = thresholds->primary ? thresholds->primary->size + 1 : 1; /* Allocate memory for new array of thresholds */ - new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), - GFP_KERNEL); + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); if (!new) { ret = -ENOMEM; goto unlock; @@ -3821,7 +3830,7 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); @@ -4420,7 +4429,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - size_t size; + unsigned int size; int node; size = sizeof(struct mem_cgroup); @@ -5354,6 +5363,16 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) root_mem_cgroup->use_hierarchy = false; } +static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) +{ + if (value == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); + + return 0; +} + static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -5364,15 +5383,8 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_min_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long min = READ_ONCE(memcg->memory.min); - - if (min == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); } static ssize_t memory_min_write(struct kernfs_open_file *of, @@ -5394,15 +5406,8 @@ static ssize_t memory_min_write(struct kernfs_open_file *of, static int memory_low_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = READ_ONCE(memcg->memory.low); - - if (low == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); } static ssize_t memory_low_write(struct kernfs_open_file *of, @@ -5424,15 +5429,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long high = READ_ONCE(memcg->high); - - if (high == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); } static ssize_t memory_high_write(struct kernfs_open_file *of, @@ -5461,15 +5458,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->memory.max); - - if (max == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); } static ssize_t memory_max_write(struct kernfs_open_file *of, @@ -5523,7 +5513,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, static int memory_events_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "low %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_LOW])); @@ -5541,7 +5531,7 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); struct accumulated_stats acc; int i; @@ -5582,6 +5572,15 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "file_writeback %llu\n", (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); + /* + * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter + * with the NR_ANON_THP vm counter, but right now it's a pain in the + * arse because it requires migrating the work out of rmap to a place + * where the page->mem_cgroup is set up and stable. + */ + seq_printf(m, "anon_thp %llu\n", + (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE); + for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], (u64)acc.lru_pages[i] * PAGE_SIZE); @@ -5613,12 +5612,18 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]); + seq_printf(m, "thp_collapse_alloc %lu\n", + acc.events[THP_COLLAPSE_ALLOC]); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return 0; } static int memory_oom_group_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "%d\n", memcg->oom_group); @@ -5747,7 +5752,7 @@ struct cgroup_subsys memory_cgrp_subsys = { * * | memory.current, if memory.current < memory.low * low_usage = | - | 0, otherwise. + * | 0, otherwise. * * * Such definition of the effective memory.low provides the expected @@ -6601,15 +6606,8 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->swap.max); - - if (max == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); } static ssize_t swap_max_write(struct kernfs_open_file *of, @@ -6631,7 +6629,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, static int swap_events_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "max %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); diff --git a/mm/memfd.c b/mm/memfd.c @@ -131,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ - F_SEAL_WRITE) + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE) static int memfd_add_seals(struct file *file, unsigned int seals) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c @@ -1825,19 +1825,17 @@ static int soft_offline_in_use_page(struct page *page, int flags) struct page *hpage = compound_head(page); if (!PageHuge(page) && PageTransHuge(hpage)) { - lock_page(hpage); - if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { - unlock_page(hpage); - if (!PageAnon(hpage)) + lock_page(page); + if (!PageAnon(page) || unlikely(split_huge_page(page))) { + unlock_page(page); + if (!PageAnon(page)) pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); else pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); - put_hwpoison_page(hpage); + put_hwpoison_page(page); return -EBUSY; } - unlock_page(hpage); - get_hwpoison_page(page); - put_hwpoison_page(hpage); + unlock_page(page); } /* diff --git a/mm/memory.c b/mm/memory.c @@ -69,6 +69,7 @@ #include <linux/userfaultfd_k.h> #include <linux/dax.h> #include <linux/oom.h> +#include <linux/numa.h> #include <asm/io.h> #include <asm/mmu_context.h> @@ -1451,7 +1452,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; retval = -EINVAL; - if (PageAnon(page)) + if (PageAnon(page) || PageSlab(page) || page_has_type(page)) goto out; retval = -ENOMEM; flush_dcache_page(page); @@ -1503,6 +1504,8 @@ out: * under mm->mmap_sem write-lock, so it can change vma->vm_flags. * Caller must set VM_MIXEDMAP on vma if it wants to call this * function from other places, for example from page-fault handler. + * + * Return: %0 on success, negative error code otherwise. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) @@ -1830,7 +1833,9 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, * @size: size of map area * @prot: page protection flags for this mapping * - * Note: this is only safe if the mm semaphore is held when called. + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) @@ -1903,6 +1908,8 @@ EXPORT_SYMBOL(remap_pfn_range); * * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get * whatever write-combining details or similar. + * + * Return: %0 on success, negative error code otherwise. */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { @@ -2381,12 +2388,13 @@ oom: * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. - * It handles locking of PTE and modifying it. The function returns - * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE - * lock. + * It handles locking of PTE and modifying it. * * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). + * + * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before + * we acquired PTE lock. */ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { @@ -2504,8 +2512,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { + if (PageAnon(vmf->page)) { int total_map_swapcount; + if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) || + page_count(vmf->page) != 1)) + goto copy; if (!trylock_page(vmf->page)) { get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2520,6 +2531,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) } put_page(vmf->page); } + if (PageKsm(vmf->page)) { + bool reused = reuse_ksm_page(vmf->page, vmf->vma, + vmf->address); + unlock_page(vmf->page); + if (!reused) + goto copy; + wp_page_reuse(vmf); + return VM_FAULT_WRITE; + } if (reuse_swap_page(vmf->page, &total_map_swapcount)) { if (total_map_swapcount == 1) { /* @@ -2540,7 +2560,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) (VM_WRITE|VM_SHARED))) { return wp_page_shared(vmf); } - +copy: /* * Ok, we need to copy. Oh, well.. */ @@ -3201,6 +3221,8 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * * Target users are page handler itself and implementations of * vm_ops->map_pages. + * + * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) @@ -3261,11 +3283,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, * This function handles all that is needed to finish a page fault once the * page to fault in is prepared. It handles locking of PTEs, inserts PTE for * given page, adds reverse page mapping, handles memcg charges and LRU - * addition. The function returns 0 on success, VM_FAULT_ code in case of - * error. + * addition. * * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). + * + * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t finish_fault(struct vm_fault *vmf) { @@ -3321,12 +3344,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, static int __init fault_around_debugfs(void) { - void *ret; - - ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, - &fault_around_bytes_fops); - if (!ret) - pr_warn("Failed to create fault_around_bytes in debugfs"); + debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, + &fault_around_bytes_fops); return 0; } late_initcall(fault_around_debugfs); @@ -3517,10 +3536,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * but allow concurrent faults). * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). + * If mmap_sem is released, vma may become invalid (for example + * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; /* @@ -3561,7 +3583,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { - pte_free(vma->vm_mm, vmf->prealloc_pte); + pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; @@ -3586,11 +3608,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; - int page_nid = -1; + int page_nid = NUMA_NO_NODE; int last_cpupid; int target_nid; bool migrated = false; - pte_t pte; + pte_t pte, old_pte; bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; @@ -3610,12 +3632,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Make it present again, Depending on how arch implementes non * accessible ptes, some can allow access by kernel mode. */ - pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte); - pte = pte_modify(pte, vma->vm_page_prot); + old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte); + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); @@ -3653,7 +3675,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); pte_unmap_unlock(vmf->pte, vmf->ptl); - if (target_nid == -1) { + if (target_nid == NUMA_NO_NODE) { put_page(page); goto out; } @@ -3667,7 +3689,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_MIGRATE_FAIL; out: - if (page_nid != -1) + if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; } @@ -4150,7 +4172,7 @@ EXPORT_SYMBOL(follow_pte_pmd); * * Only IO mappings and raw PFN mappings are allowed. * - * Returns zero and the pfn at @pfn on success, -ve otherwise. + * Return: zero and the pfn at @pfn on success, -ve otherwise. */ int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn) @@ -4300,6 +4322,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from source to destination. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c @@ -47,7 +47,7 @@ * and restore_online_page_callback() for generic callback restore. */ -static void generic_online_page(struct page *page); +static void generic_online_page(struct page *page, unsigned int order); static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); @@ -656,26 +656,40 @@ void __online_page_free(struct page *page) } EXPORT_SYMBOL_GPL(__online_page_free); -static void generic_online_page(struct page *page) +static void generic_online_page(struct page *page, unsigned int order) { - __online_page_set_limits(page); - __online_page_increment_counters(page); - __online_page_free(page); + kernel_map_pages(page, 1 << order, 1); + __free_pages_core(page, order); + totalram_pages_add(1UL << order); +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages_add(1UL << order); +#endif +} + +static int online_pages_blocks(unsigned long start, unsigned long nr_pages) +{ + unsigned long end = start + nr_pages; + int order, onlined_pages = 0; + + while (start < end) { + order = min(MAX_ORDER - 1, + get_order(PFN_PHYS(end) - PFN_PHYS(start))); + (*online_page_callback)(pfn_to_page(start), order); + + onlined_pages += (1UL << order); + start += (1UL << order); + } + return onlined_pages; } static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) { - unsigned long i; unsigned long onlined_pages = *(unsigned long *)arg; - struct page *page; if (PageReserved(pfn_to_page(start_pfn))) - for (i = 0; i < nr_pages; i++) { - page = pfn_to_page(start_pfn + i); - (*online_page_callback)(page); - onlined_pages++; - } + onlined_pages += online_pages_blocks(start_pfn, nr_pages); online_mem_sections(start_pfn, start_pfn + nr_pages); @@ -689,9 +703,9 @@ static void node_states_check_changes_online(unsigned long nr_pages, { int nid = zone_to_nid(zone); - arg->status_change_nid = -1; - arg->status_change_nid_normal = -1; - arg->status_change_nid_high = -1; + arg->status_change_nid = NUMA_NO_NODE; + arg->status_change_nid_normal = NUMA_NO_NODE; + arg->status_change_nid_high = NUMA_NO_NODE; if (!node_state(nid, N_MEMORY)) arg->status_change_nid = nid; @@ -1365,12 +1379,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { struct page *head = compound_head(page); - pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; if (compound_order(head) > PFN_SECTION_SHIFT) { ret = -EBUSY; break; } - isolate_huge_page(page, &source); + pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; + isolate_huge_page(head, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(compound_head(page)) @@ -1496,9 +1510,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages, unsigned long present_pages = 0; enum zone_type zt; - arg->status_change_nid = -1; - arg->status_change_nid_normal = -1; - arg->status_change_nid_high = -1; + arg->status_change_nid = NUMA_NO_NODE; + arg->status_change_nid_normal = NUMA_NO_NODE; + arg->status_change_nid_high = NUMA_NO_NODE; /* * Check whether node_states[N_NORMAL_MEMORY] will be changed. @@ -1612,7 +1626,6 @@ static int __ref __offline_pages(unsigned long start_pfn, cond_resched(); lru_add_drain_all(); - drain_all_pages(zone); pfn = scan_movable_pages(pfn, end_pfn); if (pfn) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c @@ -350,7 +350,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && + if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; @@ -2304,7 +2304,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); - int polnid = -1; + int polnid = NUMA_NO_NODE; int ret = -1; pol = get_vma_policy(vma, addr); diff --git a/mm/mempool.c b/mm/mempool.c @@ -222,6 +222,8 @@ EXPORT_SYMBOL(mempool_init_node); * * Like mempool_create(), but initializes the pool in (i.e. embedded in another * structure). + * + * Return: %0 on success, negative error code otherwise. */ int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) @@ -245,6 +247,8 @@ EXPORT_SYMBOL(mempool_init); * functions. This function might sleep. Both the alloc_fn() and the free_fn() * functions might sleep - as long as the mempool_alloc() function is not called * from IRQ contexts. + * + * Return: pointer to the created memory pool object or %NULL on error. */ mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) @@ -289,6 +293,8 @@ EXPORT_SYMBOL(mempool_create_node); * Note, the caller must guarantee that no mempool_destroy is called * while this function is running. mempool_alloc() & mempool_free() * might be called (eg. from IRQ contexts) while this function executes. + * + * Return: %0 on success, negative error code otherwise. */ int mempool_resize(mempool_t *pool, int new_min_nr) { @@ -363,6 +369,8 @@ EXPORT_SYMBOL(mempool_resize); * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) * Note: using __GFP_ZERO is not supported. + * + * Return: pointer to the allocated element or %NULL on error. */ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { diff --git a/mm/migrate.c b/mm/migrate.c @@ -100,7 +100,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) /* * Check PageMovable before holding a PG_lock because page's owner * assumes anybody doesn't touch PG_lock of newly allocated page - * so unconditionally grapping the lock ruins page's owner side. + * so unconditionally grabbing the lock ruins page's owner side. */ if (unlikely(!__PageMovable(page))) goto out_putpage; @@ -374,7 +374,7 @@ unlock: } #endif -static int expected_page_refs(struct page *page) +static int expected_page_refs(struct address_space *mapping, struct page *page) { int expected_count = 1; @@ -384,7 +384,7 @@ static int expected_page_refs(struct page *page) */ expected_count += is_device_private_page(page); expected_count += is_device_public_page(page); - if (page_mapping(page)) + if (mapping) expected_count += hpage_nr_pages(page) + page_has_private(page); return expected_count; @@ -405,7 +405,7 @@ int migrate_page_move_mapping(struct address_space *mapping, XA_STATE(xas, &mapping->i_pages, page_index(page)); struct zone *oldzone, *newzone; int dirty; - int expected_count = expected_page_refs(page) + extra_count; + int expected_count = expected_page_refs(mapping, page) + extra_count; if (!mapping) { /* Anonymous page without mapping */ @@ -750,7 +750,7 @@ static int __buffer_migrate_page(struct address_space *mapping, return migrate_page(mapping, newpage, page, mode); /* Check whether page does not have extra refs before we do more work */ - expected_count = expected_page_refs(page); + expected_count = expected_page_refs(mapping, page); if (page_count(page) != expected_count) return -EAGAIN; @@ -911,7 +911,7 @@ static int fallback_migrate_page(struct address_space *mapping, */ if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) - return -EAGAIN; + return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; return migrate_page(mapping, newpage, page, mode); } @@ -1287,7 +1287,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, struct anon_vma *anon_vma = NULL; /* - * Movability of hugepages depends on architectures and hugepage size. + * Migratability of hugepages depends on architectures and their size. * This check is necessary because some callers of hugepage migration * like soft offline and memory hotremove don't walk through page * tables or check whether the hugepage is pmd-based or not before diff --git a/mm/mlock.c b/mm/mlock.c @@ -182,7 +182,7 @@ static void __munlock_isolation_failed(struct page *page) unsigned int munlock_vma_page(struct page *page) { int nr_pages; - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); @@ -194,7 +194,7 @@ unsigned int munlock_vma_page(struct page *page) * might otherwise copy PageMlocked to part of the tail pages before * we clear it in the head page. It also stabilizes hpage_nr_pages(). */ - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); if (!TestClearPageMlocked(page)) { /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ @@ -203,17 +203,17 @@ unsigned int munlock_vma_page(struct page *page) } nr_pages = hpage_nr_pages(page); - __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); if (__munlock_isolate_lru_page(page, true)) { - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); __munlock_isolated_page(page); goto out; } __munlock_isolation_failed(page); unlock_out: - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); out: return nr_pages - 1; @@ -298,7 +298,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_init(&pvec_putback); /* Phase 1: page isolation */ - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&zone->zone_pgdat->lru_lock); for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; @@ -325,7 +325,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pvec->pages[i] = NULL; } __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&zone->zone_pgdat->lru_lock); /* Now we can release pins of pages that we are not munlocking */ pagevec_release(&pvec_putback); diff --git a/mm/mmap.c b/mm/mmap.c @@ -438,7 +438,7 @@ static void vma_gap_update(struct vm_area_struct *vma) { /* * As it turns out, RB_DECLARE_CALLBACKS() already created a callback - * function that does exacltly what we want. + * function that does exactly what we want. */ vma_gap_callbacks_propagate(&vma->vm_rb, NULL); } @@ -1012,7 +1012,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, * VM_SOFTDIRTY should not prevent from VMA merging, if we * match the flags but dirty bit -- the caller should mark * merged VMA as dirty. If dirty bit won't be excluded from - * comparison, we increase pressue on the memory system forcing + * comparison, we increase pressure on the memory system forcing * the kernel to generate new VMAs when old one could be * extended instead. */ @@ -1115,7 +1115,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN * might become case 1 below case 2 below case 3 below * - * It is important for case 8 that the the vma NNNN overlapping the + * It is important for case 8 that the vma NNNN overlapping the * region AAAA is never going to extended over XXXX. Instead XXXX must * be extended in region AAAA and NNNN must be removed. This way in * all cases where vma_merge succeeds, the moment vma_adjust drops the @@ -1645,7 +1645,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) #endif /* __ARCH_WANT_SYS_OLD_MMAP */ /* - * Some shared mappigns will want the pages marked read-only + * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). @@ -2126,13 +2126,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, */ #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; struct vm_unmapped_area_info info; const unsigned long mmap_end = arch_get_mmap_end(addr); diff --git a/mm/mprotect.c b/mm/mprotect.c @@ -110,8 +110,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; } - ptent = ptep_modify_prot_start(mm, addr, pte); - ptent = pte_modify(ptent, newprot); + oldpte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_modify(oldpte, newprot); if (preserve_write) ptent = pte_mk_savedwrite(ptent); @@ -121,7 +121,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, !(vma->vm_flags & VM_SOFTDIRTY))) { ptent = pte_mkwrite(ptent); } - ptep_modify_prot_commit(mm, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); pages++; } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); diff --git a/mm/mremap.c b/mm/mremap.c @@ -516,6 +516,23 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; + /* + * move_vma() need us to stay 4 maps below the threshold, otherwise + * it will bail out at the very beginning. + * That is a problem if we have already unmaped the regions here + * (new_addr, and old_addr), because userspace will not know the + * state of the vma's after it gets -ENOMEM. + * So, to avoid such scenario we can pre-compute if the whole + * operation has high chances to success map-wise. + * Worst-scenario case is when both vma's (new_addr and old_addr) get + * split in 3 before unmaping it. + * That means 2 more maps (1 for each) to the ones we already hold. + * Check whether current map count plus 2 still leads us to 4 maps below + * the threshold, otherwise return -ENOMEM here to be more safe. + */ + if ((mm->map_count + 2) >= sysctl_max_map_count - 3) + return -ENOMEM; + ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) goto out; diff --git a/mm/oom_kill.c b/mm/oom_kill.c @@ -843,7 +843,7 @@ static bool task_will_free_mem(struct task_struct *task) return ret; } -static void __oom_kill_process(struct task_struct *victim) +static void __oom_kill_process(struct task_struct *victim, const char *message) { struct task_struct *p; struct mm_struct *mm; @@ -874,8 +874,9 @@ static void __oom_kill_process(struct task_struct *victim) */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); - pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", - task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", + message, task_pid_nr(victim), victim->comm, + K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), K(get_mm_counter(victim->mm, MM_FILEPAGES)), K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); @@ -926,24 +927,20 @@ static void __oom_kill_process(struct task_struct *victim) * Kill provided task unless it's secured by setting * oom_score_adj to OOM_SCORE_ADJ_MIN. */ -static int oom_kill_memcg_member(struct task_struct *task, void *unused) +static int oom_kill_memcg_member(struct task_struct *task, void *message) { - if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && + !is_global_init(task)) { get_task_struct(task); - __oom_kill_process(task); + __oom_kill_process(task, message); } return 0; } static void oom_kill_process(struct oom_control *oc, const char *message) { - struct task_struct *p = oc->chosen; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; - struct task_struct *t; + struct task_struct *victim = oc->chosen; struct mem_cgroup *oom_group; - unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -952,57 +949,18 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * its children or threads, just give it access to memory reserves * so it can die quickly */ - task_lock(p); - if (task_will_free_mem(p)) { - mark_oom_victim(p); - wake_oom_reaper(p); - task_unlock(p); - put_task_struct(p); + task_lock(victim); + if (task_will_free_mem(victim)) { + mark_oom_victim(victim); + wake_oom_reaper(victim); + task_unlock(victim); + put_task_struct(victim); return; } - task_unlock(p); + task_unlock(victim); if (__ratelimit(&oom_rs)) - dump_header(oc, p); - - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", - message, task_pid_nr(p), p->comm, points); - - /* - * If any of p's children has a different mm and is eligible for kill, - * the one with the highest oom_badness() score is sacrificed for its - * parent. This attempts to lose the minimal amount of work done while - * still freeing memory. - */ - read_lock(&tasklist_lock); - - /* - * The task 'p' might have already exited before reaching here. The - * put_task_struct() will free task_struct 'p' while the loop still try - * to access the field of 'p', so, get an extra reference. - */ - get_task_struct(p); - for_each_thread(p, t) { - list_for_each_entry(child, &t->children, sibling) { - unsigned int child_points; - - if (process_shares_mm(child, p->mm)) - continue; - /* - * oom_badness() returns 0 if the thread is unkillable - */ - child_points = oom_badness(child, - oc->memcg, oc->nodemask, oc->totalpages); - if (child_points > victim_points) { - put_task_struct(victim); - victim = child; - victim_points = child_points; - get_task_struct(victim); - } - } - } - put_task_struct(p); - read_unlock(&tasklist_lock); + dump_header(oc, victim); /* * Do we need to kill the entire memory cgroup? @@ -1011,14 +969,15 @@ static void oom_kill_process(struct oom_control *oc, const char *message) */ oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); - __oom_kill_process(victim); + __oom_kill_process(victim, message); /* * If necessary, kill all tasks in the selected memory cgroup. */ if (oom_group) { mem_cgroup_print_oom_group(oom_group); - mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); + mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, + (void*)message); mem_cgroup_put(oom_group); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c @@ -270,7 +270,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, * node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node * - * Returns the node's number of pages potentially available for dirty + * Return: the node's number of pages potentially available for dirty * page cache. This is the base value for the per-node dirty limits. */ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) @@ -355,7 +355,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) /** * global_dirtyable_memory - number of globally dirtyable pages * - * Returns the global number of pages potentially available for dirty + * Return: the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ static unsigned long global_dirtyable_memory(void) @@ -470,7 +470,7 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) * node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node * - * Returns the maximum number of dirty pages allowed in a node, based + * Return: the maximum number of dirty pages allowed in a node, based * on the node's dirtyable memory. */ static unsigned long node_dirty_limit(struct pglist_data *pgdat) @@ -495,7 +495,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) * node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check * - * Returns %true when the dirty pages in @pgdat are within the node's + * Return: %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ bool node_dirty_ok(struct pglist_data *pgdat) @@ -743,9 +743,6 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * __wb_calc_thresh - @wb's share of dirty throttling threshold * @dtc: dirty_throttle_context of interest * - * Returns @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. - * * Note that balance_dirty_pages() will only seriously take it as a hard limit * when sleeping max_pause per page is not enough to keep the dirty pages under * control. For example, when the device is completely stalled due to some error @@ -759,6 +756,9 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. + * + * Return: @wb's dirty limit in pages. The term "dirty" in the context of + * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { @@ -1918,7 +1918,9 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited); * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's - * clean enough. Returns %true if writeback should continue. + * clean enough. + * + * Return: %true if writeback should continue. */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { @@ -2147,6 +2149,8 @@ EXPORT_SYMBOL(tag_pages_for_writeback); * lock/page writeback access order inversion - we should only ever lock * multiple pages in ascending page->index order, and looping back to the start * of the file violates that rule and causes deadlocks. + * + * Return: %0 on success, negative error code otherwise */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, @@ -2305,6 +2309,8 @@ static int __writepage(struct page *page, struct writeback_control *wbc, * * This is a library function, which implements the writepages() * address_space_operation. + * + * Return: %0 on success, negative error code otherwise */ int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -2351,6 +2357,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) * * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this * function returns. + * + * Return: %0 on success, negative error code otherwise */ int write_one_page(struct page *page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c @@ -289,8 +289,8 @@ EXPORT_SYMBOL(movable_zone); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 -int nr_node_ids __read_mostly = MAX_NUMNODES; -int nr_online_nodes __read_mostly = 1; +unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; +unsigned int nr_online_nodes __read_mostly = 1; EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif @@ -789,6 +789,57 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, return 0; } +#ifdef CONFIG_COMPACTION +static inline struct capture_control *task_capc(struct zone *zone) +{ + struct capture_control *capc = current->capture_control; + + return capc && + !(current->flags & PF_KTHREAD) && + !capc->page && + capc->cc->zone == zone && + capc->cc->direct_compaction ? capc : NULL; +} + +static inline bool +compaction_capture(struct capture_control *capc, struct page *page, + int order, int migratetype) +{ + if (!capc || order != capc->cc->order) + return false; + + /* Do not accidentally pollute CMA or isolated regions*/ + if (is_migrate_cma(migratetype) || + is_migrate_isolate(migratetype)) + return false; + + /* + * Do not let lower order allocations polluate a movable pageblock. + * This might let an unmovable request use a reclaimable pageblock + * and vice-versa but no more than normal fallback logic which can + * have trouble finding a high-order free page. + */ + if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) + return false; + + capc->page = page; + return true; +} + +#else +static inline struct capture_control *task_capc(struct zone *zone) +{ + return NULL; +} + +static inline bool +compaction_capture(struct capture_control *capc, struct page *page, + int order, int migratetype) +{ + return false; +} +#endif /* CONFIG_COMPACTION */ + /* * Freeing function for a buddy system allocator. * @@ -822,6 +873,7 @@ static inline void __free_one_page(struct page *page, unsigned long uninitialized_var(buddy_pfn); struct page *buddy; unsigned int max_order; + struct capture_control *capc = task_capc(zone); max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); @@ -837,6 +889,11 @@ static inline void __free_one_page(struct page *page, continue_merging: while (order < max_order - 1) { + if (compaction_capture(capc, page, order, migratetype)) { + __mod_zone_freepage_state(zone, -(1 << order), + migratetype); + return; + } buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); @@ -1056,7 +1113,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (PageMappingFlags(page)) page->mapping = NULL; if (memcg_kmem_enabled() && PageKmemcg(page)) - memcg_kmem_uncharge(page, order); + __memcg_kmem_uncharge(page, order); if (check_free) bad += free_pages_check(page); if (bad) @@ -1303,7 +1360,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, unsigned int order) +void __free_pages_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1382,7 +1439,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; - return __free_pages_boot_core(page, order); + __free_pages_core(page, order); } /* @@ -1472,14 +1529,14 @@ static void __init deferred_free_range(unsigned long pfn, if (nr_pages == pageblock_nr_pages && (pfn & (pageblock_nr_pages - 1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, pageblock_order); + __free_pages_core(page, pageblock_order); return; } for (i = 0; i < nr_pages; i++, page++, pfn++) { if ((pfn & (pageblock_nr_pages - 1)) == 0) set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, 0); + __free_pages_core(page, 0); } } @@ -1945,8 +2002,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); - kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); + kernel_poison_pages(page, 1 << order, 1); set_page_owner(page, order, gfp_flags); } @@ -2962,7 +3019,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * watermark, because we already know our high-order page * exists. */ - watermark = min_wmark_pages(zone) + (1UL << order); + watermark = zone->_watermark[WMARK_MIN] + (1UL << order); if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; @@ -3173,24 +3230,14 @@ static int __init fail_page_alloc_debugfs(void) dir = fault_create_debugfs_attr("fail_page_alloc", NULL, &fail_page_alloc.attr); - if (IS_ERR(dir)) - return PTR_ERR(dir); - - if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_reclaim)) - goto fail; - if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, - &fail_page_alloc.ignore_gfp_highmem)) - goto fail; - if (!debugfs_create_u32("min-order", mode, dir, - &fail_page_alloc.min_order)) - goto fail; - return 0; -fail: - debugfs_remove_recursive(dir); + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_reclaim); + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); - return -ENOMEM; + return 0; } late_initcall(fail_page_alloc_debugfs); @@ -3710,7 +3757,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, enum compact_result *compact_result) { - struct page *page; + struct page *page = NULL; unsigned long pflags; unsigned int noreclaim_flag; @@ -3721,13 +3768,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - prio); + prio, &page); memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); - if (*compact_result <= COMPACT_INACTIVE) + if (*compact_result <= COMPACT_INACTIVE) { + WARN_ON_ONCE(page); return NULL; + } /* * At least in one zone compaction wasn't deferred or skipped, so let's @@ -3735,7 +3784,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + /* Prep a captured page if available */ + if (page) + prep_new_page(page, order, gfp_mask, alloc_flags); + + /* Try get a page from the freelist if available */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) { struct zone *zone = page_zone(page); @@ -4568,7 +4623,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, out: if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && - unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { + unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) { __free_pages(page, order); page = NULL; } @@ -4761,6 +4816,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, * This function is also limited by MAX_ORDER. * * Memory allocated by this function must be released by free_pages_exact(). + * + * Return: pointer to the allocated area or %NULL in case of error. */ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) { @@ -4781,6 +4838,8 @@ EXPORT_SYMBOL(alloc_pages_exact); * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. + * + * Return: pointer to the allocated area or %NULL in case of error. */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { @@ -4814,11 +4873,13 @@ EXPORT_SYMBOL(free_pages_exact); * nr_free_zone_pages - count number of pages beyond high watermark * @offset: The zone index of the highest zone * - * nr_free_zone_pages() counts the number of counts pages which are beyond the + * nr_free_zone_pages() counts the number of pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: * * nr_free_zone_pages = managed_pages - high_pages + * + * Return: number of pages beyond high watermark. */ static unsigned long nr_free_zone_pages(int offset) { @@ -4845,6 +4906,9 @@ static unsigned long nr_free_zone_pages(int offset) * * nr_free_buffer_pages() counts the number of pages which are beyond the high * watermark within ZONE_DMA and ZONE_NORMAL. + * + * Return: number of pages beyond high watermark within ZONE_DMA and + * ZONE_NORMAL. */ unsigned long nr_free_buffer_pages(void) { @@ -4857,6 +4921,8 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages); * * nr_free_pagecache_pages() counts the number of pages which are beyond the * high watermark within all zones. + * + * Return: number of pages beyond high watermark within all zones. */ unsigned long nr_free_pagecache_pages(void) { @@ -5303,7 +5369,8 @@ static int node_load[MAX_NUMNODES]; * from each node to each node in the system), and should also prefer nodes * with no CPUs, since presumably they'll have very little allocation pressure * on them otherwise. - * It returns -1 if no node is found. + * + * Return: node id of the found node or %NUMA_NO_NODE if no node is found. */ static int find_next_best_node(int node, nodemask_t *used_node_mask) { @@ -5609,7 +5676,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", + pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", nr_online_nodes, page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); @@ -6016,7 +6083,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn, return state->last_nid; nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); - if (nid != -1) { + if (nid != NUMA_NO_NODE) { state->last_start = start_pfn; state->last_end = end_pfn; state->last_nid = nid; @@ -6214,7 +6281,7 @@ unsigned long __init __absent_pages_in_range(int nid, * @start_pfn: The start PFN to start searching for holes * @end_pfn: The end PFN to stop searching for holes * - * It returns the number of pages frames in memory holes within a range. + * Return: the number of pages frames in memory holes within a range. */ unsigned long __init absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn) @@ -6376,10 +6443,14 @@ static void __ref setup_usemap(struct pglist_data *pgdat, { unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; - if (usemapsize) + if (usemapsize) { zone->pageblock_flags = memblock_alloc_node_nopanic(usemapsize, pgdat->node_id); + if (!zone->pageblock_flags) + panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", + usemapsize, zone->name, pgdat->node_id); + } } #else static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, @@ -6609,6 +6680,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); map = memblock_alloc_node_nopanic(size, pgdat->node_id); + if (!map) + panic("Failed to allocate %ld bytes for node %d memory map\n", + size, pgdat->node_id); pgdat->node_mem_map = map + offset; } pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", @@ -6764,14 +6838,14 @@ void __init setup_nr_node_ids(void) * model has fine enough granularity to avoid incorrect mapping for the * populated node map. * - * Returns the determined alignment in pfn's. 0 if there is no alignment + * Return: the determined alignment in pfn's. 0 if there is no alignment * requirement (single node). */ unsigned long __init node_map_pfn_alignment(void) { unsigned long accl_mask = 0, last_end = 0; unsigned long start, end, mask; - int last_nid = -1; + int last_nid = NUMA_NO_NODE; int i, nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { @@ -6819,7 +6893,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) /** * find_min_pfn_with_active_regions - Find the minimum PFN registered * - * It returns the minimum PFN based on information provided via + * Return: the minimum PFN based on information provided via * memblock_set_node(). */ unsigned long __init find_min_pfn_with_active_regions(void) @@ -7267,7 +7341,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } -EXPORT_SYMBOL(free_reserved_area); #ifdef CONFIG_HIGHMEM void free_highmem_page(struct page *page) @@ -7496,7 +7569,7 @@ static void __setup_per_zone_wmarks(void) * value here. * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) - * deltas control asynch page reclaim, and so should + * deltas control async page reclaim, and so should * not be capped for highmem. */ unsigned long min_pages; @@ -7973,7 +8046,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, /* * Hugepages are not in LRU lists, but they're movable. - * We need not scan over tail pages bacause we don't + * We need not scan over tail pages because we don't * handle each tail page individually in migration. */ if (PageHuge(page)) { @@ -8112,7 +8185,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * pageblocks in the range. Once isolated, the pageblocks should not * be modified by others. * - * Returns zero on success or negative error code. On success all + * Return: zero on success or negative error code. On success all * pages which PFN is in [start, end) are allocated for the caller and * need to be freed with free_contig_range(). */ @@ -8196,7 +8269,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, */ lru_add_drain_all(); - drain_all_pages(cc.zone); order = 0; outer_start = start; diff --git a/mm/page_ext.c b/mm/page_ext.c @@ -273,6 +273,7 @@ static void free_page_ext(void *addr) table_size = get_entry_size() * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); + kmemleak_free(addr); free_pages_exact(addr, table_size); } } @@ -300,7 +301,7 @@ static int __meminit online_page_ext(unsigned long start_pfn, start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); - if (nid == -1) { + if (nid == NUMA_NO_NODE) { /* * In this case, "nid" already exists and contains valid memory. * "start_pfn" passed to us is a pfn which is an arg for diff --git a/mm/page_idle.c b/mm/page_idle.c @@ -31,7 +31,7 @@ static struct page *page_idle_get_page(unsigned long pfn) { struct page *page; - struct zone *zone; + pg_data_t *pgdat; if (!pfn_valid(pfn)) return NULL; @@ -41,13 +41,13 @@ static struct page *page_idle_get_page(unsigned long pfn) !get_page_unless_zero(page)) return NULL; - zone = page_zone(page); - spin_lock_irq(zone_lru_lock(zone)); + pgdat = page_pgdat(page); + spin_lock_irq(&pgdat->lru_lock); if (unlikely(!PageLRU(page))) { put_page(page); page = NULL; } - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); return page; } diff --git a/mm/page_owner.c b/mm/page_owner.c @@ -625,16 +625,14 @@ static const struct file_operations proc_page_owner_operations = { static int __init pageowner_init(void) { - struct dentry *dentry; - if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } - dentry = debugfs_create_file("page_owner", 0400, NULL, - NULL, &proc_page_owner_operations); + debugfs_create_file("page_owner", 0400, NULL, NULL, + &proc_page_owner_operations); - return PTR_ERR_OR_ZERO(dentry); + return 0; } late_initcall(pageowner_init) diff --git a/mm/page_poison.c b/mm/page_poison.c @@ -6,6 +6,7 @@ #include <linux/page_ext.h> #include <linux/poison.h> #include <linux/ratelimit.h> +#include <linux/kasan.h> static bool want_page_poisoning __read_mostly; @@ -40,7 +41,10 @@ static void poison_page(struct page *page) { void *addr = kmap_atomic(page); + /* KASAN still think the page is in-use, so skip it. */ + kasan_disable_current(); memset(addr, PAGE_POISON, PAGE_SIZE); + kasan_enable_current(); kunmap_atomic(addr); } diff --git a/mm/readahead.c b/mm/readahead.c @@ -81,6 +81,8 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping, * @data: private data for the callback routine. * * Hides the details of the LRU cache etc from the filesystems. + * + * Returns: %0 on success, error return by @filler otherwise */ int read_cache_pages(struct address_space *mapping, struct list_head *pages, int (*filler)(void *, struct page *), void *data) diff --git a/mm/rmap.c b/mm/rmap.c @@ -27,7 +27,7 @@ * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock - * zone_lru_lock (in mark_page_accessed, isolate_lru_page) + * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) diff --git a/mm/shmem.c b/mm/shmem.c @@ -36,6 +36,7 @@ #include <linux/uio.h> #include <linux/khugepaged.h> #include <linux/hugetlb.h> +#include <linux/frontswap.h> #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ @@ -123,6 +124,10 @@ static unsigned long shmem_default_max_inodes(void) static bool shmem_should_replace_page(struct page *page, gfp_t gfp); static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); +static int shmem_swapin_page(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + vm_fault_t *fault_type); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, @@ -1089,159 +1094,184 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } -static unsigned long find_swap_entry(struct xarray *xa, void *item) +extern struct swap_info_struct *swap_info[]; + +static int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices, + bool frontswap) { - XA_STATE(xas, xa, 0); - unsigned int checked = 0; - void *entry; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; + unsigned int ret = 0; + + if (!nr_entries) + return 0; rcu_read_lock(); - xas_for_each(&xas, entry, ULONG_MAX) { - if (xas_retry(&xas, entry)) + xas_for_each(&xas, page, ULONG_MAX) { + if (xas_retry(&xas, page)) continue; - if (entry == item) - break; - checked++; - if ((checked % XA_CHECK_SCHED) != 0) + + if (!xa_is_value(page)) continue; - xas_pause(&xas); - cond_resched_rcu(); + + if (frontswap) { + swp_entry_t entry = radix_to_swp_entry(page); + + if (!frontswap_test(swap_info[swp_type(entry)], + swp_offset(entry))) + continue; + } + + indices[ret] = xas.xa_index; + entries[ret] = page; + + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + if (++ret == nr_entries) + break; } rcu_read_unlock(); - return entry ? xas.xa_index : -1; + return ret; } /* - * If swap found in inode, free it and move page from swapcache to filecache. + * Move the swapped pages for an inode to page cache. Returns the count + * of pages swapped in, or the error in case of failure. */ -static int shmem_unuse_inode(struct shmem_inode_info *info, - swp_entry_t swap, struct page **pagep) +static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, + pgoff_t *indices) { - struct address_space *mapping = info->vfs_inode.i_mapping; - void *radswap; - pgoff_t index; - gfp_t gfp; + int i = 0; + int ret = 0; int error = 0; + struct address_space *mapping = inode->i_mapping; - radswap = swp_to_radix_entry(swap); - index = find_swap_entry(&mapping->i_pages, radswap); - if (index == -1) - return -EAGAIN; /* tell shmem_unuse we found nothing */ + for (i = 0; i < pvec.nr; i++) { + struct page *page = pvec.pages[i]; - /* - * Move _head_ to start search for next from here. - * But be careful: shmem_evict_inode checks list_empty without taking - * mutex, and there's an instant in list_move_tail when info->swaplist - * would appear empty, if it were the only one on shmem_swaplist. - */ - if (shmem_swaplist.next != &info->swaplist) - list_move_tail(&shmem_swaplist, &info->swaplist); - - gfp = mapping_gfp_mask(mapping); - if (shmem_should_replace_page(*pagep, gfp)) { - mutex_unlock(&shmem_swaplist_mutex); - error = shmem_replace_page(pagep, gfp, info, index); - mutex_lock(&shmem_swaplist_mutex); - /* - * We needed to drop mutex to make that restrictive page - * allocation, but the inode might have been freed while we - * dropped it: although a racing shmem_evict_inode() cannot - * complete without emptying the page cache, our page lock - * on this swapcache page is not enough to prevent that - - * free_swap_and_cache() of our swap entry will only - * trylock_page(), removing swap from page cache whatever. - * - * We must not proceed to shmem_add_to_page_cache() if the - * inode has been freed, but of course we cannot rely on - * inode or mapping or info to check that. However, we can - * safely check if our swap entry is still in use (and here - * it can't have got reused for another page): if it's still - * in use, then the inode cannot have been freed yet, and we - * can safely proceed (if it's no longer in use, that tells - * nothing about the inode, but we don't need to unuse swap). - */ - if (!page_swapcount(*pagep)) - error = -ENOENT; + if (!xa_is_value(page)) + continue; + error = shmem_swapin_page(inode, indices[i], + &page, SGP_CACHE, + mapping_gfp_mask(mapping), + NULL, NULL); + if (error == 0) { + unlock_page(page); + put_page(page); + ret++; + } + if (error == -ENOMEM) + break; + error = 0; } + return error ? error : ret; +} - /* - * We rely on shmem_swaplist_mutex, not only to protect the swaplist, - * but also to hold up shmem_evict_inode(): so inode cannot be freed - * beneath us (pagelock doesn't help until the page is in pagecache). - */ - if (!error) - error = shmem_add_to_page_cache(*pagep, mapping, index, - radswap, gfp); - if (error != -ENOMEM) { - /* - * Truncation and eviction use free_swap_and_cache(), which - * only does trylock page: if we raced, best clean up here. - */ - delete_from_swap_cache(*pagep); - set_page_dirty(*pagep); - if (!error) { - spin_lock_irq(&info->lock); - info->swapped--; - spin_unlock_irq(&info->lock); - swap_free(swap); +/* + * If swap found in inode, free it and move page from swapcache to filecache. + */ +static int shmem_unuse_inode(struct inode *inode, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t start = 0; + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0); + int ret = 0; + + pagevec_init(&pvec); + do { + unsigned int nr_entries = PAGEVEC_SIZE; + + if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE) + nr_entries = *fs_pages_to_unuse; + + pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, + pvec.pages, indices, + frontswap); + if (pvec.nr == 0) { + ret = 0; + break; } - } - return error; + + ret = shmem_unuse_swap_entries(inode, pvec, indices); + if (ret < 0) + break; + + if (frontswap_partial) { + *fs_pages_to_unuse -= ret; + if (*fs_pages_to_unuse == 0) { + ret = FRONTSWAP_PAGES_UNUSED; + break; + } + } + + start = indices[pvec.nr - 1]; + } while (true); + + return ret; } /* - * Search through swapped inodes to find and replace swap by page. + * Read all the shared memory data that resides in the swap + * device 'type' back into memory, so the swap device can be + * unused. */ -int shmem_unuse(swp_entry_t swap, struct page *page) +int shmem_unuse(unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { - struct list_head *this, *next; - struct shmem_inode_info *info; - struct mem_cgroup *memcg; + struct shmem_inode_info *info, *next; + struct inode *inode; + struct inode *prev_inode = NULL; int error = 0; - /* - * There's a faint possibility that swap page was replaced before - * caller locked it: caller will come back later with the right page. - */ - if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) - goto out; + if (list_empty(&shmem_swaplist)) + return 0; + + mutex_lock(&shmem_swaplist_mutex); /* - * Charge page using GFP_KERNEL while we can wait, before taking - * the shmem_swaplist_mutex which might hold up shmem_writepage(). - * Charged back to the user (not to caller) when swap account is used. + * The extra refcount on the inode is necessary to safely dereference + * p->next after re-acquiring the lock. New shmem inodes with swap + * get added to the end of the list and we will scan them all. */ - error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, - &memcg, false); - if (error) - goto out; - /* No memory allocation: swap entry occupies the slot for the page */ - error = -EAGAIN; - - mutex_lock(&shmem_swaplist_mutex); - list_for_each_safe(this, next, &shmem_swaplist) { - info = list_entry(this, struct shmem_inode_info, swaplist); - if (info->swapped) - error = shmem_unuse_inode(info, swap, &page); - else + list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { + if (!info->swapped) { list_del_init(&info->swaplist); + continue; + } + + inode = igrab(&info->vfs_inode); + if (!inode) + continue; + + mutex_unlock(&shmem_swaplist_mutex); + if (prev_inode) + iput(prev_inode); + prev_inode = inode; + + error = shmem_unuse_inode(inode, type, frontswap, + fs_pages_to_unuse); cond_resched(); - if (error != -EAGAIN) + + mutex_lock(&shmem_swaplist_mutex); + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); + if (error) break; - /* found nothing in this: move on to search the next */ } mutex_unlock(&shmem_swaplist_mutex); - if (error) { - if (error != -ENOMEM) - error = 0; - mem_cgroup_cancel_charge(page, memcg, false); - } else - mem_cgroup_commit_charge(page, memcg, true, false); -out: - unlock_page(page); - put_page(page); + if (prev_inode) + iput(prev_inode); + return error; } @@ -1325,7 +1355,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) */ mutex_lock(&shmem_swaplist_mutex); if (list_empty(&info->swaplist)) - list_add_tail(&info->swaplist, &shmem_swaplist); + list_add(&info->swaplist, &shmem_swaplist); if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { spin_lock_irq(&info->lock); @@ -1576,6 +1606,116 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, } /* + * Swap in the page pointed to by *pagep. + * Caller has to make sure that *pagep contains a valid swapped page. + * Returns 0 and the page in pagep if success. On failure, returns the + * the error code and NULL in *pagep. + */ +static int shmem_swapin_page(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + vm_fault_t *fault_type) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; + struct mem_cgroup *memcg; + struct page *page; + swp_entry_t swap; + int error; + + VM_BUG_ON(!*pagep || !xa_is_value(*pagep)); + swap = radix_to_swp_entry(*pagep); + *pagep = NULL; + + /* Look it up and read it in.. */ + page = lookup_swap_cache(swap, NULL, 0); + if (!page) { + /* Or update major stats only when swapin succeeds?? */ + if (fault_type) { + *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(charge_mm, PGMAJFAULT); + } + /* Here we actually start the io */ + page = shmem_swapin(swap, gfp, info, index); + if (!page) { + error = -ENOMEM; + goto failed; + } + } + + /* We have to do this with page locked to prevent races */ + lock_page(page); + if (!PageSwapCache(page) || page_private(page) != swap.val || + !shmem_confirm_swap(mapping, index, swap)) { + error = -EEXIST; + goto unlock; + } + if (!PageUptodate(page)) { + error = -EIO; + goto failed; + } + wait_on_page_writeback(page); + + if (shmem_should_replace_page(page, gfp)) { + error = shmem_replace_page(&page, gfp, info, index); + if (error) + goto failed; + } + + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, + false); + if (!error) { + error = shmem_add_to_page_cache(page, mapping, index, + swp_to_radix_entry(swap), gfp); + /* + * We already confirmed swap under page lock, and make + * no memory allocation here, so usually no possibility + * of error; but free_swap_and_cache() only trylocks a + * page, so it is just possible that the entry has been + * truncated or holepunched since swap was confirmed. + * shmem_undo_range() will have done some of the + * unaccounting, now delete_from_swap_cache() will do + * the rest. + */ + if (error) { + mem_cgroup_cancel_charge(page, memcg, false); + delete_from_swap_cache(page); + } + } + if (error) + goto failed; + + mem_cgroup_commit_charge(page, memcg, true, false); + + spin_lock_irq(&info->lock); + info->swapped--; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + + if (sgp == SGP_WRITE) + mark_page_accessed(page); + + delete_from_swap_cache(page); + set_page_dirty(page); + swap_free(swap); + + *pagep = page; + return 0; +failed: + if (!shmem_confirm_swap(mapping, index, swap)) + error = -EEXIST; +unlock: + if (page) { + unlock_page(page); + put_page(page); + } + + return error; +} + +/* * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the @@ -1596,7 +1736,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct mm_struct *charge_mm; struct mem_cgroup *memcg; struct page *page; - swp_entry_t swap; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; int error; @@ -1608,17 +1747,23 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) sgp = SGP_CACHE; repeat: - swap.val = 0; + if (sgp <= SGP_CACHE && + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { + return -EINVAL; + } + + sbinfo = SHMEM_SB(inode->i_sb); + charge_mm = vma ? vma->vm_mm : current->mm; + page = find_lock_entry(mapping, index); if (xa_is_value(page)) { - swap = radix_to_swp_entry(page); - page = NULL; - } + error = shmem_swapin_page(inode, index, &page, + sgp, gfp, vma, fault_type); + if (error == -EEXIST) + goto repeat; - if (sgp <= SGP_CACHE && - ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { - error = -EINVAL; - goto unlock; + *pagep = page; + return error; } if (page && sgp == SGP_WRITE) @@ -1632,7 +1777,7 @@ repeat: put_page(page); page = NULL; } - if (page || (sgp == SGP_READ && !swap.val)) { + if (page || sgp == SGP_READ) { *pagep = page; return 0; } @@ -1641,215 +1786,138 @@ repeat: * Fast cache lookup did not find it: * bring it back from swap or allocate. */ - sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : current->mm; - - if (swap.val) { - /* Look it up and read it in.. */ - page = lookup_swap_cache(swap, NULL, 0); - if (!page) { - /* Or update major stats only when swapin succeeds?? */ - if (fault_type) { - *fault_type |= VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - count_memcg_event_mm(charge_mm, PGMAJFAULT); - } - /* Here we actually start the io */ - page = shmem_swapin(swap, gfp, info, index); - if (!page) { - error = -ENOMEM; - goto failed; - } - } - - /* We have to do this with page locked to prevent races */ - lock_page(page); - if (!PageSwapCache(page) || page_private(page) != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { - error = -EEXIST; /* try again */ - goto unlock; - } - if (!PageUptodate(page)) { - error = -EIO; - goto failed; - } - wait_on_page_writeback(page); - - if (shmem_should_replace_page(page, gfp)) { - error = shmem_replace_page(&page, gfp, info, index); - if (error) - goto failed; - } - - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - false); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap), gfp); - /* - * We already confirmed swap under page lock, and make - * no memory allocation here, so usually no possibility - * of error; but free_swap_and_cache() only trylocks a - * page, so it is just possible that the entry has been - * truncated or holepunched since swap was confirmed. - * shmem_undo_range() will have done some of the - * unaccounting, now delete_from_swap_cache() will do - * the rest. - * Reset swap.val? No, leave it so "failed" goes back to - * "repeat": reading a hole and writing should succeed. - */ - if (error) { - mem_cgroup_cancel_charge(page, memcg, false); - delete_from_swap_cache(page); - } - } - if (error) - goto failed; - - mem_cgroup_commit_charge(page, memcg, true, false); - - spin_lock_irq(&info->lock); - info->swapped--; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - - if (sgp == SGP_WRITE) - mark_page_accessed(page); - delete_from_swap_cache(page); - set_page_dirty(page); - swap_free(swap); - - } else { - if (vma && userfaultfd_missing(vma)) { - *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); - return 0; - } + if (vma && userfaultfd_missing(vma)) { + *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); + return 0; + } - /* shmem_symlink() */ - if (mapping->a_ops != &shmem_aops) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_FORCE) + /* shmem_symlink() */ + if (mapping->a_ops != &shmem_aops) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_FORCE) + goto alloc_huge; + switch (sbinfo->huge) { + loff_t i_size; + pgoff_t off; + case SHMEM_HUGE_NEVER: + goto alloc_nohuge; + case SHMEM_HUGE_WITHIN_SIZE: + off = round_up(index, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && + i_size >> PAGE_SHIFT >= off) goto alloc_huge; - switch (sbinfo->huge) { - loff_t i_size; - pgoff_t off; - case SHMEM_HUGE_NEVER: - goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(index, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - goto alloc_huge; - /* fallthrough */ - case SHMEM_HUGE_ADVISE: - if (sgp_huge == SGP_HUGE) - goto alloc_huge; - /* TODO: implement fadvise() hints */ - goto alloc_nohuge; - } + /* fallthrough */ + case SHMEM_HUGE_ADVISE: + if (sgp_huge == SGP_HUGE) + goto alloc_huge; + /* TODO: implement fadvise() hints */ + goto alloc_nohuge; + } alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true); - if (IS_ERR(page)) { -alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, - index, false); - } - if (IS_ERR(page)) { - int retry = 5; - error = PTR_ERR(page); - page = NULL; - if (error != -ENOSPC) - goto failed; - /* - * Try to reclaim some spece by splitting a huge page - * beyond i_size on the filesystem. - */ - while (retry--) { - int ret; - ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); - if (ret == SHRINK_STOP) - break; - if (ret) - goto alloc_nohuge; - } - goto failed; - } - - if (PageTransHuge(page)) - hindex = round_down(index, HPAGE_PMD_NR); - else - hindex = index; + page = shmem_alloc_and_acct_page(gfp, inode, index, true); + if (IS_ERR(page)) { +alloc_nohuge: + page = shmem_alloc_and_acct_page(gfp, inode, + index, false); + } + if (IS_ERR(page)) { + int retry = 5; - if (sgp == SGP_WRITE) - __SetPageReferenced(page); + error = PTR_ERR(page); + page = NULL; + if (error != -ENOSPC) + goto unlock; + /* + * Try to reclaim some space by splitting a huge page + * beyond i_size on the filesystem. + */ + while (retry--) { + int ret; - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - PageTransHuge(page)); - if (error) - goto unacct; - error = shmem_add_to_page_cache(page, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK); - if (error) { - mem_cgroup_cancel_charge(page, memcg, - PageTransHuge(page)); - goto unacct; + ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); + if (ret == SHRINK_STOP) + break; + if (ret) + goto alloc_nohuge; } - mem_cgroup_commit_charge(page, memcg, false, - PageTransHuge(page)); - lru_cache_add_anon(page); + goto unlock; + } - spin_lock_irq(&info->lock); - info->alloced += 1 << compound_order(page); - inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - alloced = true; + if (PageTransHuge(page)) + hindex = round_down(index, HPAGE_PMD_NR); + else + hindex = index; - if (PageTransHuge(page) && - DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - hindex + HPAGE_PMD_NR - 1) { - /* - * Part of the huge page is beyond i_size: subject - * to shrink under memory pressure. - */ - spin_lock(&sbinfo->shrinklist_lock); - /* - * _careful to defend against unlocked access to - * ->shrink_list in shmem_unused_huge_shrink() - */ - if (list_empty_careful(&info->shrinklist)) { - list_add_tail(&info->shrinklist, - &sbinfo->shrinklist); - sbinfo->shrinklist_len++; - } - spin_unlock(&sbinfo->shrinklist_lock); - } + if (sgp == SGP_WRITE) + __SetPageReferenced(page); + + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, + PageTransHuge(page)); + if (error) + goto unacct; + error = shmem_add_to_page_cache(page, mapping, hindex, + NULL, gfp & GFP_RECLAIM_MASK); + if (error) { + mem_cgroup_cancel_charge(page, memcg, + PageTransHuge(page)); + goto unacct; + } + mem_cgroup_commit_charge(page, memcg, false, + PageTransHuge(page)); + lru_cache_add_anon(page); + spin_lock_irq(&info->lock); + info->alloced += 1 << compound_order(page); + inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + alloced = true; + + if (PageTransHuge(page) && + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < + hindex + HPAGE_PMD_NR - 1) { /* - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + * Part of the huge page is beyond i_size: subject + * to shrink under memory pressure. */ - if (sgp == SGP_FALLOC) - sgp = SGP_WRITE; -clear: + spin_lock(&sbinfo->shrinklist_lock); /* - * Let SGP_WRITE caller clear ends if write does not fill page; - * but SGP_FALLOC on a page fallocated earlier must initialize - * it now, lest undo on failure cancel our earlier guarantee. + * _careful to defend against unlocked access to + * ->shrink_list in shmem_unused_huge_shrink() */ - if (sgp != SGP_WRITE && !PageUptodate(page)) { - struct page *head = compound_head(page); - int i; + if (list_empty_careful(&info->shrinklist)) { + list_add_tail(&info->shrinklist, + &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + } + spin_unlock(&sbinfo->shrinklist_lock); + } - for (i = 0; i < (1 << compound_order(head)); i++) { - clear_highpage(head + i); - flush_dcache_page(head + i); - } - SetPageUptodate(head); + /* + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; +clear: + /* + * Let SGP_WRITE caller clear ends if write does not fill page; + * but SGP_FALLOC on a page fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. + */ + if (sgp != SGP_WRITE && !PageUptodate(page)) { + struct page *head = compound_head(page); + int i; + + for (i = 0; i < (1 << compound_order(head)); i++) { + clear_highpage(head + i); + flush_dcache_page(head + i); } + SetPageUptodate(head); } /* Perhaps the file has been truncated since we checked */ @@ -1879,9 +1947,6 @@ unacct: put_page(page); goto alloc_nohuge; } -failed: - if (swap.val && !shmem_confirm_swap(mapping, index, swap)) - error = -EEXIST; unlock: if (page) { unlock_page(page); @@ -2125,6 +2190,24 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { + struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + + if (info->seals & F_SEAL_FUTURE_WRITE) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * "future write" seal active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* + * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED + * read-only mapping, take care to not allow mprotect to revert + * protections. + */ + vma->vm_flags &= ~(VM_MAYWRITE); + } + file_accessed(file); vma->vm_ops = &shmem_vm_ops; if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && @@ -2375,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; /* i_mutex is held by caller */ - if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { - if (info->seals & F_SEAL_WRITE) + if (unlikely(info->seals & (F_SEAL_GROW | + F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; @@ -2639,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } @@ -3847,7 +3931,8 @@ int __init shmem_init(void) return 0; } -int shmem_unuse(swp_entry_t swap, struct page *page) +int shmem_unuse(unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { return 0; } diff --git a/mm/slab.c b/mm/slab.c @@ -550,14 +550,6 @@ static void start_cpu_timer(int cpu) static void init_arraycache(struct array_cache *ac, int limit, int batch) { - /* - * The array_cache structures contain pointers to free object. - * However, when such objects are allocated or transferred to another - * cache the pointers are not cleared and they could be counted as - * valid references during a kmemleak scan. Therefore, kmemleak must - * not scan such objects. - */ - kmemleak_no_scan(ac); if (ac) { ac->avail = 0; ac->limit = limit; @@ -573,6 +565,14 @@ static struct array_cache *alloc_arraycache(int node, int entries, struct array_cache *ac = NULL; ac = kmalloc_node(memsize, gfp, node); + /* + * The array_cache structures contain pointers to free object. + * However, when such objects are allocated or transferred to another + * cache the pointers are not cleared and they could be counted as + * valid references during a kmemleak scan. Therefore, kmemleak must + * not scan such objects. + */ + kmemleak_no_scan(ac); init_arraycache(ac, entries, batchcount); return ac; } @@ -667,6 +667,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, alc = kmalloc_node(memsize, gfp, node); if (alc) { + kmemleak_no_scan(alc); init_arraycache(&alc->ac, entries, batch); spin_lock_init(&alc->lock); } @@ -676,12 +677,11 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { struct alien_cache **alc_ptr; - size_t memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) limit = 12; - alc_ptr = kzalloc_node(memsize, gfp, node); + alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node); if (!alc_ptr) return NULL; @@ -1727,6 +1727,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * This could be made much more intelligent. For now, try to avoid using * high order pages for slabs. When the gfp() functions are more friendly * towards high-order requests, this should be changed. + * + * Return: number of left-over bytes in a slab */ static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, slab_flags_t flags) @@ -1975,6 +1977,8 @@ static bool set_on_slab_cache(struct kmem_cache *cachep, * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. + * + * Return: a pointer to the created cache or %NULL in case of error */ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) { @@ -3542,6 +3546,8 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, * * Allocate an object from this cache. The flags are only relevant * if the cache has no available objects. + * + * Return: pointer to the new object or %NULL in case of error */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { @@ -3631,6 +3637,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); * node, which can improve the performance for cpu bound structures. * * Fallback to other node is possible if __GFP_THISNODE is not set. + * + * Return: pointer to the new object or %NULL in case of error */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { @@ -3699,6 +3707,8 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller); * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). * @caller: function caller for debug tracking of the caller + * + * Return: pointer to the allocated memory or %NULL in case of error */ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, unsigned long caller) @@ -4164,6 +4174,8 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) * @buffer: user buffer * @count: data length * @ppos: unused + * + * Return: %0 on success, negative error code otherwise. */ ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) @@ -4457,6 +4469,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, * The caller must guarantee that objp points to a valid object previously * allocated with either kmalloc() or kmem_cache_alloc(). The object * must not be freed during the duration of the call. + * + * Return: size of the actual memory used by @objp in bytes */ size_t ksize(const void *objp) { diff --git a/mm/slab.h b/mm/slab.h @@ -276,8 +276,6 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - if (!memcg_kmem_enabled()) - return 0; if (is_root_cache(s)) return 0; return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); @@ -286,8 +284,6 @@ static __always_inline int memcg_charge_slab(struct page *page, static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { - if (!memcg_kmem_enabled()) - return; memcg_kmem_uncharge(page, order); } diff --git a/mm/slab_common.c b/mm/slab_common.c @@ -939,6 +939,8 @@ EXPORT_SYMBOL(kmem_cache_destroy); * * Releases as many slabs as possible for a cache. * To help debugging, a zero exit status indicates all slabs were released. + * + * Return: %0 if all slabs were released, non-zero otherwise */ int kmem_cache_shrink(struct kmem_cache *cachep) { @@ -1425,7 +1427,7 @@ void dump_unreclaimable_slab(void) #if defined(CONFIG_MEMCG) void *memcg_slab_start(struct seq_file *m, loff_t *pos) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); mutex_lock(&slab_mutex); return seq_list_start(&memcg->kmem_caches, *pos); @@ -1433,7 +1435,7 @@ void *memcg_slab_start(struct seq_file *m, loff_t *pos) void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); return seq_list_next(p, &memcg->kmem_caches, pos); } @@ -1447,7 +1449,7 @@ int memcg_slab_show(struct seq_file *m, void *p) { struct kmem_cache *s = list_entry(p, struct kmem_cache, memcg_params.kmem_caches_node); - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); if (p == memcg->kmem_caches.next) print_slabinfo_header(m); @@ -1528,6 +1530,8 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, * This function is like krealloc() except it never frees the originally * allocated buffer. Use this if you don't want to free the buffer immediately * like, for example, with RCU. + * + * Return: pointer to the allocated memory or %NULL in case of error */ void *__krealloc(const void *p, size_t new_size, gfp_t flags) { @@ -1549,6 +1553,8 @@ EXPORT_SYMBOL(__krealloc); * lesser of the new and old sizes. If @p is %NULL, krealloc() * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a * %NULL pointer, the object pointed to is freed. + * + * Return: pointer to the allocated memory or %NULL in case of error */ void *krealloc(const void *p, size_t new_size, gfp_t flags) { diff --git a/mm/slub.c b/mm/slub.c @@ -1093,8 +1093,7 @@ static void setup_page_debug(struct kmem_cache *s, void *addr, int order) } static inline int alloc_consistency_checks(struct kmem_cache *s, - struct page *page, - void *object, unsigned long addr) + struct page *page, void *object) { if (!check_slab(s, page)) return 0; @@ -1115,7 +1114,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, void *object, unsigned long addr) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!alloc_consistency_checks(s, page, object, addr)) + if (!alloc_consistency_checks(s, page, object)) goto bad; } @@ -2130,7 +2129,7 @@ redo: if (!lock) { lock = 1; /* - * Taking the spinlock removes the possiblity + * Taking the spinlock removes the possibility * that acquire_slab() will see a slab page that * is frozen */ @@ -2254,8 +2253,8 @@ static void unfreeze_partials(struct kmem_cache *s, } /* - * Put a page that was just frozen (in __slab_free) into a partial page - * slot if available. + * Put a page that was just frozen (in __slab_free|get_partial_node) into a + * partial page slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. @@ -2482,8 +2481,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, stat(s, ALLOC_SLAB); c->page = page; *pc = c; - } else - freelist = NULL; + } return freelist; } @@ -4264,7 +4262,7 @@ void __init kmem_cache_init(void) cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, slub_cpu_dead); - pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); diff --git a/mm/sparse.c b/mm/sparse.c @@ -197,7 +197,7 @@ static inline int next_present_section_nr(int section_nr) } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ - ((section_nr >= 0) && \ + ((section_nr != -1) && \ (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) diff --git a/mm/swap.c b/mm/swap.c @@ -58,16 +58,16 @@ static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); static void __page_cache_release(struct page *page) { if (PageLRU(page)) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; unsigned long flags; - spin_lock_irqsave(zone_lru_lock(zone), flags); - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + spin_lock_irqsave(&pgdat->lru_lock, flags); + lruvec = mem_cgroup_page_lruvec(page, pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); - spin_unlock_irqrestore(zone_lru_lock(zone), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); } __ClearPageWaiters(page); mem_cgroup_uncharge(page); @@ -322,12 +322,12 @@ static inline void activate_page_drain(int cpu) void activate_page(struct page *page) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); page = compound_head(page); - spin_lock_irq(zone_lru_lock(zone)); - __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); - spin_unlock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); + __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL); + spin_unlock_irq(&pgdat->lru_lock); } #endif diff --git a/mm/swap_state.c b/mm/swap_state.c @@ -523,7 +523,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) * This has been extended to use the NUMA policies from the mm triggering * the readahead. * - * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. + * Caller must hold read mmap_sem if vmf->vma is not NULL. */ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) @@ -543,6 +543,13 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!mask) goto skip; + /* Test swap type to make sure the dereference is safe */ + if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) { + struct inode *inode = si->swap_file->f_mapping->host; + if (inode_read_congested(inode)) + goto skip; + } + do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; @@ -691,6 +698,20 @@ static void swap_ra_info(struct vm_fault *vmf, pte_unmap(orig_pte); } +/** + * swap_vma_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags + * @vmf: fault information + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * Primitive swap readahead code. We simply read in a few pages whoes + * virtual addresses are around the fault address in the same vma. + * + * Caller must hold read mmap_sem if vmf->vma is not NULL. + * + */ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct vm_fault *vmf) { diff --git a/mm/swapfile.c b/mm/swapfile.c @@ -98,6 +98,15 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); +static struct swap_info_struct *swap_type_to_swap_info(int type) +{ + if (type >= READ_ONCE(nr_swapfiles)) + return NULL; + + smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */ + return READ_ONCE(swap_info[type]); +} + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ @@ -1044,12 +1053,14 @@ noswap: /* The only caller of this function is now suspend routine */ swp_entry_t get_swap_page_of_type(int type) { - struct swap_info_struct *si; + struct swap_info_struct *si = swap_type_to_swap_info(type); pgoff_t offset; - si = swap_info[type]; + if (!si) + goto fail; + spin_lock(&si->lock); - if (si && (si->flags & SWP_WRITEOK)) { + if (si->flags & SWP_WRITEOK) { atomic_long_dec(&nr_swap_pages); /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, 1); @@ -1060,6 +1071,7 @@ swp_entry_t get_swap_page_of_type(int type) atomic_long_inc(&nr_swap_pages); } spin_unlock(&si->lock); +fail: return (swp_entry_t) {0}; } @@ -1071,9 +1083,9 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry) if (!entry.val) goto out; type = swp_type(entry); - if (type >= nr_swapfiles) + p = swap_type_to_swap_info(type); + if (!p) goto bad_nofile; - p = swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); @@ -1697,10 +1709,9 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) sector_t swapdev_block(int type, pgoff_t offset) { struct block_device *bdev; + struct swap_info_struct *si = swap_type_to_swap_info(type); - if ((unsigned int)type >= nr_swapfiles) - return 0; - if (!(swap_info[type]->flags & SWP_WRITEOK)) + if (!si || !(si->flags & SWP_WRITEOK)) return 0; return map_swap_entry(swp_entry(type, offset), &bdev); } @@ -1799,44 +1810,77 @@ out_nolock: } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned long addr, unsigned long end, + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { - pte_t swp_pte = swp_entry_to_pte(entry); + struct page *page; + swp_entry_t entry; pte_t *pte; + struct swap_info_struct *si; + unsigned long offset; int ret = 0; + volatile unsigned char *swap_map; - /* - * We don't actually need pte lock while scanning for swp_pte: since - * we hold page lock and mmap_sem, swp_pte cannot be inserted into the - * page table while we're scanning; though it could get zapped, and on - * some architectures (e.g. x86_32 with PAE) we might catch a glimpse - * of unmatched parts which look like swp_pte, so unuse_pte must - * recheck under pte lock. Scanning without pte lock lets it be - * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. - */ + si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { - /* - * swapoff spends a _lot_ of time in this loop! - * Test inline before going to call unuse_pte. - */ - if (unlikely(pte_same_as_swp(*pte, swp_pte))) { - pte_unmap(pte); - ret = unuse_pte(vma, pmd, addr, entry, page); - if (ret) - goto out; - pte = pte_offset_map(pmd, addr); + struct vm_fault vmf; + + if (!is_swap_pte(*pte)) + continue; + + entry = pte_to_swp_entry(*pte); + if (swp_type(entry) != type) + continue; + + offset = swp_offset(entry); + if (frontswap && !frontswap_test(si, offset)) + continue; + + pte_unmap(pte); + swap_map = &si->swap_map[offset]; + vmf.vma = vma; + vmf.address = addr; + vmf.pmd = pmd; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + if (!page) { + if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) + goto try_next; + return -ENOMEM; + } + + lock_page(page); + wait_on_page_writeback(page); + ret = unuse_pte(vma, pmd, addr, entry, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + goto out; } + + try_to_free_swap(page); + unlock_page(page); + put_page(page); + + if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { + ret = FRONTSWAP_PAGES_UNUSED; + goto out; + } +try_next: + pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1); + + ret = 0; out: return ret; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { pmd_t *pmd; unsigned long next; @@ -1848,7 +1892,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - ret = unuse_pte_range(vma, pmd, addr, next, entry, page); + ret = unuse_pte_range(vma, pmd, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pmd++, addr = next, addr != end); @@ -1857,7 +1902,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { pud_t *pud; unsigned long next; @@ -1868,7 +1914,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - ret = unuse_pmd_range(vma, pud, addr, next, entry, page); + ret = unuse_pmd_range(vma, pud, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pud++, addr = next, addr != end); @@ -1877,7 +1924,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { p4d_t *p4d; unsigned long next; @@ -1888,78 +1936,66 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; - ret = unuse_pud_range(vma, p4d, addr, next, entry, page); + ret = unuse_pud_range(vma, p4d, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } -static int unuse_vma(struct vm_area_struct *vma, - swp_entry_t entry, struct page *page) +static int unuse_vma(struct vm_area_struct *vma, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) { pgd_t *pgd; unsigned long addr, end, next; int ret; - if (page_anon_vma(page)) { - addr = page_address_in_vma(page, vma); - if (addr == -EFAULT) - return 0; - else - end = addr + PAGE_SIZE; - } else { - addr = vma->vm_start; - end = vma->vm_end; - } + addr = vma->vm_start; + end = vma->vm_end; pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); + ret = unuse_p4d_range(vma, pgd, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } -static int unuse_mm(struct mm_struct *mm, - swp_entry_t entry, struct page *page) +static int unuse_mm(struct mm_struct *mm, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) { struct vm_area_struct *vma; int ret = 0; - if (!down_read_trylock(&mm->mmap_sem)) { - /* - * Activate page so shrink_inactive_list is unlikely to unmap - * its ptes while lock is dropped, so swapoff can make progress. - */ - activate_page(page); - unlock_page(page); - down_read(&mm->mmap_sem); - lock_page(page); - } + down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) - break; + if (vma->anon_vma) { + ret = unuse_vma(vma, type, frontswap, + fs_pages_to_unuse); + if (ret) + break; + } cond_resched(); } up_read(&mm->mmap_sem); - return (ret < 0)? ret: 0; + return ret; } /* * Scan swap_map (or frontswap_map if frontswap parameter is true) - * from current position to next entry still in use. - * Recycle to start on reaching the end, returning 0 when empty. + * from current position to next entry still in use. Return 0 + * if there are no inuse entries after prev till end of the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev, bool frontswap) { - unsigned int max = si->max; - unsigned int i = prev; + unsigned int i; unsigned char count; /* @@ -1968,20 +2004,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock). */ - for (;;) { - if (++i >= max) { - if (!prev) { - i = 0; - break; - } - /* - * No entries in use at top of swap_map, - * loop back to start and recheck there. - */ - max = prev + 1; - prev = 0; - i = 1; - } + for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) if (!frontswap || frontswap_test(si, i)) @@ -1989,240 +2012,121 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, if ((i % LATENCY_LIMIT) == 0) cond_resched(); } + + if (i == si->max) + i = 0; + return i; } /* - * We completely avoid races by reading each swap page in advance, - * and then search for the process using it. All the necessary - * page table adjustments can then be made atomically. - * - * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * If the boolean frontswap is true, only unuse pages_to_unuse pages; * pages_to_unuse==0 means all pages; ignored if frontswap is false */ +#define SWAP_UNUSE_MAX_TRIES 3 int try_to_unuse(unsigned int type, bool frontswap, unsigned long pages_to_unuse) { + struct mm_struct *prev_mm; + struct mm_struct *mm; + struct list_head *p; + int retval = 0; struct swap_info_struct *si = swap_info[type]; - struct mm_struct *start_mm; - volatile unsigned char *swap_map; /* swap_map is accessed without - * locking. Mark it as volatile - * to prevent compiler doing - * something odd. - */ - unsigned char swcount; struct page *page; swp_entry_t entry; - unsigned int i = 0; - int retval = 0; + unsigned int i; + int retries = 0; - /* - * When searching mms for an entry, a good strategy is to - * start at the first mm we freed the previous entry from - * (though actually we don't notice whether we or coincidence - * freed the entry). Initialize this start_mm with a hold. - * - * A simpler strategy would be to start at the last mm we - * freed the previous entry from; but that would take less - * advantage of mmlist ordering, which clusters forked mms - * together, child after parent. If we race with dup_mmap(), we - * prefer to resolve parent before child, lest we miss entries - * duplicated after we scanned child: using last mm would invert - * that. - */ - start_mm = &init_mm; - mmget(&init_mm); + if (!si->inuse_pages) + return 0; - /* - * Keep on scanning until all entries have gone. Usually, - * one pass through swap_map is enough, but not necessarily: - * there are races when an instance of an entry might be missed. - */ - while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { + if (!frontswap) + pages_to_unuse = 0; + +retry: + retval = shmem_unuse(type, frontswap, &pages_to_unuse); + if (retval) + goto out; + + prev_mm = &init_mm; + mmget(prev_mm); + + spin_lock(&mmlist_lock); + p = &init_mm.mmlist; + while ((p = p->next) != &init_mm.mmlist) { if (signal_pending(current)) { retval = -EINTR; break; } - /* - * Get a page for the entry, using the existing swap - * cache page if there is one. Otherwise, get a clean - * page and read the swap into it. - */ - swap_map = &si->swap_map[i]; - entry = swp_entry(type, i); - page = read_swap_cache_async(entry, - GFP_HIGHUSER_MOVABLE, NULL, 0, false); - if (!page) { - /* - * Either swap_duplicate() failed because entry - * has been freed independently, and will not be - * reused since sys_swapoff() already disabled - * allocation from here, or alloc_page() failed. - */ - swcount = *swap_map; - /* - * We don't hold lock here, so the swap entry could be - * SWAP_MAP_BAD (when the cluster is discarding). - * Instead of fail out, We can just skip the swap - * entry because swapoff will wait for discarding - * finish anyway. - */ - if (!swcount || swcount == SWAP_MAP_BAD) - continue; - retval = -ENOMEM; - break; - } + mm = list_entry(p, struct mm_struct, mmlist); + if (!mmget_not_zero(mm)) + continue; + spin_unlock(&mmlist_lock); + mmput(prev_mm); + prev_mm = mm; + retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); - /* - * Don't hold on to start_mm if it looks like exiting. - */ - if (atomic_read(&start_mm->mm_users) == 1) { - mmput(start_mm); - start_mm = &init_mm; - mmget(&init_mm); + if (retval) { + mmput(prev_mm); + goto out; } /* - * Wait for and lock page. When do_swap_page races with - * try_to_unuse, do_swap_page can handle the fault much - * faster than try_to_unuse can locate the entry. This - * apparently redundant "wait_on_page_locked" lets try_to_unuse - * defer to do_swap_page in such a case - in some tests, - * do_swap_page and try_to_unuse repeatedly compete. - */ - wait_on_page_locked(page); - wait_on_page_writeback(page); - lock_page(page); - wait_on_page_writeback(page); - - /* - * Remove all references to entry. + * Make sure that we aren't completely killing + * interactive performance. */ - swcount = *swap_map; - if (swap_count(swcount) == SWAP_MAP_SHMEM) { - retval = shmem_unuse(entry, page); - /* page has already been unlocked and released */ - if (retval < 0) - break; - continue; - } - if (swap_count(swcount) && start_mm != &init_mm) - retval = unuse_mm(start_mm, entry, page); - - if (swap_count(*swap_map)) { - int set_start_mm = (*swap_map >= swcount); - struct list_head *p = &start_mm->mmlist; - struct mm_struct *new_start_mm = start_mm; - struct mm_struct *prev_mm = start_mm; - struct mm_struct *mm; - - mmget(new_start_mm); - mmget(prev_mm); - spin_lock(&mmlist_lock); - while (swap_count(*swap_map) && !retval && - (p = p->next) != &start_mm->mmlist) { - mm = list_entry(p, struct mm_struct, mmlist); - if (!mmget_not_zero(mm)) - continue; - spin_unlock(&mmlist_lock); - mmput(prev_mm); - prev_mm = mm; + cond_resched(); + spin_lock(&mmlist_lock); + } + spin_unlock(&mmlist_lock); - cond_resched(); + mmput(prev_mm); - swcount = *swap_map; - if (!swap_count(swcount)) /* any usage ? */ - ; - else if (mm == &init_mm) - set_start_mm = 1; - else - retval = unuse_mm(mm, entry, page); - - if (set_start_mm && *swap_map < swcount) { - mmput(new_start_mm); - mmget(mm); - new_start_mm = mm; - set_start_mm = 0; - } - spin_lock(&mmlist_lock); - } - spin_unlock(&mmlist_lock); - mmput(prev_mm); - mmput(start_mm); - start_mm = new_start_mm; - } - if (retval) { - unlock_page(page); - put_page(page); - break; - } + i = 0; + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { - /* - * If a reference remains (rare), we would like to leave - * the page in the swap cache; but try_to_unmap could - * then re-duplicate the entry once we drop page lock, - * so we might loop indefinitely; also, that page could - * not be swapped out to other storage meanwhile. So: - * delete from cache even if there's another reference, - * after ensuring that the data has been saved to disk - - * since if the reference remains (rarer), it will be - * read from disk into another page. Splitting into two - * pages would be incorrect if swap supported "shared - * private" pages, but they are handled by tmpfs files. - * - * Given how unuse_vma() targets one particular offset - * in an anon_vma, once the anon_vma has been determined, - * this splitting happens to be just what is needed to - * handle where KSM pages have been swapped out: re-reading - * is unnecessarily slow, but we can fix that later on. - */ - if (swap_count(*swap_map) && - PageDirty(page) && PageSwapCache(page)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; - - swap_writepage(compound_head(page), &wbc); - lock_page(page); - wait_on_page_writeback(page); - } + entry = swp_entry(type, i); + page = find_get_page(swap_address_space(entry), i); + if (!page) + continue; /* * It is conceivable that a racing task removed this page from - * swap cache just before we acquired the page lock at the top, - * or while we dropped it in unuse_mm(). The page might even - * be back in swap cache on another swap area: that we must not - * delete, since it may not have been written out to swap yet. - */ - if (PageSwapCache(page) && - likely(page_private(page) == entry.val) && - (!PageTransCompound(page) || - !swap_page_trans_huge_swapped(si, entry))) - delete_from_swap_cache(compound_head(page)); - - /* - * So we could skip searching mms once swap count went - * to 1, we did not mark any present ptes as dirty: must - * mark page dirty so shrink_page_list will preserve it. + * swap cache just before we acquired the page lock. The page + * might even be back in swap cache on another swap area. But + * that is okay, try_to_free_swap() only removes stale pages. */ - SetPageDirty(page); + lock_page(page); + wait_on_page_writeback(page); + try_to_free_swap(page); unlock_page(page); put_page(page); /* - * Make sure that we aren't completely killing - * interactive performance. + * For frontswap, we just need to unuse pages_to_unuse, if + * it was specified. Need not check frontswap again here as + * we already zeroed out pages_to_unuse if not frontswap. */ - cond_resched(); - if (frontswap && pages_to_unuse > 0) { - if (!--pages_to_unuse) - break; - } + if (pages_to_unuse && --pages_to_unuse == 0) + goto out; } - mmput(start_mm); - return retval; + /* + * Lets check again to see if there are still swap entries in the map. + * If yes, we would need to do retry the unuse logic again. + * Under global memory pressure, swap entries can be reinserted back + * into process space after the mmlist loop above passes over them. + * Its not worth continuosuly retrying to unuse the swap in this case. + * So we try SWAP_UNUSE_MAX_TRIES times. + */ + if (++retries >= SWAP_UNUSE_MAX_TRIES) + retval = -EBUSY; + else if (si->inuse_pages) + goto retry; + +out: + return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; } /* @@ -2258,7 +2162,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) struct swap_extent *se; pgoff_t offset; - sis = swap_info[swp_type(entry)]; + sis = swp_swap_info(entry); *bdev = sis->bdev; offset = swp_offset(entry); @@ -2700,9 +2604,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (type = 0; type < nr_swapfiles; type++) { - smp_rmb(); /* read nr_swapfiles before swap_info[type] */ - si = swap_info[type]; + for (type = 0; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) @@ -2722,9 +2624,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) else type = si->type + 1; - for (; type < nr_swapfiles; type++) { - smp_rmb(); /* read nr_swapfiles before swap_info[type] */ - si = swap_info[type]; + for (; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; ++*pos; @@ -2813,9 +2713,8 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; unsigned int type; int i; - int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node); - p = kvzalloc(size, GFP_KERNEL); + p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -2831,14 +2730,14 @@ static struct swap_info_struct *alloc_swap_info(void) } if (type >= nr_swapfiles) { p->type = type; - swap_info[type] = p; + WRITE_ONCE(swap_info[type], p); /* * Write swap_info[type] before nr_swapfiles, in case a * racing procfs swap_start() or swap_next() is reading them. * (We never shrink nr_swapfiles, we never free this entry.) */ smp_wmb(); - nr_swapfiles++; + WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1); } else { kvfree(p); p = swap_info[type]; @@ -3358,7 +3257,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; struct swap_cluster_info *ci; - unsigned long offset, type; + unsigned long offset; unsigned char count; unsigned char has_cache; int err = -EINVAL; @@ -3366,10 +3265,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) if (non_swap_entry(entry)) goto out; - type = swp_type(entry); - if (type >= nr_swapfiles) + p = swp_swap_info(entry); + if (!p) goto bad_file; - p = swap_info[type]; + offset = swp_offset(entry); if (unlikely(offset >= p->max)) goto out; @@ -3466,7 +3365,7 @@ int swapcache_prepare(swp_entry_t entry) struct swap_info_struct *swp_swap_info(swp_entry_t entry) { - return swap_info[swp_type(entry)]; + return swap_type_to_swap_info(swp_type(entry)); } struct swap_info_struct *page_swap_info(struct page *page) diff --git a/mm/truncate.c b/mm/truncate.c @@ -539,6 +539,8 @@ EXPORT_SYMBOL(truncate_inode_pages_final); * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. + * + * Return: the number of the pages that were invalidated */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -664,7 +666,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EBUSY if any pages could not be invalidated. + * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -761,7 +763,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EBUSY if any pages could not be invalidated. + * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2(struct address_space *mapping) { diff --git a/mm/util.c b/mm/util.c @@ -36,6 +36,8 @@ EXPORT_SYMBOL(kfree_const); * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrdup(const char *s, gfp_t gfp) { @@ -58,9 +60,10 @@ EXPORT_SYMBOL(kstrdup); * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * - * Function returns source string if it is in .rodata section otherwise it - * fallbacks to kstrdup. - * Strings allocated by kstrdup_const should be freed by kfree_const. + * Note: Strings allocated by kstrdup_const should be freed by kfree_const. + * + * Return: source string if it is in .rodata section otherwise + * fallback to kstrdup. */ const char *kstrdup_const(const char *s, gfp_t gfp) { @@ -78,6 +81,8 @@ EXPORT_SYMBOL(kstrdup_const); * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Use kmemdup_nul() instead if the size is known exactly. + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { @@ -103,6 +108,8 @@ EXPORT_SYMBOL(kstrndup); * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use + * + * Return: newly allocated copy of @src or %NULL in case of error */ void *kmemdup(const void *src, size_t len, gfp_t gfp) { @@ -120,6 +127,9 @@ EXPORT_SYMBOL(kmemdup); * @s: The data to stringify * @len: The size of the data * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s with NUL-termination or %NULL in + * case of error */ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { @@ -143,7 +153,7 @@ EXPORT_SYMBOL(kmemdup_nul); * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. Result is physically + * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ void *memdup_user(const void __user *src, size_t len) @@ -169,7 +179,7 @@ EXPORT_SYMBOL(memdup_user); * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. Result may be not + * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ void *vmemdup_user(const void __user *src, size_t len) @@ -193,6 +203,8 @@ EXPORT_SYMBOL(vmemdup_user); * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *strndup_user(const char __user *s, long n) { @@ -224,7 +236,7 @@ EXPORT_SYMBOL(strndup_user); * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. + * Return: an ERR_PTR() on failure. */ void *memdup_user_nul(const void __user *src, size_t len) { @@ -310,10 +322,6 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - * * get_user_pages_fast provides equivalent functionality to get_user_pages, * operating on current and current->mm, with force=0 and vma=NULL. However * unlike get_user_pages, it must be called without mmap_sem held. @@ -325,6 +333,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * pages have to be faulted in, it may turn out to be slightly slower so * callers need to carefully consider what to use. On many architectures, * get_user_pages_fast simply falls back to get_user_pages. + * + * Return: number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. */ int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) @@ -386,6 +398,8 @@ EXPORT_SYMBOL(vm_mmap); * * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not * fall back to vmalloc. + * + * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) { @@ -729,7 +743,8 @@ error: * @buffer: the buffer to copy to. * @buflen: the length of the buffer. Larger cmdline values are truncated * to this length. - * Returns the size of the cmdline field copied. Note that the copy does + * + * Return: the size of the cmdline field copied. Note that the copy does * not guarantee an ending NULL byte. */ int get_cmdline(struct task_struct *task, char *buffer, int buflen) diff --git a/mm/vmalloc.c b/mm/vmalloc.c @@ -498,7 +498,11 @@ nocache: } found: - if (addr + size > vend) + /* + * Check also calculated address against the vstart, + * because it can be 0 because of big align request. + */ + if (addr + size > vend || addr < vstart) goto overflow; va->va_start = addr; @@ -840,7 +844,7 @@ static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) * @order: how many 2^order pages should be occupied in newly allocated block * @gfp_mask: flags for the page level allocator * - * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) + * Return: virtual address in a newly allocated block or ERR_PTR(-errno) */ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) { @@ -1187,6 +1191,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; + /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -1421,13 +1426,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, } /** - * get_vm_area - reserve a contiguous kernel virtual area - * @size: size of the area - * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC + * get_vm_area - reserve a contiguous kernel virtual area + * @size: size of the area + * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC * - * Search an area of @size in the kernel virtual mapping area, - * and reserved it for out purposes. Returns the area descriptor - * on success or %NULL on failure. + * Search an area of @size in the kernel virtual mapping area, + * and reserved it for out purposes. Returns the area descriptor + * on success or %NULL on failure. + * + * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { @@ -1444,12 +1451,14 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, } /** - * find_vm_area - find a continuous kernel virtual area - * @addr: base address + * find_vm_area - find a continuous kernel virtual area + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and return it. + * It is up to the caller to do all required locking to keep the returned + * pointer valid. * - * Search for the kernel VM area starting at @addr, and return it. - * It is up to the caller to do all required locking to keep the returned - * pointer valid. + * Return: pointer to the found area or %NULL on faulure */ struct vm_struct *find_vm_area(const void *addr) { @@ -1463,12 +1472,14 @@ struct vm_struct *find_vm_area(const void *addr) } /** - * remove_vm_area - find and remove a continuous kernel virtual area - * @addr: base address + * remove_vm_area - find and remove a continuous kernel virtual area + * @addr: base address * - * Search for the kernel VM area starting at @addr, and remove it. - * This function returns the found VM area, but using it is NOT safe - * on SMP machines, except for its size or flags. + * Search for the kernel VM area starting at @addr, and remove it. + * This function returns the found VM area, but using it is NOT safe + * on SMP machines, except for its size or flags. + * + * Return: pointer to the found area or %NULL on faulure */ struct vm_struct *remove_vm_area(const void *addr) { @@ -1505,7 +1516,7 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = find_vmap_area((unsigned long)addr)->vm; + area = find_vm_area(addr); if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); @@ -1548,11 +1559,11 @@ static inline void __vfree_deferred(const void *addr) } /** - * vfree_atomic - release memory allocated by vmalloc() - * @addr: memory base address + * vfree_atomic - release memory allocated by vmalloc() + * @addr: memory base address * - * This one is just like vfree() but can be called in any atomic context - * except NMIs. + * This one is just like vfree() but can be called in any atomic context + * except NMIs. */ void vfree_atomic(const void *addr) { @@ -1565,21 +1576,29 @@ void vfree_atomic(const void *addr) __vfree_deferred(addr); } +static void __vfree(const void *addr) +{ + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else + __vunmap(addr, 1); +} + /** - * vfree - release memory allocated by vmalloc() - * @addr: memory base address + * vfree - release memory allocated by vmalloc() + * @addr: memory base address * - * Free the virtually continuous memory area starting at @addr, as - * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is - * NULL, no operation is performed. + * Free the virtually continuous memory area starting at @addr, as + * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is + * NULL, no operation is performed. * - * Must not be called in NMI context (strictly speaking, only if we don't - * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling - * conventions for vfree() arch-depenedent would be a really bad idea) + * Must not be called in NMI context (strictly speaking, only if we don't + * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling + * conventions for vfree() arch-depenedent would be a really bad idea) * - * May sleep if called *not* from interrupt context. + * May sleep if called *not* from interrupt context. * - * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) + * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) */ void vfree(const void *addr) { @@ -1591,21 +1610,19 @@ void vfree(const void *addr) if (!addr) return; - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else - __vunmap(addr, 1); + + __vfree(addr); } EXPORT_SYMBOL(vfree); /** - * vunmap - release virtual mapping obtained by vmap() - * @addr: memory base address + * vunmap - release virtual mapping obtained by vmap() + * @addr: memory base address * - * Free the virtually contiguous memory area starting at @addr, - * which was created from the page array passed to vmap(). + * Free the virtually contiguous memory area starting at @addr, + * which was created from the page array passed to vmap(). * - * Must not be called in interrupt context. + * Must not be called in interrupt context. */ void vunmap(const void *addr) { @@ -1617,17 +1634,19 @@ void vunmap(const void *addr) EXPORT_SYMBOL(vunmap); /** - * vmap - map an array of pages into virtually contiguous space - * @pages: array of page pointers - * @count: number of pages to map - * @flags: vm_area->flags - * @prot: page protection for the mapping - * - * Maps @count pages from @pages into contiguous kernel virtual - * space. + * vmap - map an array of pages into virtually contiguous space + * @pages: array of page pointers + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping + * + * Maps @count pages from @pages into contiguous kernel virtual + * space. + * + * Return: the address of the area or %NULL on failure */ void *vmap(struct page **pages, unsigned int count, - unsigned long flags, pgprot_t prot) + unsigned long flags, pgprot_t prot) { struct vm_struct *area; unsigned long size; /* In bytes */ @@ -1709,25 +1728,27 @@ fail: warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); - vfree(area->addr); + __vfree(area->addr); return NULL; } /** - * __vmalloc_node_range - allocate virtually contiguous memory - * @size: allocation size - * @align: desired alignment - * @start: vm area range start - * @end: vm area range end - * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages - * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) - * @node: node to use for allocation or NUMA_NO_NODE - * @caller: caller's return address - * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * __vmalloc_node_range - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @start: vm area range start + * @end: vm area range end + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) + * @node: node to use for allocation or NUMA_NO_NODE + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + * + * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, @@ -1768,25 +1789,35 @@ fail: return NULL; } +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node_range); +#endif + /** - * __vmalloc_node - allocate virtually contiguous memory - * @size: allocation size - * @align: desired alignment - * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages - * @node: node to use for allocation or NUMA_NO_NODE - * @caller: caller's return address + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or NUMA_NO_NODE + * @caller: caller's return address * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. * - * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL - * and __GFP_NOFAIL are not supported + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL + * and __GFP_NOFAIL are not supported * - * Any use of gfp flags outside of GFP_KERNEL should be consulted - * with mm people. + * Any use of gfp flags outside of GFP_KERNEL should be consulted + * with mm people. * + * Return: pointer to the allocated memory or %NULL on error */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, @@ -1818,13 +1849,16 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, } /** - * vmalloc - allocate virtually contiguous memory - * @size: allocation size - * Allocate enough pages to cover @size from the page level - * allocator and map them into contiguous kernel virtual space. + * vmalloc - allocate virtually contiguous memory + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc(unsigned long size) { @@ -1834,14 +1868,17 @@ void *vmalloc(unsigned long size) EXPORT_SYMBOL(vmalloc); /** - * vzalloc - allocate virtually contiguous memory with zero fill - * @size: allocation size - * Allocate enough pages to cover @size from the page level - * allocator and map them into contiguous kernel virtual space. - * The memory allocated is set to zero. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * vzalloc - allocate virtually contiguous memory with zero fill + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc(unsigned long size) { @@ -1856,34 +1893,30 @@ EXPORT_SYMBOL(vzalloc); * * The resulting memory area is zeroed so it can be mapped to userspace * without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_user(unsigned long size) { - struct vm_struct *area; - void *ret; - - ret = __vmalloc_node(size, SHMLBA, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, NUMA_NO_NODE, - __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_user); /** - * vmalloc_node - allocate memory on a specific node - * @size: allocation size - * @node: numa node + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. * - * Allocate enough pages to cover @size from the page level - * allocator and map them into contiguous kernel virtual space. + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_node(unsigned long size, int node) { @@ -1903,6 +1936,8 @@ EXPORT_SYMBOL(vmalloc_node); * * For tight control over page level allocator and protection flags * use __vmalloc_node() instead. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) { @@ -1912,17 +1947,18 @@ void *vzalloc_node(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node); /** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size + * vmalloc_exec - allocate virtually contiguous, executable memory + * @size: allocation size * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * Return: pointer to the allocated memory or %NULL on error */ - void *vmalloc_exec(unsigned long size) { return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, @@ -1942,11 +1978,13 @@ void *vmalloc_exec(unsigned long size) #endif /** - * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) - * @size: allocation size + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size * - * Allocate enough 32bit PA addressable pages to cover @size from the - * page level allocator and map them into contiguous kernel virtual space. + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into contiguous kernel virtual space. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32(unsigned long size) { @@ -1957,23 +1995,19 @@ EXPORT_SYMBOL(vmalloc_32); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory - * @size: allocation size + * @size: allocation size * * The resulting memory area is 32bit addressable and zeroed so it can be * mapped to userspace without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32_user(unsigned long size) { - struct vm_struct *area; - void *ret; - - ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_user); @@ -2059,31 +2093,29 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) } /** - * vread() - read vmalloc area in a safe way. - * @buf: buffer for reading data - * @addr: vm address. - * @count: number of bytes to be read. - * - * Returns # of bytes which addr and buf should be increased. - * (same number to @count). Returns 0 if [addr...addr+count) doesn't - * includes any intersect with alive vmalloc area. - * - * This function checks that addr is a valid vmalloc'ed area, and - * copy data from that area to a given buffer. If the given memory range - * of [addr...addr+count) includes some valid address, data is copied to - * proper area of @buf. If there are memory holes, they'll be zero-filled. - * IOREMAP area is treated as memory hole and no copy is done. - * - * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. - * - * Note: In usual ops, vread() is never necessary because the caller - * should know vmalloc() area is valid and can use memcpy(). - * This is for routines which have to access vmalloc area without - * any informaion, as /dev/kmem. - * + * vread() - read vmalloc area in a safe way. + * @buf: buffer for reading data + * @addr: vm address. + * @count: number of bytes to be read. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from that area to a given buffer. If the given memory range + * of [addr...addr+count) includes some valid address, data is copied to + * proper area of @buf. If there are memory holes, they'll be zero-filled. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. @buf should be kernel's buffer. + * + * Note: In usual ops, vread() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + * Return: number of bytes for which addr and buf should be increased + * (same number as @count) or %0 if [addr...addr+count) doesn't + * include any intersection with valid vmalloc area */ - long vread(char *buf, char *addr, unsigned long count) { struct vmap_area *va; @@ -2140,31 +2172,29 @@ finished: } /** - * vwrite() - write vmalloc area in a safe way. - * @buf: buffer for source data - * @addr: vm address. - * @count: number of bytes to be read. - * - * Returns # of bytes which addr and buf should be incresed. - * (same number to @count). - * If [addr...addr+count) doesn't includes any intersect with valid - * vmalloc area, returns 0. - * - * This function checks that addr is a valid vmalloc'ed area, and - * copy data from a buffer to the given addr. If specified range of - * [addr...addr+count) includes some valid address, data is copied from - * proper area of @buf. If there are memory holes, no copy to hole. - * IOREMAP area is treated as memory hole and no copy is done. - * - * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. - * - * Note: In usual ops, vwrite() is never necessary because the caller - * should know vmalloc() area is valid and can use memcpy(). - * This is for routines which have to access vmalloc area without - * any informaion, as /dev/kmem. + * vwrite() - write vmalloc area in a safe way. + * @buf: buffer for source data + * @addr: vm address. + * @count: number of bytes to be read. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from a buffer to the given addr. If specified range of + * [addr...addr+count) includes some valid address, data is copied from + * proper area of @buf. If there are memory holes, no copy to hole. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. @buf should be kernel's buffer. + * + * Note: In usual ops, vwrite() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + * Return: number of bytes for which addr and buf should be + * increased (same number as @count) or %0 if [addr...addr+count) + * doesn't include any intersection with valid vmalloc area */ - long vwrite(char *buf, char *addr, unsigned long count) { struct vmap_area *va; @@ -2216,20 +2246,20 @@ finished: } /** - * remap_vmalloc_range_partial - map vmalloc pages to userspace - * @vma: vma to cover - * @uaddr: target user address to start at - * @kaddr: virtual address of vmalloc kernel memory - * @size: size of map area + * remap_vmalloc_range_partial - map vmalloc pages to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc kernel memory + * @size: size of map area * - * Returns: 0 for success, -Exxx on failure + * Returns: 0 for success, -Exxx on failure * - * This function checks that @kaddr is a valid vmalloc'ed area, - * and that it is big enough to cover the range starting at - * @uaddr in @vma. Will return failure if that criteria isn't - * met. + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. * - * Similar to remap_pfn_range() (see mm/memory.c) + * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, void *kaddr, unsigned long size) @@ -2248,7 +2278,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, if (!(area->flags & VM_USERMAP)) return -EINVAL; - if (kaddr + size > area->addr + area->size) + if (kaddr + size > area->addr + get_vm_area_size(area)) return -EINVAL; do { @@ -2271,18 +2301,18 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, EXPORT_SYMBOL(remap_vmalloc_range_partial); /** - * remap_vmalloc_range - map vmalloc pages to userspace - * @vma: vma to cover (map full range of vma) - * @addr: vmalloc memory - * @pgoff: number of pages into addr before first page to map + * remap_vmalloc_range - map vmalloc pages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map * - * Returns: 0 for success, -Exxx on failure + * Returns: 0 for success, -Exxx on failure * - * This function checks that addr is a valid vmalloc'ed area, and - * that it is big enough to cover the vma. Will return failure if - * that criteria isn't met. + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. * - * Similar to remap_pfn_range() (see mm/memory.c) + * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) @@ -2314,18 +2344,18 @@ static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) } /** - * alloc_vm_area - allocate a range of kernel address space - * @size: size of the area - * @ptes: returns the PTEs for the address space + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * @ptes: returns the PTEs for the address space * - * Returns: NULL on failure, vm_struct on success + * Returns: NULL on failure, vm_struct on success * - * This function reserves a range of kernel address space, and - * allocates pagetables to map that range. No actual mappings - * are created. + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. * - * If @ptes is non-NULL, pointers to the PTEs (in init_mm) - * allocated for the VM area are returned. + * If @ptes is non-NULL, pointers to the PTEs (in init_mm) + * allocated for the VM area are returned. */ struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { @@ -2751,4 +2781,3 @@ static int __init proc_vmalloc_init(void) module_init(proc_vmalloc_init); #endif - diff --git a/mm/vmscan.c b/mm/vmscan.c @@ -374,7 +374,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone */ int prealloc_shrinker(struct shrinker *shrinker) { - size_t size = sizeof(*shrinker->nr_deferred); + unsigned int size = sizeof(*shrinker->nr_deferred); if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; @@ -952,7 +952,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) - shadow = workingset_eviction(mapping, page); + shadow = workingset_eviction(page); __delete_from_page_cache(page, shadow); xa_unlock_irqrestore(&mapping->i_pages, flags); @@ -1106,16 +1106,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); - int pgactivate = 0; - unsigned nr_unqueued_dirty = 0; - unsigned nr_dirty = 0; - unsigned nr_congested = 0; unsigned nr_reclaimed = 0; - unsigned nr_writeback = 0; - unsigned nr_immediate = 0; - unsigned nr_ref_keep = 0; - unsigned nr_unmap_fail = 0; + memset(stat, 0, sizeof(*stat)); cond_resched(); while (!list_empty(page_list)) { @@ -1159,10 +1152,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ page_check_dirty_writeback(page, &dirty, &writeback); if (dirty || writeback) - nr_dirty++; + stat->nr_dirty++; if (dirty && !writeback) - nr_unqueued_dirty++; + stat->nr_unqueued_dirty++; /* * Treat this page as congested if the underlying BDI is or if @@ -1174,7 +1167,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (((dirty || writeback) && mapping && inode_write_congested(mapping->host)) || (writeback && PageReclaim(page))) - nr_congested++; + stat->nr_congested++; /* * If a page at the tail of the LRU is under writeback, there @@ -1223,7 +1216,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (current_is_kswapd() && PageReclaim(page) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { - nr_immediate++; + stat->nr_immediate++; goto activate_locked; /* Case 2 above */ @@ -1241,7 +1234,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * and it's also appropriate in global reclaim. */ SetPageReclaim(page); - nr_writeback++; + stat->nr_writeback++; goto activate_locked; /* Case 3 above */ @@ -1261,7 +1254,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGEREF_ACTIVATE: goto activate_locked; case PAGEREF_KEEP: - nr_ref_keep++; + stat->nr_ref_keep++; goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: @@ -1326,7 +1319,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; if (!try_to_unmap(page, flags)) { - nr_unmap_fail++; + stat->nr_unmap_fail++; goto activate_locked; } } @@ -1474,7 +1467,7 @@ activate_locked: VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { SetPageActive(page); - pgactivate++; + stat->nr_activate++; count_memcg_page_event(page, PGACTIVATE); } keep_locked: @@ -1489,18 +1482,8 @@ keep: free_unref_page_list(&free_pages); list_splice(&ret_pages, page_list); - count_vm_events(PGACTIVATE, pgactivate); - - if (stat) { - stat->nr_dirty = nr_dirty; - stat->nr_congested = nr_congested; - stat->nr_unqueued_dirty = nr_unqueued_dirty; - stat->nr_writeback = nr_writeback; - stat->nr_immediate = nr_immediate; - stat->nr_activate = pgactivate; - stat->nr_ref_keep = nr_ref_keep; - stat->nr_unmap_fail = nr_unmap_fail; - } + count_vm_events(PGACTIVATE, stat->nr_activate); + return nr_reclaimed; } @@ -1512,6 +1495,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; + struct reclaim_stat dummy_stat; unsigned long ret; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1525,7 +1509,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, NULL, true); + TTU_IGNORE_ACCESS, &dummy_stat, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1630,8 +1614,8 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, } -/* - * zone_lru_lock is heavily contended. Some of the functions that +/** + * pgdat->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * @@ -1653,7 +1637,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, - isolate_mode_t mode, enum lru_list lru) + enum lru_list lru) { struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; @@ -1662,6 +1646,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long skipped = 0; unsigned long scan, total_scan, nr_pages; LIST_HEAD(pages_skipped); + isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED); scan = 0; for (total_scan = 0; @@ -1765,11 +1750,11 @@ int isolate_lru_page(struct page *page) WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (PageLRU(page)) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; - spin_lock_irq(zone_lru_lock(zone)); - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + spin_lock_irq(&pgdat->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, pgdat); if (PageLRU(page)) { int lru = page_lru(page); get_page(page); @@ -1777,7 +1762,7 @@ int isolate_lru_page(struct page *page) del_page_from_lru_list(page, lruvec, lru); ret = 0; } - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); } return ret; } @@ -1899,8 +1884,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - struct reclaim_stat stat = {}; - isolate_mode_t isolate_mode = 0; + struct reclaim_stat stat; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; @@ -1921,13 +1905,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, lru_add_drain(); - if (!sc->may_unmap) - isolate_mode |= ISOLATE_UNMAPPED; - spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, - &nr_scanned, sc, isolate_mode, lru); + &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; @@ -2009,9 +1990,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * processes, from rmap. * * If the pages are mostly unmapped, the processing is fast and it is - * appropriate to hold zone_lru_lock across the whole operation. But if + * appropriate to hold pgdat->lru_lock across the whole operation. But if * the pages are mapped, the processing is slow (page_referenced()) so we - * should drop zone_lru_lock around each page. It's impossible to balance + * should drop pgdat->lru_lock around each page. It's impossible to balance * this, so instead we remove the pages from the LRU while processing them. * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. @@ -2084,19 +2065,15 @@ static void shrink_active_list(unsigned long nr_to_scan, struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; - isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); - if (!sc->may_unmap) - isolate_mode |= ISOLATE_UNMAPPED; - spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, - &nr_scanned, sc, isolate_mode, lru); + &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; @@ -2754,16 +2731,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_reclaimed - reclaimed); /* - * Direct reclaim and kswapd have to scan all memory - * cgroups to fulfill the overall scan target for the - * node. + * Kswapd have to scan all memory cgroups to fulfill + * the overall scan target for the node. * * Limit reclaim, on the other hand, only cares about * nr_to_reclaim pages to be reclaimed and it will * retry with decreasing priority if one round over the * whole hierarchy is not sufficient. */ - if (!global_reclaim(sc) && + if (!current_is_kswapd() && sc->nr_reclaimed >= sc->nr_to_reclaim) { mem_cgroup_iter_break(root, memcg); break; @@ -3527,7 +3503,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is - * found to have free_pages <= high_wmark_pages(zone), any page is that zone + * found to have free_pages <= high_wmark_pages(zone), any page in that zone * or lower is eligible for reclaim until at least one usable zone is * balanced. */ diff --git a/mm/vmstat.c b/mm/vmstat.c @@ -2121,21 +2121,14 @@ static int __init extfrag_debug_init(void) struct dentry *extfrag_debug_root; extfrag_debug_root = debugfs_create_dir("extfrag", NULL); - if (!extfrag_debug_root) - return -ENOMEM; - if (!debugfs_create_file("unusable_index", 0444, - extfrag_debug_root, NULL, &unusable_file_ops)) - goto fail; + debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, + &unusable_file_ops); - if (!debugfs_create_file("extfrag_index", 0444, - extfrag_debug_root, NULL, &extfrag_file_ops)) - goto fail; + debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, + &extfrag_file_ops); return 0; -fail: - debugfs_remove_recursive(extfrag_debug_root); - return -ENOMEM; } module_init(extfrag_debug_init); diff --git a/mm/workingset.c b/mm/workingset.c @@ -215,13 +215,12 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, /** * workingset_eviction - note the eviction of a page from memory - * @mapping: address space the page was backing * @page: the page being evicted * - * Returns a shadow entry to be stored in @mapping->i_pages in place + * Returns a shadow entry to be stored in @page->mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ -void *workingset_eviction(struct address_space *mapping, struct page *page) +void *workingset_eviction(struct page *page) { struct pglist_data *pgdat = page_pgdat(page); struct mem_cgroup *memcg = page_memcg(page); diff --git a/net/core/pktgen.c b/net/core/pktgen.c @@ -158,6 +158,7 @@ #include <linux/etherdevice.h> #include <linux/kthread.h> #include <linux/prefetch.h> +#include <linux/mmzone.h> #include <net/net_namespace.h> #include <net/checksum.h> #include <net/ipv6.h> @@ -3625,7 +3626,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_cfi = 0; pkt_dev->svlan_id = 0xffff; pkt_dev->burst = 1; - pkt_dev->node = -1; + pkt_dev->node = NUMA_NO_NODE; err = pktgen_setup_dev(t->net, pkt_dev, ifname); if (err) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c @@ -15,6 +15,7 @@ #include <linux/netlink.h> #include <linux/qrtr.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ +#include <linux/numa.h> #include <net/sock.h> @@ -101,7 +102,7 @@ static inline struct qrtr_sock *qrtr_sk(struct sock *sk) return container_of(sk, struct qrtr_sock, sk); } -static unsigned int qrtr_local_nid = -1; +static unsigned int qrtr_local_nid = NUMA_NO_NODE; /* for node ids */ static RADIX_TREE(qrtr_nodes, GFP_KERNEL); diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan @@ -27,14 +27,9 @@ else $(call cc-param,asan-globals=1) \ $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \ $(call cc-param,asan-stack=$(CONFIG_KASAN_STACK)) \ - $(call cc-param,asan-use-after-scope=1) \ $(call cc-param,asan-instrument-allocas=1) endif -ifdef CONFIG_KASAN_EXTRA -CFLAGS_KASAN += $(call cc-option, -fsanitize-address-use-after-scope) -endif - endif # CONFIG_KASAN_GENERIC ifdef CONFIG_KASAN_SW_TAGS diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh @@ -37,6 +37,13 @@ parse_symbol() { symbol=${symbol#\(} symbol=${symbol%\)} + # Strip segment + local segment + if [[ $symbol == *:* ]] ; then + segment=${symbol%%:*}: + symbol=${symbol#*:} + fi + # Strip the symbol name so that we could look it up local name=${symbol%+*} @@ -84,7 +91,7 @@ parse_symbol() { code=${code//$'\n'/' '} # Replace old address with pretty line numbers - symbol="$name ($code)" + symbol="$segment$name ($code)" } decode_code() { diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig @@ -68,10 +68,6 @@ config GCC_PLUGIN_LATENT_ENTROPY config GCC_PLUGIN_STRUCTLEAK bool "Force initialization of variables containing userspace addresses" - # Currently STRUCTLEAK inserts initialization out of live scope of - # variables from KASAN point of view. This leads to KASAN false - # positive reports. Prohibit this combination for now. - depends on !KASAN_EXTRA help This plugin zero-initializes any structures containing a __user attribute. This can prevent some classes of information diff --git a/tools/include/linux/numa.h b/tools/include/linux/numa.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NUMA_H +#define _LINUX_NUMA_H + + +#ifdef CONFIG_NODES_SHIFT +#define NODES_SHIFT CONFIG_NODES_SHIFT +#else +#define NODES_SHIFT 0 +#endif + +#define MAX_NUMNODES (1 << NODES_SHIFT) + +#define NUMA_NO_NODE (-1) + +#endif /* _LINUX_NUMA_H */ diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c @@ -34,6 +34,7 @@ #include <sys/types.h> #include <linux/kernel.h> #include <linux/time64.h> +#include <linux/numa.h> #include <numa.h> #include <numaif.h> @@ -298,7 +299,7 @@ static cpu_set_t bind_to_node(int target_node) CPU_ZERO(&mask); - if (target_node == -1) { + if (target_node == NUMA_NO_NODE) { for (cpu = 0; cpu < g->p.nr_cpus; cpu++) CPU_SET(cpu, &mask); } else { @@ -339,7 +340,7 @@ static void bind_to_memnode(int node) unsigned long nodemask; int ret; - if (node == -1) + if (node == NUMA_NO_NODE) return; BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); @@ -1363,7 +1364,7 @@ static void init_thread_data(void) int cpu; /* Allow all nodes by default: */ - td->bind_node = -1; + td->bind_node = NUMA_NO_NODE; /* Allow all CPUs by default: */ CPU_ZERO(&td->bind_cpumask); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile @@ -48,6 +48,7 @@ TARGETS += sysctl ifneq (1, $(quicktest)) TARGETS += timers endif +TARGETS += tmpfs TARGETS += user TARGETS += vm TARGETS += x86 diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c @@ -54,6 +54,22 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) return fd; } +static int mfd_assert_reopen_fd(int fd_in) +{ + int r, fd; + char path[100]; + + sprintf(path, "/proc/self/fd/%d", fd_in); + + fd = open(path, O_RDWR); + if (fd < 0) { + printf("re-open of existing fd %d failed\n", fd_in); + abort(); + } + + return fd; +} + static void mfd_fail_new(const char *name, unsigned int flags) { int r; @@ -255,6 +271,25 @@ static void mfd_assert_read(int fd) munmap(p, mfd_def_size); } +/* Test that PROT_READ + MAP_SHARED mappings work. */ +static void mfd_assert_read_shared(int fd) +{ + void *p; + + /* verify PROT_READ and MAP_SHARED *is* allowed */ + p = mmap(NULL, + mfd_def_size, + PROT_READ, + MAP_SHARED, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + munmap(p, mfd_def_size); +} + static void mfd_assert_write(int fd) { ssize_t l; @@ -693,6 +728,44 @@ static void test_seal_write(void) } /* + * Test SEAL_FUTURE_WRITE + * Test whether SEAL_FUTURE_WRITE actually prevents modifications. + */ +static void test_seal_future_write(void) +{ + int fd, fd2; + void *p; + + printf("%s SEAL-FUTURE-WRITE\n", memfd_str); + + fd = mfd_assert_new("kern_memfd_seal_future_write", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + p = mfd_assert_mmap_shared(fd); + + mfd_assert_has_seals(fd, 0); + + mfd_assert_add_seals(fd, F_SEAL_FUTURE_WRITE); + mfd_assert_has_seals(fd, F_SEAL_FUTURE_WRITE); + + /* read should pass, writes should fail */ + mfd_assert_read(fd); + mfd_assert_read_shared(fd); + mfd_fail_write(fd); + + fd2 = mfd_assert_reopen_fd(fd); + /* read should pass, writes should still fail */ + mfd_assert_read(fd2); + mfd_assert_read_shared(fd2); + mfd_fail_write(fd2); + + munmap(p, mfd_def_size); + close(fd2); + close(fd); +} + +/* * Test SEAL_SHRINK * Test whether SEAL_SHRINK actually prevents shrinking */ @@ -945,6 +1018,7 @@ int main(int argc, char **argv) test_basic(); test_seal_write(); + test_seal_future_write(); test_seal_shrink(); test_seal_grow(); test_seal_resize(); diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore @@ -2,6 +2,7 @@ /fd-002-posix-eq /fd-003-kthread /proc-loadavg-001 +/proc-pid-vm /proc-self-map-files-001 /proc-self-map-files-002 /proc-self-syscall diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS += fd-001-lookup TEST_GEN_PROGS += fd-002-posix-eq TEST_GEN_PROGS += fd-003-kthread TEST_GEN_PROGS += proc-loadavg-001 +TEST_GEN_PROGS += proc-pid-vm TEST_GEN_PROGS += proc-self-map-files-001 TEST_GEN_PROGS += proc-self-map-files-002 TEST_GEN_PROGS += proc-self-syscall diff --git a/tools/testing/selftests/proc/proc-loadavg-001.c b/tools/testing/selftests/proc/proc-loadavg-001.c @@ -30,7 +30,7 @@ int main(void) if (unshare(CLONE_NEWPID) == -1) { if (errno == ENOSYS || errno == EPERM) - return 2; + return 4; return 1; } diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* + * Fork and exec tiny 1 page executable which precisely controls its VM. + * Test /proc/$PID/maps + * Test /proc/$PID/smaps + * Test /proc/$PID/smaps_rollup + * Test /proc/$PID/statm + * + * FIXME require CONFIG_TMPFS which can be disabled + * FIXME test other values from "smaps" + * FIXME support other archs + */ +#undef NDEBUG +#include <assert.h> +#include <errno.h> +#include <sched.h> +#include <signal.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <sys/mount.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/syscall.h> +#include <sys/uio.h> +#include <linux/kdev_t.h> + +static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags) +{ + return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags); +} + +static void make_private_tmp(void) +{ + if (unshare(CLONE_NEWNS) == -1) { + if (errno == ENOSYS || errno == EPERM) { + exit(4); + } + exit(1); + } + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { + exit(1); + } + if (mount(NULL, "/tmp", "tmpfs", 0, NULL) == -1) { + exit(1); + } +} + +static pid_t pid = -1; +static void ate(void) +{ + if (pid > 0) { + kill(pid, SIGTERM); + } +} + +struct elf64_hdr { + uint8_t e_ident[16]; + uint16_t e_type; + uint16_t e_machine; + uint32_t e_version; + uint64_t e_entry; + uint64_t e_phoff; + uint64_t e_shoff; + uint32_t e_flags; + uint16_t e_ehsize; + uint16_t e_phentsize; + uint16_t e_phnum; + uint16_t e_shentsize; + uint16_t e_shnum; + uint16_t e_shstrndx; +}; + +struct elf64_phdr { + uint32_t p_type; + uint32_t p_flags; + uint64_t p_offset; + uint64_t p_vaddr; + uint64_t p_paddr; + uint64_t p_filesz; + uint64_t p_memsz; + uint64_t p_align; +}; + +#ifdef __x86_64__ +#define PAGE_SIZE 4096 +#define VADDR (1UL << 32) +#define MAPS_OFFSET 73 + +#define syscall 0x0f, 0x05 +#define mov_rdi(x) \ + 0x48, 0xbf, \ + (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \ + ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff + +#define mov_rsi(x) \ + 0x48, 0xbe, \ + (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \ + ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff + +#define mov_eax(x) \ + 0xb8, (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff + +static const uint8_t payload[] = { + /* Casually unmap stack, vDSO and everything else. */ + /* munmap */ + mov_rdi(VADDR + 4096), + mov_rsi((1ULL << 47) - 4096 - VADDR - 4096), + mov_eax(11), + syscall, + + /* Ping parent. */ + /* write(0, &c, 1); */ + 0x31, 0xff, /* xor edi, edi */ + 0x48, 0x8d, 0x35, 0x00, 0x00, 0x00, 0x00, /* lea rsi, [rip] */ + 0xba, 0x01, 0x00, 0x00, 0x00, /* mov edx, 1 */ + mov_eax(1), + syscall, + + /* 1: pause(); */ + mov_eax(34), + syscall, + + 0xeb, 0xf7, /* jmp 1b */ +}; + +static int make_exe(const uint8_t *payload, size_t len) +{ + struct elf64_hdr h; + struct elf64_phdr ph; + + struct iovec iov[3] = { + {&h, sizeof(struct elf64_hdr)}, + {&ph, sizeof(struct elf64_phdr)}, + {(void *)payload, len}, + }; + int fd, fd1; + char buf[64]; + + memset(&h, 0, sizeof(h)); + h.e_ident[0] = 0x7f; + h.e_ident[1] = 'E'; + h.e_ident[2] = 'L'; + h.e_ident[3] = 'F'; + h.e_ident[4] = 2; + h.e_ident[5] = 1; + h.e_ident[6] = 1; + h.e_ident[7] = 0; + h.e_type = 2; + h.e_machine = 0x3e; + h.e_version = 1; + h.e_entry = VADDR + sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr); + h.e_phoff = sizeof(struct elf64_hdr); + h.e_shoff = 0; + h.e_flags = 0; + h.e_ehsize = sizeof(struct elf64_hdr); + h.e_phentsize = sizeof(struct elf64_phdr); + h.e_phnum = 1; + h.e_shentsize = 0; + h.e_shnum = 0; + h.e_shstrndx = 0; + + memset(&ph, 0, sizeof(ph)); + ph.p_type = 1; + ph.p_flags = (1<<2)|1; + ph.p_offset = 0; + ph.p_vaddr = VADDR; + ph.p_paddr = 0; + ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload); + ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload); + ph.p_align = 4096; + + fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700); + if (fd == -1) { + exit(1); + } + + if (writev(fd, iov, 3) != sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len) { + exit(1); + } + + /* Avoid ETXTBSY on exec. */ + snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd); + fd1 = open(buf, O_RDONLY|O_CLOEXEC); + close(fd); + + return fd1; +} +#endif + +#ifdef __x86_64__ +int main(void) +{ + int pipefd[2]; + int exec_fd; + + atexit(ate); + + make_private_tmp(); + + /* Reserve fd 0 for 1-byte pipe ping from child. */ + close(0); + if (open("/", O_RDONLY|O_DIRECTORY|O_PATH) != 0) { + return 1; + } + + exec_fd = make_exe(payload, sizeof(payload)); + + if (pipe(pipefd) == -1) { + return 1; + } + if (dup2(pipefd[1], 0) != 0) { + return 1; + } + + pid = fork(); + if (pid == -1) { + return 1; + } + if (pid == 0) { + sys_execveat(exec_fd, "", NULL, NULL, AT_EMPTY_PATH); + return 1; + } + + char _; + if (read(pipefd[0], &_, 1) != 1) { + return 1; + } + + struct stat st; + if (fstat(exec_fd, &st) == -1) { + return 1; + } + + /* Generate "head -n1 /proc/$PID/maps" */ + char buf0[256]; + memset(buf0, ' ', sizeof(buf0)); + int len = snprintf(buf0, sizeof(buf0), + "%08lx-%08lx r-xp 00000000 %02lx:%02lx %llu", + VADDR, VADDR + PAGE_SIZE, + MAJOR(st.st_dev), MINOR(st.st_dev), + (unsigned long long)st.st_ino); + buf0[len] = ' '; + snprintf(buf0 + MAPS_OFFSET, sizeof(buf0) - MAPS_OFFSET, + "/tmp/#%llu (deleted)\n", (unsigned long long)st.st_ino); + + + /* Test /proc/$PID/maps */ + { + char buf[256]; + ssize_t rv; + int fd; + + snprintf(buf, sizeof(buf), "/proc/%u/maps", pid); + fd = open(buf, O_RDONLY); + if (fd == -1) { + return 1; + } + rv = read(fd, buf, sizeof(buf)); + assert(rv == strlen(buf0)); + assert(memcmp(buf, buf0, strlen(buf0)) == 0); + } + + /* Test /proc/$PID/smaps */ + { + char buf[1024]; + ssize_t rv; + int fd; + + snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid); + fd = open(buf, O_RDONLY); + if (fd == -1) { + return 1; + } + rv = read(fd, buf, sizeof(buf)); + assert(0 <= rv && rv <= sizeof(buf)); + + assert(rv >= strlen(buf0)); + assert(memcmp(buf, buf0, strlen(buf0)) == 0); + +#define RSS1 "Rss: 4 kB\n" +#define RSS2 "Rss: 0 kB\n" +#define PSS1 "Pss: 4 kB\n" +#define PSS2 "Pss: 0 kB\n" + assert(memmem(buf, rv, RSS1, strlen(RSS1)) || + memmem(buf, rv, RSS2, strlen(RSS2))); + assert(memmem(buf, rv, PSS1, strlen(PSS1)) || + memmem(buf, rv, PSS2, strlen(PSS2))); + + static const char *S[] = { + "Size: 4 kB\n", + "KernelPageSize: 4 kB\n", + "MMUPageSize: 4 kB\n", + "Anonymous: 0 kB\n", + "AnonHugePages: 0 kB\n", + "Shared_Hugetlb: 0 kB\n", + "Private_Hugetlb: 0 kB\n", + "Locked: 0 kB\n", + }; + int i; + + for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) { + assert(memmem(buf, rv, S[i], strlen(S[i]))); + } + } + + /* Test /proc/$PID/smaps_rollup */ + { + char bufr[256]; + memset(bufr, ' ', sizeof(bufr)); + len = snprintf(bufr, sizeof(bufr), + "%08lx-%08lx ---p 00000000 00:00 0", + VADDR, VADDR + PAGE_SIZE); + bufr[len] = ' '; + snprintf(bufr + MAPS_OFFSET, sizeof(bufr) - MAPS_OFFSET, + "[rollup]\n"); + + char buf[1024]; + ssize_t rv; + int fd; + + snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid); + fd = open(buf, O_RDONLY); + if (fd == -1) { + return 1; + } + rv = read(fd, buf, sizeof(buf)); + assert(0 <= rv && rv <= sizeof(buf)); + + assert(rv >= strlen(bufr)); + assert(memcmp(buf, bufr, strlen(bufr)) == 0); + + assert(memmem(buf, rv, RSS1, strlen(RSS1)) || + memmem(buf, rv, RSS2, strlen(RSS2))); + assert(memmem(buf, rv, PSS1, strlen(PSS1)) || + memmem(buf, rv, PSS2, strlen(PSS2))); + + static const char *S[] = { + "Anonymous: 0 kB\n", + "AnonHugePages: 0 kB\n", + "Shared_Hugetlb: 0 kB\n", + "Private_Hugetlb: 0 kB\n", + "Locked: 0 kB\n", + }; + int i; + + for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) { + assert(memmem(buf, rv, S[i], strlen(S[i]))); + } + } + + /* Test /proc/$PID/statm */ + { + char buf[64]; + ssize_t rv; + int fd; + + snprintf(buf, sizeof(buf), "/proc/%u/statm", pid); + fd = open(buf, O_RDONLY); + if (fd == -1) { + return 1; + } + rv = read(fd, buf, sizeof(buf)); + assert(rv == 7 * 2); + + assert(buf[0] == '1'); /* ->total_vm */ + assert(buf[1] == ' '); + assert(buf[2] == '0' || buf[2] == '1'); /* rss */ + assert(buf[3] == ' '); + assert(buf[4] == '0' || buf[2] == '1'); /* file rss */ + assert(buf[5] == ' '); + assert(buf[6] == '1'); /* ELF executable segments */ + assert(buf[7] == ' '); + assert(buf[8] == '0'); + assert(buf[9] == ' '); + assert(buf[10] == '0'); /* ->data_vm + ->stack_vm */ + assert(buf[11] == ' '); + assert(buf[12] == '0'); + assert(buf[13] == '\n'); + } + + return 0; +} +#else +int main(void) +{ + return 4; +} +#endif diff --git a/tools/testing/selftests/proc/proc-self-map-files-002.c b/tools/testing/selftests/proc/proc-self-map-files-002.c @@ -63,7 +63,7 @@ int main(void) p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); if (p == MAP_FAILED) { if (errno == EPERM) - return 2; + return 4; return 1; } diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c @@ -20,7 +20,6 @@ #include <sys/stat.h> #include <fcntl.h> #include <errno.h> -#include <unistd.h> #include <string.h> #include <stdio.h> @@ -39,7 +38,7 @@ int main(void) fd = open("/proc/self/syscall", O_RDONLY); if (fd == -1) { if (errno == ENOENT) - return 2; + return 4; return 1; } diff --git a/tools/testing/selftests/proc/proc-self-wchan.c b/tools/testing/selftests/proc/proc-self-wchan.c @@ -27,7 +27,7 @@ int main(void) fd = open("/proc/self/wchan", O_RDONLY); if (fd == -1) { if (errno == ENOENT) - return 2; + return 4; return 1; } diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c @@ -26,8 +26,10 @@ #include <dirent.h> #include <stdbool.h> #include <stdlib.h> +#include <stdio.h> #include <string.h> #include <sys/stat.h> +#include <sys/vfs.h> #include <fcntl.h> #include <unistd.h> @@ -123,10 +125,22 @@ static void f(DIR *d, unsigned int level) int main(void) { DIR *d; + struct statfs sfs; d = opendir("/proc"); if (!d) + return 4; + + /* Ensure /proc is proc. */ + if (fstatfs(dirfd(d), &sfs) == -1) { + return 1; + } + if (sfs.f_type != 0x9fa0) { + fprintf(stderr, "error: unexpected f_type %lx\n", (long)sfs.f_type); return 2; + } + f(d, 0); + return 0; } diff --git a/tools/testing/selftests/tmpfs/.gitignore b/tools/testing/selftests/tmpfs/.gitignore @@ -0,0 +1 @@ +/bug-link-o-tmpfile diff --git a/tools/testing/selftests/tmpfs/Makefile b/tools/testing/selftests/tmpfs/Makefile @@ -0,0 +1,7 @@ +CFLAGS += -Wall -O2 +CFLAGS += -D_GNU_SOURCE + +TEST_GEN_PROGS := +TEST_GEN_PROGS += bug-link-o-tmpfile + +include ../lib.mk diff --git a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test that open(O_TMPFILE), linkat() doesn't screw accounting. */ +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mount.h> +#include <unistd.h> + +int main(void) +{ + int fd; + + if (unshare(CLONE_NEWNS) == -1) { + if (errno == ENOSYS || errno == EPERM) { + fprintf(stderr, "error: unshare, errno %d\n", errno); + return 4; + } + fprintf(stderr, "error: unshare, errno %d\n", errno); + return 1; + } + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { + fprintf(stderr, "error: mount '/', errno %d\n", errno); + return 1; + } + + /* Our heroes: 1 root inode, 1 O_TMPFILE inode, 1 permanent inode. */ + if (mount(NULL, "/tmp", "tmpfs", 0, "nr_inodes=3") == -1) { + fprintf(stderr, "error: mount tmpfs, errno %d\n", errno); + return 1; + } + + fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); + if (fd == -1) { + fprintf(stderr, "error: open 1, errno %d\n", errno); + return 1; + } + if (linkat(fd, "", AT_FDCWD, "/tmp/1", AT_EMPTY_PATH) == -1) { + fprintf(stderr, "error: linkat, errno %d\n", errno); + return 1; + } + close(fd); + + fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); + if (fd == -1) { + fprintf(stderr, "error: open 2, errno %d\n", errno); + return 1; + } + + return 0; +} diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests @@ -211,4 +211,20 @@ else echo "[PASS]" fi +echo "------------------------------------" +echo "running vmalloc stability smoke test" +echo "------------------------------------" +./test_vmalloc.sh smoke +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + exit $exitcode diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com> +# +# This is a test script for the kernel test driver to analyse vmalloc +# allocator. Therefore it is just a kernel module loader. You can specify +# and pass different parameters in order to: +# a) analyse performance of vmalloc allocations; +# b) stressing and stability check of vmalloc subsystem. + +TEST_NAME="vmalloc" +DRIVER="test_${TEST_NAME}" + +# 1 if fails +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# +# Static templates for performance, stressing and smoke tests. +# Also it is possible to pass any supported parameters manualy. +# +PERF_PARAM="single_cpu_test=1 sequential_test_order=1 test_repeat_count=3" +SMOKE_PARAM="single_cpu_test=1 test_loop_count=10000 test_repeat_count=10" +STRESS_PARAM="test_repeat_count=20" + +check_test_requirements() +{ + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "$0: Must be run as root" + exit $ksft_skip + fi + + if ! which modprobe > /dev/null 2>&1; then + echo "$0: You need modprobe installed" + exit $ksft_skip + fi + + if ! modinfo $DRIVER > /dev/null 2>&1; then + echo "$0: You must have the following enabled in your kernel:" + echo "CONFIG_TEST_VMALLOC=m" + exit $ksft_skip + fi +} + +run_perfformance_check() +{ + echo "Run performance tests to evaluate how fast vmalloc allocation is." + echo "It runs all test cases on one single CPU with sequential order." + + modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 + echo "Done." + echo "Ccheck the kernel message buffer to see the summary." +} + +run_stability_check() +{ + echo "Run stability tests. In order to stress vmalloc subsystem we run" + echo "all available test cases on all available CPUs simultaneously." + echo "It will take time, so be patient." + + modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +run_smoke_check() +{ + echo "Run smoke test. Note, this test provides basic coverage." + echo "Please check $0 output how it can be used" + echo "for deep performance analysis as well as stress testing." + + modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +usage() +{ + echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | " + echo "manual parameters" + echo + echo "Valid tests and parameters:" + echo + modinfo $DRIVER + echo + echo "Example usage:" + echo + echo "# Shows help message" + echo "./${DRIVER}.sh" + echo + echo "# Runs 1 test(id_1), repeats it 5 times on all online CPUs" + echo "./${DRIVER}.sh run_test_mask=1 test_repeat_count=5" + echo + echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " + echo "sequential order" + echo -n "./${DRIVER}.sh single_cpu_test=1 sequential_test_order=1 " + echo "run_test_mask=23" + echo + echo -n "# Runs all tests on all online CPUs, shuffled order, repeats " + echo "20 times" + echo "./${DRIVER}.sh test_repeat_count=20" + echo + echo "# Performance analysis" + echo "./${DRIVER}.sh performance" + echo + echo "# Stress testing" + echo "./${DRIVER}.sh stress" + echo + exit 0 +} + +function validate_passed_args() +{ + VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'` + + # + # Something has been passed, check it. + # + for passed_arg in $@; do + key=${passed_arg//=*/} + val="${passed_arg:$((${#key}+1))}" + valid=0 + + for valid_arg in $VALID_ARGS; do + if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then + valid=1 + break + fi + done + + if [[ $valid -ne 1 ]]; then + echo "Error: key or value is not correct: ${key} $val" + exit $exitcode + fi + done +} + +function run_manual_check() +{ + # + # Validate passed parameters. If there is wrong one, + # the script exists and does not execute further. + # + validate_passed_args $@ + + echo "Run the test with following parameters: $@" + modprobe $DRIVER $@ > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +function run_test() +{ + if [ $# -eq 0 ]; then + usage + else + if [[ "$1" = "performance" ]]; then + run_perfformance_check + elif [[ "$1" = "stress" ]]; then + run_stability_check + elif [[ "$1" = "smoke" ]]; then + run_smoke_check + else + run_manual_check $@ + fi + fi +} + +check_test_requirements +run_test $@ + +exit 0 diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c @@ -133,7 +133,7 @@ static const char * const page_flag_names[] = { [KPF_NOPAGE] = "n:nopage", [KPF_KSM] = "x:ksm", [KPF_THP] = "t:thp", - [KPF_BALLOON] = "o:balloon", + [KPF_OFFLINE] = "o:offline", [KPF_PGTABLE] = "g:pgtable", [KPF_ZERO_PAGE] = "z:zero_page", [KPF_IDLE] = "i:idle_page", diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c @@ -110,39 +110,42 @@ static void fatal(const char *x, ...) static void usage(void) { printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" - "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" + "slabinfo [-aADefhilnosrStTvz1LXBU] [N=K] [-dafzput] [slab-regexp]\n" "-a|--aliases Show aliases\n" "-A|--activity Most active slabs first\n" - "-d<options>|--debug=<options> Set/Clear Debug options\n" + "-B|--Bytes Show size in bytes\n" "-D|--display-active Switch line format to activity\n" "-e|--empty Show empty slabs\n" "-f|--first-alias Show first alias\n" "-h|--help Show usage information\n" "-i|--inverted Inverted list\n" "-l|--slabs Show slabs\n" + "-L|--Loss Sort by loss\n" "-n|--numa Show NUMA information\n" - "-o|--ops Show kmem_cache_ops\n" + "-N|--lines=K Show the first K slabs\n" + "-o|--ops Show kmem_cache_ops\n" + "-r|--report Detailed report on single slabs\n" "-s|--shrink Shrink slabs\n" - "-r|--report Detailed report on single slabs\n" "-S|--Size Sort by size\n" "-t|--tracking Show alloc/free information\n" "-T|--Totals Show summary information\n" + "-U|--Unreclaim Show unreclaimable slabs only\n" "-v|--validate Validate slabs\n" "-z|--zero Include empty slabs\n" "-1|--1ref Single reference\n" - "-N|--lines=K Show the first K slabs\n" - "-L|--Loss Sort by loss\n" "-X|--Xtotals Show extended summary information\n" - "-B|--Bytes Show size in bytes\n" - "-U|--Unreclaim Show unreclaimable slabs only\n" - "\nValid debug options (FZPUT may be combined)\n" - "a / A Switch on all debug options (=FZUP)\n" - "- Switch off all debug options\n" - "f / F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" - "z / Z Redzoning\n" - "p / P Poisoning\n" - "u / U Tracking\n" - "t / T Tracing\n" + + "\n" + "-d | --debug Switch off all debug options\n" + "-da | --debug=a Switch on all debug options (--debug=FZPU)\n" + + "\n" + "-d[afzput] | --debug=[afzput]\n" + " f | F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" + " z | Z Redzoning\n" + " p | P Poisoning\n" + " u | U Tracking\n" + " t | T Tracing\n" ); }