whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 345671ea0f9258f410eb057b9ced9cefbbe5dc78
parent 4904008165c8a1c48602b8316139691b8c735e6e
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Fri, 26 Oct 2018 19:33:41 -0700

Merge branch 'akpm' (patches from Andrew)

Merge updates from Andrew Morton:

 - a few misc things

 - ocfs2 updates

 - most of MM

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (132 commits)
  hugetlbfs: dirty pages as they are added to pagecache
  mm: export add_swap_extent()
  mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS
  tools/testing/selftests/vm/map_fixed_noreplace.c: add test for MAP_FIXED_NOREPLACE
  mm: thp: relocate flush_cache_range() in migrate_misplaced_transhuge_page()
  mm: thp: fix mmu_notifier in migrate_misplaced_transhuge_page()
  mm: thp: fix MADV_DONTNEED vs migrate_misplaced_transhuge_page race condition
  mm/kasan/quarantine.c: make quarantine_lock a raw_spinlock_t
  mm/gup: cache dev_pagemap while pinning pages
  Revert "x86/e820: put !E820_TYPE_RAM regions into memblock.reserved"
  mm: return zero_resv_unavail optimization
  mm: zero remaining unavailable struct pages
  tools/testing/selftests/vm/gup_benchmark.c: add MAP_HUGETLB option
  tools/testing/selftests/vm/gup_benchmark.c: add MAP_SHARED option
  tools/testing/selftests/vm/gup_benchmark.c: allow user specified file
  tools/testing/selftests/vm/gup_benchmark.c: fix 'write' flag usage
  mm/gup_benchmark.c: add additional pinning methods
  mm/gup_benchmark.c: time put_page()
  mm: don't raise MEMCG_OOM event due to failed high-order allocation
  mm/page-writeback.c: fix range_cyclic writeback vs writepages deadlock
  ...

Diffstat:
ADocumentation/accounting/psi.txt | 73+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
MDocumentation/admin-guide/cgroup-v2.rst | 22++++++++++++++++++++++
MDocumentation/admin-guide/kernel-parameters.txt | 12++++++++++++
MDocumentation/filesystems/proc.txt | 4++++
MDocumentation/vm/slub.rst | 12+++++++++---
MDocumentation/x86/pat.txt | 4++--
March/alpha/Kconfig | 2++
March/alpha/kernel/core_irongate.c | 4++--
March/alpha/kernel/setup.c | 98++++++++++---------------------------------------------------------------------
March/alpha/mm/numa.c | 113+++++++++----------------------------------------------------------------------
March/arm/include/asm/hugetlb-3level.h | 32+-------------------------------
March/arm/include/asm/hugetlb.h | 33+--------------------------------
March/arm64/include/asm/hugetlb.h | 39+++++++--------------------------------
March/arm64/include/asm/string.h | 14++++++++------
March/arm64/kernel/arm64ksyms.c | 7+++++--
March/arm64/lib/memchr.S | 2+-
March/arm64/lib/memcmp.S | 2+-
March/arm64/lib/strchr.S | 2+-
March/arm64/lib/strcmp.S | 2+-
March/arm64/lib/strlen.S | 2+-
March/arm64/lib/strncmp.S | 2+-
March/arm64/lib/strnlen.S | 2+-
March/arm64/lib/strrchr.S | 2+-
March/hexagon/Kconfig | 3+++
March/hexagon/mm/init.c | 20++++++++------------
March/ia64/include/asm/hugetlb.h | 47+++++------------------------------------------
March/ia64/include/asm/pgtable.h | 1-
March/mips/include/asm/hugetlb.h | 40+++++++---------------------------------
March/nios2/Kconfig | 3+++
March/nios2/kernel/prom.c | 17-----------------
March/nios2/kernel/setup.c | 39+++++++--------------------------------
March/parisc/include/asm/hugetlb.h | 33++++++++-------------------------
March/powerpc/include/asm/book3s/32/pgtable.h | 6------
March/powerpc/include/asm/book3s/64/pgtable.h | 1+
March/powerpc/include/asm/hugetlb.h | 43++++++-------------------------------------
March/powerpc/include/asm/nohash/32/pgtable.h | 6------
March/powerpc/include/asm/nohash/64/pgtable.h | 1+
March/powerpc/platforms/cell/cpufreq_spudemand.c | 2+-
March/powerpc/platforms/cell/spufs/sched.c | 9+++------
March/s390/appldata/appldata_os.c | 4----
March/sh/include/asm/hugetlb.h | 54++++--------------------------------------------------
March/sparc/include/asm/hugetlb.h | 40++++++++--------------------------------
March/um/Kconfig | 2++
March/um/kernel/physmem.c | 22++++++++++------------
March/unicore32/Kconfig | 1+
March/unicore32/mm/init.c | 54+-----------------------------------------------------
March/x86/entry/vdso/vma.c | 24+++++++++---------------
March/x86/include/asm/hugetlb.h | 69---------------------------------------------------------------------
March/x86/kernel/e820.c | 15+++------------
March/xtensa/include/asm/Kbuild | 1+
Darch/xtensa/include/asm/vga.h | 19-------------------
Mblock/blk-iolatency.c | 8+++++---
Mdrivers/base/node.c | 19++++++++++++-------
Mdrivers/cpuidle/governors/menu.c | 4----
Mdrivers/infiniband/hw/hfi1/mmu_rb.c | 1-
Mdrivers/iommu/amd_iommu_v2.c | 1-
Mdrivers/iommu/intel-svm.c | 1-
Mdrivers/misc/sgi-gru/grutlbpurge.c | 1-
Mdrivers/of/fdt.c | 11++++++-----
Mdrivers/staging/android/ion/ion_page_pool.c | 8++++----
Mfs/cramfs/inode.c | 5++++-
Mfs/dcache.c | 38+++++++++-----------------------------
Mfs/iomap.c | 2+-
Mfs/kernfs/mount.c | 3+++
Mfs/ocfs2/alloc.c | 4----
Mfs/ocfs2/aops.c | 3+--
Mfs/ocfs2/dlm/dlmdebug.c | 2+-
Mfs/ocfs2/dlm/dlmthread.c | 2+-
Mfs/ocfs2/refcounttree.c | 2--
Mfs/proc/inode.c | 3+++
Mfs/proc/loadavg.c | 3---
Mfs/proc/meminfo.c | 16++++++++--------
Mfs/proc/task_mmu.c | 4+++-
Mfs/userfaultfd.c | 8++++----
Minclude/asm-generic/hugetlb.h | 88++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Minclude/asm-generic/pgtable.h | 4++--
Minclude/linux/cgroup-defs.h | 4++++
Minclude/linux/cgroup.h | 15+++++++++++++++
Minclude/linux/delayacct.h | 23+++++++++++++++++++++++
Minclude/linux/hmm.h | 2+-
Minclude/linux/huge_mm.h | 8++++----
Minclude/linux/iomap.h | 4+++-
Minclude/linux/linkage.h | 1+
Minclude/linux/math64.h | 3+++
Minclude/linux/memblock.h | 15---------------
Minclude/linux/memcontrol.h | 15+++++++++++++--
Minclude/linux/mm.h | 48+++++++++---------------------------------------
Minclude/linux/mmu_notifier.h | 27+++------------------------
Minclude/linux/mmzone.h | 4+++-
Minclude/linux/page-flags.h | 14+++++++++++++-
Minclude/linux/pfn_t.h | 4+++-
Ainclude/linux/psi.h | 53+++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/linux/psi_types.h | 92+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/linux/sched.h | 13++++++++++---
Minclude/linux/sched/loadavg.h | 24++++++++++++++++++++----
Minclude/linux/slab.h | 56+++++++++++++++++++++++++++++++++++++++++++++-----------
Minclude/linux/swap.h | 15++++++++-------
Minclude/trace/events/mmflags.h | 1+
Minclude/uapi/linux/taskstats.h | 6+++++-
Minit/Kconfig | 19+++++++++++++++++++
Mkernel/cgroup/cgroup.c | 45+++++++++++++++++++++++++++++++++++++++++++--
Mkernel/debug/kdb/kdb_main.c | 7+------
Mkernel/delayacct.c | 15+++++++++++++++
Mkernel/fork.c | 59+++++++++++++++++++++++++++++++++++++++++++++++++++++------
Mkernel/memremap.c | 25++++++++++---------------
Mkernel/sched/Makefile | 1+
Mkernel/sched/core.c | 16+++++++++++-----
Mkernel/sched/loadavg.c | 139+++++++++++++++++++++++++++++++++++--------------------------------------------
Akernel/sched/psi.c | 759+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mkernel/sched/sched.h | 178+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mkernel/sched/stats.h | 86+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mlib/test_kasan.c | 70++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mmm/compaction.c | 5+++++
Mmm/debug.c | 46++++++++++++++++++++++++++++++++++++++++++++++
Mmm/filemap.c | 37+++++++++++++++++++++++++------------
Mmm/gup.c | 115++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mmm/gup_benchmark.c | 37+++++++++++++++++++++++++++++++++----
Mmm/hmm.c | 12+++++++-----
Mmm/huge_memory.c | 31++++++++++++++++++++-----------
Mmm/hugetlb.c | 6++++++
Mmm/kasan/quarantine.c | 18+++++++++---------
Mmm/kmemleak.c | 42+++++++++++++++++++++++++++++++++++-------
Mmm/memblock.c | 5++---
Mmm/memcontrol.c | 54++++++++++++++++--------------------------------------
Mmm/memory.c | 156++++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mmm/memory_hotplug.c | 146++++++++++++++++++++++---------------------------------------------------------
Mmm/mempolicy.c | 35++++++++++++++++++++++-------------
Mmm/migrate.c | 44+++++++++++++++++++++++++-------------------
Mmm/mmap.c | 96+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Mmm/mmu_notifier.c | 31-------------------------------
Mmm/mremap.c | 20++++++++++++++++----
Mmm/nommu.c | 6++----
Mmm/page-writeback.c | 33+++++++++++++++------------------
Mmm/page_alloc.c | 362++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Mmm/page_io.c | 6+++---
Mmm/slab.c | 8++++++--
Mmm/slab_common.c | 115+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mmm/slub.c | 83+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mmm/sparse.c | 4+---
Mmm/swap.c | 1-
Mmm/swap_state.c | 1+
Mmm/swapfile.c | 83++++++++++++++++++++++++++++++++++++-------------------------------------------
Mmm/util.c | 5++---
Mmm/vmalloc.c | 4++++
Mmm/vmscan.c | 31++++++++++++++++++++++++++-----
Mmm/vmstat.c | 10+++++-----
Mmm/workingset.c | 135++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
Mmm/zsmalloc.c | 2+-
Mscripts/tags.sh | 2+-
Mtools/accounting/getdelays.c | 8+++++++-
Mtools/testing/selftests/vm/.gitignore | 1+
Mtools/testing/selftests/vm/Makefile | 1+
Mtools/testing/selftests/vm/gup_benchmark.c | 42+++++++++++++++++++++++++++++++++++-------
Atools/testing/selftests/vm/map_fixed_noreplace.c | 206+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/testing/selftests/vm/userfaultfd.c | 134+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mvirt/kvm/kvm_main.c | 1-
156 files changed, 3400 insertions(+), 1988 deletions(-)

diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt @@ -0,0 +1,73 @@ +================================ +PSI - Pressure Stall Information +================================ + +:Date: April, 2018 +:Author: Johannes Weiner <hannes@cmpxchg.org> + +When CPU, memory or IO devices are contended, workloads experience +latency spikes, throughput losses, and run the risk of OOM kills. + +Without an accurate measure of such contention, users are forced to +either play it safe and under-utilize their hardware resources, or +roll the dice and frequently suffer the disruptions resulting from +excessive overcommit. + +The psi feature identifies and quantifies the disruptions caused by +such resource crunches and the time impact it has on complex workloads +or even entire systems. + +Having an accurate measure of productivity losses caused by resource +scarcity aids users in sizing workloads to hardware--or provisioning +hardware according to workload demand. + +As psi aggregates this information in realtime, systems can be managed +dynamically using techniques such as load shedding, migrating jobs to +other systems or data centers, or strategically pausing or killing low +priority or restartable batch jobs. + +This allows maximizing hardware utilization without sacrificing +workload health or risking major disruptions such as OOM kills. + +Pressure interface +================== + +Pressure information for each resource is exported through the +respective file in /proc/pressure/ -- cpu, memory, and io. + +The format for CPU is as such: + +some avg10=0.00 avg60=0.00 avg300=0.00 total=0 + +and for memory and IO: + +some avg10=0.00 avg60=0.00 avg300=0.00 total=0 +full avg10=0.00 avg60=0.00 avg300=0.00 total=0 + +The "some" line indicates the share of time in which at least some +tasks are stalled on a given resource. + +The "full" line indicates the share of time in which all non-idle +tasks are stalled on a given resource simultaneously. In this state +actual CPU cycles are going to waste, and a workload that spends +extended time in this state is considered to be thrashing. This has +severe impact on performance, and it's useful to distinguish this +situation from a state where some tasks are stalled but the CPU is +still doing productive work. As such, time spent in this subset of the +stall state is tracked separately and exported in the "full" averages. + +The ratios are tracked as recent trends over ten, sixty, and three +hundred second windows, which gives insight into short term events as +well as medium and long term trends. The total absolute stall time is +tracked and exported as well, to allow detection of latency spikes +which wouldn't necessarily make a dent in the time averages, or to +average trends over custom time frames. + +Cgroup2 interface +================= + +In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem +mounted, pressure stall information is also tracked for tasks grouped +into cgroups. Each subdirectory in the cgroupfs mountpoint contains +cpu.pressure, memory.pressure, and io.pressure files; the format is +the same as the /proc/pressure/ files. diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst @@ -966,6 +966,12 @@ All time durations are in microseconds. $PERIOD duration. "max" for $MAX indicates no limit. If only one number is written, $MAX is updated. + cpu.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for CPU. See + Documentation/accounting/psi.txt for details. + Memory ------ @@ -1127,6 +1133,10 @@ PAGE_SIZE multiple when read back. disk readahead. For now OOM in memory cgroup kills tasks iff shortage has happened inside page fault. + This event is not raised if the OOM killer is not + considered as an option, e.g. for failed high-order + allocations. + oom_kill The number of processes belonging to this cgroup killed by any kind of OOM killer. @@ -1271,6 +1281,12 @@ PAGE_SIZE multiple when read back. higher than the limit for an extended period of time. This reduces the impact on the workload and memory management. + memory.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for memory. See + Documentation/accounting/psi.txt for details. + Usage Guidelines ~~~~~~~~~~~~~~~~ @@ -1408,6 +1424,12 @@ IO Interface Files 8:16 rbps=2097152 wbps=max riops=max wiops=max + io.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for IO. See + Documentation/accounting/psi.txt for details. + Writeback ~~~~~~~~~ diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt @@ -4851,6 +4851,18 @@ This is actually a boot loader parameter; the value is passed to the kernel using a special protocol. + vm_debug[=options] [KNL] Available with CONFIG_DEBUG_VM=y. + May slow down system boot speed, especially when + enabled on systems with a large amount of memory. + All options are enabled by default, and this + interface is meant to allow for selectively + enabling or disabling specific virtual memory + debugging features. + + Available options are: + P Enable page structure init time poisoning + - Disable all of the above options + vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact size of <nn>. This can be used to increase the minimum size (128MB on x86). It can also be used to diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt @@ -858,6 +858,7 @@ Writeback: 0 kB AnonPages: 861800 kB Mapped: 280372 kB Shmem: 644 kB +KReclaimable: 168048 kB Slab: 284364 kB SReclaimable: 159856 kB SUnreclaim: 124508 kB @@ -925,6 +926,9 @@ AnonHugePages: Non-file backed huge pages mapped into userspace page tables ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated with huge pages ShmemPmdMapped: Shared memory mapped into userspace with huge pages +KReclaimable: Kernel allocations that the kernel will attempt to reclaim + under memory pressure. Includes SReclaimable (below), and other + direct allocations with a shrinker. Slab: in-kernel data structures cache SReclaimable: Part of Slab, that might be reclaimed, such as caches SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst @@ -36,9 +36,10 @@ debugging is enabled. Format: slub_debug=<Debug-Options> Enable options for all slabs -slub_debug=<Debug-Options>,<slab name> - Enable options only for select slabs +slub_debug=<Debug-Options>,<slab name1>,<slab name2>,... + Enable options only for select slabs (no spaces + after a comma) Possible debug options are:: @@ -62,7 +63,12 @@ Trying to find an issue in the dentry cache? Try:: slub_debug=,dentry -to only enable debugging on the dentry cache. +to only enable debugging on the dentry cache. You may use an asterisk at the +end of the slab name, in order to cover all slabs with the same prefix. For +example, here's how you can poison the dentry cache as well as all kmalloc +slabs: + + slub_debug=P,kmalloc-*,dentry Red zoning and tracking may realign the slab. We can just apply sanity checks to the dentry cache with:: diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt @@ -90,12 +90,12 @@ pci proc | -- | -- | WC | Advanced APIs for drivers ------------------------- A. Exporting pages to users with remap_pfn_range, io_remap_pfn_range, -vm_insert_pfn +vmf_insert_pfn Drivers wanting to export some pages to userspace do it by using mmap interface and a combination of 1) pgprot_noncached() -2) io_remap_pfn_range() or remap_pfn_range() or vm_insert_pfn() +2) io_remap_pfn_range() or remap_pfn_range() or vmf_insert_pfn() With PAT support, a new API pgprot_writecombine is being added. So, drivers can continue to use the above sequence, with either pgprot_noncached() or diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig @@ -31,6 +31,8 @@ config ALPHA select ODD_RT_SIGACTION select OLD_SIGSUSPEND select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 + select HAVE_MEMBLOCK + select NO_BOOTMEM help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c @@ -21,6 +21,7 @@ #include <linux/init.h> #include <linux/initrd.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <asm/ptrace.h> #include <asm/cacheflush.h> @@ -241,8 +242,7 @@ albacore_init_arch(void) size / 1024); } #endif - reserve_bootmem_node(NODE_DATA(0), pci_mem, memtop - - pci_mem, BOOTMEM_DEFAULT); + memblock_reserve(pci_mem, memtop - pci_mem); printk("irongate_init_arch: temporarily reserving " "region %08lx-%08lx for PCI\n", pci_mem, memtop - 1); } diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c @@ -30,6 +30,7 @@ #include <linux/ioport.h> #include <linux/platform_device.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/pci.h> #include <linux/seq_file.h> #include <linux/root_dev.h> @@ -312,9 +313,7 @@ setup_memory(void *kernel_end) { struct memclust_struct * cluster; struct memdesc_struct * memdesc; - unsigned long start_kernel_pfn, end_kernel_pfn; - unsigned long bootmap_size, bootmap_pages, bootmap_start; - unsigned long start, end; + unsigned long kernel_size; unsigned long i; /* Find free clusters, and init and free the bootmem accordingly. */ @@ -322,6 +321,8 @@ setup_memory(void *kernel_end) (hwrpb->mddt_offset + (unsigned long) hwrpb); for_each_mem_cluster(memdesc, cluster, i) { + unsigned long end; + printk("memcluster %lu, usage %01lx, start %8lu, end %8lu\n", i, cluster->usage, cluster->start_pfn, cluster->start_pfn + cluster->numpages); @@ -335,6 +336,9 @@ setup_memory(void *kernel_end) end = cluster->start_pfn + cluster->numpages; if (end > max_low_pfn) max_low_pfn = end; + + memblock_add(PFN_PHYS(cluster->start_pfn), + cluster->numpages << PAGE_SHIFT); } /* @@ -363,87 +367,9 @@ setup_memory(void *kernel_end) max_low_pfn = mem_size_limit; } - /* Find the bounds of kernel memory. */ - start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); - end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); - bootmap_start = -1; - - try_again: - if (max_low_pfn <= end_kernel_pfn) - panic("not enough memory to boot"); - - /* We need to know how many physically contiguous pages - we'll need for the bootmap. */ - bootmap_pages = bootmem_bootmap_pages(max_low_pfn); - - /* Now find a good region where to allocate the bootmap. */ - for_each_mem_cluster(memdesc, cluster, i) { - if (cluster->usage & 3) - continue; - - start = cluster->start_pfn; - end = start + cluster->numpages; - if (start >= max_low_pfn) - continue; - if (end > max_low_pfn) - end = max_low_pfn; - if (start < start_kernel_pfn) { - if (end > end_kernel_pfn - && end - end_kernel_pfn >= bootmap_pages) { - bootmap_start = end_kernel_pfn; - break; - } else if (end > start_kernel_pfn) - end = start_kernel_pfn; - } else if (start < end_kernel_pfn) - start = end_kernel_pfn; - if (end - start >= bootmap_pages) { - bootmap_start = start; - break; - } - } - - if (bootmap_start == ~0UL) { - max_low_pfn >>= 1; - goto try_again; - } - - /* Allocate the bootmap and mark the whole MM as reserved. */ - bootmap_size = init_bootmem(bootmap_start, max_low_pfn); - - /* Mark the free regions. */ - for_each_mem_cluster(memdesc, cluster, i) { - if (cluster->usage & 3) - continue; - - start = cluster->start_pfn; - end = cluster->start_pfn + cluster->numpages; - if (start >= max_low_pfn) - continue; - if (end > max_low_pfn) - end = max_low_pfn; - if (start < start_kernel_pfn) { - if (end > end_kernel_pfn) { - free_bootmem(PFN_PHYS(start), - (PFN_PHYS(start_kernel_pfn) - - PFN_PHYS(start))); - printk("freeing pages %ld:%ld\n", - start, start_kernel_pfn); - start = end_kernel_pfn; - } else if (end > start_kernel_pfn) - end = start_kernel_pfn; - } else if (start < end_kernel_pfn) - start = end_kernel_pfn; - if (start >= end) - continue; - - free_bootmem(PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start)); - printk("freeing pages %ld:%ld\n", start, end); - } - - /* Reserve the bootmap memory. */ - reserve_bootmem(PFN_PHYS(bootmap_start), bootmap_size, - BOOTMEM_DEFAULT); - printk("reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size)); + /* Reserve the kernel memory. */ + kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS; + memblock_reserve(KERNEL_START_PHYS, kernel_size); #ifdef CONFIG_BLK_DEV_INITRD initrd_start = INITRD_START; @@ -459,8 +385,8 @@ setup_memory(void *kernel_end) initrd_end, phys_to_virt(PFN_PHYS(max_low_pfn))); } else { - reserve_bootmem(virt_to_phys((void *)initrd_start), - INITRD_SIZE, BOOTMEM_DEFAULT); + memblock_reserve(virt_to_phys((void *)initrd_start), + INITRD_SIZE); } } #endif /* CONFIG_BLK_DEV_INITRD */ diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/swap.h> #include <linux/initrd.h> #include <linux/pfn.h> @@ -59,12 +60,10 @@ setup_memory_node(int nid, void *kernel_end) struct memclust_struct * cluster; struct memdesc_struct * memdesc; unsigned long start_kernel_pfn, end_kernel_pfn; - unsigned long bootmap_size, bootmap_pages, bootmap_start; unsigned long start, end; unsigned long node_pfn_start, node_pfn_end; unsigned long node_min_pfn, node_max_pfn; int i; - unsigned long node_datasz = PFN_UP(sizeof(pg_data_t)); int show_init = 0; /* Find the bounds of current node */ @@ -134,24 +133,14 @@ setup_memory_node(int nid, void *kernel_end) /* Cute trick to make sure our local node data is on local memory */ node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT)); #endif - /* Quasi-mark the pg_data_t as in-use */ - node_min_pfn += node_datasz; - if (node_min_pfn >= node_max_pfn) { - printk(" not enough mem to reserve NODE_DATA"); - return; - } - NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; - printk(" Detected node memory: start %8lu, end %8lu\n", node_min_pfn, node_max_pfn); DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid)); - DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata); /* Find the bounds of kernel memory. */ start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); - bootmap_start = -1; if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) panic("kernel loaded out of ram"); @@ -161,89 +150,11 @@ setup_memory_node(int nid, void *kernel_end) has much larger alignment than 8Mb, so it's safe. */ node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1); - /* We need to know how many physically contiguous pages - we'll need for the bootmap. */ - bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn); - - /* Now find a good region where to allocate the bootmap. */ - for_each_mem_cluster(memdesc, cluster, i) { - if (cluster->usage & 3) - continue; - - start = cluster->start_pfn; - end = start + cluster->numpages; - - if (start >= node_max_pfn || end <= node_min_pfn) - continue; - - if (end > node_max_pfn) - end = node_max_pfn; - if (start < node_min_pfn) - start = node_min_pfn; - - if (start < start_kernel_pfn) { - if (end > end_kernel_pfn - && end - end_kernel_pfn >= bootmap_pages) { - bootmap_start = end_kernel_pfn; - break; - } else if (end > start_kernel_pfn) - end = start_kernel_pfn; - } else if (start < end_kernel_pfn) - start = end_kernel_pfn; - if (end - start >= bootmap_pages) { - bootmap_start = start; - break; - } - } - - if (bootmap_start == -1) - panic("couldn't find a contiguous place for the bootmap"); - - /* Allocate the bootmap and mark the whole MM as reserved. */ - bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start, - node_min_pfn, node_max_pfn); - DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n", - bootmap_start, bootmap_size, bootmap_pages); + memblock_add(PFN_PHYS(node_min_pfn), + (node_max_pfn - node_min_pfn) << PAGE_SHIFT); - /* Mark the free regions. */ - for_each_mem_cluster(memdesc, cluster, i) { - if (cluster->usage & 3) - continue; - - start = cluster->start_pfn; - end = cluster->start_pfn + cluster->numpages; - - if (start >= node_max_pfn || end <= node_min_pfn) - continue; - - if (end > node_max_pfn) - end = node_max_pfn; - if (start < node_min_pfn) - start = node_min_pfn; - - if (start < start_kernel_pfn) { - if (end > end_kernel_pfn) { - free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), - (PFN_PHYS(start_kernel_pfn) - - PFN_PHYS(start))); - printk(" freeing pages %ld:%ld\n", - start, start_kernel_pfn); - start = end_kernel_pfn; - } else if (end > start_kernel_pfn) - end = start_kernel_pfn; - } else if (start < end_kernel_pfn) - start = end_kernel_pfn; - if (start >= end) - continue; - - free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start)); - printk(" freeing pages %ld:%ld\n", start, end); - } - - /* Reserve the bootmap memory. */ - reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start), - bootmap_size, BOOTMEM_DEFAULT); - printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size)); + NODE_DATA(nid)->node_start_pfn = node_min_pfn; + NODE_DATA(nid)->node_present_pages = node_max_pfn - node_min_pfn; node_set_online(nid); } @@ -251,6 +162,7 @@ setup_memory_node(int nid, void *kernel_end) void __init setup_memory(void *kernel_end) { + unsigned long kernel_size; int nid; show_mem_layout(); @@ -262,6 +174,9 @@ setup_memory(void *kernel_end) for (nid = 0; nid < MAX_NUMNODES; nid++) setup_memory_node(nid, kernel_end); + kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS; + memblock_reserve(KERNEL_START_PHYS, kernel_size); + #ifdef CONFIG_BLK_DEV_INITRD initrd_start = INITRD_START; if (initrd_start) { @@ -279,9 +194,8 @@ setup_memory(void *kernel_end) phys_to_virt(PFN_PHYS(max_low_pfn))); } else { nid = kvaddr_to_nid(initrd_start); - reserve_bootmem_node(NODE_DATA(nid), - virt_to_phys((void *)initrd_start), - INITRD_SIZE, BOOTMEM_DEFAULT); + memblock_reserve(virt_to_phys((void *)initrd_start), + INITRD_SIZE); } } #endif /* CONFIG_BLK_DEV_INITRD */ @@ -303,9 +217,8 @@ void __init paging_init(void) dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; for_each_online_node(nid) { - bootmem_data_t *bdata = &bootmem_node_data[nid]; - unsigned long start_pfn = bdata->node_min_pfn; - unsigned long end_pfn = bdata->node_low_pfn; + unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn; + unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_present_pages; if (dma_local_pfn >= end_pfn - start_pfn) zones_size[ZONE_DMA] = end_pfn - start_pfn; diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h @@ -29,6 +29,7 @@ * ptes. * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes). */ +#define __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(pte_t *ptep) { pte_t retval = *ptep; @@ -37,35 +38,4 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return retval; } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - ptep_clear_flush(vma, addr, ptep); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} - #endif /* _ASM_ARM_HUGETLB_3LEVEL_H */ diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h @@ -23,18 +23,8 @@ #define _ASM_ARM_HUGETLB_H #include <asm/page.h> -#include <asm-generic/hugetlb.h> - #include <asm/hugetlb-3level.h> - -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - +#include <asm-generic/hugetlb.h> static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) @@ -42,27 +32,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h @@ -20,48 +20,18 @@ #include <asm/page.h> +#define __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } - - -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { return 0; } -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); @@ -70,20 +40,25 @@ static inline void arch_clear_hugepage_flags(struct page *page) extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable); #define arch_make_huge_pte arch_make_huge_pte +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH extern void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTE_CLEAR extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz); -#define huge_pte_clear huge_pte_clear extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); #define set_huge_swap_pte_at set_huge_swap_pte_at diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h @@ -16,6 +16,7 @@ #ifndef __ASM_STRING_H #define __ASM_STRING_H +#ifndef CONFIG_KASAN #define __HAVE_ARCH_STRRCHR extern char *strrchr(const char *, int c); @@ -34,6 +35,13 @@ extern __kernel_size_t strlen(const char *); #define __HAVE_ARCH_STRNLEN extern __kernel_size_t strnlen(const char *, __kernel_size_t); +#define __HAVE_ARCH_MEMCMP +extern int memcmp(const void *, const void *, size_t); + +#define __HAVE_ARCH_MEMCHR +extern void *memchr(const void *, int, __kernel_size_t); +#endif + #define __HAVE_ARCH_MEMCPY extern void *memcpy(void *, const void *, __kernel_size_t); extern void *__memcpy(void *, const void *, __kernel_size_t); @@ -42,16 +50,10 @@ extern void *__memcpy(void *, const void *, __kernel_size_t); extern void *memmove(void *, const void *, __kernel_size_t); extern void *__memmove(void *, const void *, __kernel_size_t); -#define __HAVE_ARCH_MEMCHR -extern void *memchr(const void *, int, __kernel_size_t); - #define __HAVE_ARCH_MEMSET extern void *memset(void *, int, __kernel_size_t); extern void *__memset(void *, int, __kernel_size_t); -#define __HAVE_ARCH_MEMCMP -extern int memcmp(const void *, const void *, size_t); - #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE void memcpy_flushcache(void *dst, const void *src, size_t cnt); diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c @@ -44,20 +44,23 @@ EXPORT_SYMBOL(__arch_copy_in_user); EXPORT_SYMBOL(memstart_addr); /* string / mem functions */ +#ifndef CONFIG_KASAN EXPORT_SYMBOL(strchr); EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(memcmp); +EXPORT_SYMBOL(memchr); +#endif + EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(__memset); EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(__memmove); -EXPORT_SYMBOL(memchr); -EXPORT_SYMBOL(memcmp); /* atomic bitops */ EXPORT_SYMBOL(set_bit); diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S @@ -30,7 +30,7 @@ * Returns: * x0 - address of first occurrence of 'c' or 0 */ -ENTRY(memchr) +WEAK(memchr) and w1, w1, #0xff 1: subs x2, x2, #1 b.mi 2f diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S @@ -58,7 +58,7 @@ pos .req x11 limit_wd .req x12 mask .req x13 -ENTRY(memcmp) +WEAK(memcmp) cbz limit, .Lret0 eor tmp1, src1, src2 tst tmp1, #7 diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S @@ -29,7 +29,7 @@ * Returns: * x0 - address of first occurrence of 'c' or 0 */ -ENTRY(strchr) +WEAK(strchr) and w1, w1, #0xff 1: ldrb w2, [x0], #1 cmp w2, w1 diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S @@ -60,7 +60,7 @@ tmp3 .req x9 zeroones .req x10 pos .req x11 -ENTRY(strcmp) +WEAK(strcmp) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S @@ -56,7 +56,7 @@ pos .req x12 #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 -ENTRY(strlen) +WEAK(strlen) mov zeroones, #REP8_01 bic src, srcin, #15 ands tmp1, srcin, #15 diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S @@ -64,7 +64,7 @@ limit_wd .req x13 mask .req x14 endloop .req x15 -ENTRY(strncmp) +WEAK(strncmp) cbz limit, .Lret0 eor tmp1, src1, src2 mov zeroones, #REP8_01 diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S @@ -59,7 +59,7 @@ limit_wd .req x14 #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 -ENTRY(strnlen) +WEAK(strnlen) cbz limit, .Lhit_limit mov zeroones, #REP8_01 bic src, srcin, #15 diff --git a/arch/arm64/lib/strrchr.S b/arch/arm64/lib/strrchr.S @@ -29,7 +29,7 @@ * Returns: * x0 - address of last occurrence of 'c' or 0 */ -ENTRY(strrchr) +WEAK(strrchr) mov x3, #0 and w1, w1, #0xff 1: ldrb w2, [x0], #1 diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig @@ -21,6 +21,9 @@ config HEXAGON select GENERIC_IRQ_SHOW select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK + select HAVE_MEMBLOCK + select ARCH_DISCARD_MEMBLOCK + select NO_BOOTMEM select NEED_SG_DMA_LENGTH select NO_IOPORT_MAP select GENERIC_IOMAP diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c @@ -21,6 +21,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <asm/atomic.h> #include <linux/highmem.h> #include <asm/tlb.h> @@ -176,7 +177,6 @@ size_t hexagon_coherent_pool_size = (size_t) (DMA_RESERVE << 22); void __init setup_arch_memory(void) { - int bootmap_size; /* XXX Todo: this probably should be cleaned up */ u32 *segtable = (u32 *) &swapper_pg_dir[0]; u32 *segtable_end; @@ -195,18 +195,22 @@ void __init setup_arch_memory(void) bootmem_lastpg = PFN_DOWN((bootmem_lastpg << PAGE_SHIFT) & ~((BIG_KERNEL_PAGE_SIZE) - 1)); + memblock_add(PHYS_OFFSET, + (bootmem_lastpg - ARCH_PFN_OFFSET) << PAGE_SHIFT); + + /* Reserve kernel text/data/bss */ + memblock_reserve(PHYS_OFFSET, + (bootmem_startpg - ARCH_PFN_OFFSET) << PAGE_SHIFT); /* * Reserve the top DMA_RESERVE bytes of RAM for DMA (uncached) * memory allocation */ - max_low_pfn = bootmem_lastpg - PFN_DOWN(DMA_RESERVED_BYTES); min_low_pfn = ARCH_PFN_OFFSET; - bootmap_size = init_bootmem_node(NODE_DATA(0), bootmem_startpg, min_low_pfn, max_low_pfn); + memblock_reserve(PFN_PHYS(max_low_pfn), DMA_RESERVED_BYTES); printk(KERN_INFO "bootmem_startpg: 0x%08lx\n", bootmem_startpg); printk(KERN_INFO "bootmem_lastpg: 0x%08lx\n", bootmem_lastpg); - printk(KERN_INFO "bootmap_size: %d\n", bootmap_size); printk(KERN_INFO "min_low_pfn: 0x%08lx\n", min_low_pfn); printk(KERN_INFO "max_low_pfn: 0x%08lx\n", max_low_pfn); @@ -257,14 +261,6 @@ void __init setup_arch_memory(void) #endif /* - * Free all the memory that wasn't taken up by the bootmap, the DMA - * reserve, or kernel itself. - */ - free_bootmem(PFN_PHYS(bootmem_startpg) + bootmap_size, - PFN_PHYS(bootmem_lastpg - bootmem_startpg) - bootmap_size - - DMA_RESERVED_BYTES); - - /* * The bootmem allocator seemingly just lives to feed memory * to the paging system */ diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h @@ -3,13 +3,13 @@ #define _ASM_IA64_HUGETLB_H #include <asm/page.h> -#include <asm-generic/hugetlb.h> - +#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); +#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len); @@ -21,53 +21,16 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} - -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } +#include <asm-generic/hugetlb.h> + #endif /* _ASM_IA64_HUGETLB_H */ diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h @@ -544,7 +544,6 @@ extern struct page *zero_page_memmap_ptr; # ifdef CONFIG_VIRTUAL_MEM_MAP /* arch mem_map init routine is needed due to holes in a virtual mem_map */ -# define __HAVE_ARCH_MEMMAP_INIT extern void memmap_init (unsigned long size, int nid, unsigned long zone, unsigned long start_pfn); # endif /* CONFIG_VIRTUAL_MEM_MAP */ diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h @@ -10,8 +10,6 @@ #define __ASM_HUGETLB_H #include <asm/page.h> -#include <asm-generic/hugetlb.h> - static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, @@ -20,6 +18,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } +#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) @@ -38,21 +37,7 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, - unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -64,29 +49,21 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, return pte; } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { flush_tlb_page(vma, addr & huge_page_mask(hstate_vma(vma))); } +#define __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { unsigned long val = pte_val(pte) & ~_PAGE_GLOBAL; return !val || (val == (unsigned long)invalid_pte_table); } -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, @@ -105,13 +82,10 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } +#include <asm-generic/hugetlb.h> + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig @@ -23,6 +23,9 @@ config NIOS2 select SPARSE_IRQ select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS + select HAVE_MEMBLOCK + select ARCH_DISCARD_MEMBLOCK + select NO_BOOTMEM config GENERIC_CSUM def_bool y diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c @@ -32,23 +32,6 @@ #include <asm/sections.h> -void __init early_init_dt_add_memory_arch(u64 base, u64 size) -{ - u64 kernel_start = (u64)virt_to_phys(_text); - - if (!memory_size && - (kernel_start >= base) && (kernel_start < (base + size))) - memory_size = size; - -} - -int __init early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, - bool nomap) -{ - reserve_bootmem(base, size, BOOTMEM_DEFAULT); - return 0; -} - void __init early_init_devtree(void *params) { __be32 *dtb = (u32 *)__dtb_start; diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c @@ -17,6 +17,7 @@ #include <linux/sched/task.h> #include <linux/console.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/initrd.h> #include <linux/of_fdt.h> #include <linux/screen_info.h> @@ -143,10 +144,12 @@ asmlinkage void __init nios2_boot_init(unsigned r4, unsigned r5, unsigned r6, void __init setup_arch(char **cmdline_p) { - int bootmap_size; + int dram_start; console_verbose(); + dram_start = memblock_start_of_DRAM(); + memory_size = memblock_phys_mem_size(); memory_start = PAGE_ALIGN((unsigned long)__pa(_end)); memory_end = (unsigned long) CONFIG_NIOS2_MEM_BASE + memory_size; @@ -163,39 +166,11 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = PFN_DOWN(memory_end); max_mapnr = max_low_pfn; - /* - * give all the memory to the bootmap allocator, tell it to put the - * boot mem_map at the start of memory - */ - pr_debug("init_bootmem_node(?,%#lx, %#x, %#lx)\n", - min_low_pfn, PFN_DOWN(PHYS_OFFSET), max_low_pfn); - bootmap_size = init_bootmem_node(NODE_DATA(0), - min_low_pfn, PFN_DOWN(PHYS_OFFSET), - max_low_pfn); - - /* - * free the usable memory, we have to make sure we do not free - * the bootmem bitmap so we then reserve it after freeing it :-) - */ - pr_debug("free_bootmem(%#lx, %#lx)\n", - memory_start, memory_end - memory_start); - free_bootmem(memory_start, memory_end - memory_start); - - /* - * Reserve the bootmem bitmap itself as well. We do this in two - * steps (first step was init_bootmem()) because this catches - * the (very unlikely) case of us accidentally initializing the - * bootmem allocator with an invalid RAM area. - * - * Arguments are start, size - */ - pr_debug("reserve_bootmem(%#lx, %#x)\n", memory_start, bootmap_size); - reserve_bootmem(memory_start, bootmap_size, BOOTMEM_DEFAULT); - + memblock_reserve(dram_start, memory_start - dram_start); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start) { - reserve_bootmem(virt_to_phys((void *)initrd_start), - initrd_end - initrd_start, BOOTMEM_DEFAULT); + memblock_reserve(virt_to_phys((void *)initrd_start), + initrd_end - initrd_start); } #endif /* CONFIG_BLK_DEV_INITRD */ diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h @@ -3,12 +3,12 @@ #define _ASM_PARISC64_HUGETLB_H #include <asm/page.h> -#include <asm-generic/hugetlb.h> - +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); @@ -22,6 +22,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. */ +#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { @@ -32,43 +33,25 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } +#include <asm-generic/hugetlb.h> + #endif /* _ASM_PARISC64_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -311,12 +311,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, { pte_update(ptep, _PAGE_RW, 0); } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t entry, diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -426,6 +426,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0); } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h @@ -4,7 +4,6 @@ #ifdef CONFIG_HUGETLB_PAGE #include <asm/page.h> -#include <asm-generic/hugetlb.h> extern struct kmem_cache *hugepte_cache; @@ -110,31 +109,12 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); #endif +#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -145,6 +125,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, #endif } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -153,29 +134,17 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, flush_hugetlb_page(vma, addr); } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } +#include <asm-generic/hugetlb.h> + #else /* ! CONFIG_HUGETLB_PAGE */ static inline void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -300,12 +300,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_update(ptep, clr, set); } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t entry, diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -275,6 +275,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c @@ -49,7 +49,7 @@ static int calc_freq(struct spu_gov_info_struct *info) cpu = info->policy->cpu; busy_spus = atomic_read(&cbe_spu_info[cpu_to_node(cpu)].busy_spus); - CALC_LOAD(info->busy_spus, EXP, busy_spus * FIXED_1); + info->busy_spus = calc_load(info->busy_spus, EXP, busy_spus * FIXED_1); pr_debug("cpu %d: busy_spus=%d, info->busy_spus=%ld\n", cpu, busy_spus, info->busy_spus); diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c @@ -987,9 +987,9 @@ static void spu_calc_load(void) unsigned long active_tasks; /* fixed-point */ active_tasks = count_active_contexts() * FIXED_1; - CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks); - CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks); - CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks); + spu_avenrun[0] = calc_load(spu_avenrun[0], EXP_1, active_tasks); + spu_avenrun[1] = calc_load(spu_avenrun[1], EXP_5, active_tasks); + spu_avenrun[2] = calc_load(spu_avenrun[2], EXP_15, active_tasks); } static void spusched_wake(struct timer_list *unused) @@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx, } } -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int show_spu_loadavg(struct seq_file *s, void *private) { int a, b, c; diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c @@ -25,10 +25,6 @@ #include "appldata.h" - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - /* * OS data * diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h @@ -4,8 +4,6 @@ #include <asm/cacheflush.h> #include <asm/page.h> -#include <asm-generic/hugetlb.h> - static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, @@ -17,6 +15,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. */ +#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { @@ -27,62 +26,17 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} - -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); } +#include <asm-generic/hugetlb.h> + #endif /* _ASM_SH_HUGETLB_H */ diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h @@ -3,7 +3,6 @@ #define _ASM_SPARC64_HUGETLB_H #include <asm/page.h> -#include <asm-generic/hugetlb.h> #ifdef CONFIG_HUGETLB_PAGE struct pud_huge_patch_entry { @@ -13,9 +12,11 @@ struct pud_huge_patch_entry { extern struct pud_huge_patch_entry __pud_huge_patch, __pud_huge_patch_end; #endif +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); @@ -25,37 +26,13 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -63,6 +40,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte)); } +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) @@ -75,17 +53,15 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } +#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); +#include <asm-generic/hugetlb.h> + #endif /* _ASM_SPARC64_HUGETLB_H */ diff --git a/arch/um/Kconfig b/arch/um/Kconfig @@ -12,6 +12,8 @@ config UML select HAVE_UID16 select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_DEBUG_KMEMLEAK + select HAVE_MEMBLOCK + select NO_BOOTMEM select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select GENERIC_CLOCKEVENTS diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/pfn.h> #include <asm/page.h> @@ -80,28 +81,23 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, unsigned long len, unsigned long long highmem) { unsigned long reserve = reserve_end - start; - unsigned long pfn = PFN_UP(__pa(reserve_end)); - unsigned long delta = (len - reserve) >> PAGE_SHIFT; - unsigned long offset, bootmap_size; - long map_size; + long map_size = len - reserve; int err; - offset = uml_reserved - uml_physmem; - map_size = len - offset; if(map_size <= 0) { os_warn("Too few physical memory! Needed=%lu, given=%lu\n", - offset, len); + reserve, len); exit(1); } physmem_fd = create_mem_file(len + highmem); - err = os_map_memory((void *) uml_reserved, physmem_fd, offset, + err = os_map_memory((void *) reserve_end, physmem_fd, reserve, map_size, 1, 1, 1); if (err < 0) { os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p " "failed - errno = %d\n", map_size, - (void *) uml_reserved, err); + (void *) reserve_end, err); exit(1); } @@ -113,9 +109,11 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); os_fsync_file(physmem_fd); - bootmap_size = init_bootmem(pfn, pfn + delta); - free_bootmem(__pa(reserve_end) + bootmap_size, - len - bootmap_size - reserve); + memblock_add(__pa(start), len + highmem); + memblock_reserve(__pa(start), reserve); + + min_low_pfn = PFN_UP(__pa(reserve_end)); + max_low_pfn = min_low_pfn + (map_size >> PAGE_SHIFT); } int phys_mapping(unsigned long phys, unsigned long long *offset_out) diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig @@ -6,6 +6,7 @@ config UNICORE32 select ARCH_MIGHT_HAVE_PC_SERIO select DMA_DIRECT_OPS select HAVE_MEMBLOCK + select NO_BOOTMEM select HAVE_GENERIC_DMA_COHERENT select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c @@ -84,58 +84,6 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low, } } -static void __init uc32_bootmem_init(unsigned long start_pfn, - unsigned long end_pfn) -{ - struct memblock_region *reg; - unsigned int boot_pages; - phys_addr_t bitmap; - pg_data_t *pgdat; - - /* - * Allocate the bootmem bitmap page. This must be in a region - * of memory which has already been mapped. - */ - boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn); - bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES, - __pfn_to_phys(end_pfn)); - - /* - * Initialise the bootmem allocator, handing the - * memory banks over to bootmem. - */ - node_set_online(0); - pgdat = NODE_DATA(0); - init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn); - - /* Free the lowmem regions from memblock into bootmem. */ - for_each_memblock(memory, reg) { - unsigned long start = memblock_region_memory_base_pfn(reg); - unsigned long end = memblock_region_memory_end_pfn(reg); - - if (end >= end_pfn) - end = end_pfn; - if (start >= end) - break; - - free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT); - } - - /* Reserve the lowmem memblock reserved regions in bootmem. */ - for_each_memblock(reserved, reg) { - unsigned long start = memblock_region_reserved_base_pfn(reg); - unsigned long end = memblock_region_reserved_end_pfn(reg); - - if (end >= end_pfn) - end = end_pfn; - if (start >= end) - break; - - reserve_bootmem(__pfn_to_phys(start), - (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT); - } -} - static void __init uc32_bootmem_free(unsigned long min, unsigned long max_low, unsigned long max_high) { @@ -232,7 +180,7 @@ void __init bootmem_init(void) find_limits(&min, &max_low, &max_high); - uc32_bootmem_init(min, max_low); + node_set_online(0); /* * Sparsemem tries to allocate bootmem in memory_present(), diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c @@ -39,7 +39,7 @@ void __init init_vdso_image(const struct vdso_image *image) struct linux_binprm; -static int vdso_fault(const struct vm_special_mapping *sm, +static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { const struct vdso_image *image = vma->vm_mm->context.vdso_image; @@ -84,12 +84,11 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -static int vvar_fault(const struct vm_special_mapping *sm, +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { const struct vdso_image *image = vma->vm_mm->context.vdso_image; long sym_offset; - int ret = -EFAULT; if (!image) return VM_FAULT_SIGBUS; @@ -108,29 +107,24 @@ static int vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; if (sym_offset == image->sym_vvar_page) { - ret = vm_insert_pfn(vma, vmf->address, - __pa_symbol(&__vvar_page) >> PAGE_SHIFT); + return vmf_insert_pfn(vma, vmf->address, + __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = pvclock_get_pvti_cpu0_va(); if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { - ret = vm_insert_pfn_prot( - vma, - vmf->address, - __pa(pvti) >> PAGE_SHIFT, - pgprot_decrypted(vma->vm_page_prot)); + return vmf_insert_pfn_prot(vma, vmf->address, + __pa(pvti) >> PAGE_SHIFT, + pgprot_decrypted(vma->vm_page_prot)); } } else if (sym_offset == image->sym_hvclock_page) { struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) - ret = vm_insert_pfn(vma, vmf->address, - vmalloc_to_pfn(tsc_pg)); + return vmf_insert_pfn(vma, vmf->address, + vmalloc_to_pfn(tsc_pg)); } - if (ret == 0 || ret == -EBUSY) - return VM_FAULT_NOPAGE; - return VM_FAULT_SIGBUS; } diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h @@ -13,75 +13,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - ptep_clear_flush(vma, addr, ptep); -} - -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} - -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c @@ -1248,7 +1248,6 @@ void __init e820__memblock_setup(void) { int i; u64 end; - u64 addr = 0; /* * The bootstrap memblock region count maximum is 128 entries @@ -1265,21 +1264,13 @@ void __init e820__memblock_setup(void) struct e820_entry *entry = &e820_table->entries[i]; end = entry->addr + entry->size; - if (addr < entry->addr) - memblock_reserve(addr, entry->addr - addr); - addr = end; if (end != (resource_size_t)end) continue; - /* - * all !E820_TYPE_RAM ranges (including gap ranges) are put - * into memblock.reserved to make sure that struct pages in - * such regions are not left uninitialized after bootup. - */ if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) - memblock_reserve(entry->addr, entry->size); - else - memblock_add(entry->addr, entry->size); + continue; + + memblock_add(entry->addr, entry->size); } /* Throw away partial pages: */ diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild @@ -26,5 +26,6 @@ generic-y += rwsem.h generic-y += sections.h generic-y += topology.h generic-y += trace_clock.h +generic-y += vga.h generic-y += word-at-a-time.h generic-y += xor.h diff --git a/arch/xtensa/include/asm/vga.h b/arch/xtensa/include/asm/vga.h @@ -1,19 +0,0 @@ -/* - * include/asm-xtensa/vga.h - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ - -#ifndef _XTENSA_VGA_H -#define _XTENSA_VGA_H - -#define VGA_MAP_MEM(x,s) (unsigned long)phys_to_virt(x) - -#define vga_readb(x) (*(x)) -#define vga_writeb(x,y) (*(y) = (x)) - -#endif diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c @@ -153,7 +153,7 @@ struct iolatency_grp { #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC /* * These are the constants used to fake the fixed-point moving average - * calculation just like load average. The call to CALC_LOAD folds + * calculation just like load average. The call to calc_load() folds * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling * window size is bucketed to try to approximately calculate average * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows @@ -248,7 +248,7 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, return; /* - * CALC_LOAD takes in a number stored in fixed point representation. + * calc_load() takes in a number stored in fixed point representation. * Because we are using this for IO time in ns, the values stored * are significantly larger than the FIXED_1 denominator (2048). * Therefore, rounding errors in the calculation are negligible and @@ -257,7 +257,9 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, div64_u64(iolat->cur_win_nsec, BLKIOLATENCY_EXP_BUCKET_SIZE)); - CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean); + iolat->lat_avg = calc_load(iolat->lat_avg, + iolatency_exp_factors[exp_idx], + stat->rqs.mean); } static inline bool iolatency_may_queue(struct iolatency_grp *iolat, diff --git a/drivers/base/node.c b/drivers/base/node.c @@ -67,8 +67,11 @@ static ssize_t node_read_meminfo(struct device *dev, int nid = dev->id; struct pglist_data *pgdat = NODE_DATA(nid); struct sysinfo i; + unsigned long sreclaimable, sunreclaimable; si_meminfo_node(&i, nid); + sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE); + sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE); n = sprintf(buf, "Node %d MemTotal: %8lu kB\n" "Node %d MemFree: %8lu kB\n" @@ -118,6 +121,7 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" "Node %d WritebackTmp: %8lu kB\n" + "Node %d KReclaimable: %8lu kB\n" "Node %d Slab: %8lu kB\n" "Node %d SReclaimable: %8lu kB\n" "Node %d SUnreclaim: %8lu kB\n" @@ -138,20 +142,21 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE) + - node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), - nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE)), + nid, K(sreclaimable + + node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)), + nid, K(sreclaimable + sunreclaimable), + nid, K(sreclaimable), + nid, K(sunreclaimable) #ifdef CONFIG_TRANSPARENT_HUGEPAGE - nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), + , nid, K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * - HPAGE_PMD_NR)); -#else - nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE))); + HPAGE_PMD_NR) #endif + ); n += hugetlb_report_node_meminfo(nid, buf + n); return n; } diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c @@ -130,10 +130,6 @@ struct menu_device { int interval_ptr; }; - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static inline int get_loadavg(unsigned long load) { return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10; diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -77,7 +77,6 @@ static void do_remove(struct mmu_rb_handler *handler, static void handle_remove(struct work_struct *work); static const struct mmu_notifier_ops mn_opts = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = mmu_notifier_range_start, }; diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c @@ -427,7 +427,6 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops iommu_mn = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = mn_release, .clear_flush_young = mn_clear_flush_young, .invalidate_range = mn_invalidate_range, diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c @@ -273,7 +273,6 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops intel_mmuops = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = intel_mm_release, .change_pte = intel_change_pte, .invalidate_range = intel_invalidate_range, diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c @@ -261,7 +261,6 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops gru_mmuops = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = gru_invalidate_range_start, .invalidate_range_end = gru_invalidate_range_end, .release = gru_release, diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c @@ -1127,12 +1127,13 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) { const u64 phys_offset = MIN_MEMBLOCK_ADDR; + if (size < PAGE_SIZE - (base & ~PAGE_MASK)) { + pr_warn("Ignoring memory block 0x%llx - 0x%llx\n", + base, base + size); + return; + } + if (!PAGE_ALIGNED(base)) { - if (size < PAGE_SIZE - (base & ~PAGE_MASK)) { - pr_warn("Ignoring memory block 0x%llx - 0x%llx\n", - base, base + size); - return; - } size -= PAGE_SIZE - (base & ~PAGE_MASK); base = PAGE_ALIGN(base); } diff --git a/drivers/staging/android/ion/ion_page_pool.c b/drivers/staging/android/ion/ion_page_pool.c @@ -33,8 +33,8 @@ static void ion_page_pool_add(struct ion_page_pool *pool, struct page *page) pool->low_count++; } - mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES, - (1 << (PAGE_SHIFT + pool->order))); + mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE, + 1 << pool->order); mutex_unlock(&pool->mutex); } @@ -53,8 +53,8 @@ static struct page *ion_page_pool_remove(struct ion_page_pool *pool, bool high) } list_del(&page->lru); - mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES, - -(1 << (PAGE_SHIFT + pool->order))); + mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE, + -(1 << pool->order)); return page; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c @@ -418,9 +418,12 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) int i; vma->vm_flags |= VM_MIXEDMAP; for (i = 0; i < pages && !ret; i++) { + vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); - ret = vm_insert_mixed(vma, vma->vm_start + off, pfn); + vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn); + if (vmf & VM_FAULT_ERROR) + ret = vm_fault_to_errno(vmf, 0); } } diff --git a/fs/dcache.c b/fs/dcache.c @@ -257,24 +257,10 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -static void __d_free_external_name(struct rcu_head *head) -{ - struct external_name *name = container_of(head, struct external_name, - u.head); - - mod_node_page_state(page_pgdat(virt_to_page(name)), - NR_INDIRECTLY_RECLAIMABLE_BYTES, - -ksize(name)); - - kfree(name); -} - static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - - __d_free_external_name(&external_name(dentry)->u.head); - + kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } @@ -306,7 +292,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name) struct external_name *p; p = container_of(name->name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) - call_rcu(&p->u.head, __d_free_external_name); + kfree_rcu(p, u.head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -1606,7 +1592,6 @@ EXPORT_SYMBOL(d_invalidate); struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { - struct external_name *ext = NULL; struct dentry *dentry; char *dname; int err; @@ -1627,14 +1612,15 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - - ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); - if (!ext) { + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT | + __GFP_RECLAIMABLE); + if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&ext->u.count, 1); - dname = ext->name; + atomic_set(&p->u.count, 1); + dname = p->name; } else { dname = dentry->d_iname; } @@ -1673,12 +1659,6 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } } - if (unlikely(ext)) { - pg_data_t *pgdat = page_pgdat(virt_to_page(ext)); - mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES, - ksize(ext)); - } - this_cpu_inc(nr_dentry); return dentry; @@ -2707,7 +2687,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target) dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - call_rcu(&old_name->u.head, __d_free_external_name); + kfree_rcu(old_name, u.head); } /* diff --git a/fs/iomap.c b/fs/iomap.c @@ -1057,7 +1057,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, return length; } -int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c @@ -236,6 +236,9 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; + /* sysfs dentries and inodes don't require IO to create */ + sb->s_shrink.seeks = 0; + /* get root inode, initialize and unlock it */ mutex_lock(&kernfs_mutex); inode = kernfs_get_inode(sb, info->root->kn); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c @@ -5106,8 +5106,6 @@ int ocfs2_split_extent(handle_t *handle, * rightmost extent list. */ if (path->p_tree_depth) { - struct ocfs2_extent_block *eb; - ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &last_eb_bh); @@ -5115,8 +5113,6 @@ int ocfs2_split_extent(handle_t *handle, mlog_errno(ret); goto out; } - - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; } if (rec->e_cpos == split_rec->e_cpos && diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c @@ -1392,8 +1392,7 @@ retry: unlock: spin_unlock(&oi->ip_lock); out: - if (new) - kfree(new); + kfree(new); return ret; } diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c @@ -329,7 +329,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) { char *buf; - buf = (char *) get_zeroed_page(GFP_NOFS); + buf = (char *) get_zeroed_page(GFP_ATOMIC); if (buf) { dump_mle(mle, buf, PAGE_SIZE - 1); free_page((unsigned long)buf); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c @@ -531,7 +531,7 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); /* don't shuffle secondary queues */ - if ((res->owner == dlm->node_num)) { + if (res->owner == dlm->node_num) { if (res->state & (DLM_LOCK_RES_MIGRATING | DLM_LOCK_RES_BLOCK_DIRTY)) return; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c @@ -4135,7 +4135,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, struct buffer_head *ref_root_bh = NULL; struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); - struct ocfs2_refcount_block *rb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_refcount_tree *ref_tree; @@ -4162,7 +4161,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, mlog_errno(ret); goto out; } - rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, &ref_tree->rf_ci, ref_root_bh, diff --git a/fs/proc/inode.c b/fs/proc/inode.c @@ -516,6 +516,9 @@ int proc_fill_super(struct super_block *s, void *data, int silent) */ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + /* procfs dentries and inodes don't require IO to create */ + s->s_shrink.seeks = 0; + pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); if (!root_inode) { diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c @@ -10,9 +10,6 @@ #include <linux/seqlock.h> #include <linux/time.h> -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int loadavg_proc_show(struct seq_file *m, void *v) { unsigned long avnrun[3]; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c @@ -38,6 +38,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) long cached; long available; unsigned long pages[NR_LRU_LISTS]; + unsigned long sreclaimable, sunreclaim; int lru; si_meminfo(&i); @@ -53,6 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); + sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE); + sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); @@ -94,14 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Mapped: ", global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); - show_val_kb(m, "Slab: ", - global_node_page_state(NR_SLAB_RECLAIMABLE) + - global_node_page_state(NR_SLAB_UNRECLAIMABLE)); - - show_val_kb(m, "SReclaimable: ", - global_node_page_state(NR_SLAB_RECLAIMABLE)); - show_val_kb(m, "SUnreclaim: ", - global_node_page_state(NR_SLAB_UNRECLAIMABLE)); + show_val_kb(m, "KReclaimable: ", sreclaimable + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); + show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); + show_val_kb(m, "SReclaimable: ", sreclaimable); + show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c @@ -713,6 +713,8 @@ static void smap_gather_stats(struct vm_area_struct *vma, smaps_walk.private = mss; #ifdef CONFIG_SHMEM + /* In case of smaps_rollup, reset the value from previous vma */ + mss->check_shmem_swap = false; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -728,7 +730,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE)) { - mss->swap = shmem_swapped; + mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; smaps_walk.pte_hole = smaps_pte_hole; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c @@ -1026,7 +1026,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, struct userfaultfd_ctx *fork_nctx = NULL; /* always take the fd_wqh lock before the fault_pending_wqh lock */ - spin_lock(&ctx->fd_wqh.lock); + spin_lock_irq(&ctx->fd_wqh.lock); __add_wait_queue(&ctx->fd_wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -1112,13 +1112,13 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, ret = -EAGAIN; break; } - spin_unlock(&ctx->fd_wqh.lock); + spin_unlock_irq(&ctx->fd_wqh.lock); schedule(); - spin_lock(&ctx->fd_wqh.lock); + spin_lock_irq(&ctx->fd_wqh.lock); } __remove_wait_queue(&ctx->fd_wqh, &wait); __set_current_state(TASK_RUNNING); - spin_unlock(&ctx->fd_wqh.lock); + spin_unlock_irq(&ctx->fd_wqh.lock); if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(ctx, fork_nctx, msg); diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h @@ -32,7 +32,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } -#ifndef huge_pte_clear +#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { @@ -40,4 +40,90 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, } #endif +#ifndef __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE +static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + free_pgd_range(tlb, addr, end, floor, ceiling); +} +#endif + +#ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT +static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + set_pte_at(mm, addr, ptep, pte); +} +#endif + +#ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + return ptep_get_and_clear(mm, addr, ptep); +} +#endif + +#ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH +static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + ptep_clear_flush(vma, addr, ptep); +} +#endif + +#ifndef __HAVE_ARCH_HUGE_PTE_NONE +static inline int huge_pte_none(pte_t pte) +{ + return pte_none(pte); +} +#endif + +#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT +static inline pte_t huge_pte_wrprotect(pte_t pte) +{ + return pte_wrprotect(pte); +} +#endif + +#ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE +static inline int prepare_hugepage_range(struct file *file, + unsigned long addr, unsigned long len) +{ + struct hstate *h = hstate_file(file); + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (addr & ~huge_page_mask(h)) + return -EINVAL; + + return 0; +} +#endif + +#ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT +static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + ptep_set_wrprotect(mm, addr, ptep); +} +#endif + +#ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS +static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) +{ + return ptep_set_access_flags(vma, addr, ptep, pte, dirty); +} +#endif + +#ifndef __HAVE_ARCH_HUGE_PTEP_GET +static inline pte_t huge_ptep_get(pte_t *ptep) +{ + return *ptep; +} +#endif + #endif /* _ASM_GENERIC_HUGETLB_H */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h @@ -757,7 +757,7 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, - * vm_insert_pfn. + * vmf_insert_pfn. */ /* @@ -773,7 +773,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, /* * track_pfn_insert is called when a _new_ single pfn is established - * by vm_insert_pfn(). + * by vmf_insert_pfn(). */ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h @@ -20,6 +20,7 @@ #include <linux/u64_stats_sync.h> #include <linux/workqueue.h> #include <linux/bpf-cgroup.h> +#include <linux/psi_types.h> #ifdef CONFIG_CGROUPS @@ -436,6 +437,9 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work; + /* used to track pressure stalls */ + struct psi_group psi; + /* used to store eBPF programs */ struct cgroup_bpf bpf; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h @@ -650,6 +650,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return &cgrp->psi; +} + static inline void cgroup_init_kthreadd(void) { /* @@ -703,6 +708,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) return NULL; } +static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + return NULL; +} + +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return NULL; +} + static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h @@ -57,7 +57,12 @@ struct task_delay_info { u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ + + u64 thrashing_start; + u64 thrashing_delay; /* wait for thrashing page */ + u32 freepages_count; /* total count of memory reclaim */ + u32 thrashing_count; /* total count of thrash waits */ }; #endif @@ -76,6 +81,8 @@ extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); +extern void __delayacct_thrashing_start(void); +extern void __delayacct_thrashing_end(void); static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { @@ -156,6 +163,18 @@ static inline void delayacct_freepages_end(void) __delayacct_freepages_end(); } +static inline void delayacct_thrashing_start(void) +{ + if (current->delays) + __delayacct_thrashing_start(); +} + +static inline void delayacct_thrashing_end(void) +{ + if (current->delays) + __delayacct_thrashing_end(); +} + #else static inline void delayacct_set_flag(int flag) {} @@ -182,6 +201,10 @@ static inline void delayacct_freepages_start(void) {} static inline void delayacct_freepages_end(void) {} +static inline void delayacct_thrashing_start(void) +{} +static inline void delayacct_thrashing_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/include/linux/hmm.h b/include/linux/hmm.h @@ -107,7 +107,7 @@ enum hmm_pfn_flag_e { * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the - * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not + * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not * be mirrored by a device, because the entry will never have HMM_PFN_VALID * set and the pfn value is undefined. * diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h @@ -213,9 +213,9 @@ static inline int hpage_nr_pages(struct page *page) } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags); + pmd_t *pmd, int flags, struct dev_pagemap **pgmap); struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags); + pud_t *pud, int flags, struct dev_pagemap **pgmap); extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); @@ -344,13 +344,13 @@ static inline void mm_put_huge_zero_page(struct mm_struct *mm) } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, int flags) + unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, - unsigned long addr, pud_t *pud, int flags) + unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap) { return NULL; } diff --git a/include/linux/iomap.h b/include/linux/iomap.h @@ -6,6 +6,7 @@ #include <linux/bitmap.h> #include <linux/mm.h> #include <linux/types.h> +#include <linux/mm_types.h> struct address_space; struct fiemap_extent_info; @@ -141,7 +142,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, + const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, diff --git a/include/linux/linkage.h b/include/linux/linkage.h @@ -90,6 +90,7 @@ #ifndef WEAK #define WEAK(name) \ .weak name ASM_NL \ + ALIGN ASM_NL \ name: #endif diff --git a/include/linux/math64.h b/include/linux/math64.h @@ -281,4 +281,7 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) } #endif /* mul_u64_u32_div */ +#define DIV64_U64_ROUND_UP(ll, d) \ + ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) + #endif /* _LINUX_MATH64_H */ diff --git a/include/linux/memblock.h b/include/linux/memblock.h @@ -265,21 +265,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) -/** - * for_each_resv_unavail_range - iterate through reserved and unavailable memory - * @i: u64 used as loop variable - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * - * Walks over unavailable but reserved (reserved && !memory) areas of memblock. - * Available as soon as memblock is initialized. - * Note: because this memory does not belong to any physical node, flags and - * nid arguments do not make sense and thus not exported as arguments. - */ -#define for_each_resv_unavail_range(i, p_start, p_end) \ - for_each_mem_range(i, &memblock.reserved, &memblock.memory, \ - NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL) - static inline void memblock_set_region_flags(struct memblock_region *r, enum memblock_flags flags) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h @@ -78,7 +78,7 @@ struct mem_cgroup_reclaim_cookie { struct mem_cgroup_id { int id; - atomic_t ref; + refcount_t ref; }; /* @@ -1268,10 +1268,11 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); void memcg_kmem_put_cache(struct kmem_cache *cachep); int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg); + +#ifdef CONFIG_MEMCG_KMEM int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void memcg_kmem_uncharge(struct page *page, int order); -#ifdef CONFIG_MEMCG_KMEM extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1307,6 +1308,16 @@ extern int memcg_expand_shrinker_maps(int new_id); extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else + +static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + return 0; +} + +static inline void memcg_kmem_uncharge(struct page *page, int order) +{ +} + #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) diff --git a/include/linux/mm.h b/include/linux/mm.h @@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } +extern void memmap_init_zone_device(struct zone *, unsigned long, + unsigned long, struct dev_pagemap *); #else static inline bool is_zone_device_page(const struct page *page) { @@ -2304,6 +2306,8 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); +extern int __do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); @@ -2502,11 +2506,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); -int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); @@ -2525,32 +2529,6 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn) -{ - int err = vm_insert_mixed(vma, addr, pfn); - - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; -} - -static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn) -{ - int err = vm_insert_pfn(vma, addr, pfn); - - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; -} - static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) @@ -2558,16 +2536,8 @@ static inline vm_fault_t vmf_error(int err) return VM_FAULT_SIGBUS; } -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int foll_flags, - unsigned int *page_mask); - -static inline struct page *follow_page(struct vm_area_struct *vma, - unsigned long address, unsigned int foll_flags) -{ - unsigned int unused_page_mask; - return follow_page_mask(vma, address, foll_flags, &unused_page_mask); -} +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags); #define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_TOUCH 0x02 /* mark page accessed */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h @@ -2,7 +2,6 @@ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H -#include <linux/types.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> @@ -11,9 +10,6 @@ struct mmu_notifier; struct mmu_notifier_ops; -/* mmu_notifier_ops flags */ -#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01) - #ifdef CONFIG_MMU_NOTIFIER /* @@ -31,15 +27,6 @@ struct mmu_notifier_mm { struct mmu_notifier_ops { /* - * Flags to specify behavior of callbacks for this MMU notifier. - * Used to determine which context an operation may be called. - * - * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not - * block - */ - int flags; - - /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are * freed. This can run concurrently with other mmu notifier @@ -153,7 +140,9 @@ struct mmu_notifier_ops { * * If blockable argument is set to false then the callback cannot * sleep and has to return with -EAGAIN. 0 should be returned - * otherwise. + * otherwise. Please note that if invalidate_range_start approves + * a non-blocking behavior then the same applies to + * invalidate_range_end. * */ int (*invalidate_range_start)(struct mmu_notifier *mn, @@ -181,10 +170,6 @@ struct mmu_notifier_ops { * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if * called between those functions. - * - * If this callback cannot block, and invalidate_range_{start,end} - * cannot block, mmu_notifier_ops.flags should have - * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -239,7 +224,6 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); -extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm); static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -493,11 +477,6 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, { } -static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) -{ - return false; -} - static inline void mmu_notifier_mm_init(struct mm_struct *mm) { } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h @@ -161,8 +161,10 @@ enum node_stat_item { NR_SLAB_UNRECLAIMABLE, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ + WORKINGSET_NODES, WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, + WORKINGSET_RESTORE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. @@ -180,7 +182,7 @@ enum node_stat_item { NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ - NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */ + NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h @@ -69,13 +69,14 @@ */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ - PG_error, PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_active, + PG_workingset, PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ + PG_error, PG_slab, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ PG_arch_1, @@ -162,6 +163,14 @@ static inline int PagePoisoned(const struct page *page) return page->flags == PAGE_POISON_PATTERN; } +#ifdef CONFIG_DEBUG_VM +void page_init_poison(struct page *page, size_t size); +#else +static inline void page_init_poison(struct page *page, size_t size) +{ +} +#endif + /* * Page flags policies wrt compound pages * @@ -280,6 +289,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) TESTCLEARFLAG(Active, active, PF_HEAD) +PAGEFLAG(Workingset, workingset, PF_HEAD) + TESTCLEARFLAG(Workingset, workingset, PF_HEAD) __PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ @@ -292,6 +303,7 @@ PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) + __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h @@ -9,8 +9,10 @@ * PFN_SG_LAST - pfn references a page and is the last scatterlist entry * PFN_DEV - pfn is not covered by system memmap by default * PFN_MAP - pfn has a dynamic page mapping established by a device driver + * PFN_SPECIAL - for CONFIG_FS_DAX_LIMITED builds to allow XIP, but not + * get_user_pages */ -#define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) +#define PFN_FLAGS_MASK (((u64) (~PAGE_MASK)) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) #define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) #define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2)) #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) diff --git a/include/linux/psi.h b/include/linux/psi.h @@ -0,0 +1,53 @@ +#ifndef _LINUX_PSI_H +#define _LINUX_PSI_H + +#include <linux/psi_types.h> +#include <linux/sched.h> + +struct seq_file; +struct css_set; + +#ifdef CONFIG_PSI + +extern bool psi_disabled; + +void psi_init(void); + +void psi_task_change(struct task_struct *task, int clear, int set); + +void psi_memstall_tick(struct task_struct *task, int cpu); +void psi_memstall_enter(unsigned long *flags); +void psi_memstall_leave(unsigned long *flags); + +int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgrp); +void psi_cgroup_free(struct cgroup *cgrp); +void cgroup_move_task(struct task_struct *p, struct css_set *to); +#endif + +#else /* CONFIG_PSI */ + +static inline void psi_init(void) {} + +static inline void psi_memstall_enter(unsigned long *flags) {} +static inline void psi_memstall_leave(unsigned long *flags) {} + +#ifdef CONFIG_CGROUPS +static inline int psi_cgroup_alloc(struct cgroup *cgrp) +{ + return 0; +} +static inline void psi_cgroup_free(struct cgroup *cgrp) +{ +} +static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) +{ + rcu_assign_pointer(p->cgroups, to); +} +#endif + +#endif /* CONFIG_PSI */ + +#endif /* _LINUX_PSI_H */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h @@ -0,0 +1,92 @@ +#ifndef _LINUX_PSI_TYPES_H +#define _LINUX_PSI_TYPES_H + +#include <linux/seqlock.h> +#include <linux/types.h> + +#ifdef CONFIG_PSI + +/* Tracked task states */ +enum psi_task_count { + NR_IOWAIT, + NR_MEMSTALL, + NR_RUNNING, + NR_PSI_TASK_COUNTS, +}; + +/* Task state bitmasks */ +#define TSK_IOWAIT (1 << NR_IOWAIT) +#define TSK_MEMSTALL (1 << NR_MEMSTALL) +#define TSK_RUNNING (1 << NR_RUNNING) + +/* Resources that workloads could be stalled on */ +enum psi_res { + PSI_IO, + PSI_MEM, + PSI_CPU, + NR_PSI_RESOURCES, +}; + +/* + * Pressure states for each resource: + * + * SOME: Stalled tasks & working tasks + * FULL: Stalled tasks & no working tasks + */ +enum psi_states { + PSI_IO_SOME, + PSI_IO_FULL, + PSI_MEM_SOME, + PSI_MEM_FULL, + PSI_CPU_SOME, + /* Only per-CPU, to weigh the CPU in the global average: */ + PSI_NONIDLE, + NR_PSI_STATES, +}; + +struct psi_group_cpu { + /* 1st cacheline updated by the scheduler */ + + /* Aggregator needs to know of concurrent changes */ + seqcount_t seq ____cacheline_aligned_in_smp; + + /* States of the tasks belonging to this group */ + unsigned int tasks[NR_PSI_TASK_COUNTS]; + + /* Period time sampling buckets for each state of interest (ns) */ + u32 times[NR_PSI_STATES]; + + /* Time of last task change in this group (rq_clock) */ + u64 state_start; + + /* 2nd cacheline updated by the aggregator */ + + /* Delta detection against the sampling buckets */ + u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp; +}; + +struct psi_group { + /* Protects data updated during an aggregation */ + struct mutex stat_lock; + + /* Per-cpu task state & time tracking */ + struct psi_group_cpu __percpu *pcpu; + + /* Periodic aggregation state */ + u64 total_prev[NR_PSI_STATES - 1]; + u64 last_update; + u64 next_update; + struct delayed_work clock_work; + + /* Total stall times and sampled pressure averages */ + u64 total[NR_PSI_STATES - 1]; + unsigned long avg[NR_PSI_STATES - 1][3]; +}; + +#else /* CONFIG_PSI */ + +struct psi_group { }; + +#endif /* CONFIG_PSI */ + +#endif /* _LINUX_PSI_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h @@ -25,6 +25,7 @@ #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/signal_types.h> +#include <linux/psi_types.h> #include <linux/mm_types_task.h> #include <linux/task_io_accounting.h> #include <linux/rseq.h> @@ -706,6 +707,10 @@ struct task_struct { unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_remote_wakeup:1; +#ifdef CONFIG_PSI + unsigned sched_psi_wake_requeue:1; +#endif + /* Force alignment to the next boundary: */ unsigned :0; @@ -719,9 +724,6 @@ struct task_struct { #endif #ifdef CONFIG_MEMCG unsigned in_user_fault:1; -#ifdef CONFIG_MEMCG_KMEM - unsigned memcg_kmem_skip_account:1; -#endif #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; @@ -965,6 +967,10 @@ struct task_struct { kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; +#ifdef CONFIG_PSI + /* Pressure stall state */ + unsigned int psi_flags; +#endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; @@ -1391,6 +1397,7 @@ extern struct pid *cad_pid; #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ +#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h @@ -22,10 +22,26 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); #define EXP_5 2014 /* 1/exp(5sec/5min) */ #define EXP_15 2037 /* 1/exp(5sec/15min) */ -#define CALC_LOAD(load,exp,n) \ - load *= exp; \ - load += n*(FIXED_1-exp); \ - load >>= FSHIFT; +/* + * a1 = a0 * e + a * (1 - e) + */ +static inline unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; +} + +extern unsigned long calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n); + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) extern void calc_global_load(unsigned long ticks); diff --git a/include/linux/slab.h b/include/linux/slab.h @@ -295,12 +295,43 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, #define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ (KMALLOC_MIN_SIZE) : 16) +/* + * Whenever changing this, take care of that kmalloc_type() and + * create_kmalloc_caches() still work as intended. + */ +enum kmalloc_cache_type { + KMALLOC_NORMAL = 0, + KMALLOC_RECLAIM, +#ifdef CONFIG_ZONE_DMA + KMALLOC_DMA, +#endif + NR_KMALLOC_TYPES +}; + #ifndef CONFIG_SLOB -extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +extern struct kmem_cache * +kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; + +static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) +{ + int is_dma = 0; + int type_dma = 0; + int is_reclaimable; + #ifdef CONFIG_ZONE_DMA -extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; + is_dma = !!(flags & __GFP_DMA); + type_dma = is_dma * KMALLOC_DMA; #endif + is_reclaimable = !!(flags & __GFP_RECLAIMABLE); + + /* + * If an allocation is both __GFP_DMA and __GFP_RECLAIMABLE, return + * KMALLOC_DMA and effectively ignore __GFP_RECLAIMABLE + */ + return type_dma + (is_reclaimable & !is_dma) * KMALLOC_RECLAIM; +} + /* * Figure out which kmalloc slab an allocation of a certain size * belongs to. @@ -501,18 +532,20 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) static __always_inline void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { +#ifndef CONFIG_SLOB + unsigned int index; +#endif if (size > KMALLOC_MAX_CACHE_SIZE) return kmalloc_large(size, flags); #ifndef CONFIG_SLOB - if (!(flags & GFP_DMA)) { - unsigned int index = kmalloc_index(size); + index = kmalloc_index(size); - if (!index) - return ZERO_SIZE_PTR; + if (!index) + return ZERO_SIZE_PTR; - return kmem_cache_alloc_trace(kmalloc_caches[index], - flags, size); - } + return kmem_cache_alloc_trace( + kmalloc_caches[kmalloc_type(flags)][index], + flags, size); #endif } return __kmalloc(size, flags); @@ -542,13 +575,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { #ifndef CONFIG_SLOB if (__builtin_constant_p(size) && - size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) { + size <= KMALLOC_MAX_CACHE_SIZE) { unsigned int i = kmalloc_index(size); if (!i) return ZERO_SIZE_PTR; - return kmem_cache_alloc_node_trace(kmalloc_caches[i], + return kmem_cache_alloc_node_trace( + kmalloc_caches[kmalloc_type(flags)][i], flags, node, size); } #endif diff --git a/include/linux/swap.h b/include/linux/swap.h @@ -167,13 +167,14 @@ enum { SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ - SWP_FILE = (1 << 7), /* set after swap_activate success */ - SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ - SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ - SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ - SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ + SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ + SWP_FS = (1 << 8), /* swap file goes through fs */ + SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ + SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ + SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ + SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@ -296,7 +297,7 @@ struct vma_swap_readahead { /* linux/mm/workingset.c */ void *workingset_eviction(struct address_space *mapping, struct page *page); -bool workingset_refault(void *shadow); +void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); /* Do not use directly, use workingset_lookup_update */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h @@ -88,6 +88,7 @@ {1UL << PG_dirty, "dirty" }, \ {1UL << PG_lru, "lru" }, \ {1UL << PG_active, "active" }, \ + {1UL << PG_workingset, "workingset" }, \ {1UL << PG_slab, "slab" }, \ {1UL << PG_owner_priv_1, "owner_priv_1" }, \ {1UL << PG_arch_1, "arch_1" }, \ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 8 +#define TASKSTATS_VERSION 9 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -164,6 +164,10 @@ struct taskstats { /* Delay waiting for memory reclaim */ __u64 freepages_count; __u64 freepages_delay_total; + + /* Delay waiting for thrashing page */ + __u64 thrashing_count; + __u64 thrashing_delay_total; }; diff --git a/init/Kconfig b/init/Kconfig @@ -490,6 +490,25 @@ config TASK_IO_ACCOUNTING Say N if unsure. +config PSI + bool "Pressure stall information tracking" + help + Collect metrics that indicate how overcommitted the CPU, memory, + and IO capacity are in the system. + + If you say Y here, the kernel will create /proc/pressure/ with the + pressure statistics files cpu, memory, and io. These will indicate + the share of walltime in which some or all tasks in the system are + delayed due to contention of the respective resource. + + In kernels with cgroup support, cgroups (cgroup2 only) will + have cpu.pressure, memory.pressure, and io.pressure files, + which aggregate pressure stalls for the grouped tasks only. + + For more details see Documentation/accounting/psi.txt. + + Say N if unsure. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c @@ -55,6 +55,7 @@ #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/sched/cputime.h> +#include <linux/psi.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - rcu_assign_pointer(task->cgroups, to_cset); + cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } @@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) return ret; } +#ifdef CONFIG_PSI +static int cgroup_io_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); +} +static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); +} +static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); +} +#endif + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, +#ifdef CONFIG_PSI + { + .name = "io.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_io_pressure_show, + }, + { + .name = "memory.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_memory_pressure_show, + }, + { + .name = "cpu.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_cpu_pressure_show, + }, +#endif { } /* terminate */ }; @@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + psi_cgroup_free(cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_exit(cgrp); kfree(cgrp); @@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; - ret = cgroup_bpf_inherit(cgrp); + + ret = psi_cgroup_alloc(cgrp); if (ret) goto out_idr_free; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_psi_free; + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return cgrp; +out_psi_free: + psi_cgroup_free(cgrp); out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c @@ -2556,16 +2556,11 @@ static int kdb_summary(int argc, const char **argv) } kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); - /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); -#undef LOAD_INT -#undef LOAD_FRAC + /* Display in kilobytes */ #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" diff --git a/kernel/delayacct.c b/kernel/delayacct.c @@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; tmp = d->freepages_delay_total + tsk->delays->freepages_delay; d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; + tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; + d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; + d->thrashing_count += tsk->delays->thrashing_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -169,3 +172,15 @@ void __delayacct_freepages_end(void) &current->delays->freepages_count); } +void __delayacct_thrashing_start(void) +{ + current->delays->thrashing_start = ktime_get_ns(); +} + +void __delayacct_thrashing_end(void) +{ + delayacct_end(&current->delays->lock, + &current->delays->thrashing_start, + &current->delays->thrashing_delay, + &current->delays->thrashing_count); +} diff --git a/kernel/fork.c b/kernel/fork.c @@ -223,9 +223,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) return s->addr; } + /* + * Allocated stacks are cached and later reused by new threads, + * so memcg accounting is performed manually on assigning/releasing + * stacks to tasks. Drop __GFP_ACCOUNT. + */ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, + THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); @@ -248,9 +253,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK - if (task_stack_vm_area(tsk)) { + struct vm_struct *vm = task_stack_vm_area(tsk); + + if (vm) { int i; + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + -(int)(PAGE_SIZE / 1024)); + + memcg_kmem_uncharge(vm->pages[i], 0); + } + for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], NULL, tsk->stack_vm_area) != NULL) @@ -351,10 +366,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account) NR_KERNEL_STACK_KB, PAGE_SIZE / 1024 * account); } - - /* All stack pages belong to the same memcg. */ - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -370,6 +381,35 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } } +static int memcg_charge_kernel_stack(struct task_struct *tsk) +{ +#ifdef CONFIG_VMAP_STACK + struct vm_struct *vm = task_stack_vm_area(tsk); + int ret; + + if (vm) { + int i; + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + /* + * If memcg_kmem_charge() fails, page->mem_cgroup + * pointer is NULL, and both memcg_kmem_uncharge() + * and mod_memcg_page_state() in free_thread_stack() + * will ignore this page. So it's safe. + */ + ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0); + if (ret) + return ret; + + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + PAGE_SIZE / 1024); + } + } +#endif + return 0; +} + static void release_task_stack(struct task_struct *tsk) { if (WARN_ON(tsk->state != TASK_DEAD)) @@ -807,6 +847,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + if (memcg_charge_kernel_stack(tsk)) + goto free_stack; + stack_vm_area = task_stack_vm_area(tsk); err = arch_dup_task_struct(tsk, orig); @@ -1779,6 +1822,10 @@ static __latent_entropy struct task_struct *copy_process( p->default_timer_slack_ns = current->timer_slack_ns; +#ifdef CONFIG_PSI + p->psi_flags = 0; +#endif + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/memremap.c b/kernel/memremap.c @@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL; struct resource *res = &pgmap->res; - unsigned long pfn, pgoff, order; + struct dev_pagemap *conflict_pgmap; pgprot_t pgprot = PAGE_KERNEL; + unsigned long pgoff, order; int error, nid, is_ram; - struct dev_pagemap *conflict_pgmap; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -256,19 +256,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (error) goto err_add_memory; - for_each_device_pfn(pfn, pgmap) { - struct page *page = pfn_to_page(pfn); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer. It is a bug if a ZONE_DEVICE page is ever - * freed or placed on a driver-private list. Seed the - * storage with LIST_POISON* values. - */ - list_del(&page->lru); - page->pgmap = pgmap; - percpu_ref_get(pgmap->ref); - } + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, pgmap); + percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); devm_add_action(dev, devm_memremap_pages_release, pgmap); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o +obj-$(CONFIG_PSI) += psi.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c @@ -722,8 +722,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) + if (!(flags & ENQUEUE_RESTORE)) { sched_info_queued(rq, p); + psi_enqueue(p, flags & ENQUEUE_WAKEUP); + } p->sched_class->enqueue_task(rq, p, flags); } @@ -733,8 +735,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) + if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeued(rq, p); + psi_dequeue(p, flags & DEQUEUE_SLEEP); + } p->sched_class->dequeue_task(rq, p, flags); } @@ -2037,6 +2041,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } @@ -3051,6 +3056,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); + psi_task_tick(rq); rq_unlock(rq, &rf); @@ -4933,9 +4939,7 @@ static void do_sched_yield(void) struct rq_flags rf; struct rq *rq; - local_irq_disable(); - rq = this_rq(); - rq_lock(rq, &rf); + rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); @@ -6069,6 +6073,8 @@ void __init sched_init(void) init_schedstats(); + psi_init(); + scheduler_running = 1; } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c @@ -91,19 +91,73 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; } -/* - * a1 = a0 * e + a * (1 - e) +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. */ static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) { - unsigned long newload; + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } - newload = load * exp + active * (FIXED_1 - exp); - if (active >= load) - newload += FIXED_1-1; + return result; +} - return newload / FIXED_1; +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); } #ifdef CONFIG_NO_HZ_COMMON @@ -225,75 +279,6 @@ static long calc_load_nohz_fold(void) return delta; } -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) { - for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - /* * NO_HZ can leave us missing all per-CPU ticks calling * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c @@ -0,0 +1,759 @@ +/* + * Pressure stall information for CPU, memory and IO + * + * Copyright (c) 2018 Facebook, Inc. + * Author: Johannes Weiner <hannes@cmpxchg.org> + * + * When CPU, memory and IO are contended, tasks experience delays that + * reduce throughput and introduce latencies into the workload. Memory + * and IO contention, in addition, can cause a full loss of forward + * progress in which the CPU goes idle. + * + * This code aggregates individual task delays into resource pressure + * metrics that indicate problems with both workload health and + * resource utilization. + * + * Model + * + * The time in which a task can execute on a CPU is our baseline for + * productivity. Pressure expresses the amount of time in which this + * potential cannot be realized due to resource contention. + * + * This concept of productivity has two components: the workload and + * the CPU. To measure the impact of pressure on both, we define two + * contention states for a resource: SOME and FULL. + * + * In the SOME state of a given resource, one or more tasks are + * delayed on that resource. This affects the workload's ability to + * perform work, but the CPU may still be executing other tasks. + * + * In the FULL state of a given resource, all non-idle tasks are + * delayed on that resource such that nobody is advancing and the CPU + * goes idle. This leaves both workload and CPU unproductive. + * + * (Naturally, the FULL state doesn't exist for the CPU resource.) + * + * SOME = nr_delayed_tasks != 0 + * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * + * The percentage of wallclock time spent in those compound stall + * states gives pressure numbers between 0 and 100 for each resource, + * where the SOME percentage indicates workload slowdowns and the FULL + * percentage indicates reduced CPU utilization: + * + * %SOME = time(SOME) / period + * %FULL = time(FULL) / period + * + * Multiple CPUs + * + * The more tasks and available CPUs there are, the more work can be + * performed concurrently. This means that the potential that can go + * unrealized due to resource contention *also* scales with non-idle + * tasks and CPUs. + * + * Consider a scenario where 257 number crunching tasks are trying to + * run concurrently on 256 CPUs. If we simply aggregated the task + * states, we would have to conclude a CPU SOME pressure number of + * 100%, since *somebody* is waiting on a runqueue at all + * times. However, that is clearly not the amount of contention the + * workload is experiencing: only one out of 256 possible exceution + * threads will be contended at any given time, or about 0.4%. + * + * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any + * given time *one* of the tasks is delayed due to a lack of memory. + * Again, looking purely at the task state would yield a memory FULL + * pressure number of 0%, since *somebody* is always making forward + * progress. But again this wouldn't capture the amount of execution + * potential lost, which is 1 out of 4 CPUs, or 25%. + * + * To calculate wasted potential (pressure) with multiple processors, + * we have to base our calculation on the number of non-idle tasks in + * conjunction with the number of available CPUs, which is the number + * of potential execution threads. SOME becomes then the proportion of + * delayed tasks to possibe threads, and FULL is the share of possible + * threads that are unproductive due to delays: + * + * threads = min(nr_nonidle_tasks, nr_cpus) + * SOME = min(nr_delayed_tasks / threads, 1) + * FULL = (threads - min(nr_running_tasks, threads)) / threads + * + * For the 257 number crunchers on 256 CPUs, this yields: + * + * threads = min(257, 256) + * SOME = min(1 / 256, 1) = 0.4% + * FULL = (256 - min(257, 256)) / 256 = 0% + * + * For the 1 out of 4 memory-delayed tasks, this yields: + * + * threads = min(4, 4) + * SOME = min(1 / 4, 1) = 25% + * FULL = (4 - min(3, 4)) / 4 = 25% + * + * [ Substitute nr_cpus with 1, and you can see that it's a natural + * extension of the single-CPU model. ] + * + * Implementation + * + * To assess the precise time spent in each such state, we would have + * to freeze the system on task changes and start/stop the state + * clocks accordingly. Obviously that doesn't scale in practice. + * + * Because the scheduler aims to distribute the compute load evenly + * among the available CPUs, we can track task state locally to each + * CPU and, at much lower frequency, extrapolate the global state for + * the cumulative stall times and the running averages. + * + * For each runqueue, we track: + * + * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) + * + * and then periodically aggregate: + * + * tNONIDLE = sum(tNONIDLE[i]) + * + * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE + * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE + * + * %SOME = tSOME / period + * %FULL = tFULL / period + * + * This gives us an approximation of pressure that is practical + * cost-wise, yet way more sensitive and accurate than periodic + * sampling of the aggregate task states would be. + */ + +#include <linux/sched/loadavg.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/seqlock.h> +#include <linux/cgroup.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/psi.h> +#include "sched.h" + +static int psi_bug __read_mostly; + +bool psi_disabled __read_mostly; +core_param(psi_disabled, psi_disabled, bool, 0644); + +/* Running averages - we need to be higher-res than loadavg */ +#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ +#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */ +#define EXP_60s 1981 /* 1/exp(2s/60s) */ +#define EXP_300s 2034 /* 1/exp(2s/300s) */ + +/* Sampling frequency in nanoseconds */ +static u64 psi_period __read_mostly; + +/* System-level pressure and stall tracking */ +static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); +static struct psi_group psi_system = { + .pcpu = &system_group_pcpu, +}; + +static void psi_update_work(struct work_struct *work); + +static void group_init(struct psi_group *group) +{ + int cpu; + + for_each_possible_cpu(cpu) + seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + group->next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->clock_work, psi_update_work); + mutex_init(&group->stat_lock); +} + +void __init psi_init(void) +{ + if (psi_disabled) + return; + + psi_period = jiffies_to_nsecs(PSI_FREQ); + group_init(&psi_system); +} + +static bool test_state(unsigned int *tasks, enum psi_states state) +{ + switch (state) { + case PSI_IO_SOME: + return tasks[NR_IOWAIT]; + case PSI_IO_FULL: + return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + case PSI_MEM_SOME: + return tasks[NR_MEMSTALL]; + case PSI_MEM_FULL: + return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + case PSI_CPU_SOME: + return tasks[NR_RUNNING] > 1; + case PSI_NONIDLE: + return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || + tasks[NR_RUNNING]; + default: + return false; + } +} + +static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +{ + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + unsigned int tasks[NR_PSI_TASK_COUNTS]; + u64 now, state_start; + unsigned int seq; + int s; + + /* Snapshot a coherent view of the CPU state */ + do { + seq = read_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); + memcpy(times, groupc->times, sizeof(groupc->times)); + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_start = groupc->state_start; + } while (read_seqcount_retry(&groupc->seq, seq)); + + /* Calculate state time deltas against the previous snapshot */ + for (s = 0; s < NR_PSI_STATES; s++) { + u32 delta; + /* + * In addition to already concluded states, we also + * incorporate currently active states on the CPU, + * since states may last for many sampling periods. + * + * This way we keep our delta sampling buckets small + * (u32) and our reported pressure close to what's + * actually happening. + */ + if (test_state(tasks, s)) + times[s] += now - state_start; + + delta = times[s] - groupc->times_prev[s]; + groupc->times_prev[s] = times[s]; + + times[s] = delta; + } +} + +static void calc_avgs(unsigned long avg[3], int missed_periods, + u64 time, u64 period) +{ + unsigned long pct; + + /* Fill in zeroes for periods of no activity */ + if (missed_periods) { + avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods); + avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods); + avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods); + } + + /* Sample the most recent active period */ + pct = div_u64(time * 100, period); + pct *= FIXED_1; + avg[0] = calc_load(avg[0], EXP_10s, pct); + avg[1] = calc_load(avg[1], EXP_60s, pct); + avg[2] = calc_load(avg[2], EXP_300s, pct); +} + +static bool update_stats(struct psi_group *group) +{ + u64 deltas[NR_PSI_STATES - 1] = { 0, }; + unsigned long missed_periods = 0; + unsigned long nonidle_total = 0; + u64 now, expires, period; + int cpu; + int s; + + mutex_lock(&group->stat_lock); + + /* + * Collect the per-cpu time buckets and average them into a + * single time sample that is normalized to wallclock time. + * + * For averaging, each CPU is weighted by its non-idle time in + * the sampling period. This eliminates artifacts from uneven + * loading, or even entirely idle CPUs. + */ + for_each_possible_cpu(cpu) { + u32 times[NR_PSI_STATES]; + u32 nonidle; + + get_recent_times(group, cpu, times); + + nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); + nonidle_total += nonidle; + + for (s = 0; s < PSI_NONIDLE; s++) + deltas[s] += (u64)times[s] * nonidle; + } + + /* + * Integrate the sample into the running statistics that are + * reported to userspace: the cumulative stall times and the + * decaying averages. + * + * Pressure percentages are sampled at PSI_FREQ. We might be + * called more often when the user polls more frequently than + * that; we might be called less often when there is no task + * activity, thus no data, and clock ticks are sporadic. The + * below handles both. + */ + + /* total= */ + for (s = 0; s < NR_PSI_STATES - 1; s++) + group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + + /* avgX= */ + now = sched_clock(); + expires = group->next_update; + if (now < expires) + goto out; + if (now - expires > psi_period) + missed_periods = div_u64(now - expires, psi_period); + + /* + * The periodic clock tick can get delayed for various + * reasons, especially on loaded systems. To avoid clock + * drift, we schedule the clock in fixed psi_period intervals. + * But the deltas we sample out of the per-cpu buckets above + * are based on the actual time elapsing between clock ticks. + */ + group->next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->last_update + (missed_periods * psi_period)); + group->last_update = now; + + for (s = 0; s < NR_PSI_STATES - 1; s++) { + u32 sample; + + sample = group->total[s] - group->total_prev[s]; + /* + * Due to the lockless sampling of the time buckets, + * recorded time deltas can slip into the next period, + * which under full pressure can result in samples in + * excess of the period length. + * + * We don't want to report non-sensical pressures in + * excess of 100%, nor do we want to drop such events + * on the floor. Instead we punt any overage into the + * future until pressure subsides. By doing this we + * don't underreport the occurring pressure curve, we + * just report it delayed by one period length. + * + * The error isn't cumulative. As soon as another + * delta slips from a period P to P+1, by definition + * it frees up its time T in P. + */ + if (sample > period) + sample = period; + group->total_prev[s] += sample; + calc_avgs(group->avg[s], missed_periods, sample, period); + } +out: + mutex_unlock(&group->stat_lock); + return nonidle_total; +} + +static void psi_update_work(struct work_struct *work) +{ + struct delayed_work *dwork; + struct psi_group *group; + bool nonidle; + + dwork = to_delayed_work(work); + group = container_of(dwork, struct psi_group, clock_work); + + /* + * If there is task activity, periodically fold the per-cpu + * times and feed samples into the running averages. If things + * are idle and there is no data to process, stop the clock. + * Once restarted, we'll catch up the running averages in one + * go - see calc_avgs() and missed_periods. + */ + + nonidle = update_stats(group); + + if (nonidle) { + unsigned long delay = 0; + u64 now; + + now = sched_clock(); + if (group->next_update > now) + delay = nsecs_to_jiffies(group->next_update - now) + 1; + schedule_delayed_work(dwork, delay); + } +} + +static void record_times(struct psi_group_cpu *groupc, int cpu, + bool memstall_tick) +{ + u32 delta; + u64 now; + + now = cpu_clock(cpu); + delta = now - groupc->state_start; + groupc->state_start = now; + + if (test_state(groupc->tasks, PSI_IO_SOME)) { + groupc->times[PSI_IO_SOME] += delta; + if (test_state(groupc->tasks, PSI_IO_FULL)) + groupc->times[PSI_IO_FULL] += delta; + } + + if (test_state(groupc->tasks, PSI_MEM_SOME)) { + groupc->times[PSI_MEM_SOME] += delta; + if (test_state(groupc->tasks, PSI_MEM_FULL)) + groupc->times[PSI_MEM_FULL] += delta; + else if (memstall_tick) { + u32 sample; + /* + * Since we care about lost potential, a + * memstall is FULL when there are no other + * working tasks, but also when the CPU is + * actively reclaiming and nothing productive + * could run even if it were runnable. + * + * When the timer tick sees a reclaiming CPU, + * regardless of runnable tasks, sample a FULL + * tick (or less if it hasn't been a full tick + * since the last state change). + */ + sample = min(delta, (u32)jiffies_to_nsecs(1)); + groupc->times[PSI_MEM_FULL] += sample; + } + } + + if (test_state(groupc->tasks, PSI_CPU_SOME)) + groupc->times[PSI_CPU_SOME] += delta; + + if (test_state(groupc->tasks, PSI_NONIDLE)) + groupc->times[PSI_NONIDLE] += delta; +} + +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) +{ + struct psi_group_cpu *groupc; + unsigned int t, m; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + /* + * First we assess the aggregate resource states this CPU's + * tasks have been in since the last change, and account any + * SOME and FULL time these may have resulted in. + * + * Then we update the task counts according to the state + * change requested through the @clear and @set bits. + */ + write_seqcount_begin(&groupc->seq); + + record_times(groupc, cpu, false); + + for (t = 0, m = clear; m; m &= ~(1 << t), t++) { + if (!(m & (1 << t))) + continue; + if (groupc->tasks[t] == 0 && !psi_bug) { + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + cpu, t, groupc->tasks[0], + groupc->tasks[1], groupc->tasks[2], + clear, set); + psi_bug = 1; + } + groupc->tasks[t]--; + } + + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + groupc->tasks[t]++; + + write_seqcount_end(&groupc->seq); + + if (!delayed_work_pending(&group->clock_work)) + schedule_delayed_work(&group->clock_work, PSI_FREQ); +} + +static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +{ +#ifdef CONFIG_CGROUPS + struct cgroup *cgroup = NULL; + + if (!*iter) + cgroup = task->cgroups->dfl_cgrp; + else if (*iter == &psi_system) + return NULL; + else + cgroup = cgroup_parent(*iter); + + if (cgroup && cgroup_parent(cgroup)) { + *iter = cgroup; + return cgroup_psi(cgroup); + } +#else + if (*iter) + return NULL; +#endif + *iter = &psi_system; + return &psi_system; +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + struct psi_group *group; + void *iter = NULL; + + if (!task->pid) + return; + + if (((task->psi_flags & set) || + (task->psi_flags & clear) != clear) && + !psi_bug) { + printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", + task->pid, task->comm, cpu, + task->psi_flags, clear, set); + psi_bug = 1; + } + + task->psi_flags &= ~clear; + task->psi_flags |= set; + + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set); +} + +void psi_memstall_tick(struct task_struct *task, int cpu) +{ + struct psi_group *group; + void *iter = NULL; + + while ((group = iterate_groups(task, &iter))) { + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(group->pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); + } +} + +/** + * psi_memstall_enter - mark the beginning of a memory stall section + * @flags: flags to handle nested sections + * + * Marks the calling task as being stalled due to a lack of memory, + * such as waiting for a refault or performing reclaim. + */ +void psi_memstall_enter(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + *flags = current->flags & PF_MEMSTALL; + if (*flags) + return; + /* + * PF_MEMSTALL setting & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we can + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags |= PF_MEMSTALL; + psi_task_change(current, 0, TSK_MEMSTALL); + + rq_unlock_irq(rq, &rf); +} + +/** + * psi_memstall_leave - mark the end of an memory stall section + * @flags: flags to handle nested memdelay sections + * + * Marks the calling task as no longer stalled due to lack of memory. + */ +void psi_memstall_leave(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + if (*flags) + return; + /* + * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we could + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags &= ~PF_MEMSTALL; + psi_task_change(current, TSK_MEMSTALL, 0); + + rq_unlock_irq(rq, &rf); +} + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgroup) +{ + if (psi_disabled) + return 0; + + cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi.pcpu) + return -ENOMEM; + group_init(&cgroup->psi); + return 0; +} + +void psi_cgroup_free(struct cgroup *cgroup) +{ + if (psi_disabled) + return; + + cancel_delayed_work_sync(&cgroup->psi.clock_work); + free_percpu(cgroup->psi.pcpu); +} + +/** + * cgroup_move_task - move task to a different cgroup + * @task: the task + * @to: the target css_set + * + * Move task to a new cgroup and safely migrate its associated stall + * state between the different groups. + * + * This function acquires the task's rq lock to lock out concurrent + * changes to the task's scheduling state and - in case the task is + * running - concurrent changes to its stall state. + */ +void cgroup_move_task(struct task_struct *task, struct css_set *to) +{ + bool move_psi = !psi_disabled; + unsigned int task_flags = 0; + struct rq_flags rf; + struct rq *rq; + + if (move_psi) { + rq = task_rq_lock(task, &rf); + + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; + + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; + + if (task_flags) + psi_task_change(task, task_flags, 0); + } + + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + + if (move_psi) { + if (task_flags) + psi_task_change(task, 0, task_flags); + + task_rq_unlock(rq, task, &rf); + } +} +#endif /* CONFIG_CGROUPS */ + +int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) +{ + int full; + + if (psi_disabled) + return -EOPNOTSUPP; + + update_stats(group); + + for (full = 0; full < 2 - (res == PSI_CPU); full++) { + unsigned long avg[3]; + u64 total; + int w; + + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + + return 0; +} + +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_io_show, NULL); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_memory_show, NULL); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_cpu_show, NULL); +} + +static const struct file_operations psi_io_fops = { + .open = psi_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_memory_fops = { + .open = psi_memory_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_cpu_fops = { + .open = psi_cpu_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init psi_proc_init(void) +{ + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + return 0; +} +module_init(psi_proc_init); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h @@ -54,6 +54,7 @@ #include <linux/proc_fs.h> #include <linux/prefetch.h> #include <linux/profile.h> +#include <linux/psi.h> #include <linux/rcupdate_wait.h> #include <linux/security.h> #include <linux/stop_machine.h> @@ -319,6 +320,7 @@ extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED #include <linux/cgroup.h> +#include <linux/psi.h> struct cfs_rq; struct rt_rq; @@ -957,6 +959,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +extern void update_rq_clock(struct rq *rq); + static inline u64 __rq_clock_broken(struct rq *rq) { return READ_ONCE(rq->clock); @@ -1075,6 +1079,98 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) #endif } +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, rf); + return rq; +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, @@ -1717,8 +1813,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } -extern void update_rq_clock(struct rq *rq); - extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1783,86 +1877,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) #endif #endif -struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(rq->lock); - -struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(p->pi_lock) - __acquires(rq->lock); - -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - __releases(rq->lock) - __releases(p->pi_lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -} - -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irqsave(&rq->lock, rf->flags); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irq(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_relock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); -} - -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -} - -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irq(&rq->lock); -} - -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h @@ -55,6 +55,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_PSI +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (psi_disabled) + return; + + if (!wakeup || p->sched_psi_wake_requeue) { + if (p->flags & PF_MEMSTALL) + set |= TSK_MEMSTALL; + if (p->sched_psi_wake_requeue) + p->sched_psi_wake_requeue = 0; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + int clear = TSK_RUNNING, set = 0; + + if (psi_disabled) + return; + + if (!sleep) { + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + set |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (psi_disabled) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + struct rq_flags rf; + struct rq *rq; + int clear = 0; + + if (p->in_iowait) + clear |= TSK_IOWAIT; + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, clear, 0); + p->sched_psi_wake_requeue = 1; + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_task_tick(struct rq *rq) +{ + if (psi_disabled) + return; + + if (unlikely(rq->curr->flags & PF_MEMSTALL)) + psi_memstall_tick(rq->curr, cpu_of(rq)); +} +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_task_tick(struct rq *rq) {} +#endif /* CONFIG_PSI */ + #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) { diff --git a/lib/test_kasan.c b/lib/test_kasan.c @@ -579,6 +579,73 @@ static noinline void __init kmem_cache_invalid_free(void) kmem_cache_destroy(cache); } +static noinline void __init kasan_memchr(void) +{ + char *ptr; + size_t size = 24; + + pr_info("out-of-bounds in memchr\n"); + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + if (!ptr) + return; + + memchr(ptr, '1', size + 1); + kfree(ptr); +} + +static noinline void __init kasan_memcmp(void) +{ + char *ptr; + size_t size = 24; + int arr[9]; + + pr_info("out-of-bounds in memcmp\n"); + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + if (!ptr) + return; + + memset(arr, 0, sizeof(arr)); + memcmp(ptr, arr, size+1); + kfree(ptr); +} + +static noinline void __init kasan_strings(void) +{ + char *ptr; + size_t size = 24; + + pr_info("use-after-free in strchr\n"); + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + if (!ptr) + return; + + kfree(ptr); + + /* + * Try to cause only 1 invalid access (less spam in dmesg). + * For that we need ptr to point to zeroed byte. + * Skip metadata that could be stored in freed object so ptr + * will likely point to zeroed byte. + */ + ptr += 16; + strchr(ptr, '1'); + + pr_info("use-after-free in strrchr\n"); + strrchr(ptr, '1'); + + pr_info("use-after-free in strcmp\n"); + strcmp(ptr, "2"); + + pr_info("use-after-free in strncmp\n"); + strncmp(ptr, "2", 1); + + pr_info("use-after-free in strlen\n"); + strlen(ptr); + + pr_info("use-after-free in strnlen\n"); + strnlen(ptr, 1); +} + static int __init kmalloc_tests_init(void) { /* @@ -618,6 +685,9 @@ static int __init kmalloc_tests_init(void) use_after_scope_test(); kmem_cache_double_free(); kmem_cache_invalid_free(); + kasan_memchr(); + kasan_memcmp(); + kasan_strings(); kasan_restore_multi_shot(multishot); diff --git a/mm/compaction.c b/mm/compaction.c @@ -22,6 +22,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/page_owner.h> +#include <linux/psi.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -2068,11 +2069,15 @@ static int kcompactd(void *p) pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { + unsigned long pflags; + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); wait_event_freezable(pgdat->kcompactd_wait, kcompactd_work_requested(pgdat)); + psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); } return 0; diff --git a/mm/debug.c b/mm/debug.c @@ -13,6 +13,7 @@ #include <trace/events/mmflags.h> #include <linux/migrate.h> #include <linux/page_owner.h> +#include <linux/ctype.h> #include "internal.h" @@ -175,4 +176,49 @@ void dump_mm(const struct mm_struct *mm) ); } +static bool page_init_poisoning __read_mostly = true; + +static int __init setup_vm_debug(char *str) +{ + bool __page_init_poisoning = true; + + /* + * Calling vm_debug with no arguments is equivalent to requesting + * to enable all debugging options we can control. + */ + if (*str++ != '=' || !*str) + goto out; + + __page_init_poisoning = false; + if (*str == '-') + goto out; + + while (*str) { + switch (tolower(*str)) { + case'p': + __page_init_poisoning = true; + break; + default: + pr_err("vm_debug option '%c' unknown. skipped\n", + *str); + } + + str++; + } +out: + if (page_init_poisoning && !__page_init_poisoning) + pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n"); + + page_init_poisoning = __page_init_poisoning; + + return 1; +} +__setup("vm_debug", setup_vm_debug); + +void page_init_poison(struct page *page, size_t size) +{ + if (page_init_poisoning) + memset(page, PAGE_POISON_PATTERN, size); +} +EXPORT_SYMBOL_GPL(page_init_poison); #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/filemap.c b/mm/filemap.c @@ -36,6 +36,8 @@ #include <linux/cleancache.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> +#include <linux/delayacct.h> +#include <linux/psi.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -915,12 +917,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ - if (!(gfp_mask & __GFP_WRITE) && - shadow && workingset_refault(shadow)) { - SetPageActive(page); - workingset_activation(page); - } else - ClearPageActive(page); + WARN_ON_ONCE(PageActive(page)); + if (!(gfp_mask & __GFP_WRITE) && shadow) + workingset_refault(page, shadow); lru_cache_add(page); } return ret; @@ -1076,8 +1075,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; + unsigned long pflags; int ret = 0; + if (bit_nr == PG_locked && + !PageUptodate(page) && PageWorkingset(page)) { + if (!PageSwapBacked(page)) + delayacct_thrashing_start(); + psi_memstall_enter(&pflags); + thrashing = true; + } + init_wait(wait); wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; @@ -1116,6 +1125,12 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, finish_wait(q, wait); + if (thrashing) { + if (!PageSwapBacked(page)) + delayacct_thrashing_end(); + psi_memstall_leave(&pflags); + } + /* * A signal could leave PageWaiters set. Clearing it here if * !waitqueue_active would be possible (by open-coding finish_wait), @@ -2581,9 +2596,7 @@ no_cached_page: * system is low on memory, or a problem occurs while trying * to schedule I/O. */ - if (error == -ENOMEM) - return VM_FAULT_OOM; - return VM_FAULT_SIGBUS; + return vmf_error(error); page_not_uptodate: /* @@ -2748,9 +2761,9 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) return generic_file_mmap(file, vma); } #else -int filemap_page_mkwrite(struct vm_fault *vmf) +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { - return -ENOSYS; + return VM_FAULT_SIGBUS; } int generic_file_mmap(struct file * file, struct vm_area_struct * vma) { @@ -3012,7 +3025,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) { /* If there are pages to writeback, return */ if (filemap_range_has_page(inode->i_mapping, pos, - pos + iov_iter_count(from))) + pos + write_len)) return -EAGAIN; } else { written = filemap_write_and_wait_range(mapping, pos, diff --git a/mm/gup.c b/mm/gup.c @@ -20,6 +20,11 @@ #include "internal.h" +struct follow_page_context { + struct dev_pagemap *pgmap; + unsigned int page_mask; +}; + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -71,10 +76,10 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) } static struct page *follow_page_pte(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags) + unsigned long address, pmd_t *pmd, unsigned int flags, + struct dev_pagemap **pgmap) { struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -116,8 +121,8 @@ retry: * Only return device mapping pages in the FOLL_GET case since * they are only valid while holding the pgmap reference. */ - pgmap = get_dev_pagemap(pte_pfn(pte), NULL); - if (pgmap) + *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); + if (*pgmap) page = pte_page(pte); else goto no_page; @@ -152,15 +157,8 @@ retry: goto retry; } - if (flags & FOLL_GET) { + if (flags & FOLL_GET) get_page(page); - - /* drop the pgmap reference now that we hold the page */ - if (pgmap) { - put_dev_pagemap(pgmap); - pgmap = NULL; - } - } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -210,7 +208,8 @@ no_page: static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { pmd_t *pmd, pmdval; spinlock_t *ptl; @@ -258,13 +257,13 @@ retry: } if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); - page = follow_devmap_pmd(vma, address, pmd, flags); + page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; } if (likely(!pmd_trans_huge(pmdval))) - return follow_page_pte(vma, address, pmd, flags); + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); @@ -284,7 +283,7 @@ retry_locked: } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); - return follow_page_pte(vma, address, pmd, flags); + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } if (flags & FOLL_SPLIT) { int ret; @@ -307,18 +306,18 @@ retry_locked: } return ret ? ERR_PTR(ret) : - follow_page_pte(vma, address, pmd, flags); + follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; + ctx->page_mask = HPAGE_PMD_NR - 1; return page; } - static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { pud_t *pud; spinlock_t *ptl; @@ -344,7 +343,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); - page = follow_devmap_pud(vma, address, pud, flags); + page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; @@ -352,13 +351,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); - return follow_pmd_mask(vma, address, pud, flags, page_mask); + return follow_pmd_mask(vma, address, pud, flags, ctx); } - static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { p4d_t *p4d; struct page *page; @@ -378,7 +377,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } - return follow_pud_mask(vma, address, p4d, flags, page_mask); + return follow_pud_mask(vma, address, p4d, flags, ctx); } /** @@ -396,13 +395,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, */ struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, - unsigned int *page_mask) + struct follow_page_context *ctx) { pgd_t *pgd; struct page *page; struct mm_struct *mm = vma->vm_mm; - *page_mask = 0; + ctx->page_mask = 0; /* make this handle hugepd */ page = follow_huge_addr(mm, address, flags & FOLL_WRITE); @@ -431,7 +430,19 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return no_page_table(vma, flags); } - return follow_p4d_mask(vma, address, pgd, flags, page_mask); + return follow_p4d_mask(vma, address, pgd, flags, ctx); +} + +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) +{ + struct follow_page_context ctx = { NULL }; + struct page *page; + + page = follow_page_mask(vma, address, foll_flags, &ctx); + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, @@ -659,9 +670,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) { - long i = 0; - unsigned int page_mask; + long ret = 0, i = 0; struct vm_area_struct *vma = NULL; + struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; @@ -691,12 +702,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pages ? &pages[i] : NULL); if (ret) return i ? : ret; - page_mask = 0; + ctx.page_mask = 0; goto next_page; } - if (!vma || check_vma_flags(vma, gup_flags)) - return i ? : -EFAULT; + if (!vma || check_vma_flags(vma, gup_flags)) { + ret = -EFAULT; + goto out; + } if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, @@ -709,23 +722,26 @@ retry: * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; + if (unlikely(fatal_signal_pending(current))) { + ret = -ERESTARTSYS; + goto out; + } cond_resched(); - page = follow_page_mask(vma, start, foll_flags, &page_mask); + + page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page) { - int ret; ret = faultin_page(tsk, vma, start, &foll_flags, nonblocking); switch (ret) { case 0: goto retry; + case -EBUSY: + ret = 0; + /* FALLTHRU */ case -EFAULT: case -ENOMEM: case -EHWPOISON: - return i ? i : ret; - case -EBUSY: - return i; + goto out; case -ENOENT: goto next_page; } @@ -737,27 +753,31 @@ retry: */ goto next_page; } else if (IS_ERR(page)) { - return i ? i : PTR_ERR(page); + ret = PTR_ERR(page); + goto out; } if (pages) { pages[i] = page; flush_anon_page(vma, page, start); flush_dcache_page(page); - page_mask = 0; + ctx.page_mask = 0; } next_page: if (vmas) { vmas[i] = vma; - page_mask = 0; + ctx.page_mask = 0; } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); - return i; +out: + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return i ? i : ret; } static bool vma_permits_fault(struct vm_area_struct *vma, @@ -1780,12 +1800,11 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write) int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - unsigned long addr, len, end; + unsigned long len, end; unsigned long flags; int nr = 0; start &= PAGE_MASK; - addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; @@ -1807,7 +1826,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages, write)) { local_irq_save(flags); - gup_pgd_range(addr, end, write, pages, &nr); + gup_pgd_range(start, end, write, pages, &nr); local_irq_restore(flags); } diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c @@ -6,13 +6,17 @@ #include <linux/debugfs.h> #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) +#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) +#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) struct gup_benchmark { - __u64 delta_usec; + __u64 get_delta_usec; + __u64 put_delta_usec; __u64 addr; __u64 size; __u32 nr_pages_per_call; __u32 flags; + __u64 expansion[10]; /* For future use */ }; static int __gup_benchmark_ioctl(unsigned int cmd, @@ -41,21 +45,40 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = (next - addr) / PAGE_SIZE; } - nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i); + switch (cmd) { + case GUP_FAST_BENCHMARK: + nr = get_user_pages_fast(addr, nr, gup->flags & 1, + pages + i); + break; + case GUP_LONGTERM_BENCHMARK: + nr = get_user_pages_longterm(addr, nr, gup->flags & 1, + pages + i, NULL); + break; + case GUP_BENCHMARK: + nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, + NULL); + break; + default: + return -1; + } + if (nr <= 0) break; i += nr; } end_time = ktime_get(); - gup->delta_usec = ktime_us_delta(end_time, start_time); + gup->get_delta_usec = ktime_us_delta(end_time, start_time); gup->size = addr - gup->addr; + start_time = ktime_get(); for (i = 0; i < nr_pages; i++) { if (!pages[i]) break; put_page(pages[i]); } + end_time = ktime_get(); + gup->put_delta_usec = ktime_us_delta(end_time, start_time); kvfree(pages); return 0; @@ -67,8 +90,14 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, struct gup_benchmark gup; int ret; - if (cmd != GUP_FAST_BENCHMARK) + switch (cmd) { + case GUP_FAST_BENCHMARK: + case GUP_LONGTERM_BENCHMARK: + case GUP_BENCHMARK: + break; + default: return -EINVAL; + } if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) return -EFAULT; diff --git a/mm/hmm.c b/mm/hmm.c @@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) resource_size_t key, align_start, align_size, align_end; struct device *device = devmem->device; int ret, nid, is_ram; - unsigned long pfn; align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); align_size = ALIGN(devmem->resource->start + @@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) align_size >> PAGE_SHIFT, NULL); mem_hotplug_done(); - for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { - struct page *page = pfn_to_page(pfn); + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, &devmem->pagemap); - page->pgmap = &devmem->pagemap; - } return 0; error_add_memory: diff --git a/mm/huge_memory.c b/mm/huge_memory.c @@ -852,11 +852,10 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags) + pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pmd_pfn(*pmd); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pmd_lockptr(mm, pmd)); @@ -886,12 +885,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, return ERR_PTR(-EEXIST); pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } @@ -1000,11 +998,10 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr, } struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags) + pud_t *pud, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pud_pfn(*pud); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pud_lockptr(mm, pud)); @@ -1028,12 +1025,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, return ERR_PTR(-EEXIST); pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } @@ -1562,8 +1558,20 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) * We are not sure a pending tlb flush here is for a huge page * mapping or not. Hence use the tlb range variant */ - if (mm_tlb_flush_pending(vma->vm_mm)) + if (mm_tlb_flush_pending(vma->vm_mm)) { flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + /* + * change_huge_pmd() released the pmd lock before + * invalidating the secondary MMUs sharing the primary + * MMU pagetables (with ->invalidate_range()). The + * mmu_notifier_invalidate_range_end() (which + * internally calls ->invalidate_range()) in + * change_pmd_range() will run after us, so we can't + * rely on it here and we need an explicit invalidate. + */ + mmu_notifier_invalidate_range(vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + } /* * Migrate the THP to the requested node, returns with page unlocked @@ -2369,6 +2377,7 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | + (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | (1L << PG_dirty))); diff --git a/mm/hugetlb.c b/mm/hugetlb.c @@ -3690,6 +3690,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return err; ClearPagePrivate(page); + /* + * set page dirty so that it will not be removed from cache/file + * by non-hugetlbfs specific code paths. + */ + set_page_dirty(page); + spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c @@ -103,7 +103,7 @@ static int quarantine_head; static int quarantine_tail; /* Total size of all objects in global_quarantine across all batches. */ static unsigned long quarantine_size; -static DEFINE_SPINLOCK(quarantine_lock); +static DEFINE_RAW_SPINLOCK(quarantine_lock); DEFINE_STATIC_SRCU(remove_cache_srcu); /* Maximum size of the global queue. */ @@ -190,7 +190,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { qlist_move_all(q, &temp); - spin_lock(&quarantine_lock); + raw_spin_lock(&quarantine_lock); WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); qlist_move_all(&temp, &global_quarantine[quarantine_tail]); if (global_quarantine[quarantine_tail].bytes >= @@ -203,7 +203,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (new_tail != quarantine_head) quarantine_tail = new_tail; } - spin_unlock(&quarantine_lock); + raw_spin_unlock(&quarantine_lock); } local_irq_restore(flags); @@ -230,7 +230,7 @@ void quarantine_reduce(void) * expected case). */ srcu_idx = srcu_read_lock(&remove_cache_srcu); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); /* * Update quarantine size in case of hotplug. Allocate a fraction of @@ -254,7 +254,7 @@ void quarantine_reduce(void) quarantine_head = 0; } - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, NULL); srcu_read_unlock(&remove_cache_srcu, srcu_idx); @@ -310,17 +310,17 @@ void quarantine_remove_cache(struct kmem_cache *cache) */ on_each_cpu(per_cpu_remove_cache, cache, 1); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); for (i = 0; i < QUARANTINE_BATCHES; i++) { if (qlist_empty(&global_quarantine[i])) continue; qlist_move_cache(&global_quarantine[i], &to_free, cache); /* Scanning whole quarantine can take a while. */ - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); cond_resched(); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); } - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, cache); diff --git a/mm/kmemleak.c b/mm/kmemleak.c @@ -86,6 +86,7 @@ #include <linux/seq_file.h> #include <linux/cpumask.h> #include <linux/spinlock.h> +#include <linux/module.h> #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/stacktrace.h> @@ -181,6 +182,7 @@ struct kmemleak_object { /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) +#define HEX_PREFIX " " /* number of bytes to print per line; must be 16 or 32 */ #define HEX_ROW_SIZE 16 /* number of bytes to print at a time (1, 2, 4, 8) */ @@ -235,6 +237,9 @@ static int kmemleak_skip_disable; /* If there are leaks that can be reported */ static bool kmemleak_found_leaks; +static bool kmemleak_verbose; +module_param_named(verbose, kmemleak_verbose, bool, 0600); + /* * Early object allocation/freeing logging. Kmemleak is initialized after the * kernel allocator. However, both the kernel allocator and kmemleak may @@ -299,6 +304,25 @@ static void kmemleak_disable(void); kmemleak_disable(); \ } while (0) +#define warn_or_seq_printf(seq, fmt, ...) do { \ + if (seq) \ + seq_printf(seq, fmt, ##__VA_ARGS__); \ + else \ + pr_warn(fmt, ##__VA_ARGS__); \ +} while (0) + +static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type, + int rowsize, int groupsize, const void *buf, + size_t len, bool ascii) +{ + if (seq) + seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize, + buf, len, ascii); + else + print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type, + rowsize, groupsize, buf, len, ascii); +} + /* * Printing of the objects hex dump to the seq file. The number of lines to be * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The @@ -314,10 +338,10 @@ static void hex_dump_object(struct seq_file *seq, /* limit the number of lines to HEX_MAX_LINES */ len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); - seq_printf(seq, " hex dump (first %zu bytes):\n", len); + warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); kasan_disable_current(); - seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE, - HEX_GROUP_SIZE, ptr, len, HEX_ASCII); + warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE, + HEX_GROUP_SIZE, ptr, len, HEX_ASCII); kasan_enable_current(); } @@ -365,17 +389,17 @@ static void print_unreferenced(struct seq_file *seq, int i; unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); - seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", + warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", object->pointer, object->size); - seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", + warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", object->comm, object->pid, object->jiffies, msecs_age / 1000, msecs_age % 1000); hex_dump_object(seq, object); - seq_printf(seq, " backtrace:\n"); + warn_or_seq_printf(seq, " backtrace:\n"); for (i = 0; i < object->trace_len; i++) { void *ptr = (void *)object->trace[i]; - seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); + warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); } } @@ -1598,6 +1622,10 @@ static void kmemleak_scan(void) if (unreferenced_object(object) && !(object->flags & OBJECT_REPORTED)) { object->flags |= OBJECT_REPORTED; + + if (kmemleak_verbose) + print_unreferenced(NULL, object); + new_leaks++; } spin_unlock_irqrestore(&object->lock, flags); diff --git a/mm/memblock.c b/mm/memblock.c @@ -1444,10 +1444,9 @@ void * __init memblock_virt_alloc_try_nid_raw( ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); -#ifdef CONFIG_DEBUG_VM if (ptr && size > 0) - memset(ptr, PAGE_POISON_PATTERN, size); -#endif + page_init_poison(ptr, size); + return ptr; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c @@ -1669,6 +1669,8 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int if (order > PAGE_ALLOC_COSTLY_ORDER) return OOM_SKIPPED; + memcg_memory_event(memcg, MEMCG_OOM); + /* * We are in the middle of the charge context here, so we * don't want to block when potentially sitting on a callstack @@ -2250,8 +2252,6 @@ retry: if (fatal_signal_pending(current)) goto force; - memcg_memory_event(mem_over_limit, MEMCG_OOM); - /* * keep retrying as long as the memcg oom killer is able to make * a forward progress or bypass the charge if the oom killer @@ -2460,7 +2460,7 @@ static void memcg_kmem_cache_create_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, struct kmem_cache *cachep) { struct memcg_kmem_cache_create_work *cw; @@ -2478,25 +2478,6 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, queue_work(memcg_kmem_cache_wq, &cw->work); } -static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, - struct kmem_cache *cachep) -{ - /* - * We need to stop accounting when we kmalloc, because if the - * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_schedule_kmem_cache_create will recurse. - * - * However, it is better to enclose the whole function. Depending on - * the debugging options enabled, INIT_WORK(), for instance, can - * trigger an allocation. This too, will make us recurse. Because at - * this point we can't allow ourselves back into memcg_kmem_get_cache, - * the safest choice is to do it like this, wrapping the whole function. - */ - current->memcg_kmem_skip_account = 1; - __memcg_schedule_kmem_cache_create(memcg, cachep); - current->memcg_kmem_skip_account = 0; -} - static inline bool memcg_kmem_bypass(void) { if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) @@ -2531,9 +2512,6 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (memcg_kmem_bypass()) return cachep; - if (current->memcg_kmem_skip_account) - return cachep; - memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) @@ -4321,14 +4299,12 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg) static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); - atomic_add(n, &memcg->id.ref); + refcount_add(n, &memcg->id.ref); } static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) < n); - if (atomic_sub_and_test(n, &memcg->id.ref)) { + if (refcount_sub_and_test(n, &memcg->id.ref)) { mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ @@ -4545,7 +4521,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) } /* Online state pins memcg ID, memcg ID pins CSS */ - atomic_set(&memcg->id.ref, 1); + refcount_set(&memcg->id.ref, 1); css_get(css); return 0; } @@ -4573,6 +4549,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); wb_memcg_offline(memcg); + drain_all_stock(memcg); + mem_cgroup_id_put(memcg); } @@ -5595,6 +5573,13 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); + seq_printf(m, "workingset_refault %lu\n", + acc.stat[WORKINGSET_REFAULT]); + seq_printf(m, "workingset_activate %lu\n", + acc.stat[WORKINGSET_ACTIVATE]); + seq_printf(m, "workingset_nodereclaim %lu\n", + acc.stat[WORKINGSET_NODERECLAIM]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + acc.events[PGSCAN_DIRECT]); @@ -5605,13 +5590,6 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); - seq_printf(m, "workingset_refault %lu\n", - acc.stat[WORKINGSET_REFAULT]); - seq_printf(m, "workingset_activate %lu\n", - acc.stat[WORKINGSET_ACTIVATE]); - seq_printf(m, "workingset_nodereclaim %lu\n", - acc.stat[WORKINGSET_NODERECLAIM]); - return 0; } @@ -6377,7 +6355,7 @@ subsys_initcall(mem_cgroup_init); #ifdef CONFIG_MEMCG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { - while (!atomic_inc_not_zero(&memcg->id.ref)) { + while (!refcount_inc_not_zero(&memcg->id.ref)) { /* * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. diff --git a/mm/memory.c b/mm/memory.c @@ -1520,19 +1520,16 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); -static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, +static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; - int retval; pte_t *pte, entry; spinlock_t *ptl; - retval = -ENOMEM; pte = get_locked_pte(mm, addr, &ptl); if (!pte) - goto out; - retval = -EBUSY; + return VM_FAULT_OOM; if (!pte_none(*pte)) { if (mkwrite) { /* @@ -1565,56 +1562,32 @@ out_mkwrite: set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ - retval = 0; out_unlock: pte_unmap_unlock(pte, ptl); -out: - return retval; -} - -/** - * vm_insert_pfn - insert single pfn into user vma - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * - * Similar to vm_insert_page, this allows drivers to insert individual pages - * they've allocated into a user vma. Same comments apply. - * - * This function should only be called from a vm_ops->fault handler, and - * in that case the handler should return NULL. - * - * vma cannot be a COW mapping. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - */ -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) -{ - return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); + return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL(vm_insert_pfn); /** - * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * - * This is exactly like vm_insert_pfn, except that it allows drivers to + * This is exactly like vmf_insert_pfn(), except that it allows drivers to * to override pgprot on a per-page basis. * * This only makes sense for IO mappings, and it makes no sense for - * cow mappings. In general, using multiple vmas is preferable; - * vm_insert_pfn_prot should only be used if using multiple VMAs is + * COW mappings. In general, using multiple vmas is preferable; + * vmf_insert_pfn_prot should only be used if using multiple VMAs is * impractical. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. */ -int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { - int ret; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1628,19 +1601,44 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; + return VM_FAULT_SIGBUS; if (!pfn_modify_allowed(pfn, pgprot)) - return -EACCES; + return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); - ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, + return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); +} +EXPORT_SYMBOL(vmf_insert_pfn_prot); - return ret; +/** + * vmf_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_insert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return the result of this function. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); } -EXPORT_SYMBOL(vm_insert_pfn_prot); +EXPORT_SYMBOL(vmf_insert_pfn); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) { @@ -1656,20 +1654,21 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) return false; } -static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, bool mkwrite) +static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; + int err; BUG_ON(!vm_mixed_ok(vma, pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; + return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) - return -EACCES; + return VM_FAULT_SIGBUS; /* * If we don't have pte special, then we have to use the pfn_valid() @@ -1688,36 +1687,35 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); - return insert_page(vma, addr, page, pgprot); + err = insert_page(vma, addr, page, pgprot); + } else { + return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } - return insert_pfn(vma, addr, pfn, pgprot, mkwrite); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; } -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, false); - } -EXPORT_SYMBOL(vm_insert_mixed); +EXPORT_SYMBOL(vmf_insert_mixed); /* * If the insertion of PTE failed because someone else already added a * different entry in the mean time, we treat that as success as we assume * the same entry was actually inserted. */ - vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { - int err; - - err = __vm_insert_mixed(vma, addr, pfn, true); - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - return VM_FAULT_NOPAGE; + return __vm_insert_mixed(vma, addr, pfn, true); } EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); @@ -3498,10 +3496,36 @@ static vm_fault_t do_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; - /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ - if (!vma->vm_ops->fault) - ret = VM_FAULT_SIGBUS; - else if (!(vmf->flags & FAULT_FLAG_WRITE)) + /* + * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND + */ + if (!vma->vm_ops->fault) { + /* + * If we find a migration pmd entry or a none pmd entry, which + * should never happen, return SIGBUS + */ + if (unlikely(!pmd_present(*vmf->pmd))) + ret = VM_FAULT_SIGBUS; + else { + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, + vmf->pmd, + vmf->address, + &vmf->ptl); + /* + * Make sure this is not a temporary clearing of pte + * by holding ptl and checking again. A R/M/W update + * of pte involves: take ptl, clearing the pte so that + * we don't have concurrent modification by hardware + * followed by an update. + */ + if (unlikely(pte_none(*vmf->pte))) + ret = VM_FAULT_SIGBUS; + else + ret = VM_FAULT_NOPAGE; + + pte_unmap_unlock(vmf->pte, vmf->ptl); + } + } else if (!(vmf->flags & FAULT_FLAG_WRITE)) ret = do_read_fault(vmf); else if (!(vma->vm_flags & VM_SHARED)) ret = do_cow_fault(vmf); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c @@ -687,62 +687,19 @@ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) { int nid = zone_to_nid(zone); - enum zone_type zone_last = ZONE_NORMAL; - /* - * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_NORMAL, - * set zone_last to ZONE_NORMAL. - * - * If we don't have HIGHMEM nor movable node, - * node_states[N_NORMAL_MEMORY] contains nodes which have zones of - * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. - */ - if (N_MEMORY == N_NORMAL_MEMORY) - zone_last = ZONE_MOVABLE; + arg->status_change_nid = -1; + arg->status_change_nid_normal = -1; + arg->status_change_nid_high = -1; - /* - * if the memory to be online is in a zone of 0...zone_last, and - * the zones of 0...zone_last don't have memory before online, we will - * need to set the node to node_states[N_NORMAL_MEMORY] after - * the memory is online. - */ - if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) + if (!node_state(nid, N_MEMORY)) + arg->status_change_nid = nid; + if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) arg->status_change_nid_normal = nid; - else - arg->status_change_nid_normal = -1; - #ifdef CONFIG_HIGHMEM - /* - * If we have movable node, node_states[N_HIGH_MEMORY] - * contains nodes which have zones of 0...ZONE_HIGHMEM, - * set zone_last to ZONE_HIGHMEM. - * - * If we don't have movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_MOVABLE, - * set zone_last to ZONE_MOVABLE. - */ - zone_last = ZONE_HIGHMEM; - if (N_MEMORY == N_HIGH_MEMORY) - zone_last = ZONE_MOVABLE; - - if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) + if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY)) arg->status_change_nid_high = nid; - else - arg->status_change_nid_high = -1; -#else - arg->status_change_nid_high = arg->status_change_nid_normal; #endif - - /* - * if the node don't have memory befor online, we will need to - * set the node to node_states[N_MEMORY] after the memory - * is online. - */ - if (!node_state(nid, N_MEMORY)) - arg->status_change_nid = nid; - else - arg->status_change_nid = -1; } static void node_states_set_node(int node, struct memory_notify *arg) @@ -753,7 +710,8 @@ static void node_states_set_node(int node, struct memory_notify *arg) if (arg->status_change_nid_high >= 0) node_set_state(node, N_HIGH_MEMORY); - node_set_state(node, N_MEMORY); + if (arg->status_change_nid >= 0) + node_set_state(node, N_MEMORY); } static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, @@ -1505,75 +1463,53 @@ static void node_states_check_changes_offline(unsigned long nr_pages, { struct pglist_data *pgdat = zone->zone_pgdat; unsigned long present_pages = 0; - enum zone_type zt, zone_last = ZONE_NORMAL; + enum zone_type zt; - /* - * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_NORMAL, - * set zone_last to ZONE_NORMAL. - * - * If we don't have HIGHMEM nor movable node, - * node_states[N_NORMAL_MEMORY] contains nodes which have zones of - * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. - */ - if (N_MEMORY == N_NORMAL_MEMORY) - zone_last = ZONE_MOVABLE; + arg->status_change_nid = -1; + arg->status_change_nid_normal = -1; + arg->status_change_nid_high = -1; /* - * check whether node_states[N_NORMAL_MEMORY] will be changed. - * If the memory to be offline is in a zone of 0...zone_last, - * and it is the last present memory, 0...zone_last will - * become empty after offline , thus we can determind we will - * need to clear the node from node_states[N_NORMAL_MEMORY]. + * Check whether node_states[N_NORMAL_MEMORY] will be changed. + * If the memory to be offline is within the range + * [0..ZONE_NORMAL], and it is the last present memory there, + * the zones in that range will become empty after the offlining, + * thus we can determine that we need to clear the node from + * node_states[N_NORMAL_MEMORY]. */ - for (zt = 0; zt <= zone_last; zt++) + for (zt = 0; zt <= ZONE_NORMAL; zt++) present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) arg->status_change_nid_normal = zone_to_nid(zone); - else - arg->status_change_nid_normal = -1; #ifdef CONFIG_HIGHMEM /* - * If we have movable node, node_states[N_HIGH_MEMORY] - * contains nodes which have zones of 0...ZONE_HIGHMEM, - * set zone_last to ZONE_HIGHMEM. - * - * If we don't have movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_MOVABLE, - * set zone_last to ZONE_MOVABLE. + * node_states[N_HIGH_MEMORY] contains nodes which + * have normal memory or high memory. + * Here we add the present_pages belonging to ZONE_HIGHMEM. + * If the zone is within the range of [0..ZONE_HIGHMEM), and + * we determine that the zones in that range become empty, + * we need to clear the node for N_HIGH_MEMORY. */ - zone_last = ZONE_HIGHMEM; - if (N_MEMORY == N_HIGH_MEMORY) - zone_last = ZONE_MOVABLE; - - for (; zt <= zone_last; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages; + if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages) arg->status_change_nid_high = zone_to_nid(zone); - else - arg->status_change_nid_high = -1; -#else - arg->status_change_nid_high = arg->status_change_nid_normal; #endif /* - * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE + * We have accounted the pages from [0..ZONE_NORMAL), and + * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM + * as well. + * Here we count the possible pages from ZONE_MOVABLE. + * If after having accounted all the pages, we see that the nr_pages + * to be offlined is over or equal to the accounted pages, + * we know that the node will become empty, and so, we can clear + * it for N_MEMORY as well. */ - zone_last = ZONE_MOVABLE; + present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; - /* - * check whether node_states[N_HIGH_MEMORY] will be changed - * If we try to offline the last present @nr_pages from the node, - * we can determind we will need to clear the node from - * node_states[N_HIGH_MEMORY]. - */ - for (; zt <= zone_last; zt++) - present_pages += pgdat->node_zones[zt].present_pages; if (nr_pages >= present_pages) arg->status_change_nid = zone_to_nid(zone); - else - arg->status_change_nid = -1; } static void node_states_clear_node(int node, struct memory_notify *arg) @@ -1581,12 +1517,10 @@ static void node_states_clear_node(int node, struct memory_notify *arg) if (arg->status_change_nid_normal >= 0) node_clear_state(node, N_NORMAL_MEMORY); - if ((N_MEMORY != N_NORMAL_MEMORY) && - (arg->status_change_nid_high >= 0)) + if (arg->status_change_nid_high >= 0) node_clear_state(node, N_HIGH_MEMORY); - if ((N_MEMORY != N_HIGH_MEMORY) && - (arg->status_change_nid >= 0)) + if (arg->status_change_nid >= 0) node_clear_state(node, N_MEMORY); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c @@ -797,16 +797,19 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) } } -static int lookup_node(unsigned long addr) +static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p; int err; - err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL); + int locked = 1; + err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); if (err >= 0) { err = page_to_nid(p); put_page(p); } + if (locked) + up_read(&mm->mmap_sem); return err; } @@ -817,7 +820,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) @@ -857,7 +860,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { - err = lookup_node(addr); + /* + * Take a refcount on the mpol, lookup_node() + * wil drop the mmap_sem, so after calling + * lookup_node() only "pol" remains valid, "vma" + * is stale. + */ + pol_refcount = pol; + vma = NULL; + mpol_get(pol); + err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; @@ -892,7 +904,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, out: mpol_cond_put(pol); if (vma) - up_read(&current->mm->mmap_sem); + up_read(&mm->mmap_sem); + if (pol_refcount) + mpol_put(pol_refcount); return err; } @@ -2697,12 +2711,11 @@ static const char * const policy_modes[] = int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; - unsigned short mode; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); - int err = 1; + int err = 1, mode; if (nodelist) { /* NUL-terminate mode or flags string */ @@ -2717,12 +2730,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (mode = 0; mode < MPOL_MAX; mode++) { - if (!strcmp(str, policy_modes[mode])) { - break; - } - } - if (mode >= MPOL_MAX) + mode = match_string(policy_modes, MPOL_MAX, str); + if (mode < 0) goto out; switch (mode) { diff --git a/mm/migrate.c b/mm/migrate.c @@ -685,6 +685,8 @@ void migrate_page_states(struct page *newpage, struct page *page) SetPageActive(newpage); } else if (TestClearPageUnevictable(page)) SetPageUnevictable(newpage); + if (PageWorkingset(page)) + SetPageWorkingset(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -1973,8 +1975,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int isolated = 0; struct page *new_page = NULL; int page_lru = page_is_file_cache(page); - unsigned long mmun_start = address & HPAGE_PMD_MASK; - unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; + unsigned long start = address & HPAGE_PMD_MASK; new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), @@ -1997,15 +1998,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; new_page->index = page->index; + /* flush the cache before copying using the kernel virtual address */ + flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); migrate_page_copy(new_page, page); WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -2029,16 +2030,26 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); /* - * Clear the old entry under pagetable lock and establish the new PTE. - * Any parallel GUP will either observe the old page blocking on the - * page lock, block on the page table lock or observe the new page. - * The SetPageUptodate on the new page and page_add_new_anon_rmap - * guarantee the copy is visible before the pagetable update. + * Overwrite the old entry under pagetable lock and establish + * the new PTE. Any parallel GUP will either observe the old + * page blocking on the page lock, block on the page table + * lock or observe the new page. The SetPageUptodate on the + * new page and page_add_new_anon_rmap guarantee the copy is + * visible before the pagetable update. */ - flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start, true); - pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); - set_pmd_at(mm, mmun_start, pmd, entry); + page_add_anon_rmap(new_page, vma, start, true); + /* + * At this point the pmd is numa/protnone (i.e. non present) and the TLB + * has already been flushed globally. So no TLB can be currently + * caching this non present pmd mapping. There's no need to clear the + * pmd before doing set_pmd_at(), nor to flush the TLB after + * set_pmd_at(). Clearing the pmd here would introduce a race + * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the + * mmap_sem for reading. If the pmd is set to NULL at any given time, + * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this + * pmd. + */ + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); page_ref_unfreeze(page, 2); @@ -2047,11 +2058,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); /* Take an "isolate" reference and put new page on the LRU. */ get_page(new_page); @@ -2075,7 +2081,7 @@ out_fail: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { entry = pmd_modify(entry, vma->vm_page_prot); - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } spin_unlock(ptl); diff --git a/mm/mmap.c b/mm/mmap.c @@ -191,16 +191,19 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long retval; - unsigned long newbrk, oldbrk; + unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *next; unsigned long min_brk; bool populate; + bool downgraded = false; LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; + origbrk = mm->brk; + #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting @@ -229,14 +232,32 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; + if (oldbrk == newbrk) { + mm->brk = brk; + goto success; + } - /* Always allow shrinking brk. */ + /* + * Always allow shrinking brk. + * __do_munmap() may downgrade mmap_sem to read. + */ if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) - goto set_brk; - goto out; + int ret; + + /* + * mm->brk must to be protected by write mmap_sem so update it + * before downgrading mmap_sem. When __do_munmap() fails, + * mm->brk will be restored from origbrk. + */ + mm->brk = brk; + ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); + if (ret < 0) { + mm->brk = origbrk; + goto out; + } else if (ret == 1) { + downgraded = true; + } + goto success; } /* Check against existing mmap mappings. */ @@ -247,18 +268,21 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Ok, looks good - let it rip. */ if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) goto out; - -set_brk: mm->brk = brk; + +success: populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; - up_write(&mm->mmap_sem); + if (downgraded) + up_read(&mm->mmap_sem); + else + up_write(&mm->mmap_sem); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: - retval = mm->brk; + retval = origbrk; up_write(&mm->mmap_sem); return retval; } @@ -2687,8 +2711,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * work. This now handles partial unmappings. * Jeremy Fitzhardinge <jeremy@goop.org> */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf) +int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf, bool downgrade) { unsigned long end; struct vm_area_struct *vma, *prev, *last; @@ -2770,25 +2794,38 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, mm->locked_vm -= vma_pages(tmp); munlock_vma_pages_all(tmp); } + tmp = tmp->vm_next; } } - /* - * Remove the vma's, and unmap the actual pages - */ + /* Detach vmas from rbtree */ detach_vmas_to_be_unmapped(mm, vma, prev, end); - unmap_region(mm, vma, prev, start, end); + /* + * mpx unmap needs to be called with mmap_sem held for write. + * It is safe to call it before unmap_region(). + */ arch_unmap(mm, vma, start, end); + if (downgrade) + downgrade_write(&mm->mmap_sem); + + unmap_region(mm, vma, prev, start, end); + /* Fix up all other VM information */ remove_vma_list(mm, vma); - return 0; + return downgrade ? 1 : 0; } -int vm_munmap(unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf) +{ + return __do_munmap(mm, start, len, uf, false); +} + +static int __vm_munmap(unsigned long start, size_t len, bool downgrade) { int ret; struct mm_struct *mm = current->mm; @@ -2797,17 +2834,32 @@ int vm_munmap(unsigned long start, size_t len) if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_munmap(mm, start, len, &uf); - up_write(&mm->mmap_sem); + ret = __do_munmap(mm, start, len, &uf, downgrade); + /* + * Returning 1 indicates mmap_sem is downgraded. + * But 1 is not legal return value of vm_munmap() and munmap(), reset + * it to 0 before return. + */ + if (ret == 1) { + up_read(&mm->mmap_sem); + ret = 0; + } else + up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); return ret; } + +int vm_munmap(unsigned long start, size_t len) +{ + return __vm_munmap(start, len, false); +} EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { profile_munmap(addr); - return vm_munmap(addr, len); + return __vm_munmap(addr, len, true); } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c @@ -247,37 +247,6 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); -/* - * Must be called while holding mm->mmap_sem for either read or write. - * The result is guaranteed to be valid until mm->mmap_sem is dropped. - */ -bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) -{ - struct mmu_notifier *mn; - int id; - bool ret = false; - - WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); - - if (!mm_has_notifiers(mm)) - return ret; - - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (!mn->ops->invalidate_range && - !mn->ops->invalidate_range_start && - !mn->ops->invalidate_range_end) - continue; - - if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) { - ret = true; - break; - } - } - srcu_read_unlock(&srcu, id); - return ret; -} - static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, int take_mmap_sem) diff --git a/mm/mremap.c b/mm/mremap.c @@ -521,6 +521,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long ret = -EINVAL; unsigned long charged = 0; bool locked = false; + bool downgraded = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); @@ -557,12 +558,20 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * do_munmap does all the needed commit accounting + * __do_munmap does all the needed commit accounting, and + * downgrades mmap_sem to read if so directed. */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); - if (ret && old_len != new_len) + int retval; + + retval = __do_munmap(mm, addr+new_len, old_len - new_len, + &uf_unmap, true); + if (retval < 0 && old_len != new_len) { + ret = retval; goto out; + /* Returning 1 indicates mmap_sem is downgraded to read. */ + } else if (retval == 1) + downgraded = true; ret = addr; goto out; } @@ -627,7 +636,10 @@ out: vm_unacct_memory(charged); locked = 0; } - up_write(&current->mm->mmap_sem); + if (downgraded) + up_read(&current->mm->mmap_sem); + else + up_write(&current->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); userfaultfd_unmap_complete(mm, &uf_unmap_early); diff --git a/mm/nommu.c b/mm/nommu.c @@ -1709,11 +1709,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, return ret; } -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) { - *page_mask = 0; return NULL; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c @@ -2149,6 +2149,13 @@ EXPORT_SYMBOL(tag_pages_for_writeback); * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). + * + * To avoid deadlocks between range_cyclic writeback and callers that hold + * pages in PageWriteback to aggregate IO until write_cache_pages() returns, + * we do not loop back to the start of the file. Doing so causes a page + * lock/page writeback access order inversion - we should only ever lock + * multiple pages in ascending page->index order, and looping back to the start + * of the file violates that rule and causes deadlocks. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, @@ -2162,7 +2169,6 @@ int write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - int cycled; int range_whole = 0; int tag; @@ -2170,23 +2176,17 @@ int write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; -retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; @@ -2272,17 +2272,14 @@ continue_unlock: pagevec_release(&pvec); cond_resched(); } - if (!cycled && !done) { - /* - * range_cyclic: - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - cycled = 1; - index = 0; - end = writeback_index - 1; - goto retry; - } + + /* + * If we hit the last page and there is more work to be done: wrap + * back the index back to the start of the file for the next + * time we are called. + */ + if (wbc->range_cyclic && !done) + done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; diff --git a/mm/page_alloc.c b/mm/page_alloc.c @@ -66,6 +66,7 @@ #include <linux/ftrace.h> #include <linux/lockdep.h> #include <linux/nmi.h> +#include <linux/psi.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -306,24 +307,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn) } /* - * Returns false when the remaining initialisation should be deferred until + * Returns true when the remaining initialisation should be deferred until * later in the boot cycle when it can be parallelised. */ -static inline bool update_defer_init(pg_data_t *pgdat, - unsigned long pfn, unsigned long zone_end, - unsigned long *nr_initialised) +static bool __meminit +defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { + static unsigned long prev_end_pfn, nr_initialised; + + /* + * prev_end_pfn static that contains the end of previous zone + * No need to protect because called very early in boot before smp_init. + */ + if (prev_end_pfn != end_pfn) { + prev_end_pfn = end_pfn; + nr_initialised = 0; + } + /* Always populate low zones for address-constrained allocations */ - if (zone_end < pgdat_end_pfn(pgdat)) - return true; - (*nr_initialised)++; - if ((*nr_initialised > pgdat->static_init_pgcnt) && - (pfn & (PAGES_PER_SECTION - 1)) == 0) { - pgdat->first_deferred_pfn = pfn; + if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) return false; + nr_initialised++; + if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + NODE_DATA(nid)->first_deferred_pfn = pfn; + return true; } - - return true; + return false; } #else static inline bool early_page_uninitialised(unsigned long pfn) @@ -331,11 +341,9 @@ static inline bool early_page_uninitialised(unsigned long pfn) return false; } -static inline bool update_defer_init(pg_data_t *pgdat, - unsigned long pfn, unsigned long zone_end, - unsigned long *nr_initialised) +static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { - return true; + return false; } #endif @@ -1231,7 +1239,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) /* Avoid false-positive PageTail() */ INIT_LIST_HEAD(&page->lru); - SetPageReserved(page); + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); } } } @@ -2015,10 +2028,6 @@ static int move_freepages(struct zone *zone, pfn_valid(page_to_pfn(end_page)) && page_zone(start_page) != page_zone(end_page)); #endif - - if (num_movable) - *num_movable = 0; - for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -2058,6 +2067,9 @@ int move_freepages_block(struct zone *zone, struct page *page, unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; + if (num_movable) + *num_movable = 0; + start_pfn = page_to_pfn(page); start_pfn = start_pfn & ~(pageblock_nr_pages-1); start_page = pfn_to_page(start_pfn); @@ -3366,26 +3378,12 @@ try_this_zone: return NULL; } -/* - * Large machines with many possible nodes should not always dump per-node - * meminfo in irq context. - */ -static inline bool should_suppress_show_mem(void) -{ - bool ret = false; - -#if NODES_SHIFT > 8 - ret = in_interrupt(); -#endif - return ret; -} - static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { unsigned int filter = SHOW_MEM_FILTER_NODES; static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) + if (!__ratelimit(&show_mem_rs)) return; /* @@ -3549,15 +3547,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum compact_priority prio, enum compact_result *compact_result) { struct page *page; + unsigned long pflags; unsigned int noreclaim_flag; if (!order) return NULL; + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); if (*compact_result <= COMPACT_INACTIVE) return NULL; @@ -3756,11 +3759,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct reclaim_state reclaim_state; int progress; unsigned int noreclaim_flag; + unsigned long pflags; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); + psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; @@ -3772,6 +3777,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, current->reclaim_state = NULL; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); + psi_memstall_leave(&pflags); cond_resched(); @@ -3922,6 +3928,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, { struct zone *zone; struct zoneref *z; + bool ret = false; /* * Costly allocations might have made a progress but this doesn't mean @@ -3985,25 +3992,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, } } - /* - * Memory allocation/reclaim might be called from a WQ - * context and the current implementation of the WQ - * concurrency control doesn't recognize that - * a particular WQ is congested if the worker thread is - * looping without ever sleeping. Therefore we have to - * do a short sleep here rather than calling - * cond_resched(). - */ - if (current->flags & PF_WQ_WORKER) - schedule_timeout_uninterruptible(1); - else - cond_resched(); - - return true; + ret = true; + goto out; } } - return false; +out: + /* + * Memory allocation/reclaim might be called from a WQ context and the + * current implementation of the WQ concurrency control doesn't + * recognize that a particular WQ is congested if the worker thread is + * looping without ever sleeping. Therefore we have to do a short sleep + * here rather than calling cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout_uninterruptible(1); + else + cond_resched(); + return ret; } static inline bool @@ -4701,6 +4707,7 @@ long si_mem_available(void) unsigned long pagecache; unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; + unsigned long reclaimable; struct zone *zone; int lru; @@ -4726,19 +4733,13 @@ long si_mem_available(void) available += pagecache; /* - * Part of the reclaimable slab consists of items that are in use, - * and cannot be freed. Cap this estimate at the low watermark. + * Part of the reclaimable slab and other kernel memory consists of + * items that are in use, and cannot be freed. Cap this estimate at the + * low watermark. */ - available += global_node_page_state(NR_SLAB_RECLAIMABLE) - - min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, - wmark_low); - - /* - * Part of the kernel memory, which can be released under memory - * pressure. - */ - available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> - PAGE_SHIFT; + reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + available += reclaimable - min(reclaimable / 2, wmark_low); if (available < 0) available = 0; @@ -5449,6 +5450,30 @@ void __ref build_all_zonelists(pg_data_t *pgdat) #endif } +/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ +static bool __meminit +overlap_memmap_init(unsigned long zone, unsigned long *pfn) +{ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + static struct memblock_region *r; + + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { + for_each_memblock(memory, r) { + if (*pfn < memblock_region_memory_end_pfn(r)) + break; + } + } + if (*pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + *pfn = memblock_region_memory_end_pfn(r); + return true; + } + } +#endif + return false; +} + /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is @@ -5458,67 +5483,118 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context, struct vmem_altmap *altmap) { - unsigned long end_pfn = start_pfn + size; - pg_data_t *pgdat = NODE_DATA(nid); - unsigned long pfn; - unsigned long nr_initialised = 0; + unsigned long pfn, end_pfn = start_pfn + size; struct page *page; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - struct memblock_region *r = NULL, *tmp; -#endif if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; +#ifdef CONFIG_ZONE_DEVICE /* * Honor reservation requested by the driver for this ZONE_DEVICE - * memory + * memory. We limit the total number of pages to initialize to just + * those that might contain the memory mapping. We will defer the + * ZONE_DEVICE page initialization until after we have released + * the hotplug lock. */ - if (altmap && start_pfn == altmap->base_pfn) - start_pfn += altmap->reserve; + if (zone == ZONE_DEVICE) { + if (!altmap) + return; + + if (start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + } +#endif for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ - if (context != MEMMAP_EARLY) - goto not_early; - - if (!early_pfn_valid(pfn)) - continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; - if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) - break; - -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * Check given memblock attribute by firmware which can affect - * kernel memory layout. If zone==ZONE_MOVABLE but memory is - * mirrored, it's an overlapped memmap init. skip it. - */ - if (mirrored_kernelcore && zone == ZONE_MOVABLE) { - if (!r || pfn >= memblock_region_memory_end_pfn(r)) { - for_each_memblock(memory, tmp) - if (pfn < memblock_region_memory_end_pfn(tmp)) - break; - r = tmp; - } - if (pfn >= memblock_region_memory_base_pfn(r) && - memblock_is_mirror(r)) { - /* already initialized as NORMAL */ - pfn = memblock_region_memory_end_pfn(r); + if (context == MEMMAP_EARLY) { + if (!early_pfn_valid(pfn)) continue; - } + if (!early_pfn_in_nid(pfn, nid)) + continue; + if (overlap_memmap_init(zone, &pfn)) + continue; + if (defer_init(nid, pfn, end_pfn)) + break; } -#endif -not_early: page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); if (context == MEMMAP_HOTPLUG) - SetPageReserved(page); + __SetPageReserved(page); + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * bitmap is created for zone's valid pfn range. but memmap + * can be created for invalid pages (for alignment) + * check here not to call set_pageblock_migratetype() against + * pfn out of zone. + */ + if (!(pfn & (pageblock_nr_pages - 1))) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); + } + } +} + +#ifdef CONFIG_ZONE_DEVICE +void __ref memmap_init_zone_device(struct zone *zone, + unsigned long start_pfn, + unsigned long size, + struct dev_pagemap *pgmap) +{ + unsigned long pfn, end_pfn = start_pfn + size; + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long zone_idx = zone_idx(zone); + unsigned long start = jiffies; + int nid = pgdat->node_id; + + if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone))) + return; + + /* + * The call to memmap_init_zone should have already taken care + * of the pages reserved for the memmap, so we can just jump to + * the end of that region and start processing the device pages. + */ + if (pgmap->altmap_valid) { + struct vmem_altmap *altmap = &pgmap->altmap; + + start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + size = end_pfn - start_pfn; + } + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_single_page(page, pfn, zone_idx, nid); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer and hmm_data. It is a bug if a ZONE_DEVICE + * page is ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->hmm_data = 0; /* * Mark the block movable so that blocks are reserved for @@ -5540,8 +5616,12 @@ not_early: cond_resched(); } } + + pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), + size, jiffies_to_msecs(jiffies - start)); } +#endif static void __meminit zone_init_free_lists(struct zone *zone) { unsigned int order, t; @@ -5551,10 +5631,11 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -#ifndef __HAVE_ARCH_MEMMAP_INIT -#define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL) -#endif +void __meminit __weak memmap_init(unsigned long size, int nid, + unsigned long zone, unsigned long start_pfn) +{ + memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL); +} static int zone_batchsize(struct zone *zone) { @@ -6428,45 +6509,65 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, } #if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) + +/* + * Zero all valid struct pages in range [spfn, epfn), return number of struct + * pages zeroed + */ +static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) +{ + unsigned long pfn; + u64 pgcnt = 0; + + for (pfn = spfn; pfn < epfn; pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + + pageblock_nr_pages - 1; + continue; + } + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; + } + + return pgcnt; +} + /* * Only struct pages that are backed by physical memory are zeroed and * initialized by going through __init_single_page(). But, there are some * struct pages which are reserved in memblock allocator and their fields * may be accessed (for example page_to_pfn() on some configuration accesses * flags). We must explicitly zero those struct pages. + * + * This function also addresses a similar issue where struct pages are left + * uninitialized because the physical address range is not covered by + * memblock.memory or memblock.reserved. That could happen when memblock + * layout is manually configured via memmap=. */ void __init zero_resv_unavail(void) { phys_addr_t start, end; - unsigned long pfn; u64 i, pgcnt; + phys_addr_t next = 0; /* - * Loop through ranges that are reserved, but do not have reported - * physical memory backing. + * Loop through unavailable ranges not covered by memblock.memory. */ pgcnt = 0; - for_each_resv_unavail_range(i, &start, &end) { - for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; - continue; - } - mm_zero_struct_page(pfn_to_page(pfn)); - pgcnt++; - } + for_each_mem_range(i, &memblock.memory, NULL, + NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { + if (next < start) + pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); + next = end; } + pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn); /* * Struct pages that do not have backing memory. This could be because * firmware is using some of this memory, or for some other reasons. - * Once memblock is changed so such behaviour is not allowed: i.e. - * list of "reserved" memory must be a subset of list of "memory", then - * this code can be removed. */ if (pgcnt) - pr_info("Reserved but unavailable: %lld pages", pgcnt); + pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); } #endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ @@ -6803,15 +6904,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid) { enum zone_type zone_type; - if (N_MEMORY == N_NORMAL_MEMORY) - return; - for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; if (populated_zone(zone)) { - node_set_state(nid, N_HIGH_MEMORY); - if (N_NORMAL_MEMORY != N_HIGH_MEMORY && - zone_type <= ZONE_NORMAL) + if (IS_ENABLED(CONFIG_HIGHMEM)) + node_set_state(nid, N_HIGH_MEMORY); + if (zone_type <= ZONE_NORMAL) node_set_state(nid, N_NORMAL_MEMORY); break; } diff --git a/mm/page_io.c b/mm/page_io.c @@ -283,7 +283,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -365,7 +365,7 @@ int swap_readpage(struct page *page, bool synchronous) goto out; } - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -423,7 +423,7 @@ int swap_set_page_dirty(struct page *page) { struct swap_info_struct *sis = page_swap_info(page); - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct address_space *mapping = sis->swap_file->f_mapping; VM_BUG_ON_PAGE(!PageSwapCache(page), page); diff --git a/mm/slab.c b/mm/slab.c @@ -1288,7 +1288,7 @@ void __init kmem_cache_init(void) * Initialize the caches that provide memory for the kmem_cache_node * structures first. Without this, further allocations will bug. */ - kmalloc_caches[INDEX_NODE] = create_kmalloc_cache( + kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( kmalloc_info[INDEX_NODE].name, kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS, 0, kmalloc_size(INDEX_NODE)); @@ -1304,7 +1304,7 @@ void __init kmem_cache_init(void) for_each_online_node(nid) { init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - init_list(kmalloc_caches[INDEX_NODE], + init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE], &init_kmem_cache_node[SIZE_NODE + nid], nid); } } @@ -3675,6 +3675,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) struct kmem_cache *cachep; void *ret; + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; @@ -3710,6 +3712,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, struct kmem_cache *cachep; void *ret; + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; diff --git a/mm/slab_common.c b/mm/slab_common.c @@ -973,14 +973,10 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, return s; } -struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; +struct kmem_cache * +kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_caches); -#ifdef CONFIG_ZONE_DMA -struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; -EXPORT_SYMBOL(kmalloc_dma_caches); -#endif - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -1027,25 +1023,20 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { unsigned int index; - if (unlikely(size > KMALLOC_MAX_SIZE)) { - WARN_ON_ONCE(!(flags & __GFP_NOWARN)); - return NULL; - } - if (size <= 192) { if (!size) return ZERO_SIZE_PTR; index = size_index[size_index_elem(size)]; - } else + } else { + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + WARN_ON(1); + return NULL; + } index = fls(size - 1); + } -#ifdef CONFIG_ZONE_DMA - if (unlikely((flags & GFP_DMA))) - return kmalloc_dma_caches[index]; - -#endif - return kmalloc_caches[index]; + return kmalloc_caches[kmalloc_type(flags)][index]; } /* @@ -1059,15 +1050,15 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { {"kmalloc-16", 16}, {"kmalloc-32", 32}, {"kmalloc-64", 64}, {"kmalloc-128", 128}, {"kmalloc-256", 256}, {"kmalloc-512", 512}, - {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, - {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, - {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, - {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, - {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, - {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, - {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, - {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, - {"kmalloc-67108864", 67108864} + {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048}, + {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192}, + {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768}, + {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072}, + {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288}, + {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152}, + {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608}, + {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432}, + {"kmalloc-64M", 67108864} }; /* @@ -1117,9 +1108,36 @@ void __init setup_kmalloc_cache_index_table(void) } } -static void __init new_kmalloc_cache(int idx, slab_flags_t flags) +static const char * +kmalloc_cache_name(const char *prefix, unsigned int size) +{ + + static const char units[3] = "\0kM"; + int idx = 0; + + while (size >= 1024 && (size % 1024 == 0)) { + size /= 1024; + idx++; + } + + return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]); +} + +static void __init +new_kmalloc_cache(int idx, int type, slab_flags_t flags) { - kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, + const char *name; + + if (type == KMALLOC_RECLAIM) { + flags |= SLAB_RECLAIM_ACCOUNT; + name = kmalloc_cache_name("kmalloc-rcl", + kmalloc_info[idx].size); + BUG_ON(!name); + } else { + name = kmalloc_info[idx].name; + } + + kmalloc_caches[type][idx] = create_kmalloc_cache(name, kmalloc_info[idx].size, flags, 0, kmalloc_info[idx].size); } @@ -1131,21 +1149,25 @@ static void __init new_kmalloc_cache(int idx, slab_flags_t flags) */ void __init create_kmalloc_caches(slab_flags_t flags) { - int i; + int i, type; - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[i]) - new_kmalloc_cache(i, flags); + for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[type][i]) + new_kmalloc_cache(i, type, flags); - /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches - */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) - new_kmalloc_cache(1, flags); - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) - new_kmalloc_cache(2, flags); + /* + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches + */ + if (KMALLOC_MIN_SIZE <= 32 && i == 6 && + !kmalloc_caches[type][1]) + new_kmalloc_cache(1, type, flags); + if (KMALLOC_MIN_SIZE <= 64 && i == 7 && + !kmalloc_caches[type][2]) + new_kmalloc_cache(2, type, flags); + } } /* Kmalloc array is now usable */ @@ -1153,16 +1175,15 @@ void __init create_kmalloc_caches(slab_flags_t flags) #ifdef CONFIG_ZONE_DMA for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[i]; + struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i]; if (s) { unsigned int size = kmalloc_size(i); - char *n = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%u", size); + const char *n = kmalloc_cache_name("dma-kmalloc", size); BUG_ON(!n); - kmalloc_dma_caches[i] = create_kmalloc_cache(n, - size, SLAB_CACHE_DMA | flags, 0, 0); + kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( + n, size, SLAB_CACHE_DMA | flags, 0, 0); } } #endif diff --git a/mm/slub.c b/mm/slub.c @@ -1276,16 +1276,54 @@ out: __setup("slub_debug", setup_slub_debug); +/* + * kmem_cache_flags - apply debugging options to the cache + * @object_size: the size of an object without meta data + * @flags: flags to set + * @name: name of the cache + * @ctor: constructor function + * + * Debug option(s) are applied to @flags. In addition to the debug + * option(s), if a slab name (or multiple) is specified i.e. + * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ... + * then only the select slabs will receive the debug option(s). + */ slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { - /* - * Enable debugging if selected on the kernel commandline. - */ - if (slub_debug && (!slub_debug_slabs || (name && - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) - flags |= slub_debug; + char *iter; + size_t len; + + /* If slub_debug = 0, it folds into the if conditional. */ + if (!slub_debug_slabs) + return flags | slub_debug; + + len = strlen(name); + iter = slub_debug_slabs; + while (*iter) { + char *end, *glob; + size_t cmplen; + + end = strchr(iter, ','); + if (!end) + end = iter + strlen(iter); + + glob = strnchr(iter, end - iter, '*'); + if (glob) + cmplen = glob - iter; + else + cmplen = max_t(size_t, len, (end - iter)); + + if (!strncmp(name, iter, cmplen)) { + flags |= slub_debug; + break; + } + + if (!*end) + break; + iter = end + 1; + } return flags; } @@ -3621,9 +3659,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects), - sizeof(long), - GFP_ATOMIC); + unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC); if (!map) return; slab_err(s, page, text, s->name); @@ -3638,7 +3674,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } slab_unlock(page); - kfree(map); + bitmap_free(map); #endif } @@ -4411,10 +4447,8 @@ static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), - sizeof(unsigned long), - GFP_KERNEL); struct kmem_cache_node *n; + unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); if (!map) return -ENOMEM; @@ -4422,7 +4456,7 @@ static long validate_slab_cache(struct kmem_cache *s) flush_all(s); for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - kfree(map); + bitmap_free(map); return count; } /* @@ -4573,14 +4607,12 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long i; struct loc_track t = { 0, 0, NULL }; int node; - unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), - sizeof(unsigned long), - GFP_KERNEL); struct kmem_cache_node *n; + unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { - kfree(map); + bitmap_free(map); return sprintf(buf, "Out of memory\n"); } /* Push back cpu slabs */ @@ -4646,7 +4678,7 @@ static int list_locations(struct kmem_cache *s, char *buf, } free_loc_track(&t); - kfree(map); + bitmap_free(map); if (!t.count) len += sprintf(buf, "No data\n"); return len; @@ -4657,6 +4689,7 @@ static int list_locations(struct kmem_cache *s, char *buf, static void __init resiliency_test(void) { u8 *p; + int type = KMALLOC_NORMAL; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); @@ -4669,7 +4702,7 @@ static void __init resiliency_test(void) pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", p + 16); - validate_slab_cache(kmalloc_caches[4]); + validate_slab_cache(kmalloc_caches[type][4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); @@ -4678,33 +4711,33 @@ static void __init resiliency_test(void) p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[5]); + validate_slab_cache(kmalloc_caches[type][5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[6]); + validate_slab_cache(kmalloc_caches[type][6]); pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[7]); + validate_slab_cache(kmalloc_caches[type][7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[8]); + validate_slab_cache(kmalloc_caches[type][8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[9]); + validate_slab_cache(kmalloc_caches[type][9]); } #else #ifdef CONFIG_SYSFS diff --git a/mm/sparse.c b/mm/sparse.c @@ -696,13 +696,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, goto out; } -#ifdef CONFIG_DEBUG_VM /* * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION); -#endif + page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION); section_mark_present(ms); sparse_init_one_section(ms, section_nr, memmap, usemap); diff --git a/mm/swap.c b/mm/swap.c @@ -29,7 +29,6 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> -#include <linux/memremap.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> diff --git a/mm/swap_state.c b/mm/swap_state.c @@ -448,6 +448,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Initiate read into locked page and return. */ + SetPageWorkingset(new_page); lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; diff --git a/mm/swapfile.c b/mm/swapfile.c @@ -103,26 +103,39 @@ static inline unsigned char swap_count(unsigned char ent) return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } +/* Reclaim the swap entry anyway if possible */ +#define TTRS_ANYWAY 0x1 +/* + * Reclaim the swap entry if there are no more mappings of the + * corresponding page + */ +#define TTRS_UNMAPPED 0x2 +/* Reclaim the swap entry if swap is getting full*/ +#define TTRS_FULL 0x4 + /* returns 1 if swap entry is freed */ -static int -__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) +static int __try_to_reclaim_swap(struct swap_info_struct *si, + unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); struct page *page; int ret = 0; - page = find_get_page(swap_address_space(entry), swp_offset(entry)); + page = find_get_page(swap_address_space(entry), offset); if (!page) return 0; /* - * This function is called from scan_swap_map() and it's called - * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. - * We have to use trylock for avoiding deadlock. This is a special + * When this function is called from scan_swap_map_slots() and it's + * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, + * here. We have to use trylock for avoiding deadlock. This is a special * case and you should use try_to_free_swap() with explicit lock_page() * in usual operations. */ if (trylock_page(page)) { - ret = try_to_free_swap(page); + if ((flags & TTRS_ANYWAY) || + ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) + ret = try_to_free_swap(page); unlock_page(page); } put_page(page); @@ -780,7 +793,7 @@ checks: int swap_was_freed; unlock_cluster(ci); spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset); + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed) @@ -919,6 +932,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) struct swap_cluster_info *ci; ci = lock_cluster(si, offset); + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); cluster_set_count_flag(ci, 0, 0); free_cluster(si, idx); unlock_cluster(ci); @@ -989,7 +1003,7 @@ start_over: goto nextsi; } if (size == SWAPFILE_CLUSTER) { - if (!(si->flags & SWP_FILE)) + if (!(si->flags & SWP_FS)) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, @@ -1169,6 +1183,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, ci = lock_cluster_or_swap_info(p, offset); usage = __swap_entry_free_locked(p, offset, usage); unlock_cluster_or_swap_info(p, ci); + if (!usage) + free_swap_slot(entry); return usage; } @@ -1199,10 +1215,8 @@ void swap_free(swp_entry_t entry) struct swap_info_struct *p; p = _swap_info_get(entry); - if (p) { - if (!__swap_entry_free(p, entry, 1)) - free_swap_slot(entry); - } + if (p) + __swap_entry_free(p, entry, 1); } /* @@ -1237,9 +1251,6 @@ void put_swap_page(struct page *page, swp_entry_t entry) if (free_entries == SWAPFILE_CLUSTER) { unlock_cluster_or_swap_info(si, ci); spin_lock(&si->lock); - ci = lock_cluster(si, offset); - memset(map, 0, SWAPFILE_CLUSTER); - unlock_cluster(ci); mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); swap_free_cluster(si, idx); spin_unlock(&si->lock); @@ -1612,7 +1623,6 @@ int try_to_free_swap(struct page *page) int free_swap_and_cache(swp_entry_t entry) { struct swap_info_struct *p; - struct page *page = NULL; unsigned char count; if (non_swap_entry(entry)) @@ -1622,30 +1632,9 @@ int free_swap_and_cache(swp_entry_t entry) if (p) { count = __swap_entry_free(p, entry, 1); if (count == SWAP_HAS_CACHE && - !swap_page_trans_huge_swapped(p, entry)) { - page = find_get_page(swap_address_space(entry), - swp_offset(entry)); - if (page && !trylock_page(page)) { - put_page(page); - page = NULL; - } - } else if (!count) - free_swap_slot(entry); - } - if (page) { - /* - * Not mapped elsewhere, or swap space full? Free it! - * Also recheck PageSwapCache now page is locked (above). - */ - if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || mem_cgroup_swap_full(page)) && - !swap_page_trans_huge_swapped(p, entry)) { - page = compound_head(page); - delete_from_swap_cache(page); - SetPageDirty(page); - } - unlock_page(page); - put_page(page); + !swap_page_trans_huge_swapped(p, entry)) + __try_to_reclaim_swap(p, swp_offset(entry), + TTRS_UNMAPPED | TTRS_FULL); } return p != NULL; } @@ -2310,12 +2299,13 @@ static void destroy_swap_extents(struct swap_info_struct *sis) kfree(se); } - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_ACTIVATED) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; - sis->flags &= ~SWP_FILE; - mapping->a_ops->swap_deactivate(swap_file); + sis->flags &= ~SWP_ACTIVATED; + if (mapping->a_ops->swap_deactivate) + mapping->a_ops->swap_deactivate(swap_file); } } @@ -2364,6 +2354,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, list_add_tail(&new_se->list, &sis->first_swap_extent.list); return 1; } +EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages @@ -2411,8 +2402,10 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); + if (ret >= 0) + sis->flags |= SWP_ACTIVATED; if (!ret) { - sis->flags |= SWP_FILE; + sis->flags |= SWP_FS; ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; } diff --git a/mm/util.c b/mm/util.c @@ -435,7 +435,7 @@ EXPORT_SYMBOL(kvmalloc_node); * It is slightly more efficient to use kfree() or vfree() if you are certain * that you know which one to use. * - * Context: Any context except NMI. + * Context: Either preemptible task context or not-NMI interrupt. */ void kvfree(const void *addr) { @@ -678,8 +678,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * Part of the kernel memory, which can be released * under memory pressure. */ - free += global_node_page_state( - NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; + free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); /* * Leave reserved pages. The pages are not for anonymous pages. diff --git a/mm/vmalloc.c b/mm/vmalloc.c @@ -1577,6 +1577,8 @@ void vfree_atomic(const void *addr) * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-depenedent would be a really bad idea) * + * May sleep if called *not* from interrupt context. + * * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) */ void vfree(const void *addr) @@ -1585,6 +1587,8 @@ void vfree(const void *addr) kmemleak_free(addr); + might_sleep_if(!in_interrupt()); + if (!addr) return; if (unlikely(in_interrupt())) diff --git a/mm/vmscan.c b/mm/vmscan.c @@ -49,6 +49,7 @@ #include <linux/prefetch.h> #include <linux/printk.h> #include <linux/dax.h> +#include <linux/psi.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -473,9 +474,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = freeable >> priority; - delta *= 4; - do_div(delta, shrinker->seeks); + if (shrinker->seeks) { + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); + } else { + /* + * These objects don't require any IO to create. Trim + * them aggressively under memory pressure to keep + * them from causing refetches in the IO caches. + */ + delta = freeable / 2; + } /* * Make sure we apply some minimal pressure on default priority @@ -2145,6 +2155,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } ClearPageActive(page); /* we are de-activating */ + SetPageWorkingset(page); list_add(&page->lru, &l_inactive); } @@ -2456,9 +2467,11 @@ out: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. + * Make sure we don't miss the last page + * because of a round-off error. */ - scan = div64_u64(scan * fraction[file], - denominator); + scan = DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); break; case SCAN_FILE: case SCAN_ANON: @@ -3302,6 +3315,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long pflags; int nid; unsigned int noreclaim_flag; struct scan_control sc = { @@ -3330,9 +3344,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3497,6 +3515,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long pflags; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -3507,6 +3526,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_swap = 1, }; + psi_memstall_enter(&pflags); __fs_reclaim_acquire(); count_vm_event(PAGEOUTRUN); @@ -3608,6 +3628,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); + psi_memstall_leave(&pflags); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller diff --git a/mm/vmstat.c b/mm/vmstat.c @@ -1143,8 +1143,10 @@ const char * const vmstat_text[] = { "nr_slab_unreclaimable", "nr_isolated_anon", "nr_isolated_file", + "workingset_nodes", "workingset_refault", "workingset_activate", + "workingset_restore", "workingset_nodereclaim", "nr_anon_pages", "nr_mapped", @@ -1161,7 +1163,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", - "", /* nr_indirectly_reclaimable */ + "nr_kernel_misc_reclaimable", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1663,6 +1665,8 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) stat_items_size += sizeof(struct vm_event_state); #endif + BUILD_BUG_ON(stat_items_size != + ARRAY_SIZE(vmstat_text) * sizeof(unsigned long)); v = kmalloc(stat_items_size, GFP_KERNEL); m->private = v; if (!v) @@ -1706,10 +1710,6 @@ static int vmstat_show(struct seq_file *m, void *arg) unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - /* Skip hidden vmstat items. */ - if (*vmstat_text[off] == '\0') - return 0; - seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); diff --git a/mm/workingset.c b/mm/workingset.c @@ -121,7 +121,7 @@ * the only thing eating into inactive list space is active pages. * * - * Activating refaulting pages + * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given @@ -134,6 +134,10 @@ * used less frequently than the refaulting page - or even not used at * all anymore. * + * That means if inactive cache is refaulting with a suitable refault + * distance, we assume the cache workingset is transitioning and put + * pressure on the current active list. + * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. @@ -141,6 +145,14 @@ * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * + * Refaulting active pages + * + * If on the other hand the refaulting pages have recently been + * deactivated, it means that the active list is no longer protecting + * actively used cache from reclaim. The cache is NOT transitioning to + * a different workingset; the existing workingset is thrashing in the + * space allocated to the page cache. + * * * Implementation * @@ -156,8 +168,7 @@ */ #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - NODES_SHIFT + \ - MEM_CGROUP_ID_SHIFT) + 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* @@ -170,23 +181,28 @@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) { eviction >>= bucket_order; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; + eviction = (eviction << 1) | workingset; eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp) + unsigned long *evictionp, bool *workingsetp) { unsigned long entry = (unsigned long)shadow; int memcgid, nid; + bool workingset; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + workingset = entry & 1; + entry >>= 1; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); @@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; + *workingsetp = workingset; } /** @@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, */ void *workingset_eviction(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg = page_memcg(page); struct pglist_data *pgdat = page_pgdat(page); + struct mem_cgroup *memcg = page_memcg(page); int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; @@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) lruvec = mem_cgroup_lruvec(pgdat, memcg); eviction = atomic_long_inc_return(&lruvec->inactive_age); - return pack_shadow(memcgid, pgdat, eviction); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } /** * workingset_refault - evaluate the refault of a previously evicted page + * @page: the freshly allocated replacement page * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously * evicted page in the context of the node it was allocated in. - * - * Returns %true if the page should be activated, %false otherwise. */ -bool workingset_refault(void *shadow) +void workingset_refault(struct page *page, void *shadow) { unsigned long refault_distance; + struct pglist_data *pgdat; unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; unsigned long refault; - struct pglist_data *pgdat; + bool workingset; int memcgid; - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); rcu_read_lock(); /* @@ -263,41 +280,51 @@ bool workingset_refault(void *shadow) * configurations instead. */ memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !memcg) { - rcu_read_unlock(); - return false; - } + if (!mem_cgroup_disabled() && !memcg) + goto out; lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. + * Calculate the refault distance * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. There is a + * special case: usually, shadow entries have a short lifetime + * and are either refaulted or reclaimed along with the inode + * before they get too old. But it is not impossible for the + * inactive_age to lap a shadow entry in the field, which can + * then result in a false small refault distance, leading to a + * false activation should this old entry actually refault + * again. However, earlier kernels used to deactivate + * unconditionally with *every* reclaim invocation for the + * longest time, so the occasional inappropriate activation + * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; inc_lruvec_state(lruvec, WORKINGSET_REFAULT); - if (refault_distance <= active_file) { - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); - rcu_read_unlock(); - return true; + /* + * Compare the distance to the existing workingset size. We + * don't act on pages that couldn't stay resident even if all + * the memory was available to the page cache. + */ + if (refault_distance > active_file) + goto out; + + SetPageActive(page); + atomic_long_inc(&lruvec->inactive_age); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + + /* Page was active prior to eviction */ + if (workingset) { + SetPageWorkingset(page); + inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } +out: rcu_read_unlock(); - return false; } /** @@ -350,12 +377,20 @@ void workingset_update_node(struct radix_tree_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ + VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + if (node->count && node->count == node->exceptional) { - if (list_empty(&node->private_list)) + if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); + __inc_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } else { - if (!list_empty(&node->private_list)) + if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); + __dec_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } } @@ -364,7 +399,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, { unsigned long max_nodes; unsigned long nodes; - unsigned long cache; + unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); @@ -390,14 +425,20 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE */ +#ifdef CONFIG_MEMCG if (sc->memcg) { - cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + struct lruvec *lruvec; + + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL); + lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); + pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); + } else +#endif + pages = node_present_pages(sc->nid); + + max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3); if (!nodes) return SHRINK_EMPTY; @@ -440,6 +481,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); + __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES); + spin_unlock(lru_lock); /* @@ -467,7 +510,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; - inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); + __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->i_pages, node, workingset_lookup_update(mapping)); @@ -491,7 +534,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, - .seeks = DEFAULT_SEEKS, + .seeks = 0, /* ->count reports only fully expendable nodes */ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, }; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c @@ -418,7 +418,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle, case ZPOOL_MM_WO: zs_mm = ZS_MM_WO; break; - case ZPOOL_MM_RW: /* fallthru */ + case ZPOOL_MM_RW: /* fall through */ default: zs_mm = ZS_MM_RW; break; diff --git a/scripts/tags.sh b/scripts/tags.sh @@ -203,7 +203,7 @@ regex_c=( '/\<DECLARE_\(TASKLET\|WORK\|DELAYED_WORK\)(\([[:alnum:]_]*\)/\2/v/' '/\(^\s\)OFFSET(\([[:alnum:]_]*\)/\2/v/' '/\(^\s\)DEFINE(\([[:alnum:]_]*\)/\2/v/' - '/\<DEFINE_HASHTABLE(\([[:alnum:]_]*\)/\1/v/' + '/\<\(DEFINE\|DECLARE\)_HASHTABLE(\([[:alnum:]_]*\)/\2/v/' ) regex_kconfig=( '/^[[:blank:]]*\(menu\|\)config[[:blank:]]\+\([[:alnum:]_]\+\)/\2/' diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c @@ -203,6 +203,8 @@ static void print_delayacct(struct taskstats *t) "SWAP %15s%15s%15s\n" " %15llu%15llu%15llums\n" "RECLAIM %12s%15s%15s\n" + " %15llu%15llu%15llums\n" + "THRASHING%12s%15s%15s\n" " %15llu%15llu%15llums\n", "count", "real total", "virtual total", "delay total", "delay average", @@ -222,7 +224,11 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->freepages_count, (unsigned long long)t->freepages_delay_total, - average_ms(t->freepages_delay_total, t->freepages_count)); + average_ms(t->freepages_delay_total, t->freepages_count), + "count", "delay total", "delay average", + (unsigned long long)t->thrashing_count, + (unsigned long long)t->thrashing_delay_total, + average_ms(t->thrashing_delay_total, t->thrashing_count)); } static void task_context_switch_counts(struct taskstats *t) diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore @@ -13,3 +13,4 @@ mlock-random-test virtual_address_range gup_benchmark va_128TBswitch +map_fixed_noreplace diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile @@ -12,6 +12,7 @@ TEST_GEN_FILES += gup_benchmark TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_populate TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c @@ -15,9 +15,12 @@ #define PAGE_SIZE sysconf(_SC_PAGESIZE) #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) +#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) +#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) struct gup_benchmark { - __u64 delta_usec; + __u64 get_delta_usec; + __u64 put_delta_usec; __u64 addr; __u64 size; __u32 nr_pages_per_call; @@ -28,10 +31,12 @@ int main(int argc, char **argv) { struct gup_benchmark gup; unsigned long size = 128 * MB; - int i, fd, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; + int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; + int cmd = GUP_FAST_BENCHMARK, flags = MAP_PRIVATE; + char *file = "/dev/zero"; char *p; - while ((opt = getopt(argc, argv, "m:r:n:tT")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:f:tTLUSH")) != -1) { switch (opt) { case 'm': size = atoi(optarg) * MB; @@ -48,13 +53,36 @@ int main(int argc, char **argv) case 'T': thp = 0; break; + case 'L': + cmd = GUP_LONGTERM_BENCHMARK; + break; + case 'U': + cmd = GUP_BENCHMARK; + break; case 'w': write = 1; + break; + case 'f': + file = optarg; + break; + case 'S': + flags &= ~MAP_PRIVATE; + flags |= MAP_SHARED; + break; + case 'H': + flags |= MAP_HUGETLB; + break; default: return -1; } } + filed = open(file, O_RDWR|O_CREAT); + if (filed < 0) { + perror("open"); + exit(filed); + } + gup.nr_pages_per_call = nr_pages; gup.flags = write; @@ -62,8 +90,7 @@ int main(int argc, char **argv) if (fd == -1) perror("open"), exit(1); - p = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); if (p == MAP_FAILED) perror("mmap"), exit(1); gup.addr = (unsigned long)p; @@ -78,10 +105,11 @@ int main(int argc, char **argv) for (i = 0; i < repeats; i++) { gup.size = size; - if (ioctl(fd, GUP_FAST_BENCHMARK, &gup)) + if (ioctl(fd, cmd, &gup)) perror("ioctl"), exit(1); - printf("Time: %lld us", gup.delta_usec); + printf("Time: get:%lld put:%lld us", gup.get_delta_usec, + gup.put_delta_usec); if (gup.size != size) printf(", truncated (size: %lld)", gup.size); printf("\n"); diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/vm/map_fixed_noreplace.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Test that MAP_FIXED_NOREPLACE works. + * + * Copyright 2018, Jann Horn <jannh@google.com> + * Copyright 2018, Michael Ellerman, IBM Corporation. + */ + +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#ifndef MAP_FIXED_NOREPLACE +#define MAP_FIXED_NOREPLACE 0x100000 +#endif + +#define BASE_ADDRESS (256ul * 1024 * 1024) + + +static void dump_maps(void) +{ + char cmd[32]; + + snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); + system(cmd); +} + +int main(void) +{ + unsigned long flags, addr, size, page_size; + char *p; + + page_size = sysconf(_SC_PAGE_SIZE); + + flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; + + // Check we can map all the areas we need below + errno = 0; + addr = BASE_ADDRESS; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error: couldn't map the space we need for the test\n"); + return 1; + } + + errno = 0; + if (munmap((void *)addr, 5 * page_size) != 0) { + dump_maps(); + printf("Error: munmap failed!?\n"); + return 1; + } + printf("unmap() successful\n"); + + errno = 0; + addr = BASE_ADDRESS + page_size; + size = 3 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error: first mmap() failed unexpectedly\n"); + return 1; + } + + /* + * Exact same mapping again: + * base | free | new + * +1 | mapped | new + * +2 | mapped | new + * +3 | mapped | new + * +4 | free | new + */ + errno = 0; + addr = BASE_ADDRESS; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:1: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Second mapping contained within first: + * + * base | free | + * +1 | mapped | + * +2 | mapped | new + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = BASE_ADDRESS + (2 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:2: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Overlap end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | new + * +4 | free | new + */ + errno = 0; + addr = BASE_ADDRESS + (3 * page_size); + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:3: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Overlap start of existing mapping: + * base | free | new + * +1 | mapped | new + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = BASE_ADDRESS; + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:4: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Adjacent to start of existing mapping: + * base | free | new + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = BASE_ADDRESS; + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error:5: mmap() failed when it shouldn't have\n"); + return 1; + } + + /* + * Adjacent to end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | new + */ + errno = 0; + addr = BASE_ADDRESS + (4 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error:6: mmap() failed when it shouldn't have\n"); + return 1; + } + + addr = BASE_ADDRESS; + size = 5 * page_size; + if (munmap((void *)addr, size) != 0) { + dump_maps(); + printf("Error: munmap failed!?\n"); + return 1; + } + printf("unmap() successful\n"); + + printf("OK\n"); + return 0; +} diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c @@ -34,18 +34,6 @@ * per-CPU threads 1 by triggering userfaults inside * pthread_mutex_lock will also verify the atomicity of the memory * transfer (UFFDIO_COPY). - * - * The program takes two parameters: the amounts of physical memory in - * megabytes (MiB) of the area and the number of bounces to execute. - * - * # 100MiB 99999 bounces - * ./userfaultfd 100 99999 - * - * # 1GiB 99 bounces - * ./userfaultfd 1000 99 - * - * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers - * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done */ #define _GNU_SOURCE @@ -115,6 +103,30 @@ pthread_attr_t attr; ~(unsigned long)(sizeof(unsigned long long) \ - 1))) +const char *examples = + "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" + "./userfaultfd anon 100 99999\n\n" + "# Run share memory test on 1GiB region with 99 bounces:\n" + "./userfaultfd shmem 1000 99\n\n" + "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n" + "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n" + "# Run the same hugetlb test but using shmem:\n" + "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n" + "# 10MiB-~6GiB 999 bounces anonymous test, " + "continue forever unless an error triggers\n" + "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> " + "[hugetlbfs_file]\n\n"); + fprintf(stderr, "Supported <test type>: anon, hugetlb, " + "hugetlb_shared, shmem\n\n"); + fprintf(stderr, "Examples:\n\n"); + fprintf(stderr, examples); + exit(1); +} + static int anon_release_pages(char *rel_area) { int ret = 0; @@ -439,6 +451,43 @@ static int copy_page(int ufd, unsigned long offset) return __copy_page(ufd, offset, false); } +static int uffd_read_msg(int ufd, struct uffd_msg *msg) +{ + int ret = read(uffd, msg, sizeof(*msg)); + + if (ret != sizeof(*msg)) { + if (ret < 0) { + if (errno == EAGAIN) + return 1; + else + perror("blocking read error"), exit(1); + } else { + fprintf(stderr, "short read\n"), exit(1); + } + } + + return 0; +} + +/* Return 1 if page fault handled by us; otherwise 0 */ +static int uffd_handle_page_fault(struct uffd_msg *msg) +{ + unsigned long offset; + + if (msg->event != UFFD_EVENT_PAGEFAULT) + fprintf(stderr, "unexpected msg event %u\n", + msg->event), exit(1); + + if (bounces & BOUNCE_VERIFY && + msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + fprintf(stderr, "unexpected write fault\n"), exit(1); + + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + + return copy_page(uffd, offset); +} + static void *uffd_poll_thread(void *arg) { unsigned long cpu = (unsigned long) arg; @@ -446,7 +495,6 @@ static void *uffd_poll_thread(void *arg) struct uffd_msg msg; struct uffdio_register uffd_reg; int ret; - unsigned long offset; char tmp_chr; unsigned long userfaults = 0; @@ -470,25 +518,15 @@ static void *uffd_poll_thread(void *arg) if (!(pollfd[0].revents & POLLIN)) fprintf(stderr, "pollfd[0].revents %d\n", pollfd[0].revents), exit(1); - ret = read(uffd, &msg, sizeof(msg)); - if (ret < 0) { - if (errno == EAGAIN) - continue; - perror("nonblocking read error"), exit(1); - } + if (uffd_read_msg(uffd, &msg)) + continue; switch (msg.event) { default: fprintf(stderr, "unexpected msg event %u\n", msg.event), exit(1); break; case UFFD_EVENT_PAGEFAULT: - if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - fprintf(stderr, "unexpected write fault\n"), exit(1); - offset = (char *)(unsigned long)msg.arg.pagefault.address - - area_dst; - offset &= ~(page_size-1); - if (copy_page(uffd, offset)) - userfaults++; + userfaults += uffd_handle_page_fault(&msg); break; case UFFD_EVENT_FORK: close(uffd); @@ -516,8 +554,6 @@ static void *uffd_read_thread(void *arg) { unsigned long *this_cpu_userfaults; struct uffd_msg msg; - unsigned long offset; - int ret; this_cpu_userfaults = (unsigned long *) arg; *this_cpu_userfaults = 0; @@ -526,24 +562,9 @@ static void *uffd_read_thread(void *arg) /* from here cancellation is ok */ for (;;) { - ret = read(uffd, &msg, sizeof(msg)); - if (ret != sizeof(msg)) { - if (ret < 0) - perror("blocking read error"), exit(1); - else - fprintf(stderr, "short read\n"), exit(1); - } - if (msg.event != UFFD_EVENT_PAGEFAULT) - fprintf(stderr, "unexpected msg event %u\n", - msg.event), exit(1); - if (bounces & BOUNCE_VERIFY && - msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - fprintf(stderr, "unexpected write fault\n"), exit(1); - offset = (char *)(unsigned long)msg.arg.pagefault.address - - area_dst; - offset &= ~(page_size-1); - if (copy_page(uffd, offset)) - (*this_cpu_userfaults)++; + if (uffd_read_msg(uffd, &msg)) + continue; + (*this_cpu_userfaults) += uffd_handle_page_fault(&msg); } return (void *)NULL; } @@ -605,6 +626,12 @@ static int stress(unsigned long *userfaults) if (uffd_test_ops->release_pages(area_src)) return 1; + + finished = 1; + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(locking_threads[cpu], NULL)) + return 1; + for (cpu = 0; cpu < nr_cpus; cpu++) { char c; if (bounces & BOUNCE_POLL) { @@ -622,11 +649,6 @@ static int stress(unsigned long *userfaults) } } - finished = 1; - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(locking_threads[cpu], NULL)) - return 1; - return 0; } @@ -1272,8 +1294,7 @@ static void sigalrm(int sig) int main(int argc, char **argv) { if (argc < 4) - fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"), - exit(1); + usage(); if (signal(SIGALRM, sigalrm) == SIG_ERR) fprintf(stderr, "failed to arm SIGALRM"), exit(1); @@ -1286,20 +1307,19 @@ int main(int argc, char **argv) nr_cpus; if (!nr_pages_per_cpu) { fprintf(stderr, "invalid MiB\n"); - fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); + usage(); } bounces = atoi(argv[3]); if (bounces <= 0) { fprintf(stderr, "invalid bounces\n"); - fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); + usage(); } nr_pages = nr_pages_per_cpu * nr_cpus; if (test_type == TEST_HUGETLB) { if (argc < 5) - fprintf(stderr, "Usage: hugetlb <MiB> <bounces> <hugetlbfs_file>\n"), - exit(1); + usage(); huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); if (huge_fd < 0) { fprintf(stderr, "Open of %s failed", argv[3]); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c @@ -497,7 +497,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young,