whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit f654f0fc0bd3f1b0ec76e654bf1cc21f33382241
parent 4f1cbe078546914538d8aabba04db984da68dcbf
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Fri,  5 Apr 2019 17:08:55 -1000

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "14 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  kernel/sysctl.c: fix out-of-bounds access when setting file-max
  mm/util.c: fix strndup_user() comment
  sh: fix multiple function definition build errors
  MAINTAINERS: add maintainer and replacing reviewer ARM/NUVOTON NPCM
  MAINTAINERS: fix bad pattern in ARM/NUVOTON NPCM
  mm: writeback: use exact memcg dirty counts
  psi: clarify the units used in pressure files
  mm/huge_memory.c: fix modifying of page protection by insert_pfn_pmd()
  hugetlbfs: fix memory leak for resv_map
  mm: fix vm_fault_t cast in VM_FAULT_GET_HINDEX()
  lib/lzo: fix bugs for very short or empty input
  include/linux/bitrev.h: fix constant bitrev
  kmemleak: powerpc: skip scanning holes in the .bss section
  lib/string.c: implement a basic bcmp

Diffstat:
MDocumentation/accounting/psi.txt | 12++++++------
MDocumentation/lzo.txt | 8+++++---
MMAINTAINERS | 5+++--
March/powerpc/kernel/kvm.c | 7+++++++
March/sh/boards/of-generic.c | 4++--
Mfs/hugetlbfs/inode.c | 20++++++++++++++------
Minclude/linux/bitrev.h | 46+++++++++++++++++++++++-----------------------
Minclude/linux/memcontrol.h | 5++++-
Minclude/linux/mm_types.h | 2+-
Minclude/linux/string.h | 3+++
Mkernel/sysctl.c | 3++-
Mlib/lzo/lzo1x_compress.c | 9++++++---
Mlib/lzo/lzo1x_decompress_safe.c | 4+---
Mlib/string.c | 20++++++++++++++++++++
Mmm/huge_memory.c | 36++++++++++++++++++++++++++++++++++++
Mmm/kmemleak.c | 16+++++++++++-----
Mmm/memcontrol.c | 20++++++++++++++++++--
Mmm/util.c | 2+-
18 files changed, 163 insertions(+), 59 deletions(-)

diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt @@ -56,12 +56,12 @@ situation from a state where some tasks are stalled but the CPU is still doing productive work. As such, time spent in this subset of the stall state is tracked separately and exported in the "full" averages. -The ratios are tracked as recent trends over ten, sixty, and three -hundred second windows, which gives insight into short term events as -well as medium and long term trends. The total absolute stall time is -tracked and exported as well, to allow detection of latency spikes -which wouldn't necessarily make a dent in the time averages, or to -average trends over custom time frames. +The ratios (in %) are tracked as recent trends over ten, sixty, and +three hundred second windows, which gives insight into short term events +as well as medium and long term trends. The total absolute stall time +(in us) is tracked and exported as well, to allow detection of latency +spikes which wouldn't necessarily make a dent in the time averages, +or to average trends over custom time frames. Cgroup2 interface ================= diff --git a/Documentation/lzo.txt b/Documentation/lzo.txt @@ -102,9 +102,11 @@ Byte sequences dictionary which is empty, and that it will always be invalid at this place. - 17 : bitstream version. If the first byte is 17, the next byte - gives the bitstream version (version 1 only). If the first byte - is not 17, the bitstream version is 0. + 17 : bitstream version. If the first byte is 17, and compressed + stream length is at least 5 bytes (length of shortest possible + versioned bitstream), the next byte gives the bitstream version + (version 1 only). + Otherwise, the bitstream version is 0. 18..21 : copy 0..3 literals state = (byte - 17) = 0..3 [ copy <state> literals ] diff --git a/MAINTAINERS b/MAINTAINERS @@ -1893,14 +1893,15 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-nomadik.git ARM/NUVOTON NPCM ARCHITECTURE M: Avi Fishman <avifishman70@gmail.com> M: Tomer Maimon <tmaimon77@gmail.com> +M: Tali Perry <tali.perry1@gmail.com> R: Patrick Venture <venture@google.com> R: Nancy Yuen <yuenn@google.com> -R: Brendan Higgins <brendanhiggins@google.com> +R: Benjamin Fair <benjaminfair@google.com> L: openbmc@lists.ozlabs.org (moderated for non-subscribers) S: Supported F: arch/arm/mach-npcm/ F: arch/arm/boot/dts/nuvoton-npcm* -F: include/dt-bindings/clock/nuvoton,npcm7xx-clks.h +F: include/dt-bindings/clock/nuvoton,npcm7xx-clock.h F: drivers/*/*npcm* F: Documentation/devicetree/bindings/*/*npcm* F: Documentation/devicetree/bindings/*/*/*npcm* diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c @@ -22,6 +22,7 @@ #include <linux/kvm_host.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/kmemleak.h> #include <linux/kvm_para.h> #include <linux/slab.h> #include <linux/of.h> @@ -712,6 +713,12 @@ static void kvm_use_magic_page(void) static __init void kvm_free_tmp(void) { + /* + * Inform kmemleak about the hole in the .bss section since the + * corresponding pages will be unmapped with DEBUG_PAGEALLOC=y. + */ + kmemleak_free_part(&kvm_tmp[kvm_tmp_index], + ARRAY_SIZE(kvm_tmp) - kvm_tmp_index); free_reserved_area(&kvm_tmp[kvm_tmp_index], &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL); } diff --git a/arch/sh/boards/of-generic.c b/arch/sh/boards/of-generic.c @@ -164,10 +164,10 @@ static struct sh_machine_vector __initmv sh_of_generic_mv = { struct sh_clk_ops; -void __init arch_init_clk_ops(struct sh_clk_ops **ops, int idx) +void __init __weak arch_init_clk_ops(struct sh_clk_ops **ops, int idx) { } -void __init plat_irq_setup(void) +void __init __weak plat_irq_setup(void) { } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c @@ -755,11 +755,17 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev) { struct inode *inode; - struct resv_map *resv_map; + struct resv_map *resv_map = NULL; - resv_map = resv_map_alloc(); - if (!resv_map) - return NULL; + /* + * Reserve maps are only needed for inodes that can have associated + * page allocations. + */ + if (S_ISREG(mode) || S_ISLNK(mode)) { + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; + } inode = new_inode(sb); if (inode) { @@ -794,8 +800,10 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); - } else - kref_put(&resv_map->refs, resv_map_release); + } else { + if (resv_map) + kref_put(&resv_map->refs, resv_map_release); + } return inode; } diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h @@ -34,41 +34,41 @@ static inline u32 __bitrev32(u32 x) #define __constant_bitrev32(x) \ ({ \ - u32 __x = x; \ - __x = (__x >> 16) | (__x << 16); \ - __x = ((__x & (u32)0xFF00FF00UL) >> 8) | ((__x & (u32)0x00FF00FFUL) << 8); \ - __x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4); \ - __x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2); \ - __x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1); \ - __x; \ + u32 ___x = x; \ + ___x = (___x >> 16) | (___x << 16); \ + ___x = ((___x & (u32)0xFF00FF00UL) >> 8) | ((___x & (u32)0x00FF00FFUL) << 8); \ + ___x = ((___x & (u32)0xF0F0F0F0UL) >> 4) | ((___x & (u32)0x0F0F0F0FUL) << 4); \ + ___x = ((___x & (u32)0xCCCCCCCCUL) >> 2) | ((___x & (u32)0x33333333UL) << 2); \ + ___x = ((___x & (u32)0xAAAAAAAAUL) >> 1) | ((___x & (u32)0x55555555UL) << 1); \ + ___x; \ }) #define __constant_bitrev16(x) \ ({ \ - u16 __x = x; \ - __x = (__x >> 8) | (__x << 8); \ - __x = ((__x & (u16)0xF0F0U) >> 4) | ((__x & (u16)0x0F0FU) << 4); \ - __x = ((__x & (u16)0xCCCCU) >> 2) | ((__x & (u16)0x3333U) << 2); \ - __x = ((__x & (u16)0xAAAAU) >> 1) | ((__x & (u16)0x5555U) << 1); \ - __x; \ + u16 ___x = x; \ + ___x = (___x >> 8) | (___x << 8); \ + ___x = ((___x & (u16)0xF0F0U) >> 4) | ((___x & (u16)0x0F0FU) << 4); \ + ___x = ((___x & (u16)0xCCCCU) >> 2) | ((___x & (u16)0x3333U) << 2); \ + ___x = ((___x & (u16)0xAAAAU) >> 1) | ((___x & (u16)0x5555U) << 1); \ + ___x; \ }) #define __constant_bitrev8x4(x) \ ({ \ - u32 __x = x; \ - __x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4); \ - __x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2); \ - __x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1); \ - __x; \ + u32 ___x = x; \ + ___x = ((___x & (u32)0xF0F0F0F0UL) >> 4) | ((___x & (u32)0x0F0F0F0FUL) << 4); \ + ___x = ((___x & (u32)0xCCCCCCCCUL) >> 2) | ((___x & (u32)0x33333333UL) << 2); \ + ___x = ((___x & (u32)0xAAAAAAAAUL) >> 1) | ((___x & (u32)0x55555555UL) << 1); \ + ___x; \ }) #define __constant_bitrev8(x) \ ({ \ - u8 __x = x; \ - __x = (__x >> 4) | (__x << 4); \ - __x = ((__x & (u8)0xCCU) >> 2) | ((__x & (u8)0x33U) << 2); \ - __x = ((__x & (u8)0xAAU) >> 1) | ((__x & (u8)0x55U) << 1); \ - __x; \ + u8 ___x = x; \ + ___x = (___x >> 4) | (___x << 4); \ + ___x = ((___x & (u8)0xCCU) >> 2) | ((___x & (u8)0x33U) << 2); \ + ___x = ((___x & (u8)0xAAU) >> 1) | ((___x & (u8)0x55U) << 1); \ + ___x; \ }) #define bitrev32(x) \ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h @@ -566,7 +566,10 @@ struct mem_cgroup *lock_page_memcg(struct page *page); void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page); -/* idx can be of type enum memcg_stat_item or node_stat_item */ +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page_state(). + */ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h @@ -671,7 +671,7 @@ enum vm_fault_reason { /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16)) -#define VM_FAULT_GET_HINDEX(x) (((x) >> 16) & 0xf) +#define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf) #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \ VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \ diff --git a/include/linux/string.h b/include/linux/string.h @@ -150,6 +150,9 @@ extern void * memscan(void *,int,__kernel_size_t); #ifndef __HAVE_ARCH_MEMCMP extern int memcmp(const void *,const void *,__kernel_size_t); #endif +#ifndef __HAVE_ARCH_BCMP +extern int bcmp(const void *,const void *,__kernel_size_t); +#endif #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c @@ -128,6 +128,7 @@ static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; +static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; static int one_hundred = 100; @@ -1750,7 +1751,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero, + .extra1 = &zero_ul, .extra2 = &long_max, }, { diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c @@ -291,13 +291,14 @@ int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len, { const unsigned char *ip = in; unsigned char *op = out; + unsigned char *data_start; size_t l = in_len; size_t t = 0; signed char state_offset = -2; unsigned int m4_max_offset; - // LZO v0 will never write 17 as first byte, - // so this is used to version the bitstream + // LZO v0 will never write 17 as first byte (except for zero-length + // input), so this is used to version the bitstream if (bitstream_version > 0) { *op++ = 17; *op++ = bitstream_version; @@ -306,6 +307,8 @@ int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len, m4_max_offset = M4_MAX_OFFSET_V0; } + data_start = op; + while (l > 20) { size_t ll = l <= (m4_max_offset + 1) ? l : (m4_max_offset + 1); uintptr_t ll_end = (uintptr_t) ip + ll; @@ -324,7 +327,7 @@ int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len, if (t > 0) { const unsigned char *ii = in + in_len - t; - if (op == out && t <= 238) { + if (op == data_start && t <= 238) { *op++ = (17 + t); } else if (t <= 3) { op[state_offset] |= t; diff --git a/lib/lzo/lzo1x_decompress_safe.c b/lib/lzo/lzo1x_decompress_safe.c @@ -54,11 +54,9 @@ int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, if (unlikely(in_len < 3)) goto input_overrun; - if (likely(*ip == 17)) { + if (likely(in_len >= 5) && likely(*ip == 17)) { bitstream_version = ip[1]; ip += 2; - if (unlikely(in_len < 5)) - goto input_overrun; } else { bitstream_version = 0; } diff --git a/lib/string.c b/lib/string.c @@ -866,6 +866,26 @@ __visible int memcmp(const void *cs, const void *ct, size_t count) EXPORT_SYMBOL(memcmp); #endif +#ifndef __HAVE_ARCH_BCMP +/** + * bcmp - returns 0 if and only if the buffers have identical contents. + * @a: pointer to first buffer. + * @b: pointer to second buffer. + * @len: size of buffers. + * + * The sign or magnitude of a non-zero return value has no particular + * meaning, and architectures may implement their own more efficient bcmp(). So + * while this particular implementation is a simple (tail) call to memcmp, do + * not rely on anything but whether the return value is zero or non-zero. + */ +#undef bcmp +int bcmp(const void *a, const void *b, size_t len) +{ + return memcmp(a, b, len); +} +EXPORT_SYMBOL(bcmp); +#endif + #ifndef __HAVE_ARCH_MEMSCAN /** * memscan - Find a character in an area of memory. diff --git a/mm/huge_memory.c b/mm/huge_memory.c @@ -755,6 +755,21 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; ptl = pmd_lock(mm, pmd); + if (!pmd_none(*pmd)) { + if (write) { + if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); + goto out_unlock; + } + entry = pmd_mkyoung(*pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) + update_mmu_cache_pmd(vma, addr, pmd); + } + + goto out_unlock; + } + entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); @@ -766,11 +781,16 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); mm_inc_nr_ptes(mm); + pgtable = NULL; } set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); + +out_unlock: spin_unlock(ptl); + if (pgtable) + pte_free(mm, pgtable); } vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -821,6 +841,20 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; ptl = pud_lock(mm, pud); + if (!pud_none(*pud)) { + if (write) { + if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_huge_zero_pud(*pud)); + goto out_unlock; + } + entry = pud_mkyoung(*pud); + entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); + if (pudp_set_access_flags(vma, addr, pud, entry, 1)) + update_mmu_cache_pud(vma, addr, pud); + } + goto out_unlock; + } + entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry); @@ -830,6 +864,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, } set_pud_at(mm, addr, pud, entry); update_mmu_cache_pud(vma, addr, pud); + +out_unlock: spin_unlock(ptl); } diff --git a/mm/kmemleak.c b/mm/kmemleak.c @@ -1529,11 +1529,6 @@ static void kmemleak_scan(void) } rcu_read_unlock(); - /* data/bss scanning */ - scan_large_block(_sdata, _edata); - scan_large_block(__bss_start, __bss_stop); - scan_large_block(__start_ro_after_init, __end_ro_after_init); - #ifdef CONFIG_SMP /* per-cpu sections scanning */ for_each_possible_cpu(i) @@ -2071,6 +2066,17 @@ void __init kmemleak_init(void) } local_irq_restore(flags); + /* register the data/bss sections */ + create_object((unsigned long)_sdata, _edata - _sdata, + KMEMLEAK_GREY, GFP_ATOMIC); + create_object((unsigned long)__bss_start, __bss_stop - __bss_start, + KMEMLEAK_GREY, GFP_ATOMIC); + /* only register .data..ro_after_init if not within .data */ + if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata) + create_object((unsigned long)__start_ro_after_init, + __end_ro_after_init - __start_ro_after_init, + KMEMLEAK_GREY, GFP_ATOMIC); + /* * This is the point where tracking allocations is safe. Automatic * scanning is started during the late initcall. Add the early logged diff --git a/mm/memcontrol.c b/mm/memcontrol.c @@ -3882,6 +3882,22 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) return &memcg->cgwb_domain; } +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page(). + */ +static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = atomic_long_read(&memcg->stat[idx]); + int cpu; + + for_each_online_cpu(cpu) + x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx]; + if (x < 0) + x = 0; + return x; +} + /** * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg * @wb: bdi_writeback in question @@ -3907,10 +3923,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); + *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); /* this should eventually include NR_UNSTABLE_NFS */ - *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); + *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | (1 << LRU_ACTIVE_FILE)); *pheadroom = PAGE_COUNTER_MAX; diff --git a/mm/util.c b/mm/util.c @@ -204,7 +204,7 @@ EXPORT_SYMBOL(vmemdup_user); * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. * - * Return: newly allocated copy of @s or %NULL in case of error + * Return: newly allocated copy of @s or an ERR_PTR() in case of error */ char *strndup_user(const char __user *s, long n) {