whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit b9de6efed25cb713c1648e71302f4af83bd14ee6
parent cd984a5be21549273a3f13b52a8b7b84097b32a7
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Sat,  2 Feb 2019 09:32:58 -0800

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "24 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (24 commits)
  autofs: fix error return in autofs_fill_super()
  autofs: drop dentry reference only when it is never used
  fs/drop_caches.c: avoid softlockups in drop_pagecache_sb()
  mm: migrate: don't rely on __PageMovable() of newpage after unlocking it
  psi: clarify the Kconfig text for the default-disable option
  mm, memory_hotplug: __offline_pages fix wrong locking
  mm: hwpoison: use do_send_sig_info() instead of force_sig()
  kasan: mark file common so ftrace doesn't trace it
  init/Kconfig: fix grammar by moving a closing parenthesis
  lib/test_kmod.c: potential double free in error handling
  mm, oom: fix use-after-free in oom_kill_process
  mm/hotplug: invalid PFNs from pfn_to_online_page()
  mm,memory_hotplug: fix scan_movable_pages() for gigantic hugepages
  psi: fix aggregation idle shut-off
  mm, memory_hotplug: test_pages_in_a_zone do not pass the end of zone
  mm, memory_hotplug: is_mem_section_removable do not pass the end of a zone
  oom, oom_reaper: do not enqueue same task twice
  mm: migrate: make buffer_migrate_page_norefs() actually succeed
  kernel/exit.c: release ptraced tasks before zap_pid_ns_processes
  x86_64: increase stack size for KASAN_EXTRA
  ...

Diffstat:
March/c6x/include/asm/Kbuild | 1+
March/c6x/include/uapi/asm/Kbuild | 1-
March/h8300/include/asm/Kbuild | 1+
March/h8300/include/uapi/asm/Kbuild | 1-
March/hexagon/include/asm/Kbuild | 1+
March/hexagon/include/uapi/asm/Kbuild | 1-
March/m68k/include/asm/Kbuild | 1+
March/m68k/include/uapi/asm/Kbuild | 1-
March/microblaze/include/asm/Kbuild | 1+
March/microblaze/include/uapi/asm/Kbuild | 1-
March/openrisc/include/asm/Kbuild | 1+
March/openrisc/include/uapi/asm/Kbuild | 1-
March/unicore32/include/asm/Kbuild | 1+
March/unicore32/include/uapi/asm/Kbuild | 1-
March/x86/include/asm/page_64_types.h | 4++++
Mfs/autofs/expire.c | 3++-
Mfs/autofs/inode.c | 4+++-
Mfs/drop_caches.c | 8+++++++-
Mfs/proc/generic.c | 4+++-
Mfs/proc/internal.h | 1+
Mfs/proc/proc_net.c | 20++++++++++++++++++++
Minclude/linux/memory_hotplug.h | 18++++++++++--------
Minclude/linux/sched/coredump.h | 1+
Minit/Kconfig | 13++++++++++++-
Mkernel/exit.c | 12++++++++++--
Mkernel/sched/psi.c | 21+++++++++++++++++----
Mkernel/workqueue.c | 23+++++++++++++++++++++++
Mkernel/workqueue_internal.h | 6+++++-
Mlib/test_kmod.c | 2+-
Mmm/hugetlb.c | 3++-
Mmm/kasan/Makefile | 1+
Mmm/memory-failure.c | 3++-
Mmm/memory_hotplug.c | 62+++++++++++++++++++++++++++-----------------------------------
Mmm/migrate.c | 12+++++-------
Mmm/oom_kill.c | 12++++++++++--
Mtools/testing/selftests/proc/.gitignore | 1+
Mtools/testing/selftests/proc/Makefile | 1+
Atools/testing/selftests/proc/setns-dcache.c | 129+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
38 files changed, 304 insertions(+), 74 deletions(-)

diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild @@ -30,6 +30,7 @@ generic-y += pgalloc.h generic-y += preempt.h generic-y += segment.h generic-y += serial.h +generic-y += shmparam.h generic-y += tlbflush.h generic-y += topology.h generic-y += trace_clock.h diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild @@ -1,5 +1,4 @@ include include/uapi/asm-generic/Kbuild.asm generic-y += kvm_para.h -generic-y += shmparam.h generic-y += ucontext.h diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild @@ -40,6 +40,7 @@ generic-y += preempt.h generic-y += scatterlist.h generic-y += sections.h generic-y += serial.h +generic-y += shmparam.h generic-y += sizes.h generic-y += spinlock.h generic-y += timex.h diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild @@ -1,5 +1,4 @@ include include/uapi/asm-generic/Kbuild.asm generic-y += kvm_para.h -generic-y += shmparam.h generic-y += ucontext.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild @@ -30,6 +30,7 @@ generic-y += rwsem.h generic-y += sections.h generic-y += segment.h generic-y += serial.h +generic-y += shmparam.h generic-y += sizes.h generic-y += topology.h generic-y += trace_clock.h diff --git a/arch/hexagon/include/uapi/asm/Kbuild b/arch/hexagon/include/uapi/asm/Kbuild @@ -1,4 +1,3 @@ include include/uapi/asm-generic/Kbuild.asm -generic-y += shmparam.h generic-y += ucontext.h diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild @@ -20,6 +20,7 @@ generic-y += mm-arch-hooks.h generic-y += percpu.h generic-y += preempt.h generic-y += sections.h +generic-y += shmparam.h generic-y += spinlock.h generic-y += topology.h generic-y += trace_clock.h diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild @@ -2,4 +2,3 @@ include include/uapi/asm-generic/Kbuild.asm generated-y += unistd_32.h generic-y += kvm_para.h -generic-y += shmparam.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild @@ -26,6 +26,7 @@ generic-y += parport.h generic-y += percpu.h generic-y += preempt.h generic-y += serial.h +generic-y += shmparam.h generic-y += syscalls.h generic-y += topology.h generic-y += trace_clock.h diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild @@ -2,5 +2,4 @@ include include/uapi/asm-generic/Kbuild.asm generated-y += unistd_32.h generic-y += kvm_para.h -generic-y += shmparam.h generic-y += ucontext.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild @@ -34,6 +34,7 @@ generic-y += qrwlock_types.h generic-y += qrwlock.h generic-y += sections.h generic-y += segment.h +generic-y += shmparam.h generic-y += string.h generic-y += switch_to.h generic-y += topology.h diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild @@ -1,5 +1,4 @@ include include/uapi/asm-generic/Kbuild.asm generic-y += kvm_para.h -generic-y += shmparam.h generic-y += ucontext.h diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild @@ -28,6 +28,7 @@ generic-y += preempt.h generic-y += sections.h generic-y += segment.h generic-y += serial.h +generic-y += shmparam.h generic-y += sizes.h generic-y += syscalls.h generic-y += topology.h diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild @@ -1,5 +1,4 @@ include include/uapi/asm-generic/Kbuild.asm generic-y += kvm_para.h -generic-y += shmparam.h generic-y += ucontext.h diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h @@ -7,7 +7,11 @@ #endif #ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_EXTRA +#define KASAN_STACK_ORDER 2 +#else #define KASAN_STACK_ORDER 1 +#endif #else #define KASAN_STACK_ORDER 0 #endif diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c @@ -596,7 +596,6 @@ int autofs_expire_run(struct super_block *sb, pkt.len = dentry->d_name.len; memcpy(pkt.name, dentry->d_name.name, pkt.len); pkt.name[pkt.len] = '\0'; - dput(dentry); if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) ret = -EFAULT; @@ -609,6 +608,8 @@ int autofs_expire_run(struct super_block *sb, complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); + dput(dentry); + return ret; } diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c @@ -266,8 +266,10 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) } root_inode = autofs_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); - if (!root) + if (!root) { + ret = -ENOMEM; goto fail_ino; + } pipe = NULL; root->d_fsdata = ino; diff --git a/fs/drop_caches.c b/fs/drop_caches.c @@ -21,8 +21,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); + /* + * We must skip inodes in unusual state. We may also skip + * inodes without pages but we deliberately won't in case + * we need to reschedule to avoid softlockups. + */ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (inode->i_mapping->nrpages == 0)) { + (inode->i_mapping->nrpages == 0 && !need_resched())) { spin_unlock(&inode->i_lock); continue; } @@ -30,6 +35,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); + cond_resched(); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; diff --git a/fs/proc/generic.c b/fs/proc/generic.c @@ -256,7 +256,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &proc_misc_dentry_ops); + d_set_d_op(dentry, de->proc_dops); return d_splice_alias(inode, dentry); } read_unlock(&proc_subdir_lock); @@ -429,6 +429,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); + ent->proc_dops = &proc_misc_dentry_ops; + out: return ent; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h @@ -44,6 +44,7 @@ struct proc_dir_entry { struct completion *pde_unload_completion; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; + const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c @@ -38,6 +38,22 @@ static struct net *get_proc_net(const struct inode *inode) return maybe_get_net(PDE_NET(PDE(inode))); } +static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return 0; +} + +static const struct dentry_operations proc_net_dentry_ops = { + .d_revalidate = proc_net_d_revalidate, + .d_delete = always_delete_dentry, +}; + +static void pde_force_lookup(struct proc_dir_entry *pde) +{ + /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ + pde->proc_dops = &proc_net_dentry_ops; +} + static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; @@ -90,6 +106,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -133,6 +150,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -181,6 +199,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; return proc_register(parent, p); @@ -223,6 +242,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; p->write = write; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h @@ -21,14 +21,16 @@ struct vmem_altmap; * walkers which rely on the fully initialized page->flags and others * should use this rather than pfn_valid && pfn_to_page */ -#define pfn_to_online_page(pfn) \ -({ \ - struct page *___page = NULL; \ - unsigned long ___nr = pfn_to_section_nr(pfn); \ - \ - if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr))\ - ___page = pfn_to_page(pfn); \ - ___page; \ +#define pfn_to_online_page(pfn) \ +({ \ + struct page *___page = NULL; \ + unsigned long ___pfn = pfn; \ + unsigned long ___nr = pfn_to_section_nr(___pfn); \ + \ + if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr) && \ + pfn_valid_within(___pfn)) \ + ___page = pfn_to_page(___pfn); \ + ___page; \ }) /* diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h @@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ +#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/init/Kconfig b/init/Kconfig @@ -512,6 +512,17 @@ config PSI_DEFAULT_DISABLED per default but can be enabled through passing psi=1 on the kernel commandline during boot. + This feature adds some code to the task wakeup and sleep + paths of the scheduler. The overhead is too low to affect + common scheduling-intense workloads in practice (such as + webservers, memcache), but it does show up in artificial + scheduler stress tests, such as hackbench. + + If you are paranoid and not sure what the kernel will be + used for, say Y. + + Say N if unsure. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION @@ -825,7 +836,7 @@ config CGROUP_PIDS PIDs controller is designed to stop this from happening. It should be noted that organisational operations (such as attaching - to a cgroup hierarchy will *not* be blocked by the PIDs controller), + to a cgroup hierarchy) will *not* be blocked by the PIDs controller, since the PIDs limit only affects a process's ability to fork, not to attach to a cgroup. diff --git a/kernel/exit.c b/kernel/exit.c @@ -558,12 +558,14 @@ static struct task_struct *find_alive_thread(struct task_struct *p) return NULL; } -static struct task_struct *find_child_reaper(struct task_struct *father) +static struct task_struct *find_child_reaper(struct task_struct *father, + struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; + struct task_struct *p, *n; if (likely(reaper != father)) return reaper; @@ -579,6 +581,12 @@ static struct task_struct *find_child_reaper(struct task_struct *father) panic("Attempted to kill init! exitcode=0x%08x\n", father->signal->group_exit_code ?: father->exit_code); } + + list_for_each_entry_safe(p, n, dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } + zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); @@ -668,7 +676,7 @@ static void forget_original_parent(struct task_struct *father, exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ - reaper = find_child_reaper(father); + reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c @@ -124,6 +124,7 @@ * sampling of the aggregate task states would be. */ +#include "../workqueue_internal.h" #include <linux/sched/loadavg.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> @@ -480,9 +481,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->tasks[t]++; write_seqcount_end(&groupc->seq); - - if (!delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -513,6 +511,7 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; + bool wake_clock = true; void *iter = NULL; if (!task->pid) @@ -530,8 +529,22 @@ void psi_task_change(struct task_struct *task, int clear, int set) task->psi_flags &= ~clear; task->psi_flags |= set; - while ((group = iterate_groups(task, &iter))) + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((clear & TSK_RUNNING) && + (task->flags & PF_WQ_WORKER) && + wq_worker_last_func(task) == psi_update_work)) + wake_clock = false; + + while ((group = iterate_groups(task, &iter))) { psi_group_change(group, cpu, clear, set); + if (wake_clock && !delayed_work_pending(&group->clock_work)) + schedule_delayed_work(&group->clock_work, PSI_FREQ); + } } void psi_memstall_tick(struct task_struct *task, int cpu) diff --git a/kernel/workqueue.c b/kernel/workqueue.c @@ -910,6 +910,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) } /** + * wq_worker_last_func - retrieve worker's last work function + * + * Determine the last function a worker executed. This is called from + * the scheduler to get a worker's last known identity. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + * + * Return: + * The last work function %current executed as a worker, NULL if it + * hasn't executed any work yet. + */ +work_func_t wq_worker_last_func(struct task_struct *task) +{ + struct worker *worker = kthread_data(task); + + return worker->last_func; +} + +/** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set @@ -2184,6 +2204,9 @@ __acquires(&pool->lock) if (unlikely(cpu_intensive)) worker_clr_flags(worker, WORKER_CPU_INTENSIVE); + /* tag the worker for identification in schedule() */ + worker->last_func = worker->current_func; + /* we're done with it, release */ hash_del(&worker->hentry); worker->current_work = NULL; diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h @@ -53,6 +53,9 @@ struct worker { /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ + + /* used by the scheduler to determine a worker's last known identity */ + work_func_t last_func; }; /** @@ -67,9 +70,10 @@ static inline struct worker *current_wq_worker(void) /* * Scheduler hooks for concurrency managed workqueue. Only to be used from - * sched/core.c and workqueue.c. + * sched/ and workqueue.c. */ void wq_worker_waking_up(struct task_struct *task, int cpu); struct task_struct *wq_worker_sleeping(struct task_struct *task); +work_func_t wq_worker_last_func(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ diff --git a/lib/test_kmod.c b/lib/test_kmod.c @@ -632,7 +632,7 @@ static void __kmod_config_free(struct test_config *config) config->test_driver = NULL; kfree_const(config->test_fs); - config->test_driver = NULL; + config->test_fs = NULL; } static void kmod_config_free(struct kmod_test_device *test_dev) diff --git a/mm/hugetlb.c b/mm/hugetlb.c @@ -4268,7 +4268,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } if (ret & VM_FAULT_RETRY) { - if (nonblocking) + if (nonblocking && + !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *nonblocking = 0; *nr_pages = 0; /* diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile @@ -5,6 +5,7 @@ UBSAN_SANITIZE_generic.o := n UBSAN_SANITIZE_tags.o := n KCOV_INSTRUMENT := n +CFLAGS_REMOVE_common.o = -pg CFLAGS_REMOVE_generic.o = -pg # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 diff --git a/mm/memory-failure.c b/mm/memory-failure.c @@ -372,7 +372,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, if (fail || tk->addr_valid == 0) { pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", pfn, tk->tsk->comm, tk->tsk->pid); - force_sig(SIGKILL, tk->tsk); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, + tk->tsk, PIDTYPE_PID); } /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c @@ -1233,7 +1233,8 @@ static bool is_pageblock_removable_nolock(struct page *page) bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { struct page *page = pfn_to_page(start_pfn); - struct page *end_page = page + nr_pages; + unsigned long end_pfn = min(start_pfn + nr_pages, zone_end_pfn(page_zone(page))); + struct page *end_page = pfn_to_page(end_pfn); /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { @@ -1273,6 +1274,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, i++; if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) continue; + /* Check if we got outside of the zone */ + if (zone && !zone_spans_pfn(zone, pfn + i)) + return 0; page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) return 0; @@ -1301,23 +1305,27 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, static unsigned long scan_movable_pages(unsigned long start, unsigned long end) { unsigned long pfn; - struct page *page; + for (pfn = start; pfn < end; pfn++) { - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageLRU(page)) - return pfn; - if (__PageMovable(page)) - return pfn; - if (PageHuge(page)) { - if (hugepage_migration_supported(page_hstate(page)) && - page_huge_active(page)) - return pfn; - else - pfn = round_up(pfn + 1, - 1 << compound_order(page)) - 1; - } - } + struct page *page, *head; + unsigned long skip; + + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (PageLRU(page)) + return pfn; + if (__PageMovable(page)) + return pfn; + + if (!PageHuge(page)) + continue; + head = compound_head(page); + if (hugepage_migration_supported(page_hstate(head)) && + page_huge_active(head)) + return pfn; + skip = (1 << compound_order(head)) - (page - head); + pfn += skip - 1; } return 0; } @@ -1344,7 +1352,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page; - int not_managed = 0; int ret = 0; LIST_HEAD(source); @@ -1392,7 +1399,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) else ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (!ret) { /* Success */ - put_page(page); list_add_tail(&page->lru, &source); if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + @@ -1401,22 +1407,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } else { pr_warn("failed to isolate pfn %lx\n", pfn); dump_page(page, "isolation failed"); - put_page(page); - /* Because we don't have big zone->lock. we should - check this again here. */ - if (page_count(page)) { - not_managed++; - ret = -EBUSY; - break; - } } + put_page(page); } if (!list_empty(&source)) { - if (not_managed) { - putback_movable_pages(&source); - goto out; - } - /* Allocate a new page from the nearest neighbor node */ ret = migrate_pages(&source, new_node_page, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); @@ -1429,7 +1423,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) putback_movable_pages(&source); } } -out: + return ret; } @@ -1576,7 +1570,6 @@ static int __ref __offline_pages(unsigned long start_pfn, we assume this for now. .*/ if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) { - mem_hotplug_done(); ret = -EINVAL; reason = "multizone range"; goto failed_removal; @@ -1591,7 +1584,6 @@ static int __ref __offline_pages(unsigned long start_pfn, MIGRATE_MOVABLE, SKIP_HWPOISON | REPORT_FAILURE); if (ret) { - mem_hotplug_done(); reason = "failure to isolate range"; goto failed_removal; } diff --git a/mm/migrate.c b/mm/migrate.c @@ -709,7 +709,6 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, /* Simple case, sync compaction */ if (mode != MIGRATE_ASYNC) { do { - get_bh(bh); lock_buffer(bh); bh = bh->b_this_page; @@ -720,18 +719,15 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, /* async case, we cannot block on lock_buffer so use trylock_buffer */ do { - get_bh(bh); if (!trylock_buffer(bh)) { /* * We failed to lock the buffer and cannot stall in * async migration. Release the taken locks */ struct buffer_head *failed_bh = bh; - put_bh(failed_bh); bh = head; while (bh != failed_bh) { unlock_buffer(bh); - put_bh(bh); bh = bh->b_this_page; } return false; @@ -818,7 +814,6 @@ unlock_buffers: bh = head; do { unlock_buffer(bh); - put_bh(bh); bh = bh->b_this_page; } while (bh != head); @@ -1135,10 +1130,13 @@ out: * If migration is successful, decrease refcount of the newpage * which will not free the page because new page owner increased * refcounter. As well, if it is LRU page, add the page to LRU - * list in here. + * list in here. Use the old state of the isolated source page to + * determine if we migrated a LRU page. newpage was already unlocked + * and possibly modified by its owner - don't rely on the page + * state. */ if (rc == MIGRATEPAGE_SUCCESS) { - if (unlikely(__PageMovable(newpage))) + if (unlikely(!is_lru)) put_page(newpage); else putback_lru_page(newpage); diff --git a/mm/oom_kill.c b/mm/oom_kill.c @@ -647,8 +647,8 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - /* tsk is already queued? */ - if (tsk == oom_reaper_list || tsk->oom_reaper_list) + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) return; get_task_struct(tsk); @@ -975,6 +975,13 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * still freeing memory. */ read_lock(&tasklist_lock); + + /* + * The task 'p' might have already exited before reaching here. The + * put_task_struct() will free task_struct 'p' while the loop still try + * to access the field of 'p', so, get an extra reference. + */ + get_task_struct(p); for_each_thread(p, t) { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; @@ -994,6 +1001,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } } } + put_task_struct(p); read_unlock(&tasklist_lock); /* diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore @@ -10,4 +10,5 @@ /proc-uptime-002 /read /self +/setns-dcache /thread-self diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile @@ -14,6 +14,7 @@ TEST_GEN_PROGS += proc-uptime-001 TEST_GEN_PROGS += proc-uptime-002 TEST_GEN_PROGS += read TEST_GEN_PROGS += self +TEST_GEN_PROGS += setns-dcache TEST_GEN_PROGS += thread-self include ../lib.mk diff --git a/tools/testing/selftests/proc/setns-dcache.c b/tools/testing/selftests/proc/setns-dcache.c @@ -0,0 +1,129 @@ +/* + * Copyright © 2019 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* + * Test that setns(CLONE_NEWNET) points to new /proc/net content even + * if old one is in dcache. + * + * FIXME /proc/net/unix is under CONFIG_UNIX which can be disabled. + */ +#undef NDEBUG +#include <assert.h> +#include <errno.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/socket.h> + +static pid_t pid = -1; + +static void f(void) +{ + if (pid > 0) { + kill(pid, SIGTERM); + } +} + +int main(void) +{ + int fd[2]; + char _ = 0; + int nsfd; + + atexit(f); + + /* Check for priviledges and syscall availability straight away. */ + if (unshare(CLONE_NEWNET) == -1) { + if (errno == ENOSYS || errno == EPERM) { + return 4; + } + return 1; + } + /* Distinguisher between two otherwise empty net namespaces. */ + if (socket(AF_UNIX, SOCK_STREAM, 0) == -1) { + return 1; + } + + if (pipe(fd) == -1) { + return 1; + } + + pid = fork(); + if (pid == -1) { + return 1; + } + + if (pid == 0) { + if (unshare(CLONE_NEWNET) == -1) { + return 1; + } + + if (write(fd[1], &_, 1) != 1) { + return 1; + } + + pause(); + + return 0; + } + + if (read(fd[0], &_, 1) != 1) { + return 1; + } + + { + char buf[64]; + snprintf(buf, sizeof(buf), "/proc/%u/ns/net", pid); + nsfd = open(buf, O_RDONLY); + if (nsfd == -1) { + return 1; + } + } + + /* Reliably pin dentry into dcache. */ + (void)open("/proc/net/unix", O_RDONLY); + + if (setns(nsfd, CLONE_NEWNET) == -1) { + return 1; + } + + kill(pid, SIGTERM); + pid = 0; + + { + char buf[4096]; + ssize_t rv; + int fd; + + fd = open("/proc/net/unix", O_RDONLY); + if (fd == -1) { + return 1; + } + +#define S "Num RefCount Protocol Flags Type St Inode Path\n" + rv = read(fd, buf, sizeof(buf)); + + assert(rv == strlen(S)); + assert(memcmp(buf, S, strlen(S)) == 0); + } + + return 0; +}