whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit aa5b1054badb60191f6a09e7ef65beacf837c5d4
parent e1dbc5a41051d4791160727829903ec5169c7152
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Fri, 24 Aug 2018 09:34:23 -0700

Merge tag 'powerpc-4.19-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc fixes from Michael Ellerman:

 - An implementation for the newly added hv_ops->flush() for the OPAL
   hvc console driver backends, I forgot to apply this after merging the
   hvc driver changes before the merge window.

 - Enable all PCI bridges at boot on powernv, to avoid races when
   multiple children of a bridge try to enable it simultaneously. This
   is a workaround until the PCI core can be enhanced to fix the races.

 - A fix to query PowerVM for the correct system topology at boot before
   initialising sched domains, seen in some configurations to cause
   broken scheduling etc.

 - A fix for pte_access_permitted() on "nohash" platforms.

 - Two commits to fix SIGBUS when using remap_pfn_range() seen on Power9
   due to a workaround when using the nest MMU (GPUs, accelerators).

 - Another fix to the VFIO code used by KVM, the previous fix had some
   bugs which caused guests to not start in some configurations.

 - A handful of other minor fixes.

Thanks to: Aneesh Kumar K.V, Benjamin Herrenschmidt, Christophe Leroy,
Hari Bathini, Luke Dashjr, Mahesh Salgaonkar, Nicholas Piggin, Paul
Mackerras, Srikar Dronamraju.

* tag 'powerpc-4.19-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux:
  powerpc/mce: Fix SLB rebolting during MCE recovery path.
  KVM: PPC: Book3S: Fix guest DMA when guest partially backed by THP pages
  powerpc/mm/radix: Only need the Nest MMU workaround for R -> RW transition
  powerpc/mm/books3s: Add new pte bit to mark pte temporarily invalid.
  powerpc/nohash: fix pte_access_permitted()
  powerpc/topology: Get topology for shared processors at boot
  powerpc64/ftrace: Include ftrace.h needed for enable/disable calls
  powerpc/powernv/pci: Work around races in PCI bridge enabling
  powerpc/fadump: cleanup crash memory ranges support
  powerpc/powernv: provide a console flush operation for opal hvc driver
  powerpc/traps: Avoid rate limit messages from show unhandled signals
  powerpc/64s: Fix PACA_IRQ_HARD_DIS accounting in idle_power4()

Diffstat:
March/powerpc/include/asm/book3s/64/pgtable.h | 18+++++++++++++++++-
March/powerpc/include/asm/nohash/pgtable.h | 9+++------
March/powerpc/include/asm/opal.h | 1+
March/powerpc/include/asm/topology.h | 5+++++
March/powerpc/kernel/fadump.c | 8+-------
March/powerpc/kernel/idle_power4.S | 16++++++++++++++--
March/powerpc/kernel/smp.c | 5+++++
March/powerpc/kernel/traps.c | 13++++++-------
March/powerpc/kvm/book3s_hv.c | 1+
March/powerpc/mm/mmu_context_iommu.c | 17++++++++++-------
March/powerpc/mm/numa.c | 20++++++++++----------
March/powerpc/mm/pgtable-radix.c | 8+++++---
March/powerpc/mm/slb.c | 2+-
March/powerpc/platforms/powernv/opal.c | 83+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
March/powerpc/platforms/powernv/pci-ioda.c | 37+++++++++++++++++++++++++++++++++++++
Mdrivers/tty/hvc/hvc_opal.c | 2++
16 files changed, 170 insertions(+), 75 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -44,6 +44,16 @@ #define _PAGE_PTE 0x4000000000000000UL /* distinguishes PTEs from pointers */ #define _PAGE_PRESENT 0x8000000000000000UL /* pte contains a translation */ +/* + * We need to mark a pmd pte invalid while splitting. We can do that by clearing + * the _PAGE_PRESENT bit. But then that will be taken as a swap pte. In order to + * differentiate between two use a SW field when invalidating. + * + * We do that temporary invalidate for regular pte entry in ptep_set_access_flags + * + * This is used only when _PAGE_PRESENT is cleared. + */ +#define _PAGE_INVALID _RPAGE_SW0 /* * Top and bottom bits of RPN which can be used by hash @@ -568,7 +578,13 @@ static inline pte_t pte_clear_savedwrite(pte_t pte) static inline int pte_present(pte_t pte) { - return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT)); + /* + * A pte is considerent present if _PAGE_PRESENT is set. + * We also need to consider the pte present which is marked + * invalid during ptep_set_access_flags. Hence we look for _PAGE_INVALID + * if we find _PAGE_PRESENT cleared. + */ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID)); } #ifdef CONFIG_PPC_MEM_KEYS diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h @@ -51,17 +51,14 @@ static inline int pte_present(pte_t pte) #define pte_access_permitted pte_access_permitted static inline bool pte_access_permitted(pte_t pte, bool write) { - unsigned long pteval = pte_val(pte); /* * A read-only access is controlled by _PAGE_USER bit. * We have _PAGE_READ set for WRITE and EXECUTE */ - unsigned long need_pte_bits = _PAGE_PRESENT | _PAGE_USER; - - if (write) - need_pte_bits |= _PAGE_WRITE; + if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte)) + return false; - if ((pteval & need_pte_bits) != need_pte_bits) + if (write && !pte_write(pte)) return false; return true; diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h @@ -308,6 +308,7 @@ extern void opal_configure_cores(void); extern int opal_get_chars(uint32_t vtermno, char *buf, int count); extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len); extern int opal_put_chars_atomic(uint32_t vtermno, const char *buf, int total_len); +extern int opal_flush_chars(uint32_t vtermno, bool wait); extern int opal_flush_console(uint32_t vtermno); extern void hvc_opal_init_early(void); diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h @@ -92,6 +92,7 @@ extern int stop_topology_update(void); extern int prrn_is_enabled(void); extern int find_and_online_cpu_nid(int cpu); extern int timed_topology_update(int nsecs); +extern void __init shared_proc_topology_init(void); #else static inline int start_topology_update(void) { @@ -113,6 +114,10 @@ static inline int timed_topology_update(int nsecs) { return 0; } + +#ifdef CONFIG_SMP +static inline void shared_proc_topology_init(void) {} +#endif #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */ #include <asm-generic/topology.h> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c @@ -34,6 +34,7 @@ #include <linux/crash_dump.h> #include <linux/kobject.h> #include <linux/sysfs.h> +#include <linux/slab.h> #include <asm/debugfs.h> #include <asm/page.h> @@ -1019,13 +1020,6 @@ static int fadump_setup_crash_memory_ranges(void) pr_debug("Setup crash memory ranges.\n"); crash_mem_ranges = 0; - /* allocate memory for crash memory ranges for the first time */ - if (!max_crash_mem_ranges) { - ret = allocate_crash_memory_ranges(); - if (ret) - return ret; - } - /* * add the first memory chunk (RMA_START through boot_memory_size) as * a separate memory chunk. The reason is, at the time crash firmware diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S @@ -32,6 +32,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP) cmpwi 0,r4,0 beqlr + /* This sequence is similar to prep_irq_for_idle() */ + /* Hard disable interrupts */ mfmsr r7 rldicl r0,r7,48,1 @@ -41,10 +43,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP) /* Check if something happened while soft-disabled */ lbz r0,PACAIRQHAPPENED(r13) cmpwi cr0,r0,0 - bnelr + bne- 2f - /* Soft-enable interrupts */ + /* + * Soft-enable interrupts. This will make power4_fixup_nap return + * to our caller with interrupts enabled (soft and hard). The caller + * can cope with either interrupts disabled or enabled upon return. + */ #ifdef CONFIG_TRACE_IRQFLAGS + /* Tell the tracer interrupts are on, because idle responds to them. */ mflr r0 std r0,16(r1) stdu r1,-128(r1) @@ -73,3 +80,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) isync b 1b +2: /* Return if an interrupt had happened while soft disabled */ + /* Set the HARD_DIS flag because interrupts are now hard disabled */ + ori r0,r0,PACA_IRQ_HARD_DIS + stb r0,PACAIRQHAPPENED(r13) + blr diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c @@ -1160,6 +1160,11 @@ void __init smp_cpus_done(unsigned int max_cpus) if (smp_ops && smp_ops->bringup_done) smp_ops->bringup_done(); + /* + * On a shared LPAR, associativity needs to be requested. + * Hence, get numa topology before dumping cpu topology + */ + shared_proc_topology_init(); dump_numa_cpu_topology(); /* diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c @@ -315,22 +315,21 @@ void user_single_step_siginfo(struct task_struct *tsk, info->si_addr = (void __user *)regs->nip; } -static bool show_unhandled_signals_ratelimited(void) +static void show_signal_msg(int signr, struct pt_regs *regs, int code, + unsigned long addr) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - return show_unhandled_signals && __ratelimit(&rs); -} -static void show_signal_msg(int signr, struct pt_regs *regs, int code, - unsigned long addr) -{ - if (!show_unhandled_signals_ratelimited()) + if (!show_unhandled_signals) return; if (!unhandled_signal(current, signr)) return; + if (!__ratelimit(&rs)) + return; + pr_info("%s[%d]: %s (%d) at %lx nip %lx lr %lx code %x", current->comm, current->pid, signame(signr), signr, addr, regs->nip, regs->link, code); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c @@ -46,6 +46,7 @@ #include <linux/compiler.h> #include <linux/of.h> +#include <asm/ftrace.h> #include <asm/reg.h> #include <asm/ppc-opcode.h> #include <asm/asm-prototypes.h> diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c @@ -129,6 +129,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, long i, j, ret = 0, locked_entries = 0; unsigned int pageshift; unsigned long flags; + unsigned long cur_ua; struct page *page = NULL; mutex_lock(&mem_list_mutex); @@ -177,7 +178,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, } for (i = 0; i < entries; ++i) { - if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT), + cur_ua = ua + (i << PAGE_SHIFT); + if (1 != get_user_pages_fast(cur_ua, 1/* pages */, 1/* iswrite */, &page)) { ret = -EFAULT; for (j = 0; j < i; ++j) @@ -196,7 +198,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, if (is_migrate_cma_page(page)) { if (mm_iommu_move_page_from_cma(page)) goto populate; - if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT), + if (1 != get_user_pages_fast(cur_ua, 1/* pages */, 1/* iswrite */, &page)) { ret = -EFAULT; @@ -210,20 +212,21 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, } populate: pageshift = PAGE_SHIFT; - if (PageCompound(page)) { + if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) { pte_t *pte; struct page *head = compound_head(page); unsigned int compshift = compound_order(head); + unsigned int pteshift; local_irq_save(flags); /* disables as well */ - pte = find_linux_pte(mm->pgd, ua, NULL, &pageshift); - local_irq_restore(flags); + pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift); /* Double check it is still the same pinned page */ if (pte && pte_page(*pte) == head && - pageshift == compshift) - pageshift = max_t(unsigned int, pageshift, + pteshift == compshift + PAGE_SHIFT) + pageshift = max_t(unsigned int, pteshift, PAGE_SHIFT); + local_irq_restore(flags); } mem->pageshift = min(mem->pageshift, pageshift); mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c @@ -1078,7 +1078,6 @@ static int prrn_enabled; static void reset_topology_timer(void); static int topology_timer_secs = 1; static int topology_inited; -static int topology_update_needed; /* * Change polling interval for associativity changes. @@ -1306,11 +1305,8 @@ int numa_update_cpu_topology(bool cpus_locked) struct device *dev; int weight, new_nid, i = 0; - if (!prrn_enabled && !vphn_enabled) { - if (!topology_inited) - topology_update_needed = 1; + if (!prrn_enabled && !vphn_enabled && topology_inited) return 0; - } weight = cpumask_weight(&cpu_associativity_changes_mask); if (!weight) @@ -1423,7 +1419,6 @@ int numa_update_cpu_topology(bool cpus_locked) out: kfree(updates); - topology_update_needed = 0; return changed; } @@ -1551,6 +1546,15 @@ int prrn_is_enabled(void) return prrn_enabled; } +void __init shared_proc_topology_init(void) +{ + if (lppaca_shared_proc(get_lppaca())) { + bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), + nr_cpumask_bits); + numa_update_cpu_topology(false); + } +} + static int topology_read(struct seq_file *file, void *v) { if (vphn_enabled || prrn_enabled) @@ -1608,10 +1612,6 @@ static int topology_update_init(void) return -ENOMEM; topology_inited = 1; - if (topology_update_needed) - bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), - nr_cpumask_bits); - return 0; } device_initcall(topology_update_init); diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c @@ -1045,20 +1045,22 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, struct mm_struct *mm = vma->vm_mm; unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); + + unsigned long change = pte_val(entry) ^ pte_val(*ptep); /* * To avoid NMMU hang while relaxing access, we need mark * the pte invalid in between. */ - if (atomic_read(&mm->context.copros) > 0) { + if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { unsigned long old_pte, new_pte; - old_pte = __radix_pte_update(ptep, ~0, 0); + old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); /* * new value of pte */ new_pte = old_pte | set; radix__flush_tlb_page_psize(mm, address, psize); - __radix_pte_update(ptep, 0, new_pte); + __radix_pte_update(ptep, _PAGE_INVALID, new_pte); } else { __radix_pte_update(ptep, 0, set); /* diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c @@ -70,7 +70,7 @@ static inline void slb_shadow_update(unsigned long ea, int ssize, static inline void slb_shadow_clear(enum slb_index index) { - WRITE_ONCE(get_slb_shadow()->save_area[index].esid, 0); + WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index)); } static inline void create_shadowed_slbe(unsigned long ea, int ssize, diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c @@ -370,12 +370,8 @@ static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, b olen = cpu_to_be64(total_len); rc = opal_console_write(vtermno, &olen, data); if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { - if (rc == OPAL_BUSY_EVENT) { - mdelay(OPAL_BUSY_DELAY_MS); + if (rc == OPAL_BUSY_EVENT) opal_poll_events(NULL); - } else if (rc == OPAL_BUSY_EVENT) { - mdelay(OPAL_BUSY_DELAY_MS); - } written = -EAGAIN; goto out; } @@ -401,15 +397,6 @@ out: if (atomic) spin_unlock_irqrestore(&opal_write_lock, flags); - /* In the -EAGAIN case, callers loop, so we have to flush the console - * here in case they have interrupts off (and we don't want to wait - * for async flushing if we can make immediate progress here). If - * necessary the API could be made entirely non-flushing if the - * callers had a ->flush API to use. - */ - if (written == -EAGAIN) - opal_flush_console(vtermno); - return written; } @@ -429,40 +416,74 @@ int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len) return __opal_put_chars(vtermno, data, total_len, true); } -int opal_flush_console(uint32_t vtermno) +static s64 __opal_flush_console(uint32_t vtermno) { s64 rc; if (!opal_check_token(OPAL_CONSOLE_FLUSH)) { __be64 evt; - WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n"); /* * If OPAL_CONSOLE_FLUSH is not implemented in the firmware, * the console can still be flushed by calling the polling * function while it has OPAL_EVENT_CONSOLE_OUTPUT events. */ - do { - opal_poll_events(&evt); - } while (be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT); + WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n"); + + opal_poll_events(&evt); + if (!(be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT)) + return OPAL_SUCCESS; + return OPAL_BUSY; - return OPAL_SUCCESS; + } else { + rc = opal_console_flush(vtermno); + if (rc == OPAL_BUSY_EVENT) { + opal_poll_events(NULL); + rc = OPAL_BUSY; + } + return rc; } - do { - rc = OPAL_BUSY; - while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { - rc = opal_console_flush(vtermno); - if (rc == OPAL_BUSY_EVENT) { - mdelay(OPAL_BUSY_DELAY_MS); - opal_poll_events(NULL); - } else if (rc == OPAL_BUSY) { - mdelay(OPAL_BUSY_DELAY_MS); +} + +/* + * opal_flush_console spins until the console is flushed + */ +int opal_flush_console(uint32_t vtermno) +{ + for (;;) { + s64 rc = __opal_flush_console(vtermno); + + if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) { + mdelay(1); + continue; + } + + return opal_error_code(rc); + } +} + +/* + * opal_flush_chars is an hvc interface that sleeps until the console is + * flushed if wait, otherwise it will return -EBUSY if the console has data, + * -EAGAIN if it has data and some of it was flushed. + */ +int opal_flush_chars(uint32_t vtermno, bool wait) +{ + for (;;) { + s64 rc = __opal_flush_console(vtermno); + + if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) { + if (wait) { + msleep(OPAL_BUSY_DELAY_MS); + continue; } + if (rc == OPAL_PARTIAL) + return -EAGAIN; } - } while (rc == OPAL_PARTIAL); /* More to flush */ - return opal_error_code(rc); + return opal_error_code(rc); + } } static int opal_recover_mce(struct pt_regs *regs, diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3228,12 +3228,49 @@ static void pnv_pci_ioda_create_dbgfs(void) #endif /* CONFIG_DEBUG_FS */ } +static void pnv_pci_enable_bridge(struct pci_bus *bus) +{ + struct pci_dev *dev = bus->self; + struct pci_bus *child; + + /* Empty bus ? bail */ + if (list_empty(&bus->devices)) + return; + + /* + * If there's a bridge associated with that bus enable it. This works + * around races in the generic code if the enabling is done during + * parallel probing. This can be removed once those races have been + * fixed. + */ + if (dev) { + int rc = pci_enable_device(dev); + if (rc) + pci_err(dev, "Error enabling bridge (%d)\n", rc); + pci_set_master(dev); + } + + /* Perform the same to child busses */ + list_for_each_entry(child, &bus->children, node) + pnv_pci_enable_bridge(child); +} + +static void pnv_pci_enable_bridges(void) +{ + struct pci_controller *hose; + + list_for_each_entry(hose, &hose_list, list_node) + pnv_pci_enable_bridge(hose->bus); +} + static void pnv_pci_ioda_fixup(void) { pnv_pci_ioda_setup_PEs(); pnv_pci_ioda_setup_iommu_api(); pnv_pci_ioda_create_dbgfs(); + pnv_pci_enable_bridges(); + #ifdef CONFIG_EEH pnv_eeh_post_init(); #endif diff --git a/drivers/tty/hvc/hvc_opal.c b/drivers/tty/hvc/hvc_opal.c @@ -52,6 +52,7 @@ static u32 hvc_opal_boot_termno; static const struct hv_ops hvc_opal_raw_ops = { .get_chars = opal_get_chars, .put_chars = opal_put_chars, + .flush = opal_flush_chars, .notifier_add = notifier_add_irq, .notifier_del = notifier_del_irq, .notifier_hangup = notifier_hangup_irq, @@ -141,6 +142,7 @@ static int hvc_opal_hvsi_tiocmset(struct hvc_struct *hp, unsigned int set, static const struct hv_ops hvc_opal_hvsi_ops = { .get_chars = hvc_opal_hvsi_get_chars, .put_chars = hvc_opal_hvsi_put_chars, + .flush = opal_flush_chars, .notifier_add = hvc_opal_hvsi_open, .notifier_del = hvc_opal_hvsi_close, .notifier_hangup = hvc_opal_hvsi_hangup,