whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 6c3ac1134371b51c9601171af2c32153ccb11100
parent d72cb8c7d9dbd9ce820c80f3fddb56b296ba96fc
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu,  7 Mar 2019 12:56:26 -0800

Merge tag 'powerpc-5.1-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc updates from Michael Ellerman:
 "Notable changes:

   - Enable THREAD_INFO_IN_TASK to move thread_info off the stack.

   - A big series from Christoph reworking our DMA code to use more of
     the generic infrastructure, as he said:
       "This series switches the powerpc port to use the generic swiotlb
        and noncoherent dma ops, and to use more generic code for the
        coherent direct mapping, as well as removing a lot of dead
        code."

   - Increase our vmalloc space to 512T with the Hash MMU on modern
     CPUs, allowing us to support machines with larger amounts of total
     RAM or distance between nodes.

   - Two series from Christophe, one to optimise TLB miss handlers on
     6xx, and another to optimise the way STRICT_KERNEL_RWX is
     implemented on some 32-bit CPUs.

   - Support for KCOV coverage instrumentation which means we can run
     syzkaller and discover even more bugs in our code.

  And as always many clean-ups, reworks and minor fixes etc.

  Thanks to: Alan Modra, Alexey Kardashevskiy, Alistair Popple, Andrea
  Arcangeli, Andrew Donnellan, Aneesh Kumar K.V, Aravinda Prasad, Balbir
  Singh, Brajeswar Ghosh, Breno Leitao, Christian Lamparter, Christian
  Zigotzky, Christophe Leroy, Christoph Hellwig, Corentin Labbe, Daniel
  Axtens, David Gibson, Diana Craciun, Firoz Khan, Gustavo A. R. Silva,
  Igor Stoppa, Joe Lawrence, Joel Stanley, Jonathan Neuschäfer, Jordan
  Niethe, Laurent Dufour, Madhavan Srinivasan, Mahesh Salgaonkar, Mark
  Cave-Ayland, Masahiro Yamada, Mathieu Malaterre, Matteo Croce, Meelis
  Roos, Michael W. Bringmann, Nathan Chancellor, Nathan Fontenot,
  Nicholas Piggin, Nick Desaulniers, Nicolai Stange, Oliver O'Halloran,
  Paul Mackerras, Peter Xu, PrasannaKumar Muralidharan, Qian Cai,
  Rashmica Gupta, Reza Arbab, Robert P. J. Day, Russell Currey,
  Sabyasachi Gupta, Sam Bobroff, Sandipan Das, Sergey Senozhatsky,
  Souptick Joarder, Stewart Smith, Tyrel Datwyler, Vaibhav Jain,
  YueHaibing"

* tag 'powerpc-5.1-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (200 commits)
  powerpc/32: Clear on-stack exception marker upon exception return
  powerpc: Remove export of save_stack_trace_tsk_reliable()
  powerpc/mm: fix "section_base" set but not used
  powerpc/mm: Fix "sz" set but not used warning
  powerpc/mm: Check secondary hash page table
  powerpc: remove nargs from __SYSCALL
  powerpc/64s: Fix unrelocated interrupt trampoline address test
  powerpc/powernv/ioda: Fix locked_vm counting for memory used by IOMMU tables
  powerpc/fsl: Fix the flush of branch predictor.
  powerpc/powernv: Make opal log only readable by root
  powerpc/xmon: Fix opcode being uninitialized in print_insn_powerpc
  powerpc/powernv: move OPAL call wrapper tracing and interrupt handling to C
  powerpc/64s: Fix data interrupts vs d-side MCE reentrancy
  powerpc/64s: Prepare to handle data interrupts vs d-side MCE reentrancy
  powerpc/64s: system reset interrupt preserve HSRRs
  powerpc/64s: Fix HV NMI vs HV interrupt recoverability test
  powerpc/mm/hash: Handle mmap_min_addr correctly in get_unmapped_area topdown search
  powerpc/hugetlb: Handle mmap_min_addr correctly in get_unmapped_area callback
  selftests/powerpc: Remove duplicate header
  powerpc sstep: Add support for modsd, modud instructions
  ...

Diffstat:
March/powerpc/Kconfig | 79++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
March/powerpc/Kconfig.debug | 4----
March/powerpc/Makefile | 11+++++++++--
March/powerpc/boot/dts/Makefile | 1+
March/powerpc/boot/dts/akebono.dts | 2+-
March/powerpc/boot/dts/bluestone.dts | 2+-
March/powerpc/boot/dts/currituck.dts | 2+-
March/powerpc/boot/dts/iss4xx-mpic.dts | 2+-
March/powerpc/boot/dts/wii.dts | 22++++++++++++++++++++++
March/powerpc/include/asm/asm-prototypes.h | 14++++++--------
March/powerpc/include/asm/book3s/32/mmu-hash.h | 2++
March/powerpc/include/asm/book3s/32/pgtable.h | 11+++++++++++
March/powerpc/include/asm/book3s/64/hash.h | 32+++++++++++++++++++++++---------
March/powerpc/include/asm/book3s/64/mmu-hash.h | 2+-
March/powerpc/include/asm/book3s/64/pgalloc.h | 8++++----
March/powerpc/include/asm/book3s/64/pgtable.h | 16+---------------
March/powerpc/include/asm/book3s/64/tlbflush-radix.h | 30++++++++++++++++++++++++------
March/powerpc/include/asm/checksum.h | 4----
March/powerpc/include/asm/device.h | 10+++++-----
March/powerpc/include/asm/dma-direct.h | 18++++++++----------
March/powerpc/include/asm/dma-mapping.h | 92-------------------------------------------------------------------------------
March/powerpc/include/asm/eeh.h | 10+++++++---
March/powerpc/include/asm/eeh_event.h | 1+
March/powerpc/include/asm/exception-64s.h | 4++--
March/powerpc/include/asm/hvsi.h | 2+-
March/powerpc/include/asm/iommu.h | 17+++++++++++++++++
March/powerpc/include/asm/ipic.h | 3---
March/powerpc/include/asm/irq.h | 18+++++++-----------
March/powerpc/include/asm/kvm_ppc.h | 3++-
March/powerpc/include/asm/livepatch.h | 7++++---
March/powerpc/include/asm/machdep.h | 4+---
March/powerpc/include/asm/mce.h | 2+-
March/powerpc/include/asm/mmu.h | 13+++++++++++++
March/powerpc/include/asm/nmi.h | 2++
March/powerpc/include/asm/nohash/32/mmu-8xx.h | 3++-
March/powerpc/include/asm/page.h | 14++------------
March/powerpc/include/asm/pci-bridge.h | 7++++---
March/powerpc/include/asm/pci.h | 2--
March/powerpc/include/asm/pgtable.h | 1-
March/powerpc/include/asm/powernv.h | 3++-
March/powerpc/include/asm/ppc-opcode.h | 16+++++++++++++++-
March/powerpc/include/asm/ppc-pci.h | 4++--
March/powerpc/include/asm/processor.h | 108++++++++-----------------------------------------------------------------------
March/powerpc/include/asm/ptrace.h | 2+-
March/powerpc/include/asm/reg.h | 9+++++++--
March/powerpc/include/asm/sections.h | 7+++++++
March/powerpc/include/asm/smp.h | 17++++++++++++++++-
March/powerpc/include/asm/swiotlb.h | 5-----
Aarch/powerpc/include/asm/task_size_32.h | 21+++++++++++++++++++++
Aarch/powerpc/include/asm/task_size_64.h | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/powerpc/include/asm/thread_info.h | 19-------------------
March/powerpc/include/asm/topology.h | 2++
March/powerpc/kernel/Makefile | 15+++++++++++++--
March/powerpc/kernel/asm-offsets.c | 15+++++++++++----
March/powerpc/kernel/cpu_setup_6xx.S | 4++++
March/powerpc/kernel/dma-iommu.c | 75++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Aarch/powerpc/kernel/dma-mask.c | 12++++++++++++
March/powerpc/kernel/dma-swiotlb.c | 89-------------------------------------------------------------------------------
Darch/powerpc/kernel/dma.c | 362-------------------------------------------------------------------------------
March/powerpc/kernel/dt_cpu_ftrs.c | 17+++++++----------
March/powerpc/kernel/eeh.c | 190++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
March/powerpc/kernel/eeh_cache.c | 36+++++++++++++++++++++++++++++-------
March/powerpc/kernel/eeh_driver.c | 86+++++++++++++++++++++++++++++++------------------------------------------------
March/powerpc/kernel/eeh_event.c | 16+++++++++++++++-
March/powerpc/kernel/eeh_pe.c | 68+++++++++++++++++++++++++++++---------------------------------------
March/powerpc/kernel/eeh_sysfs.c | 3++-
March/powerpc/kernel/entry_32.S | 97+++++++++++++++++++++++++++++++++----------------------------------------------
March/powerpc/kernel/entry_64.S | 53+++++++++++++++++++++++++++--------------------------
March/powerpc/kernel/epapr_hcalls.S | 5++---
March/powerpc/kernel/exceptions-64e.S | 14++------------
March/powerpc/kernel/exceptions-64s.S | 94++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
March/powerpc/kernel/head_32.S | 160++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
March/powerpc/kernel/head_40x.S | 9++++-----
March/powerpc/kernel/head_44x.S | 8++++----
March/powerpc/kernel/head_64.S | 20+++++++++-----------
March/powerpc/kernel/head_8xx.S | 124+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
March/powerpc/kernel/head_booke.h | 12+++---------
March/powerpc/kernel/head_fsl_booke.S | 16++++++++--------
March/powerpc/kernel/idle_6xx.S | 8+++-----
March/powerpc/kernel/idle_book3e.S | 2+-
March/powerpc/kernel/idle_e500.S | 8+++-----
March/powerpc/kernel/idle_power4.S | 2+-
March/powerpc/kernel/irq.c | 119++++++++++---------------------------------------------------------------------
March/powerpc/kernel/kgdb.c | 28----------------------------
March/powerpc/kernel/machine_kexec_64.c | 6++----
March/powerpc/kernel/mce.c | 11++++++++---
March/powerpc/kernel/misc_32.S | 17+++++++++--------
March/powerpc/kernel/pci-common.c | 21+++++++++++++--------
March/powerpc/kernel/process.c | 68+++++++++++++++++++++++++++++++++++++++++---------------------------
March/powerpc/kernel/ptrace.c | 18+++++++++++++++---
March/powerpc/kernel/setup-common.c | 5++---
March/powerpc/kernel/setup_32.c | 26++++++++++++++++----------
March/powerpc/kernel/setup_64.c | 51++++++++++-----------------------------------------
March/powerpc/kernel/smp.c | 109++++++++++++++++++++++++++++---------------------------------------------------
March/powerpc/kernel/stacktrace.c | 102+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
March/powerpc/kernel/syscalls.c | 2+-
March/powerpc/kernel/syscalls/syscalltbl.sh | 4++--
March/powerpc/kernel/systbl.S | 6+++---
March/powerpc/kernel/time.c | 1-
March/powerpc/kernel/trace/Makefile | 3++-
March/powerpc/kernel/trace/ftrace_64_mprofile.S | 6+++---
March/powerpc/kernel/traps.c | 133++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
March/powerpc/kernel/udbg.c | 2+-
March/powerpc/kernel/vdso32/Makefile | 1+
March/powerpc/kernel/vdso64/Makefile | 1+
March/powerpc/kernel/vmlinux.lds.S | 14++++++--------
March/powerpc/kvm/Makefile | 5-----
March/powerpc/kvm/book3s.c | 7+++++++
March/powerpc/kvm/book3s_hv.c | 25+++++++++++++++++++++----
March/powerpc/kvm/book3s_hv_hmi.c | 1+
March/powerpc/kvm/book3s_hv_ras.c | 58++++++++++++++--------------------------------------------
March/powerpc/kvm/book3s_hv_rmhandlers.S | 66++++++++++++++++++++----------------------------------------------
March/powerpc/lib/Makefile | 3++-
March/powerpc/lib/sstep.c | 114+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
March/powerpc/lib/test_emulate_step.c | 535++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Aarch/powerpc/lib/test_emulate_step_exec_instr.S | 150+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/powerpc/math-emu/Makefile | 2+-
March/powerpc/mm/40x_mmu.c | 2+-
March/powerpc/mm/44x_mmu.c | 2+-
March/powerpc/mm/8xx_mmu.c | 91+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
March/powerpc/mm/Makefile | 15++++++---------
March/powerpc/mm/dma-noncoherent.c | 40+++++++++++++++++++++++++++-------------
Darch/powerpc/mm/dump_hashpagetable.c | 550-------------------------------------------------------------------------------
Darch/powerpc/mm/dump_linuxpagetables-8xx.c | 82-------------------------------------------------------------------------------
Darch/powerpc/mm/dump_linuxpagetables-book3s64.c | 120-------------------------------------------------------------------------------
Darch/powerpc/mm/dump_linuxpagetables-generic.c | 80-------------------------------------------------------------------------------
Darch/powerpc/mm/dump_linuxpagetables.c | 373-------------------------------------------------------------------------------
March/powerpc/mm/fsl_booke_mmu.c | 2+-
March/powerpc/mm/hash_low_32.S | 76+++++++++++++++++++++++++++++++---------------------------------------------
March/powerpc/mm/hash_utils_64.c | 6+++---
March/powerpc/mm/hugetlbpage-hash64.c | 3+--
March/powerpc/mm/hugetlbpage-radix.c | 5+++--
March/powerpc/mm/init_32.c | 6+-----
March/powerpc/mm/init_64.c | 2--
March/powerpc/mm/mem.c | 61++++++-------------------------------------------------------
March/powerpc/mm/mmu_decl.h | 10+++++++++-
March/powerpc/mm/numa.c | 9+--------
March/powerpc/mm/pgtable_32.c | 42++++++++++++++++++++++--------------------
March/powerpc/mm/ppc_mmu_32.c | 186+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Aarch/powerpc/mm/ptdump/8xx.c | 82+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/powerpc/mm/ptdump/Makefile | 9+++++++++
Rarch/powerpc/mm/dump_bats.c -> arch/powerpc/mm/ptdump/bats.c | 0
Aarch/powerpc/mm/ptdump/book3s64.c | 120+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/powerpc/mm/ptdump/hashpagetable.c | 550+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/powerpc/mm/ptdump/ptdump.c | 379+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rarch/powerpc/mm/dump_linuxpagetables.h -> arch/powerpc/mm/ptdump/ptdump.h | 0
Rarch/powerpc/mm/dump_sr.c -> arch/powerpc/mm/ptdump/segment_regs.c | 0
Aarch/powerpc/mm/ptdump/shared.c | 80+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/powerpc/mm/slb.c | 5+++++
March/powerpc/mm/slice.c | 10++++++----
March/powerpc/mm/tlb_nohash.c | 2+-
March/powerpc/net/bpf_jit32.h | 5++---
March/powerpc/perf/power9-events-list.h | 24++++++++++++++++++++++++
March/powerpc/perf/power9-pmu.c | 4++++
March/powerpc/platforms/44x/Kconfig | 1+
March/powerpc/platforms/44x/ppc476.c | 1+
March/powerpc/platforms/44x/warp.c | 2+-
March/powerpc/platforms/83xx/suspend-asm.S | 34+++++++++++++++++++++++++++-------
March/powerpc/platforms/85xx/corenet_generic.c | 5+----
March/powerpc/platforms/85xx/ge_imp3a.c | 2--
March/powerpc/platforms/85xx/mpc8536_ds.c | 2--
March/powerpc/platforms/85xx/mpc85xx_ds.c | 4----
March/powerpc/platforms/85xx/mpc85xx_mds.c | 4----
March/powerpc/platforms/85xx/p1010rdb.c | 1-
March/powerpc/platforms/85xx/p1022_ds.c | 2--
March/powerpc/platforms/85xx/p1022_rdk.c | 2--
March/powerpc/platforms/85xx/qemu_e500.c | 1+
March/powerpc/platforms/86xx/mpc86xx_hpcn.c | 1-
March/powerpc/platforms/Kconfig.cputype | 9+++++++++
March/powerpc/platforms/cell/iommu.c | 172++++++++++++-------------------------------------------------------------------
March/powerpc/platforms/cell/spu_callbacks.c | 2+-
March/powerpc/platforms/cell/spu_syscalls.c | 1-
March/powerpc/platforms/cell/spufs/file.c | 5++---
March/powerpc/platforms/embedded6xx/wii.c | 24------------------------
March/powerpc/platforms/pasemi/iommu.c | 2+-
March/powerpc/platforms/pasemi/setup.c | 51---------------------------------------------------
March/powerpc/platforms/powernv/Makefile | 5++---
March/powerpc/platforms/powernv/idle.c | 27++-------------------------
March/powerpc/platforms/powernv/npu-dma.c | 16+---------------
Aarch/powerpc/platforms/powernv/opal-call.c | 283+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/powerpc/platforms/powernv/opal-msglog.c | 2+-
March/powerpc/platforms/powernv/opal-wrappers.S | 344+++++++++----------------------------------------------------------------------
March/powerpc/platforms/powernv/opal.c | 3+--
March/powerpc/platforms/powernv/pci-ioda-tce.c | 1-
March/powerpc/platforms/powernv/pci-ioda.c | 146+++++++++++++++++++++----------------------------------------------------------
March/powerpc/platforms/powernv/smp.c | 25+++++++++++++++++++++++++
March/powerpc/platforms/ps3/device-init.c | 4+---
March/powerpc/platforms/ps3/os-area.c | 4++--
March/powerpc/platforms/ps3/system-bus.c | 4++--
March/powerpc/platforms/pseries/hotplug-cpu.c | 19+++++++++++++++++++
March/powerpc/platforms/pseries/iommu.c | 99++++++++++++++++++++-----------------------------------------------------------
March/powerpc/platforms/pseries/lparcfg.c | 1+
March/powerpc/platforms/pseries/vio.c | 95++++++++++++++++++++++++++++++++++++-------------------------------------------
March/powerpc/sysdev/6xx-suspend.S | 5++---
March/powerpc/sysdev/dart_iommu.c | 58++++++++++++++++++----------------------------------------
March/powerpc/sysdev/fsl_pci.c | 25+++++++++++++------------
March/powerpc/sysdev/ipic.c | 35-----------------------------------
March/powerpc/sysdev/tsi108_dev.c | 2+-
March/powerpc/sysdev/xive/common.c | 2+-
March/powerpc/xmon/Makefile | 1+
March/powerpc/xmon/ppc-dis.c | 2+-
March/powerpc/xmon/xmon.c | 2+-
Mdrivers/misc/cxl/guest.c | 2++
Mdrivers/misc/cxl/pci.c | 39++++++++++++++++++++++++++++++---------
Mdrivers/misc/cxl/vphb.c | 3+--
Mdrivers/net/ethernet/pasemi/pasemi_mac.c | 1+
Mdrivers/vfio/vfio_spapr_eeh.c | 6+++---
Minclude/linux/swiotlb.h | 3---
Mkernel/dma/Kconfig | 3+++
Mkernel/dma/direct.c | 3+--
Mkernel/dma/mapping.c | 11+++++++----
Mkernel/dma/swiotlb.c | 12------------
Mkernel/resource.c | 4----
Mtools/testing/selftests/powerpc/benchmarks/null_syscall.c | 2+-
Mtools/testing/selftests/powerpc/include/reg.h | 8++++++++
Mtools/testing/selftests/powerpc/include/utils.h | 2++
Mtools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c | 1-
Mtools/testing/selftests/powerpc/tm/.gitignore | 1+
Mtools/testing/selftests/powerpc/tm/Makefile | 4+++-
Atools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c | 184+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/testing/selftests/vm/map_hugetlb.c | 29+++++++++++++++++++++++++++--
221 files changed, 4813 insertions(+), 4185 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig @@ -119,9 +119,6 @@ config GENERIC_HWEIGHT bool default y -config ARCH_HAS_DMA_SET_COHERENT_MASK - bool - config PPC bool default y @@ -131,10 +128,10 @@ config PPC select ARCH_32BIT_OFF_T if PPC32 select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED - select ARCH_HAS_DMA_SET_COHERENT_MASK select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_KCOV select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 select ARCH_HAS_PTE_SPECIAL @@ -203,7 +200,7 @@ config PPC select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK select HAVE_KERNEL_GZIP - select HAVE_KERNEL_XZ if PPC_BOOK3S + select HAVE_KERNEL_XZ if PPC_BOOK3S || 44x select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES @@ -222,7 +219,7 @@ config PPC select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN + select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING select HAVE_IRQ_TIME_ACCOUNTING @@ -243,6 +240,7 @@ config PPC select RTC_LIB select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK select VIRT_TO_BUS if !PPC64 # # Please keep this list sorted alphabetically. @@ -253,9 +251,6 @@ config PPC_BARRIER_NOSPEC default y depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E -config GENERIC_CSUM - def_bool n - config EARLY_PRINTK bool default y @@ -475,9 +470,6 @@ config ARCH_CPU_PROBE_RELEASE config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y -config ARCH_HAS_WALK_MEMORY - def_bool y - config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y @@ -693,7 +685,7 @@ config PPC_16K_PAGES config PPC_64K_PAGES bool "64k page size" - depends on !PPC_FSL_BOOK3E && (44x || PPC_BOOK3S_64 || PPC_BOOK3E_64) + depends on 44x || PPC_BOOK3S_64 select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64 config PPC_256K_PAGES @@ -711,6 +703,13 @@ config PPC_256K_PAGES endchoice +config PPC_PAGE_SHIFT + int + default 18 if PPC_256K_PAGES + default 16 if PPC_64K_PAGES + default 14 if PPC_16K_PAGES + default 12 + config THREAD_SHIFT int "Thread shift" if EXPERT range 13 15 @@ -721,6 +720,59 @@ config THREAD_SHIFT Used to define the stack size. The default is almost always what you want. Only change this if you know what you are doing. +config ETEXT_SHIFT_BOOL + bool "Set custom etext alignment" if STRICT_KERNEL_RWX && \ + (PPC_BOOK3S_32 || PPC_8xx) + depends on ADVANCED_OPTIONS + help + This option allows you to set the kernel end of text alignment. When + RAM is mapped by blocks, the alignment needs to fit the size and + number of possible blocks. The default should be OK for most configs. + + Say N here unless you know what you are doing. + +config ETEXT_SHIFT + int "_etext shift" if ETEXT_SHIFT_BOOL + range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + range 19 23 if STRICT_KERNEL_RWX && PPC_8xx + default 17 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + default 19 if STRICT_KERNEL_RWX && PPC_8xx + default PPC_PAGE_SHIFT + help + On Book3S 32 (603+), IBATs are used to map kernel text. + Smaller is the alignment, greater is the number of necessary IBATs. + + On 8xx, large pages (512kb or 8M) are used to map kernel linear + memory. Aligning to 8M reduces TLB misses as only 8M pages are used + in that case. + +config DATA_SHIFT_BOOL + bool "Set custom data alignment" if STRICT_KERNEL_RWX && \ + (PPC_BOOK3S_32 || PPC_8xx) + depends on ADVANCED_OPTIONS + help + This option allows you to set the kernel data alignment. When + RAM is mapped by blocks, the alignment needs to fit the size and + number of possible blocks. The default should be OK for most configs. + + Say N here unless you know what you are doing. + +config DATA_SHIFT + int "Data shift" if DATA_SHIFT_BOOL + default 24 if STRICT_KERNEL_RWX && PPC64 + range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + range 19 23 if STRICT_KERNEL_RWX && PPC_8xx + default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + default 23 if STRICT_KERNEL_RWX && PPC_8xx + default PPC_PAGE_SHIFT + help + On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO. + Smaller is the alignment, greater is the number of necessary DBATs. + + On 8xx, large pages (512kb or 8M) are used to map kernel linear + memory. Aligning to 8M reduces TLB misses as only 8M pages are used + in that case. + config FORCE_MAX_ZONEORDER int "Maximum zone order" range 8 9 if PPC64 && PPC_64K_PAGES @@ -887,6 +939,7 @@ config FSL_SOC config FSL_PCI bool + select ARCH_HAS_DMA_SET_MASK select PPC_INDIRECT_PCI select PCI_QUIRKS diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug @@ -361,10 +361,6 @@ config PPC_PTDUMP If you are unsure, say N. -config PPC_HTDUMP - def_bool y - depends on PPC_PTDUMP && PPC_BOOK3S_64 - config PPC_FAST_ENDIAN_SWITCH bool "Deprecated fast endian-switch syscall" depends on DEBUG_KERNEL && PPC_BOOK3S_64 diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile @@ -213,9 +213,9 @@ endif asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1) KBUILD_CPPFLAGS += -Iarch/$(ARCH) $(asinstr) -KBUILD_AFLAGS += -Iarch/$(ARCH) $(AFLAGS-y) +KBUILD_AFLAGS += $(AFLAGS-y) KBUILD_CFLAGS += $(call cc-option,-msoft-float) -KBUILD_CFLAGS += -pipe -Iarch/$(ARCH) $(CFLAGS-y) +KBUILD_CFLAGS += -pipe $(CFLAGS-y) CPP = $(CC) -E $(KBUILD_CFLAGS) CHECKFLAGS += -m$(BITS) -D__powerpc__ -D__powerpc$(BITS)__ @@ -427,6 +427,13 @@ else endif endif +ifdef CONFIG_SMP +prepare: task_cpu_prepare + +task_cpu_prepare: prepare0 + $(eval KBUILD_CFLAGS += -D_TASK_CPU=$(shell awk '{if ($$2 == "TASK_CPU") print $$3;}' include/generated/asm-offsets.h)) +endif + # Check toolchain versions: # - gcc-4.6 is the minimum kernel-wide version so nothing required. checkbin: diff --git a/arch/powerpc/boot/dts/Makefile b/arch/powerpc/boot/dts/Makefile @@ -4,3 +4,4 @@ subdir-y += fsl dtstree := $(srctree)/$(src) dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts)) +dtb-$(CONFIG_XILINX_VIRTEX440_GENERIC_BOARD) += virtex440-ml507.dtb virtex440-ml510.dtb diff --git a/arch/powerpc/boot/dts/akebono.dts b/arch/powerpc/boot/dts/akebono.dts @@ -40,7 +40,7 @@ d-cache-size = <32768>; dcr-controller; dcr-access-method = "native"; - status = "ok"; + status = "okay"; }; cpu@1 { device_type = "cpu"; diff --git a/arch/powerpc/boot/dts/bluestone.dts b/arch/powerpc/boot/dts/bluestone.dts @@ -109,7 +109,7 @@ OCM: ocm@400040000 { compatible = "ibm,ocm"; - status = "ok"; + status = "okay"; cell-index = <1>; /* configured in U-Boot */ reg = <4 0x00040000 0x8000>; /* 32K */ diff --git a/arch/powerpc/boot/dts/currituck.dts b/arch/powerpc/boot/dts/currituck.dts @@ -39,7 +39,7 @@ d-cache-size = <32768>; dcr-controller; dcr-access-method = "native"; - status = "ok"; + status = "okay"; }; cpu@1 { device_type = "cpu"; diff --git a/arch/powerpc/boot/dts/iss4xx-mpic.dts b/arch/powerpc/boot/dts/iss4xx-mpic.dts @@ -43,7 +43,7 @@ d-cache-size = <32768>; dcr-controller; dcr-access-method = "native"; - status = "ok"; + status = "okay"; }; cpu@1 { device_type = "cpu"; diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts @@ -14,6 +14,7 @@ /dts-v1/; #include <dt-bindings/gpio/gpio.h> +#include <dt-bindings/input/input.h> /* * This is commented-out for now. @@ -187,6 +188,11 @@ "DEBUG0", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4", "DEBUG5", "DEBUG6", "DEBUG7"; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = <10>; + interrupt-parent = <&PIC1>; + /* * This is commented out while a standard binding * for i2c over gpio is defined. @@ -235,5 +241,21 @@ panic-indicator; }; }; + + gpio-keys { + compatible = "gpio-keys"; + + power { + label = "Power Button"; + gpios = <&GPIO 0 GPIO_ACTIVE_HIGH>; + linux,code = <KEY_POWER>; + }; + + eject { + label = "Eject Button"; + gpios = <&GPIO 6 GPIO_ACTIVE_HIGH>; + linux,code = <KEY_EJECTCD>; + }; + }; }; diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h @@ -23,8 +23,8 @@ #include <uapi/asm/ucontext.h> /* SMP */ -extern struct thread_info *current_set[NR_CPUS]; -extern struct thread_info *secondary_ti; +extern struct task_struct *current_set[NR_CPUS]; +extern struct task_struct *secondary_current; void start_secondary(void *unused); /* kexec */ @@ -37,13 +37,11 @@ void kexec_copy_flush(struct kimage *image); extern struct static_key hcall_tracepoint_key; void __trace_hcall_entry(unsigned long opcode, unsigned long *args); void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf); -/* OPAL tracing */ -#ifdef CONFIG_JUMP_LABEL -extern struct static_key opal_tracepoint_key; -#endif -void __trace_opal_entry(unsigned long opcode, unsigned long *args); -void __trace_opal_exit(long opcode, unsigned long retval); +/* OPAL */ +int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, + int64_t a4, int64_t a5, int64_t a6, int64_t a7, + int64_t opcode, uint64_t msr); /* VMX copying */ int enter_vmx_usercopy(void); diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -92,6 +92,8 @@ typedef struct { unsigned long vdso_base; } mm_context_t; +void update_bats(void); + /* patch sites */ extern s32 patch__hash_page_A0, patch__hash_page_A1, patch__hash_page_A2; extern s32 patch__hash_page_B, patch__hash_page_C; diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -174,7 +174,18 @@ static inline bool pte_user(pte_t pte) * of RAM. -- Cort */ #define VMALLOC_OFFSET (0x1000000) /* 16M */ + +/* + * With CONFIG_STRICT_KERNEL_RWX, kernel segments are set NX. But when modules + * are used, NX cannot be set on VMALLOC space. So vmalloc VM space and linear + * memory shall not share segments. + */ +#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_MODULES) +#define VMALLOC_START ((_ALIGN((long)high_memory, 256L << 20) + VMALLOC_OFFSET) & \ + ~(VMALLOC_OFFSET - 1)) +#else #define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))) +#endif #define VMALLOC_END ioremap_bot #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h @@ -40,22 +40,36 @@ #else #define H_PUD_CACHE_INDEX (H_PUD_INDEX_SIZE) #endif + /* - * Define the address range of the kernel non-linear virtual area + * Define the address range of the kernel non-linear virtual area. In contrast + * to the linear mapping, this is managed using the kernel page tables and then + * inserted into the hash page table to actually take effect, similarly to user + * mappings. */ #define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) -#define H_KERN_VIRT_SIZE ASM_CONST(0x0000400000000000) /* 64T */ /* - * The vmalloc space starts at the beginning of that region, and - * occupies half of it on hash CPUs and a quarter of it on Book3E - * (we keep a quarter for the virtual memmap) + * Allow virtual mapping of one context size. + * 512TB for 64K page size + * 64TB for 4K page size + */ +#define H_KERN_VIRT_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT) + +/* + * 8TB IO mapping size + */ +#define H_KERN_IO_SIZE ASM_CONST(0x80000000000) /* 8T */ + +/* + * The vmalloc space starts at the beginning of the kernel non-linear virtual + * region, and occupies 504T (64K) or 56T (4K) */ -#define H_VMALLOC_START H_KERN_VIRT_START -#define H_VMALLOC_SIZE ASM_CONST(0x380000000000) /* 56T */ -#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) +#define H_VMALLOC_START H_KERN_VIRT_START +#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE) +#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) -#define H_KERN_IO_START H_VMALLOC_END +#define H_KERN_IO_START H_VMALLOC_END /* * Region IDs diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -23,7 +23,7 @@ */ #include <asm/book3s/64/pgtable.h> #include <asm/bug.h> -#include <asm/processor.h> +#include <asm/task_size_64.h> #include <asm/cpu_has_feature.h> /* diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -111,7 +111,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { - pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS); + *pgd = __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) @@ -138,7 +138,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { - pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS); + *pud = __pud(__pgtable_ptr_val(pmd) | PUD_VAL_BITS); } static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, @@ -176,13 +176,13 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS); + *pmd = __pmd(__pgtable_ptr_val(pte) | PMD_VAL_BITS); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS); + *pmd = __pmd(__pgtable_ptr_val(pte_page) | PMD_VAL_BITS); } static inline pgtable_t pmd_pgtable(pmd_t pmd) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -811,7 +811,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, return hash__set_pte_at(mm, addr, ptep, pte, percpu); } -#define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) +#define _PAGE_CACHE_CTL (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) #define pgprot_noncached pgprot_noncached static inline pgprot_t pgprot_noncached(pgprot_t prot) @@ -851,11 +851,6 @@ static inline bool pte_ci(pte_t pte) return false; } -static inline void pmd_set(pmd_t *pmdp, unsigned long val) -{ - *pmdp = __pmd(val); -} - static inline void pmd_clear(pmd_t *pmdp) { *pmdp = __pmd(0); @@ -887,11 +882,6 @@ static inline int pmd_bad(pmd_t pmd) return hash__pmd_bad(pmd); } -static inline void pud_set(pud_t *pudp, unsigned long val) -{ - *pudp = __pud(val); -} - static inline void pud_clear(pud_t *pudp) { *pudp = __pud(0); @@ -934,10 +924,6 @@ static inline bool pud_access_permitted(pud_t pud, bool write) } #define pgd_write(pgd) pte_write(pgd_pte(pgd)) -static inline void pgd_set(pgd_t *pgdp, unsigned long val) -{ - *pgdp = __pgd(val); -} static inline void pgd_clear(pgd_t *pgdp) { diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -13,8 +13,32 @@ static inline int mmu_get_ap(int psize) #ifdef CONFIG_PPC_RADIX_MMU extern void radix__tlbiel_all(unsigned int action); +extern void radix__flush_tlb_lpid_page(unsigned int lpid, + unsigned long addr, + unsigned long page_size); +extern void radix__flush_pwc_lpid(unsigned int lpid); +extern void radix__flush_tlb_lpid(unsigned int lpid); +extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); #else static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); }; +static inline void radix__flush_tlb_lpid_page(unsigned int lpid, + unsigned long addr, + unsigned long page_size) +{ + WARN_ON(1); +} +static inline void radix__flush_pwc_lpid(unsigned int lpid) +{ + WARN_ON(1); +} +static inline void radix__flush_tlb_lpid(unsigned int lpid) +{ + WARN_ON(1); +} +static inline void radix__local_flush_tlb_lpid_guest(unsigned int lpid) +{ + WARN_ON(1); +} #endif extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, @@ -49,12 +73,6 @@ extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr); extern void radix__flush_tlb_all(void); -extern void radix__flush_tlb_lpid_page(unsigned int lpid, - unsigned long addr, - unsigned long page_size); -extern void radix__flush_pwc_lpid(unsigned int lpid); -extern void radix__flush_tlb_lpid(unsigned int lpid); extern void radix__local_flush_tlb_lpid(unsigned int lpid); -extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); #endif diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h @@ -9,9 +9,6 @@ * 2 of the License, or (at your option) any later version. */ -#ifdef CONFIG_GENERIC_CSUM -#include <asm-generic/checksum.h> -#else #include <linux/bitops.h> #include <linux/in6.h> /* @@ -217,6 +214,5 @@ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum sum); -#endif #endif /* __KERNEL__ */ #endif diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h @@ -20,6 +20,11 @@ struct iommu_table; */ struct dev_archdata { /* + * Set to %true if the dma_iommu_ops are requested to use a direct + * window instead of dynamically mapping memory. + */ + bool iommu_bypass : 1; + /* * These two used to be a union. However, with the hybrid ops we need * both so here we store both a DMA offset for direct mappings and * an iommu_table for remapped DMA. @@ -33,9 +38,6 @@ struct dev_archdata { #ifdef CONFIG_IOMMU_API void *iommu_domain; #endif -#ifdef CONFIG_SWIOTLB - dma_addr_t max_direct_dma_addr; -#endif #ifdef CONFIG_PPC64 struct pci_dn *pci_data; #endif @@ -54,6 +56,4 @@ struct pdev_archdata { u64 dma_mask; }; -#define ARCH_HAS_DMA_GET_REQUIRED_MASK - #endif /* _ASM_POWERPC_DEVICE_H */ diff --git a/arch/powerpc/include/asm/dma-direct.h b/arch/powerpc/include/asm/dma-direct.h @@ -4,26 +4,24 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { -#ifdef CONFIG_SWIOTLB - struct dev_archdata *sd = &dev->archdata; - - if (sd->max_direct_dma_addr && addr + size > sd->max_direct_dma_addr) - return false; -#endif - if (!dev->dma_mask) return false; - return addr + size - 1 <= *dev->dma_mask; + return addr + size - 1 <= + min_not_zero(*dev->dma_mask, dev->bus_dma_mask); } static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) { - return paddr + get_dma_offset(dev); + if (!dev) + return paddr + PCI_DRAM_OFFSET; + return paddr + dev->archdata.dma_offset; } static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr) { - return daddr - get_dma_offset(dev); + if (!dev) + return daddr - PCI_DRAM_OFFSET; + return daddr - dev->archdata.dma_offset; } #endif /* ASM_POWERPC_DMA_DIRECT_H */ diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h @@ -1,74 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2004 IBM - * - * Implements the generic device dma API for powerpc. - * the pci and vio busses */ #ifndef _ASM_DMA_MAPPING_H #define _ASM_DMA_MAPPING_H -#ifdef __KERNEL__ - -#include <linux/types.h> -#include <linux/cache.h> -/* need struct page definitions */ -#include <linux/mm.h> -#include <linux/scatterlist.h> -#include <linux/dma-debug.h> -#include <asm/io.h> -#include <asm/swiotlb.h> - -/* Some dma direct funcs must be visible for use in other dma_ops */ -extern void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs); -extern void __dma_nommu_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs); -extern int dma_nommu_mmap_coherent(struct device *dev, - struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t handle, - size_t size, unsigned long attrs); - -#ifdef CONFIG_NOT_COHERENT_CACHE -/* - * DMA-consistent mapping functions for PowerPCs that don't support - * cache snooping. These allocate/free a region of uncached mapped - * memory space for use with DMA devices. Alternatively, you could - * allocate the space "normally" and use the cache management functions - * to ensure it is consistent. - */ -struct device; -extern void __dma_sync(void *vaddr, size_t size, int direction); -extern void __dma_sync_page(struct page *page, unsigned long offset, - size_t size, int direction); -extern unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr); - -#else /* ! CONFIG_NOT_COHERENT_CACHE */ -/* - * Cache coherent cores. - */ - -#define __dma_sync(addr, size, rw) ((void)0) -#define __dma_sync_page(pg, off, sz, rw) ((void)0) - -#endif /* ! CONFIG_NOT_COHERENT_CACHE */ - -static inline unsigned long device_to_mask(struct device *dev) -{ - if (dev->dma_mask && *dev->dma_mask) - return *dev->dma_mask; - /* Assume devices without mask can take 32 bit addresses */ - return 0xfffffffful; -} - -/* - * Available generic sets of operations - */ -#ifdef CONFIG_PPC64 -extern struct dma_map_ops dma_iommu_ops; -#endif -extern const struct dma_map_ops dma_nommu_ops; static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { @@ -80,31 +15,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) return NULL; } -/* - * get_dma_offset() - * - * Get the dma offset on configurations where the dma address can be determined - * from the physical address by looking at a simple offset. Direct dma and - * swiotlb use this function, but it is typically not used by implementations - * with an iommu. - */ -static inline dma_addr_t get_dma_offset(struct device *dev) -{ - if (dev) - return dev->archdata.dma_offset; - - return PCI_DRAM_OFFSET; -} - -static inline void set_dma_offset(struct device *dev, dma_addr_t off) -{ - if (dev) - dev->archdata.dma_offset = off; -} - -#define HAVE_ARCH_DMA_SET_MASK 1 - -extern u64 __dma_get_required_mask(struct device *dev); - -#endif /* __KERNEL__ */ #endif /* _ASM_DMA_MAPPING_H */ diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h @@ -219,7 +219,8 @@ struct eeh_ops { }; extern int eeh_subsystem_flags; -extern int eeh_max_freezes; +extern u32 eeh_max_freezes; +extern bool eeh_debugfs_no_recover; extern struct eeh_ops *eeh_ops; extern raw_spinlock_t confirm_error_lock; @@ -293,14 +294,14 @@ void eeh_add_device_late(struct pci_dev *); void eeh_add_device_tree_late(struct pci_bus *); void eeh_add_sysfs_files(struct pci_bus *); void eeh_remove_device(struct pci_dev *); -int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state); +int eeh_unfreeze_pe(struct eeh_pe *pe); int eeh_pe_reset_and_recover(struct eeh_pe *pe); int eeh_dev_open(struct pci_dev *pdev); void eeh_dev_release(struct pci_dev *pdev); struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group); int eeh_pe_set_option(struct eeh_pe *pe, int option); int eeh_pe_get_state(struct eeh_pe *pe); -int eeh_pe_reset(struct eeh_pe *pe, int option); +int eeh_pe_reset(struct eeh_pe *pe, int option, bool include_passed); int eeh_pe_configure(struct eeh_pe *pe); int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func, unsigned long addr, unsigned long mask); @@ -460,6 +461,9 @@ static inline void eeh_readsl(const volatile void __iomem *addr, void * buf, eeh_check_failure(addr); } + +void eeh_cache_debugfs_init(void); + #endif /* CONFIG_PPC64 */ #endif /* __KERNEL__ */ #endif /* _POWERPC_EEH_H */ diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h @@ -33,6 +33,7 @@ struct eeh_event { int eeh_event_init(void); int eeh_send_failure_event(struct eeh_pe *pe); +int __eeh_send_failure_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force); void eeh_handle_normal_event(struct eeh_pe *pe); void eeh_handle_special_event(void); diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h @@ -671,7 +671,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define RUNLATCH_ON \ BEGIN_FTR_SECTION \ - CURRENT_THREAD_INFO(r3, r1); \ + ld r3, PACA_THREAD_INFO(r13); \ ld r4,TI_LOCAL_FLAGS(r3); \ andi. r0,r4,_TLF_RUNLATCH; \ beql ppc64_runlatch_on_trampoline; \ @@ -721,7 +721,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL) #ifdef CONFIG_PPC_970_NAP #define FINISH_NAP \ BEGIN_FTR_SECTION \ - CURRENT_THREAD_INFO(r11, r1); \ + ld r11, PACA_THREAD_INFO(r13); \ ld r9,TI_LOCAL_FLAGS(r11); \ andi. r10,r9,_TLF_NAPPING; \ bnel power4_fixup_nap; \ diff --git a/arch/powerpc/include/asm/hvsi.h b/arch/powerpc/include/asm/hvsi.h @@ -64,7 +64,7 @@ struct hvsi_priv { unsigned int inbuf_len; /* data in input buffer */ unsigned char inbuf[HVSI_INBUF_SIZE]; unsigned int inbuf_cur; /* Cursor in input buffer */ - unsigned int inbuf_pktlen; /* packet lenght from cursor */ + unsigned int inbuf_pktlen; /* packet length from cursor */ atomic_t seqno; /* packet sequence number */ unsigned int opened:1; /* driver opened */ unsigned int established:1; /* protocol established */ diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h @@ -237,6 +237,7 @@ static inline void iommu_del_device(struct device *dev) } #endif /* !CONFIG_IOMMU_API */ +u64 dma_iommu_get_required_mask(struct device *dev); #else static inline void *get_iommu_table_base(struct device *dev) @@ -318,5 +319,21 @@ extern void iommu_release_ownership(struct iommu_table *tbl); extern enum dma_data_direction iommu_tce_direction(unsigned long tce); extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir); +#ifdef CONFIG_PPC_CELL_NATIVE +extern bool iommu_fixed_is_weak; +#else +#define iommu_fixed_is_weak false +#endif + +extern const struct dma_map_ops dma_iommu_ops; + +static inline unsigned long device_to_mask(struct device *dev) +{ + if (dev->dma_mask && *dev->dma_mask) + return *dev->dma_mask; + /* Assume devices without mask can take 32 bit addresses */ + return 0xfffffffful; +} + #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/include/asm/ipic.h b/arch/powerpc/include/asm/ipic.h @@ -69,10 +69,7 @@ enum ipic_mcp_irq { IPIC_MCP_MU = 7, }; -extern void ipic_set_highest_priority(unsigned int irq); extern void ipic_set_default_priority(void); -extern void ipic_enable_mcp(enum ipic_mcp_irq mcp_irq); -extern void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq); extern u32 ipic_get_mcp_status(void); extern void ipic_clear_mcp_status(u32 mask); diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h @@ -48,23 +48,19 @@ struct pt_regs; * Per-cpu stacks for handling critical, debug and machine check * level interrupts. */ -extern struct thread_info *critirq_ctx[NR_CPUS]; -extern struct thread_info *dbgirq_ctx[NR_CPUS]; -extern struct thread_info *mcheckirq_ctx[NR_CPUS]; -extern void exc_lvl_ctx_init(void); -#else -#define exc_lvl_ctx_init() +extern void *critirq_ctx[NR_CPUS]; +extern void *dbgirq_ctx[NR_CPUS]; +extern void *mcheckirq_ctx[NR_CPUS]; #endif /* * Per-cpu stacks for handling hard and soft interrupts. */ -extern struct thread_info *hardirq_ctx[NR_CPUS]; -extern struct thread_info *softirq_ctx[NR_CPUS]; +extern void *hardirq_ctx[NR_CPUS]; +extern void *softirq_ctx[NR_CPUS]; -extern void irq_ctx_init(void); -extern void call_do_softirq(struct thread_info *tp); -extern void call_do_irq(struct pt_regs *regs, struct thread_info *tp); +void call_do_softirq(void *sp); +void call_do_irq(struct pt_regs *regs, void *sp); extern void do_IRQ(struct pt_regs *regs); extern void __init init_IRQ(void); extern void __do_irq(struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h @@ -141,6 +141,7 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); +extern void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu); @@ -632,7 +633,7 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, unsigned int yield_count); long kvmppc_h_random(struct kvm_vcpu *vcpu); void kvmhv_commence_exit(int trap); -long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu); +void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu); void kvmppc_subcore_enter_guest(void); void kvmppc_subcore_exit_guest(void); long kvmppc_realmode_hmi_handler(void); diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/ftrace.h> +#include <linux/sched/task_stack.h> #ifdef CONFIG_LIVEPATCH static inline int klp_check_compiler_support(void) @@ -43,13 +44,13 @@ static inline unsigned long klp_get_ftrace_location(unsigned long faddr) return ftrace_location_range(faddr, faddr + 16); } -static inline void klp_init_thread_info(struct thread_info *ti) +static inline void klp_init_thread_info(struct task_struct *p) { /* + 1 to account for STACK_END_MAGIC */ - ti->livepatch_sp = (unsigned long *)(ti + 1) + 1; + task_thread_info(p)->livepatch_sp = end_of_stack(p) + 1; } #else -static void klp_init_thread_info(struct thread_info *ti) { } +static inline void klp_init_thread_info(struct task_struct *p) { } #endif /* CONFIG_LIVEPATCH */ #endif /* _ASM_POWERPC_LIVEPATCH_H */ diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h @@ -47,9 +47,7 @@ struct machdep_calls { #endif #endif /* CONFIG_PPC64 */ - /* Platform set_dma_mask and dma_get_required_mask overrides */ - int (*dma_set_mask)(struct device *dev, u64 dma_mask); - u64 (*dma_get_required_mask)(struct device *dev); + void (*dma_set_mask)(struct device *dev, u64 dma_mask); int (*probe)(void); void (*setup_arch)(void); /* Optional, may be NULL */ diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h @@ -209,7 +209,7 @@ extern int get_mce_event(struct machine_check_event *mce, bool release); extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt, - bool user_mode); + bool user_mode, bool in_guest); #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h @@ -289,6 +289,17 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) } #endif /* CONFIG_PPC_MEM_KEYS */ +#ifdef CONFIG_STRICT_KERNEL_RWX +static inline bool strict_kernel_rwx_enabled(void) +{ + return rodata_enabled; +} +#else +static inline bool strict_kernel_rwx_enabled(void) +{ + return false; +} +#endif #endif /* !__ASSEMBLY__ */ /* The kernel use the constants below to index in the page sizes array. @@ -356,6 +367,8 @@ extern void early_init_mmu_secondary(void); extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size); static inline void mmu_early_init_devtree(void) { } + +extern void *abatron_pteptrs[2]; #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h @@ -14,4 +14,6 @@ extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif +extern void hv_nmi_check_nonrecoverable(struct pt_regs *regs); + #endif /* _ASM_NMI_H */ diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -231,9 +231,10 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) } /* patch sites */ -extern s32 patch__itlbmiss_linmem_top; +extern s32 patch__itlbmiss_linmem_top, patch__itlbmiss_linmem_top8; extern s32 patch__dtlbmiss_linmem_top, patch__dtlbmiss_immr_jmp; extern s32 patch__fixupdar_linmem_top; +extern s32 patch__dtlbmiss_romem_top, patch__dtlbmiss_romem_top8; extern s32 patch__itlbmiss_exit_1, patch__itlbmiss_exit_2; extern s32 patch__dtlbmiss_exit_1, patch__dtlbmiss_exit_2, patch__dtlbmiss_exit_3; diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h @@ -20,20 +20,11 @@ /* * On regular PPC32 page size is 4K (but we support 4K/16K/64K/256K pages - * on PPC44x). For PPC64 we support either 4K or 64K software + * on PPC44x and 4K/16K on 8xx). For PPC64 we support either 4K or 64K software * page size. When using 64K pages however, whether we are really supporting * 64K pages in HW or not is irrelevant to those definitions. */ -#if defined(CONFIG_PPC_256K_PAGES) -#define PAGE_SHIFT 18 -#elif defined(CONFIG_PPC_64K_PAGES) -#define PAGE_SHIFT 16 -#elif defined(CONFIG_PPC_16K_PAGES) -#define PAGE_SHIFT 14 -#else -#define PAGE_SHIFT 12 -#endif - +#define PAGE_SHIFT CONFIG_PPC_PAGE_SHIFT #define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) #ifndef __ASSEMBLY__ @@ -326,7 +317,6 @@ struct page; extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg); extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *p); -extern int page_is_ram(unsigned long pfn); extern int devmem_is_allowed(unsigned long pfn); #ifdef CONFIG_PPC_SMLPAR diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h @@ -20,6 +20,8 @@ struct device_node; struct pci_controller_ops { void (*dma_dev_setup)(struct pci_dev *pdev); void (*dma_bus_setup)(struct pci_bus *bus); + bool (*iommu_bypass_supported)(struct pci_dev *pdev, + u64 mask); int (*probe_mode)(struct pci_bus *bus); @@ -44,9 +46,6 @@ struct pci_controller_ops { void (*teardown_msi_irqs)(struct pci_dev *pdev); #endif - int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask); - u64 (*dma_get_required_mask)(struct pci_dev *pdev); - void (*shutdown)(struct pci_controller *hose); }; @@ -275,6 +274,8 @@ extern int pcibios_map_io_space(struct pci_bus *bus); extern struct pci_controller *pci_find_hose_for_OF_device( struct device_node* node); +extern struct pci_controller *pci_find_controller_for_domain(int domain_nr); + /* Fill up host controller resources from the OF node */ extern void pci_process_bridge_OF_ranges(struct pci_controller *hose, struct device_node *dev, int primary); diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h @@ -52,10 +52,8 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) #ifdef CONFIG_PCI extern void set_pci_dma_ops(const struct dma_map_ops *dma_ops); -extern const struct dma_map_ops *get_pci_dma_ops(void); #else /* CONFIG_PCI */ #define set_pci_dma_ops(d) -#define get_pci_dma_ops() NULL #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h @@ -66,7 +66,6 @@ extern unsigned long empty_zero_page[]; extern pgd_t swapper_pg_dir[]; -int dma_pfn_limit_to_zone(u64 pfn_limit); extern void paging_init(void); /* diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h @@ -23,6 +23,8 @@ extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, unsigned long *flags, unsigned long *status, int count); +void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val); + void pnv_tm_init(void); #else static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } @@ -40,7 +42,6 @@ static inline int pnv_npu2_handle_fault(struct npu_context *context, } static inline void pnv_tm_init(void) { } -static inline void pnv_power9_force_smt4(void) { } #endif #endif /* _ASM_POWERNV_H */ diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h @@ -326,6 +326,7 @@ #define PPC_INST_ADDI 0x38000000 #define PPC_INST_ADDIS 0x3c000000 #define PPC_INST_ADD 0x7c000214 +#define PPC_INST_ADDC 0x7c000014 #define PPC_INST_SUB 0x7c000050 #define PPC_INST_BLR 0x4e800020 #define PPC_INST_BLRL 0x4e800021 @@ -334,6 +335,9 @@ #define PPC_INST_MULLW 0x7c0001d6 #define PPC_INST_MULHWU 0x7c000016 #define PPC_INST_MULLI 0x1c000000 +#define PPC_INST_MADDHD 0x10000030 +#define PPC_INST_MADDHDU 0x10000031 +#define PPC_INST_MADDLD 0x10000033 #define PPC_INST_DIVWU 0x7c000396 #define PPC_INST_DIVD 0x7c0003d2 #define PPC_INST_RLWINM 0x54000000 @@ -377,6 +381,7 @@ /* macros to insert fields into opcodes */ #define ___PPC_RA(a) (((a) & 0x1f) << 16) #define ___PPC_RB(b) (((b) & 0x1f) << 11) +#define ___PPC_RC(c) (((c) & 0x1f) << 6) #define ___PPC_RS(s) (((s) & 0x1f) << 21) #define ___PPC_RT(t) ___PPC_RS(t) #define ___PPC_R(r) (((r) & 0x1) << 16) @@ -396,7 +401,7 @@ #define __PPC_WS(w) (((w) & 0x1f) << 11) #define __PPC_SH(s) __PPC_WS(s) #define __PPC_SH64(s) (__PPC_SH(s) | (((s) & 0x20) >> 4)) -#define __PPC_MB(s) (((s) & 0x1f) << 6) +#define __PPC_MB(s) ___PPC_RC(s) #define __PPC_ME(s) (((s) & 0x1f) << 1) #define __PPC_MB64(s) (__PPC_MB(s) | ((s) & 0x20)) #define __PPC_ME64(s) __PPC_MB64(s) @@ -438,6 +443,15 @@ #define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_INST_STQCX | \ ___PPC_RT(t) | ___PPC_RA(a) | \ ___PPC_RB(b)) +#define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDHD | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) +#define PPC_MADDHDU(t, a, b, c) stringify_in_c(.long PPC_INST_MADDHDU | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) +#define PPC_MADDLD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDLD | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) #define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ ___PPC_RB(b)) #define PPC_MSGSYNC stringify_in_c(.long PPC_INST_MSGSYNC) diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h @@ -53,13 +53,13 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev); struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr); void eeh_slot_error_detail(struct eeh_pe *pe, int severity); int eeh_pci_enable(struct eeh_pe *pe, int function); -int eeh_pe_reset_full(struct eeh_pe *pe); +int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed); void eeh_save_bars(struct eeh_dev *edev); int rtas_write_config(struct pci_dn *, int where, int size, u32 val); int rtas_read_config(struct pci_dn *, int where, int size, u32 *val); void eeh_pe_state_mark(struct eeh_pe *pe, int state); void eeh_pe_mark_isolated(struct eeh_pe *pe); -void eeh_pe_state_clear(struct eeh_pe *pe, int state); +void eeh_pe_state_clear(struct eeh_pe *pe, int state, bool include_passed); void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state); void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode); diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h @@ -40,7 +40,7 @@ #ifndef __ASSEMBLY__ #include <linux/types.h> -#include <asm/thread_info.h> +#include <linux/thread_info.h> #include <asm/ptrace.h> #include <asm/hw_breakpoint.h> @@ -77,105 +77,15 @@ extern int _chrp_type; #ifdef __KERNEL__ -struct task_struct; -void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); -void release_thread(struct task_struct *); - -#ifdef CONFIG_PPC32 - -#if CONFIG_TASK_SIZE > CONFIG_KERNEL_START -#error User TASK_SIZE overlaps with KERNEL_START address -#endif -#define TASK_SIZE (CONFIG_TASK_SIZE) - -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define TASK_UNMAPPED_BASE (TASK_SIZE / 8 * 3) -#endif - #ifdef CONFIG_PPC64 -/* - * 64-bit user address space can have multiple limits - * For now supported values are: - */ -#define TASK_SIZE_64TB (0x0000400000000000UL) -#define TASK_SIZE_128TB (0x0000800000000000UL) -#define TASK_SIZE_512TB (0x0002000000000000UL) -#define TASK_SIZE_1PB (0x0004000000000000UL) -#define TASK_SIZE_2PB (0x0008000000000000UL) -/* - * With 52 bits in the address we can support - * upto 4PB of range. - */ -#define TASK_SIZE_4PB (0x0010000000000000UL) - -/* - * For now 512TB is only supported with book3s and 64K linux page size. - */ -#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES) -/* - * Max value currently used: - */ -#define TASK_SIZE_USER64 TASK_SIZE_4PB -#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_128TB -#define TASK_CONTEXT_SIZE TASK_SIZE_512TB +#include <asm/task_size_64.h> #else -#define TASK_SIZE_USER64 TASK_SIZE_64TB -#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_64TB -/* - * We don't need to allocate extended context ids for 4K page size, because - * we limit the max effective address on this config to 64TB. - */ -#define TASK_CONTEXT_SIZE TASK_SIZE_64TB +#include <asm/task_size_32.h> #endif -/* - * 32-bit user address space is 4GB - 1 page - * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT - */ -#define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE)) - -#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ - TASK_SIZE_USER32 : TASK_SIZE_USER64) -#define TASK_SIZE TASK_SIZE_OF(current) -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) -#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4)) - -#define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \ - TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 ) -#endif - -/* - * Initial task size value for user applications. For book3s 64 we start - * with 128TB and conditionally enable upto 512TB - */ -#ifdef CONFIG_PPC_BOOK3S_64 -#define DEFAULT_MAP_WINDOW ((is_32bit_task()) ? \ - TASK_SIZE_USER32 : DEFAULT_MAP_WINDOW_USER64) -#else -#define DEFAULT_MAP_WINDOW TASK_SIZE -#endif - -#ifdef __powerpc64__ - -#define STACK_TOP_USER64 DEFAULT_MAP_WINDOW_USER64 -#define STACK_TOP_USER32 TASK_SIZE_USER32 - -#define STACK_TOP (is_32bit_task() ? \ - STACK_TOP_USER32 : STACK_TOP_USER64) - -#define STACK_TOP_MAX TASK_SIZE_USER64 - -#else /* __powerpc64__ */ - -#define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX STACK_TOP - -#endif /* __powerpc64__ */ +struct task_struct; +void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); +void release_thread(struct task_struct *); typedef struct { unsigned long seg; @@ -250,6 +160,9 @@ struct thread_struct { #ifdef CONFIG_PPC32 void *pgdir; /* root of page-table tree */ unsigned long ksp_limit; /* if ksp <= ksp_limit stack overflow */ +#ifdef CONFIG_PPC_RTAS + unsigned long rtas_sp; /* stack pointer for when in RTAS */ +#endif #endif /* Debug Registers */ struct debug_reg debug; @@ -357,8 +270,7 @@ struct thread_struct { #define ARCH_MIN_TASKALIGN 16 #define INIT_SP (sizeof(init_stack) + (unsigned long) &init_stack) -#define INIT_SP_LIMIT \ - (_ALIGN_UP(sizeof(init_thread_info), 16) + (unsigned long) &init_stack) +#define INIT_SP_LIMIT ((unsigned long)&init_stack) #ifdef CONFIG_SPE #define SPEFSCR_INIT \ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h @@ -157,7 +157,7 @@ extern int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data); #define current_pt_regs() \ - ((struct pt_regs *)((unsigned long)current_thread_info() + THREAD_SIZE) - 1) + ((struct pt_regs *)((unsigned long)task_stack_page(current) + THREAD_SIZE) - 1) /* * We use the least-significant bit of the trap field to indicate * whether we have saved the full set of registers, or only a diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h @@ -1062,7 +1062,7 @@ * - SPRG9 debug exception scratch * * All 32-bit: - * - SPRG3 current thread_info pointer + * - SPRG3 current thread_struct physical addr pointer * (virtual on BookE, physical on others) * * 32-bit classic: @@ -1167,7 +1167,7 @@ #ifdef CONFIG_PPC_BOOK3S_32 #define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 #define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 -#define SPRN_SPRG_RTAS SPRN_SPRG2 +#define SPRN_SPRG_PGDIR SPRN_SPRG2 #define SPRN_SPRG_603_LRU SPRN_SPRG4 #endif @@ -1425,6 +1425,11 @@ static inline void msr_check_and_clear(unsigned long bits) #define mfsrin(v) ({unsigned int rval; \ asm volatile("mfsrin %0,%1" : "=r" (rval) : "r" (v)); \ rval;}) + +static inline void mtsrin(u32 val, u32 idx) +{ + asm volatile("mtsrin %0, %1" : : "r" (val), "r" (idx)); +} #endif #define proc_trap() asm volatile("trap") diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h @@ -17,6 +17,13 @@ extern char __end_interrupts[]; extern char __prom_init_toc_start[]; extern char __prom_init_toc_end[]; +#ifdef CONFIG_PPC_POWERNV +extern char start_real_trampolines[]; +extern char end_real_trampolines[]; +extern char start_virt_trampolines[]; +extern char end_virt_trampolines[]; +#endif + static inline int in_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end) diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h @@ -83,7 +83,22 @@ int is_cpu_dead(unsigned int cpu); /* 32-bit */ extern int smp_hw_index[]; -#define raw_smp_processor_id() (current_thread_info()->cpu) +/* + * This is particularly ugly: it appears we can't actually get the definition + * of task_struct here, but we need access to the CPU this task is running on. + * Instead of using task_struct we're using _TASK_CPU which is extracted from + * asm-offsets.h by kbuild to get the current processor ID. + * + * This also needs to be safeguarded when building asm-offsets.s because at + * that time _TASK_CPU is not defined yet. It could have been guarded by + * _TASK_CPU itself, but we want the build to fail if _TASK_CPU is missing + * when building something else than asm-offsets.s + */ +#ifdef GENERATING_ASM_OFFSETS +#define raw_smp_processor_id() (0) +#else +#define raw_smp_processor_id() (*(unsigned int *)((void *)current + _TASK_CPU)) +#endif #define hard_smp_processor_id() (smp_hw_index[smp_processor_id()]) static inline int get_hard_smp_processor_id(int cpu) diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h @@ -13,12 +13,7 @@ #include <linux/swiotlb.h> -extern const struct dma_map_ops powerpc_swiotlb_dma_ops; - extern unsigned int ppc_swiotlb_enable; -int __init swiotlb_setup_bus_notifier(void); - -extern void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev); #ifdef CONFIG_SWIOTLB void swiotlb_detect_4g(void); diff --git a/arch/powerpc/include/asm/task_size_32.h b/arch/powerpc/include/asm/task_size_32.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_TASK_SIZE_32_H +#define _ASM_POWERPC_TASK_SIZE_32_H + +#if CONFIG_TASK_SIZE > CONFIG_KERNEL_START +#error User TASK_SIZE overlaps with KERNEL_START address +#endif + +#define TASK_SIZE (CONFIG_TASK_SIZE) + +/* + * This decides where the kernel will search for a free chunk of vm space during + * mmap's. + */ +#define TASK_UNMAPPED_BASE (TASK_SIZE / 8 * 3) + +#define DEFAULT_MAP_WINDOW TASK_SIZE +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + +#endif /* _ASM_POWERPC_TASK_SIZE_32_H */ diff --git a/arch/powerpc/include/asm/task_size_64.h b/arch/powerpc/include/asm/task_size_64.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_TASK_SIZE_64_H +#define _ASM_POWERPC_TASK_SIZE_64_H + +/* + * 64-bit user address space can have multiple limits + * For now supported values are: + */ +#define TASK_SIZE_64TB (0x0000400000000000UL) +#define TASK_SIZE_128TB (0x0000800000000000UL) +#define TASK_SIZE_512TB (0x0002000000000000UL) +#define TASK_SIZE_1PB (0x0004000000000000UL) +#define TASK_SIZE_2PB (0x0008000000000000UL) + +/* + * With 52 bits in the address we can support up to 4PB of range. + */ +#define TASK_SIZE_4PB (0x0010000000000000UL) + +/* + * For now 512TB is only supported with book3s and 64K linux page size. + */ +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES) +/* + * Max value currently used: + */ +#define TASK_SIZE_USER64 TASK_SIZE_4PB +#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_128TB +#define TASK_CONTEXT_SIZE TASK_SIZE_512TB +#else +#define TASK_SIZE_USER64 TASK_SIZE_64TB +#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_64TB + +/* + * We don't need to allocate extended context ids for 4K page size, because we + * limit the max effective address on this config to 64TB. + */ +#define TASK_CONTEXT_SIZE TASK_SIZE_64TB +#endif + +/* + * 32-bit user address space is 4GB - 1 page + * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT + */ +#define TASK_SIZE_USER32 (0x0000000100000000UL - (1 * PAGE_SIZE)) + +#define TASK_SIZE_OF(tsk) \ + (test_tsk_thread_flag(tsk, TIF_32BIT) ? TASK_SIZE_USER32 : \ + TASK_SIZE_USER64) + +#define TASK_SIZE TASK_SIZE_OF(current) + +#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) +#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4)) + +/* + * This decides where the kernel will search for a free chunk of vm space during + * mmap's. + */ +#define TASK_UNMAPPED_BASE \ + ((is_32bit_task()) ? TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64) + +/* + * Initial task size value for user applications. For book3s 64 we start + * with 128TB and conditionally enable upto 512TB + */ +#ifdef CONFIG_PPC_BOOK3S_64 +#define DEFAULT_MAP_WINDOW \ + ((is_32bit_task()) ? TASK_SIZE_USER32 : DEFAULT_MAP_WINDOW_USER64) +#else +#define DEFAULT_MAP_WINDOW TASK_SIZE +#endif + +#define STACK_TOP_USER64 DEFAULT_MAP_WINDOW_USER64 +#define STACK_TOP_USER32 TASK_SIZE_USER32 +#define STACK_TOP_MAX TASK_SIZE_USER64 +#define STACK_TOP (is_32bit_task() ? STACK_TOP_USER32 : STACK_TOP_USER64) + +#endif /* _ASM_POWERPC_TASK_SIZE_64_H */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h @@ -17,12 +17,6 @@ #define THREAD_SIZE (1 << THREAD_SHIFT) -#ifdef CONFIG_PPC64 -#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(clrrdi dest, sp, THREAD_SHIFT) -#else -#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(rlwinm dest, sp, 0, 0, 31-THREAD_SHIFT) -#endif - #ifndef __ASSEMBLY__ #include <linux/cache.h> #include <asm/processor.h> @@ -34,8 +28,6 @@ * low level task data. */ struct thread_info { - struct task_struct *task; /* main task structure */ - int cpu; /* cpu we're on */ int preempt_count; /* 0 => preemptable, <0 => BUG */ unsigned long local_flags; /* private flags for thread */ @@ -58,8 +50,6 @@ struct thread_info { */ #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ - .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .flags = 0, \ } @@ -67,15 +57,6 @@ struct thread_info { #define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT) /* how to get the thread information struct from C */ -static inline struct thread_info *current_thread_info(void) -{ - unsigned long val; - - asm (CURRENT_THREAD_INFO(%0,1) : "=r" (val)); - - return (struct thread_info *)val; -} - extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h @@ -132,6 +132,8 @@ static inline void shared_proc_topology_init(void) {} #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_core_id(cpu) (cpu_to_core_id(cpu)) + +int dlpar_cpu_readd(int cpu); #endif #endif diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile @@ -36,7 +36,7 @@ obj-y := cputable.o ptrace.o syscalls.o \ process.o systbl.o idle.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ - udbg.o misc.o io.o dma.o misc_$(BITS).o \ + udbg.o misc.o io.o misc_$(BITS).o \ of_platform.o prom_parse.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ @@ -105,6 +105,7 @@ obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o +obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o obj-$(CONFIG_PCI) += pci_$(BITS).o $(pci64-y) \ @@ -142,19 +143,29 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o -# Disable GCOV & sanitizers in odd or sensitive code +# Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n +KCOV_INSTRUMENT_prom_init.o := n UBSAN_SANITIZE_prom_init.o := n GCOV_PROFILE_machine_kexec_64.o := n +KCOV_INSTRUMENT_machine_kexec_64.o := n UBSAN_SANITIZE_machine_kexec_64.o := n GCOV_PROFILE_machine_kexec_32.o := n +KCOV_INSTRUMENT_machine_kexec_32.o := n UBSAN_SANITIZE_machine_kexec_32.o := n GCOV_PROFILE_kprobes.o := n +KCOV_INSTRUMENT_kprobes.o := n UBSAN_SANITIZE_kprobes.o := n GCOV_PROFILE_kprobes-ftrace.o := n +KCOV_INSTRUMENT_kprobes-ftrace.o := n UBSAN_SANITIZE_kprobes-ftrace.o := n UBSAN_SANITIZE_vdso.o := n +# Necessary for booting with kcov enabled on book3e machines +KCOV_INSTRUMENT_cputable.o := n +KCOV_INSTRUMENT_setup_64.o := n +KCOV_INSTRUMENT_paca.o := n + extra-$(CONFIG_PPC_FPU) += fpu.o extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c @@ -13,6 +13,8 @@ * 2 of the License, or (at your option) any later version. */ +#define GENERATING_ASM_OFFSETS /* asm/smp.h */ + #include <linux/compat.h> #include <linux/signal.h> #include <linux/sched.h> @@ -90,10 +92,15 @@ int main(void) DEFINE(SIGSEGV, SIGSEGV); DEFINE(NMI_MASK, NMI_MASK); #else - OFFSET(THREAD_INFO, task_struct, stack); - DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16)); OFFSET(KSP_LIMIT, thread_struct, ksp_limit); +#ifdef CONFIG_PPC_RTAS + OFFSET(RTAS_SP, thread_struct, rtas_sp); +#endif #endif /* CONFIG_PPC64 */ + OFFSET(TASK_STACK, task_struct, stack); +#ifdef CONFIG_SMP + OFFSET(TASK_CPU, task_struct, cpu); +#endif #ifdef CONFIG_LIVEPATCH OFFSET(TI_livepatch_sp, thread_info, livepatch_sp); @@ -161,8 +168,6 @@ int main(void) OFFSET(TI_FLAGS, thread_info, flags); OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags); OFFSET(TI_PREEMPT, thread_info, preempt_count); - OFFSET(TI_TASK, thread_info, task); - OFFSET(TI_CPU, thread_info, cpu); #ifdef CONFIG_PPC64 OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size); @@ -177,6 +182,8 @@ int main(void) OFFSET(PACAPROCSTART, paca_struct, cpu_start); OFFSET(PACAKSAVE, paca_struct, kstack); OFFSET(PACACURRENT, paca_struct, __current); + DEFINE(PACA_THREAD_INFO, offsetof(struct paca_struct, __current) + + offsetof(struct task_struct, thread_info)); OFFSET(PACASAVEDMSR, paca_struct, saved_msr); OFFSET(PACAR1, paca_struct, saved_r1); OFFSET(PACATOC, paca_struct, kernel_toc); diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -24,6 +24,10 @@ BEGIN_MMU_FTR_SECTION li r10,0 mtspr SPRN_SPRG_603_LRU,r10 /* init SW LRU tracking */ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) + lis r10, (swapper_pg_dir - PAGE_OFFSET)@h + ori r10, r10, (swapper_pg_dir - PAGE_OFFSET)@l + mtspr SPRN_SPRG_PGDIR, r10 + BEGIN_FTR_SECTION bl __init_fpu_registers END_FTR_SECTION_IFCLR(CPU_FTR_FPU_UNAVAILABLE) diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c @@ -6,12 +6,31 @@ * busses using the iommu infrastructure */ +#include <linux/dma-direct.h> +#include <linux/pci.h> #include <asm/iommu.h> /* * Generic iommu implementation */ +/* + * The coherent mask may be smaller than the real mask, check if we can + * really use a direct window. + */ +static inline bool dma_iommu_alloc_bypass(struct device *dev) +{ + return dev->archdata.iommu_bypass && !iommu_fixed_is_weak && + dma_direct_supported(dev, dev->coherent_dma_mask); +} + +static inline bool dma_iommu_map_bypass(struct device *dev, + unsigned long attrs) +{ + return dev->archdata.iommu_bypass && + (!iommu_fixed_is_weak || (attrs & DMA_ATTR_WEAK_ORDERING)); +} + /* Allocates a contiguous real buffer and creates mappings over it. * Returns the virtual address of the buffer and sets dma_handle * to the dma address (mapping) of the first page. @@ -20,6 +39,8 @@ static void *dma_iommu_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { + if (dma_iommu_alloc_bypass(dev)) + return dma_direct_alloc(dev, size, dma_handle, flag, attrs); return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size, dma_handle, dev->coherent_dma_mask, flag, dev_to_node(dev)); @@ -29,7 +50,11 @@ static void dma_iommu_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { - iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle); + if (dma_iommu_alloc_bypass(dev)) + dma_direct_free(dev, size, vaddr, dma_handle, attrs); + else + iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, + dma_handle); } /* Creates TCEs for a user provided buffer. The user buffer must be @@ -42,6 +67,9 @@ static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page, enum dma_data_direction direction, unsigned long attrs) { + if (dma_iommu_map_bypass(dev, attrs)) + return dma_direct_map_page(dev, page, offset, size, direction, + attrs); return iommu_map_page(dev, get_iommu_table_base(dev), page, offset, size, device_to_mask(dev), direction, attrs); } @@ -51,8 +79,9 @@ static void dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction, unsigned long attrs) { - iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, direction, - attrs); + if (!dma_iommu_map_bypass(dev, attrs)) + iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, + direction, attrs); } @@ -60,6 +89,8 @@ static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction, unsigned long attrs) { + if (dma_iommu_map_bypass(dev, attrs)) + return dma_direct_map_sg(dev, sglist, nelems, direction, attrs); return ppc_iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems, device_to_mask(dev), direction, attrs); } @@ -68,10 +99,20 @@ static void dma_iommu_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction, unsigned long attrs) { - ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems, + if (!dma_iommu_map_bypass(dev, attrs)) + ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems, direction, attrs); } +static bool dma_iommu_bypass_supported(struct device *dev, u64 mask) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct pci_controller *phb = pci_bus_to_host(pdev->bus); + + return phb->controller_ops.iommu_bypass_supported && + phb->controller_ops.iommu_bypass_supported(pdev, mask); +} + /* We support DMA to/from any memory page via the iommu */ int dma_iommu_dma_supported(struct device *dev, u64 mask) { @@ -83,32 +124,48 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask) return 0; } + if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) { + dev->archdata.iommu_bypass = true; + dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); + return 1; + } + if (tbl->it_offset > (mask >> tbl->it_page_shift)) { dev_info(dev, "Warning: IOMMU offset too big for device mask\n"); dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n", mask, tbl->it_offset << tbl->it_page_shift); return 0; - } else - return 1; + } + + dev_dbg(dev, "iommu: not 64-bit, using default ops\n"); + dev->archdata.iommu_bypass = false; + return 1; } -static u64 dma_iommu_get_required_mask(struct device *dev) +u64 dma_iommu_get_required_mask(struct device *dev) { struct iommu_table *tbl = get_iommu_table_base(dev); u64 mask; + if (!tbl) return 0; + if (dev_is_pci(dev)) { + u64 bypass_mask = dma_direct_get_required_mask(dev); + + if (dma_iommu_bypass_supported(dev, bypass_mask)) + return bypass_mask; + } + mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1); mask += mask - 1; return mask; } -struct dma_map_ops dma_iommu_ops = { +const struct dma_map_ops dma_iommu_ops = { .alloc = dma_iommu_alloc_coherent, .free = dma_iommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, .map_sg = dma_iommu_map_sg, .unmap_sg = dma_iommu_unmap_sg, .dma_supported = dma_iommu_dma_supported, diff --git a/arch/powerpc/kernel/dma-mask.c b/arch/powerpc/kernel/dma-mask.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/dma-mapping.h> +#include <linux/export.h> +#include <asm/machdep.h> + +void arch_dma_set_mask(struct device *dev, u64 dma_mask) +{ + if (ppc_md.dma_set_mask) + ppc_md.dma_set_mask(dev, dma_mask); +} +EXPORT_SYMBOL(arch_dma_set_mask); diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c @@ -10,101 +10,12 @@ * option) any later version. * */ - -#include <linux/dma-direct.h> #include <linux/memblock.h> -#include <linux/pfn.h> -#include <linux/of_platform.h> -#include <linux/platform_device.h> -#include <linux/pci.h> - #include <asm/machdep.h> #include <asm/swiotlb.h> -#include <asm/dma.h> unsigned int ppc_swiotlb_enable; -static u64 swiotlb_powerpc_get_required(struct device *dev) -{ - u64 end, mask, max_direct_dma_addr = dev->archdata.max_direct_dma_addr; - - end = memblock_end_of_DRAM(); - if (max_direct_dma_addr && end > max_direct_dma_addr) - end = max_direct_dma_addr; - end += get_dma_offset(dev); - - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; -} - -/* - * At the moment, all platforms that use this code only require - * swiotlb to be used if we're operating on HIGHMEM. Since - * we don't ever call anything other than map_sg, unmap_sg, - * map_page, and unmap_page on highmem, use normal dma_ops - * for everything else. - */ -const struct dma_map_ops powerpc_swiotlb_dma_ops = { - .alloc = __dma_nommu_alloc_coherent, - .free = __dma_nommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, - .map_sg = dma_direct_map_sg, - .unmap_sg = dma_direct_unmap_sg, - .dma_supported = swiotlb_dma_supported, - .map_page = dma_direct_map_page, - .unmap_page = dma_direct_unmap_page, - .sync_single_for_cpu = dma_direct_sync_single_for_cpu, - .sync_single_for_device = dma_direct_sync_single_for_device, - .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, - .sync_sg_for_device = dma_direct_sync_sg_for_device, - .get_required_mask = swiotlb_powerpc_get_required, -}; - -void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) -{ - struct pci_controller *hose; - struct dev_archdata *sd; - - hose = pci_bus_to_host(pdev->bus); - sd = &pdev->dev.archdata; - sd->max_direct_dma_addr = - hose->dma_window_base_cur + hose->dma_window_size; -} - -static int ppc_swiotlb_bus_notify(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct device *dev = data; - struct dev_archdata *sd; - - /* We are only intereted in device addition */ - if (action != BUS_NOTIFY_ADD_DEVICE) - return 0; - - sd = &dev->archdata; - sd->max_direct_dma_addr = 0; - - /* May need to bounce if the device can't address all of DRAM */ - if ((dma_get_mask(dev) + 1) < memblock_end_of_DRAM()) - set_dma_ops(dev, &powerpc_swiotlb_dma_ops); - - return NOTIFY_DONE; -} - -static struct notifier_block ppc_swiotlb_plat_bus_notifier = { - .notifier_call = ppc_swiotlb_bus_notify, - .priority = 0, -}; - -int __init swiotlb_setup_bus_notifier(void) -{ - bus_register_notifier(&platform_bus_type, - &ppc_swiotlb_plat_bus_notifier); - return 0; -} - void __init swiotlb_detect_4g(void) { if ((memblock_end_of_DRAM() - 1) > 0xffffffff) diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c @@ -1,362 +0,0 @@ -/* - * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corporation - * - * Provide default implementations of the DMA mapping callbacks for - * directly mapped busses. - */ - -#include <linux/device.h> -#include <linux/dma-mapping.h> -#include <linux/dma-debug.h> -#include <linux/gfp.h> -#include <linux/memblock.h> -#include <linux/export.h> -#include <linux/pci.h> -#include <asm/vio.h> -#include <asm/bug.h> -#include <asm/machdep.h> -#include <asm/swiotlb.h> -#include <asm/iommu.h> - -/* - * Generic direct DMA implementation - * - * This implementation supports a per-device offset that can be applied if - * the address at which memory is visible to devices is not 0. Platform code - * can set archdata.dma_data to an unsigned long holding the offset. By - * default the offset is PCI_DRAM_OFFSET. - */ - -static u64 __maybe_unused get_pfn_limit(struct device *dev) -{ - u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1; - struct dev_archdata __maybe_unused *sd = &dev->archdata; - -#ifdef CONFIG_SWIOTLB - if (sd->max_direct_dma_addr && dev->dma_ops == &powerpc_swiotlb_dma_ops) - pfn = min_t(u64, pfn, sd->max_direct_dma_addr >> PAGE_SHIFT); -#endif - - return pfn; -} - -static int dma_nommu_dma_supported(struct device *dev, u64 mask) -{ -#ifdef CONFIG_PPC64 - u64 limit = get_dma_offset(dev) + (memblock_end_of_DRAM() - 1); - - /* Limit fits in the mask, we are good */ - if (mask >= limit) - return 1; - -#ifdef CONFIG_FSL_SOC - /* - * Freescale gets another chance via ZONE_DMA, however - * that will have to be refined if/when they support iommus - */ - return 1; -#endif - /* Sorry ... */ - return 0; -#else - return 1; -#endif -} - -#ifndef CONFIG_NOT_COHERENT_CACHE -void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - void *ret; - struct page *page; - int node = dev_to_node(dev); -#ifdef CONFIG_FSL_SOC - u64 pfn = get_pfn_limit(dev); - int zone; - - /* - * This code should be OK on other platforms, but we have drivers that - * don't set coherent_dma_mask. As a workaround we just ifdef it. This - * whole routine needs some serious cleanup. - */ - - zone = dma_pfn_limit_to_zone(pfn); - if (zone < 0) { - dev_err(dev, "%s: No suitable zone for pfn %#llx\n", - __func__, pfn); - return NULL; - } - - switch (zone) { -#ifdef CONFIG_ZONE_DMA - case ZONE_DMA: - flag |= GFP_DMA; - break; -#endif - }; -#endif /* CONFIG_FSL_SOC */ - - page = alloc_pages_node(node, flag, get_order(size)); - if (page == NULL) - return NULL; - ret = page_address(page); - memset(ret, 0, size); - *dma_handle = __pa(ret) + get_dma_offset(dev); - - return ret; -} - -void __dma_nommu_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - free_pages((unsigned long)vaddr, get_order(size)); -} -#endif /* !CONFIG_NOT_COHERENT_CACHE */ - -static void *dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - struct iommu_table *iommu; - - /* The coherent mask may be smaller than the real mask, check if - * we can really use the direct ops - */ - if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask)) - return __dma_nommu_alloc_coherent(dev, size, dma_handle, - flag, attrs); - - /* Ok we can't ... do we have an iommu ? If not, fail */ - iommu = get_iommu_table_base(dev); - if (!iommu) - return NULL; - - /* Try to use the iommu */ - return iommu_alloc_coherent(dev, iommu, size, dma_handle, - dev->coherent_dma_mask, flag, - dev_to_node(dev)); -} - -static void dma_nommu_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - struct iommu_table *iommu; - - /* See comments in dma_nommu_alloc_coherent() */ - if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask)) - return __dma_nommu_free_coherent(dev, size, vaddr, dma_handle, - attrs); - /* Maybe we used an iommu ... */ - iommu = get_iommu_table_base(dev); - - /* If we hit that we should have never allocated in the first - * place so how come we are freeing ? - */ - if (WARN_ON(!iommu)) - return; - iommu_free_coherent(iommu, size, vaddr, dma_handle); -} - -int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t handle, size_t size, - unsigned long attrs) -{ - unsigned long pfn; - -#ifdef CONFIG_NOT_COHERENT_CACHE - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - pfn = __dma_get_coherent_pfn((unsigned long)cpu_addr); -#else - pfn = page_to_pfn(virt_to_page(cpu_addr)); -#endif - return remap_pfn_range(vma, vma->vm_start, - pfn + vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot); -} - -static int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) { - sg->dma_address = sg_phys(sg) + get_dma_offset(dev); - sg->dma_length = sg->length; - - if (attrs & DMA_ATTR_SKIP_CPU_SYNC) - continue; - - __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); - } - - return nents; -} - -static void dma_nommu_unmap_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); -} - -static u64 dma_nommu_get_required_mask(struct device *dev) -{ - u64 end, mask; - - end = memblock_end_of_DRAM() + get_dma_offset(dev); - - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; -} - -static inline dma_addr_t dma_nommu_map_page(struct device *dev, - struct page *page, - unsigned long offset, - size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - __dma_sync_page(page, offset, size, dir); - - return page_to_phys(page) + offset + get_dma_offset(dev); -} - -static inline void dma_nommu_unmap_page(struct device *dev, - dma_addr_t dma_address, - size_t size, - enum dma_data_direction direction, - unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - __dma_sync(bus_to_virt(dma_address), size, direction); -} - -#ifdef CONFIG_NOT_COHERENT_CACHE -static inline void dma_nommu_sync_sg(struct device *dev, - struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); -} - -static inline void dma_nommu_sync_single(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - __dma_sync(bus_to_virt(dma_handle), size, direction); -} -#endif - -const struct dma_map_ops dma_nommu_ops = { - .alloc = dma_nommu_alloc_coherent, - .free = dma_nommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, - .map_sg = dma_nommu_map_sg, - .unmap_sg = dma_nommu_unmap_sg, - .dma_supported = dma_nommu_dma_supported, - .map_page = dma_nommu_map_page, - .unmap_page = dma_nommu_unmap_page, - .get_required_mask = dma_nommu_get_required_mask, -#ifdef CONFIG_NOT_COHERENT_CACHE - .sync_single_for_cpu = dma_nommu_sync_single, - .sync_single_for_device = dma_nommu_sync_single, - .sync_sg_for_cpu = dma_nommu_sync_sg, - .sync_sg_for_device = dma_nommu_sync_sg, -#endif -}; -EXPORT_SYMBOL(dma_nommu_ops); - -int dma_set_coherent_mask(struct device *dev, u64 mask) -{ - if (!dma_supported(dev, mask)) { - /* - * We need to special case the direct DMA ops which can - * support a fallback for coherent allocations. There - * is no dma_op->set_coherent_mask() so we have to do - * things the hard way: - */ - if (get_dma_ops(dev) != &dma_nommu_ops || - get_iommu_table_base(dev) == NULL || - !dma_iommu_dma_supported(dev, mask)) - return -EIO; - } - dev->coherent_dma_mask = mask; - return 0; -} -EXPORT_SYMBOL(dma_set_coherent_mask); - -int dma_set_mask(struct device *dev, u64 dma_mask) -{ - if (ppc_md.dma_set_mask) - return ppc_md.dma_set_mask(dev, dma_mask); - - if (dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - struct pci_controller *phb = pci_bus_to_host(pdev->bus); - if (phb->controller_ops.dma_set_mask) - return phb->controller_ops.dma_set_mask(pdev, dma_mask); - } - - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - *dev->dma_mask = dma_mask; - return 0; -} -EXPORT_SYMBOL(dma_set_mask); - -u64 __dma_get_required_mask(struct device *dev) -{ - const struct dma_map_ops *dma_ops = get_dma_ops(dev); - - if (unlikely(dma_ops == NULL)) - return 0; - - if (dma_ops->get_required_mask) - return dma_ops->get_required_mask(dev); - - return DMA_BIT_MASK(8 * sizeof(dma_addr_t)); -} - -u64 dma_get_required_mask(struct device *dev) -{ - if (ppc_md.dma_get_required_mask) - return ppc_md.dma_get_required_mask(dev); - - if (dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - struct pci_controller *phb = pci_bus_to_host(pdev->bus); - if (phb->controller_ops.dma_get_required_mask) - return phb->controller_ops.dma_get_required_mask(pdev); - } - - return __dma_get_required_mask(dev); -} -EXPORT_SYMBOL_GPL(dma_get_required_mask); - -static int __init dma_init(void) -{ -#ifdef CONFIG_IBMVIO - dma_debug_add_bus(&vio_bus_type); -#endif - - return 0; -} -fs_initcall(dma_init); - diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -666,8 +666,10 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) m = &dt_cpu_feature_match_table[i]; if (!strcmp(f->name, m->name)) { known = true; - if (m->enable(f)) + if (m->enable(f)) { + cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask; break; + } pr_info("not enabling: %s (disabled or unsupported by kernel)\n", f->name); @@ -675,17 +677,12 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) } } - if (!known && enable_unknown) { - if (!feat_try_enable_unknown(f)) { - pr_info("not enabling: %s (unknown and unsupported by kernel)\n", - f->name); - return false; - } + if (!known && (!enable_unknown || !feat_try_enable_unknown(f))) { + pr_info("not enabling: %s (unknown and unsupported by kernel)\n", + f->name); + return false; } - if (m->cpu_ftr_bit_mask) - cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask; - if (known) pr_debug("enabling: %s\n", f->name); else diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c @@ -109,7 +109,14 @@ EXPORT_SYMBOL(eeh_subsystem_flags); * frozen count in last hour exceeds this limit, the PE will * be forced to be offline permanently. */ -int eeh_max_freezes = 5; +u32 eeh_max_freezes = 5; + +/* + * Controls whether a recovery event should be scheduled when an + * isolated device is discovered. This is only really useful for + * debugging problems with the EEH core. + */ +bool eeh_debugfs_no_recover; /* Platform dependent EEH operations */ struct eeh_ops *eeh_ops = NULL; @@ -823,15 +830,15 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat switch (state) { case pcie_deassert_reset: eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); - eeh_unfreeze_pe(pe, false); + eeh_unfreeze_pe(pe); if (!(pe->type & EEH_PE_VF)) - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true); eeh_pe_dev_traverse(pe, eeh_restore_dev_state, dev); - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); break; case pcie_hot_reset: eeh_pe_mark_isolated(pe); - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) @@ -840,7 +847,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat break; case pcie_warm_reset: eeh_pe_mark_isolated(pe); - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) @@ -848,7 +855,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); break; default: - eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED, true); return -EINVAL; }; @@ -877,6 +884,24 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) return NULL; } +static void eeh_pe_refreeze_passed(struct eeh_pe *root) +{ + struct eeh_pe *pe; + int state; + + eeh_for_each_pe(root, pe) { + if (eeh_pe_passed(pe)) { + state = eeh_ops->get_state(pe, NULL); + if (state & + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_MMIO_ENABLED)) { + pr_info("EEH: Passed-through PE PHB#%x-PE#%x was thawed by reset, re-freezing for safety.\n", + pe->phb->global_number, pe->addr); + eeh_pe_set_option(pe, EEH_OPT_FREEZE_PE); + } + } + } +} + /** * eeh_pe_reset_full - Complete a full reset process on the indicated PE * @pe: EEH PE @@ -889,12 +914,12 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) * * This function will attempt to reset a PE three times before failing. */ -int eeh_pe_reset_full(struct eeh_pe *pe) +int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed) { int reset_state = (EEH_PE_RESET | EEH_PE_CFG_BLOCKED); int type = EEH_RESET_HOT; unsigned int freset = 0; - int i, state, ret; + int i, state = 0, ret; /* * Determine the type of reset to perform - hot or fundamental. @@ -911,32 +936,42 @@ int eeh_pe_reset_full(struct eeh_pe *pe) /* Make three attempts at resetting the bus */ for (i = 0; i < 3; i++) { - ret = eeh_pe_reset(pe, type); - if (ret) - break; - - ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE); - if (ret) - break; + ret = eeh_pe_reset(pe, type, include_passed); + if (!ret) + ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, + include_passed); + if (ret) { + ret = -EIO; + pr_warn("EEH: Failure %d resetting PHB#%x-PE#%x (attempt %d)\n\n", + state, pe->phb->global_number, pe->addr, i + 1); + continue; + } + if (i) + pr_warn("EEH: PHB#%x-PE#%x: Successful reset (attempt %d)\n", + pe->phb->global_number, pe->addr, i + 1); /* Wait until the PE is in a functioning state */ state = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); if (state < 0) { - pr_warn("%s: Unrecoverable slot failure on PHB#%x-PE#%x", - __func__, pe->phb->global_number, pe->addr); + pr_warn("EEH: Unrecoverable slot failure on PHB#%x-PE#%x", + pe->phb->global_number, pe->addr); ret = -ENOTRECOVERABLE; break; } if (eeh_state_active(state)) break; - - /* Set error in case this is our last attempt */ - ret = -EIO; - pr_warn("%s: Failure %d resetting PHB#%x-PE#%x\n (%d)\n", - __func__, state, pe->phb->global_number, pe->addr, (i + 1)); + else + pr_warn("EEH: PHB#%x-PE#%x: Slot inactive after reset: 0x%x (attempt %d)\n", + pe->phb->global_number, pe->addr, state, i + 1); } - eeh_pe_state_clear(pe, reset_state); + /* Resetting the PE may have unfrozen child PEs. If those PEs have been + * (potentially) passed through to a guest, re-freeze them: + */ + if (!include_passed) + eeh_pe_refreeze_passed(pe); + + eeh_pe_state_clear(pe, reset_state, true); return ret; } @@ -1309,7 +1344,7 @@ void eeh_remove_device(struct pci_dev *dev) edev->mode &= ~EEH_DEV_SYSFS; } -int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state) +int eeh_unfreeze_pe(struct eeh_pe *pe) { int ret; @@ -1327,10 +1362,6 @@ int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state) return ret; } - /* Clear software isolated state */ - if (sw_state && (pe->state & EEH_PE_ISOLATED)) - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); - return ret; } @@ -1382,7 +1413,10 @@ static int eeh_pe_change_owner(struct eeh_pe *pe) } } - return eeh_unfreeze_pe(pe, true); + ret = eeh_unfreeze_pe(pe); + if (!ret) + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); + return ret; } /** @@ -1612,13 +1646,12 @@ int eeh_pe_get_state(struct eeh_pe *pe) } EXPORT_SYMBOL_GPL(eeh_pe_get_state); -static int eeh_pe_reenable_devices(struct eeh_pe *pe) +static int eeh_pe_reenable_devices(struct eeh_pe *pe, bool include_passed) { struct eeh_dev *edev, *tmp; struct pci_dev *pdev; int ret = 0; - /* Restore config space */ eeh_pe_restore_bars(pe); /* @@ -1639,7 +1672,14 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe) } /* The PE is still in frozen state */ - return eeh_unfreeze_pe(pe, true); + if (include_passed || !eeh_pe_passed(pe)) { + ret = eeh_unfreeze_pe(pe); + } else + pr_info("EEH: Note: Leaving passthrough PHB#%x-PE#%x frozen.\n", + pe->phb->global_number, pe->addr); + if (!ret) + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, include_passed); + return ret; } @@ -1652,7 +1692,7 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe) * indicated type, either fundamental reset or hot reset. * PE reset is the most important part for error recovery. */ -int eeh_pe_reset(struct eeh_pe *pe, int option) +int eeh_pe_reset(struct eeh_pe *pe, int option, bool include_passed) { int ret = 0; @@ -1666,11 +1706,11 @@ int eeh_pe_reset(struct eeh_pe *pe, int option) switch (option) { case EEH_RESET_DEACTIVATE: ret = eeh_ops->reset(pe, option); - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, include_passed); if (ret) break; - ret = eeh_pe_reenable_devices(pe); + ret = eeh_pe_reenable_devices(pe, include_passed); break; case EEH_RESET_HOT: case EEH_RESET_FUNDAMENTAL: @@ -1796,22 +1836,64 @@ static int eeh_enable_dbgfs_get(void *data, u64 *val) return 0; } -static int eeh_freeze_dbgfs_set(void *data, u64 val) -{ - eeh_max_freezes = val; - return 0; -} +DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get, + eeh_enable_dbgfs_set, "0x%llx\n"); -static int eeh_freeze_dbgfs_get(void *data, u64 *val) +static ssize_t eeh_force_recover_write(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) { - *val = eeh_max_freezes; - return 0; + struct pci_controller *hose; + uint32_t phbid, pe_no; + struct eeh_pe *pe; + char buf[20]; + int ret; + + ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count); + if (!ret) + return -EFAULT; + + /* + * When PE is NULL the event is a "special" event. Rather than + * recovering a specific PE it forces the EEH core to scan for failed + * PHBs and recovers each. This needs to be done before any device + * recoveries can occur. + */ + if (!strncmp(buf, "hwcheck", 7)) { + __eeh_send_failure_event(NULL); + return count; + } + + ret = sscanf(buf, "%x:%x", &phbid, &pe_no); + if (ret != 2) + return -EINVAL; + + hose = pci_find_controller_for_domain(phbid); + if (!hose) + return -ENODEV; + + /* Retrieve PE */ + pe = eeh_pe_get(hose, pe_no, 0); + if (!pe) + return -ENODEV; + + /* + * We don't do any state checking here since the detection + * process is async to the recovery process. The recovery + * thread *should* not break even if we schedule a recovery + * from an odd state (e.g. PE removed, or recovery of a + * non-isolated PE) + */ + __eeh_send_failure_event(pe); + + return ret < 0 ? ret : count; } -DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get, - eeh_enable_dbgfs_set, "0x%llx\n"); -DEFINE_DEBUGFS_ATTRIBUTE(eeh_freeze_dbgfs_ops, eeh_freeze_dbgfs_get, - eeh_freeze_dbgfs_set, "0x%llx\n"); +static const struct file_operations eeh_force_recover_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = eeh_force_recover_write, +}; #endif static int __init eeh_init_proc(void) @@ -1822,9 +1904,15 @@ static int __init eeh_init_proc(void) debugfs_create_file_unsafe("eeh_enable", 0600, powerpc_debugfs_root, NULL, &eeh_enable_dbgfs_ops); - debugfs_create_file_unsafe("eeh_max_freezes", 0600, - powerpc_debugfs_root, NULL, - &eeh_freeze_dbgfs_ops); + debugfs_create_u32("eeh_max_freezes", 0600, + powerpc_debugfs_root, &eeh_max_freezes); + debugfs_create_bool("eeh_disable_recovery", 0600, + powerpc_debugfs_root, + &eeh_debugfs_no_recover); + debugfs_create_file_unsafe("eeh_force_recover", 0600, + powerpc_debugfs_root, NULL, + &eeh_force_recover_fops); + eeh_cache_debugfs_init(); #endif } diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c @@ -26,6 +26,7 @@ #include <linux/spinlock.h> #include <linux/atomic.h> #include <asm/pci-bridge.h> +#include <asm/debugfs.h> #include <asm/ppc-pci.h> @@ -113,7 +114,7 @@ static void eeh_addr_cache_print(struct pci_io_addr_cache *cache) while (n) { struct pci_io_addr_range *piar; piar = rb_entry(n, struct pci_io_addr_range, rb_node); - pr_debug("PCI: %s addr range %d [%pap-%pap]: %s\n", + pr_info("PCI: %s addr range %d [%pap-%pap]: %s\n", (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev)); cnt++; @@ -157,10 +158,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo, piar->pcidev = dev; piar->flags = flags; -#ifdef DEBUG pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n", &alo, &ahi, pci_name(dev)); -#endif rb_link_node(&piar->rb_node, parent, p); rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); @@ -240,6 +239,8 @@ restart: piar = rb_entry(n, struct pci_io_addr_range, rb_node); if (piar->pcidev == dev) { + pr_debug("PIAR: remove range=[%pap:%pap] dev=%s\n", + &piar->addr_lo, &piar->addr_hi, pci_name(dev)); rb_erase(n, &pci_io_addr_cache_root.rb_root); kfree(piar); goto restart; @@ -298,9 +299,30 @@ void eeh_addr_cache_build(void) eeh_addr_cache_insert_dev(dev); eeh_sysfs_add_device(dev); } +} -#ifdef DEBUG - /* Verify tree built up above, echo back the list of addrs. */ - eeh_addr_cache_print(&pci_io_addr_cache_root); -#endif +static int eeh_addr_cache_show(struct seq_file *s, void *v) +{ + struct pci_io_addr_range *piar; + struct rb_node *n; + + spin_lock(&pci_io_addr_cache_root.piar_lock); + for (n = rb_first(&pci_io_addr_cache_root.rb_root); n; n = rb_next(n)) { + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + seq_printf(s, "%s addr range [%pap-%pap]: %s\n", + (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", + &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev)); + } + spin_unlock(&pci_io_addr_cache_root.piar_lock); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache); + +void eeh_cache_debugfs_init(void) +{ + debugfs_create_file_unsafe("eeh_address_cache", 0400, + powerpc_debugfs_root, NULL, + &eeh_addr_cache_fops); } diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c @@ -510,22 +510,11 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) * support EEH. So we just care about PCI devices for * simplicity here. */ - if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) - return NULL; - - /* - * We rely on count-based pcibios_release_device() to - * detach permanently offlined PEs. Unfortunately, that's - * not reliable enough. We might have the permanently - * offlined PEs attached, but we needn't take care of - * them and their child devices. - */ - if (eeh_dev_removed(edev)) + if (!eeh_edev_actionable(edev) || + (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) return NULL; if (rmv_data) { - if (eeh_pe_passed(edev->pe)) - return NULL; driver = eeh_pcid_get(dev); if (driver) { if (driver->err_handler && @@ -539,8 +528,8 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) } /* Remove it from PCI subsystem */ - pr_debug("EEH: Removing %s without EEH sensitive driver\n", - pci_name(dev)); + pr_info("EEH: Removing %s without EEH sensitive driver\n", + pci_name(dev)); edev->mode |= EEH_DEV_DISCONNECTED; if (rmv_data) rmv_data->removed_dev_count++; @@ -591,34 +580,22 @@ static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) * PE reset (for 3 times), we try to clear the frozen state * for 3 times as well. */ -static void *__eeh_clear_pe_frozen_state(struct eeh_pe *pe, void *flag) +static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed) { - bool clear_sw_state = *(bool *)flag; - int i, rc = 1; - - for (i = 0; rc && i < 3; i++) - rc = eeh_unfreeze_pe(pe, clear_sw_state); + struct eeh_pe *pe; + int i; - /* Stop immediately on any errors */ - if (rc) { - pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n", - __func__, rc, pe->phb->global_number, pe->addr); - return (void *)pe; + eeh_for_each_pe(root, pe) { + if (include_passed || !eeh_pe_passed(pe)) { + for (i = 0; i < 3; i++) + if (!eeh_unfreeze_pe(pe)) + break; + if (i >= 3) + return -EIO; + } } - - return NULL; -} - -static int eeh_clear_pe_frozen_state(struct eeh_pe *pe, - bool clear_sw_state) -{ - void *rc; - - rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state); - if (!rc) - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); - - return rc ? -EIO : 0; + eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed); + return 0; } int eeh_pe_reset_and_recover(struct eeh_pe *pe) @@ -636,16 +613,16 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); /* Issue reset */ - ret = eeh_pe_reset_full(pe); + ret = eeh_pe_reset_full(pe, true); if (ret) { - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return ret; } /* Unfreeze the PE */ ret = eeh_clear_pe_frozen_state(pe, true); if (ret) { - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return ret; } @@ -653,7 +630,7 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); /* Clear recovery mode */ - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return 0; } @@ -676,6 +653,11 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, time64_t tstamp; int cnt, rc; struct eeh_dev *edev; + struct eeh_pe *tmp_pe; + bool any_passed = false; + + eeh_for_each_pe(pe, tmp_pe) + any_passed |= eeh_pe_passed(tmp_pe); /* pcibios will clear the counter; save the value */ cnt = pe->freeze_count; @@ -688,7 +670,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * into pci_hp_add_devices(). */ eeh_pe_state_mark(pe, EEH_PE_KEEP); - if (driver_eeh_aware || (pe->type & EEH_PE_VF)) { + if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) { eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); } else { pci_lock_rescan_remove(); @@ -705,7 +687,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * config accesses. So we prefer to block them. However, controlled * PCI config accesses initiated from EEH itself are allowed. */ - rc = eeh_pe_reset_full(pe); + rc = eeh_pe_reset_full(pe, false); if (rc) return rc; @@ -744,11 +726,11 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, eeh_add_virt_device(edev); } else { if (!driver_eeh_aware) - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); pci_hp_add_devices(bus); } } - eeh_pe_state_clear(pe, EEH_PE_KEEP); + eeh_pe_state_clear(pe, EEH_PE_KEEP, true); pe->tstamp = tstamp; pe->freeze_count = cnt; @@ -900,7 +882,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * is still in frozen state. Clear it before * resuming the PE. */ - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); result = PCI_ERS_RESULT_RECOVERED; } } @@ -977,7 +959,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); pci_lock_rescan_remove(); @@ -987,7 +969,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) return; } } - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); } /** @@ -1069,7 +1051,7 @@ void eeh_handle_special_event(void) continue; /* Notify all devices to be down */ - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); eeh_set_channel_state(pe, pci_channel_io_perm_failure); eeh_pe_report( "error_detected(permanent failure)", pe, diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c @@ -121,7 +121,7 @@ int eeh_event_init(void) * the actual event will be delivered in a normal context * (from a workqueue). */ -int eeh_send_failure_event(struct eeh_pe *pe) +int __eeh_send_failure_event(struct eeh_pe *pe) { unsigned long flags; struct eeh_event *event; @@ -144,6 +144,20 @@ int eeh_send_failure_event(struct eeh_pe *pe) return 0; } +int eeh_send_failure_event(struct eeh_pe *pe) +{ + /* + * If we've manually supressed recovery events via debugfs + * then just drop it on the floor. + */ + if (eeh_debugfs_no_recover) { + pr_err("EEH: Event dropped due to no_recover setting\n"); + return 0; + } + + return __eeh_send_failure_event(pe); +} + /** * eeh_remove_event - Remove EEH event from the queue * @pe: Event binding to the PE diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c @@ -657,62 +657,52 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode) } /** - * __eeh_pe_state_clear - Clear state for the PE + * eeh_pe_state_clear - Clear state for the PE * @data: EEH PE - * @flag: state + * @state: state + * @include_passed: include passed-through devices? * * The function is used to clear the indicated state from the * given PE. Besides, we also clear the check count of the PE * as well. */ -static void *__eeh_pe_state_clear(struct eeh_pe *pe, void *flag) +void eeh_pe_state_clear(struct eeh_pe *root, int state, bool include_passed) { - int state = *((int *)flag); + struct eeh_pe *pe; struct eeh_dev *edev, *tmp; struct pci_dev *pdev; - /* Keep the state of permanently removed PE intact */ - if (pe->state & EEH_PE_REMOVED) - return NULL; + eeh_for_each_pe(root, pe) { + /* Keep the state of permanently removed PE intact */ + if (pe->state & EEH_PE_REMOVED) + continue; - pe->state &= ~state; + if (!include_passed && eeh_pe_passed(pe)) + continue; - /* - * Special treatment on clearing isolated state. Clear - * check count since last isolation and put all affected - * devices to normal state. - */ - if (!(state & EEH_PE_ISOLATED)) - return NULL; + pe->state &= ~state; - pe->check_count = 0; - eeh_pe_for_each_dev(pe, edev, tmp) { - pdev = eeh_dev_to_pci_dev(edev); - if (!pdev) + /* + * Special treatment on clearing isolated state. Clear + * check count since last isolation and put all affected + * devices to normal state. + */ + if (!(state & EEH_PE_ISOLATED)) continue; - pdev->error_state = pci_channel_io_normal; - } - - /* Unblock PCI config access if required */ - if (pe->state & EEH_PE_CFG_RESTRICTED) - pe->state &= ~EEH_PE_CFG_BLOCKED; + pe->check_count = 0; + eeh_pe_for_each_dev(pe, edev, tmp) { + pdev = eeh_dev_to_pci_dev(edev); + if (!pdev) + continue; - return NULL; -} + pdev->error_state = pci_channel_io_normal; + } -/** - * eeh_pe_state_clear - Clear state for the PE and its children - * @pe: PE - * @state: state to be cleared - * - * When the PE and its children has been recovered from error, - * we need clear the error state for that. The function is used - * for the purpose. - */ -void eeh_pe_state_clear(struct eeh_pe *pe, int state) -{ - eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); + /* Unblock PCI config access if required */ + if (pe->state & EEH_PE_CFG_RESTRICTED) + pe->state &= ~EEH_PE_CFG_BLOCKED; + } } /* diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c @@ -82,8 +82,9 @@ static ssize_t eeh_pe_state_store(struct device *dev, if (!(edev->pe->state & EEH_PE_ISOLATED)) return count; - if (eeh_unfreeze_pe(edev->pe, true)) + if (eeh_unfreeze_pe(edev->pe)) return -EIO; + eeh_pe_state_clear(edev->pe, EEH_PE_ISOLATED, true); return count; } diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S @@ -97,14 +97,11 @@ crit_transfer_to_handler: mfspr r0,SPRN_SRR1 stw r0,_SRR1(r11) - /* set the stack limit to the current stack - * and set the limit to protect the thread_info - * struct - */ + /* set the stack limit to the current stack */ mfspr r8,SPRN_SPRG_THREAD lwz r0,KSP_LIMIT(r8) stw r0,SAVED_KSP_LIMIT(r11) - rlwimi r0,r1,0,0,(31-THREAD_SHIFT) + rlwinm r0,r1,0,0,(31 - THREAD_SHIFT) stw r0,KSP_LIMIT(r8) /* fall through */ #endif @@ -121,14 +118,11 @@ crit_transfer_to_handler: mfspr r0,SPRN_SRR1 stw r0,crit_srr1@l(0) - /* set the stack limit to the current stack - * and set the limit to protect the thread_info - * struct - */ + /* set the stack limit to the current stack */ mfspr r8,SPRN_SPRG_THREAD lwz r0,KSP_LIMIT(r8) stw r0,saved_ksp_limit@l(0) - rlwimi r0,r1,0,0,(31-THREAD_SHIFT) + rlwinm r0,r1,0,0,(31 - THREAD_SHIFT) stw r0,KSP_LIMIT(r8) /* fall through */ #endif @@ -157,7 +151,6 @@ transfer_to_handler: stw r2,_XER(r11) mfspr r12,SPRN_SPRG_THREAD addi r2,r12,-THREAD - tovirt(r2,r2) /* set r2 to current */ beq 2f /* if from user, fix up THREAD.regs */ addi r11,r1,STACK_FRAME_OVERHEAD stw r11,PT_REGS(r12) @@ -166,6 +159,9 @@ transfer_to_handler: internal debug mode bit to do this. */ lwz r12,THREAD_DBCR0(r12) andis. r12,r12,DBCR0_IDM@h +#endif + ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) +#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) beq+ 3f /* From user and task is ptraced - load up global dbcr0 */ li r12,-1 /* clear all pending debug events */ @@ -174,8 +170,7 @@ transfer_to_handler: tophys(r11,r11) addi r11,r11,global_dbcr0@l #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_CPU(r9) + lwz r9,TASK_CPU(r2) slwi r9,r9,3 add r11,r11,r9 #endif @@ -185,11 +180,6 @@ transfer_to_handler: addi r12,r12,-1 stw r12,4(r11) #endif -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - CURRENT_THREAD_INFO(r9, r1) - tophys(r9, r9) - ACCOUNT_CPU_USER_ENTRY(r9, r11, r12) -#endif b 3f @@ -201,9 +191,7 @@ transfer_to_handler: ble- stack_ovf /* then the kernel stack overflowed */ 5: #if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) - CURRENT_THREAD_INFO(r9, r1) - tophys(r9,r9) /* check local flags */ - lwz r12,TI_LOCAL_FLAGS(r9) + lwz r12,TI_LOCAL_FLAGS(r2) mtcrf 0x01,r12 bt- 31-TLF_NAPPING,4f bt- 31-TLF_SLEEPING,7f @@ -212,6 +200,7 @@ transfer_to_handler: transfer_to_handler_cont: 3: mflr r9 + tovirt(r2, r2) /* set r2 to current */ lwz r11,0(r9) /* virtual address of handler */ lwz r9,4(r9) /* where to go when done */ #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) @@ -275,11 +264,11 @@ reenable_mmu: /* re-enable mmu so we can */ #if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) 4: rlwinm r12,r12,0,~_TLF_NAPPING - stw r12,TI_LOCAL_FLAGS(r9) + stw r12,TI_LOCAL_FLAGS(r2) b power_save_ppc32_restore 7: rlwinm r12,r12,0,~_TLF_SLEEPING - stw r12,TI_LOCAL_FLAGS(r9) + stw r12,TI_LOCAL_FLAGS(r2) lwz r9,_MSR(r11) /* if sleeping, clear MSR.EE */ rlwinm r9,r9,0,~MSR_EE lwz r12,_LINK(r11) /* and return to address in LR */ @@ -351,8 +340,7 @@ _GLOBAL(DoSyscall) mtmsr r11 1: #endif /* CONFIG_TRACE_IRQFLAGS */ - CURRENT_THREAD_INFO(r10, r1) - lwz r11,TI_FLAGS(r10) + lwz r11,TI_FLAGS(r2) andi. r11,r11,_TIF_SYSCALL_DOTRACE bne- syscall_dotrace syscall_dotrace_cont: @@ -385,13 +373,12 @@ ret_from_syscall: lwz r3,GPR3(r1) #endif mr r6,r3 - CURRENT_THREAD_INFO(r12, r1) /* disable interrupts so current_thread_info()->flags can't change */ LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */ /* Note: We don't bother telling lockdep about it */ SYNC MTMSRD(r10) - lwz r9,TI_FLAGS(r12) + lwz r9,TI_FLAGS(r2) li r8,-MAX_ERRNO andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) bne- syscall_exit_work @@ -438,8 +425,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE andi. r4,r8,MSR_PR beq 3f - CURRENT_THREAD_INFO(r4, r1) - ACCOUNT_CPU_USER_EXIT(r4, r5, r7) + ACCOUNT_CPU_USER_EXIT(r2, r5, r7) 3: #endif lwz r4,_LINK(r1) @@ -532,7 +518,7 @@ syscall_exit_work: /* Clear per-syscall TIF flags if any are set. */ li r11,_TIF_PERSYSCALL_MASK - addi r12,r12,TI_FLAGS + addi r12,r2,TI_FLAGS 3: lwarx r8,0,r12 andc r8,r8,r11 #ifdef CONFIG_IBM405_ERR77 @@ -540,7 +526,6 @@ syscall_exit_work: #endif stwcx. r8,0,r12 bne- 3b - subi r12,r12,TI_FLAGS 4: /* Anything which requires enabling interrupts? */ andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP) @@ -745,6 +730,9 @@ fast_exception_return: mtcr r10 lwz r10,_LINK(r11) mtlr r10 + /* Clear the exception_marker on the stack to avoid confusing stacktrace */ + li r10, 0 + stw r10, 8(r11) REST_GPR(10, r11) #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) mtspr SPRN_NRI, r0 @@ -819,8 +807,7 @@ ret_from_except: user_exc_return: /* r10 contains MSR_KERNEL here */ /* Check current_thread_info()->flags */ - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_FLAGS(r9) + lwz r9,TI_FLAGS(r2) andi. r0,r9,_TIF_USER_WORK_MASK bne do_work @@ -832,18 +819,14 @@ restore_user: andis. r10,r0,DBCR0_IDM@h bnel- load_dbcr0 #endif -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - CURRENT_THREAD_INFO(r9, r1) - ACCOUNT_CPU_USER_EXIT(r9, r10, r11) -#endif + ACCOUNT_CPU_USER_EXIT(r2, r10, r11) b restore /* N.B. the only way to get here is from the beq following ret_from_except. */ resume_kernel: /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ - CURRENT_THREAD_INFO(r9, r1) - lwz r8,TI_FLAGS(r9) + lwz r8,TI_FLAGS(r2) andis. r0,r8,_TIF_EMULATE_STACK_STORE@h beq+ 1f @@ -869,7 +852,7 @@ resume_kernel: /* Clear _TIF_EMULATE_STACK_STORE flag */ lis r11,_TIF_EMULATE_STACK_STORE@h - addi r5,r9,TI_FLAGS + addi r5,r2,TI_FLAGS 0: lwarx r8,0,r5 andc r8,r8,r11 #ifdef CONFIG_IBM405_ERR77 @@ -881,7 +864,7 @@ resume_kernel: #ifdef CONFIG_PREEMPT /* check current_thread_info->preempt_count */ - lwz r0,TI_PREEMPT(r9) + lwz r0,TI_PREEMPT(r2) cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ bne restore andi. r8,r8,_TIF_NEED_RESCHED @@ -897,8 +880,7 @@ resume_kernel: bl trace_hardirqs_off #endif 1: bl preempt_schedule_irq - CURRENT_THREAD_INFO(r9, r1) - lwz r3,TI_FLAGS(r9) + lwz r3,TI_FLAGS(r2) andi. r0,r3,_TIF_NEED_RESCHED bne- 1b #ifdef CONFIG_TRACE_IRQFLAGS @@ -982,6 +964,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) mtcrf 0xFF,r10 mtlr r11 + /* Clear the exception_marker on the stack to avoid confusing stacktrace */ + li r10, 0 + stw r10, 8(r1) /* * Once we put values in SRR0 and SRR1, we are in a state * where exceptions are not recoverable, since taking an @@ -997,9 +982,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) .globl exc_exit_restart exc_exit_restart: lwz r12,_NIP(r1) -#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) - mtspr SPRN_NRI, r0 -#endif mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r9 REST_4GPRS(9, r1) @@ -1021,6 +1003,9 @@ exc_exit_restart_end: mtlr r11 lwz r10,_CCR(r1) mtcrf 0xff,r10 + /* Clear the exception_marker on the stack to avoid confusing stacktrace */ + li r10, 0 + stw r10, 8(r1) REST_2GPRS(9, r1) .globl exc_exit_restart exc_exit_restart: @@ -1166,10 +1151,6 @@ ret_from_debug_exc: mfspr r9,SPRN_SPRG_THREAD lwz r10,SAVED_KSP_LIMIT(r1) stw r10,KSP_LIMIT(r9) - lwz r9,THREAD_INFO-THREAD(r9) - CURRENT_THREAD_INFO(r10, r1) - lwz r10,TI_PREEMPT(r10) - stw r10,TI_PREEMPT(r9) RESTORE_xSRR(SRR0,SRR1); RESTORE_xSRR(CSRR0,CSRR1); RESTORE_MMU_REGS; @@ -1201,8 +1182,7 @@ load_dbcr0: lis r11,global_dbcr0@ha addi r11,r11,global_dbcr0@l #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_CPU(r9) + lwz r9,TASK_CPU(r2) slwi r9,r9,3 add r11,r11,r9 #endif @@ -1242,8 +1222,7 @@ recheck: LOAD_MSR_KERNEL(r10,MSR_KERNEL) SYNC MTMSRD(r10) /* disable interrupts */ - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_FLAGS(r9) + lwz r9,TI_FLAGS(r2) andi. r0,r9,_TIF_NEED_RESCHED bne- do_resched andi. r0,r9,_TIF_USER_WORK_MASK @@ -1292,10 +1271,13 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_601) lwz r3,_TRAP(r1) andi. r0,r3,1 - beq 4f + beq 5f SAVE_NVGPRS(r1) rlwinm r3,r3,0,0,30 stw r3,_TRAP(r1) +5: mfspr r2,SPRN_SPRG_THREAD + addi r2,r2,-THREAD + tovirt(r2,r2) /* set back r2 to current */ 4: addi r3,r1,STACK_FRAME_OVERHEAD bl unrecoverable_exception /* shouldn't return */ @@ -1335,7 +1317,7 @@ _GLOBAL(enter_rtas) MTMSRD(r0) /* don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) mtlr r6 - mtspr SPRN_SPRG_RTAS,r7 + stw r7, THREAD + RTAS_SP(r2) mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI @@ -1344,7 +1326,8 @@ _GLOBAL(enter_rtas) lwz r9,8(r9) /* original msr value */ addi r1,r1,INT_FRAME_SIZE li r0,0 - mtspr SPRN_SPRG_RTAS,r0 + tophys(r7, r2) + stw r0, THREAD + RTAS_SP(r7) mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI /* return to caller */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S @@ -166,7 +166,7 @@ system_call: /* label this so stack traces look sane */ li r10,IRQS_ENABLED std r10,SOFTE(r1) - CURRENT_THREAD_INFO(r11, r1) + ld r11, PACA_THREAD_INFO(r13) ld r10,TI_FLAGS(r11) andi. r11,r10,_TIF_SYSCALL_DOTRACE bne .Lsyscall_dotrace /* does not return */ @@ -213,7 +213,7 @@ system_call: /* label this so stack traces look sane */ ld r3,RESULT(r1) #endif - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) ld r8,_MSR(r1) #ifdef CONFIG_PPC_BOOK3S @@ -236,18 +236,14 @@ system_call_exit: /* * Disable interrupts so current_thread_info()->flags can't change, * and so that we don't get interrupted after loading SRR0/1. + * + * Leave MSR_RI enabled for now, because with THREAD_INFO_IN_TASK we + * could fault on the load of the TI_FLAGS below. */ #ifdef CONFIG_PPC_BOOK3E wrteei 0 #else - /* - * For performance reasons we clear RI the same time that we - * clear EE. We only need to clear RI just before we restore r13 - * below, but batching it with EE saves us one expensive mtmsrd call. - * We have to be careful to restore RI if we branch anywhere from - * here (eg syscall_exit_work). - */ - li r11,0 + li r11,MSR_RI mtmsrd r11,1 #endif /* CONFIG_PPC_BOOK3E */ @@ -263,15 +259,7 @@ system_call_exit: bne 3f #endif 2: addi r3,r1,STACK_FRAME_OVERHEAD -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif bl restore_math -#ifdef CONFIG_PPC_BOOK3S - li r11,0 - mtmsrd r11,1 -#endif ld r8,_MSR(r1) ld r3,RESULT(r1) li r11,-MAX_ERRNO @@ -287,6 +275,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) andi. r6,r8,MSR_PR ld r4,_LINK(r1) +#ifdef CONFIG_PPC_BOOK3S + /* + * Clear MSR_RI, MSR_EE is already and remains disabled. We could do + * this later, but testing shows that doing it here causes less slow + * down than doing it closer to the rfid. + */ + li r11,0 + mtmsrd r11,1 +#endif + beq- 1f ACCOUNT_CPU_USER_EXIT(r13, r11, r12) @@ -348,7 +346,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Repopulate r9 and r10 for the syscall path */ addi r9,r1,STACK_FRAME_OVERHEAD - CURRENT_THREAD_INFO(r10, r1) + ld r10, PACA_THREAD_INFO(r13) ld r10,TI_FLAGS(r10) cmpldi r0,NR_syscalls @@ -363,10 +361,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b .Lsyscall_exit .Lsyscall_exit_work: -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif /* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr. If TIF_NOERROR is set, just save r3 as it is. */ @@ -695,7 +689,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) 2: #endif /* CONFIG_PPC_BOOK3S_64 */ - CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ + clrrdi r7, r8, THREAD_SHIFT /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE because we don't need to leave the 288-byte ABI gap at the top of the kernel stack. */ @@ -746,7 +740,7 @@ _GLOBAL(ret_from_except_lite) mtmsrd r10,1 /* Update machine state */ #endif /* CONFIG_PPC_BOOK3E */ - CURRENT_THREAD_INFO(r9, r1) + ld r9, PACA_THREAD_INFO(r13) ld r3,_MSR(r1) #ifdef CONFIG_PPC_BOOK3E ld r10,PACACURRENT(r13) @@ -860,7 +854,7 @@ resume_kernel: 1: bl preempt_schedule_irq /* Re-test flags and eventually loop */ - CURRENT_THREAD_INFO(r9, r1) + ld r9, PACA_THREAD_INFO(r13) ld r4,TI_FLAGS(r9) andi. r0,r4,_TIF_NEED_RESCHED bne 1b @@ -1002,6 +996,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r2,_NIP(r1) mtspr SPRN_SRR0,r2 + /* + * Leaving a stale exception_marker on the stack can confuse + * the reliable stack unwinder later on. Clear it. + */ + li r2,0 + std r2,STACK_FRAME_OVERHEAD-16(r1) + ld r0,GPR0(r1) ld r2,GPR2(r1) ld r3,GPR3(r1) diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S @@ -21,10 +21,9 @@ #ifndef CONFIG_PPC64 /* epapr_ev_idle() was derived from e500_idle() */ _GLOBAL(epapr_ev_idle) - CURRENT_THREAD_INFO(r3, r1) - PPC_LL r4, TI_LOCAL_FLAGS(r3) /* set napping bit */ + PPC_LL r4, TI_LOCAL_FLAGS(r2) /* set napping bit */ ori r4, r4,_TLF_NAPPING /* so when we take an exception */ - PPC_STL r4, TI_LOCAL_FLAGS(r3) /* it will return to our caller */ + PPC_STL r4, TI_LOCAL_FLAGS(r2) /* it will return to our caller */ wrteei 1 diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S @@ -77,17 +77,6 @@ special_reg_save: andi. r3,r3,MSR_PR bnelr - /* Copy info into temporary exception thread info */ - ld r11,PACAKSAVE(r13) - CURRENT_THREAD_INFO(r11, r11) - CURRENT_THREAD_INFO(r12, r1) - ld r10,TI_FLAGS(r11) - std r10,TI_FLAGS(r12) - ld r10,TI_PREEMPT(r11) - std r10,TI_PREEMPT(r12) - ld r10,TI_TASK(r11) - std r10,TI_TASK(r12) - /* * Advance to the next TLB exception frame for handler * types that don't do it automatically. @@ -349,6 +338,7 @@ ret_from_mc_except: #define GEN_BTB_FLUSH #define CRIT_BTB_FLUSH #define DBG_BTB_FLUSH +#define MC_BTB_FLUSH #define GDBELL_BTB_FLUSH #endif @@ -504,7 +494,7 @@ exc_##n##_bad_stack: \ * interrupts happen before the wait instruction. */ #define CHECK_NAPPING() \ - CURRENT_THREAD_INFO(r11, r1); \ + ld r11, PACA_THREAD_INFO(r13); \ ld r10,TI_LOCAL_FLAGS(r11); \ andi. r9,r10,_TLF_NAPPING; \ beq+ 1f; \ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S @@ -68,6 +68,14 @@ OPEN_FIXED_SECTION(real_vectors, 0x0100, 0x1900) OPEN_FIXED_SECTION(real_trampolines, 0x1900, 0x4000) OPEN_FIXED_SECTION(virt_vectors, 0x4000, 0x5900) OPEN_FIXED_SECTION(virt_trampolines, 0x5900, 0x7000) + +#ifdef CONFIG_PPC_POWERNV + .globl start_real_trampolines + .globl end_real_trampolines + .globl start_virt_trampolines + .globl end_virt_trampolines +#endif + #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. @@ -566,8 +574,36 @@ EXC_COMMON_BEGIN(mce_return) RFI_TO_KERNEL b . -EXC_REAL(data_access, 0x300, 0x80) -EXC_VIRT(data_access, 0x4300, 0x80, 0x300) +EXC_REAL_BEGIN(data_access, 0x300, 0x80) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) + b tramp_real_data_access +EXC_REAL_END(data_access, 0x300, 0x80) + +TRAMP_REAL_BEGIN(tramp_real_data_access) +EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x300) + /* + * DAR/DSISR must be read before setting MSR[RI], because + * a d-side MCE will clobber those registers so is not + * recoverable if they are live. + */ + mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR + std r10,PACA_EXGEN+EX_DAR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) +EXCEPTION_PROLOG_2(data_access_common, EXC_STD) + +EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) +EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x300) + mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR + std r10,PACA_EXGEN+EX_DAR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) +EXCEPTION_PROLOG_2_RELON(data_access_common, EXC_STD) +EXC_VIRT_END(data_access, 0x4300, 0x80) + TRAMP_KVM_SKIP(PACA_EXGEN, 0x300) EXC_COMMON_BEGIN(data_access_common) @@ -575,11 +611,8 @@ EXC_COMMON_BEGIN(data_access_common) * Here r13 points to the paca, r9 contains the saved CR, * SRR0 and SRR1 are saved in r11 and r12, * r9 - r13 are saved in paca->exgen. + * EX_DAR and EX_DSISR have saved DAR/DSISR */ - mfspr r10,SPRN_DAR - std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_DSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN) RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) @@ -596,18 +629,29 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) -EXCEPTION_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, KVMTEST_PR, 0x380); +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXSLB) + b tramp_real_data_access_slb EXC_REAL_END(data_access_slb, 0x380, 0x80) +TRAMP_REAL_BEGIN(tramp_real_data_access_slb) +EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) + mfspr r10,SPRN_DAR + std r10,PACA_EXSLB+EX_DAR(r13) +EXCEPTION_PROLOG_2(data_access_slb_common, EXC_STD) + EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) -EXCEPTION_RELON_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, NOTEST, 0x380); +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXSLB) +EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) + mfspr r10,SPRN_DAR + std r10,PACA_EXSLB+EX_DAR(r13) +EXCEPTION_PROLOG_2_RELON(data_access_slb_common, EXC_STD) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) EXC_COMMON_BEGIN(data_access_slb_common) - mfspr r10,SPRN_DAR - std r10,PACA_EXSLB+EX_DAR(r13) EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) ld r4,PACA_EXSLB+EX_DAR(r13) std r4,_DAR(r1) @@ -703,14 +747,30 @@ TRAMP_KVM_HV(PACA_EXGEN, 0x500) EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) -EXC_REAL(alignment, 0x600, 0x100) -EXC_VIRT(alignment, 0x4600, 0x100, 0x600) -TRAMP_KVM(PACA_EXGEN, 0x600) -EXC_COMMON_BEGIN(alignment_common) +EXC_REAL_BEGIN(alignment, 0x600, 0x100) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) +EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x600) mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_DSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) +EXCEPTION_PROLOG_2(alignment_common, EXC_STD) +EXC_REAL_END(alignment, 0x600, 0x100) + +EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) +EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x600) + mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR + std r10,PACA_EXGEN+EX_DAR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) +EXCEPTION_PROLOG_2_RELON(alignment_common, EXC_STD) +EXC_VIRT_END(alignment, 0x4600, 0x100) + +TRAMP_KVM(PACA_EXGEN, 0x600) +EXC_COMMON_BEGIN(alignment_common) EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN) ld r3,PACA_EXGEN+EX_DAR(r13) lwz r4,PACA_EXGEN+EX_DSISR(r13) @@ -1629,7 +1689,7 @@ do_hash_page: ori r0,r0,DSISR_BAD_FAULT_64S@l and. r0,r4,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ - CURRENT_THREAD_INFO(r11, r1) + ld r11, PACA_THREAD_INFO(r13) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ bne 77f /* then don't call hash_page now */ diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S @@ -261,7 +261,7 @@ __secondary_hold_acknowledge: tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ mfspr r11,SPRN_SPRG_THREAD; \ - lwz r11,THREAD_INFO-THREAD(r11); \ + lwz r11,TASK_STACK-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ 1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ @@ -352,9 +352,8 @@ i##n: \ * registers that might have bad values includes all the GPRs * and all the BATs. We indicate that we are in RTAS by putting * a non-zero value, the address of the exception frame to use, - * in SPRG2. The machine check handler checks SPRG2 and uses its - * value if it is non-zero. If we ever needed to free up SPRG2, - * we could use a field in the thread_info or thread_struct instead. + * in thread.rtas_sp. The machine check handler checks thread.rtas_sp + * and uses its value if it is non-zero. * (Other exception handlers assume that r1 is a valid kernel stack * pointer when we take an exception from supervisor mode.) * -- paulus. @@ -365,16 +364,15 @@ i##n: \ mtspr SPRN_SPRG_SCRATCH1,r11 mfcr r10 #ifdef CONFIG_PPC_CHRP - mfspr r11,SPRN_SPRG_RTAS - cmpwi 0,r11,0 - bne 7f + mfspr r11, SPRN_SPRG_THREAD + lwz r11, RTAS_SP(r11) + cmpwi cr1, r11, 0 + bne cr1, 7f #endif /* CONFIG_PPC_CHRP */ EXCEPTION_PROLOG_1 7: EXCEPTION_PROLOG_2 addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_CHRP - mfspr r4,SPRN_SPRG_RTAS - cmpwi cr1,r4,0 bne cr1,1f #endif EXC_XFER_STD(0x200, machine_check_exception) @@ -500,18 +498,22 @@ InstructionTLBMiss: */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_IMISS +#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG_THREAD - li r1,_PAGE_USER|_PAGE_PRESENT|_PAGE_EXEC /* low addresses tested as user */ - lwz r2,PGDIR(r2) +#endif + mfspr r2, SPRN_SPRG_PGDIR +#ifdef CONFIG_SWAP + li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC +#else + li r1,_PAGE_PRESENT | _PAGE_EXEC +#endif +#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) bge- 112f - mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ - lis r2,swapper_pg_dir@ha /* if kernel address, use */ - addi r2,r2,swapper_pg_dir@l /* kernel page table */ -112: tophys(r2,r2) - rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ + lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ +#endif +112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- InstructionAddressInvalid /* return if no mapping */ @@ -519,20 +521,10 @@ InstructionTLBMiss: lwz r0,0(r2) /* get linux-style pte */ andc. r1,r1,r0 /* check access & ~permission */ bne- InstructionAddressInvalid /* return if access not permitted */ - ori r0,r0,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */ - /* - * NOTE! We are assuming this is not an SMP system, otherwise - * we would need to update the pte atomically with lwarx/stwcx. - */ - stw r0,0(r2) /* update PTE (accessed bit) */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r2,r0,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ - and r1,r1,r2 /* writable if _RW and _DIRTY */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ - ori r1,r1,0xe04 /* clear out reserved bits */ - andc r1,r0,r1 /* PP = user? (rw&dirty? 2: 3): 0 */ + ori r1, r1, 0xe05 /* clear out reserved bits */ + andc r1, r0, r1 /* PP = user? 2 : 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -576,16 +568,16 @@ DataLoadTLBMiss: mfspr r3,SPRN_DMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG_THREAD - li r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */ - lwz r2,PGDIR(r2) + mfspr r2, SPRN_SPRG_PGDIR +#ifdef CONFIG_SWAP + li r1, _PAGE_PRESENT | _PAGE_ACCESSED +#else + li r1, _PAGE_PRESENT +#endif bge- 112f - mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ - lis r2,swapper_pg_dir@ha /* if kernel address, use */ - addi r2,r2,swapper_pg_dir@l /* kernel page table */ -112: tophys(r2,r2) - rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ + lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ +112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ @@ -593,20 +585,16 @@ DataLoadTLBMiss: lwz r0,0(r2) /* get linux-style pte */ andc. r1,r1,r0 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ - ori r0,r0,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */ /* * NOTE! We are assuming this is not an SMP system, otherwise * we would need to update the pte atomically with lwarx/stwcx. */ - stw r0,0(r2) /* update PTE (accessed bit) */ /* Convert linux-style PTE to low word of PPC-style PTE */ rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r2,r0,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ - and r1,r1,r2 /* writable if _RW and _DIRTY */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ ori r1,r1,0xe04 /* clear out reserved bits */ - andc r1,r0,r1 /* PP = user? (rw&dirty? 2: 3): 0 */ + andc r1,r0,r1 /* PP = user? rw? 2: 3: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -660,16 +648,16 @@ DataStoreTLBMiss: mfspr r3,SPRN_DMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG_THREAD - li r1,_PAGE_RW|_PAGE_USER|_PAGE_PRESENT /* access flags */ - lwz r2,PGDIR(r2) + mfspr r2, SPRN_SPRG_PGDIR +#ifdef CONFIG_SWAP + li r1, _PAGE_RW | _PAGE_PRESENT | _PAGE_ACCESSED +#else + li r1, _PAGE_RW | _PAGE_PRESENT +#endif bge- 112f - mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ - lis r2,swapper_pg_dir@ha /* if kernel address, use */ - addi r2,r2,swapper_pg_dir@l /* kernel page table */ -112: tophys(r2,r2) - rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ + lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ +112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ @@ -677,12 +665,10 @@ DataStoreTLBMiss: lwz r0,0(r2) /* get linux-style pte */ andc. r1,r1,r0 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ - ori r0,r0,_PAGE_ACCESSED|_PAGE_DIRTY /* * NOTE! We are assuming this is not an SMP system, otherwise * we would need to update the pte atomically with lwarx/stwcx. */ - stw r0,0(r2) /* update PTE (accessed/dirty bits) */ /* Convert linux-style PTE to low word of PPC-style PTE */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ li r1,0xe05 /* clear out reserved bits & PP lsb */ @@ -845,12 +831,12 @@ __secondary_start: bl init_idle_6xx #endif /* CONFIG_PPC_BOOK3S_32 */ - /* get current_thread_info and current */ - lis r1,secondary_ti@ha - tophys(r1,r1) - lwz r1,secondary_ti@l(r1) - tophys(r2,r1) - lwz r2,TI_TASK(r2) + /* get current's stack and current */ + lis r2,secondary_current@ha + tophys(r2,r2) + lwz r2,secondary_current@l(r2) + tophys(r1,r2) + lwz r1,TASK_STACK(r1) /* stack */ addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD @@ -865,8 +851,10 @@ __secondary_start: tophys(r4,r2) addi r4,r4,THREAD /* phys address of our thread_struct */ mtspr SPRN_SPRG_THREAD,r4 +#ifdef CONFIG_PPC_RTAS li r3,0 - mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ + stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ +#endif /* enable MMU and jump to start_secondary */ li r4,MSR_KERNEL @@ -950,8 +938,10 @@ start_here: tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ mtspr SPRN_SPRG_THREAD,r4 +#ifdef CONFIG_PPC_RTAS li r3,0 - mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ + stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ +#endif /* stack */ lis r1,init_thread_union@ha @@ -1022,15 +1012,16 @@ _ENTRY(switch_mmu_context) li r0,NUM_USER_SEGMENTS mtctr r0 + lwz r4, MM_PGD(r4) #ifdef CONFIG_BDI_SWITCH /* Context switch the PTE pointer for the Abatron BDI2000. * The PGDIR is passed as second argument. */ - lwz r4,MM_PGD(r4) - lis r5, KERNELBASE@h - lwz r5, 0xf0(r5) - stw r4, 0x4(r5) + lis r5, abatron_pteptrs@ha + stw r4, abatron_pteptrs@l + 0x4(r5) #endif + tophys(r4, r4) + mtspr SPRN_SPRG_PGDIR, r4 li r4,0 isync 3: @@ -1105,6 +1096,41 @@ BEGIN_MMU_FTR_SECTION END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr +_ENTRY(update_bats) + lis r4, 1f@h + ori r4, r4, 1f@l + tophys(r4, r4) + mfmsr r6 + mflr r7 + li r3, MSR_KERNEL & ~(MSR_IR | MSR_DR) + rlwinm r0, r6, 0, ~MSR_RI + rlwinm r0, r0, 0, ~MSR_EE + mtmsr r0 + mtspr SPRN_SRR0, r4 + mtspr SPRN_SRR1, r3 + SYNC + RFI +1: bl clear_bats + lis r3, BATS@ha + addi r3, r3, BATS@l + tophys(r3, r3) + LOAD_BAT(0, r3, r4, r5) + LOAD_BAT(1, r3, r4, r5) + LOAD_BAT(2, r3, r4, r5) + LOAD_BAT(3, r3, r4, r5) +BEGIN_MMU_FTR_SECTION + LOAD_BAT(4, r3, r4, r5) + LOAD_BAT(5, r3, r4, r5) + LOAD_BAT(6, r3, r4, r5) + LOAD_BAT(7, r3, r4, r5) +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) + li r3, MSR_KERNEL & ~(MSR_IR | MSR_DR | MSR_RI) + mtmsr r3 + mtspr SPRN_SRR0, r7 + mtspr SPRN_SRR1, r6 + SYNC + RFI + flush_tlbs: lis r10, 0x40 1: addic. r10, r10, -0x1000 diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S @@ -115,7 +115,7 @@ _ENTRY(saved_ksp_limit) andi. r11,r11,MSR_PR; \ beq 1f; \ mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack */\ + lwz r1,TASK_STACK-THREAD(r1); /* this thread's kernel stack */\ addi r1,r1,THREAD_SIZE; \ 1: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\ tophys(r11,r1); \ @@ -158,7 +158,7 @@ _ENTRY(saved_ksp_limit) beq 1f; \ /* COMING FROM USER MODE */ \ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ + lwz r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\ 1: addi r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm */\ tophys(r11,r11); \ stw r10,_CCR(r11); /* save various registers */\ @@ -953,9 +953,8 @@ _GLOBAL(set_context) /* Context switch the PTE pointer for the Abatron BDI2000. * The PGDIR is the second parameter. */ - lis r5, KERNELBASE@h - lwz r5, 0xf0(r5) - stw r4, 0x4(r5) + lis r5, abatron_pteptrs@ha + stw r4, abatron_pteptrs@l + 0x4(r5) #endif sync mtspr SPRN_PID,r3 diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S @@ -1019,10 +1019,10 @@ _GLOBAL(start_secondary_47x) /* Now we can get our task struct and real stack pointer */ - /* Get current_thread_info and current */ - lis r1,secondary_ti@ha - lwz r1,secondary_ti@l(r1) - lwz r2,TI_TASK(r1) + /* Get current's stack and current */ + lis r2,secondary_current@ha + lwz r2,secondary_current@l(r2) + lwz r1,TASK_STACK(r2) /* Current stack pointer */ addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S @@ -801,21 +801,19 @@ __secondary_start: /* Set thread priority to MEDIUM */ HMT_MEDIUM - /* Initialize the kernel stack */ - LOAD_REG_ADDR(r3, current_set) - sldi r28,r24,3 /* get current_set[cpu#] */ - ldx r14,r3,r28 - addi r14,r14,THREAD_SIZE-STACK_FRAME_OVERHEAD - std r14,PACAKSAVE(r13) - - /* Do early setup for that CPU (SLB and hash table pointer) */ + /* + * Do early setup for this CPU, in particular initialising the MMU so we + * can turn it on below. This is a call to C, which is OK, we're still + * running on the emergency stack. + */ bl early_setup_secondary /* - * setup the new stack pointer, but *don't* use this until - * translation is on. + * The primary has initialized our kernel stack for us in the paca, grab + * it and put it in r1. We must *not* use it until we turn on the MMU + * below, because it may not be inside the RMO. */ - mr r1, r14 + ld r1, PACAKSAVE(r13) /* Clear backchain so we get nice backtraces */ li r7,0 diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S @@ -142,7 +142,7 @@ instruction_counter: tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ mfspr r11,SPRN_SPRG_THREAD; \ - lwz r11,THREAD_INFO-THREAD(r11); \ + lwz r11,TASK_STACK-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ 1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ @@ -292,6 +292,17 @@ SystemCall: */ EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD) +/* Called from DataStoreTLBMiss when perf TLB misses events are activated */ +#ifdef CONFIG_PERF_EVENTS + patch_site 0f, patch__dtlbmiss_perf +0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + addi r10, r10, 1 + stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + mfspr r10, SPRN_SPRG_SCRATCH0 + mfspr r11, SPRN_SPRG_SCRATCH1 + rfi +#endif + . = 0x1100 /* * For the MPC8xx, this is a software tablewalk to load the instruction @@ -337,8 +348,8 @@ InstructionTLBMiss: rlwinm r10, r10, 16, 0xfff8 cmpli cr0, r10, PAGE_OFFSET@h #ifndef CONFIG_PIN_TLB_TEXT - /* It is assumed that kernel code fits into the first 8M page */ -0: cmpli cr7, r10, (PAGE_OFFSET + 0x0800000)@h + /* It is assumed that kernel code fits into the first 32M */ +0: cmpli cr7, r10, (PAGE_OFFSET + 0x2000000)@h patch_site 0b, patch__itlbmiss_linmem_top #endif #endif @@ -405,10 +416,20 @@ InstructionTLBMiss: #ifndef CONFIG_PIN_TLB_TEXT ITLBMissLinear: mtcr r11 +#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23 + patch_site 0f, patch__itlbmiss_linmem_top8 + + mfspr r10, SPRN_SRR0 +0: subis r11, r10, (PAGE_OFFSET - 0x80000000)@ha + rlwinm r11, r11, 4, MI_PS8MEG ^ MI_PS512K + ori r11, r11, MI_PS512K | MI_SVALID + rlwinm r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */ +#else /* Set 8M byte page and mark it valid */ li r11, MI_PS8MEG | MI_SVALID - mtspr SPRN_MI_TWC, r11 rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ +#endif + mtspr SPRN_MI_TWC, r11 ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ @@ -434,7 +455,7 @@ DataStoreTLBMiss: #ifndef CONFIG_PIN_TLB_IMMR cmpli cr6, r10, VIRT_IMMR_BASE@h #endif -0: cmpli cr7, r10, (PAGE_OFFSET + 0x1800000)@h +0: cmpli cr7, r10, (PAGE_OFFSET + 0x2000000)@h patch_site 0b, patch__dtlbmiss_linmem_top mfspr r10, SPRN_M_TWB /* Get level 1 table */ @@ -494,16 +515,6 @@ DataStoreTLBMiss: rfi patch_site 0b, patch__dtlbmiss_exit_1 -#ifdef CONFIG_PERF_EVENTS - patch_site 0f, patch__dtlbmiss_perf -0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - addi r10, r10, 1 - stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 - rfi -#endif - DTLBMissIMMR: mtcr r11 /* Set 512k byte guarded page and mark it valid */ @@ -525,10 +536,29 @@ DTLBMissIMMR: DTLBMissLinear: mtcr r11 + rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ +#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_DATA_SHIFT < 23 + patch_site 0f, patch__dtlbmiss_romem_top8 + +0: subis r11, r10, (PAGE_OFFSET - 0x80000000)@ha + rlwinm r11, r11, 0, 0xff800000 + neg r10, r11 + or r11, r11, r10 + rlwinm r11, r11, 4, MI_PS8MEG ^ MI_PS512K + ori r11, r11, MI_PS512K | MI_SVALID + mfspr r10, SPRN_MD_EPN + rlwinm r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */ +#else /* Set 8M byte page and mark it valid */ li r11, MD_PS8MEG | MD_SVALID +#endif mtspr SPRN_MD_TWC, r11 - rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ +#ifdef CONFIG_STRICT_KERNEL_RWX + patch_site 0f, patch__dtlbmiss_romem_top + +0: subis r11, r10, 0 + rlwimi r10, r11, 11, _PAGE_RO +#endif ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ @@ -551,11 +581,11 @@ InstructionTLBError: mr r4,r12 andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ andis. r10,r9,SRR1_ISI_NOPT@h - beq+ 1f + beq+ .Litlbie tlbie r4 -itlbie: /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */ -1: EXC_XFER_LITE(0x400, handle_page_fault) +.Litlbie: + EXC_XFER_LITE(0x400, handle_page_fault) /* This is the data TLB error on the MPC8xx. This could be due to * many reasons, including a dirty update to a pte. We bail out to @@ -577,10 +607,10 @@ DARFixed:/* Return from dcbx instruction bug workaround */ stw r5,_DSISR(r11) mfspr r4,SPRN_DAR andis. r10,r5,DSISR_NOHPTE@h - beq+ 1f + beq+ .Ldtlbie tlbie r4 -dtlbie: -1: li r10,RPN_PATTERN +.Ldtlbie: + li r10,RPN_PATTERN mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */ /* 0x300 is DataAccess exception, needed by bad_page_fault() */ EXC_XFER_LITE(0x300, handle_page_fault) @@ -603,8 +633,8 @@ DataBreakpoint: mtspr SPRN_SPRG_SCRATCH1, r11 mfcr r10 mfspr r11, SPRN_SRR0 - cmplwi cr0, r11, (dtlbie - PAGE_OFFSET)@l - cmplwi cr7, r11, (itlbie - PAGE_OFFSET)@l + cmplwi cr0, r11, (.Ldtlbie - PAGE_OFFSET)@l + cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l beq- cr0, 11f beq- cr7, 11f EXCEPTION_PROLOG_1 @@ -886,28 +916,11 @@ initial_mmu: mtspr SPRN_MD_CTR, r10 /* remove PINNED DTLB entries */ tlbia /* Invalidate all TLB entries */ -#ifdef CONFIG_PIN_TLB_TEXT - lis r8, MI_RSV4I@h - ori r8, r8, 0x1c00 - - mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ -#endif - #ifdef CONFIG_PIN_TLB_DATA oris r10, r10, MD_RSV4I@h mtspr SPRN_MD_CTR, r10 /* Set data TLB control */ #endif - /* Now map the lower 8 Meg into the ITLB. */ - lis r8, KERNELBASE@h /* Create vaddr for TLB */ - ori r8, r8, MI_EVALID /* Mark it valid */ - mtspr SPRN_MI_EPN, r8 - li r8, MI_PS8MEG /* Set 8M byte page */ - ori r8, r8, MI_SVALID /* Make it valid */ - mtspr SPRN_MI_TWC, r8 - li r8, MI_BOOTINIT /* Create RPN for address 0 */ - mtspr SPRN_MI_RPN, r8 /* Store TLB entry */ - lis r8, MI_APG_INIT@h /* Set protection modes */ ori r8, r8, MI_APG_INIT@l mtspr SPRN_MI_AP, r8 @@ -937,6 +950,34 @@ initial_mmu: mtspr SPRN_MD_RPN, r8 #endif + /* Now map the lower RAM (up to 32 Mbytes) into the ITLB. */ +#ifdef CONFIG_PIN_TLB_TEXT + lis r8, MI_RSV4I@h + ori r8, r8, 0x1c00 +#endif + li r9, 4 /* up to 4 pages of 8M */ + mtctr r9 + lis r9, KERNELBASE@h /* Create vaddr for TLB */ + li r10, MI_PS8MEG | MI_SVALID /* Set 8M byte page */ + li r11, MI_BOOTINIT /* Create RPN for address 0 */ + lis r12, _einittext@h + ori r12, r12, _einittext@l +1: +#ifdef CONFIG_PIN_TLB_TEXT + mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ + addi r8, r8, 0x100 +#endif + + ori r0, r9, MI_EVALID /* Mark it valid */ + mtspr SPRN_MI_EPN, r0 + mtspr SPRN_MI_TWC, r10 + mtspr SPRN_MI_RPN, r11 /* Store TLB entry */ + addis r9, r9, 0x80 + addis r11, r11, 0x80 + + cmpl cr0, r9, r12 + bdnzf gt, 1b + /* Since the cache is enabled according to the information we * just loaded into the TLB, invalidate and enable the caches here. * We should probably check/set other modes....later. @@ -989,5 +1030,6 @@ swapper_pg_dir: /* Room for two PTE table poiners, usually the kernel and current user * pointer to their respective root page table (pgdir). */ + .globl abatron_pteptrs abatron_pteptrs: .space 8 diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h @@ -55,7 +55,7 @@ END_BTB_FLUSH_SECTION beq 1f; \ BOOKE_CLEAR_BTB(r11) \ /* if from user, start at top of this thread's kernel stack */ \ - lwz r11, THREAD_INFO-THREAD(r10); \ + lwz r11, TASK_STACK - THREAD(r10); \ ALLOC_STACK_FRAME(r11, THREAD_SIZE); \ 1 : subi r11, r11, INT_FRAME_SIZE; /* Allocate exception frame */ \ stw r13, _CCR(r11); /* save various registers */ \ @@ -142,7 +142,7 @@ END_BTB_FLUSH_SECTION BOOKE_CLEAR_BTB(r10) \ andi. r11,r11,MSR_PR; \ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ + lwz r11, TASK_STACK - THREAD(r11); /* this thread's kernel stack */\ addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ beq 1f; \ /* COMING FROM USER MODE */ \ @@ -155,13 +155,7 @@ END_BTB_FLUSH_SECTION stw r10,GPR11(r11); \ b 2f; \ /* COMING FROM PRIV MODE */ \ -1: lwz r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r11); \ - lwz r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r11); \ - stw r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r8); \ - stw r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r8); \ - lwz r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r11); \ - stw r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r8); \ - mr r11,r8; \ +1: mr r11, r8; \ 2: mfspr r8,SPRN_SPRG_RSCRATCH_##exc_level; \ stw r12,GPR12(r11); /* save various registers */\ mflr r10; \ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S @@ -243,8 +243,9 @@ set_ivor: li r0,0 stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) - CURRENT_THREAD_INFO(r22, r1) - stw r24, TI_CPU(r22) +#ifdef CONFIG_SMP + stw r24, TASK_CPU(r2) +#endif bl early_init @@ -717,8 +718,7 @@ finish_tlb_load: /* Get the next_tlbcam_idx percpu var */ #ifdef CONFIG_SMP - lwz r12, THREAD_INFO-THREAD(r12) - lwz r15, TI_CPU(r12) + lwz r15, TASK_CPU-THREAD(r12) lis r14, __per_cpu_offset@h ori r14, r14, __per_cpu_offset@l rlwinm r15, r15, 2, 0, 29 @@ -1089,10 +1089,10 @@ __secondary_start: mr r4,r24 /* Why? */ bl call_setup_cpu - /* get current_thread_info and current */ - lis r1,secondary_ti@ha - lwz r1,secondary_ti@l(r1) - lwz r2,TI_TASK(r1) + /* get current's stack and current */ + lis r2,secondary_current@ha + lwz r2,secondary_current@l(r2) + lwz r1,TASK_STACK(r2) /* stack */ addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S @@ -136,10 +136,9 @@ BEGIN_FTR_SECTION DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) - CURRENT_THREAD_INFO(r9, r1) - lwz r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ + lwz r8,TI_LOCAL_FLAGS(r2) /* set napping bit */ ori r8,r8,_TLF_NAPPING /* so when we take an exception */ - stw r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ + stw r8,TI_LOCAL_FLAGS(r2) /* it will return to our caller */ mfmsr r7 ori r7,r7,MSR_EE oris r7,r7,MSR_POW@h @@ -159,8 +158,7 @@ _GLOBAL(power_save_ppc32_restore) stw r9,_NIP(r11) /* make it do a blr */ #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r12, r11) - lwz r11,TI_CPU(r12) /* get cpu number * 4 */ + lwz r11,TASK_CPU(r2) /* get cpu number * 4 */ slwi r11,r11,2 #else li r11,0 diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S @@ -63,7 +63,7 @@ _GLOBAL(\name) 1: /* Let's set the _TLF_NAPPING flag so interrupts make us return * to the right spot */ - CURRENT_THREAD_INFO(r11, r1) + ld r11, PACACURRENT(r13) ld r10,TI_LOCAL_FLAGS(r11) ori r10,r10,_TLF_NAPPING std r10,TI_LOCAL_FLAGS(r11) diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S @@ -22,10 +22,9 @@ .text _GLOBAL(e500_idle) - CURRENT_THREAD_INFO(r3, r1) - lwz r4,TI_LOCAL_FLAGS(r3) /* set napping bit */ + lwz r4,TI_LOCAL_FLAGS(r2) /* set napping bit */ ori r4,r4,_TLF_NAPPING /* so when we take an exception */ - stw r4,TI_LOCAL_FLAGS(r3) /* it will return to our caller */ + stw r4,TI_LOCAL_FLAGS(r2) /* it will return to our caller */ #ifdef CONFIG_PPC_E500MC wrteei 1 @@ -88,8 +87,7 @@ _GLOBAL(power_save_ppc32_restore) stw r9,_NIP(r11) /* make it do a blr */ #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r12, r1) - lwz r11,TI_CPU(r12) /* get cpu number * 4 */ + lwz r11,TASK_CPU(r2) /* get cpu number * 4 */ slwi r11,r11,2 #else li r11,0 diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S @@ -68,7 +68,7 @@ BEGIN_FTR_SECTION DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) - CURRENT_THREAD_INFO(r9, r1) + ld r9, PACA_THREAD_INFO(r13) ld r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ ori r8,r8,_TLF_NAPPING /* so when we take an exception */ std r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c @@ -618,9 +618,8 @@ static inline void check_stack_overflow(void) sp = current_stack_pointer() & (THREAD_SIZE-1); /* check for stack overflow: is there less than 2KB free? */ - if (unlikely(sp < (sizeof(struct thread_info) + 2048))) { - pr_err("do_IRQ: stack overflow: %ld\n", - sp - sizeof(struct thread_info)); + if (unlikely(sp < 2048)) { + pr_err("do_IRQ: stack overflow: %ld\n", sp); dump_stack(); } #endif @@ -660,36 +659,21 @@ void __do_irq(struct pt_regs *regs) void do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - struct thread_info *curtp, *irqtp, *sirqtp; + void *cursp, *irqsp, *sirqsp; /* Switch to the irq stack to handle this */ - curtp = current_thread_info(); - irqtp = hardirq_ctx[raw_smp_processor_id()]; - sirqtp = softirq_ctx[raw_smp_processor_id()]; + cursp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); + irqsp = hardirq_ctx[raw_smp_processor_id()]; + sirqsp = softirq_ctx[raw_smp_processor_id()]; /* Already there ? */ - if (unlikely(curtp == irqtp || curtp == sirqtp)) { + if (unlikely(cursp == irqsp || cursp == sirqsp)) { __do_irq(regs); set_irq_regs(old_regs); return; } - - /* Prepare the thread_info in the irq stack */ - irqtp->task = curtp->task; - irqtp->flags = 0; - - /* Copy the preempt_count so that the [soft]irq checks work. */ - irqtp->preempt_count = curtp->preempt_count; - /* Switch stack and call */ - call_do_irq(regs, irqtp); - - /* Restore stack limit */ - irqtp->task = NULL; - - /* Copy back updates to the thread_info */ - if (irqtp->flags) - set_bits(irqtp->flags, &curtp->flags); + call_do_irq(regs, irqsp); set_irq_regs(old_regs); } @@ -698,90 +682,20 @@ void __init init_IRQ(void) { if (ppc_md.init_IRQ) ppc_md.init_IRQ(); - - exc_lvl_ctx_init(); - - irq_ctx_init(); } #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) -struct thread_info *critirq_ctx[NR_CPUS] __read_mostly; -struct thread_info *dbgirq_ctx[NR_CPUS] __read_mostly; -struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly; - -void exc_lvl_ctx_init(void) -{ - struct thread_info *tp; - int i, cpu_nr; - - for_each_possible_cpu(i) { -#ifdef CONFIG_PPC64 - cpu_nr = i; -#else -#ifdef CONFIG_SMP - cpu_nr = get_hard_smp_processor_id(i); -#else - cpu_nr = 0; -#endif +void *critirq_ctx[NR_CPUS] __read_mostly; +void *dbgirq_ctx[NR_CPUS] __read_mostly; +void *mcheckirq_ctx[NR_CPUS] __read_mostly; #endif - memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE); - tp = critirq_ctx[cpu_nr]; - tp->cpu = cpu_nr; - tp->preempt_count = 0; - -#ifdef CONFIG_BOOKE - memset((void *)dbgirq_ctx[cpu_nr], 0, THREAD_SIZE); - tp = dbgirq_ctx[cpu_nr]; - tp->cpu = cpu_nr; - tp->preempt_count = 0; - - memset((void *)mcheckirq_ctx[cpu_nr], 0, THREAD_SIZE); - tp = mcheckirq_ctx[cpu_nr]; - tp->cpu = cpu_nr; - tp->preempt_count = HARDIRQ_OFFSET; -#endif - } -} -#endif - -struct thread_info *softirq_ctx[NR_CPUS] __read_mostly; -struct thread_info *hardirq_ctx[NR_CPUS] __read_mostly; - -void irq_ctx_init(void) -{ - struct thread_info *tp; - int i; - - for_each_possible_cpu(i) { - memset((void *)softirq_ctx[i], 0, THREAD_SIZE); - tp = softirq_ctx[i]; - tp->cpu = i; - klp_init_thread_info(tp); - - memset((void *)hardirq_ctx[i], 0, THREAD_SIZE); - tp = hardirq_ctx[i]; - tp->cpu = i; - klp_init_thread_info(tp); - } -} +void *softirq_ctx[NR_CPUS] __read_mostly; +void *hardirq_ctx[NR_CPUS] __read_mostly; void do_softirq_own_stack(void) { - struct thread_info *curtp, *irqtp; - - curtp = current_thread_info(); - irqtp = softirq_ctx[smp_processor_id()]; - irqtp->task = curtp->task; - irqtp->flags = 0; - call_do_softirq(irqtp); - irqtp->task = NULL; - - /* Set any flag that may have been set on the - * alternate stack - */ - if (irqtp->flags) - set_bits(irqtp->flags, &curtp->flags); + call_do_softirq(softirq_ctx[smp_processor_id()]); } irq_hw_number_t virq_to_hw(unsigned int virq) @@ -827,11 +741,6 @@ int irq_choose_cpu(const struct cpumask *mask) } #endif -int arch_early_irq_init(void) -{ - return 0; -} - #ifdef CONFIG_PPC64 static int __init setup_noirqdistrib(char *str) { diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c @@ -151,41 +151,13 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs) return 1; } -static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info); static int kgdb_singlestep(struct pt_regs *regs) { - struct thread_info *thread_info, *exception_thread_info; - struct thread_info *backup_current_thread_info = - this_cpu_ptr(&kgdb_thread_info); - if (user_mode(regs)) return 0; - /* - * On Book E and perhaps other processors, singlestep is handled on - * the critical exception stack. This causes current_thread_info() - * to fail, since it it locates the thread_info by masking off - * the low bits of the current stack pointer. We work around - * this issue by copying the thread_info from the kernel stack - * before calling kgdb_handle_exception, and copying it back - * afterwards. On most processors the copy is avoided since - * exception_thread_info == thread_info. - */ - thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1)); - exception_thread_info = current_thread_info(); - - if (thread_info != exception_thread_info) { - /* Save the original current_thread_info. */ - memcpy(backup_current_thread_info, exception_thread_info, sizeof *thread_info); - memcpy(exception_thread_info, thread_info, sizeof *thread_info); - } - kgdb_handle_exception(0, SIGTRAP, 0, regs); - if (thread_info != exception_thread_info) - /* Restore current_thread_info lastly. */ - memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info); - return 1; } diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c @@ -317,10 +317,8 @@ void default_machine_kexec(struct kimage *image) * We setup preempt_count to avoid using VMX in memcpy. * XXX: the task struct will likely be invalid once we do the copy! */ - kexec_stack.thread_info.task = current_thread_info()->task; - kexec_stack.thread_info.flags = 0; - kexec_stack.thread_info.preempt_count = HARDIRQ_OFFSET; - kexec_stack.thread_info.cpu = current_thread_info()->cpu; + current_thread_info()->flags = 0; + current_thread_info()->preempt_count = HARDIRQ_OFFSET; /* We need a static PACA, too; copy this CPU's PACA over and switch to * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c @@ -31,6 +31,7 @@ #include <asm/machdep.h> #include <asm/mce.h> +#include <asm/nmi.h> static DEFINE_PER_CPU(int, mce_nest_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); @@ -301,13 +302,13 @@ static void machine_check_process_queued_event(struct irq_work *work) while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; evt = this_cpu_ptr(&mce_event_queue[index]); - machine_check_print_event_info(evt, false); + machine_check_print_event_info(evt, false, false); __this_cpu_dec(mce_queue_count); } } void machine_check_print_event_info(struct machine_check_event *evt, - bool user_mode) + bool user_mode, bool in_guest) { const char *level, *sevstr, *subtype; static const char *mc_ue_types[] = { @@ -387,7 +388,9 @@ void machine_check_print_event_info(struct machine_check_event *evt, evt->disposition == MCE_DISPOSITION_RECOVERED ? "Recovered" : "Not recovered"); - if (user_mode) { + if (in_guest) { + printk("%s Guest NIP: %016llx\n", level, evt->srr0); + } else if (user_mode) { printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level, evt->srr0, current->pid, current->comm); } else { @@ -488,6 +491,8 @@ long machine_check_early(struct pt_regs *regs) { long handled = 0; + hv_nmi_check_nonrecoverable(regs); + /* * See if platform is capable of handling machine check. */ diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S @@ -46,11 +46,10 @@ _GLOBAL(call_do_softirq) mflr r0 stw r0,4(r1) lwz r10,THREAD+KSP_LIMIT(r2) - addi r11,r3,THREAD_INFO_GAP + stw r3, THREAD+KSP_LIMIT(r2) stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) mr r1,r3 stw r10,8(r1) - stw r11,THREAD+KSP_LIMIT(r2) bl __do_softirq lwz r10,8(r1) lwz r1,0(r1) @@ -60,17 +59,16 @@ _GLOBAL(call_do_softirq) blr /* - * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp); + * void call_do_irq(struct pt_regs *regs, void *sp); */ _GLOBAL(call_do_irq) mflr r0 stw r0,4(r1) lwz r10,THREAD+KSP_LIMIT(r2) - addi r11,r4,THREAD_INFO_GAP + stw r4, THREAD+KSP_LIMIT(r2) stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) mr r1,r4 stw r10,8(r1) - stw r11,THREAD+KSP_LIMIT(r2) bl __do_irq lwz r10,8(r1) lwz r1,0(r1) @@ -183,10 +181,13 @@ _GLOBAL(low_choose_750fx_pll) or r4,r4,r5 mtspr SPRN_HID1,r4 +#ifdef CONFIG_SMP /* Store new HID1 image */ - CURRENT_THREAD_INFO(r6, r1) - lwz r6,TI_CPU(r6) + lwz r6,TASK_CPU(r2) slwi r6,r6,2 +#else + li r6, 0 +#endif addis r6,r6,nap_save_hid1@ha stw r4,nap_save_hid1@l(r6) @@ -599,7 +600,7 @@ EXPORT_SYMBOL(__bswapdi2) #ifdef CONFIG_SMP _GLOBAL(start_secondary_resume) /* Reset stack */ - CURRENT_THREAD_INFO(r1, r1) + rlwinm r1, r1, 0, 0, 31 - THREAD_SHIFT addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD li r3,0 stw r3,0(r1) /* Zero the stack frame pointer */ diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c @@ -63,19 +63,13 @@ resource_size_t isa_mem_base; EXPORT_SYMBOL(isa_mem_base); -static const struct dma_map_ops *pci_dma_ops = &dma_nommu_ops; +static const struct dma_map_ops *pci_dma_ops; void set_pci_dma_ops(const struct dma_map_ops *dma_ops) { pci_dma_ops = dma_ops; } -const struct dma_map_ops *get_pci_dma_ops(void) -{ - return pci_dma_ops; -} -EXPORT_SYMBOL(get_pci_dma_ops); - /* * This function should run under locking protection, specifically * hose_spinlock. @@ -358,6 +352,17 @@ struct pci_controller* pci_find_hose_for_OF_device(struct device_node* node) return NULL; } +struct pci_controller *pci_find_controller_for_domain(int domain_nr) +{ + struct pci_controller *hose; + + list_for_each_entry(hose, &hose_list, list_node) + if (hose->global_number == domain_nr) + return hose; + + return NULL; +} + /* * Reads the interrupt pin to determine if interrupt is use by card. * If the interrupt is used, then gets the interrupt line from the @@ -973,7 +978,7 @@ static void pcibios_setup_device(struct pci_dev *dev) /* Hook up default DMA ops */ set_dma_ops(&dev->dev, pci_dma_ops); - set_dma_offset(&dev->dev, PCI_DRAM_OFFSET); + dev->dev.archdata.dma_offset = PCI_DRAM_OFFSET; /* Additional platform DMA/iommu setup */ phb = pci_bus_to_host(dev->bus); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c @@ -176,7 +176,7 @@ static void __giveup_fpu(struct task_struct *tsk) save_fpu(tsk); msr = tsk->thread.regs->msr; - msr &= ~MSR_FP; + msr &= ~(MSR_FP|MSR_FE0|MSR_FE1); #ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX)) msr &= ~MSR_VSX; @@ -1231,8 +1231,8 @@ struct task_struct *__switch_to(struct task_struct *prev, batch->active = 1; } - if (current_thread_info()->task->thread.regs) { - restore_math(current_thread_info()->task->thread.regs); + if (current->thread.regs) { + restore_math(current->thread.regs); /* * The copy-paste buffer can only store into foreign real @@ -1242,7 +1242,7 @@ struct task_struct *__switch_to(struct task_struct *prev, * mappings, we must issue a cp_abort to clear any state and * prevent snooping, corruption or a covert channel. */ - if (current_thread_info()->task->thread.used_vas) + if (current->thread.used_vas) asm volatile(PPC_CP_ABORT); } #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -1634,7 +1634,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; struct thread_info *ti = task_thread_info(p); - klp_init_thread_info(ti); + klp_init_thread_info(p); /* Copy registers */ sp -= sizeof(struct pt_regs); @@ -1691,8 +1691,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, sp -= STACK_FRAME_OVERHEAD; p->thread.ksp = sp; #ifdef CONFIG_PPC32 - p->thread.ksp_limit = (unsigned long)task_stack_page(p) + - _ALIGN_UP(sizeof(struct thread_info), 16); + p->thread.ksp_limit = (unsigned long)end_of_stack(p); #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT p->thread.ptrace_bps[0] = NULL; @@ -1995,21 +1994,14 @@ static inline int valid_irq_stack(unsigned long sp, struct task_struct *p, unsigned long stack_page; unsigned long cpu = task_cpu(p); - /* - * Avoid crashing if the stack has overflowed and corrupted - * task_cpu(p), which is in the thread_info struct. - */ - if (cpu < NR_CPUS && cpu_possible(cpu)) { - stack_page = (unsigned long) hardirq_ctx[cpu]; - if (sp >= stack_page + sizeof(struct thread_struct) - && sp <= stack_page + THREAD_SIZE - nbytes) - return 1; - - stack_page = (unsigned long) softirq_ctx[cpu]; - if (sp >= stack_page + sizeof(struct thread_struct) - && sp <= stack_page + THREAD_SIZE - nbytes) - return 1; - } + stack_page = (unsigned long)hardirq_ctx[cpu]; + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; + + stack_page = (unsigned long)softirq_ctx[cpu]; + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; + return 0; } @@ -2018,8 +2010,10 @@ int validate_sp(unsigned long sp, struct task_struct *p, { unsigned long stack_page = (unsigned long)task_stack_page(p); - if (sp >= stack_page + sizeof(struct thread_struct) - && sp <= stack_page + THREAD_SIZE - nbytes) + if (sp < THREAD_SIZE) + return 0; + + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) return 1; return valid_irq_stack(sp, p, nbytes); @@ -2027,7 +2021,7 @@ int validate_sp(unsigned long sp, struct task_struct *p, EXPORT_SYMBOL(validate_sp); -unsigned long get_wchan(struct task_struct *p) +static unsigned long __get_wchan(struct task_struct *p) { unsigned long ip, sp; int count = 0; @@ -2053,6 +2047,20 @@ unsigned long get_wchan(struct task_struct *p) return 0; } +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long ret; + + if (!try_get_task_stack(p)) + return 0; + + ret = __get_wchan(p); + + put_task_stack(p); + + return ret; +} + static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; void show_stack(struct task_struct *tsk, unsigned long *stack) @@ -2067,9 +2075,13 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) int curr_frame = 0; #endif - sp = (unsigned long) stack; if (tsk == NULL) tsk = current; + + if (!try_get_task_stack(tsk)) + return; + + sp = (unsigned long) stack; if (sp == 0) { if (tsk == current) sp = current_stack_pointer(); @@ -2081,7 +2093,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) printk("Call Trace:\n"); do { if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) - return; + break; stack = (unsigned long *) sp; newsp = stack[0]; @@ -2121,6 +2133,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) sp = newsp; } while (count++ < kstack_depth_to_print); + + put_task_stack(tsk); } #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c @@ -33,6 +33,7 @@ #include <linux/hw_breakpoint.h> #include <linux/perf_event.h> #include <linux/context_tracking.h> +#include <linux/nospec.h> #include <linux/uaccess.h> #include <linux/pkeys.h> @@ -274,6 +275,8 @@ static int set_user_trap(struct task_struct *task, unsigned long trap) */ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) { + unsigned int regs_max; + if ((task->thread.regs == NULL) || !data) return -EIO; @@ -297,7 +300,9 @@ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) } #endif - if (regno < (sizeof(struct user_pt_regs) / sizeof(unsigned long))) { + regs_max = sizeof(struct user_pt_regs) / sizeof(unsigned long); + if (regno < regs_max) { + regno = array_index_nospec(regno, regs_max); *data = ((unsigned long *)task->thread.regs)[regno]; return 0; } @@ -321,6 +326,7 @@ int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data) return set_user_dscr(task, data); if (regno <= PT_MAX_PUT_REG) { + regno = array_index_nospec(regno, PT_MAX_PUT_REG + 1); ((unsigned long *)task->thread.regs)[regno] = data; return 0; } @@ -561,6 +567,7 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset, /* * Copy out only the low-order word of vrsave. */ + int start, end; union { elf_vrreg_t reg; u32 word; @@ -569,8 +576,10 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset, vrsave.word = target->thread.vrsave; + start = 33 * sizeof(vector128); + end = start + sizeof(vrsave); ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave, - 33 * sizeof(vector128), -1); + start, end); } return ret; @@ -608,6 +617,7 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset, /* * We use only the first word of vrsave. */ + int start, end; union { elf_vrreg_t reg; u32 word; @@ -616,8 +626,10 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset, vrsave.word = target->thread.vrsave; + start = 33 * sizeof(vector128); + end = start + sizeof(vrsave); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave, - 33 * sizeof(vector128), -1); + start, end); if (!ret) target->thread.vrsave = vrsave.word; } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c @@ -634,7 +634,7 @@ void probe_machine(void) } /* What can we do if we didn't find ? */ if (machine_id >= &__machine_desc_end) { - DBG("No suitable machine found !\n"); + pr_err("No suitable machine description found !\n"); for (;;); } @@ -791,7 +791,6 @@ void arch_setup_pdev_archdata(struct platform_device *pdev) { pdev->archdata.dma_mask = DMA_BIT_MASK(32); pdev->dev.dma_mask = &pdev->archdata.dma_mask; - set_dma_ops(&pdev->dev, &dma_nommu_ops); } static __init void print_system_info(void) @@ -938,7 +937,7 @@ void __init setup_arch(char **cmdline_p) /* Reserve large chunks of memory for use by CMA for KVM. */ kvm_cma_reserve(); - klp_init_thread_info(&init_thread_info); + klp_init_thread_info(&init_task); init_mm.start_code = (unsigned long)_stext; init_mm.end_code = (unsigned long) _etext; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c @@ -162,6 +162,17 @@ static int __init ppc_init(void) } arch_initcall(ppc_init); +static void *__init alloc_stack(void) +{ + void *ptr = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + + if (!ptr) + panic("cannot allocate %d bytes for stack at %pS\n", + THREAD_SIZE, (void *)_RET_IP_); + + return ptr; +} + void __init irqstack_early_init(void) { unsigned int i; @@ -169,10 +180,8 @@ void __init irqstack_early_init(void) /* interrupt stacks must be in lowmem, we get that for free on ppc32 * as the memblock is limited to lowmem by default */ for_each_possible_cpu(i) { - softirq_ctx[i] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); - hardirq_ctx[i] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); + softirq_ctx[i] = alloc_stack(); + hardirq_ctx[i] = alloc_stack(); } } @@ -190,13 +199,10 @@ void __init exc_lvl_early_init(void) hw_cpu = 0; #endif - critirq_ctx[hw_cpu] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); + critirq_ctx[hw_cpu] = alloc_stack(); #ifdef CONFIG_BOOKE - dbgirq_ctx[hw_cpu] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); - mcheckirq_ctx[hw_cpu] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); + dbgirq_ctx[hw_cpu] = alloc_stack(); + mcheckirq_ctx[hw_cpu] = alloc_stack(); #endif } } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c @@ -634,19 +634,17 @@ __init u64 ppc64_bolted_size(void) static void *__init alloc_stack(unsigned long limit, int cpu) { - unsigned long pa; + void *ptr; BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16); - pa = memblock_alloc_base_nid(THREAD_SIZE, THREAD_SIZE, limit, - early_cpu_to_node(cpu), MEMBLOCK_NONE); - if (!pa) { - pa = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); - if (!pa) - panic("cannot allocate stacks"); - } + ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_SIZE, + MEMBLOCK_LOW_LIMIT, limit, + early_cpu_to_node(cpu)); + if (!ptr) + panic("cannot allocate stacks"); - return __va(pa); + return ptr; } void __init irqstack_early_init(void) @@ -692,24 +690,6 @@ void __init exc_lvl_early_init(void) #endif /* - * Emergency stacks are used for a range of things, from asynchronous - * NMIs (system reset, machine check) to synchronous, process context. - * We set preempt_count to zero, even though that isn't necessarily correct. To - * get the right value we'd need to copy it from the previous thread_info, but - * doing that might fault causing more problems. - * TODO: what to do with accounting? - */ -static void emerg_stack_init_thread_info(struct thread_info *ti, int cpu) -{ - ti->task = NULL; - ti->cpu = cpu; - ti->preempt_count = 0; - ti->local_flags = 0; - ti->flags = 0; - klp_init_thread_info(ti); -} - -/* * Stack space used when we detect a bad kernel stack pointer, and * early in SMP boots before relocation is enabled. Exclusive emergency * stack for machine checks. @@ -736,25 +716,14 @@ void __init emergency_stack_init(void) limit = min(ppc64_bolted_size(), ppc64_rma_size); for_each_possible_cpu(i) { - struct thread_info *ti; - - ti = alloc_stack(limit, i); - memset(ti, 0, THREAD_SIZE); - emerg_stack_init_thread_info(ti, i); - paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE; + paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; #ifdef CONFIG_PPC_BOOK3S_64 /* emergency stack for NMI exception handling. */ - ti = alloc_stack(limit, i); - memset(ti, 0, THREAD_SIZE); - emerg_stack_init_thread_info(ti, i); - paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE; + paca_ptrs[i]->nmi_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; /* emergency stack for machine check exception handling. */ - ti = alloc_stack(limit, i); - memset(ti, 0, THREAD_SIZE); - emerg_stack_init_thread_info(ti, i); - paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE; + paca_ptrs[i]->mc_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; #endif } } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c @@ -20,6 +20,7 @@ #include <linux/kernel.h> #include <linux/export.h> #include <linux/sched/mm.h> +#include <linux/sched/task_stack.h> #include <linux/sched/topology.h> #include <linux/smp.h> #include <linux/interrupt.h> @@ -75,7 +76,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif -struct thread_info *secondary_ti; +struct task_struct *secondary_current; bool has_big_cores; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); @@ -358,13 +359,12 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) * NMI IPIs may not be recoverable, so should not be used as ongoing part of * a running system. They can be used for crash, debug, halt/reboot, etc. * - * NMI IPIs are globally single threaded. No more than one in progress at - * any time. - * * The IPI call waits with interrupts disabled until all targets enter the - * NMI handler, then the call returns. + * NMI handler, then returns. Subsequent IPIs can be issued before targets + * have returned from their handlers, so there is no guarantee about + * concurrency or re-entrancy. * - * No new NMI can be initiated until targets exit the handler. + * A new NMI can be issued before all targets exit the handler. * * The IPI call may time out without all targets entering the NMI handler. * In that case, there is some logic to recover (and ignore subsequent @@ -375,7 +375,7 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0); static struct cpumask nmi_ipi_pending_mask; -static int nmi_ipi_busy_count = 0; +static bool nmi_ipi_busy = false; static void (*nmi_ipi_function)(struct pt_regs *) = NULL; static void nmi_ipi_lock_start(unsigned long *flags) @@ -414,7 +414,7 @@ static void nmi_ipi_unlock_end(unsigned long *flags) */ int smp_handle_nmi_ipi(struct pt_regs *regs) { - void (*fn)(struct pt_regs *); + void (*fn)(struct pt_regs *) = NULL; unsigned long flags; int me = raw_smp_processor_id(); int ret = 0; @@ -425,29 +425,17 @@ int smp_handle_nmi_ipi(struct pt_regs *regs) * because the caller may have timed out. */ nmi_ipi_lock_start(&flags); - if (!nmi_ipi_busy_count) - goto out; - if (!cpumask_test_cpu(me, &nmi_ipi_pending_mask)) - goto out; - - fn = nmi_ipi_function; - if (!fn) - goto out; - - cpumask_clear_cpu(me, &nmi_ipi_pending_mask); - nmi_ipi_busy_count++; - nmi_ipi_unlock(); - - ret = 1; - - fn(regs); - - nmi_ipi_lock(); - if (nmi_ipi_busy_count > 1) /* Can race with caller time-out */ - nmi_ipi_busy_count--; -out: + if (cpumask_test_cpu(me, &nmi_ipi_pending_mask)) { + cpumask_clear_cpu(me, &nmi_ipi_pending_mask); + fn = READ_ONCE(nmi_ipi_function); + WARN_ON_ONCE(!fn); + ret = 1; + } nmi_ipi_unlock_end(&flags); + if (fn) + fn(regs); + return ret; } @@ -473,9 +461,10 @@ static void do_smp_send_nmi_ipi(int cpu, bool safe) * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS. * - fn is the target callback function. * - delay_us > 0 is the delay before giving up waiting for targets to - * complete executing the handler, == 0 specifies indefinite delay. + * begin executing the handler, == 0 specifies indefinite delay. */ -int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool safe) +static int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), + u64 delay_us, bool safe) { unsigned long flags; int me = raw_smp_processor_id(); @@ -487,31 +476,33 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool if (unlikely(!smp_ops)) return 0; - /* Take the nmi_ipi_busy count/lock with interrupts hard disabled */ nmi_ipi_lock_start(&flags); - while (nmi_ipi_busy_count) { + while (nmi_ipi_busy) { nmi_ipi_unlock_end(&flags); - spin_until_cond(nmi_ipi_busy_count == 0); + spin_until_cond(!nmi_ipi_busy); nmi_ipi_lock_start(&flags); } - + nmi_ipi_busy = true; nmi_ipi_function = fn; + WARN_ON_ONCE(!cpumask_empty(&nmi_ipi_pending_mask)); + if (cpu < 0) { /* ALL_OTHERS */ cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask); cpumask_clear_cpu(me, &nmi_ipi_pending_mask); } else { - /* cpumask starts clear */ cpumask_set_cpu(cpu, &nmi_ipi_pending_mask); } - nmi_ipi_busy_count++; + nmi_ipi_unlock(); + /* Interrupts remain hard disabled */ + do_smp_send_nmi_ipi(cpu, safe); nmi_ipi_lock(); - /* nmi_ipi_busy_count is held here, so unlock/lock is okay */ + /* nmi_ipi_busy is set here, so unlock/lock is okay */ while (!cpumask_empty(&nmi_ipi_pending_mask)) { nmi_ipi_unlock(); udelay(1); @@ -523,29 +514,15 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool } } - while (nmi_ipi_busy_count > 1) { - nmi_ipi_unlock(); - udelay(1); - nmi_ipi_lock(); - if (delay_us) { - delay_us--; - if (!delay_us) - break; - } - } - if (!cpumask_empty(&nmi_ipi_pending_mask)) { /* Timeout waiting for CPUs to call smp_handle_nmi_ipi */ ret = 0; cpumask_clear(&nmi_ipi_pending_mask); } - if (nmi_ipi_busy_count > 1) { - /* Timeout waiting for CPUs to execute fn */ - ret = 0; - nmi_ipi_busy_count = 1; - } - nmi_ipi_busy_count--; + nmi_ipi_function = NULL; + nmi_ipi_busy = false; + nmi_ipi_unlock_end(&flags); return ret; @@ -613,17 +590,8 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) static void nmi_stop_this_cpu(struct pt_regs *regs) { /* - * This is a special case because it never returns, so the NMI IPI - * handling would never mark it as done, which makes any later - * smp_send_nmi_ipi() call spin forever. Mark it done now. - * * IRQs are already hard disabled by the smp_handle_nmi_ipi. */ - nmi_ipi_lock(); - if (nmi_ipi_busy_count > 1) - nmi_ipi_busy_count--; - nmi_ipi_unlock(); - spin_begin(); while (1) spin_cpu_relax(); @@ -663,7 +631,7 @@ void smp_send_stop(void) } #endif /* CONFIG_NMI_IPI */ -struct thread_info *current_set[NR_CPUS]; +struct task_struct *current_set[NR_CPUS]; static void smp_store_cpu_info(int id) { @@ -928,7 +896,7 @@ void smp_prepare_boot_cpu(void) paca_ptrs[boot_cpuid]->__current = current; #endif set_numa_node(numa_cpu_lookup_table[boot_cpuid]); - current_set[boot_cpuid] = task_thread_info(current); + current_set[boot_cpuid] = current; } #ifdef CONFIG_HOTPLUG_CPU @@ -1013,14 +981,13 @@ static bool secondaries_inhibited(void) static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) { - struct thread_info *ti = task_thread_info(idle); - #ifdef CONFIG_PPC64 paca_ptrs[cpu]->__current = idle; - paca_ptrs[cpu]->kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD; + paca_ptrs[cpu]->kstack = (unsigned long)task_stack_page(idle) + + THREAD_SIZE - STACK_FRAME_OVERHEAD; #endif - ti->cpu = cpu; - secondary_ti = current_set[cpu] = ti; + idle->cpu = cpu; + secondary_current = current_set[cpu] = idle; } int __cpu_up(unsigned int cpu, struct task_struct *tidle) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c @@ -67,12 +67,17 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { unsigned long sp; + if (!try_get_task_stack(tsk)) + return; + if (tsk == current) sp = current_stack_pointer(); else sp = tsk->thread.ksp; save_context_stack(trace, sp, tsk, 0); + + put_task_stack(tsk); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); @@ -84,25 +89,21 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) EXPORT_SYMBOL_GPL(save_stack_trace_regs); #ifdef CONFIG_HAVE_RELIABLE_STACKTRACE -int -save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) +/* + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) { unsigned long sp; + unsigned long newsp; unsigned long stack_page = (unsigned long)task_stack_page(tsk); unsigned long stack_end; int graph_idx = 0; - - /* - * The last frame (unwinding first) may not yet have saved - * its LR onto the stack. - */ - int firstframe = 1; - - if (tsk == current) - sp = current_stack_pointer(); - else - sp = tsk->thread.ksp; + bool firstframe; stack_end = stack_page + THREAD_SIZE; if (!is_idle_task(tsk)) { @@ -129,40 +130,53 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, stack_end -= STACK_FRAME_OVERHEAD; } + if (tsk == current) + sp = current_stack_pointer(); + else + sp = tsk->thread.ksp; + if (sp < stack_page + sizeof(struct thread_struct) || sp > stack_end - STACK_FRAME_MIN_SIZE) { - return 1; + return -EINVAL; } - for (;;) { + for (firstframe = true; sp != stack_end; + firstframe = false, sp = newsp) { unsigned long *stack = (unsigned long *) sp; - unsigned long newsp, ip; + unsigned long ip; /* sanity check: ABI requires SP to be aligned 16 bytes. */ if (sp & 0xF) - return 1; - - /* Mark stacktraces with exception frames as unreliable. */ - if (sp <= stack_end - STACK_INT_FRAME_SIZE && - stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { - return 1; - } + return -EINVAL; newsp = stack[0]; /* Stack grows downwards; unwinder may only go up. */ if (newsp <= sp) - return 1; + return -EINVAL; if (newsp != stack_end && newsp > stack_end - STACK_FRAME_MIN_SIZE) { - return 1; /* invalid backlink, too far up. */ + return -EINVAL; /* invalid backlink, too far up. */ + } + + /* + * We can only trust the bottom frame's backlink, the + * rest of the frame may be uninitialized, continue to + * the next. + */ + if (firstframe) + continue; + + /* Mark stacktraces with exception frames as unreliable. */ + if (sp <= stack_end - STACK_INT_FRAME_SIZE && + stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + return -EINVAL; } /* Examine the saved LR: it must point into kernel code. */ ip = stack[STACK_FRAME_LR_SAVE]; - if (!firstframe && !__kernel_text_address(ip)) - return 1; - firstframe = 0; + if (!__kernel_text_address(ip)) + return -EINVAL; /* * FIXME: IMHO these tests do not belong in @@ -175,25 +189,37 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, * as unreliable. */ if (ip == (unsigned long)kretprobe_trampoline) - return 1; + return -EINVAL; #endif + if (trace->nr_entries >= trace->max_entries) + return -E2BIG; if (!trace->skip) trace->entries[trace->nr_entries++] = ip; else trace->skip--; + } + return 0; +} - if (newsp == stack_end) - break; +int save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) +{ + int ret; - if (trace->nr_entries >= trace->max_entries) - return -E2BIG; + /* + * If the task doesn't have a stack (e.g., a zombie), the stack is + * "reliably" empty. + */ + if (!try_get_task_stack(tsk)) + return 0; - sp = newsp; - } - return 0; + ret = __save_stack_trace_tsk_reliable(tsk, trace); + + put_task_stack(tsk); + + return ret; } -EXPORT_SYMBOL_GPL(save_stack_trace_tsk_reliable); #endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c @@ -123,7 +123,7 @@ long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, (u64)len_high << 32 | len_low, advice); } -long sys_switch_endian(void) +SYSCALL_DEFINE0(switch_endian) { struct thread_info *ti; diff --git a/arch/powerpc/kernel/syscalls/syscalltbl.sh b/arch/powerpc/kernel/syscalls/syscalltbl.sh @@ -13,10 +13,10 @@ emit() { t_entry="$3" while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s,sys_ni_syscall, )\n" "${t_nxt}" + printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" t_nxt=$((t_nxt+1)) done - printf "__SYSCALL(%s,%s, )\n" "${t_nxt}" "${t_entry}" + printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" } grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S @@ -25,11 +25,11 @@ .globl sys_call_table sys_call_table: #ifdef CONFIG_PPC64 -#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry) +#define __SYSCALL(nr, entry) .8byte DOTSYM(entry) #include <asm/syscall_table_64.h> #undef __SYSCALL #else -#define __SYSCALL(nr, entry, nargs) .long entry +#define __SYSCALL(nr, entry) .long entry #include <asm/syscall_table_32.h> #undef __SYSCALL #endif @@ -38,7 +38,7 @@ sys_call_table: .globl compat_sys_call_table compat_sys_call_table: #define compat_sys_sigsuspend sys_sigsuspend -#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry) +#define __SYSCALL(nr, entry) .8byte DOTSYM(entry) #include <asm/syscall_table_c32.h> #undef __SYSCALL #endif diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c @@ -57,7 +57,6 @@ #include <linux/irq_work.h> #include <linux/clk-provider.h> #include <linux/suspend.h> -#include <linux/rtc.h> #include <linux/sched/cputime.h> #include <linux/processor.h> #include <asm/trace.h> diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_PPC64) += $(obj64-y) obj-$(CONFIG_PPC32) += $(obj32-y) -# Disable GCOV & sanitizers in odd or sensitive code +# Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_ftrace.o := n +KCOV_INSTRUMENT_ftrace.o := n UBSAN_SANITIZE_ftrace.o := n diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -229,7 +229,7 @@ ftrace_call: * - r0, r11 & r12 are free */ livepatch_handler: - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) /* Allocate 3 x 8 bytes */ ld r11, TI_livepatch_sp(r12) @@ -256,7 +256,7 @@ livepatch_handler: * restore it. */ - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) ld r11, TI_livepatch_sp(r12) @@ -273,7 +273,7 @@ livepatch_handler: ld r2, -24(r11) /* Pop livepatch stack frame */ - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) subi r11, r11, 24 std r11, TI_livepatch_sp(r12) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c @@ -257,24 +257,17 @@ static int __die(const char *str, struct pt_regs *regs, long err) { printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); - if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) - printk("LE "); - else - printk("BE "); - - if (IS_ENABLED(CONFIG_PREEMPT)) - pr_cont("PREEMPT "); - - if (IS_ENABLED(CONFIG_SMP)) - pr_cont("SMP NR_CPUS=%d ", NR_CPUS); - - if (debug_pagealloc_enabled()) - pr_cont("DEBUG_PAGEALLOC "); - - if (IS_ENABLED(CONFIG_NUMA)) - pr_cont("NUMA "); - - pr_cont("%s\n", ppc_md.name ? ppc_md.name : ""); + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s%s %s\n", + IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", + PAGE_SIZE / 1024, + early_radix_enabled() ? " MMU=Radix" : "", + early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ? " MMU=Hash" : "", + IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", + IS_ENABLED(CONFIG_NUMA) ? " NUMA" : "", + ppc_md.name ? ppc_md.name : ""); if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP) return 1; @@ -376,16 +369,101 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) force_sig_fault(signr, code, (void __user *)addr, current); } +/* + * The interrupt architecture has a quirk in that the HV interrupts excluding + * the NMIs (0x100 and 0x200) do not clear MSR[RI] at entry. The first thing + * that an interrupt handler must do is save off a GPR into a scratch register, + * and all interrupts on POWERNV (HV=1) use the HSPRG1 register as scratch. + * Therefore an NMI can clobber an HV interrupt's live HSPRG1 without noticing + * that it is non-reentrant, which leads to random data corruption. + * + * The solution is for NMI interrupts in HV mode to check if they originated + * from these critical HV interrupt regions. If so, then mark them not + * recoverable. + * + * An alternative would be for HV NMIs to use SPRG for scratch to avoid the + * HSPRG1 clobber, however this would cause guest SPRG to be clobbered. Linux + * guests should always have MSR[RI]=0 when its scratch SPRG is in use, so + * that would work. However any other guest OS that may have the SPRG live + * and MSR[RI]=1 could encounter silent corruption. + * + * Builds that do not support KVM could take this second option to increase + * the recoverability of NMIs. + */ +void hv_nmi_check_nonrecoverable(struct pt_regs *regs) +{ +#ifdef CONFIG_PPC_POWERNV + unsigned long kbase = (unsigned long)_stext; + unsigned long nip = regs->nip; + + if (!(regs->msr & MSR_RI)) + return; + if (!(regs->msr & MSR_HV)) + return; + if (regs->msr & MSR_PR) + return; + + /* + * Now test if the interrupt has hit a range that may be using + * HSPRG1 without having RI=0 (i.e., an HSRR interrupt). The + * problem ranges all run un-relocated. Test real and virt modes + * at the same time by droping the high bit of the nip (virt mode + * entry points still have the +0x4000 offset). + */ + nip &= ~0xc000000000000000ULL; + if ((nip >= 0x500 && nip < 0x600) || (nip >= 0x4500 && nip < 0x4600)) + goto nonrecoverable; + if ((nip >= 0x980 && nip < 0xa00) || (nip >= 0x4980 && nip < 0x4a00)) + goto nonrecoverable; + if ((nip >= 0xe00 && nip < 0xec0) || (nip >= 0x4e00 && nip < 0x4ec0)) + goto nonrecoverable; + if ((nip >= 0xf80 && nip < 0xfa0) || (nip >= 0x4f80 && nip < 0x4fa0)) + goto nonrecoverable; + + /* Trampoline code runs un-relocated so subtract kbase. */ + if (nip >= (unsigned long)(start_real_trampolines - kbase) && + nip < (unsigned long)(end_real_trampolines - kbase)) + goto nonrecoverable; + if (nip >= (unsigned long)(start_virt_trampolines - kbase) && + nip < (unsigned long)(end_virt_trampolines - kbase)) + goto nonrecoverable; + return; + +nonrecoverable: + regs->msr &= ~MSR_RI; +#endif +} + void system_reset_exception(struct pt_regs *regs) { + unsigned long hsrr0, hsrr1; + bool nested = in_nmi(); + bool saved_hsrrs = false; + /* * Avoid crashes in case of nested NMI exceptions. Recoverability * is determined by RI and in_nmi */ - bool nested = in_nmi(); if (!nested) nmi_enter(); + /* + * System reset can interrupt code where HSRRs are live and MSR[RI]=1. + * The system reset interrupt itself may clobber HSRRs (e.g., to call + * OPAL), so save them here and restore them before returning. + * + * Machine checks don't need to save HSRRs, as the real mode handler + * is careful to avoid them, and the regular handler is not delivered + * as an NMI. + */ + if (cpu_has_feature(CPU_FTR_HVMODE)) { + hsrr0 = mfspr(SPRN_HSRR0); + hsrr1 = mfspr(SPRN_HSRR1); + saved_hsrrs = true; + } + + hv_nmi_check_nonrecoverable(regs); + __this_cpu_inc(irq_stat.sreset_irqs); /* See if any machine dependent calls */ @@ -433,6 +511,11 @@ out: if (!(regs->msr & MSR_RI)) nmi_panic(regs, "Unrecoverable System Reset"); + if (saved_hsrrs) { + mtspr(SPRN_HSRR0, hsrr0); + mtspr(SPRN_HSRR1, hsrr1); + } + if (!nested) nmi_exit(); @@ -763,15 +846,15 @@ void machine_check_exception(struct pt_regs *regs) if (check_io_access(regs)) goto bail; - /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) - nmi_panic(regs, "Unrecoverable Machine check"); - if (!nested) nmi_exit(); die("Machine check", regs, SIGBUS); + /* Must die if the interrupt is not recoverable */ + if (!(regs->msr & MSR_RI)) + nmi_panic(regs, "Unrecoverable Machine check"); + return; bail: @@ -1542,8 +1625,8 @@ bail: void StackOverflow(struct pt_regs *regs) { - printk(KERN_CRIT "Kernel stack overflow in process %p, r1=%lx\n", - current, regs->gpr[1]); + pr_crit("Kernel stack overflow in process %s[%d], r1=%lx\n", + current->comm, task_pid_nr(current), regs->gpr[1]); debugger(regs); show_regs(regs); panic("kernel stack overflow"); diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c @@ -74,7 +74,7 @@ void __init udbg_early_init(void) #endif #ifdef CONFIG_PPC_EARLY_DEBUG - console_loglevel = 10; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; register_early_udbg_console(); #endif diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile @@ -23,6 +23,7 @@ targets := $(obj-vdso32) vdso32.so vdso32.so.dbg obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) GCOV_PROFILE := n +KCOV_INSTRUMENT := n UBSAN_SANITIZE := n ccflags-y := -shared -fno-common -fno-builtin diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile @@ -9,6 +9,7 @@ targets := $(obj-vdso64) vdso64.so vdso64.so.dbg obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) GCOV_PROFILE := n +KCOV_INSTRUMENT := n UBSAN_SANITIZE := n ccflags-y := -shared -fno-common -fno-builtin diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S @@ -12,11 +12,8 @@ #include <asm/cache.h> #include <asm/thread_info.h> -#if defined(CONFIG_STRICT_KERNEL_RWX) && !defined(CONFIG_PPC32) -#define STRICT_ALIGN_SIZE (1 << 24) -#else -#define STRICT_ALIGN_SIZE PAGE_SIZE -#endif +#define STRICT_ALIGN_SIZE (1 << CONFIG_DATA_SHIFT) +#define ETEXT_ALIGN_SIZE (1 << CONFIG_ETEXT_SHIFT) ENTRY(_stext) @@ -86,11 +83,11 @@ SECTIONS #ifdef CONFIG_PPC64 /* - * BLOCK(0) overrides the default output section alignment because + * ALIGN(0) overrides the default output section alignment because * this needs to start right after .head.text in order for fixed * section placement to work. */ - .text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) { + .text ALIGN(0) : AT(ADDR(.text) - LOAD_OFFSET) { #ifdef CONFIG_LD_HEAD_STUB_CATCH KEEP(*(.linker_stub_catch)); . = . ; @@ -131,7 +128,7 @@ SECTIONS } :kernel - . = ALIGN(PAGE_SIZE); + . = ALIGN(ETEXT_ALIGN_SIZE); _etext = .; PROVIDE32 (etext = .); @@ -319,6 +316,7 @@ SECTIONS *(.sdata2) *(.got.plt) *(.got) *(.plt) + *(.branch_lt) } #else .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile @@ -10,11 +10,6 @@ common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o -CFLAGS_e500_mmu.o := -I. -CFLAGS_e500_mmu_host.o := -I. -CFLAGS_emulate.o := -I. -CFLAGS_emulate_loadstore.o := -I. - common-objs-y += powerpc.o emulate_loadstore.o obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c @@ -192,6 +192,13 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) } EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio); +void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags) +{ + /* might as well deliver this straight away */ + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_MACHINE_CHECK, flags); +} +EXPORT_SYMBOL_GPL(kvmppc_core_queue_machine_check); + void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) { /* might as well deliver this straight away */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c @@ -1215,6 +1215,22 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK: + /* Print the MCE event to host console. */ + machine_check_print_event_info(&vcpu->arch.mce_evt, false, true); + + /* + * If the guest can do FWNMI, exit to userspace so it can + * deliver a FWNMI to the guest. + * Otherwise we synthesize a machine check for the guest + * so that it knows that the machine check occurred. + */ + if (!vcpu->kvm->arch.fwnmi_enabled) { + ulong flags = vcpu->arch.shregs.msr & 0x083c0000; + kvmppc_core_queue_machine_check(vcpu, flags); + r = RESUME_GUEST; + break; + } + /* Exit to guest with KVM_EXIT_NMI as exit reason */ run->exit_reason = KVM_EXIT_NMI; run->hw.hardware_exit_reason = vcpu->arch.trap; @@ -1227,8 +1243,6 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV; r = RESUME_HOST; - /* Print the MCE event to host console. */ - machine_check_print_event_info(&vcpu->arch.mce_evt, false); break; case BOOK3S_INTERRUPT_PROGRAM: { @@ -1392,7 +1406,7 @@ static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) /* Pass the machine check to the L1 guest */ r = RESUME_HOST; /* Print the MCE event to host console. */ - machine_check_print_event_info(&vcpu->arch.mce_evt, false); + machine_check_print_event_info(&vcpu->arch.mce_evt, false, true); break; /* * We get these next two if the guest accesses a page which it thinks @@ -3455,6 +3469,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long host_dscr = mfspr(SPRN_DSCR); unsigned long host_tidr = mfspr(SPRN_TIDR); unsigned long host_iamr = mfspr(SPRN_IAMR); + unsigned long host_amr = mfspr(SPRN_AMR); s64 dec; u64 tb; int trap, save_pmu; @@ -3571,13 +3586,15 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_PSPB, 0); mtspr(SPRN_WORT, 0); - mtspr(SPRN_AMR, 0); mtspr(SPRN_UAMOR, 0); mtspr(SPRN_DSCR, host_dscr); mtspr(SPRN_TIDR, host_tidr); mtspr(SPRN_IAMR, host_iamr); mtspr(SPRN_PSPB, 0); + if (host_amr != vcpu->arch.amr) + mtspr(SPRN_AMR, host_amr); + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); store_fp_state(&vcpu->arch.fp); #ifdef CONFIG_ALTIVEC diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c @@ -24,6 +24,7 @@ #include <linux/compiler.h> #include <asm/paca.h> #include <asm/hmi.h> +#include <asm/processor.h> void wait_for_subcore_guest_exit(void) { diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c @@ -66,10 +66,8 @@ static void reload_slb(struct kvm_vcpu *vcpu) /* * On POWER7, see if we can handle a machine check that occurred inside * the guest in real mode, without switching to the host partition. - * - * Returns: 0 => exit guest, 1 => deliver machine check to guest */ -static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) +static void kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) { unsigned long srr1 = vcpu->arch.shregs.msr; struct machine_check_event mce_evt; @@ -111,52 +109,24 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) } /* - * See if we have already handled the condition in the linux host. - * We assume that if the condition is recovered then linux host - * will have generated an error log event that we will pick - * up and log later. - * Don't release mce event now. We will queue up the event so that - * we can log the MCE event info on host console. + * Now get the event and stash it in the vcpu struct so it can + * be handled by the primary thread in virtual mode. We can't + * call machine_check_queue_event() here if we are running on + * an offline secondary thread. */ - if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE)) - goto out; - - if (mce_evt.version == MCE_V1 && - (mce_evt.severity == MCE_SEV_NO_ERROR || - mce_evt.disposition == MCE_DISPOSITION_RECOVERED)) - handled = 1; - -out: - /* - * For guest that supports FWNMI capability, hook the MCE event into - * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI - * exit reason. On our way to exit we will pull this event from vcpu - * structure and print it from thread 0 of the core/subcore. - * - * For guest that does not support FWNMI capability (old QEMU): - * We are now going enter guest either through machine check - * interrupt (for unhandled errors) or will continue from - * current HSRR0 (for handled errors) in guest. Hence - * queue up the event so that we can log it from host console later. - */ - if (vcpu->kvm->arch.fwnmi_enabled) { - /* - * Hook up the mce event on to vcpu structure. - * First clear the old event. - */ - memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt)); - if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) { - vcpu->arch.mce_evt = mce_evt; - } - } else - machine_check_queue_event(); + if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) { + if (handled && mce_evt.version == MCE_V1) + mce_evt.disposition = MCE_DISPOSITION_RECOVERED; + } else { + memset(&mce_evt, 0, sizeof(mce_evt)); + } - return handled; + vcpu->arch.mce_evt = mce_evt; } -long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) +void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) { - return kvmppc_realmode_mc_power7(vcpu); + kvmppc_realmode_mc_power7(vcpu); } /* Check if dynamic split is in force and return subcore size accordingly. */ diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -58,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define STACK_SLOT_DAWR (SFS-56) #define STACK_SLOT_DAWRX (SFS-64) #define STACK_SLOT_HFSCR (SFS-72) +#define STACK_SLOT_AMR (SFS-80) +#define STACK_SLOT_UAMOR (SFS-88) /* the following is used by the P9 short path */ #define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ @@ -726,11 +728,9 @@ BEGIN_FTR_SECTION mfspr r5, SPRN_TIDR mfspr r6, SPRN_PSSCR mfspr r7, SPRN_PID - mfspr r8, SPRN_IAMR std r5, STACK_SLOT_TID(r1) std r6, STACK_SLOT_PSSCR(r1) std r7, STACK_SLOT_PID(r1) - std r8, STACK_SLOT_IAMR(r1) mfspr r5, SPRN_HFSCR std r5, STACK_SLOT_HFSCR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) @@ -738,11 +738,18 @@ BEGIN_FTR_SECTION mfspr r5, SPRN_CIABR mfspr r6, SPRN_DAWR mfspr r7, SPRN_DAWRX + mfspr r8, SPRN_IAMR std r5, STACK_SLOT_CIABR(r1) std r6, STACK_SLOT_DAWR(r1) std r7, STACK_SLOT_DAWRX(r1) + std r8, STACK_SLOT_IAMR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mfspr r5, SPRN_AMR + std r5, STACK_SLOT_AMR(r1) + mfspr r6, SPRN_UAMOR + std r6, STACK_SLOT_UAMOR(r1) + BEGIN_FTR_SECTION /* Set partition DABR */ /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ @@ -1631,22 +1638,25 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) mtspr SPRN_PSPB, r0 mtspr SPRN_WORT, r0 BEGIN_FTR_SECTION - mtspr SPRN_IAMR, r0 mtspr SPRN_TCSCR, r0 /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ li r0, 1 sldi r0, r0, 31 mtspr SPRN_MMCRS, r0 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) -8: - /* Save and reset AMR and UAMOR before turning on the MMU */ + /* Save and restore AMR, IAMR and UAMOR before turning on the MMU */ + ld r8, STACK_SLOT_IAMR(r1) + mtspr SPRN_IAMR, r8 + +8: /* Power7 jumps back in here */ mfspr r5,SPRN_AMR mfspr r6,SPRN_UAMOR std r5,VCPU_AMR(r9) std r6,VCPU_UAMOR(r9) - li r6,0 - mtspr SPRN_AMR,r6 + ld r5,STACK_SLOT_AMR(r1) + ld r6,STACK_SLOT_UAMOR(r1) + mtspr SPRN_AMR, r5 mtspr SPRN_UAMOR, r6 /* Switch DSCR back to host value */ @@ -1746,11 +1756,9 @@ BEGIN_FTR_SECTION ld r5, STACK_SLOT_TID(r1) ld r6, STACK_SLOT_PSSCR(r1) ld r7, STACK_SLOT_PID(r1) - ld r8, STACK_SLOT_IAMR(r1) mtspr SPRN_TIDR, r5 mtspr SPRN_PSSCR, r6 mtspr SPRN_PID, r7 - mtspr SPRN_IAMR, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) #ifdef CONFIG_PPC_RADIX_MMU @@ -2826,49 +2834,15 @@ kvm_cede_exit: #endif /* CONFIG_KVM_XICS */ 3: b guest_exit_cont - /* Try to handle a machine check in real mode */ + /* Try to do machine check recovery in real mode */ machine_check_realmode: mr r3, r9 /* get vcpu pointer */ bl kvmppc_realmode_machine_check nop + /* all machine checks go to virtual mode for further handling */ ld r9, HSTATE_KVM_VCPU(r13) li r12, BOOK3S_INTERRUPT_MACHINE_CHECK - /* - * For the guest that is FWNMI capable, deliver all the MCE errors - * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit - * reason. This new approach injects machine check errors in guest - * address space to guest with additional information in the form - * of RTAS event, thus enabling guest kernel to suitably handle - * such errors. - * - * For the guest that is not FWNMI capable (old QEMU) fallback - * to old behaviour for backward compatibility: - * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either - * through machine check interrupt (set HSRR0 to 0x200). - * For handled errors (no-fatal), just go back to guest execution - * with current HSRR0. - * if we receive machine check with MSR(RI=0) then deliver it to - * guest as machine check causing guest to crash. - */ - ld r11, VCPU_MSR(r9) - rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ - bne guest_exit_cont /* if so, exit to host */ - /* Check if guest is capable of handling NMI exit */ - ld r10, VCPU_KVM(r9) - lbz r10, KVM_FWNMI(r10) - cmpdi r10, 1 /* FWNMI capable? */ - beq guest_exit_cont /* if so, exit with KVM_EXIT_NMI. */ - - /* if not, fall through for backward compatibility. */ - andi. r10, r11, MSR_RI /* check for unrecoverable exception */ - beq 1f /* Deliver a machine check to guest */ - ld r10, VCPU_PC(r9) - cmpdi r3, 0 /* Did we handle MCE ? */ - bne 2f /* Continue guest execution. */ - /* If not, deliver a machine check. SRR0/1 are already set */ -1: li r10, BOOK3S_INTERRUPT_MACHINE_CHECK - bl kvmppc_msr_interrupt -2: b fast_interrupt_c_return + b guest_exit_cont /* * Call C code to handle a HMI in real mode. diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile @@ -30,7 +30,8 @@ obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ obj64-$(CONFIG_SMP) += locks.o obj64-$(CONFIG_ALTIVEC) += vmx-helper.o -obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o +obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o \ + test_emulate_step_exec_instr.o obj-y += checksum_$(BITS).o checksum_wrappers.o \ string_$(BITS).o memcmp_$(BITS).o diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c @@ -1169,7 +1169,7 @@ static nokprobe_inline int trap_compare(long v1, long v2) int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, unsigned int instr) { - unsigned int opcode, ra, rb, rd, spr, u; + unsigned int opcode, ra, rb, rc, rd, spr, u; unsigned long int imm; unsigned long int val, val2; unsigned int mb, me, sh; @@ -1292,6 +1292,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, rd = (instr >> 21) & 0x1f; ra = (instr >> 16) & 0x1f; rb = (instr >> 11) & 0x1f; + rc = (instr >> 6) & 0x1f; switch (opcode) { #ifdef __powerpc64__ @@ -1305,6 +1306,38 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, goto trap; return 1; +#ifdef __powerpc64__ + case 4: + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + + switch (instr & 0x3f) { + case 48: /* maddhd */ + asm volatile(PPC_MADDHD(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + + case 49: /* maddhdu */ + asm volatile(PPC_MADDHDU(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + + case 51: /* maddld */ + asm volatile(PPC_MADDLD(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + } + + /* + * There are other instructions from ISA 3.0 with the same + * primary opcode which do not have emulation support yet. + */ + return -1; +#endif + case 7: /* mulli */ op->val = regs->gpr[ra] * (short) instr; goto compute_done; @@ -1671,10 +1704,23 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, (int) regs->gpr[rb]; goto arith_done; - +#ifdef __powerpc64__ + case 265: /* modud */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = regs->gpr[ra] % regs->gpr[rb]; + goto compute_done; +#endif case 266: /* add */ op->val = regs->gpr[ra] + regs->gpr[rb]; goto arith_done; + + case 267: /* moduw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = (unsigned int) regs->gpr[ra] % + (unsigned int) regs->gpr[rb]; + goto compute_done; #ifdef __powerpc64__ case 457: /* divdu */ op->val = regs->gpr[ra] / regs->gpr[rb]; @@ -1695,6 +1741,42 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, (int) regs->gpr[rb]; goto arith_done; + case 755: /* darn */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + switch (ra & 0x3) { + case 0: + /* 32-bit conditioned */ + asm volatile(PPC_DARN(%0, 0) : "=r" (op->val)); + goto compute_done; + + case 1: + /* 64-bit conditioned */ + asm volatile(PPC_DARN(%0, 1) : "=r" (op->val)); + goto compute_done; + + case 2: + /* 64-bit raw */ + asm volatile(PPC_DARN(%0, 2) : "=r" (op->val)); + goto compute_done; + } + + return -1; +#ifdef __powerpc64__ + case 777: /* modsd */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = (long int) regs->gpr[ra] % + (long int) regs->gpr[rb]; + goto compute_done; +#endif + case 779: /* modsw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = (int) regs->gpr[ra] % + (int) regs->gpr[rb]; + goto compute_done; + /* * Logical instructions @@ -1765,6 +1847,20 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, do_popcnt(regs, op, regs->gpr[rd], 64); goto logical_done_nocc; #endif + case 538: /* cnttzw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + val = (unsigned int) regs->gpr[rd]; + op->val = (val ? __builtin_ctz(val) : 32); + goto logical_done; +#ifdef __powerpc64__ + case 570: /* cnttzd */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + val = regs->gpr[rd]; + op->val = (val ? __builtin_ctzl(val) : 64); + goto logical_done; +#endif case 922: /* extsh */ op->val = (signed short) regs->gpr[rd]; goto logical_done; @@ -1866,6 +1962,20 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval &= ~XER_CA; set_ca32(op, op->xerval & XER_CA); goto logical_done; + + case 890: /* extswsli with sh_5 = 0 */ + case 891: /* extswsli with sh_5 = 1 */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->type = COMPUTE + SETREG; + sh = rb | ((instr & 2) << 4); + val = (signed int) regs->gpr[rd]; + if (sh) + op->val = ROTATE(val, sh) & MASK64(0, 63 - sh); + else + op->val = val; + goto logical_done; + #endif /* __powerpc64__ */ /* diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c @@ -1,5 +1,5 @@ /* - * Simple sanity test for emulate_step load/store instructions. + * Simple sanity tests for instruction emulation infrastructure. * * Copyright IBM Corp. 2016 * @@ -14,6 +14,7 @@ #include <linux/ptrace.h> #include <asm/sstep.h> #include <asm/ppc-opcode.h> +#include <asm/code-patching.h> #define IMM_L(i) ((uintptr_t)(i) & 0xffff) @@ -48,7 +49,20 @@ ___PPC_RA(a) | ___PPC_RB(b)) #define TEST_LXVD2X(s, a, b) (PPC_INST_LXVD2X | VSX_XX1((s), R##a, R##b)) #define TEST_STXVD2X(s, a, b) (PPC_INST_STXVD2X | VSX_XX1((s), R##a, R##b)) +#define TEST_ADD(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define TEST_ADD_DOT(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b) | 0x1) +#define TEST_ADDC(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define TEST_ADDC_DOT(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b) | 0x1) + +#define MAX_SUBTESTS 16 +#define IGNORE_GPR(n) (0x1UL << (n)) +#define IGNORE_XER (0x1UL << 32) +#define IGNORE_CCR (0x1UL << 33) static void __init init_pt_regs(struct pt_regs *regs) { @@ -72,9 +86,15 @@ static void __init init_pt_regs(struct pt_regs *regs) msr_cached = true; } -static void __init show_result(char *ins, char *result) +static void __init show_result(char *mnemonic, char *result) { - pr_info("%-14s : %s\n", ins, result); + pr_info("%-14s : %s\n", mnemonic, result); +} + +static void __init show_result_with_descr(char *mnemonic, char *descr, + char *result) +{ + pr_info("%-14s : %-50s %s\n", mnemonic, descr, result); } static void __init test_ld(void) @@ -426,7 +446,7 @@ static void __init test_lxvd2x_stxvd2x(void) } #endif /* CONFIG_VSX */ -static int __init test_emulate_step(void) +static void __init run_tests_load_store(void) { test_ld(); test_lwz(); @@ -437,6 +457,513 @@ static int __init test_emulate_step(void) test_lfdx_stfdx(); test_lvx_stvx(); test_lxvd2x_stxvd2x(); +} + +struct compute_test { + char *mnemonic; + struct { + char *descr; + unsigned long flags; + unsigned int instr; + struct pt_regs regs; + } subtests[MAX_SUBTESTS + 1]; +}; + +static struct compute_test compute_tests[] = { + { + .mnemonic = "nop", + .subtests = { + { + .descr = "R0 = LONG_MAX", + .instr = PPC_INST_NOP, + .regs = { + .gpr[0] = LONG_MAX, + } + } + } + }, + { + .mnemonic = "add", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + } + } + }, + { + .mnemonic = "add.", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .flags = IGNORE_CCR, + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .flags = IGNORE_CCR, + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + } + } + }, + { + .mnemonic = "addc", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN | (uint)INT_MIN, + .gpr[22] = LONG_MIN | (uint)INT_MIN, + } + } + } + }, + { + .mnemonic = "addc.", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .flags = IGNORE_CCR, + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .flags = IGNORE_CCR, + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN | (uint)INT_MIN, + .gpr[22] = LONG_MIN | (uint)INT_MIN, + } + } + } + } +}; + +static int __init emulate_compute_instr(struct pt_regs *regs, + unsigned int instr) +{ + struct instruction_op op; + + if (!regs || !instr) + return -EINVAL; + + if (analyse_instr(&op, regs, instr) != 1 || + GETTYPE(op.type) != COMPUTE) { + pr_info("emulation failed, instruction = 0x%08x\n", instr); + return -EFAULT; + } + + emulate_update_regs(regs, &op); + return 0; +} + +static int __init execute_compute_instr(struct pt_regs *regs, + unsigned int instr) +{ + extern int exec_instr(struct pt_regs *regs); + extern s32 patch__exec_instr; + + if (!regs || !instr) + return -EINVAL; + + /* Patch the NOP with the actual instruction */ + patch_instruction_site(&patch__exec_instr, instr); + if (exec_instr(regs)) { + pr_info("execution failed, instruction = 0x%08x\n", instr); + return -EFAULT; + } + + return 0; +} + +#define gpr_mismatch(gprn, exp, got) \ + pr_info("GPR%u mismatch, exp = 0x%016lx, got = 0x%016lx\n", \ + gprn, exp, got) + +#define reg_mismatch(name, exp, got) \ + pr_info("%s mismatch, exp = 0x%016lx, got = 0x%016lx\n", \ + name, exp, got) + +static void __init run_tests_compute(void) +{ + unsigned long flags; + struct compute_test *test; + struct pt_regs *regs, exp, got; + unsigned int i, j, k, instr; + bool ignore_gpr, ignore_xer, ignore_ccr, passed; + + for (i = 0; i < ARRAY_SIZE(compute_tests); i++) { + test = &compute_tests[i]; + + for (j = 0; j < MAX_SUBTESTS && test->subtests[j].descr; j++) { + instr = test->subtests[j].instr; + flags = test->subtests[j].flags; + regs = &test->subtests[j].regs; + ignore_xer = flags & IGNORE_XER; + ignore_ccr = flags & IGNORE_CCR; + passed = true; + + memcpy(&exp, regs, sizeof(struct pt_regs)); + memcpy(&got, regs, sizeof(struct pt_regs)); + + /* + * Set a compatible MSR value explicitly to ensure + * that XER and CR bits are updated appropriately + */ + exp.msr = MSR_KERNEL; + got.msr = MSR_KERNEL; + + if (emulate_compute_instr(&got, instr) || + execute_compute_instr(&exp, instr)) { + passed = false; + goto print; + } + + /* Verify GPR values */ + for (k = 0; k < 32; k++) { + ignore_gpr = flags & IGNORE_GPR(k); + if (!ignore_gpr && exp.gpr[k] != got.gpr[k]) { + passed = false; + gpr_mismatch(k, exp.gpr[k], got.gpr[k]); + } + } + + /* Verify LR value */ + if (exp.link != got.link) { + passed = false; + reg_mismatch("LR", exp.link, got.link); + } + + /* Verify XER value */ + if (!ignore_xer && exp.xer != got.xer) { + passed = false; + reg_mismatch("XER", exp.xer, got.xer); + } + + /* Verify CR value */ + if (!ignore_ccr && exp.ccr != got.ccr) { + passed = false; + reg_mismatch("CR", exp.ccr, got.ccr); + } + +print: + show_result_with_descr(test->mnemonic, + test->subtests[j].descr, + passed ? "PASS" : "FAIL"); + } + } +} + +static int __init test_emulate_step(void) +{ + printk(KERN_INFO "Running instruction emulation self-tests ...\n"); + run_tests_load_store(); + run_tests_compute(); return 0; } diff --git a/arch/powerpc/lib/test_emulate_step_exec_instr.S b/arch/powerpc/lib/test_emulate_step_exec_instr.S @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Non-emulated single-stepping support (currently limited to basic integer + * computations) used to validate the instruction emulation infrastructure. + * + * Copyright (C) 2019 IBM Corporation + */ + +#include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> +#include <asm/code-patching-asm.h> +#include <linux/errno.h> + +/* int exec_instr(struct pt_regs *regs) */ +_GLOBAL(exec_instr) + + /* + * Stack frame layout (INT_FRAME_SIZE bytes) + * In-memory pt_regs (SP + STACK_FRAME_OVERHEAD) + * Scratch space (SP + 8) + * Back chain (SP + 0) + */ + + /* + * Allocate a new stack frame with enough space to hold the register + * states in an in-memory pt_regs and also create the back chain to + * the caller's stack frame. + */ + stdu r1, -INT_FRAME_SIZE(r1) + + /* + * Save non-volatile GPRs on stack. This includes TOC pointer (GPR2) + * and local variables (GPR14 to GPR31). The register for the pt_regs + * parameter (GPR3) is saved additionally to ensure that the resulting + * register state can still be saved even if GPR3 gets overwritten + * when loading the initial register state for the test instruction. + * The stack pointer (GPR1) and the thread pointer (GPR13) are not + * saved as these should not be modified anyway. + */ + SAVE_2GPRS(2, r1) + SAVE_NVGPRS(r1) + + /* + * Save LR on stack to ensure that the return address is available + * even if it gets overwritten by the test instruction. + */ + mflr r0 + std r0, _LINK(r1) + + /* + * Save CR on stack. For simplicity, the entire register is saved + * even though only fields 2 to 4 are non-volatile. + */ + mfcr r0 + std r0, _CCR(r1) + + /* + * Load register state for the test instruction without touching the + * critical non-volatile registers. The register state is passed as a + * pointer to a pt_regs instance. + */ + subi r31, r3, GPR0 + + /* Load LR from pt_regs */ + ld r0, _LINK(r31) + mtlr r0 + + /* Load CR from pt_regs */ + ld r0, _CCR(r31) + mtcr r0 + + /* Load XER from pt_regs */ + ld r0, _XER(r31) + mtxer r0 + + /* Load GPRs from pt_regs */ + REST_GPR(0, r31) + REST_10GPRS(2, r31) + REST_GPR(12, r31) + REST_NVGPRS(r31) + + /* Placeholder for the test instruction */ +1: nop + patch_site 1b patch__exec_instr + + /* + * Since GPR3 is overwritten, temporarily restore it back to its + * original state, i.e. the pointer to pt_regs, to ensure that the + * resulting register state can be saved. Before doing this, a copy + * of it is created in the scratch space which is used later on to + * save it to pt_regs. + */ + std r3, 8(r1) + REST_GPR(3, r1) + + /* Save resulting GPR state to pt_regs */ + subi r3, r3, GPR0 + SAVE_GPR(0, r3) + SAVE_GPR(2, r3) + SAVE_8GPRS(4, r3) + SAVE_GPR(12, r3) + SAVE_NVGPRS(r3) + + /* Save resulting LR to pt_regs */ + mflr r0 + std r0, _LINK(r3) + + /* Save resulting CR to pt_regs */ + mfcr r0 + std r0, _CCR(r3) + + /* Save resulting XER to pt_regs */ + mfxer r0 + std r0, _XER(r3) + + /* Restore resulting GPR3 from scratch space and save it to pt_regs */ + ld r0, 8(r1) + std r0, GPR3(r3) + + /* Set return value to denote execution success */ + li r3, 0 + + /* Continue */ + b 3f + + /* Set return value to denote execution failure */ +2: li r3, -EFAULT + + /* Restore the non-volatile GPRs from stack */ +3: REST_GPR(2, r1) + REST_NVGPRS(r1) + + /* Restore LR from stack to be able to return */ + ld r0, _LINK(r1) + mtlr r0 + + /* Restore CR from stack */ + ld r0, _CCR(r1) + mtcr r0 + + /* Tear down stack frame */ + addi r1, r1, INT_FRAME_SIZE + + /* Return */ + blr + + /* Setup exception table */ + EX_TABLE(1b, 2b) + +_ASM_NOKPROBE_SYMBOL(exec_instr) diff --git a/arch/powerpc/math-emu/Makefile b/arch/powerpc/math-emu/Makefile @@ -17,4 +17,4 @@ obj-$(CONFIG_SPE) += math_efp.o CFLAGS_fabs.o = -fno-builtin-fabs CFLAGS_math.o = -fno-builtin-fabs -ccflags-y = -I. -Iinclude/math-emu -w +ccflags-y = -w diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c @@ -93,7 +93,7 @@ void __init MMU_init_hw(void) #define LARGE_PAGE_SIZE_16M (1<<24) #define LARGE_PAGE_SIZE_4M (1<<22) -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long v, s, mapped; phys_addr_t p; diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c @@ -170,7 +170,7 @@ void __init MMU_init_hw(void) flush_instruction_cache(); } -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long addr; unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c @@ -66,26 +66,22 @@ unsigned long p_block_mapped(phys_addr_t pa) void __init MMU_init_hw(void) { /* PIN up to the 3 first 8Mb after IMMR in DTLB table */ -#ifdef CONFIG_PIN_TLB_DATA - unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; - unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; -#ifdef CONFIG_PIN_TLB_IMMR - int i = 29; -#else - int i = 28; -#endif - unsigned long addr = 0; - unsigned long mem = total_lowmem; - - for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) { - mtspr(SPRN_MD_CTR, ctr | (i << 8)); - mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID); - mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID); - mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT); - addr += LARGE_PAGE_SIZE_8M; - mem -= LARGE_PAGE_SIZE_8M; + if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) { + unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; + unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; + int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28; + unsigned long addr = 0; + unsigned long mem = total_lowmem; + + for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) { + mtspr(SPRN_MD_CTR, ctr | (i << 8)); + mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID); + mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID); + mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT); + addr += LARGE_PAGE_SIZE_8M; + mem -= LARGE_PAGE_SIZE_8M; + } } -#endif } static void __init mmu_mapin_immr(void) @@ -98,26 +94,36 @@ static void __init mmu_mapin_immr(void) map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG); } -static void __init mmu_patch_cmp_limit(s32 *site, unsigned long mapped) +static void mmu_patch_cmp_limit(s32 *site, unsigned long mapped) { modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16); } -unsigned long __init mmu_mapin_ram(unsigned long top) +static void mmu_patch_addis(s32 *site, long simm) +{ + unsigned int instr = *(unsigned int *)patch_site_addr(site); + + instr &= 0xffff0000; + instr |= ((unsigned long)simm) >> 16; + patch_instruction_site(site, instr); +} + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long mapped; if (__map_without_ltlbs) { mapped = 0; mmu_mapin_immr(); -#ifndef CONFIG_PIN_TLB_IMMR - patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP); -#endif -#ifndef CONFIG_PIN_TLB_TEXT - mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); -#endif + if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR)) + patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); } else { mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, + _ALIGN(__pa(_einittext), 8 << 20)); } mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped); @@ -138,6 +144,26 @@ unsigned long __init mmu_mapin_ram(unsigned long top) return mapped; } +void mmu_mark_initmem_nx(void) +{ + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23) + mmu_patch_addis(&patch__itlbmiss_linmem_top8, + -((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1))); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext)); +} + +#ifdef CONFIG_STRICT_KERNEL_RWX +void mmu_mark_rodata_ro(void) +{ + if (CONFIG_DATA_SHIFT < 23) + mmu_patch_addis(&patch__dtlbmiss_romem_top8, + -__pa(((unsigned long)_sinittext) & + ~(LARGE_PAGE_SIZE_8M - 1))); + mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext)); +} +#endif + void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { @@ -146,8 +172,8 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, */ BUG_ON(first_memblock_base != 0); - /* 8xx can only access 24MB at the moment */ - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000)); + /* 8xx can only access 32MB at the moment */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x02000000)); } /* @@ -162,14 +188,11 @@ void set_context(unsigned long id, pgd_t *pgd) { s16 offset = (s16)(__pa(swapper_pg_dir)); -#ifdef CONFIG_BDI_SWITCH - pgd_t **ptr = *(pgd_t ***)(KERNELBASE + 0xf0); - /* Context switch the PTE pointer for the Abatron BDI2000. * The PGDIR is passed as second argument. */ - *(ptr + 1) = pgd; -#endif + if (IS_ENABLED(CONFIG_BDI_SWITCH)) + abatron_pteptrs[1] = pgd; /* Register M_TWB will contain base address of level 1 table minus the * lower part of the kernel PGDIR base address, so that all accesses to diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile @@ -45,13 +45,10 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o -obj-$(CONFIG_PPC_PTDUMP) += dump_linuxpagetables.o -ifdef CONFIG_PPC_PTDUMP -obj-$(CONFIG_4xx) += dump_linuxpagetables-generic.o -obj-$(CONFIG_PPC_8xx) += dump_linuxpagetables-8xx.o -obj-$(CONFIG_PPC_BOOK3E_MMU) += dump_linuxpagetables-generic.o -obj-$(CONFIG_PPC_BOOK3S_32) += dump_linuxpagetables-generic.o dump_bats.o dump_sr.o -obj-$(CONFIG_PPC_BOOK3S_64) += dump_linuxpagetables-book3s64.o -endif -obj-$(CONFIG_PPC_HTDUMP) += dump_hashpagetable.o +obj-$(CONFIG_PPC_PTDUMP) += ptdump/ obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o + +# Disable kcov instrumentation on sensitive code +# This is necessary for booting with kcov enabled on book3e machines +KCOV_INSTRUMENT_tlb_nohash.o := n +KCOV_INSTRUMENT_fsl_booke_mmu.o := n diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c @@ -30,6 +30,7 @@ #include <linux/types.h> #include <linux/highmem.h> #include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> #include <linux/export.h> #include <asm/tlbflush.h> @@ -151,8 +152,8 @@ static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsi * Allocate DMA-coherent memory space and return both the kernel remapped * virtual and bus address for that space. */ -void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) +void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) { struct page *page; struct ppc_vm_region *c; @@ -253,7 +254,7 @@ void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, /* * free a page as defined by the above mapping. */ -void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr, +void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { struct ppc_vm_region *c; @@ -313,7 +314,7 @@ void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr, /* * make an area consistent. */ -void __dma_sync(void *vaddr, size_t size, int direction) +static void __dma_sync(void *vaddr, size_t size, int direction) { unsigned long start = (unsigned long)vaddr; unsigned long end = start + size; @@ -339,7 +340,6 @@ void __dma_sync(void *vaddr, size_t size, int direction) break; } } -EXPORT_SYMBOL(__dma_sync); #ifdef CONFIG_HIGHMEM /* @@ -386,28 +386,42 @@ static inline void __dma_sync_page_highmem(struct page *page, * __dma_sync_page makes memory consistent. identical to __dma_sync, but * takes a struct page instead of a virtual address */ -void __dma_sync_page(struct page *page, unsigned long offset, - size_t size, int direction) +static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir) { + struct page *page = pfn_to_page(paddr >> PAGE_SHIFT); + unsigned offset = paddr & ~PAGE_MASK; + #ifdef CONFIG_HIGHMEM - __dma_sync_page_highmem(page, offset, size, direction); + __dma_sync_page_highmem(page, offset, size, dir); #else unsigned long start = (unsigned long)page_address(page) + offset; - __dma_sync((void *)start, size, direction); + __dma_sync((void *)start, size, dir); #endif } -EXPORT_SYMBOL(__dma_sync_page); + +void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) +{ + __dma_sync_page(paddr, size, dir); +} + +void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) +{ + __dma_sync_page(paddr, size, dir); +} /* - * Return the PFN for a given cpu virtual address returned by - * __dma_nommu_alloc_coherent. This is used by dma_mmap_coherent() + * Return the PFN for a given cpu virtual address returned by arch_dma_alloc. */ -unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr) +long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, + dma_addr_t dma_addr) { /* This should always be populated, so we don't test every * level. If that fails, we'll have a nice crash which * will be as good as a BUG_ON() */ + unsigned long cpu_addr = (unsigned long)vaddr; pgd_t *pgd = pgd_offset_k(cpu_addr); pud_t *pud = pud_offset(pgd, cpu_addr); pmd_t *pmd = pmd_offset(pud, cpu_addr); diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c @@ -1,550 +0,0 @@ -/* - * Copyright 2016, Rashmica Gupta, IBM Corp. - * - * This traverses the kernel virtual memory and dumps the pages that are in - * the hash pagetable, along with their flags to - * /sys/kernel/debug/kernel_hash_pagetable. - * - * If radix is enabled then there is no hash page table and so no debugfs file - * is generated. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#include <linux/debugfs.h> -#include <linux/fs.h> -#include <linux/io.h> -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/seq_file.h> -#include <asm/pgtable.h> -#include <linux/const.h> -#include <asm/page.h> -#include <asm/pgalloc.h> -#include <asm/plpar_wrappers.h> -#include <linux/memblock.h> -#include <asm/firmware.h> - -struct pg_state { - struct seq_file *seq; - const struct addr_marker *marker; - unsigned long start_address; - unsigned int level; - u64 current_flags; -}; - -struct addr_marker { - unsigned long start_address; - const char *name; -}; - -static struct addr_marker address_markers[] = { - { 0, "Start of kernel VM" }, - { 0, "vmalloc() Area" }, - { 0, "vmalloc() End" }, - { 0, "isa I/O start" }, - { 0, "isa I/O end" }, - { 0, "phb I/O start" }, - { 0, "phb I/O end" }, - { 0, "I/O remap start" }, - { 0, "I/O remap end" }, - { 0, "vmemmap start" }, - { -1, NULL }, -}; - -struct flag_info { - u64 mask; - u64 val; - const char *set; - const char *clear; - bool is_val; - int shift; -}; - -static const struct flag_info v_flag_array[] = { - { - .mask = SLB_VSID_B, - .val = SLB_VSID_B_256M, - .set = "ssize: 256M", - .clear = "ssize: 1T ", - }, { - .mask = HPTE_V_SECONDARY, - .val = HPTE_V_SECONDARY, - .set = "secondary", - .clear = "primary ", - }, { - .mask = HPTE_V_VALID, - .val = HPTE_V_VALID, - .set = "valid ", - .clear = "invalid", - }, { - .mask = HPTE_V_BOLTED, - .val = HPTE_V_BOLTED, - .set = "bolted", - .clear = "", - } -}; - -static const struct flag_info r_flag_array[] = { - { - .mask = HPTE_R_PP0 | HPTE_R_PP, - .val = PP_RWXX, - .set = "prot:RW--", - }, { - .mask = HPTE_R_PP0 | HPTE_R_PP, - .val = PP_RWRX, - .set = "prot:RWR-", - }, { - .mask = HPTE_R_PP0 | HPTE_R_PP, - .val = PP_RWRW, - .set = "prot:RWRW", - }, { - .mask = HPTE_R_PP0 | HPTE_R_PP, - .val = PP_RXRX, - .set = "prot:R-R-", - }, { - .mask = HPTE_R_PP0 | HPTE_R_PP, - .val = PP_RXXX, - .set = "prot:R---", - }, { - .mask = HPTE_R_KEY_HI | HPTE_R_KEY_LO, - .val = HPTE_R_KEY_HI | HPTE_R_KEY_LO, - .set = "key", - .clear = "", - .is_val = true, - }, { - .mask = HPTE_R_R, - .val = HPTE_R_R, - .set = "ref", - .clear = " ", - }, { - .mask = HPTE_R_C, - .val = HPTE_R_C, - .set = "changed", - .clear = " ", - }, { - .mask = HPTE_R_N, - .val = HPTE_R_N, - .set = "no execute", - }, { - .mask = HPTE_R_WIMG, - .val = HPTE_R_W, - .set = "writethru", - }, { - .mask = HPTE_R_WIMG, - .val = HPTE_R_I, - .set = "no cache", - }, { - .mask = HPTE_R_WIMG, - .val = HPTE_R_G, - .set = "guarded", - } -}; - -static int calculate_pagesize(struct pg_state *st, int ps, char s[]) -{ - static const char units[] = "BKMGTPE"; - const char *unit = units; - - while (ps > 9 && unit[1]) { - ps -= 10; - unit++; - } - seq_printf(st->seq, " %s_ps: %i%c\t", s, 1<<ps, *unit); - return ps; -} - -static void dump_flag_info(struct pg_state *st, const struct flag_info - *flag, u64 pte, int num) -{ - unsigned int i; - - for (i = 0; i < num; i++, flag++) { - const char *s = NULL; - u64 val; - - /* flag not defined so don't check it */ - if (flag->mask == 0) - continue; - /* Some 'flags' are actually values */ - if (flag->is_val) { - val = pte & flag->val; - if (flag->shift) - val = val >> flag->shift; - seq_printf(st->seq, " %s:%llx", flag->set, val); - } else { - if ((pte & flag->mask) == flag->val) - s = flag->set; - else - s = flag->clear; - if (s) - seq_printf(st->seq, " %s", s); - } - } -} - -static void dump_hpte_info(struct pg_state *st, unsigned long ea, u64 v, u64 r, - unsigned long rpn, int bps, int aps, unsigned long lp) -{ - int aps_index; - - while (ea >= st->marker[1].start_address) { - st->marker++; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); - } - seq_printf(st->seq, "0x%lx:\t", ea); - seq_printf(st->seq, "AVPN:%llx\t", HPTE_V_AVPN_VAL(v)); - dump_flag_info(st, v_flag_array, v, ARRAY_SIZE(v_flag_array)); - seq_printf(st->seq, " rpn: %lx\t", rpn); - dump_flag_info(st, r_flag_array, r, ARRAY_SIZE(r_flag_array)); - - calculate_pagesize(st, bps, "base"); - aps_index = calculate_pagesize(st, aps, "actual"); - if (aps_index != 2) - seq_printf(st->seq, "LP enc: %lx", lp); - seq_putc(st->seq, '\n'); -} - - -static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 - *r) -{ - struct hash_pte *hptep; - unsigned long hash, vsid, vpn, hpte_group, want_v, hpte_v; - int i, ssize = mmu_kernel_ssize; - unsigned long shift = mmu_psize_defs[psize].shift; - - /* calculate hash */ - vsid = get_kernel_vsid(ea, ssize); - vpn = hpt_vpn(ea, vsid, ssize); - hash = hpt_hash(vpn, shift, ssize); - want_v = hpte_encode_avpn(vpn, psize, ssize); - - /* to check in the secondary hash table, we invert the hash */ - if (!primary) - hash = ~hash; - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - for (i = 0; i < HPTES_PER_GROUP; i++) { - hptep = htab_address + hpte_group; - hpte_v = be64_to_cpu(hptep->v); - - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* HPTE matches */ - *v = be64_to_cpu(hptep->v); - *r = be64_to_cpu(hptep->r); - return 0; - } - ++hpte_group; - } - return -1; -} - -#ifdef CONFIG_PPC_PSERIES -static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) -{ - struct hash_pte ptes[4]; - unsigned long vsid, vpn, hash, hpte_group, want_v; - int i, j, ssize = mmu_kernel_ssize; - long lpar_rc = 0; - unsigned long shift = mmu_psize_defs[psize].shift; - - /* calculate hash */ - vsid = get_kernel_vsid(ea, ssize); - vpn = hpt_vpn(ea, vsid, ssize); - hash = hpt_hash(vpn, shift, ssize); - want_v = hpte_encode_avpn(vpn, psize, ssize); - - /* to check in the secondary hash table, we invert the hash */ - if (!primary) - hash = ~hash; - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - /* see if we can find an entry in the hpte with this hash */ - for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) { - lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes); - - if (lpar_rc != H_SUCCESS) - continue; - for (j = 0; j < 4; j++) { - if (HPTE_V_COMPARE(ptes[j].v, want_v) && - (ptes[j].v & HPTE_V_VALID)) { - /* HPTE matches */ - *v = ptes[j].v; - *r = ptes[j].r; - return 0; - } - } - } - return -1; -} -#endif - -static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps, - unsigned long *lp_bits) -{ - struct mmu_psize_def entry; - unsigned long arpn, mask, lp; - int penc = -2, idx = 0, shift; - - /*. - * The LP field has 8 bits. Depending on the actual page size, some of - * these bits are concatenated with the APRN to get the RPN. The rest - * of the bits in the LP field is the LP value and is an encoding for - * the base page size and the actual page size. - * - * - find the mmu entry for our base page size - * - go through all page encodings and use the associated mask to - * find an encoding that matches our encoding in the LP field. - */ - arpn = (r & HPTE_R_RPN) >> HPTE_R_RPN_SHIFT; - lp = arpn & 0xff; - - entry = mmu_psize_defs[bps]; - while (idx < MMU_PAGE_COUNT) { - penc = entry.penc[idx]; - if ((penc != -1) && (mmu_psize_defs[idx].shift)) { - shift = mmu_psize_defs[idx].shift - HPTE_R_RPN_SHIFT; - mask = (0x1 << (shift)) - 1; - if ((lp & mask) == penc) { - *aps = mmu_psize_to_shift(idx); - *lp_bits = lp & mask; - *rpn = arpn >> shift; - return; - } - } - idx++; - } -} - -static int base_hpte_find(unsigned long ea, int psize, bool primary, u64 *v, - u64 *r) -{ -#ifdef CONFIG_PPC_PSERIES - if (firmware_has_feature(FW_FEATURE_LPAR)) - return pseries_find(ea, psize, primary, v, r); -#endif - return native_find(ea, psize, primary, v, r); -} - -static unsigned long hpte_find(struct pg_state *st, unsigned long ea, int psize) -{ - unsigned long slot; - u64 v = 0, r = 0; - unsigned long rpn, lp_bits; - int base_psize = 0, actual_psize = 0; - - if (ea < PAGE_OFFSET) - return -1; - - /* Look in primary table */ - slot = base_hpte_find(ea, psize, true, &v, &r); - - /* Look in secondary table */ - if (slot == -1) - slot = base_hpte_find(ea, psize, true, &v, &r); - - /* No entry found */ - if (slot == -1) - return -1; - - /* - * We found an entry in the hash page table: - * - check that this has the same base page - * - find the actual page size - * - find the RPN - */ - base_psize = mmu_psize_to_shift(psize); - - if ((v & HPTE_V_LARGE) == HPTE_V_LARGE) { - decode_r(psize, r, &rpn, &actual_psize, &lp_bits); - } else { - /* 4K actual page size */ - actual_psize = 12; - rpn = (r & HPTE_R_RPN) >> HPTE_R_RPN_SHIFT; - /* In this case there are no LP bits */ - lp_bits = -1; - } - /* - * We didn't find a matching encoding, so the PTE we found isn't for - * this address. - */ - if (actual_psize == -1) - return -1; - - dump_hpte_info(st, ea, v, r, rpn, base_psize, actual_psize, lp_bits); - return 0; -} - -static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) -{ - pte_t *pte = pte_offset_kernel(pmd, 0); - unsigned long addr, pteval, psize; - int i, status; - - for (i = 0; i < PTRS_PER_PTE; i++, pte++) { - addr = start + i * PAGE_SIZE; - pteval = pte_val(*pte); - - if (addr < VMALLOC_END) - psize = mmu_vmalloc_psize; - else - psize = mmu_io_psize; -#ifdef CONFIG_PPC_64K_PAGES - /* check for secret 4K mappings */ - if (((pteval & H_PAGE_COMBO) == H_PAGE_COMBO) || - ((pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN)) - psize = mmu_io_psize; -#endif - /* check for hashpte */ - status = hpte_find(st, addr, psize); - - if (((pteval & H_PAGE_HASHPTE) != H_PAGE_HASHPTE) - && (status != -1)) { - /* found a hpte that is not in the linux page tables */ - seq_printf(st->seq, "page probably bolted before linux" - " pagetables were set: addr:%lx, pteval:%lx\n", - addr, pteval); - } - } -} - -static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) -{ - pmd_t *pmd = pmd_offset(pud, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { - addr = start + i * PMD_SIZE; - if (!pmd_none(*pmd)) - /* pmd exists */ - walk_pte(st, pmd, addr); - } -} - -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) -{ - pud_t *pud = pud_offset(pgd, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PUD; i++, pud++) { - addr = start + i * PUD_SIZE; - if (!pud_none(*pud)) - /* pud exists */ - walk_pmd(st, pud, addr); - } -} - -static void walk_pagetables(struct pg_state *st) -{ - pgd_t *pgd = pgd_offset_k(0UL); - unsigned int i; - unsigned long addr; - - /* - * Traverse the linux pagetable structure and dump pages that are in - * the hash pagetable. - */ - for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { - addr = KERN_VIRT_START + i * PGDIR_SIZE; - if (!pgd_none(*pgd)) - /* pgd exists */ - walk_pud(st, pgd, addr); - } -} - - -static void walk_linearmapping(struct pg_state *st) -{ - unsigned long addr; - - /* - * Traverse the linear mapping section of virtual memory and dump pages - * that are in the hash pagetable. - */ - unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift; - - for (addr = PAGE_OFFSET; addr < PAGE_OFFSET + - memblock_end_of_DRAM(); addr += psize) - hpte_find(st, addr, mmu_linear_psize); -} - -static void walk_vmemmap(struct pg_state *st) -{ -#ifdef CONFIG_SPARSEMEM_VMEMMAP - struct vmemmap_backing *ptr = vmemmap_list; - - /* - * Traverse the vmemmaped memory and dump pages that are in the hash - * pagetable. - */ - while (ptr->list) { - hpte_find(st, ptr->virt_addr, mmu_vmemmap_psize); - ptr = ptr->list; - } - seq_puts(st->seq, "---[ vmemmap end ]---\n"); -#endif -} - -static void populate_markers(void) -{ - address_markers[0].start_address = PAGE_OFFSET; - address_markers[1].start_address = VMALLOC_START; - address_markers[2].start_address = VMALLOC_END; - address_markers[3].start_address = ISA_IO_BASE; - address_markers[4].start_address = ISA_IO_END; - address_markers[5].start_address = PHB_IO_BASE; - address_markers[6].start_address = PHB_IO_END; - address_markers[7].start_address = IOREMAP_BASE; - address_markers[8].start_address = IOREMAP_END; -#ifdef CONFIG_PPC_BOOK3S_64 - address_markers[9].start_address = H_VMEMMAP_BASE; -#else - address_markers[9].start_address = VMEMMAP_BASE; -#endif -} - -static int ptdump_show(struct seq_file *m, void *v) -{ - struct pg_state st = { - .seq = m, - .start_address = PAGE_OFFSET, - .marker = address_markers, - }; - /* - * Traverse the 0xc, 0xd and 0xf areas of the kernel virtual memory and - * dump pages that are in the hash pagetable. - */ - walk_linearmapping(&st); - walk_pagetables(&st); - walk_vmemmap(&st); - return 0; -} - -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int ptdump_init(void) -{ - struct dentry *debugfs_file; - - if (!radix_enabled()) { - populate_markers(); - debugfs_file = debugfs_create_file("kernel_hash_pagetable", - 0400, NULL, NULL, &ptdump_fops); - return debugfs_file ? 0 : -ENOMEM; - } - return 0; -} -device_initcall(ptdump_init); diff --git a/arch/powerpc/mm/dump_linuxpagetables-8xx.c b/arch/powerpc/mm/dump_linuxpagetables-8xx.c @@ -1,82 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * From split of dump_linuxpagetables.c - * Copyright 2016, Rashmica Gupta, IBM Corp. - * - */ -#include <linux/kernel.h> -#include <asm/pgtable.h> - -#include "dump_linuxpagetables.h" - -static const struct flag_info flag_array[] = { - { - .mask = _PAGE_SH, - .val = 0, - .set = "user", - .clear = " ", - }, { - .mask = _PAGE_RO | _PAGE_NA, - .val = 0, - .set = "rw", - }, { - .mask = _PAGE_RO | _PAGE_NA, - .val = _PAGE_RO, - .set = "r ", - }, { - .mask = _PAGE_RO | _PAGE_NA, - .val = _PAGE_NA, - .set = " ", - }, { - .mask = _PAGE_EXEC, - .val = _PAGE_EXEC, - .set = " X ", - .clear = " ", - }, { - .mask = _PAGE_PRESENT, - .val = _PAGE_PRESENT, - .set = "present", - .clear = " ", - }, { - .mask = _PAGE_GUARDED, - .val = _PAGE_GUARDED, - .set = "guarded", - .clear = " ", - }, { - .mask = _PAGE_DIRTY, - .val = _PAGE_DIRTY, - .set = "dirty", - .clear = " ", - }, { - .mask = _PAGE_ACCESSED, - .val = _PAGE_ACCESSED, - .set = "accessed", - .clear = " ", - }, { - .mask = _PAGE_NO_CACHE, - .val = _PAGE_NO_CACHE, - .set = "no cache", - .clear = " ", - }, { - .mask = _PAGE_SPECIAL, - .val = _PAGE_SPECIAL, - .set = "special", - } -}; - -struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pud */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pmd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pte */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, -}; diff --git a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c @@ -1,120 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * From split of dump_linuxpagetables.c - * Copyright 2016, Rashmica Gupta, IBM Corp. - * - */ -#include <linux/kernel.h> -#include <asm/pgtable.h> - -#include "dump_linuxpagetables.h" - -static const struct flag_info flag_array[] = { - { - .mask = _PAGE_PRIVILEGED, - .val = 0, - .set = "user", - .clear = " ", - }, { - .mask = _PAGE_READ, - .val = _PAGE_READ, - .set = "r", - .clear = " ", - }, { - .mask = _PAGE_WRITE, - .val = _PAGE_WRITE, - .set = "w", - .clear = " ", - }, { - .mask = _PAGE_EXEC, - .val = _PAGE_EXEC, - .set = " X ", - .clear = " ", - }, { - .mask = _PAGE_PTE, - .val = _PAGE_PTE, - .set = "pte", - .clear = " ", - }, { - .mask = _PAGE_PRESENT, - .val = _PAGE_PRESENT, - .set = "valid", - .clear = " ", - }, { - .mask = _PAGE_PRESENT | _PAGE_INVALID, - .val = 0, - .set = " ", - .clear = "present", - }, { - .mask = H_PAGE_HASHPTE, - .val = H_PAGE_HASHPTE, - .set = "hpte", - .clear = " ", - }, { - .mask = _PAGE_DIRTY, - .val = _PAGE_DIRTY, - .set = "dirty", - .clear = " ", - }, { - .mask = _PAGE_ACCESSED, - .val = _PAGE_ACCESSED, - .set = "accessed", - .clear = " ", - }, { - .mask = _PAGE_NON_IDEMPOTENT, - .val = _PAGE_NON_IDEMPOTENT, - .set = "non-idempotent", - .clear = " ", - }, { - .mask = _PAGE_TOLERANT, - .val = _PAGE_TOLERANT, - .set = "tolerant", - .clear = " ", - }, { - .mask = H_PAGE_BUSY, - .val = H_PAGE_BUSY, - .set = "busy", - }, { -#ifdef CONFIG_PPC_64K_PAGES - .mask = H_PAGE_COMBO, - .val = H_PAGE_COMBO, - .set = "combo", - }, { - .mask = H_PAGE_4K_PFN, - .val = H_PAGE_4K_PFN, - .set = "4K_pfn", - }, { -#else /* CONFIG_PPC_64K_PAGES */ - .mask = H_PAGE_F_GIX, - .val = H_PAGE_F_GIX, - .set = "f_gix", - .is_val = true, - .shift = H_PAGE_F_GIX_SHIFT, - }, { - .mask = H_PAGE_F_SECOND, - .val = H_PAGE_F_SECOND, - .set = "f_second", - }, { -#endif /* CONFIG_PPC_64K_PAGES */ - .mask = _PAGE_SPECIAL, - .val = _PAGE_SPECIAL, - .set = "special", - } -}; - -struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pud */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pmd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pte */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, -}; diff --git a/arch/powerpc/mm/dump_linuxpagetables-generic.c b/arch/powerpc/mm/dump_linuxpagetables-generic.c @@ -1,80 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * From split of dump_linuxpagetables.c - * Copyright 2016, Rashmica Gupta, IBM Corp. - * - */ -#include <linux/kernel.h> -#include <asm/pgtable.h> - -#include "dump_linuxpagetables.h" - -static const struct flag_info flag_array[] = { - { - .mask = _PAGE_USER, - .val = _PAGE_USER, - .set = "user", - .clear = " ", - }, { - .mask = _PAGE_RW, - .val = _PAGE_RW, - .set = "rw", - .clear = "r ", - }, { - .mask = _PAGE_EXEC, - .val = _PAGE_EXEC, - .set = " X ", - .clear = " ", - }, { - .mask = _PAGE_PRESENT, - .val = _PAGE_PRESENT, - .set = "present", - .clear = " ", - }, { - .mask = _PAGE_GUARDED, - .val = _PAGE_GUARDED, - .set = "guarded", - .clear = " ", - }, { - .mask = _PAGE_DIRTY, - .val = _PAGE_DIRTY, - .set = "dirty", - .clear = " ", - }, { - .mask = _PAGE_ACCESSED, - .val = _PAGE_ACCESSED, - .set = "accessed", - .clear = " ", - }, { - .mask = _PAGE_WRITETHRU, - .val = _PAGE_WRITETHRU, - .set = "write through", - .clear = " ", - }, { - .mask = _PAGE_NO_CACHE, - .val = _PAGE_NO_CACHE, - .set = "no cache", - .clear = " ", - }, { - .mask = _PAGE_SPECIAL, - .val = _PAGE_SPECIAL, - .set = "special", - } -}; - -struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pud */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pmd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pte */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, -}; diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c @@ -1,373 +0,0 @@ -/* - * Copyright 2016, Rashmica Gupta, IBM Corp. - * - * This traverses the kernel pagetables and dumps the - * information about the used sections of memory to - * /sys/kernel/debug/kernel_pagetables. - * - * Derived from the arm64 implementation: - * Copyright (c) 2014, The Linux Foundation, Laura Abbott. - * (C) Copyright 2008 Intel Corporation, Arjan van de Ven. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#include <linux/debugfs.h> -#include <linux/fs.h> -#include <linux/hugetlb.h> -#include <linux/io.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/sched.h> -#include <linux/seq_file.h> -#include <asm/fixmap.h> -#include <asm/pgtable.h> -#include <linux/const.h> -#include <asm/page.h> -#include <asm/pgalloc.h> - -#include "dump_linuxpagetables.h" - -#ifdef CONFIG_PPC32 -#define KERN_VIRT_START 0 -#endif - -/* - * To visualise what is happening, - * - * - PTRS_PER_P** = how many entries there are in the corresponding P** - * - P**_SHIFT = how many bits of the address we use to index into the - * corresponding P** - * - P**_SIZE is how much memory we can access through the table - not the - * size of the table itself. - * P**={PGD, PUD, PMD, PTE} - * - * - * Each entry of the PGD points to a PUD. Each entry of a PUD points to a - * PMD. Each entry of a PMD points to a PTE. And every PTE entry points to - * a page. - * - * In the case where there are only 3 levels, the PUD is folded into the - * PGD: every PUD has only one entry which points to the PMD. - * - * The page dumper groups page table entries of the same type into a single - * description. It uses pg_state to track the range information while - * iterating over the PTE entries. When the continuity is broken it then - * dumps out a description of the range - ie PTEs that are virtually contiguous - * with the same PTE flags are chunked together. This is to make it clear how - * different areas of the kernel virtual memory are used. - * - */ -struct pg_state { - struct seq_file *seq; - const struct addr_marker *marker; - unsigned long start_address; - unsigned long start_pa; - unsigned long last_pa; - unsigned int level; - u64 current_flags; -}; - -struct addr_marker { - unsigned long start_address; - const char *name; -}; - -static struct addr_marker address_markers[] = { - { 0, "Start of kernel VM" }, - { 0, "vmalloc() Area" }, - { 0, "vmalloc() End" }, -#ifdef CONFIG_PPC64 - { 0, "isa I/O start" }, - { 0, "isa I/O end" }, - { 0, "phb I/O start" }, - { 0, "phb I/O end" }, - { 0, "I/O remap start" }, - { 0, "I/O remap end" }, - { 0, "vmemmap start" }, -#else - { 0, "Early I/O remap start" }, - { 0, "Early I/O remap end" }, -#ifdef CONFIG_NOT_COHERENT_CACHE - { 0, "Consistent mem start" }, - { 0, "Consistent mem end" }, -#endif -#ifdef CONFIG_HIGHMEM - { 0, "Highmem PTEs start" }, - { 0, "Highmem PTEs end" }, -#endif - { 0, "Fixmap start" }, - { 0, "Fixmap end" }, -#endif - { -1, NULL }, -}; - -static void dump_flag_info(struct pg_state *st, const struct flag_info - *flag, u64 pte, int num) -{ - unsigned int i; - - for (i = 0; i < num; i++, flag++) { - const char *s = NULL; - u64 val; - - /* flag not defined so don't check it */ - if (flag->mask == 0) - continue; - /* Some 'flags' are actually values */ - if (flag->is_val) { - val = pte & flag->val; - if (flag->shift) - val = val >> flag->shift; - seq_printf(st->seq, " %s:%llx", flag->set, val); - } else { - if ((pte & flag->mask) == flag->val) - s = flag->set; - else - s = flag->clear; - if (s) - seq_printf(st->seq, " %s", s); - } - st->current_flags &= ~flag->mask; - } - if (st->current_flags != 0) - seq_printf(st->seq, " unknown flags:%llx", st->current_flags); -} - -static void dump_addr(struct pg_state *st, unsigned long addr) -{ - static const char units[] = "KMGTPE"; - const char *unit = units; - unsigned long delta; - -#ifdef CONFIG_PPC64 - seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); - seq_printf(st->seq, "0x%016lx ", st->start_pa); -#else - seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1); - seq_printf(st->seq, "0x%08lx ", st->start_pa); -#endif - - delta = (addr - st->start_address) >> 10; - /* Work out what appropriate unit to use */ - while (!(delta & 1023) && unit[1]) { - delta >>= 10; - unit++; - } - seq_printf(st->seq, "%9lu%c", delta, *unit); - -} - -static void note_page(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val) -{ - u64 flag = val & pg_level[level].mask; - u64 pa = val & PTE_RPN_MASK; - - /* At first no level is set */ - if (!st->level) { - st->level = level; - st->current_flags = flag; - st->start_address = addr; - st->start_pa = pa; - st->last_pa = pa; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); - /* - * Dump the section of virtual memory when: - * - the PTE flags from one entry to the next differs. - * - we change levels in the tree. - * - the address is in a different section of memory and is thus - * used for a different purpose, regardless of the flags. - * - the pa of this page is not adjacent to the last inspected page - */ - } else if (flag != st->current_flags || level != st->level || - addr >= st->marker[1].start_address || - pa != st->last_pa + PAGE_SIZE) { - - /* Check the PTE flags */ - if (st->current_flags) { - dump_addr(st, addr); - - /* Dump all the flags */ - if (pg_level[st->level].flag) - dump_flag_info(st, pg_level[st->level].flag, - st->current_flags, - pg_level[st->level].num); - - seq_putc(st->seq, '\n'); - } - - /* - * Address indicates we have passed the end of the - * current section of virtual memory - */ - while (addr >= st->marker[1].start_address) { - st->marker++; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); - } - st->start_address = addr; - st->start_pa = pa; - st->last_pa = pa; - st->current_flags = flag; - st->level = level; - } else { - st->last_pa = pa; - } -} - -static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) -{ - pte_t *pte = pte_offset_kernel(pmd, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PTE; i++, pte++) { - addr = start + i * PAGE_SIZE; - note_page(st, addr, 4, pte_val(*pte)); - - } -} - -static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) -{ - pmd_t *pmd = pmd_offset(pud, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { - addr = start + i * PMD_SIZE; - if (!pmd_none(*pmd) && !pmd_huge(*pmd)) - /* pmd exists */ - walk_pte(st, pmd, addr); - else - note_page(st, addr, 3, pmd_val(*pmd)); - } -} - -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) -{ - pud_t *pud = pud_offset(pgd, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PUD; i++, pud++) { - addr = start + i * PUD_SIZE; - if (!pud_none(*pud) && !pud_huge(*pud)) - /* pud exists */ - walk_pmd(st, pud, addr); - else - note_page(st, addr, 2, pud_val(*pud)); - } -} - -static void walk_pagetables(struct pg_state *st) -{ - pgd_t *pgd = pgd_offset_k(0UL); - unsigned int i; - unsigned long addr; - - addr = st->start_address; - - /* - * Traverse the linux pagetable structure and dump pages that are in - * the hash pagetable. - */ - for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { - if (!pgd_none(*pgd) && !pgd_huge(*pgd)) - /* pgd exists */ - walk_pud(st, pgd, addr); - else - note_page(st, addr, 1, pgd_val(*pgd)); - } -} - -static void populate_markers(void) -{ - int i = 0; - - address_markers[i++].start_address = PAGE_OFFSET; - address_markers[i++].start_address = VMALLOC_START; - address_markers[i++].start_address = VMALLOC_END; -#ifdef CONFIG_PPC64 - address_markers[i++].start_address = ISA_IO_BASE; - address_markers[i++].start_address = ISA_IO_END; - address_markers[i++].start_address = PHB_IO_BASE; - address_markers[i++].start_address = PHB_IO_END; - address_markers[i++].start_address = IOREMAP_BASE; - address_markers[i++].start_address = IOREMAP_END; -#ifdef CONFIG_PPC_BOOK3S_64 - address_markers[i++].start_address = H_VMEMMAP_BASE; -#else - address_markers[i++].start_address = VMEMMAP_BASE; -#endif -#else /* !CONFIG_PPC64 */ - address_markers[i++].start_address = ioremap_bot; - address_markers[i++].start_address = IOREMAP_TOP; -#ifdef CONFIG_NOT_COHERENT_CACHE - address_markers[i++].start_address = IOREMAP_TOP; - address_markers[i++].start_address = IOREMAP_TOP + - CONFIG_CONSISTENT_SIZE; -#endif -#ifdef CONFIG_HIGHMEM - address_markers[i++].start_address = PKMAP_BASE; - address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP); -#endif - address_markers[i++].start_address = FIXADDR_START; - address_markers[i++].start_address = FIXADDR_TOP; -#endif /* CONFIG_PPC64 */ -} - -static int ptdump_show(struct seq_file *m, void *v) -{ - struct pg_state st = { - .seq = m, - .marker = address_markers, - }; - - if (radix_enabled()) - st.start_address = PAGE_OFFSET; - else - st.start_address = KERN_VIRT_START; - - /* Traverse kernel page tables */ - walk_pagetables(&st); - note_page(&st, 0, 0, 0); - return 0; -} - - -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static void build_pgtable_complete_mask(void) -{ - unsigned int i, j; - - for (i = 0; i < ARRAY_SIZE(pg_level); i++) - if (pg_level[i].flag) - for (j = 0; j < pg_level[i].num; j++) - pg_level[i].mask |= pg_level[i].flag[j].mask; -} - -static int ptdump_init(void) -{ - struct dentry *debugfs_file; - - populate_markers(); - build_pgtable_complete_mask(); - debugfs_file = debugfs_create_file("kernel_page_tables", 0400, NULL, - NULL, &ptdump_fops); - return debugfs_file ? 0 : -ENOMEM; -} -device_initcall(ptdump_init); diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c @@ -221,7 +221,7 @@ unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun) #error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" #endif -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; } diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S @@ -47,14 +47,13 @@ mmu_hash_lock: * Returns to the caller if the access is illegal or there is no * mapping for the address. Otherwise it places an appropriate PTE * in the hash table and returns from the exception. - * Uses r0, r3 - r8, r10, ctr, lr. + * Uses r0, r3 - r6, r8, r10, ctr, lr. */ .text _GLOBAL(hash_page) - tophys(r7,0) /* gets -KERNELBASE into r7 */ #ifdef CONFIG_SMP - addis r8,r7,mmu_hash_lock@h - ori r8,r8,mmu_hash_lock@l + lis r8, (mmu_hash_lock - PAGE_OFFSET)@h + ori r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l lis r0,0x0fff b 10f 11: lwz r6,0(r8) @@ -70,14 +69,13 @@ _GLOBAL(hash_page) /* Get PTE (linux-style) and check access */ lis r0,KERNELBASE@h /* check if kernel address */ cmplw 0,r4,r0 - mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ - lwz r5,PGDIR(r8) /* virt page-table root */ + mfspr r5, SPRN_SPRG_PGDIR /* virt page-table root */ blt+ 112f /* assume user more likely */ lis r5,swapper_pg_dir@ha /* if kernel address, use */ addi r5,r5,swapper_pg_dir@l /* kernel page table */ rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ -112: add r5,r5,r7 /* convert to phys addr */ +112: tophys(r5, r5) #ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ lwz r8,0(r5) /* get pmd entry */ @@ -144,25 +142,24 @@ retry: #ifdef CONFIG_SMP eieio - addis r8,r7,mmu_hash_lock@ha + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha li r0,0 - stw r0,mmu_hash_lock@l(r8) + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) #endif /* Return from the exception */ lwz r5,_CTR(r11) mtctr r5 lwz r0,GPR0(r11) - lwz r7,GPR7(r11) lwz r8,GPR8(r11) b fast_exception_return #ifdef CONFIG_SMP hash_page_out: eieio - addis r8,r7,mmu_hash_lock@ha + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha li r0,0 - stw r0,mmu_hash_lock@l(r8) + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) blr #endif /* CONFIG_SMP */ @@ -186,8 +183,7 @@ _GLOBAL(add_hash_page) add r3,r3,r0 /* note create_hpte trims to 24 bits */ #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */ - lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */ + lwz r8,TASK_CPU(r2) /* to go in mmu_hash_lock */ oris r8,r8,12 #endif /* CONFIG_SMP */ @@ -208,11 +204,9 @@ _GLOBAL(add_hash_page) SYNC_601 isync - tophys(r7,0) - #ifdef CONFIG_SMP - addis r6,r7,mmu_hash_lock@ha - addi r6,r6,mmu_hash_lock@l + lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l 10: lwarx r0,0,r6 /* take the mmu_hash_lock */ cmpi 0,r0,0 bne- 11f @@ -257,8 +251,8 @@ _GLOBAL(add_hash_page) 9: #ifdef CONFIG_SMP - addis r6,r7,mmu_hash_lock@ha - addi r6,r6,mmu_hash_lock@l + lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l eieio li r0,0 stw r0,0(r6) /* clear mmu_hash_lock */ @@ -278,10 +272,8 @@ _GLOBAL(add_hash_page) * It is designed to be called with the MMU either on or off. * r3 contains the VSID, r4 contains the virtual address, * r5 contains the linux PTE, r6 contains the old value of the - * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the - * offset to be added to addresses (0 if the MMU is on, - * -KERNELBASE if it is off). r10 contains the upper half of - * the PTE if CONFIG_PTE_64BIT. + * linux PTE (before setting _PAGE_HASHPTE). r10 contains the + * upper half of the PTE if CONFIG_PTE_64BIT. * On SMP, the caller should have the mmu_hash_lock held. * We assume that the caller has (or will) set the _PAGE_HASHPTE * bit in the linux PTE in memory. The value passed in r6 should @@ -342,7 +334,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) patch_site 1f, patch__hash_page_A1 patch_site 2f, patch__hash_page_A2 /* Get the address of the primary PTE group in the hash table (r3) */ -0: addis r0,r7,Hash_base@h /* base address of hash table */ +0: lis r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ 1: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ 2: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ xor r3,r3,r0 /* make primary hash */ @@ -356,10 +348,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) beq+ 10f /* no PTE: go look for an empty slot */ tlbie r4 - addis r4,r7,htab_hash_searches@ha - lwz r6,htab_hash_searches@l(r4) + lis r4, (htab_hash_searches - PAGE_OFFSET)@ha + lwz r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) addi r6,r6,1 /* count how many searches we do */ - stw r6,htab_hash_searches@l(r4) + stw r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ mtctr r0 @@ -391,10 +383,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) beq+ found_empty /* update counter of times that the primary PTEG is full */ - addis r4,r7,primary_pteg_full@ha - lwz r6,primary_pteg_full@l(r4) + lis r4, (primary_pteg_full - PAGE_OFFSET)@ha + lwz r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) addi r6,r6,1 - stw r6,primary_pteg_full@l(r4) + stw r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) patch_site 0f, patch__hash_page_C /* Search the secondary PTEG for an empty slot */ @@ -428,8 +420,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) * lockup here but that shouldn't happen */ -1: addis r4,r7,next_slot@ha /* get next evict slot */ - lwz r6,next_slot@l(r4) +1: lis r4, (next_slot - PAGE_OFFSET)@ha /* get next evict slot */ + lwz r6, (next_slot - PAGE_OFFSET)@l(r4) addi r6,r6,HPTE_SIZE /* search for candidate */ andi. r6,r6,7*HPTE_SIZE stw r6,next_slot@l(r4) @@ -501,8 +493,6 @@ htab_hash_searches: * We assume that there is a hash table in use (Hash != 0). */ _GLOBAL(flush_hash_pages) - tophys(r7,0) - /* * We disable interrupts here, even on UP, because we want * the _PAGE_HASHPTE bit to be a reliable indication of @@ -547,11 +537,9 @@ _GLOBAL(flush_hash_pages) SET_V(r11) /* set V (valid) bit */ #ifdef CONFIG_SMP - addis r9,r7,mmu_hash_lock@ha - addi r9,r9,mmu_hash_lock@l - CURRENT_THREAD_INFO(r8, r1) - add r8,r8,r7 - lwz r8,TI_CPU(r8) + lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l + lwz r8,TASK_CPU(r2) oris r8,r8,9 10: lwarx r0,0,r9 cmpi 0,r0,0 @@ -584,7 +572,7 @@ _GLOBAL(flush_hash_pages) patch_site 1f, patch__flush_hash_A1 patch_site 2f, patch__flush_hash_A2 /* Get the address of the primary PTE group in the hash table (r3) */ -0: addis r8,r7,Hash_base@h /* base address of hash table */ +0: lis r8, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ 1: rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ 2: rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ xor r8,r0,r8 /* make primary hash */ @@ -646,8 +634,7 @@ EXPORT_SYMBOL(flush_hash_pages) */ _GLOBAL(_tlbie) #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r8, r1) - lwz r8,TI_CPU(r8) + lwz r8,TASK_CPU(r2) oris r8,r8,11 mfmsr r10 SYNC @@ -684,8 +671,7 @@ _GLOBAL(_tlbie) */ _GLOBAL(_tlbia) #if defined(CONFIG_SMP) - CURRENT_THREAD_INFO(r8, r1) - lwz r8,TI_CPU(r8) + lwz r8,TASK_CPU(r2) oris r8,r8,10 mfmsr r10 SYNC diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c @@ -1889,12 +1889,12 @@ static int hpt_order_set(void *data, u64 val) return mmu_hash_ops.resize_hpt(val); } -DEFINE_SIMPLE_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); static int __init hash64_debugfs(void) { - if (!debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, - NULL, &fops_hpt_order)) { + if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root, + NULL, &fops_hpt_order)) { pr_err("lpar: unable to create hpt_order debugsfs file\n"); } diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -26,7 +26,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, real_pte_t rpte; unsigned long vpn; unsigned long old_pte, new_pte; - unsigned long rflags, pa, sz; + unsigned long rflags, pa; long slot, offset; BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); @@ -73,7 +73,6 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, offset = PTRS_PER_PMD; rpte = __real_pte(__pte(old_pte), ptep, offset); - sz = ((1UL) << shift); if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) /* No CPU has hugepages but lacks no execute, so we * don't need to worry about that case */ diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/hugetlb.h> +#include <linux/security.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/cacheflush.h> @@ -73,7 +74,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (high_limit - len >= addr && + if (high_limit - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -83,7 +84,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, */ info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = PAGE_SIZE; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c @@ -108,12 +108,8 @@ static void __init MMU_setup(void) __map_without_bats = 1; __map_without_ltlbs = 1; } -#ifdef CONFIG_STRICT_KERNEL_RWX - if (rodata_enabled) { - __map_without_bats = 1; + if (strict_kernel_rwx_enabled() && !IS_ENABLED(CONFIG_PPC_8xx)) __map_without_ltlbs = 1; - } -#endif } /* diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c @@ -274,7 +274,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, for (; start < end; start += page_size) { unsigned long nr_pages, addr; - struct page *section_base; struct page *page; /* @@ -290,7 +289,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, continue; page = pfn_to_page(addr >> PAGE_SHIFT); - section_base = pfn_to_page(vmemmap_section_start(start)); nr_pages = 1 << page_order; base_pfn = PHYS_PFN(addr); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c @@ -69,22 +69,14 @@ pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); pgprot_t kmap_prot; EXPORT_SYMBOL(kmap_prot); -#define TOP_ZONE ZONE_HIGHMEM static inline pte_t *virt_to_kpte(unsigned long vaddr) { return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), vaddr), vaddr); } -#else -#define TOP_ZONE ZONE_NORMAL #endif -int page_is_ram(unsigned long pfn) -{ - return memblock_is_memory(__pfn_to_phys(pfn)); -} - pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -176,34 +168,6 @@ int __meminit arch_remove_memory(int nid, u64 start, u64 size, #endif #endif /* CONFIG_MEMORY_HOTPLUG */ -/* - * walk_memory_resource() needs to make sure there is no holes in a given - * memory range. PPC64 does not maintain the memory layout in /proc/iomem. - * Instead it maintains it in memblock.memory structures. Walk through the - * memory regions, find holes and callback for contiguous regions. - */ -int -walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, - void *arg, int (*func)(unsigned long, unsigned long, void *)) -{ - struct memblock_region *reg; - unsigned long end_pfn = start_pfn + nr_pages; - unsigned long tstart, tend; - int ret = -1; - - for_each_memblock(memory, reg) { - tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); - tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); - if (tstart >= tend) - continue; - ret = (*func)(tstart, tend - tstart, arg); - if (ret) - break; - } - return ret; -} -EXPORT_SYMBOL_GPL(walk_system_ram_range); - #ifndef CONFIG_NEED_MULTIPLE_NODES void __init mem_topology_setup(void) { @@ -262,25 +226,6 @@ static int __init mark_nonram_nosave(void) static unsigned long max_zone_pfns[MAX_NR_ZONES]; /* - * Find the least restrictive zone that is entirely below the - * specified pfn limit. Returns < 0 if no suitable zone is found. - * - * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit - * systems -- the DMA limit can be higher than any possible real pfn. - */ -int dma_pfn_limit_to_zone(u64 pfn_limit) -{ - int i; - - for (i = TOP_ZONE; i >= 0; i--) { - if (max_zone_pfns[i] <= pfn_limit) - return i; - } - - return -EPERM; -} - -/* * paging_init() sets up the page tables - in fact we've already done this. */ void __init paging_init(void) @@ -585,3 +530,9 @@ int devmem_is_allowed(unsigned long pfn) return 0; } #endif /* CONFIG_STRICT_DEVMEM */ + +/* + * This is defined in kernel/resource.c but only powerpc needs to export it, for + * the EHEA driver. Drop this when drivers/net/ethernet/ibm/ehea is removed. + */ +EXPORT_SYMBOL_GPL(walk_system_ram_range); diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h @@ -130,7 +130,7 @@ extern void wii_memory_fixups(void); */ #ifdef CONFIG_PPC32 extern void MMU_init_hw(void); -extern unsigned long mmu_mapin_ram(unsigned long top); +unsigned long mmu_mapin_ram(unsigned long base, unsigned long top); #endif #ifdef CONFIG_PPC_FSL_BOOK3E @@ -165,3 +165,11 @@ unsigned long p_block_mapped(phys_addr_t pa); static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; } static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; } #endif + +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) +void mmu_mark_initmem_nx(void); +void mmu_mark_rodata_ro(void); +#else +static inline void mmu_mark_initmem_nx(void) { } +static inline void mmu_mark_rodata_ro(void) { } +#endif diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c @@ -1460,13 +1460,6 @@ static void reset_topology_timer(void) #ifdef CONFIG_SMP -static void stage_topology_update(int core_id) -{ - cpumask_or(&cpu_associativity_changes_mask, - &cpu_associativity_changes_mask, cpu_sibling_mask(core_id)); - reset_topology_timer(); -} - static int dt_update_callback(struct notifier_block *nb, unsigned long action, void *data) { @@ -1479,7 +1472,7 @@ static int dt_update_callback(struct notifier_block *nb, !of_prop_cmp(update->prop->name, "ibm,associativity")) { u32 core_id; of_property_read_u32(update->dn, "reg", &core_id); - stage_topology_update(core_id); + rc = dlpar_cpu_readd(core_id); rc = NOTIFY_OK; } break; diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c @@ -254,26 +254,20 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) void __init mapin_ram(void) { - unsigned long s, top; - -#ifndef CONFIG_WII - top = total_lowmem; - s = mmu_mapin_ram(top); - __mapin_ram_chunk(s, top); -#else - if (!wii_hole_size) { - s = mmu_mapin_ram(total_lowmem); - __mapin_ram_chunk(s, total_lowmem); - } else { - top = wii_hole_start; - s = mmu_mapin_ram(top); - __mapin_ram_chunk(s, top); - - top = memblock_end_of_DRAM(); - s = wii_mmu_mapin_mem2(top); - __mapin_ram_chunk(s, top); + struct memblock_region *reg; + + for_each_memblock(memory, reg) { + phys_addr_t base = reg->base; + phys_addr_t top = min(base + reg->size, total_lowmem); + + if (base >= top) + continue; + base = mmu_mapin_ram(base, top); + if (IS_ENABLED(CONFIG_BDI_SWITCH)) + __mapin_ram_chunk(reg->base, top); + else + __mapin_ram_chunk(base, top); } -#endif } /* Scan the real Linux page tables and return a PTE pointer for @@ -359,7 +353,10 @@ void mark_initmem_nx(void) unsigned long numpages = PFN_UP((unsigned long)_einittext) - PFN_DOWN((unsigned long)_sinittext); - change_page_attr(page, numpages, PAGE_KERNEL); + if (v_block_mapped((unsigned long)_stext) + 1) + mmu_mark_initmem_nx(); + else + change_page_attr(page, numpages, PAGE_KERNEL); } #ifdef CONFIG_STRICT_KERNEL_RWX @@ -368,6 +365,11 @@ void mark_rodata_ro(void) struct page *page; unsigned long numpages; + if (v_block_mapped((unsigned long)_sinittext)) { + mmu_mark_rodata_ro(); + return; + } + page = virt_to_page(_stext); numpages = PFN_UP((unsigned long)_etext) - PFN_DOWN((unsigned long)_stext); diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c @@ -32,6 +32,7 @@ #include <asm/mmu.h> #include <asm/machdep.h> #include <asm/code-patching.h> +#include <asm/sections.h> #include "mmu_decl.h" @@ -73,45 +74,171 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; } -unsigned long __init mmu_mapin_ram(unsigned long top) +static int find_free_bat(void) { - unsigned long tot, bl, done; - unsigned long max_size = (256<<20); + int b; + + if (cpu_has_feature(CPU_FTR_601)) { + for (b = 0; b < 4; b++) { + struct ppc_bat *bat = BATS[b]; + + if (!(bat[0].batl & 0x40)) + return b; + } + } else { + int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + + for (b = 0; b < n; b++) { + struct ppc_bat *bat = BATS[b]; + + if (!(bat[1].batu & 3)) + return b; + } + } + return -1; +} + +static unsigned int block_size(unsigned long base, unsigned long top) +{ + unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; + unsigned int base_shift = (fls(base) - 1) & 31; + unsigned int block_shift = (fls(top - base) - 1) & 31; + + return min3(max_size, 1U << base_shift, 1U << block_shift); +} + +/* + * Set up one of the IBAT (block address translation) register pairs. + * The parameters are not checked; in particular size must be a power + * of 2 between 128k and 256M. + * Only for 603+ ... + */ +static void setibat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, pgprot_t prot) +{ + unsigned int bl = (size >> 17) - 1; + int wimgxpp; + struct ppc_bat *bat = BATS[index]; + unsigned long flags = pgprot_val(prot); + + if (!cpu_has_feature(CPU_FTR_NEED_COHERENT)) + flags &= ~_PAGE_COHERENT; + + wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX); + bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp; + if (flags & _PAGE_USER) + bat[0].batu |= 1; /* Vp = 1 */ +} + +static void clearibat(int index) +{ + struct ppc_bat *bat = BATS[index]; + + bat[0].batu = 0; + bat[0].batl = 0; +} + +static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top) +{ + int idx; + + while ((idx = find_free_bat()) != -1 && base != top) { + unsigned int size = block_size(base, top); + + if (size < 128 << 10) + break; + setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); + base += size; + } + + return base; +} + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + int done; + unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; if (__map_without_bats) { - printk(KERN_DEBUG "RAM mapped without BATs\n"); - return 0; + pr_debug("RAM mapped without BATs\n"); + return base; + } + + if (!strict_kernel_rwx_enabled() || base >= border || top <= border) + return __mmu_mapin_ram(base, top); + + done = __mmu_mapin_ram(base, border); + if (done != border - base) + return done; + + return done + __mmu_mapin_ram(border, top); +} + +void mmu_mark_initmem_nx(void) +{ + int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + int i; + unsigned long base = (unsigned long)_stext - PAGE_OFFSET; + unsigned long top = (unsigned long)_etext - PAGE_OFFSET; + unsigned long size; + + if (cpu_has_feature(CPU_FTR_601)) + return; + + for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { + size = block_size(base, top); + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + base += size; } + if (base < top) { + size = block_size(base, top); + size = max(size, 128UL << 10); + if ((top - base) > size) { + if (strict_kernel_rwx_enabled()) + pr_warn("Kernel _etext not properly aligned\n"); + size <<= 1; + } + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + base += size; + } + for (; i < nb; i++) + clearibat(i); - /* Set up BAT2 and if necessary BAT3 to cover RAM. */ + update_bats(); - /* Make sure we don't map a block larger than the - smallest alignment of the physical address. */ - tot = top; - for (bl = 128<<10; bl < max_size; bl <<= 1) { - if (bl * 2 > tot) + for (i = TASK_SIZE >> 28; i < 16; i++) { + /* Do not set NX on VM space for modules */ + if (IS_ENABLED(CONFIG_MODULES) && + (VMALLOC_START & 0xf0000000) == i << 28) break; + mtsrin(mfsrin(i << 28) | 0x10000000, i << 28); } +} + +void mmu_mark_rodata_ro(void) +{ + int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + int i; + + if (cpu_has_feature(CPU_FTR_601)) + return; + + for (i = 0; i < nb; i++) { + struct ppc_bat *bat = BATS[i]; - setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X); - done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1; - if ((done < tot) && !bat_addrs[3].limit) { - /* use BAT3 to cover a bit more */ - tot -= done; - for (bl = 128<<10; bl < max_size; bl <<= 1) - if (bl * 2 > tot) - break; - setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X); - done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1; + if (bat_addrs[i].start < (unsigned long)__init_begin) + bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX; } - return done; + update_bats(); } /* * Set up one of the I/D BAT (block address translation) register pairs. * The parameters are not checked; in particular size must be a power * of 2 between 128k and 256M. + * On 603+, only set IBAT when _PAGE_EXEC is set */ void __init setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot) @@ -138,11 +265,12 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, bat[1].batu |= 1; /* Vp = 1 */ if (flags & _PAGE_GUARDED) { /* G bit must be zero in IBATs */ - bat[0].batu = bat[0].batl = 0; - } else { - /* make IBAT same as DBAT */ - bat[0] = bat[1]; + flags &= ~_PAGE_EXEC; } + if (flags & _PAGE_EXEC) + bat[0] = bat[1]; + else + bat[0].batu = bat[0].batl = 0; } else { /* 601 cpu */ if (bl > BL_8M) @@ -231,7 +359,8 @@ void __init MMU_init_hw(void) if (lg_n_hpteg > 16) mb2 = 16 - LG_HPTEG_SIZE; - modify_instruction_site(&patch__hash_page_A0, 0xffff, (unsigned int)Hash >> 16); + modify_instruction_site(&patch__hash_page_A0, 0xffff, + ((unsigned int)Hash - PAGE_OFFSET) >> 16); modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6); modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6); modify_instruction_site(&patch__hash_page_B, 0xffff, hmask); @@ -240,7 +369,8 @@ void __init MMU_init_hw(void) /* * Patch up the instructions in hashtable.S:flush_hash_page */ - modify_instruction_site(&patch__flush_hash_A0, 0xffff, (unsigned int)Hash >> 16); + modify_instruction_site(&patch__flush_hash_A0, 0xffff, + ((unsigned int)Hash - PAGE_OFFSET) >> 16); modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6); modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6); modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask); diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <asm/pgtable.h> + +#include "ptdump.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_SH, + .val = 0, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = 0, + .set = "rw", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = _PAGE_RO, + .set = "r ", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = _PAGE_NA, + .set = " ", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "present", + .clear = " ", + }, { + .mask = _PAGE_GUARDED, + .val = _PAGE_GUARDED, + .set = "guarded", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_NO_CACHE, + .val = _PAGE_NO_CACHE, + .set = "no cache", + .clear = " ", + }, { + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += ptdump.o + +obj-$(CONFIG_4xx) += shared.o +obj-$(CONFIG_PPC_8xx) += 8xx.o +obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o +obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o diff --git a/arch/powerpc/mm/dump_bats.c b/arch/powerpc/mm/ptdump/bats.c diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <asm/pgtable.h> + +#include "ptdump.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_PRIVILEGED, + .val = 0, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_READ, + .val = _PAGE_READ, + .set = "r", + .clear = " ", + }, { + .mask = _PAGE_WRITE, + .val = _PAGE_WRITE, + .set = "w", + .clear = " ", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PTE, + .val = _PAGE_PTE, + .set = "pte", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "valid", + .clear = " ", + }, { + .mask = _PAGE_PRESENT | _PAGE_INVALID, + .val = 0, + .set = " ", + .clear = "present", + }, { + .mask = H_PAGE_HASHPTE, + .val = H_PAGE_HASHPTE, + .set = "hpte", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_NON_IDEMPOTENT, + .val = _PAGE_NON_IDEMPOTENT, + .set = "non-idempotent", + .clear = " ", + }, { + .mask = _PAGE_TOLERANT, + .val = _PAGE_TOLERANT, + .set = "tolerant", + .clear = " ", + }, { + .mask = H_PAGE_BUSY, + .val = H_PAGE_BUSY, + .set = "busy", + }, { +#ifdef CONFIG_PPC_64K_PAGES + .mask = H_PAGE_COMBO, + .val = H_PAGE_COMBO, + .set = "combo", + }, { + .mask = H_PAGE_4K_PFN, + .val = H_PAGE_4K_PFN, + .set = "4K_pfn", + }, { +#else /* CONFIG_PPC_64K_PAGES */ + .mask = H_PAGE_F_GIX, + .val = H_PAGE_F_GIX, + .set = "f_gix", + .is_val = true, + .shift = H_PAGE_F_GIX_SHIFT, + }, { + .mask = H_PAGE_F_SECOND, + .val = H_PAGE_F_SECOND, + .set = "f_second", + }, { +#endif /* CONFIG_PPC_64K_PAGES */ + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -0,0 +1,550 @@ +/* + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + * This traverses the kernel virtual memory and dumps the pages that are in + * the hash pagetable, along with their flags to + * /sys/kernel/debug/kernel_hash_pagetable. + * + * If radix is enabled then there is no hash page table and so no debugfs file + * is generated. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/debugfs.h> +#include <linux/fs.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <asm/pgtable.h> +#include <linux/const.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/plpar_wrappers.h> +#include <linux/memblock.h> +#include <asm/firmware.h> + +struct pg_state { + struct seq_file *seq; + const struct addr_marker *marker; + unsigned long start_address; + unsigned int level; + u64 current_flags; +}; + +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +static struct addr_marker address_markers[] = { + { 0, "Start of kernel VM" }, + { 0, "vmalloc() Area" }, + { 0, "vmalloc() End" }, + { 0, "isa I/O start" }, + { 0, "isa I/O end" }, + { 0, "phb I/O start" }, + { 0, "phb I/O end" }, + { 0, "I/O remap start" }, + { 0, "I/O remap end" }, + { 0, "vmemmap start" }, + { -1, NULL }, +}; + +struct flag_info { + u64 mask; + u64 val; + const char *set; + const char *clear; + bool is_val; + int shift; +}; + +static const struct flag_info v_flag_array[] = { + { + .mask = SLB_VSID_B, + .val = SLB_VSID_B_256M, + .set = "ssize: 256M", + .clear = "ssize: 1T ", + }, { + .mask = HPTE_V_SECONDARY, + .val = HPTE_V_SECONDARY, + .set = "secondary", + .clear = "primary ", + }, { + .mask = HPTE_V_VALID, + .val = HPTE_V_VALID, + .set = "valid ", + .clear = "invalid", + }, { + .mask = HPTE_V_BOLTED, + .val = HPTE_V_BOLTED, + .set = "bolted", + .clear = "", + } +}; + +static const struct flag_info r_flag_array[] = { + { + .mask = HPTE_R_PP0 | HPTE_R_PP, + .val = PP_RWXX, + .set = "prot:RW--", + }, { + .mask = HPTE_R_PP0 | HPTE_R_PP, + .val = PP_RWRX, + .set = "prot:RWR-", + }, { + .mask = HPTE_R_PP0 | HPTE_R_PP, + .val = PP_RWRW, + .set = "prot:RWRW", + }, { + .mask = HPTE_R_PP0 | HPTE_R_PP, + .val = PP_RXRX, + .set = "prot:R-R-", + }, { + .mask = HPTE_R_PP0 | HPTE_R_PP, + .val = PP_RXXX, + .set = "prot:R---", + }, { + .mask = HPTE_R_KEY_HI | HPTE_R_KEY_LO, + .val = HPTE_R_KEY_HI | HPTE_R_KEY_LO, + .set = "key", + .clear = "", + .is_val = true, + }, { + .mask = HPTE_R_R, + .val = HPTE_R_R, + .set = "ref", + .clear = " ", + }, { + .mask = HPTE_R_C, + .val = HPTE_R_C, + .set = "changed", + .clear = " ", + }, { + .mask = HPTE_R_N, + .val = HPTE_R_N, + .set = "no execute", + }, { + .mask = HPTE_R_WIMG, + .val = HPTE_R_W, + .set = "writethru", + }, { + .mask = HPTE_R_WIMG, + .val = HPTE_R_I, + .set = "no cache", + }, { + .mask = HPTE_R_WIMG, + .val = HPTE_R_G, + .set = "guarded", + } +}; + +static int calculate_pagesize(struct pg_state *st, int ps, char s[]) +{ + static const char units[] = "BKMGTPE"; + const char *unit = units; + + while (ps > 9 && unit[1]) { + ps -= 10; + unit++; + } + seq_printf(st->seq, " %s_ps: %i%c\t", s, 1<<ps, *unit); + return ps; +} + +static void dump_flag_info(struct pg_state *st, const struct flag_info + *flag, u64 pte, int num) +{ + unsigned int i; + + for (i = 0; i < num; i++, flag++) { + const char *s = NULL; + u64 val; + + /* flag not defined so don't check it */ + if (flag->mask == 0) + continue; + /* Some 'flags' are actually values */ + if (flag->is_val) { + val = pte & flag->val; + if (flag->shift) + val = val >> flag->shift; + seq_printf(st->seq, " %s:%llx", flag->set, val); + } else { + if ((pte & flag->mask) == flag->val) + s = flag->set; + else + s = flag->clear; + if (s) + seq_printf(st->seq, " %s", s); + } + } +} + +static void dump_hpte_info(struct pg_state *st, unsigned long ea, u64 v, u64 r, + unsigned long rpn, int bps, int aps, unsigned long lp) +{ + int aps_index; + + while (ea >= st->marker[1].start_address) { + st->marker++; + seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + } + seq_printf(st->seq, "0x%lx:\t", ea); + seq_printf(st->seq, "AVPN:%llx\t", HPTE_V_AVPN_VAL(v)); + dump_flag_info(st, v_flag_array, v, ARRAY_SIZE(v_flag_array)); + seq_printf(st->seq, " rpn: %lx\t", rpn); + dump_flag_info(st, r_flag_array, r, ARRAY_SIZE(r_flag_array)); + + calculate_pagesize(st, bps, "base"); + aps_index = calculate_pagesize(st, aps, "actual"); + if (aps_index != 2) + seq_printf(st->seq, "LP enc: %lx", lp); + seq_putc(st->seq, '\n'); +} + + +static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 + *r) +{ + struct hash_pte *hptep; + unsigned long hash, vsid, vpn, hpte_group, want_v, hpte_v; + int i, ssize = mmu_kernel_ssize; + unsigned long shift = mmu_psize_defs[psize].shift; + + /* calculate hash */ + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); + + /* to check in the secondary hash table, we invert the hash */ + if (!primary) + hash = ~hash; + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + for (i = 0; i < HPTES_PER_GROUP; i++) { + hptep = htab_address + hpte_group; + hpte_v = be64_to_cpu(hptep->v); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + /* HPTE matches */ + *v = be64_to_cpu(hptep->v); + *r = be64_to_cpu(hptep->r); + return 0; + } + ++hpte_group; + } + return -1; +} + +#ifdef CONFIG_PPC_PSERIES +static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) +{ + struct hash_pte ptes[4]; + unsigned long vsid, vpn, hash, hpte_group, want_v; + int i, j, ssize = mmu_kernel_ssize; + long lpar_rc = 0; + unsigned long shift = mmu_psize_defs[psize].shift; + + /* calculate hash */ + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); + + /* to check in the secondary hash table, we invert the hash */ + if (!primary) + hash = ~hash; + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + /* see if we can find an entry in the hpte with this hash */ + for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) { + lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes); + + if (lpar_rc != H_SUCCESS) + continue; + for (j = 0; j < 4; j++) { + if (HPTE_V_COMPARE(ptes[j].v, want_v) && + (ptes[j].v & HPTE_V_VALID)) { + /* HPTE matches */ + *v = ptes[j].v; + *r = ptes[j].r; + return 0; + } + } + } + return -1; +} +#endif + +static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps, + unsigned long *lp_bits) +{ + struct mmu_psize_def entry; + unsigned long arpn, mask, lp; + int penc = -2, idx = 0, shift; + + /*. + * The LP field has 8 bits. Depending on the actual page size, some of + * these bits are concatenated with the APRN to get the RPN. The rest + * of the bits in the LP field is the LP value and is an encoding for + * the base page size and the actual page size. + * + * - find the mmu entry for our base page size + * - go through all page encodings and use the associated mask to + * find an encoding that matches our encoding in the LP field. + */ + arpn = (r & HPTE_R_RPN) >> HPTE_R_RPN_SHIFT; + lp = arpn & 0xff; + + entry = mmu_psize_defs[bps]; + while (idx < MMU_PAGE_COUNT) { + penc = entry.penc[idx]; + if ((penc != -1) && (mmu_psize_defs[idx].shift)) { + shift = mmu_psize_defs[idx].shift - HPTE_R_RPN_SHIFT; + mask = (0x1 << (shift)) - 1; + if ((lp & mask) == penc) { + *aps = mmu_psize_to_shift(idx); + *lp_bits = lp & mask; + *rpn = arpn >> shift; + return; + } + } + idx++; + } +} + +static int base_hpte_find(unsigned long ea, int psize, bool primary, u64 *v, + u64 *r) +{ +#ifdef CONFIG_PPC_PSERIES + if (firmware_has_feature(FW_FEATURE_LPAR)) + return pseries_find(ea, psize, primary, v, r); +#endif + return native_find(ea, psize, primary, v, r); +} + +static unsigned long hpte_find(struct pg_state *st, unsigned long ea, int psize) +{ + unsigned long slot; + u64 v = 0, r = 0; + unsigned long rpn, lp_bits; + int base_psize = 0, actual_psize = 0; + + if (ea < PAGE_OFFSET) + return -1; + + /* Look in primary table */ + slot = base_hpte_find(ea, psize, true, &v, &r); + + /* Look in secondary table */ + if (slot == -1) + slot = base_hpte_find(ea, psize, false, &v, &r); + + /* No entry found */ + if (slot == -1) + return -1; + + /* + * We found an entry in the hash page table: + * - check that this has the same base page + * - find the actual page size + * - find the RPN + */ + base_psize = mmu_psize_to_shift(psize); + + if ((v & HPTE_V_LARGE) == HPTE_V_LARGE) { + decode_r(psize, r, &rpn, &actual_psize, &lp_bits); + } else { + /* 4K actual page size */ + actual_psize = 12; + rpn = (r & HPTE_R_RPN) >> HPTE_R_RPN_SHIFT; + /* In this case there are no LP bits */ + lp_bits = -1; + } + /* + * We didn't find a matching encoding, so the PTE we found isn't for + * this address. + */ + if (actual_psize == -1) + return -1; + + dump_hpte_info(st, ea, v, r, rpn, base_psize, actual_psize, lp_bits); + return 0; +} + +static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) +{ + pte_t *pte = pte_offset_kernel(pmd, 0); + unsigned long addr, pteval, psize; + int i, status; + + for (i = 0; i < PTRS_PER_PTE; i++, pte++) { + addr = start + i * PAGE_SIZE; + pteval = pte_val(*pte); + + if (addr < VMALLOC_END) + psize = mmu_vmalloc_psize; + else + psize = mmu_io_psize; +#ifdef CONFIG_PPC_64K_PAGES + /* check for secret 4K mappings */ + if (((pteval & H_PAGE_COMBO) == H_PAGE_COMBO) || + ((pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN)) + psize = mmu_io_psize; +#endif + /* check for hashpte */ + status = hpte_find(st, addr, psize); + + if (((pteval & H_PAGE_HASHPTE) != H_PAGE_HASHPTE) + && (status != -1)) { + /* found a hpte that is not in the linux page tables */ + seq_printf(st->seq, "page probably bolted before linux" + " pagetables were set: addr:%lx, pteval:%lx\n", + addr, pteval); + } + } +} + +static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) +{ + pmd_t *pmd = pmd_offset(pud, 0); + unsigned long addr; + unsigned int i; + + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { + addr = start + i * PMD_SIZE; + if (!pmd_none(*pmd)) + /* pmd exists */ + walk_pte(st, pmd, addr); + } +} + +static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +{ + pud_t *pud = pud_offset(pgd, 0); + unsigned long addr; + unsigned int i; + + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { + addr = start + i * PUD_SIZE; + if (!pud_none(*pud)) + /* pud exists */ + walk_pmd(st, pud, addr); + } +} + +static void walk_pagetables(struct pg_state *st) +{ + pgd_t *pgd = pgd_offset_k(0UL); + unsigned int i; + unsigned long addr; + + /* + * Traverse the linux pagetable structure and dump pages that are in + * the hash pagetable. + */ + for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { + addr = KERN_VIRT_START + i * PGDIR_SIZE; + if (!pgd_none(*pgd)) + /* pgd exists */ + walk_pud(st, pgd, addr); + } +} + + +static void walk_linearmapping(struct pg_state *st) +{ + unsigned long addr; + + /* + * Traverse the linear mapping section of virtual memory and dump pages + * that are in the hash pagetable. + */ + unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift; + + for (addr = PAGE_OFFSET; addr < PAGE_OFFSET + + memblock_end_of_DRAM(); addr += psize) + hpte_find(st, addr, mmu_linear_psize); +} + +static void walk_vmemmap(struct pg_state *st) +{ +#ifdef CONFIG_SPARSEMEM_VMEMMAP + struct vmemmap_backing *ptr = vmemmap_list; + + /* + * Traverse the vmemmaped memory and dump pages that are in the hash + * pagetable. + */ + while (ptr->list) { + hpte_find(st, ptr->virt_addr, mmu_vmemmap_psize); + ptr = ptr->list; + } + seq_puts(st->seq, "---[ vmemmap end ]---\n"); +#endif +} + +static void populate_markers(void) +{ + address_markers[0].start_address = PAGE_OFFSET; + address_markers[1].start_address = VMALLOC_START; + address_markers[2].start_address = VMALLOC_END; + address_markers[3].start_address = ISA_IO_BASE; + address_markers[4].start_address = ISA_IO_END; + address_markers[5].start_address = PHB_IO_BASE; + address_markers[6].start_address = PHB_IO_END; + address_markers[7].start_address = IOREMAP_BASE; + address_markers[8].start_address = IOREMAP_END; +#ifdef CONFIG_PPC_BOOK3S_64 + address_markers[9].start_address = H_VMEMMAP_BASE; +#else + address_markers[9].start_address = VMEMMAP_BASE; +#endif +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + struct pg_state st = { + .seq = m, + .start_address = PAGE_OFFSET, + .marker = address_markers, + }; + /* + * Traverse the 0xc, 0xd and 0xf areas of the kernel virtual memory and + * dump pages that are in the hash pagetable. + */ + walk_linearmapping(&st); + walk_pagetables(&st); + walk_vmemmap(&st); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *file) +{ + return single_open(file, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ptdump_init(void) +{ + struct dentry *debugfs_file; + + if (!radix_enabled()) { + populate_markers(); + debugfs_file = debugfs_create_file("kernel_hash_pagetable", + 0400, NULL, NULL, &ptdump_fops); + return debugfs_file ? 0 : -ENOMEM; + } + return 0; +} +device_initcall(ptdump_init); diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c @@ -0,0 +1,379 @@ +/* + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + * This traverses the kernel pagetables and dumps the + * information about the used sections of memory to + * /sys/kernel/debug/kernel_pagetables. + * + * Derived from the arm64 implementation: + * Copyright (c) 2014, The Linux Foundation, Laura Abbott. + * (C) Copyright 2008 Intel Corporation, Arjan van de Ven. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/debugfs.h> +#include <linux/fs.h> +#include <linux/hugetlb.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <asm/fixmap.h> +#include <asm/pgtable.h> +#include <linux/const.h> +#include <asm/page.h> +#include <asm/pgalloc.h> + +#include "ptdump.h" + +#ifdef CONFIG_PPC32 +#define KERN_VIRT_START 0 +#endif + +/* + * To visualise what is happening, + * + * - PTRS_PER_P** = how many entries there are in the corresponding P** + * - P**_SHIFT = how many bits of the address we use to index into the + * corresponding P** + * - P**_SIZE is how much memory we can access through the table - not the + * size of the table itself. + * P**={PGD, PUD, PMD, PTE} + * + * + * Each entry of the PGD points to a PUD. Each entry of a PUD points to a + * PMD. Each entry of a PMD points to a PTE. And every PTE entry points to + * a page. + * + * In the case where there are only 3 levels, the PUD is folded into the + * PGD: every PUD has only one entry which points to the PMD. + * + * The page dumper groups page table entries of the same type into a single + * description. It uses pg_state to track the range information while + * iterating over the PTE entries. When the continuity is broken it then + * dumps out a description of the range - ie PTEs that are virtually contiguous + * with the same PTE flags are chunked together. This is to make it clear how + * different areas of the kernel virtual memory are used. + * + */ +struct pg_state { + struct seq_file *seq; + const struct addr_marker *marker; + unsigned long start_address; + unsigned long start_pa; + unsigned long last_pa; + unsigned int level; + u64 current_flags; +}; + +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +static struct addr_marker address_markers[] = { + { 0, "Start of kernel VM" }, + { 0, "vmalloc() Area" }, + { 0, "vmalloc() End" }, +#ifdef CONFIG_PPC64 + { 0, "isa I/O start" }, + { 0, "isa I/O end" }, + { 0, "phb I/O start" }, + { 0, "phb I/O end" }, + { 0, "I/O remap start" }, + { 0, "I/O remap end" }, + { 0, "vmemmap start" }, +#else + { 0, "Early I/O remap start" }, + { 0, "Early I/O remap end" }, +#ifdef CONFIG_NOT_COHERENT_CACHE + { 0, "Consistent mem start" }, + { 0, "Consistent mem end" }, +#endif +#ifdef CONFIG_HIGHMEM + { 0, "Highmem PTEs start" }, + { 0, "Highmem PTEs end" }, +#endif + { 0, "Fixmap start" }, + { 0, "Fixmap end" }, +#endif + { -1, NULL }, +}; + +static void dump_flag_info(struct pg_state *st, const struct flag_info + *flag, u64 pte, int num) +{ + unsigned int i; + + for (i = 0; i < num; i++, flag++) { + const char *s = NULL; + u64 val; + + /* flag not defined so don't check it */ + if (flag->mask == 0) + continue; + /* Some 'flags' are actually values */ + if (flag->is_val) { + val = pte & flag->val; + if (flag->shift) + val = val >> flag->shift; + seq_printf(st->seq, " %s:%llx", flag->set, val); + } else { + if ((pte & flag->mask) == flag->val) + s = flag->set; + else + s = flag->clear; + if (s) + seq_printf(st->seq, " %s", s); + } + st->current_flags &= ~flag->mask; + } + if (st->current_flags != 0) + seq_printf(st->seq, " unknown flags:%llx", st->current_flags); +} + +static void dump_addr(struct pg_state *st, unsigned long addr) +{ + static const char units[] = "KMGTPE"; + const char *unit = units; + unsigned long delta; + +#ifdef CONFIG_PPC64 +#define REG "0x%016lx" +#else +#define REG "0x%08lx" +#endif + + seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); + if (st->start_pa == st->last_pa && st->start_address + PAGE_SIZE != addr) { + seq_printf(st->seq, "[" REG "]", st->start_pa); + delta = PAGE_SIZE >> 10; + } else { + seq_printf(st->seq, " " REG " ", st->start_pa); + delta = (addr - st->start_address) >> 10; + } + /* Work out what appropriate unit to use */ + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + seq_printf(st->seq, "%9lu%c", delta, *unit); + +} + +static void note_page(struct pg_state *st, unsigned long addr, + unsigned int level, u64 val) +{ + u64 flag = val & pg_level[level].mask; + u64 pa = val & PTE_RPN_MASK; + + /* At first no level is set */ + if (!st->level) { + st->level = level; + st->current_flags = flag; + st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; + seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + /* + * Dump the section of virtual memory when: + * - the PTE flags from one entry to the next differs. + * - we change levels in the tree. + * - the address is in a different section of memory and is thus + * used for a different purpose, regardless of the flags. + * - the pa of this page is not adjacent to the last inspected page + */ + } else if (flag != st->current_flags || level != st->level || + addr >= st->marker[1].start_address || + (pa != st->last_pa + PAGE_SIZE && + (pa != st->start_pa || st->start_pa != st->last_pa))) { + + /* Check the PTE flags */ + if (st->current_flags) { + dump_addr(st, addr); + + /* Dump all the flags */ + if (pg_level[st->level].flag) + dump_flag_info(st, pg_level[st->level].flag, + st->current_flags, + pg_level[st->level].num); + + seq_putc(st->seq, '\n'); + } + + /* + * Address indicates we have passed the end of the + * current section of virtual memory + */ + while (addr >= st->marker[1].start_address) { + st->marker++; + seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + } + st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; + st->current_flags = flag; + st->level = level; + } else { + st->last_pa = pa; + } +} + +static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) +{ + pte_t *pte = pte_offset_kernel(pmd, 0); + unsigned long addr; + unsigned int i; + + for (i = 0; i < PTRS_PER_PTE; i++, pte++) { + addr = start + i * PAGE_SIZE; + note_page(st, addr, 4, pte_val(*pte)); + + } +} + +static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) +{ + pmd_t *pmd = pmd_offset(pud, 0); + unsigned long addr; + unsigned int i; + + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { + addr = start + i * PMD_SIZE; + if (!pmd_none(*pmd) && !pmd_huge(*pmd)) + /* pmd exists */ + walk_pte(st, pmd, addr); + else + note_page(st, addr, 3, pmd_val(*pmd)); + } +} + +static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +{ + pud_t *pud = pud_offset(pgd, 0); + unsigned long addr; + unsigned int i; + + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { + addr = start + i * PUD_SIZE; + if (!pud_none(*pud) && !pud_huge(*pud)) + /* pud exists */ + walk_pmd(st, pud, addr); + else + note_page(st, addr, 2, pud_val(*pud)); + } +} + +static void walk_pagetables(struct pg_state *st) +{ + pgd_t *pgd = pgd_offset_k(0UL); + unsigned int i; + unsigned long addr; + + addr = st->start_address; + + /* + * Traverse the linux pagetable structure and dump pages that are in + * the hash pagetable. + */ + for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { + if (!pgd_none(*pgd) && !pgd_huge(*pgd)) + /* pgd exists */ + walk_pud(st, pgd, addr); + else + note_page(st, addr, 1, pgd_val(*pgd)); + } +} + +static void populate_markers(void) +{ + int i = 0; + + address_markers[i++].start_address = PAGE_OFFSET; + address_markers[i++].start_address = VMALLOC_START; + address_markers[i++].start_address = VMALLOC_END; +#ifdef CONFIG_PPC64 + address_markers[i++].start_address = ISA_IO_BASE; + address_markers[i++].start_address = ISA_IO_END; + address_markers[i++].start_address = PHB_IO_BASE; + address_markers[i++].start_address = PHB_IO_END; + address_markers[i++].start_address = IOREMAP_BASE; + address_markers[i++].start_address = IOREMAP_END; +#ifdef CONFIG_PPC_BOOK3S_64 + address_markers[i++].start_address = H_VMEMMAP_BASE; +#else + address_markers[i++].start_address = VMEMMAP_BASE; +#endif +#else /* !CONFIG_PPC64 */ + address_markers[i++].start_address = ioremap_bot; + address_markers[i++].start_address = IOREMAP_TOP; +#ifdef CONFIG_NOT_COHERENT_CACHE + address_markers[i++].start_address = IOREMAP_TOP; + address_markers[i++].start_address = IOREMAP_TOP + + CONFIG_CONSISTENT_SIZE; +#endif +#ifdef CONFIG_HIGHMEM + address_markers[i++].start_address = PKMAP_BASE; + address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP); +#endif + address_markers[i++].start_address = FIXADDR_START; + address_markers[i++].start_address = FIXADDR_TOP; +#endif /* CONFIG_PPC64 */ +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + struct pg_state st = { + .seq = m, + .marker = address_markers, + }; + + if (radix_enabled()) + st.start_address = PAGE_OFFSET; + else + st.start_address = KERN_VIRT_START; + + /* Traverse kernel page tables */ + walk_pagetables(&st); + note_page(&st, 0, 0, 0); + return 0; +} + + +static int ptdump_open(struct inode *inode, struct file *file) +{ + return single_open(file, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void build_pgtable_complete_mask(void) +{ + unsigned int i, j; + + for (i = 0; i < ARRAY_SIZE(pg_level); i++) + if (pg_level[i].flag) + for (j = 0; j < pg_level[i].num; j++) + pg_level[i].mask |= pg_level[i].flag[j].mask; +} + +static int ptdump_init(void) +{ + struct dentry *debugfs_file; + + populate_markers(); + build_pgtable_complete_mask(); + debugfs_file = debugfs_create_file("kernel_page_tables", 0400, NULL, + NULL, &ptdump_fops); + return debugfs_file ? 0 : -ENOMEM; +} +device_initcall(ptdump_init); diff --git a/arch/powerpc/mm/dump_linuxpagetables.h b/arch/powerpc/mm/ptdump/ptdump.h diff --git a/arch/powerpc/mm/dump_sr.c b/arch/powerpc/mm/ptdump/segment_regs.c diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <asm/pgtable.h> + +#include "ptdump.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_USER, + .val = _PAGE_USER, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_RW, + .val = _PAGE_RW, + .set = "rw", + .clear = "r ", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "present", + .clear = " ", + }, { + .mask = _PAGE_GUARDED, + .val = _PAGE_GUARDED, + .set = "guarded", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_WRITETHRU, + .val = _PAGE_WRITETHRU, + .set = "write through", + .clear = " ", + }, { + .mask = _PAGE_NO_CACHE, + .val = _PAGE_NO_CACHE, + .set = "no cache", + .clear = " ", + }, { + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c @@ -69,6 +69,11 @@ static void assert_slb_presence(bool present, unsigned long ea) if (!cpu_has_feature(CPU_FTR_ARCH_206)) return; + /* + * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware + * ignores all other bits from 0-27, so just clear them all. + */ + ea &= ~((1UL << 28) - 1); asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); WARN_ON(present == (tmp == 0)); diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c @@ -32,6 +32,7 @@ #include <linux/export.h> #include <linux/hugetlb.h> #include <linux/sched/mm.h> +#include <linux/security.h> #include <asm/mman.h> #include <asm/mmu.h> #include <asm/copro.h> @@ -377,6 +378,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); unsigned long addr, found, prev; struct vm_unmapped_area_info info; + unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr); info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; @@ -393,7 +395,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, if (high_limit > DEFAULT_MAP_WINDOW) addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW; - while (addr > PAGE_SIZE) { + while (addr > min_addr) { info.high_limit = addr; if (!slice_scan_available(addr - 1, available, 0, &addr)) continue; @@ -405,8 +407,8 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * Check if we need to reduce the range, or if we can * extend it to cover the previous available slice. */ - if (addr < PAGE_SIZE) - addr = PAGE_SIZE; + if (addr < min_addr) + addr = min_addr; else if (slice_scan_available(addr - 1, available, 0, &prev)) { addr = prev; goto prev_slice; @@ -528,7 +530,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, addr = _ALIGN_UP(addr, page_size); slice_dbg(" aligned addr=%lx\n", addr); /* Ignore hint if it's too large or overlaps a VMA */ - if (addr > high_limit - len || + if (addr > high_limit - len || addr < mmap_min_addr || !slice_area_is_free(mm, addr, len)) addr = 0; } diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c @@ -302,7 +302,7 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, * This function as well as __local_flush_tlb_page() must only be called * for user contexts. */ - if (unlikely(WARN_ON(!mm))) + if (WARN_ON(!mm)) return; preempt_disable(); diff --git a/arch/powerpc/net/bpf_jit32.h b/arch/powerpc/net/bpf_jit32.h @@ -106,9 +106,8 @@ DECLARE_LOAD_FUNC(sk_load_byte_msh); } while (0) #else #define PPC_BPF_LOAD_CPU(r) \ - do { BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4); \ - PPC_LHZ_OFFS(r, (1 & ~(THREAD_SIZE - 1)), \ - offsetof(struct thread_info, cpu)); \ + do { BUILD_BUG_ON(FIELD_SIZEOF(struct task_struct, cpu) != 4); \ + PPC_LHZ_OFFS(r, 2, offsetof(struct task_struct, cpu)); \ } while(0) #endif #else diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h @@ -97,3 +97,27 @@ EVENT(PM_MRK_DTLB_MISS_64K, 0x3d156) EVENT(PM_DTLB_MISS_16M, 0x4c056) EVENT(PM_DTLB_MISS_1G, 0x4c05a) EVENT(PM_MRK_DTLB_MISS_16M, 0x4c15e) + +/* + * Memory Access Events + * + * Primary PMU event used here is PM_MRK_INST_CMPL (0x401e0) + * To enable capturing of memory profiling, these MMCRA bits + * needs to be programmed and corresponding raw event format + * encoding. + * + * MMCRA bits encoding needed are + * SM (Sampling Mode) + * EM (Eligibility for Random Sampling) + * TECE (Threshold Event Counter Event) + * TS (Threshold Start Event) + * TE (Threshold End Event) + * + * Corresponding Raw Encoding bits: + * sample [EM,SM] + * thresh_sel (TECE) + * thresh start (TS) + * thresh end (TE) + */ +EVENT(MEM_LOADS, 0x34340401e0) +EVENT(MEM_STORES, 0x343c0401e0) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c @@ -160,6 +160,8 @@ GENERIC_EVENT_ATTR(branch-instructions, PM_BR_CMPL); GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL); GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1); GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1_FIN); +GENERIC_EVENT_ATTR(mem-loads, MEM_LOADS); +GENERIC_EVENT_ATTR(mem-stores, MEM_STORES); CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1_FIN); CACHE_EVENT_ATTR(L1-dcache-loads, PM_LD_REF_L1); @@ -185,6 +187,8 @@ static struct attribute *power9_events_attr[] = { GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL), GENERIC_EVENT_PTR(PM_LD_REF_L1), GENERIC_EVENT_PTR(PM_LD_MISS_L1_FIN), + GENERIC_EVENT_PTR(MEM_LOADS), + GENERIC_EVENT_PTR(MEM_STORES), CACHE_EVENT_PTR(PM_LD_MISS_L1_FIN), CACHE_EVENT_PTR(PM_LD_REF_L1), CACHE_EVENT_PTR(PM_L1_PREF), diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig @@ -180,6 +180,7 @@ config CURRITUCK depends on PPC_47x select SWIOTLB select 476FPE + select FORCE_PCI select PPC4xx_PCI_EXPRESS help This option enables support for the IBM Currituck (476fpe) evaluation board diff --git a/arch/powerpc/platforms/44x/ppc476.c b/arch/powerpc/platforms/44x/ppc476.c @@ -34,6 +34,7 @@ #include <asm/ppc4xx.h> #include <asm/mpic.h> #include <asm/mmu.h> +#include <asm/swiotlb.h> #include <linux/pci.h> #include <linux/i2c.h> diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c @@ -47,7 +47,7 @@ static int __init warp_probe(void) if (!of_machine_is_compatible("pika,warp")) return 0; - /* For __dma_nommu_alloc_coherent */ + /* For arch_dma_alloc */ ISA_DMA_THRESHOLD = ~0L; return 1; diff --git a/arch/powerpc/platforms/83xx/suspend-asm.S b/arch/powerpc/platforms/83xx/suspend-asm.S @@ -26,13 +26,13 @@ #define SS_MSR 0x74 #define SS_SDR1 0x78 #define SS_LR 0x7c -#define SS_SPRG 0x80 /* 4 SPRGs */ -#define SS_DBAT 0x90 /* 8 DBATs */ -#define SS_IBAT 0xd0 /* 8 IBATs */ -#define SS_TB 0x110 -#define SS_CR 0x118 -#define SS_GPREG 0x11c /* r12-r31 */ -#define STATE_SAVE_SIZE 0x16c +#define SS_SPRG 0x80 /* 8 SPRGs */ +#define SS_DBAT 0xa0 /* 8 DBATs */ +#define SS_IBAT 0xe0 /* 8 IBATs */ +#define SS_TB 0x120 +#define SS_CR 0x128 +#define SS_GPREG 0x12c /* r12-r31 */ +#define STATE_SAVE_SIZE 0x17c .section .data .align 5 @@ -103,6 +103,16 @@ _GLOBAL(mpc83xx_enter_deep_sleep) stw r7, SS_SPRG+12(r3) stw r8, SS_SDR1(r3) + mfspr r4, SPRN_SPRG4 + mfspr r5, SPRN_SPRG5 + mfspr r6, SPRN_SPRG6 + mfspr r7, SPRN_SPRG7 + + stw r4, SS_SPRG+16(r3) + stw r5, SS_SPRG+20(r3) + stw r6, SS_SPRG+24(r3) + stw r7, SS_SPRG+28(r3) + mfspr r4, SPRN_DBAT0U mfspr r5, SPRN_DBAT0L mfspr r6, SPRN_DBAT1U @@ -493,6 +503,16 @@ mpc83xx_deep_resume: mtspr SPRN_IBAT7U, r6 mtspr SPRN_IBAT7L, r7 + lwz r4, SS_SPRG+16(r3) + lwz r5, SS_SPRG+20(r3) + lwz r6, SS_SPRG+24(r3) + lwz r7, SS_SPRG+28(r3) + + mtspr SPRN_SPRG4, r4 + mtspr SPRN_SPRG5, r5 + mtspr SPRN_SPRG6, r6 + mtspr SPRN_SPRG7, r7 + lwz r4, SS_SPRG+0(r3) lwz r5, SS_SPRG+4(r3) lwz r6, SS_SPRG+8(r3) diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c @@ -27,6 +27,7 @@ #include <asm/udbg.h> #include <asm/mpic.h> #include <asm/ehv_pic.h> +#include <asm/swiotlb.h> #include <soc/fsl/qe/qe_ic.h> #include <linux/of_platform.h> @@ -223,7 +224,3 @@ define_machine(corenet_generic) { }; machine_arch_initcall(corenet_generic, corenet_gen_publish_devices); - -#ifdef CONFIG_SWIOTLB -machine_arch_initcall(corenet_generic, swiotlb_setup_bus_notifier); -#endif diff --git a/arch/powerpc/platforms/85xx/ge_imp3a.c b/arch/powerpc/platforms/85xx/ge_imp3a.c @@ -202,8 +202,6 @@ static int __init ge_imp3a_probe(void) machine_arch_initcall(ge_imp3a, mpc85xx_common_publish_devices); -machine_arch_initcall(ge_imp3a, swiotlb_setup_bus_notifier); - define_machine(ge_imp3a) { .name = "GE_IMP3A", .probe = ge_imp3a_probe, diff --git a/arch/powerpc/platforms/85xx/mpc8536_ds.c b/arch/powerpc/platforms/85xx/mpc8536_ds.c @@ -57,8 +57,6 @@ static void __init mpc8536_ds_setup_arch(void) machine_arch_initcall(mpc8536_ds, mpc85xx_common_publish_devices); -machine_arch_initcall(mpc8536_ds, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ds.c b/arch/powerpc/platforms/85xx/mpc85xx_ds.c @@ -174,10 +174,6 @@ machine_arch_initcall(mpc8544_ds, mpc85xx_common_publish_devices); machine_arch_initcall(mpc8572_ds, mpc85xx_common_publish_devices); machine_arch_initcall(p2020_ds, mpc85xx_common_publish_devices); -machine_arch_initcall(mpc8544_ds, swiotlb_setup_bus_notifier); -machine_arch_initcall(mpc8572_ds, swiotlb_setup_bus_notifier); -machine_arch_initcall(p2020_ds, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -367,10 +367,6 @@ machine_arch_initcall(mpc8568_mds, mpc85xx_publish_devices); machine_arch_initcall(mpc8569_mds, mpc85xx_publish_devices); machine_arch_initcall(p1021_mds, mpc85xx_common_publish_devices); -machine_arch_initcall(mpc8568_mds, swiotlb_setup_bus_notifier); -machine_arch_initcall(mpc8569_mds, swiotlb_setup_bus_notifier); -machine_arch_initcall(p1021_mds, swiotlb_setup_bus_notifier); - static void __init mpc85xx_mds_pic_init(void) { struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN | diff --git a/arch/powerpc/platforms/85xx/p1010rdb.c b/arch/powerpc/platforms/85xx/p1010rdb.c @@ -55,7 +55,6 @@ static void __init p1010_rdb_setup_arch(void) } machine_arch_initcall(p1010_rdb, mpc85xx_common_publish_devices); -machine_arch_initcall(p1010_rdb, swiotlb_setup_bus_notifier); /* * Called very early, device-tree isn't unflattened diff --git a/arch/powerpc/platforms/85xx/p1022_ds.c b/arch/powerpc/platforms/85xx/p1022_ds.c @@ -548,8 +548,6 @@ static void __init p1022_ds_setup_arch(void) machine_arch_initcall(p1022_ds, mpc85xx_common_publish_devices); -machine_arch_initcall(p1022_ds, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/85xx/p1022_rdk.c b/arch/powerpc/platforms/85xx/p1022_rdk.c @@ -128,8 +128,6 @@ static void __init p1022_rdk_setup_arch(void) machine_arch_initcall(p1022_rdk, mpc85xx_common_publish_devices); -machine_arch_initcall(p1022_rdk, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/85xx/qemu_e500.c b/arch/powerpc/platforms/85xx/qemu_e500.c @@ -22,6 +22,7 @@ #include <asm/time.h> #include <asm/udbg.h> #include <asm/mpic.h> +#include <asm/swiotlb.h> #include <sysdev/fsl_soc.h> #include <sysdev/fsl_pci.h> #include "smp.h" diff --git a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c @@ -121,7 +121,6 @@ static int __init declare_of_platform_devices(void) return 0; } machine_arch_initcall(mpc86xx_hpcn, declare_of_platform_devices); -machine_arch_initcall(mpc86xx_hpcn, swiotlb_setup_bus_notifier); define_machine(mpc86xx_hpcn) { .name = "MPC86xx HPCN", diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype @@ -153,6 +153,11 @@ config E300C3_CPU bool "e300c3 (831x)" depends on PPC_BOOK3S_32 +config G4_CPU + bool "G4 (74xx)" + depends on PPC_BOOK3S_32 + select ALTIVEC + endchoice config TARGET_CPU_BOOL @@ -171,6 +176,7 @@ config TARGET_CPU default "860" if 860_CPU default "e300c2" if E300C2_CPU default "e300c3" if E300C3_CPU + default "G4" if G4_CPU config PPC_BOOK3S def_bool y @@ -402,6 +408,9 @@ config NOT_COHERENT_CACHE bool depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \ GAMECUBE_COMMON || AMIGAONE + select ARCH_HAS_DMA_COHERENT_TO_PFN + select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_SYNC_DMA_FOR_CPU default n if PPC_47x default y diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c @@ -544,9 +544,10 @@ static struct cbe_iommu *cell_iommu_for_node(int nid) static unsigned long cell_dma_nommu_offset; static unsigned long dma_iommu_fixed_base; +static bool cell_iommu_enabled; /* iommu_fixed_is_weak is set if booted with iommu_fixed=weak */ -static int iommu_fixed_is_weak; +bool iommu_fixed_is_weak; static struct iommu_table *cell_get_iommu_table(struct device *dev) { @@ -568,102 +569,19 @@ static struct iommu_table *cell_get_iommu_table(struct device *dev) return &window->table; } -/* A coherent allocation implies strong ordering */ - -static void *dma_fixed_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - if (iommu_fixed_is_weak) - return iommu_alloc_coherent(dev, cell_get_iommu_table(dev), - size, dma_handle, - device_to_mask(dev), flag, - dev_to_node(dev)); - else - return dma_nommu_ops.alloc(dev, size, dma_handle, flag, - attrs); -} - -static void dma_fixed_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - if (iommu_fixed_is_weak) - iommu_free_coherent(cell_get_iommu_table(dev), size, vaddr, - dma_handle); - else - dma_nommu_ops.free(dev, size, vaddr, dma_handle, attrs); -} - -static dma_addr_t dma_fixed_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - return dma_nommu_ops.map_page(dev, page, offset, size, - direction, attrs); - else - return iommu_map_page(dev, cell_get_iommu_table(dev), page, - offset, size, device_to_mask(dev), - direction, attrs); -} - -static void dma_fixed_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - dma_nommu_ops.unmap_page(dev, dma_addr, size, direction, - attrs); - else - iommu_unmap_page(cell_get_iommu_table(dev), dma_addr, size, - direction, attrs); -} - -static int dma_fixed_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - return dma_nommu_ops.map_sg(dev, sg, nents, direction, attrs); - else - return ppc_iommu_map_sg(dev, cell_get_iommu_table(dev), sg, - nents, device_to_mask(dev), - direction, attrs); -} - -static void dma_fixed_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - dma_nommu_ops.unmap_sg(dev, sg, nents, direction, attrs); - else - ppc_iommu_unmap_sg(cell_get_iommu_table(dev), sg, nents, - direction, attrs); -} - -static int dma_suported_and_switch(struct device *dev, u64 dma_mask); - -static const struct dma_map_ops dma_iommu_fixed_ops = { - .alloc = dma_fixed_alloc_coherent, - .free = dma_fixed_free_coherent, - .map_sg = dma_fixed_map_sg, - .unmap_sg = dma_fixed_unmap_sg, - .dma_supported = dma_suported_and_switch, - .map_page = dma_fixed_map_page, - .unmap_page = dma_fixed_unmap_page, -}; +static u64 cell_iommu_get_fixed_address(struct device *dev); static void cell_dma_dev_setup(struct device *dev) { - if (get_pci_dma_ops() == &dma_iommu_ops) + if (cell_iommu_enabled) { + u64 addr = cell_iommu_get_fixed_address(dev); + + if (addr != OF_BAD_ADDR) + dev->archdata.dma_offset = addr + dma_iommu_fixed_base; set_iommu_table_base(dev, cell_get_iommu_table(dev)); - else if (get_pci_dma_ops() == &dma_nommu_ops) - set_dma_offset(dev, cell_dma_nommu_offset); - else - BUG(); + } else { + dev->archdata.dma_offset = cell_dma_nommu_offset; + } } static void cell_pci_dma_dev_setup(struct pci_dev *dev) @@ -680,11 +598,9 @@ static int cell_of_bus_notify(struct notifier_block *nb, unsigned long action, if (action != BUS_NOTIFY_ADD_DEVICE) return 0; - /* We use the PCI DMA ops */ - dev->dma_ops = get_pci_dma_ops(); - + if (cell_iommu_enabled) + dev->dma_ops = &dma_iommu_ops; cell_dma_dev_setup(dev); - return 0; } @@ -809,7 +725,6 @@ static int __init cell_iommu_init_disabled(void) unsigned long base = 0, size; /* When no iommu is present, we use direct DMA ops */ - set_pci_dma_ops(&dma_nommu_ops); /* First make sure all IOC translation is turned off */ cell_disable_iommus(); @@ -894,7 +809,11 @@ static u64 cell_iommu_get_fixed_address(struct device *dev) const u32 *ranges = NULL; int i, len, best, naddr, nsize, pna, range_size; + /* We can be called for platform devices that have no of_node */ np = of_node_get(dev->of_node); + if (!np) + goto out; + while (1) { naddr = of_n_addr_cells(np); nsize = of_n_size_cells(np); @@ -945,27 +864,10 @@ out: return dev_addr; } -static int dma_suported_and_switch(struct device *dev, u64 dma_mask) +static bool cell_pci_iommu_bypass_supported(struct pci_dev *pdev, u64 mask) { - if (dma_mask == DMA_BIT_MASK(64) && - cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) { - u64 addr = cell_iommu_get_fixed_address(dev) + - dma_iommu_fixed_base; - dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); - dev_dbg(dev, "iommu: fixed addr = %llx\n", addr); - set_dma_ops(dev, &dma_iommu_fixed_ops); - set_dma_offset(dev, addr); - return 1; - } - - if (dma_iommu_dma_supported(dev, dma_mask)) { - dev_dbg(dev, "iommu: not 64-bit, using default ops\n"); - set_dma_ops(dev, get_pci_dma_ops()); - cell_dma_dev_setup(dev); - return 1; - } - - return 0; + return mask == DMA_BIT_MASK(64) && + cell_iommu_get_fixed_address(&pdev->dev) != OF_BAD_ADDR; } static void insert_16M_pte(unsigned long addr, unsigned long *ptab, @@ -1119,9 +1021,8 @@ static int __init cell_iommu_fixed_mapping_init(void) cell_iommu_setup_window(iommu, np, dbase, dsize, 0); } - dma_iommu_ops.dma_supported = dma_suported_and_switch; - set_pci_dma_ops(&dma_iommu_ops); - + cell_pci_controller_ops.iommu_bypass_supported = + cell_pci_iommu_bypass_supported; return 0; } @@ -1142,7 +1043,7 @@ static int __init setup_iommu_fixed(char *str) pciep = of_find_node_by_type(NULL, "pcie-endpoint"); if (strcmp(str, "weak") == 0 || (pciep && strcmp(str, "strong") != 0)) - iommu_fixed_is_weak = DMA_ATTR_WEAK_ORDERING; + iommu_fixed_is_weak = true; of_node_put(pciep); @@ -1150,26 +1051,6 @@ static int __init setup_iommu_fixed(char *str) } __setup("iommu_fixed=", setup_iommu_fixed); -static u64 cell_dma_get_required_mask(struct device *dev) -{ - const struct dma_map_ops *dma_ops; - - if (!dev->dma_mask) - return 0; - - if (!iommu_fixed_disabled && - cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) - return DMA_BIT_MASK(64); - - dma_ops = get_dma_ops(dev); - if (dma_ops->get_required_mask) - return dma_ops->get_required_mask(dev); - - WARN_ONCE(1, "no get_required_mask in %p ops", dma_ops); - - return DMA_BIT_MASK(64); -} - static int __init cell_iommu_init(void) { struct device_node *np; @@ -1186,10 +1067,9 @@ static int __init cell_iommu_init(void) /* Setup various callbacks */ cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup; - ppc_md.dma_get_required_mask = cell_dma_get_required_mask; if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0) - goto bail; + goto done; /* Create an iommu for each /axon node. */ for_each_node_by_name(np, "axon") { @@ -1206,10 +1086,10 @@ static int __init cell_iommu_init(void) continue; cell_iommu_init_one(np, SPIDER_DMA_OFFSET); } - + done: /* Setup default PCI iommu ops */ set_pci_dma_ops(&dma_iommu_ops); - + cell_iommu_enabled = true; bail: /* Register callbacks on OF platform device addition/removal * to handle linking them to the right DMA operations diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c @@ -34,7 +34,7 @@ */ static void *spu_syscall_table[] = { -#define __SYSCALL(nr, entry, nargs) entry, +#define __SYSCALL(nr, entry) entry, #include <asm/syscall_table_spu.h> #undef __SYSCALL }; diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c @@ -26,7 +26,6 @@ #include <linux/syscalls.h> #include <linux/rcupdate.h> #include <linux/binfmts.h> -#include <linux/syscalls.h> #include <asm/spu.h> diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c @@ -2338,9 +2338,8 @@ static int spufs_switch_log_open(struct inode *inode, struct file *file) goto out; } - ctx->switch_log = kmalloc(sizeof(struct switch_log) + - SWITCH_LOG_BUFSIZE * sizeof(struct switch_log_entry), - GFP_KERNEL); + ctx->switch_log = kmalloc(struct_size(ctx->switch_log, log, + SWITCH_LOG_BUFSIZE), GFP_KERNEL); if (!ctx->switch_log) { rc = -ENOMEM; diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c @@ -54,10 +54,6 @@ static void __iomem *hw_ctrl; static void __iomem *hw_gpio; -unsigned long wii_hole_start; -unsigned long wii_hole_size; - - static int __init page_aligned(unsigned long x) { return !(x & (PAGE_SIZE-1)); @@ -69,26 +65,6 @@ void __init wii_memory_fixups(void) BUG_ON(memblock.memory.cnt != 2); BUG_ON(!page_aligned(p[0].base) || !page_aligned(p[1].base)); - - /* determine hole */ - wii_hole_start = ALIGN(p[0].base + p[0].size, PAGE_SIZE); - wii_hole_size = p[1].base - wii_hole_start; -} - -unsigned long __init wii_mmu_mapin_mem2(unsigned long top) -{ - unsigned long delta, size, bl; - unsigned long max_size = (256<<20); - - /* MEM2 64MB@0x10000000 */ - delta = wii_hole_start + wii_hole_size; - size = top - delta; - for (bl = 128<<10; bl < max_size; bl <<= 1) { - if (bl * 2 > size) - break; - } - setbat(4, PAGE_OFFSET+delta, delta, bl, PAGE_KERNEL_X); - return delta + bl; } static void __noreturn wii_spin(void) diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c @@ -186,7 +186,7 @@ static void pci_dma_dev_setup_pasemi(struct pci_dev *dev) */ if (dev->vendor == 0x1959 && dev->device == 0xa007 && !firmware_has_feature(FW_FEATURE_LPAR)) { - dev->dev.dma_ops = &dma_nommu_ops; + dev->dev.dma_ops = NULL; /* * Set the coherent DMA mask to prevent the iommu * being used unnecessarily diff --git a/arch/powerpc/platforms/pasemi/setup.c b/arch/powerpc/platforms/pasemi/setup.c @@ -411,55 +411,6 @@ out: return !!(srr1 & 0x2); } -#ifdef CONFIG_PCMCIA -static int pcmcia_notify(struct notifier_block *nb, unsigned long action, - void *data) -{ - struct device *dev = data; - struct device *parent; - struct pcmcia_device *pdev = to_pcmcia_dev(dev); - - /* We are only intereted in device addition */ - if (action != BUS_NOTIFY_ADD_DEVICE) - return 0; - - parent = pdev->socket->dev.parent; - - /* We know electra_cf devices will always have of_node set, since - * electra_cf is an of_platform driver. - */ - if (!parent->of_node) - return 0; - - if (!of_device_is_compatible(parent->of_node, "electra-cf")) - return 0; - - /* We use the direct ops for localbus */ - dev->dma_ops = &dma_nommu_ops; - - return 0; -} - -static struct notifier_block pcmcia_notifier = { - .notifier_call = pcmcia_notify, -}; - -static inline void pasemi_pcmcia_init(void) -{ - extern struct bus_type pcmcia_bus_type; - - bus_register_notifier(&pcmcia_bus_type, &pcmcia_notifier); -} - -#else - -static inline void pasemi_pcmcia_init(void) -{ -} - -#endif - - static const struct of_device_id pasemi_bus_ids[] = { /* Unfortunately needed for legacy firmwares */ { .type = "localbus", }, @@ -472,8 +423,6 @@ static const struct of_device_id pasemi_bus_ids[] = { static int __init pasemi_publish_devices(void) { - pasemi_pcmcia_init(); - /* Publish OF platform devices for SDC and other non-PCI devices */ of_platform_bus_probe(NULL, pasemi_bus_ids, NULL); diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o -obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o +obj-y += setup.o opal-call.o opal-wrappers.o opal.o opal-async.o +obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o @@ -11,7 +11,6 @@ obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o -obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c @@ -458,7 +458,8 @@ EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_HOTPLUG_CPU -static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) + +void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) { u64 pir = get_hard_smp_processor_id(cpu); @@ -481,20 +482,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu) { unsigned long srr1; u32 idle_states = pnv_get_supported_cpuidle_states(); - u64 lpcr_val; - - /* - * We don't want to take decrementer interrupts while we are - * offline, so clear LPCR:PECE1. We keep PECE2 (and - * LPCR_PECE_HVEE on P9) enabled as to let IPIs in. - * - * If the CPU gets woken up by a special wakeup, ensure that - * the SLW engine sets LPCR with decrementer bit cleared, else - * the CPU will come back to the kernel due to a spurious - * wakeup. - */ - lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; - pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); __ppc64_runlatch_off(); @@ -526,16 +513,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu) __ppc64_runlatch_on(); - /* - * Re-enable decrementer interrupts in LPCR. - * - * Further, we want stop states to be woken up by decrementer - * for non-hotplug cases. So program the LPCR via stop api as - * well. - */ - lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1; - pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); - return srr1; } #endif diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c @@ -28,10 +28,6 @@ */ static DEFINE_SPINLOCK(npu_context_lock); -/* - * Other types of TCE cache invalidation are not functional in the - * hardware. - */ static struct pci_dev *get_pci_dev(struct device_node *dn) { struct pci_dn *pdn = PCI_DN(dn); @@ -220,7 +216,7 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) * their parent device so drivers shouldn't be doing DMA * operations directly on these devices. */ - set_dma_ops(&npe->pdev->dev, NULL); + set_dma_ops(&npe->pdev->dev, &dma_dummy_ops); } /* @@ -917,15 +913,6 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn, mmio_invalidate(npu_context, 0, ~0UL); } -static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address, - pte_t pte) -{ - struct npu_context *npu_context = mn_to_npu_context(mn); - mmio_invalidate(npu_context, address, PAGE_SIZE); -} - static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -936,7 +923,6 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { .release = pnv_npu2_mn_release, - .change_pte = pnv_npu2_mn_change_pte, .invalidate_range = pnv_npu2_mn_invalidate_range, }; diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/percpu.h> +#include <linux/jump_label.h> +#include <asm/opal-api.h> +#include <asm/trace.h> +#include <asm/asm-prototypes.h> + +#ifdef CONFIG_TRACEPOINTS +/* + * Since the tracing code might execute OPAL calls we need to guard against + * recursion. + */ +static DEFINE_PER_CPU(unsigned int, opal_trace_depth); + +static void __trace_opal_entry(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode) +{ + unsigned int *depth; + unsigned long args[8]; + + depth = this_cpu_ptr(&opal_trace_depth); + + if (*depth) + return; + + args[0] = a0; + args[1] = a1; + args[2] = a2; + args[3] = a3; + args[4] = a4; + args[5] = a5; + args[6] = a6; + args[7] = a7; + + (*depth)++; + trace_opal_entry(opcode, &args[0]); + (*depth)--; +} + +static void __trace_opal_exit(unsigned long opcode, unsigned long retval) +{ + unsigned int *depth; + + depth = this_cpu_ptr(&opal_trace_depth); + + if (*depth) + return; + + (*depth)++; + trace_opal_exit(opcode, retval); + (*depth)--; +} + +static DEFINE_STATIC_KEY_FALSE(opal_tracepoint_key); + +int opal_tracepoint_regfunc(void) +{ + static_branch_inc(&opal_tracepoint_key); + return 0; +} + +void opal_tracepoint_unregfunc(void) +{ + static_branch_dec(&opal_tracepoint_key); +} + +static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode, unsigned long msr) +{ + s64 ret; + + __trace_opal_entry(a0, a1, a2, a3, a4, a5, a6, a7, opcode); + ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + __trace_opal_exit(opcode, ret); + + return ret; +} + +#define DO_TRACE (static_branch_unlikely(&opal_tracepoint_key)) + +#else /* CONFIG_TRACEPOINTS */ + +static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode, unsigned long msr) +{ +} + +#define DO_TRACE false +#endif /* CONFIG_TRACEPOINTS */ + +static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, + int64_t a4, int64_t a5, int64_t a6, int64_t a7, int64_t opcode) +{ + unsigned long flags; + unsigned long msr = mfmsr(); + bool mmu = (msr & (MSR_IR|MSR_DR)); + int64_t ret; + + msr &= ~MSR_EE; + + if (unlikely(!mmu)) + return __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + + local_save_flags(flags); + hard_irq_disable(); + + if (DO_TRACE) { + ret = __opal_call_trace(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + } else { + ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + } + + local_irq_restore(flags); + + return ret; +} + +#define OPAL_CALL(name, opcode) \ +int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3, \ + int64_t a4, int64_t a5, int64_t a6, int64_t a7) \ +{ \ + return opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode); \ +} + +OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); +OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); +OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); +OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); +OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); +OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); +OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); +OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); +OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2); +OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); +OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); +OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); +OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); +OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); +OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); +OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); +OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); +OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); +OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); +OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); +OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); +OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); +OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); +OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); +OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); +OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); +OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET); +OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT); +OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); +OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); +OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); +OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); +OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); +OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); +OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); +OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE); +OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE); +OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); +OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); +OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); +OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); +OPAL_CALL(opal_start_cpu, OPAL_START_CPU); +OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); +OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); +OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); +OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); +OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); +OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA); +OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA); +OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB); +OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT); +OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); +OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); +OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); +OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS); +OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); +OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR); +OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL); +OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); +OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2); +OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ); +OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); +OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); +OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); +OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); +OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); +OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); +OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); +OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); +OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND); +OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE); +OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); +OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); +OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); +OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); +OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN); +OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); +OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); +OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); +OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); +OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); +OPAL_CALL(opal_get_msg, OPAL_GET_MSG); +OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC); +OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); +OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); +OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); +OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); +OPAL_CALL(opal_get_param, OPAL_GET_PARAM); +OPAL_CALL(opal_set_param, OPAL_SET_PARAM); +OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); +OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); +OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); +OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); +OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); +OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE); +OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO); +OPAL_CALL(opal_tpo_read, OPAL_READ_TPO); +OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND); +OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV); +OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); +OPAL_CALL(opal_flash_read, OPAL_FLASH_READ); +OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE); +OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); +OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); +OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR); +OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR); +OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); +OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE); +OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); +OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); +OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); +OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); +OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); +OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); +OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); +OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); +OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); +OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); +OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); +OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); +OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); +OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); +OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); +OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); +OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); +OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); +OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); +OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); +OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); +OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); +OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); +OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); +OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); +OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); +OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); +OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); +OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); +OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP); +OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P); +OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP); +OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP); +OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR); +OPAL_CALL(opal_quiesce, OPAL_QUIESCE); +OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP); +OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE); +OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET); +OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR); +OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); +OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); +OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); +OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -98,7 +98,7 @@ static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, } static struct bin_attribute opal_msglog_attr = { - .attr = {.name = "msglog", .mode = 0444}, + .attr = {.name = "msglog", .mode = 0400}, .read = opal_msglog_read }; diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -17,317 +17,51 @@ #include <asm/asm-compat.h> #include <asm/feature-fixups.h> - .section ".text" - -#ifdef CONFIG_TRACEPOINTS -#ifdef CONFIG_JUMP_LABEL -#define OPAL_BRANCH(LABEL) \ - ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key) -#else - - .section ".toc","aw" - - .globl opal_tracepoint_refcount -opal_tracepoint_refcount: - .8byte 0 - - .section ".text" - -/* - * We branch around this in early init by using an unconditional cpu - * feature. - */ -#define OPAL_BRANCH(LABEL) \ -BEGIN_FTR_SECTION; \ - b 1f; \ -END_FTR_SECTION(0, 1); \ - ld r11,opal_tracepoint_refcount@toc(r2); \ - cmpdi r11,0; \ - bne- LABEL; \ -1: - -#endif - -#else -#define OPAL_BRANCH(LABEL) -#endif + .section ".text" /* - * DO_OPAL_CALL assumes: - * r0 = opal call token - * r12 = msr - * LR has been saved + * r3-r10 - OPAL call arguments + * STK_PARAM(R11) - OPAL opcode + * STK_PARAM(R12) - MSR to restore */ -#define DO_OPAL_CALL() \ - mfcr r11; \ - stw r11,8(r1); \ - li r11,0; \ - ori r11,r11,MSR_EE; \ - std r12,PACASAVEDMSR(r13); \ - andc r12,r12,r11; \ - mtmsrd r12,1; \ - LOAD_REG_ADDR(r11,opal_return); \ - mtlr r11; \ - li r11,MSR_DR|MSR_IR|MSR_LE;\ - andc r12,r12,r11; \ - mtspr SPRN_HSRR1,r12; \ - LOAD_REG_ADDR(r11,opal); \ - ld r12,8(r11); \ - ld r2,0(r11); \ - mtspr SPRN_HSRR0,r12; \ +_GLOBAL_TOC(__opal_call) + mflr r0 + std r0,PPC_LR_STKOFF(r1) + ld r12,STK_PARAM(R12)(r1) + li r0,MSR_IR|MSR_DR|MSR_LE + andc r12,r12,r0 + LOAD_REG_ADDR(r11, opal_return) + mtlr r11 + LOAD_REG_ADDR(r11, opal) + ld r2,0(r11) + ld r11,8(r11) + mtspr SPRN_HSRR0,r11 + mtspr SPRN_HSRR1,r12 + /* set token to r0 */ + ld r0,STK_PARAM(R11)(r1) hrfid - -#define OPAL_CALL(name, token) \ - _GLOBAL_TOC(name); \ - mfmsr r12; \ - mflr r0; \ - andi. r11,r12,MSR_IR|MSR_DR; \ - std r0,PPC_LR_STKOFF(r1); \ - li r0,token; \ - beq opal_real_call; \ - OPAL_BRANCH(opal_tracepoint_entry) \ - DO_OPAL_CALL() - - opal_return: /* - * Fixup endian on OPAL return... we should be able to simplify - * this by instead converting the below trampoline to a set of - * bytes (always BE) since MSR:LE will end up fixed up as a side - * effect of the rfid. + * Restore MSR on OPAL return. The MSR is set to big-endian. */ - FIXUP_ENDIAN_HV - ld r2,PACATOC(r13); - lwz r4,8(r1); - ld r5,PPC_LR_STKOFF(r1); - ld r6,PACASAVEDMSR(r13); - mtcr r4; - mtspr SPRN_HSRR0,r5; - mtspr SPRN_HSRR1,r6; - hrfid - -opal_real_call: - mfcr r11 - stw r11,8(r1) - /* Set opal return address */ - LOAD_REG_ADDR(r11, opal_return_realmode) - mtlr r11 - li r11,MSR_LE - andc r12,r12,r11 - mtspr SPRN_HSRR1,r12 - LOAD_REG_ADDR(r11,opal) - ld r12,8(r11) - ld r2,0(r11) - mtspr SPRN_HSRR0,r12 - hrfid - -opal_return_realmode: - FIXUP_ENDIAN_HV - ld r2,PACATOC(r13); - lwz r11,8(r1); - ld r12,PPC_LR_STKOFF(r1) - mtcr r11; - mtlr r12 - blr - -#ifdef CONFIG_TRACEPOINTS -opal_tracepoint_entry: - stdu r1,-STACKFRAMESIZE(r1) - std r0,STK_REG(R23)(r1) - std r3,STK_REG(R24)(r1) - std r4,STK_REG(R25)(r1) - std r5,STK_REG(R26)(r1) - std r6,STK_REG(R27)(r1) - std r7,STK_REG(R28)(r1) - std r8,STK_REG(R29)(r1) - std r9,STK_REG(R30)(r1) - std r10,STK_REG(R31)(r1) - mr r3,r0 - addi r4,r1,STK_REG(R24) - bl __trace_opal_entry - ld r0,STK_REG(R23)(r1) - ld r3,STK_REG(R24)(r1) - ld r4,STK_REG(R25)(r1) - ld r5,STK_REG(R26)(r1) - ld r6,STK_REG(R27)(r1) - ld r7,STK_REG(R28)(r1) - ld r8,STK_REG(R29)(r1) - ld r9,STK_REG(R30)(r1) - ld r10,STK_REG(R31)(r1) - - /* setup LR so we return via tracepoint_return */ - LOAD_REG_ADDR(r11,opal_tracepoint_return) - std r11,16(r1) - - mfmsr r12 - DO_OPAL_CALL() - -opal_tracepoint_return: - std r3,STK_REG(R31)(r1) - mr r4,r3 - ld r3,STK_REG(R23)(r1) - bl __trace_opal_exit - ld r3,STK_REG(R31)(r1) - addi r1,r1,STACKFRAMESIZE - ld r0,16(r1) +#ifdef __BIG_ENDIAN__ + ld r11,STK_PARAM(R12)(r1) + mtmsrd r11 +#else + /* Endian can only be switched with rfi, must byte reverse MSR load */ + .short 0x4039 /* li r10,STK_PARAM(R12) */ + .byte (STK_PARAM(R12) >> 8) & 0xff + .byte STK_PARAM(R12) & 0xff + + .long 0x280c6a7d /* ldbrx r11,r10,r1 */ + .long 0x05009f42 /* bcl 20,31,$+4 */ + .long 0xa602487d /* mflr r10 */ + .long 0x14004a39 /* addi r10,r10,20 */ + .long 0xa64b5a7d /* mthsrr0 r10 */ + .long 0xa64b7b7d /* mthsrr1 r11 */ + .long 0x2402004c /* hrfid */ +#endif + ld r2,PACATOC(r13) + ld r0,PPC_LR_STKOFF(r1) mtlr r0 blr -#endif - - -OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); -OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); -OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); -OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); -OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); -OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); -OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); -OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); -OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2); -OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); -OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); -OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); -OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); -OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); -OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); -OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); -OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); -OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); -OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); -OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); -OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); -OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); -OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); -OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); -OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); -OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); -OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET); -OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT); -OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); -OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); -OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); -OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); -OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); -OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); -OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); -OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE); -OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE); -OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); -OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); -OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); -OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); -OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); -OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); -OPAL_CALL(opal_start_cpu, OPAL_START_CPU); -OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); -OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); -OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); -OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); -OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); -OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA); -OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA); -OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB); -OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT); -OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); -OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); -OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); -OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS); -OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); -OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR); -OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL); -OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); -OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2); -OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ); -OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); -OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); -OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); -OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); -OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); -OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); -OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); -OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); -OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND); -OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE); -OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); -OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); -OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); -OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); -OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN); -OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); -OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); -OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); -OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); -OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); -OPAL_CALL(opal_get_msg, OPAL_GET_MSG); -OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC); -OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); -OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); -OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); -OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); -OPAL_CALL(opal_get_param, OPAL_GET_PARAM); -OPAL_CALL(opal_set_param, OPAL_SET_PARAM); -OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); -OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); -OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); -OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); -OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); -OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE); -OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO); -OPAL_CALL(opal_tpo_read, OPAL_READ_TPO); -OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND); -OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV); -OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); -OPAL_CALL(opal_flash_read, OPAL_FLASH_READ); -OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE); -OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); -OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); -OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR); -OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR); -OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); -OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE); -OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); -OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); -OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); -OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); -OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); -OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); -OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); -OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); -OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); -OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); -OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); -OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); -OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); -OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); -OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); -OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); -OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); -OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); -OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); -OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); -OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); -OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); -OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); -OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); -OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); -OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); -OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); -OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); -OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); -OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); -OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP); -OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P); -OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP); -OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP); -OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO); -OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO); -OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR); -OPAL_CALL(opal_quiesce, OPAL_QUIESCE); -OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP); -OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE); -OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET); -OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR); -OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); -OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); -OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); -OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c @@ -26,7 +26,6 @@ #include <linux/memblock.h> #include <linux/kthread.h> #include <linux/freezer.h> -#include <linux/printk.h> #include <linux/kmsg_dump.h> #include <linux/console.h> #include <linux/sched/debug.h> @@ -587,7 +586,7 @@ int opal_machine_check(struct pt_regs *regs) evt.version); return 0; } - machine_check_print_event_info(&evt, user_mode(regs)); + machine_check_print_event_info(&evt, user_mode(regs), false); if (opal_recover_mce(regs, &evt)) return 1; diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -313,7 +313,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, page_shift); tbl->it_level_size = 1ULL << (level_shift - 3); tbl->it_indirect_levels = levels - 1; - tbl->it_allocated_size = total_allocated; tbl->it_userspace = uas; tbl->it_nid = nid; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1748,7 +1748,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); - set_dma_offset(&pdev->dev, pe->tce_bypass_base); + pdev->dev.archdata.dma_offset = pe->tce_bypass_base; set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); /* * Note: iommu_add_device() will fail here as @@ -1758,31 +1758,6 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev */ } -static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) -{ - unsigned short vendor = 0; - struct pci_dev *pdev; - - if (pe->device_count == 1) - return true; - - /* pe->pdev should be set if it's a single device, pe->pbus if not */ - if (!pe->pbus) - return true; - - list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { - if (!vendor) { - vendor = pdev->vendor; - continue; - } - - if (pdev->vendor != vendor) - return false; - } - - return true; -} - /* * Reconfigure TVE#0 to be usable as 64-bit DMA space. * @@ -1852,88 +1827,45 @@ err: return -EIO; } -static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, + u64 dma_mask) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; - uint64_t top; - bool bypass = false; - s64 rc; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) return -ENODEV; pe = &phb->ioda.pe_array[pdn->pe_number]; if (pe->tce_bypass_enabled) { - top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; - bypass = (dma_mask >= top); + u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; + if (dma_mask >= top) + return true; } - if (bypass) { - dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); - set_dma_ops(&pdev->dev, &dma_nommu_ops); - } else { - /* - * If the device can't set the TCE bypass bit but still wants - * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to - * bypass the 32-bit region and be usable for 64-bit DMAs. - * The device needs to be able to address all of this space. - */ - if (dma_mask >> 32 && - dma_mask > (memory_hotplug_max() + (1ULL << 32)) && - pnv_pci_ioda_pe_single_vendor(pe) && - phb->model == PNV_PHB_MODEL_PHB3) { - /* Configure the bypass mode */ - rc = pnv_pci_ioda_dma_64bit_bypass(pe); - if (rc) - return rc; - /* 4GB offset bypasses 32-bit space */ - set_dma_offset(&pdev->dev, (1ULL << 32)); - set_dma_ops(&pdev->dev, &dma_nommu_ops); - } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) { - /* - * Fail the request if a DMA mask between 32 and 64 bits - * was requested but couldn't be fulfilled. Ideally we - * would do this for 64-bits but historically we have - * always fallen back to 32-bits. - */ - return -ENOMEM; - } else { - dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); - set_dma_ops(&pdev->dev, &dma_iommu_ops); - } + /* + * If the device can't set the TCE bypass bit but still wants + * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to + * bypass the 32-bit region and be usable for 64-bit DMAs. + * The device needs to be able to address all of this space. + */ + if (dma_mask >> 32 && + dma_mask > (memory_hotplug_max() + (1ULL << 32)) && + /* pe->pdev should be set if it's a single device, pe->pbus if not */ + (pe->device_count == 1 || !pe->pbus) && + phb->model == PNV_PHB_MODEL_PHB3) { + /* Configure the bypass mode */ + s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe); + if (rc) + return rc; + /* 4GB offset bypasses 32-bit space */ + pdev->dev.archdata.dma_offset = (1ULL << 32); + return true; } - *pdev->dev.dma_mask = dma_mask; - /* Update peer npu devices */ - pnv_npu_try_dma_set_bypass(pdev, bypass); - - return 0; -} - -static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(pdev); - struct pnv_ioda_pe *pe; - u64 end, mask; - - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return 0; - - pe = &phb->ioda.pe_array[pdn->pe_number]; - if (!pe->tce_bypass_enabled) - return __dma_get_required_mask(&pdev->dev); - - - end = pe->tce_bypass_base + memblock_end_of_DRAM(); - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; + return false; } static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) @@ -1942,7 +1874,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) list_for_each_entry(dev, &bus->devices, bus_list) { set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); - set_dma_offset(&dev->dev, pe->tce_bypass_base); + dev->dev.archdata.dma_offset = pe->tce_bypass_base; if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) pnv_ioda_setup_bus_dma(pe, dev->subordinate); @@ -2594,8 +2526,13 @@ static long pnv_pci_ioda2_create_table_userspace( int num, __u32 page_shift, __u64 window_size, __u32 levels, struct iommu_table **ptbl) { - return pnv_pci_ioda2_create_table(table_group, + long ret = pnv_pci_ioda2_create_table(table_group, num, page_shift, window_size, levels, true, ptbl); + + if (!ret) + (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size( + page_shift, window_size, levels); + return ret; } static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) @@ -3661,6 +3598,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, .dma_bus_setup = pnv_pci_dma_bus_setup, + .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, .setup_msi_irqs = pnv_setup_msi_irqs, .teardown_msi_irqs = pnv_teardown_msi_irqs, .enable_device_hook = pnv_pci_enable_device_hook, @@ -3668,19 +3606,9 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .window_alignment = pnv_pci_window_alignment, .setup_bridge = pnv_pci_setup_bridge, .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_pci_ioda_dma_set_mask, - .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, .shutdown = pnv_pci_ioda_shutdown, }; -static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) -{ - dev_err_once(&npdev->dev, - "%s operation unsupported for NVLink devices\n", - __func__); - return -EPERM; -} - static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, .setup_msi_irqs = pnv_setup_msi_irqs, @@ -3688,7 +3616,6 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { .enable_device_hook = pnv_pci_enable_device_hook, .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_npu_dma_set_mask, .shutdown = pnv_pci_ioda_shutdown, .disable_device = pnv_npu_disable_device, }; @@ -3946,9 +3873,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, * shutdown PCI devices correctly. We already got IODA table * cleaned out. So we have to issue PHB reset to stop all PCI * transactions from previous kernel. The ppc_pci_reset_phbs - * kernel parameter will force this reset too. + * kernel parameter will force this reset too. Additionally, + * if the IODA reset above failed then use a bigger hammer. + * This can happen if we get a PHB fatal error in very early + * boot. */ - if (is_kdump_kernel() || pci_reset_phbs) { + if (is_kdump_kernel() || pci_reset_phbs || rc) { pr_info(" Issue PHB reset ...\n"); pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE); diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c @@ -39,6 +39,7 @@ #include <asm/cpuidle.h> #include <asm/kexec.h> #include <asm/reg.h> +#include <asm/powernv.h> #include "powernv.h" @@ -153,6 +154,7 @@ static void pnv_smp_cpu_kill_self(void) { unsigned int cpu; unsigned long srr1, wmask; + u64 lpcr_val; /* Standard hot unplug procedure */ /* @@ -174,6 +176,19 @@ static void pnv_smp_cpu_kill_self(void) if (cpu_has_feature(CPU_FTR_ARCH_207S)) wmask = SRR1_WAKEMASK_P8; + /* + * We don't want to take decrementer interrupts while we are + * offline, so clear LPCR:PECE1. We keep PECE2 (and + * LPCR_PECE_HVEE on P9) enabled so as to let IPIs in. + * + * If the CPU gets woken up by a special wakeup, ensure that + * the SLW engine sets LPCR with decrementer bit cleared, else + * the CPU will come back to the kernel due to a spurious + * wakeup. + */ + lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); + while (!generic_check_cpu_restart(cpu)) { /* * Clear IPI flag, since we don't handle IPIs while @@ -246,6 +261,16 @@ static void pnv_smp_cpu_kill_self(void) } + /* + * Re-enable decrementer interrupts in LPCR. + * + * Further, we want stop states to be woken up by decrementer + * for non-hotplug cases. So program the LPCR via stop api as + * well. + */ + lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); + DBG("CPU%d coming online...\n", cpu); } diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c @@ -354,9 +354,7 @@ static int ps3_setup_storage_dev(const struct ps3_repository_device *repo, repo->dev_index, repo->dev_type, port, blk_size, num_blocks, num_regions); - p = kzalloc(sizeof(struct ps3_storage_device) + - num_regions * sizeof(struct ps3_storage_region), - GFP_KERNEL); + p = kzalloc(struct_size(p, regions, num_regions), GFP_KERNEL); if (!p) { result = -ENOMEM; goto fail_malloc; diff --git a/arch/powerpc/platforms/ps3/os-area.c b/arch/powerpc/platforms/ps3/os-area.c @@ -205,11 +205,11 @@ static const struct os_area_db_id os_area_db_id_rtc_diff = { * 3) The number of seconds from 1970 to 2000. */ -struct saved_params { +static struct saved_params { unsigned int valid; s64 rtc_diff; unsigned int av_multi_out; -} static saved_params; +} saved_params; static struct property property_rtc_diff = { .name = "linux,rtc_diff", diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c @@ -37,12 +37,12 @@ static struct device ps3_system_bus = { }; /* FIXME: need device usage counters! */ -struct { +static struct { struct mutex mutex; int sb_11; /* usb 0 */ int sb_12; /* usb 0 */ int gpu; -} static usage_hack; +} usage_hack; static int ps3_is_device(struct ps3_system_bus_device *dev, u64 bus_id, u64 dev_id) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -802,6 +802,25 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add) return rc; } +int dlpar_cpu_readd(int cpu) +{ + struct device_node *dn; + struct device *dev; + u32 drc_index; + int rc; + + dev = get_cpu_device(cpu); + dn = dev->of_node; + + rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index); + + rc = dlpar_cpu_remove_by_index(drc_index); + if (!rc) + rc = dlpar_cpu_add(drc_index); + + return rc; +} + int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) { u32 count, drc_index; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c @@ -978,7 +978,7 @@ static phys_addr_t ddw_memory_hotplug_max(void) * pdn: the parent pe node with the ibm,dma_window property * Future: also check if we can remap the base window for our base page size * - * returns the dma offset for use by dma_set_mask + * returns the dma offset for use by the direct mapped DMA code. */ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) { @@ -1198,87 +1198,37 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) iommu_add_device(pci->table_group, &dev->dev); } -static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) +static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) { - bool ddw_enabled = false; - struct device_node *pdn, *dn; - struct pci_dev *pdev; + struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; const __be32 *dma_window = NULL; - u64 dma_offset; - - if (!dev->dma_mask) - return -EIO; - - if (!dev_is_pci(dev)) - goto check_mask; - - pdev = to_pci_dev(dev); /* only attempt to use a new window if 64-bit DMA is requested */ - if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { - dn = pci_device_to_OF_node(pdev); - dev_dbg(dev, "node is %pOF\n", dn); + if (dma_mask < DMA_BIT_MASK(64)) + return false; - /* - * the device tree might contain the dma-window properties - * per-device and not necessarily for the bus. So we need to - * search upwards in the tree until we either hit a dma-window - * property, OR find a parent with a table already allocated. - */ - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; - pdn = pdn->parent) { - dma_window = of_get_property(pdn, "ibm,dma-window", NULL); - if (dma_window) - break; - } - if (pdn && PCI_DN(pdn)) { - dma_offset = enable_ddw(pdev, pdn); - if (dma_offset != 0) { - dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset); - set_dma_offset(dev, dma_offset); - set_dma_ops(dev, &dma_nommu_ops); - ddw_enabled = true; - } - } - } + dev_dbg(&pdev->dev, "node is %pOF\n", dn); - /* fall back on iommu ops */ - if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) { - dev_info(dev, "Restoring 32-bit DMA via iommu\n"); - set_dma_ops(dev, &dma_iommu_ops); + /* + * the device tree might contain the dma-window properties + * per-device and not necessarily for the bus. So we need to + * search upwards in the tree until we either hit a dma-window + * property, OR find a parent with a table already allocated. + */ + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; + pdn = pdn->parent) { + dma_window = of_get_property(pdn, "ibm,dma-window", NULL); + if (dma_window) + break; } -check_mask: - if (!dma_supported(dev, dma_mask)) - return -EIO; - - *dev->dma_mask = dma_mask; - return 0; -} - -static u64 dma_get_required_mask_pSeriesLP(struct device *dev) -{ - if (!dev->dma_mask) - return 0; - - if (!disable_ddw && dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - struct device_node *dn; - - dn = pci_device_to_OF_node(pdev); - - /* search upwards for ibm,dma-window */ - for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group; - dn = dn->parent) - if (of_get_property(dn, "ibm,dma-window", NULL)) - break; - /* if there is a ibm,ddw-applicable property require 64 bits */ - if (dn && PCI_DN(dn) && - of_get_property(dn, "ibm,ddw-applicable", NULL)) - return DMA_BIT_MASK(64); + if (pdn && PCI_DN(pdn)) { + pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn); + if (pdev->dev.archdata.dma_offset) + return true; } - return dma_iommu_ops.get_required_mask(dev); + return false; } static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, @@ -1373,8 +1323,9 @@ void iommu_init_early_pSeries(void) if (firmware_has_feature(FW_FEATURE_LPAR)) { pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP; pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP; - ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; - ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP; + if (!disable_ddw) + pseries_pci_controller_ops.iommu_bypass_supported = + iommu_bypass_supported_pSeriesLP; } else { pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries; pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries; diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c @@ -475,6 +475,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) splpar_dispatch_data(m); seq_printf(m, "purr=%ld\n", get_purr()); + seq_printf(m, "tbr=%ld\n", mftb()); } else { /* non SPLPAR case */ seq_printf(m, "system_active_processors=%d\n", diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c @@ -492,7 +492,9 @@ static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size, return NULL; } - ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs); + ret = iommu_alloc_coherent(dev, get_iommu_table_base(dev), size, + dma_handle, dev->coherent_dma_mask, flag, + dev_to_node(dev)); if (unlikely(ret == NULL)) { vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE)); atomic_inc(&viodev->cmo.allocs_failed); @@ -507,8 +509,7 @@ static void vio_dma_iommu_free_coherent(struct device *dev, size_t size, { struct vio_dev *viodev = to_vio_dev(dev); - dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs); - + iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle); vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE)); } @@ -518,22 +519,22 @@ static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; + struct iommu_table *tbl = get_iommu_table_base(dev); dma_addr_t ret = DMA_MAPPING_ERROR; - tbl = get_iommu_table_base(dev); - if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) { - atomic_inc(&viodev->cmo.allocs_failed); - return ret; - } - - ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs); - if (unlikely(dma_mapping_error(dev, ret))) { - vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); - atomic_inc(&viodev->cmo.allocs_failed); - } - + if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) + goto out_fail; + ret = iommu_map_page(dev, tbl, page, offset, size, device_to_mask(dev), + direction, attrs); + if (unlikely(ret == DMA_MAPPING_ERROR)) + goto out_deallocate; return ret; + +out_deallocate: + vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); +out_fail: + atomic_inc(&viodev->cmo.allocs_failed); + return DMA_MAPPING_ERROR; } static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, @@ -542,11 +543,9 @@ static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; - - tbl = get_iommu_table_base(dev); - dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs); + struct iommu_table *tbl = get_iommu_table_base(dev); + iommu_unmap_page(tbl, dma_handle, size, direction, attrs); vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); } @@ -555,34 +554,32 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; + struct iommu_table *tbl = get_iommu_table_base(dev); struct scatterlist *sgl; int ret, count; size_t alloc_size = 0; - tbl = get_iommu_table_base(dev); for_each_sg(sglist, sgl, nelems, count) alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); - if (vio_cmo_alloc(viodev, alloc_size)) { - atomic_inc(&viodev->cmo.allocs_failed); - return 0; - } - - ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs); - - if (unlikely(!ret)) { - vio_cmo_dealloc(viodev, alloc_size); - atomic_inc(&viodev->cmo.allocs_failed); - return ret; - } + if (vio_cmo_alloc(viodev, alloc_size)) + goto out_fail; + ret = ppc_iommu_map_sg(dev, tbl, sglist, nelems, device_to_mask(dev), + direction, attrs); + if (unlikely(!ret)) + goto out_deallocate; for_each_sg(sglist, sgl, ret, count) alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); if (alloc_size) vio_cmo_dealloc(viodev, alloc_size); - return ret; + +out_deallocate: + vio_cmo_dealloc(viodev, alloc_size); +out_fail: + atomic_inc(&viodev->cmo.allocs_failed); + return 0; } static void vio_dma_iommu_unmap_sg(struct device *dev, @@ -591,40 +588,27 @@ static void vio_dma_iommu_unmap_sg(struct device *dev, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; + struct iommu_table *tbl = get_iommu_table_base(dev); struct scatterlist *sgl; size_t alloc_size = 0; int count; - tbl = get_iommu_table_base(dev); for_each_sg(sglist, sgl, nelems, count) alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); - dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); - + ppc_iommu_unmap_sg(tbl, sglist, nelems, direction, attrs); vio_cmo_dealloc(viodev, alloc_size); } -static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask) -{ - return dma_iommu_ops.dma_supported(dev, mask); -} - -static u64 vio_dma_get_required_mask(struct device *dev) -{ - return dma_iommu_ops.get_required_mask(dev); -} - static const struct dma_map_ops vio_dma_mapping_ops = { .alloc = vio_dma_iommu_alloc_coherent, .free = vio_dma_iommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, .map_sg = vio_dma_iommu_map_sg, .unmap_sg = vio_dma_iommu_unmap_sg, .map_page = vio_dma_iommu_map_page, .unmap_page = vio_dma_iommu_unmap_page, - .dma_supported = vio_dma_iommu_dma_supported, - .get_required_mask = vio_dma_get_required_mask, + .dma_supported = dma_iommu_dma_supported, + .get_required_mask = dma_iommu_get_required_mask, }; /** @@ -1715,3 +1699,10 @@ int vio_disable_interrupts(struct vio_dev *dev) } EXPORT_SYMBOL(vio_disable_interrupts); #endif /* CONFIG_PPC_PSERIES */ + +static int __init vio_init(void) +{ + dma_debug_add_bus(&vio_bus_type); + return 0; +} +fs_initcall(vio_init); diff --git a/arch/powerpc/sysdev/6xx-suspend.S b/arch/powerpc/sysdev/6xx-suspend.S @@ -29,10 +29,9 @@ _GLOBAL(mpc6xx_enter_standby) ori r5, r5, ret_from_standby@l mtlr r5 - CURRENT_THREAD_INFO(r5, r1) - lwz r6, TI_LOCAL_FLAGS(r5) + lwz r6, TI_LOCAL_FLAGS(r2) ori r6, r6, _TLF_SLEEPING - stw r6, TI_LOCAL_FLAGS(r5) + stw r6, TI_LOCAL_FLAGS(r2) mfmsr r5 ori r5, r5, MSR_EE diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c @@ -360,13 +360,6 @@ static void iommu_table_dart_setup(void) set_bit(iommu_table_dart.it_size - 1, iommu_table_dart.it_map); } -static void pci_dma_dev_setup_dart(struct pci_dev *dev) -{ - if (dart_is_u4) - set_dma_offset(&dev->dev, DART_U4_BYPASS_BASE); - set_iommu_table_base(&dev->dev, &iommu_table_dart); -} - static void pci_dma_bus_setup_dart(struct pci_bus *bus) { if (!iommu_table_dart_inited) { @@ -390,27 +383,18 @@ static bool dart_device_on_pcie(struct device *dev) return false; } -static int dart_dma_set_mask(struct device *dev, u64 dma_mask) +static void pci_dma_dev_setup_dart(struct pci_dev *dev) { - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - - /* U4 supports a DART bypass, we use it for 64-bit capable - * devices to improve performances. However, that only works - * for devices connected to U4 own PCIe interface, not bridged - * through hypertransport. We need the device to support at - * least 40 bits of addresses. - */ - if (dart_device_on_pcie(dev) && dma_mask >= DMA_BIT_MASK(40)) { - dev_info(dev, "Using 64-bit DMA iommu bypass\n"); - set_dma_ops(dev, &dma_nommu_ops); - } else { - dev_info(dev, "Using 32-bit DMA via iommu\n"); - set_dma_ops(dev, &dma_iommu_ops); - } + if (dart_is_u4 && dart_device_on_pcie(&dev->dev)) + dev->dev.archdata.dma_offset = DART_U4_BYPASS_BASE; + set_iommu_table_base(&dev->dev, &iommu_table_dart); +} - *dev->dma_mask = dma_mask; - return 0; +static bool iommu_bypass_supported_dart(struct pci_dev *dev, u64 mask) +{ + return dart_is_u4 && + dart_device_on_pcie(&dev->dev) && + mask >= DMA_BIT_MASK(40); } void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops) @@ -428,26 +412,20 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops) /* Initialize the DART HW */ if (dart_init(dn) != 0) - goto bail; - - /* Setup bypass if supported */ - if (dart_is_u4) - ppc_md.dma_set_mask = dart_dma_set_mask; + return; + /* + * U4 supports a DART bypass, we use it for 64-bit capable devices to + * improve performance. However, that only works for devices connected + * to the U4 own PCIe interface, not bridged through hypertransport. + * We need the device to support at least 40 bits of addresses. + */ controller_ops->dma_dev_setup = pci_dma_dev_setup_dart; controller_ops->dma_bus_setup = pci_dma_bus_setup_dart; + controller_ops->iommu_bypass_supported = iommu_bypass_supported_dart; /* Setup pci_dma ops */ set_pci_dma_ops(&dma_iommu_ops); - return; - - bail: - /* If init failed, use direct iommu and null setup functions */ - controller_ops->dma_dev_setup = NULL; - controller_ops->dma_bus_setup = NULL; - - /* Setup pci_dma ops */ - set_pci_dma_ops(&dma_nommu_ops); } #ifdef CONFIG_PM diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c @@ -40,6 +40,7 @@ #include <asm/mpc85xx.h> #include <asm/disassemble.h> #include <asm/ppc-opcode.h> +#include <asm/swiotlb.h> #include <sysdev/fsl_soc.h> #include <sysdev/fsl_pci.h> @@ -114,33 +115,33 @@ static struct pci_ops fsl_indirect_pcie_ops = static u64 pci64_dma_offset; #ifdef CONFIG_SWIOTLB +static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + + pdev->dev.bus_dma_mask = + hose->dma_window_base_cur + hose->dma_window_size; +} + static void setup_swiotlb_ops(struct pci_controller *hose) { - if (ppc_swiotlb_enable) { + if (ppc_swiotlb_enable) hose->controller_ops.dma_dev_setup = pci_dma_dev_setup_swiotlb; - set_pci_dma_ops(&powerpc_swiotlb_dma_ops); - } } #else static inline void setup_swiotlb_ops(struct pci_controller *hose) {} #endif -static int fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) +static void fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) { - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - /* * Fix up PCI devices that are able to DMA to the large inbound * mapping that allows addressing any RAM address from across PCI. */ if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) { - set_dma_ops(dev, &dma_nommu_ops); - set_dma_offset(dev, pci64_dma_offset); + dev->bus_dma_mask = 0; + dev->archdata.dma_offset = pci64_dma_offset; } - - *dev->dma_mask = dma_mask; - return 0; } static int setup_one_atmu(struct ccsr_pci __iomem *pci, diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c @@ -771,21 +771,6 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags) return ipic; } -void ipic_set_highest_priority(unsigned int virq) -{ - struct ipic *ipic = ipic_from_irq(virq); - unsigned int src = virq_to_hw(virq); - u32 temp; - - temp = ipic_read(ipic->regs, IPIC_SICFR); - - /* clear and set HPI */ - temp &= 0x7f000000; - temp |= (src & 0x7f) << 24; - - ipic_write(ipic->regs, IPIC_SICFR, temp); -} - void ipic_set_default_priority(void) { ipic_write(primary_ipic->regs, IPIC_SIPRR_A, IPIC_PRIORITY_DEFAULT); @@ -796,26 +781,6 @@ void ipic_set_default_priority(void) ipic_write(primary_ipic->regs, IPIC_SMPRR_B, IPIC_PRIORITY_DEFAULT); } -void ipic_enable_mcp(enum ipic_mcp_irq mcp_irq) -{ - struct ipic *ipic = primary_ipic; - u32 temp; - - temp = ipic_read(ipic->regs, IPIC_SERMR); - temp |= (1 << (31 - mcp_irq)); - ipic_write(ipic->regs, IPIC_SERMR, temp); -} - -void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq) -{ - struct ipic *ipic = primary_ipic; - u32 temp; - - temp = ipic_read(ipic->regs, IPIC_SERMR); - temp &= (1 << (31 - mcp_irq)); - ipic_write(ipic->regs, IPIC_SERMR, temp); -} - u32 ipic_get_mcp_status(void) { return primary_ipic ? ipic_read(primary_ipic->regs, IPIC_SERSR) : 0; diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c @@ -51,7 +51,7 @@ phys_addr_t get_csrbase(void) const void *prop = of_get_property(tsi, "reg", &size); tsi108_csr_base = of_translate_address(tsi, prop); of_node_put(tsi); - }; + } return tsi108_csr_base; } diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c @@ -442,7 +442,7 @@ static void xive_dec_target_count(int cpu) struct xive_cpu *xc = per_cpu(xive_cpu, cpu); struct xive_q *q = &xc->queue[xive_irq_priority]; - if (unlikely(WARN_ON(cpu < 0 || !xc))) { + if (WARN_ON(cpu < 0 || !xc)) { pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc); return; } diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile @@ -5,6 +5,7 @@ subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header) GCOV_PROFILE := n +KCOV_INSTRUMENT := n UBSAN_SANITIZE := n # Disable ftrace for the entire directory diff --git a/arch/powerpc/xmon/ppc-dis.c b/arch/powerpc/xmon/ppc-dis.c @@ -158,7 +158,7 @@ int print_insn_powerpc (unsigned long insn, unsigned long memaddr) dialect |= (PPC_OPCODE_POWER5 | PPC_OPCODE_POWER6 | PPC_OPCODE_POWER7 | PPC_OPCODE_POWER8 | PPC_OPCODE_POWER9 | PPC_OPCODE_HTM | PPC_OPCODE_ALTIVEC | PPC_OPCODE_ALTIVEC2 - | PPC_OPCODE_VSX | PPC_OPCODE_VSX3), + | PPC_OPCODE_VSX | PPC_OPCODE_VSX3); /* Get the major opcode of the insn. */ opcode = NULL; diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c @@ -2997,7 +2997,7 @@ static void show_task(struct task_struct *tsk) printf("%px %016lx %6d %6d %c %2d %s\n", tsk, tsk->thread.ksp, tsk->pid, rcu_dereference(tsk->parent)->pid, - state, task_thread_info(tsk)->cpu, + state, task_cpu(tsk), tsk->comm); } diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c @@ -267,6 +267,7 @@ static int guest_reset(struct cxl *adapter) int i, rc; pr_devel("Adapter reset request\n"); + spin_lock(&adapter->afu_list_lock); for (i = 0; i < adapter->slices; i++) { if ((afu = adapter->afu[i])) { pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT, @@ -283,6 +284,7 @@ static int guest_reset(struct cxl *adapter) pci_error_handlers(afu, CXL_RESUME_EVENT, 0); } } + spin_unlock(&adapter->afu_list_lock); return rc; } diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c @@ -1805,7 +1805,7 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu, /* There should only be one entry, but go through the list * anyway */ - if (afu->phb == NULL) + if (afu == NULL || afu->phb == NULL) return result; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { @@ -1832,7 +1832,8 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev, { struct cxl *adapter = pci_get_drvdata(pdev); struct cxl_afu *afu; - pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result; + pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET; + pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET; int i; /* At this point, we could still have an interrupt pending. @@ -1843,6 +1844,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev, /* If we're permanently dead, give up. */ if (state == pci_channel_io_perm_failure) { + spin_lock(&adapter->afu_list_lock); for (i = 0; i < adapter->slices; i++) { afu = adapter->afu[i]; /* @@ -1851,6 +1853,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev, */ cxl_vphb_error_detected(afu, state); } + spin_unlock(&adapter->afu_list_lock); return PCI_ERS_RESULT_DISCONNECT; } @@ -1932,11 +1935,17 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev, * * In slot_reset, free the old resources and allocate new ones. * * In resume, clear the flag to allow things to start. */ + + /* Make sure no one else changes the afu list */ + spin_lock(&adapter->afu_list_lock); + for (i = 0; i < adapter->slices; i++) { afu = adapter->afu[i]; - afu_result = cxl_vphb_error_detected(afu, state); + if (afu == NULL) + continue; + afu_result = cxl_vphb_error_detected(afu, state); cxl_context_detach_all(afu); cxl_ops->afu_deactivate_mode(afu, afu->current_mode); pci_deconfigure_afu(afu); @@ -1948,6 +1957,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev, (result == PCI_ERS_RESULT_NEED_RESET)) result = PCI_ERS_RESULT_NONE; } + spin_unlock(&adapter->afu_list_lock); /* should take the context lock here */ if (cxl_adapter_context_lock(adapter) != 0) @@ -1980,14 +1990,18 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) */ cxl_adapter_context_unlock(adapter); + spin_lock(&adapter->afu_list_lock); for (i = 0; i < adapter->slices; i++) { afu = adapter->afu[i]; + if (afu == NULL) + continue; + if (pci_configure_afu(afu, adapter, pdev)) - goto err; + goto err_unlock; if (cxl_afu_select_best_mode(afu)) - goto err; + goto err_unlock; if (afu->phb == NULL) continue; @@ -1999,16 +2013,16 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) ctx = cxl_get_context(afu_dev); if (ctx && cxl_release_context(ctx)) - goto err; + goto err_unlock; ctx = cxl_dev_context_init(afu_dev); if (IS_ERR(ctx)) - goto err; + goto err_unlock; afu_dev->dev.archdata.cxl_ctx = ctx; if (cxl_ops->afu_check_and_enable(afu)) - goto err; + goto err_unlock; afu_dev->error_state = pci_channel_io_normal; @@ -2029,8 +2043,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) result = PCI_ERS_RESULT_DISCONNECT; } } + + spin_unlock(&adapter->afu_list_lock); return result; +err_unlock: + spin_unlock(&adapter->afu_list_lock); + err: /* All the bits that happen in both error_detected and cxl_remove * should be idempotent, so we don't need to worry about leaving a mix @@ -2051,10 +2070,11 @@ static void cxl_pci_resume(struct pci_dev *pdev) * This is not the place to be checking if everything came back up * properly, because there's no return value: do that in slot_reset. */ + spin_lock(&adapter->afu_list_lock); for (i = 0; i < adapter->slices; i++) { afu = adapter->afu[i]; - if (afu->phb == NULL) + if (afu == NULL || afu->phb == NULL) continue; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { @@ -2063,6 +2083,7 @@ static void cxl_pci_resume(struct pci_dev *pdev) afu_dev->driver->err_handler->resume(afu_dev); } } + spin_unlock(&adapter->afu_list_lock); } static const struct pci_error_handlers cxl_err_handler = { diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c @@ -43,8 +43,7 @@ static bool cxl_pci_enable_device_hook(struct pci_dev *dev) return false; } - set_dma_ops(&dev->dev, &dma_nommu_ops); - set_dma_offset(&dev->dev, PAGE_OFFSET); + dev->dev.archdata.dma_offset = PAGE_OFFSET; /* * Allocate a context to do cxl things too. If we eventually do real diff --git a/drivers/net/ethernet/pasemi/pasemi_mac.c b/drivers/net/ethernet/pasemi/pasemi_mac.c @@ -1716,6 +1716,7 @@ pasemi_mac_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = -ENODEV; goto out; } + dma_set_mask(&mac->dma_pdev->dev, DMA_BIT_MASK(64)); mac->iob_pdev = pci_get_device(PCI_VENDOR_ID_PASEMI, 0xa001, NULL); if (!mac->iob_pdev) { diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c @@ -74,13 +74,13 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, ret = eeh_pe_get_state(pe); break; case VFIO_EEH_PE_RESET_DEACTIVATE: - ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE); + ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); break; case VFIO_EEH_PE_RESET_HOT: - ret = eeh_pe_reset(pe, EEH_RESET_HOT); + ret = eeh_pe_reset(pe, EEH_RESET_HOT, true); break; case VFIO_EEH_PE_RESET_FUNDAMENTAL: - ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL); + ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); break; case VFIO_EEH_PE_CONFIGURE: ret = eeh_pe_configure(pe); diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h @@ -60,9 +60,6 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev, size_t size, enum dma_data_direction dir, enum dma_sync_target target); -extern int -swiotlb_dma_supported(struct device *hwdev, u64 mask); - #ifdef CONFIG_SWIOTLB extern enum swiotlb_force swiotlb_force; extern phys_addr_t io_tlb_start, io_tlb_end; diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig @@ -16,6 +16,9 @@ config ARCH_DMA_ADDR_T_64BIT config ARCH_HAS_DMA_COHERENCE_H bool +config ARCH_HAS_DMA_SET_MASK + bool + config HAVE_GENERIC_DMA_COHERENT bool diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c @@ -132,8 +132,7 @@ again: goto again; } - if (IS_ENABLED(CONFIG_ZONE_DMA) && - phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) { + if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c @@ -207,7 +207,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL(dma_mmap_attrs); -#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK static u64 dma_default_get_required_mask(struct device *dev) { u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); @@ -238,7 +237,6 @@ u64 dma_get_required_mask(struct device *dev) return dma_default_get_required_mask(dev); } EXPORT_SYMBOL_GPL(dma_get_required_mask); -#endif #ifndef arch_dma_alloc_attrs #define arch_dma_alloc_attrs(dev) (true) @@ -318,18 +316,23 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); -#ifndef HAVE_ARCH_DMA_SET_MASK +#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK +void arch_dma_set_mask(struct device *dev, u64 mask); +#else +#define arch_dma_set_mask(dev, mask) do { } while (0) +#endif + int dma_set_mask(struct device *dev, u64 mask) { if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; + arch_dma_set_mask(dev, mask); dma_check_mask(dev, mask); *dev->dma_mask = mask; return 0; } EXPORT_SYMBOL(dma_set_mask); -#endif #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c @@ -650,15 +650,3 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, return true; } - -/* - * Return whether the given device DMA address mask can be supported - * properly. For example, if your device can only drive the low 24-bits - * during bus mastering, then you would pass 0x00ffffff as the mask to - * this function. - */ -int -swiotlb_dma_supported(struct device *hwdev, u64 mask) -{ - return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; -} diff --git a/kernel/resource.c b/kernel/resource.c @@ -448,8 +448,6 @@ int walk_mem_res(u64 start, u64 end, void *arg, arg, func); } -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) - /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. @@ -481,8 +479,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, return ret; } -#endif - static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) { return 1; diff --git a/tools/testing/selftests/powerpc/benchmarks/null_syscall.c b/tools/testing/selftests/powerpc/benchmarks/null_syscall.c @@ -25,7 +25,7 @@ unsigned long long clock_frequency; unsigned long long timebase_frequency; double timebase_multiplier; -static inline unsigned long long mftb(void) +static inline unsigned long mftb(void) { unsigned long low; diff --git a/tools/testing/selftests/powerpc/include/reg.h b/tools/testing/selftests/powerpc/include/reg.h @@ -77,6 +77,14 @@ #define TEXASR_TE 0x0000000004000000 #define TEXASR_ROT 0x0000000002000000 +/* MSR register bits */ +#define MSR_TS_S_LG 33 /* Trans Mem state: Suspended */ + +#define __MASK(X) (1UL<<(X)) + +/* macro to check TM MSR bits */ +#define MSR_TS_S __MASK(MSR_TS_S_LG) /* Transaction Suspended */ + /* Vector Instructions */ #define VSX_XX1(xs, ra, rb) (((xs) & 0x1f) << 21 | ((ra) << 16) | \ ((rb) << 11) | (((xs) >> 5))) diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h @@ -102,8 +102,10 @@ do { \ #if defined(__powerpc64__) #define UCONTEXT_NIA(UC) (UC)->uc_mcontext.gp_regs[PT_NIP] +#define UCONTEXT_MSR(UC) (UC)->uc_mcontext.gp_regs[PT_MSR] #elif defined(__powerpc__) #define UCONTEXT_NIA(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_NIP] +#define UCONTEXT_MSR(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_MSR] #else #error implement UCONTEXT_NIA #endif diff --git a/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c b/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c @@ -11,7 +11,6 @@ #include <sys/wait.h> #include <unistd.h> #include <setjmp.h> -#include <signal.h> #include "ebb.h" diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore @@ -11,6 +11,7 @@ tm-signal-context-chk-fpu tm-signal-context-chk-gpr tm-signal-context-chk-vmx tm-signal-context-chk-vsx +tm-signal-context-force-tm tm-signal-sigreturn-nt tm-vmx-unavail tm-unavailable diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile @@ -4,7 +4,8 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \ tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \ - $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt + $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt \ + tm-signal-context-force-tm top_srcdir = ../../../../.. include ../../lib.mk @@ -20,6 +21,7 @@ $(OUTPUT)/tm-vmx-unavail: CFLAGS += -pthread -m64 $(OUTPUT)/tm-resched-dscr: ../pmu/lib.c $(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized -mvsx $(OUTPUT)/tm-trap: CFLAGS += -O0 -pthread -m64 +$(OUTPUT)/tm-signal-context-force-tm: CFLAGS += -pthread -m64 SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS)) $(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2018, Breno Leitao, Gustavo Romero, IBM Corp. + * + * This test raises a SIGUSR1 signal, and toggle the MSR[TS] + * fields at the signal handler. With MSR[TS] being set, the kernel will + * force a recheckpoint, which may cause a segfault when returning to + * user space. Since the test needs to re-run, the segfault needs to be + * caught and handled. + * + * In order to continue the test even after a segfault, the context is + * saved prior to the signal being raised, and it is restored when there is + * a segmentation fault. This happens for COUNT_MAX times. + * + * This test never fails (as returning EXIT_FAILURE). It either succeeds, + * or crash the kernel (on a buggy kernel). + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include <ucontext.h> +#include <unistd.h> +#include <sys/mman.h> + +#include "tm.h" +#include "utils.h" +#include "reg.h" + +#define COUNT_MAX 5000 /* Number of interactions */ + +/* + * This test only runs on 64 bits system. Unsetting MSR_TS_S to avoid + * compilation issue on 32 bits system. There is no side effect, since the + * whole test will be skipped if it is not running on 64 bits system. + */ +#ifndef __powerpc64__ +#undef MSR_TS_S +#define MSR_TS_S 0 +#endif + +/* Setting contexts because the test will crash and we want to recover */ +ucontext_t init_context, main_context; + +static int count, first_time; + +void usr_signal_handler(int signo, siginfo_t *si, void *uc) +{ + ucontext_t *ucp = uc; + int ret; + + /* + * Allocating memory in a signal handler, and never freeing it on + * purpose, forcing the heap increase, so, the memory leak is what + * we want here. + */ + ucp->uc_link = mmap(NULL, sizeof(ucontext_t), + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (ucp->uc_link == (void *)-1) { + perror("Mmap failed"); + exit(-1); + } + + /* Forcing the page to be allocated in a page fault */ + ret = madvise(ucp->uc_link, sizeof(ucontext_t), MADV_DONTNEED); + if (ret) { + perror("madvise failed"); + exit(-1); + } + + memcpy(&ucp->uc_link->uc_mcontext, &ucp->uc_mcontext, + sizeof(ucp->uc_mcontext)); + + /* Forcing to enable MSR[TM] */ + UCONTEXT_MSR(ucp) |= MSR_TS_S; + + /* + * A fork inside a signal handler seems to be more efficient than a + * fork() prior to the signal being raised. + */ + if (fork() == 0) { + /* + * Both child and parent will return, but, child returns + * with count set so it will exit in the next segfault. + * Parent will continue to loop. + */ + count = COUNT_MAX; + } + + /* + * If the change above does not hit the bug, it will cause a + * segmentation fault, since the ck structures are NULL. + */ +} + +void seg_signal_handler(int signo, siginfo_t *si, void *uc) +{ + if (count == COUNT_MAX) { + /* Return to tm_signal_force_msr() and exit */ + setcontext(&main_context); + } + + count++; + + /* Reexecute the test */ + setcontext(&init_context); +} + +void tm_trap_test(void) +{ + struct sigaction usr_sa, seg_sa; + stack_t ss; + + usr_sa.sa_flags = SA_SIGINFO | SA_ONSTACK; + usr_sa.sa_sigaction = usr_signal_handler; + + seg_sa.sa_flags = SA_SIGINFO; + seg_sa.sa_sigaction = seg_signal_handler; + + /* + * Set initial context. Will get back here from + * seg_signal_handler() + */ + getcontext(&init_context); + + /* Allocated an alternative signal stack area */ + ss.ss_sp = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + ss.ss_size = SIGSTKSZ; + ss.ss_flags = 0; + + if (ss.ss_sp == (void *)-1) { + perror("mmap error\n"); + exit(-1); + } + + /* Force the allocation through a page fault */ + if (madvise(ss.ss_sp, SIGSTKSZ, MADV_DONTNEED)) { + perror("madvise\n"); + exit(-1); + } + + /* Setting an alternative stack to generate a page fault when + * the signal is raised. + */ + if (sigaltstack(&ss, NULL)) { + perror("sigaltstack\n"); + exit(-1); + } + + /* The signal handler will enable MSR_TS */ + sigaction(SIGUSR1, &usr_sa, NULL); + /* If it does not crash, it will segfault, avoid it to retest */ + sigaction(SIGSEGV, &seg_sa, NULL); + + raise(SIGUSR1); +} + +int tm_signal_context_force_tm(void) +{ + SKIP_IF(!have_htm()); + /* + * Skipping if not running on 64 bits system, since I think it is + * not possible to set mcontext's [MSR] with TS, due to it being 32 + * bits. + */ + SKIP_IF(!is_ppc64le()); + + /* Will get back here after COUNT_MAX interactions */ + getcontext(&main_context); + + if (!first_time++) + tm_trap_test(); + + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + test_harness(tm_signal_context_force_tm, "tm_signal_context_force_tm"); +} diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c @@ -23,6 +23,14 @@ #define MAP_HUGETLB 0x40000 /* arch specific */ #endif +#ifndef MAP_HUGE_SHIFT +#define MAP_HUGE_SHIFT 26 +#endif + +#ifndef MAP_HUGE_MASK +#define MAP_HUGE_MASK 0x3f +#endif + /* Only ia64 requires this */ #ifdef __ia64__ #define ADDR (void *)(0x8000000000000000UL) @@ -58,12 +66,29 @@ static int read_bytes(char *addr) return 0; } -int main(void) +int main(int argc, char **argv) { void *addr; int ret; + size_t length = LENGTH; + int flags = FLAGS; + int shift = 0; + + if (argc > 1) + length = atol(argv[1]) << 20; + if (argc > 2) { + shift = atoi(argv[2]); + if (shift) + flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } + + if (shift) + printf("%u kB hugepages\n", 1 << shift); + else + printf("Default size hugepages\n"); + printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20); - addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, -1, 0); + addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); exit(1);