whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 889bb74302e5aba85d987b4093344150984d7cda
parent 903b77c631673eeec9e9114e9524171cdf9a2646
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Sat, 29 Dec 2018 09:37:03 -0800

Merge tag 'nds32-for-linus-4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/greentime/linux

Pull nds32 updates from Greentime Hu:

 - Perf support

 - Power management support

 - FPU support

 - Hardware prefetcher support

 - Build error fixed

 - Performance enhancement

* tag 'nds32-for-linus-4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/greentime/linux:
  nds32: support hardware prefetcher
  nds32: Fix the items of hwcap_str ordering issue.
  math-emu/soft-fp.h: (_FP_ROUND_ZERO) cast 0 to void to fix warning
  math-emu/op-2.h: Use statement expressions to prevent negative constant shift
  nds32: support denormalized result through FP emulator
  nds32: Support FP emulation
  nds32: nds32 FPU port
  nds32: Remove duplicated include from pm.c
  nds32: Power management for nds32
  nds32: Add document for NDS32 PMU.
  nds32: Add perf call-graph support.
  nds32: Perf porting
  nds32: Fix bug in bitfield.h
  nds32: Fix gcc 8.0 compiler option incompatible.
  nds32: Fill all TLB entries with kernel image mapping
  nds32: Remove the redundant assignment

Diffstat:
ADocumentation/devicetree/bindings/perf/nds32v3-pmu.txt | 17+++++++++++++++++
March/nds32/Kconfig | 12++++++++++++
March/nds32/Kconfig.cpu | 41+++++++++++++++++++++++++++++++++++++++++
March/nds32/Makefile | 5+++++
March/nds32/boot/dts/ae3xx.dts | 5+++++
March/nds32/include/asm/Kbuild | 1+
March/nds32/include/asm/bitfield.h | 25+++++++++++++++++++++++--
March/nds32/include/asm/elf.h | 11+++++++++++
Aarch/nds32/include/asm/fpu.h | 126+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/nds32/include/asm/fpuemu.h | 32++++++++++++++++++++++++++++++++
Aarch/nds32/include/asm/nds32_fpu_inst.h | 109+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/nds32/include/asm/perf_event.h | 16++++++++++++++++
Aarch/nds32/include/asm/pmu.h | 386+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/nds32/include/asm/processor.h | 7+++++++
Aarch/nds32/include/asm/sfp-machine.h | 158+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/nds32/include/asm/stacktrace.h | 39+++++++++++++++++++++++++++++++++++++++
Aarch/nds32/include/asm/suspend.h | 11+++++++++++
March/nds32/include/asm/syscalls.h | 1+
March/nds32/include/uapi/asm/auxvec.h | 7+++++++
March/nds32/include/uapi/asm/sigcontext.h | 14++++++++++++++
Aarch/nds32/include/uapi/asm/udftrap.h | 13+++++++++++++
March/nds32/include/uapi/asm/unistd.h | 2++
March/nds32/kernel/Makefile | 7+++++--
March/nds32/kernel/ex-entry.S | 24++++++++++++++++++++++--
March/nds32/kernel/ex-exit.S | 13+++++++++++--
March/nds32/kernel/ex-scall.S | 8+++++---
Aarch/nds32/kernel/fpu.c | 269+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/nds32/kernel/head.S | 13++-----------
Aarch/nds32/kernel/perf_event_cpu.c | 1522+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/nds32/kernel/pm.c | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/nds32/kernel/process.c | 64+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
March/nds32/kernel/setup.c | 22+++++++++++++++++++---
March/nds32/kernel/signal.c | 62+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Aarch/nds32/kernel/sleep.S | 131+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/nds32/kernel/sys_nds32.c | 32++++++++++++++++++++++++++++++++
March/nds32/kernel/traps.c | 16++++++++++++++++
Aarch/nds32/math-emu/Makefile | 7+++++++
Aarch/nds32/math-emu/faddd.c | 24++++++++++++++++++++++++
Aarch/nds32/math-emu/fadds.c | 24++++++++++++++++++++++++
Aarch/nds32/math-emu/fcmpd.c | 24++++++++++++++++++++++++
Aarch/nds32/math-emu/fcmps.c | 24++++++++++++++++++++++++
Aarch/nds32/math-emu/fd2s.c | 22++++++++++++++++++++++
Aarch/nds32/math-emu/fdivd.c | 27+++++++++++++++++++++++++++
Aarch/nds32/math-emu/fdivs.c | 26++++++++++++++++++++++++++
Aarch/nds32/math-emu/fmuld.c | 23+++++++++++++++++++++++
Aarch/nds32/math-emu/fmuls.c | 23+++++++++++++++++++++++
Aarch/nds32/math-emu/fnegd.c | 21+++++++++++++++++++++
Aarch/nds32/math-emu/fnegs.c | 21+++++++++++++++++++++
Aarch/nds32/math-emu/fpuemu.c | 357+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/nds32/math-emu/fs2d.c | 23+++++++++++++++++++++++
Aarch/nds32/math-emu/fsqrtd.c | 21+++++++++++++++++++++
Aarch/nds32/math-emu/fsqrts.c | 21+++++++++++++++++++++
Aarch/nds32/math-emu/fsubd.c | 27+++++++++++++++++++++++++++
Aarch/nds32/math-emu/fsubs.c | 27+++++++++++++++++++++++++++
March/nds32/mm/Makefile | 6+++++-
March/nds32/mm/fault.c | 13+++++++++----
Mdrivers/irqchip/irq-ativic32.c | 31+++++++++++++++++++++++++++++++
Minclude/math-emu/op-2.h | 99++++++++++++++++++++++++++++++++++++++-----------------------------------------
Minclude/math-emu/soft-fp.h | 2+-
Mtools/include/asm/barrier.h | 2++
Atools/perf/arch/nds32/Build | 1+
Atools/perf/arch/nds32/util/Build | 1+
Atools/perf/arch/nds32/util/header.c | 29+++++++++++++++++++++++++++++
Atools/perf/pmu-events/arch/nds32/mapfile.csv | 15+++++++++++++++
Atools/perf/pmu-events/arch/nds32/n13/atcpmu.json | 290+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
65 files changed, 4441 insertions(+), 89 deletions(-)

diff --git a/Documentation/devicetree/bindings/perf/nds32v3-pmu.txt b/Documentation/devicetree/bindings/perf/nds32v3-pmu.txt @@ -0,0 +1,17 @@ +* NDS32 Performance Monitor Units + +NDS32 core have a PMU for counting cpu and cache events like cache misses. +The NDS32 PMU representation in the device tree should be done as under: + +Required properties: + +- compatible : + "andestech,nds32v3-pmu" + +- interrupts : The interrupt number for NDS32 PMU is 13. + +Example: +pmu{ + compatible = "andestech,nds32v3-pmu"; + interrupts = <13>; +} diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig @@ -28,7 +28,9 @@ config NDS32 select HANDLE_DOMAIN_IRQ select HAVE_ARCH_TRACEHOOK select HAVE_DEBUG_KMEMLEAK + select HAVE_EXIT_THREAD select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_PERF_EVENTS select IRQ_DOMAIN select LOCKDEP_SUPPORT select MODULES_USE_ELF_RELA @@ -91,3 +93,13 @@ endmenu menu "Kernel Features" source "kernel/Kconfig.hz" endmenu + +menu "Power management options" +config SYS_SUPPORTS_APM_EMULATION + bool + +config ARCH_SUSPEND_POSSIBLE + def_bool y + +source "kernel/power/Kconfig" +endmenu diff --git a/arch/nds32/Kconfig.cpu b/arch/nds32/Kconfig.cpu @@ -7,6 +7,40 @@ config CPU_LITTLE_ENDIAN bool "Little endian" default y +config FPU + bool "FPU support" + default n + help + If FPU ISA is used in user space, this configuration shall be Y to + enable required support in kerenl such as fpu context switch and + fpu exception handler. + + If no FPU ISA is used in user space, say N. + +config LAZY_FPU + bool "lazy FPU support" + depends on FPU + default y + help + Say Y here to enable the lazy FPU scheme. The lazy FPU scheme can + enhance system performance by reducing the context switch + frequency of the FPU register. + + For nomal case, say Y. + +config SUPPORT_DENORMAL_ARITHMETIC + bool "Denormal arithmetic support" + depends on FPU + default n + help + Say Y here to enable arithmetic of denormalized number. Enabling + this feature can enhance the precision for tininess number. + However, performance loss in float pointe calculations is + possibly significant due to additional FPU exception. + + If the calculated tolerance for tininess number is not critical, + say N to prevent performance loss. + config HWZOL bool "hardware zero overhead loop support" depends on CPU_D10 || CPU_D15 @@ -143,6 +177,13 @@ config CACHE_L2 Say Y here to enable L2 cache if your SoC are integrated with L2CC. If unsure, say N. +config HW_PRE + bool "Enable hardware prefetcher" + default y + help + Say Y here to enable hardware prefetcher feature. + Only when CPU_VER.REV >= 0x09 can support. + menu "Memory configuration" choice diff --git a/arch/nds32/Makefile b/arch/nds32/Makefile @@ -5,10 +5,14 @@ KBUILD_DEFCONFIG := defconfig comma = , + ifdef CONFIG_FUNCTION_TRACER arch-y += -malways-save-lp -mno-relax endif +# Avoid generating FPU instructions +arch-y += -mno-ext-fpu-sp -mno-ext-fpu-dp -mfloat-abi=soft + KBUILD_CFLAGS += $(call cc-option, -mno-sched-prolog-epilog) KBUILD_CFLAGS += -mcmodel=large @@ -26,6 +30,7 @@ export TEXTADDR # If we have a machine-specific directory, then include it in the build. core-y += arch/nds32/kernel/ arch/nds32/mm/ +core-$(CONFIG_FPU) += arch/nds32/math-emu/ libs-y += arch/nds32/lib/ ifneq '$(CONFIG_NDS32_BUILTIN_DTB)' '""' diff --git a/arch/nds32/boot/dts/ae3xx.dts b/arch/nds32/boot/dts/ae3xx.dts @@ -82,4 +82,9 @@ interrupts = <18>; }; }; + + pmu { + compatible = "andestech,nds32v3-pmu"; + interrupts= <13>; + }; }; diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild @@ -36,6 +36,7 @@ generic-y += kprobes.h generic-y += kvm_para.h generic-y += limits.h generic-y += local.h +generic-y += local64.h generic-y += mm-arch-hooks.h generic-y += mman.h generic-y += parport.h diff --git a/arch/nds32/include/asm/bitfield.h b/arch/nds32/include/asm/bitfield.h @@ -251,6 +251,11 @@ #define ITYPE_mskSTYPE ( 0xF << ITYPE_offSTYPE ) #define ITYPE_mskCPID ( 0x3 << ITYPE_offCPID ) +/* Additional definitions of ITYPE register for FPU */ +#define FPU_DISABLE_EXCEPTION (0x1 << ITYPE_offSTYPE) +#define FPU_EXCEPTION (0x2 << ITYPE_offSTYPE) +#define FPU_CPID 0 /* FPU Co-Processor ID is 0 */ + #define NDS32_VECTOR_mskNONEXCEPTION 0x78 #define NDS32_VECTOR_offEXCEPTION 8 #define NDS32_VECTOR_offINTERRUPT 9 @@ -692,8 +697,8 @@ #define PFM_CTL_offKU1 13 /* Enable user mode event counting for PFMC1 */ #define PFM_CTL_offKU2 14 /* Enable user mode event counting for PFMC2 */ #define PFM_CTL_offSEL0 15 /* The event selection for PFMC0 */ -#define PFM_CTL_offSEL1 21 /* The event selection for PFMC1 */ -#define PFM_CTL_offSEL2 27 /* The event selection for PFMC2 */ +#define PFM_CTL_offSEL1 16 /* The event selection for PFMC1 */ +#define PFM_CTL_offSEL2 22 /* The event selection for PFMC2 */ /* bit 28:31 reserved */ #define PFM_CTL_mskEN0 ( 0x01 << PFM_CTL_offEN0 ) @@ -735,14 +740,20 @@ #define N13MISC_CTL_offRTP 1 /* Disable Return Target Predictor */ #define N13MISC_CTL_offPTEPF 2 /* Disable HPTWK L2 PTE pefetch */ #define N13MISC_CTL_offSP_SHADOW_EN 4 /* Enable shadow stack pointers */ +#define MISC_CTL_offHWPRE 11 /* Enable HardWare PREFETCH */ /* bit 6, 9:31 reserved */ #define N13MISC_CTL_makBTB ( 0x1 << N13MISC_CTL_offBTB ) #define N13MISC_CTL_makRTP ( 0x1 << N13MISC_CTL_offRTP ) #define N13MISC_CTL_makPTEPF ( 0x1 << N13MISC_CTL_offPTEPF ) #define N13MISC_CTL_makSP_SHADOW_EN ( 0x1 << N13MISC_CTL_offSP_SHADOW_EN ) +#define MISC_CTL_makHWPRE_EN ( 0x1 << MISC_CTL_offHWPRE ) +#ifdef CONFIG_HW_PRE +#define MISC_init (N13MISC_CTL_makBTB|N13MISC_CTL_makRTP|N13MISC_CTL_makSP_SHADOW_EN|MISC_CTL_makHWPRE_EN) +#else #define MISC_init (N13MISC_CTL_makBTB|N13MISC_CTL_makRTP|N13MISC_CTL_makSP_SHADOW_EN) +#endif /****************************************************************************** * PRUSR_ACC_CTL (Privileged Resource User Access Control Registers) @@ -926,6 +937,7 @@ #define FPCSR_mskDNIT ( 0x1 << FPCSR_offDNIT ) #define FPCSR_mskRIT ( 0x1 << FPCSR_offRIT ) #define FPCSR_mskALL (FPCSR_mskIVO | FPCSR_mskDBZ | FPCSR_mskOVF | FPCSR_mskUDF | FPCSR_mskIEX) +#define FPCSR_mskALLE_NO_UDFE (FPCSR_mskIVOE | FPCSR_mskDBZE | FPCSR_mskOVFE | FPCSR_mskIEXE) #define FPCSR_mskALLE (FPCSR_mskIVOE | FPCSR_mskDBZE | FPCSR_mskOVFE | FPCSR_mskUDFE | FPCSR_mskIEXE) #define FPCSR_mskALLT (FPCSR_mskIVOT | FPCSR_mskDBZT | FPCSR_mskOVFT | FPCSR_mskUDFT | FPCSR_mskIEXT |FPCSR_mskDNIT | FPCSR_mskRIT) @@ -946,6 +958,15 @@ #define FPCFG_mskIMVER ( 0x1F << FPCFG_offIMVER ) #define FPCFG_mskAVER ( 0x1F << FPCFG_offAVER ) +/* 8 Single precision or 4 double precision registers are available */ +#define SP8_DP4_reg 0 +/* 16 Single precision or 8 double precision registers are available */ +#define SP16_DP8_reg 1 +/* 32 Single precision or 16 double precision registers are available */ +#define SP32_DP16_reg 2 +/* 32 Single precision or 32 double precision registers are available */ +#define SP32_DP32_reg 3 + /****************************************************************************** * fucpr: FUCOP_CTL (FPU and Coprocessor Enable Control Register) *****************************************************************************/ diff --git a/arch/nds32/include/asm/elf.h b/arch/nds32/include/asm/elf.h @@ -9,6 +9,7 @@ */ #include <asm/ptrace.h> +#include <asm/fpu.h> typedef unsigned long elf_greg_t; typedef unsigned long elf_freg_t[3]; @@ -159,8 +160,18 @@ struct elf32_hdr; #endif + +#if IS_ENABLED(CONFIG_FPU) +#define FPU_AUX_ENT NEW_AUX_ENT(AT_FPUCW, FPCSR_INIT) +#else +#define FPU_AUX_ENT NEW_AUX_ENT(AT_IGNORE, 0) +#endif + #define ARCH_DLINFO \ do { \ + /* Optional FPU initialization */ \ + FPU_AUX_ENT; \ + \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (elf_addr_t)current->mm->context.vdso); \ } while (0) diff --git a/arch/nds32/include/asm/fpu.h b/arch/nds32/include/asm/fpu.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2005-2018 Andes Technology Corporation */ + +#ifndef __ASM_NDS32_FPU_H +#define __ASM_NDS32_FPU_H + +#if IS_ENABLED(CONFIG_FPU) +#ifndef __ASSEMBLY__ +#include <linux/sched/task_stack.h> +#include <linux/preempt.h> +#include <asm/ptrace.h> + +extern bool has_fpu; + +extern void save_fpu(struct task_struct *__tsk); +extern void load_fpu(const struct fpu_struct *fpregs); +extern bool do_fpu_exception(unsigned int subtype, struct pt_regs *regs); +extern int do_fpuemu(struct pt_regs *regs, struct fpu_struct *fpu); + +#define test_tsk_fpu(regs) (regs->fucop_ctl & FUCOP_CTL_mskCP0EN) + +/* + * Initially load the FPU with signalling NANS. This bit pattern + * has the property that no matter whether considered as single or as + * double precision, it still represents a signalling NAN. + */ + +#define sNAN64 0xFFFFFFFFFFFFFFFFULL +#define sNAN32 0xFFFFFFFFUL + +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) +/* + * Denormalized number is unsupported by nds32 FPU. Hence the operation + * is treated as underflow cases when the final result is a denormalized + * number. To enhance precision, underflow exception trap should be + * enabled by default and kerenl will re-execute it by fpu emulator + * when getting underflow exception. + */ +#define FPCSR_INIT FPCSR_mskUDFE +#else +#define FPCSR_INIT 0x0UL +#endif + +extern const struct fpu_struct init_fpuregs; + +static inline void disable_ptreg_fpu(struct pt_regs *regs) +{ + regs->fucop_ctl &= ~FUCOP_CTL_mskCP0EN; +} + +static inline void enable_ptreg_fpu(struct pt_regs *regs) +{ + regs->fucop_ctl |= FUCOP_CTL_mskCP0EN; +} + +static inline void enable_fpu(void) +{ + unsigned long fucop_ctl; + + fucop_ctl = __nds32__mfsr(NDS32_SR_FUCOP_CTL) | FUCOP_CTL_mskCP0EN; + __nds32__mtsr(fucop_ctl, NDS32_SR_FUCOP_CTL); + __nds32__isb(); +} + +static inline void disable_fpu(void) +{ + unsigned long fucop_ctl; + + fucop_ctl = __nds32__mfsr(NDS32_SR_FUCOP_CTL) & ~FUCOP_CTL_mskCP0EN; + __nds32__mtsr(fucop_ctl, NDS32_SR_FUCOP_CTL); + __nds32__isb(); +} + +static inline void lose_fpu(void) +{ + preempt_disable(); +#if IS_ENABLED(CONFIG_LAZY_FPU) + if (last_task_used_math == current) { + last_task_used_math = NULL; +#else + if (test_tsk_fpu(task_pt_regs(current))) { +#endif + save_fpu(current); + } + disable_ptreg_fpu(task_pt_regs(current)); + preempt_enable(); +} + +static inline void own_fpu(void) +{ + preempt_disable(); +#if IS_ENABLED(CONFIG_LAZY_FPU) + if (last_task_used_math != current) { + if (last_task_used_math != NULL) + save_fpu(last_task_used_math); + load_fpu(&current->thread.fpu); + last_task_used_math = current; + } +#else + if (!test_tsk_fpu(task_pt_regs(current))) { + load_fpu(&current->thread.fpu); + } +#endif + enable_ptreg_fpu(task_pt_regs(current)); + preempt_enable(); +} + +#if !IS_ENABLED(CONFIG_LAZY_FPU) +static inline void unlazy_fpu(struct task_struct *tsk) +{ + preempt_disable(); + if (test_tsk_fpu(task_pt_regs(tsk))) + save_fpu(tsk); + preempt_enable(); +} +#endif /* !CONFIG_LAZY_FPU */ +static inline void clear_fpu(struct pt_regs *regs) +{ + preempt_disable(); + if (test_tsk_fpu(regs)) + disable_ptreg_fpu(regs); + preempt_enable(); +} +#endif /* CONFIG_FPU */ +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_NDS32_FPU_H */ diff --git a/arch/nds32/include/asm/fpuemu.h b/arch/nds32/include/asm/fpuemu.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2005-2018 Andes Technology Corporation */ + +#ifndef __ARCH_NDS32_FPUEMU_H +#define __ARCH_NDS32_FPUEMU_H + +/* + * single precision + */ + +void fadds(void *ft, void *fa, void *fb); +void fsubs(void *ft, void *fa, void *fb); +void fmuls(void *ft, void *fa, void *fb); +void fdivs(void *ft, void *fa, void *fb); +void fs2d(void *ft, void *fa); +void fsqrts(void *ft, void *fa); +void fnegs(void *ft, void *fa); +int fcmps(void *ft, void *fa, void *fb, int cop); + +/* + * double precision + */ +void faddd(void *ft, void *fa, void *fb); +void fsubd(void *ft, void *fa, void *fb); +void fmuld(void *ft, void *fa, void *fb); +void fdivd(void *ft, void *fa, void *fb); +void fsqrtd(void *ft, void *fa); +void fd2s(void *ft, void *fa); +void fnegd(void *ft, void *fa); +int fcmpd(void *ft, void *fa, void *fb, int cop); + +#endif /* __ARCH_NDS32_FPUEMU_H */ diff --git a/arch/nds32/include/asm/nds32_fpu_inst.h b/arch/nds32/include/asm/nds32_fpu_inst.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2005-2018 Andes Technology Corporation */ + +#ifndef __NDS32_FPU_INST_H +#define __NDS32_FPU_INST_H + +#define cop0_op 0x35 + +/* + * COP0 field of opcodes. + */ +#define fs1_op 0x0 +#define fs2_op 0x4 +#define fd1_op 0x8 +#define fd2_op 0xc + +/* + * FS1 opcode. + */ +enum fs1 { + fadds_op, fsubs_op, fcpynss_op, fcpyss_op, + fmadds_op, fmsubs_op, fcmovns_op, fcmovzs_op, + fnmadds_op, fnmsubs_op, + fmuls_op = 0xc, fdivs_op, + fs1_f2op_op = 0xf +}; + +/* + * FS1/F2OP opcode. + */ +enum fs1_f2 { + fs2d_op, fsqrts_op, + fui2s_op = 0x8, fsi2s_op = 0xc, + fs2ui_op = 0x10, fs2ui_z_op = 0x14, + fs2si_op = 0x18, fs2si_z_op = 0x1c +}; + +/* + * FS2 opcode. + */ +enum fs2 { + fcmpeqs_op, fcmpeqs_e_op, fcmplts_op, fcmplts_e_op, + fcmples_op, fcmples_e_op, fcmpuns_op, fcmpuns_e_op +}; + +/* + * FD1 opcode. + */ +enum fd1 { + faddd_op, fsubd_op, fcpynsd_op, fcpysd_op, + fmaddd_op, fmsubd_op, fcmovnd_op, fcmovzd_op, + fnmaddd_op, fnmsubd_op, + fmuld_op = 0xc, fdivd_op, fd1_f2op_op = 0xf +}; + +/* + * FD1/F2OP opcode. + */ +enum fd1_f2 { + fd2s_op, fsqrtd_op, + fui2d_op = 0x8, fsi2d_op = 0xc, + fd2ui_op = 0x10, fd2ui_z_op = 0x14, + fd2si_op = 0x18, fd2si_z_op = 0x1c +}; + +/* + * FD2 opcode. + */ +enum fd2 { + fcmpeqd_op, fcmpeqd_e_op, fcmpltd_op, fcmpltd_e_op, + fcmpled_op, fcmpled_e_op, fcmpund_op, fcmpund_e_op +}; + +#define NDS32Insn(x) x + +#define I_OPCODE_off 25 +#define NDS32Insn_OPCODE(x) (NDS32Insn(x) >> I_OPCODE_off) + +#define I_OPCODE_offRt 20 +#define I_OPCODE_mskRt (0x1fUL << I_OPCODE_offRt) +#define NDS32Insn_OPCODE_Rt(x) \ + ((NDS32Insn(x) & I_OPCODE_mskRt) >> I_OPCODE_offRt) + +#define I_OPCODE_offRa 15 +#define I_OPCODE_mskRa (0x1fUL << I_OPCODE_offRa) +#define NDS32Insn_OPCODE_Ra(x) \ + ((NDS32Insn(x) & I_OPCODE_mskRa) >> I_OPCODE_offRa) + +#define I_OPCODE_offRb 10 +#define I_OPCODE_mskRb (0x1fUL << I_OPCODE_offRb) +#define NDS32Insn_OPCODE_Rb(x) \ + ((NDS32Insn(x) & I_OPCODE_mskRb) >> I_OPCODE_offRb) + +#define I_OPCODE_offbit1014 10 +#define I_OPCODE_mskbit1014 (0x1fUL << I_OPCODE_offbit1014) +#define NDS32Insn_OPCODE_BIT1014(x) \ + ((NDS32Insn(x) & I_OPCODE_mskbit1014) >> I_OPCODE_offbit1014) + +#define I_OPCODE_offbit69 6 +#define I_OPCODE_mskbit69 (0xfUL << I_OPCODE_offbit69) +#define NDS32Insn_OPCODE_BIT69(x) \ + ((NDS32Insn(x) & I_OPCODE_mskbit69) >> I_OPCODE_offbit69) + +#define I_OPCODE_offCOP0 0 +#define I_OPCODE_mskCOP0 (0x3fUL << I_OPCODE_offCOP0) +#define NDS32Insn_OPCODE_COP0(x) \ + ((NDS32Insn(x) & I_OPCODE_mskCOP0) >> I_OPCODE_offCOP0) + +#endif /* __NDS32_FPU_INST_H */ diff --git a/arch/nds32/include/asm/perf_event.h b/arch/nds32/include/asm/perf_event.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2008-2018 Andes Technology Corporation */ + +#ifndef __ASM_PERF_EVENT_H +#define __ASM_PERF_EVENT_H + +/* + * This file is request by Perf, + * please refer to tools/perf/design.txt for more details + */ +struct pt_regs; +unsigned long perf_instruction_pointer(struct pt_regs *regs); +unsigned long perf_misc_flags(struct pt_regs *regs); +#define perf_misc_flags(regs) perf_misc_flags(regs) + +#endif diff --git a/arch/nds32/include/asm/pmu.h b/arch/nds32/include/asm/pmu.h @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2008-2018 Andes Technology Corporation */ + +#ifndef __ASM_PMU_H +#define __ASM_PMU_H + +#include <linux/interrupt.h> +#include <linux/perf_event.h> +#include <asm/unistd.h> +#include <asm/bitfield.h> + +/* Has special meaning for perf core implementation */ +#define HW_OP_UNSUPPORTED 0x0 +#define C(_x) PERF_COUNT_HW_CACHE_##_x +#define CACHE_OP_UNSUPPORTED 0x0 + +/* Enough for both software and hardware defined events */ +#define SOFTWARE_EVENT_MASK 0xFF + +#define PFM_OFFSET_MAGIC_0 2 /* DO NOT START FROM 0 */ +#define PFM_OFFSET_MAGIC_1 (PFM_OFFSET_MAGIC_0 + 36) +#define PFM_OFFSET_MAGIC_2 (PFM_OFFSET_MAGIC_1 + 36) + +enum { PFMC0, PFMC1, PFMC2, MAX_COUNTERS }; + +u32 PFM_CTL_OVF[3] = { PFM_CTL_mskOVF0, PFM_CTL_mskOVF1, + PFM_CTL_mskOVF2 }; +u32 PFM_CTL_EN[3] = { PFM_CTL_mskEN0, PFM_CTL_mskEN1, + PFM_CTL_mskEN2 }; +u32 PFM_CTL_OFFSEL[3] = { PFM_CTL_offSEL0, PFM_CTL_offSEL1, + PFM_CTL_offSEL2 }; +u32 PFM_CTL_IE[3] = { PFM_CTL_mskIE0, PFM_CTL_mskIE1, PFM_CTL_mskIE2 }; +u32 PFM_CTL_KS[3] = { PFM_CTL_mskKS0, PFM_CTL_mskKS1, PFM_CTL_mskKS2 }; +u32 PFM_CTL_KU[3] = { PFM_CTL_mskKU0, PFM_CTL_mskKU1, PFM_CTL_mskKU2 }; +u32 PFM_CTL_SEL[3] = { PFM_CTL_mskSEL0, PFM_CTL_mskSEL1, PFM_CTL_mskSEL2 }; +/* + * Perf Events' indices + */ +#define NDS32_IDX_CYCLE_COUNTER 0 +#define NDS32_IDX_COUNTER0 1 +#define NDS32_IDX_COUNTER1 2 + +/* The events for a given PMU register set. */ +struct pmu_hw_events { + /* + * The events that are active on the PMU for the given index. + */ + struct perf_event *events[MAX_COUNTERS]; + + /* + * A 1 bit for an index indicates that the counter is being used for + * an event. A 0 means that the counter can be used. + */ + unsigned long used_mask[BITS_TO_LONGS(MAX_COUNTERS)]; + + /* + * Hardware lock to serialize accesses to PMU registers. Needed for the + * read/modify/write sequences. + */ + raw_spinlock_t pmu_lock; +}; + +struct nds32_pmu { + struct pmu pmu; + cpumask_t active_irqs; + char *name; + irqreturn_t (*handle_irq)(int irq_num, void *dev); + void (*enable)(struct perf_event *event); + void (*disable)(struct perf_event *event); + int (*get_event_idx)(struct pmu_hw_events *hw_events, + struct perf_event *event); + int (*set_event_filter)(struct hw_perf_event *evt, + struct perf_event_attr *attr); + u32 (*read_counter)(struct perf_event *event); + void (*write_counter)(struct perf_event *event, u32 val); + void (*start)(struct nds32_pmu *nds32_pmu); + void (*stop)(struct nds32_pmu *nds32_pmu); + void (*reset)(void *data); + int (*request_irq)(struct nds32_pmu *nds32_pmu, irq_handler_t handler); + void (*free_irq)(struct nds32_pmu *nds32_pmu); + int (*map_event)(struct perf_event *event); + int num_events; + atomic_t active_events; + u64 max_period; + struct platform_device *plat_device; + struct pmu_hw_events *(*get_hw_events)(void); +}; + +#define to_nds32_pmu(p) (container_of(p, struct nds32_pmu, pmu)) + +int nds32_pmu_register(struct nds32_pmu *nds32_pmu, int type); + +u64 nds32_pmu_event_update(struct perf_event *event); + +int nds32_pmu_event_set_period(struct perf_event *event); + +/* + * Common NDS32 SPAv3 event types + * + * Note: An implementation may not be able to count all of these events + * but the encodings are considered to be `reserved' in the case that + * they are not available. + * + * SEL_TOTAL_CYCLES will add an offset is due to ZERO is defined as + * NOT_SUPPORTED EVENT mapping in generic perf code. + * You will need to deal it in the event writing implementation. + */ +enum spav3_counter_0_perf_types { + SPAV3_0_SEL_BASE = -1 + PFM_OFFSET_MAGIC_0, /* counting symbol */ + SPAV3_0_SEL_TOTAL_CYCLES = 0 + PFM_OFFSET_MAGIC_0, + SPAV3_0_SEL_COMPLETED_INSTRUCTION = 1 + PFM_OFFSET_MAGIC_0, + SPAV3_0_SEL_LAST /* counting symbol */ +}; + +enum spav3_counter_1_perf_types { + SPAV3_1_SEL_BASE = -1 + PFM_OFFSET_MAGIC_1, /* counting symbol */ + SPAV3_1_SEL_TOTAL_CYCLES = 0 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_COMPLETED_INSTRUCTION = 1 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_CONDITIONAL_BRANCH = 2 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_TAKEN_CONDITIONAL_BRANCH = 3 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_PREFETCH_INSTRUCTION = 4 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_RET_INST = 5 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_JR_INST = 6 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_JAL_JRAL_INST = 7 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_NOP_INST = 8 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_SCW_INST = 9 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_ISB_DSB_INST = 10 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_CCTL_INST = 11 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_TAKEN_INTERRUPTS = 12 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_LOADS_COMPLETED = 13 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_UITLB_ACCESS = 14 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_UDTLB_ACCESS = 15 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_MTLB_ACCESS = 16 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_CODE_CACHE_ACCESS = 17 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_DATA_DEPENDENCY_STALL_CYCLES = 18 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_DATA_CACHE_MISS_STALL_CYCLES = 19 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_DATA_CACHE_ACCESS = 20 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_DATA_CACHE_MISS = 21 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_LOAD_DATA_CACHE_ACCESS = 22 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_STORE_DATA_CACHE_ACCESS = 23 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_ILM_ACCESS = 24 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_LSU_BIU_CYCLES = 25 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_HPTWK_BIU_CYCLES = 26 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_DMA_BIU_CYCLES = 27 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_CODE_CACHE_FILL_BIU_CYCLES = 28 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_LEGAL_UNALIGN_DCACHE_ACCESS = 29 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_PUSH25 = 30 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_SYSCALLS_INST = 31 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_LAST /* counting symbol */ +}; + +enum spav3_counter_2_perf_types { + SPAV3_2_SEL_BASE = -1 + PFM_OFFSET_MAGIC_2, /* counting symbol */ + SPAV3_2_SEL_TOTAL_CYCLES = 0 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_COMPLETED_INSTRUCTION = 1 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_CONDITIONAL_BRANCH_MISPREDICT = 2 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_TAKEN_CONDITIONAL_BRANCH_MISPREDICT = + 3 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_PREFETCH_INSTRUCTION_CACHE_HIT = 4 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_RET_MISPREDICT = 5 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_IMMEDIATE_J_INST = 6 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_MULTIPLY_INST = 7 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_16_BIT_INST = 8 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_FAILED_SCW_INST = 9 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_LD_AFTER_ST_CONFLICT_REPLAYS = 10 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_TAKEN_EXCEPTIONS = 12 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_STORES_COMPLETED = 13 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_UITLB_MISS = 14 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_UDTLB_MISS = 15 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_MTLB_MISS = 16 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_CODE_CACHE_MISS = 17 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_EMPTY_INST_QUEUE_STALL_CYCLES = 18 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_DATA_WRITE_BACK = 19 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_DATA_CACHE_MISS = 21 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_LOAD_DATA_CACHE_MISS = 22 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_STORE_DATA_CACHE_MISS = 23 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_DLM_ACCESS = 24 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_LSU_BIU_REQUEST = 25 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_HPTWK_BIU_REQUEST = 26 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_DMA_BIU_REQUEST = 27 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_CODE_CACHE_FILL_BIU_REQUEST = 28 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_EXTERNAL_EVENTS = 29 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_POP25 = 30 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_LAST /* counting symbol */ +}; + +/* Get converted event counter index */ +static inline int get_converted_event_idx(unsigned long event) +{ + int idx; + + if ((event) > SPAV3_0_SEL_BASE && event < SPAV3_0_SEL_LAST) { + idx = 0; + } else if ((event) > SPAV3_1_SEL_BASE && event < SPAV3_1_SEL_LAST) { + idx = 1; + } else if ((event) > SPAV3_2_SEL_BASE && event < SPAV3_2_SEL_LAST) { + idx = 2; + } else { + pr_err("GET_CONVERTED_EVENT_IDX PFM counter range error\n"); + return -EPERM; + } + + return idx; +} + +/* Get converted hardware event number */ +static inline u32 get_converted_evet_hw_num(u32 event) +{ + if (event > SPAV3_0_SEL_BASE && event < SPAV3_0_SEL_LAST) + event -= PFM_OFFSET_MAGIC_0; + else if (event > SPAV3_1_SEL_BASE && event < SPAV3_1_SEL_LAST) + event -= PFM_OFFSET_MAGIC_1; + else if (event > SPAV3_2_SEL_BASE && event < SPAV3_2_SEL_LAST) + event -= PFM_OFFSET_MAGIC_2; + else if (event != 0) + pr_err("GET_CONVERTED_EVENT_HW_NUM PFM counter range error\n"); + + return event; +} + +/* + * NDS32 HW events mapping + * + * The hardware events that we support. We do support cache operations but + * we have harvard caches and no way to combine instruction and data + * accesses/misses in hardware. + */ +static const unsigned int nds32_pfm_perf_map[PERF_COUNT_HW_MAX] = { + [PERF_COUNT_HW_CPU_CYCLES] = SPAV3_0_SEL_TOTAL_CYCLES, + [PERF_COUNT_HW_INSTRUCTIONS] = SPAV3_1_SEL_COMPLETED_INSTRUCTION, + [PERF_COUNT_HW_CACHE_REFERENCES] = SPAV3_1_SEL_DATA_CACHE_ACCESS, + [PERF_COUNT_HW_CACHE_MISSES] = SPAV3_2_SEL_DATA_CACHE_MISS, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = HW_OP_UNSUPPORTED, + [PERF_COUNT_HW_BRANCH_MISSES] = HW_OP_UNSUPPORTED, + [PERF_COUNT_HW_BUS_CYCLES] = HW_OP_UNSUPPORTED, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = HW_OP_UNSUPPORTED, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = HW_OP_UNSUPPORTED, + [PERF_COUNT_HW_REF_CPU_CYCLES] = HW_OP_UNSUPPORTED +}; + +static const unsigned int nds32_pfm_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_LOAD_DATA_CACHE_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_LOAD_DATA_CACHE_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_STORE_DATA_CACHE_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_STORE_DATA_CACHE_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + }, + [C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_CODE_CACHE_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_CODE_CACHE_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_CODE_CACHE_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_CODE_CACHE_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, + /* TODO: L2CC */ + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, + /* NDS32 PMU does not support TLB read/write hit/miss, + * However, it can count access/miss, which mixed with read and write. + * Therefore, only READ counter will use it. + * We do as possible as we can. + */ + [C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_UDTLB_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_UDTLB_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + }, + [C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_UITLB_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_UITLB_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + }, + [C(BPU)] = { /* What is BPU? */ + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + }, + [C(NODE)] = { /* What is NODE? */ + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + }, +}; + +int nds32_pmu_map_event(struct perf_event *event, + const unsigned int (*event_map)[PERF_COUNT_HW_MAX], + const unsigned int (*cache_map)[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX], u32 raw_event_mask); + +#endif /* __ASM_PMU_H */ diff --git a/arch/nds32/include/asm/processor.h b/arch/nds32/include/asm/processor.h @@ -35,6 +35,8 @@ struct thread_struct { unsigned long address; unsigned long trap_no; unsigned long error_code; + + struct fpu_struct fpu; }; #define INIT_THREAD { } @@ -72,6 +74,11 @@ struct task_struct; /* Free all resources held by a thread. */ #define release_thread(thread) do { } while(0) +#if IS_ENABLED(CONFIG_FPU) +#if !IS_ENABLED(CONFIG_UNLAZU_FPU) +extern struct task_struct *last_task_used_math; +#endif +#endif /* Prepare to copy thread state - unlazy all lazy status */ #define prepare_to_copy(tsk) do { } while (0) diff --git a/arch/nds32/include/asm/sfp-machine.h b/arch/nds32/include/asm/sfp-machine.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2005-2018 Andes Technology Corporation */ + +#include <asm/bitfield.h> + +#define _FP_W_TYPE_SIZE 32 +#define _FP_W_TYPE unsigned long +#define _FP_WS_TYPE signed long +#define _FP_I_TYPE long + +#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2)) +#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1)) +#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2)) + +#define _FP_MUL_MEAT_S(R, X, Y) \ + _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_S, R, X, Y, umul_ppmm) +#define _FP_MUL_MEAT_D(R, X, Y) \ + _FP_MUL_MEAT_2_wide(_FP_WFRACBITS_D, R, X, Y, umul_ppmm) +#define _FP_MUL_MEAT_Q(R, X, Y) \ + _FP_MUL_MEAT_4_wide(_FP_WFRACBITS_Q, R, X, Y, umul_ppmm) + +#define _FP_MUL_MEAT_DW_S(R, X, Y) \ + _FP_MUL_MEAT_DW_1_wide(_FP_WFRACBITS_S, R, X, Y, umul_ppmm) +#define _FP_MUL_MEAT_DW_D(R, X, Y) \ + _FP_MUL_MEAT_DW_2_wide(_FP_WFRACBITS_D, R, X, Y, umul_ppmm) + +#define _FP_DIV_MEAT_S(R, X, Y) _FP_DIV_MEAT_1_udiv_norm(S, R, X, Y) +#define _FP_DIV_MEAT_D(R, X, Y) _FP_DIV_MEAT_2_udiv(D, R, X, Y) + +#define _FP_NANFRAC_S ((_FP_QNANBIT_S << 1) - 1) +#define _FP_NANFRAC_D ((_FP_QNANBIT_D << 1) - 1), -1 +#define _FP_NANFRAC_Q ((_FP_QNANBIT_Q << 1) - 1), -1, -1, -1 +#define _FP_NANSIGN_S 0 +#define _FP_NANSIGN_D 0 +#define _FP_NANSIGN_Q 0 + +#define _FP_KEEPNANFRACP 1 +#define _FP_QNANNEGATEDP 0 + +#define _FP_CHOOSENAN(fs, wc, R, X, Y, OP) \ +do { \ + if ((_FP_FRAC_HIGH_RAW_##fs(X) & _FP_QNANBIT_##fs) \ + && !(_FP_FRAC_HIGH_RAW_##fs(Y) & _FP_QNANBIT_##fs)) { \ + R##_s = Y##_s; \ + _FP_FRAC_COPY_##wc(R, Y); \ + } else { \ + R##_s = X##_s; \ + _FP_FRAC_COPY_##wc(R, X); \ + } \ + R##_c = FP_CLS_NAN; \ +} while (0) + +#define __FPU_FPCSR (current->thread.fpu.fpcsr) + +/* Obtain the current rounding mode. */ +#define FP_ROUNDMODE \ +({ \ + __FPU_FPCSR & FPCSR_mskRM; \ +}) + +#define FP_RND_NEAREST 0 +#define FP_RND_PINF 1 +#define FP_RND_MINF 2 +#define FP_RND_ZERO 3 + +#define FP_EX_INVALID FPCSR_mskIVO +#define FP_EX_DIVZERO FPCSR_mskDBZ +#define FP_EX_OVERFLOW FPCSR_mskOVF +#define FP_EX_UNDERFLOW FPCSR_mskUDF +#define FP_EX_INEXACT FPCSR_mskIEX + +#define SF_CEQ 2 +#define SF_CLT 1 +#define SF_CGT 3 +#define SF_CUN 4 + +#include <asm/byteorder.h> + +#ifdef __BIG_ENDIAN__ +#define __BYTE_ORDER __BIG_ENDIAN +#define __LITTLE_ENDIAN 0 +#else +#define __BYTE_ORDER __LITTLE_ENDIAN +#define __BIG_ENDIAN 0 +#endif + +#define abort() do { } while (0) +#define umul_ppmm(w1, w0, u, v) \ +do { \ + UWtype __x0, __x1, __x2, __x3; \ + UHWtype __ul, __vl, __uh, __vh; \ + \ + __ul = __ll_lowpart(u); \ + __uh = __ll_highpart(u); \ + __vl = __ll_lowpart(v); \ + __vh = __ll_highpart(v); \ + \ + __x0 = (UWtype) __ul * __vl; \ + __x1 = (UWtype) __ul * __vh; \ + __x2 = (UWtype) __uh * __vl; \ + __x3 = (UWtype) __uh * __vh; \ + \ + __x1 += __ll_highpart(__x0); \ + __x1 += __x2; \ + if (__x1 < __x2) \ + __x3 += __ll_B; \ + \ + (w1) = __x3 + __ll_highpart(__x1); \ + (w0) = __ll_lowpart(__x1) * __ll_B + __ll_lowpart(__x0); \ +} while (0) + +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ +do { \ + UWtype __x; \ + __x = (al) + (bl); \ + (sh) = (ah) + (bh) + (__x < (al)); \ + (sl) = __x; \ +} while (0) + +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ +do { \ + UWtype __x; \ + __x = (al) - (bl); \ + (sh) = (ah) - (bh) - (__x > (al)); \ + (sl) = __x; \ +} while (0) + +#define udiv_qrnnd(q, r, n1, n0, d) \ +do { \ + UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \ + __d1 = __ll_highpart(d); \ + __d0 = __ll_lowpart(d); \ + \ + __r1 = (n1) % __d1; \ + __q1 = (n1) / __d1; \ + __m = (UWtype) __q1 * __d0; \ + __r1 = __r1 * __ll_B | __ll_highpart(n0); \ + if (__r1 < __m) { \ + __q1--, __r1 += (d); \ + if (__r1 >= (d)) \ + if (__r1 < __m) \ + __q1--, __r1 += (d); \ + } \ + __r1 -= __m; \ + __r0 = __r1 % __d1; \ + __q0 = __r1 / __d1; \ + __m = (UWtype) __q0 * __d0; \ + __r0 = __r0 * __ll_B | __ll_lowpart(n0); \ + if (__r0 < __m) { \ + __q0--, __r0 += (d); \ + if (__r0 >= (d)) \ + if (__r0 < __m) \ + __q0--, __r0 += (d); \ + } \ + __r0 -= __m; \ + (q) = (UWtype) __q1 * __ll_B | __q0; \ + (r) = __r0; \ +} while (0) diff --git a/arch/nds32/include/asm/stacktrace.h b/arch/nds32/include/asm/stacktrace.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2008-2018 Andes Technology Corporation */ + +#ifndef __ASM_STACKTRACE_H +#define __ASM_STACKTRACE_H + +/* Kernel callchain */ +struct stackframe { + unsigned long fp; + unsigned long sp; + unsigned long lp; +}; + +/* + * struct frame_tail: User callchain + * IMPORTANT: + * This struct is used for call-stack walking, + * the order and types matters. + * Do not use array, it only stores sizeof(pointer) + * + * The details can refer to arch/arm/kernel/perf_event.c + */ +struct frame_tail { + unsigned long stack_fp; + unsigned long stack_lp; +}; + +/* For User callchain with optimize for size */ +struct frame_tail_opt_size { + unsigned long stack_r6; + unsigned long stack_fp; + unsigned long stack_gp; + unsigned long stack_lp; +}; + +extern void +get_real_ret_addr(unsigned long *addr, struct task_struct *tsk, int *graph); + +#endif /* __ASM_STACKTRACE_H */ diff --git a/arch/nds32/include/asm/suspend.h b/arch/nds32/include/asm/suspend.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (C) 2008-2017 Andes Technology Corporation + +#ifndef __ASM_NDS32_SUSPEND_H +#define __ASM_NDS32_SUSPEND_H + +extern void suspend2ram(void); +extern void cpu_resume(void); +extern unsigned long wake_mask; + +#endif diff --git a/arch/nds32/include/asm/syscalls.h b/arch/nds32/include/asm/syscalls.h @@ -7,6 +7,7 @@ asmlinkage long sys_cacheflush(unsigned long addr, unsigned long len, unsigned int op); asmlinkage long sys_fadvise64_64_wrapper(int fd, int advice, loff_t offset, loff_t len); asmlinkage long sys_rt_sigreturn_wrapper(void); +asmlinkage long sys_udftrap(int option); #include <asm-generic/syscalls.h> diff --git a/arch/nds32/include/uapi/asm/auxvec.h b/arch/nds32/include/uapi/asm/auxvec.h @@ -4,6 +4,13 @@ #ifndef __ASM_AUXVEC_H #define __ASM_AUXVEC_H +/* + * This entry gives some information about the FPU initialization + * performed by the kernel. + */ +#define AT_FPUCW 18 /* Used FPU control word. */ + + /* VDSO location */ #define AT_SYSINFO_EHDR 33 diff --git a/arch/nds32/include/uapi/asm/sigcontext.h b/arch/nds32/include/uapi/asm/sigcontext.h @@ -9,6 +9,19 @@ * before the signal handler was invoked. Note: only add new entries * to the end of the structure. */ +struct fpu_struct { + unsigned long long fd_regs[32]; + unsigned long fpcsr; + /* + * UDF_trap is used to recognize whether underflow trap is enabled + * or not. When UDF_trap == 1, this process will be traped and then + * get a SIGFPE signal when encountering an underflow exception. + * UDF_trap is only modified through setfputrap syscall. Therefore, + * UDF_trap needn't be saved or loaded to context in each context + * switch. + */ + unsigned long UDF_trap; +}; struct zol_struct { unsigned long nds32_lc; /* $LC */ @@ -54,6 +67,7 @@ struct sigcontext { unsigned long fault_address; unsigned long used_math_flag; /* FPU Registers */ + struct fpu_struct fpu; struct zol_struct zol; }; diff --git a/arch/nds32/include/uapi/asm/udftrap.h b/arch/nds32/include/uapi/asm/udftrap.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2005-2018 Andes Technology Corporation */ +#ifndef _ASM_SETFPUTRAP +#define _ASM_SETFPUTRAP + +/* + * Options for setfputrap system call + */ +#define DISABLE_UDFTRAP 0 /* disable underflow exception trap */ +#define ENABLE_UDFTRAP 1 /* enable undeflos exception trap */ +#define GET_UDFTRAP 2 /* only get undeflos exception trap status */ + +#endif /* _ASM_CACHECTL */ diff --git a/arch/nds32/include/uapi/asm/unistd.h b/arch/nds32/include/uapi/asm/unistd.h @@ -9,4 +9,6 @@ /* Additional NDS32 specific syscalls. */ #define __NR_cacheflush (__NR_arch_specific_syscall) +#define __NR_udftrap (__NR_arch_specific_syscall + 1) __SYSCALL(__NR_cacheflush, sys_cacheflush) +__SYSCALL(__NR_udftrap, sys_udftrap) diff --git a/arch/nds32/kernel/Makefile b/arch/nds32/kernel/Makefile @@ -4,7 +4,6 @@ CPPFLAGS_vmlinux.lds := -DTEXTADDR=$(TEXTADDR) AFLAGS_head.o := -DTEXTADDR=$(TEXTADDR) - # Object file lists. obj-y := ex-entry.o ex-exit.o ex-scall.o irq.o \ @@ -14,11 +13,15 @@ obj-y := ex-entry.o ex-exit.o ex-scall.o irq.o \ obj-$(CONFIG_MODULES) += nds32_ksyms.o module.o obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_OF) += devtree.o obj-$(CONFIG_CACHE_L2) += atl2c.o - +obj-$(CONFIG_PERF_EVENTS) += perf_event_cpu.o +obj-$(CONFIG_PM) += pm.o sleep.o extra-y := head.o vmlinux.lds +CFLAGS_fpu.o += -mext-fpu-sp -mext-fpu-dp + obj-y += vdso/ diff --git a/arch/nds32/kernel/ex-entry.S b/arch/nds32/kernel/ex-entry.S @@ -7,6 +7,7 @@ #include <asm/errno.h> #include <asm/asm-offsets.h> #include <asm/page.h> +#include <asm/fpu.h> #ifdef CONFIG_HWZOL .macro push_zol @@ -15,12 +16,31 @@ mfusr $r16, $LC .endm #endif + .macro skip_save_fucop_ctl +#if defined(CONFIG_FPU) +skip_fucop_ctl: + smw.adm $p0, [$sp], $p0, #0x1 + j fucop_ctl_done +#endif + .endm .macro save_user_regs - +#if defined(CONFIG_FPU) + sethi $p0, hi20(has_fpu) + lbsi $p0, [$p0+lo12(has_fpu)] + beqz $p0, skip_fucop_ctl + mfsr $p0, $FUCOP_CTL + smw.adm $p0, [$sp], $p0, #0x1 + bclr $p0, $p0, #FUCOP_CTL_offCP0EN + mtsr $p0, $FUCOP_CTL +fucop_ctl_done: + /* move $SP to the bottom of pt_regs */ + addi $sp, $sp, -FUCOP_CTL_OFFSET +#else smw.adm $sp, [$sp], $sp, #0x1 /* move $SP to the bottom of pt_regs */ addi $sp, $sp, -OSP_OFFSET +#endif /* push $r0 ~ $r25 */ smw.bim $r0, [$sp], $r25 @@ -79,6 +99,7 @@ exception_handlers: .long eh_syscall !Syscall .long asm_do_IRQ !IRQ + skip_save_fucop_ctl common_exception_handler: save_user_regs mfsr $p0, $ITYPE @@ -103,7 +124,6 @@ common_exception_handler: mtsr $r21, $PSW dsb jr $p1 - /* syscall */ 1: addi $p1, $p0, #-NDS32_VECTOR_offEXCEPTION diff --git a/arch/nds32/kernel/ex-exit.S b/arch/nds32/kernel/ex-exit.S @@ -8,6 +8,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/current.h> +#include <asm/fpu.h> @@ -22,10 +23,18 @@ .macro restore_user_regs_first setgie.d isb - +#if defined(CONFIG_FPU) + addi $sp, $sp, OSP_OFFSET + lmw.adm $r12, [$sp], $r25, #0x0 + sethi $p0, hi20(has_fpu) + lbsi $p0, [$p0+lo12(has_fpu)] + beqz $p0, 2f + mtsr $r25, $FUCOP_CTL +2: +#else addi $sp, $sp, FUCOP_CTL_OFFSET - lmw.adm $r12, [$sp], $r24, #0x0 +#endif mtsr $r12, $SP_USR mtsr $r13, $IPC #ifdef CONFIG_HWZOL diff --git a/arch/nds32/kernel/ex-scall.S b/arch/nds32/kernel/ex-scall.S @@ -19,11 +19,13 @@ ENTRY(__switch_to) la $p0, __entry_task sw $r1, [$p0] - move $p1, $r0 - addi $p1, $p1, #THREAD_CPU_CONTEXT + addi $p1, $r0, #THREAD_CPU_CONTEXT smw.bi $r6, [$p1], $r14, #0xb ! push r6~r14, fp, lp, sp move $r25, $r1 - addi $r1, $r1, #THREAD_CPU_CONTEXT +#if defined(CONFIG_FPU) + call _switch_fpu +#endif + addi $r1, $r25, #THREAD_CPU_CONTEXT lmw.bi $r6, [$r1], $r14, #0xb ! pop r6~r14, fp, lp, sp ret diff --git a/arch/nds32/kernel/fpu.c b/arch/nds32/kernel/fpu.c @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include <linux/sched.h> +#include <linux/signal.h> +#include <linux/sched/signal.h> +#include <asm/processor.h> +#include <asm/user.h> +#include <asm/io.h> +#include <asm/bitfield.h> +#include <asm/fpu.h> + +const struct fpu_struct init_fpuregs = { + .fd_regs = {[0 ... 31] = sNAN64}, + .fpcsr = FPCSR_INIT, +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + .UDF_trap = 0 +#endif +}; + +void save_fpu(struct task_struct *tsk) +{ + unsigned int fpcfg, fpcsr; + + enable_fpu(); + fpcfg = ((__nds32__fmfcfg() & FPCFG_mskFREG) >> FPCFG_offFREG); + switch (fpcfg) { + case SP32_DP32_reg: + asm volatile ("fsdi $fd31, [%0+0xf8]\n\t" + "fsdi $fd30, [%0+0xf0]\n\t" + "fsdi $fd29, [%0+0xe8]\n\t" + "fsdi $fd28, [%0+0xe0]\n\t" + "fsdi $fd27, [%0+0xd8]\n\t" + "fsdi $fd26, [%0+0xd0]\n\t" + "fsdi $fd25, [%0+0xc8]\n\t" + "fsdi $fd24, [%0+0xc0]\n\t" + "fsdi $fd23, [%0+0xb8]\n\t" + "fsdi $fd22, [%0+0xb0]\n\t" + "fsdi $fd21, [%0+0xa8]\n\t" + "fsdi $fd20, [%0+0xa0]\n\t" + "fsdi $fd19, [%0+0x98]\n\t" + "fsdi $fd18, [%0+0x90]\n\t" + "fsdi $fd17, [%0+0x88]\n\t" + "fsdi $fd16, [%0+0x80]\n\t" + : /* no output */ + : "r" (&tsk->thread.fpu) + : "memory"); + /* fall through */ + case SP32_DP16_reg: + asm volatile ("fsdi $fd15, [%0+0x78]\n\t" + "fsdi $fd14, [%0+0x70]\n\t" + "fsdi $fd13, [%0+0x68]\n\t" + "fsdi $fd12, [%0+0x60]\n\t" + "fsdi $fd11, [%0+0x58]\n\t" + "fsdi $fd10, [%0+0x50]\n\t" + "fsdi $fd9, [%0+0x48]\n\t" + "fsdi $fd8, [%0+0x40]\n\t" + : /* no output */ + : "r" (&tsk->thread.fpu) + : "memory"); + /* fall through */ + case SP16_DP8_reg: + asm volatile ("fsdi $fd7, [%0+0x38]\n\t" + "fsdi $fd6, [%0+0x30]\n\t" + "fsdi $fd5, [%0+0x28]\n\t" + "fsdi $fd4, [%0+0x20]\n\t" + : /* no output */ + : "r" (&tsk->thread.fpu) + : "memory"); + /* fall through */ + case SP8_DP4_reg: + asm volatile ("fsdi $fd3, [%1+0x18]\n\t" + "fsdi $fd2, [%1+0x10]\n\t" + "fsdi $fd1, [%1+0x8]\n\t" + "fsdi $fd0, [%1+0x0]\n\t" + "fmfcsr %0\n\t" + "swi %0, [%1+0x100]\n\t" + : "=&r" (fpcsr) + : "r"(&tsk->thread.fpu) + : "memory"); + } + disable_fpu(); +} + +void load_fpu(const struct fpu_struct *fpregs) +{ + unsigned int fpcfg, fpcsr; + + enable_fpu(); + fpcfg = ((__nds32__fmfcfg() & FPCFG_mskFREG) >> FPCFG_offFREG); + switch (fpcfg) { + case SP32_DP32_reg: + asm volatile ("fldi $fd31, [%0+0xf8]\n\t" + "fldi $fd30, [%0+0xf0]\n\t" + "fldi $fd29, [%0+0xe8]\n\t" + "fldi $fd28, [%0+0xe0]\n\t" + "fldi $fd27, [%0+0xd8]\n\t" + "fldi $fd26, [%0+0xd0]\n\t" + "fldi $fd25, [%0+0xc8]\n\t" + "fldi $fd24, [%0+0xc0]\n\t" + "fldi $fd23, [%0+0xb8]\n\t" + "fldi $fd22, [%0+0xb0]\n\t" + "fldi $fd21, [%0+0xa8]\n\t" + "fldi $fd20, [%0+0xa0]\n\t" + "fldi $fd19, [%0+0x98]\n\t" + "fldi $fd18, [%0+0x90]\n\t" + "fldi $fd17, [%0+0x88]\n\t" + "fldi $fd16, [%0+0x80]\n\t" + : /* no output */ + : "r" (fpregs)); + /* fall through */ + case SP32_DP16_reg: + asm volatile ("fldi $fd15, [%0+0x78]\n\t" + "fldi $fd14, [%0+0x70]\n\t" + "fldi $fd13, [%0+0x68]\n\t" + "fldi $fd12, [%0+0x60]\n\t" + "fldi $fd11, [%0+0x58]\n\t" + "fldi $fd10, [%0+0x50]\n\t" + "fldi $fd9, [%0+0x48]\n\t" + "fldi $fd8, [%0+0x40]\n\t" + : /* no output */ + : "r" (fpregs)); + /* fall through */ + case SP16_DP8_reg: + asm volatile ("fldi $fd7, [%0+0x38]\n\t" + "fldi $fd6, [%0+0x30]\n\t" + "fldi $fd5, [%0+0x28]\n\t" + "fldi $fd4, [%0+0x20]\n\t" + : /* no output */ + : "r" (fpregs)); + /* fall through */ + case SP8_DP4_reg: + asm volatile ("fldi $fd3, [%1+0x18]\n\t" + "fldi $fd2, [%1+0x10]\n\t" + "fldi $fd1, [%1+0x8]\n\t" + "fldi $fd0, [%1+0x0]\n\t" + "lwi %0, [%1+0x100]\n\t" + "fmtcsr %0\n\t":"=&r" (fpcsr) + : "r"(fpregs)); + } + disable_fpu(); +} +void store_fpu_for_suspend(void) +{ +#ifdef CONFIG_LAZY_FPU + if (last_task_used_math != NULL) + save_fpu(last_task_used_math); + last_task_used_math = NULL; +#else + if (!used_math()) + return; + unlazy_fpu(current); +#endif + clear_fpu(task_pt_regs(current)); +} +inline void do_fpu_context_switch(struct pt_regs *regs) +{ + /* Enable to use FPU. */ + + if (!user_mode(regs)) { + pr_err("BUG: FPU is used in kernel mode.\n"); + BUG(); + return; + } + + enable_ptreg_fpu(regs); +#ifdef CONFIG_LAZY_FPU //Lazy FPU is used + if (last_task_used_math == current) + return; + if (last_task_used_math != NULL) + /* Other processes fpu state, save away */ + save_fpu(last_task_used_math); + last_task_used_math = current; +#endif + if (used_math()) { + load_fpu(&current->thread.fpu); + } else { + /* First time FPU user. */ + load_fpu(&init_fpuregs); +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + current->thread.fpu.UDF_trap = init_fpuregs.UDF_trap; +#endif + set_used_math(); + } + +} + +inline void fill_sigfpe_signo(unsigned int fpcsr, int *signo) +{ + if (fpcsr & FPCSR_mskOVFT) + *signo = FPE_FLTOVF; +#ifndef CONFIG_SUPPORT_DENORMAL_ARITHMETIC + else if (fpcsr & FPCSR_mskUDFT) + *signo = FPE_FLTUND; +#endif + else if (fpcsr & FPCSR_mskIVOT) + *signo = FPE_FLTINV; + else if (fpcsr & FPCSR_mskDBZT) + *signo = FPE_FLTDIV; + else if (fpcsr & FPCSR_mskIEXT) + *signo = FPE_FLTRES; +} + +inline void handle_fpu_exception(struct pt_regs *regs) +{ + unsigned int fpcsr; + int si_code = 0, si_signo = SIGFPE; +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + unsigned long redo_except = FPCSR_mskDNIT|FPCSR_mskUDFT; +#else + unsigned long redo_except = FPCSR_mskDNIT; +#endif + + lose_fpu(); + fpcsr = current->thread.fpu.fpcsr; + + if (fpcsr & redo_except) { +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + if (fpcsr & FPCSR_mskUDFT) + current->thread.fpu.fpcsr &= ~FPCSR_mskIEX; +#endif + si_signo = do_fpuemu(regs, &current->thread.fpu); + fpcsr = current->thread.fpu.fpcsr; + if (!si_signo) + goto done; + } else if (fpcsr & FPCSR_mskRIT) { + if (!user_mode(regs)) + do_exit(SIGILL); + si_signo = SIGILL; + } + + + switch (si_signo) { + case SIGFPE: + fill_sigfpe_signo(fpcsr, &si_code); + break; + case SIGILL: + show_regs(regs); + si_code = ILL_COPROC; + break; + case SIGBUS: + si_code = BUS_ADRERR; + break; + default: + break; + } + + force_sig_fault(si_signo, si_code, + (void __user *)instruction_pointer(regs), current); +done: + own_fpu(); +} + +bool do_fpu_exception(unsigned int subtype, struct pt_regs *regs) +{ + int done = true; + /* Coprocessor disabled exception */ + if (subtype == FPU_DISABLE_EXCEPTION) { + preempt_disable(); + do_fpu_context_switch(regs); + preempt_enable(); + } + /* Coprocessor exception such as underflow and overflow */ + else if (subtype == FPU_EXCEPTION) + handle_fpu_exception(regs); + else + done = false; + return done; +} diff --git a/arch/nds32/kernel/head.S b/arch/nds32/kernel/head.S @@ -123,21 +123,12 @@ _image_size_check: andi $r0, $r0, MMU_CFG_mskTBS srli $r6, $r6, MMU_CFG_offTBW srli $r0, $r0, MMU_CFG_offTBS - /* - * we just map the kernel to the maximum way - 1 of tlb - * reserver one way for UART VA mapping - * it will cause page fault if UART mapping cover the kernel mapping - * - * direct mapping is not supported now. - */ - li $r2, 't' - beqz $r6, __error ! MMU_CFG.TBW = 0 is direct mappin + addi $r6, $r6, #0x1 ! MMU_CFG.TBW value -> meaning addi $r0, $r0, #0x2 ! MMU_CFG.TBS value -> meaning sll $r0, $r6, $r0 ! entries = k-way * n-set mul $r6, $r0, $r5 ! max size = entries * page size /* check kernel image size */ la $r3, (_end - PAGE_OFFSET) - li $r2, 's' bgt $r3, $r6, __error li $r2, #(PHYS_OFFSET + TLB_DATA_kernel_text_attr) @@ -160,7 +151,7 @@ _tlb: #endif mtsr $r3, $TLB_MISC - mfsr $r0, $MISC_CTL ! Enable BTB and RTP and shadow sp + mfsr $r0, $MISC_CTL ! Enable BTB, RTP, shadow sp, and HW_PRE ori $r0, $r0, #MISC_init mtsr $r0, $MISC_CTL diff --git a/arch/nds32/kernel/perf_event_cpu.c b/arch/nds32/kernel/perf_event_cpu.c @@ -0,0 +1,1522 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2008-2017 Andes Technology Corporation + * + * Reference ARMv7: Jean Pihet <jpihet@mvista.com> + * 2010 (c) MontaVista Software, LLC. + */ + +#include <linux/perf_event.h> +#include <linux/bitmap.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/pm_runtime.h> +#include <linux/ftrace.h> +#include <linux/uaccess.h> +#include <linux/sched/clock.h> +#include <linux/percpu-defs.h> + +#include <asm/pmu.h> +#include <asm/irq_regs.h> +#include <asm/nds32.h> +#include <asm/stacktrace.h> +#include <asm/perf_event.h> +#include <nds32_intrinsic.h> + +/* Set at runtime when we know what CPU type we are. */ +static struct nds32_pmu *cpu_pmu; + +static DEFINE_PER_CPU(struct pmu_hw_events, cpu_hw_events); +static void nds32_pmu_start(struct nds32_pmu *cpu_pmu); +static void nds32_pmu_stop(struct nds32_pmu *cpu_pmu); +static struct platform_device_id cpu_pmu_plat_device_ids[] = { + {.name = "nds32-pfm"}, + {}, +}; + +static int nds32_pmu_map_cache_event(const unsigned int (*cache_map) + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX], u64 config) +{ + unsigned int cache_type, cache_op, cache_result, ret; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + ret = (int)(*cache_map)[cache_type][cache_op][cache_result]; + + if (ret == CACHE_OP_UNSUPPORTED) + return -ENOENT; + + return ret; +} + +static int +nds32_pmu_map_hw_event(const unsigned int (*event_map)[PERF_COUNT_HW_MAX], + u64 config) +{ + int mapping; + + if (config >= PERF_COUNT_HW_MAX) + return -ENOENT; + + mapping = (*event_map)[config]; + return mapping == HW_OP_UNSUPPORTED ? -ENOENT : mapping; +} + +static int nds32_pmu_map_raw_event(u32 raw_event_mask, u64 config) +{ + int ev_type = (int)(config & raw_event_mask); + int idx = config >> 8; + + switch (idx) { + case 0: + ev_type = PFM_OFFSET_MAGIC_0 + ev_type; + if (ev_type >= SPAV3_0_SEL_LAST || ev_type <= SPAV3_0_SEL_BASE) + return -ENOENT; + break; + case 1: + ev_type = PFM_OFFSET_MAGIC_1 + ev_type; + if (ev_type >= SPAV3_1_SEL_LAST || ev_type <= SPAV3_1_SEL_BASE) + return -ENOENT; + break; + case 2: + ev_type = PFM_OFFSET_MAGIC_2 + ev_type; + if (ev_type >= SPAV3_2_SEL_LAST || ev_type <= SPAV3_2_SEL_BASE) + return -ENOENT; + break; + default: + return -ENOENT; + } + + return ev_type; +} + +int +nds32_pmu_map_event(struct perf_event *event, + const unsigned int (*event_map)[PERF_COUNT_HW_MAX], + const unsigned int (*cache_map) + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX], u32 raw_event_mask) +{ + u64 config = event->attr.config; + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + return nds32_pmu_map_hw_event(event_map, config); + case PERF_TYPE_HW_CACHE: + return nds32_pmu_map_cache_event(cache_map, config); + case PERF_TYPE_RAW: + return nds32_pmu_map_raw_event(raw_event_mask, config); + } + + return -ENOENT; +} + +static int nds32_spav3_map_event(struct perf_event *event) +{ + return nds32_pmu_map_event(event, &nds32_pfm_perf_map, + &nds32_pfm_perf_cache_map, SOFTWARE_EVENT_MASK); +} + +static inline u32 nds32_pfm_getreset_flags(void) +{ + /* Read overflow status */ + u32 val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 old_val = val; + + /* Write overflow bit to clear status, and others keep it 0 */ + u32 ov_flag = PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]; + + __nds32__mtsr(val | ov_flag, NDS32_SR_PFM_CTL); + + return old_val; +} + +static inline int nds32_pfm_has_overflowed(u32 pfm) +{ + u32 ov_flag = PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]; + + return pfm & ov_flag; +} + +static inline int nds32_pfm_counter_has_overflowed(u32 pfm, int idx) +{ + u32 mask = 0; + + switch (idx) { + case 0: + mask = PFM_CTL_OVF[0]; + break; + case 1: + mask = PFM_CTL_OVF[1]; + break; + case 2: + mask = PFM_CTL_OVF[2]; + break; + default: + pr_err("%s index wrong\n", __func__); + break; + } + return pfm & mask; +} + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the event disabled in hw: + */ +int nds32_pmu_event_set_period(struct perf_event *event) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int ret = 0; + + /* The period may have been changed by PERF_EVENT_IOC_PERIOD */ + if (unlikely(period != hwc->last_period)) + left = period - (hwc->last_period - left); + + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (left > (s64)nds32_pmu->max_period) + left = nds32_pmu->max_period; + + /* + * The hw event starts counting from this event offset, + * mark it to be able to extract future "deltas": + */ + local64_set(&hwc->prev_count, (u64)(-left)); + + nds32_pmu->write_counter(event, (u64)(-left) & nds32_pmu->max_period); + + perf_event_update_userpage(event); + + return ret; +} + +static irqreturn_t nds32_pmu_handle_irq(int irq_num, void *dev) +{ + u32 pfm; + struct perf_sample_data data; + struct nds32_pmu *cpu_pmu = (struct nds32_pmu *)dev; + struct pmu_hw_events *cpuc = cpu_pmu->get_hw_events(); + struct pt_regs *regs; + int idx; + /* + * Get and reset the IRQ flags + */ + pfm = nds32_pfm_getreset_flags(); + + /* + * Did an overflow occur? + */ + if (!nds32_pfm_has_overflowed(pfm)) + return IRQ_NONE; + + /* + * Handle the counter(s) overflow(s) + */ + regs = get_irq_regs(); + + nds32_pmu_stop(cpu_pmu); + for (idx = 0; idx < cpu_pmu->num_events; ++idx) { + struct perf_event *event = cpuc->events[idx]; + struct hw_perf_event *hwc; + + /* Ignore if we don't have an event. */ + if (!event) + continue; + + /* + * We have a single interrupt for all counters. Check that + * each counter has overflowed before we process it. + */ + if (!nds32_pfm_counter_has_overflowed(pfm, idx)) + continue; + + hwc = &event->hw; + nds32_pmu_event_update(event); + perf_sample_data_init(&data, 0, hwc->last_period); + if (!nds32_pmu_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + cpu_pmu->disable(event); + } + nds32_pmu_start(cpu_pmu); + /* + * Handle the pending perf events. + * + * Note: this call *must* be run with interrupts disabled. For + * platforms that can have the PMU interrupts raised as an NMI, this + * will not work. + */ + irq_work_run(); + + return IRQ_HANDLED; +} + +static inline int nds32_pfm_counter_valid(struct nds32_pmu *cpu_pmu, int idx) +{ + return ((idx >= 0) && (idx < cpu_pmu->num_events)); +} + +static inline int nds32_pfm_disable_counter(int idx) +{ + unsigned int val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 mask = 0; + + mask = PFM_CTL_EN[idx]; + val &= ~mask; + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + return idx; +} + +/* + * Add an event filter to a given event. + */ +static int nds32_pmu_set_event_filter(struct hw_perf_event *event, + struct perf_event_attr *attr) +{ + unsigned long config_base = 0; + int idx = event->idx; + unsigned long no_kernel_tracing = 0; + unsigned long no_user_tracing = 0; + /* If index is -1, do not do anything */ + if (idx == -1) + return 0; + + no_kernel_tracing = PFM_CTL_KS[idx]; + no_user_tracing = PFM_CTL_KU[idx]; + /* + * Default: enable both kernel and user mode tracing. + */ + if (attr->exclude_user) + config_base |= no_user_tracing; + + if (attr->exclude_kernel) + config_base |= no_kernel_tracing; + + /* + * Install the filter into config_base as this is used to + * construct the event type. + */ + event->config_base |= config_base; + return 0; +} + +static inline void nds32_pfm_write_evtsel(int idx, u32 evnum) +{ + u32 offset = 0; + u32 ori_val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 ev_mask = 0; + u32 no_kernel_mask = 0; + u32 no_user_mask = 0; + u32 val; + + offset = PFM_CTL_OFFSEL[idx]; + /* Clear previous mode selection, and write new one */ + no_kernel_mask = PFM_CTL_KS[idx]; + no_user_mask = PFM_CTL_KU[idx]; + ori_val &= ~no_kernel_mask; + ori_val &= ~no_user_mask; + if (evnum & no_kernel_mask) + ori_val |= no_kernel_mask; + + if (evnum & no_user_mask) + ori_val |= no_user_mask; + + /* Clear previous event selection */ + ev_mask = PFM_CTL_SEL[idx]; + ori_val &= ~ev_mask; + evnum &= SOFTWARE_EVENT_MASK; + + /* undo the linear mapping */ + evnum = get_converted_evet_hw_num(evnum); + val = ori_val | (evnum << offset); + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); +} + +static inline int nds32_pfm_enable_counter(int idx) +{ + unsigned int val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 mask = 0; + + mask = PFM_CTL_EN[idx]; + val |= mask; + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + return idx; +} + +static inline int nds32_pfm_enable_intens(int idx) +{ + unsigned int val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 mask = 0; + + mask = PFM_CTL_IE[idx]; + val |= mask; + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + return idx; +} + +static inline int nds32_pfm_disable_intens(int idx) +{ + unsigned int val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 mask = 0; + + mask = PFM_CTL_IE[idx]; + val &= ~mask; + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + return idx; +} + +static int event_requires_mode_exclusion(struct perf_event_attr *attr) +{ + /* Other modes NDS32 does not support */ + return attr->exclude_user || attr->exclude_kernel; +} + +static void nds32_pmu_enable_event(struct perf_event *event) +{ + unsigned long flags; + unsigned int evnum = 0; + struct hw_perf_event *hwc = &event->hw; + struct nds32_pmu *cpu_pmu = to_nds32_pmu(event->pmu); + struct pmu_hw_events *events = cpu_pmu->get_hw_events(); + int idx = hwc->idx; + + if (!nds32_pfm_counter_valid(cpu_pmu, idx)) { + pr_err("CPU enabling wrong pfm counter IRQ enable\n"); + return; + } + + /* + * Enable counter and interrupt, and set the counter to count + * the event that we're interested in. + */ + raw_spin_lock_irqsave(&events->pmu_lock, flags); + + /* + * Disable counter + */ + nds32_pfm_disable_counter(idx); + + /* + * Check whether we need to exclude the counter from certain modes. + */ + if ((!cpu_pmu->set_event_filter || + cpu_pmu->set_event_filter(hwc, &event->attr)) && + event_requires_mode_exclusion(&event->attr)) { + pr_notice + ("NDS32 performance counters do not support mode exclusion\n"); + hwc->config_base = 0; + } + /* Write event */ + evnum = hwc->config_base; + nds32_pfm_write_evtsel(idx, evnum); + + /* + * Enable interrupt for this counter + */ + nds32_pfm_enable_intens(idx); + + /* + * Enable counter + */ + nds32_pfm_enable_counter(idx); + + raw_spin_unlock_irqrestore(&events->pmu_lock, flags); +} + +static void nds32_pmu_disable_event(struct perf_event *event) +{ + unsigned long flags; + struct hw_perf_event *hwc = &event->hw; + struct nds32_pmu *cpu_pmu = to_nds32_pmu(event->pmu); + struct pmu_hw_events *events = cpu_pmu->get_hw_events(); + int idx = hwc->idx; + + if (!nds32_pfm_counter_valid(cpu_pmu, idx)) { + pr_err("CPU disabling wrong pfm counter IRQ enable %d\n", idx); + return; + } + + /* + * Disable counter and interrupt + */ + raw_spin_lock_irqsave(&events->pmu_lock, flags); + + /* + * Disable counter + */ + nds32_pfm_disable_counter(idx); + + /* + * Disable interrupt for this counter + */ + nds32_pfm_disable_intens(idx); + + raw_spin_unlock_irqrestore(&events->pmu_lock, flags); +} + +static inline u32 nds32_pmu_read_counter(struct perf_event *event) +{ + struct nds32_pmu *cpu_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + u32 count = 0; + + if (!nds32_pfm_counter_valid(cpu_pmu, idx)) { + pr_err("CPU reading wrong counter %d\n", idx); + } else { + switch (idx) { + case PFMC0: + count = __nds32__mfsr(NDS32_SR_PFMC0); + break; + case PFMC1: + count = __nds32__mfsr(NDS32_SR_PFMC1); + break; + case PFMC2: + count = __nds32__mfsr(NDS32_SR_PFMC2); + break; + default: + pr_err + ("%s: CPU has no performance counters %d\n", + __func__, idx); + } + } + return count; +} + +static inline void nds32_pmu_write_counter(struct perf_event *event, u32 value) +{ + struct nds32_pmu *cpu_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + if (!nds32_pfm_counter_valid(cpu_pmu, idx)) { + pr_err("CPU writing wrong counter %d\n", idx); + } else { + switch (idx) { + case PFMC0: + __nds32__mtsr_isb(value, NDS32_SR_PFMC0); + break; + case PFMC1: + __nds32__mtsr_isb(value, NDS32_SR_PFMC1); + break; + case PFMC2: + __nds32__mtsr_isb(value, NDS32_SR_PFMC2); + break; + default: + pr_err + ("%s: CPU has no performance counters %d\n", + __func__, idx); + } + } +} + +static int nds32_pmu_get_event_idx(struct pmu_hw_events *cpuc, + struct perf_event *event) +{ + int idx; + struct hw_perf_event *hwc = &event->hw; + /* + * Current implementation maps cycles, instruction count and cache-miss + * to specific counter. + * However, multiple of the 3 counters are able to count these events. + * + * + * SOFTWARE_EVENT_MASK mask for getting event num , + * This is defined by Jia-Rung, you can change the polocies. + * However, do not exceed 8 bits. This is hardware specific. + * The last number is SPAv3_2_SEL_LAST. + */ + unsigned long evtype = hwc->config_base & SOFTWARE_EVENT_MASK; + + idx = get_converted_event_idx(evtype); + /* + * Try to get the counter for correpsonding event + */ + if (evtype == SPAV3_0_SEL_TOTAL_CYCLES) { + if (!test_and_set_bit(idx, cpuc->used_mask)) + return idx; + if (!test_and_set_bit(NDS32_IDX_COUNTER0, cpuc->used_mask)) + return NDS32_IDX_COUNTER0; + if (!test_and_set_bit(NDS32_IDX_COUNTER1, cpuc->used_mask)) + return NDS32_IDX_COUNTER1; + } else if (evtype == SPAV3_1_SEL_COMPLETED_INSTRUCTION) { + if (!test_and_set_bit(idx, cpuc->used_mask)) + return idx; + else if (!test_and_set_bit(NDS32_IDX_COUNTER1, cpuc->used_mask)) + return NDS32_IDX_COUNTER1; + else if (!test_and_set_bit + (NDS32_IDX_CYCLE_COUNTER, cpuc->used_mask)) + return NDS32_IDX_CYCLE_COUNTER; + } else { + if (!test_and_set_bit(idx, cpuc->used_mask)) + return idx; + } + return -EAGAIN; +} + +static void nds32_pmu_start(struct nds32_pmu *cpu_pmu) +{ + unsigned long flags; + unsigned int val; + struct pmu_hw_events *events = cpu_pmu->get_hw_events(); + + raw_spin_lock_irqsave(&events->pmu_lock, flags); + + /* Enable all counters , NDS PFM has 3 counters */ + val = __nds32__mfsr(NDS32_SR_PFM_CTL); + val |= (PFM_CTL_EN[0] | PFM_CTL_EN[1] | PFM_CTL_EN[2]); + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + + raw_spin_unlock_irqrestore(&events->pmu_lock, flags); +} + +static void nds32_pmu_stop(struct nds32_pmu *cpu_pmu) +{ + unsigned long flags; + unsigned int val; + struct pmu_hw_events *events = cpu_pmu->get_hw_events(); + + raw_spin_lock_irqsave(&events->pmu_lock, flags); + + /* Disable all counters , NDS PFM has 3 counters */ + val = __nds32__mfsr(NDS32_SR_PFM_CTL); + val &= ~(PFM_CTL_EN[0] | PFM_CTL_EN[1] | PFM_CTL_EN[2]); + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + + raw_spin_unlock_irqrestore(&events->pmu_lock, flags); +} + +static void nds32_pmu_reset(void *info) +{ + u32 val = 0; + + val |= (PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr(val, NDS32_SR_PFM_CTL); + __nds32__mtsr(0, NDS32_SR_PFM_CTL); + __nds32__mtsr(0, NDS32_SR_PFMC0); + __nds32__mtsr(0, NDS32_SR_PFMC1); + __nds32__mtsr(0, NDS32_SR_PFMC2); +} + +static void nds32_pmu_init(struct nds32_pmu *cpu_pmu) +{ + cpu_pmu->handle_irq = nds32_pmu_handle_irq; + cpu_pmu->enable = nds32_pmu_enable_event; + cpu_pmu->disable = nds32_pmu_disable_event; + cpu_pmu->read_counter = nds32_pmu_read_counter; + cpu_pmu->write_counter = nds32_pmu_write_counter; + cpu_pmu->get_event_idx = nds32_pmu_get_event_idx; + cpu_pmu->start = nds32_pmu_start; + cpu_pmu->stop = nds32_pmu_stop; + cpu_pmu->reset = nds32_pmu_reset; + cpu_pmu->max_period = 0xFFFFFFFF; /* Maximum counts */ +}; + +static u32 nds32_read_num_pfm_events(void) +{ + /* NDS32 SPAv3 PMU support 3 counter */ + return 3; +} + +static int device_pmu_init(struct nds32_pmu *cpu_pmu) +{ + nds32_pmu_init(cpu_pmu); + /* + * This name should be devive-specific name, whatever you like :) + * I think "PMU" will be a good generic name. + */ + cpu_pmu->name = "nds32v3-pmu"; + cpu_pmu->map_event = nds32_spav3_map_event; + cpu_pmu->num_events = nds32_read_num_pfm_events(); + cpu_pmu->set_event_filter = nds32_pmu_set_event_filter; + return 0; +} + +/* + * CPU PMU identification and probing. + */ +static int probe_current_pmu(struct nds32_pmu *pmu) +{ + int ret; + + get_cpu(); + ret = -ENODEV; + /* + * If ther are various CPU types with its own PMU, initialize with + * + * the corresponding one + */ + device_pmu_init(pmu); + put_cpu(); + return ret; +} + +static void nds32_pmu_enable(struct pmu *pmu) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(pmu); + struct pmu_hw_events *hw_events = nds32_pmu->get_hw_events(); + int enabled = bitmap_weight(hw_events->used_mask, + nds32_pmu->num_events); + + if (enabled) + nds32_pmu->start(nds32_pmu); +} + +static void nds32_pmu_disable(struct pmu *pmu) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(pmu); + + nds32_pmu->stop(nds32_pmu); +} + +static void nds32_pmu_release_hardware(struct nds32_pmu *nds32_pmu) +{ + nds32_pmu->free_irq(nds32_pmu); + pm_runtime_put_sync(&nds32_pmu->plat_device->dev); +} + +static irqreturn_t nds32_pmu_dispatch_irq(int irq, void *dev) +{ + struct nds32_pmu *nds32_pmu = (struct nds32_pmu *)dev; + int ret; + u64 start_clock, finish_clock; + + start_clock = local_clock(); + ret = nds32_pmu->handle_irq(irq, dev); + finish_clock = local_clock(); + + perf_sample_event_took(finish_clock - start_clock); + return ret; +} + +static int nds32_pmu_reserve_hardware(struct nds32_pmu *nds32_pmu) +{ + int err; + struct platform_device *pmu_device = nds32_pmu->plat_device; + + if (!pmu_device) + return -ENODEV; + + pm_runtime_get_sync(&pmu_device->dev); + err = nds32_pmu->request_irq(nds32_pmu, nds32_pmu_dispatch_irq); + if (err) { + nds32_pmu_release_hardware(nds32_pmu); + return err; + } + + return 0; +} + +static int +validate_event(struct pmu *pmu, struct pmu_hw_events *hw_events, + struct perf_event *event) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + + if (is_software_event(event)) + return 1; + + if (event->pmu != pmu) + return 0; + + if (event->state < PERF_EVENT_STATE_OFF) + return 1; + + if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec) + return 1; + + return nds32_pmu->get_event_idx(hw_events, event) >= 0; +} + +static int validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct pmu_hw_events fake_pmu; + DECLARE_BITMAP(fake_used_mask, MAX_COUNTERS); + /* + * Initialize the fake PMU. We only need to populate the + * used_mask for the purposes of validation. + */ + memset(fake_used_mask, 0, sizeof(fake_used_mask)); + + if (!validate_event(event->pmu, &fake_pmu, leader)) + return -EINVAL; + + for_each_sibling_event(sibling, leader) { + if (!validate_event(event->pmu, &fake_pmu, sibling)) + return -EINVAL; + } + + if (!validate_event(event->pmu, &fake_pmu, event)) + return -EINVAL; + + return 0; +} + +static int __hw_perf_event_init(struct perf_event *event) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int mapping; + + mapping = nds32_pmu->map_event(event); + + if (mapping < 0) { + pr_debug("event %x:%llx not supported\n", event->attr.type, + event->attr.config); + return mapping; + } + + /* + * We don't assign an index until we actually place the event onto + * hardware. Use -1 to signify that we haven't decided where to put it + * yet. For SMP systems, each core has it's own PMU so we can't do any + * clever allocation or constraints checking at this point. + */ + hwc->idx = -1; + hwc->config_base = 0; + hwc->config = 0; + hwc->event_base = 0; + + /* + * Check whether we need to exclude the counter from certain modes. + */ + if ((!nds32_pmu->set_event_filter || + nds32_pmu->set_event_filter(hwc, &event->attr)) && + event_requires_mode_exclusion(&event->attr)) { + pr_debug + ("NDS performance counters do not support mode exclusion\n"); + return -EOPNOTSUPP; + } + + /* + * Store the event encoding into the config_base field. + */ + hwc->config_base |= (unsigned long)mapping; + + if (!hwc->sample_period) { + /* + * For non-sampling runs, limit the sample_period to half + * of the counter width. That way, the new counter value + * is far less likely to overtake the previous one unless + * you have some serious IRQ latency issues. + */ + hwc->sample_period = nds32_pmu->max_period >> 1; + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + } + + if (event->group_leader != event) { + if (validate_group(event) != 0) + return -EINVAL; + } + + return 0; +} + +static int nds32_pmu_event_init(struct perf_event *event) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + int err = 0; + atomic_t *active_events = &nds32_pmu->active_events; + + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + if (nds32_pmu->map_event(event) == -ENOENT) + return -ENOENT; + + if (!atomic_inc_not_zero(active_events)) { + if (atomic_read(active_events) == 0) { + /* Register irq handler */ + err = nds32_pmu_reserve_hardware(nds32_pmu); + } + + if (!err) + atomic_inc(active_events); + } + + if (err) + return err; + + err = __hw_perf_event_init(event); + + return err; +} + +static void nds32_start(struct perf_event *event, int flags) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + /* + * NDS pmu always has to reprogram the period, so ignore + * PERF_EF_RELOAD, see the comment below. + */ + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + + hwc->state = 0; + /* Set the period for the event. */ + nds32_pmu_event_set_period(event); + + nds32_pmu->enable(event); +} + +static int nds32_pmu_add(struct perf_event *event, int flags) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct pmu_hw_events *hw_events = nds32_pmu->get_hw_events(); + struct hw_perf_event *hwc = &event->hw; + int idx; + int err = 0; + + perf_pmu_disable(event->pmu); + + /* If we don't have a space for the counter then finish early. */ + idx = nds32_pmu->get_event_idx(hw_events, event); + if (idx < 0) { + err = idx; + goto out; + } + + /* + * If there is an event in the counter we are going to use then make + * sure it is disabled. + */ + event->hw.idx = idx; + nds32_pmu->disable(event); + hw_events->events[idx] = event; + + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + if (flags & PERF_EF_START) + nds32_start(event, PERF_EF_RELOAD); + + /* Propagate our changes to the userspace mapping. */ + perf_event_update_userpage(event); + +out: + perf_pmu_enable(event->pmu); + return err; +} + +u64 nds32_pmu_event_update(struct perf_event *event) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u64 delta, prev_raw_count, new_raw_count; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + new_raw_count = nds32_pmu->read_counter(event); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) { + goto again; + } + /* + * Whether overflow or not, "unsigned substraction" + * will always get their delta + */ + delta = (new_raw_count - prev_raw_count) & nds32_pmu->max_period; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +static void nds32_stop(struct perf_event *event, int flags) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + /* + * NDS pmu always has to update the counter, so ignore + * PERF_EF_UPDATE, see comments in nds32_start(). + */ + if (!(hwc->state & PERF_HES_STOPPED)) { + nds32_pmu->disable(event); + nds32_pmu_event_update(event); + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + } +} + +static void nds32_pmu_del(struct perf_event *event, int flags) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct pmu_hw_events *hw_events = nds32_pmu->get_hw_events(); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + nds32_stop(event, PERF_EF_UPDATE); + hw_events->events[idx] = NULL; + clear_bit(idx, hw_events->used_mask); + + perf_event_update_userpage(event); +} + +static void nds32_pmu_read(struct perf_event *event) +{ + nds32_pmu_event_update(event); +} + +/* Please refer to SPAv3 for more hardware specific details */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *nds32_arch_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group nds32_pmu_format_group = { + .name = "format", + .attrs = nds32_arch_formats_attr, +}; + +static ssize_t nds32_pmu_cpumask_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return 0; +} + +static DEVICE_ATTR(cpus, 0444, nds32_pmu_cpumask_show, NULL); + +static struct attribute *nds32_pmu_common_attrs[] = { + &dev_attr_cpus.attr, + NULL, +}; + +static struct attribute_group nds32_pmu_common_group = { + .attrs = nds32_pmu_common_attrs, +}; + +static const struct attribute_group *nds32_pmu_attr_groups[] = { + &nds32_pmu_format_group, + &nds32_pmu_common_group, + NULL, +}; + +static void nds32_init(struct nds32_pmu *nds32_pmu) +{ + atomic_set(&nds32_pmu->active_events, 0); + + nds32_pmu->pmu = (struct pmu) { + .pmu_enable = nds32_pmu_enable, + .pmu_disable = nds32_pmu_disable, + .attr_groups = nds32_pmu_attr_groups, + .event_init = nds32_pmu_event_init, + .add = nds32_pmu_add, + .del = nds32_pmu_del, + .start = nds32_start, + .stop = nds32_stop, + .read = nds32_pmu_read, + }; +} + +int nds32_pmu_register(struct nds32_pmu *nds32_pmu, int type) +{ + nds32_init(nds32_pmu); + pm_runtime_enable(&nds32_pmu->plat_device->dev); + pr_info("enabled with %s PMU driver, %d counters available\n", + nds32_pmu->name, nds32_pmu->num_events); + return perf_pmu_register(&nds32_pmu->pmu, nds32_pmu->name, type); +} + +static struct pmu_hw_events *cpu_pmu_get_cpu_events(void) +{ + return this_cpu_ptr(&cpu_hw_events); +} + +static int cpu_pmu_request_irq(struct nds32_pmu *cpu_pmu, irq_handler_t handler) +{ + int err, irq, irqs; + struct platform_device *pmu_device = cpu_pmu->plat_device; + + if (!pmu_device) + return -ENODEV; + + irqs = min(pmu_device->num_resources, num_possible_cpus()); + if (irqs < 1) { + pr_err("no irqs for PMUs defined\n"); + return -ENODEV; + } + + irq = platform_get_irq(pmu_device, 0); + err = request_irq(irq, handler, IRQF_NOBALANCING, "nds32-pfm", + cpu_pmu); + if (err) { + pr_err("unable to request IRQ%d for NDS PMU counters\n", + irq); + return err; + } + return 0; +} + +static void cpu_pmu_free_irq(struct nds32_pmu *cpu_pmu) +{ + int irq; + struct platform_device *pmu_device = cpu_pmu->plat_device; + + irq = platform_get_irq(pmu_device, 0); + if (irq >= 0) + free_irq(irq, cpu_pmu); +} + +static void cpu_pmu_init(struct nds32_pmu *cpu_pmu) +{ + int cpu; + struct pmu_hw_events *events = &per_cpu(cpu_hw_events, cpu); + + raw_spin_lock_init(&events->pmu_lock); + + cpu_pmu->get_hw_events = cpu_pmu_get_cpu_events; + cpu_pmu->request_irq = cpu_pmu_request_irq; + cpu_pmu->free_irq = cpu_pmu_free_irq; + + /* Ensure the PMU has sane values out of reset. */ + if (cpu_pmu->reset) + on_each_cpu(cpu_pmu->reset, cpu_pmu, 1); +} + +const static struct of_device_id cpu_pmu_of_device_ids[] = { + {.compatible = "andestech,nds32v3-pmu", + .data = device_pmu_init}, + {}, +}; + +static int cpu_pmu_device_probe(struct platform_device *pdev) +{ + const struct of_device_id *of_id; + int (*init_fn)(struct nds32_pmu *nds32_pmu); + struct device_node *node = pdev->dev.of_node; + struct nds32_pmu *pmu; + int ret = -ENODEV; + + if (cpu_pmu) { + pr_notice("[perf] attempt to register multiple PMU devices!\n"); + return -ENOSPC; + } + + pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); + if (!pmu) + return -ENOMEM; + + of_id = of_match_node(cpu_pmu_of_device_ids, pdev->dev.of_node); + if (node && of_id) { + init_fn = of_id->data; + ret = init_fn(pmu); + } else { + ret = probe_current_pmu(pmu); + } + + if (ret) { + pr_notice("[perf] failed to probe PMU!\n"); + goto out_free; + } + + cpu_pmu = pmu; + cpu_pmu->plat_device = pdev; + cpu_pmu_init(cpu_pmu); + ret = nds32_pmu_register(cpu_pmu, PERF_TYPE_RAW); + + if (!ret) + return 0; + +out_free: + pr_notice("[perf] failed to register PMU devices!\n"); + kfree(pmu); + return ret; +} + +static struct platform_driver cpu_pmu_driver = { + .driver = { + .name = "nds32-pfm", + .of_match_table = cpu_pmu_of_device_ids, + }, + .probe = cpu_pmu_device_probe, + .id_table = cpu_pmu_plat_device_ids, +}; + +static int __init register_pmu_driver(void) +{ + int err = 0; + + err = platform_driver_register(&cpu_pmu_driver); + if (err) + pr_notice("[perf] PMU initialization failed\n"); + else + pr_notice("[perf] PMU initialization done\n"); + + return err; +} + +device_initcall(register_pmu_driver); + +/* + * References: arch/nds32/kernel/traps.c:__dump() + * You will need to know the NDS ABI first. + */ +static int unwind_frame_kernel(struct stackframe *frame) +{ + int graph = 0; +#ifdef CONFIG_FRAME_POINTER + /* 0x3 means misalignment */ + if (!kstack_end((void *)frame->fp) && + !((unsigned long)frame->fp & 0x3) && + ((unsigned long)frame->fp >= TASK_SIZE)) { + /* + * The array index is based on the ABI, the below graph + * illustrate the reasons. + * Function call procedure: "smw" and "lmw" will always + * update SP and FP for you automatically. + * + * Stack Relative Address + * | | 0 + * ---- + * |LP| <-- SP(before smw) <-- FP(after smw) -1 + * ---- + * |FP| -2 + * ---- + * | | <-- SP(after smw) -3 + */ + frame->lp = ((unsigned long *)frame->fp)[-1]; + frame->fp = ((unsigned long *)frame->fp)[FP_OFFSET]; + /* make sure CONFIG_FUNCTION_GRAPH_TRACER is turned on */ + if (__kernel_text_address(frame->lp)) + frame->lp = ftrace_graph_ret_addr + (NULL, &graph, frame->lp, NULL); + + return 0; + } else { + return -EPERM; + } +#else + /* + * You can refer to arch/nds32/kernel/traps.c:__dump() + * Treat "sp" as "fp", but the "sp" is one frame ahead of "fp". + * And, the "sp" is not always correct. + * + * Stack Relative Address + * | | 0 + * ---- + * |LP| <-- SP(before smw) -1 + * ---- + * | | <-- SP(after smw) -2 + * ---- + */ + if (!kstack_end((void *)frame->sp)) { + frame->lp = ((unsigned long *)frame->sp)[1]; + /* TODO: How to deal with the value in first + * "sp" is not correct? + */ + if (__kernel_text_address(frame->lp)) + frame->lp = ftrace_graph_ret_addr + (tsk, &graph, frame->lp, NULL); + + frame->sp = ((unsigned long *)frame->sp) + 1; + + return 0; + } else { + return -EPERM; + } +#endif +} + +static void notrace +walk_stackframe(struct stackframe *frame, + int (*fn_record)(struct stackframe *, void *), + void *data) +{ + while (1) { + int ret; + + if (fn_record(frame, data)) + break; + + ret = unwind_frame_kernel(frame); + if (ret < 0) + break; + } +} + +/* + * Gets called by walk_stackframe() for every stackframe. This will be called + * whist unwinding the stackframe and is like a subroutine return so we use + * the PC. + */ +static int callchain_trace(struct stackframe *fr, void *data) +{ + struct perf_callchain_entry_ctx *entry = data; + + perf_callchain_store(entry, fr->lp); + return 0; +} + +/* + * Get the return address for a single stackframe and return a pointer to the + * next frame tail. + */ +static unsigned long +user_backtrace(struct perf_callchain_entry_ctx *entry, unsigned long fp) +{ + struct frame_tail buftail; + unsigned long lp = 0; + unsigned long *user_frame_tail = + (unsigned long *)(fp - (unsigned long)sizeof(buftail)); + + /* Check accessibility of one struct frame_tail beyond */ + if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(buftail))) + return 0; + if (__copy_from_user_inatomic + (&buftail, user_frame_tail, sizeof(buftail))) + return 0; + + /* + * Refer to unwind_frame_kernel() for more illurstration + */ + lp = buftail.stack_lp; /* ((unsigned long *)fp)[-1] */ + fp = buftail.stack_fp; /* ((unsigned long *)fp)[FP_OFFSET] */ + perf_callchain_store(entry, lp); + return fp; +} + +static unsigned long +user_backtrace_opt_size(struct perf_callchain_entry_ctx *entry, + unsigned long fp) +{ + struct frame_tail_opt_size buftail; + unsigned long lp = 0; + + unsigned long *user_frame_tail = + (unsigned long *)(fp - (unsigned long)sizeof(buftail)); + + /* Check accessibility of one struct frame_tail beyond */ + if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(buftail))) + return 0; + if (__copy_from_user_inatomic + (&buftail, user_frame_tail, sizeof(buftail))) + return 0; + + /* + * Refer to unwind_frame_kernel() for more illurstration + */ + lp = buftail.stack_lp; /* ((unsigned long *)fp)[-1] */ + fp = buftail.stack_fp; /* ((unsigned long *)fp)[FP_OFFSET] */ + + perf_callchain_store(entry, lp); + return fp; +} + +/* + * This will be called when the target is in user mode + * This function will only be called when we use + * "PERF_SAMPLE_CALLCHAIN" in + * kernel/events/core.c:perf_prepare_sample() + * + * How to trigger perf_callchain_[user/kernel] : + * $ perf record -e cpu-clock --call-graph fp ./program + * $ perf report --call-graph + */ +unsigned long leaf_fp; +void +perf_callchain_user(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + unsigned long fp = 0; + unsigned long gp = 0; + unsigned long lp = 0; + unsigned long sp = 0; + unsigned long *user_frame_tail; + + leaf_fp = 0; + + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* We don't support guest os callchain now */ + return; + } + + perf_callchain_store(entry, regs->ipc); + fp = regs->fp; + gp = regs->gp; + lp = regs->lp; + sp = regs->sp; + if (entry->nr < PERF_MAX_STACK_DEPTH && + (unsigned long)fp && !((unsigned long)fp & 0x7) && fp > sp) { + user_frame_tail = + (unsigned long *)(fp - (unsigned long)sizeof(fp)); + + if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(fp))) + return; + + if (__copy_from_user_inatomic + (&leaf_fp, user_frame_tail, sizeof(fp))) + return; + + if (leaf_fp == lp) { + /* + * Maybe this is non leaf function + * with optimize for size, + * or maybe this is the function + * with optimize for size + */ + struct frame_tail buftail; + + user_frame_tail = + (unsigned long *)(fp - + (unsigned long)sizeof(buftail)); + + if (!access_ok + (VERIFY_READ, user_frame_tail, sizeof(buftail))) + return; + + if (__copy_from_user_inatomic + (&buftail, user_frame_tail, sizeof(buftail))) + return; + + if (buftail.stack_fp == gp) { + /* non leaf function with optimize + * for size condition + */ + struct frame_tail_opt_size buftail_opt_size; + + user_frame_tail = + (unsigned long *)(fp - (unsigned long) + sizeof(buftail_opt_size)); + + if (!access_ok(VERIFY_READ, user_frame_tail, + sizeof(buftail_opt_size))) + return; + + if (__copy_from_user_inatomic + (&buftail_opt_size, user_frame_tail, + sizeof(buftail_opt_size))) + return; + + perf_callchain_store(entry, lp); + fp = buftail_opt_size.stack_fp; + + while ((entry->nr < PERF_MAX_STACK_DEPTH) && + (unsigned long)fp && + !((unsigned long)fp & 0x7) && + fp > sp) { + sp = fp; + fp = user_backtrace_opt_size(entry, fp); + } + + } else { + /* this is the function + * without optimize for size + */ + fp = buftail.stack_fp; + perf_callchain_store(entry, lp); + while ((entry->nr < PERF_MAX_STACK_DEPTH) && + (unsigned long)fp && + !((unsigned long)fp & 0x7) && + fp > sp) { + sp = fp; + fp = user_backtrace(entry, fp); + } + } + } else { + /* this is leaf function */ + fp = leaf_fp; + perf_callchain_store(entry, lp); + + /* previous function callcahin */ + while ((entry->nr < PERF_MAX_STACK_DEPTH) && + (unsigned long)fp && + !((unsigned long)fp & 0x7) && fp > sp) { + sp = fp; + fp = user_backtrace(entry, fp); + } + } + return; + } +} + +/* This will be called when the target is in kernel mode */ +void +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + struct stackframe fr; + + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* We don't support guest os callchain now */ + return; + } + fr.fp = regs->fp; + fr.lp = regs->lp; + fr.sp = regs->sp; + walk_stackframe(&fr, callchain_trace, entry); +} + +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ + /* However, NDS32 does not support virtualization */ + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) + return perf_guest_cbs->get_guest_ip(); + + return instruction_pointer(regs); +} + +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + int misc = 0; + + /* However, NDS32 does not support virtualization */ + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_cbs->is_user_mode()) + misc |= PERF_RECORD_MISC_GUEST_USER; + else + misc |= PERF_RECORD_MISC_GUEST_KERNEL; + } else { + if (user_mode(regs)) + misc |= PERF_RECORD_MISC_USER; + else + misc |= PERF_RECORD_MISC_KERNEL; + } + + return misc; +} diff --git a/arch/nds32/kernel/pm.c b/arch/nds32/kernel/pm.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2008-2017 Andes Technology Corporation + +#include <linux/init.h> +#include <linux/suspend.h> +#include <linux/device.h> +#include <linux/printk.h> +#include <asm/suspend.h> +#include <nds32_intrinsic.h> + +unsigned int resume_addr; +unsigned int *phy_addr_sp_tmp; + +static void nds32_suspend2ram(void) +{ + pgd_t *pgdv; + pud_t *pudv; + pmd_t *pmdv; + pte_t *ptev; + + pgdv = (pgd_t *)__va((__nds32__mfsr(NDS32_SR_L1_PPTB) & + L1_PPTB_mskBASE)) + pgd_index((unsigned int)cpu_resume); + + pudv = pud_offset(pgdv, (unsigned int)cpu_resume); + pmdv = pmd_offset(pudv, (unsigned int)cpu_resume); + ptev = pte_offset_map(pmdv, (unsigned int)cpu_resume); + + resume_addr = ((*ptev) & TLB_DATA_mskPPN) + | ((unsigned int)cpu_resume & 0x00000fff); + + suspend2ram(); +} + +static void nds32_suspend_cpu(void) +{ + while (!(__nds32__mfsr(NDS32_SR_INT_PEND) & wake_mask)) + __asm__ volatile ("standby no_wake_grant\n\t"); +} + +static int nds32_pm_valid(suspend_state_t state) +{ + switch (state) { + case PM_SUSPEND_ON: + case PM_SUSPEND_STANDBY: + case PM_SUSPEND_MEM: + return 1; + default: + return 0; + } +} + +static int nds32_pm_enter(suspend_state_t state) +{ + pr_debug("%s:state:%d\n", __func__, state); + switch (state) { + case PM_SUSPEND_STANDBY: + nds32_suspend_cpu(); + return 0; + case PM_SUSPEND_MEM: + nds32_suspend2ram(); + return 0; + default: + return -EINVAL; + } +} + +static const struct platform_suspend_ops nds32_pm_ops = { + .valid = nds32_pm_valid, + .enter = nds32_pm_enter, +}; + +static int __init nds32_pm_init(void) +{ + pr_debug("Enter %s\n", __func__); + suspend_set_ops(&nds32_pm_ops); + return 0; +} +late_initcall(nds32_pm_init); diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c @@ -9,15 +9,16 @@ #include <linux/uaccess.h> #include <asm/elf.h> #include <asm/proc-fns.h> +#include <asm/fpu.h> #include <linux/ptrace.h> #include <linux/reboot.h> -extern void setup_mm_for_reboot(char mode); -#ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_dir_cpu; -EXPORT_SYMBOL(proc_dir_cpu); +#if IS_ENABLED(CONFIG_LAZY_FPU) +struct task_struct *last_task_used_math; #endif +extern void setup_mm_for_reboot(char mode); + extern inline void arch_reset(char mode) { if (mode == 's') { @@ -125,15 +126,31 @@ void show_regs(struct pt_regs *regs) EXPORT_SYMBOL(show_regs); +void exit_thread(struct task_struct *tsk) +{ +#if defined(CONFIG_FPU) && defined(CONFIG_LAZY_FPU) + if (last_task_used_math == tsk) + last_task_used_math = NULL; +#endif +} + void flush_thread(void) { +#if defined(CONFIG_FPU) + clear_fpu(task_pt_regs(current)); + clear_used_math(); +# ifdef CONFIG_LAZY_FPU + if (last_task_used_math == current) + last_task_used_math = NULL; +# endif +#endif } DEFINE_PER_CPU(struct task_struct *, __entry_task); asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p) + unsigned long stk_sz, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); @@ -159,6 +176,22 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, p->thread.cpu_context.pc = (unsigned long)ret_from_fork; p->thread.cpu_context.sp = (unsigned long)childregs; +#if IS_ENABLED(CONFIG_FPU) + if (used_math()) { +# if !IS_ENABLED(CONFIG_LAZY_FPU) + unlazy_fpu(current); +# else + preempt_disable(); + if (last_task_used_math == current) + save_fpu(current); + preempt_enable(); +# endif + p->thread.fpu = current->thread.fpu; + clear_fpu(task_pt_regs(p)); + set_stopped_child_used_math(p); + } +#endif + #ifdef CONFIG_HWZOL childregs->lb = 0; childregs->le = 0; @@ -168,12 +201,33 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, return 0; } +#if IS_ENABLED(CONFIG_FPU) +struct task_struct *_switch_fpu(struct task_struct *prev, struct task_struct *next) +{ +#if !IS_ENABLED(CONFIG_LAZY_FPU) + unlazy_fpu(prev); +#endif + if (!(next->flags & PF_KTHREAD)) + clear_fpu(task_pt_regs(next)); + return prev; +} +#endif + /* * fill in the fpe structure for a core dump... */ int dump_fpu(struct pt_regs *regs, elf_fpregset_t * fpu) { int fpvalid = 0; +#if IS_ENABLED(CONFIG_FPU) + struct task_struct *tsk = current; + + fpvalid = tsk_used_math(tsk); + if (fpvalid) { + lose_fpu(); + memcpy(fpu, &tsk->thread.fpu, sizeof(*fpu)); + } +#endif return fpvalid; } diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c @@ -15,6 +15,7 @@ #include <asm/proc-fns.h> #include <asm/cache_info.h> #include <asm/elf.h> +#include <asm/fpu.h> #include <nds32_intrinsic.h> #define HWCAP_MFUSR_PC 0x000001 @@ -38,8 +39,10 @@ #define HWCAP_FPU_DP 0x040000 #define HWCAP_V2 0x080000 #define HWCAP_DX_REGS 0x100000 +#define HWCAP_HWPRE 0x200000 unsigned long cpu_id, cpu_rev, cpu_cfgid; +bool has_fpu = false; char cpu_series; char *endianness = NULL; @@ -70,8 +73,10 @@ static const char *hwcap_str[] = { "div", "mac", "l2c", - "dx_regs", + "fpu_dp", "v2", + "dx_regs", + "hw_pre", NULL, }; @@ -136,6 +141,11 @@ static void __init dump_cpu_info(int cpu) (aliasing_num - 1) << PAGE_SHIFT; } #endif +#ifdef CONFIG_FPU + /* Disable fpu and enable when it is used. */ + if (has_fpu) + disable_fpu(); +#endif } static void __init setup_cpuinfo(void) @@ -180,9 +190,10 @@ static void __init setup_cpuinfo(void) if (cpu_cfgid & 0x0004) elf_hwcap |= HWCAP_EXT2; - if (cpu_cfgid & 0x0008) + if (cpu_cfgid & 0x0008) { elf_hwcap |= HWCAP_FPU; - + has_fpu = true; + } if (cpu_cfgid & 0x0010) elf_hwcap |= HWCAP_STRING; @@ -212,6 +223,11 @@ static void __init setup_cpuinfo(void) if (__nds32__mfsr(NDS32_SR_MSC_CFG) & MSC_CFG_mskL2C) elf_hwcap |= HWCAP_L2C; +#ifdef CONFIG_HW_PRE + if (__nds32__mfsr(NDS32_SR_MISC_CTL) & MISC_CTL_makHWPRE_EN) + elf_hwcap |= HWCAP_HWPRE; +#endif + tmp = __nds32__mfsr(NDS32_SR_CACHE_CTL); if (!IS_ENABLED(CONFIG_CPU_DCACHE_DISABLE)) tmp |= CACHE_CTL_mskDC_EN; diff --git a/arch/nds32/kernel/signal.c b/arch/nds32/kernel/signal.c @@ -12,6 +12,7 @@ #include <asm/cacheflush.h> #include <asm/ucontext.h> #include <asm/unistd.h> +#include <asm/fpu.h> #include <asm/ptrace.h> #include <asm/vdso.h> @@ -20,6 +21,60 @@ struct rt_sigframe { struct siginfo info; struct ucontext uc; }; +#if IS_ENABLED(CONFIG_FPU) +static inline int restore_sigcontext_fpu(struct pt_regs *regs, + struct sigcontext __user *sc) +{ + struct task_struct *tsk = current; + unsigned long used_math_flag; + int ret = 0; + + clear_used_math(); + __get_user_error(used_math_flag, &sc->used_math_flag, ret); + + if (!used_math_flag) + return 0; + set_used_math(); + +#if IS_ENABLED(CONFIG_LAZY_FPU) + preempt_disable(); + if (current == last_task_used_math) { + last_task_used_math = NULL; + disable_ptreg_fpu(regs); + } + preempt_enable(); +#else + clear_fpu(regs); +#endif + + return __copy_from_user(&tsk->thread.fpu, &sc->fpu, + sizeof(struct fpu_struct)); +} + +static inline int setup_sigcontext_fpu(struct pt_regs *regs, + struct sigcontext __user *sc) +{ + struct task_struct *tsk = current; + int ret = 0; + + __put_user_error(used_math(), &sc->used_math_flag, ret); + + if (!used_math()) + return ret; + + preempt_disable(); +#if IS_ENABLED(CONFIG_LAZY_FPU) + if (last_task_used_math == tsk) + save_fpu(last_task_used_math); +#else + unlazy_fpu(tsk); +#endif + ret = __copy_to_user(&sc->fpu, &tsk->thread.fpu, + sizeof(struct fpu_struct)); + preempt_enable(); + return ret; +} +#endif static int restore_sigframe(struct pt_regs *regs, struct rt_sigframe __user * sf) @@ -69,7 +124,9 @@ static int restore_sigframe(struct pt_regs *regs, __get_user_error(regs->le, &sf->uc.uc_mcontext.zol.nds32_le, err); __get_user_error(regs->lb, &sf->uc.uc_mcontext.zol.nds32_lb, err); #endif - +#if IS_ENABLED(CONFIG_FPU) + err |= restore_sigcontext_fpu(regs, &sf->uc.uc_mcontext); +#endif /* * Avoid sys_rt_sigreturn() restarting. */ @@ -153,6 +210,9 @@ setup_sigframe(struct rt_sigframe __user * sf, struct pt_regs *regs, __put_user_error(regs->le, &sf->uc.uc_mcontext.zol.nds32_le, err); __put_user_error(regs->lb, &sf->uc.uc_mcontext.zol.nds32_lb, err); #endif +#if IS_ENABLED(CONFIG_FPU) + err |= setup_sigcontext_fpu(regs, &sf->uc.uc_mcontext); +#endif __put_user_error(current->thread.trap_no, &sf->uc.uc_mcontext.trap_no, err); diff --git a/arch/nds32/kernel/sleep.S b/arch/nds32/kernel/sleep.S @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2017 Andes Technology Corporation */ + +#include <asm/memory.h> + +.data +.global sp_tmp +sp_tmp: +.long + +.text +.globl suspend2ram +.globl cpu_resume + +suspend2ram: + pushm $r0, $r31 +#if defined(CONFIG_HWZOL) + mfusr $r0, $lc + mfusr $r1, $le + mfusr $r2, $lb +#endif + mfsr $r3, $mr0 + mfsr $r4, $mr1 + mfsr $r5, $mr4 + mfsr $r6, $mr6 + mfsr $r7, $mr7 + mfsr $r8, $mr8 + mfsr $r9, $ir0 + mfsr $r10, $ir1 + mfsr $r11, $ir2 + mfsr $r12, $ir3 + mfsr $r13, $ir9 + mfsr $r14, $ir10 + mfsr $r15, $ir12 + mfsr $r16, $ir13 + mfsr $r17, $ir14 + mfsr $r18, $ir15 + pushm $r0, $r19 +#if defined(CONFIG_FPU) + jal store_fpu_for_suspend +#endif + tlbop FlushAll + isb + + // transfer $sp from va to pa + sethi $r0, hi20(PAGE_OFFSET) + ori $r0, $r0, lo12(PAGE_OFFSET) + movi $r2, PHYS_OFFSET + sub $r1, $sp, $r0 + add $r2, $r1, $r2 + + // store pa($sp) to sp_tmp + sethi $r1, hi20(sp_tmp) + swi $r2, [$r1 + lo12(sp_tmp)] + + pushm $r16, $r25 + pushm $r29, $r30 +#ifdef CONFIG_CACHE_L2 + jal dcache_wb_all_level +#else + jal cpu_dcache_wb_all +#endif + popm $r29, $r30 + popm $r16, $r25 + + // get wake_mask and loop in standby + la $r1, wake_mask + lwi $r1, [$r1] +self_loop: + standby wake_grant + mfsr $r2, $ir15 + and $r2, $r1, $r2 + beqz $r2, self_loop + + // set ipc to resume address + la $r1, resume_addr + lwi $r1, [$r1] + mtsr $r1, $ipc + isb + + // reset psw, turn off the address translation + li $r2, 0x7000a + mtsr $r2, $ipsw + isb + + iret +cpu_resume: + // translate the address of sp_tmp variable to pa + la $r1, sp_tmp + sethi $r0, hi20(PAGE_OFFSET) + ori $r0, $r0, lo12(PAGE_OFFSET) + movi $r2, PHYS_OFFSET + sub $r1, $r1, $r0 + add $r1, $r1, $r2 + + // access the sp_tmp to get stack pointer + lwi $sp, [$r1] + + popm $r0, $r19 +#if defined(CONFIG_HWZOL) + mtusr $r0, $lb + mtusr $r1, $lc + mtusr $r2, $le +#endif + mtsr $r3, $mr0 + mtsr $r4, $mr1 + mtsr $r5, $mr4 + mtsr $r6, $mr6 + mtsr $r7, $mr7 + mtsr $r8, $mr8 + // set original psw to ipsw + mtsr $r9, $ir1 + + mtsr $r11, $ir2 + mtsr $r12, $ir3 + + // set ipc to RR + la $r13, RR + mtsr $r13, $ir9 + + mtsr $r14, $ir10 + mtsr $r15, $ir12 + mtsr $r16, $ir13 + mtsr $r17, $ir14 + mtsr $r18, $ir15 + popm $r0, $r31 + + isb + iret +RR: + ret diff --git a/arch/nds32/kernel/sys_nds32.c b/arch/nds32/kernel/sys_nds32.c @@ -6,6 +6,8 @@ #include <asm/cachectl.h> #include <asm/proc-fns.h> +#include <asm/udftrap.h> +#include <asm/fpu.h> SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, @@ -48,3 +50,33 @@ SYSCALL_DEFINE3(cacheflush, unsigned int, start, unsigned int, end, int, cache) return 0; } + +SYSCALL_DEFINE1(udftrap, int, option) +{ +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + int old_udftrap; + + if (!used_math()) { + load_fpu(&init_fpuregs); + current->thread.fpu.UDF_trap = init_fpuregs.UDF_trap; + set_used_math(); + } + + old_udftrap = current->thread.fpu.UDF_trap; + switch (option) { + case DISABLE_UDFTRAP: + current->thread.fpu.UDF_trap = 0; + break; + case ENABLE_UDFTRAP: + current->thread.fpu.UDF_trap = FPCSR_mskUDFE; + break; + case GET_UDFTRAP: + break; + default: + return -EINVAL; + } + return old_udftrap; +#else + return -ENOTSUPP; +#endif +} diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c @@ -12,6 +12,7 @@ #include <asm/proc-fns.h> #include <asm/unistd.h> +#include <asm/fpu.h> #include <linux/ptrace.h> #include <nds32_intrinsic.h> @@ -357,6 +358,21 @@ void do_dispatch_general(unsigned long entry, unsigned long addr, } else if (type == ETYPE_RESERVED_INSTRUCTION) { /* Reserved instruction */ do_revinsn(regs); + } else if (type == ETYPE_COPROCESSOR) { + /* Coprocessor */ +#if IS_ENABLED(CONFIG_FPU) + unsigned int fucop_exist = __nds32__mfsr(NDS32_SR_FUCOP_EXIST); + unsigned int cpid = ((itype & ITYPE_mskCPID) >> ITYPE_offCPID); + + if ((cpid == FPU_CPID) && + (fucop_exist & FUCOP_EXIST_mskCP0ISFPU)) { + unsigned int subtype = (itype & ITYPE_mskSTYPE); + + if (true == do_fpu_exception(subtype, regs)) + return; + } +#endif + unhandled_exceptions(entry, addr, type, regs); } else if (type == ETYPE_TRAP && swid == SWID_RAISE_INTERRUPT_LEVEL) { /* trap, used on v3 EDM target debugging workaround */ /* diff --git a/arch/nds32/math-emu/Makefile b/arch/nds32/math-emu/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux/nds32 kernel FPU emulation. +# + +obj-y := fpuemu.o \ + fdivd.o fmuld.o fsubd.o faddd.o fs2d.o fsqrtd.o fcmpd.o fnegs.o \ + fdivs.o fmuls.o fsubs.o fadds.o fd2s.o fsqrts.o fcmps.o fnegd.o diff --git a/arch/nds32/math-emu/faddd.c b/arch/nds32/math-emu/faddd.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <linux/uaccess.h> + +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/double.h> +void faddd(void *ft, void *fa, void *fb) +{ + FP_DECL_D(A); + FP_DECL_D(B); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + FP_UNPACK_DP(B, fb); + + FP_ADD_D(R, A, B); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; + +} diff --git a/arch/nds32/math-emu/fadds.c b/arch/nds32/math-emu/fadds.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <linux/uaccess.h> + +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/single.h> +void fadds(void *ft, void *fa, void *fb) +{ + FP_DECL_S(A); + FP_DECL_S(B); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + FP_UNPACK_SP(B, fb); + + FP_ADD_S(R, A, B); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; + +} diff --git a/arch/nds32/math-emu/fcmpd.c b/arch/nds32/math-emu/fcmpd.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/double.h> +int fcmpd(void *ft, void *fa, void *fb, int cmpop) +{ + FP_DECL_D(A); + FP_DECL_D(B); + FP_DECL_EX; + long cmp; + + FP_UNPACK_DP(A, fa); + FP_UNPACK_DP(B, fb); + + FP_CMP_D(cmp, A, B, SF_CUN); + cmp += 2; + if (cmp == SF_CGT) + *(long *)ft = 0; + else + *(long *)ft = (cmp & cmpop) ? 1 : 0; + + return 0; +} diff --git a/arch/nds32/math-emu/fcmps.c b/arch/nds32/math-emu/fcmps.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/single.h> +int fcmps(void *ft, void *fa, void *fb, int cmpop) +{ + FP_DECL_S(A); + FP_DECL_S(B); + FP_DECL_EX; + long cmp; + + FP_UNPACK_SP(A, fa); + FP_UNPACK_SP(B, fb); + + FP_CMP_S(cmp, A, B, SF_CUN); + cmp += 2; + if (cmp == SF_CGT) + *(int *)ft = 0x0; + else + *(int *)ft = (cmp & cmpop) ? 0x1 : 0x0; + + return 0; +} diff --git a/arch/nds32/math-emu/fd2s.c b/arch/nds32/math-emu/fd2s.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <linux/uaccess.h> + +#include <asm/sfp-machine.h> +#include <math-emu/double.h> +#include <math-emu/single.h> +#include <math-emu/soft-fp.h> +void fd2s(void *ft, void *fa) +{ + FP_DECL_D(A); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + + FP_CONV(S, D, 1, 2, R, A); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fdivd.c b/arch/nds32/math-emu/fdivd.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include <linux/uaccess.h> +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/double.h> + +void fdivd(void *ft, void *fa, void *fb) +{ + FP_DECL_D(A); + FP_DECL_D(B); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + FP_UNPACK_DP(B, fb); + + if (B_c == FP_CLS_ZERO && A_c != FP_CLS_ZERO) + FP_SET_EXCEPTION(FP_EX_DIVZERO); + + FP_DIV_D(R, A, B); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fdivs.c b/arch/nds32/math-emu/fdivs.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <linux/uaccess.h> + +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/single.h> +void fdivs(void *ft, void *fa, void *fb) +{ + FP_DECL_S(A); + FP_DECL_S(B); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + FP_UNPACK_SP(B, fb); + + if (B_c == FP_CLS_ZERO && A_c != FP_CLS_ZERO) + FP_SET_EXCEPTION(FP_EX_DIVZERO); + + FP_DIV_S(R, A, B); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fmuld.c b/arch/nds32/math-emu/fmuld.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <linux/uaccess.h> + +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/double.h> +void fmuld(void *ft, void *fa, void *fb) +{ + FP_DECL_D(A); + FP_DECL_D(B); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + FP_UNPACK_DP(B, fb); + + FP_MUL_D(R, A, B); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fmuls.c b/arch/nds32/math-emu/fmuls.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <linux/uaccess.h> + +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/single.h> +void fmuls(void *ft, void *fa, void *fb) +{ + FP_DECL_S(A); + FP_DECL_S(B); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + FP_UNPACK_SP(B, fb); + + FP_MUL_S(R, A, B); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fnegd.c b/arch/nds32/math-emu/fnegd.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <linux/uaccess.h> + +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/double.h> +void fnegd(void *ft, void *fa) +{ + FP_DECL_D(A); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + + FP_NEG_D(R, A); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fnegs.c b/arch/nds32/math-emu/fnegs.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <linux/uaccess.h> + +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/single.h> +void fnegs(void *ft, void *fa) +{ + FP_DECL_S(A); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + + FP_NEG_S(R, A); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fpuemu.c b/arch/nds32/math-emu/fpuemu.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include <asm/bitfield.h> +#include <asm/uaccess.h> +#include <asm/sfp-machine.h> +#include <asm/fpuemu.h> +#include <asm/nds32_fpu_inst.h> + +#define DPFROMREG(dp, x) (dp = (void *)((unsigned long *)fpu_reg + 2*x)) +#ifdef __NDS32_EL__ +#define SPFROMREG(sp, x)\ + ((sp) = (void *)((unsigned long *)fpu_reg + (x^1))) +#else +#define SPFROMREG(sp, x) ((sp) = (void *)((unsigned long *)fpu_reg + x)) +#endif + +#define DEF3OP(name, p, f1, f2) \ +void fpemu_##name##p(void *ft, void *fa, void *fb) \ +{ \ + f1(fa, fa, fb); \ + f2(ft, ft, fa); \ +} + +#define DEF3OPNEG(name, p, f1, f2, f3) \ +void fpemu_##name##p(void *ft, void *fa, void *fb) \ +{ \ + f1(fa, fa, fb); \ + f2(ft, ft, fa); \ + f3(ft, ft); \ +} +DEF3OP(fmadd, s, fmuls, fadds); +DEF3OP(fmsub, s, fmuls, fsubs); +DEF3OP(fmadd, d, fmuld, faddd); +DEF3OP(fmsub, d, fmuld, fsubd); +DEF3OPNEG(fnmadd, s, fmuls, fadds, fnegs); +DEF3OPNEG(fnmsub, s, fmuls, fsubs, fnegs); +DEF3OPNEG(fnmadd, d, fmuld, faddd, fnegd); +DEF3OPNEG(fnmsub, d, fmuld, fsubd, fnegd); + +static const unsigned char cmptab[8] = { + SF_CEQ, + SF_CEQ, + SF_CLT, + SF_CLT, + SF_CLT | SF_CEQ, + SF_CLT | SF_CEQ, + SF_CUN, + SF_CUN +}; + +enum ARGTYPE { + S1S = 1, + S2S, + S1D, + CS, + D1D, + D2D, + D1S, + CD +}; +union func_t { + void (*t)(void *ft, void *fa, void *fb); + void (*b)(void *ft, void *fa); +}; +/* + * Emulate a single FPU arithmetic instruction. + */ +static int fpu_emu(struct fpu_struct *fpu_reg, unsigned long insn) +{ + int rfmt; /* resulting format */ + union func_t func; + int ftype = 0; + + switch (rfmt = NDS32Insn_OPCODE_COP0(insn)) { + case fs1_op:{ + switch (NDS32Insn_OPCODE_BIT69(insn)) { + case fadds_op: + func.t = fadds; + ftype = S2S; + break; + case fsubs_op: + func.t = fsubs; + ftype = S2S; + break; + case fmadds_op: + func.t = fpemu_fmadds; + ftype = S2S; + break; + case fmsubs_op: + func.t = fpemu_fmsubs; + ftype = S2S; + break; + case fnmadds_op: + func.t = fpemu_fnmadds; + ftype = S2S; + break; + case fnmsubs_op: + func.t = fpemu_fnmsubs; + ftype = S2S; + break; + case fmuls_op: + func.t = fmuls; + ftype = S2S; + break; + case fdivs_op: + func.t = fdivs; + ftype = S2S; + break; + case fs1_f2op_op: + switch (NDS32Insn_OPCODE_BIT1014(insn)) { + case fs2d_op: + func.b = fs2d; + ftype = S1D; + break; + case fsqrts_op: + func.b = fsqrts; + ftype = S1S; + break; + default: + return SIGILL; + } + break; + default: + return SIGILL; + } + break; + } + case fs2_op: + switch (NDS32Insn_OPCODE_BIT69(insn)) { + case fcmpeqs_op: + case fcmpeqs_e_op: + case fcmplts_op: + case fcmplts_e_op: + case fcmples_op: + case fcmples_e_op: + case fcmpuns_op: + case fcmpuns_e_op: + ftype = CS; + break; + default: + return SIGILL; + } + break; + case fd1_op:{ + switch (NDS32Insn_OPCODE_BIT69(insn)) { + case faddd_op: + func.t = faddd; + ftype = D2D; + break; + case fsubd_op: + func.t = fsubd; + ftype = D2D; + break; + case fmaddd_op: + func.t = fpemu_fmaddd; + ftype = D2D; + break; + case fmsubd_op: + func.t = fpemu_fmsubd; + ftype = D2D; + break; + case fnmaddd_op: + func.t = fpemu_fnmaddd; + ftype = D2D; + break; + case fnmsubd_op: + func.t = fpemu_fnmsubd; + ftype = D2D; + break; + case fmuld_op: + func.t = fmuld; + ftype = D2D; + break; + case fdivd_op: + func.t = fdivd; + ftype = D2D; + break; + case fd1_f2op_op: + switch (NDS32Insn_OPCODE_BIT1014(insn)) { + case fd2s_op: + func.b = fd2s; + ftype = D1S; + break; + case fsqrtd_op: + func.b = fsqrtd; + ftype = D1D; + break; + default: + return SIGILL; + } + break; + default: + return SIGILL; + + } + break; + } + + case fd2_op: + switch (NDS32Insn_OPCODE_BIT69(insn)) { + case fcmpeqd_op: + case fcmpeqd_e_op: + case fcmpltd_op: + case fcmpltd_e_op: + case fcmpled_op: + case fcmpled_e_op: + case fcmpund_op: + case fcmpund_e_op: + ftype = CD; + break; + default: + return SIGILL; + } + break; + + default: + return SIGILL; + } + + switch (ftype) { + case S1S:{ + void *ft, *fa; + + SPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + SPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + func.b(ft, fa); + break; + } + case S2S:{ + void *ft, *fa, *fb; + + SPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + SPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + SPFROMREG(fb, NDS32Insn_OPCODE_Rb(insn)); + func.t(ft, fa, fb); + break; + } + case S1D:{ + void *ft, *fa; + + DPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + SPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + func.b(ft, fa); + break; + } + case CS:{ + unsigned int cmpop = NDS32Insn_OPCODE_BIT69(insn); + void *ft, *fa, *fb; + + SPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + SPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + SPFROMREG(fb, NDS32Insn_OPCODE_Rb(insn)); + if (cmpop < 0x8) { + cmpop = cmptab[cmpop]; + fcmps(ft, fa, fb, cmpop); + } else + return SIGILL; + break; + } + case D1D:{ + void *ft, *fa; + + DPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + DPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + func.b(ft, fa); + break; + } + case D2D:{ + void *ft, *fa, *fb; + + DPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + DPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + DPFROMREG(fb, NDS32Insn_OPCODE_Rb(insn)); + func.t(ft, fa, fb); + break; + } + case D1S:{ + void *ft, *fa; + + SPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + DPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + func.b(ft, fa); + break; + } + case CD:{ + unsigned int cmpop = NDS32Insn_OPCODE_BIT69(insn); + void *ft, *fa, *fb; + + SPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + DPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + DPFROMREG(fb, NDS32Insn_OPCODE_Rb(insn)); + if (cmpop < 0x8) { + cmpop = cmptab[cmpop]; + fcmpd(ft, fa, fb, cmpop); + } else + return SIGILL; + break; + } + default: + return SIGILL; + } + + /* + * If an exception is required, generate a tidy SIGFPE exception. + */ +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + if (((fpu_reg->fpcsr << 5) & fpu_reg->fpcsr & FPCSR_mskALLE_NO_UDFE) || + ((fpu_reg->fpcsr & FPCSR_mskUDF) && (fpu_reg->UDF_trap))) +#else + if ((fpu_reg->fpcsr << 5) & fpu_reg->fpcsr & FPCSR_mskALLE) +#endif + return SIGFPE; + return 0; +} + + +int do_fpuemu(struct pt_regs *regs, struct fpu_struct *fpu) +{ + unsigned long insn = 0, addr = regs->ipc; + unsigned long emulpc, contpc; + unsigned char *pc = (void *)&insn; + char c; + int i = 0, ret; + + for (i = 0; i < 4; i++) { + if (__get_user(c, (unsigned char *)addr++)) + return SIGBUS; + *pc++ = c; + } + + insn = be32_to_cpu(insn); + + emulpc = regs->ipc; + contpc = regs->ipc + 4; + + if (NDS32Insn_OPCODE(insn) != cop0_op) + return SIGILL; + switch (NDS32Insn_OPCODE_COP0(insn)) { + case fs1_op: + case fs2_op: + case fd1_op: + case fd2_op: + { + /* a real fpu computation instruction */ + ret = fpu_emu(fpu, insn); + if (!ret) + regs->ipc = contpc; + } + break; + + default: + return SIGILL; + } + + return ret; +} diff --git a/arch/nds32/math-emu/fs2d.c b/arch/nds32/math-emu/fs2d.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include <linux/uaccess.h> +#include <asm/sfp-machine.h> +#include <math-emu/double.h> +#include <math-emu/single.h> +#include <math-emu/soft-fp.h> + +void fs2d(void *ft, void *fa) +{ + FP_DECL_S(A); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + + FP_CONV(D, S, 2, 1, R, A); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fsqrtd.c b/arch/nds32/math-emu/fsqrtd.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include <linux/uaccess.h> +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/double.h> +void fsqrtd(void *ft, void *fa) +{ + FP_DECL_D(A); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + + FP_SQRT_D(R, A); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fsqrts.c b/arch/nds32/math-emu/fsqrts.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include <linux/uaccess.h> +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/single.h> +void fsqrts(void *ft, void *fa) +{ + FP_DECL_S(A); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + + FP_SQRT_S(R, A); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fsubd.c b/arch/nds32/math-emu/fsubd.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <linux/uaccess.h> + +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/double.h> +void fsubd(void *ft, void *fa, void *fb) +{ + + FP_DECL_D(A); + FP_DECL_D(B); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + FP_UNPACK_DP(B, fb); + + if (B_c != FP_CLS_NAN) + B_s ^= 1; + + FP_ADD_D(R, A, B); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fsubs.c b/arch/nds32/math-emu/fsubs.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include <linux/uaccess.h> + +#include <asm/sfp-machine.h> +#include <math-emu/soft-fp.h> +#include <math-emu/single.h> +void fsubs(void *ft, void *fa, void *fb) +{ + + FP_DECL_S(A); + FP_DECL_S(B); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + FP_UNPACK_SP(B, fb); + + if (B_c != FP_CLS_NAN) + B_s ^= 1; + + FP_ADD_S(R, A, B); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/mm/Makefile b/arch/nds32/mm/Makefile @@ -4,4 +4,8 @@ obj-y := extable.o tlb.o \ obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o obj-$(CONFIG_HIGHMEM) += highmem.o -CFLAGS_proc-n13.o += -fomit-frame-pointer + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_proc.o = $(CC_FLAGS_FTRACE) +endif +CFLAGS_proc.o += -fomit-frame-pointer diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c @@ -9,6 +9,7 @@ #include <linux/init.h> #include <linux/hardirq.h> #include <linux/uaccess.h> +#include <linux/perf_event.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -169,8 +170,6 @@ good_area: mask = VM_EXEC; else { mask = VM_READ | VM_WRITE; - if (vma->vm_flags & VM_WRITE) - flags |= FAULT_FLAG_WRITE; } } else if (entry == ENTRY_TLB_MISC) { switch (error_code & ITYPE_mskETYPE) { @@ -231,11 +230,17 @@ good_area: * attempt. If we go through a retry, it is extremely likely that the * page will be found in page cache at that point. */ + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) + if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - else + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, + 1, regs, addr); + } else { tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, + 1, regs, addr); + } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; diff --git a/drivers/irqchip/irq-ativic32.c b/drivers/irqchip/irq-ativic32.c @@ -10,6 +10,8 @@ #include <linux/irqchip.h> #include <nds32_intrinsic.h> +unsigned long wake_mask; + static void ativic32_ack_irq(struct irq_data *data) { __nds32__mtsr_dsb(BIT(data->hwirq), NDS32_SR_INT_PEND2); @@ -27,11 +29,40 @@ static void ativic32_unmask_irq(struct irq_data *data) __nds32__mtsr_dsb(int_mask2 | (BIT(data->hwirq)), NDS32_SR_INT_MASK2); } +static int nointc_set_wake(struct irq_data *data, unsigned int on) +{ + unsigned long int_mask = __nds32__mfsr(NDS32_SR_INT_MASK); + static unsigned long irq_orig_bit; + u32 bit = 1 << data->hwirq; + + if (on) { + if (int_mask & bit) + __assign_bit(data->hwirq, &irq_orig_bit, true); + else + __assign_bit(data->hwirq, &irq_orig_bit, false); + + __assign_bit(data->hwirq, &int_mask, true); + __assign_bit(data->hwirq, &wake_mask, true); + + } else { + if (!(irq_orig_bit & bit)) + __assign_bit(data->hwirq, &int_mask, false); + + __assign_bit(data->hwirq, &wake_mask, false); + __assign_bit(data->hwirq, &irq_orig_bit, false); + } + + __nds32__mtsr_dsb(int_mask, NDS32_SR_INT_MASK); + + return 0; +} + static struct irq_chip ativic32_chip = { .name = "ativic32", .irq_ack = ativic32_ack_irq, .irq_mask = ativic32_mask_irq, .irq_unmask = ativic32_unmask_irq, + .irq_set_wake = nointc_set_wake, }; static unsigned int __initdata nivic_map[6] = { 6, 2, 10, 16, 24, 32 }; diff --git a/include/math-emu/op-2.h b/include/math-emu/op-2.h @@ -31,61 +31,56 @@ #define _FP_FRAC_HIGH_2(X) (X##_f1) #define _FP_FRAC_LOW_2(X) (X##_f0) #define _FP_FRAC_WORD_2(X,w) (X##_f##w) +#define _FP_FRAC_SLL_2(X, N) ( \ + (void) (((N) < _FP_W_TYPE_SIZE) \ + ? ({ \ + if (__builtin_constant_p(N) && (N) == 1) { \ + X##_f1 = X##_f1 + X##_f1 + \ + (((_FP_WS_TYPE) (X##_f0)) < 0); \ + X##_f0 += X##_f0; \ + } else { \ + X##_f1 = X##_f1 << (N) | X##_f0 >> \ + (_FP_W_TYPE_SIZE - (N)); \ + X##_f0 <<= (N); \ + } \ + 0; \ + }) \ + : ({ \ + X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE); \ + X##_f0 = 0; \ + }))) + + +#define _FP_FRAC_SRL_2(X, N) ( \ + (void) (((N) < _FP_W_TYPE_SIZE) \ + ? ({ \ + X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \ + X##_f1 >>= (N); \ + }) \ + : ({ \ + X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE); \ + X##_f1 = 0; \ + }))) -#define _FP_FRAC_SLL_2(X,N) \ - do { \ - if ((N) < _FP_W_TYPE_SIZE) \ - { \ - if (__builtin_constant_p(N) && (N) == 1) \ - { \ - X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0); \ - X##_f0 += X##_f0; \ - } \ - else \ - { \ - X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \ - X##_f0 <<= (N); \ - } \ - } \ - else \ - { \ - X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE); \ - X##_f0 = 0; \ - } \ - } while (0) - -#define _FP_FRAC_SRL_2(X,N) \ - do { \ - if ((N) < _FP_W_TYPE_SIZE) \ - { \ - X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \ - X##_f1 >>= (N); \ - } \ - else \ - { \ - X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE); \ - X##_f1 = 0; \ - } \ - } while (0) /* Right shift with sticky-lsb. */ -#define _FP_FRAC_SRS_2(X,N,sz) \ - do { \ - if ((N) < _FP_W_TYPE_SIZE) \ - { \ - X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) | \ - (__builtin_constant_p(N) && (N) == 1 \ - ? X##_f0 & 1 \ - : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \ - X##_f1 >>= (N); \ - } \ - else \ - { \ - X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) | \ - (((X##_f1 << (2*_FP_W_TYPE_SIZE - (N))) | X##_f0) != 0)); \ - X##_f1 = 0; \ - } \ - } while (0) +#define _FP_FRAC_SRS_2(X, N, sz) ( \ + (void) (((N) < _FP_W_TYPE_SIZE) \ + ? ({ \ + X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) \ + | (__builtin_constant_p(N) && (N) == 1 \ + ? X##_f0 & 1 \ + : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \ + X##_f1 >>= (N); \ + }) \ + : ({ \ + X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) \ + | ((((N) == _FP_W_TYPE_SIZE \ + ? 0 \ + : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \ + | X##_f0) != 0)); \ + X##_f1 = 0; \ + }))) #define _FP_FRAC_ADDI_2(X,I) \ __FP_FRAC_ADDI_2(X##_f1, X##_f0, I) diff --git a/include/math-emu/soft-fp.h b/include/math-emu/soft-fp.h @@ -138,7 +138,7 @@ do { \ _FP_FRAC_ADDI_##wc(X, _FP_WORK_ROUND); \ } while (0) -#define _FP_ROUND_ZERO(wc, X) 0 +#define _FP_ROUND_ZERO(wc, X) (void)0 #define _FP_ROUND_PINF(wc, X) \ do { \ diff --git a/tools/include/asm/barrier.h b/tools/include/asm/barrier.h @@ -24,6 +24,8 @@ #include "../../arch/ia64/include/asm/barrier.h" #elif defined(__xtensa__) #include "../../arch/xtensa/include/asm/barrier.h" +#elif defined(__nds32__) +#include "../../arch/nds32/include/asm/barrier.h" #else #include <asm-generic/barrier.h> #endif diff --git a/tools/perf/arch/nds32/Build b/tools/perf/arch/nds32/Build @@ -0,0 +1 @@ +libperf-y += util/ diff --git a/tools/perf/arch/nds32/util/Build b/tools/perf/arch/nds32/util/Build @@ -0,0 +1 @@ +libperf-y += header.o diff --git a/tools/perf/arch/nds32/util/header.c b/tools/perf/arch/nds32/util/header.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2017 Andes Technology Corporation + +#include <stdio.h> +#include <stdlib.h> +#include <api/fs/fs.h> +#include "header.h" + +#define STR_LEN 1024 + +char *get_cpuid_str(struct perf_pmu *pmu) +{ + /* In nds32, we only have one cpu */ + char *buf = NULL; + struct cpu_map *cpus; + const char *sysfs = sysfs__mountpoint(); + + if (!sysfs || !pmu || !pmu->cpus) + return NULL; + + buf = malloc(STR_LEN); + if (!buf) + return NULL; + + cpus = cpu_map__get(pmu->cpus); + sprintf(buf, "0x%x", cpus->nr - 1); + cpu_map__put(cpus); + return buf; +} diff --git a/tools/perf/pmu-events/arch/nds32/mapfile.csv b/tools/perf/pmu-events/arch/nds32/mapfile.csv @@ -0,0 +1,15 @@ +# Format: +# MIDR,Version,JSON/file/pathname,Type +# +# where +# MIDR Processor version +# Variant[23:20] and Revision [3:0] should be zero. +# Version could be used to track version of of JSON file +# but currently unused. +# JSON/file/pathname is the path to JSON file, relative +# to tools/perf/pmu-events/arch/arm64/. +# Type is core, uncore etc +# +# +#Family-model,Version,Filename,EventType +0x0,v3,n13,core diff --git a/tools/perf/pmu-events/arch/nds32/n13/atcpmu.json b/tools/perf/pmu-events/arch/nds32/n13/atcpmu.json @@ -0,0 +1,290 @@ +[ + { + "PublicDescription": "Conditional branch", + "EventCode": "0x102", + "EventName": "cond_br", + "BriefDescription": "V3 Conditional branch" + }, + { + "PublicDescription": "Taken conditional branches", + "EventCode": "0x103", + "EventName": "taken_cond_br", + "BriefDescription": "V3 Taken Conditional branch" + }, + { + "PublicDescription": "Prefetch Instruction", + "EventCode": "0x104", + "EventName": "prefetch_inst", + "BriefDescription": "V3 Prefetch Instruction" + }, + { + "PublicDescription": "RET Inst", + "EventCode": "0x105", + "EventName": "ret_inst", + "BriefDescription": "V3 RET Inst" + }, + { + "PublicDescription": "JR(non-RET) instructions", + "EventCode": "0x106", + "EventName": "jr_inst", + "BriefDescription": "V3 JR(non-RET) instructions" + }, + { + "PublicDescription": "JAL/JRAL instructions", + "EventCode": "0x107", + "EventName": "jal_jral_inst", + "BriefDescription": "V3 JAL/JRAL instructions" + }, + { + "PublicDescription": "NOP instructions", + "EventCode": "0x108", + "EventName": "nop_inst", + "BriefDescription": "V3 NOP instructions" + }, + { + "PublicDescription": "SCW instructions", + "EventCode": "0x109", + "EventName": "scw_inst", + "BriefDescription": "V3 SCW instructions" + }, + { + "PublicDescription": "ISB/DSB instructions", + "EventCode": "0x10a", + "EventName": "isb_dsb_inst", + "BriefDescription": "V3 ISB/DSB instructions" + }, + { + "PublicDescription": "CCTL instructions", + "EventCode": "0x10b", + "EventName": "cctl_inst", + "BriefDescription": "V3 CCTL instructions" + }, + { + "PublicDescription": "Taken Interrupts", + "EventCode": "0x10c", + "EventName": "taken_interrupts", + "BriefDescription": "V3 Taken Interrupts" + }, + { + "PublicDescription": "Loads Completed", + "EventCode": "0x10d", + "EventName": "load_completed", + "BriefDescription": "V3 Loads Completed" + }, + { + "PublicDescription": "uITLB accesses", + "EventCode": "0x10e", + "EventName": "uitlb_access", + "BriefDescription": "V3 uITLB accesses" + }, + { + "PublicDescription": "uDTLB accesses", + "EventCode": "0x10f", + "EventName": "udtlb_access", + "BriefDescription": "V3 uDTLB accesses" + }, + { + "PublicDescription": "MTLB accesses", + "EventCode": "0x110", + "EventName": "mtlb_access", + "BriefDescription": "V3 MTLB accesses" + }, + { + "PublicDescription": "DATA_DEPENDENCY_STALL_CYCLES", + "EventCode": "0x112", + "EventName": "data_dependency_stall", + "BriefDescription": "V3 DATA_DEPENDENCY_STALL_CYCLES" + }, + { + "PublicDescription": "DATA_CACHE_MISS_STALL_CYCLES", + "EventCode": "0x113", + "EventName": "dcache_miss_stall", + "BriefDescription": "V3 DATA_CACHE_MISS_STALL_CYCLES" + }, + { + "PublicDescription": "ILM access", + "EventCode": "0x118", + "EventName": "ilm_access", + "BriefDescription": "V3 ILM accesses" + }, + { + "PublicDescription": "LSU BIU CYCLES", + "EventCode": "0x119", + "EventName": "lsu_biu_cycles", + "BriefDescription": "V3 LSU BIU CYCLES" + }, + { + "PublicDescription": "HPTWK BIU CYCLES", + "EventCode": "0x11a", + "EventName": "hptwk_biu_cycles", + "BriefDescription": "V3 HPTWK BIU CYCLES" + }, + { + "PublicDescription": "DMA BIU CYCLES", + "EventCode": "0x11b", + "EventName": "dma_biu_cycles", + "BriefDescription": "V3 DMA BIU CYCLES" + }, + { + "PublicDescription": "CODE CACHE FILL BIU CYCLES", + "EventCode": "0x11c", + "EventName": "icache_fill_biu_cycles", + "BriefDescription": "V3 CODE CACHE FILL BIU CYCLES" + }, + { + "PublicDescription": "LEAGAL UNALIGN DCACHE ACCESS", + "EventCode": "0x11d", + "EventName": "legal_unalined_dcache_access", + "BriefDescription": "V3 LEAGAL UNALIGN DCACHE ACCESS" + }, + { + "PublicDescription": "PUSH25 instructions", + "EventCode": "0x11e", + "EventName": "push25_inst", + "BriefDescription": "V3 PUSH25 instructions" + }, + { + "PublicDescription": "SYSCALL instructions", + "EventCode": "0x11f", + "EventName": "syscall_inst", + "BriefDescription": "V3 SYSCALL instructions" + }, + { + "PublicDescription": "conditional branch miss", + "EventCode": "0x202", + "EventName": "cond_br_miss", + "BriefDescription": "V3 conditional branch miss" + }, + { + "PublicDescription": "taken conditional branch miss", + "EventCode": "0x203", + "EventName": "taken_cond_br_miss", + "BriefDescription": "V3 taken conditional branch miss" + }, + { + "PublicDescription": "Prefetch Instructions with cache hit", + "EventCode": "0x204", + "EventName": "prefetch_icache_hit", + "BriefDescription": "V3 Prefetch Instructions with cache hit" + }, + { + "PublicDescription": "RET mispredict", + "EventCode": "0x205", + "EventName": "ret_mispredict", + "BriefDescription": "V3 RET mispredict" + }, + { + "PublicDescription": "Immediate J instructions", + "EventCode": "0x206", + "EventName": "imm_j_inst", + "BriefDescription": "V3 Immediate J instructions" + }, + { + "PublicDescription": "Multiply instructions", + "EventCode": "0x207", + "EventName": "mul_inst", + "BriefDescription": "V3 Multiply instructions" + }, + { + "PublicDescription": "16 bits instructions", + "EventCode": "0x208", + "EventName": "sixteen_bits_inst", + "BriefDescription": "V3 16 bits instructions" + }, + { + "PublicDescription": "Failed SCW instructions", + "EventCode": "0x209", + "EventName": "fail_scw_inst", + "BriefDescription": "V3 Failed SCW instructions" + }, + { + "PublicDescription": "ld-after-st conflict replays", + "EventCode": "0x20a", + "EventName": "ld_af_st_conflict", + "BriefDescription": "V3 ld-after-st conflict replays" + }, + { + "PublicDescription": "Exception taken", + "EventCode": "0x20c", + "EventName": "exception_taken", + "BriefDescription": "V3 Exception taken" + }, + { + "PublicDescription": "Stores completed", + "EventCode": "0x20d", + "EventName": "store_completed", + "BriefDescription": "V3 Stores completed" + }, + { + "PublicDescription": "uITLB miss", + "EventCode": "0x20e", + "EventName": "uitlb_miss", + "BriefDescription": "V3 uITLB miss" + }, + { + "PublicDescription": "uDTLB miss", + "EventCode": "0x20f", + "EventName": "udtlb_miss", + "BriefDescription": "V3 uDTLB miss" + }, + { + "PublicDescription": "MTLB miss", + "EventCode": "0x210", + "EventName": "mtlb_miss", + "BriefDescription": "V3 MTLB miss" + }, + { + "PublicDescription": "Empty instructions queue stall cycles", + "EventCode": "0x212", + "EventName": "empty_inst_q_stall", + "BriefDescription": "V3 Empty instructions queue stall cycles" + }, + { + "PublicDescription": "Data write back", + "EventCode": "0x213", + "EventName": "data_wb", + "BriefDescription": "V3 Data write back" + }, + { + "PublicDescription": "DLM access", + "EventCode": "0x218", + "EventName": "dlm_access", + "BriefDescription": "V3 DLM access" + }, + { + "PublicDescription": "LSU BIU request", + "EventCode": "0x219", + "EventName": "lsu_biu_req", + "BriefDescription": "V3 LSU BIU request" + }, + { + "PublicDescription": "HPTWK BIU request", + "EventCode": "0x21a", + "EventName": "hptwk_biu_req", + "BriefDescription": "V3 HPTWK BIU request" + }, + { + "PublicDescription": "DMA BIU request", + "EventCode": "0x21b", + "EventName": "dma_biu_req", + "BriefDescription": "V3 DMA BIU request" + }, + { + "PublicDescription": "Icache fill BIU request", + "EventCode": "0x21c", + "EventName": "icache_fill_biu_req", + "BriefDescription": "V3 Icache fill BIU request" + }, + { + "PublicDescription": "External events", + "EventCode": "0x21d", + "EventName": "external_events", + "BriefDescription": "V3 External events" + }, + { + "PublicDescription": "POP25 instructions", + "EventCode": "0x21e", + "EventName": "pop25_inst", + "BriefDescription": "V3 POP25 instructions" + }, +]