whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 034bda1cd5abbe7b170ce76b618768d164030bbd
parent d82924c3b8d0607094b94fab290a33c5ad7d586c
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Tue, 23 Oct 2018 19:07:25 +0100

Merge branch 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 vdso updates from Ingo Molnar:
 "Two main changes:

   - Cleanups, simplifications and CLOCK_TAI support (Thomas Gleixner)

   - Improve code generation (Andy Lutomirski)"

* 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/vdso: Rearrange do_hres() to improve code generation
  x86/vdso: Document vgtod_ts better
  x86/vdso: Remove "memory" clobbers in the vDSO syscall fallbacks
  x66/vdso: Add CLOCK_TAI support
  x86/vdso: Move cycle_last handling into the caller
  x86/vdso: Simplify the invalid vclock case
  x86/vdso: Replace the clockid switch case
  x86/vdso: Collapse coarse functions
  x86/vdso: Collapse high resolution functions
  x86/vdso: Introduce and use vgtod_ts
  x86/vdso: Use unsigned int consistently for vsyscall_gtod_data:: Seq
  x86/vdso: Enforce 64bit clocksource
  x86/time: Implement clocksource_arch_init()
  clocksource: Provide clocksource_arch_init()

Diffstat:
March/x86/Kconfig | 1+
March/x86/entry/vdso/vclock_gettime.c | 210+++++++++++++++++++++++--------------------------------------------------------
March/x86/entry/vsyscall/vsyscall_gtod.c | 51++++++++++++++++++++++++++++-----------------------
March/x86/include/asm/vgtod.h | 53+++++++++++++++++++++++++++++++++--------------------
March/x86/kernel/time.c | 22++++++++++++++++++++++
Minclude/linux/clocksource.h | 5+++++
Mkernel/time/Kconfig | 4++++
Mkernel/time/clocksource.c | 2++
8 files changed, 155 insertions(+), 193 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig @@ -48,6 +48,7 @@ config X86 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ANON_INODES select ARCH_CLOCKSOURCE_DATA + select ARCH_CLOCKSOURCE_INIT select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEBUG_VIRTUAL diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c @@ -45,21 +45,10 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) long ret; asm ("syscall" : "=a" (ret), "=m" (*ts) : "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : - "memory", "rcx", "r11"); + "rcx", "r11"); return ret; } -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm ("syscall" : "=a" (ret), "=m" (*tv), "=m" (*tz) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : - "memory", "rcx", "r11"); - return ret; -} - - #else notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) @@ -73,22 +62,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) "mov %%edx, %%ebx \n" : "=a" (ret), "=m" (*ts) : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) - : "memory", "edx"); - return ret; -} - -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm ( - "mov %%ebx, %%edx \n" - "mov %[tv], %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret), "=m" (*tv), "=m" (*tz) - : "0" (__NR_gettimeofday), [tv] "g" (tv), "c" (tz) - : "memory", "edx"); + : "edx"); return ret; } @@ -100,12 +74,11 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) return (const struct pvclock_vsyscall_time_info *)&pvclock_page; } -static notrace u64 vread_pvclock(int *mode) +static notrace u64 vread_pvclock(void) { const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; - u64 ret; - u64 last; u32 version; + u64 ret; /* * Note: The kernel and hypervisor must guarantee that cpu ID @@ -132,175 +105,112 @@ static notrace u64 vread_pvclock(int *mode) do { version = pvclock_read_begin(pvti); - if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { - *mode = VCLOCK_NONE; - return 0; - } + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) + return U64_MAX; ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); } while (pvclock_read_retry(pvti, version)); - /* refer to vread_tsc() comment for rationale */ - last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - return last; + return ret; } #endif #ifdef CONFIG_HYPERV_TSCPAGE -static notrace u64 vread_hvclock(int *mode) +static notrace u64 vread_hvclock(void) { const struct ms_hyperv_tsc_page *tsc_pg = (const struct ms_hyperv_tsc_page *)&hvclock_page; - u64 current_tick = hv_read_tsc_page(tsc_pg); - - if (current_tick != U64_MAX) - return current_tick; - *mode = VCLOCK_NONE; - return 0; + return hv_read_tsc_page(tsc_pg); } #endif -notrace static u64 vread_tsc(void) +notrace static inline u64 vgetcyc(int mode) { - u64 ret = (u64)rdtsc_ordered(); - u64 last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a function of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} - -notrace static inline u64 vgetsns(int *mode) -{ - u64 v; - cycles_t cycles; - - if (gtod->vclock_mode == VCLOCK_TSC) - cycles = vread_tsc(); + if (mode == VCLOCK_TSC) + return (u64)rdtsc_ordered(); #ifdef CONFIG_PARAVIRT_CLOCK - else if (gtod->vclock_mode == VCLOCK_PVCLOCK) - cycles = vread_pvclock(mode); + else if (mode == VCLOCK_PVCLOCK) + return vread_pvclock(); #endif #ifdef CONFIG_HYPERV_TSCPAGE - else if (gtod->vclock_mode == VCLOCK_HVCLOCK) - cycles = vread_hvclock(mode); + else if (mode == VCLOCK_HVCLOCK) + return vread_hvclock(); #endif - else - return 0; - v = (cycles - gtod->cycle_last) & gtod->mask; - return v * gtod->mult; + return U64_MAX; } -/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ -notrace static int __always_inline do_realtime(struct timespec *ts) +notrace static int do_hres(clockid_t clk, struct timespec *ts) { - unsigned long seq; - u64 ns; - int mode; + struct vgtod_ts *base = &gtod->basetime[clk]; + u64 cycles, last, sec, ns; + unsigned int seq; do { seq = gtod_read_begin(gtod); - mode = gtod->vclock_mode; - ts->tv_sec = gtod->wall_time_sec; - ns = gtod->wall_time_snsec; - ns += vgetsns(&mode); + cycles = vgetcyc(gtod->vclock_mode); + ns = base->nsec; + last = gtod->cycle_last; + if (unlikely((s64)cycles < 0)) + return vdso_fallback_gettime(clk, ts); + if (cycles > last) + ns += (cycles - last) * gtod->mult; ns >>= gtod->shift; + sec = base->sec; } while (unlikely(gtod_read_retry(gtod, seq))); - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + /* + * Do this outside the loop: a race inside the loop could result + * in __iter_div_u64_rem() being extremely slow. + */ + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); ts->tv_nsec = ns; - return mode; + return 0; } -notrace static int __always_inline do_monotonic(struct timespec *ts) +notrace static void do_coarse(clockid_t clk, struct timespec *ts) { - unsigned long seq; - u64 ns; - int mode; + struct vgtod_ts *base = &gtod->basetime[clk]; + unsigned int seq; do { seq = gtod_read_begin(gtod); - mode = gtod->vclock_mode; - ts->tv_sec = gtod->monotonic_time_sec; - ns = gtod->monotonic_time_snsec; - ns += vgetsns(&mode); - ns >>= gtod->shift; + ts->tv_sec = base->sec; + ts->tv_nsec = base->nsec; } while (unlikely(gtod_read_retry(gtod, seq))); - - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; - - return mode; } -notrace static void do_realtime_coarse(struct timespec *ts) +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - unsigned long seq; - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = gtod->wall_time_coarse_sec; - ts->tv_nsec = gtod->wall_time_coarse_nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); -} + unsigned int msk; -notrace static void do_monotonic_coarse(struct timespec *ts) -{ - unsigned long seq; - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = gtod->monotonic_time_coarse_sec; - ts->tv_nsec = gtod->monotonic_time_coarse_nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); -} + /* Sort out negative (CPU/FD) and invalid clocks */ + if (unlikely((unsigned int) clock >= MAX_CLOCKS)) + return vdso_fallback_gettime(clock, ts); -notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) -{ - switch (clock) { - case CLOCK_REALTIME: - if (do_realtime(ts) == VCLOCK_NONE) - goto fallback; - break; - case CLOCK_MONOTONIC: - if (do_monotonic(ts) == VCLOCK_NONE) - goto fallback; - break; - case CLOCK_REALTIME_COARSE: - do_realtime_coarse(ts); - break; - case CLOCK_MONOTONIC_COARSE: - do_monotonic_coarse(ts); - break; - default: - goto fallback; + /* + * Convert the clockid to a bitmask and use it to check which + * clocks are handled in the VDSO directly. + */ + msk = 1U << clock; + if (likely(msk & VGTOD_HRES)) { + return do_hres(clock, ts); + } else if (msk & VGTOD_COARSE) { + do_coarse(clock, ts); + return 0; } - - return 0; -fallback: return vdso_fallback_gettime(clock, ts); } + int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { if (likely(tv != NULL)) { - if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) - return vdso_fallback_gtod(tv, tz); + struct timespec *ts = (struct timespec *) tv; + + do_hres(CLOCK_REALTIME, ts); tv->tv_usec /= 1000; } if (unlikely(tz != NULL)) { @@ -320,7 +230,7 @@ int gettimeofday(struct timeval *, struct timezone *) notrace time_t __vdso_time(time_t *t) { /* This is atomic on x86 so we don't need any locks. */ - time_t result = READ_ONCE(gtod->wall_time_sec); + time_t result = READ_ONCE(gtod->basetime[CLOCK_REALTIME].sec); if (t) *t = result; diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -31,6 +31,8 @@ void update_vsyscall(struct timekeeper *tk) { int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + struct vgtod_ts *base; + u64 nsec; /* Mark the new vclock used. */ BUILD_BUG_ON(VCLOCK_MAX >= 32); @@ -45,34 +47,37 @@ void update_vsyscall(struct timekeeper *tk) vdata->mult = tk->tkr_mono.mult; vdata->shift = tk->tkr_mono.shift; - vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; + base = &vdata->basetime[CLOCK_REALTIME]; + base->sec = tk->xtime_sec; + base->nsec = tk->tkr_mono.xtime_nsec; - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec - + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->tkr_mono.shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; - vdata->monotonic_time_sec++; - } + base = &vdata->basetime[CLOCK_TAI]; + base->sec = tk->xtime_sec + (s64)tk->tai_offset; + base->nsec = tk->tkr_mono.xtime_nsec; - vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> - tk->tkr_mono.shift); + base = &vdata->basetime[CLOCK_MONOTONIC]; + base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec; + nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + nsec -= ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; + base->sec++; + } + base->nsec = nsec; - vdata->monotonic_time_coarse_sec = - vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_coarse_nsec = - vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + base = &vdata->basetime[CLOCK_REALTIME_COARSE]; + base->sec = tk->xtime_sec; + base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { - vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; - vdata->monotonic_time_coarse_sec++; + base = &vdata->basetime[CLOCK_MONOTONIC_COARSE]; + base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec += tk->wall_to_monotonic.tv_nsec; + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + base->sec++; } + base->nsec = nsec; gtod_write_end(vdata); } diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h @@ -5,33 +5,46 @@ #include <linux/compiler.h> #include <linux/clocksource.h> +#include <uapi/linux/time.h> + #ifdef BUILD_VDSO32_64 typedef u64 gtod_long_t; #else typedef unsigned long gtod_long_t; #endif + +/* + * There is one of these objects in the vvar page for each + * vDSO-accelerated clockid. For high-resolution clocks, this encodes + * the time corresponding to vsyscall_gtod_data.cycle_last. For coarse + * clocks, this encodes the actual time. + * + * To confuse the reader, for high-resolution clocks, nsec is left-shifted + * by vsyscall_gtod_data.shift. + */ +struct vgtod_ts { + u64 sec; + u64 nsec; +}; + +#define VGTOD_BASES (CLOCK_TAI + 1) +#define VGTOD_HRES (BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC) | BIT(CLOCK_TAI)) +#define VGTOD_COARSE (BIT(CLOCK_REALTIME_COARSE) | BIT(CLOCK_MONOTONIC_COARSE)) + /* * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time * so be carefull by modifying this structure. */ struct vsyscall_gtod_data { - unsigned seq; - - int vclock_mode; - u64 cycle_last; - u64 mask; - u32 mult; - u32 shift; - - /* open coded 'struct timespec' */ - u64 wall_time_snsec; - gtod_long_t wall_time_sec; - gtod_long_t monotonic_time_sec; - u64 monotonic_time_snsec; - gtod_long_t wall_time_coarse_sec; - gtod_long_t wall_time_coarse_nsec; - gtod_long_t monotonic_time_coarse_sec; - gtod_long_t monotonic_time_coarse_nsec; + unsigned int seq; + + int vclock_mode; + u64 cycle_last; + u64 mask; + u32 mult; + u32 shift; + + struct vgtod_ts basetime[VGTOD_BASES]; int tz_minuteswest; int tz_dsttime; @@ -44,9 +57,9 @@ static inline bool vclock_was_used(int vclock) return READ_ONCE(vclocks_used) & (1 << vclock); } -static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) +static inline unsigned int gtod_read_begin(const struct vsyscall_gtod_data *s) { - unsigned ret; + unsigned int ret; repeat: ret = READ_ONCE(s->seq); @@ -59,7 +72,7 @@ repeat: } static inline int gtod_read_retry(const struct vsyscall_gtod_data *s, - unsigned start) + unsigned int start) { smp_rmb(); return unlikely(s->seq != start); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c @@ -10,6 +10,7 @@ * */ +#include <linux/clocksource.h> #include <linux/clockchips.h> #include <linux/interrupt.h> #include <linux/irq.h> @@ -105,3 +106,24 @@ void __init time_init(void) { late_time_init = x86_late_time_init; } + +/* + * Sanity check the vdso related archdata content. + */ +void clocksource_arch_init(struct clocksource *cs) +{ + if (cs->archdata.vclock_mode == VCLOCK_NONE) + return; + + if (cs->archdata.vclock_mode > VCLOCK_MAX) { + pr_warn("clocksource %s registered with invalid vclock_mode %d. Disabling vclock.\n", + cs->name, cs->archdata.vclock_mode); + cs->archdata.vclock_mode = VCLOCK_NONE; + } + + if (cs->mask != CLOCKSOURCE_MASK(64)) { + pr_warn("clocksource %s registered with invalid mask %016llx. Disabling vclock.\n", + cs->name, cs->mask); + cs->archdata.vclock_mode = VCLOCK_NONE; + } +} diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h @@ -241,6 +241,11 @@ static inline void __clocksource_update_freq_khz(struct clocksource *cs, u32 khz __clocksource_update_freq_scale(cs, 1000, khz); } +#ifdef CONFIG_ARCH_CLOCKSOURCE_INIT +extern void clocksource_arch_init(struct clocksource *cs); +#else +static inline void clocksource_arch_init(struct clocksource *cs) { } +#endif extern int timekeeping_notify(struct clocksource *clock); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig @@ -12,6 +12,10 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool +# Architecture has extra clocksource init called from registration +config ARCH_CLOCKSOURCE_INIT + bool + # Clocksources require validation of the clocksource against the last # cycle update - x86/TSC misfeature config CLOCKSOURCE_VALIDATE_LAST_CYCLE diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c @@ -937,6 +937,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) { unsigned long flags; + clocksource_arch_init(cs); + /* Initialize mult/shift and max_idle_ns */ __clocksource_update_freq_scale(cs, scale, freq);