/* $NetBSD: clock.c,v 1.70 2018/06/30 14:59:38 riastradh Exp $ */ /*- * Copyright (c) 2017, 2018 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.70 2018/06/30 14:59:38 riastradh Exp $"); #include #include #include #include #include /* * struct pvclock_cpu * * Per-CPU state for pvclock. */ struct pvclock_cpu { volatile struct vcpu_time_info *pvc_vcputime; struct evcnt pvc_tsc_backwards_evcnt; struct evcnt pvc_tsc_delta_negative_evcnt; struct evcnt pvc_raw_systime_wraparound_evcnt; struct evcnt pvc_raw_systime_backwards_evcnt; uint64_t pvc_last_systime_ns; uint64_t pvc_systime_ns_skew; }; /* * struct pvclock_ticket * * State for a read section, during which a caller may read from * fields of a struct vcpu_time_info and call pvclock_rdtsc. * Caller must enter with pvclock_enter, exit with pvclock_exit, * and be prepared to retry if pvclock_exit fails. */ struct pvclock_ticket { uint64_t version; }; static uint64_t pvclock_systime_ns(struct pvclock_cpu *); static uint64_t pvclock_global_systime_ns(void); static unsigned pvclock_get_timecount(struct timecounter *); /* * pvclock_percpu * * Per-CPU state for pvclock. */ static struct percpu *pvclock_percpu __read_mostly; /* struct pvclock_cpu */ /* * pvclock timecounter: * * pvclock system time, plus an adjustment with rdtsc. */ static struct timecounter pvclock_timecounter = { .tc_get_timecount = pvclock_get_timecount, .tc_poll_pps = NULL, .tc_counter_mask = ~0U, .tc_frequency = 1000000000ULL, /* 1 GHz, i.e. units of nanoseconds */ .tc_name = "pvclock", .tc_quality = 10000, }; /* * pvclock_global_systime_ns_stamp * * The latest pvclock system time that has been observed on any * CPU, for a global monotonic view of the pvclock. */ static volatile uint64_t pvclock_global_systime_ns_stamp __cacheline_aligned; /* * pvclock_alloc() * * Allocate resources for pvclock. Must precede any use of * pvclock_init_cpu and pvclock_init. */ void pvclock_alloc(void) { pvclock_percpu = percpu_alloc(sizeof(struct pvclock_cpu)); } /* * pvclock_cpu_init(ci, vt) * * Initialize CPU ci with the specified struct vcpu_time_info * pointer. Caller must guarantee only one CPU at a time this for * any particular CPU, before pvclock_init(). */ void pvclock_cpu_init(struct cpu_info *ci, volatile struct vcpu_time_info *vt) { const char *xname = device_xname(ci->ci_dev); struct pvclock_cpu *pvc = percpu_getptr_remote(pvclock_percpu, ci); KASSERT(pvc != NULL); pvc->pvc_vcputime = vt; evcnt_attach_dynamic(&pvc->pvc_cpu_tsc_backwards_evcnt, EVCNT_TYPE_INTR, NULL, xname, "cpu tsc ran backwards"); evcnt_attach_dynamic(&pvc->pvc_tsc_delta_negative_evcnt, EVCNT_TYPE_INTR, NULL, xname, "tsc delta went negative"); evcnt_attach_dynamic(&pvc->pvc_raw_systime_wraparound_evcnt, EVCNT_TYPE_INTR, NULL, xname, "raw systime wrapped around"); evcnt_attach_dynamic(&pvc->pvc_raw_systime_backwards_evcnt, EVCNT_TYPE_INTR, NULL, xname, "raw systime went backwards"); } /* * pvclock_init() * * Initialize the pvclock. Must follow pvclock_alloc and * pvclock_init_cpu on all CPUs. */ void pvclock_init(void) { tc_init(&pvclock_timecounter); } /* * pvclock_rdtsc() * * Read the local pCPU's tsc. */ static inline uint64_t pvclock_rdtsc(void) { uint32_t lo, hi; asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); return ((uint64_t)hi << 32) | lo; } /* * pvclock_enter(pvc, tp) * * Enter a pvclock read section and store a ticket in *tp, which * the caller must use with pvclock_exit. Return a pointer to the * current CPU's vcpu_time_info structure. Caller must already be * bound to the CPU. */ static inline volatile struct vcpu_time_info * pvclock_enter(struct pvclock_cpu *pvc, struct pvclock_ticket *tp) { volatile struct vcpu_time_info *vt = pvc->pvc_vcputime; while (__predict_false(1 & (tp->version = vt->version))) SPINLOCK_BACKOFF_HOOK; /* * Must read the version before reading the tsc on the local * pCPU. We are racing only with interruption by the * hypervisor, so no need for a stronger memory barrier. */ __insn_barrier(); return vt; } /* * pvclock_exit(pvc, vt, tp) * * Exit a pvclock read section with the ticket in *tp from * pvclock_enter. Return true on success, false if caller must * retry. */ static inline bool pvclock_exit(struct pvclock_cpu *pvc, volatile struct vcpu_time_info *vt, struct pvclock_ticket *tp) { KASSERT(vt == pvc->pvc_vcputime); /* * Must read the tsc before re-reading the version on the local * pCPU. We are racing only with interruption by the * hypervisor, so no need for a stronger memory barrier. */ __insn_barrier(); return tp->version == vt->version; } /* * pvclock_tsc_to_ns_delta(delta_tsc, mul_frac, shift) * * Convert a difference in tsc units to a difference in * nanoseconds given a multiplier and shift for the unit * conversion. */ static inline uint64_t pvclock_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul, int8_t tsc_shift) { uint32_t delta_tsc_hi, delta_tsc_lo; if (tsc_shift < 0) delta_tsc >>= -tsc_shift; else delta_tsc <<= tsc_shift; delta_tsc_hi = delta_tsc >> 32; delta_tsc_lo = delta_tsc & 0xffffffffUL; /* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */ return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) + (((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32); } /* * pvclock_systime_ns(pvc) * * Return a snapshot of the pvclock system time plus an adjustment * from the tsc, in units of nanoseconds. Caller must be bound to * the current CPU. */ static uint64_t pvclock_systime_ns(struct pvclock_cpu *pvc) { volatile struct vcpu_time_info *vt; struct pvclock_ticket ticket; uint64_t raw_systime_ns, tsc_timestamp, tsc, delta_tsc, delta_ns; uint32_t tsc_to_system_mul; int8_t tsc_shift; uint64_t systime_ns; /* * Repeatedly try to read the system time, corresponding tsc * timestamp, and tsc frequency until we get a consistent view. */ do { vt = pvclock_enter(pvc, &ticket); /* Grab hypervisor's snapshot of raw system time and tsc. */ raw_systime_ns = vt->system_time; tsc_timestamp = vt->tsc_timestamp; /* Get hypervisor's idea of how fast the tsc is counting. */ tsc_to_system_mul = vt->tsc_to_system_mul; tsc_shift = vt->tsc_shift; /* Read the CPU's tsc. */ tsc = pvclock_rdtsc(); } while (!pvclock_exit(pvc, vt, &ticket)); /* * Out of paranoia, check whether the tsc has gone backwards * since the pvclock timestamp. * * This shouldn't happen because the hypervisor is supposed to * have read the tsc _before_ writing to the vcpu_time_info * page, _before_ we read the tsc. * * Further, if we switched pCPUs after reading the tsc * timestamp but before reading the CPU's tsc, the hypervisor * had better notify us by updating the version too and forcing * us to retry the vCPU time read. */ if (__predict_false(tsc < tsc_timestamp)) { /* * Notify the console that the CPU's tsc appeared to * run behind the hypervisor's idea of it, and pretend * it hadn't. */ #if PVCLOCK_CLOCK_DEBUG /* XXX dtrace hook */ printf("pvclock cpu tsc %"PRIu64 " ran backwards from timestamp %"PRIu64 " by %"PRIu64"\n", tsc, tsc_timestamp, tsc_timestamp - tsc); #endif pvc->pvc_cpu_tsc_backwards_evcnt.ev_count++; delta_ns = delta_tsc = 0; } else { /* Find how far the CPU's tsc has advanced. */ delta_tsc = tsc - tsc_timestamp; /* Convert the tsc delta to a nanosecond delta. */ delta_ns = pvclock_tsc_to_ns_delta(delta_tsc, tsc_to_system_mul, tsc_shift); } /* * Notify the console if the delta computation yielded a * negative, and pretend it hadn't. * * This doesn't make sense but I include it out of paranoia. */ if (__predict_false((int64_t)delta_ns < 0)) { #if PVCLOCK_CLOCK_DEBUG /* XXX dtrace hook */ printf("pvclock tsc delta in ns went negative: %"PRId64"\n", delta_ns); #endif pvc->pvc_tsc_delta_negative_evcnt.ev_count++; delta_ns = 0; } /* * Compute the TSC-adjusted system time. */ systime_ns = raw_systime_ns + delta_ns; /* * Notify the console if the addition wrapped around. * * This shouldn't happen because system time should be relative * to a reasonable reference point, not centuries in the past. * (2^64 ns is approximately half a millennium.) */ if (__predict_false(systime_ns < raw_systime_ns)) { #if PVCLOCK_CLOCK_DEBUG /* XXX dtrace hook */ printf("pvclock raw systime + tsc delta wrapped around:" " %"PRIu64" + %"PRIu64" = %"PRIu64"\n", raw_systime_ns, delta_ns, systime_ns); #endif pvc->pvc_raw_systime_wraparound_evcnt.ev_count++; } /* * Notify the console if the TSC-adjusted pvclock system time * appears to have gone backwards, and pretend we had gone * forward. This seems to happen pretty regularly under load. */ if (__predict_false(pvc->pvc_last_systime_ns > systime_ns)) { #if PVCLOCK_CLOCK_DEBUG /* XXX dtrace hook */ printf("pvclock raw systime + tsc delta went backwards:" " %"PRIu64" > %"PRIu64"\n", pvc->pvc_last_systime_ns, systime_ns); printf(" raw_systime_ns=%"PRIu64"\n tsc_timestamp=%"PRIu64"\n" " tsc=%"PRIu64"\n tsc_to_system_mul=%"PRIu32"\n" " tsc_shift=%"PRId8"\n delta_tsc=%"PRIu64"\n" " delta_ns=%"PRIu64"\n", raw_systime_ns, tsc_timestamp, tsc, tsc_to_system_mul, tsc_shift, delta_tsc, delta_ns); #endif pvc->pvc_raw_systime_backwards_evcnt.ev_count++; systime_ns = pvc->pvc_last_systime_ns + 1; } /* Remember the TSC-adjusted pvclock system time. */ pvc->pvc_last_systime_ns = systime_ns; /* And we're done: return the TSC-adjusted systime in nanoseconds. */ return systime_ns; } /* * pvclock_raw_systime_ns() * * Return a snapshot of the current pvclock system time to the * resolution of the hypervisor tick, in units of nanoseconds. */ uint64_t pvclock_raw_systime_ns(void) { struct pvclock_cpu *pvc; volatile struct vcpu_time_info *vt; struct pvclock_ticket ticket; uint64_t raw_systime_ns; pvc = percpu_getref(pvclock_percpu); do { vt = pvclock_enter(pvc, &ticket); raw_systime_ns = vt->system_time; } while (!pvclock_exit(pvc, vt, &ticket)); percpu_putref(pvclock_percpu); return raw_systime_ns; } /* * pvclock_global_systime_ns() * * Return a global monotonic view of the system time in * nanoseconds, computed by the per-CPU pvclock raw system time * plus an rdtsc adjustment, and advance the view of the system * time for all other CPUs. */ static uint64_t pvclock_global_systime_ns(void) { struct pvclock_cpu *pvc; uint64_t local, global, result; int bound; /* * Find the local timecount on this CPU, and make sure it does * not precede the latest global timecount witnessed so far by * any CPU. If it does, add to the local CPU's skew from the * fastest CPU. * * XXX Can we avoid retrying if the CAS fails? */ pvc = percpu_getref(pvclock_percpu); do { local = pvclock_systime_ns(); local += pvc->pvc_systime_ns_skew; global = pvclock_global_systime_ns_stamp; if (__predict_false(local < global + 1)) { result = global + 1; pvc->pvc_systime_ns_skew += global + 1 - local; } else { result = local; } } while (atomic_cas_64(&pvclock_global_systime_ns_stamp, global, result) != global); percpu_putref(pvclock_percpu); return result; } /* * pvclock_get_timecount(tc) * * Return the low 32 bits of a global monotonic view of the * pvclock system time. */ static unsigned pvclock_get_timecount(struct timecounter *tc) { KASSERT(tc == &pvclock_timecounter); return (unsigned)pvclock_global_systime_ns(); }