[deliverable/linux.git] / arch / x86 / xen / time.c

/*
 * Xen time implementation.
 *
 * This is implemented in terms of a clocksource driver which uses
 * the hypervisor clock as a nanosecond timebase, and a clockevent
 * driver which uses the hypervisor's timer mechanism.
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/kernel_stat.h>
#include <linux/math64.h>

#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>

#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>

#include "xen-ops.h"

#define XEN_SHIFT 22

/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP	100000
#define NS_PER_TICK	(1000000000LL / HZ)

static cycle_t xen_clocksource_read(void);

/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);

/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);

/* unused ns of stolen and blocked time */
static DEFINE_PER_CPU(u64, residual_stolen);
static DEFINE_PER_CPU(u64, residual_blocked);

/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
{
	u64 ret;

	if (BITS_PER_LONG < 64) {
		u32 *p32 = (u32 *)p;
		u32 h, l;

		/*
		 * Read high then low, and then make sure high is
		 * still the same; this will only loop if low wraps
		 * and carries into high.
		 * XXX some clean way to make this endian-proof?
		 */
		do {
			h = p32[1];
			barrier();
			l = p32[0];
			barrier();
		} while (p32[1] != h);

		ret = (((u64)h) << 32) | l;
	} else
		ret = *p;

	return ret;
}

/*
 * Runstate accounting
 */
static void get_runstate_snapshot(struct vcpu_runstate_info *res)
{
	u64 state_time;
	struct vcpu_runstate_info *state;

	BUG_ON(preemptible());

	state = &__get_cpu_var(runstate);

	/*
	 * The runstate info is always updated by the hypervisor on
	 * the current CPU, so there's no need to use anything
	 * stronger than a compiler barrier when fetching it.
	 */
	do {
		state_time = get64(&state->state_entry_time);
		barrier();
		*res = *state;
		barrier();
	} while (get64(&state->state_entry_time) != state_time);
}

/* return true when a vcpu could run but has no real cpu to run on */
bool xen_vcpu_stolen(int vcpu)
{
	return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
}

static void setup_runstate_info(int cpu)
{
	struct vcpu_register_runstate_memory_area area;

	area.addr.v = &per_cpu(runstate, cpu);

	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
			       cpu, &area))
		BUG();
}

static void do_stolen_accounting(void)
{
	struct vcpu_runstate_info state;
	struct vcpu_runstate_info *snap;
	s64 blocked, runnable, offline, stolen;
	cputime_t ticks;

	get_runstate_snapshot(&state);

	WARN_ON(state.state != RUNSTATE_running);

	snap = &__get_cpu_var(runstate_snapshot);

	/* work out how much time the VCPU has not been runn*ing*  */
	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];

	*snap = state;

	/* Add the appropriate number of ticks of stolen time,
	   including any left-overs from last time.  Passing NULL to
	   account_steal_time accounts the time as stolen. */
	stolen = runnable + offline + __get_cpu_var(residual_stolen);

	if (stolen < 0)
		stolen = 0;

	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
	__get_cpu_var(residual_stolen) = stolen;
	account_steal_time(NULL, ticks);

	/* Add the appropriate number of ticks of blocked time,
	   including any left-overs from last time.  Passing idle to
	   account_steal_time accounts the time as idle/wait. */
	blocked += __get_cpu_var(residual_blocked);

	if (blocked < 0)
		blocked = 0;

	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
	__get_cpu_var(residual_blocked) = blocked;
	account_steal_time(idle_task(smp_processor_id()), ticks);
}

/*
 * Xen sched_clock implementation.  Returns the number of unstolen
 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
 * states.
 */
unsigned long long xen_sched_clock(void)
{
	struct vcpu_runstate_info state;
	cycle_t now;
	u64 ret;
	s64 offset;

	/*
	 * Ideally sched_clock should be called on a per-cpu basis
	 * anyway, so preempt should already be disabled, but that's
	 * not current practice at the moment.
	 */
	preempt_disable();

	now = xen_clocksource_read();

	get_runstate_snapshot(&state);

	WARN_ON(state.state != RUNSTATE_running);

	offset = now - state.state_entry_time;
	if (offset < 0)
		offset = 0;

	ret = state.time[RUNSTATE_blocked] +
		state.time[RUNSTATE_running] +
		offset;

	preempt_enable();

	return ret;
}


/* Get the CPU speed from Xen */
unsigned long xen_cpu_khz(void)
{
	u64 xen_khz = 1000000ULL << 32;
	const struct pvclock_vcpu_time_info *info =
		&HYPERVISOR_shared_info->vcpu_info[0].time;

	do_div(xen_khz, info->tsc_to_system_mul);
	if (info->tsc_shift < 0)
		xen_khz <<= -info->tsc_shift;
	else
		xen_khz >>= info->tsc_shift;

	return xen_khz;
}

static cycle_t xen_clocksource_read(void)
{
        struct pvclock_vcpu_time_info *src;
	cycle_t ret;

	src = &get_cpu_var(xen_vcpu)->time;
	ret = pvclock_clocksource_read(src);
	put_cpu_var(xen_vcpu);
	return ret;
}

static void xen_read_wallclock(struct timespec *ts)
{
	struct shared_info *s = HYPERVISOR_shared_info;
	struct pvclock_wall_clock *wall_clock = &(s->wc);
        struct pvclock_vcpu_time_info *vcpu_time;

	vcpu_time = &get_cpu_var(xen_vcpu)->time;
	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
	put_cpu_var(xen_vcpu);
}

unsigned long xen_get_wallclock(void)
{
	struct timespec ts;

	xen_read_wallclock(&ts);
	return ts.tv_sec;
}

int xen_set_wallclock(unsigned long now)
{
	/* do nothing for domU */
	return -1;
}

static struct clocksource xen_clocksource __read_mostly = {
	.name = "xen",
	.rating = 400,
	.read = xen_clocksource_read,
	.mask = ~0,
	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
	.shift = XEN_SHIFT,
	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};

/*
   Xen clockevent implementation

   Xen has two clockevent implementations:

   The old timer_op one works with all released versions of Xen prior
   to version 3.0.4.  This version of the hypervisor provides a
   single-shot timer with nanosecond resolution.  However, sharing the
   same event channel is a 100Hz tick which is delivered while the
   vcpu is running.  We don't care about or use this tick, but it will
   cause the core time code to think the timer fired too soon, and
   will end up resetting it each time.  It could be filtered, but
   doing so has complications when the ktime clocksource is not yet
   the xen clocksource (ie, at boot time).

   The new vcpu_op-based timer interface allows the tick timer period
   to be changed or turned off.  The tick timer is not useful as a
   periodic timer because events are only delivered to running vcpus.
   The one-shot timer can report when a timeout is in the past, so
   set_next_event is capable of returning -ETIME when appropriate.
   This interface is used when available.
*/


/*
  Get a hypervisor absolute time.  In theory we could maintain an
  offset between the kernel's time and the hypervisor's time, and
  apply that to a kernel's absolute timeout.  Unfortunately the
  hypervisor and kernel times can drift even if the kernel is using
  the Xen clocksource, because ntp can warp the kernel's clocksource.
*/
static s64 get_abs_timeout(unsigned long delta)
{
	return xen_clocksource_read() + delta;
}

static void xen_timerop_set_mode(enum clock_event_mode mode,
				 struct clock_event_device *evt)
{
	switch (mode) {
	case CLOCK_EVT_MODE_PERIODIC:
		/* unsupported */
		WARN_ON(1);
		break;

	case CLOCK_EVT_MODE_ONESHOT:
	case CLOCK_EVT_MODE_RESUME:
		break;

	case CLOCK_EVT_MODE_UNUSED:
	case CLOCK_EVT_MODE_SHUTDOWN:
		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
		break;
	}
}

static int xen_timerop_set_next_event(unsigned long delta,
				      struct clock_event_device *evt)
{
	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);

	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
		BUG();

	/* We may have missed the deadline, but there's no real way of
	   knowing for sure.  If the event was in the past, then we'll
	   get an immediate interrupt. */

	return 0;
}

static const struct clock_event_device xen_timerop_clockevent = {
	.name = "xen",
	.features = CLOCK_EVT_FEAT_ONESHOT,

	.max_delta_ns = 0xffffffff,
	.min_delta_ns = TIMER_SLOP,

	.mult = 1,
	.shift = 0,
	.rating = 500,

	.set_mode = xen_timerop_set_mode,
	.set_next_event = xen_timerop_set_next_event,
};


static void xen_vcpuop_set_mode(enum clock_event_mode mode,
				struct clock_event_device *evt)
{
	int cpu = smp_processor_id();

	switch (mode) {
	case CLOCK_EVT_MODE_PERIODIC:
		WARN_ON(1);	/* unsupported */
		break;

	case CLOCK_EVT_MODE_ONESHOT:
		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
			BUG();
		break;

	case CLOCK_EVT_MODE_UNUSED:
	case CLOCK_EVT_MODE_SHUTDOWN:
		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
			BUG();
		break;
	case CLOCK_EVT_MODE_RESUME:
		break;
	}
}

static int xen_vcpuop_set_next_event(unsigned long delta,
				     struct clock_event_device *evt)
{
	int cpu = smp_processor_id();
	struct vcpu_set_singleshot_timer single;
	int ret;

	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);

	single.timeout_abs_ns = get_abs_timeout(delta);
	single.flags = VCPU_SSHOTTMR_future;

	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);

	BUG_ON(ret != 0 && ret != -ETIME);

	return ret;
}

static const struct clock_event_device xen_vcpuop_clockevent = {
	.name = "xen",
	.features = CLOCK_EVT_FEAT_ONESHOT,

	.max_delta_ns = 0xffffffff,
	.min_delta_ns = TIMER_SLOP,

	.mult = 1,
	.shift = 0,
	.rating = 500,

	.set_mode = xen_vcpuop_set_mode,
	.set_next_event = xen_vcpuop_set_next_event,
};

static const struct clock_event_device *xen_clockevent =
	&xen_timerop_clockevent;
static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);

static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
	irqreturn_t ret;

	ret = IRQ_NONE;
	if (evt->event_handler) {
		evt->event_handler(evt);
		ret = IRQ_HANDLED;
	}

	do_stolen_accounting();

	return ret;
}

void xen_setup_timer(int cpu)
{
	const char *name;
	struct clock_event_device *evt;
	int irq;

	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);

	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
	if (!name)
		name = "<timer kasprintf failed>";

	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
				      name, NULL);

	evt = &per_cpu(xen_clock_events, cpu);
	memcpy(evt, xen_clockevent, sizeof(*evt));

	evt->cpumask = cpumask_of_cpu(cpu);
	evt->irq = irq;

	setup_runstate_info(cpu);
}

void xen_setup_cpu_clockevents(void)
{
	BUG_ON(preemptible());

	clockevents_register_device(&__get_cpu_var(xen_clock_events));
}

__init void xen_time_init(void)
{
	int cpu = smp_processor_id();

	clocksource_register(&xen_clocksource);

	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
		/* Successfully turned off 100Hz tick, so we have the
		   vcpuop-based timer interface */
		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
		xen_clockevent = &xen_vcpuop_clockevent;
	}

	/* Set initial system time with full resolution */
	xen_read_wallclock(&xtime);
	set_normalized_timespec(&wall_to_monotonic,
				-xtime.tv_sec, -xtime.tv_nsec);

	setup_force_cpu_cap(X86_FEATURE_TSC);

	xen_setup_timer(cpu);
	xen_setup_cpu_clockevents();
}
Commit	Line	Data
15c84731 JF	1	/*
	2	* Xen time implementation.
	3	*
	4	* This is implemented in terms of a clocksource driver which uses
	5	* the hypervisor clock as a nanosecond timebase, and a clockevent
	6	* driver which uses the hypervisor's timer mechanism.
	7	*
	8	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
	9	*/
	10	#include <linux/kernel.h>
	11	#include <linux/interrupt.h>
	12	#include <linux/clocksource.h>
	13	#include <linux/clockchips.h>
f91a8b44	14	#include <linux/kernel_stat.h>
f595ec96	15	#include <linux/math64.h>
15c84731	16
1c7b67f7	17	#include <asm/pvclock.h>
15c84731 JF	18	#include <asm/xen/hypervisor.h>
	19	#include <asm/xen/hypercall.h>
	20
	21	#include <xen/events.h>
	22	#include <xen/interface/xen.h>
	23	#include <xen/interface/vcpu.h>
	24
	25	#include "xen-ops.h"
	26
	27	#define XEN_SHIFT 22
	28
	29	/* Xen may fire a timer up to this many ns early */
	30	#define TIMER_SLOP 100000
f91a8b44	31	#define NS_PER_TICK (1000000000LL / HZ)
15c84731	32
ab550288 JF	33	static cycle_t xen_clocksource_read(void);
ab550288 JF	34
f91a8b44 JF	35	/* runstate info updated by Xen */
	36	static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
	37
	38	/* snapshots of runstate info */
	39	static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
	40
	41	/* unused ns of stolen and blocked time */
	42	static DEFINE_PER_CPU(u64, residual_stolen);
	43	static DEFINE_PER_CPU(u64, residual_blocked);
	44
	45	/* return an consistent snapshot of 64-bit time/counter value */
	46	static u64 get64(const u64 *p)
	47	{
	48	u64 ret;
	49
	50	if (BITS_PER_LONG < 64) {
	51	u32 p32 = (u32 )p;
	52	u32 h, l;
	53
	54	/*
	55	* Read high then low, and then make sure high is
	56	* still the same; this will only loop if low wraps
	57	* and carries into high.
	58	* XXX some clean way to make this endian-proof?
	59	*/
	60	do {
	61	h = p32[1];
	62	barrier();
	63	l = p32[0];
	64	barrier();
	65	} while (p32[1] != h);
	66
	67	ret = (((u64)h) << 32) \| l;
	68	} else
	69	ret = *p;
	70
	71	return ret;
	72	}
	73
	74	/*
	75	* Runstate accounting
	76	*/
	77	static void get_runstate_snapshot(struct vcpu_runstate_info *res)
	78	{
	79	u64 state_time;
	80	struct vcpu_runstate_info *state;
	81
f120f13e	82	BUG_ON(preemptible());
f91a8b44 JF	83
	84	state = &__get_cpu_var(runstate);
	85
	86	/*
	87	* The runstate info is always updated by the hypervisor on
	88	* the current CPU, so there's no need to use anything
	89	* stronger than a compiler barrier when fetching it.
	90	*/
	91	do {
	92	state_time = get64(&state->state_entry_time);
	93	barrier();
	94	res = state;
	95	barrier();
	96	} while (get64(&state->state_entry_time) != state_time);
f91a8b44 JF	97	}
f91a8b44 JF	98
f0d73394 JF	99	/* return true when a vcpu could run but has no real cpu to run on */
	100	bool xen_vcpu_stolen(int vcpu)
	101	{
	102	return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
	103	}
	104
f91a8b44 JF	105	static void setup_runstate_info(int cpu)
	106	{
	107	struct vcpu_register_runstate_memory_area area;
	108
	109	area.addr.v = &per_cpu(runstate, cpu);
	110
	111	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
	112	cpu, &area))
	113	BUG();
	114	}
	115
	116	static void do_stolen_accounting(void)
	117	{
	118	struct vcpu_runstate_info state;
	119	struct vcpu_runstate_info *snap;
	120	s64 blocked, runnable, offline, stolen;
	121	cputime_t ticks;
	122
	123	get_runstate_snapshot(&state);
	124
	125	WARN_ON(state.state != RUNSTATE_running);
	126
	127	snap = &__get_cpu_var(runstate_snapshot);
	128
	129	/* work out how much time the VCPU has not been running */
	130	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
	131	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
	132	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
	133
	134	*snap = state;
	135
	136	/* Add the appropriate number of ticks of stolen time,
	137	including any left-overs from last time. Passing NULL to
	138	account_steal_time accounts the time as stolen. */
	139	stolen = runnable + offline + __get_cpu_var(residual_stolen);
	140
	141	if (stolen < 0)
	142	stolen = 0;
	143
f595ec96	144	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
f91a8b44 JF	145	__get_cpu_var(residual_stolen) = stolen;
	146	account_steal_time(NULL, ticks);
	147
	148	/* Add the appropriate number of ticks of blocked time,
	149	including any left-overs from last time. Passing idle to
	150	account_steal_time accounts the time as idle/wait. */
	151	blocked += __get_cpu_var(residual_blocked);
	152
	153	if (blocked < 0)
	154	blocked = 0;
	155
f595ec96	156	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
f91a8b44 JF	157	__get_cpu_var(residual_blocked) = blocked;
	158	account_steal_time(idle_task(smp_processor_id()), ticks);
	159	}
	160
ab550288 JF	161	/*
	162	* Xen sched_clock implementation. Returns the number of unstolen
	163	* nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
	164	* states.
	165	*/
	166	unsigned long long xen_sched_clock(void)
	167	{
	168	struct vcpu_runstate_info state;
f120f13e JF	169	cycle_t now;
f120f13e JF	170	u64 ret;
ab550288 JF	171	s64 offset;
ab550288 JF	172
f120f13e JF	173	/*
	174	* Ideally sched_clock should be called on a per-cpu basis
	175	* anyway, so preempt should already be disabled, but that's
	176	* not current practice at the moment.
	177	*/
	178	preempt_disable();
	179
	180	now = xen_clocksource_read();
	181
ab550288 JF	182	get_runstate_snapshot(&state);
	183
	184	WARN_ON(state.state != RUNSTATE_running);
	185
	186	offset = now - state.state_entry_time;
	187	if (offset < 0)
	188	offset = 0;
	189
f120f13e	190	ret = state.time[RUNSTATE_blocked] +
ab550288 JF	191	state.time[RUNSTATE_running] +
ab550288 JF	192	offset;
f120f13e JF	193
	194	preempt_enable();
	195
	196	return ret;
ab550288	197	}
f91a8b44 JF	198
	199
	200	/* Get the CPU speed from Xen */
15c84731 JF	201	unsigned long xen_cpu_khz(void)
15c84731 JF	202	{
88a5ac89	203	u64 xen_khz = 1000000ULL << 32;
1c7b67f7	204	const struct pvclock_vcpu_time_info *info =
15c84731 JF	205	&HYPERVISOR_shared_info->vcpu_info[0].time;
15c84731 JF	206
88a5ac89	207	do_div(xen_khz, info->tsc_to_system_mul);
15c84731	208	if (info->tsc_shift < 0)
88a5ac89	209	xen_khz <<= -info->tsc_shift;
15c84731	210	else
88a5ac89	211	xen_khz >>= info->tsc_shift;
15c84731	212
88a5ac89	213	return xen_khz;
15c84731 JF	214	}
15c84731 JF	215
ab550288	216	static cycle_t xen_clocksource_read(void)
15c84731	217	{
1c7b67f7	218	struct pvclock_vcpu_time_info *src;
15c84731	219	cycle_t ret;
15c84731	220
1c7b67f7 GH	221	src = &get_cpu_var(xen_vcpu)->time;
	222	ret = pvclock_clocksource_read(src);
	223	put_cpu_var(xen_vcpu);
15c84731 JF	224	return ret;
	225	}
	226
	227	static void xen_read_wallclock(struct timespec *ts)
	228	{
1c7b67f7 GH	229	struct shared_info *s = HYPERVISOR_shared_info;
	230	struct pvclock_wall_clock *wall_clock = &(s->wc);
	231	struct pvclock_vcpu_time_info *vcpu_time;
15c84731	232
1c7b67f7 GH	233	vcpu_time = &get_cpu_var(xen_vcpu)->time;
	234	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
	235	put_cpu_var(xen_vcpu);
15c84731 JF	236	}
	237
	238	unsigned long xen_get_wallclock(void)
	239	{
	240	struct timespec ts;
	241
	242	xen_read_wallclock(&ts);
15c84731 JF	243	return ts.tv_sec;
	244	}
	245
	246	int xen_set_wallclock(unsigned long now)
	247	{
	248	/* do nothing for domU */
	249	return -1;
	250	}
	251
	252	static struct clocksource xen_clocksource __read_mostly = {
	253	.name = "xen",
	254	.rating = 400,
	255	.read = xen_clocksource_read,
	256	.mask = ~0,
	257	.mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
	258	.shift = XEN_SHIFT,
	259	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
	260	};
	261
	262	/*
	263	Xen clockevent implementation
	264
	265	Xen has two clockevent implementations:
	266
	267	The old timer_op one works with all released versions of Xen prior
	268	to version 3.0.4. This version of the hypervisor provides a
	269	single-shot timer with nanosecond resolution. However, sharing the
	270	same event channel is a 100Hz tick which is delivered while the
	271	vcpu is running. We don't care about or use this tick, but it will
	272	cause the core time code to think the timer fired too soon, and
	273	will end up resetting it each time. It could be filtered, but
	274	doing so has complications when the ktime clocksource is not yet
	275	the xen clocksource (ie, at boot time).
	276
	277	The new vcpu_op-based timer interface allows the tick timer period
	278	to be changed or turned off. The tick timer is not useful as a
	279	periodic timer because events are only delivered to running vcpus.
	280	The one-shot timer can report when a timeout is in the past, so
	281	set_next_event is capable of returning -ETIME when appropriate.
	282	This interface is used when available.
	283	*/
	284
	285
	286	/*
	287	Get a hypervisor absolute time. In theory we could maintain an
	288	offset between the kernel's time and the hypervisor's time, and
	289	apply that to a kernel's absolute timeout. Unfortunately the
	290	hypervisor and kernel times can drift even if the kernel is using
	291	the Xen clocksource, because ntp can warp the kernel's clocksource.
	292	*/
	293	static s64 get_abs_timeout(unsigned long delta)
	294	{
	295	return xen_clocksource_read() + delta;
	296	}
	297
	298	static void xen_timerop_set_mode(enum clock_event_mode mode,
	299	struct clock_event_device *evt)
	300	{
	301	switch (mode) {
	302	case CLOCK_EVT_MODE_PERIODIC:
	303	/* unsupported */
	304	WARN_ON(1);
	305	break;
	306
307	case CLOCK_EVT_MODE_ONESHOT:
18de5bc4	308	case CLOCK_EVT_MODE_RESUME:
15c84731 JF	309	break;
	310
	311	case CLOCK_EVT_MODE_UNUSED:
	312	case CLOCK_EVT_MODE_SHUTDOWN:
	313	HYPERVISOR_set_timer_op(0); /* cancel timeout */
	314	break;
	315	}
	316	}
	317
	318	static int xen_timerop_set_next_event(unsigned long delta,
	319	struct clock_event_device *evt)
	320	{
	321	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
	322
	323	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
	324	BUG();
	325
	326	/* We may have missed the deadline, but there's no real way of
	327	knowing for sure. If the event was in the past, then we'll
	328	get an immediate interrupt. */
	329
	330	return 0;
	331	}
	332
	333	static const struct clock_event_device xen_timerop_clockevent = {
	334	.name = "xen",
	335	.features = CLOCK_EVT_FEAT_ONESHOT,
	336
	337	.max_delta_ns = 0xffffffff,
	338	.min_delta_ns = TIMER_SLOP,
	339
	340	.mult = 1,
	341	.shift = 0,
	342	.rating = 500,
	343
	344	.set_mode = xen_timerop_set_mode,
	345	.set_next_event = xen_timerop_set_next_event,
	346	};
	347
	348
	349
	350	static void xen_vcpuop_set_mode(enum clock_event_mode mode,
	351	struct clock_event_device *evt)
	352	{
	353	int cpu = smp_processor_id();
	354
	355	switch (mode) {
	356	case CLOCK_EVT_MODE_PERIODIC:
	357	WARN_ON(1); /* unsupported */
	358	break;
	359
	360	case CLOCK_EVT_MODE_ONESHOT:
	361	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
	362	BUG();
	363	break;
	364
	365	case CLOCK_EVT_MODE_UNUSED:
	366	case CLOCK_EVT_MODE_SHUTDOWN:
	367	if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) \|\|
	368	HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
	369	BUG();
	370	break;
18de5bc4 TG	371	case CLOCK_EVT_MODE_RESUME:
18de5bc4 TG	372	break;
15c84731 JF	373	}
	374	}
	375
	376	static int xen_vcpuop_set_next_event(unsigned long delta,
	377	struct clock_event_device *evt)
	378	{
	379	int cpu = smp_processor_id();
	380	struct vcpu_set_singleshot_timer single;
	381	int ret;
	382
	383	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
	384
	385	single.timeout_abs_ns = get_abs_timeout(delta);
	386	single.flags = VCPU_SSHOTTMR_future;
	387
	388	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
	389
	390	BUG_ON(ret != 0 && ret != -ETIME);
	391
	392	return ret;
	393	}
	394
	395	static const struct clock_event_device xen_vcpuop_clockevent = {
	396	.name = "xen",
	397	.features = CLOCK_EVT_FEAT_ONESHOT,
	398
	399	.max_delta_ns = 0xffffffff,
	400	.min_delta_ns = TIMER_SLOP,
	401
	402	.mult = 1,
	403	.shift = 0,
	404	.rating = 500,
	405
	406	.set_mode = xen_vcpuop_set_mode,
	407	.set_next_event = xen_vcpuop_set_next_event,
	408	};
	409
	410	static const struct clock_event_device *xen_clockevent =
	411	&xen_timerop_clockevent;
	412	static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
	413
	414	static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
	415	{
	416	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
	417	irqreturn_t ret;
	418
	419	ret = IRQ_NONE;
	420	if (evt->event_handler) {
	421	evt->event_handler(evt);
	422	ret = IRQ_HANDLED;
	423	}
	424
f91a8b44 JF	425	do_stolen_accounting();
f91a8b44 JF	426
15c84731 JF	427	return ret;
	428	}
	429
f87e4cac	430	void xen_setup_timer(int cpu)
15c84731 JF	431	{
	432	const char *name;
	433	struct clock_event_device *evt;
	434	int irq;
	435
	436	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
	437
	438	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
	439	if (!name)
	440	name = "<timer kasprintf failed>";
	441
	442	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
	443	IRQF_DISABLED\|IRQF_PERCPU\|IRQF_NOBALANCING,
	444	name, NULL);
	445
f87e4cac	446	evt = &per_cpu(xen_clock_events, cpu);
15c84731 JF	447	memcpy(evt, xen_clockevent, sizeof(*evt));
	448
	449	evt->cpumask = cpumask_of_cpu(cpu);
	450	evt->irq = irq;
15c84731	451
f91a8b44	452	setup_runstate_info(cpu);
f87e4cac JF	453	}
	454
	455	void xen_setup_cpu_clockevents(void)
	456	{
	457	BUG_ON(preemptible());
f91a8b44	458
f87e4cac	459	clockevents_register_device(&__get_cpu_var(xen_clock_events));
15c84731 JF	460	}
	461
	462	__init void xen_time_init(void)
	463	{
	464	int cpu = smp_processor_id();
	465
15c84731 JF	466	clocksource_register(&xen_clocksource);
	467
	468	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
f91a8b44	469	/* Successfully turned off 100Hz tick, so we have the
15c84731 JF	470	vcpuop-based timer interface */
	471	printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
	472	xen_clockevent = &xen_vcpuop_clockevent;
	473	}
	474
	475	/* Set initial system time with full resolution */
	476	xen_read_wallclock(&xtime);
	477	set_normalized_timespec(&wall_to_monotonic,
	478	-xtime.tv_sec, -xtime.tv_nsec);
	479
404ee5b1	480	setup_force_cpu_cap(X86_FEATURE_TSC);
15c84731 JF	481
15c84731 JF	482	xen_setup_timer(cpu);
f87e4cac	483	xen_setup_cpu_clockevents();
15c84731	484	}