Commit | Line | Data |
---|---|---|
15c84731 JF |
1 | /* |
2 | * Xen time implementation. | |
3 | * | |
4 | * This is implemented in terms of a clocksource driver which uses | |
5 | * the hypervisor clock as a nanosecond timebase, and a clockevent | |
6 | * driver which uses the hypervisor's timer mechanism. | |
7 | * | |
8 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | |
9 | */ | |
10 | #include <linux/kernel.h> | |
11 | #include <linux/interrupt.h> | |
12 | #include <linux/clocksource.h> | |
13 | #include <linux/clockchips.h> | |
f91a8b44 | 14 | #include <linux/kernel_stat.h> |
f595ec96 | 15 | #include <linux/math64.h> |
5a0e3ad6 | 16 | #include <linux/gfp.h> |
c9d76a24 | 17 | #include <linux/slab.h> |
5584880e | 18 | #include <linux/pvclock_gtod.h> |
15c84731 | 19 | |
1c7b67f7 | 20 | #include <asm/pvclock.h> |
15c84731 JF |
21 | #include <asm/xen/hypervisor.h> |
22 | #include <asm/xen/hypercall.h> | |
23 | ||
24 | #include <xen/events.h> | |
409771d2 | 25 | #include <xen/features.h> |
15c84731 JF |
26 | #include <xen/interface/xen.h> |
27 | #include <xen/interface/vcpu.h> | |
28 | ||
29 | #include "xen-ops.h" | |
30 | ||
15c84731 JF |
31 | /* Xen may fire a timer up to this many ns early */ |
32 | #define TIMER_SLOP 100000 | |
f91a8b44 | 33 | #define NS_PER_TICK (1000000000LL / HZ) |
15c84731 | 34 | |
f91a8b44 | 35 | /* snapshots of runstate info */ |
c6e22f9e | 36 | static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); |
f91a8b44 | 37 | |
0b0c002c | 38 | /* unused ns of stolen time */ |
c6e22f9e | 39 | static DEFINE_PER_CPU(u64, xen_residual_stolen); |
f91a8b44 | 40 | |
f91a8b44 JF |
41 | static void do_stolen_accounting(void) |
42 | { | |
43 | struct vcpu_runstate_info state; | |
44 | struct vcpu_runstate_info *snap; | |
0b0c002c | 45 | s64 runnable, offline, stolen; |
f91a8b44 JF |
46 | cputime_t ticks; |
47 | ||
4ccefbe5 | 48 | xen_get_runstate_snapshot(&state); |
f91a8b44 JF |
49 | |
50 | WARN_ON(state.state != RUNSTATE_running); | |
51 | ||
89cbc767 | 52 | snap = this_cpu_ptr(&xen_runstate_snapshot); |
f91a8b44 JF |
53 | |
54 | /* work out how much time the VCPU has not been runn*ing* */ | |
f91a8b44 JF |
55 | runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; |
56 | offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; | |
57 | ||
58 | *snap = state; | |
59 | ||
60 | /* Add the appropriate number of ticks of stolen time, | |
79741dd3 | 61 | including any left-overs from last time. */ |
780f36d8 | 62 | stolen = runnable + offline + __this_cpu_read(xen_residual_stolen); |
f91a8b44 JF |
63 | |
64 | if (stolen < 0) | |
65 | stolen = 0; | |
66 | ||
f595ec96 | 67 | ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); |
780f36d8 | 68 | __this_cpu_write(xen_residual_stolen, stolen); |
79741dd3 | 69 | account_steal_ticks(ticks); |
f91a8b44 JF |
70 | } |
71 | ||
e93ef949 | 72 | /* Get the TSC speed from Xen */ |
409771d2 | 73 | static unsigned long xen_tsc_khz(void) |
15c84731 | 74 | { |
3807f345 | 75 | struct pvclock_vcpu_time_info *info = |
15c84731 JF |
76 | &HYPERVISOR_shared_info->vcpu_info[0].time; |
77 | ||
3807f345 | 78 | return pvclock_tsc_khz(info); |
15c84731 JF |
79 | } |
80 | ||
ee7686bc | 81 | cycle_t xen_clocksource_read(void) |
15c84731 | 82 | { |
1c7b67f7 | 83 | struct pvclock_vcpu_time_info *src; |
15c84731 | 84 | cycle_t ret; |
15c84731 | 85 | |
f1c39625 | 86 | preempt_disable_notrace(); |
3251f20b | 87 | src = &__this_cpu_read(xen_vcpu)->time; |
1c7b67f7 | 88 | ret = pvclock_clocksource_read(src); |
f1c39625 | 89 | preempt_enable_notrace(); |
15c84731 JF |
90 | return ret; |
91 | } | |
92 | ||
8e19608e MD |
93 | static cycle_t xen_clocksource_get_cycles(struct clocksource *cs) |
94 | { | |
95 | return xen_clocksource_read(); | |
96 | } | |
97 | ||
15c84731 JF |
98 | static void xen_read_wallclock(struct timespec *ts) |
99 | { | |
1c7b67f7 GH |
100 | struct shared_info *s = HYPERVISOR_shared_info; |
101 | struct pvclock_wall_clock *wall_clock = &(s->wc); | |
102 | struct pvclock_vcpu_time_info *vcpu_time; | |
15c84731 | 103 | |
1c7b67f7 GH |
104 | vcpu_time = &get_cpu_var(xen_vcpu)->time; |
105 | pvclock_read_wallclock(wall_clock, vcpu_time, ts); | |
106 | put_cpu_var(xen_vcpu); | |
15c84731 JF |
107 | } |
108 | ||
3565184e | 109 | static void xen_get_wallclock(struct timespec *now) |
15c84731 | 110 | { |
3565184e | 111 | xen_read_wallclock(now); |
15c84731 | 112 | } |
15c84731 | 113 | |
3565184e | 114 | static int xen_set_wallclock(const struct timespec *now) |
15c84731 | 115 | { |
47433b8c | 116 | return -1; |
15c84731 JF |
117 | } |
118 | ||
47433b8c DV |
119 | static int xen_pvclock_gtod_notify(struct notifier_block *nb, |
120 | unsigned long was_set, void *priv) | |
15c84731 | 121 | { |
47433b8c DV |
122 | /* Protected by the calling core code serialization */ |
123 | static struct timespec next_sync; | |
5584880e | 124 | |
fdb9eb9f | 125 | struct xen_platform_op op; |
47433b8c | 126 | struct timespec now; |
fdb9eb9f | 127 | |
5584880e DV |
128 | now = __current_kernel_time(); |
129 | ||
47433b8c DV |
130 | /* |
131 | * We only take the expensive HV call when the clock was set | |
132 | * or when the 11 minutes RTC synchronization time elapsed. | |
133 | */ | |
134 | if (!was_set && timespec_compare(&now, &next_sync) < 0) | |
135 | return NOTIFY_OK; | |
fdb9eb9f JF |
136 | |
137 | op.cmd = XENPF_settime; | |
5584880e DV |
138 | op.u.settime.secs = now.tv_sec; |
139 | op.u.settime.nsecs = now.tv_nsec; | |
fdb9eb9f JF |
140 | op.u.settime.system_time = xen_clocksource_read(); |
141 | ||
cfafae94 | 142 | (void)HYPERVISOR_platform_op(&op); |
fdb9eb9f | 143 | |
47433b8c DV |
144 | /* |
145 | * Move the next drift compensation time 11 minutes | |
146 | * ahead. That's emulating the sync_cmos_clock() update for | |
147 | * the hardware RTC. | |
148 | */ | |
149 | next_sync = now; | |
150 | next_sync.tv_sec += 11 * 60; | |
151 | ||
5584880e | 152 | return NOTIFY_OK; |
15c84731 JF |
153 | } |
154 | ||
5584880e DV |
155 | static struct notifier_block xen_pvclock_gtod_notifier = { |
156 | .notifier_call = xen_pvclock_gtod_notify, | |
157 | }; | |
158 | ||
15c84731 JF |
159 | static struct clocksource xen_clocksource __read_mostly = { |
160 | .name = "xen", | |
161 | .rating = 400, | |
8e19608e | 162 | .read = xen_clocksource_get_cycles, |
15c84731 | 163 | .mask = ~0, |
15c84731 JF |
164 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
165 | }; | |
166 | ||
167 | /* | |
168 | Xen clockevent implementation | |
169 | ||
170 | Xen has two clockevent implementations: | |
171 | ||
172 | The old timer_op one works with all released versions of Xen prior | |
173 | to version 3.0.4. This version of the hypervisor provides a | |
174 | single-shot timer with nanosecond resolution. However, sharing the | |
175 | same event channel is a 100Hz tick which is delivered while the | |
176 | vcpu is running. We don't care about or use this tick, but it will | |
177 | cause the core time code to think the timer fired too soon, and | |
178 | will end up resetting it each time. It could be filtered, but | |
179 | doing so has complications when the ktime clocksource is not yet | |
180 | the xen clocksource (ie, at boot time). | |
181 | ||
182 | The new vcpu_op-based timer interface allows the tick timer period | |
183 | to be changed or turned off. The tick timer is not useful as a | |
184 | periodic timer because events are only delivered to running vcpus. | |
185 | The one-shot timer can report when a timeout is in the past, so | |
186 | set_next_event is capable of returning -ETIME when appropriate. | |
187 | This interface is used when available. | |
188 | */ | |
189 | ||
190 | ||
191 | /* | |
192 | Get a hypervisor absolute time. In theory we could maintain an | |
193 | offset between the kernel's time and the hypervisor's time, and | |
194 | apply that to a kernel's absolute timeout. Unfortunately the | |
195 | hypervisor and kernel times can drift even if the kernel is using | |
196 | the Xen clocksource, because ntp can warp the kernel's clocksource. | |
197 | */ | |
198 | static s64 get_abs_timeout(unsigned long delta) | |
199 | { | |
200 | return xen_clocksource_read() + delta; | |
201 | } | |
202 | ||
955381dd | 203 | static int xen_timerop_shutdown(struct clock_event_device *evt) |
15c84731 | 204 | { |
955381dd VK |
205 | /* cancel timeout */ |
206 | HYPERVISOR_set_timer_op(0); | |
207 | ||
208 | return 0; | |
15c84731 JF |
209 | } |
210 | ||
211 | static int xen_timerop_set_next_event(unsigned long delta, | |
212 | struct clock_event_device *evt) | |
213 | { | |
955381dd | 214 | WARN_ON(!clockevent_state_oneshot(evt)); |
15c84731 JF |
215 | |
216 | if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) | |
217 | BUG(); | |
218 | ||
219 | /* We may have missed the deadline, but there's no real way of | |
220 | knowing for sure. If the event was in the past, then we'll | |
221 | get an immediate interrupt. */ | |
222 | ||
223 | return 0; | |
224 | } | |
225 | ||
226 | static const struct clock_event_device xen_timerop_clockevent = { | |
955381dd VK |
227 | .name = "xen", |
228 | .features = CLOCK_EVT_FEAT_ONESHOT, | |
15c84731 | 229 | |
955381dd VK |
230 | .max_delta_ns = 0xffffffff, |
231 | .min_delta_ns = TIMER_SLOP, | |
15c84731 | 232 | |
955381dd VK |
233 | .mult = 1, |
234 | .shift = 0, | |
235 | .rating = 500, | |
15c84731 | 236 | |
955381dd VK |
237 | .set_state_shutdown = xen_timerop_shutdown, |
238 | .set_next_event = xen_timerop_set_next_event, | |
15c84731 JF |
239 | }; |
240 | ||
955381dd VK |
241 | static int xen_vcpuop_shutdown(struct clock_event_device *evt) |
242 | { | |
243 | int cpu = smp_processor_id(); | |
15c84731 | 244 | |
955381dd VK |
245 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || |
246 | HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) | |
247 | BUG(); | |
15c84731 | 248 | |
955381dd VK |
249 | return 0; |
250 | } | |
251 | ||
252 | static int xen_vcpuop_set_oneshot(struct clock_event_device *evt) | |
15c84731 JF |
253 | { |
254 | int cpu = smp_processor_id(); | |
255 | ||
955381dd VK |
256 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) |
257 | BUG(); | |
258 | ||
259 | return 0; | |
15c84731 JF |
260 | } |
261 | ||
262 | static int xen_vcpuop_set_next_event(unsigned long delta, | |
263 | struct clock_event_device *evt) | |
264 | { | |
265 | int cpu = smp_processor_id(); | |
266 | struct vcpu_set_singleshot_timer single; | |
267 | int ret; | |
268 | ||
955381dd | 269 | WARN_ON(!clockevent_state_oneshot(evt)); |
15c84731 JF |
270 | |
271 | single.timeout_abs_ns = get_abs_timeout(delta); | |
272 | single.flags = VCPU_SSHOTTMR_future; | |
273 | ||
274 | ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); | |
275 | ||
276 | BUG_ON(ret != 0 && ret != -ETIME); | |
277 | ||
278 | return ret; | |
279 | } | |
280 | ||
281 | static const struct clock_event_device xen_vcpuop_clockevent = { | |
282 | .name = "xen", | |
283 | .features = CLOCK_EVT_FEAT_ONESHOT, | |
284 | ||
285 | .max_delta_ns = 0xffffffff, | |
286 | .min_delta_ns = TIMER_SLOP, | |
287 | ||
288 | .mult = 1, | |
289 | .shift = 0, | |
290 | .rating = 500, | |
291 | ||
955381dd VK |
292 | .set_state_shutdown = xen_vcpuop_shutdown, |
293 | .set_state_oneshot = xen_vcpuop_set_oneshot, | |
15c84731 JF |
294 | .set_next_event = xen_vcpuop_set_next_event, |
295 | }; | |
296 | ||
297 | static const struct clock_event_device *xen_clockevent = | |
298 | &xen_timerop_clockevent; | |
31620a19 KRW |
299 | |
300 | struct xen_clock_event_device { | |
301 | struct clock_event_device evt; | |
7be0772d | 302 | char name[16]; |
31620a19 KRW |
303 | }; |
304 | static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 }; | |
15c84731 JF |
305 | |
306 | static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) | |
307 | { | |
89cbc767 | 308 | struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt); |
15c84731 JF |
309 | irqreturn_t ret; |
310 | ||
311 | ret = IRQ_NONE; | |
312 | if (evt->event_handler) { | |
313 | evt->event_handler(evt); | |
314 | ret = IRQ_HANDLED; | |
315 | } | |
316 | ||
f91a8b44 JF |
317 | do_stolen_accounting(); |
318 | ||
15c84731 JF |
319 | return ret; |
320 | } | |
321 | ||
09e99da7 KRW |
322 | void xen_teardown_timer(int cpu) |
323 | { | |
324 | struct clock_event_device *evt; | |
325 | BUG_ON(cpu == 0); | |
326 | evt = &per_cpu(xen_clock_events, cpu).evt; | |
327 | ||
328 | if (evt->irq >= 0) { | |
329 | unbind_from_irqhandler(evt->irq, NULL); | |
330 | evt->irq = -1; | |
09e99da7 KRW |
331 | } |
332 | } | |
333 | ||
f87e4cac | 334 | void xen_setup_timer(int cpu) |
15c84731 | 335 | { |
7be0772d VK |
336 | struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu); |
337 | struct clock_event_device *evt = &xevt->evt; | |
15c84731 JF |
338 | int irq; |
339 | ||
ef35a4e6 | 340 | WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu); |
09e99da7 KRW |
341 | if (evt->irq >= 0) |
342 | xen_teardown_timer(cpu); | |
ef35a4e6 | 343 | |
15c84731 JF |
344 | printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); |
345 | ||
7be0772d | 346 | snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu); |
15c84731 JF |
347 | |
348 | irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, | |
9d71cee6 | 349 | IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| |
8d5999df | 350 | IRQF_FORCE_RESUME|IRQF_EARLY_RESUME, |
7be0772d | 351 | xevt->name, NULL); |
8785c676 | 352 | (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX); |
15c84731 | 353 | |
15c84731 JF |
354 | memcpy(evt, xen_clockevent, sizeof(*evt)); |
355 | ||
320ab2b0 | 356 | evt->cpumask = cpumask_of(cpu); |
15c84731 | 357 | evt->irq = irq; |
f87e4cac JF |
358 | } |
359 | ||
d68d82af | 360 | |
f87e4cac JF |
361 | void xen_setup_cpu_clockevents(void) |
362 | { | |
89cbc767 | 363 | clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt)); |
15c84731 JF |
364 | } |
365 | ||
d07af1f0 JF |
366 | void xen_timer_resume(void) |
367 | { | |
368 | int cpu; | |
369 | ||
e7a3481c JF |
370 | pvclock_resume(); |
371 | ||
d07af1f0 JF |
372 | if (xen_clockevent != &xen_vcpuop_clockevent) |
373 | return; | |
374 | ||
375 | for_each_online_cpu(cpu) { | |
376 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) | |
377 | BUG(); | |
378 | } | |
379 | } | |
380 | ||
fb6ce5de | 381 | static const struct pv_time_ops xen_time_ops __initconst = { |
ca50a5f3 | 382 | .sched_clock = xen_clocksource_read, |
409771d2 SS |
383 | }; |
384 | ||
fb6ce5de | 385 | static void __init xen_time_init(void) |
15c84731 JF |
386 | { |
387 | int cpu = smp_processor_id(); | |
c4507257 | 388 | struct timespec tp; |
15c84731 | 389 | |
94dd85f6 PI |
390 | /* As Dom0 is never moved, no penalty on using TSC there */ |
391 | if (xen_initial_domain()) | |
392 | xen_clocksource.rating = 275; | |
393 | ||
b01cc1b0 | 394 | clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); |
15c84731 JF |
395 | |
396 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { | |
f91a8b44 | 397 | /* Successfully turned off 100Hz tick, so we have the |
15c84731 JF |
398 | vcpuop-based timer interface */ |
399 | printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); | |
400 | xen_clockevent = &xen_vcpuop_clockevent; | |
401 | } | |
402 | ||
403 | /* Set initial system time with full resolution */ | |
c4507257 JS |
404 | xen_read_wallclock(&tp); |
405 | do_settimeofday(&tp); | |
15c84731 | 406 | |
404ee5b1 | 407 | setup_force_cpu_cap(X86_FEATURE_TSC); |
15c84731 | 408 | |
be012920 | 409 | xen_setup_runstate_info(cpu); |
15c84731 | 410 | xen_setup_timer(cpu); |
f87e4cac | 411 | xen_setup_cpu_clockevents(); |
5584880e DV |
412 | |
413 | if (xen_initial_domain()) | |
414 | pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); | |
15c84731 | 415 | } |
409771d2 | 416 | |
fb6ce5de | 417 | void __init xen_init_time_ops(void) |
409771d2 SS |
418 | { |
419 | pv_time_ops = xen_time_ops; | |
420 | ||
421 | x86_init.timers.timer_init = xen_time_init; | |
422 | x86_init.timers.setup_percpu_clockev = x86_init_noop; | |
423 | x86_cpuinit.setup_percpu_clockev = x86_init_noop; | |
424 | ||
425 | x86_platform.calibrate_tsc = xen_tsc_khz; | |
426 | x86_platform.get_wallclock = xen_get_wallclock; | |
47433b8c DV |
427 | /* Dom0 uses the native method to set the hardware RTC. */ |
428 | if (!xen_initial_domain()) | |
429 | x86_platform.set_wallclock = xen_set_wallclock; | |
409771d2 SS |
430 | } |
431 | ||
ca65f9fc | 432 | #ifdef CONFIG_XEN_PVHVM |
409771d2 SS |
433 | static void xen_hvm_setup_cpu_clockevents(void) |
434 | { | |
435 | int cpu = smp_processor_id(); | |
436 | xen_setup_runstate_info(cpu); | |
7918c92a KRW |
437 | /* |
438 | * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence | |
439 | * doing it xen_hvm_cpu_notify (which gets called by smp_init during | |
440 | * early bootup and also during CPU hotplug events). | |
441 | */ | |
409771d2 SS |
442 | xen_setup_cpu_clockevents(); |
443 | } | |
444 | ||
fb6ce5de | 445 | void __init xen_hvm_init_time_ops(void) |
409771d2 SS |
446 | { |
447 | /* vector callback is needed otherwise we cannot receive interrupts | |
31e7e931 SS |
448 | * on cpu > 0 and at this point we don't know how many cpus are |
449 | * available */ | |
450 | if (!xen_have_vector_callback) | |
409771d2 SS |
451 | return; |
452 | if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { | |
453 | printk(KERN_INFO "Xen doesn't support pvclock on HVM," | |
454 | "disable pv timer\n"); | |
455 | return; | |
456 | } | |
457 | ||
458 | pv_time_ops = xen_time_ops; | |
459 | x86_init.timers.setup_percpu_clockev = xen_time_init; | |
460 | x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; | |
461 | ||
462 | x86_platform.calibrate_tsc = xen_tsc_khz; | |
463 | x86_platform.get_wallclock = xen_get_wallclock; | |
464 | x86_platform.set_wallclock = xen_set_wallclock; | |
465 | } | |
ca65f9fc | 466 | #endif |