Commit | Line | Data |
---|---|---|
15c84731 JF |
1 | /* |
2 | * Xen time implementation. | |
3 | * | |
4 | * This is implemented in terms of a clocksource driver which uses | |
5 | * the hypervisor clock as a nanosecond timebase, and a clockevent | |
6 | * driver which uses the hypervisor's timer mechanism. | |
7 | * | |
8 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | |
9 | */ | |
10 | #include <linux/kernel.h> | |
11 | #include <linux/interrupt.h> | |
12 | #include <linux/clocksource.h> | |
13 | #include <linux/clockchips.h> | |
f91a8b44 | 14 | #include <linux/kernel_stat.h> |
f595ec96 | 15 | #include <linux/math64.h> |
15c84731 | 16 | |
1c7b67f7 | 17 | #include <asm/pvclock.h> |
15c84731 JF |
18 | #include <asm/xen/hypervisor.h> |
19 | #include <asm/xen/hypercall.h> | |
20 | ||
21 | #include <xen/events.h> | |
22 | #include <xen/interface/xen.h> | |
23 | #include <xen/interface/vcpu.h> | |
24 | ||
25 | #include "xen-ops.h" | |
26 | ||
27 | #define XEN_SHIFT 22 | |
28 | ||
29 | /* Xen may fire a timer up to this many ns early */ | |
30 | #define TIMER_SLOP 100000 | |
f91a8b44 | 31 | #define NS_PER_TICK (1000000000LL / HZ) |
15c84731 | 32 | |
ab550288 JF |
33 | static cycle_t xen_clocksource_read(void); |
34 | ||
f91a8b44 JF |
35 | /* runstate info updated by Xen */ |
36 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); | |
37 | ||
38 | /* snapshots of runstate info */ | |
39 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); | |
40 | ||
41 | /* unused ns of stolen and blocked time */ | |
42 | static DEFINE_PER_CPU(u64, residual_stolen); | |
43 | static DEFINE_PER_CPU(u64, residual_blocked); | |
44 | ||
45 | /* return an consistent snapshot of 64-bit time/counter value */ | |
46 | static u64 get64(const u64 *p) | |
47 | { | |
48 | u64 ret; | |
49 | ||
50 | if (BITS_PER_LONG < 64) { | |
51 | u32 *p32 = (u32 *)p; | |
52 | u32 h, l; | |
53 | ||
54 | /* | |
55 | * Read high then low, and then make sure high is | |
56 | * still the same; this will only loop if low wraps | |
57 | * and carries into high. | |
58 | * XXX some clean way to make this endian-proof? | |
59 | */ | |
60 | do { | |
61 | h = p32[1]; | |
62 | barrier(); | |
63 | l = p32[0]; | |
64 | barrier(); | |
65 | } while (p32[1] != h); | |
66 | ||
67 | ret = (((u64)h) << 32) | l; | |
68 | } else | |
69 | ret = *p; | |
70 | ||
71 | return ret; | |
72 | } | |
73 | ||
74 | /* | |
75 | * Runstate accounting | |
76 | */ | |
77 | static void get_runstate_snapshot(struct vcpu_runstate_info *res) | |
78 | { | |
79 | u64 state_time; | |
80 | struct vcpu_runstate_info *state; | |
81 | ||
f120f13e | 82 | BUG_ON(preemptible()); |
f91a8b44 JF |
83 | |
84 | state = &__get_cpu_var(runstate); | |
85 | ||
86 | /* | |
87 | * The runstate info is always updated by the hypervisor on | |
88 | * the current CPU, so there's no need to use anything | |
89 | * stronger than a compiler barrier when fetching it. | |
90 | */ | |
91 | do { | |
92 | state_time = get64(&state->state_entry_time); | |
93 | barrier(); | |
94 | *res = *state; | |
95 | barrier(); | |
96 | } while (get64(&state->state_entry_time) != state_time); | |
f91a8b44 JF |
97 | } |
98 | ||
f0d73394 JF |
99 | /* return true when a vcpu could run but has no real cpu to run on */ |
100 | bool xen_vcpu_stolen(int vcpu) | |
101 | { | |
102 | return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; | |
103 | } | |
104 | ||
f91a8b44 JF |
105 | static void setup_runstate_info(int cpu) |
106 | { | |
107 | struct vcpu_register_runstate_memory_area area; | |
108 | ||
109 | area.addr.v = &per_cpu(runstate, cpu); | |
110 | ||
111 | if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, | |
112 | cpu, &area)) | |
113 | BUG(); | |
114 | } | |
115 | ||
116 | static void do_stolen_accounting(void) | |
117 | { | |
118 | struct vcpu_runstate_info state; | |
119 | struct vcpu_runstate_info *snap; | |
120 | s64 blocked, runnable, offline, stolen; | |
121 | cputime_t ticks; | |
122 | ||
123 | get_runstate_snapshot(&state); | |
124 | ||
125 | WARN_ON(state.state != RUNSTATE_running); | |
126 | ||
127 | snap = &__get_cpu_var(runstate_snapshot); | |
128 | ||
129 | /* work out how much time the VCPU has not been runn*ing* */ | |
130 | blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; | |
131 | runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; | |
132 | offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; | |
133 | ||
134 | *snap = state; | |
135 | ||
136 | /* Add the appropriate number of ticks of stolen time, | |
137 | including any left-overs from last time. Passing NULL to | |
138 | account_steal_time accounts the time as stolen. */ | |
139 | stolen = runnable + offline + __get_cpu_var(residual_stolen); | |
140 | ||
141 | if (stolen < 0) | |
142 | stolen = 0; | |
143 | ||
f595ec96 | 144 | ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); |
f91a8b44 JF |
145 | __get_cpu_var(residual_stolen) = stolen; |
146 | account_steal_time(NULL, ticks); | |
147 | ||
148 | /* Add the appropriate number of ticks of blocked time, | |
149 | including any left-overs from last time. Passing idle to | |
150 | account_steal_time accounts the time as idle/wait. */ | |
151 | blocked += __get_cpu_var(residual_blocked); | |
152 | ||
153 | if (blocked < 0) | |
154 | blocked = 0; | |
155 | ||
f595ec96 | 156 | ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); |
f91a8b44 JF |
157 | __get_cpu_var(residual_blocked) = blocked; |
158 | account_steal_time(idle_task(smp_processor_id()), ticks); | |
159 | } | |
160 | ||
ab550288 JF |
161 | /* |
162 | * Xen sched_clock implementation. Returns the number of unstolen | |
163 | * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED | |
164 | * states. | |
165 | */ | |
166 | unsigned long long xen_sched_clock(void) | |
167 | { | |
168 | struct vcpu_runstate_info state; | |
f120f13e JF |
169 | cycle_t now; |
170 | u64 ret; | |
ab550288 JF |
171 | s64 offset; |
172 | ||
f120f13e JF |
173 | /* |
174 | * Ideally sched_clock should be called on a per-cpu basis | |
175 | * anyway, so preempt should already be disabled, but that's | |
176 | * not current practice at the moment. | |
177 | */ | |
178 | preempt_disable(); | |
179 | ||
180 | now = xen_clocksource_read(); | |
181 | ||
ab550288 JF |
182 | get_runstate_snapshot(&state); |
183 | ||
184 | WARN_ON(state.state != RUNSTATE_running); | |
185 | ||
186 | offset = now - state.state_entry_time; | |
187 | if (offset < 0) | |
188 | offset = 0; | |
189 | ||
f120f13e | 190 | ret = state.time[RUNSTATE_blocked] + |
ab550288 JF |
191 | state.time[RUNSTATE_running] + |
192 | offset; | |
f120f13e JF |
193 | |
194 | preempt_enable(); | |
195 | ||
196 | return ret; | |
ab550288 | 197 | } |
f91a8b44 JF |
198 | |
199 | ||
200 | /* Get the CPU speed from Xen */ | |
15c84731 JF |
201 | unsigned long xen_cpu_khz(void) |
202 | { | |
88a5ac89 | 203 | u64 xen_khz = 1000000ULL << 32; |
1c7b67f7 | 204 | const struct pvclock_vcpu_time_info *info = |
15c84731 JF |
205 | &HYPERVISOR_shared_info->vcpu_info[0].time; |
206 | ||
88a5ac89 | 207 | do_div(xen_khz, info->tsc_to_system_mul); |
15c84731 | 208 | if (info->tsc_shift < 0) |
88a5ac89 | 209 | xen_khz <<= -info->tsc_shift; |
15c84731 | 210 | else |
88a5ac89 | 211 | xen_khz >>= info->tsc_shift; |
15c84731 | 212 | |
88a5ac89 | 213 | return xen_khz; |
15c84731 JF |
214 | } |
215 | ||
ab550288 | 216 | static cycle_t xen_clocksource_read(void) |
15c84731 | 217 | { |
1c7b67f7 | 218 | struct pvclock_vcpu_time_info *src; |
15c84731 | 219 | cycle_t ret; |
15c84731 | 220 | |
1c7b67f7 GH |
221 | src = &get_cpu_var(xen_vcpu)->time; |
222 | ret = pvclock_clocksource_read(src); | |
223 | put_cpu_var(xen_vcpu); | |
15c84731 JF |
224 | return ret; |
225 | } | |
226 | ||
227 | static void xen_read_wallclock(struct timespec *ts) | |
228 | { | |
1c7b67f7 GH |
229 | struct shared_info *s = HYPERVISOR_shared_info; |
230 | struct pvclock_wall_clock *wall_clock = &(s->wc); | |
231 | struct pvclock_vcpu_time_info *vcpu_time; | |
15c84731 | 232 | |
1c7b67f7 GH |
233 | vcpu_time = &get_cpu_var(xen_vcpu)->time; |
234 | pvclock_read_wallclock(wall_clock, vcpu_time, ts); | |
235 | put_cpu_var(xen_vcpu); | |
15c84731 JF |
236 | } |
237 | ||
238 | unsigned long xen_get_wallclock(void) | |
239 | { | |
240 | struct timespec ts; | |
241 | ||
242 | xen_read_wallclock(&ts); | |
15c84731 JF |
243 | return ts.tv_sec; |
244 | } | |
245 | ||
246 | int xen_set_wallclock(unsigned long now) | |
247 | { | |
248 | /* do nothing for domU */ | |
249 | return -1; | |
250 | } | |
251 | ||
252 | static struct clocksource xen_clocksource __read_mostly = { | |
253 | .name = "xen", | |
254 | .rating = 400, | |
255 | .read = xen_clocksource_read, | |
256 | .mask = ~0, | |
257 | .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ | |
258 | .shift = XEN_SHIFT, | |
259 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | |
260 | }; | |
261 | ||
262 | /* | |
263 | Xen clockevent implementation | |
264 | ||
265 | Xen has two clockevent implementations: | |
266 | ||
267 | The old timer_op one works with all released versions of Xen prior | |
268 | to version 3.0.4. This version of the hypervisor provides a | |
269 | single-shot timer with nanosecond resolution. However, sharing the | |
270 | same event channel is a 100Hz tick which is delivered while the | |
271 | vcpu is running. We don't care about or use this tick, but it will | |
272 | cause the core time code to think the timer fired too soon, and | |
273 | will end up resetting it each time. It could be filtered, but | |
274 | doing so has complications when the ktime clocksource is not yet | |
275 | the xen clocksource (ie, at boot time). | |
276 | ||
277 | The new vcpu_op-based timer interface allows the tick timer period | |
278 | to be changed or turned off. The tick timer is not useful as a | |
279 | periodic timer because events are only delivered to running vcpus. | |
280 | The one-shot timer can report when a timeout is in the past, so | |
281 | set_next_event is capable of returning -ETIME when appropriate. | |
282 | This interface is used when available. | |
283 | */ | |
284 | ||
285 | ||
286 | /* | |
287 | Get a hypervisor absolute time. In theory we could maintain an | |
288 | offset between the kernel's time and the hypervisor's time, and | |
289 | apply that to a kernel's absolute timeout. Unfortunately the | |
290 | hypervisor and kernel times can drift even if the kernel is using | |
291 | the Xen clocksource, because ntp can warp the kernel's clocksource. | |
292 | */ | |
293 | static s64 get_abs_timeout(unsigned long delta) | |
294 | { | |
295 | return xen_clocksource_read() + delta; | |
296 | } | |
297 | ||
298 | static void xen_timerop_set_mode(enum clock_event_mode mode, | |
299 | struct clock_event_device *evt) | |
300 | { | |
301 | switch (mode) { | |
302 | case CLOCK_EVT_MODE_PERIODIC: | |
303 | /* unsupported */ | |
304 | WARN_ON(1); | |
305 | break; | |
306 | ||
307 | case CLOCK_EVT_MODE_ONESHOT: | |
18de5bc4 | 308 | case CLOCK_EVT_MODE_RESUME: |
15c84731 JF |
309 | break; |
310 | ||
311 | case CLOCK_EVT_MODE_UNUSED: | |
312 | case CLOCK_EVT_MODE_SHUTDOWN: | |
313 | HYPERVISOR_set_timer_op(0); /* cancel timeout */ | |
314 | break; | |
315 | } | |
316 | } | |
317 | ||
318 | static int xen_timerop_set_next_event(unsigned long delta, | |
319 | struct clock_event_device *evt) | |
320 | { | |
321 | WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); | |
322 | ||
323 | if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) | |
324 | BUG(); | |
325 | ||
326 | /* We may have missed the deadline, but there's no real way of | |
327 | knowing for sure. If the event was in the past, then we'll | |
328 | get an immediate interrupt. */ | |
329 | ||
330 | return 0; | |
331 | } | |
332 | ||
333 | static const struct clock_event_device xen_timerop_clockevent = { | |
334 | .name = "xen", | |
335 | .features = CLOCK_EVT_FEAT_ONESHOT, | |
336 | ||
337 | .max_delta_ns = 0xffffffff, | |
338 | .min_delta_ns = TIMER_SLOP, | |
339 | ||
340 | .mult = 1, | |
341 | .shift = 0, | |
342 | .rating = 500, | |
343 | ||
344 | .set_mode = xen_timerop_set_mode, | |
345 | .set_next_event = xen_timerop_set_next_event, | |
346 | }; | |
347 | ||
348 | ||
349 | ||
350 | static void xen_vcpuop_set_mode(enum clock_event_mode mode, | |
351 | struct clock_event_device *evt) | |
352 | { | |
353 | int cpu = smp_processor_id(); | |
354 | ||
355 | switch (mode) { | |
356 | case CLOCK_EVT_MODE_PERIODIC: | |
357 | WARN_ON(1); /* unsupported */ | |
358 | break; | |
359 | ||
360 | case CLOCK_EVT_MODE_ONESHOT: | |
361 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) | |
362 | BUG(); | |
363 | break; | |
364 | ||
365 | case CLOCK_EVT_MODE_UNUSED: | |
366 | case CLOCK_EVT_MODE_SHUTDOWN: | |
367 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || | |
368 | HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) | |
369 | BUG(); | |
370 | break; | |
18de5bc4 TG |
371 | case CLOCK_EVT_MODE_RESUME: |
372 | break; | |
15c84731 JF |
373 | } |
374 | } | |
375 | ||
376 | static int xen_vcpuop_set_next_event(unsigned long delta, | |
377 | struct clock_event_device *evt) | |
378 | { | |
379 | int cpu = smp_processor_id(); | |
380 | struct vcpu_set_singleshot_timer single; | |
381 | int ret; | |
382 | ||
383 | WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); | |
384 | ||
385 | single.timeout_abs_ns = get_abs_timeout(delta); | |
386 | single.flags = VCPU_SSHOTTMR_future; | |
387 | ||
388 | ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); | |
389 | ||
390 | BUG_ON(ret != 0 && ret != -ETIME); | |
391 | ||
392 | return ret; | |
393 | } | |
394 | ||
395 | static const struct clock_event_device xen_vcpuop_clockevent = { | |
396 | .name = "xen", | |
397 | .features = CLOCK_EVT_FEAT_ONESHOT, | |
398 | ||
399 | .max_delta_ns = 0xffffffff, | |
400 | .min_delta_ns = TIMER_SLOP, | |
401 | ||
402 | .mult = 1, | |
403 | .shift = 0, | |
404 | .rating = 500, | |
405 | ||
406 | .set_mode = xen_vcpuop_set_mode, | |
407 | .set_next_event = xen_vcpuop_set_next_event, | |
408 | }; | |
409 | ||
410 | static const struct clock_event_device *xen_clockevent = | |
411 | &xen_timerop_clockevent; | |
412 | static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); | |
413 | ||
414 | static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) | |
415 | { | |
416 | struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); | |
417 | irqreturn_t ret; | |
418 | ||
419 | ret = IRQ_NONE; | |
420 | if (evt->event_handler) { | |
421 | evt->event_handler(evt); | |
422 | ret = IRQ_HANDLED; | |
423 | } | |
424 | ||
f91a8b44 JF |
425 | do_stolen_accounting(); |
426 | ||
15c84731 JF |
427 | return ret; |
428 | } | |
429 | ||
f87e4cac | 430 | void xen_setup_timer(int cpu) |
15c84731 JF |
431 | { |
432 | const char *name; | |
433 | struct clock_event_device *evt; | |
434 | int irq; | |
435 | ||
436 | printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); | |
437 | ||
438 | name = kasprintf(GFP_KERNEL, "timer%d", cpu); | |
439 | if (!name) | |
440 | name = "<timer kasprintf failed>"; | |
441 | ||
442 | irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, | |
443 | IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, | |
444 | name, NULL); | |
445 | ||
f87e4cac | 446 | evt = &per_cpu(xen_clock_events, cpu); |
15c84731 JF |
447 | memcpy(evt, xen_clockevent, sizeof(*evt)); |
448 | ||
449 | evt->cpumask = cpumask_of_cpu(cpu); | |
450 | evt->irq = irq; | |
15c84731 | 451 | |
f91a8b44 | 452 | setup_runstate_info(cpu); |
f87e4cac JF |
453 | } |
454 | ||
455 | void xen_setup_cpu_clockevents(void) | |
456 | { | |
457 | BUG_ON(preemptible()); | |
f91a8b44 | 458 | |
f87e4cac | 459 | clockevents_register_device(&__get_cpu_var(xen_clock_events)); |
15c84731 JF |
460 | } |
461 | ||
462 | __init void xen_time_init(void) | |
463 | { | |
464 | int cpu = smp_processor_id(); | |
465 | ||
15c84731 JF |
466 | clocksource_register(&xen_clocksource); |
467 | ||
468 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { | |
f91a8b44 | 469 | /* Successfully turned off 100Hz tick, so we have the |
15c84731 JF |
470 | vcpuop-based timer interface */ |
471 | printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); | |
472 | xen_clockevent = &xen_vcpuop_clockevent; | |
473 | } | |
474 | ||
475 | /* Set initial system time with full resolution */ | |
476 | xen_read_wallclock(&xtime); | |
477 | set_normalized_timespec(&wall_to_monotonic, | |
478 | -xtime.tv_sec, -xtime.tv_nsec); | |
479 | ||
404ee5b1 | 480 | setup_force_cpu_cap(X86_FEATURE_TSC); |
15c84731 JF |
481 | |
482 | xen_setup_timer(cpu); | |
f87e4cac | 483 | xen_setup_cpu_clockevents(); |
15c84731 | 484 | } |