Merge branch 'oprofile-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[deliverable/linux.git] / arch / x86 / kernel / vsyscall_64.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 * Copyright 2003 Andi Kleen, SuSE Labs.
4 *
5 * Thanks to hpa@transmeta.com for some useful hint.
6 * Special thanks to Ingo Molnar for his early experience with
7 * a different vsyscall implementation for Linux/IA32 and for the name.
8 *
9 * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
10 * at virtual address -10Mbyte+1024bytes etc... There are at max 4
11 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
12 * jumping out of line if necessary. We cannot add more with this
13 * mechanism because older kernels won't return -ENOSYS.
14 * If we want more than four we need a vDSO.
15 *
16 * Note: the concept clashes with user mode linux. If you use UML and
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
18 */
19
2b7d0390 20/* Disable profiling for userspace code: */
2ed84eeb 21#define DISABLE_BRANCH_PROFILING
1f0d69a9 22
1da177e4
LT
23#include <linux/time.h>
24#include <linux/init.h>
25#include <linux/kernel.h>
26#include <linux/timer.h>
27#include <linux/seqlock.h>
28#include <linux/jiffies.h>
29#include <linux/sysctl.h>
7460ed28 30#include <linux/clocksource.h>
c08c8205 31#include <linux/getcpu.h>
8c131af1
AK
32#include <linux/cpu.h>
33#include <linux/smp.h>
34#include <linux/notifier.h>
1da177e4
LT
35
36#include <asm/vsyscall.h>
37#include <asm/pgtable.h>
38#include <asm/page.h>
7460ed28 39#include <asm/unistd.h>
1da177e4
LT
40#include <asm/fixmap.h>
41#include <asm/errno.h>
42#include <asm/io.h>
c08c8205
VP
43#include <asm/segment.h>
44#include <asm/desc.h>
45#include <asm/topology.h>
2aae950b 46#include <asm/vgtod.h>
1da177e4 47
23adec55
SR
48#define __vsyscall(nr) \
49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
65ea5b03 50#define __syscall_clobber "r11","cx","memory"
1da177e4 51
c8118c6c
ED
52/*
53 * vsyscall_gtod_data contains data that is :
54 * - readonly from vsyscalls
676b1855 55 * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
c8118c6c
ED
56 * Try to keep this structure as small as possible to avoid cache line ping pongs
57 */
c08c8205 58int __vgetcpu_mode __section_vgetcpu_mode;
1da177e4 59
2aae950b 60struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
1da177e4 61{
7460ed28 62 .lock = SEQLOCK_UNLOCKED,
63 .sysctl_enabled = 1,
64};
1da177e4 65
2c622148
TB
66void update_vsyscall_tz(void)
67{
68 unsigned long flags;
69
70 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
71 /* sys_tz has changed */
72 vsyscall_gtod_data.sys_tz = sys_tz;
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74}
75
7460ed28 76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
1da177e4 77{
7460ed28 78 unsigned long flags;
1da177e4 79
7460ed28 80 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
81 /* copy vsyscall data */
c8118c6c
ED
82 vsyscall_gtod_data.clock.vread = clock->vread;
83 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
84 vsyscall_gtod_data.clock.mask = clock->mask;
85 vsyscall_gtod_data.clock.mult = clock->mult;
86 vsyscall_gtod_data.clock.shift = clock->shift;
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
2aae950b 89 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
7460ed28 90 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
1da177e4
LT
91}
92
7460ed28 93/* RED-PEN may want to readd seq locking, but then the variable should be
94 * write-once.
95 */
2c8bc944 96static __always_inline void do_get_tz(struct timezone * tz)
1da177e4 97{
7460ed28 98 *tz = __vsyscall_gtod_data.sys_tz;
1da177e4
LT
99}
100
2c8bc944 101static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
1da177e4
LT
102{
103 int ret;
ce28b986 104 asm volatile("syscall"
1da177e4 105 : "=a" (ret)
7460ed28 106 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
107 : __syscall_clobber );
1da177e4
LT
108 return ret;
109}
110
2c8bc944 111static __always_inline long time_syscall(long *t)
1da177e4
LT
112{
113 long secs;
ce28b986 114 asm volatile("syscall"
1da177e4
LT
115 : "=a" (secs)
116 : "0" (__NR_time),"D" (t) : __syscall_clobber);
117 return secs;
118}
119
7460ed28 120static __always_inline void do_vgettimeofday(struct timeval * tv)
121{
122 cycle_t now, base, mask, cycle_delta;
c8118c6c
ED
123 unsigned seq;
124 unsigned long mult, shift, nsec;
7460ed28 125 cycle_t (*vread)(void);
126 do {
127 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
128
129 vread = __vsyscall_gtod_data.clock.vread;
130 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
89952d13 131 gettimeofday(tv,NULL);
7460ed28 132 return;
133 }
cb9e35dc 134
7460ed28 135 now = vread();
136 base = __vsyscall_gtod_data.clock.cycle_last;
137 mask = __vsyscall_gtod_data.clock.mask;
138 mult = __vsyscall_gtod_data.clock.mult;
139 shift = __vsyscall_gtod_data.clock.shift;
140
c8118c6c
ED
141 tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
142 nsec = __vsyscall_gtod_data.wall_time_nsec;
7460ed28 143 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
144
145 /* calculate interval: */
146 cycle_delta = (now - base) & mask;
147 /* convert to nsecs: */
c8118c6c 148 nsec += (cycle_delta * mult) >> shift;
7460ed28 149
c8118c6c 150 while (nsec >= NSEC_PER_SEC) {
7460ed28 151 tv->tv_sec += 1;
c8118c6c 152 nsec -= NSEC_PER_SEC;
7460ed28 153 }
c8118c6c 154 tv->tv_usec = nsec / NSEC_PER_USEC;
7460ed28 155}
156
2e8ad43e 157int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
1da177e4 158{
1da177e4
LT
159 if (tv)
160 do_vgettimeofday(tv);
161 if (tz)
162 do_get_tz(tz);
163 return 0;
164}
165
166/* This will break when the xtime seconds get inaccurate, but that is
167 * unlikely */
2e8ad43e 168time_t __vsyscall(1) vtime(time_t *t)
1da177e4 169{
d0aff6e6 170 struct timeval tv;
272a3713 171 time_t result;
7460ed28 172 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
1da177e4 173 return time_syscall(t);
d0aff6e6 174
c80544dc 175 vgettimeofday(&tv, NULL);
d0aff6e6 176 result = tv.tv_sec;
272a3713
ED
177 if (t)
178 *t = result;
179 return result;
1da177e4
LT
180}
181
c08c8205
VP
182/* Fast way to get current CPU and node.
183 This helps to do per node and per CPU caches in user space.
184 The result is not guaranteed without CPU affinity, but usually
185 works out because the scheduler tries to keep a thread on the same
186 CPU.
187
188 tcache must point to a two element sized long array.
189 All arguments can be NULL. */
190long __vsyscall(2)
191vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
1da177e4 192{
8f12dea6 193 unsigned int p;
c08c8205
VP
194 unsigned long j = 0;
195
196 /* Fast cache - only recompute value once per jiffies and avoid
197 relatively costly rdtscp/cpuid otherwise.
198 This works because the scheduler usually keeps the process
199 on the same CPU and this syscall doesn't guarantee its
200 results anyways.
201 We do this here because otherwise user space would do it on
202 its own in a likely inferior way (no access to jiffies).
203 If you don't like it pass NULL. */
34596dc9
AK
204 if (tcache && tcache->blob[0] == (j = __jiffies)) {
205 p = tcache->blob[1];
c08c8205
VP
206 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
207 /* Load per CPU data from RDTSCP */
8f12dea6 208 native_read_tscp(&p);
c08c8205
VP
209 } else {
210 /* Load per CPU data from GDT */
211 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
212 }
213 if (tcache) {
34596dc9
AK
214 tcache->blob[0] = j;
215 tcache->blob[1] = p;
c08c8205
VP
216 }
217 if (cpu)
218 *cpu = p & 0xfff;
219 if (node)
220 *node = p >> 12;
221 return 0;
1da177e4
LT
222}
223
a4928cff 224static long __vsyscall(3) venosys_1(void)
1da177e4
LT
225{
226 return -ENOSYS;
227}
228
229#ifdef CONFIG_SYSCTL
d67bbacb
TG
230
231static int
232vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
233 void __user *buffer, size_t *lenp, loff_t *ppos)
234{
235 return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
236}
237
1da177e4 238static ctl_table kernel_table2[] = {
282a821f 239 { .procname = "vsyscall64",
7460ed28 240 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
d67bbacb
TG
241 .mode = 0644,
242 .proc_handler = vsyscall_sysctl_change },
7a44d37d 243 {}
1da177e4
LT
244};
245
246static ctl_table kernel_root_table2[] = {
247 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
248 .child = kernel_table2 },
7a44d37d 249 {}
1da177e4 250};
1da177e4
LT
251#endif
252
8c131af1
AK
253/* Assume __initcall executes before all user space. Hopefully kmod
254 doesn't violate that. We'll find out if it does. */
255static void __cpuinit vsyscall_set_cpu(int cpu)
c08c8205 256{
fc8b8a60 257 unsigned long d;
c08c8205
VP
258 unsigned long node = 0;
259#ifdef CONFIG_NUMA
98c9e27a 260 node = cpu_to_node(cpu);
c08c8205 261#endif
92cb7612 262 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
8c131af1 263 write_rdtscp_aux((node << 12) | cpu);
c08c8205
VP
264
265 /* Store cpu number in limit so that it can be loaded quickly
266 in user space in vgetcpu.
267 12 bits for the CPU and 8 bits for the node. */
fc8b8a60
JF
268 d = 0x0f40000000000ULL;
269 d |= cpu;
270 d |= (node & 0xf) << 12;
271 d |= (node >> 4) << 48;
272 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
c08c8205
VP
273}
274
8c131af1
AK
275static void __cpuinit cpu_vsyscall_init(void *arg)
276{
277 /* preemption should be already off */
278 vsyscall_set_cpu(raw_smp_processor_id());
279}
280
281static int __cpuinit
282cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
283{
284 long cpu = (long)arg;
8bb78442 285 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
8691e5a8 286 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
8c131af1
AK
287 return NOTIFY_DONE;
288}
289
e4026440 290void __init map_vsyscall(void)
1da177e4
LT
291{
292 extern char __vsyscall_0;
293 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
294
103efcd9 295 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
1da177e4
LT
296 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
297}
298
299static int __init vsyscall_init(void)
300{
301 BUG_ON(((unsigned long) &vgettimeofday !=
302 VSYSCALL_ADDR(__NR_vgettimeofday)));
303 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
304 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
c08c8205 305 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
f3c5f5e7 306#ifdef CONFIG_SYSCTL
0b4d4147 307 register_sysctl_table(kernel_root_table2);
f3c5f5e7 308#endif
15c8b6c1 309 on_each_cpu(cpu_vsyscall_init, NULL, 1);
8c131af1 310 hotcpu_notifier(cpu_vsyscall_notifier, 0);
1da177e4
LT
311 return 0;
312}
313
314__initcall(vsyscall_init);
This page took 0.46569 seconds and 5 git commands to generate.