x86, 32-bit: trim memory not covered by wb mtrrs
[deliverable/linux.git] / arch / x86 / kernel / vsyscall_64.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 * Copyright 2003 Andi Kleen, SuSE Labs.
4 *
5 * Thanks to hpa@transmeta.com for some useful hint.
6 * Special thanks to Ingo Molnar for his early experience with
7 * a different vsyscall implementation for Linux/IA32 and for the name.
8 *
9 * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
10 * at virtual address -10Mbyte+1024bytes etc... There are at max 4
11 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
12 * jumping out of line if necessary. We cannot add more with this
13 * mechanism because older kernels won't return -ENOSYS.
14 * If we want more than four we need a vDSO.
15 *
16 * Note: the concept clashes with user mode linux. If you use UML and
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
18 */
19
20#include <linux/time.h>
21#include <linux/init.h>
22#include <linux/kernel.h>
23#include <linux/timer.h>
24#include <linux/seqlock.h>
25#include <linux/jiffies.h>
26#include <linux/sysctl.h>
7460ed28 27#include <linux/clocksource.h>
c08c8205 28#include <linux/getcpu.h>
8c131af1
AK
29#include <linux/cpu.h>
30#include <linux/smp.h>
31#include <linux/notifier.h>
1da177e4
LT
32
33#include <asm/vsyscall.h>
34#include <asm/pgtable.h>
35#include <asm/page.h>
7460ed28 36#include <asm/unistd.h>
1da177e4
LT
37#include <asm/fixmap.h>
38#include <asm/errno.h>
39#include <asm/io.h>
c08c8205
VP
40#include <asm/segment.h>
41#include <asm/desc.h>
42#include <asm/topology.h>
2aae950b 43#include <asm/vgtod.h>
1da177e4
LT
44
45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
65ea5b03 46#define __syscall_clobber "r11","cx","memory"
0dbf7028
VG
47#define __pa_vsymbol(x) \
48 ({unsigned long v; \
49 extern char __vsyscall_0; \
50 asm("" : "=r" (v) : "0" (x)); \
957ff882 51 ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
1da177e4 52
c8118c6c
ED
53/*
54 * vsyscall_gtod_data contains data that is :
55 * - readonly from vsyscalls
676b1855 56 * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
c8118c6c
ED
57 * Try to keep this structure as small as possible to avoid cache line ping pongs
58 */
c08c8205 59int __vgetcpu_mode __section_vgetcpu_mode;
1da177e4 60
2aae950b 61struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
1da177e4 62{
7460ed28 63 .lock = SEQLOCK_UNLOCKED,
64 .sysctl_enabled = 1,
65};
1da177e4 66
2c622148
TB
67void update_vsyscall_tz(void)
68{
69 unsigned long flags;
70
71 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
72 /* sys_tz has changed */
73 vsyscall_gtod_data.sys_tz = sys_tz;
74 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
75}
76
7460ed28 77void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
1da177e4 78{
7460ed28 79 unsigned long flags;
1da177e4 80
7460ed28 81 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
82 /* copy vsyscall data */
c8118c6c
ED
83 vsyscall_gtod_data.clock.vread = clock->vread;
84 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
85 vsyscall_gtod_data.clock.mask = clock->mask;
86 vsyscall_gtod_data.clock.mult = clock->mult;
87 vsyscall_gtod_data.clock.shift = clock->shift;
88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
2aae950b 90 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
7460ed28 91 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
1da177e4
LT
92}
93
7460ed28 94/* RED-PEN may want to readd seq locking, but then the variable should be
95 * write-once.
96 */
2c8bc944 97static __always_inline void do_get_tz(struct timezone * tz)
1da177e4 98{
7460ed28 99 *tz = __vsyscall_gtod_data.sys_tz;
1da177e4
LT
100}
101
2c8bc944 102static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
1da177e4
LT
103{
104 int ret;
105 asm volatile("vsysc2: syscall"
106 : "=a" (ret)
7460ed28 107 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
108 : __syscall_clobber );
1da177e4
LT
109 return ret;
110}
111
2c8bc944 112static __always_inline long time_syscall(long *t)
1da177e4
LT
113{
114 long secs;
115 asm volatile("vsysc1: syscall"
116 : "=a" (secs)
117 : "0" (__NR_time),"D" (t) : __syscall_clobber);
118 return secs;
119}
120
7460ed28 121static __always_inline void do_vgettimeofday(struct timeval * tv)
122{
123 cycle_t now, base, mask, cycle_delta;
c8118c6c
ED
124 unsigned seq;
125 unsigned long mult, shift, nsec;
7460ed28 126 cycle_t (*vread)(void);
127 do {
128 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
129
130 vread = __vsyscall_gtod_data.clock.vread;
131 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
89952d13 132 gettimeofday(tv,NULL);
7460ed28 133 return;
134 }
135 now = vread();
136 base = __vsyscall_gtod_data.clock.cycle_last;
137 mask = __vsyscall_gtod_data.clock.mask;
138 mult = __vsyscall_gtod_data.clock.mult;
139 shift = __vsyscall_gtod_data.clock.shift;
140
c8118c6c
ED
141 tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
142 nsec = __vsyscall_gtod_data.wall_time_nsec;
7460ed28 143 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
144
145 /* calculate interval: */
146 cycle_delta = (now - base) & mask;
147 /* convert to nsecs: */
c8118c6c 148 nsec += (cycle_delta * mult) >> shift;
7460ed28 149
c8118c6c 150 while (nsec >= NSEC_PER_SEC) {
7460ed28 151 tv->tv_sec += 1;
c8118c6c 152 nsec -= NSEC_PER_SEC;
7460ed28 153 }
c8118c6c 154 tv->tv_usec = nsec / NSEC_PER_USEC;
7460ed28 155}
156
2e8ad43e 157int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
1da177e4 158{
1da177e4
LT
159 if (tv)
160 do_vgettimeofday(tv);
161 if (tz)
162 do_get_tz(tz);
163 return 0;
164}
165
166/* This will break when the xtime seconds get inaccurate, but that is
167 * unlikely */
2e8ad43e 168time_t __vsyscall(1) vtime(time_t *t)
1da177e4 169{
d0aff6e6 170 struct timeval tv;
272a3713 171 time_t result;
7460ed28 172 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
1da177e4 173 return time_syscall(t);
d0aff6e6 174
c80544dc 175 vgettimeofday(&tv, NULL);
d0aff6e6 176 result = tv.tv_sec;
272a3713
ED
177 if (t)
178 *t = result;
179 return result;
1da177e4
LT
180}
181
c08c8205
VP
182/* Fast way to get current CPU and node.
183 This helps to do per node and per CPU caches in user space.
184 The result is not guaranteed without CPU affinity, but usually
185 works out because the scheduler tries to keep a thread on the same
186 CPU.
187
188 tcache must point to a two element sized long array.
189 All arguments can be NULL. */
190long __vsyscall(2)
191vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
1da177e4 192{
8f12dea6 193 unsigned int p;
c08c8205
VP
194 unsigned long j = 0;
195
196 /* Fast cache - only recompute value once per jiffies and avoid
197 relatively costly rdtscp/cpuid otherwise.
198 This works because the scheduler usually keeps the process
199 on the same CPU and this syscall doesn't guarantee its
200 results anyways.
201 We do this here because otherwise user space would do it on
202 its own in a likely inferior way (no access to jiffies).
203 If you don't like it pass NULL. */
34596dc9
AK
204 if (tcache && tcache->blob[0] == (j = __jiffies)) {
205 p = tcache->blob[1];
c08c8205
VP
206 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
207 /* Load per CPU data from RDTSCP */
8f12dea6 208 native_read_tscp(&p);
c08c8205
VP
209 } else {
210 /* Load per CPU data from GDT */
211 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
212 }
213 if (tcache) {
34596dc9
AK
214 tcache->blob[0] = j;
215 tcache->blob[1] = p;
c08c8205
VP
216 }
217 if (cpu)
218 *cpu = p & 0xfff;
219 if (node)
220 *node = p >> 12;
221 return 0;
1da177e4
LT
222}
223
2e8ad43e 224long __vsyscall(3) venosys_1(void)
1da177e4
LT
225{
226 return -ENOSYS;
227}
228
229#ifdef CONFIG_SYSCTL
230
231#define SYSCALL 0x050f
232#define NOP2 0x9090
233
234/*
235 * NOP out syscall in vsyscall page when not needed.
236 */
237static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
238 void __user *buffer, size_t *lenp, loff_t *ppos)
239{
240 extern u16 vsysc1, vsysc2;
131cfd7b
AK
241 u16 __iomem *map1;
242 u16 __iomem *map2;
1da177e4
LT
243 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
244 if (!write)
245 return ret;
246 /* gcc has some trouble with __va(__pa()), so just do it this
247 way. */
0dbf7028 248 map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
1da177e4
LT
249 if (!map1)
250 return -ENOMEM;
0dbf7028 251 map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
1da177e4
LT
252 if (!map2) {
253 ret = -ENOMEM;
254 goto out;
255 }
7460ed28 256 if (!vsyscall_gtod_data.sysctl_enabled) {
131cfd7b
AK
257 writew(SYSCALL, map1);
258 writew(SYSCALL, map2);
1da177e4 259 } else {
131cfd7b
AK
260 writew(NOP2, map1);
261 writew(NOP2, map2);
1da177e4
LT
262 }
263 iounmap(map2);
264out:
265 iounmap(map1);
266 return ret;
267}
268
1da177e4 269static ctl_table kernel_table2[] = {
282a821f 270 { .procname = "vsyscall64",
7460ed28 271 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
272 .mode = 0644,
1da177e4 273 .proc_handler = vsyscall_sysctl_change },
7a44d37d 274 {}
1da177e4
LT
275};
276
277static ctl_table kernel_root_table2[] = {
278 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
279 .child = kernel_table2 },
7a44d37d 280 {}
1da177e4
LT
281};
282
283#endif
284
8c131af1
AK
285/* Assume __initcall executes before all user space. Hopefully kmod
286 doesn't violate that. We'll find out if it does. */
287static void __cpuinit vsyscall_set_cpu(int cpu)
c08c8205
VP
288{
289 unsigned long *d;
290 unsigned long node = 0;
291#ifdef CONFIG_NUMA
98c9e27a 292 node = cpu_to_node(cpu);
c08c8205 293#endif
92cb7612 294 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
8c131af1 295 write_rdtscp_aux((node << 12) | cpu);
c08c8205
VP
296
297 /* Store cpu number in limit so that it can be loaded quickly
298 in user space in vgetcpu.
299 12 bits for the CPU and 8 bits for the node. */
f6dc247c 300 d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU);
c08c8205
VP
301 *d = 0x0f40000000000ULL;
302 *d |= cpu;
303 *d |= (node & 0xf) << 12;
304 *d |= (node >> 4) << 48;
305}
306
8c131af1
AK
307static void __cpuinit cpu_vsyscall_init(void *arg)
308{
309 /* preemption should be already off */
310 vsyscall_set_cpu(raw_smp_processor_id());
311}
312
313static int __cpuinit
314cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
315{
316 long cpu = (long)arg;
8bb78442 317 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
8c131af1
AK
318 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
319 return NOTIFY_DONE;
320}
321
e4026440 322void __init map_vsyscall(void)
1da177e4
LT
323{
324 extern char __vsyscall_0;
325 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
326
103efcd9 327 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
1da177e4
LT
328 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
329}
330
331static int __init vsyscall_init(void)
332{
333 BUG_ON(((unsigned long) &vgettimeofday !=
334 VSYSCALL_ADDR(__NR_vgettimeofday)));
335 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
336 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
c08c8205 337 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
f3c5f5e7 338#ifdef CONFIG_SYSCTL
0b4d4147 339 register_sysctl_table(kernel_root_table2);
f3c5f5e7 340#endif
8c131af1
AK
341 on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
342 hotcpu_notifier(cpu_vsyscall_notifier, 0);
1da177e4
LT
343 return 0;
344}
345
346__initcall(vsyscall_init);
This page took 0.334661 seconds and 5 git commands to generate.