Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* |
1da177e4 LT |
2 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE |
3 | * Copyright 2003 Andi Kleen, SuSE Labs. | |
4 | * | |
5cec93c2 AL |
5 | * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] |
6 | * | |
1da177e4 LT |
7 | * Thanks to hpa@transmeta.com for some useful hint. |
8 | * Special thanks to Ingo Molnar for his early experience with | |
9 | * a different vsyscall implementation for Linux/IA32 and for the name. | |
10 | * | |
11 | * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located | |
12 | * at virtual address -10Mbyte+1024bytes etc... There are at max 4 | |
13 | * vsyscalls. One vsyscall can reserve more than 1 slot to avoid | |
14 | * jumping out of line if necessary. We cannot add more with this | |
15 | * mechanism because older kernels won't return -ENOSYS. | |
1da177e4 | 16 | * |
5cec93c2 AL |
17 | * Note: the concept clashes with user mode linux. UML users should |
18 | * use the vDSO. | |
1da177e4 LT |
19 | */ |
20 | ||
21 | #include <linux/time.h> | |
22 | #include <linux/init.h> | |
23 | #include <linux/kernel.h> | |
24 | #include <linux/timer.h> | |
25 | #include <linux/seqlock.h> | |
26 | #include <linux/jiffies.h> | |
27 | #include <linux/sysctl.h> | |
7460ed28 | 28 | #include <linux/clocksource.h> |
c08c8205 | 29 | #include <linux/getcpu.h> |
8c131af1 AK |
30 | #include <linux/cpu.h> |
31 | #include <linux/smp.h> | |
32 | #include <linux/notifier.h> | |
5cec93c2 AL |
33 | #include <linux/syscalls.h> |
34 | #include <linux/ratelimit.h> | |
1da177e4 LT |
35 | |
36 | #include <asm/vsyscall.h> | |
37 | #include <asm/pgtable.h> | |
c9712944 | 38 | #include <asm/compat.h> |
1da177e4 | 39 | #include <asm/page.h> |
7460ed28 | 40 | #include <asm/unistd.h> |
1da177e4 LT |
41 | #include <asm/fixmap.h> |
42 | #include <asm/errno.h> | |
43 | #include <asm/io.h> | |
c08c8205 VP |
44 | #include <asm/segment.h> |
45 | #include <asm/desc.h> | |
46 | #include <asm/topology.h> | |
2aae950b | 47 | #include <asm/vgtod.h> |
5cec93c2 | 48 | #include <asm/traps.h> |
1da177e4 | 49 | |
c149a665 AL |
50 | #define CREATE_TRACE_POINTS |
51 | #include "vsyscall_trace.h" | |
52 | ||
8c49d9a7 AL |
53 | DEFINE_VVAR(int, vgetcpu_mode); |
54 | DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = | |
1da177e4 | 55 | { |
c4dbe54e | 56 | .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), |
7460ed28 | 57 | }; |
1da177e4 | 58 | |
3ae36655 AL |
59 | static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; |
60 | ||
61 | static int __init vsyscall_setup(char *str) | |
62 | { | |
63 | if (str) { | |
64 | if (!strcmp("emulate", str)) | |
65 | vsyscall_mode = EMULATE; | |
66 | else if (!strcmp("native", str)) | |
67 | vsyscall_mode = NATIVE; | |
68 | else if (!strcmp("none", str)) | |
69 | vsyscall_mode = NONE; | |
70 | else | |
71 | return -EINVAL; | |
72 | ||
73 | return 0; | |
74 | } | |
75 | ||
76 | return -EINVAL; | |
77 | } | |
78 | early_param("vsyscall", vsyscall_setup); | |
79 | ||
2c622148 TB |
80 | void update_vsyscall_tz(void) |
81 | { | |
82 | unsigned long flags; | |
83 | ||
84 | write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); | |
85 | /* sys_tz has changed */ | |
86 | vsyscall_gtod_data.sys_tz = sys_tz; | |
87 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); | |
88 | } | |
89 | ||
7615856e JS |
90 | void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, |
91 | struct clocksource *clock, u32 mult) | |
1da177e4 | 92 | { |
7460ed28 | 93 | unsigned long flags; |
1da177e4 | 94 | |
7460ed28 | 95 | write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); |
5cec93c2 | 96 | |
7460ed28 | 97 | /* copy vsyscall data */ |
98d0ac38 | 98 | vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; |
5cec93c2 AL |
99 | vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; |
100 | vsyscall_gtod_data.clock.mask = clock->mask; | |
101 | vsyscall_gtod_data.clock.mult = mult; | |
102 | vsyscall_gtod_data.clock.shift = clock->shift; | |
103 | vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; | |
104 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; | |
105 | vsyscall_gtod_data.wall_to_monotonic = *wtm; | |
106 | vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); | |
107 | ||
7460ed28 | 108 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); |
1da177e4 LT |
109 | } |
110 | ||
5cec93c2 AL |
111 | static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, |
112 | const char *message) | |
1da177e4 | 113 | { |
5cec93c2 AL |
114 | static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); |
115 | struct task_struct *tsk; | |
1da177e4 | 116 | |
5cec93c2 AL |
117 | if (!show_unhandled_signals || !__ratelimit(&rs)) |
118 | return; | |
1da177e4 | 119 | |
5cec93c2 | 120 | tsk = current; |
7460ed28 | 121 | |
c9712944 | 122 | printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", |
5cec93c2 | 123 | level, tsk->comm, task_pid_nr(tsk), |
3ae36655 | 124 | message, regs->ip, regs->cs, |
c9712944 AL |
125 | regs->sp, regs->ax, regs->si, regs->di); |
126 | } | |
127 | ||
128 | static int addr_to_vsyscall_nr(unsigned long addr) | |
129 | { | |
130 | int nr; | |
131 | ||
132 | if ((addr & ~0xC00UL) != VSYSCALL_START) | |
133 | return -EINVAL; | |
134 | ||
135 | nr = (addr & 0xC00UL) >> 10; | |
136 | if (nr >= 3) | |
137 | return -EINVAL; | |
138 | ||
139 | return nr; | |
1da177e4 LT |
140 | } |
141 | ||
3ae36655 | 142 | bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) |
1da177e4 | 143 | { |
5cec93c2 AL |
144 | struct task_struct *tsk; |
145 | unsigned long caller; | |
146 | int vsyscall_nr; | |
147 | long ret; | |
148 | ||
3ae36655 AL |
149 | /* |
150 | * No point in checking CS -- the only way to get here is a user mode | |
151 | * trap to a high address, which means that we're in 64-bit user code. | |
152 | */ | |
5cec93c2 | 153 | |
3ae36655 | 154 | WARN_ON_ONCE(address != regs->ip); |
c9712944 | 155 | |
3ae36655 AL |
156 | if (vsyscall_mode == NONE) { |
157 | warn_bad_vsyscall(KERN_INFO, regs, | |
158 | "vsyscall attempted with vsyscall=none"); | |
159 | return false; | |
c9712944 AL |
160 | } |
161 | ||
3ae36655 | 162 | vsyscall_nr = addr_to_vsyscall_nr(address); |
c149a665 AL |
163 | |
164 | trace_emulate_vsyscall(vsyscall_nr); | |
165 | ||
c9712944 AL |
166 | if (vsyscall_nr < 0) { |
167 | warn_bad_vsyscall(KERN_WARNING, regs, | |
3ae36655 | 168 | "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); |
5cec93c2 AL |
169 | goto sigsegv; |
170 | } | |
d0aff6e6 | 171 | |
5cec93c2 | 172 | if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { |
3ae36655 AL |
173 | warn_bad_vsyscall(KERN_WARNING, regs, |
174 | "vsyscall with bad stack (exploit attempt?)"); | |
5cec93c2 AL |
175 | goto sigsegv; |
176 | } | |
8c73626a | 177 | |
5cec93c2 AL |
178 | tsk = current; |
179 | if (seccomp_mode(&tsk->seccomp)) | |
180 | do_exit(SIGKILL); | |
181 | ||
182 | switch (vsyscall_nr) { | |
183 | case 0: | |
5cec93c2 AL |
184 | ret = sys_gettimeofday( |
185 | (struct timeval __user *)regs->di, | |
186 | (struct timezone __user *)regs->si); | |
187 | break; | |
188 | ||
189 | case 1: | |
5cec93c2 AL |
190 | ret = sys_time((time_t __user *)regs->di); |
191 | break; | |
192 | ||
193 | case 2: | |
5cec93c2 AL |
194 | ret = sys_getcpu((unsigned __user *)regs->di, |
195 | (unsigned __user *)regs->si, | |
196 | 0); | |
197 | break; | |
5cec93c2 | 198 | } |
8c73626a | 199 | |
5cec93c2 AL |
200 | if (ret == -EFAULT) { |
201 | /* | |
202 | * Bad news -- userspace fed a bad pointer to a vsyscall. | |
203 | * | |
204 | * With a real vsyscall, that would have caused SIGSEGV. | |
205 | * To make writing reliable exploits using the emulated | |
206 | * vsyscalls harder, generate SIGSEGV here as well. | |
207 | */ | |
208 | warn_bad_vsyscall(KERN_INFO, regs, | |
209 | "vsyscall fault (exploit attempt?)"); | |
210 | goto sigsegv; | |
211 | } | |
8c73626a | 212 | |
5cec93c2 | 213 | regs->ax = ret; |
1da177e4 | 214 | |
5cec93c2 AL |
215 | /* Emulate a ret instruction. */ |
216 | regs->ip = caller; | |
217 | regs->sp += 8; | |
c08c8205 | 218 | |
3ae36655 | 219 | return true; |
5cec93c2 AL |
220 | |
221 | sigsegv: | |
5cec93c2 | 222 | force_sig(SIGSEGV, current); |
3ae36655 | 223 | return true; |
1da177e4 LT |
224 | } |
225 | ||
5cec93c2 AL |
226 | /* |
227 | * Assume __initcall executes before all user space. Hopefully kmod | |
228 | * doesn't violate that. We'll find out if it does. | |
229 | */ | |
8c131af1 | 230 | static void __cpuinit vsyscall_set_cpu(int cpu) |
c08c8205 | 231 | { |
fc8b8a60 | 232 | unsigned long d; |
c08c8205 VP |
233 | unsigned long node = 0; |
234 | #ifdef CONFIG_NUMA | |
98c9e27a | 235 | node = cpu_to_node(cpu); |
c08c8205 | 236 | #endif |
92cb7612 | 237 | if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) |
8c131af1 | 238 | write_rdtscp_aux((node << 12) | cpu); |
c08c8205 | 239 | |
5cec93c2 AL |
240 | /* |
241 | * Store cpu number in limit so that it can be loaded quickly | |
242 | * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) | |
243 | */ | |
fc8b8a60 JF |
244 | d = 0x0f40000000000ULL; |
245 | d |= cpu; | |
246 | d |= (node & 0xf) << 12; | |
247 | d |= (node >> 4) << 48; | |
5cec93c2 | 248 | |
fc8b8a60 | 249 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); |
c08c8205 VP |
250 | } |
251 | ||
8c131af1 AK |
252 | static void __cpuinit cpu_vsyscall_init(void *arg) |
253 | { | |
254 | /* preemption should be already off */ | |
255 | vsyscall_set_cpu(raw_smp_processor_id()); | |
256 | } | |
257 | ||
258 | static int __cpuinit | |
259 | cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) | |
260 | { | |
261 | long cpu = (long)arg; | |
5cec93c2 | 262 | |
8bb78442 | 263 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) |
8691e5a8 | 264 | smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); |
5cec93c2 | 265 | |
8c131af1 AK |
266 | return NOTIFY_DONE; |
267 | } | |
268 | ||
e4026440 | 269 | void __init map_vsyscall(void) |
1da177e4 | 270 | { |
3ae36655 AL |
271 | extern char __vsyscall_page; |
272 | unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); | |
9fd67b4e AL |
273 | extern char __vvar_page; |
274 | unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); | |
1da177e4 | 275 | |
3ae36655 AL |
276 | __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, |
277 | vsyscall_mode == NATIVE | |
278 | ? PAGE_KERNEL_VSYSCALL | |
279 | : PAGE_KERNEL_VVAR); | |
280 | BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != | |
281 | (unsigned long)VSYSCALL_START); | |
282 | ||
9fd67b4e | 283 | __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); |
3ae36655 AL |
284 | BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != |
285 | (unsigned long)VVAR_ADDRESS); | |
1da177e4 LT |
286 | } |
287 | ||
288 | static int __init vsyscall_init(void) | |
289 | { | |
5cec93c2 AL |
290 | BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); |
291 | ||
15c8b6c1 | 292 | on_each_cpu(cpu_vsyscall_init, NULL, 1); |
be43f83d SY |
293 | /* notifier priority > KVM */ |
294 | hotcpu_notifier(cpu_vsyscall_notifier, 30); | |
5cec93c2 | 295 | |
1da177e4 LT |
296 | return 0; |
297 | } | |
1da177e4 | 298 | __initcall(vsyscall_init); |