Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* |
1da177e4 LT |
2 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE |
3 | * Copyright 2003 Andi Kleen, SuSE Labs. | |
4 | * | |
5cec93c2 AL |
5 | * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] |
6 | * | |
1da177e4 LT |
7 | * Thanks to hpa@transmeta.com for some useful hint. |
8 | * Special thanks to Ingo Molnar for his early experience with | |
9 | * a different vsyscall implementation for Linux/IA32 and for the name. | |
10 | * | |
11 | * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located | |
12 | * at virtual address -10Mbyte+1024bytes etc... There are at max 4 | |
13 | * vsyscalls. One vsyscall can reserve more than 1 slot to avoid | |
14 | * jumping out of line if necessary. We cannot add more with this | |
15 | * mechanism because older kernels won't return -ENOSYS. | |
1da177e4 | 16 | * |
5cec93c2 AL |
17 | * Note: the concept clashes with user mode linux. UML users should |
18 | * use the vDSO. | |
1da177e4 LT |
19 | */ |
20 | ||
c767a54b JP |
21 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
22 | ||
1da177e4 LT |
23 | #include <linux/time.h> |
24 | #include <linux/init.h> | |
25 | #include <linux/kernel.h> | |
26 | #include <linux/timer.h> | |
27 | #include <linux/seqlock.h> | |
28 | #include <linux/jiffies.h> | |
29 | #include <linux/sysctl.h> | |
29574022 | 30 | #include <linux/topology.h> |
189374ae | 31 | #include <linux/timekeeper_internal.h> |
c08c8205 | 32 | #include <linux/getcpu.h> |
8c131af1 AK |
33 | #include <linux/cpu.h> |
34 | #include <linux/smp.h> | |
35 | #include <linux/notifier.h> | |
5cec93c2 AL |
36 | #include <linux/syscalls.h> |
37 | #include <linux/ratelimit.h> | |
1da177e4 LT |
38 | |
39 | #include <asm/vsyscall.h> | |
40 | #include <asm/pgtable.h> | |
c9712944 | 41 | #include <asm/compat.h> |
1da177e4 | 42 | #include <asm/page.h> |
7460ed28 | 43 | #include <asm/unistd.h> |
1da177e4 LT |
44 | #include <asm/fixmap.h> |
45 | #include <asm/errno.h> | |
46 | #include <asm/io.h> | |
c08c8205 VP |
47 | #include <asm/segment.h> |
48 | #include <asm/desc.h> | |
49 | #include <asm/topology.h> | |
2aae950b | 50 | #include <asm/vgtod.h> |
5cec93c2 | 51 | #include <asm/traps.h> |
1da177e4 | 52 | |
c149a665 AL |
53 | #define CREATE_TRACE_POINTS |
54 | #include "vsyscall_trace.h" | |
55 | ||
8c49d9a7 | 56 | DEFINE_VVAR(int, vgetcpu_mode); |
2ab51657 | 57 | DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); |
1da177e4 | 58 | |
2e57ae05 | 59 | static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; |
3ae36655 AL |
60 | |
61 | static int __init vsyscall_setup(char *str) | |
62 | { | |
63 | if (str) { | |
64 | if (!strcmp("emulate", str)) | |
65 | vsyscall_mode = EMULATE; | |
66 | else if (!strcmp("native", str)) | |
67 | vsyscall_mode = NATIVE; | |
68 | else if (!strcmp("none", str)) | |
69 | vsyscall_mode = NONE; | |
70 | else | |
71 | return -EINVAL; | |
72 | ||
73 | return 0; | |
74 | } | |
75 | ||
76 | return -EINVAL; | |
77 | } | |
78 | early_param("vsyscall", vsyscall_setup); | |
79 | ||
2c622148 TB |
80 | void update_vsyscall_tz(void) |
81 | { | |
2c622148 | 82 | vsyscall_gtod_data.sys_tz = sys_tz; |
2c622148 TB |
83 | } |
84 | ||
650ea024 | 85 | void update_vsyscall(struct timekeeper *tk) |
1da177e4 | 86 | { |
650ea024 | 87 | struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; |
5cec93c2 | 88 | |
650ea024 | 89 | write_seqcount_begin(&vdata->seq); |
68fe7b23 | 90 | |
7460ed28 | 91 | /* copy vsyscall data */ |
650ea024 JS |
92 | vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; |
93 | vdata->clock.cycle_last = tk->clock->cycle_last; | |
94 | vdata->clock.mask = tk->clock->mask; | |
95 | vdata->clock.mult = tk->mult; | |
96 | vdata->clock.shift = tk->shift; | |
97 | ||
98 | vdata->wall_time_sec = tk->xtime_sec; | |
99 | vdata->wall_time_snsec = tk->xtime_nsec; | |
100 | ||
101 | vdata->monotonic_time_sec = tk->xtime_sec | |
102 | + tk->wall_to_monotonic.tv_sec; | |
103 | vdata->monotonic_time_snsec = tk->xtime_nsec | |
104 | + (tk->wall_to_monotonic.tv_nsec | |
105 | << tk->shift); | |
106 | while (vdata->monotonic_time_snsec >= | |
107 | (((u64)NSEC_PER_SEC) << tk->shift)) { | |
108 | vdata->monotonic_time_snsec -= | |
109 | ((u64)NSEC_PER_SEC) << tk->shift; | |
110 | vdata->monotonic_time_sec++; | |
111 | } | |
91ec87d5 | 112 | |
650ea024 JS |
113 | vdata->wall_time_coarse.tv_sec = tk->xtime_sec; |
114 | vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); | |
91ec87d5 | 115 | |
650ea024 JS |
116 | vdata->monotonic_time_coarse = timespec_add(vdata->wall_time_coarse, |
117 | tk->wall_to_monotonic); | |
5cec93c2 | 118 | |
650ea024 | 119 | write_seqcount_end(&vdata->seq); |
1da177e4 LT |
120 | } |
121 | ||
5cec93c2 AL |
122 | static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, |
123 | const char *message) | |
1da177e4 | 124 | { |
c767a54b | 125 | if (!show_unhandled_signals) |
5cec93c2 | 126 | return; |
1da177e4 | 127 | |
c767a54b JP |
128 | pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", |
129 | level, current->comm, task_pid_nr(current), | |
130 | message, regs->ip, regs->cs, | |
131 | regs->sp, regs->ax, regs->si, regs->di); | |
c9712944 AL |
132 | } |
133 | ||
134 | static int addr_to_vsyscall_nr(unsigned long addr) | |
135 | { | |
136 | int nr; | |
137 | ||
138 | if ((addr & ~0xC00UL) != VSYSCALL_START) | |
139 | return -EINVAL; | |
140 | ||
141 | nr = (addr & 0xC00UL) >> 10; | |
142 | if (nr >= 3) | |
143 | return -EINVAL; | |
144 | ||
145 | return nr; | |
1da177e4 LT |
146 | } |
147 | ||
4fc34901 AL |
148 | static bool write_ok_or_segv(unsigned long ptr, size_t size) |
149 | { | |
150 | /* | |
151 | * XXX: if access_ok, get_user, and put_user handled | |
152 | * sig_on_uaccess_error, this could go away. | |
153 | */ | |
154 | ||
155 | if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { | |
156 | siginfo_t info; | |
157 | struct thread_struct *thread = ¤t->thread; | |
158 | ||
159 | thread->error_code = 6; /* user fault, no page, write */ | |
160 | thread->cr2 = ptr; | |
51e7dc70 | 161 | thread->trap_nr = X86_TRAP_PF; |
4fc34901 AL |
162 | |
163 | memset(&info, 0, sizeof(info)); | |
164 | info.si_signo = SIGSEGV; | |
165 | info.si_errno = 0; | |
166 | info.si_code = SEGV_MAPERR; | |
167 | info.si_addr = (void __user *)ptr; | |
168 | ||
169 | force_sig_info(SIGSEGV, &info, current); | |
170 | return false; | |
171 | } else { | |
172 | return true; | |
173 | } | |
174 | } | |
175 | ||
3ae36655 | 176 | bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) |
1da177e4 | 177 | { |
5cec93c2 AL |
178 | struct task_struct *tsk; |
179 | unsigned long caller; | |
87b526d3 | 180 | int vsyscall_nr, syscall_nr, tmp; |
4fc34901 | 181 | int prev_sig_on_uaccess_error; |
5cec93c2 AL |
182 | long ret; |
183 | ||
3ae36655 AL |
184 | /* |
185 | * No point in checking CS -- the only way to get here is a user mode | |
186 | * trap to a high address, which means that we're in 64-bit user code. | |
187 | */ | |
5cec93c2 | 188 | |
3ae36655 | 189 | WARN_ON_ONCE(address != regs->ip); |
c9712944 | 190 | |
3ae36655 AL |
191 | if (vsyscall_mode == NONE) { |
192 | warn_bad_vsyscall(KERN_INFO, regs, | |
193 | "vsyscall attempted with vsyscall=none"); | |
194 | return false; | |
c9712944 AL |
195 | } |
196 | ||
3ae36655 | 197 | vsyscall_nr = addr_to_vsyscall_nr(address); |
c149a665 AL |
198 | |
199 | trace_emulate_vsyscall(vsyscall_nr); | |
200 | ||
c9712944 AL |
201 | if (vsyscall_nr < 0) { |
202 | warn_bad_vsyscall(KERN_WARNING, regs, | |
3ae36655 | 203 | "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); |
5cec93c2 AL |
204 | goto sigsegv; |
205 | } | |
d0aff6e6 | 206 | |
5cec93c2 | 207 | if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { |
3ae36655 AL |
208 | warn_bad_vsyscall(KERN_WARNING, regs, |
209 | "vsyscall with bad stack (exploit attempt?)"); | |
5cec93c2 AL |
210 | goto sigsegv; |
211 | } | |
8c73626a | 212 | |
5cec93c2 | 213 | tsk = current; |
4fc34901 AL |
214 | |
215 | /* | |
87b526d3 AL |
216 | * Check for access_ok violations and find the syscall nr. |
217 | * | |
46ed99d1 | 218 | * NULL is a valid user pointer (in the access_ok sense) on 32-bit and |
4fc34901 | 219 | * 64-bit, so we don't need to special-case it here. For all the |
46ed99d1 | 220 | * vsyscalls, NULL means "don't write anything" not "write it at |
4fc34901 AL |
221 | * address 0". |
222 | */ | |
5cec93c2 AL |
223 | switch (vsyscall_nr) { |
224 | case 0: | |
4fc34901 | 225 | if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || |
87b526d3 AL |
226 | !write_ok_or_segv(regs->si, sizeof(struct timezone))) { |
227 | ret = -EFAULT; | |
228 | goto check_fault; | |
229 | } | |
4fc34901 | 230 | |
87b526d3 AL |
231 | syscall_nr = __NR_gettimeofday; |
232 | break; | |
233 | ||
234 | case 1: | |
235 | if (!write_ok_or_segv(regs->di, sizeof(time_t))) { | |
236 | ret = -EFAULT; | |
237 | goto check_fault; | |
238 | } | |
239 | ||
240 | syscall_nr = __NR_time; | |
241 | break; | |
242 | ||
243 | case 2: | |
244 | if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || | |
245 | !write_ok_or_segv(regs->si, sizeof(unsigned))) { | |
246 | ret = -EFAULT; | |
247 | goto check_fault; | |
248 | } | |
249 | ||
250 | syscall_nr = __NR_getcpu; | |
251 | break; | |
252 | } | |
253 | ||
254 | /* | |
255 | * Handle seccomp. regs->ip must be the original value. | |
256 | * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. | |
257 | * | |
258 | * We could optimize the seccomp disabled case, but performance | |
259 | * here doesn't matter. | |
260 | */ | |
261 | regs->orig_ax = syscall_nr; | |
262 | regs->ax = -ENOSYS; | |
263 | tmp = secure_computing(syscall_nr); | |
264 | if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { | |
265 | warn_bad_vsyscall(KERN_DEBUG, regs, | |
266 | "seccomp tried to change syscall nr or ip"); | |
267 | do_exit(SIGSYS); | |
268 | } | |
269 | if (tmp) | |
270 | goto do_ret; /* skip requested */ | |
271 | ||
272 | /* | |
273 | * With a real vsyscall, page faults cause SIGSEGV. We want to | |
274 | * preserve that behavior to make writing exploits harder. | |
275 | */ | |
276 | prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; | |
277 | current_thread_info()->sig_on_uaccess_error = 1; | |
278 | ||
279 | ret = -EFAULT; | |
280 | switch (vsyscall_nr) { | |
281 | case 0: | |
5cec93c2 AL |
282 | ret = sys_gettimeofday( |
283 | (struct timeval __user *)regs->di, | |
284 | (struct timezone __user *)regs->si); | |
285 | break; | |
286 | ||
287 | case 1: | |
5cec93c2 AL |
288 | ret = sys_time((time_t __user *)regs->di); |
289 | break; | |
290 | ||
291 | case 2: | |
5cec93c2 AL |
292 | ret = sys_getcpu((unsigned __user *)regs->di, |
293 | (unsigned __user *)regs->si, | |
46ed99d1 | 294 | NULL); |
5cec93c2 | 295 | break; |
5cec93c2 | 296 | } |
8c73626a | 297 | |
4fc34901 AL |
298 | current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; |
299 | ||
87b526d3 | 300 | check_fault: |
5cec93c2 | 301 | if (ret == -EFAULT) { |
4fc34901 | 302 | /* Bad news -- userspace fed a bad pointer to a vsyscall. */ |
5cec93c2 AL |
303 | warn_bad_vsyscall(KERN_INFO, regs, |
304 | "vsyscall fault (exploit attempt?)"); | |
4fc34901 AL |
305 | |
306 | /* | |
307 | * If we failed to generate a signal for any reason, | |
308 | * generate one here. (This should be impossible.) | |
309 | */ | |
310 | if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && | |
311 | !sigismember(&tsk->pending.signal, SIGSEGV))) | |
312 | goto sigsegv; | |
313 | ||
314 | return true; /* Don't emulate the ret. */ | |
5cec93c2 | 315 | } |
8c73626a | 316 | |
5cec93c2 | 317 | regs->ax = ret; |
1da177e4 | 318 | |
5651721e | 319 | do_ret: |
5cec93c2 AL |
320 | /* Emulate a ret instruction. */ |
321 | regs->ip = caller; | |
322 | regs->sp += 8; | |
3ae36655 | 323 | return true; |
5cec93c2 AL |
324 | |
325 | sigsegv: | |
5cec93c2 | 326 | force_sig(SIGSEGV, current); |
3ae36655 | 327 | return true; |
1da177e4 LT |
328 | } |
329 | ||
5cec93c2 AL |
330 | /* |
331 | * Assume __initcall executes before all user space. Hopefully kmod | |
332 | * doesn't violate that. We'll find out if it does. | |
333 | */ | |
148f9bb8 | 334 | static void vsyscall_set_cpu(int cpu) |
c08c8205 | 335 | { |
fc8b8a60 | 336 | unsigned long d; |
c08c8205 VP |
337 | unsigned long node = 0; |
338 | #ifdef CONFIG_NUMA | |
98c9e27a | 339 | node = cpu_to_node(cpu); |
c08c8205 | 340 | #endif |
92cb7612 | 341 | if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) |
8c131af1 | 342 | write_rdtscp_aux((node << 12) | cpu); |
c08c8205 | 343 | |
5cec93c2 AL |
344 | /* |
345 | * Store cpu number in limit so that it can be loaded quickly | |
346 | * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) | |
347 | */ | |
fc8b8a60 JF |
348 | d = 0x0f40000000000ULL; |
349 | d |= cpu; | |
350 | d |= (node & 0xf) << 12; | |
351 | d |= (node >> 4) << 48; | |
5cec93c2 | 352 | |
fc8b8a60 | 353 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); |
c08c8205 VP |
354 | } |
355 | ||
148f9bb8 | 356 | static void cpu_vsyscall_init(void *arg) |
8c131af1 AK |
357 | { |
358 | /* preemption should be already off */ | |
359 | vsyscall_set_cpu(raw_smp_processor_id()); | |
360 | } | |
361 | ||
148f9bb8 | 362 | static int |
8c131af1 AK |
363 | cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) |
364 | { | |
365 | long cpu = (long)arg; | |
5cec93c2 | 366 | |
8bb78442 | 367 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) |
8691e5a8 | 368 | smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); |
5cec93c2 | 369 | |
8c131af1 AK |
370 | return NOTIFY_DONE; |
371 | } | |
372 | ||
e4026440 | 373 | void __init map_vsyscall(void) |
1da177e4 | 374 | { |
3ae36655 AL |
375 | extern char __vsyscall_page; |
376 | unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); | |
9fd67b4e AL |
377 | extern char __vvar_page; |
378 | unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); | |
1da177e4 | 379 | |
3ae36655 AL |
380 | __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, |
381 | vsyscall_mode == NATIVE | |
382 | ? PAGE_KERNEL_VSYSCALL | |
383 | : PAGE_KERNEL_VVAR); | |
384 | BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != | |
385 | (unsigned long)VSYSCALL_START); | |
386 | ||
9fd67b4e | 387 | __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); |
3ae36655 AL |
388 | BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != |
389 | (unsigned long)VVAR_ADDRESS); | |
1da177e4 LT |
390 | } |
391 | ||
392 | static int __init vsyscall_init(void) | |
393 | { | |
5cec93c2 AL |
394 | BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); |
395 | ||
15c8b6c1 | 396 | on_each_cpu(cpu_vsyscall_init, NULL, 1); |
be43f83d SY |
397 | /* notifier priority > KVM */ |
398 | hotcpu_notifier(cpu_vsyscall_notifier, 30); | |
5cec93c2 | 399 | |
1da177e4 LT |
400 | return 0; |
401 | } | |
1da177e4 | 402 | __initcall(vsyscall_init); |