x86, um: switch to generic fork/vfork/clone
[deliverable/linux.git] / arch / x86 / kernel / process_64.c
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * X86-64 port
8 * Andi Kleen.
9 *
10 * CPU hotplug support - ashok.raj@intel.com
11 */
12
13 /*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52
53 asmlinkage extern void ret_from_fork(void);
54
55 DEFINE_PER_CPU(unsigned long, old_rsp);
56
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61 unsigned long d0, d1, d2, d3, d6, d7;
62 unsigned int fsindex, gsindex;
63 unsigned int ds, cs, es;
64
65 show_regs_common();
66 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
67 printk_address(regs->ip, 1);
68 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
69 regs->sp, regs->flags);
70 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
71 regs->ax, regs->bx, regs->cx);
72 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
73 regs->dx, regs->si, regs->di);
74 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
75 regs->bp, regs->r8, regs->r9);
76 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
77 regs->r10, regs->r11, regs->r12);
78 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
79 regs->r13, regs->r14, regs->r15);
80
81 asm("movl %%ds,%0" : "=r" (ds));
82 asm("movl %%cs,%0" : "=r" (cs));
83 asm("movl %%es,%0" : "=r" (es));
84 asm("movl %%fs,%0" : "=r" (fsindex));
85 asm("movl %%gs,%0" : "=r" (gsindex));
86
87 rdmsrl(MSR_FS_BASE, fs);
88 rdmsrl(MSR_GS_BASE, gs);
89 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
90
91 if (!all)
92 return;
93
94 cr0 = read_cr0();
95 cr2 = read_cr2();
96 cr3 = read_cr3();
97 cr4 = read_cr4();
98
99 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
100 fs, fsindex, gs, gsindex, shadowgs);
101 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
102 es, cr0);
103 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
104 cr4);
105
106 get_debugreg(d0, 0);
107 get_debugreg(d1, 1);
108 get_debugreg(d2, 2);
109 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
110 get_debugreg(d3, 3);
111 get_debugreg(d6, 6);
112 get_debugreg(d7, 7);
113 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
114 }
115
116 void release_thread(struct task_struct *dead_task)
117 {
118 if (dead_task->mm) {
119 if (dead_task->mm->context.size) {
120 pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121 dead_task->comm,
122 dead_task->mm->context.ldt,
123 dead_task->mm->context.size);
124 BUG();
125 }
126 }
127 }
128
129 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
130 {
131 struct user_desc ud = {
132 .base_addr = addr,
133 .limit = 0xfffff,
134 .seg_32bit = 1,
135 .limit_in_pages = 1,
136 .useable = 1,
137 };
138 struct desc_struct *desc = t->thread.tls_array;
139 desc += tls;
140 fill_ldt(desc, &ud);
141 }
142
143 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
144 {
145 return get_desc_base(&t->thread.tls_array[tls]);
146 }
147
148 int copy_thread(unsigned long clone_flags, unsigned long sp,
149 unsigned long arg,
150 struct task_struct *p, struct pt_regs *regs)
151 {
152 int err;
153 struct pt_regs *childregs;
154 struct task_struct *me = current;
155
156 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
157 childregs = task_pt_regs(p);
158 p->thread.sp = (unsigned long) childregs;
159 p->thread.usersp = me->thread.usersp;
160 set_tsk_thread_flag(p, TIF_FORK);
161 p->fpu_counter = 0;
162 p->thread.io_bitmap_ptr = NULL;
163
164 savesegment(gs, p->thread.gsindex);
165 p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
166 savesegment(fs, p->thread.fsindex);
167 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
168 savesegment(es, p->thread.es);
169 savesegment(ds, p->thread.ds);
170 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
171
172 if (unlikely(p->flags & PF_KTHREAD)) {
173 /* kernel thread */
174 memset(childregs, 0, sizeof(struct pt_regs));
175 childregs->sp = (unsigned long)childregs;
176 childregs->ss = __KERNEL_DS;
177 childregs->bx = sp; /* function */
178 childregs->bp = arg;
179 childregs->orig_ax = -1;
180 childregs->cs = __KERNEL_CS | get_kernel_rpl();
181 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
182 return 0;
183 }
184 *childregs = *current_pt_regs();
185
186 childregs->ax = 0;
187 if (sp)
188 childregs->sp = sp;
189
190 err = -ENOMEM;
191 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
192
193 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
194 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
195 IO_BITMAP_BYTES, GFP_KERNEL);
196 if (!p->thread.io_bitmap_ptr) {
197 p->thread.io_bitmap_max = 0;
198 return -ENOMEM;
199 }
200 set_tsk_thread_flag(p, TIF_IO_BITMAP);
201 }
202
203 /*
204 * Set a new TLS for the child thread?
205 */
206 if (clone_flags & CLONE_SETTLS) {
207 #ifdef CONFIG_IA32_EMULATION
208 if (test_thread_flag(TIF_IA32))
209 err = do_set_thread_area(p, -1,
210 (struct user_desc __user *)childregs->si, 0);
211 else
212 #endif
213 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
214 if (err)
215 goto out;
216 }
217 err = 0;
218 out:
219 if (err && p->thread.io_bitmap_ptr) {
220 kfree(p->thread.io_bitmap_ptr);
221 p->thread.io_bitmap_max = 0;
222 }
223
224 return err;
225 }
226
227 static void
228 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
229 unsigned long new_sp,
230 unsigned int _cs, unsigned int _ss, unsigned int _ds)
231 {
232 loadsegment(fs, 0);
233 loadsegment(es, _ds);
234 loadsegment(ds, _ds);
235 load_gs_index(0);
236 current->thread.usersp = new_sp;
237 regs->ip = new_ip;
238 regs->sp = new_sp;
239 this_cpu_write(old_rsp, new_sp);
240 regs->cs = _cs;
241 regs->ss = _ss;
242 regs->flags = X86_EFLAGS_IF;
243 }
244
245 void
246 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
247 {
248 start_thread_common(regs, new_ip, new_sp,
249 __USER_CS, __USER_DS, 0);
250 }
251
252 #ifdef CONFIG_IA32_EMULATION
253 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
254 {
255 start_thread_common(regs, new_ip, new_sp,
256 test_thread_flag(TIF_X32)
257 ? __USER_CS : __USER32_CS,
258 __USER_DS, __USER_DS);
259 }
260 #endif
261
262 /*
263 * switch_to(x,y) should switch tasks from x to y.
264 *
265 * This could still be optimized:
266 * - fold all the options into a flag word and test it with a single test.
267 * - could test fs/gs bitsliced
268 *
269 * Kprobes not supported here. Set the probe on schedule instead.
270 * Function graph tracer not supported too.
271 */
272 __notrace_funcgraph struct task_struct *
273 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
274 {
275 struct thread_struct *prev = &prev_p->thread;
276 struct thread_struct *next = &next_p->thread;
277 int cpu = smp_processor_id();
278 struct tss_struct *tss = &per_cpu(init_tss, cpu);
279 unsigned fsindex, gsindex;
280 fpu_switch_t fpu;
281
282 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
283
284 /*
285 * Reload esp0, LDT and the page table pointer:
286 */
287 load_sp0(tss, next);
288
289 /*
290 * Switch DS and ES.
291 * This won't pick up thread selector changes, but I guess that is ok.
292 */
293 savesegment(es, prev->es);
294 if (unlikely(next->es | prev->es))
295 loadsegment(es, next->es);
296
297 savesegment(ds, prev->ds);
298 if (unlikely(next->ds | prev->ds))
299 loadsegment(ds, next->ds);
300
301
302 /* We must save %fs and %gs before load_TLS() because
303 * %fs and %gs may be cleared by load_TLS().
304 *
305 * (e.g. xen_load_tls())
306 */
307 savesegment(fs, fsindex);
308 savesegment(gs, gsindex);
309
310 load_TLS(next, cpu);
311
312 /*
313 * Leave lazy mode, flushing any hypercalls made here.
314 * This must be done before restoring TLS segments so
315 * the GDT and LDT are properly updated, and must be
316 * done before math_state_restore, so the TS bit is up
317 * to date.
318 */
319 arch_end_context_switch(next_p);
320
321 /*
322 * Switch FS and GS.
323 *
324 * Segment register != 0 always requires a reload. Also
325 * reload when it has changed. When prev process used 64bit
326 * base always reload to avoid an information leak.
327 */
328 if (unlikely(fsindex | next->fsindex | prev->fs)) {
329 loadsegment(fs, next->fsindex);
330 /*
331 * Check if the user used a selector != 0; if yes
332 * clear 64bit base, since overloaded base is always
333 * mapped to the Null selector
334 */
335 if (fsindex)
336 prev->fs = 0;
337 }
338 /* when next process has a 64bit base use it */
339 if (next->fs)
340 wrmsrl(MSR_FS_BASE, next->fs);
341 prev->fsindex = fsindex;
342
343 if (unlikely(gsindex | next->gsindex | prev->gs)) {
344 load_gs_index(next->gsindex);
345 if (gsindex)
346 prev->gs = 0;
347 }
348 if (next->gs)
349 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
350 prev->gsindex = gsindex;
351
352 switch_fpu_finish(next_p, fpu);
353
354 /*
355 * Switch the PDA and FPU contexts.
356 */
357 prev->usersp = this_cpu_read(old_rsp);
358 this_cpu_write(old_rsp, next->usersp);
359 this_cpu_write(current_task, next_p);
360
361 this_cpu_write(kernel_stack,
362 (unsigned long)task_stack_page(next_p) +
363 THREAD_SIZE - KERNEL_STACK_OFFSET);
364
365 /*
366 * Now maybe reload the debug registers and handle I/O bitmaps
367 */
368 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
369 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
370 __switch_to_xtra(prev_p, next_p, tss);
371
372 return prev_p;
373 }
374
375 void set_personality_64bit(void)
376 {
377 /* inherit personality from parent */
378
379 /* Make sure to be in 64bit mode */
380 clear_thread_flag(TIF_IA32);
381 clear_thread_flag(TIF_ADDR32);
382 clear_thread_flag(TIF_X32);
383
384 /* Ensure the corresponding mm is not marked. */
385 if (current->mm)
386 current->mm->context.ia32_compat = 0;
387
388 /* TBD: overwrites user setup. Should have two bits.
389 But 64bit processes have always behaved this way,
390 so it's not too bad. The main problem is just that
391 32bit childs are affected again. */
392 current->personality &= ~READ_IMPLIES_EXEC;
393 }
394
395 void set_personality_ia32(bool x32)
396 {
397 /* inherit personality from parent */
398
399 /* Make sure to be in 32bit mode */
400 set_thread_flag(TIF_ADDR32);
401
402 /* Mark the associated mm as containing 32-bit tasks. */
403 if (current->mm)
404 current->mm->context.ia32_compat = 1;
405
406 if (x32) {
407 clear_thread_flag(TIF_IA32);
408 set_thread_flag(TIF_X32);
409 current->personality &= ~READ_IMPLIES_EXEC;
410 /* is_compat_task() uses the presence of the x32
411 syscall bit flag to determine compat status */
412 current_thread_info()->status &= ~TS_COMPAT;
413 } else {
414 set_thread_flag(TIF_IA32);
415 clear_thread_flag(TIF_X32);
416 current->personality |= force_personality32;
417 /* Prepare the first "return" to user space */
418 current_thread_info()->status |= TS_COMPAT;
419 }
420 }
421 EXPORT_SYMBOL_GPL(set_personality_ia32);
422
423 unsigned long get_wchan(struct task_struct *p)
424 {
425 unsigned long stack;
426 u64 fp, ip;
427 int count = 0;
428
429 if (!p || p == current || p->state == TASK_RUNNING)
430 return 0;
431 stack = (unsigned long)task_stack_page(p);
432 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
433 return 0;
434 fp = *(u64 *)(p->thread.sp);
435 do {
436 if (fp < (unsigned long)stack ||
437 fp >= (unsigned long)stack+THREAD_SIZE)
438 return 0;
439 ip = *(u64 *)(fp+8);
440 if (!in_sched_functions(ip))
441 return ip;
442 fp = *(u64 *)fp;
443 } while (count++ < 16);
444 return 0;
445 }
446
447 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
448 {
449 int ret = 0;
450 int doit = task == current;
451 int cpu;
452
453 switch (code) {
454 case ARCH_SET_GS:
455 if (addr >= TASK_SIZE_OF(task))
456 return -EPERM;
457 cpu = get_cpu();
458 /* handle small bases via the GDT because that's faster to
459 switch. */
460 if (addr <= 0xffffffff) {
461 set_32bit_tls(task, GS_TLS, addr);
462 if (doit) {
463 load_TLS(&task->thread, cpu);
464 load_gs_index(GS_TLS_SEL);
465 }
466 task->thread.gsindex = GS_TLS_SEL;
467 task->thread.gs = 0;
468 } else {
469 task->thread.gsindex = 0;
470 task->thread.gs = addr;
471 if (doit) {
472 load_gs_index(0);
473 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
474 }
475 }
476 put_cpu();
477 break;
478 case ARCH_SET_FS:
479 /* Not strictly needed for fs, but do it for symmetry
480 with gs */
481 if (addr >= TASK_SIZE_OF(task))
482 return -EPERM;
483 cpu = get_cpu();
484 /* handle small bases via the GDT because that's faster to
485 switch. */
486 if (addr <= 0xffffffff) {
487 set_32bit_tls(task, FS_TLS, addr);
488 if (doit) {
489 load_TLS(&task->thread, cpu);
490 loadsegment(fs, FS_TLS_SEL);
491 }
492 task->thread.fsindex = FS_TLS_SEL;
493 task->thread.fs = 0;
494 } else {
495 task->thread.fsindex = 0;
496 task->thread.fs = addr;
497 if (doit) {
498 /* set the selector to 0 to not confuse
499 __switch_to */
500 loadsegment(fs, 0);
501 ret = wrmsrl_safe(MSR_FS_BASE, addr);
502 }
503 }
504 put_cpu();
505 break;
506 case ARCH_GET_FS: {
507 unsigned long base;
508 if (task->thread.fsindex == FS_TLS_SEL)
509 base = read_32bit_tls(task, FS_TLS);
510 else if (doit)
511 rdmsrl(MSR_FS_BASE, base);
512 else
513 base = task->thread.fs;
514 ret = put_user(base, (unsigned long __user *)addr);
515 break;
516 }
517 case ARCH_GET_GS: {
518 unsigned long base;
519 unsigned gsindex;
520 if (task->thread.gsindex == GS_TLS_SEL)
521 base = read_32bit_tls(task, GS_TLS);
522 else if (doit) {
523 savesegment(gs, gsindex);
524 if (gsindex)
525 rdmsrl(MSR_KERNEL_GS_BASE, base);
526 else
527 base = task->thread.gs;
528 } else
529 base = task->thread.gs;
530 ret = put_user(base, (unsigned long __user *)addr);
531 break;
532 }
533
534 default:
535 ret = -EINVAL;
536 break;
537 }
538
539 return ret;
540 }
541
542 long sys_arch_prctl(int code, unsigned long addr)
543 {
544 return do_arch_prctl(current, code, addr);
545 }
546
547 unsigned long KSTK_ESP(struct task_struct *task)
548 {
549 return (test_tsk_thread_flag(task, TIF_IA32)) ?
550 (task_pt_regs(task)->sp) : ((task)->thread.usersp);
551 }
This page took 0.042855 seconds and 6 git commands to generate.