2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
42 #include <asm/uaccess.h>
43 #include <asm/pgtable.h>
44 #include <asm/system.h>
46 #include <asm/processor.h>
48 #include <asm/mmu_context.h>
50 #include <asm/prctl.h>
52 #include <asm/proto.h>
56 asmlinkage
extern void ret_from_fork(void);
58 unsigned long kernel_thread_flags
= CLONE_VM
| CLONE_UNTRACED
;
60 unsigned long boot_option_idle_override
= 0;
61 EXPORT_SYMBOL(boot_option_idle_override
);
64 * Powermanagement idle function, if any..
66 void (*pm_idle
)(void);
67 EXPORT_SYMBOL(pm_idle
);
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier
);
71 void idle_notifier_register(struct notifier_block
*n
)
73 atomic_notifier_chain_register(&idle_notifier
, n
);
79 atomic_notifier_call_chain(&idle_notifier
, IDLE_START
, NULL
);
82 static void __exit_idle(void)
84 if (test_and_clear_bit_pda(0, isidle
) == 0)
86 atomic_notifier_call_chain(&idle_notifier
, IDLE_END
, NULL
);
89 /* Called from interrupts to signify idle end */
92 /* idle loop has pid 0 */
99 * We use this if we don't have any better
102 void default_idle(void)
104 current_thread_info()->status
&= ~TS_POLLING
;
106 * TS_POLLING-cleared state must be visible before we
111 safe_halt(); /* enables interrupts racelessly */
114 current_thread_info()->status
|= TS_POLLING
;
117 #ifdef CONFIG_HOTPLUG_CPU
118 DECLARE_PER_CPU(int, cpu_state
);
121 /* We halt the CPU with physical CPU hotplug */
122 static inline void play_dead(void)
128 __get_cpu_var(cpu_state
) = CPU_DEAD
;
135 static inline void play_dead(void)
139 #endif /* CONFIG_HOTPLUG_CPU */
142 * The idle thread. There's no useful work to be
143 * done, so just try to conserve power and have a
144 * low exit latency (ie sit in a loop waiting for
145 * somebody to say that they'd like to reschedule)
149 current_thread_info()->status
|= TS_POLLING
;
152 * If we're the non-boot CPU, nothing set the PDA stack
153 * canary up for us - and if we are the boot CPU we have
154 * a 0 stack canary. This is a good place for updating
155 * it, as we wont ever return from this function (so the
156 * invalid canaries already on the stack wont ever
159 boot_init_stack_canary();
161 /* endless idle loop with no priority at all */
163 tick_nohz_stop_sched_tick();
164 while (!need_resched()) {
171 if (cpu_is_offline(smp_processor_id()))
174 * Idle routines should keep interrupts disabled
175 * from here on, until they go to idle.
176 * Otherwise, idle callbacks can misfire.
181 /* In many cases the interrupt that ended idle
182 has already called exit_idle. But some idle
183 loops can be woken up without interrupt. */
187 tick_nohz_restart_sched_tick();
188 preempt_enable_no_resched();
194 /* Prints also some state that isn't saved in the pt_regs */
195 void __show_regs(struct pt_regs
* regs
)
197 unsigned long cr0
= 0L, cr2
= 0L, cr3
= 0L, cr4
= 0L, fs
, gs
, shadowgs
;
198 unsigned long d0
, d1
, d2
, d3
, d6
, d7
;
199 unsigned int fsindex
, gsindex
;
200 unsigned int ds
, cs
, es
;
204 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
205 current
->pid
, current
->comm
, print_tainted(),
206 init_utsname()->release
,
207 (int)strcspn(init_utsname()->version
, " "),
208 init_utsname()->version
);
209 printk("RIP: %04lx:[<%016lx>] ", regs
->cs
& 0xffff, regs
->ip
);
210 printk_address(regs
->ip
, 1);
211 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs
->ss
, regs
->sp
,
213 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
214 regs
->ax
, regs
->bx
, regs
->cx
);
215 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
216 regs
->dx
, regs
->si
, regs
->di
);
217 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
218 regs
->bp
, regs
->r8
, regs
->r9
);
219 printk("R10: %016lx R11: %016lx R12: %016lx\n",
220 regs
->r10
, regs
->r11
, regs
->r12
);
221 printk("R13: %016lx R14: %016lx R15: %016lx\n",
222 regs
->r13
, regs
->r14
, regs
->r15
);
224 asm("movl %%ds,%0" : "=r" (ds
));
225 asm("movl %%cs,%0" : "=r" (cs
));
226 asm("movl %%es,%0" : "=r" (es
));
227 asm("movl %%fs,%0" : "=r" (fsindex
));
228 asm("movl %%gs,%0" : "=r" (gsindex
));
230 rdmsrl(MSR_FS_BASE
, fs
);
231 rdmsrl(MSR_GS_BASE
, gs
);
232 rdmsrl(MSR_KERNEL_GS_BASE
, shadowgs
);
239 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
240 fs
,fsindex
,gs
,gsindex
,shadowgs
);
241 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs
, ds
, es
, cr0
);
242 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2
, cr3
, cr4
);
247 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0
, d1
, d2
);
251 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3
, d6
, d7
);
254 void show_regs(struct pt_regs
*regs
)
256 printk("CPU %d:", smp_processor_id());
258 show_trace(NULL
, regs
, (void *)(regs
+ 1), regs
->bp
);
262 * Free current thread data structures etc..
264 void exit_thread(void)
266 struct task_struct
*me
= current
;
267 struct thread_struct
*t
= &me
->thread
;
269 if (me
->thread
.io_bitmap_ptr
) {
270 struct tss_struct
*tss
= &per_cpu(init_tss
, get_cpu());
272 kfree(t
->io_bitmap_ptr
);
273 t
->io_bitmap_ptr
= NULL
;
274 clear_thread_flag(TIF_IO_BITMAP
);
276 * Careful, clear this in the TSS too:
278 memset(tss
->io_bitmap
, 0xff, t
->io_bitmap_max
);
279 t
->io_bitmap_max
= 0;
284 void flush_thread(void)
286 struct task_struct
*tsk
= current
;
288 if (test_tsk_thread_flag(tsk
, TIF_ABI_PENDING
)) {
289 clear_tsk_thread_flag(tsk
, TIF_ABI_PENDING
);
290 if (test_tsk_thread_flag(tsk
, TIF_IA32
)) {
291 clear_tsk_thread_flag(tsk
, TIF_IA32
);
293 set_tsk_thread_flag(tsk
, TIF_IA32
);
294 current_thread_info()->status
|= TS_COMPAT
;
297 clear_tsk_thread_flag(tsk
, TIF_DEBUG
);
299 tsk
->thread
.debugreg0
= 0;
300 tsk
->thread
.debugreg1
= 0;
301 tsk
->thread
.debugreg2
= 0;
302 tsk
->thread
.debugreg3
= 0;
303 tsk
->thread
.debugreg6
= 0;
304 tsk
->thread
.debugreg7
= 0;
305 memset(tsk
->thread
.tls_array
, 0, sizeof(tsk
->thread
.tls_array
));
307 * Forget coprocessor state..
313 void release_thread(struct task_struct
*dead_task
)
316 if (dead_task
->mm
->context
.size
) {
317 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
319 dead_task
->mm
->context
.ldt
,
320 dead_task
->mm
->context
.size
);
326 static inline void set_32bit_tls(struct task_struct
*t
, int tls
, u32 addr
)
328 struct user_desc ud
= {
335 struct desc_struct
*desc
= t
->thread
.tls_array
;
340 static inline u32
read_32bit_tls(struct task_struct
*t
, int tls
)
342 return get_desc_base(&t
->thread
.tls_array
[tls
]);
346 * This gets called before we allocate a new thread and copy
347 * the current task into it.
349 void prepare_to_copy(struct task_struct
*tsk
)
354 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long sp
,
355 unsigned long unused
,
356 struct task_struct
* p
, struct pt_regs
* regs
)
359 struct pt_regs
* childregs
;
360 struct task_struct
*me
= current
;
362 childregs
= ((struct pt_regs
*)
363 (THREAD_SIZE
+ task_stack_page(p
))) - 1;
369 childregs
->sp
= (unsigned long)childregs
;
371 p
->thread
.sp
= (unsigned long) childregs
;
372 p
->thread
.sp0
= (unsigned long) (childregs
+1);
373 p
->thread
.usersp
= me
->thread
.usersp
;
375 set_tsk_thread_flag(p
, TIF_FORK
);
377 p
->thread
.fs
= me
->thread
.fs
;
378 p
->thread
.gs
= me
->thread
.gs
;
380 asm("mov %%gs,%0" : "=m" (p
->thread
.gsindex
));
381 asm("mov %%fs,%0" : "=m" (p
->thread
.fsindex
));
382 asm("mov %%es,%0" : "=m" (p
->thread
.es
));
383 asm("mov %%ds,%0" : "=m" (p
->thread
.ds
));
385 if (unlikely(test_tsk_thread_flag(me
, TIF_IO_BITMAP
))) {
386 p
->thread
.io_bitmap_ptr
= kmalloc(IO_BITMAP_BYTES
, GFP_KERNEL
);
387 if (!p
->thread
.io_bitmap_ptr
) {
388 p
->thread
.io_bitmap_max
= 0;
391 memcpy(p
->thread
.io_bitmap_ptr
, me
->thread
.io_bitmap_ptr
,
393 set_tsk_thread_flag(p
, TIF_IO_BITMAP
);
397 * Set a new TLS for the child thread?
399 if (clone_flags
& CLONE_SETTLS
) {
400 #ifdef CONFIG_IA32_EMULATION
401 if (test_thread_flag(TIF_IA32
))
402 err
= do_set_thread_area(p
, -1,
403 (struct user_desc __user
*)childregs
->si
, 0);
406 err
= do_arch_prctl(p
, ARCH_SET_FS
, childregs
->r8
);
412 if (err
&& p
->thread
.io_bitmap_ptr
) {
413 kfree(p
->thread
.io_bitmap_ptr
);
414 p
->thread
.io_bitmap_max
= 0;
420 start_thread(struct pt_regs
*regs
, unsigned long new_ip
, unsigned long new_sp
)
422 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
426 write_pda(oldrsp
, new_sp
);
427 regs
->cs
= __USER_CS
;
428 regs
->ss
= __USER_DS
;
432 * Free the old FP and other extended state
434 free_thread_xstate(current
);
436 EXPORT_SYMBOL_GPL(start_thread
);
438 static void hard_disable_TSC(void)
440 write_cr4(read_cr4() | X86_CR4_TSD
);
443 void disable_TSC(void)
446 if (!test_and_set_thread_flag(TIF_NOTSC
))
448 * Must flip the CPU state synchronously with
449 * TIF_NOTSC in the current running context.
455 static void hard_enable_TSC(void)
457 write_cr4(read_cr4() & ~X86_CR4_TSD
);
460 static void enable_TSC(void)
463 if (test_and_clear_thread_flag(TIF_NOTSC
))
465 * Must flip the CPU state synchronously with
466 * TIF_NOTSC in the current running context.
472 int get_tsc_mode(unsigned long adr
)
476 if (test_thread_flag(TIF_NOTSC
))
477 val
= PR_TSC_SIGSEGV
;
481 return put_user(val
, (unsigned int __user
*)adr
);
484 int set_tsc_mode(unsigned int val
)
486 if (val
== PR_TSC_SIGSEGV
)
488 else if (val
== PR_TSC_ENABLE
)
497 * This special macro can be used to load a debugging register
499 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
501 static inline void __switch_to_xtra(struct task_struct
*prev_p
,
502 struct task_struct
*next_p
,
503 struct tss_struct
*tss
)
505 struct thread_struct
*prev
, *next
;
506 unsigned long debugctl
;
508 prev
= &prev_p
->thread
,
509 next
= &next_p
->thread
;
511 debugctl
= prev
->debugctlmsr
;
512 if (next
->ds_area_msr
!= prev
->ds_area_msr
) {
513 /* we clear debugctl to make sure DS
514 * is not in use when we change it */
516 update_debugctlmsr(0);
517 wrmsrl(MSR_IA32_DS_AREA
, next
->ds_area_msr
);
520 if (next
->debugctlmsr
!= debugctl
)
521 update_debugctlmsr(next
->debugctlmsr
);
523 if (test_tsk_thread_flag(next_p
, TIF_DEBUG
)) {
533 if (test_tsk_thread_flag(prev_p
, TIF_NOTSC
) ^
534 test_tsk_thread_flag(next_p
, TIF_NOTSC
)) {
535 /* prev and next are different */
536 if (test_tsk_thread_flag(next_p
, TIF_NOTSC
))
542 if (test_tsk_thread_flag(next_p
, TIF_IO_BITMAP
)) {
544 * Copy the relevant range of the IO bitmap.
545 * Normally this is 128 bytes or less:
547 memcpy(tss
->io_bitmap
, next
->io_bitmap_ptr
,
548 max(prev
->io_bitmap_max
, next
->io_bitmap_max
));
549 } else if (test_tsk_thread_flag(prev_p
, TIF_IO_BITMAP
)) {
551 * Clear any possible leftover bits:
553 memset(tss
->io_bitmap
, 0xff, prev
->io_bitmap_max
);
557 if (test_tsk_thread_flag(prev_p
, TIF_BTS_TRACE_TS
))
558 ptrace_bts_take_timestamp(prev_p
, BTS_TASK_DEPARTS
);
560 if (test_tsk_thread_flag(next_p
, TIF_BTS_TRACE_TS
))
561 ptrace_bts_take_timestamp(next_p
, BTS_TASK_ARRIVES
);
566 * switch_to(x,y) should switch tasks from x to y.
568 * This could still be optimized:
569 * - fold all the options into a flag word and test it with a single test.
570 * - could test fs/gs bitsliced
572 * Kprobes not supported here. Set the probe on schedule instead.
575 __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
577 struct thread_struct
*prev
= &prev_p
->thread
,
578 *next
= &next_p
->thread
;
579 int cpu
= smp_processor_id();
580 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
582 /* we're going to use this soon, after a few expensive things */
583 if (next_p
->fpu_counter
>5)
584 prefetch(next
->xstate
);
587 * Reload esp0, LDT and the page table pointer:
593 * This won't pick up thread selector changes, but I guess that is ok.
595 asm volatile("mov %%es,%0" : "=m" (prev
->es
));
596 if (unlikely(next
->es
| prev
->es
))
597 loadsegment(es
, next
->es
);
599 asm volatile ("mov %%ds,%0" : "=m" (prev
->ds
));
600 if (unlikely(next
->ds
| prev
->ds
))
601 loadsegment(ds
, next
->ds
);
610 asm volatile("movl %%fs,%0" : "=r" (fsindex
));
611 /* segment register != 0 always requires a reload.
612 also reload when it has changed.
613 when prev process used 64bit base always reload
614 to avoid an information leak. */
615 if (unlikely(fsindex
| next
->fsindex
| prev
->fs
)) {
616 loadsegment(fs
, next
->fsindex
);
617 /* check if the user used a selector != 0
618 * if yes clear 64bit base, since overloaded base
619 * is always mapped to the Null selector
624 /* when next process has a 64bit base use it */
626 wrmsrl(MSR_FS_BASE
, next
->fs
);
627 prev
->fsindex
= fsindex
;
631 asm volatile("movl %%gs,%0" : "=r" (gsindex
));
632 if (unlikely(gsindex
| next
->gsindex
| prev
->gs
)) {
633 load_gs_index(next
->gsindex
);
638 wrmsrl(MSR_KERNEL_GS_BASE
, next
->gs
);
639 prev
->gsindex
= gsindex
;
642 /* Must be after DS reload */
646 * Switch the PDA and FPU contexts.
648 prev
->usersp
= read_pda(oldrsp
);
649 write_pda(oldrsp
, next
->usersp
);
650 write_pda(pcurrent
, next_p
);
652 write_pda(kernelstack
,
653 (unsigned long)task_stack_page(next_p
) + THREAD_SIZE
- PDA_STACKOFFSET
);
654 #ifdef CONFIG_CC_STACKPROTECTOR
656 * Build time only check to make sure the stack_canary is at
657 * offset 40 in the pda; this is a gcc ABI requirement
659 BUILD_BUG_ON(offsetof(struct x8664_pda
, stack_canary
) != 40);
663 * Now maybe reload the debug registers and handle I/O bitmaps
665 if (unlikely(task_thread_info(next_p
)->flags
& _TIF_WORK_CTXSW_NEXT
||
666 task_thread_info(prev_p
)->flags
& _TIF_WORK_CTXSW_PREV
))
667 __switch_to_xtra(prev_p
, next_p
, tss
);
669 /* If the task has used fpu the last 5 timeslices, just do a full
670 * restore of the math state immediately to avoid the trap; the
671 * chances of needing FPU soon are obviously high now
673 if (next_p
->fpu_counter
>5)
674 math_state_restore();
679 * sys_execve() executes a new program.
682 long sys_execve(char __user
*name
, char __user
* __user
*argv
,
683 char __user
* __user
*envp
, struct pt_regs
*regs
)
688 filename
= getname(name
);
689 error
= PTR_ERR(filename
);
690 if (IS_ERR(filename
))
692 error
= do_execve(filename
, argv
, envp
, regs
);
697 void set_personality_64bit(void)
699 /* inherit personality from parent */
701 /* Make sure to be in 64bit mode */
702 clear_thread_flag(TIF_IA32
);
704 /* TBD: overwrites user setup. Should have two bits.
705 But 64bit processes have always behaved this way,
706 so it's not too bad. The main problem is just that
707 32bit childs are affected again. */
708 current
->personality
&= ~READ_IMPLIES_EXEC
;
711 asmlinkage
long sys_fork(struct pt_regs
*regs
)
713 return do_fork(SIGCHLD
, regs
->sp
, regs
, 0, NULL
, NULL
);
717 sys_clone(unsigned long clone_flags
, unsigned long newsp
,
718 void __user
*parent_tid
, void __user
*child_tid
, struct pt_regs
*regs
)
722 return do_fork(clone_flags
, newsp
, regs
, 0, parent_tid
, child_tid
);
726 * This is trivial, and on the face of it looks like it
727 * could equally well be done in user mode.
729 * Not so, for quite unobvious reasons - register pressure.
730 * In user mode vfork() cannot have a stack frame, and if
731 * done by calling the "clone()" system call directly, you
732 * do not have enough call-clobbered registers to hold all
733 * the information you need.
735 asmlinkage
long sys_vfork(struct pt_regs
*regs
)
737 return do_fork(CLONE_VFORK
| CLONE_VM
| SIGCHLD
, regs
->sp
, regs
, 0,
741 unsigned long get_wchan(struct task_struct
*p
)
747 if (!p
|| p
== current
|| p
->state
==TASK_RUNNING
)
749 stack
= (unsigned long)task_stack_page(p
);
750 if (p
->thread
.sp
< stack
|| p
->thread
.sp
> stack
+THREAD_SIZE
)
752 fp
= *(u64
*)(p
->thread
.sp
);
754 if (fp
< (unsigned long)stack
||
755 fp
> (unsigned long)stack
+THREAD_SIZE
)
758 if (!in_sched_functions(ip
))
761 } while (count
++ < 16);
765 long do_arch_prctl(struct task_struct
*task
, int code
, unsigned long addr
)
768 int doit
= task
== current
;
773 if (addr
>= TASK_SIZE_OF(task
))
776 /* handle small bases via the GDT because that's faster to
778 if (addr
<= 0xffffffff) {
779 set_32bit_tls(task
, GS_TLS
, addr
);
781 load_TLS(&task
->thread
, cpu
);
782 load_gs_index(GS_TLS_SEL
);
784 task
->thread
.gsindex
= GS_TLS_SEL
;
787 task
->thread
.gsindex
= 0;
788 task
->thread
.gs
= addr
;
791 ret
= checking_wrmsrl(MSR_KERNEL_GS_BASE
, addr
);
797 /* Not strictly needed for fs, but do it for symmetry
799 if (addr
>= TASK_SIZE_OF(task
))
802 /* handle small bases via the GDT because that's faster to
804 if (addr
<= 0xffffffff) {
805 set_32bit_tls(task
, FS_TLS
, addr
);
807 load_TLS(&task
->thread
, cpu
);
808 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL
));
810 task
->thread
.fsindex
= FS_TLS_SEL
;
813 task
->thread
.fsindex
= 0;
814 task
->thread
.fs
= addr
;
816 /* set the selector to 0 to not confuse
818 asm volatile("movl %0,%%fs" :: "r" (0));
819 ret
= checking_wrmsrl(MSR_FS_BASE
, addr
);
826 if (task
->thread
.fsindex
== FS_TLS_SEL
)
827 base
= read_32bit_tls(task
, FS_TLS
);
829 rdmsrl(MSR_FS_BASE
, base
);
831 base
= task
->thread
.fs
;
832 ret
= put_user(base
, (unsigned long __user
*)addr
);
838 if (task
->thread
.gsindex
== GS_TLS_SEL
)
839 base
= read_32bit_tls(task
, GS_TLS
);
841 asm("movl %%gs,%0" : "=r" (gsindex
));
843 rdmsrl(MSR_KERNEL_GS_BASE
, base
);
845 base
= task
->thread
.gs
;
848 base
= task
->thread
.gs
;
849 ret
= put_user(base
, (unsigned long __user
*)addr
);
861 long sys_arch_prctl(int code
, unsigned long addr
)
863 return do_arch_prctl(current
, code
, addr
);
866 unsigned long arch_align_stack(unsigned long sp
)
868 if (!(current
->personality
& ADDR_NO_RANDOMIZE
) && randomize_va_space
)
869 sp
-= get_random_int() % 8192;
873 unsigned long arch_randomize_brk(struct mm_struct
*mm
)
875 unsigned long range_end
= mm
->brk
+ 0x02000000;
876 return randomize_range(mm
->brk
, range_end
, 0) ? : mm
->brk
;