2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage
extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags
= CLONE_VM
| CLONE_UNTRACED
;
59 unsigned long boot_option_idle_override
= 0;
60 EXPORT_SYMBOL(boot_option_idle_override
);
63 * Powermanagement idle function, if any..
65 void (*pm_idle
)(void);
66 EXPORT_SYMBOL(pm_idle
);
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier
);
70 void idle_notifier_register(struct notifier_block
*n
)
72 atomic_notifier_chain_register(&idle_notifier
, n
);
78 atomic_notifier_call_chain(&idle_notifier
, IDLE_START
, NULL
);
81 static void __exit_idle(void)
83 if (test_and_clear_bit_pda(0, isidle
) == 0)
85 atomic_notifier_call_chain(&idle_notifier
, IDLE_END
, NULL
);
88 /* Called from interrupts to signify idle end */
91 /* idle loop has pid 0 */
98 * We use this if we don't have any better
101 void default_idle(void)
103 current_thread_info()->status
&= ~TS_POLLING
;
105 * TS_POLLING-cleared state must be visible before we
110 safe_halt(); /* enables interrupts racelessly */
113 current_thread_info()->status
|= TS_POLLING
;
116 #ifdef CONFIG_HOTPLUG_CPU
117 DECLARE_PER_CPU(int, cpu_state
);
120 /* We halt the CPU with physical CPU hotplug */
121 static inline void play_dead(void)
127 __get_cpu_var(cpu_state
) = CPU_DEAD
;
134 static inline void play_dead(void)
138 #endif /* CONFIG_HOTPLUG_CPU */
141 * The idle thread. There's no useful work to be
142 * done, so just try to conserve power and have a
143 * low exit latency (ie sit in a loop waiting for
144 * somebody to say that they'd like to reschedule)
148 current_thread_info()->status
|= TS_POLLING
;
151 * If we're the non-boot CPU, nothing set the PDA stack
152 * canary up for us - and if we are the boot CPU we have
153 * a 0 stack canary. This is a good place for updating
154 * it, as we wont ever return from this function (so the
155 * invalid canaries already on the stack wont ever
158 boot_init_stack_canary();
160 /* endless idle loop with no priority at all */
162 tick_nohz_stop_sched_tick();
163 while (!need_resched()) {
170 if (cpu_is_offline(smp_processor_id()))
173 * Idle routines should keep interrupts disabled
174 * from here on, until they go to idle.
175 * Otherwise, idle callbacks can misfire.
180 /* In many cases the interrupt that ended idle
181 has already called exit_idle. But some idle
182 loops can be woken up without interrupt. */
186 tick_nohz_restart_sched_tick();
187 preempt_enable_no_resched();
193 /* Prints also some state that isn't saved in the pt_regs */
194 void __show_regs(struct pt_regs
* regs
)
196 unsigned long cr0
= 0L, cr2
= 0L, cr3
= 0L, cr4
= 0L, fs
, gs
, shadowgs
;
197 unsigned long d0
, d1
, d2
, d3
, d6
, d7
;
198 unsigned int fsindex
, gsindex
;
199 unsigned int ds
, cs
, es
;
203 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
204 current
->pid
, current
->comm
, print_tainted(),
205 init_utsname()->release
,
206 (int)strcspn(init_utsname()->version
, " "),
207 init_utsname()->version
);
208 printk("RIP: %04lx:[<%016lx>] ", regs
->cs
& 0xffff, regs
->ip
);
209 printk_address(regs
->ip
, 1);
210 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs
->ss
, regs
->sp
,
212 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
213 regs
->ax
, regs
->bx
, regs
->cx
);
214 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
215 regs
->dx
, regs
->si
, regs
->di
);
216 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
217 regs
->bp
, regs
->r8
, regs
->r9
);
218 printk("R10: %016lx R11: %016lx R12: %016lx\n",
219 regs
->r10
, regs
->r11
, regs
->r12
);
220 printk("R13: %016lx R14: %016lx R15: %016lx\n",
221 regs
->r13
, regs
->r14
, regs
->r15
);
223 asm("movl %%ds,%0" : "=r" (ds
));
224 asm("movl %%cs,%0" : "=r" (cs
));
225 asm("movl %%es,%0" : "=r" (es
));
226 asm("movl %%fs,%0" : "=r" (fsindex
));
227 asm("movl %%gs,%0" : "=r" (gsindex
));
229 rdmsrl(MSR_FS_BASE
, fs
);
230 rdmsrl(MSR_GS_BASE
, gs
);
231 rdmsrl(MSR_KERNEL_GS_BASE
, shadowgs
);
238 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
239 fs
,fsindex
,gs
,gsindex
,shadowgs
);
240 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs
, ds
, es
, cr0
);
241 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2
, cr3
, cr4
);
246 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0
, d1
, d2
);
250 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3
, d6
, d7
);
253 void show_regs(struct pt_regs
*regs
)
255 printk("CPU %d:", smp_processor_id());
257 show_trace(NULL
, regs
, (void *)(regs
+ 1), regs
->bp
);
261 * Free current thread data structures etc..
263 void exit_thread(void)
265 struct task_struct
*me
= current
;
266 struct thread_struct
*t
= &me
->thread
;
268 if (me
->thread
.io_bitmap_ptr
) {
269 struct tss_struct
*tss
= &per_cpu(init_tss
, get_cpu());
271 kfree(t
->io_bitmap_ptr
);
272 t
->io_bitmap_ptr
= NULL
;
273 clear_thread_flag(TIF_IO_BITMAP
);
275 * Careful, clear this in the TSS too:
277 memset(tss
->io_bitmap
, 0xff, t
->io_bitmap_max
);
278 t
->io_bitmap_max
= 0;
283 void flush_thread(void)
285 struct task_struct
*tsk
= current
;
287 if (test_tsk_thread_flag(tsk
, TIF_ABI_PENDING
)) {
288 clear_tsk_thread_flag(tsk
, TIF_ABI_PENDING
);
289 if (test_tsk_thread_flag(tsk
, TIF_IA32
)) {
290 clear_tsk_thread_flag(tsk
, TIF_IA32
);
292 set_tsk_thread_flag(tsk
, TIF_IA32
);
293 current_thread_info()->status
|= TS_COMPAT
;
296 clear_tsk_thread_flag(tsk
, TIF_DEBUG
);
298 tsk
->thread
.debugreg0
= 0;
299 tsk
->thread
.debugreg1
= 0;
300 tsk
->thread
.debugreg2
= 0;
301 tsk
->thread
.debugreg3
= 0;
302 tsk
->thread
.debugreg6
= 0;
303 tsk
->thread
.debugreg7
= 0;
304 memset(tsk
->thread
.tls_array
, 0, sizeof(tsk
->thread
.tls_array
));
306 * Forget coprocessor state..
312 void release_thread(struct task_struct
*dead_task
)
315 if (dead_task
->mm
->context
.size
) {
316 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
318 dead_task
->mm
->context
.ldt
,
319 dead_task
->mm
->context
.size
);
325 static inline void set_32bit_tls(struct task_struct
*t
, int tls
, u32 addr
)
327 struct user_desc ud
= {
334 struct desc_struct
*desc
= t
->thread
.tls_array
;
339 static inline u32
read_32bit_tls(struct task_struct
*t
, int tls
)
341 return get_desc_base(&t
->thread
.tls_array
[tls
]);
345 * This gets called before we allocate a new thread and copy
346 * the current task into it.
348 void prepare_to_copy(struct task_struct
*tsk
)
353 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long sp
,
354 unsigned long unused
,
355 struct task_struct
* p
, struct pt_regs
* regs
)
358 struct pt_regs
* childregs
;
359 struct task_struct
*me
= current
;
361 childregs
= ((struct pt_regs
*)
362 (THREAD_SIZE
+ task_stack_page(p
))) - 1;
368 childregs
->sp
= (unsigned long)childregs
;
370 p
->thread
.sp
= (unsigned long) childregs
;
371 p
->thread
.sp0
= (unsigned long) (childregs
+1);
372 p
->thread
.usersp
= me
->thread
.usersp
;
374 set_tsk_thread_flag(p
, TIF_FORK
);
376 p
->thread
.fs
= me
->thread
.fs
;
377 p
->thread
.gs
= me
->thread
.gs
;
379 asm("mov %%gs,%0" : "=m" (p
->thread
.gsindex
));
380 asm("mov %%fs,%0" : "=m" (p
->thread
.fsindex
));
381 asm("mov %%es,%0" : "=m" (p
->thread
.es
));
382 asm("mov %%ds,%0" : "=m" (p
->thread
.ds
));
384 if (unlikely(test_tsk_thread_flag(me
, TIF_IO_BITMAP
))) {
385 p
->thread
.io_bitmap_ptr
= kmalloc(IO_BITMAP_BYTES
, GFP_KERNEL
);
386 if (!p
->thread
.io_bitmap_ptr
) {
387 p
->thread
.io_bitmap_max
= 0;
390 memcpy(p
->thread
.io_bitmap_ptr
, me
->thread
.io_bitmap_ptr
,
392 set_tsk_thread_flag(p
, TIF_IO_BITMAP
);
396 * Set a new TLS for the child thread?
398 if (clone_flags
& CLONE_SETTLS
) {
399 #ifdef CONFIG_IA32_EMULATION
400 if (test_thread_flag(TIF_IA32
))
401 err
= do_set_thread_area(p
, -1,
402 (struct user_desc __user
*)childregs
->si
, 0);
405 err
= do_arch_prctl(p
, ARCH_SET_FS
, childregs
->r8
);
411 if (err
&& p
->thread
.io_bitmap_ptr
) {
412 kfree(p
->thread
.io_bitmap_ptr
);
413 p
->thread
.io_bitmap_max
= 0;
419 start_thread(struct pt_regs
*regs
, unsigned long new_ip
, unsigned long new_sp
)
421 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
425 write_pda(oldrsp
, new_sp
);
426 regs
->cs
= __USER_CS
;
427 regs
->ss
= __USER_DS
;
431 * Free the old FP and other extended state
433 free_thread_xstate(current
);
435 EXPORT_SYMBOL_GPL(start_thread
);
437 static void hard_disable_TSC(void)
439 write_cr4(read_cr4() | X86_CR4_TSD
);
442 void disable_TSC(void)
445 if (!test_and_set_thread_flag(TIF_NOTSC
))
447 * Must flip the CPU state synchronously with
448 * TIF_NOTSC in the current running context.
454 static void hard_enable_TSC(void)
456 write_cr4(read_cr4() & ~X86_CR4_TSD
);
459 static void enable_TSC(void)
462 if (test_and_clear_thread_flag(TIF_NOTSC
))
464 * Must flip the CPU state synchronously with
465 * TIF_NOTSC in the current running context.
471 int get_tsc_mode(unsigned long adr
)
475 if (test_thread_flag(TIF_NOTSC
))
476 val
= PR_TSC_SIGSEGV
;
480 return put_user(val
, (unsigned int __user
*)adr
);
483 int set_tsc_mode(unsigned int val
)
485 if (val
== PR_TSC_SIGSEGV
)
487 else if (val
== PR_TSC_ENABLE
)
496 * This special macro can be used to load a debugging register
498 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
500 static inline void __switch_to_xtra(struct task_struct
*prev_p
,
501 struct task_struct
*next_p
,
502 struct tss_struct
*tss
)
504 struct thread_struct
*prev
, *next
;
505 unsigned long debugctl
;
507 prev
= &prev_p
->thread
,
508 next
= &next_p
->thread
;
510 debugctl
= prev
->debugctlmsr
;
511 if (next
->ds_area_msr
!= prev
->ds_area_msr
) {
512 /* we clear debugctl to make sure DS
513 * is not in use when we change it */
515 update_debugctlmsr(0);
516 wrmsrl(MSR_IA32_DS_AREA
, next
->ds_area_msr
);
519 if (next
->debugctlmsr
!= debugctl
)
520 update_debugctlmsr(next
->debugctlmsr
);
522 if (test_tsk_thread_flag(next_p
, TIF_DEBUG
)) {
532 if (test_tsk_thread_flag(prev_p
, TIF_NOTSC
) ^
533 test_tsk_thread_flag(next_p
, TIF_NOTSC
)) {
534 /* prev and next are different */
535 if (test_tsk_thread_flag(next_p
, TIF_NOTSC
))
541 if (test_tsk_thread_flag(next_p
, TIF_IO_BITMAP
)) {
543 * Copy the relevant range of the IO bitmap.
544 * Normally this is 128 bytes or less:
546 memcpy(tss
->io_bitmap
, next
->io_bitmap_ptr
,
547 max(prev
->io_bitmap_max
, next
->io_bitmap_max
));
548 } else if (test_tsk_thread_flag(prev_p
, TIF_IO_BITMAP
)) {
550 * Clear any possible leftover bits:
552 memset(tss
->io_bitmap
, 0xff, prev
->io_bitmap_max
);
556 if (test_tsk_thread_flag(prev_p
, TIF_BTS_TRACE_TS
))
557 ptrace_bts_take_timestamp(prev_p
, BTS_TASK_DEPARTS
);
559 if (test_tsk_thread_flag(next_p
, TIF_BTS_TRACE_TS
))
560 ptrace_bts_take_timestamp(next_p
, BTS_TASK_ARRIVES
);
565 * switch_to(x,y) should switch tasks from x to y.
567 * This could still be optimized:
568 * - fold all the options into a flag word and test it with a single test.
569 * - could test fs/gs bitsliced
571 * Kprobes not supported here. Set the probe on schedule instead.
574 __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
576 struct thread_struct
*prev
= &prev_p
->thread
,
577 *next
= &next_p
->thread
;
578 int cpu
= smp_processor_id();
579 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
581 /* we're going to use this soon, after a few expensive things */
582 if (next_p
->fpu_counter
>5)
583 prefetch(next
->xstate
);
586 * Reload esp0, LDT and the page table pointer:
592 * This won't pick up thread selector changes, but I guess that is ok.
594 asm volatile("mov %%es,%0" : "=m" (prev
->es
));
595 if (unlikely(next
->es
| prev
->es
))
596 loadsegment(es
, next
->es
);
598 asm volatile ("mov %%ds,%0" : "=m" (prev
->ds
));
599 if (unlikely(next
->ds
| prev
->ds
))
600 loadsegment(ds
, next
->ds
);
609 asm volatile("movl %%fs,%0" : "=r" (fsindex
));
610 /* segment register != 0 always requires a reload.
611 also reload when it has changed.
612 when prev process used 64bit base always reload
613 to avoid an information leak. */
614 if (unlikely(fsindex
| next
->fsindex
| prev
->fs
)) {
615 loadsegment(fs
, next
->fsindex
);
616 /* check if the user used a selector != 0
617 * if yes clear 64bit base, since overloaded base
618 * is always mapped to the Null selector
623 /* when next process has a 64bit base use it */
625 wrmsrl(MSR_FS_BASE
, next
->fs
);
626 prev
->fsindex
= fsindex
;
630 asm volatile("movl %%gs,%0" : "=r" (gsindex
));
631 if (unlikely(gsindex
| next
->gsindex
| prev
->gs
)) {
632 load_gs_index(next
->gsindex
);
637 wrmsrl(MSR_KERNEL_GS_BASE
, next
->gs
);
638 prev
->gsindex
= gsindex
;
641 /* Must be after DS reload */
645 * Switch the PDA and FPU contexts.
647 prev
->usersp
= read_pda(oldrsp
);
648 write_pda(oldrsp
, next
->usersp
);
649 write_pda(pcurrent
, next_p
);
651 write_pda(kernelstack
,
652 (unsigned long)task_stack_page(next_p
) + THREAD_SIZE
- PDA_STACKOFFSET
);
653 #ifdef CONFIG_CC_STACKPROTECTOR
655 * Build time only check to make sure the stack_canary is at
656 * offset 40 in the pda; this is a gcc ABI requirement
658 BUILD_BUG_ON(offsetof(struct x8664_pda
, stack_canary
) != 40);
662 * Now maybe reload the debug registers and handle I/O bitmaps
664 if (unlikely(task_thread_info(next_p
)->flags
& _TIF_WORK_CTXSW_NEXT
||
665 task_thread_info(prev_p
)->flags
& _TIF_WORK_CTXSW_PREV
))
666 __switch_to_xtra(prev_p
, next_p
, tss
);
668 /* If the task has used fpu the last 5 timeslices, just do a full
669 * restore of the math state immediately to avoid the trap; the
670 * chances of needing FPU soon are obviously high now
672 if (next_p
->fpu_counter
>5)
673 math_state_restore();
678 * sys_execve() executes a new program.
681 long sys_execve(char __user
*name
, char __user
* __user
*argv
,
682 char __user
* __user
*envp
, struct pt_regs
*regs
)
687 filename
= getname(name
);
688 error
= PTR_ERR(filename
);
689 if (IS_ERR(filename
))
691 error
= do_execve(filename
, argv
, envp
, regs
);
696 void set_personality_64bit(void)
698 /* inherit personality from parent */
700 /* Make sure to be in 64bit mode */
701 clear_thread_flag(TIF_IA32
);
703 /* TBD: overwrites user setup. Should have two bits.
704 But 64bit processes have always behaved this way,
705 so it's not too bad. The main problem is just that
706 32bit childs are affected again. */
707 current
->personality
&= ~READ_IMPLIES_EXEC
;
710 asmlinkage
long sys_fork(struct pt_regs
*regs
)
712 return do_fork(SIGCHLD
, regs
->sp
, regs
, 0, NULL
, NULL
);
716 sys_clone(unsigned long clone_flags
, unsigned long newsp
,
717 void __user
*parent_tid
, void __user
*child_tid
, struct pt_regs
*regs
)
721 return do_fork(clone_flags
, newsp
, regs
, 0, parent_tid
, child_tid
);
725 * This is trivial, and on the face of it looks like it
726 * could equally well be done in user mode.
728 * Not so, for quite unobvious reasons - register pressure.
729 * In user mode vfork() cannot have a stack frame, and if
730 * done by calling the "clone()" system call directly, you
731 * do not have enough call-clobbered registers to hold all
732 * the information you need.
734 asmlinkage
long sys_vfork(struct pt_regs
*regs
)
736 return do_fork(CLONE_VFORK
| CLONE_VM
| SIGCHLD
, regs
->sp
, regs
, 0,
740 unsigned long get_wchan(struct task_struct
*p
)
746 if (!p
|| p
== current
|| p
->state
==TASK_RUNNING
)
748 stack
= (unsigned long)task_stack_page(p
);
749 if (p
->thread
.sp
< stack
|| p
->thread
.sp
> stack
+THREAD_SIZE
)
751 fp
= *(u64
*)(p
->thread
.sp
);
753 if (fp
< (unsigned long)stack
||
754 fp
> (unsigned long)stack
+THREAD_SIZE
)
757 if (!in_sched_functions(ip
))
760 } while (count
++ < 16);
764 long do_arch_prctl(struct task_struct
*task
, int code
, unsigned long addr
)
767 int doit
= task
== current
;
772 if (addr
>= TASK_SIZE_OF(task
))
775 /* handle small bases via the GDT because that's faster to
777 if (addr
<= 0xffffffff) {
778 set_32bit_tls(task
, GS_TLS
, addr
);
780 load_TLS(&task
->thread
, cpu
);
781 load_gs_index(GS_TLS_SEL
);
783 task
->thread
.gsindex
= GS_TLS_SEL
;
786 task
->thread
.gsindex
= 0;
787 task
->thread
.gs
= addr
;
790 ret
= checking_wrmsrl(MSR_KERNEL_GS_BASE
, addr
);
796 /* Not strictly needed for fs, but do it for symmetry
798 if (addr
>= TASK_SIZE_OF(task
))
801 /* handle small bases via the GDT because that's faster to
803 if (addr
<= 0xffffffff) {
804 set_32bit_tls(task
, FS_TLS
, addr
);
806 load_TLS(&task
->thread
, cpu
);
807 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL
));
809 task
->thread
.fsindex
= FS_TLS_SEL
;
812 task
->thread
.fsindex
= 0;
813 task
->thread
.fs
= addr
;
815 /* set the selector to 0 to not confuse
817 asm volatile("movl %0,%%fs" :: "r" (0));
818 ret
= checking_wrmsrl(MSR_FS_BASE
, addr
);
825 if (task
->thread
.fsindex
== FS_TLS_SEL
)
826 base
= read_32bit_tls(task
, FS_TLS
);
828 rdmsrl(MSR_FS_BASE
, base
);
830 base
= task
->thread
.fs
;
831 ret
= put_user(base
, (unsigned long __user
*)addr
);
837 if (task
->thread
.gsindex
== GS_TLS_SEL
)
838 base
= read_32bit_tls(task
, GS_TLS
);
840 asm("movl %%gs,%0" : "=r" (gsindex
));
842 rdmsrl(MSR_KERNEL_GS_BASE
, base
);
844 base
= task
->thread
.gs
;
847 base
= task
->thread
.gs
;
848 ret
= put_user(base
, (unsigned long __user
*)addr
);
860 long sys_arch_prctl(int code
, unsigned long addr
)
862 return do_arch_prctl(current
, code
, addr
);
865 unsigned long arch_align_stack(unsigned long sp
)
867 if (!(current
->personality
& ADDR_NO_RANDOMIZE
) && randomize_va_space
)
868 sp
-= get_random_int() % 8192;
872 unsigned long arch_randomize_brk(struct mm_struct
*mm
)
874 unsigned long range_end
= mm
->brk
+ 0x02000000;
875 return randomize_range(mm
->brk
, range_end
, 0) ? : mm
->brk
;