[deliverable/linux.git] / arch / i386 / kernel / smp.c

/*
 *	Intel SMP support routines.
 *
 *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
 *	(c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
 *
 *	This code is released under the GNU General Public License version 2 or
 *	later.
 */

#include <linux/init.h>

#include <linux/mm.h>
#include <linux/irq.h>
#include <linux/delay.h>
#include <linux/spinlock.h>
#include <linux/smp_lock.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/cache.h>
#include <linux/interrupt.h>

#include <asm/mtrr.h>
#include <asm/tlbflush.h>
#include <mach_apic.h>

/*
 *	Some notes on x86 processor bugs affecting SMP operation:
 *
 *	Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
 *	The Linux implications for SMP are handled as follows:
 *
 *	Pentium III / [Xeon]
 *		None of the E1AP-E3AP errata are visible to the user.
 *
 *	E1AP.	see PII A1AP
 *	E2AP.	see PII A2AP
 *	E3AP.	see PII A3AP
 *
 *	Pentium II / [Xeon]
 *		None of the A1AP-A3AP errata are visible to the user.
 *
 *	A1AP.	see PPro 1AP
 *	A2AP.	see PPro 2AP
 *	A3AP.	see PPro 7AP
 *
 *	Pentium Pro
 *		None of 1AP-9AP errata are visible to the normal user,
 *	except occasional delivery of 'spurious interrupt' as trap #15.
 *	This is very rare and a non-problem.
 *
 *	1AP.	Linux maps APIC as non-cacheable
 *	2AP.	worked around in hardware
 *	3AP.	fixed in C0 and above steppings microcode update.
 *		Linux does not use excessive STARTUP_IPIs.
 *	4AP.	worked around in hardware
 *	5AP.	symmetric IO mode (normal Linux operation) not affected.
 *		'noapic' mode has vector 0xf filled out properly.
 *	6AP.	'noapic' mode might be affected - fixed in later steppings
 *	7AP.	We do not assume writes to the LVT deassering IRQs
 *	8AP.	We do not enable low power mode (deep sleep) during MP bootup
 *	9AP.	We do not use mixed mode
 *
 *	Pentium
 *		There is a marginal case where REP MOVS on 100MHz SMP
 *	machines with B stepping processors can fail. XXX should provide
 *	an L1cache=Writethrough or L1cache=off option.
 *
 *		B stepping CPUs may hang. There are hardware work arounds
 *	for this. We warn about it in case your board doesn't have the work
 *	arounds. Basically thats so I can tell anyone with a B stepping
 *	CPU and SMP problems "tough".
 *
 *	Specific items [From Pentium Processor Specification Update]
 *
 *	1AP.	Linux doesn't use remote read
 *	2AP.	Linux doesn't trust APIC errors
 *	3AP.	We work around this
 *	4AP.	Linux never generated 3 interrupts of the same priority
 *		to cause a lost local interrupt.
 *	5AP.	Remote read is never used
 *	6AP.	not affected - worked around in hardware
 *	7AP.	not affected - worked around in hardware
 *	8AP.	worked around in hardware - we get explicit CS errors if not
 *	9AP.	only 'noapic' mode affected. Might generate spurious
 *		interrupts, we log only the first one and count the
 *		rest silently.
 *	10AP.	not affected - worked around in hardware
 *	11AP.	Linux reads the APIC between writes to avoid this, as per
 *		the documentation. Make sure you preserve this as it affects
 *		the C stepping chips too.
 *	12AP.	not affected - worked around in hardware
 *	13AP.	not affected - worked around in hardware
 *	14AP.	we always deassert INIT during bootup
 *	15AP.	not affected - worked around in hardware
 *	16AP.	not affected - worked around in hardware
 *	17AP.	not affected - worked around in hardware
 *	18AP.	not affected - worked around in hardware
 *	19AP.	not affected - worked around in BIOS
 *
 *	If this sounds worrying believe me these bugs are either ___RARE___,
 *	or are signal timing bugs worked around in hardware and there's
 *	about nothing of note with C stepping upwards.
 */

DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };

/*
 * the following functions deal with sending IPIs between CPUs.
 *
 * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
 */

static inline int __prepare_ICR (unsigned int shortcut, int vector)
{
	return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL;
}

static inline int __prepare_ICR2 (unsigned int mask)
{
	return SET_APIC_DEST_FIELD(mask);
}

void __send_IPI_shortcut(unsigned int shortcut, int vector)
{
	/*
	 * Subtle. In the case of the 'never do double writes' workaround
	 * we have to lock out interrupts to be safe.  As we don't care
	 * of the value read we use an atomic rmw access to avoid costly
	 * cli/sti.  Otherwise we use an even cheaper single atomic write
	 * to the APIC.
	 */
	unsigned int cfg;

	/*
	 * Wait for idle.
	 */
	apic_wait_icr_idle();

	/*
	 * No need to touch the target chip field
	 */
	cfg = __prepare_ICR(shortcut, vector);

	/*
	 * Send the IPI. The write to APIC_ICR fires this off.
	 */
	apic_write_around(APIC_ICR, cfg);
}

void fastcall send_IPI_self(int vector)
{
	__send_IPI_shortcut(APIC_DEST_SELF, vector);
}

/*
 * This is only used on smaller machines.
 */
void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
{
	unsigned long mask = cpus_addr(cpumask)[0];
	unsigned long cfg;
	unsigned long flags;

	local_irq_save(flags);
		
	/*
	 * Wait for idle.
	 */
	apic_wait_icr_idle();
		
	/*
	 * prepare target chip field
	 */
	cfg = __prepare_ICR2(mask);
	apic_write_around(APIC_ICR2, cfg);
		
	/*
	 * program the ICR 
	 */
	cfg = __prepare_ICR(0, vector);
			
	/*
	 * Send the IPI. The write to APIC_ICR fires this off.
	 */
	apic_write_around(APIC_ICR, cfg);

	local_irq_restore(flags);
}

void send_IPI_mask_sequence(cpumask_t mask, int vector)
{
	unsigned long cfg, flags;
	unsigned int query_cpu;

	/*
	 * Hack. The clustered APIC addressing mode doesn't allow us to send 
	 * to an arbitrary mask, so I do a unicasts to each CPU instead. This 
	 * should be modified to do 1 message per cluster ID - mbligh
	 */ 

	local_irq_save(flags);

	for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
		if (cpu_isset(query_cpu, mask)) {
		
			/*
			 * Wait for idle.
			 */
			apic_wait_icr_idle();
		
			/*
			 * prepare target chip field
			 */
			cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
			apic_write_around(APIC_ICR2, cfg);
		
			/*
			 * program the ICR 
			 */
			cfg = __prepare_ICR(0, vector);
			
			/*
			 * Send the IPI. The write to APIC_ICR fires this off.
			 */
			apic_write_around(APIC_ICR, cfg);
		}
	}
	local_irq_restore(flags);
}

#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */

/*
 *	Smarter SMP flushing macros. 
 *		c/o Linus Torvalds.
 *
 *	These mean you can really definitely utterly forget about
 *	writing to user space from interrupts. (Its not allowed anyway).
 *
 *	Optimizations Manfred Spraul <manfred@colorfullife.com>
 */

static cpumask_t flush_cpumask;
static struct mm_struct * flush_mm;
static unsigned long flush_va;
static DEFINE_SPINLOCK(tlbstate_lock);
#define FLUSH_ALL	0xffffffff

/*
 * We cannot call mmdrop() because we are in interrupt context, 
 * instead update mm->cpu_vm_mask.
 *
 * We need to reload %cr3 since the page tables may be going
 * away from under us..
 */
static inline void leave_mm (unsigned long cpu)
{
	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
		BUG();
	cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
	load_cr3(swapper_pg_dir);
}

/*
 *
 * The flush IPI assumes that a thread switch happens in this order:
 * [cpu0: the cpu that switches]
 * 1) switch_mm() either 1a) or 1b)
 * 1a) thread switch to a different mm
 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
 * 	Stop ipi delivery for the old mm. This is not synchronized with
 * 	the other cpus, but smp_invalidate_interrupt ignore flush ipis
 * 	for the wrong mm, and in the worst case we perform a superflous
 * 	tlb flush.
 * 1a2) set cpu_tlbstate to TLBSTATE_OK
 * 	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
 *	was in lazy tlb mode.
 * 1a3) update cpu_tlbstate[].active_mm
 * 	Now cpu0 accepts tlb flushes for the new mm.
 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
 * 	Now the other cpus will send tlb flush ipis.
 * 1a4) change cr3.
 * 1b) thread switch without mm change
 *	cpu_tlbstate[].active_mm is correct, cpu0 already handles
 *	flush ipis.
 * 1b1) set cpu_tlbstate to TLBSTATE_OK
 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
 * 	Atomically set the bit [other cpus will start sending flush ipis],
 * 	and test the bit.
 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
 * 2) switch %%esp, ie current
 *
 * The interrupt must handle 2 special cases:
 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
 *   runs in kernel space, the cpu could load tlb entries for user space
 *   pages.
 *
 * The good news is that cpu_tlbstate is local to each cpu, no
 * write/read ordering problems.
 */

/*
 * TLB flush IPI:
 *
 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
 * 2) Leave the mm if we are in the lazy tlb mode.
 */

fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
{
	unsigned long cpu;

	cpu = get_cpu();

	if (!cpu_isset(cpu, flush_cpumask))
		goto out;
		/* 
		 * This was a BUG() but until someone can quote me the
		 * line from the intel manual that guarantees an IPI to
		 * multiple CPUs is retried _only_ on the erroring CPUs
		 * its staying as a return
		 *
		 * BUG();
		 */
		 
	if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
		if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
			if (flush_va == FLUSH_ALL)
				local_flush_tlb();
			else
				__flush_tlb_one(flush_va);
		} else
			leave_mm(cpu);
	}
	ack_APIC_irq();
	smp_mb__before_clear_bit();
	cpu_clear(cpu, flush_cpumask);
	smp_mb__after_clear_bit();
out:
	put_cpu_no_resched();
}

static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
						unsigned long va)
{
	cpumask_t tmp;
	/*
	 * A couple of (to be removed) sanity checks:
	 *
	 * - we do not send IPIs to not-yet booted CPUs.
	 * - current CPU must not be in mask
	 * - mask must exist :)
	 */
	BUG_ON(cpus_empty(cpumask));

	cpus_and(tmp, cpumask, cpu_online_map);
	BUG_ON(!cpus_equal(cpumask, tmp));
	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
	BUG_ON(!mm);

	/*
	 * i'm not happy about this global shared spinlock in the
	 * MM hot path, but we'll see how contended it is.
	 * Temporarily this turns IRQs off, so that lockups are
	 * detected by the NMI watchdog.
	 */
	spin_lock(&tlbstate_lock);
	
	flush_mm = mm;
	flush_va = va;
#if NR_CPUS <= BITS_PER_LONG
	atomic_set_mask(cpumask, &flush_cpumask);
#else
	{
		int k;
		unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
		unsigned long *cpu_mask = (unsigned long *)&cpumask;
		for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
			atomic_set_mask(cpu_mask[k], &flush_mask[k]);
	}
#endif
	/*
	 * We have to send the IPI only to
	 * CPUs affected.
	 */
	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);

	while (!cpus_empty(flush_cpumask))
		/* nothing. lockup detection does not belong here */
		mb();

	flush_mm = NULL;
	flush_va = 0;
	spin_unlock(&tlbstate_lock);
}
	
void flush_tlb_current_task(void)
{
	struct mm_struct *mm = current->mm;
	cpumask_t cpu_mask;

	preempt_disable();
	cpu_mask = mm->cpu_vm_mask;
	cpu_clear(smp_processor_id(), cpu_mask);

	local_flush_tlb();
	if (!cpus_empty(cpu_mask))
		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
	preempt_enable();
}

void flush_tlb_mm (struct mm_struct * mm)
{
	cpumask_t cpu_mask;

	preempt_disable();
	cpu_mask = mm->cpu_vm_mask;
	cpu_clear(smp_processor_id(), cpu_mask);

	if (current->active_mm == mm) {
		if (current->mm)
			local_flush_tlb();
		else
			leave_mm(smp_processor_id());
	}
	if (!cpus_empty(cpu_mask))
		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);

	preempt_enable();
}

void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
{
	struct mm_struct *mm = vma->vm_mm;
	cpumask_t cpu_mask;

	preempt_disable();
	cpu_mask = mm->cpu_vm_mask;
	cpu_clear(smp_processor_id(), cpu_mask);

	if (current->active_mm == mm) {
		if(current->mm)
			__flush_tlb_one(va);
		 else
		 	leave_mm(smp_processor_id());
	}

	if (!cpus_empty(cpu_mask))
		flush_tlb_others(cpu_mask, mm, va);

	preempt_enable();
}

static void do_flush_tlb_all(void* info)
{
	unsigned long cpu = smp_processor_id();

	__flush_tlb_all();
	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
		leave_mm(cpu);
}

void flush_tlb_all(void)
{
	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
}

/*
 * this function sends a 'reschedule' IPI to another CPU.
 * it goes straight through and wastes no time serializing
 * anything. Worst case is that we lose a reschedule ...
 */
void smp_send_reschedule(int cpu)
{
	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
}

/*
 * Structure and data for smp_call_function(). This is designed to minimise
 * static memory requirements. It also looks cleaner.
 */
static DEFINE_SPINLOCK(call_lock);

struct call_data_struct {
	void (*func) (void *info);
	void *info;
	atomic_t started;
	atomic_t finished;
	int wait;
};

static struct call_data_struct * call_data;

/*
 * this function sends a 'generic call function' IPI to all other CPUs
 * in the system.
 */

int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
			int wait)
/*
 * [SUMMARY] Run a function on all other CPUs.
 * <func> The function to run. This must be fast and non-blocking.
 * <info> An arbitrary pointer to pass to the function.
 * <nonatomic> currently unused.
 * <wait> If true, wait (atomically) until function has completed on other CPUs.
 * [RETURNS] 0 on success, else a negative status code. Does not return until
 * remote CPUs are nearly ready to execute <<func>> or are or have executed.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.
 */
{
	struct call_data_struct data;
	int cpus = num_online_cpus()-1;

	if (!cpus)
		return 0;

	/* Can deadlock when called with interrupts disabled */
	WARN_ON(irqs_disabled());

	data.func = func;
	data.info = info;
	atomic_set(&data.started, 0);
	data.wait = wait;
	if (wait)
		atomic_set(&data.finished, 0);

	spin_lock(&call_lock);
	call_data = &data;
	mb();
	
	/* Send a message to all other CPUs and wait for them to respond */
	send_IPI_allbutself(CALL_FUNCTION_VECTOR);

	/* Wait for response */
	while (atomic_read(&data.started) != cpus)
		cpu_relax();

	if (wait)
		while (atomic_read(&data.finished) != cpus)
			cpu_relax();
	spin_unlock(&call_lock);

	return 0;
}

static void stop_this_cpu (void * dummy)
{
	/*
	 * Remove this CPU:
	 */
	cpu_clear(smp_processor_id(), cpu_online_map);
	local_irq_disable();
	disable_local_APIC();
	if (cpu_data[smp_processor_id()].hlt_works_ok)
		for(;;) __asm__("hlt");
	for (;;);
}

/*
 * this function calls the 'stop' function on all other CPUs in the system.
 */

void smp_send_stop(void)
{
	smp_call_function(stop_this_cpu, NULL, 1, 0);

	local_irq_disable();
	disable_local_APIC();
	local_irq_enable();
}

/*
 * Reschedule call back. Nothing to do,
 * all the work is done automatically when
 * we return from the interrupt.
 */
fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
{
	ack_APIC_irq();
}

fastcall void smp_call_function_interrupt(struct pt_regs *regs)
{
	void (*func) (void *info) = call_data->func;
	void *info = call_data->info;
	int wait = call_data->wait;

	ack_APIC_irq();
	/*
	 * Notify initiating CPU that I've grabbed the data and am
	 * about to execute the function
	 */
	mb();
	atomic_inc(&call_data->started);
	/*
	 * At this point the info structure may be out of scope unless wait==1
	 */
	irq_enter();
	(*func)(info);
	irq_exit();

	if (wait) {
		mb();
		atomic_inc(&call_data->finished);
	}
}
Commit	Line	Data
1da177e4 LT	1	/*
	2	* Intel SMP support routines.
	3	*
	4	* (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
	5	* (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
	6	*
	7	* This code is released under the GNU General Public License version 2 or
	8	* later.
	9	*/
	10
	11	#include <linux/init.h>
	12
	13	#include <linux/mm.h>
	14	#include <linux/irq.h>
	15	#include <linux/delay.h>
	16	#include <linux/spinlock.h>
	17	#include <linux/smp_lock.h>
	18	#include <linux/kernel_stat.h>
	19	#include <linux/mc146818rtc.h>
	20	#include <linux/cache.h>
	21	#include <linux/interrupt.h>
	22
	23	#include <asm/mtrr.h>
	24	#include <asm/tlbflush.h>
	25	#include <mach_apic.h>
	26
	27	/*
	28	* Some notes on x86 processor bugs affecting SMP operation:
	29	*
	30	* Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
	31	* The Linux implications for SMP are handled as follows:
	32	*
	33	* Pentium III / [Xeon]
	34	* None of the E1AP-E3AP errata are visible to the user.
	35	*
	36	* E1AP. see PII A1AP
	37	* E2AP. see PII A2AP
	38	* E3AP. see PII A3AP
	39	*
	40	* Pentium II / [Xeon]
	41	* None of the A1AP-A3AP errata are visible to the user.
	42	*
	43	* A1AP. see PPro 1AP
	44	* A2AP. see PPro 2AP
	45	* A3AP. see PPro 7AP
	46	*
	47	* Pentium Pro
	48	* None of 1AP-9AP errata are visible to the normal user,
	49	* except occasional delivery of 'spurious interrupt' as trap #15.
	50	* This is very rare and a non-problem.
	51	*
	52	* 1AP. Linux maps APIC as non-cacheable
	53	* 2AP. worked around in hardware
	54	* 3AP. fixed in C0 and above steppings microcode update.
	55	* Linux does not use excessive STARTUP_IPIs.
	56	* 4AP. worked around in hardware
	57	* 5AP. symmetric IO mode (normal Linux operation) not affected.
	58	* 'noapic' mode has vector 0xf filled out properly.
	59	* 6AP. 'noapic' mode might be affected - fixed in later steppings
	60	* 7AP. We do not assume writes to the LVT deassering IRQs
	61	* 8AP. We do not enable low power mode (deep sleep) during MP bootup
	62	* 9AP. We do not use mixed mode
	63	*
	64	* Pentium
65	* There is a marginal case where REP MOVS on 100MHz SMP
66	* machines with B stepping processors can fail. XXX should provide
67	* an L1cache=Writethrough or L1cache=off option.
68	*
69	* B stepping CPUs may hang. There are hardware work arounds
70	* for this. We warn about it in case your board doesn't have the work
71	* arounds. Basically thats so I can tell anyone with a B stepping
72	* CPU and SMP problems "tough".
73	*
74	* Specific items [From Pentium Processor Specification Update]
75	*
76	* 1AP. Linux doesn't use remote read
77	* 2AP. Linux doesn't trust APIC errors
78	* 3AP. We work around this
79	* 4AP. Linux never generated 3 interrupts of the same priority
80	* to cause a lost local interrupt.
81	* 5AP. Remote read is never used
82	* 6AP. not affected - worked around in hardware
83	* 7AP. not affected - worked around in hardware
84	* 8AP. worked around in hardware - we get explicit CS errors if not
85	* 9AP. only 'noapic' mode affected. Might generate spurious
86	* interrupts, we log only the first one and count the
87	* rest silently.
88	* 10AP. not affected - worked around in hardware
89	* 11AP. Linux reads the APIC between writes to avoid this, as per
90	* the documentation. Make sure you preserve this as it affects
91	* the C stepping chips too.
92	* 12AP. not affected - worked around in hardware
93	* 13AP. not affected - worked around in hardware
94	* 14AP. we always deassert INIT during bootup
95	* 15AP. not affected - worked around in hardware
96	* 16AP. not affected - worked around in hardware
97	* 17AP. not affected - worked around in hardware
98	* 18AP. not affected - worked around in hardware
99	* 19AP. not affected - worked around in BIOS
100	*
101	* If this sounds worrying believe me these bugs are either ___RARE___,
102	* or are signal timing bugs worked around in hardware and there's
103	* about nothing of note with C stepping upwards.
104	*/
105
106	DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
107
108	/*
109	* the following functions deal with sending IPIs between CPUs.
110	*
111	* We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
112	*/
113
114	static inline int __prepare_ICR (unsigned int shortcut, int vector)
115	{
116	return APIC_DM_FIXED \| shortcut \| vector \| APIC_DEST_LOGICAL;
117	}
118
119	static inline int __prepare_ICR2 (unsigned int mask)
120	{
121	return SET_APIC_DEST_FIELD(mask);
122	}
123
124	void __send_IPI_shortcut(unsigned int shortcut, int vector)
125	{
126	/*
127	* Subtle. In the case of the 'never do double writes' workaround
128	* we have to lock out interrupts to be safe. As we don't care
129	* of the value read we use an atomic rmw access to avoid costly
130	* cli/sti. Otherwise we use an even cheaper single atomic write
131	* to the APIC.
132	*/
133	unsigned int cfg;
134
135	/*
136	* Wait for idle.
137	*/
138	apic_wait_icr_idle();
139
140	/*
141	* No need to touch the target chip field
142	*/
143	cfg = __prepare_ICR(shortcut, vector);
144
145	/*
146	* Send the IPI. The write to APIC_ICR fires this off.
147	*/
148	apic_write_around(APIC_ICR, cfg);
149	}
150
151	void fastcall send_IPI_self(int vector)
152	{
153	__send_IPI_shortcut(APIC_DEST_SELF, vector);
154	}
155
156	/*
157	* This is only used on smaller machines.
158	*/
159	void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
160	{
161	unsigned long mask = cpus_addr(cpumask)[0];
162	unsigned long cfg;
163	unsigned long flags;
164
165	local_irq_save(flags);
166
167	/*
168	* Wait for idle.
169	*/
170	apic_wait_icr_idle();
171
172	/*
173	* prepare target chip field
174	*/
175	cfg = __prepare_ICR2(mask);
176	apic_write_around(APIC_ICR2, cfg);
177
178	/*
179	* program the ICR
180	*/
181	cfg = __prepare_ICR(0, vector);
182
183	/*
184	* Send the IPI. The write to APIC_ICR fires this off.
185	*/
186	apic_write_around(APIC_ICR, cfg);
187
188	local_irq_restore(flags);
189	}
190
191	void send_IPI_mask_sequence(cpumask_t mask, int vector)
192	{
193	unsigned long cfg, flags;
194	unsigned int query_cpu;
195
196	/*
197	* Hack. The clustered APIC addressing mode doesn't allow us to send
198	* to an arbitrary mask, so I do a unicasts to each CPU instead. This
199	* should be modified to do 1 message per cluster ID - mbligh
200	*/
201
202	local_irq_save(flags);
203
204	for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
205	if (cpu_isset(query_cpu, mask)) {
206
207	/*
208	* Wait for idle.
209	*/
210	apic_wait_icr_idle();
211
212	/*
213	* prepare target chip field
214	*/
215	cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
216	apic_write_around(APIC_ICR2, cfg);
217
218	/*
219	* program the ICR
220	*/
221	cfg = __prepare_ICR(0, vector);
222
223	/*
224	* Send the IPI. The write to APIC_ICR fires this off.
225	*/
226	apic_write_around(APIC_ICR, cfg);
227	}
228	}
229	local_irq_restore(flags);
230	}
231
232	#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
233
234	/*
235	* Smarter SMP flushing macros.
236	* c/o Linus Torvalds.
237	*
238	* These mean you can really definitely utterly forget about
239	* writing to user space from interrupts. (Its not allowed anyway).
240	*
241	* Optimizations Manfred Spraul <manfred@colorfullife.com>
242	*/
243
244	static cpumask_t flush_cpumask;
245	static struct mm_struct * flush_mm;
246	static unsigned long flush_va;
247	static DEFINE_SPINLOCK(tlbstate_lock);
248	#define FLUSH_ALL 0xffffffff
249
250	/*
251	* We cannot call mmdrop() because we are in interrupt context,
252	* instead update mm->cpu_vm_mask.
253	*
254	* We need to reload %cr3 since the page tables may be going
255	* away from under us..
256	*/
257	static inline void leave_mm (unsigned long cpu)
258	{
259	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
260	BUG();
261	cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
262	load_cr3(swapper_pg_dir);
263	}
264
265	/*
266	*
267	* The flush IPI assumes that a thread switch happens in this order:
268	* [cpu0: the cpu that switches]
269	* 1) switch_mm() either 1a) or 1b)
270	* 1a) thread switch to a different mm
271	* 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
272	* Stop ipi delivery for the old mm. This is not synchronized with
273	* the other cpus, but smp_invalidate_interrupt ignore flush ipis
274	* for the wrong mm, and in the worst case we perform a superflous
275	* tlb flush.
276	* 1a2) set cpu_tlbstate to TLBSTATE_OK
277	* Now the smp_invalidate_interrupt won't call leave_mm if cpu0
278	* was in lazy tlb mode.
279	* 1a3) update cpu_tlbstate[].active_mm
280	* Now cpu0 accepts tlb flushes for the new mm.
281	* 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
282	* Now the other cpus will send tlb flush ipis.
283	* 1a4) change cr3.
284	* 1b) thread switch without mm change
285	* cpu_tlbstate[].active_mm is correct, cpu0 already handles
286	* flush ipis.
287	* 1b1) set cpu_tlbstate to TLBSTATE_OK
288	* 1b2) test_and_set the cpu bit in cpu_vm_mask.
289	* Atomically set the bit [other cpus will start sending flush ipis],
290	* and test the bit.
291	* 1b3) if the bit was 0: leave_mm was called, flush the tlb.
292	* 2) switch %%esp, ie current
293	*
294	* The interrupt must handle 2 special cases:
295	* - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
296	* - the cpu performs speculative tlb reads, i.e. even if the cpu only
297	* runs in kernel space, the cpu could load tlb entries for user space
298	* pages.
299	*
300	* The good news is that cpu_tlbstate is local to each cpu, no
301	* write/read ordering problems.
302	*/
303
304	/*
305	* TLB flush IPI:
306	*
307	* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
308	* 2) Leave the mm if we are in the lazy tlb mode.
309	*/
310
311	fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
312	{
313	unsigned long cpu;
314
315	cpu = get_cpu();
316
317	if (!cpu_isset(cpu, flush_cpumask))
318	goto out;
319	/*
320	* This was a BUG() but until someone can quote me the
321	* line from the intel manual that guarantees an IPI to
322	* multiple CPUs is retried _only_ on the erroring CPUs
323	* its staying as a return
324	*
325	* BUG();
326	*/
327
328	if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
329	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
330	if (flush_va == FLUSH_ALL)
331	local_flush_tlb();
332	else
333	__flush_tlb_one(flush_va);
334	} else
335	leave_mm(cpu);
336	}
337	ack_APIC_irq();
338	smp_mb__before_clear_bit();
339	cpu_clear(cpu, flush_cpumask);
340	smp_mb__after_clear_bit();
341	out:
342	put_cpu_no_resched();
343	}
344
345	static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
346	unsigned long va)
347	{
348	cpumask_t tmp;
349	/*
350	* A couple of (to be removed) sanity checks:
351	*
352	* - we do not send IPIs to not-yet booted CPUs.
353	* - current CPU must not be in mask
354	* - mask must exist :)
355	*/
356	BUG_ON(cpus_empty(cpumask));
357
358	cpus_and(tmp, cpumask, cpu_online_map);
359	BUG_ON(!cpus_equal(cpumask, tmp));
360	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
361	BUG_ON(!mm);
362
363	/*
364	* i'm not happy about this global shared spinlock in the
365	* MM hot path, but we'll see how contended it is.
366	* Temporarily this turns IRQs off, so that lockups are
367	* detected by the NMI watchdog.
368	*/
369	spin_lock(&tlbstate_lock);
370
371	flush_mm = mm;
372	flush_va = va;
373	#if NR_CPUS <= BITS_PER_LONG
374	atomic_set_mask(cpumask, &flush_cpumask);
375	#else
376	{
377	int k;
378	unsigned long flush_mask = (unsigned long )&flush_cpumask;
379	unsigned long cpu_mask = (unsigned long )&cpumask;
380	for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
381	atomic_set_mask(cpu_mask[k], &flush_mask[k]);
382	}
383	#endif
384	/*
385	* We have to send the IPI only to
386	* CPUs affected.
387	*/
388	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
389
390	while (!cpus_empty(flush_cpumask))
391	/* nothing. lockup detection does not belong here */
392	mb();
393
394	flush_mm = NULL;
395	flush_va = 0;
396	spin_unlock(&tlbstate_lock);
397	}
398
399	void flush_tlb_current_task(void)
400	{
401	struct mm_struct *mm = current->mm;
402	cpumask_t cpu_mask;
403
404	preempt_disable();
405	cpu_mask = mm->cpu_vm_mask;
406	cpu_clear(smp_processor_id(), cpu_mask);
407
408	local_flush_tlb();
409	if (!cpus_empty(cpu_mask))
410	flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
411	preempt_enable();
412	}
413
414	void flush_tlb_mm (struct mm_struct * mm)
415	{
416	cpumask_t cpu_mask;
417
418	preempt_disable();
419	cpu_mask = mm->cpu_vm_mask;
420	cpu_clear(smp_processor_id(), cpu_mask);
421
422	if (current->active_mm == mm) {
423	if (current->mm)
424	local_flush_tlb();
425	else
426	leave_mm(smp_processor_id());
427	}
428	if (!cpus_empty(cpu_mask))
429	flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
430
431	preempt_enable();
432	}
433
434	void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
435	{
436	struct mm_struct *mm = vma->vm_mm;
437	cpumask_t cpu_mask;
438
439	preempt_disable();
440	cpu_mask = mm->cpu_vm_mask;
441	cpu_clear(smp_processor_id(), cpu_mask);
442
443	if (current->active_mm == mm) {
444	if(current->mm)
445	__flush_tlb_one(va);
446	else
447	leave_mm(smp_processor_id());
448	}
449
450	if (!cpus_empty(cpu_mask))
451	flush_tlb_others(cpu_mask, mm, va);
452
453	preempt_enable();
454	}
455
456	static void do_flush_tlb_all(void* info)
457	{
458	unsigned long cpu = smp_processor_id();
459
460	__flush_tlb_all();
461	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
462	leave_mm(cpu);
463	}
464
465	void flush_tlb_all(void)
466	{
467	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
468	}
469
470	/*
471	* this function sends a 'reschedule' IPI to another CPU.
472	* it goes straight through and wastes no time serializing
473	* anything. Worst case is that we lose a reschedule ...
474	*/
475	void smp_send_reschedule(int cpu)
476	{
477	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
478	}
479
480	/*
481	* Structure and data for smp_call_function(). This is designed to minimise
482	* static memory requirements. It also looks cleaner.
483	*/
484	static DEFINE_SPINLOCK(call_lock);
485
486	struct call_data_struct {
487	void (func) (void info);
488	void *info;
489	atomic_t started;
490	atomic_t finished;
491	int wait;
492	};
493
494	static struct call_data_struct * call_data;
495
496	/*
497	* this function sends a 'generic call function' IPI to all other CPUs
498	* in the system.
499	*/
500
501	int smp_call_function (void (func) (void info), void *info, int nonatomic,
502	int wait)
503	/*
504	* [SUMMARY] Run a function on all other CPUs.
505	* <func> The function to run. This must be fast and non-blocking.
506	* <info> An arbitrary pointer to pass to the function.
507	* <nonatomic> currently unused.
508	* <wait> If true, wait (atomically) until function has completed on other CPUs.
509	* [RETURNS] 0 on success, else a negative status code. Does not return until
510	* remote CPUs are nearly ready to execute <<func>> or are or have executed.
511	*
512	* You must not call this function with disabled interrupts or from a
513	* hardware interrupt handler or from a bottom half handler.
514	*/
515	{
516	struct call_data_struct data;
517	int cpus = num_online_cpus()-1;
518
519	if (!cpus)
520	return 0;
521
522	/* Can deadlock when called with interrupts disabled */
523	WARN_ON(irqs_disabled());
524
525	data.func = func;
526	data.info = info;
527	atomic_set(&data.started, 0);
528	data.wait = wait;
529	if (wait)
530	atomic_set(&data.finished, 0);
531
532	spin_lock(&call_lock);
533	call_data = &data;
534	mb();
535
536	/* Send a message to all other CPUs and wait for them to respond */
537	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
538
539	/* Wait for response */
540	while (atomic_read(&data.started) != cpus)
541	cpu_relax();
542
543	if (wait)
544	while (atomic_read(&data.finished) != cpus)
545	cpu_relax();
546	spin_unlock(&call_lock);
547
548	return 0;
549	}
550
551	static void stop_this_cpu (void * dummy)
552	{
553	/*
554	* Remove this CPU:
555	*/
556	cpu_clear(smp_processor_id(), cpu_online_map);
557	local_irq_disable();
558	disable_local_APIC();
559	if (cpu_data[smp_processor_id()].hlt_works_ok)
560	for(;;) __asm__("hlt");
561	for (;;);
562	}
563
564	/*
565	* this function calls the 'stop' function on all other CPUs in the system.
566	*/
567
568	void smp_send_stop(void)
569	{
570	smp_call_function(stop_this_cpu, NULL, 1, 0);
571
572	local_irq_disable();
573	disable_local_APIC();
574	local_irq_enable();
575	}
576
577	/*
578	* Reschedule call back. Nothing to do,
579	* all the work is done automatically when
580	* we return from the interrupt.
581	*/
582	fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
583	{
584	ack_APIC_irq();
585	}
586
587	fastcall void smp_call_function_interrupt(struct pt_regs *regs)
588	{
589	void (func) (void info) = call_data->func;
590	void *info = call_data->info;
591	int wait = call_data->wait;
592
593	ack_APIC_irq();
594	/*
595	* Notify initiating CPU that I've grabbed the data and am
596	* about to execute the function
597	*/
598	mb();
599	atomic_inc(&call_data->started);
600	/*
601	* At this point the info structure may be out of scope unless wait==1
602	*/
603	irq_enter();
604	(*func)(info);
605	irq_exit();
606
607	if (wait) {
608	mb();
609	atomic_inc(&call_data->finished);
610	}
611	}
612