Commit | Line | Data |
---|---|---|
6487673b | 1 | /* |
130ace11 TH |
2 | * Asm versions of Xen pv-ops, suitable for either direct use or |
3 | * inlining. The inline versions are the same as the direct-use | |
4 | * versions, with the pre- and post-amble chopped off. | |
5 | * | |
6 | * This code is encoded for size rather than absolute efficiency, with | |
7 | * a view to being able to inline as much as possible. | |
8 | * | |
9 | * We only bother with direct forms (ie, vcpu in pda) of the | |
10 | * operations here; the indirect forms are better handled in C, since | |
11 | * they're generally too large to inline anyway. | |
6487673b JF |
12 | */ |
13 | ||
6487673b | 14 | #include <asm/thread_info.h> |
6487673b | 15 | #include <asm/processor-flags.h> |
9ec2b804 | 16 | #include <asm/segment.h> |
8f6380b9 | 17 | #include <asm/asm.h> |
9ec2b804 JF |
18 | |
19 | #include <xen/interface/xen.h> | |
6487673b | 20 | |
5393744b | 21 | #include "xen-asm.h" |
6487673b JF |
22 | |
23 | /* | |
130ace11 TH |
24 | * Force an event check by making a hypercall, but preserve regs |
25 | * before making the call. | |
6487673b | 26 | */ |
5393744b JF |
27 | check_events: |
28 | push %eax | |
29 | push %ecx | |
30 | push %edx | |
31 | call xen_force_evtchn_callback | |
32 | pop %edx | |
33 | pop %ecx | |
34 | pop %eax | |
6487673b | 35 | ret |
6487673b | 36 | |
b77797fb | 37 | /* |
130ace11 TH |
38 | * We can't use sysexit directly, because we're not running in ring0. |
39 | * But we can easily fake it up using iret. Assuming xen_sysexit is | |
40 | * jumped to with a standard stack frame, we can just strip it back to | |
41 | * a standard iret frame and use iret. | |
b77797fb JF |
42 | */ |
43 | ENTRY(xen_sysexit) | |
44 | movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ | |
45 | orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) | |
46 | lea PT_EIP(%esp), %esp | |
47 | ||
48 | jmp xen_iret | |
49 | ENDPROC(xen_sysexit) | |
50 | ||
9ec2b804 | 51 | /* |
130ace11 TH |
52 | * This is run where a normal iret would be run, with the same stack setup: |
53 | * 8: eflags | |
54 | * 4: cs | |
55 | * esp-> 0: eip | |
56 | * | |
57 | * This attempts to make sure that any pending events are dealt with | |
58 | * on return to usermode, but there is a small window in which an | |
59 | * event can happen just before entering usermode. If the nested | |
60 | * interrupt ends up setting one of the TIF_WORK_MASK pending work | |
61 | * flags, they will not be tested again before returning to | |
62 | * usermode. This means that a process can end up with pending work, | |
63 | * which will be unprocessed until the process enters and leaves the | |
64 | * kernel again, which could be an unbounded amount of time. This | |
65 | * means that a pending signal or reschedule event could be | |
66 | * indefinitely delayed. | |
67 | * | |
68 | * The fix is to notice a nested interrupt in the critical window, and | |
69 | * if one occurs, then fold the nested interrupt into the current | |
70 | * interrupt stack frame, and re-process it iteratively rather than | |
71 | * recursively. This means that it will exit via the normal path, and | |
72 | * all pending work will be dealt with appropriately. | |
73 | * | |
74 | * Because the nested interrupt handler needs to deal with the current | |
75 | * stack state in whatever form its in, we keep things simple by only | |
76 | * using a single register which is pushed/popped on the stack. | |
9ec2b804 | 77 | */ |
4461bbc0 BO |
78 | |
79 | .macro POP_FS | |
80 | 1: | |
81 | popw %fs | |
82 | .pushsection .fixup, "ax" | |
83 | 2: movw $0, (%esp) | |
84 | jmp 1b | |
85 | .popsection | |
86 | _ASM_EXTABLE(1b,2b) | |
87 | .endm | |
88 | ||
81e103f1 | 89 | ENTRY(xen_iret) |
9ec2b804 JF |
90 | /* test eflags for special cases */ |
91 | testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) | |
92 | jnz hyper_iret | |
93 | ||
94 | push %eax | |
95 | ESP_OFFSET=4 # bytes pushed onto stack | |
96 | ||
4461bbc0 | 97 | /* Store vcpu_info pointer for easy access */ |
9ec2b804 | 98 | #ifdef CONFIG_SMP |
4461bbc0 BO |
99 | pushw %fs |
100 | movl $(__KERNEL_PERCPU), %eax | |
101 | movl %eax, %fs | |
102 | movl %fs:xen_vcpu, %eax | |
103 | POP_FS | |
9ec2b804 | 104 | #else |
13d2b4d1 | 105 | movl %ss:xen_vcpu, %eax |
9ec2b804 JF |
106 | #endif |
107 | ||
108 | /* check IF state we're restoring */ | |
109 | testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) | |
110 | ||
130ace11 TH |
111 | /* |
112 | * Maybe enable events. Once this happens we could get a | |
113 | * recursive event, so the critical region starts immediately | |
114 | * afterwards. However, if that happens we don't end up | |
115 | * resuming the code, so we don't have to be worried about | |
116 | * being preempted to another CPU. | |
117 | */ | |
13d2b4d1 | 118 | setz %ss:XEN_vcpu_info_mask(%eax) |
9ec2b804 JF |
119 | xen_iret_start_crit: |
120 | ||
121 | /* check for unmasked and pending */ | |
13d2b4d1 | 122 | cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax) |
9ec2b804 | 123 | |
130ace11 TH |
124 | /* |
125 | * If there's something pending, mask events again so we can | |
d198d499 IM |
126 | * jump back into xen_hypervisor_callback. Otherwise do not |
127 | * touch XEN_vcpu_info_mask. | |
130ace11 | 128 | */ |
d198d499 | 129 | jne 1f |
13d2b4d1 | 130 | movb $1, %ss:XEN_vcpu_info_mask(%eax) |
9ec2b804 | 131 | |
d198d499 | 132 | 1: popl %eax |
9ec2b804 | 133 | |
130ace11 TH |
134 | /* |
135 | * From this point on the registers are restored and the stack | |
136 | * updated, so we don't need to worry about it if we're | |
137 | * preempted | |
138 | */ | |
9ec2b804 JF |
139 | iret_restore_end: |
140 | ||
130ace11 TH |
141 | /* |
142 | * Jump to hypervisor_callback after fixing up the stack. | |
143 | * Events are masked, so jumping out of the critical region is | |
144 | * OK. | |
145 | */ | |
9ec2b804 JF |
146 | je xen_hypervisor_callback |
147 | ||
90e9f536 | 148 | 1: iret |
9ec2b804 | 149 | xen_iret_end_crit: |
8f6380b9 | 150 | _ASM_EXTABLE(1b, iret_exc) |
9ec2b804 JF |
151 | |
152 | hyper_iret: | |
153 | /* put this out of line since its very rarely used */ | |
154 | jmp hypercall_page + __HYPERVISOR_iret * 32 | |
155 | ||
156 | .globl xen_iret_start_crit, xen_iret_end_crit | |
157 | ||
158 | /* | |
130ace11 TH |
159 | * This is called by xen_hypervisor_callback in entry.S when it sees |
160 | * that the EIP at the time of interrupt was between | |
161 | * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in | |
162 | * %eax so we can do a more refined determination of what to do. | |
163 | * | |
164 | * The stack format at this point is: | |
165 | * ---------------- | |
166 | * ss : (ss/esp may be present if we came from usermode) | |
167 | * esp : | |
168 | * eflags } outer exception info | |
169 | * cs } | |
170 | * eip } | |
171 | * ---------------- <- edi (copy dest) | |
172 | * eax : outer eax if it hasn't been restored | |
173 | * ---------------- | |
174 | * eflags } nested exception info | |
175 | * cs } (no ss/esp because we're nested | |
176 | * eip } from the same ring) | |
177 | * orig_eax }<- esi (copy src) | |
178 | * - - - - - - - - | |
179 | * fs } | |
180 | * es } | |
181 | * ds } SAVE_ALL state | |
182 | * eax } | |
183 | * : : | |
184 | * ebx }<- esp | |
185 | * ---------------- | |
186 | * | |
187 | * In order to deliver the nested exception properly, we need to shift | |
188 | * everything from the return addr up to the error code so it sits | |
189 | * just under the outer exception info. This means that when we | |
190 | * handle the exception, we do it in the context of the outer | |
191 | * exception rather than starting a new one. | |
192 | * | |
193 | * The only caveat is that if the outer eax hasn't been restored yet | |
194 | * (ie, it's still on stack), we need to insert its value into the | |
195 | * SAVE_ALL state before going on, since it's usermode state which we | |
196 | * eventually need to restore. | |
9ec2b804 JF |
197 | */ |
198 | ENTRY(xen_iret_crit_fixup) | |
9ec2b804 | 199 | /* |
130ace11 TH |
200 | * Paranoia: Make sure we're really coming from kernel space. |
201 | * One could imagine a case where userspace jumps into the | |
202 | * critical range address, but just before the CPU delivers a | |
203 | * GP, it decides to deliver an interrupt instead. Unlikely? | |
204 | * Definitely. Easy to avoid? Yes. The Intel documents | |
205 | * explicitly say that the reported EIP for a bad jump is the | |
206 | * jump instruction itself, not the destination, but some | |
207 | * virtual environments get this wrong. | |
9ec2b804 | 208 | */ |
0f2c8769 | 209 | movl PT_CS(%esp), %ecx |
9ec2b804 JF |
210 | andl $SEGMENT_RPL_MASK, %ecx |
211 | cmpl $USER_RPL, %ecx | |
212 | je 2f | |
213 | ||
0f2c8769 JF |
214 | lea PT_ORIG_EAX(%esp), %esi |
215 | lea PT_EFLAGS(%esp), %edi | |
9ec2b804 | 216 | |
130ace11 TH |
217 | /* |
218 | * If eip is before iret_restore_end then stack | |
219 | * hasn't been restored yet. | |
220 | */ | |
9ec2b804 JF |
221 | cmp $iret_restore_end, %eax |
222 | jae 1f | |
223 | ||
130ace11 | 224 | movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */ |
0f2c8769 | 225 | movl %eax, PT_EAX(%esp) |
9ec2b804 | 226 | |
130ace11 | 227 | lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */ |
9ec2b804 JF |
228 | |
229 | /* set up the copy */ | |
230 | 1: std | |
0f2c8769 | 231 | mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ |
9ec2b804 JF |
232 | rep movsl |
233 | cld | |
234 | ||
130ace11 | 235 | lea 4(%edi), %esp /* point esp to new frame */ |
0f2c8769 | 236 | 2: jmp xen_do_upcall |
6487673b | 237 |