Commit | Line | Data |
---|---|---|
2e04ef76 RR |
1 | /*P:200 |
2 | * This contains all the /dev/lguest code, whereby the userspace launcher | |
f938d2c8 | 3 | * controls and communicates with the Guest. For example, the first write will |
3c6b5bfa RR |
4 | * tell us the Guest's memory layout, pagetable, entry point and kernel address |
5 | * offset. A read will run the Guest until something happens, such as a signal | |
2e04ef76 RR |
6 | * or the Guest doing a NOTIFY out to the Launcher. |
7 | :*/ | |
d7e28ffe RR |
8 | #include <linux/uaccess.h> |
9 | #include <linux/miscdevice.h> | |
10 | #include <linux/fs.h> | |
ca94f2bd | 11 | #include <linux/sched.h> |
df60aeef RR |
12 | #include <linux/eventfd.h> |
13 | #include <linux/file.h> | |
d7e28ffe RR |
14 | #include "lg.h" |
15 | ||
df60aeef RR |
16 | bool send_notify_to_eventfd(struct lg_cpu *cpu) |
17 | { | |
18 | unsigned int i; | |
19 | struct lg_eventfd_map *map; | |
20 | ||
21 | /* lg->eventfds is RCU-protected */ | |
22 | rcu_read_lock(); | |
23 | map = rcu_dereference(cpu->lg->eventfds); | |
24 | for (i = 0; i < map->num; i++) { | |
25 | if (map->map[i].addr == cpu->pending_notify) { | |
26 | eventfd_signal(map->map[i].event, 1); | |
27 | cpu->pending_notify = 0; | |
28 | break; | |
29 | } | |
30 | } | |
31 | rcu_read_unlock(); | |
32 | return cpu->pending_notify == 0; | |
33 | } | |
34 | ||
35 | static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) | |
36 | { | |
37 | struct lg_eventfd_map *new, *old = lg->eventfds; | |
38 | ||
39 | if (!addr) | |
40 | return -EINVAL; | |
41 | ||
2e04ef76 RR |
42 | /* |
43 | * Replace the old array with the new one, carefully: others can | |
44 | * be accessing it at the same time. | |
45 | */ | |
df60aeef RR |
46 | new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), |
47 | GFP_KERNEL); | |
48 | if (!new) | |
49 | return -ENOMEM; | |
50 | ||
51 | /* First make identical copy. */ | |
52 | memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); | |
53 | new->num = old->num; | |
54 | ||
55 | /* Now append new entry. */ | |
56 | new->map[new->num].addr = addr; | |
13389010 | 57 | new->map[new->num].event = eventfd_ctx_fdget(fd); |
df60aeef | 58 | if (IS_ERR(new->map[new->num].event)) { |
f2945262 | 59 | int err = PTR_ERR(new->map[new->num].event); |
df60aeef | 60 | kfree(new); |
f2945262 | 61 | return err; |
df60aeef RR |
62 | } |
63 | new->num++; | |
64 | ||
65 | /* Now put new one in place. */ | |
66 | rcu_assign_pointer(lg->eventfds, new); | |
67 | ||
2e04ef76 RR |
68 | /* |
69 | * We're not in a big hurry. Wait until noone's looking at old | |
70 | * version, then delete it. | |
71 | */ | |
df60aeef RR |
72 | synchronize_rcu(); |
73 | kfree(old); | |
74 | ||
75 | return 0; | |
76 | } | |
77 | ||
78 | static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) | |
79 | { | |
80 | unsigned long addr, fd; | |
81 | int err; | |
82 | ||
83 | if (get_user(addr, input) != 0) | |
84 | return -EFAULT; | |
85 | input++; | |
86 | if (get_user(fd, input) != 0) | |
87 | return -EFAULT; | |
88 | ||
89 | mutex_lock(&lguest_lock); | |
90 | err = add_eventfd(lg, addr, fd); | |
91 | mutex_unlock(&lguest_lock); | |
92 | ||
f2945262 | 93 | return err; |
df60aeef RR |
94 | } |
95 | ||
2e04ef76 RR |
96 | /*L:050 |
97 | * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt | |
98 | * number to /dev/lguest. | |
99 | */ | |
177e449d | 100 | static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) |
d7e28ffe | 101 | { |
511801dc | 102 | unsigned long irq; |
d7e28ffe RR |
103 | |
104 | if (get_user(irq, input) != 0) | |
105 | return -EFAULT; | |
106 | if (irq >= LGUEST_IRQS) | |
107 | return -EINVAL; | |
9f155a9b RR |
108 | |
109 | set_interrupt(cpu, irq); | |
d7e28ffe RR |
110 | return 0; |
111 | } | |
112 | ||
2e04ef76 RR |
113 | /*L:040 |
114 | * Once our Guest is initialized, the Launcher makes it run by reading | |
115 | * from /dev/lguest. | |
116 | */ | |
d7e28ffe RR |
117 | static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) |
118 | { | |
119 | struct lguest *lg = file->private_data; | |
d0953d42 GOC |
120 | struct lg_cpu *cpu; |
121 | unsigned int cpu_id = *o; | |
d7e28ffe | 122 | |
dde79789 | 123 | /* You must write LHREQ_INITIALIZE first! */ |
d7e28ffe RR |
124 | if (!lg) |
125 | return -EINVAL; | |
126 | ||
d0953d42 GOC |
127 | /* Watch out for arbitrary vcpu indexes! */ |
128 | if (cpu_id >= lg->nr_cpus) | |
129 | return -EINVAL; | |
130 | ||
131 | cpu = &lg->cpus[cpu_id]; | |
132 | ||
e1e72965 | 133 | /* If you're not the task which owns the Guest, go away. */ |
66686c2a | 134 | if (current != cpu->tsk) |
d7e28ffe RR |
135 | return -EPERM; |
136 | ||
a6bd8e13 | 137 | /* If the Guest is already dead, we indicate why */ |
d7e28ffe RR |
138 | if (lg->dead) { |
139 | size_t len; | |
140 | ||
dde79789 | 141 | /* lg->dead either contains an error code, or a string. */ |
d7e28ffe RR |
142 | if (IS_ERR(lg->dead)) |
143 | return PTR_ERR(lg->dead); | |
144 | ||
dde79789 | 145 | /* We can only return as much as the buffer they read with. */ |
d7e28ffe RR |
146 | len = min(size, strlen(lg->dead)+1); |
147 | if (copy_to_user(user, lg->dead, len) != 0) | |
148 | return -EFAULT; | |
149 | return len; | |
150 | } | |
151 | ||
2e04ef76 RR |
152 | /* |
153 | * If we returned from read() last time because the Guest sent I/O, | |
154 | * clear the flag. | |
155 | */ | |
5e232f4f GOC |
156 | if (cpu->pending_notify) |
157 | cpu->pending_notify = 0; | |
d7e28ffe | 158 | |
dde79789 | 159 | /* Run the Guest until something interesting happens. */ |
d0953d42 | 160 | return run_guest(cpu, (unsigned long __user *)user); |
d7e28ffe RR |
161 | } |
162 | ||
2e04ef76 RR |
163 | /*L:025 |
164 | * This actually initializes a CPU. For the moment, a Guest is only | |
165 | * uniprocessor, so "id" is always 0. | |
166 | */ | |
4dcc53da GOC |
167 | static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) |
168 | { | |
a6bd8e13 | 169 | /* We have a limited number the number of CPUs in the lguest struct. */ |
24adf127 | 170 | if (id >= ARRAY_SIZE(cpu->lg->cpus)) |
4dcc53da GOC |
171 | return -EINVAL; |
172 | ||
a6bd8e13 | 173 | /* Set up this CPU's id, and pointer back to the lguest struct. */ |
4dcc53da GOC |
174 | cpu->id = id; |
175 | cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); | |
176 | cpu->lg->nr_cpus++; | |
a6bd8e13 RR |
177 | |
178 | /* Each CPU has a timer it can set. */ | |
ad8d8f3b | 179 | init_clockdev(cpu); |
4dcc53da | 180 | |
2e04ef76 RR |
181 | /* |
182 | * We need a complete page for the Guest registers: they are accessible | |
183 | * to the Guest and we can only grant it access to whole pages. | |
184 | */ | |
a53a35a8 GOC |
185 | cpu->regs_page = get_zeroed_page(GFP_KERNEL); |
186 | if (!cpu->regs_page) | |
187 | return -ENOMEM; | |
188 | ||
189 | /* We actually put the registers at the bottom of the page. */ | |
190 | cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); | |
191 | ||
2e04ef76 RR |
192 | /* |
193 | * Now we initialize the Guest's registers, handing it the start | |
194 | * address. | |
195 | */ | |
a53a35a8 GOC |
196 | lguest_arch_setup_regs(cpu, start_ip); |
197 | ||
2e04ef76 RR |
198 | /* |
199 | * We keep a pointer to the Launcher task (ie. current task) for when | |
200 | * other Guests want to wake this one (eg. console input). | |
201 | */ | |
66686c2a GOC |
202 | cpu->tsk = current; |
203 | ||
2e04ef76 RR |
204 | /* |
205 | * We need to keep a pointer to the Launcher's memory map, because if | |
66686c2a | 206 | * the Launcher dies we need to clean it up. If we don't keep a |
2e04ef76 RR |
207 | * reference, it is destroyed before close() is called. |
208 | */ | |
66686c2a GOC |
209 | cpu->mm = get_task_mm(cpu->tsk); |
210 | ||
2e04ef76 RR |
211 | /* |
212 | * We remember which CPU's pages this Guest used last, for optimization | |
213 | * when the same Guest runs on the same CPU twice. | |
214 | */ | |
f34f8c5f GOC |
215 | cpu->last_pages = NULL; |
216 | ||
a6bd8e13 | 217 | /* No error == success. */ |
4dcc53da GOC |
218 | return 0; |
219 | } | |
220 | ||
2e04ef76 RR |
221 | /*L:020 |
222 | * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in | |
223 | * addition to the LHREQ_INITIALIZE value). These are: | |
dde79789 | 224 | * |
3c6b5bfa RR |
225 | * base: The start of the Guest-physical memory inside the Launcher memory. |
226 | * | |
dde79789 | 227 | * pfnlimit: The highest (Guest-physical) page number the Guest should be |
e1e72965 RR |
228 | * allowed to access. The Guest memory lives inside the Launcher, so it sets |
229 | * this to ensure the Guest can only reach its own memory. | |
dde79789 | 230 | * |
dde79789 | 231 | * start: The first instruction to execute ("eip" in x86-speak). |
dde79789 | 232 | */ |
511801dc | 233 | static int initialize(struct file *file, const unsigned long __user *input) |
d7e28ffe | 234 | { |
2e04ef76 | 235 | /* "struct lguest" contains all we (the Host) know about a Guest. */ |
d7e28ffe | 236 | struct lguest *lg; |
48245cc0 | 237 | int err; |
58a24566 | 238 | unsigned long args[3]; |
d7e28ffe | 239 | |
2e04ef76 RR |
240 | /* |
241 | * We grab the Big Lguest lock, which protects against multiple | |
242 | * simultaneous initializations. | |
243 | */ | |
d7e28ffe | 244 | mutex_lock(&lguest_lock); |
dde79789 | 245 | /* You can't initialize twice! Close the device and start again... */ |
d7e28ffe RR |
246 | if (file->private_data) { |
247 | err = -EBUSY; | |
248 | goto unlock; | |
249 | } | |
250 | ||
251 | if (copy_from_user(args, input, sizeof(args)) != 0) { | |
252 | err = -EFAULT; | |
253 | goto unlock; | |
254 | } | |
255 | ||
48245cc0 RR |
256 | lg = kzalloc(sizeof(*lg), GFP_KERNEL); |
257 | if (!lg) { | |
258 | err = -ENOMEM; | |
d7e28ffe RR |
259 | goto unlock; |
260 | } | |
dde79789 | 261 | |
df60aeef RR |
262 | lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL); |
263 | if (!lg->eventfds) { | |
264 | err = -ENOMEM; | |
265 | goto free_lg; | |
266 | } | |
267 | lg->eventfds->num = 0; | |
268 | ||
dde79789 | 269 | /* Populate the easy fields of our "struct lguest" */ |
74dbf719 | 270 | lg->mem_base = (void __user *)args[0]; |
3c6b5bfa | 271 | lg->pfn_limit = args[1]; |
dde79789 | 272 | |
58a24566 MZ |
273 | /* This is the first cpu (cpu 0) and it will start booting at args[2] */ |
274 | err = lg_cpu_start(&lg->cpus[0], 0, args[2]); | |
4dcc53da | 275 | if (err) |
df60aeef | 276 | goto free_eventfds; |
4dcc53da | 277 | |
2e04ef76 RR |
278 | /* |
279 | * Initialize the Guest's shadow page tables, using the toplevel | |
280 | * address the Launcher gave us. This allocates memory, so can fail. | |
281 | */ | |
58a24566 | 282 | err = init_guest_pagetable(lg); |
d7e28ffe RR |
283 | if (err) |
284 | goto free_regs; | |
285 | ||
dde79789 | 286 | /* We keep our "struct lguest" in the file's private_data. */ |
d7e28ffe RR |
287 | file->private_data = lg; |
288 | ||
289 | mutex_unlock(&lguest_lock); | |
290 | ||
dde79789 | 291 | /* And because this is a write() call, we return the length used. */ |
d7e28ffe RR |
292 | return sizeof(args); |
293 | ||
294 | free_regs: | |
a53a35a8 GOC |
295 | /* FIXME: This should be in free_vcpu */ |
296 | free_page(lg->cpus[0].regs_page); | |
df60aeef RR |
297 | free_eventfds: |
298 | kfree(lg->eventfds); | |
299 | free_lg: | |
43054412 | 300 | kfree(lg); |
d7e28ffe RR |
301 | unlock: |
302 | mutex_unlock(&lguest_lock); | |
303 | return err; | |
304 | } | |
305 | ||
2e04ef76 RR |
306 | /*L:010 |
307 | * The first operation the Launcher does must be a write. All writes | |
e1e72965 | 308 | * start with an unsigned long number: for the first write this must be |
dde79789 | 309 | * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use |
a6bd8e13 RR |
310 | * writes of other values to send interrupts. |
311 | * | |
312 | * Note that we overload the "offset" in the /dev/lguest file to indicate what | |
313 | * CPU number we're dealing with. Currently this is always 0, since we only | |
314 | * support uniprocessor Guests, but you can see the beginnings of SMP support | |
2e04ef76 RR |
315 | * here. |
316 | */ | |
511801dc | 317 | static ssize_t write(struct file *file, const char __user *in, |
d7e28ffe RR |
318 | size_t size, loff_t *off) |
319 | { | |
2e04ef76 RR |
320 | /* |
321 | * Once the Guest is initialized, we hold the "struct lguest" in the | |
322 | * file private data. | |
323 | */ | |
d7e28ffe | 324 | struct lguest *lg = file->private_data; |
511801dc JS |
325 | const unsigned long __user *input = (const unsigned long __user *)in; |
326 | unsigned long req; | |
177e449d | 327 | struct lg_cpu *uninitialized_var(cpu); |
7ea07a15 | 328 | unsigned int cpu_id = *off; |
d7e28ffe | 329 | |
a6bd8e13 | 330 | /* The first value tells us what this request is. */ |
d7e28ffe RR |
331 | if (get_user(req, input) != 0) |
332 | return -EFAULT; | |
511801dc | 333 | input++; |
d7e28ffe | 334 | |
dde79789 | 335 | /* If you haven't initialized, you must do that first. */ |
7ea07a15 GOC |
336 | if (req != LHREQ_INITIALIZE) { |
337 | if (!lg || (cpu_id >= lg->nr_cpus)) | |
338 | return -EINVAL; | |
339 | cpu = &lg->cpus[cpu_id]; | |
dde79789 | 340 | |
f73d1e6c ET |
341 | /* Once the Guest is dead, you can only read() why it died. */ |
342 | if (lg->dead) | |
343 | return -ENOENT; | |
f73d1e6c | 344 | } |
d7e28ffe RR |
345 | |
346 | switch (req) { | |
347 | case LHREQ_INITIALIZE: | |
511801dc | 348 | return initialize(file, input); |
d7e28ffe | 349 | case LHREQ_IRQ: |
177e449d | 350 | return user_send_irq(cpu, input); |
df60aeef RR |
351 | case LHREQ_EVENTFD: |
352 | return attach_eventfd(lg, input); | |
d7e28ffe RR |
353 | default: |
354 | return -EINVAL; | |
355 | } | |
356 | } | |
357 | ||
2e04ef76 RR |
358 | /*L:060 |
359 | * The final piece of interface code is the close() routine. It reverses | |
dde79789 RR |
360 | * everything done in initialize(). This is usually called because the |
361 | * Launcher exited. | |
362 | * | |
363 | * Note that the close routine returns 0 or a negative error number: it can't | |
364 | * really fail, but it can whine. I blame Sun for this wart, and K&R C for | |
2e04ef76 RR |
365 | * letting them do it. |
366 | :*/ | |
d7e28ffe RR |
367 | static int close(struct inode *inode, struct file *file) |
368 | { | |
369 | struct lguest *lg = file->private_data; | |
ad8d8f3b | 370 | unsigned int i; |
d7e28ffe | 371 | |
dde79789 | 372 | /* If we never successfully initialized, there's nothing to clean up */ |
d7e28ffe RR |
373 | if (!lg) |
374 | return 0; | |
375 | ||
2e04ef76 RR |
376 | /* |
377 | * We need the big lock, to protect from inter-guest I/O and other | |
378 | * Launchers initializing guests. | |
379 | */ | |
d7e28ffe | 380 | mutex_lock(&lguest_lock); |
66686c2a GOC |
381 | |
382 | /* Free up the shadow page tables for the Guest. */ | |
383 | free_guest_pagetable(lg); | |
384 | ||
a53a35a8 | 385 | for (i = 0; i < lg->nr_cpus; i++) { |
ad8d8f3b GOC |
386 | /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ |
387 | hrtimer_cancel(&lg->cpus[i].hrt); | |
a53a35a8 GOC |
388 | /* We can free up the register page we allocated. */ |
389 | free_page(lg->cpus[i].regs_page); | |
2e04ef76 RR |
390 | /* |
391 | * Now all the memory cleanups are done, it's safe to release | |
392 | * the Launcher's memory management structure. | |
393 | */ | |
66686c2a | 394 | mmput(lg->cpus[i].mm); |
a53a35a8 | 395 | } |
df60aeef RR |
396 | |
397 | /* Release any eventfds they registered. */ | |
398 | for (i = 0; i < lg->eventfds->num; i++) | |
13389010 | 399 | eventfd_ctx_put(lg->eventfds->map[i].event); |
df60aeef RR |
400 | kfree(lg->eventfds); |
401 | ||
2e04ef76 RR |
402 | /* |
403 | * If lg->dead doesn't contain an error code it will be NULL or a | |
404 | * kmalloc()ed string, either of which is ok to hand to kfree(). | |
405 | */ | |
d7e28ffe RR |
406 | if (!IS_ERR(lg->dead)) |
407 | kfree(lg->dead); | |
05dfdbbd MW |
408 | /* Free the memory allocated to the lguest_struct */ |
409 | kfree(lg); | |
dde79789 | 410 | /* Release lock and exit. */ |
d7e28ffe | 411 | mutex_unlock(&lguest_lock); |
dde79789 | 412 | |
d7e28ffe RR |
413 | return 0; |
414 | } | |
415 | ||
dde79789 RR |
416 | /*L:000 |
417 | * Welcome to our journey through the Launcher! | |
418 | * | |
419 | * The Launcher is the Host userspace program which sets up, runs and services | |
420 | * the Guest. In fact, many comments in the Drivers which refer to "the Host" | |
421 | * doing things are inaccurate: the Launcher does all the device handling for | |
e1e72965 | 422 | * the Guest, but the Guest can't know that. |
dde79789 RR |
423 | * |
424 | * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we | |
425 | * shall see more of that later. | |
426 | * | |
427 | * We begin our understanding with the Host kernel interface which the Launcher | |
428 | * uses: reading and writing a character device called /dev/lguest. All the | |
2e04ef76 RR |
429 | * work happens in the read(), write() and close() routines: |
430 | */ | |
d7e28ffe RR |
431 | static struct file_operations lguest_fops = { |
432 | .owner = THIS_MODULE, | |
433 | .release = close, | |
434 | .write = write, | |
435 | .read = read, | |
436 | }; | |
dde79789 | 437 | |
2e04ef76 RR |
438 | /* |
439 | * This is a textbook example of a "misc" character device. Populate a "struct | |
440 | * miscdevice" and register it with misc_register(). | |
441 | */ | |
d7e28ffe RR |
442 | static struct miscdevice lguest_dev = { |
443 | .minor = MISC_DYNAMIC_MINOR, | |
444 | .name = "lguest", | |
445 | .fops = &lguest_fops, | |
446 | }; | |
447 | ||
448 | int __init lguest_device_init(void) | |
449 | { | |
450 | return misc_register(&lguest_dev); | |
451 | } | |
452 | ||
453 | void __exit lguest_device_remove(void) | |
454 | { | |
455 | misc_deregister(&lguest_dev); | |
456 | } |