Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * linux/fs/file.c | |
3 | * | |
4 | * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes | |
5 | * | |
6 | * Manage the dynamic fd arrays in the process files_struct. | |
7 | */ | |
8 | ||
9 | #include <linux/fs.h> | |
10 | #include <linux/mm.h> | |
11 | #include <linux/time.h> | |
12 | #include <linux/slab.h> | |
13 | #include <linux/vmalloc.h> | |
14 | #include <linux/file.h> | |
15 | #include <linux/bitops.h> | |
ab2af1f5 DS |
16 | #include <linux/interrupt.h> |
17 | #include <linux/spinlock.h> | |
18 | #include <linux/rcupdate.h> | |
19 | #include <linux/workqueue.h> | |
20 | ||
21 | struct fdtable_defer { | |
22 | spinlock_t lock; | |
23 | struct work_struct wq; | |
24 | struct timer_list timer; | |
25 | struct fdtable *next; | |
26 | }; | |
27 | ||
28 | /* | |
29 | * We use this list to defer free fdtables that have vmalloced | |
30 | * sets/arrays. By keeping a per-cpu list, we avoid having to embed | |
31 | * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in | |
32 | * this per-task structure. | |
33 | */ | |
34 | static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); | |
1da177e4 LT |
35 | |
36 | ||
37 | /* | |
38 | * Allocate an fd array, using kmalloc or vmalloc. | |
39 | * Note: the array isn't cleared at allocation time. | |
40 | */ | |
41 | struct file ** alloc_fd_array(int num) | |
42 | { | |
43 | struct file **new_fds; | |
44 | int size = num * sizeof(struct file *); | |
45 | ||
46 | if (size <= PAGE_SIZE) | |
47 | new_fds = (struct file **) kmalloc(size, GFP_KERNEL); | |
48 | else | |
49 | new_fds = (struct file **) vmalloc(size); | |
50 | return new_fds; | |
51 | } | |
52 | ||
53 | void free_fd_array(struct file **array, int num) | |
54 | { | |
55 | int size = num * sizeof(struct file *); | |
56 | ||
57 | if (!array) { | |
58 | printk (KERN_ERR "free_fd_array: array = 0 (num = %d)\n", num); | |
59 | return; | |
60 | } | |
61 | ||
62 | if (num <= NR_OPEN_DEFAULT) /* Don't free the embedded fd array! */ | |
63 | return; | |
64 | else if (size <= PAGE_SIZE) | |
65 | kfree(array); | |
66 | else | |
67 | vfree(array); | |
68 | } | |
69 | ||
ab2af1f5 | 70 | static void __free_fdtable(struct fdtable *fdt) |
1da177e4 | 71 | { |
ab2af1f5 | 72 | int fdset_size, fdarray_size; |
1da177e4 | 73 | |
ab2af1f5 DS |
74 | fdset_size = fdt->max_fdset / 8; |
75 | fdarray_size = fdt->max_fds * sizeof(struct file *); | |
76 | free_fdset(fdt->open_fds, fdset_size); | |
77 | free_fdset(fdt->close_on_exec, fdset_size); | |
78 | free_fd_array(fdt->fd, fdarray_size); | |
79 | kfree(fdt); | |
80 | } | |
1da177e4 | 81 | |
ab2af1f5 DS |
82 | static void fdtable_timer(unsigned long data) |
83 | { | |
84 | struct fdtable_defer *fddef = (struct fdtable_defer *)data; | |
1da177e4 | 85 | |
ab2af1f5 DS |
86 | spin_lock(&fddef->lock); |
87 | /* | |
88 | * If someone already emptied the queue return. | |
1da177e4 | 89 | */ |
ab2af1f5 DS |
90 | if (!fddef->next) |
91 | goto out; | |
92 | if (!schedule_work(&fddef->wq)) | |
93 | mod_timer(&fddef->timer, 5); | |
94 | out: | |
95 | spin_unlock(&fddef->lock); | |
96 | } | |
1da177e4 | 97 | |
ab2af1f5 DS |
98 | static void free_fdtable_work(struct fdtable_defer *f) |
99 | { | |
100 | struct fdtable *fdt; | |
1da177e4 | 101 | |
ab2af1f5 DS |
102 | spin_lock_bh(&f->lock); |
103 | fdt = f->next; | |
104 | f->next = NULL; | |
105 | spin_unlock_bh(&f->lock); | |
106 | while(fdt) { | |
107 | struct fdtable *next = fdt->next; | |
108 | __free_fdtable(fdt); | |
109 | fdt = next; | |
110 | } | |
111 | } | |
1da177e4 | 112 | |
ab2af1f5 DS |
113 | static void free_fdtable_rcu(struct rcu_head *rcu) |
114 | { | |
115 | struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); | |
116 | int fdset_size, fdarray_size; | |
117 | struct fdtable_defer *fddef; | |
1da177e4 | 118 | |
ab2af1f5 DS |
119 | BUG_ON(!fdt); |
120 | fdset_size = fdt->max_fdset / 8; | |
121 | fdarray_size = fdt->max_fds * sizeof(struct file *); | |
122 | ||
123 | if (fdt->free_files) { | |
124 | /* | |
125 | * The this fdtable was embedded in the files structure | |
126 | * and the files structure itself was getting destroyed. | |
127 | * It is now safe to free the files structure. | |
128 | */ | |
129 | kmem_cache_free(files_cachep, fdt->free_files); | |
130 | return; | |
131 | } | |
132 | if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) { | |
133 | /* | |
134 | * The fdtable was embedded | |
135 | */ | |
136 | return; | |
137 | } | |
138 | if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) { | |
139 | kfree(fdt->open_fds); | |
140 | kfree(fdt->close_on_exec); | |
141 | kfree(fdt->fd); | |
142 | kfree(fdt); | |
1da177e4 | 143 | } else { |
ab2af1f5 DS |
144 | fddef = &get_cpu_var(fdtable_defer_list); |
145 | spin_lock(&fddef->lock); | |
146 | fdt->next = fddef->next; | |
147 | fddef->next = fdt; | |
148 | /* | |
149 | * vmallocs are handled from the workqueue context. | |
150 | * If the per-cpu workqueue is running, then we | |
151 | * defer work scheduling through a timer. | |
152 | */ | |
153 | if (!schedule_work(&fddef->wq)) | |
154 | mod_timer(&fddef->timer, 5); | |
155 | spin_unlock(&fddef->lock); | |
156 | put_cpu_var(fdtable_defer_list); | |
1da177e4 | 157 | } |
ab2af1f5 DS |
158 | } |
159 | ||
160 | void free_fdtable(struct fdtable *fdt) | |
161 | { | |
162 | if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE || | |
163 | fdt->max_fds > NR_OPEN_DEFAULT) | |
164 | call_rcu(&fdt->rcu, free_fdtable_rcu); | |
165 | } | |
166 | ||
167 | /* | |
168 | * Expand the fdset in the files_struct. Called with the files spinlock | |
169 | * held for write. | |
170 | */ | |
171 | static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt) | |
172 | { | |
173 | int i; | |
174 | int count; | |
175 | ||
176 | BUG_ON(nfdt->max_fdset < fdt->max_fdset); | |
177 | BUG_ON(nfdt->max_fds < fdt->max_fds); | |
178 | /* Copy the existing tables and install the new pointers */ | |
179 | ||
180 | i = fdt->max_fdset / (sizeof(unsigned long) * 8); | |
181 | count = (nfdt->max_fdset - fdt->max_fdset) / 8; | |
182 | ||
183 | /* | |
184 | * Don't copy the entire array if the current fdset is | |
185 | * not yet initialised. | |
186 | */ | |
187 | if (i) { | |
188 | memcpy (nfdt->open_fds, fdt->open_fds, | |
189 | fdt->max_fdset/8); | |
190 | memcpy (nfdt->close_on_exec, fdt->close_on_exec, | |
191 | fdt->max_fdset/8); | |
192 | memset (&nfdt->open_fds->fds_bits[i], 0, count); | |
193 | memset (&nfdt->close_on_exec->fds_bits[i], 0, count); | |
194 | } | |
195 | ||
196 | /* Don't copy/clear the array if we are creating a new | |
197 | fd array for fork() */ | |
198 | if (fdt->max_fds) { | |
199 | memcpy(nfdt->fd, fdt->fd, | |
200 | fdt->max_fds * sizeof(struct file *)); | |
201 | /* clear the remainder of the array */ | |
202 | memset(&nfdt->fd[fdt->max_fds], 0, | |
203 | (nfdt->max_fds - fdt->max_fds) * | |
204 | sizeof(struct file *)); | |
205 | } | |
206 | nfdt->next_fd = fdt->next_fd; | |
1da177e4 LT |
207 | } |
208 | ||
209 | /* | |
210 | * Allocate an fdset array, using kmalloc or vmalloc. | |
211 | * Note: the array isn't cleared at allocation time. | |
212 | */ | |
213 | fd_set * alloc_fdset(int num) | |
214 | { | |
215 | fd_set *new_fdset; | |
216 | int size = num / 8; | |
217 | ||
218 | if (size <= PAGE_SIZE) | |
219 | new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL); | |
220 | else | |
221 | new_fdset = (fd_set *) vmalloc(size); | |
222 | return new_fdset; | |
223 | } | |
224 | ||
225 | void free_fdset(fd_set *array, int num) | |
226 | { | |
227 | int size = num / 8; | |
228 | ||
229 | if (num <= __FD_SETSIZE) /* Don't free an embedded fdset */ | |
230 | return; | |
231 | else if (size <= PAGE_SIZE) | |
232 | kfree(array); | |
233 | else | |
234 | vfree(array); | |
235 | } | |
236 | ||
ab2af1f5 | 237 | static struct fdtable *alloc_fdtable(int nr) |
1da177e4 | 238 | { |
ab2af1f5 DS |
239 | struct fdtable *fdt = NULL; |
240 | int nfds = 0; | |
241 | fd_set *new_openset = NULL, *new_execset = NULL; | |
242 | struct file **new_fds; | |
1da177e4 | 243 | |
ab2af1f5 DS |
244 | fdt = kmalloc(sizeof(*fdt), GFP_KERNEL); |
245 | if (!fdt) | |
246 | goto out; | |
247 | memset(fdt, 0, sizeof(*fdt)); | |
1da177e4 | 248 | |
ab2af1f5 DS |
249 | nfds = __FD_SETSIZE; |
250 | /* Expand to the max in easy steps */ | |
251 | do { | |
1da177e4 LT |
252 | if (nfds < (PAGE_SIZE * 8)) |
253 | nfds = PAGE_SIZE * 8; | |
254 | else { | |
255 | nfds = nfds * 2; | |
256 | if (nfds > NR_OPEN) | |
257 | nfds = NR_OPEN; | |
258 | } | |
259 | } while (nfds <= nr); | |
260 | ||
ab2af1f5 DS |
261 | new_openset = alloc_fdset(nfds); |
262 | new_execset = alloc_fdset(nfds); | |
263 | if (!new_openset || !new_execset) | |
264 | goto out; | |
265 | fdt->open_fds = new_openset; | |
266 | fdt->close_on_exec = new_execset; | |
267 | fdt->max_fdset = nfds; | |
268 | ||
269 | nfds = NR_OPEN_DEFAULT; | |
270 | /* | |
271 | * Expand to the max in easy steps, and keep expanding it until | |
272 | * we have enough for the requested fd array size. | |
273 | */ | |
274 | do { | |
275 | #if NR_OPEN_DEFAULT < 256 | |
276 | if (nfds < 256) | |
277 | nfds = 256; | |
278 | else | |
279 | #endif | |
280 | if (nfds < (PAGE_SIZE / sizeof(struct file *))) | |
281 | nfds = PAGE_SIZE / sizeof(struct file *); | |
282 | else { | |
283 | nfds = nfds * 2; | |
284 | if (nfds > NR_OPEN) | |
285 | nfds = NR_OPEN; | |
286 | } | |
287 | } while (nfds <= nr); | |
288 | new_fds = alloc_fd_array(nfds); | |
289 | if (!new_fds) | |
1da177e4 | 290 | goto out; |
ab2af1f5 DS |
291 | fdt->fd = new_fds; |
292 | fdt->max_fds = nfds; | |
293 | fdt->free_files = NULL; | |
294 | return fdt; | |
295 | out: | |
296 | if (new_openset) | |
297 | free_fdset(new_openset, nfds); | |
298 | if (new_execset) | |
299 | free_fdset(new_execset, nfds); | |
300 | kfree(fdt); | |
301 | return NULL; | |
302 | } | |
1da177e4 | 303 | |
ab2af1f5 DS |
304 | /* |
305 | * Expands the file descriptor table - it will allocate a new fdtable and | |
306 | * both fd array and fdset. It is expected to be called with the | |
307 | * files_lock held. | |
308 | */ | |
309 | static int expand_fdtable(struct files_struct *files, int nr) | |
310 | __releases(files->file_lock) | |
311 | __acquires(files->file_lock) | |
312 | { | |
313 | int error = 0; | |
314 | struct fdtable *fdt; | |
315 | struct fdtable *nfdt = NULL; | |
316 | ||
317 | spin_unlock(&files->file_lock); | |
318 | nfdt = alloc_fdtable(nr); | |
319 | if (!nfdt) { | |
320 | error = -ENOMEM; | |
321 | spin_lock(&files->file_lock); | |
322 | goto out; | |
323 | } | |
324 | ||
325 | spin_lock(&files->file_lock); | |
badf1662 | 326 | fdt = files_fdtable(files); |
ab2af1f5 DS |
327 | /* |
328 | * Check again since another task may have expanded the | |
329 | * fd table while we dropped the lock | |
330 | */ | |
331 | if (nr >= fdt->max_fds || nr >= fdt->max_fdset) { | |
332 | copy_fdtable(nfdt, fdt); | |
333 | } else { | |
334 | /* Somebody expanded while we dropped file_lock */ | |
1da177e4 | 335 | spin_unlock(&files->file_lock); |
ab2af1f5 | 336 | __free_fdtable(nfdt); |
1da177e4 | 337 | spin_lock(&files->file_lock); |
ab2af1f5 DS |
338 | goto out; |
339 | } | |
340 | rcu_assign_pointer(files->fdt, nfdt); | |
341 | free_fdtable(fdt); | |
1da177e4 | 342 | out: |
1da177e4 LT |
343 | return error; |
344 | } | |
345 | ||
346 | /* | |
347 | * Expand files. | |
348 | * Return <0 on error; 0 nothing done; 1 files expanded, we may have blocked. | |
349 | * Should be called with the files->file_lock spinlock held for write. | |
350 | */ | |
351 | int expand_files(struct files_struct *files, int nr) | |
352 | { | |
353 | int err, expand = 0; | |
badf1662 | 354 | struct fdtable *fdt; |
1da177e4 | 355 | |
badf1662 | 356 | fdt = files_fdtable(files); |
ab2af1f5 DS |
357 | if (nr >= fdt->max_fdset || nr >= fdt->max_fds) { |
358 | if (fdt->max_fdset >= NR_OPEN || | |
359 | fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) { | |
360 | err = -EMFILE; | |
1da177e4 | 361 | goto out; |
ab2af1f5 | 362 | } |
1da177e4 | 363 | expand = 1; |
ab2af1f5 | 364 | if ((err = expand_fdtable(files, nr))) |
1da177e4 LT |
365 | goto out; |
366 | } | |
367 | err = expand; | |
368 | out: | |
369 | return err; | |
370 | } | |
ab2af1f5 DS |
371 | |
372 | static void __devinit fdtable_defer_list_init(int cpu) | |
373 | { | |
374 | struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); | |
375 | spin_lock_init(&fddef->lock); | |
376 | INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef); | |
377 | init_timer(&fddef->timer); | |
378 | fddef->timer.data = (unsigned long)fddef; | |
379 | fddef->timer.function = fdtable_timer; | |
380 | fddef->next = NULL; | |
381 | } | |
382 | ||
383 | void __init files_defer_init(void) | |
384 | { | |
385 | int i; | |
386 | /* Really early - can't use for_each_cpu */ | |
387 | for (i = 0; i < NR_CPUS; i++) | |
388 | fdtable_defer_list_init(i); | |
389 | } |