aio: kill batch allocation
[deliverable/linux.git] / fs / read_write.c
CommitLineData
1da177e4
LT
1/*
2 * linux/fs/read_write.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7#include <linux/slab.h>
8#include <linux/stat.h>
9#include <linux/fcntl.h>
10#include <linux/file.h>
11#include <linux/uio.h>
0eeca283 12#include <linux/fsnotify.h>
1da177e4 13#include <linux/security.h>
630d9c47 14#include <linux/export.h>
1da177e4 15#include <linux/syscalls.h>
e28cc715 16#include <linux/pagemap.h>
d6b29d7c 17#include <linux/splice.h>
561c6731 18#include <linux/compat.h>
06ae43f3 19#include "internal.h"
1da177e4
LT
20
21#include <asm/uaccess.h>
22#include <asm/unistd.h>
23
c0bd14af
AV
24typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
25typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
26 unsigned long, loff_t);
27
4b6f5d20 28const struct file_operations generic_ro_fops = {
1da177e4 29 .llseek = generic_file_llseek,
543ade1f
BP
30 .read = do_sync_read,
31 .aio_read = generic_file_aio_read,
1da177e4 32 .mmap = generic_file_readonly_mmap,
534f2aaa 33 .splice_read = generic_file_splice_read,
1da177e4
LT
34};
35
36EXPORT_SYMBOL(generic_ro_fops);
37
cccb5a1e 38static inline int unsigned_offsets(struct file *file)
4a3956c7 39{
cccb5a1e 40 return file->f_mode & FMODE_UNSIGNED_OFFSET;
4a3956c7
KH
41}
42
ef3d0fd2
AK
43static loff_t lseek_execute(struct file *file, struct inode *inode,
44 loff_t offset, loff_t maxsize)
45{
46 if (offset < 0 && !unsigned_offsets(file))
47 return -EINVAL;
48 if (offset > maxsize)
49 return -EINVAL;
50
51 if (offset != file->f_pos) {
52 file->f_pos = offset;
53 file->f_version = 0;
54 }
55 return offset;
56}
57
3a8cff4f 58/**
5760495a 59 * generic_file_llseek_size - generic llseek implementation for regular files
3a8cff4f
CH
60 * @file: file structure to seek on
61 * @offset: file offset to seek to
965c8e59 62 * @whence: type of seek
e8b96eb5
ES
63 * @size: max size of this file in file system
64 * @eof: offset used for SEEK_END position
3a8cff4f 65 *
5760495a 66 * This is a variant of generic_file_llseek that allows passing in a custom
e8b96eb5 67 * maximum file size and a custom EOF position, for e.g. hashed directories
ef3d0fd2
AK
68 *
69 * Synchronization:
5760495a 70 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
ef3d0fd2
AK
71 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
72 * read/writes behave like SEEK_SET against seeks.
3a8cff4f 73 */
9465efc9 74loff_t
965c8e59 75generic_file_llseek_size(struct file *file, loff_t offset, int whence,
e8b96eb5 76 loff_t maxsize, loff_t eof)
1da177e4 77{
1da177e4
LT
78 struct inode *inode = file->f_mapping->host;
79
965c8e59 80 switch (whence) {
3a8cff4f 81 case SEEK_END:
e8b96eb5 82 offset += eof;
3a8cff4f
CH
83 break;
84 case SEEK_CUR:
5b6f1eb9
AK
85 /*
86 * Here we special-case the lseek(fd, 0, SEEK_CUR)
87 * position-querying operation. Avoid rewriting the "same"
88 * f_pos value back to the file because a concurrent read(),
89 * write() or lseek() might have altered it
90 */
91 if (offset == 0)
92 return file->f_pos;
ef3d0fd2
AK
93 /*
94 * f_lock protects against read/modify/write race with other
95 * SEEK_CURs. Note that parallel writes and reads behave
96 * like SEEK_SET.
97 */
98 spin_lock(&file->f_lock);
99 offset = lseek_execute(file, inode, file->f_pos + offset,
5760495a 100 maxsize);
ef3d0fd2
AK
101 spin_unlock(&file->f_lock);
102 return offset;
982d8165
JB
103 case SEEK_DATA:
104 /*
105 * In the generic case the entire file is data, so as long as
106 * offset isn't at the end of the file then the offset is data.
107 */
e8b96eb5 108 if (offset >= eof)
982d8165
JB
109 return -ENXIO;
110 break;
111 case SEEK_HOLE:
112 /*
113 * There is a virtual hole at the end of the file, so as long as
114 * offset isn't i_size or larger, return i_size.
115 */
e8b96eb5 116 if (offset >= eof)
982d8165 117 return -ENXIO;
e8b96eb5 118 offset = eof;
982d8165 119 break;
1da177e4 120 }
3a8cff4f 121
5760495a
AK
122 return lseek_execute(file, inode, offset, maxsize);
123}
124EXPORT_SYMBOL(generic_file_llseek_size);
125
126/**
127 * generic_file_llseek - generic llseek implementation for regular files
128 * @file: file structure to seek on
129 * @offset: file offset to seek to
965c8e59 130 * @whence: type of seek
5760495a
AK
131 *
132 * This is a generic implemenation of ->llseek useable for all normal local
133 * filesystems. It just updates the file offset to the value specified by
546ae2d2 134 * @offset and @whence.
5760495a 135 */
965c8e59 136loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
5760495a
AK
137{
138 struct inode *inode = file->f_mapping->host;
139
965c8e59 140 return generic_file_llseek_size(file, offset, whence,
e8b96eb5
ES
141 inode->i_sb->s_maxbytes,
142 i_size_read(inode));
1da177e4 143}
9465efc9 144EXPORT_SYMBOL(generic_file_llseek);
1da177e4 145
ae6afc3f
B
146/**
147 * noop_llseek - No Operation Performed llseek implementation
148 * @file: file structure to seek on
149 * @offset: file offset to seek to
965c8e59 150 * @whence: type of seek
ae6afc3f
B
151 *
152 * This is an implementation of ->llseek useable for the rare special case when
153 * userspace expects the seek to succeed but the (device) file is actually not
154 * able to perform the seek. In this case you use noop_llseek() instead of
155 * falling back to the default implementation of ->llseek.
156 */
965c8e59 157loff_t noop_llseek(struct file *file, loff_t offset, int whence)
ae6afc3f
B
158{
159 return file->f_pos;
160}
161EXPORT_SYMBOL(noop_llseek);
162
965c8e59 163loff_t no_llseek(struct file *file, loff_t offset, int whence)
1da177e4
LT
164{
165 return -ESPIPE;
166}
167EXPORT_SYMBOL(no_llseek);
168
965c8e59 169loff_t default_llseek(struct file *file, loff_t offset, int whence)
1da177e4 170{
496ad9aa 171 struct inode *inode = file_inode(file);
16abef0e 172 loff_t retval;
1da177e4 173
982d8165 174 mutex_lock(&inode->i_mutex);
965c8e59 175 switch (whence) {
7b8e8924 176 case SEEK_END:
982d8165 177 offset += i_size_read(inode);
1da177e4 178 break;
7b8e8924 179 case SEEK_CUR:
5b6f1eb9
AK
180 if (offset == 0) {
181 retval = file->f_pos;
182 goto out;
183 }
1da177e4 184 offset += file->f_pos;
982d8165
JB
185 break;
186 case SEEK_DATA:
187 /*
188 * In the generic case the entire file is data, so as
189 * long as offset isn't at the end of the file then the
190 * offset is data.
191 */
bacb2d81
DC
192 if (offset >= inode->i_size) {
193 retval = -ENXIO;
194 goto out;
195 }
982d8165
JB
196 break;
197 case SEEK_HOLE:
198 /*
199 * There is a virtual hole at the end of the file, so
200 * as long as offset isn't i_size or larger, return
201 * i_size.
202 */
bacb2d81
DC
203 if (offset >= inode->i_size) {
204 retval = -ENXIO;
205 goto out;
206 }
982d8165
JB
207 offset = inode->i_size;
208 break;
1da177e4
LT
209 }
210 retval = -EINVAL;
cccb5a1e 211 if (offset >= 0 || unsigned_offsets(file)) {
1da177e4
LT
212 if (offset != file->f_pos) {
213 file->f_pos = offset;
214 file->f_version = 0;
215 }
216 retval = offset;
217 }
5b6f1eb9 218out:
982d8165 219 mutex_unlock(&inode->i_mutex);
1da177e4
LT
220 return retval;
221}
222EXPORT_SYMBOL(default_llseek);
223
965c8e59 224loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
1da177e4
LT
225{
226 loff_t (*fn)(struct file *, loff_t, int);
227
228 fn = no_llseek;
229 if (file->f_mode & FMODE_LSEEK) {
1da177e4
LT
230 if (file->f_op && file->f_op->llseek)
231 fn = file->f_op->llseek;
232 }
965c8e59 233 return fn(file, offset, whence);
1da177e4
LT
234}
235EXPORT_SYMBOL(vfs_llseek);
236
965c8e59 237SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
1da177e4
LT
238{
239 off_t retval;
2903ff01
AV
240 struct fd f = fdget(fd);
241 if (!f.file)
242 return -EBADF;
1da177e4
LT
243
244 retval = -EINVAL;
965c8e59
AM
245 if (whence <= SEEK_MAX) {
246 loff_t res = vfs_llseek(f.file, offset, whence);
1da177e4
LT
247 retval = res;
248 if (res != (loff_t)retval)
249 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
250 }
2903ff01 251 fdput(f);
1da177e4
LT
252 return retval;
253}
254
561c6731
AV
255#ifdef CONFIG_COMPAT
256COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
257{
258 return sys_lseek(fd, offset, whence);
259}
260#endif
261
1da177e4 262#ifdef __ARCH_WANT_SYS_LLSEEK
003d7ab4
HC
263SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
264 unsigned long, offset_low, loff_t __user *, result,
965c8e59 265 unsigned int, whence)
1da177e4
LT
266{
267 int retval;
2903ff01 268 struct fd f = fdget(fd);
1da177e4 269 loff_t offset;
1da177e4 270
2903ff01
AV
271 if (!f.file)
272 return -EBADF;
1da177e4
LT
273
274 retval = -EINVAL;
965c8e59 275 if (whence > SEEK_MAX)
1da177e4
LT
276 goto out_putf;
277
2903ff01 278 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
965c8e59 279 whence);
1da177e4
LT
280
281 retval = (int)offset;
282 if (offset >= 0) {
283 retval = -EFAULT;
284 if (!copy_to_user(result, &offset, sizeof(offset)))
285 retval = 0;
286 }
287out_putf:
2903ff01 288 fdput(f);
1da177e4
LT
289 return retval;
290}
291#endif
292
e28cc715
LT
293/*
294 * rw_verify_area doesn't like huge counts. We limit
295 * them to something that fits in "int" so that others
296 * won't have to do range checks all the time.
297 */
1da177e4
LT
298int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
299{
300 struct inode *inode;
301 loff_t pos;
c43e259c 302 int retval = -EINVAL;
1da177e4 303
496ad9aa 304 inode = file_inode(file);
e28cc715 305 if (unlikely((ssize_t) count < 0))
c43e259c 306 return retval;
1da177e4 307 pos = *ppos;
cccb5a1e
AV
308 if (unlikely(pos < 0)) {
309 if (!unsigned_offsets(file))
310 return retval;
311 if (count >= -pos) /* both values are in 0..LLONG_MAX */
312 return -EOVERFLOW;
313 } else if (unlikely((loff_t) (pos + count) < 0)) {
314 if (!unsigned_offsets(file))
4a3956c7
KH
315 return retval;
316 }
1da177e4 317
a16877ca 318 if (unlikely(inode->i_flock && mandatory_lock(inode))) {
c43e259c 319 retval = locks_mandatory_area(
e28cc715
LT
320 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
321 inode, file, pos, count);
322 if (retval < 0)
323 return retval;
324 }
c43e259c
JM
325 retval = security_file_permission(file,
326 read_write == READ ? MAY_READ : MAY_WRITE);
327 if (retval)
328 return retval;
e28cc715 329 return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
1da177e4
LT
330}
331
332ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
333{
027445c3 334 struct iovec iov = { .iov_base = buf, .iov_len = len };
1da177e4
LT
335 struct kiocb kiocb;
336 ssize_t ret;
337
338 init_sync_kiocb(&kiocb, filp);
339 kiocb.ki_pos = *ppos;
027445c3 340 kiocb.ki_left = len;
61964eba 341 kiocb.ki_nbytes = len;
027445c3 342
41003a7b 343 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1da177e4
LT
344 if (-EIOCBQUEUED == ret)
345 ret = wait_on_sync_kiocb(&kiocb);
346 *ppos = kiocb.ki_pos;
347 return ret;
348}
349
350EXPORT_SYMBOL(do_sync_read);
351
352ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
353{
354 ssize_t ret;
355
356 if (!(file->f_mode & FMODE_READ))
357 return -EBADF;
358 if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
359 return -EINVAL;
360 if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
361 return -EFAULT;
362
363 ret = rw_verify_area(READ, file, pos, count);
e28cc715
LT
364 if (ret >= 0) {
365 count = ret;
c43e259c
JM
366 if (file->f_op->read)
367 ret = file->f_op->read(file, buf, count, pos);
368 else
369 ret = do_sync_read(file, buf, count, pos);
370 if (ret > 0) {
2a12a9d7 371 fsnotify_access(file);
c43e259c 372 add_rchar(current, ret);
1da177e4 373 }
c43e259c 374 inc_syscr(current);
1da177e4
LT
375 }
376
377 return ret;
378}
379
380EXPORT_SYMBOL(vfs_read);
381
382ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
383{
027445c3 384 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
1da177e4
LT
385 struct kiocb kiocb;
386 ssize_t ret;
387
388 init_sync_kiocb(&kiocb, filp);
389 kiocb.ki_pos = *ppos;
027445c3 390 kiocb.ki_left = len;
61964eba 391 kiocb.ki_nbytes = len;
027445c3 392
41003a7b 393 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1da177e4
LT
394 if (-EIOCBQUEUED == ret)
395 ret = wait_on_sync_kiocb(&kiocb);
396 *ppos = kiocb.ki_pos;
397 return ret;
398}
399
400EXPORT_SYMBOL(do_sync_write);
401
06ae43f3
AV
402ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
403{
404 mm_segment_t old_fs;
405 const char __user *p;
406 ssize_t ret;
407
3e84f48e
AV
408 if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
409 return -EINVAL;
410
06ae43f3
AV
411 old_fs = get_fs();
412 set_fs(get_ds());
413 p = (__force const char __user *)buf;
414 if (count > MAX_RW_COUNT)
415 count = MAX_RW_COUNT;
416 if (file->f_op->write)
417 ret = file->f_op->write(file, p, count, pos);
418 else
419 ret = do_sync_write(file, p, count, pos);
420 set_fs(old_fs);
421 if (ret > 0) {
422 fsnotify_modify(file);
423 add_wchar(current, ret);
424 }
425 inc_syscw(current);
426 return ret;
427}
428
1da177e4
LT
429ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
430{
431 ssize_t ret;
432
433 if (!(file->f_mode & FMODE_WRITE))
434 return -EBADF;
435 if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
436 return -EINVAL;
437 if (unlikely(!access_ok(VERIFY_READ, buf, count)))
438 return -EFAULT;
439
440 ret = rw_verify_area(WRITE, file, pos, count);
e28cc715
LT
441 if (ret >= 0) {
442 count = ret;
03d95eb2 443 file_start_write(file);
c43e259c
JM
444 if (file->f_op->write)
445 ret = file->f_op->write(file, buf, count, pos);
446 else
447 ret = do_sync_write(file, buf, count, pos);
448 if (ret > 0) {
2a12a9d7 449 fsnotify_modify(file);
c43e259c 450 add_wchar(current, ret);
1da177e4 451 }
c43e259c 452 inc_syscw(current);
03d95eb2 453 file_end_write(file);
1da177e4
LT
454 }
455
456 return ret;
457}
458
459EXPORT_SYMBOL(vfs_write);
460
461static inline loff_t file_pos_read(struct file *file)
462{
463 return file->f_pos;
464}
465
466static inline void file_pos_write(struct file *file, loff_t pos)
467{
468 file->f_pos = pos;
469}
470
3cdad428 471SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
1da177e4 472{
2903ff01 473 struct fd f = fdget(fd);
1da177e4 474 ssize_t ret = -EBADF;
1da177e4 475
2903ff01
AV
476 if (f.file) {
477 loff_t pos = file_pos_read(f.file);
478 ret = vfs_read(f.file, buf, count, &pos);
479 file_pos_write(f.file, pos);
480 fdput(f);
1da177e4 481 }
1da177e4
LT
482 return ret;
483}
1da177e4 484
3cdad428
HC
485SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
486 size_t, count)
1da177e4 487{
2903ff01 488 struct fd f = fdget(fd);
1da177e4 489 ssize_t ret = -EBADF;
1da177e4 490
2903ff01
AV
491 if (f.file) {
492 loff_t pos = file_pos_read(f.file);
493 ret = vfs_write(f.file, buf, count, &pos);
494 file_pos_write(f.file, pos);
495 fdput(f);
1da177e4
LT
496 }
497
498 return ret;
499}
500
4a0fd5bf
AV
501SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
502 size_t, count, loff_t, pos)
1da177e4 503{
2903ff01 504 struct fd f;
1da177e4 505 ssize_t ret = -EBADF;
1da177e4
LT
506
507 if (pos < 0)
508 return -EINVAL;
509
2903ff01
AV
510 f = fdget(fd);
511 if (f.file) {
1da177e4 512 ret = -ESPIPE;
2903ff01
AV
513 if (f.file->f_mode & FMODE_PREAD)
514 ret = vfs_read(f.file, buf, count, &pos);
515 fdput(f);
1da177e4
LT
516 }
517
518 return ret;
519}
520
4a0fd5bf
AV
521SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
522 size_t, count, loff_t, pos)
1da177e4 523{
2903ff01 524 struct fd f;
1da177e4 525 ssize_t ret = -EBADF;
1da177e4
LT
526
527 if (pos < 0)
528 return -EINVAL;
529
2903ff01
AV
530 f = fdget(fd);
531 if (f.file) {
1da177e4 532 ret = -ESPIPE;
2903ff01
AV
533 if (f.file->f_mode & FMODE_PWRITE)
534 ret = vfs_write(f.file, buf, count, &pos);
535 fdput(f);
1da177e4
LT
536 }
537
538 return ret;
539}
540
541/*
542 * Reduce an iovec's length in-place. Return the resulting number of segments
543 */
544unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
545{
546 unsigned long seg = 0;
547 size_t len = 0;
548
549 while (seg < nr_segs) {
550 seg++;
551 if (len + iov->iov_len >= to) {
552 iov->iov_len = to - len;
553 break;
554 }
555 len += iov->iov_len;
556 iov++;
557 }
558 return seg;
559}
19295529 560EXPORT_SYMBOL(iov_shorten);
1da177e4 561
72ec3516 562static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
ee0b3e67
BP
563 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
564{
565 struct kiocb kiocb;
566 ssize_t ret;
567
568 init_sync_kiocb(&kiocb, filp);
569 kiocb.ki_pos = *ppos;
570 kiocb.ki_left = len;
571 kiocb.ki_nbytes = len;
572
41003a7b 573 ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
ee0b3e67
BP
574 if (ret == -EIOCBQUEUED)
575 ret = wait_on_sync_kiocb(&kiocb);
576 *ppos = kiocb.ki_pos;
577 return ret;
578}
579
580/* Do it by hand, with file-ops */
72ec3516 581static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
ee0b3e67
BP
582 unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
583{
584 struct iovec *vector = iov;
585 ssize_t ret = 0;
586
587 while (nr_segs > 0) {
588 void __user *base;
589 size_t len;
590 ssize_t nr;
591
592 base = vector->iov_base;
593 len = vector->iov_len;
594 vector++;
595 nr_segs--;
596
597 nr = fn(filp, base, len, ppos);
598
599 if (nr < 0) {
600 if (!ret)
601 ret = nr;
602 break;
603 }
604 ret += nr;
605 if (nr != len)
606 break;
607 }
608
609 return ret;
610}
611
1da177e4
LT
612/* A write operation does a read from user space and vice versa */
613#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
614
eed4e51f
BP
615ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
616 unsigned long nr_segs, unsigned long fast_segs,
617 struct iovec *fast_pointer,
ac34ebb3 618 struct iovec **ret_pointer)
435f49a5 619{
eed4e51f 620 unsigned long seg;
435f49a5 621 ssize_t ret;
eed4e51f
BP
622 struct iovec *iov = fast_pointer;
623
435f49a5
LT
624 /*
625 * SuS says "The readv() function *may* fail if the iovcnt argument
626 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
627 * traditionally returned zero for zero segments, so...
628 */
eed4e51f
BP
629 if (nr_segs == 0) {
630 ret = 0;
435f49a5 631 goto out;
eed4e51f
BP
632 }
633
435f49a5
LT
634 /*
635 * First get the "struct iovec" from user memory and
636 * verify all the pointers
637 */
eed4e51f
BP
638 if (nr_segs > UIO_MAXIOV) {
639 ret = -EINVAL;
435f49a5 640 goto out;
eed4e51f
BP
641 }
642 if (nr_segs > fast_segs) {
435f49a5 643 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
eed4e51f
BP
644 if (iov == NULL) {
645 ret = -ENOMEM;
435f49a5 646 goto out;
eed4e51f 647 }
435f49a5 648 }
eed4e51f
BP
649 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
650 ret = -EFAULT;
435f49a5 651 goto out;
eed4e51f
BP
652 }
653
435f49a5 654 /*
eed4e51f
BP
655 * According to the Single Unix Specification we should return EINVAL
656 * if an element length is < 0 when cast to ssize_t or if the
657 * total length would overflow the ssize_t return value of the
658 * system call.
435f49a5
LT
659 *
660 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
661 * overflow case.
662 */
eed4e51f 663 ret = 0;
435f49a5
LT
664 for (seg = 0; seg < nr_segs; seg++) {
665 void __user *buf = iov[seg].iov_base;
666 ssize_t len = (ssize_t)iov[seg].iov_len;
eed4e51f
BP
667
668 /* see if we we're about to use an invalid len or if
669 * it's about to overflow ssize_t */
435f49a5 670 if (len < 0) {
eed4e51f 671 ret = -EINVAL;
435f49a5 672 goto out;
eed4e51f 673 }
ac34ebb3 674 if (type >= 0
fcf63409 675 && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
eed4e51f 676 ret = -EFAULT;
435f49a5
LT
677 goto out;
678 }
679 if (len > MAX_RW_COUNT - ret) {
680 len = MAX_RW_COUNT - ret;
681 iov[seg].iov_len = len;
eed4e51f 682 }
eed4e51f 683 ret += len;
435f49a5 684 }
eed4e51f
BP
685out:
686 *ret_pointer = iov;
687 return ret;
688}
689
1da177e4
LT
690static ssize_t do_readv_writev(int type, struct file *file,
691 const struct iovec __user * uvector,
692 unsigned long nr_segs, loff_t *pos)
693{
1da177e4
LT
694 size_t tot_len;
695 struct iovec iovstack[UIO_FASTIOV];
ee0b3e67 696 struct iovec *iov = iovstack;
1da177e4 697 ssize_t ret;
1da177e4
LT
698 io_fn_t fn;
699 iov_fn_t fnv;
700
eed4e51f
BP
701 if (!file->f_op) {
702 ret = -EINVAL;
1da177e4 703 goto out;
1da177e4 704 }
1da177e4 705
eed4e51f 706 ret = rw_copy_check_uvector(type, uvector, nr_segs,
ac34ebb3 707 ARRAY_SIZE(iovstack), iovstack, &iov);
eed4e51f 708 if (ret <= 0)
1da177e4 709 goto out;
1da177e4 710
eed4e51f 711 tot_len = ret;
1da177e4 712 ret = rw_verify_area(type, file, pos, tot_len);
e28cc715 713 if (ret < 0)
411b67b4 714 goto out;
1da177e4
LT
715
716 fnv = NULL;
717 if (type == READ) {
718 fn = file->f_op->read;
ee0b3e67 719 fnv = file->f_op->aio_read;
1da177e4
LT
720 } else {
721 fn = (io_fn_t)file->f_op->write;
ee0b3e67 722 fnv = file->f_op->aio_write;
03d95eb2 723 file_start_write(file);
1da177e4
LT
724 }
725
ee0b3e67
BP
726 if (fnv)
727 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
728 pos, fnv);
729 else
730 ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
1da177e4 731
03d95eb2
AV
732 if (type != READ)
733 file_end_write(file);
734
1da177e4
LT
735out:
736 if (iov != iovstack)
737 kfree(iov);
0eeca283
RL
738 if ((ret + (type == READ)) > 0) {
739 if (type == READ)
2a12a9d7 740 fsnotify_access(file);
0eeca283 741 else
2a12a9d7 742 fsnotify_modify(file);
0eeca283 743 }
1da177e4 744 return ret;
1da177e4
LT
745}
746
747ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
748 unsigned long vlen, loff_t *pos)
749{
750 if (!(file->f_mode & FMODE_READ))
751 return -EBADF;
ee0b3e67 752 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
1da177e4
LT
753 return -EINVAL;
754
755 return do_readv_writev(READ, file, vec, vlen, pos);
756}
757
758EXPORT_SYMBOL(vfs_readv);
759
760ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
761 unsigned long vlen, loff_t *pos)
762{
763 if (!(file->f_mode & FMODE_WRITE))
764 return -EBADF;
ee0b3e67 765 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
1da177e4
LT
766 return -EINVAL;
767
768 return do_readv_writev(WRITE, file, vec, vlen, pos);
769}
770
771EXPORT_SYMBOL(vfs_writev);
772
3cdad428
HC
773SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
774 unsigned long, vlen)
1da177e4 775{
2903ff01 776 struct fd f = fdget(fd);
1da177e4 777 ssize_t ret = -EBADF;
1da177e4 778
2903ff01
AV
779 if (f.file) {
780 loff_t pos = file_pos_read(f.file);
781 ret = vfs_readv(f.file, vec, vlen, &pos);
782 file_pos_write(f.file, pos);
783 fdput(f);
1da177e4
LT
784 }
785
786 if (ret > 0)
4b98d11b
AD
787 add_rchar(current, ret);
788 inc_syscr(current);
1da177e4
LT
789 return ret;
790}
791
3cdad428
HC
792SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
793 unsigned long, vlen)
1da177e4 794{
2903ff01 795 struct fd f = fdget(fd);
1da177e4 796 ssize_t ret = -EBADF;
1da177e4 797
2903ff01
AV
798 if (f.file) {
799 loff_t pos = file_pos_read(f.file);
800 ret = vfs_writev(f.file, vec, vlen, &pos);
801 file_pos_write(f.file, pos);
802 fdput(f);
1da177e4
LT
803 }
804
805 if (ret > 0)
4b98d11b
AD
806 add_wchar(current, ret);
807 inc_syscw(current);
1da177e4
LT
808 return ret;
809}
810
601cc11d
LT
811static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
812{
813#define HALF_LONG_BITS (BITS_PER_LONG / 2)
814 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
815}
816
f3554f4b 817SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
601cc11d 818 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
f3554f4b 819{
601cc11d 820 loff_t pos = pos_from_hilo(pos_h, pos_l);
2903ff01 821 struct fd f;
f3554f4b 822 ssize_t ret = -EBADF;
f3554f4b
GH
823
824 if (pos < 0)
825 return -EINVAL;
826
2903ff01
AV
827 f = fdget(fd);
828 if (f.file) {
f3554f4b 829 ret = -ESPIPE;
2903ff01
AV
830 if (f.file->f_mode & FMODE_PREAD)
831 ret = vfs_readv(f.file, vec, vlen, &pos);
832 fdput(f);
f3554f4b
GH
833 }
834
835 if (ret > 0)
836 add_rchar(current, ret);
837 inc_syscr(current);
838 return ret;
839}
840
841SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
601cc11d 842 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
f3554f4b 843{
601cc11d 844 loff_t pos = pos_from_hilo(pos_h, pos_l);
2903ff01 845 struct fd f;
f3554f4b 846 ssize_t ret = -EBADF;
f3554f4b
GH
847
848 if (pos < 0)
849 return -EINVAL;
850
2903ff01
AV
851 f = fdget(fd);
852 if (f.file) {
f3554f4b 853 ret = -ESPIPE;
2903ff01
AV
854 if (f.file->f_mode & FMODE_PWRITE)
855 ret = vfs_writev(f.file, vec, vlen, &pos);
856 fdput(f);
f3554f4b
GH
857 }
858
859 if (ret > 0)
860 add_wchar(current, ret);
861 inc_syscw(current);
862 return ret;
863}
864
72ec3516
AV
865#ifdef CONFIG_COMPAT
866
867static ssize_t compat_do_readv_writev(int type, struct file *file,
868 const struct compat_iovec __user *uvector,
869 unsigned long nr_segs, loff_t *pos)
870{
871 compat_ssize_t tot_len;
872 struct iovec iovstack[UIO_FASTIOV];
873 struct iovec *iov = iovstack;
874 ssize_t ret;
875 io_fn_t fn;
876 iov_fn_t fnv;
877
878 ret = -EINVAL;
879 if (!file->f_op)
880 goto out;
881
882 ret = -EFAULT;
883 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
884 goto out;
885
886 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
887 UIO_FASTIOV, iovstack, &iov);
888 if (ret <= 0)
889 goto out;
890
891 tot_len = ret;
892 ret = rw_verify_area(type, file, pos, tot_len);
893 if (ret < 0)
894 goto out;
895
896 fnv = NULL;
897 if (type == READ) {
898 fn = file->f_op->read;
899 fnv = file->f_op->aio_read;
900 } else {
901 fn = (io_fn_t)file->f_op->write;
902 fnv = file->f_op->aio_write;
03d95eb2 903 file_start_write(file);
72ec3516
AV
904 }
905
03d95eb2 906 if (fnv)
72ec3516
AV
907 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
908 pos, fnv);
03d95eb2 909 else
72ec3516
AV
910 ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
911
03d95eb2
AV
912 if (type != READ)
913 file_end_write(file);
914
72ec3516
AV
915out:
916 if (iov != iovstack)
917 kfree(iov);
918 if ((ret + (type == READ)) > 0) {
919 if (type == READ)
920 fsnotify_access(file);
921 else
922 fsnotify_modify(file);
923 }
924 return ret;
925}
926
927static size_t compat_readv(struct file *file,
928 const struct compat_iovec __user *vec,
929 unsigned long vlen, loff_t *pos)
930{
931 ssize_t ret = -EBADF;
932
933 if (!(file->f_mode & FMODE_READ))
934 goto out;
935
936 ret = -EINVAL;
937 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
938 goto out;
939
940 ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
941
942out:
943 if (ret > 0)
944 add_rchar(current, ret);
945 inc_syscr(current);
946 return ret;
947}
948
949COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
950 const struct compat_iovec __user *,vec,
951 unsigned long, vlen)
952{
953 struct fd f = fdget(fd);
954 ssize_t ret;
955 loff_t pos;
956
957 if (!f.file)
958 return -EBADF;
959 pos = f.file->f_pos;
960 ret = compat_readv(f.file, vec, vlen, &pos);
961 f.file->f_pos = pos;
962 fdput(f);
963 return ret;
964}
965
966COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
967 const struct compat_iovec __user *,vec,
968 unsigned long, vlen, loff_t, pos)
969{
970 struct fd f;
971 ssize_t ret;
972
973 if (pos < 0)
974 return -EINVAL;
975 f = fdget(fd);
976 if (!f.file)
977 return -EBADF;
978 ret = -ESPIPE;
979 if (f.file->f_mode & FMODE_PREAD)
980 ret = compat_readv(f.file, vec, vlen, &pos);
981 fdput(f);
982 return ret;
983}
984
985COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
986 const struct compat_iovec __user *,vec,
987 unsigned long, vlen, u32, pos_low, u32, pos_high)
988{
989 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
990 return compat_sys_preadv64(fd, vec, vlen, pos);
991}
992
993static size_t compat_writev(struct file *file,
994 const struct compat_iovec __user *vec,
995 unsigned long vlen, loff_t *pos)
996{
997 ssize_t ret = -EBADF;
998
999 if (!(file->f_mode & FMODE_WRITE))
1000 goto out;
1001
1002 ret = -EINVAL;
1003 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
1004 goto out;
1005
1006 ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1007
1008out:
1009 if (ret > 0)
1010 add_wchar(current, ret);
1011 inc_syscw(current);
1012 return ret;
1013}
1014
1015COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
1016 const struct compat_iovec __user *, vec,
1017 unsigned long, vlen)
1018{
1019 struct fd f = fdget(fd);
1020 ssize_t ret;
1021 loff_t pos;
1022
1023 if (!f.file)
1024 return -EBADF;
1025 pos = f.file->f_pos;
1026 ret = compat_writev(f.file, vec, vlen, &pos);
1027 f.file->f_pos = pos;
1028 fdput(f);
1029 return ret;
1030}
1031
1032COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1033 const struct compat_iovec __user *,vec,
1034 unsigned long, vlen, loff_t, pos)
1035{
1036 struct fd f;
1037 ssize_t ret;
1038
1039 if (pos < 0)
1040 return -EINVAL;
1041 f = fdget(fd);
1042 if (!f.file)
1043 return -EBADF;
1044 ret = -ESPIPE;
1045 if (f.file->f_mode & FMODE_PWRITE)
1046 ret = compat_writev(f.file, vec, vlen, &pos);
1047 fdput(f);
1048 return ret;
1049}
1050
1051COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
1052 const struct compat_iovec __user *,vec,
1053 unsigned long, vlen, u32, pos_low, u32, pos_high)
1054{
1055 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1056 return compat_sys_pwritev64(fd, vec, vlen, pos);
1057}
1058#endif
1059
19f4fc3a
AV
1060static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1061 size_t count, loff_t max)
1da177e4 1062{
2903ff01
AV
1063 struct fd in, out;
1064 struct inode *in_inode, *out_inode;
1da177e4
LT
1065 loff_t pos;
1066 ssize_t retval;
2903ff01 1067 int fl;
1da177e4
LT
1068
1069 /*
1070 * Get input file, and verify that it is ok..
1071 */
1072 retval = -EBADF;
2903ff01
AV
1073 in = fdget(in_fd);
1074 if (!in.file)
1da177e4 1075 goto out;
2903ff01 1076 if (!(in.file->f_mode & FMODE_READ))
1da177e4 1077 goto fput_in;
1da177e4
LT
1078 retval = -ESPIPE;
1079 if (!ppos)
2903ff01 1080 ppos = &in.file->f_pos;
1da177e4 1081 else
2903ff01 1082 if (!(in.file->f_mode & FMODE_PREAD))
1da177e4 1083 goto fput_in;
2903ff01 1084 retval = rw_verify_area(READ, in.file, ppos, count);
e28cc715 1085 if (retval < 0)
1da177e4 1086 goto fput_in;
e28cc715 1087 count = retval;
1da177e4 1088
1da177e4
LT
1089 /*
1090 * Get output file, and verify that it is ok..
1091 */
1092 retval = -EBADF;
2903ff01
AV
1093 out = fdget(out_fd);
1094 if (!out.file)
1da177e4 1095 goto fput_in;
2903ff01 1096 if (!(out.file->f_mode & FMODE_WRITE))
1da177e4
LT
1097 goto fput_out;
1098 retval = -EINVAL;
496ad9aa
AV
1099 in_inode = file_inode(in.file);
1100 out_inode = file_inode(out.file);
2903ff01 1101 retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
e28cc715 1102 if (retval < 0)
1da177e4 1103 goto fput_out;
e28cc715 1104 count = retval;
1da177e4 1105
1da177e4
LT
1106 if (!max)
1107 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1108
1109 pos = *ppos;
1da177e4
LT
1110 if (unlikely(pos + count > max)) {
1111 retval = -EOVERFLOW;
1112 if (pos >= max)
1113 goto fput_out;
1114 count = max - pos;
1115 }
1116
d96e6e71 1117 fl = 0;
534f2aaa 1118#if 0
d96e6e71
JA
1119 /*
1120 * We need to debate whether we can enable this or not. The
1121 * man page documents EAGAIN return for the output at least,
1122 * and the application is arguably buggy if it doesn't expect
1123 * EAGAIN on a non-blocking file descriptor.
1124 */
2903ff01 1125 if (in.file->f_flags & O_NONBLOCK)
d96e6e71 1126 fl = SPLICE_F_NONBLOCK;
534f2aaa 1127#endif
2903ff01 1128 retval = do_splice_direct(in.file, ppos, out.file, count, fl);
1da177e4
LT
1129
1130 if (retval > 0) {
4b98d11b
AD
1131 add_rchar(current, retval);
1132 add_wchar(current, retval);
a68c2f12
SW
1133 fsnotify_access(in.file);
1134 fsnotify_modify(out.file);
1da177e4 1135 }
1da177e4 1136
4b98d11b
AD
1137 inc_syscr(current);
1138 inc_syscw(current);
1da177e4
LT
1139 if (*ppos > max)
1140 retval = -EOVERFLOW;
1141
1142fput_out:
2903ff01 1143 fdput(out);
1da177e4 1144fput_in:
2903ff01 1145 fdput(in);
1da177e4
LT
1146out:
1147 return retval;
1148}
1149
002c8976 1150SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1da177e4
LT
1151{
1152 loff_t pos;
1153 off_t off;
1154 ssize_t ret;
1155
1156 if (offset) {
1157 if (unlikely(get_user(off, offset)))
1158 return -EFAULT;
1159 pos = off;
1160 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1161 if (unlikely(put_user(pos, offset)))
1162 return -EFAULT;
1163 return ret;
1164 }
1165
1166 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1167}
1168
002c8976 1169SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1da177e4
LT
1170{
1171 loff_t pos;
1172 ssize_t ret;
1173
1174 if (offset) {
1175 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1176 return -EFAULT;
1177 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1178 if (unlikely(put_user(pos, offset)))
1179 return -EFAULT;
1180 return ret;
1181 }
1182
1183 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1184}
19f4fc3a
AV
1185
1186#ifdef CONFIG_COMPAT
1187COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1188 compat_off_t __user *, offset, compat_size_t, count)
1189{
1190 loff_t pos;
1191 off_t off;
1192 ssize_t ret;
1193
1194 if (offset) {
1195 if (unlikely(get_user(off, offset)))
1196 return -EFAULT;
1197 pos = off;
1198 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1199 if (unlikely(put_user(pos, offset)))
1200 return -EFAULT;
1201 return ret;
1202 }
1203
1204 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1205}
1206
1207COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1208 compat_loff_t __user *, offset, compat_size_t, count)
1209{
1210 loff_t pos;
1211 ssize_t ret;
1212
1213 if (offset) {
1214 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1215 return -EFAULT;
1216 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1217 if (unlikely(put_user(pos, offset)))
1218 return -EFAULT;
1219 return ret;
1220 }
1221
1222 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1223}
1224#endif
This page took 0.691866 seconds and 5 git commands to generate.