[PATCH] splice: fix unlocking of page on error ->prepare_write()
[deliverable/linux.git] / fs / splice.c
CommitLineData
5274f052
JA
1/*
2 * "splice": joining two ropes together by interweaving their strands.
3 *
4 * This is the "extended pipe" functionality, where a pipe is used as
5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6 * buffer that you can use to transfer data from one end to the other.
7 *
8 * The traditional unix read/write is extended with a "splice()" operation
9 * that transfers data buffers to or from a pipe buffer.
10 *
11 * Named by Larry McVoy, original implementation from Linus, extended by
c2058e06
JA
12 * Jens to support splicing to files, network, direct splicing, etc and
13 * fixing lots of bugs.
5274f052 14 *
c2058e06
JA
15 * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de>
16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
5274f052
JA
18 *
19 */
20#include <linux/fs.h>
21#include <linux/file.h>
22#include <linux/pagemap.h>
23#include <linux/pipe_fs_i.h>
24#include <linux/mm_inline.h>
5abc97aa 25#include <linux/swap.h>
4f6f0bd2
JA
26#include <linux/writeback.h>
27#include <linux/buffer_head.h>
a0f06780 28#include <linux/module.h>
4f6f0bd2 29#include <linux/syscalls.h>
912d35f8 30#include <linux/uio.h>
5274f052 31
912d35f8
JA
32struct partial_page {
33 unsigned int offset;
34 unsigned int len;
35};
36
37/*
00522fb4 38 * Passed to splice_to_pipe
912d35f8
JA
39 */
40struct splice_pipe_desc {
41 struct page **pages; /* page map */
42 struct partial_page *partial; /* pages[] may not be contig */
43 int nr_pages; /* number of pages in map */
44 unsigned int flags; /* splice flags */
45 struct pipe_buf_operations *ops;/* ops associated with output pipe */
46};
47
83f9135b
JA
48/*
49 * Attempt to steal a page from a pipe buffer. This should perhaps go into
50 * a vm helper function, it's already simplified quite a bit by the
51 * addition of remove_mapping(). If success is returned, the caller may
52 * attempt to reuse this page for another destination.
53 */
5abc97aa
JA
54static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
55 struct pipe_buffer *buf)
56{
57 struct page *page = buf->page;
4f6f0bd2 58 struct address_space *mapping = page_mapping(page);
5abc97aa 59
9e0267c2
JA
60 lock_page(page);
61
5abc97aa
JA
62 WARN_ON(!PageUptodate(page));
63
ad8d6f0a
JA
64 /*
65 * At least for ext2 with nobh option, we need to wait on writeback
66 * completing on this page, since we'll remove it from the pagecache.
67 * Otherwise truncate wont wait on the page, allowing the disk
68 * blocks to be reused by someone else before we actually wrote our
69 * data to them. fs corruption ensues.
70 */
71 wait_on_page_writeback(page);
72
4f6f0bd2
JA
73 if (PagePrivate(page))
74 try_to_release_page(page, mapping_gfp_mask(mapping));
75
9e0267c2
JA
76 if (!remove_mapping(mapping, page)) {
77 unlock_page(page);
5abc97aa 78 return 1;
9e0267c2 79 }
5abc97aa 80
5abc97aa
JA
81 return 0;
82}
83
5274f052
JA
84static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
85 struct pipe_buffer *buf)
86{
87 page_cache_release(buf->page);
5274f052
JA
88}
89
f84d7519
JA
90static int page_cache_pipe_buf_pin(struct pipe_inode_info *info,
91 struct pipe_buffer *buf)
5274f052
JA
92{
93 struct page *page = buf->page;
49d0b21b 94 int err;
5274f052
JA
95
96 if (!PageUptodate(page)) {
49d0b21b
JA
97 lock_page(page);
98
99 /*
100 * Page got truncated/unhashed. This will cause a 0-byte
73d62d83 101 * splice, if this is the first page.
49d0b21b
JA
102 */
103 if (!page->mapping) {
104 err = -ENODATA;
105 goto error;
106 }
5274f052 107
49d0b21b 108 /*
73d62d83 109 * Uh oh, read-error from disk.
49d0b21b
JA
110 */
111 if (!PageUptodate(page)) {
112 err = -EIO;
113 goto error;
114 }
115
116 /*
f84d7519 117 * Page is ok afterall, we are done.
49d0b21b 118 */
5274f052 119 unlock_page(page);
5274f052
JA
120 }
121
f84d7519 122 return 0;
49d0b21b
JA
123error:
124 unlock_page(page);
f84d7519 125 return err;
70524490
JA
126}
127
5274f052
JA
128static struct pipe_buf_operations page_cache_pipe_buf_ops = {
129 .can_merge = 0,
f84d7519
JA
130 .map = generic_pipe_buf_map,
131 .unmap = generic_pipe_buf_unmap,
132 .pin = page_cache_pipe_buf_pin,
5274f052 133 .release = page_cache_pipe_buf_release,
5abc97aa 134 .steal = page_cache_pipe_buf_steal,
f84d7519 135 .get = generic_pipe_buf_get,
5274f052
JA
136};
137
912d35f8
JA
138static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
139 struct pipe_buffer *buf)
140{
7afa6fd0
JA
141 if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
142 return 1;
143
330ab716 144 return generic_pipe_buf_steal(pipe, buf);
912d35f8
JA
145}
146
147static struct pipe_buf_operations user_page_pipe_buf_ops = {
148 .can_merge = 0,
f84d7519
JA
149 .map = generic_pipe_buf_map,
150 .unmap = generic_pipe_buf_unmap,
151 .pin = generic_pipe_buf_pin,
912d35f8
JA
152 .release = page_cache_pipe_buf_release,
153 .steal = user_page_pipe_buf_steal,
f84d7519 154 .get = generic_pipe_buf_get,
912d35f8
JA
155};
156
83f9135b
JA
157/*
158 * Pipe output worker. This sets up our pipe format with the page cache
159 * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
160 */
00522fb4
JA
161static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
162 struct splice_pipe_desc *spd)
5274f052 163{
912d35f8 164 int ret, do_wakeup, page_nr;
5274f052
JA
165
166 ret = 0;
167 do_wakeup = 0;
912d35f8 168 page_nr = 0;
5274f052 169
3a326a2c
IM
170 if (pipe->inode)
171 mutex_lock(&pipe->inode->i_mutex);
5274f052 172
5274f052 173 for (;;) {
3a326a2c 174 if (!pipe->readers) {
5274f052
JA
175 send_sig(SIGPIPE, current, 0);
176 if (!ret)
177 ret = -EPIPE;
178 break;
179 }
180
6f767b04
JA
181 if (pipe->nrbufs < PIPE_BUFFERS) {
182 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
3a326a2c 183 struct pipe_buffer *buf = pipe->bufs + newbuf;
5274f052 184
912d35f8
JA
185 buf->page = spd->pages[page_nr];
186 buf->offset = spd->partial[page_nr].offset;
187 buf->len = spd->partial[page_nr].len;
188 buf->ops = spd->ops;
7afa6fd0
JA
189 if (spd->flags & SPLICE_F_GIFT)
190 buf->flags |= PIPE_BUF_FLAG_GIFT;
191
6f767b04 192 pipe->nrbufs++;
912d35f8
JA
193 page_nr++;
194 ret += buf->len;
195
6f767b04
JA
196 if (pipe->inode)
197 do_wakeup = 1;
5274f052 198
912d35f8 199 if (!--spd->nr_pages)
5274f052 200 break;
6f767b04 201 if (pipe->nrbufs < PIPE_BUFFERS)
5274f052
JA
202 continue;
203
204 break;
205 }
206
912d35f8 207 if (spd->flags & SPLICE_F_NONBLOCK) {
29e35094
LT
208 if (!ret)
209 ret = -EAGAIN;
210 break;
211 }
212
5274f052
JA
213 if (signal_pending(current)) {
214 if (!ret)
215 ret = -ERESTARTSYS;
216 break;
217 }
218
219 if (do_wakeup) {
c0bd1f65 220 smp_mb();
3a326a2c
IM
221 if (waitqueue_active(&pipe->wait))
222 wake_up_interruptible_sync(&pipe->wait);
223 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
5274f052
JA
224 do_wakeup = 0;
225 }
226
3a326a2c
IM
227 pipe->waiting_writers++;
228 pipe_wait(pipe);
229 pipe->waiting_writers--;
5274f052
JA
230 }
231
3a326a2c
IM
232 if (pipe->inode)
233 mutex_unlock(&pipe->inode->i_mutex);
5274f052
JA
234
235 if (do_wakeup) {
c0bd1f65 236 smp_mb();
3a326a2c
IM
237 if (waitqueue_active(&pipe->wait))
238 wake_up_interruptible(&pipe->wait);
239 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
5274f052
JA
240 }
241
912d35f8
JA
242 while (page_nr < spd->nr_pages)
243 page_cache_release(spd->pages[page_nr++]);
5274f052
JA
244
245 return ret;
246}
247
3a326a2c 248static int
cbb7e577
JA
249__generic_file_splice_read(struct file *in, loff_t *ppos,
250 struct pipe_inode_info *pipe, size_t len,
251 unsigned int flags)
5274f052
JA
252{
253 struct address_space *mapping = in->f_mapping;
912d35f8 254 unsigned int loff, nr_pages;
16c523dd 255 struct page *pages[PIPE_BUFFERS];
912d35f8 256 struct partial_page partial[PIPE_BUFFERS];
5274f052 257 struct page *page;
91ad66ef
JA
258 pgoff_t index, end_index;
259 loff_t isize;
912d35f8 260 size_t total_len;
eb20796b 261 int error, page_nr;
912d35f8
JA
262 struct splice_pipe_desc spd = {
263 .pages = pages,
264 .partial = partial,
265 .flags = flags,
266 .ops = &page_cache_pipe_buf_ops,
267 };
5274f052 268
cbb7e577 269 index = *ppos >> PAGE_CACHE_SHIFT;
912d35f8
JA
270 loff = *ppos & ~PAGE_CACHE_MASK;
271 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
5274f052
JA
272
273 if (nr_pages > PIPE_BUFFERS)
274 nr_pages = PIPE_BUFFERS;
275
276 /*
73d62d83 277 * Initiate read-ahead on this page range. however, don't call into
0b749ce3
JA
278 * read-ahead if this is a non-zero offset (we are likely doing small
279 * chunk splice and the page is already there) for a single page.
5274f052 280 */
eb645a24
JA
281 if (!loff || nr_pages > 1)
282 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
5274f052 283
5274f052 284 /*
73d62d83 285 * Now fill in the holes:
5274f052 286 */
7480a904 287 error = 0;
912d35f8 288 total_len = 0;
82aa5d61 289
eb20796b
JA
290 /*
291 * Lookup the (hopefully) full range of pages we need.
292 */
293 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
82aa5d61 294
eb20796b
JA
295 /*
296 * If find_get_pages_contig() returned fewer pages than we needed,
297 * allocate the rest.
298 */
299 index += spd.nr_pages;
300 while (spd.nr_pages < nr_pages) {
82aa5d61 301 /*
eb20796b
JA
302 * Page could be there, find_get_pages_contig() breaks on
303 * the first hole.
5274f052 304 */
7480a904
JA
305 page = find_get_page(mapping, index);
306 if (!page) {
e27dedd8
JA
307 /*
308 * Make sure the read-ahead engine is notified
309 * about this failure.
310 */
311 handle_ra_miss(mapping, &in->f_ra, index);
312
7480a904 313 /*
eb20796b 314 * page didn't exist, allocate one.
7480a904
JA
315 */
316 page = page_cache_alloc_cold(mapping);
317 if (!page)
318 break;
319
320 error = add_to_page_cache_lru(page, mapping, index,
eb20796b 321 mapping_gfp_mask(mapping));
7480a904
JA
322 if (unlikely(error)) {
323 page_cache_release(page);
324 break;
325 }
eb20796b
JA
326 /*
327 * add_to_page_cache() locks the page, unlock it
328 * to avoid convoluting the logic below even more.
329 */
330 unlock_page(page);
7480a904
JA
331 }
332
eb20796b
JA
333 pages[spd.nr_pages++] = page;
334 index++;
335 }
336
337 /*
338 * Now loop over the map and see if we need to start IO on any
339 * pages, fill in the partial map, etc.
340 */
341 index = *ppos >> PAGE_CACHE_SHIFT;
342 nr_pages = spd.nr_pages;
343 spd.nr_pages = 0;
344 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
345 unsigned int this_len;
346
347 if (!len)
348 break;
349
350 /*
351 * this_len is the max we'll use from this page
352 */
353 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
354 page = pages[page_nr];
355
7480a904
JA
356 /*
357 * If the page isn't uptodate, we may need to start io on it
358 */
359 if (!PageUptodate(page)) {
c4f895cb
JA
360 /*
361 * If in nonblock mode then dont block on waiting
362 * for an in-flight io page
363 */
364 if (flags & SPLICE_F_NONBLOCK)
365 break;
366
7480a904
JA
367 lock_page(page);
368
369 /*
370 * page was truncated, stop here. if this isn't the
371 * first page, we'll just complete what we already
372 * added
373 */
374 if (!page->mapping) {
375 unlock_page(page);
7480a904
JA
376 break;
377 }
378 /*
379 * page was already under io and is now done, great
380 */
381 if (PageUptodate(page)) {
382 unlock_page(page);
383 goto fill_it;
384 }
5274f052 385
7480a904
JA
386 /*
387 * need to read in the page
388 */
389 error = mapping->a_ops->readpage(in, page);
5274f052 390 if (unlikely(error)) {
eb20796b
JA
391 /*
392 * We really should re-lookup the page here,
393 * but it complicates things a lot. Instead
394 * lets just do what we already stored, and
395 * we'll get it the next time we are called.
396 */
7480a904 397 if (error == AOP_TRUNCATED_PAGE)
eb20796b
JA
398 error = 0;
399
5274f052
JA
400 break;
401 }
91ad66ef
JA
402
403 /*
404 * i_size must be checked after ->readpage().
405 */
406 isize = i_size_read(mapping->host);
407 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
eb20796b 408 if (unlikely(!isize || index > end_index))
91ad66ef 409 break;
91ad66ef
JA
410
411 /*
412 * if this is the last page, see if we need to shrink
413 * the length and stop
414 */
415 if (end_index == index) {
416 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
eb20796b 417 if (total_len + loff > isize)
91ad66ef 418 break;
91ad66ef
JA
419 /*
420 * force quit after adding this page
421 */
eb20796b 422 len = this_len;
82aa5d61 423 this_len = min(this_len, loff);
912d35f8 424 loff = 0;
91ad66ef 425 }
5274f052 426 }
7480a904 427fill_it:
eb20796b
JA
428 partial[page_nr].offset = loff;
429 partial[page_nr].len = this_len;
82aa5d61 430 len -= this_len;
912d35f8 431 total_len += this_len;
91ad66ef 432 loff = 0;
eb20796b
JA
433 spd.nr_pages++;
434 index++;
5274f052
JA
435 }
436
eb20796b
JA
437 /*
438 * Release any pages at the end, if we quit early. 'i' is how far
439 * we got, 'nr_pages' is how many pages are in the map.
440 */
441 while (page_nr < nr_pages)
442 page_cache_release(pages[page_nr++]);
443
912d35f8 444 if (spd.nr_pages)
00522fb4 445 return splice_to_pipe(pipe, &spd);
5274f052 446
7480a904 447 return error;
5274f052
JA
448}
449
83f9135b
JA
450/**
451 * generic_file_splice_read - splice data from file to a pipe
452 * @in: file to splice from
453 * @pipe: pipe to splice to
454 * @len: number of bytes to splice
455 * @flags: splice modifier flags
456 *
457 * Will read pages from given file and fill them into a pipe.
83f9135b 458 */
cbb7e577
JA
459ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
460 struct pipe_inode_info *pipe, size_t len,
461 unsigned int flags)
5274f052
JA
462{
463 ssize_t spliced;
464 int ret;
465
466 ret = 0;
467 spliced = 0;
3a326a2c 468
5274f052 469 while (len) {
cbb7e577 470 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
5274f052 471
c4f895cb 472 if (ret < 0)
5274f052 473 break;
c4f895cb
JA
474 else if (!ret) {
475 if (spliced)
476 break;
477 if (flags & SPLICE_F_NONBLOCK) {
478 ret = -EAGAIN;
479 break;
480 }
481 }
5274f052 482
cbb7e577 483 *ppos += ret;
5274f052
JA
484 len -= ret;
485 spliced += ret;
486 }
487
488 if (spliced)
489 return spliced;
490
491 return ret;
492}
493
059a8f37
JA
494EXPORT_SYMBOL(generic_file_splice_read);
495
5274f052 496/*
4f6f0bd2 497 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
016b661e 498 * using sendpage(). Return the number of bytes sent.
5274f052
JA
499 */
500static int pipe_to_sendpage(struct pipe_inode_info *info,
501 struct pipe_buffer *buf, struct splice_desc *sd)
502{
503 struct file *file = sd->file;
504 loff_t pos = sd->pos;
f84d7519 505 int ret, more;
5274f052 506
f84d7519
JA
507 ret = buf->ops->pin(info, buf);
508 if (!ret) {
509 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
5274f052 510
f84d7519
JA
511 ret = file->f_op->sendpage(file, buf->page, buf->offset,
512 sd->len, &pos, more);
513 }
5274f052 514
016b661e 515 return ret;
5274f052
JA
516}
517
518/*
519 * This is a little more tricky than the file -> pipe splicing. There are
520 * basically three cases:
521 *
522 * - Destination page already exists in the address space and there
523 * are users of it. For that case we have no other option that
524 * copying the data. Tough luck.
525 * - Destination page already exists in the address space, but there
526 * are no users of it. Make sure it's uptodate, then drop it. Fall
527 * through to last case.
528 * - Destination page does not exist, we can add the pipe page to
529 * the page cache and avoid the copy.
530 *
83f9135b
JA
531 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
532 * sd->flags), we attempt to migrate pages from the pipe to the output
533 * file address space page cache. This is possible if no one else has
534 * the pipe page referenced outside of the pipe and page cache. If
535 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
536 * a new page in the output file page cache and fill/dirty that.
5274f052
JA
537 */
538static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
539 struct splice_desc *sd)
540{
541 struct file *file = sd->file;
542 struct address_space *mapping = file->f_mapping;
3e7ee3e7 543 gfp_t gfp_mask = mapping_gfp_mask(mapping);
016b661e 544 unsigned int offset, this_len;
5274f052 545 struct page *page;
5274f052 546 pgoff_t index;
3e7ee3e7 547 int ret;
5274f052
JA
548
549 /*
49d0b21b 550 * make sure the data in this buffer is uptodate
5274f052 551 */
f84d7519
JA
552 ret = buf->ops->pin(info, buf);
553 if (unlikely(ret))
554 return ret;
5274f052
JA
555
556 index = sd->pos >> PAGE_CACHE_SHIFT;
557 offset = sd->pos & ~PAGE_CACHE_MASK;
558
016b661e
JA
559 this_len = sd->len;
560 if (this_len + offset > PAGE_CACHE_SIZE)
561 this_len = PAGE_CACHE_SIZE - offset;
562
5274f052 563 /*
0568b409
JA
564 * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
565 * page.
5274f052 566 */
0568b409 567 if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) {
83f9135b
JA
568 /*
569 * If steal succeeds, buf->page is now pruned from the vm
a893b99b
JA
570 * side (page cache) and we can reuse it. The page will also
571 * be locked on successful return.
83f9135b 572 */
5abc97aa
JA
573 if (buf->ops->steal(info, buf))
574 goto find_page;
575
576 page = buf->page;
a893b99b
JA
577 page_cache_get(page);
578
579 /*
580 * page must be on the LRU for adding to the pagecache.
581 * Check this without grabbing the zone lock, if it isn't
582 * the do grab the zone lock, recheck, and add if necessary.
583 */
584 if (!PageLRU(page)) {
585 struct zone *zone = page_zone(page);
586
587 spin_lock_irq(&zone->lru_lock);
588 if (!PageLRU(page)) {
589 SetPageLRU(page);
590 add_page_to_inactive_list(zone, page);
591 }
592 spin_unlock_irq(&zone->lru_lock);
593 }
594
46e678c9 595 if (add_to_page_cache(page, mapping, index, gfp_mask)) {
a893b99b 596 page_cache_release(page);
46e678c9 597 unlock_page(page);
5abc97aa 598 goto find_page;
46e678c9 599 }
5abc97aa
JA
600 } else {
601find_page:
9e0267c2
JA
602 page = find_lock_page(mapping, index);
603 if (!page) {
604 ret = -ENOMEM;
605 page = page_cache_alloc_cold(mapping);
606 if (unlikely(!page))
607 goto out_nomem;
608
609 /*
610 * This will also lock the page
611 */
612 ret = add_to_page_cache_lru(page, mapping, index,
613 gfp_mask);
614 if (unlikely(ret))
615 goto out;
616 }
5abc97aa
JA
617
618 /*
9e0267c2
JA
619 * We get here with the page locked. If the page is also
620 * uptodate, we don't need to do more. If it isn't, we
621 * may need to bring it in if we are not going to overwrite
622 * the full page.
5abc97aa
JA
623 */
624 if (!PageUptodate(page)) {
016b661e 625 if (this_len < PAGE_CACHE_SIZE) {
5abc97aa
JA
626 ret = mapping->a_ops->readpage(file, page);
627 if (unlikely(ret))
628 goto out;
629
630 lock_page(page);
631
632 if (!PageUptodate(page)) {
633 /*
73d62d83 634 * Page got invalidated, repeat.
5abc97aa
JA
635 */
636 if (!page->mapping) {
637 unlock_page(page);
638 page_cache_release(page);
639 goto find_page;
640 }
641 ret = -EIO;
642 goto out;
5274f052 643 }
9e0267c2 644 } else
5abc97aa 645 SetPageUptodate(page);
5274f052
JA
646 }
647 }
648
016b661e 649 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
bfc4ee39
JA
650 if (unlikely(ret)) {
651 loff_t isize = i_size_read(mapping->host);
652
653 if (ret != AOP_TRUNCATED_PAGE)
654 unlock_page(page);
4f6f0bd2 655 page_cache_release(page);
bfc4ee39
JA
656 if (ret == AOP_TRUNCATED_PAGE)
657 goto find_page;
658
659 /*
660 * prepare_write() may have instantiated a few blocks
661 * outside i_size. Trim these off again.
662 */
663 if (sd->pos + this_len > isize)
664 vmtruncate(mapping->host, isize);
665
5274f052 666 goto out;
bfc4ee39 667 }
5274f052 668
0568b409 669 if (buf->page != page) {
f84d7519
JA
670 /*
671 * Careful, ->map() uses KM_USER0!
672 */
f6762b7a 673 char *src = buf->ops->map(info, buf, 1);
f84d7519 674 char *dst = kmap_atomic(page, KM_USER1);
5abc97aa 675
016b661e 676 memcpy(dst + offset, src + buf->offset, this_len);
5abc97aa 677 flush_dcache_page(page);
f84d7519 678 kunmap_atomic(dst, KM_USER1);
f6762b7a 679 buf->ops->unmap(info, buf, src);
5abc97aa 680 }
5274f052 681
016b661e 682 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
0568b409
JA
683 if (!ret) {
684 /*
685 * Return the number of bytes written and mark page as
686 * accessed, we are now done!
687 */
688 ret = this_len;
689 mark_page_accessed(page);
690 balance_dirty_pages_ratelimited(mapping);
691 } else if (ret == AOP_TRUNCATED_PAGE) {
4f6f0bd2
JA
692 page_cache_release(page);
693 goto find_page;
0568b409 694 }
5274f052 695out:
0568b409 696 page_cache_release(page);
9e0267c2 697 unlock_page(page);
9aefe431 698out_nomem:
5274f052
JA
699 return ret;
700}
701
83f9135b
JA
702/*
703 * Pipe input worker. Most of this logic works like a regular pipe, the
704 * key here is the 'actor' worker passed in that actually moves the data
705 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
706 */
00522fb4
JA
707ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
708 loff_t *ppos, size_t len, unsigned int flags,
709 splice_actor *actor)
5274f052 710{
5274f052
JA
711 int ret, do_wakeup, err;
712 struct splice_desc sd;
713
714 ret = 0;
715 do_wakeup = 0;
716
717 sd.total_len = len;
718 sd.flags = flags;
719 sd.file = out;
cbb7e577 720 sd.pos = *ppos;
5274f052 721
3a326a2c
IM
722 if (pipe->inode)
723 mutex_lock(&pipe->inode->i_mutex);
5274f052 724
5274f052 725 for (;;) {
6f767b04
JA
726 if (pipe->nrbufs) {
727 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
5274f052
JA
728 struct pipe_buf_operations *ops = buf->ops;
729
730 sd.len = buf->len;
731 if (sd.len > sd.total_len)
732 sd.len = sd.total_len;
733
3a326a2c 734 err = actor(pipe, buf, &sd);
016b661e 735 if (err <= 0) {
5274f052
JA
736 if (!ret && err != -ENODATA)
737 ret = err;
738
739 break;
740 }
741
016b661e
JA
742 ret += err;
743 buf->offset += err;
744 buf->len -= err;
745
746 sd.len -= err;
747 sd.pos += err;
748 sd.total_len -= err;
749 if (sd.len)
750 continue;
73d62d83 751
5274f052
JA
752 if (!buf->len) {
753 buf->ops = NULL;
3a326a2c 754 ops->release(pipe, buf);
6f767b04
JA
755 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
756 pipe->nrbufs--;
757 if (pipe->inode)
758 do_wakeup = 1;
5274f052
JA
759 }
760
5274f052
JA
761 if (!sd.total_len)
762 break;
763 }
764
6f767b04 765 if (pipe->nrbufs)
5274f052 766 continue;
3a326a2c 767 if (!pipe->writers)
5274f052 768 break;
3a326a2c 769 if (!pipe->waiting_writers) {
5274f052
JA
770 if (ret)
771 break;
772 }
773
29e35094
LT
774 if (flags & SPLICE_F_NONBLOCK) {
775 if (!ret)
776 ret = -EAGAIN;
777 break;
778 }
779
5274f052
JA
780 if (signal_pending(current)) {
781 if (!ret)
782 ret = -ERESTARTSYS;
783 break;
784 }
785
786 if (do_wakeup) {
c0bd1f65 787 smp_mb();
3a326a2c
IM
788 if (waitqueue_active(&pipe->wait))
789 wake_up_interruptible_sync(&pipe->wait);
790 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
5274f052
JA
791 do_wakeup = 0;
792 }
793
3a326a2c 794 pipe_wait(pipe);
5274f052
JA
795 }
796
3a326a2c
IM
797 if (pipe->inode)
798 mutex_unlock(&pipe->inode->i_mutex);
5274f052
JA
799
800 if (do_wakeup) {
c0bd1f65 801 smp_mb();
3a326a2c
IM
802 if (waitqueue_active(&pipe->wait))
803 wake_up_interruptible(&pipe->wait);
804 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
5274f052
JA
805 }
806
5274f052 807 return ret;
5274f052
JA
808}
809
83f9135b
JA
810/**
811 * generic_file_splice_write - splice data from a pipe to a file
3a326a2c 812 * @pipe: pipe info
83f9135b
JA
813 * @out: file to write to
814 * @len: number of bytes to splice
815 * @flags: splice modifier flags
816 *
817 * Will either move or copy pages (determined by @flags options) from
818 * the given pipe inode to the given file.
819 *
820 */
3a326a2c
IM
821ssize_t
822generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
cbb7e577 823 loff_t *ppos, size_t len, unsigned int flags)
5274f052 824{
4f6f0bd2 825 struct address_space *mapping = out->f_mapping;
3a326a2c
IM
826 ssize_t ret;
827
00522fb4 828 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
a4514ebd 829 if (ret > 0) {
4f6f0bd2 830 struct inode *inode = mapping->host;
4f6f0bd2 831
a4514ebd
JA
832 *ppos += ret;
833
834 /*
835 * If file or inode is SYNC and we actually wrote some data,
836 * sync it.
837 */
838 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
839 int err;
840
841 mutex_lock(&inode->i_mutex);
842 err = generic_osync_inode(inode, mapping,
843 OSYNC_METADATA|OSYNC_DATA);
844 mutex_unlock(&inode->i_mutex);
4f6f0bd2 845
a4514ebd
JA
846 if (err)
847 ret = err;
848 }
4f6f0bd2
JA
849 }
850
851 return ret;
5274f052
JA
852}
853
059a8f37
JA
854EXPORT_SYMBOL(generic_file_splice_write);
855
83f9135b
JA
856/**
857 * generic_splice_sendpage - splice data from a pipe to a socket
858 * @inode: pipe inode
859 * @out: socket to write to
860 * @len: number of bytes to splice
861 * @flags: splice modifier flags
862 *
863 * Will send @len bytes from the pipe to a network socket. No data copying
864 * is involved.
865 *
866 */
3a326a2c 867ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
cbb7e577 868 loff_t *ppos, size_t len, unsigned int flags)
5274f052 869{
00522fb4 870 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
5274f052
JA
871}
872
059a8f37 873EXPORT_SYMBOL(generic_splice_sendpage);
a0f06780 874
83f9135b
JA
875/*
876 * Attempt to initiate a splice from pipe to file.
877 */
3a326a2c 878static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
cbb7e577 879 loff_t *ppos, size_t len, unsigned int flags)
5274f052 880{
5274f052
JA
881 int ret;
882
49570e9b 883 if (unlikely(!out->f_op || !out->f_op->splice_write))
5274f052
JA
884 return -EINVAL;
885
49570e9b 886 if (unlikely(!(out->f_mode & FMODE_WRITE)))
5274f052
JA
887 return -EBADF;
888
cbb7e577 889 ret = rw_verify_area(WRITE, out, ppos, len);
5274f052
JA
890 if (unlikely(ret < 0))
891 return ret;
892
cbb7e577 893 return out->f_op->splice_write(pipe, out, ppos, len, flags);
5274f052
JA
894}
895
83f9135b
JA
896/*
897 * Attempt to initiate a splice from a file to a pipe.
898 */
cbb7e577
JA
899static long do_splice_to(struct file *in, loff_t *ppos,
900 struct pipe_inode_info *pipe, size_t len,
901 unsigned int flags)
5274f052 902{
cbb7e577 903 loff_t isize, left;
5274f052
JA
904 int ret;
905
49570e9b 906 if (unlikely(!in->f_op || !in->f_op->splice_read))
5274f052
JA
907 return -EINVAL;
908
49570e9b 909 if (unlikely(!(in->f_mode & FMODE_READ)))
5274f052
JA
910 return -EBADF;
911
cbb7e577 912 ret = rw_verify_area(READ, in, ppos, len);
5274f052
JA
913 if (unlikely(ret < 0))
914 return ret;
915
916 isize = i_size_read(in->f_mapping->host);
cbb7e577 917 if (unlikely(*ppos >= isize))
5274f052
JA
918 return 0;
919
cbb7e577 920 left = isize - *ppos;
49570e9b 921 if (unlikely(left < len))
5274f052
JA
922 len = left;
923
cbb7e577 924 return in->f_op->splice_read(in, ppos, pipe, len, flags);
5274f052
JA
925}
926
cbb7e577
JA
927long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
928 size_t len, unsigned int flags)
b92ce558
JA
929{
930 struct pipe_inode_info *pipe;
931 long ret, bytes;
cbb7e577 932 loff_t out_off;
b92ce558
JA
933 umode_t i_mode;
934 int i;
935
936 /*
937 * We require the input being a regular file, as we don't want to
938 * randomly drop data for eg socket -> socket splicing. Use the
939 * piped splicing for that!
940 */
941 i_mode = in->f_dentry->d_inode->i_mode;
942 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
943 return -EINVAL;
944
945 /*
946 * neither in nor out is a pipe, setup an internal pipe attached to
947 * 'out' and transfer the wanted data from 'in' to 'out' through that
948 */
949 pipe = current->splice_pipe;
49570e9b 950 if (unlikely(!pipe)) {
b92ce558
JA
951 pipe = alloc_pipe_info(NULL);
952 if (!pipe)
953 return -ENOMEM;
954
955 /*
956 * We don't have an immediate reader, but we'll read the stuff
00522fb4 957 * out of the pipe right after the splice_to_pipe(). So set
b92ce558
JA
958 * PIPE_READERS appropriately.
959 */
960 pipe->readers = 1;
961
962 current->splice_pipe = pipe;
963 }
964
965 /*
73d62d83 966 * Do the splice.
b92ce558
JA
967 */
968 ret = 0;
969 bytes = 0;
cbb7e577 970 out_off = 0;
b92ce558
JA
971
972 while (len) {
973 size_t read_len, max_read_len;
974
975 /*
976 * Do at most PIPE_BUFFERS pages worth of transfer:
977 */
978 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
979
cbb7e577 980 ret = do_splice_to(in, ppos, pipe, max_read_len, flags);
b92ce558
JA
981 if (unlikely(ret < 0))
982 goto out_release;
983
984 read_len = ret;
985
986 /*
987 * NOTE: nonblocking mode only applies to the input. We
988 * must not do the output in nonblocking mode as then we
989 * could get stuck data in the internal pipe:
990 */
cbb7e577 991 ret = do_splice_from(pipe, out, &out_off, read_len,
b92ce558
JA
992 flags & ~SPLICE_F_NONBLOCK);
993 if (unlikely(ret < 0))
994 goto out_release;
995
996 bytes += ret;
997 len -= ret;
998
999 /*
1000 * In nonblocking mode, if we got back a short read then
1001 * that was due to either an IO error or due to the
1002 * pagecache entry not being there. In the IO error case
1003 * the _next_ splice attempt will produce a clean IO error
1004 * return value (not a short read), so in both cases it's
1005 * correct to break out of the loop here:
1006 */
1007 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
1008 break;
1009 }
1010
1011 pipe->nrbufs = pipe->curbuf = 0;
1012
1013 return bytes;
1014
1015out_release:
1016 /*
1017 * If we did an incomplete transfer we must release
1018 * the pipe buffers in question:
1019 */
1020 for (i = 0; i < PIPE_BUFFERS; i++) {
1021 struct pipe_buffer *buf = pipe->bufs + i;
1022
1023 if (buf->ops) {
1024 buf->ops->release(pipe, buf);
1025 buf->ops = NULL;
1026 }
1027 }
1028 pipe->nrbufs = pipe->curbuf = 0;
1029
1030 /*
1031 * If we transferred some data, return the number of bytes:
1032 */
1033 if (bytes > 0)
1034 return bytes;
1035
1036 return ret;
1037}
1038
1039EXPORT_SYMBOL(do_splice_direct);
1040
83f9135b
JA
1041/*
1042 * Determine where to splice to/from.
1043 */
529565dc
IM
1044static long do_splice(struct file *in, loff_t __user *off_in,
1045 struct file *out, loff_t __user *off_out,
1046 size_t len, unsigned int flags)
5274f052 1047{
3a326a2c 1048 struct pipe_inode_info *pipe;
cbb7e577 1049 loff_t offset, *off;
a4514ebd 1050 long ret;
5274f052 1051
3a326a2c 1052 pipe = in->f_dentry->d_inode->i_pipe;
529565dc
IM
1053 if (pipe) {
1054 if (off_in)
1055 return -ESPIPE;
b92ce558
JA
1056 if (off_out) {
1057 if (out->f_op->llseek == no_llseek)
1058 return -EINVAL;
cbb7e577 1059 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
b92ce558 1060 return -EFAULT;
cbb7e577
JA
1061 off = &offset;
1062 } else
1063 off = &out->f_pos;
529565dc 1064
a4514ebd
JA
1065 ret = do_splice_from(pipe, out, off, len, flags);
1066
1067 if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1068 ret = -EFAULT;
1069
1070 return ret;
529565dc 1071 }
5274f052 1072
3a326a2c 1073 pipe = out->f_dentry->d_inode->i_pipe;
529565dc
IM
1074 if (pipe) {
1075 if (off_out)
1076 return -ESPIPE;
b92ce558
JA
1077 if (off_in) {
1078 if (in->f_op->llseek == no_llseek)
1079 return -EINVAL;
cbb7e577 1080 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
b92ce558 1081 return -EFAULT;
cbb7e577
JA
1082 off = &offset;
1083 } else
1084 off = &in->f_pos;
529565dc 1085
a4514ebd
JA
1086 ret = do_splice_to(in, off, pipe, len, flags);
1087
1088 if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1089 ret = -EFAULT;
1090
1091 return ret;
529565dc 1092 }
5274f052
JA
1093
1094 return -EINVAL;
1095}
1096
912d35f8
JA
1097/*
1098 * Map an iov into an array of pages and offset/length tupples. With the
1099 * partial_page structure, we can map several non-contiguous ranges into
1100 * our ones pages[] map instead of splitting that operation into pieces.
1101 * Could easily be exported as a generic helper for other users, in which
1102 * case one would probably want to add a 'max_nr_pages' parameter as well.
1103 */
1104static int get_iovec_page_array(const struct iovec __user *iov,
1105 unsigned int nr_vecs, struct page **pages,
7afa6fd0 1106 struct partial_page *partial, int aligned)
912d35f8
JA
1107{
1108 int buffers = 0, error = 0;
1109
1110 /*
1111 * It's ok to take the mmap_sem for reading, even
1112 * across a "get_user()".
1113 */
1114 down_read(&current->mm->mmap_sem);
1115
1116 while (nr_vecs) {
1117 unsigned long off, npages;
1118 void __user *base;
1119 size_t len;
1120 int i;
1121
1122 /*
1123 * Get user address base and length for this iovec.
1124 */
1125 error = get_user(base, &iov->iov_base);
1126 if (unlikely(error))
1127 break;
1128 error = get_user(len, &iov->iov_len);
1129 if (unlikely(error))
1130 break;
1131
1132 /*
1133 * Sanity check this iovec. 0 read succeeds.
1134 */
1135 if (unlikely(!len))
1136 break;
1137 error = -EFAULT;
1138 if (unlikely(!base))
1139 break;
1140
1141 /*
1142 * Get this base offset and number of pages, then map
1143 * in the user pages.
1144 */
1145 off = (unsigned long) base & ~PAGE_MASK;
7afa6fd0
JA
1146
1147 /*
1148 * If asked for alignment, the offset must be zero and the
1149 * length a multiple of the PAGE_SIZE.
1150 */
1151 error = -EINVAL;
1152 if (aligned && (off || len & ~PAGE_MASK))
1153 break;
1154
912d35f8
JA
1155 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1156 if (npages > PIPE_BUFFERS - buffers)
1157 npages = PIPE_BUFFERS - buffers;
1158
1159 error = get_user_pages(current, current->mm,
1160 (unsigned long) base, npages, 0, 0,
1161 &pages[buffers], NULL);
1162
1163 if (unlikely(error <= 0))
1164 break;
1165
1166 /*
1167 * Fill this contiguous range into the partial page map.
1168 */
1169 for (i = 0; i < error; i++) {
7591489a 1170 const int plen = min_t(size_t, len, PAGE_SIZE - off);
912d35f8
JA
1171
1172 partial[buffers].offset = off;
1173 partial[buffers].len = plen;
1174
1175 off = 0;
1176 len -= plen;
1177 buffers++;
1178 }
1179
1180 /*
1181 * We didn't complete this iov, stop here since it probably
1182 * means we have to move some of this into a pipe to
1183 * be able to continue.
1184 */
1185 if (len)
1186 break;
1187
1188 /*
1189 * Don't continue if we mapped fewer pages than we asked for,
1190 * or if we mapped the max number of pages that we have
1191 * room for.
1192 */
1193 if (error < npages || buffers == PIPE_BUFFERS)
1194 break;
1195
1196 nr_vecs--;
1197 iov++;
1198 }
1199
1200 up_read(&current->mm->mmap_sem);
1201
1202 if (buffers)
1203 return buffers;
1204
1205 return error;
1206}
1207
1208/*
1209 * vmsplice splices a user address range into a pipe. It can be thought of
1210 * as splice-from-memory, where the regular splice is splice-from-file (or
1211 * to file). In both cases the output is a pipe, naturally.
1212 *
1213 * Note that vmsplice only supports splicing _from_ user memory to a pipe,
1214 * not the other way around. Splicing from user memory is a simple operation
1215 * that can be supported without any funky alignment restrictions or nasty
1216 * vm tricks. We simply map in the user memory and fill them into a pipe.
1217 * The reverse isn't quite as easy, though. There are two possible solutions
1218 * for that:
1219 *
1220 * - memcpy() the data internally, at which point we might as well just
1221 * do a regular read() on the buffer anyway.
1222 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1223 * has restriction limitations on both ends of the pipe).
1224 *
1225 * Alas, it isn't here.
1226 *
1227 */
1228static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1229 unsigned long nr_segs, unsigned int flags)
1230{
1231 struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
1232 struct page *pages[PIPE_BUFFERS];
1233 struct partial_page partial[PIPE_BUFFERS];
1234 struct splice_pipe_desc spd = {
1235 .pages = pages,
1236 .partial = partial,
1237 .flags = flags,
1238 .ops = &user_page_pipe_buf_ops,
1239 };
1240
1241 if (unlikely(!pipe))
1242 return -EBADF;
1243 if (unlikely(nr_segs > UIO_MAXIOV))
1244 return -EINVAL;
1245 else if (unlikely(!nr_segs))
1246 return 0;
1247
7afa6fd0
JA
1248 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1249 flags & SPLICE_F_GIFT);
912d35f8
JA
1250 if (spd.nr_pages <= 0)
1251 return spd.nr_pages;
1252
00522fb4 1253 return splice_to_pipe(pipe, &spd);
912d35f8
JA
1254}
1255
1256asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1257 unsigned long nr_segs, unsigned int flags)
1258{
1259 struct file *file;
1260 long error;
1261 int fput;
1262
1263 error = -EBADF;
1264 file = fget_light(fd, &fput);
1265 if (file) {
1266 if (file->f_mode & FMODE_WRITE)
1267 error = do_vmsplice(file, iov, nr_segs, flags);
1268
1269 fput_light(file, fput);
1270 }
1271
1272 return error;
1273}
1274
529565dc
IM
1275asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1276 int fd_out, loff_t __user *off_out,
1277 size_t len, unsigned int flags)
5274f052
JA
1278{
1279 long error;
1280 struct file *in, *out;
1281 int fput_in, fput_out;
1282
1283 if (unlikely(!len))
1284 return 0;
1285
1286 error = -EBADF;
529565dc 1287 in = fget_light(fd_in, &fput_in);
5274f052
JA
1288 if (in) {
1289 if (in->f_mode & FMODE_READ) {
529565dc 1290 out = fget_light(fd_out, &fput_out);
5274f052
JA
1291 if (out) {
1292 if (out->f_mode & FMODE_WRITE)
529565dc
IM
1293 error = do_splice(in, off_in,
1294 out, off_out,
1295 len, flags);
5274f052
JA
1296 fput_light(out, fput_out);
1297 }
1298 }
1299
1300 fput_light(in, fput_in);
1301 }
1302
1303 return error;
1304}
70524490
JA
1305
1306/*
1307 * Link contents of ipipe to opipe.
1308 */
1309static int link_pipe(struct pipe_inode_info *ipipe,
1310 struct pipe_inode_info *opipe,
1311 size_t len, unsigned int flags)
1312{
1313 struct pipe_buffer *ibuf, *obuf;
2a27250e
JA
1314 int ret, do_wakeup, i, ipipe_first;
1315
1316 ret = do_wakeup = ipipe_first = 0;
70524490
JA
1317
1318 /*
1319 * Potential ABBA deadlock, work around it by ordering lock
1320 * grabbing by inode address. Otherwise two different processes
1321 * could deadlock (one doing tee from A -> B, the other from B -> A).
1322 */
1323 if (ipipe->inode < opipe->inode) {
2a27250e 1324 ipipe_first = 1;
70524490
JA
1325 mutex_lock(&ipipe->inode->i_mutex);
1326 mutex_lock(&opipe->inode->i_mutex);
1327 } else {
1328 mutex_lock(&opipe->inode->i_mutex);
1329 mutex_lock(&ipipe->inode->i_mutex);
1330 }
1331
1332 for (i = 0;; i++) {
1333 if (!opipe->readers) {
1334 send_sig(SIGPIPE, current, 0);
1335 if (!ret)
1336 ret = -EPIPE;
1337 break;
1338 }
1339 if (ipipe->nrbufs - i) {
1340 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1341
1342 /*
1343 * If we have room, fill this buffer
1344 */
1345 if (opipe->nrbufs < PIPE_BUFFERS) {
1346 int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1347
1348 /*
1349 * Get a reference to this pipe buffer,
1350 * so we can copy the contents over.
1351 */
1352 ibuf->ops->get(ipipe, ibuf);
1353
1354 obuf = opipe->bufs + nbuf;
1355 *obuf = *ibuf;
1356
7afa6fd0
JA
1357 /*
1358 * Don't inherit the gift flag, we need to
1359 * prevent multiple steals of this page.
1360 */
1361 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1362
70524490
JA
1363 if (obuf->len > len)
1364 obuf->len = len;
1365
1366 opipe->nrbufs++;
1367 do_wakeup = 1;
1368 ret += obuf->len;
1369 len -= obuf->len;
1370
1371 if (!len)
1372 break;
1373 if (opipe->nrbufs < PIPE_BUFFERS)
1374 continue;
1375 }
1376
1377 /*
1378 * We have input available, but no output room.
2a27250e
JA
1379 * If we already copied data, return that. If we
1380 * need to drop the opipe lock, it must be ordered
1381 * last to avoid deadlocks.
70524490 1382 */
2a27250e 1383 if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) {
70524490
JA
1384 if (!ret)
1385 ret = -EAGAIN;
1386 break;
1387 }
1388 if (signal_pending(current)) {
1389 if (!ret)
1390 ret = -ERESTARTSYS;
1391 break;
1392 }
1393 if (do_wakeup) {
1394 smp_mb();
1395 if (waitqueue_active(&opipe->wait))
1396 wake_up_interruptible(&opipe->wait);
1397 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1398 do_wakeup = 0;
1399 }
1400
1401 opipe->waiting_writers++;
1402 pipe_wait(opipe);
1403 opipe->waiting_writers--;
1404 continue;
1405 }
1406
1407 /*
1408 * No input buffers, do the usual checks for available
1409 * writers and blocking and wait if necessary
1410 */
1411 if (!ipipe->writers)
1412 break;
1413 if (!ipipe->waiting_writers) {
1414 if (ret)
1415 break;
1416 }
2a27250e
JA
1417 /*
1418 * pipe_wait() drops the ipipe mutex. To avoid deadlocks
1419 * with another process, we can only safely do that if
1420 * the ipipe lock is ordered last.
1421 */
1422 if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) {
70524490
JA
1423 if (!ret)
1424 ret = -EAGAIN;
1425 break;
1426 }
1427 if (signal_pending(current)) {
1428 if (!ret)
1429 ret = -ERESTARTSYS;
1430 break;
1431 }
1432
1433 if (waitqueue_active(&ipipe->wait))
1434 wake_up_interruptible_sync(&ipipe->wait);
1435 kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT);
1436
1437 pipe_wait(ipipe);
1438 }
1439
1440 mutex_unlock(&ipipe->inode->i_mutex);
1441 mutex_unlock(&opipe->inode->i_mutex);
1442
1443 if (do_wakeup) {
1444 smp_mb();
1445 if (waitqueue_active(&opipe->wait))
1446 wake_up_interruptible(&opipe->wait);
1447 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1448 }
1449
1450 return ret;
1451}
1452
1453/*
1454 * This is a tee(1) implementation that works on pipes. It doesn't copy
1455 * any data, it simply references the 'in' pages on the 'out' pipe.
1456 * The 'flags' used are the SPLICE_F_* variants, currently the only
1457 * applicable one is SPLICE_F_NONBLOCK.
1458 */
1459static long do_tee(struct file *in, struct file *out, size_t len,
1460 unsigned int flags)
1461{
1462 struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe;
1463 struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe;
1464
1465 /*
1466 * Link ipipe to the two output pipes, consuming as we go along.
1467 */
1468 if (ipipe && opipe)
1469 return link_pipe(ipipe, opipe, len, flags);
1470
1471 return -EINVAL;
1472}
1473
1474asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
1475{
1476 struct file *in;
1477 int error, fput_in;
1478
1479 if (unlikely(!len))
1480 return 0;
1481
1482 error = -EBADF;
1483 in = fget_light(fdin, &fput_in);
1484 if (in) {
1485 if (in->f_mode & FMODE_READ) {
1486 int fput_out;
1487 struct file *out = fget_light(fdout, &fput_out);
1488
1489 if (out) {
1490 if (out->f_mode & FMODE_WRITE)
1491 error = do_tee(in, out, len, flags);
1492 fput_light(out, fput_out);
1493 }
1494 }
1495 fput_light(in, fput_in);
1496 }
1497
1498 return error;
1499}
This page took 0.141606 seconds and 5 git commands to generate.