Merge branches 'pm-core', 'pm-qos', 'pm-domains' and 'pm-opp'
[sfrench/cifs-2.6.git] / fs / splice.c
1 /*
2  * "splice": joining two ropes together by interweaving their strands.
3  *
4  * This is the "extended pipe" functionality, where a pipe is used as
5  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6  * buffer that you can use to transfer data from one end to the other.
7  *
8  * The traditional unix read/write is extended with a "splice()" operation
9  * that transfers data buffers to or from a pipe buffer.
10  *
11  * Named by Larry McVoy, original implementation from Linus, extended by
12  * Jens to support splicing to files, network, direct splicing, etc and
13  * fixing lots of bugs.
14  *
15  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18  *
19  */
20 #include <linux/bvec.h>
21 #include <linux/fs.h>
22 #include <linux/file.h>
23 #include <linux/pagemap.h>
24 #include <linux/splice.h>
25 #include <linux/memcontrol.h>
26 #include <linux/mm_inline.h>
27 #include <linux/swap.h>
28 #include <linux/writeback.h>
29 #include <linux/export.h>
30 #include <linux/syscalls.h>
31 #include <linux/uio.h>
32 #include <linux/security.h>
33 #include <linux/gfp.h>
34 #include <linux/socket.h>
35 #include <linux/compat.h>
36 #include "internal.h"
37
38 /*
39  * Attempt to steal a page from a pipe buffer. This should perhaps go into
40  * a vm helper function, it's already simplified quite a bit by the
41  * addition of remove_mapping(). If success is returned, the caller may
42  * attempt to reuse this page for another destination.
43  */
44 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
45                                      struct pipe_buffer *buf)
46 {
47         struct page *page = buf->page;
48         struct address_space *mapping;
49
50         lock_page(page);
51
52         mapping = page_mapping(page);
53         if (mapping) {
54                 WARN_ON(!PageUptodate(page));
55
56                 /*
57                  * At least for ext2 with nobh option, we need to wait on
58                  * writeback completing on this page, since we'll remove it
59                  * from the pagecache.  Otherwise truncate wont wait on the
60                  * page, allowing the disk blocks to be reused by someone else
61                  * before we actually wrote our data to them. fs corruption
62                  * ensues.
63                  */
64                 wait_on_page_writeback(page);
65
66                 if (page_has_private(page) &&
67                     !try_to_release_page(page, GFP_KERNEL))
68                         goto out_unlock;
69
70                 /*
71                  * If we succeeded in removing the mapping, set LRU flag
72                  * and return good.
73                  */
74                 if (remove_mapping(mapping, page)) {
75                         buf->flags |= PIPE_BUF_FLAG_LRU;
76                         return 0;
77                 }
78         }
79
80         /*
81          * Raced with truncate or failed to remove page from current
82          * address space, unlock and return failure.
83          */
84 out_unlock:
85         unlock_page(page);
86         return 1;
87 }
88
89 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
90                                         struct pipe_buffer *buf)
91 {
92         put_page(buf->page);
93         buf->flags &= ~PIPE_BUF_FLAG_LRU;
94 }
95
96 /*
97  * Check whether the contents of buf is OK to access. Since the content
98  * is a page cache page, IO may be in flight.
99  */
100 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
101                                        struct pipe_buffer *buf)
102 {
103         struct page *page = buf->page;
104         int err;
105
106         if (!PageUptodate(page)) {
107                 lock_page(page);
108
109                 /*
110                  * Page got truncated/unhashed. This will cause a 0-byte
111                  * splice, if this is the first page.
112                  */
113                 if (!page->mapping) {
114                         err = -ENODATA;
115                         goto error;
116                 }
117
118                 /*
119                  * Uh oh, read-error from disk.
120                  */
121                 if (!PageUptodate(page)) {
122                         err = -EIO;
123                         goto error;
124                 }
125
126                 /*
127                  * Page is ok afterall, we are done.
128                  */
129                 unlock_page(page);
130         }
131
132         return 0;
133 error:
134         unlock_page(page);
135         return err;
136 }
137
138 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
139         .can_merge = 0,
140         .confirm = page_cache_pipe_buf_confirm,
141         .release = page_cache_pipe_buf_release,
142         .steal = page_cache_pipe_buf_steal,
143         .get = generic_pipe_buf_get,
144 };
145
146 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
147                                     struct pipe_buffer *buf)
148 {
149         if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
150                 return 1;
151
152         buf->flags |= PIPE_BUF_FLAG_LRU;
153         return generic_pipe_buf_steal(pipe, buf);
154 }
155
156 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
157         .can_merge = 0,
158         .confirm = generic_pipe_buf_confirm,
159         .release = page_cache_pipe_buf_release,
160         .steal = user_page_pipe_buf_steal,
161         .get = generic_pipe_buf_get,
162 };
163
164 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
165 {
166         smp_mb();
167         if (waitqueue_active(&pipe->wait))
168                 wake_up_interruptible(&pipe->wait);
169         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
170 }
171
172 /**
173  * splice_to_pipe - fill passed data into a pipe
174  * @pipe:       pipe to fill
175  * @spd:        data to fill
176  *
177  * Description:
178  *    @spd contains a map of pages and len/offset tuples, along with
179  *    the struct pipe_buf_operations associated with these pages. This
180  *    function will link that data to the pipe.
181  *
182  */
183 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
184                        struct splice_pipe_desc *spd)
185 {
186         unsigned int spd_pages = spd->nr_pages;
187         int ret = 0, page_nr = 0;
188
189         if (!spd_pages)
190                 return 0;
191
192         if (unlikely(!pipe->readers)) {
193                 send_sig(SIGPIPE, current, 0);
194                 ret = -EPIPE;
195                 goto out;
196         }
197
198         while (pipe->nrbufs < pipe->buffers) {
199                 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
200                 struct pipe_buffer *buf = pipe->bufs + newbuf;
201
202                 buf->page = spd->pages[page_nr];
203                 buf->offset = spd->partial[page_nr].offset;
204                 buf->len = spd->partial[page_nr].len;
205                 buf->private = spd->partial[page_nr].private;
206                 buf->ops = spd->ops;
207                 buf->flags = 0;
208
209                 pipe->nrbufs++;
210                 page_nr++;
211                 ret += buf->len;
212
213                 if (!--spd->nr_pages)
214                         break;
215         }
216
217         if (!ret)
218                 ret = -EAGAIN;
219
220 out:
221         while (page_nr < spd_pages)
222                 spd->spd_release(spd, page_nr++);
223
224         return ret;
225 }
226 EXPORT_SYMBOL_GPL(splice_to_pipe);
227
228 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
229 {
230         int ret;
231
232         if (unlikely(!pipe->readers)) {
233                 send_sig(SIGPIPE, current, 0);
234                 ret = -EPIPE;
235         } else if (pipe->nrbufs == pipe->buffers) {
236                 ret = -EAGAIN;
237         } else {
238                 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
239                 pipe->bufs[newbuf] = *buf;
240                 pipe->nrbufs++;
241                 return buf->len;
242         }
243         pipe_buf_release(pipe, buf);
244         return ret;
245 }
246 EXPORT_SYMBOL(add_to_pipe);
247
248 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
249 {
250         put_page(spd->pages[i]);
251 }
252
253 /*
254  * Check if we need to grow the arrays holding pages and partial page
255  * descriptions.
256  */
257 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
258 {
259         unsigned int buffers = ACCESS_ONCE(pipe->buffers);
260
261         spd->nr_pages_max = buffers;
262         if (buffers <= PIPE_DEF_BUFFERS)
263                 return 0;
264
265         spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
266         spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
267
268         if (spd->pages && spd->partial)
269                 return 0;
270
271         kfree(spd->pages);
272         kfree(spd->partial);
273         return -ENOMEM;
274 }
275
276 void splice_shrink_spd(struct splice_pipe_desc *spd)
277 {
278         if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
279                 return;
280
281         kfree(spd->pages);
282         kfree(spd->partial);
283 }
284
285 /**
286  * generic_file_splice_read - splice data from file to a pipe
287  * @in:         file to splice from
288  * @ppos:       position in @in
289  * @pipe:       pipe to splice to
290  * @len:        number of bytes to splice
291  * @flags:      splice modifier flags
292  *
293  * Description:
294  *    Will read pages from given file and fill them into a pipe. Can be
295  *    used as long as it has more or less sane ->read_iter().
296  *
297  */
298 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
299                                  struct pipe_inode_info *pipe, size_t len,
300                                  unsigned int flags)
301 {
302         struct iov_iter to;
303         struct kiocb kiocb;
304         int idx, ret;
305
306         iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len);
307         idx = to.idx;
308         init_sync_kiocb(&kiocb, in);
309         kiocb.ki_pos = *ppos;
310         ret = in->f_op->read_iter(&kiocb, &to);
311         if (ret > 0) {
312                 *ppos = kiocb.ki_pos;
313                 file_accessed(in);
314         } else if (ret < 0) {
315                 to.idx = idx;
316                 to.iov_offset = 0;
317                 iov_iter_advance(&to, 0); /* to free what was emitted */
318                 /*
319                  * callers of ->splice_read() expect -EAGAIN on
320                  * "can't put anything in there", rather than -EFAULT.
321                  */
322                 if (ret == -EFAULT)
323                         ret = -EAGAIN;
324         }
325
326         return ret;
327 }
328 EXPORT_SYMBOL(generic_file_splice_read);
329
330 const struct pipe_buf_operations default_pipe_buf_ops = {
331         .can_merge = 0,
332         .confirm = generic_pipe_buf_confirm,
333         .release = generic_pipe_buf_release,
334         .steal = generic_pipe_buf_steal,
335         .get = generic_pipe_buf_get,
336 };
337
338 static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
339                                     struct pipe_buffer *buf)
340 {
341         return 1;
342 }
343
344 /* Pipe buffer operations for a socket and similar. */
345 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
346         .can_merge = 0,
347         .confirm = generic_pipe_buf_confirm,
348         .release = generic_pipe_buf_release,
349         .steal = generic_pipe_buf_nosteal,
350         .get = generic_pipe_buf_get,
351 };
352 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
353
354 static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
355                             unsigned long vlen, loff_t offset)
356 {
357         mm_segment_t old_fs;
358         loff_t pos = offset;
359         ssize_t res;
360
361         old_fs = get_fs();
362         set_fs(get_ds());
363         /* The cast to a user pointer is valid due to the set_fs() */
364         res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
365         set_fs(old_fs);
366
367         return res;
368 }
369
370 ssize_t kernel_write(struct file *file, const char *buf, size_t count,
371                             loff_t pos)
372 {
373         mm_segment_t old_fs;
374         ssize_t res;
375
376         old_fs = get_fs();
377         set_fs(get_ds());
378         /* The cast to a user pointer is valid due to the set_fs() */
379         res = vfs_write(file, (__force const char __user *)buf, count, &pos);
380         set_fs(old_fs);
381
382         return res;
383 }
384 EXPORT_SYMBOL(kernel_write);
385
386 static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
387                                  struct pipe_inode_info *pipe, size_t len,
388                                  unsigned int flags)
389 {
390         struct kvec *vec, __vec[PIPE_DEF_BUFFERS];
391         struct iov_iter to;
392         struct page **pages;
393         unsigned int nr_pages;
394         size_t offset, dummy, copied = 0;
395         ssize_t res;
396         int i;
397
398         if (pipe->nrbufs == pipe->buffers)
399                 return -EAGAIN;
400
401         /*
402          * Try to keep page boundaries matching to source pagecache ones -
403          * it probably won't be much help, but...
404          */
405         offset = *ppos & ~PAGE_MASK;
406
407         iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset);
408
409         res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy);
410         if (res <= 0)
411                 return -ENOMEM;
412
413         BUG_ON(dummy);
414         nr_pages = DIV_ROUND_UP(res, PAGE_SIZE);
415
416         vec = __vec;
417         if (nr_pages > PIPE_DEF_BUFFERS) {
418                 vec = kmalloc(nr_pages * sizeof(struct kvec), GFP_KERNEL);
419                 if (unlikely(!vec)) {
420                         res = -ENOMEM;
421                         goto out;
422                 }
423         }
424
425         pipe->bufs[to.idx].offset = offset;
426         pipe->bufs[to.idx].len -= offset;
427
428         for (i = 0; i < nr_pages; i++) {
429                 size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
430                 vec[i].iov_base = page_address(pages[i]) + offset;
431                 vec[i].iov_len = this_len;
432                 len -= this_len;
433                 offset = 0;
434         }
435
436         res = kernel_readv(in, vec, nr_pages, *ppos);
437         if (res > 0) {
438                 copied = res;
439                 *ppos += res;
440         }
441
442         if (vec != __vec)
443                 kfree(vec);
444 out:
445         for (i = 0; i < nr_pages; i++)
446                 put_page(pages[i]);
447         kvfree(pages);
448         iov_iter_advance(&to, copied);  /* truncates and discards */
449         return res;
450 }
451
452 /*
453  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
454  * using sendpage(). Return the number of bytes sent.
455  */
456 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
457                             struct pipe_buffer *buf, struct splice_desc *sd)
458 {
459         struct file *file = sd->u.file;
460         loff_t pos = sd->pos;
461         int more;
462
463         if (!likely(file->f_op->sendpage))
464                 return -EINVAL;
465
466         more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
467
468         if (sd->len < sd->total_len && pipe->nrbufs > 1)
469                 more |= MSG_SENDPAGE_NOTLAST;
470
471         return file->f_op->sendpage(file, buf->page, buf->offset,
472                                     sd->len, &pos, more);
473 }
474
475 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
476 {
477         smp_mb();
478         if (waitqueue_active(&pipe->wait))
479                 wake_up_interruptible(&pipe->wait);
480         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
481 }
482
483 /**
484  * splice_from_pipe_feed - feed available data from a pipe to a file
485  * @pipe:       pipe to splice from
486  * @sd:         information to @actor
487  * @actor:      handler that splices the data
488  *
489  * Description:
490  *    This function loops over the pipe and calls @actor to do the
491  *    actual moving of a single struct pipe_buffer to the desired
492  *    destination.  It returns when there's no more buffers left in
493  *    the pipe or if the requested number of bytes (@sd->total_len)
494  *    have been copied.  It returns a positive number (one) if the
495  *    pipe needs to be filled with more data, zero if the required
496  *    number of bytes have been copied and -errno on error.
497  *
498  *    This, together with splice_from_pipe_{begin,end,next}, may be
499  *    used to implement the functionality of __splice_from_pipe() when
500  *    locking is required around copying the pipe buffers to the
501  *    destination.
502  */
503 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
504                           splice_actor *actor)
505 {
506         int ret;
507
508         while (pipe->nrbufs) {
509                 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
510
511                 sd->len = buf->len;
512                 if (sd->len > sd->total_len)
513                         sd->len = sd->total_len;
514
515                 ret = pipe_buf_confirm(pipe, buf);
516                 if (unlikely(ret)) {
517                         if (ret == -ENODATA)
518                                 ret = 0;
519                         return ret;
520                 }
521
522                 ret = actor(pipe, buf, sd);
523                 if (ret <= 0)
524                         return ret;
525
526                 buf->offset += ret;
527                 buf->len -= ret;
528
529                 sd->num_spliced += ret;
530                 sd->len -= ret;
531                 sd->pos += ret;
532                 sd->total_len -= ret;
533
534                 if (!buf->len) {
535                         pipe_buf_release(pipe, buf);
536                         pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
537                         pipe->nrbufs--;
538                         if (pipe->files)
539                                 sd->need_wakeup = true;
540                 }
541
542                 if (!sd->total_len)
543                         return 0;
544         }
545
546         return 1;
547 }
548
549 /**
550  * splice_from_pipe_next - wait for some data to splice from
551  * @pipe:       pipe to splice from
552  * @sd:         information about the splice operation
553  *
554  * Description:
555  *    This function will wait for some data and return a positive
556  *    value (one) if pipe buffers are available.  It will return zero
557  *    or -errno if no more data needs to be spliced.
558  */
559 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
560 {
561         /*
562          * Check for signal early to make process killable when there are
563          * always buffers available
564          */
565         if (signal_pending(current))
566                 return -ERESTARTSYS;
567
568         while (!pipe->nrbufs) {
569                 if (!pipe->writers)
570                         return 0;
571
572                 if (!pipe->waiting_writers && sd->num_spliced)
573                         return 0;
574
575                 if (sd->flags & SPLICE_F_NONBLOCK)
576                         return -EAGAIN;
577
578                 if (signal_pending(current))
579                         return -ERESTARTSYS;
580
581                 if (sd->need_wakeup) {
582                         wakeup_pipe_writers(pipe);
583                         sd->need_wakeup = false;
584                 }
585
586                 pipe_wait(pipe);
587         }
588
589         return 1;
590 }
591
592 /**
593  * splice_from_pipe_begin - start splicing from pipe
594  * @sd:         information about the splice operation
595  *
596  * Description:
597  *    This function should be called before a loop containing
598  *    splice_from_pipe_next() and splice_from_pipe_feed() to
599  *    initialize the necessary fields of @sd.
600  */
601 static void splice_from_pipe_begin(struct splice_desc *sd)
602 {
603         sd->num_spliced = 0;
604         sd->need_wakeup = false;
605 }
606
607 /**
608  * splice_from_pipe_end - finish splicing from pipe
609  * @pipe:       pipe to splice from
610  * @sd:         information about the splice operation
611  *
612  * Description:
613  *    This function will wake up pipe writers if necessary.  It should
614  *    be called after a loop containing splice_from_pipe_next() and
615  *    splice_from_pipe_feed().
616  */
617 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
618 {
619         if (sd->need_wakeup)
620                 wakeup_pipe_writers(pipe);
621 }
622
623 /**
624  * __splice_from_pipe - splice data from a pipe to given actor
625  * @pipe:       pipe to splice from
626  * @sd:         information to @actor
627  * @actor:      handler that splices the data
628  *
629  * Description:
630  *    This function does little more than loop over the pipe and call
631  *    @actor to do the actual moving of a single struct pipe_buffer to
632  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
633  *    pipe_to_user.
634  *
635  */
636 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
637                            splice_actor *actor)
638 {
639         int ret;
640
641         splice_from_pipe_begin(sd);
642         do {
643                 cond_resched();
644                 ret = splice_from_pipe_next(pipe, sd);
645                 if (ret > 0)
646                         ret = splice_from_pipe_feed(pipe, sd, actor);
647         } while (ret > 0);
648         splice_from_pipe_end(pipe, sd);
649
650         return sd->num_spliced ? sd->num_spliced : ret;
651 }
652 EXPORT_SYMBOL(__splice_from_pipe);
653
654 /**
655  * splice_from_pipe - splice data from a pipe to a file
656  * @pipe:       pipe to splice from
657  * @out:        file to splice to
658  * @ppos:       position in @out
659  * @len:        how many bytes to splice
660  * @flags:      splice modifier flags
661  * @actor:      handler that splices the data
662  *
663  * Description:
664  *    See __splice_from_pipe. This function locks the pipe inode,
665  *    otherwise it's identical to __splice_from_pipe().
666  *
667  */
668 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
669                          loff_t *ppos, size_t len, unsigned int flags,
670                          splice_actor *actor)
671 {
672         ssize_t ret;
673         struct splice_desc sd = {
674                 .total_len = len,
675                 .flags = flags,
676                 .pos = *ppos,
677                 .u.file = out,
678         };
679
680         pipe_lock(pipe);
681         ret = __splice_from_pipe(pipe, &sd, actor);
682         pipe_unlock(pipe);
683
684         return ret;
685 }
686
687 /**
688  * iter_file_splice_write - splice data from a pipe to a file
689  * @pipe:       pipe info
690  * @out:        file to write to
691  * @ppos:       position in @out
692  * @len:        number of bytes to splice
693  * @flags:      splice modifier flags
694  *
695  * Description:
696  *    Will either move or copy pages (determined by @flags options) from
697  *    the given pipe inode to the given file.
698  *    This one is ->write_iter-based.
699  *
700  */
701 ssize_t
702 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
703                           loff_t *ppos, size_t len, unsigned int flags)
704 {
705         struct splice_desc sd = {
706                 .total_len = len,
707                 .flags = flags,
708                 .pos = *ppos,
709                 .u.file = out,
710         };
711         int nbufs = pipe->buffers;
712         struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
713                                         GFP_KERNEL);
714         ssize_t ret;
715
716         if (unlikely(!array))
717                 return -ENOMEM;
718
719         pipe_lock(pipe);
720
721         splice_from_pipe_begin(&sd);
722         while (sd.total_len) {
723                 struct iov_iter from;
724                 size_t left;
725                 int n, idx;
726
727                 ret = splice_from_pipe_next(pipe, &sd);
728                 if (ret <= 0)
729                         break;
730
731                 if (unlikely(nbufs < pipe->buffers)) {
732                         kfree(array);
733                         nbufs = pipe->buffers;
734                         array = kcalloc(nbufs, sizeof(struct bio_vec),
735                                         GFP_KERNEL);
736                         if (!array) {
737                                 ret = -ENOMEM;
738                                 break;
739                         }
740                 }
741
742                 /* build the vector */
743                 left = sd.total_len;
744                 for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
745                         struct pipe_buffer *buf = pipe->bufs + idx;
746                         size_t this_len = buf->len;
747
748                         if (this_len > left)
749                                 this_len = left;
750
751                         if (idx == pipe->buffers - 1)
752                                 idx = -1;
753
754                         ret = pipe_buf_confirm(pipe, buf);
755                         if (unlikely(ret)) {
756                                 if (ret == -ENODATA)
757                                         ret = 0;
758                                 goto done;
759                         }
760
761                         array[n].bv_page = buf->page;
762                         array[n].bv_len = this_len;
763                         array[n].bv_offset = buf->offset;
764                         left -= this_len;
765                 }
766
767                 iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
768                               sd.total_len - left);
769                 ret = vfs_iter_write(out, &from, &sd.pos);
770                 if (ret <= 0)
771                         break;
772
773                 sd.num_spliced += ret;
774                 sd.total_len -= ret;
775                 *ppos = sd.pos;
776
777                 /* dismiss the fully eaten buffers, adjust the partial one */
778                 while (ret) {
779                         struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
780                         if (ret >= buf->len) {
781                                 ret -= buf->len;
782                                 buf->len = 0;
783                                 pipe_buf_release(pipe, buf);
784                                 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
785                                 pipe->nrbufs--;
786                                 if (pipe->files)
787                                         sd.need_wakeup = true;
788                         } else {
789                                 buf->offset += ret;
790                                 buf->len -= ret;
791                                 ret = 0;
792                         }
793                 }
794         }
795 done:
796         kfree(array);
797         splice_from_pipe_end(pipe, &sd);
798
799         pipe_unlock(pipe);
800
801         if (sd.num_spliced)
802                 ret = sd.num_spliced;
803
804         return ret;
805 }
806
807 EXPORT_SYMBOL(iter_file_splice_write);
808
809 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
810                           struct splice_desc *sd)
811 {
812         int ret;
813         void *data;
814         loff_t tmp = sd->pos;
815
816         data = kmap(buf->page);
817         ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
818         kunmap(buf->page);
819
820         return ret;
821 }
822
823 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
824                                          struct file *out, loff_t *ppos,
825                                          size_t len, unsigned int flags)
826 {
827         ssize_t ret;
828
829         ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
830         if (ret > 0)
831                 *ppos += ret;
832
833         return ret;
834 }
835
836 /**
837  * generic_splice_sendpage - splice data from a pipe to a socket
838  * @pipe:       pipe to splice from
839  * @out:        socket to write to
840  * @ppos:       position in @out
841  * @len:        number of bytes to splice
842  * @flags:      splice modifier flags
843  *
844  * Description:
845  *    Will send @len bytes from the pipe to a network socket. No data copying
846  *    is involved.
847  *
848  */
849 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
850                                 loff_t *ppos, size_t len, unsigned int flags)
851 {
852         return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
853 }
854
855 EXPORT_SYMBOL(generic_splice_sendpage);
856
857 /*
858  * Attempt to initiate a splice from pipe to file.
859  */
860 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
861                            loff_t *ppos, size_t len, unsigned int flags)
862 {
863         ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
864                                 loff_t *, size_t, unsigned int);
865
866         if (out->f_op->splice_write)
867                 splice_write = out->f_op->splice_write;
868         else
869                 splice_write = default_file_splice_write;
870
871         return splice_write(pipe, out, ppos, len, flags);
872 }
873
874 /*
875  * Attempt to initiate a splice from a file to a pipe.
876  */
877 static long do_splice_to(struct file *in, loff_t *ppos,
878                          struct pipe_inode_info *pipe, size_t len,
879                          unsigned int flags)
880 {
881         ssize_t (*splice_read)(struct file *, loff_t *,
882                                struct pipe_inode_info *, size_t, unsigned int);
883         int ret;
884
885         if (unlikely(!(in->f_mode & FMODE_READ)))
886                 return -EBADF;
887
888         ret = rw_verify_area(READ, in, ppos, len);
889         if (unlikely(ret < 0))
890                 return ret;
891
892         if (unlikely(len > MAX_RW_COUNT))
893                 len = MAX_RW_COUNT;
894
895         if (in->f_op->splice_read)
896                 splice_read = in->f_op->splice_read;
897         else
898                 splice_read = default_file_splice_read;
899
900         return splice_read(in, ppos, pipe, len, flags);
901 }
902
903 /**
904  * splice_direct_to_actor - splices data directly between two non-pipes
905  * @in:         file to splice from
906  * @sd:         actor information on where to splice to
907  * @actor:      handles the data splicing
908  *
909  * Description:
910  *    This is a special case helper to splice directly between two
911  *    points, without requiring an explicit pipe. Internally an allocated
912  *    pipe is cached in the process, and reused during the lifetime of
913  *    that process.
914  *
915  */
916 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
917                                splice_direct_actor *actor)
918 {
919         struct pipe_inode_info *pipe;
920         long ret, bytes;
921         umode_t i_mode;
922         size_t len;
923         int i, flags, more;
924
925         /*
926          * We require the input being a regular file, as we don't want to
927          * randomly drop data for eg socket -> socket splicing. Use the
928          * piped splicing for that!
929          */
930         i_mode = file_inode(in)->i_mode;
931         if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
932                 return -EINVAL;
933
934         /*
935          * neither in nor out is a pipe, setup an internal pipe attached to
936          * 'out' and transfer the wanted data from 'in' to 'out' through that
937          */
938         pipe = current->splice_pipe;
939         if (unlikely(!pipe)) {
940                 pipe = alloc_pipe_info();
941                 if (!pipe)
942                         return -ENOMEM;
943
944                 /*
945                  * We don't have an immediate reader, but we'll read the stuff
946                  * out of the pipe right after the splice_to_pipe(). So set
947                  * PIPE_READERS appropriately.
948                  */
949                 pipe->readers = 1;
950
951                 current->splice_pipe = pipe;
952         }
953
954         /*
955          * Do the splice.
956          */
957         ret = 0;
958         bytes = 0;
959         len = sd->total_len;
960         flags = sd->flags;
961
962         /*
963          * Don't block on output, we have to drain the direct pipe.
964          */
965         sd->flags &= ~SPLICE_F_NONBLOCK;
966         more = sd->flags & SPLICE_F_MORE;
967
968         while (len) {
969                 size_t read_len;
970                 loff_t pos = sd->pos, prev_pos = pos;
971
972                 ret = do_splice_to(in, &pos, pipe, len, flags);
973                 if (unlikely(ret <= 0))
974                         goto out_release;
975
976                 read_len = ret;
977                 sd->total_len = read_len;
978
979                 /*
980                  * If more data is pending, set SPLICE_F_MORE
981                  * If this is the last data and SPLICE_F_MORE was not set
982                  * initially, clears it.
983                  */
984                 if (read_len < len)
985                         sd->flags |= SPLICE_F_MORE;
986                 else if (!more)
987                         sd->flags &= ~SPLICE_F_MORE;
988                 /*
989                  * NOTE: nonblocking mode only applies to the input. We
990                  * must not do the output in nonblocking mode as then we
991                  * could get stuck data in the internal pipe:
992                  */
993                 ret = actor(pipe, sd);
994                 if (unlikely(ret <= 0)) {
995                         sd->pos = prev_pos;
996                         goto out_release;
997                 }
998
999                 bytes += ret;
1000                 len -= ret;
1001                 sd->pos = pos;
1002
1003                 if (ret < read_len) {
1004                         sd->pos = prev_pos + ret;
1005                         goto out_release;
1006                 }
1007         }
1008
1009 done:
1010         pipe->nrbufs = pipe->curbuf = 0;
1011         file_accessed(in);
1012         return bytes;
1013
1014 out_release:
1015         /*
1016          * If we did an incomplete transfer we must release
1017          * the pipe buffers in question:
1018          */
1019         for (i = 0; i < pipe->buffers; i++) {
1020                 struct pipe_buffer *buf = pipe->bufs + i;
1021
1022                 if (buf->ops)
1023                         pipe_buf_release(pipe, buf);
1024         }
1025
1026         if (!bytes)
1027                 bytes = ret;
1028
1029         goto done;
1030 }
1031 EXPORT_SYMBOL(splice_direct_to_actor);
1032
1033 static int direct_splice_actor(struct pipe_inode_info *pipe,
1034                                struct splice_desc *sd)
1035 {
1036         struct file *file = sd->u.file;
1037
1038         return do_splice_from(pipe, file, sd->opos, sd->total_len,
1039                               sd->flags);
1040 }
1041
1042 /**
1043  * do_splice_direct - splices data directly between two files
1044  * @in:         file to splice from
1045  * @ppos:       input file offset
1046  * @out:        file to splice to
1047  * @opos:       output file offset
1048  * @len:        number of bytes to splice
1049  * @flags:      splice modifier flags
1050  *
1051  * Description:
1052  *    For use by do_sendfile(). splice can easily emulate sendfile, but
1053  *    doing it in the application would incur an extra system call
1054  *    (splice in + splice out, as compared to just sendfile()). So this helper
1055  *    can splice directly through a process-private pipe.
1056  *
1057  */
1058 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1059                       loff_t *opos, size_t len, unsigned int flags)
1060 {
1061         struct splice_desc sd = {
1062                 .len            = len,
1063                 .total_len      = len,
1064                 .flags          = flags,
1065                 .pos            = *ppos,
1066                 .u.file         = out,
1067                 .opos           = opos,
1068         };
1069         long ret;
1070
1071         if (unlikely(!(out->f_mode & FMODE_WRITE)))
1072                 return -EBADF;
1073
1074         if (unlikely(out->f_flags & O_APPEND))
1075                 return -EINVAL;
1076
1077         ret = rw_verify_area(WRITE, out, opos, len);
1078         if (unlikely(ret < 0))
1079                 return ret;
1080
1081         ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1082         if (ret > 0)
1083                 *ppos = sd.pos;
1084
1085         return ret;
1086 }
1087 EXPORT_SYMBOL(do_splice_direct);
1088
1089 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1090 {
1091         for (;;) {
1092                 if (unlikely(!pipe->readers)) {
1093                         send_sig(SIGPIPE, current, 0);
1094                         return -EPIPE;
1095                 }
1096                 if (pipe->nrbufs != pipe->buffers)
1097                         return 0;
1098                 if (flags & SPLICE_F_NONBLOCK)
1099                         return -EAGAIN;
1100                 if (signal_pending(current))
1101                         return -ERESTARTSYS;
1102                 pipe->waiting_writers++;
1103                 pipe_wait(pipe);
1104                 pipe->waiting_writers--;
1105         }
1106 }
1107
1108 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1109                                struct pipe_inode_info *opipe,
1110                                size_t len, unsigned int flags);
1111
1112 /*
1113  * Determine where to splice to/from.
1114  */
1115 static long do_splice(struct file *in, loff_t __user *off_in,
1116                       struct file *out, loff_t __user *off_out,
1117                       size_t len, unsigned int flags)
1118 {
1119         struct pipe_inode_info *ipipe;
1120         struct pipe_inode_info *opipe;
1121         loff_t offset;
1122         long ret;
1123
1124         ipipe = get_pipe_info(in);
1125         opipe = get_pipe_info(out);
1126
1127         if (ipipe && opipe) {
1128                 if (off_in || off_out)
1129                         return -ESPIPE;
1130
1131                 if (!(in->f_mode & FMODE_READ))
1132                         return -EBADF;
1133
1134                 if (!(out->f_mode & FMODE_WRITE))
1135                         return -EBADF;
1136
1137                 /* Splicing to self would be fun, but... */
1138                 if (ipipe == opipe)
1139                         return -EINVAL;
1140
1141                 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1142         }
1143
1144         if (ipipe) {
1145                 if (off_in)
1146                         return -ESPIPE;
1147                 if (off_out) {
1148                         if (!(out->f_mode & FMODE_PWRITE))
1149                                 return -EINVAL;
1150                         if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1151                                 return -EFAULT;
1152                 } else {
1153                         offset = out->f_pos;
1154                 }
1155
1156                 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1157                         return -EBADF;
1158
1159                 if (unlikely(out->f_flags & O_APPEND))
1160                         return -EINVAL;
1161
1162                 ret = rw_verify_area(WRITE, out, &offset, len);
1163                 if (unlikely(ret < 0))
1164                         return ret;
1165
1166                 file_start_write(out);
1167                 ret = do_splice_from(ipipe, out, &offset, len, flags);
1168                 file_end_write(out);
1169
1170                 if (!off_out)
1171                         out->f_pos = offset;
1172                 else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1173                         ret = -EFAULT;
1174
1175                 return ret;
1176         }
1177
1178         if (opipe) {
1179                 if (off_out)
1180                         return -ESPIPE;
1181                 if (off_in) {
1182                         if (!(in->f_mode & FMODE_PREAD))
1183                                 return -EINVAL;
1184                         if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1185                                 return -EFAULT;
1186                 } else {
1187                         offset = in->f_pos;
1188                 }
1189
1190                 pipe_lock(opipe);
1191                 ret = wait_for_space(opipe, flags);
1192                 if (!ret)
1193                         ret = do_splice_to(in, &offset, opipe, len, flags);
1194                 pipe_unlock(opipe);
1195                 if (ret > 0)
1196                         wakeup_pipe_readers(opipe);
1197                 if (!off_in)
1198                         in->f_pos = offset;
1199                 else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1200                         ret = -EFAULT;
1201
1202                 return ret;
1203         }
1204
1205         return -EINVAL;
1206 }
1207
1208 static int iter_to_pipe(struct iov_iter *from,
1209                         struct pipe_inode_info *pipe,
1210                         unsigned flags)
1211 {
1212         struct pipe_buffer buf = {
1213                 .ops = &user_page_pipe_buf_ops,
1214                 .flags = flags
1215         };
1216         size_t total = 0;
1217         int ret = 0;
1218         bool failed = false;
1219
1220         while (iov_iter_count(from) && !failed) {
1221                 struct page *pages[16];
1222                 ssize_t copied;
1223                 size_t start;
1224                 int n;
1225
1226                 copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
1227                 if (copied <= 0) {
1228                         ret = copied;
1229                         break;
1230                 }
1231
1232                 for (n = 0; copied; n++, start = 0) {
1233                         int size = min_t(int, copied, PAGE_SIZE - start);
1234                         if (!failed) {
1235                                 buf.page = pages[n];
1236                                 buf.offset = start;
1237                                 buf.len = size;
1238                                 ret = add_to_pipe(pipe, &buf);
1239                                 if (unlikely(ret < 0)) {
1240                                         failed = true;
1241                                 } else {
1242                                         iov_iter_advance(from, ret);
1243                                         total += ret;
1244                                 }
1245                         } else {
1246                                 put_page(pages[n]);
1247                         }
1248                         copied -= size;
1249                 }
1250         }
1251         return total ? total : ret;
1252 }
1253
1254 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1255                         struct splice_desc *sd)
1256 {
1257         int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1258         return n == sd->len ? n : -EFAULT;
1259 }
1260
1261 /*
1262  * For lack of a better implementation, implement vmsplice() to userspace
1263  * as a simple copy of the pipes pages to the user iov.
1264  */
1265 static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
1266                              unsigned long nr_segs, unsigned int flags)
1267 {
1268         struct pipe_inode_info *pipe;
1269         struct splice_desc sd;
1270         long ret;
1271         struct iovec iovstack[UIO_FASTIOV];
1272         struct iovec *iov = iovstack;
1273         struct iov_iter iter;
1274
1275         pipe = get_pipe_info(file);
1276         if (!pipe)
1277                 return -EBADF;
1278
1279         ret = import_iovec(READ, uiov, nr_segs,
1280                            ARRAY_SIZE(iovstack), &iov, &iter);
1281         if (ret < 0)
1282                 return ret;
1283
1284         sd.total_len = iov_iter_count(&iter);
1285         sd.len = 0;
1286         sd.flags = flags;
1287         sd.u.data = &iter;
1288         sd.pos = 0;
1289
1290         if (sd.total_len) {
1291                 pipe_lock(pipe);
1292                 ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1293                 pipe_unlock(pipe);
1294         }
1295
1296         kfree(iov);
1297         return ret;
1298 }
1299
1300 /*
1301  * vmsplice splices a user address range into a pipe. It can be thought of
1302  * as splice-from-memory, where the regular splice is splice-from-file (or
1303  * to file). In both cases the output is a pipe, naturally.
1304  */
1305 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov,
1306                              unsigned long nr_segs, unsigned int flags)
1307 {
1308         struct pipe_inode_info *pipe;
1309         struct iovec iovstack[UIO_FASTIOV];
1310         struct iovec *iov = iovstack;
1311         struct iov_iter from;
1312         long ret;
1313         unsigned buf_flag = 0;
1314
1315         if (flags & SPLICE_F_GIFT)
1316                 buf_flag = PIPE_BUF_FLAG_GIFT;
1317
1318         pipe = get_pipe_info(file);
1319         if (!pipe)
1320                 return -EBADF;
1321
1322         ret = import_iovec(WRITE, uiov, nr_segs,
1323                            ARRAY_SIZE(iovstack), &iov, &from);
1324         if (ret < 0)
1325                 return ret;
1326
1327         pipe_lock(pipe);
1328         ret = wait_for_space(pipe, flags);
1329         if (!ret)
1330                 ret = iter_to_pipe(&from, pipe, buf_flag);
1331         pipe_unlock(pipe);
1332         if (ret > 0)
1333                 wakeup_pipe_readers(pipe);
1334         kfree(iov);
1335         return ret;
1336 }
1337
1338 /*
1339  * Note that vmsplice only really supports true splicing _from_ user memory
1340  * to a pipe, not the other way around. Splicing from user memory is a simple
1341  * operation that can be supported without any funky alignment restrictions
1342  * or nasty vm tricks. We simply map in the user memory and fill them into
1343  * a pipe. The reverse isn't quite as easy, though. There are two possible
1344  * solutions for that:
1345  *
1346  *      - memcpy() the data internally, at which point we might as well just
1347  *        do a regular read() on the buffer anyway.
1348  *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1349  *        has restriction limitations on both ends of the pipe).
1350  *
1351  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1352  *
1353  */
1354 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1355                 unsigned long, nr_segs, unsigned int, flags)
1356 {
1357         struct fd f;
1358         long error;
1359
1360         if (unlikely(nr_segs > UIO_MAXIOV))
1361                 return -EINVAL;
1362         else if (unlikely(!nr_segs))
1363                 return 0;
1364
1365         error = -EBADF;
1366         f = fdget(fd);
1367         if (f.file) {
1368                 if (f.file->f_mode & FMODE_WRITE)
1369                         error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
1370                 else if (f.file->f_mode & FMODE_READ)
1371                         error = vmsplice_to_user(f.file, iov, nr_segs, flags);
1372
1373                 fdput(f);
1374         }
1375
1376         return error;
1377 }
1378
1379 #ifdef CONFIG_COMPAT
1380 COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
1381                     unsigned int, nr_segs, unsigned int, flags)
1382 {
1383         unsigned i;
1384         struct iovec __user *iov;
1385         if (nr_segs > UIO_MAXIOV)
1386                 return -EINVAL;
1387         iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
1388         for (i = 0; i < nr_segs; i++) {
1389                 struct compat_iovec v;
1390                 if (get_user(v.iov_base, &iov32[i].iov_base) ||
1391                     get_user(v.iov_len, &iov32[i].iov_len) ||
1392                     put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
1393                     put_user(v.iov_len, &iov[i].iov_len))
1394                         return -EFAULT;
1395         }
1396         return sys_vmsplice(fd, iov, nr_segs, flags);
1397 }
1398 #endif
1399
1400 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1401                 int, fd_out, loff_t __user *, off_out,
1402                 size_t, len, unsigned int, flags)
1403 {
1404         struct fd in, out;
1405         long error;
1406
1407         if (unlikely(!len))
1408                 return 0;
1409
1410         error = -EBADF;
1411         in = fdget(fd_in);
1412         if (in.file) {
1413                 if (in.file->f_mode & FMODE_READ) {
1414                         out = fdget(fd_out);
1415                         if (out.file) {
1416                                 if (out.file->f_mode & FMODE_WRITE)
1417                                         error = do_splice(in.file, off_in,
1418                                                           out.file, off_out,
1419                                                           len, flags);
1420                                 fdput(out);
1421                         }
1422                 }
1423                 fdput(in);
1424         }
1425         return error;
1426 }
1427
1428 /*
1429  * Make sure there's data to read. Wait for input if we can, otherwise
1430  * return an appropriate error.
1431  */
1432 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1433 {
1434         int ret;
1435
1436         /*
1437          * Check ->nrbufs without the inode lock first. This function
1438          * is speculative anyways, so missing one is ok.
1439          */
1440         if (pipe->nrbufs)
1441                 return 0;
1442
1443         ret = 0;
1444         pipe_lock(pipe);
1445
1446         while (!pipe->nrbufs) {
1447                 if (signal_pending(current)) {
1448                         ret = -ERESTARTSYS;
1449                         break;
1450                 }
1451                 if (!pipe->writers)
1452                         break;
1453                 if (!pipe->waiting_writers) {
1454                         if (flags & SPLICE_F_NONBLOCK) {
1455                                 ret = -EAGAIN;
1456                                 break;
1457                         }
1458                 }
1459                 pipe_wait(pipe);
1460         }
1461
1462         pipe_unlock(pipe);
1463         return ret;
1464 }
1465
1466 /*
1467  * Make sure there's writeable room. Wait for room if we can, otherwise
1468  * return an appropriate error.
1469  */
1470 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1471 {
1472         int ret;
1473
1474         /*
1475          * Check ->nrbufs without the inode lock first. This function
1476          * is speculative anyways, so missing one is ok.
1477          */
1478         if (pipe->nrbufs < pipe->buffers)
1479                 return 0;
1480
1481         ret = 0;
1482         pipe_lock(pipe);
1483
1484         while (pipe->nrbufs >= pipe->buffers) {
1485                 if (!pipe->readers) {
1486                         send_sig(SIGPIPE, current, 0);
1487                         ret = -EPIPE;
1488                         break;
1489                 }
1490                 if (flags & SPLICE_F_NONBLOCK) {
1491                         ret = -EAGAIN;
1492                         break;
1493                 }
1494                 if (signal_pending(current)) {
1495                         ret = -ERESTARTSYS;
1496                         break;
1497                 }
1498                 pipe->waiting_writers++;
1499                 pipe_wait(pipe);
1500                 pipe->waiting_writers--;
1501         }
1502
1503         pipe_unlock(pipe);
1504         return ret;
1505 }
1506
1507 /*
1508  * Splice contents of ipipe to opipe.
1509  */
1510 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1511                                struct pipe_inode_info *opipe,
1512                                size_t len, unsigned int flags)
1513 {
1514         struct pipe_buffer *ibuf, *obuf;
1515         int ret = 0, nbuf;
1516         bool input_wakeup = false;
1517
1518
1519 retry:
1520         ret = ipipe_prep(ipipe, flags);
1521         if (ret)
1522                 return ret;
1523
1524         ret = opipe_prep(opipe, flags);
1525         if (ret)
1526                 return ret;
1527
1528         /*
1529          * Potential ABBA deadlock, work around it by ordering lock
1530          * grabbing by pipe info address. Otherwise two different processes
1531          * could deadlock (one doing tee from A -> B, the other from B -> A).
1532          */
1533         pipe_double_lock(ipipe, opipe);
1534
1535         do {
1536                 if (!opipe->readers) {
1537                         send_sig(SIGPIPE, current, 0);
1538                         if (!ret)
1539                                 ret = -EPIPE;
1540                         break;
1541                 }
1542
1543                 if (!ipipe->nrbufs && !ipipe->writers)
1544                         break;
1545
1546                 /*
1547                  * Cannot make any progress, because either the input
1548                  * pipe is empty or the output pipe is full.
1549                  */
1550                 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1551                         /* Already processed some buffers, break */
1552                         if (ret)
1553                                 break;
1554
1555                         if (flags & SPLICE_F_NONBLOCK) {
1556                                 ret = -EAGAIN;
1557                                 break;
1558                         }
1559
1560                         /*
1561                          * We raced with another reader/writer and haven't
1562                          * managed to process any buffers.  A zero return
1563                          * value means EOF, so retry instead.
1564                          */
1565                         pipe_unlock(ipipe);
1566                         pipe_unlock(opipe);
1567                         goto retry;
1568                 }
1569
1570                 ibuf = ipipe->bufs + ipipe->curbuf;
1571                 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1572                 obuf = opipe->bufs + nbuf;
1573
1574                 if (len >= ibuf->len) {
1575                         /*
1576                          * Simply move the whole buffer from ipipe to opipe
1577                          */
1578                         *obuf = *ibuf;
1579                         ibuf->ops = NULL;
1580                         opipe->nrbufs++;
1581                         ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1582                         ipipe->nrbufs--;
1583                         input_wakeup = true;
1584                 } else {
1585                         /*
1586                          * Get a reference to this pipe buffer,
1587                          * so we can copy the contents over.
1588                          */
1589                         pipe_buf_get(ipipe, ibuf);
1590                         *obuf = *ibuf;
1591
1592                         /*
1593                          * Don't inherit the gift flag, we need to
1594                          * prevent multiple steals of this page.
1595                          */
1596                         obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1597
1598                         obuf->len = len;
1599                         opipe->nrbufs++;
1600                         ibuf->offset += obuf->len;
1601                         ibuf->len -= obuf->len;
1602                 }
1603                 ret += obuf->len;
1604                 len -= obuf->len;
1605         } while (len);
1606
1607         pipe_unlock(ipipe);
1608         pipe_unlock(opipe);
1609
1610         /*
1611          * If we put data in the output pipe, wakeup any potential readers.
1612          */
1613         if (ret > 0)
1614                 wakeup_pipe_readers(opipe);
1615
1616         if (input_wakeup)
1617                 wakeup_pipe_writers(ipipe);
1618
1619         return ret;
1620 }
1621
1622 /*
1623  * Link contents of ipipe to opipe.
1624  */
1625 static int link_pipe(struct pipe_inode_info *ipipe,
1626                      struct pipe_inode_info *opipe,
1627                      size_t len, unsigned int flags)
1628 {
1629         struct pipe_buffer *ibuf, *obuf;
1630         int ret = 0, i = 0, nbuf;
1631
1632         /*
1633          * Potential ABBA deadlock, work around it by ordering lock
1634          * grabbing by pipe info address. Otherwise two different processes
1635          * could deadlock (one doing tee from A -> B, the other from B -> A).
1636          */
1637         pipe_double_lock(ipipe, opipe);
1638
1639         do {
1640                 if (!opipe->readers) {
1641                         send_sig(SIGPIPE, current, 0);
1642                         if (!ret)
1643                                 ret = -EPIPE;
1644                         break;
1645                 }
1646
1647                 /*
1648                  * If we have iterated all input buffers or ran out of
1649                  * output room, break.
1650                  */
1651                 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1652                         break;
1653
1654                 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1655                 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1656
1657                 /*
1658                  * Get a reference to this pipe buffer,
1659                  * so we can copy the contents over.
1660                  */
1661                 pipe_buf_get(ipipe, ibuf);
1662
1663                 obuf = opipe->bufs + nbuf;
1664                 *obuf = *ibuf;
1665
1666                 /*
1667                  * Don't inherit the gift flag, we need to
1668                  * prevent multiple steals of this page.
1669                  */
1670                 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1671
1672                 if (obuf->len > len)
1673                         obuf->len = len;
1674
1675                 opipe->nrbufs++;
1676                 ret += obuf->len;
1677                 len -= obuf->len;
1678                 i++;
1679         } while (len);
1680
1681         /*
1682          * return EAGAIN if we have the potential of some data in the
1683          * future, otherwise just return 0
1684          */
1685         if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1686                 ret = -EAGAIN;
1687
1688         pipe_unlock(ipipe);
1689         pipe_unlock(opipe);
1690
1691         /*
1692          * If we put data in the output pipe, wakeup any potential readers.
1693          */
1694         if (ret > 0)
1695                 wakeup_pipe_readers(opipe);
1696
1697         return ret;
1698 }
1699
1700 /*
1701  * This is a tee(1) implementation that works on pipes. It doesn't copy
1702  * any data, it simply references the 'in' pages on the 'out' pipe.
1703  * The 'flags' used are the SPLICE_F_* variants, currently the only
1704  * applicable one is SPLICE_F_NONBLOCK.
1705  */
1706 static long do_tee(struct file *in, struct file *out, size_t len,
1707                    unsigned int flags)
1708 {
1709         struct pipe_inode_info *ipipe = get_pipe_info(in);
1710         struct pipe_inode_info *opipe = get_pipe_info(out);
1711         int ret = -EINVAL;
1712
1713         /*
1714          * Duplicate the contents of ipipe to opipe without actually
1715          * copying the data.
1716          */
1717         if (ipipe && opipe && ipipe != opipe) {
1718                 /*
1719                  * Keep going, unless we encounter an error. The ipipe/opipe
1720                  * ordering doesn't really matter.
1721                  */
1722                 ret = ipipe_prep(ipipe, flags);
1723                 if (!ret) {
1724                         ret = opipe_prep(opipe, flags);
1725                         if (!ret)
1726                                 ret = link_pipe(ipipe, opipe, len, flags);
1727                 }
1728         }
1729
1730         return ret;
1731 }
1732
1733 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1734 {
1735         struct fd in;
1736         int error;
1737
1738         if (unlikely(!len))
1739                 return 0;
1740
1741         error = -EBADF;
1742         in = fdget(fdin);
1743         if (in.file) {
1744                 if (in.file->f_mode & FMODE_READ) {
1745                         struct fd out = fdget(fdout);
1746                         if (out.file) {
1747                                 if (out.file->f_mode & FMODE_WRITE)
1748                                         error = do_tee(in.file, out.file,
1749                                                         len, flags);
1750                                 fdput(out);
1751                         }
1752                 }
1753                 fdput(in);
1754         }
1755
1756         return error;
1757 }