| From: |
| Evgeniy Polyakov <s0mbre@tservice.net.ru> |
| To: |
| Evgeniy Polyakov <johnpol@2ka.mipt.ru> |
| Subject: |
| [PATCH] zero-copy receiving sendfile(). |
| Date: |
| Tue, 20 Sep 2005 10:26:30 +0400 |
| Cc: |
| "David S. Miller" <davem@davemloft.net>, netdev@vger.kernel.org |
Hello, developers.
This patch adds zero-copy receiving sock_sendfile(),
which can be used for socket->file data transfers.
Tests with sendfile() on the server side and
recv()/write() vs. sendfile() on client side over TCP
shows about 5-10% performance improvement and
decreased CPU usage (CPU usage graph attached,
which contains comparison of
recv()/write() vs. receiving sendfile()
test over loopback runing with different thread number
on 2.6.13-rc6 and 2.6.14-rc1-git-friday).
Oprofile does not show copy_from_user() anymore,
which was third in recv()/write() case.
Implementation is based on top of VFS layer,
so system grabs a page and receives data
using recvmsg() directly into it, when page is filled,
it is commited back to VFS.
Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
diff --git a/fs/read_write.c b/fs/read_write.c
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -14,6 +14,15 @@
#include <linux/security.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/aio.h>
+#include <linux/swap.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+
+#include <net/sock.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -667,8 +751,15 @@ static ssize_t do_sendfile(int out_fd, i
if (!(out_file->f_mode & FMODE_WRITE))
goto fput_out;
retval = -EINVAL;
- if (!out_file->f_op || !out_file->f_op->sendpage)
+ if (!out_file->f_op)
goto fput_out;
+
+ if (!SOCKET_I(in_file->f_dentry->d_inode) && !out_file->f_op->sendpage) {
+ printk("%s: sock=%p, sendpage=%p.\n", __func__,
+ SOCKET_I(in_file->f_dentry->d_inode), out_file->f_op->sendpage);
+ goto fput_out;
+ }
+
out_inode = out_file->f_dentry->d_inode;
retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
if (retval)
@@ -685,7 +776,7 @@ static ssize_t do_sendfile(int out_fd, i
retval = -EINVAL;
if (unlikely(pos < 0))
goto fput_out;
- if (unlikely(pos + count > max)) {
+ if (unlikely((unsigned long long)(pos + count) > (unsigned long long)max)) {
retval = -EOVERFLOW;
if (pos >= max)
goto fput_out;
diff --git a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1663,7 +1663,7 @@ EXPORT_SYMBOL(read_cache_page);
* caller's lru-buffering pagevec. This function is specifically for
* generic_file_write().
*/
-static inline struct page *
+struct page *
__grab_cache_page(struct address_space *mapping, unsigned long index,
struct page **cached_page, struct pagevec *lru_pvec)
{
@@ -1692,6 +1692,8 @@ repeat:
return page;
}
+EXPORT_SYMBOL_GPL(__grab_cache_page);
+
/*
* The logic we want is
*
diff --git a/net/socket.c b/net/socket.c
--- a/net/socket.c
+++ b/net/socket.c
@@ -44,6 +44,7 @@
* Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0)
* Tigran Aivazian : Made listen(2) backlog sanity checks
* protocol-independent
+ * Evgeniy Polyakov: Added sock_sendfile().
*
*
* This program is free software; you can redistribute it and/or
@@ -84,6 +85,10 @@
#include <linux/compat.h>
#include <linux/kmod.h>
#include <linux/audit.h>
+#include <linux/pagevec.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
#ifdef CONFIG_NET_RADIO
#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */
@@ -116,6 +121,7 @@ static ssize_t sock_writev(struct file *
unsigned long count, loff_t *ppos);
static ssize_t sock_sendpage(struct file *file, struct page *page,
int offset, size_t size, loff_t *ppos, int more);
+ssize_t sock_sendfile(struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void
*target);
/*
@@ -136,7 +142,8 @@ static struct file_operations socket_fil
.fasync = sock_fasync,
.readv = sock_readv,
.writev = sock_writev,
- .sendpage = sock_sendpage
+ .sendpage = sock_sendpage,
+ .sendfile = sock_sendfile,
};
/*
@@ -726,6 +733,117 @@ static ssize_t sock_aio_write(struct kio
return __sock_sendmsg(iocb, sock, &x->async_msg, size);
}
+extern struct page * __grab_cache_page(struct address_space *mapping, unsigned long index,
+ struct page **cached_page, struct pagevec *lru_pvec);
+
+ssize_t sock_sendfile(struct file *in_file, loff_t *ppos, size_t count, read_actor_t actor, void
*target)
+{
+ struct socket *sock;
+ struct page *page;
+ int err = 0;
+ struct msghdr msg;
+ struct kvec iov;
+ size_t written = 0;
+ struct file *file = target;
+ struct address_space *mapping = file->f_mapping;
+ struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ loff_t pos = *ppos;
+ struct page *cached_page = NULL;
+ struct pagevec lru_pvec;
+ unsigned long index;
+ unsigned long page_offset;
+ unsigned long bytes, recv;
+
+ if (!count)
+ return 0;
+
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err)
+ goto err_out_exit;
+
+ pagevec_init(&lru_pvec, 0);
+
+ sock = SOCKET_I(in_file->f_dentry->d_inode);
+
+ while (count) {
+ page_offset = (pos & (PAGE_CACHE_SIZE -1));
+ index = pos >> PAGE_CACHE_SHIFT;
+ bytes = PAGE_CACHE_SIZE - page_offset;
+ if (bytes > count)
+ bytes = count;
+
+ page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec);
+ if (!page) {
+ err = -ENOMEM;
+ goto err_out_exit;
+ }
+
+ err = a_ops->prepare_write(file, page, page_offset, page_offset+bytes);
+ if (unlikely(err))
+ goto err_out_unlock;
+
+ recv = bytes;
+
+ while (recv) {
+ sock->sk->sk_allocation |= GFP_NOIO;
+ iov.iov_base = page_address(page)+page_offset+bytes-recv;
+ iov.iov_len = recv;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
+ msg.msg_flags = MSG_NOSIGNAL;
+
+ err = kernel_recvmsg(sock, &msg, &iov, 1, recv, 0);
+
+ if (err > 0)
+ recv -= err;
+
+ if (signal_pending(current))
+ err = -ERESTARTSYS;
+
+ if (err <= 0)
+ break;
+ }
+
+ bytes -= recv;
+
+ flush_dcache_page(page);
+ err = a_ops->commit_write(file, page, page_offset, page_offset+bytes);
+ unlock_page(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
+
+ if (bytes == 0)
+ break;
+ if (err < 0)
+ goto err_out_exit;
+
+ balance_dirty_pages_ratelimited(mapping);
+
+ count -= bytes;
+ written += bytes;
+ pos += bytes;
+ }
+
+ if (cached_page)
+ page_cache_release(cached_page);
+
+ pagevec_lru_add(&lru_pvec);
+ *ppos += written;
+
+ return written;
+
+err_out_unlock:
+ unlock_page(page);
+ page_cache_release(page);
+err_out_exit:
+
+ return err;
+}
+
static ssize_t sock_sendpage(struct file *file, struct page *page,
int offset, size_t size, loff_t *ppos, int more)
{
--
Evgeniy Polyakov