|
|
Subscribe / Log in / New account

zero-copy receiving sendfile().

From:  Evgeniy Polyakov <s0mbre@tservice.net.ru>
To:  Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Subject:  [PATCH] zero-copy receiving sendfile().
Date:  Tue, 20 Sep 2005 10:26:30 +0400
Cc:  "David S. Miller" <davem@davemloft.net>, netdev@vger.kernel.org

Hello, developers.

This patch adds zero-copy receiving sock_sendfile(), 
which can be used for socket->file data transfers.
Tests with sendfile() on the server side and
recv()/write() vs. sendfile() on client side over TCP
shows about 5-10% performance improvement and 
decreased CPU usage (CPU usage graph attached, 
which contains comparison of 
recv()/write() vs. receiving sendfile()
test over loopback runing with different thread number
on 2.6.13-rc6 and 2.6.14-rc1-git-friday).
Oprofile does not show copy_from_user() anymore,
which was third in recv()/write() case.

Implementation is based on top of VFS layer, 
so system grabs a page and receives data 
using recvmsg() directly into it, when page is filled,
it is commited back to VFS.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/fs/read_write.c b/fs/read_write.c
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -14,6 +14,15 @@
 #include <linux/security.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/aio.h>
+#include <linux/swap.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+
+#include <net/sock.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -667,8 +751,15 @@ static ssize_t do_sendfile(int out_fd, i
 	if (!(out_file->f_mode & FMODE_WRITE))
 		goto fput_out;
 	retval = -EINVAL;
-	if (!out_file->f_op || !out_file->f_op->sendpage)
+	if (!out_file->f_op)
 		goto fput_out;
+	       	
+	if (!SOCKET_I(in_file->f_dentry->d_inode) && !out_file->f_op->sendpage) {
+		printk("%s: sock=%p, sendpage=%p.\n", __func__, 
+				SOCKET_I(in_file->f_dentry->d_inode), out_file->f_op->sendpage);
+		goto fput_out;
+	}
+	
 	out_inode = out_file->f_dentry->d_inode;
 	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
 	if (retval)
@@ -685,7 +776,7 @@ static ssize_t do_sendfile(int out_fd, i
 	retval = -EINVAL;
 	if (unlikely(pos < 0))
 		goto fput_out;
-	if (unlikely(pos + count > max)) {
+	if (unlikely((unsigned long long)(pos + count) > (unsigned long long)max)) {
 		retval = -EOVERFLOW;
 		if (pos >= max)
 			goto fput_out;
diff --git a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1663,7 +1663,7 @@ EXPORT_SYMBOL(read_cache_page);
  * caller's lru-buffering pagevec.  This function is specifically for
  * generic_file_write().
  */
-static inline struct page *
+struct page *
 __grab_cache_page(struct address_space *mapping, unsigned long index,
 			struct page **cached_page, struct pagevec *lru_pvec)
 {
@@ -1692,6 +1692,8 @@ repeat:
 	return page;
 }
 
+EXPORT_SYMBOL_GPL(__grab_cache_page);
+
 /*
  * The logic we want is
  *
diff --git a/net/socket.c b/net/socket.c
--- a/net/socket.c
+++ b/net/socket.c
@@ -44,6 +44,7 @@
  *		Tigran Aivazian	:	sys_send(args) calls sys_sendto(args, NULL, 0)
  *		Tigran Aivazian	:	Made listen(2) backlog sanity checks 
  *					protocol-independent
+ *		Evgeniy Polyakov:	Added sock_sendfile().
  *
  *
  *		This program is free software; you can redistribute it and/or
@@ -84,6 +85,10 @@
 #include <linux/compat.h>
 #include <linux/kmod.h>
 #include <linux/audit.h>
+#include <linux/pagevec.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
 
 #ifdef CONFIG_NET_RADIO
 #include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
@@ -116,6 +121,7 @@ static ssize_t sock_writev(struct file *
 			  unsigned long count, loff_t *ppos);
 static ssize_t sock_sendpage(struct file *file, struct page *page,
 			     int offset, size_t size, loff_t *ppos, int more);
+ssize_t sock_sendfile(struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void
*target);
 
 
 /*
@@ -136,7 +142,8 @@ static struct file_operations socket_fil
 	.fasync =	sock_fasync,
 	.readv =	sock_readv,
 	.writev =	sock_writev,
-	.sendpage =	sock_sendpage
+	.sendpage =	sock_sendpage,
+	.sendfile =	sock_sendfile,
 };
 
 /*
@@ -726,6 +733,117 @@ static ssize_t sock_aio_write(struct kio
 	return __sock_sendmsg(iocb, sock, &x->async_msg, size);
 }
 
+extern struct page * __grab_cache_page(struct address_space *mapping, unsigned long index,
+			struct page **cached_page, struct pagevec *lru_pvec);
+
+ssize_t sock_sendfile(struct file *in_file, loff_t *ppos, size_t count, read_actor_t actor, void
*target)
+{
+	struct socket *sock;
+	struct page *page;
+	int err = 0;
+	struct msghdr msg;
+	struct kvec iov;
+	size_t written = 0;
+	struct file *file = target;
+	struct address_space *mapping = file->f_mapping;
+	struct address_space_operations *a_ops = mapping->a_ops;
+	struct inode *inode = mapping->host;
+	loff_t pos = *ppos;
+	struct page *cached_page = NULL;
+	struct pagevec lru_pvec;
+	unsigned long index;
+	unsigned long page_offset;
+	unsigned long bytes, recv;
+
+	if (!count)
+		return 0;
+
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto err_out_exit;
+	
+	pagevec_init(&lru_pvec, 0);
+
+	sock = SOCKET_I(in_file->f_dentry->d_inode);
+	
+	while (count) {
+		page_offset = (pos & (PAGE_CACHE_SIZE -1));
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - page_offset;
+		if (bytes > count)
+			bytes = count;
+		
+		page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec);
+		if (!page) {
+			err = -ENOMEM;
+			goto err_out_exit;
+		}
+
+		err = a_ops->prepare_write(file, page, page_offset, page_offset+bytes);
+		if (unlikely(err))
+			goto err_out_unlock;
+		
+		recv = bytes;
+
+		while (recv) {
+			sock->sk->sk_allocation |= GFP_NOIO;
+			iov.iov_base = page_address(page)+page_offset+bytes-recv;
+			iov.iov_len = recv;
+			msg.msg_name = NULL;
+			msg.msg_namelen = 0;
+			msg.msg_control = NULL;
+			msg.msg_controllen = 0;
+			msg.msg_namelen = 0;
+			msg.msg_flags = MSG_NOSIGNAL;
+			
+			err = kernel_recvmsg(sock, &msg, &iov, 1, recv, 0);
+
+			if (err > 0)
+				recv -= err;
+
+			if (signal_pending(current))
+				err = -ERESTARTSYS;
+
+			if (err <= 0)
+				break;
+		}
+
+		bytes -= recv;
+
+		flush_dcache_page(page);
+		err = a_ops->commit_write(file, page, page_offset, page_offset+bytes);
+		unlock_page(page);
+		mark_page_accessed(page);
+		page_cache_release(page);
+
+		if (bytes == 0)
+			break;
+		if (err < 0)
+			goto err_out_exit;
+
+		balance_dirty_pages_ratelimited(mapping);
+
+		count -= bytes;
+		written += bytes;
+		pos += bytes;
+	}
+	
+	if (cached_page)
+		page_cache_release(cached_page);
+	
+	pagevec_lru_add(&lru_pvec);
+	*ppos += written;
+
+	return written;
+
+err_out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+err_out_exit:
+	
+	return err;
+}
+
 static ssize_t sock_sendpage(struct file *file, struct page *page,
 			     int offset, size_t size, loff_t *ppos, int more)
 {


-- 
	Evgeniy Polyakov



Copyright © 2005, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds