zero-copy receiving sendfile().
From: | Evgeniy Polyakov <s0mbre@tservice.net.ru> | |
To: | Evgeniy Polyakov <johnpol@2ka.mipt.ru> | |
Subject: | [PATCH] zero-copy receiving sendfile(). | |
Date: | Tue, 20 Sep 2005 10:26:30 +0400 | |
Cc: | "David S. Miller" <davem@davemloft.net>, netdev@vger.kernel.org |
Hello, developers. This patch adds zero-copy receiving sock_sendfile(), which can be used for socket->file data transfers. Tests with sendfile() on the server side and recv()/write() vs. sendfile() on client side over TCP shows about 5-10% performance improvement and decreased CPU usage (CPU usage graph attached, which contains comparison of recv()/write() vs. receiving sendfile() test over loopback runing with different thread number on 2.6.13-rc6 and 2.6.14-rc1-git-friday). Oprofile does not show copy_from_user() anymore, which was third in recv()/write() case. Implementation is based on top of VFS layer, so system grabs a page and receives data using recvmsg() directly into it, when page is filled, it is commited back to VFS. Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru> diff --git a/fs/read_write.c b/fs/read_write.c --- a/fs/read_write.c +++ b/fs/read_write.c @@ -14,6 +14,15 @@ #include <linux/security.h> #include <linux/module.h> #include <linux/syscalls.h> +#include <linux/mm.h> +#include <linux/aio.h> +#include <linux/swap.h> +#include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> + +#include <net/sock.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -667,8 +751,15 @@ static ssize_t do_sendfile(int out_fd, i if (!(out_file->f_mode & FMODE_WRITE)) goto fput_out; retval = -EINVAL; - if (!out_file->f_op || !out_file->f_op->sendpage) + if (!out_file->f_op) goto fput_out; + + if (!SOCKET_I(in_file->f_dentry->d_inode) && !out_file->f_op->sendpage) { + printk("%s: sock=%p, sendpage=%p.\n", __func__, + SOCKET_I(in_file->f_dentry->d_inode), out_file->f_op->sendpage); + goto fput_out; + } + out_inode = out_file->f_dentry->d_inode; retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); if (retval) @@ -685,7 +776,7 @@ static ssize_t do_sendfile(int out_fd, i retval = -EINVAL; if (unlikely(pos < 0)) goto fput_out; - if (unlikely(pos + count > max)) { + if (unlikely((unsigned long long)(pos + count) > (unsigned long long)max)) { retval = -EOVERFLOW; if (pos >= max) goto fput_out; diff --git a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1663,7 +1663,7 @@ EXPORT_SYMBOL(read_cache_page); * caller's lru-buffering pagevec. This function is specifically for * generic_file_write(). */ -static inline struct page * +struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, struct page **cached_page, struct pagevec *lru_pvec) { @@ -1692,6 +1692,8 @@ repeat: return page; } +EXPORT_SYMBOL_GPL(__grab_cache_page); + /* * The logic we want is * diff --git a/net/socket.c b/net/socket.c --- a/net/socket.c +++ b/net/socket.c @@ -44,6 +44,7 @@ * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent + * Evgeniy Polyakov: Added sock_sendfile(). * * * This program is free software; you can redistribute it and/or @@ -84,6 +85,10 @@ #include <linux/compat.h> #include <linux/kmod.h> #include <linux/audit.h> +#include <linux/pagevec.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/writeback.h> #ifdef CONFIG_NET_RADIO #include <linux/wireless.h> /* Note : will define WIRELESS_EXT */ @@ -116,6 +121,7 @@ static ssize_t sock_writev(struct file * unsigned long count, loff_t *ppos); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more); +ssize_t sock_sendfile(struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void *target); /* @@ -136,7 +142,8 @@ static struct file_operations socket_fil .fasync = sock_fasync, .readv = sock_readv, .writev = sock_writev, - .sendpage = sock_sendpage + .sendpage = sock_sendpage, + .sendfile = sock_sendfile, }; /* @@ -726,6 +733,117 @@ static ssize_t sock_aio_write(struct kio return __sock_sendmsg(iocb, sock, &x->async_msg, size); } +extern struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, + struct page **cached_page, struct pagevec *lru_pvec); + +ssize_t sock_sendfile(struct file *in_file, loff_t *ppos, size_t count, read_actor_t actor, void *target) +{ + struct socket *sock; + struct page *page; + int err = 0; + struct msghdr msg; + struct kvec iov; + size_t written = 0; + struct file *file = target; + struct address_space *mapping = file->f_mapping; + struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + loff_t pos = *ppos; + struct page *cached_page = NULL; + struct pagevec lru_pvec; + unsigned long index; + unsigned long page_offset; + unsigned long bytes, recv; + + if (!count) + return 0; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto err_out_exit; + + pagevec_init(&lru_pvec, 0); + + sock = SOCKET_I(in_file->f_dentry->d_inode); + + while (count) { + page_offset = (pos & (PAGE_CACHE_SIZE -1)); + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - page_offset; + if (bytes > count) + bytes = count; + + page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec); + if (!page) { + err = -ENOMEM; + goto err_out_exit; + } + + err = a_ops->prepare_write(file, page, page_offset, page_offset+bytes); + if (unlikely(err)) + goto err_out_unlock; + + recv = bytes; + + while (recv) { + sock->sk->sk_allocation |= GFP_NOIO; + iov.iov_base = page_address(page)+page_offset+bytes-recv; + iov.iov_len = recv; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_flags = MSG_NOSIGNAL; + + err = kernel_recvmsg(sock, &msg, &iov, 1, recv, 0); + + if (err > 0) + recv -= err; + + if (signal_pending(current)) + err = -ERESTARTSYS; + + if (err <= 0) + break; + } + + bytes -= recv; + + flush_dcache_page(page); + err = a_ops->commit_write(file, page, page_offset, page_offset+bytes); + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + + if (bytes == 0) + break; + if (err < 0) + goto err_out_exit; + + balance_dirty_pages_ratelimited(mapping); + + count -= bytes; + written += bytes; + pos += bytes; + } + + if (cached_page) + page_cache_release(cached_page); + + pagevec_lru_add(&lru_pvec); + *ppos += written; + + return written; + +err_out_unlock: + unlock_page(page); + page_cache_release(page); +err_out_exit: + + return err; +} + static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more) { -- Evgeniy Polyakov