| From: |
| Evgeniy Polyakov <johnpol@2ka.mipt.ru> |
| To: |
| netdev@vger.kernel.org |
| Subject: |
| [PATCH] Kevent, network AIO system calls implementation. |
| Date: |
| Sat, 18 Feb 2006 19:11:40 +0300 |
| Cc: |
| David Miller <davem@davemloft.net>,
Jamal Hadi Salim <hadi@cyberus.ca> |
Hello developers.
I'm pleased to announce combined patch of kevent and network AIO subsytems,
which are implemented using three system calls:
* kevent_ctl(), which fully controls kevent subsystem.
It allows to add/modify/remove kevents and waiting for either given
number or at least one kevent is ready.
* aio_send().
This system call allows to asynchronously transfer userspace buffer
to the remote host using zero-copy mechanism.
* aio_recv().
Asynchronous receiving support.
Next step is to implement aio_sendfile() support.
Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 9b21a31..3ae20b2 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -294,3 +294,7 @@ ENTRY(sys_call_table)
.long sys_inotify_init
.long sys_inotify_add_watch
.long sys_inotify_rm_watch
+ .long sys_aio_recv
+ .long sys_aio_send
+ .long sys_aio_sendfile
+ .long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index e0eb0c7..dc6924d 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -643,6 +643,10 @@ ia32_sys_call_table:
.quad sys_inotify_init
.quad sys_inotify_add_watch
.quad sys_inotify_rm_watch
+ .quad sys_aio_recv
+ .quad sys_aio_send
+ .quad sys_aio_sendfile
+ .quad sys_kevent_ctl
ia32_syscall_end:
.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
.quad ni_syscall
diff --git a/fs/file_table.c b/fs/file_table.c
index c3a5e2f..7d73a2b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -89,6 +89,9 @@ struct file *get_empty_filp(void)
if (security_file_alloc(f))
goto fail_sec;
+#ifdef CONFIG_KEVENT_POLL
+ kevent_storage_init(KEVENT_POLL, KEVENT_MASK_EMPTY, f, &f->st);
+#endif
eventpoll_init_file(f);
atomic_set(&f->f_count, 1);
f->f_uid = current->fsuid;
@@ -135,6 +138,9 @@ void fastcall __fput(struct file *file)
might_sleep();
fsnotify_close(file);
+#ifdef CONFIG_KEVENT_POLL
+ kevent_storage_fini(&file->st);
+#endif
/*
* The function eventpoll_release() should be the first called
* in the file cleanup chain.
diff --git a/fs/inode.c b/fs/inode.c
index d8d04bd..185bd67 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
#include <linux/cdev.h>
#include <linux/bootmem.h>
#include <linux/inotify.h>
+#include <linux/kevent.h>
/*
* This is needed for the following functions:
@@ -165,6 +166,9 @@ static struct inode *alloc_inode(struct
}
memset(&inode->u, 0, sizeof(inode->u));
inode->i_mapping = mapping;
+#if defined CONFIG_KEVENT_INODE || defined CONFIG_KEVENT_SOCKET
+ kevent_storage_init(KEVENT_INODE, KEVENT_MASK_EMPTY, inode, &inode->st);
+#endif
}
return inode;
}
@@ -173,6 +177,9 @@ void destroy_inode(struct inode *inode)
{
if (inode_has_buffers(inode))
BUG();
+#if defined CONFIG_KEVENT_INODE || defined CONFIG_KEVENT_SOCKET
+ kevent_storage_fini(&inode->st);
+#endif
security_inode_free(inode);
if (inode->i_sb->s_op->destroy_inode)
inode->i_sb->s_op->destroy_inode(inode);
diff --git a/include/asm-i386/socket.h b/include/asm-i386/socket.h
index 802ae76..3473f5c 100644
--- a/include/asm-i386/socket.h
+++ b/include/asm-i386/socket.h
@@ -49,4 +49,6 @@
#define SO_PEERSEC 31
+#define SO_ASYNC_SOCK 34
+
#endif /* _ASM_SOCKET_H */
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 0f92e78..2075f4b 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -299,8 +299,12 @@
#define __NR_inotify_init 291
#define __NR_inotify_add_watch 292
#define __NR_inotify_rm_watch 293
+#define __NR_aio_recv 294
+#define __NR_aio_send 295
+#define __NR_aio_sendfile 296
+#define __NR_kevent_ctl 297
-#define NR_syscalls 294
+#define NR_syscalls 298
/*
* user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h
index d5166ec..9fba3a0 100644
--- a/include/asm-x86_64/ia32_unistd.h
+++ b/include/asm-x86_64/ia32_unistd.h
@@ -299,7 +299,8 @@
#define __NR_ia32_inotify_init 291
#define __NR_ia32_inotify_add_watch 292
#define __NR_ia32_inotify_rm_watch 293
+#define __NR_ia32_aio_recv 294
-#define IA32_NR_syscalls 294 /* must be > than biggest syscall! */
+#define IA32_NR_syscalls 295 /* must be > than biggest syscall! */
#endif /* _ASM_X86_64_IA32_UNISTD_H_ */
diff --git a/include/asm-x86_64/socket.h b/include/asm-x86_64/socket.h
index f2cdbea..1f31f86 100644
--- a/include/asm-x86_64/socket.h
+++ b/include/asm-x86_64/socket.h
@@ -49,4 +49,6 @@
#define SO_PEERSEC 31
+#define SO_ASYNC_SOCK 34
+
#endif /* _ASM_SOCKET_H */
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 2c42150..1f23f91 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -571,8 +571,16 @@ __SYSCALL(__NR_inotify_init, sys_inotify
__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
#define __NR_inotify_rm_watch 255
__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
+#define __NR_aio_recv 256
+__SYSCALL(__NR_aio_recv, sys_aio_recv)
+#define __NR_aio_send 257
+__SYSCALL(__NR_aio_send, sys_aio_send)
+#define __NR_aio_sendfile 258
+__SYSCALL(__NR_aio_sendfile, sys_aio_sendfile)
+#define __NR_kevent_ctl 259
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
-#define __NR_syscall_max __NR_inotify_rm_watch
+#define __NR_syscall_max __NR_kevent_ctl
#ifndef __NO_STUBS
/* user-visible error numbers are in the range -1 - -4095 */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc35b6a..6566600 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -224,6 +224,9 @@ extern int dir_notify_enable;
#include <asm/atomic.h>
#include <asm/semaphore.h>
#include <asm/byteorder.h>
+#ifdef CONFIG_KEVENT
+#include <linux/kevent_storage.h>
+#endif
struct iovec;
struct nameidata;
@@ -483,6 +486,10 @@ struct inode {
struct semaphore inotify_sem; /* protects the watches list */
#endif
+#ifdef CONFIG_KEVENT_INODE
+ struct kevent_storage st;
+#endif
+
unsigned long i_state;
unsigned long dirtied_when; /* jiffies of first dirtying */
@@ -616,6 +623,9 @@ struct file {
struct list_head f_ep_links;
spinlock_t f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */
+#ifdef CONFIG_KEVENT_POLL
+ struct kevent_storage st;
+#endif
struct address_space *f_mapping;
};
extern spinlock_t files_lock;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 03b8e79..85449ac 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -15,6 +15,7 @@
#include <linux/dnotify.h>
#include <linux/inotify.h>
+#include <linux/kevent.h>
/*
* fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
@@ -56,6 +57,7 @@ static inline void fsnotify_nameremove(s
isdir = IN_ISDIR;
dnotify_parent(dentry, DN_DELETE);
inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+ kevent_inode_notify_parent(dentry, KEVENT_INODE_REMOVE);
}
/*
@@ -65,6 +67,7 @@ static inline void fsnotify_inoderemove(
{
inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL);
inotify_inode_is_dead(inode);
+ kevent_inode_remove(inode);
}
/*
@@ -74,6 +77,7 @@ static inline void fsnotify_create(struc
{
inode_dir_notify(inode, DN_CREATE);
inotify_inode_queue_event(inode, IN_CREATE, 0, name);
+ kevent_inode_notify(inode, KEVENT_INODE_CREATE);
}
/*
@@ -83,6 +87,7 @@ static inline void fsnotify_mkdir(struct
{
inode_dir_notify(inode, DN_CREATE);
inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, name);
+ kevent_inode_notify(inode, KEVENT_INODE_CREATE);
}
/*
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..64926bc
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,268 @@
+/*
+ * kevent.h
+ *
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT 0x1 /* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN 0x1 /* Kevent is broken. */
+#define KEVENT_RET_DONE 0x2 /* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+enum {
+ KEVENT_SOCKET = 0,
+ KEVENT_INODE,
+ KEVENT_TIMER,
+ KEVENT_POLL,
+ KEVENT_NAIO,
+
+ KEVENT_MAX,
+};
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+enum {
+ KEVENT_TIMER_FIRED = 0x1,
+};
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+enum {
+ KEVENT_SOCKET_RECV = 0x1,
+ KEVENT_SOCKET_ACCEPT = 0x2,
+ KEVENT_SOCKET_SEND = 0x4,
+};
+
+/*
+ * Inode events.
+ */
+enum {
+ KEVENT_INODE_CREATE = 0x1,
+ KEVENT_INODE_REMOVE = 0x2,
+};
+
+/*
+ * Poll events.
+ */
+enum {
+ KEVENT_POLL_POLLIN = 0x0001,
+ KEVENT_POLL_POLLPRI = 0x0002,
+ KEVENT_POLL_POLLOUT = 0x0004,
+ KEVENT_POLL_POLLERR = 0x0008,
+ KEVENT_POLL_POLLHUP = 0x0010,
+ KEVENT_POLL_POLLNVAL = 0x0020,
+
+ KEVENT_POLL_POLLRDNORM = 0x0040,
+ KEVENT_POLL_POLLRDBAND = 0x0080,
+ KEVENT_POLL_POLLWRNORM = 0x0100,
+ KEVENT_POLL_POLLWRBAND = 0x0200,
+ KEVENT_POLL_POLLMSG = 0x0400,
+ KEVENT_POLL_POLLREMOVE = 0x1000,
+};
+
+#define KEVENT_MASK_ALL 0xffffffff /* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY 0x0 /* Empty mask of ready events. */
+
+struct kevent_id
+{
+ __u32 raw[2];
+};
+
+struct ukevent
+{
+ struct kevent_id id; /* Id of this request, e.g. socket number, file descriptor and so on...
*/
+ __u32 type; /* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+ __u32 event; /* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+ __u32 req_flags; /* Per-event request flags */
+ __u32 ret_flags; /* Per-event return flags */
+ __u32 ret_data[2]; /* Event return data. Event originator fills it with anything it likes. */
+ union {
+ __u32 user[2]; /* User's data. It is not used, just copied to/from user. */
+ void *ptr;
+ };
+};
+
+enum {
+ KEVENT_CTL_ADD = 0,
+ KEVENT_CTL_REMOVE,
+ KEVENT_CTL_MODIFY,
+ KEVENT_CTL_WAIT,
+ KEVENT_CTL_INIT,
+};
+
+struct kevent_user_control
+{
+ unsigned int cmd; /* Control command, e.g. KEVENT_ADD, KEVENT_REMOVE... */
+ unsigned int num; /* Number of ukevents this strucutre controls. */
+ unsigned int timeout; /* Timeout in milliseconds waiting for "num" events to become ready. */
+};
+
+#define KEVENT_USER_SYMBOL 'K'
+#define KEVENT_USER_CTL _IOWR(KEVENT_USER_SYMBOL, 0, struct kevent_user_control)
+#define KEVENT_USER_WAIT _IOWR(KEVENT_USER_SYMBOL, 1, struct kevent_user_control)
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/kevent_storage.h>
+#include <asm/semaphore.h>
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+struct kevent
+{
+ struct ukevent event;
+ spinlock_t lock; /* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+ struct list_head kevent_entry; /* Entry of user's queue. */
+ struct list_head storage_entry; /* Entry of origin's queue. */
+ struct list_head ready_entry; /* Entry of user's ready. */
+
+ struct kevent_user *user; /* User who requested this kevent. */
+ struct kevent_storage *st; /* Kevent container. */
+
+ kevent_callback_t callback; /* Is called each time new event has been caught. */
+ kevent_callback_t enqueue; /* Is called each time new event is queued. */
+ kevent_callback_t dequeue; /* Is called each time event is dequeued. */
+
+ void *priv; /* Private data for different storages.
+ * poll()/select storage has a list of wait_queue_t containers
+ * for each ->poll() { poll_wait()' } here.
+ */
+};
+
+#define KEVENT_HASH_MASK 0xff
+
+struct kevent_list
+{
+ struct list_head kevent_list; /* List of all kevents. */
+ spinlock_t kevent_lock; /* Protects all manipulations with queue of kevents. */
+};
+
+struct kevent_user
+{
+ struct kevent_list kqueue[KEVENT_HASH_MASK+1];
+ unsigned int kevent_num; /* Number of queued kevents. */
+
+ struct list_head ready_list; /* List of ready kevents. */
+ unsigned int ready_num; /* Number of ready kevents. */
+ spinlock_t ready_lock; /* Protects all manipulations with ready queue. */
+
+ unsigned int max_ready_num; /* Requested number of kevents. */
+
+ struct semaphore ctl_mutex; /* Protects against simultaneous kevent_user control manipulations.
*/
+ struct semaphore wait_mutex; /* Protects against simultaneous kevent_user waits. */
+ wait_queue_head_t wait; /* Wait until some events are ready. */
+
+ atomic_t refcnt; /* Reference counter, increased for each new kevent. */
+};
+
+#define KEVENT_MAX_REQUESTS PAGE_SIZE/sizeof(struct kevent)
+
+struct kevent *kevent_alloc(gfp_t mask);
+void kevent_free(struct kevent *k);
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+#define list_for_each_entry_reverse_safe(pos, n, head, member) \
+ for (pos = list_entry((head)->prev, typeof(*pos), member), \
+ n = list_entry(pos->member.prev, typeof(*pos), member); \
+ prefetch(pos->member.prev), &pos->member != (head); \
+ pos = n, n = list_entry(pos->member.prev, typeof(*pos), member))
+
+int kevent_break(struct kevent *k);
+int kevent_init(struct kevent *k);
+
+int kevent_init_socket(struct kevent *k);
+int kevent_init_inode(struct kevent *k);
+int kevent_init_timer(struct kevent *k);
+int kevent_init_poll(struct kevent *k);
+int kevent_init_naio(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, kevent_callback_t ready_callback, u32
event);
+int kevent_storage_init(__u32 type, __u32 event, void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+
+enum {
+ KEVENT_SOCKET_FLAGS_ASYNC = 0,
+ KEVENT_SOCKET_FLAGS_INUSE,
+};
+
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) test_bit(KEVENT_SOCKET_FLAGS_ASYNC, &__sk->sk_kevent_flags)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk) 0
+#endif
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..7c68170
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,21 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+ __u32 type; /* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+ __u32 event; /* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED,
+ * which were NOT updated by any kevent in origin's queue,
+ * i.e. when new event happens and there are no requests in
+ * origin's queue for that event, it will be placed here.
+ * New events are ORed with old one, so when new kevent is being added
+ * into origin's queue, it just needs to check if requested event
+ * is in this mask, and if so, return positive value from ->enqueu()
+ */
+ void *origin; /* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+ struct list_head list; /* List of queued kevents. */
+ unsigned int qlen; /* Number of queued kevents. */
+ spinlock_t lock; /* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8c5d600..8dbd67d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1232,6 +1232,8 @@ extern struct sk_buff *skb_recv_datagram
int noblock, int *err);
extern unsigned int datagram_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
+extern int skb_copy_datagram(const struct sk_buff *from,
+ int offset, void *dst, int size);
extern int skb_copy_datagram_iovec(const struct sk_buff *from,
int offset, struct iovec *to,
int size);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c7007b1..5e1e872 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -512,4 +512,8 @@ asmlinkage long sys_ioprio_get(int which
asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
unsigned long maxnode);
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_aio_sendfile(int ctl_fd, int fd, int s, size_t size, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, void __user *buf);
#endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 982b4ec..d571c3f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -48,6 +48,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h> /* struct sk_buff */
#include <linux/security.h>
+#include <linux/kevent.h>
#include <linux/filter.h>
@@ -212,6 +213,9 @@ struct sock {
int sk_route_caps;
unsigned long sk_flags;
unsigned long sk_lingertime;
+#ifdef CONFIG_KEVENT_SOCKET
+ unsigned long sk_kevent_flags;
+#endif
/*
* The backlog queue is special, it is always used with
* the per-socket spinlock held and requires low latency
@@ -444,6 +448,21 @@ static inline int sk_stream_memory_free(
extern void sk_stream_rfree(struct sk_buff *skb);
+struct socket_alloc {
+ struct socket socket;
+ struct inode vfs_inode;
+};
+
+static inline struct socket *SOCKET_I(struct inode *inode)
+{
+ return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+}
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+ return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
skb->sk = sk;
@@ -470,6 +489,7 @@ static inline void sk_add_backlog(struct
sk->sk_backlog.tail = skb;
}
skb->next = NULL;
+ kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
}
#define sk_wait_event(__sk, __timeo, __condition) \
@@ -532,6 +552,12 @@ struct proto {
int (*backlog_rcv) (struct sock *sk,
struct sk_buff *skb);
+
+ int (*async_recv) (struct sock *sk,
+ void *dst, size_t size);
+ int (*async_send) (struct sock *sk,
+ struct page **pages, unsigned int poffset,
+ size_t size);
/* Keeping track of sk's, looking them up, and port selection methods. */
void (*hash)(struct sock *sk);
@@ -664,21 +690,6 @@ static inline struct kiocb *siocb_to_kio
return si->kiocb;
}
-struct socket_alloc {
- struct socket socket;
- struct inode vfs_inode;
-};
-
-static inline struct socket *SOCKET_I(struct inode *inode)
-{
- return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
-}
-
-static inline struct inode *SOCK_INODE(struct socket *socket)
-{
- return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
-}
-
extern void __sk_stream_mem_reclaim(struct sock *sk);
extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d78025f..8788625 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -392,6 +392,8 @@ extern int tcp_setsockopt(struct sock
int optname, char __user *optval,
int optlen);
extern void tcp_set_keepalive(struct sock *sk, int val);
+extern int tcp_async_recv(struct sock *sk, void *dst, size_t size);
+extern int tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t
size);
extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg,
size_t len, int nonblock,
@@ -950,6 +952,7 @@ static __inline__ int tcp_prequeue(struc
tp->ucopy.memory = 0;
} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
wake_up_interruptible(sk->sk_sleep);
+ kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
if (!inet_csk_ack_scheduled(sk))
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
(3 * TCP_RTO_MIN) / 4,
diff --git a/init/Kconfig b/init/Kconfig
index 9fc0759..4a4f26c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -224,6 +224,8 @@ config KOBJECT_UEVENT
Say Y, unless you are building a system requiring minimal memory
consumption.
+source "kernel/kevent/Kconfig"
+
config IKCONFIG
bool "Kernel .config support"
---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index 4f5a145..7c5aa88 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..a52a86f
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,39 @@
+config KEVENT
+ bool "Kernel event notification mechanism"
+ help
+ This option enables event queue mechanism.
+ It can be used as replacement for poll()/select(), AIO callback invocations,
+ advanced timer notifications and other kernel object status changes.
+
+config KEVENT_SOCKET
+ bool "Kernel event notifications for sockets"
+ depends on NET && KEVENT
+ help
+ This option enables notifications through KEVENT subsystem of
+ sockets operations, like new packet receiving conditions, ready for accept
+ conditions and so on.
+
+config KEVENT_INODE
+ bool "Kernel event notifications for inodes"
+ depends on KEVENT
+ help
+ This option enables notifications through KEVENT subsystem of
+ inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+ bool "Kernel event notifications for timers"
+ depends on KEVENT
+ help
+ This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+ bool "Kernel event notifications for poll()/select()"
+ depends on KEVENT
+ help
+ This option allows to use kevent subsystem for poll()/select() notifications.
+
+config KEVENT_NAIO
+ bool "Network asynchronous IO"
+ depends on KEVENT_SOCKET
+ help
+ This option enables kevent based network asynchronous IO subsystem.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..2bc7135
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,6 @@
+obj-y := kevent.o kevent_user.o kevent_init.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..5c3a141
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,301 @@
+/*
+ * kevent.c
+ *
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+static kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+ if (k->event.type >= KEVENT_MAX)
+ return -E2BIG;
+
+ if (!k->enqueue) {
+ kevent_break(k);
+ return -EINVAL;
+ }
+
+ return k->enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+ if (k->event.type >= KEVENT_MAX)
+ return -E2BIG;
+
+ if (!k->dequeue) {
+ kevent_break(k);
+ return -EINVAL;
+ }
+
+ return k->dequeue(k);
+}
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+ int err;
+
+ spin_lock_init(&k->lock);
+ k->kevent_entry.next = LIST_POISON1;
+ k->storage_entry.next = LIST_POISON1;
+ k->ready_entry.next = LIST_POISON1;
+
+ if (k->event.type >= KEVENT_MAX)
+ return -E2BIG;
+
+ switch (k->event.type) {
+ case KEVENT_NAIO:
+ err = kevent_init_naio(k);
+ break;
+ case KEVENT_SOCKET:
+ err = kevent_init_socket(k);
+ break;
+ case KEVENT_INODE:
+ err = kevent_init_inode(k);
+ break;
+ case KEVENT_TIMER:
+ err = kevent_init_timer(k);
+ break;
+ case KEVENT_POLL:
+ err = kevent_init_poll(k);
+ break;
+ default:
+ err = -ENODEV;
+ }
+
+ return err;
+}
+
+/*
+ * Checks if "event" is requested by given kevent and if so setup kevent's ret_data.
+ * Also updates storage's mask of pending events.
+ * Returns set of events requested by user which are ready.
+ */
+static inline u32 kevent_set_event(struct kevent_storage *st, struct kevent *k, u32 event)
+{
+ u32 ev = event & k->event.event;
+
+ st->event &= ~ev;
+#if 0
+ if (ev)
+ k->event.ret_data[1] = ev;
+#endif
+ return ev;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+ unsigned long flags;
+ u32 ev;
+
+ k->st = st;
+ spin_lock_irqsave(&st->lock, flags);
+
+ spin_lock(&k->lock);
+ ev = kevent_set_event(st, k, st->event);
+ spin_unlock(&k->lock);
+
+ list_add_tail(&k->storage_entry, &st->list);
+ st->qlen++;
+
+ spin_unlock_irqrestore(&st->lock, flags);
+#if 0
+ if (ev) {
+ spin_lock_irqsave(&k->user->ready_lock, flags);
+ list_add_tail(&k->ready_entry, &k->user->ready_list);
+ k->user->ready_num++;
+ spin_unlock_irqrestore(&k->user->ready_lock, flags);
+ wake_up(&k->user->wait);
+ }
+#endif
+ return !!ev;
+}
+
+/*
+ * Dequeue kevent from origin's queue.
+ * It does not decrease origin's reference counter in any way and must be called before it,
+ * so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&st->lock, flags);
+ if (k->storage_entry.next != LIST_POISON1) {
+ list_del(&k->storage_entry);
+ st->qlen--;
+ }
+ spin_unlock_irqrestore(&st->lock, flags);
+}
+
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+ int err, rem = 0;
+
+ wake_up(&k->user->wait);
+
+ err = k->callback(k);
+
+ spin_lock(&k->lock);
+ if (err > 0) {
+ k->event.ret_flags |= KEVENT_RET_DONE;
+ } else if (err < 0) {
+ k->event.ret_flags |= KEVENT_RET_BROKEN;
+ k->event.ret_flags |= KEVENT_RET_DONE;
+ }
+ rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+ spin_unlock(&k->lock);
+
+ if (err) {
+ if (rem) {
+ list_del(&k->storage_entry);
+ k->st->qlen--;
+ }
+ spin_lock(&k->user->ready_lock);
+ if (k->ready_entry.next == LIST_POISON1) {
+ list_add_tail(&k->ready_entry, &k->user->ready_list);
+ k->user->ready_num++;
+ }
+ spin_unlock(&k->user->ready_lock);
+ }
+}
+
+void kevent_requeue(struct kevent *k)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&k->st->lock, flags);
+ __kevent_requeue(k, 0);
+ spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, kevent_callback_t ready_callback, u32 event)
+{
+ //unsigned long flags;
+ struct kevent *k, *n;
+ unsigned int qlen;
+ u32 ev = 0;
+
+ spin_lock_bh(&st->lock);
+ st->event |= event;
+ qlen = st->qlen;
+
+ if (qlen) {
+ list_for_each_entry_safe(k, n, &st->list, storage_entry) {
+ if (qlen-- <= 0)
+ break;
+
+ if (ready_callback)
+ ready_callback(k);
+
+ ev |= (event & k->event.event);
+
+ if (event & k->event.event)
+ __kevent_requeue(k, event);
+ }
+ }
+
+ st->event &= ~ev;
+ spin_unlock_bh(&st->lock);
+}
+
+int kevent_storage_init(__u32 type, __u32 event, void *origin, struct kevent_storage *st)
+{
+ spin_lock_init(&st->lock);
+ st->type = type;
+ st->event = event;
+ st->origin = origin;
+ st->qlen = 0;
+ INIT_LIST_HEAD(&st->list);
+ return 0;
+}
+
+void kevent_storage_fini(struct kevent_storage *st)
+{
+ kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+struct kevent *kevent_alloc(gfp_t mask)
+{
+ struct kevent *k;
+
+ if (kevent_cache)
+ k = kmem_cache_alloc(kevent_cache, mask);
+ else
+ k = kzalloc(sizeof(struct kevent), mask);
+
+ return k;
+}
+
+void kevent_free(struct kevent *k)
+{
+ if (kevent_cache)
+ kmem_cache_free(kevent_cache, k);
+ else
+ kfree(k);
+}
+
+int __init kevent_sys_init(void)
+{
+ int err = 0;
+
+ kevent_cache = kmem_cache_create("kevent_cache", sizeof(struct kevent),
+ 0, 0, NULL, NULL);
+ if (!kevent_cache)
+ err = -ENOMEM;
+
+ return err;
+}
+
+late_initcall(kevent_sys_init);
diff --git a/kernel/kevent/kevent_init.c b/kernel/kevent/kevent_init.c
new file mode 100644
index 0000000..74659df
--- /dev/null
+++ b/kernel/kevent/kevent_init.c
@@ -0,0 +1,77 @@
+/*
+ * kevent_init.c
+ *
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/kevent.h>
+
+int kevent_break(struct kevent *k)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&k->lock, flags);
+ k->event.ret_flags |= KEVENT_RET_BROKEN;
+ spin_unlock_irqrestore(&k->lock, flags);
+ return 0;
+}
+
+#ifndef CONFIG_KEVENT_SOCKET
+int kevent_init_socket(struct kevent *k)
+{
+ kevent_break(k);
+ return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_INODE
+int kevent_init_inode(struct kevent *k)
+{
+ kevent_break(k);
+ return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_TIMER
+int kevent_init_timer(struct kevent *k)
+{
+ kevent_break(k);
+ return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_POLL
+int kevent_init_poll(struct kevent *k)
+{
+ kevent_break(k);
+ return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_NAIO
+int kevent_init_naio(struct kevent *k)
+{
+ kevent_break(k);
+ return -ENODEV;
+}
+#endif
diff --git a/kernel/kevent/kevent_inode.c b/kernel/kevent/kevent_inode.c
new file mode 100644
index 0000000..b6d12f6
--- /dev/null
+++ b/kernel/kevent/kevent_inode.c
@@ -0,0 +1,110 @@
+/*
+ * kevent_inode.c
+ *
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/fs.h>
+
+static int kevent_inode_enqueue(struct kevent *k)
+{
+ struct file *file;
+ struct inode *inode;
+ int err, fput_needed;
+
+ file = fget_light(k->event.id.raw[0], &fput_needed);
+ if (!file)
+ return -ENODEV;
+
+ err = -EINVAL;
+ if (!file->f_dentry || !file->f_dentry->d_inode)
+ goto err_out_fput;
+
+ inode = igrab(file->f_dentry->d_inode);
+ if (!inode)
+ goto err_out_fput;
+
+ err = kevent_storage_enqueue(&inode->st, k);
+ if (err < 0)
+ goto err_out_iput;
+
+ fput_light(file, fput_needed);
+ return 0;
+
+err_out_iput:
+ iput(inode);
+err_out_fput:
+ fput_light(file, fput_needed);
+ return err;
+}
+
+static int kevent_inode_dequeue(struct kevent *k)
+{
+ struct inode *inode = k->st->origin;
+
+ kevent_storage_dequeue(k->st, k);
+ iput(inode);
+
+ return 0;
+}
+
+static int kevent_inode_callback(struct kevent *k)
+{
+ return 1;
+}
+
+int kevent_init_inode(struct kevent *k)
+{
+ k->enqueue = &kevent_inode_enqueue;
+ k->dequeue = &kevent_inode_dequeue;
+ k->callback = &kevent_inode_callback;
+ return 0;
+}
+
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+ struct dentry *parent;
+ struct inode *inode;
+
+ spin_lock(&dentry->d_lock);
+ parent = dentry->d_parent;
+ inode = parent->d_inode;
+
+ dget(parent);
+ spin_unlock(&dentry->d_lock);
+ kevent_inode_notify(inode, KEVENT_INODE_REMOVE);
+ dput(parent);
+}
+
+void kevent_inode_remove(struct inode *inode)
+{
+ kevent_storage_fini(&inode->st);
+}
+
+void kevent_inode_notify(struct inode *inode, u32 event)
+{
+ kevent_storage_ready(&inode->st, NULL, event);
+}
diff --git a/kernel/kevent/kevent_naio.c b/kernel/kevent/kevent_naio.c
new file mode 100644
index 0000000..004b292
--- /dev/null
+++ b/kernel/kevent/kevent_naio.c
@@ -0,0 +1,238 @@
+/*
+ * kevent_naio.c
+ *
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+static int kevent_naio_enqueue(struct kevent *k);
+static int kevent_naio_dequeue(struct kevent *k);
+static int kevent_naio_callback(struct kevent *k);
+
+static int kevent_naio_setup_aio(int ctl_fd, int s, void __user *buf, size_t size, u32 event)
+{
+ struct kevent_user *u;
+ struct file *file;
+ int err, fput_needed;
+ struct ukevent uk;
+
+ file = fget_light(ctl_fd, &fput_needed);
+ if (!file)
+ return -ENODEV;
+
+ u = file->private_data;
+ if (!u) {
+ err = -EINVAL;
+ goto err_out_fput;
+ }
+
+ memset(&uk, 0, sizeof(struct ukevent));
+ uk.type = KEVENT_NAIO;
+ uk.ptr = buf;
+ uk.req_flags = KEVENT_REQ_ONESHOT;
+ uk.event = event;
+ uk.id.raw[0] = s;
+ uk.id.raw[1] = size;
+
+ err = kevent_user_add_ukevent(&uk, u);
+
+err_out_fput:
+ fput_light(file, fput_needed);
+ return err;
+}
+
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags)
+{
+ return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_RECV);
+}
+
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags)
+{
+ return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_SEND);
+}
+
+asmlinkage long sys_aio_sendfile(int ctl_fd, int fd, int s, size_t size, unsigned flags)
+{
+ return 0;
+}
+
+static int kevent_naio_enqueue(struct kevent *k)
+{
+ int err, i;
+ struct page **page;
+ void *addr;
+ unsigned int size = k->event.id.raw[1];
+ int num = size/PAGE_SIZE + 1;
+ struct file *file;
+ struct sock *sk = NULL;
+ int fput_needed;
+
+ file = fget_light(k->event.id.raw[0], &fput_needed);
+ if (!file)
+ return -ENODEV;
+
+ err = -EINVAL;
+ if (!file->f_dentry || !file->f_dentry->d_inode)
+ goto err_out_fput;
+
+ sk = SOCKET_I(file->f_dentry->d_inode)->sk;
+
+ err = -ESOCKTNOSUPPORT;
+ if (!sk || !sk->sk_prot->async_recv || !sk->sk_prot->async_send ||
+ !test_bit(KEVENT_SOCKET_FLAGS_ASYNC, &sk->sk_kevent_flags))
+ goto err_out_fput;
+
+ page = kmalloc(sizeof(struct page *) * num, GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ addr = k->event.ptr;
+
+ down_read(¤t->mm->mmap_sem);
+ err = get_user_pages(current, current->mm, (unsigned long)addr, num, 1, 0, page, NULL);
+ up_read(¤t->mm->mmap_sem);
+ if (err <= 0)
+ goto err_out_free;
+ num = err;
+
+ k->event.ret_data[0] = num;
+ k->event.ret_data[1] = offset_in_page(k->event.ptr);
+ k->priv = page;
+
+ sk->sk_allocation = GFP_ATOMIC;
+
+ spin_lock_bh(&sk->sk_lock.slock);
+ err = kevent_socket_enqueue(k);
+ spin_unlock_bh(&sk->sk_lock.slock);
+ if (err)
+ goto err_out_put_pages;
+
+ fput_light(file, fput_needed);
+
+ return err;
+
+err_out_put_pages:
+ for (i=0; i<num; ++i)
+ page_cache_release(page[i]);
+err_out_free:
+ kfree(page);
+err_out_fput:
+ fput_light(file, fput_needed);
+
+ return err;
+}
+
+static int kevent_naio_dequeue(struct kevent *k)
+{
+ int err, i, num;
+ struct page **page = k->priv;
+
+ num = k->event.ret_data[0];
+
+ err = kevent_socket_dequeue(k);
+
+ for (i=0; i<num; ++i)
+ page_cache_release(page[i]);
+
+ kfree(k->priv);
+ k->priv = NULL;
+
+ return err;
+}
+
+static int kevent_naio_callback(struct kevent *k)
+{
+ struct inode *inode = k->st->origin;
+ struct sock *sk = SOCKET_I(inode)->sk;
+ unsigned int size = k->event.id.raw[1];
+ unsigned int off = k->event.ret_data[1];
+ struct page **pages = k->priv, *page;
+ int ready = 0, num = off/PAGE_SIZE, err = 0, send = 0;
+ void *ptr, *optr;
+ unsigned int len;
+
+ if (!test_bit(KEVENT_SOCKET_FLAGS_ASYNC, &sk->sk_kevent_flags))
+ return -1;
+
+ if (k->event.event & KEVENT_SOCKET_SEND)
+ send = 1;
+ else if (!(k->event.event & KEVENT_SOCKET_RECV))
+ return -EINVAL;
+
+ /*
+ * sk_prot->async_*() can return either number of bytes processed,
+ * or negative error value, or zero if socket is closed.
+ */
+
+ if (!send) {
+ page = pages[num];
+
+ optr = ptr = kmap_atomic(page, KM_IRQ0);
+ if (!ptr)
+ return -ENOMEM;
+
+ ptr += off % PAGE_SIZE;
+ len = min_t(unsigned int, PAGE_SIZE - (ptr - optr), size);
+
+ err = sk->sk_prot->async_recv(sk, ptr, len);
+
+ kunmap_atomic(optr, KM_IRQ0);
+ } else {
+ len = size;
+ err = sk->sk_prot->async_send(sk, pages, off, size);
+ }
+
+ if (err > 0) {
+ num++;
+ size -= err;
+ off += err;
+ }
+
+ k->event.ret_data[1] = off;
+ k->event.id.raw[1] = size;
+
+ if (err == 0 || (err < 0 && err != -EAGAIN))
+ ready = -1;
+
+ if (!size)
+ ready = 1;
+#if 0
+ printk("%s: sk=%p, k=%p, size=%4u, off=%4u, err=%3d, ready=%1d.\n",
+ __func__, sk, k, size, off, err, ready);
+#endif
+
+ return ready;
+}
+
+int kevent_init_naio(struct kevent *k)
+{
+ k->enqueue = &kevent_naio_enqueue;
+ k->dequeue = &kevent_naio_dequeue;
+ k->callback = &kevent_naio_callback;
+ return 0;
+}
diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..12b06bb
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,213 @@
+/*
+ * kevent_poll.c
+ *
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+ struct poll_table_struct pt;
+ struct kevent *k;
+};
+
+struct kevent_poll_wait_container
+{
+ struct list_head container_entry;
+ wait_queue_head_t *whead;
+ wait_queue_t wait;
+ struct kevent *k;
+};
+
+struct kevent_poll_private
+{
+ struct list_head container_list;
+ spinlock_t container_lock;
+};
+
+static int kevent_poll_wait_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct kevent_poll_wait_container *cont = container_of(wait, struct kevent_poll_wait_container,
wait);
+ struct kevent *k = cont->k;
+ struct file *file = k->st->origin;
+ unsigned long flags;
+ u32 revents, event;
+
+ revents = file->f_op->poll(file, NULL);
+ spin_lock_irqsave(&k->lock, flags);
+ event = k->event.event;
+ spin_unlock_irqrestore(&k->lock, flags);
+
+ kevent_storage_ready(k->st, NULL, revents);
+
+ return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, struct
poll_table_struct *poll_table)
+{
+ struct kevent *k = container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+ struct kevent_poll_private *priv = k->priv;
+ struct kevent_poll_wait_container *cont;
+ unsigned long flags;
+
+ cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
+ if (!cont) {
+ kevent_break(k);
+ return;
+ }
+
+ cont->k = k;
+ init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+ cont->whead = whead;
+
+ spin_lock_irqsave(&priv->container_lock, flags);
+ list_add_tail(&cont->container_entry, &priv->container_list);
+ spin_unlock_irqrestore(&priv->container_lock, flags);
+
+ add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+ struct file *file;
+ int err, ready = 0;
+ unsigned int revents;
+ struct kevent_poll_ctl ctl;
+ struct kevent_poll_private *priv;
+
+ file = fget(k->event.id.raw[0]);
+ if (!file)
+ return -ENODEV;
+
+ err = -EINVAL;
+ if (!file->f_op || !file->f_op->poll)
+ goto err_out_fput;
+
+ err = -ENOMEM;
+ priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
+ if (!priv)
+ goto err_out_fput;
+
+ spin_lock_init(&priv->container_lock);
+ INIT_LIST_HEAD(&priv->container_list);
+
+ k->priv = priv;
+
+ ctl.k = k;
+ init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+
+ revents = file->f_op->poll(file, &ctl.pt);
+ if (revents & k->event.event)
+ ready = 1;
+
+ err = kevent_storage_enqueue(&file->st, k);
+ if (err < 0)
+ goto err_out_free;
+
+ return ready;
+
+err_out_free:
+ kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+ fput(file);
+ return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+ struct file *file = k->st->origin;
+ struct kevent_poll_private *priv = k->priv;
+ struct kevent_poll_wait_container *w, *n;
+ unsigned long flags;
+
+ kevent_storage_dequeue(k->st, k);
+
+ spin_lock_irqsave(&priv->container_lock, flags);
+ list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+ list_del(&w->container_entry);
+ remove_wait_queue(w->whead, &w->wait);
+ kmem_cache_free(kevent_poll_container_cache, w);
+ }
+ spin_unlock_irqrestore(&priv->container_lock, flags);
+
+ kmem_cache_free(kevent_poll_priv_cache, priv);
+ k->priv = NULL;
+
+ fput(file);
+
+ return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+ struct file *file = k->st->origin;
+ unsigned int revents = file->f_op->poll(file, NULL);
+ return (revents & k->event.event);
+}
+
+int kevent_init_poll(struct kevent *k)
+{
+ if (!kevent_poll_container_cache || !kevent_poll_priv_cache)
+ return -ENOMEM;
+
+ k->enqueue = &kevent_poll_enqueue;
+ k->dequeue = &kevent_poll_dequeue;
+ k->callback = &kevent_poll_callback;
+ return 0;
+}
+
+
+static int __init kevent_poll_sys_init(void)
+{
+ kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", sizeof(struct
kevent_poll_wait_container),
+ 0, 0, NULL, NULL);
+ if (!kevent_poll_container_cache) {
+ printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+ return -ENOMEM;
+ }
+
+ kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", sizeof(struct
kevent_poll_private),
+ 0, 0, NULL, NULL);
+ if (!kevent_poll_priv_cache) {
+ printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+ kmem_cache_destroy(kevent_poll_container_cache);
+ kevent_poll_container_cache = NULL;
+ return -ENOMEM;
+ }
+
+ printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+ return 0;
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+ kmem_cache_destroy(kevent_poll_priv_cache);
+ kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);
diff --git a/kernel/kevent/kevent_socket.c b/kernel/kevent/kevent_socket.c
new file mode 100644
index 0000000..d179ca8
--- /dev/null
+++ b/kernel/kevent/kevent_socket.c
@@ -0,0 +1,124 @@
+/*
+ * kevent_socket.c
+ *
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/tcp.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/request_sock.h>
+#include <net/inet_connection_sock.h>
+
+static int kevent_socket_callback(struct kevent *k)
+{
+ struct inode *inode = k->st->origin;
+ struct sock *sk = SOCKET_I(inode)->sk;
+ int rmem;
+
+ if (k->event.event & KEVENT_SOCKET_RECV) {
+ int ret = 0;
+
+ if ((rmem = atomic_read(&sk->sk_rmem_alloc)) > 0 || !skb_queue_empty(&sk->sk_receive_queue))
+ ret = 1;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ ret = 1;
+ if (ret)
+ return ret;
+ }
+ if ((k->event.event & KEVENT_SOCKET_ACCEPT) &&
+ (!reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ||
+ reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue))) {
+ k->event.ret_data[1] = reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
+ return 1;
+ }
+
+ return 0;
+}
+
+int kevent_socket_enqueue(struct kevent *k)
+{
+ struct file *file;
+ struct inode *inode;
+ int err, fput_needed;
+
+ file = fget_light(k->event.id.raw[0], &fput_needed);
+ if (!file)
+ return -ENODEV;
+
+ err = -EINVAL;
+ if (!file->f_dentry || !file->f_dentry->d_inode)
+ goto err_out_fput;
+
+ inode = igrab(file->f_dentry->d_inode);
+ if (!inode)
+ goto err_out_fput;
+
+ err = kevent_storage_enqueue(&inode->st, k);
+ if (err < 0)
+ goto err_out_iput;
+
+ err = k->callback(k);
+ if (err)
+ goto err_out_dequeue;
+
+ fput_light(file, fput_needed);
+ return err;
+
+err_out_dequeue:
+ kevent_storage_dequeue(k->st, k);
+err_out_iput:
+ iput(inode);
+err_out_fput:
+ fput_light(file, fput_needed);
+ return err;
+}
+
+int kevent_socket_dequeue(struct kevent *k)
+{
+ struct inode *inode = k->st->origin;
+
+ kevent_storage_dequeue(k->st, k);
+ iput(inode);
+
+ return 0;
+}
+
+int kevent_init_socket(struct kevent *k)
+{
+ k->enqueue = &kevent_socket_enqueue;
+ k->dequeue = &kevent_socket_dequeue;
+ k->callback = &kevent_socket_callback;
+ return 0;
+}
+
+void kevent_socket_notify(struct sock *sk, u32 event)
+{
+ if (sk->sk_socket && !test_and_set_bit(KEVENT_SOCKET_FLAGS_INUSE, &sk->sk_kevent_flags)) {
+ kevent_storage_ready(&SOCK_INODE(sk->sk_socket)->st, NULL, event);
+ clear_bit(KEVENT_SOCKET_FLAGS_INUSE, &sk->sk_kevent_flags);
+ }
+}
diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..3ae05c2
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,111 @@
+/*
+ * kevent_timer.c
+ *
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+static void kevent_timer_func(unsigned long data)
+{
+ struct kevent *k = (struct kevent *)data;
+ struct timer_list *t = k->st->origin;
+
+ kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+ mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+}
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+ struct timer_list *t;
+ struct kevent_storage *st;
+ int err;
+
+ t = kmalloc(sizeof(struct timer_list) + sizeof(struct kevent_storage), GFP_KERNEL);
+ if (!t)
+ return -ENOMEM;
+
+ init_timer(t);
+ t->function = kevent_timer_func;
+ t->expires = jiffies + msecs_to_jiffies(k->event.id.raw[0]);
+ t->data = (unsigned long)k;
+
+ st = (struct kevent_storage *)(t+1);
+ err = kevent_storage_init(k->event.type, KEVENT_MASK_EMPTY, t, st);
+ if (err)
+ goto err_out_free;
+
+ err = kevent_storage_enqueue(st, k);
+ if (err < 0)
+ goto err_out_st_fini;
+
+ add_timer(t);
+
+ return 0;
+
+err_out_st_fini:
+ kevent_storage_fini(st);
+err_out_free:
+ kfree(t);
+
+ return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+ struct kevent_storage *st = k->st;
+ struct timer_list *t = st->origin;
+
+ if (!t)
+ return -ENODEV;
+
+ del_timer_sync(t);
+
+ kevent_storage_dequeue(st, k);
+
+ kfree(t);
+
+ return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+ struct kevent_storage *st = k->st;
+ struct timer_list *t = st->origin;
+
+ if (!t)
+ return -ENODEV;
+
+ k->event.ret_data[0] = (__u32)jiffies;
+ return 1;
+}
+
+int kevent_init_timer(struct kevent *k)
+{
+ k->enqueue = &kevent_timer_enqueue;
+ k->dequeue = &kevent_timer_dequeue;
+ k->callback = &kevent_timer_callback;
+ return 0;
+}
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..6a0eedd
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,655 @@
+/*
+ * kevent_user.c
+ *
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+static struct class *kevent_user_class;
+static char kevent_name[] = "kevent";
+static int kevent_user_major;
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static int kevent_user_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+
+static struct file_operations kevent_user_fops = {
+ .open = kevent_user_open,
+ .release = kevent_user_release,
+ .ioctl = kevent_user_ioctl,
+ .poll = kevent_user_poll,
+ .owner = THIS_MODULE,
+};
+
+static struct super_block *kevent_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
+{
+ return get_sb_pseudo(fs_type, kevent_name, NULL, 0xabcdef); /* So original magic... */
+}
+
+static struct file_system_type kevent_fs_type = {
+ .name = kevent_name,
+ .get_sb = kevent_get_sb,
+ .kill_sb = kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct kevent_user *u = file->private_data;
+ unsigned int mask;
+
+ poll_wait(file, &u->wait, wait);
+ mask = 0;
+
+ if (u->ready_num)
+ mask |= POLLIN | POLLRDNORM;
+
+ return mask;
+}
+
+static struct kevent_user *kevent_user_alloc(void)
+{
+ struct kevent_user *u;
+ int i;
+
+ u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+ if (!u)
+ return NULL;
+
+ INIT_LIST_HEAD(&u->ready_list);
+ spin_lock_init(&u->ready_lock);
+ u->ready_num = 0;
+
+ for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
+ INIT_LIST_HEAD(&u->kqueue[i].kevent_list);
+ spin_lock_init(&u->kqueue[i].kevent_lock);
+ }
+ u->kevent_num = 0;
+
+ init_MUTEX(&u->ctl_mutex);
+ init_MUTEX(&u->wait_mutex);
+ init_waitqueue_head(&u->wait);
+ u->max_ready_num = 0;
+
+ atomic_set(&u->refcnt, 1);
+
+ return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+ struct kevent_user *u = kevent_user_alloc();
+
+ if (!u)
+ return -ENOMEM;
+
+ file->private_data = u;
+
+ return 0;
+}
+
+static inline void kevent_user_get(struct kevent_user *u)
+{
+ atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+ if (atomic_dec_and_test(&u->refcnt))
+ kfree(u);
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+ unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+
+ h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+ h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+ return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+ return jhash_1word(uk->id.raw[0], uk->id.raw[1]) & KEVENT_HASH_MASK;
+}
+#endif
+
+/*
+ * Remove kevent from user's list of all events,
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int lock)
+{
+ struct kevent_user *u = k->user;
+ unsigned long flags;
+
+ if (lock) {
+ unsigned int hash = kevent_user_hash(&k->event);
+ struct kevent_list *l = &u->kqueue[hash];
+
+ spin_lock_irqsave(&l->kevent_lock, flags);
+ list_del(&k->kevent_entry);
+ u->kevent_num--;
+ spin_unlock_irqrestore(&l->kevent_lock, flags);
+ } else {
+ list_del(&k->kevent_entry);
+ u->kevent_num--;
+ }
+ kevent_dequeue(k);
+ kevent_user_put(u);
+ kevent_free(k);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *__kqueue_dequeue_one_ready(struct list_head *q, unsigned int *qlen)
+{
+ struct kevent *k = NULL;
+ unsigned int len = *qlen;
+
+ if (len) {
+ k = list_entry(q->next, struct kevent, ready_entry);
+ list_del(&k->ready_entry);
+ *qlen = len - 1;
+ }
+
+ return k;
+}
+
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+ unsigned long flags;
+ struct kevent *k;
+
+ spin_lock_irqsave(&u->ready_lock, flags);
+ k = __kqueue_dequeue_one_ready(&u->ready_list, &u->ready_num);
+ spin_unlock_irqrestore(&u->ready_lock, flags);
+
+ return k;
+}
+
+struct kevent *kevent_search(struct ukevent *uk, struct kevent_user *u)
+{
+ struct kevent *k;
+ unsigned int hash = kevent_user_hash(uk);
+ struct kevent_list *l = &u->kqueue[hash];
+ int found = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&l->kevent_lock, flags);
+ list_for_each_entry(k, &l->kevent_list, kevent_entry) {
+ spin_lock(&k->lock);
+ if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+ k->event.id.raw[0] == uk->id.raw[0] && k->event.id.raw[1] == uk->id.raw[1]) {
+ found = 1;
+ spin_unlock(&k->lock);
+ break;
+ }
+ spin_unlock(&k->lock);
+ }
+ spin_unlock_irqrestore(&l->kevent_lock, flags);
+
+ return (found)?k:NULL;
+}
+
+/*
+ * No new entry can be added or removed from any list at this point.
+ * It is not permitted to call ->ioctl() and ->release() in parallel.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+ struct kevent_user *u = file->private_data;
+ struct kevent *k, *n;
+ int i;
+
+ for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
+ struct kevent_list *l = &u->kqueue[i];
+
+ list_for_each_entry_safe(k, n, &l->kevent_list, kevent_entry)
+ kevent_finish_user(k, 1);
+ }
+
+ kevent_user_put(u);
+ file->private_data = NULL;
+
+ return 0;
+}
+
+static int kevent_user_ctl_modify(struct kevent_user *u, struct kevent_user_control *ctl, void
__user *arg)
+{
+ int err = 0, i;
+ struct kevent *k;
+ unsigned long flags;
+ struct ukevent uk;
+
+ if (down_interruptible(&u->ctl_mutex))
+ return -ERESTARTSYS;
+
+ for (i=0; i<ctl->num; ++i) {
+ if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+ err = -EINVAL;
+ break;
+ }
+
+ k = kevent_search(&uk, u);
+ if (k) {
+ spin_lock_irqsave(&k->lock, flags);
+ k->event.event = uk.event;
+ k->event.req_flags = uk.req_flags;
+ k->event.ret_flags = 0;
+ spin_unlock_irqrestore(&k->lock, flags);
+ kevent_requeue(k);
+ } else
+ uk.ret_flags |= KEVENT_RET_BROKEN;
+
+ uk.ret_flags |= KEVENT_RET_DONE;
+
+ if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+ err = -EINVAL;
+ break;
+ }
+
+ arg += sizeof(struct ukevent);
+ }
+
+ up(&u->ctl_mutex);
+
+ return err;
+}
+
+static int kevent_user_ctl_remove(struct kevent_user *u, struct kevent_user_control *ctl, void
__user *arg)
+{
+ int err = 0, i;
+ struct kevent *k;
+ struct ukevent uk;
+
+ if (down_interruptible(&u->ctl_mutex))
+ return -ERESTARTSYS;
+
+ for (i=0; i<ctl->num; ++i) {
+ if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+ err = -EINVAL;
+ break;
+ }
+
+ k = kevent_search(&uk, u);
+ if (k) {
+ kevent_finish_user(k, 1);
+ } else
+ uk.ret_flags |= KEVENT_RET_BROKEN;
+
+ uk.ret_flags |= KEVENT_RET_DONE;
+
+ if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+ err = -EINVAL;
+ break;
+ }
+
+ arg += sizeof(struct ukevent);
+ }
+
+ up(&u->ctl_mutex);
+
+ return err;
+}
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+ struct kevent *k;
+ int err;
+
+ k = kevent_alloc(GFP_KERNEL);
+ if (!k) {
+ err = -ENOMEM;
+ goto err_out_exit;
+ }
+
+ memcpy(&k->event, uk, sizeof(struct ukevent));
+
+ k->event.ret_flags = 0;
+ k->event.ret_data[0] = 0;
+ k->event.ret_data[1] = 0;
+
+ err = kevent_init(k);
+ if (err) {
+ kevent_free(k);
+ goto err_out_exit;
+ }
+ k->user = u;
+
+ err = kevent_enqueue(k);
+ if (err) {
+ if (err < 0)
+ uk->ret_flags |= KEVENT_RET_BROKEN;
+ uk->ret_flags |= KEVENT_RET_DONE;
+ kevent_free(k);
+ } else {
+ unsigned long flags;
+ unsigned int hash = kevent_user_hash(&k->event);
+ struct kevent_list *l = &u->kqueue[hash];
+
+ spin_lock_irqsave(&l->kevent_lock, flags);
+ list_add_tail(&k->kevent_entry, &l->kevent_list);
+ u->kevent_num++;
+ kevent_user_get(u);
+ spin_unlock_irqrestore(&l->kevent_lock, flags);
+ }
+
+err_out_exit:
+ return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one and add them
+ * into appropriate kevent_storages, e.g. sockets, inodes and so on...
+ * If something goes wrong, all events will be dequeued and negative error will be returned.
+ * On success zero is returned and ctl->num will be a number of finished events, either completed
+ * or failed. Array of finished events (struct ukevent) will be placed behind kevent_user_control
structure.
+ * User must run through that array and check ret_flags field of each ukevent structure
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, struct kevent_user_control *ctl, void __user
*arg)
+{
+ int err = 0, cerr = 0, num = 0, knum = 0, i;
+ void __user *orig, *ctl_addr;
+ struct ukevent uk;
+
+ if (down_interruptible(&u->ctl_mutex))
+ return -ERESTARTSYS;
+
+ orig = arg;
+ ctl_addr = arg - sizeof(struct kevent_user_control);
+#if 1
+ err = -ENFILE;
+ if (u->kevent_num + ctl->num >= 1024)
+ goto err_out_remove;
+#endif
+ for (i=0; i<ctl->num; ++i) {
+ if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+ cerr = -EINVAL;
+ break;
+ }
+ arg += sizeof(struct ukevent);
+
+ err = kevent_user_add_ukevent(&uk, u);
+ if (err) {
+ if (copy_to_user(orig, &uk, sizeof(struct ukevent)))
+ cerr = -EINVAL;
+ orig += sizeof(struct ukevent);
+ num++;
+ } else
+ knum++;
+ }
+
+ if (cerr < 0)
+ goto err_out_remove;
+
+ ctl->num = num;
+ if (copy_to_user(ctl_addr, ctl, sizeof(struct kevent_user_control)))
+ cerr = -EINVAL;
+
+ if (cerr)
+ err = cerr;
+ if (!err)
+ err = num;
+
+err_out_remove:
+ up(&u->ctl_mutex);
+
+ return err;
+}
+
+/*
+ * Waits until at least ctl->ready_num events are ready or timeout and returns
+ * number of ready events (in case of timeout) or number of requested events.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, struct kevent_user_control
*ctl, void __user *arg)
+{
+ struct kevent *k;
+ int cerr = 0, num = 0;
+ void __user *ptr = arg + sizeof(struct kevent_user_control);
+
+ if (down_interruptible(&u->wait_mutex))
+ return -ERESTARTSYS;
+
+ if (!(file->f_flags & O_NONBLOCK)) {
+ if (ctl->timeout)
+ wait_event_interruptible_timeout(u->wait,
+ u->ready_num >= ctl->num, msecs_to_jiffies(ctl->timeout));
+ else
+ wait_event_interruptible_timeout(u->wait,
+ u->ready_num > 0, msecs_to_jiffies(1000));
+ }
+ while (num < ctl->num && (k = kqueue_dequeue_ready(u))) {
+ if (copy_to_user(ptr + num*sizeof(struct ukevent), &k->event, sizeof(struct ukevent)))
+ cerr = -EINVAL;
+ /*
+ * If it is one-shot kevent, it has been removed already from
+ * origin's queue, so we can easily free it here.
+ */
+ if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+ kevent_finish_user(k, 1);
+ ++num;
+ }
+
+ ctl->num = num;
+ if (copy_to_user(arg, ctl, sizeof(struct kevent_user_control)))
+ cerr = -EINVAL;
+
+ up(&u->wait_mutex);
+
+ return (cerr)?cerr:num;
+}
+
+static int kevent_ctl_init(void)
+{
+ struct kevent_user *u;
+ struct file *file;
+ int fd, ret;
+
+ fd = get_unused_fd();
+ if (fd < 0)
+ return fd;
+
+ file = get_empty_filp();
+ if (!file) {
+ ret = -ENFILE;
+ goto out_put_fd;
+ }
+
+ u = kevent_user_alloc();
+ if (unlikely(!u)) {
+ ret = -ENOMEM;
+ goto out_put_file;
+ }
+
+ file->f_op = &kevent_user_fops;
+ file->f_vfsmnt = mntget(kevent_mnt);
+ file->f_dentry = dget(kevent_mnt->mnt_root);
+ file->f_mapping = file->f_dentry->d_inode->i_mapping;
+ file->f_mode = FMODE_READ;
+ file->f_flags = O_RDONLY;
+ file->private_data = u;
+
+ fd_install(fd, file);
+
+ return fd;
+
+out_put_file:
+ put_filp(file);
+out_put_fd:
+ put_unused_fd(fd);
+ return ret;
+}
+
+static int kevent_ctl_process(struct file *file, struct kevent_user_control *ctl, void __user
*arg)
+{
+ int err;
+ struct kevent_user *u = file->private_data;
+
+ if (!u)
+ return -EINVAL;
+
+ switch (ctl->cmd) {
+ case KEVENT_CTL_ADD:
+ err = kevent_user_ctl_add(u, ctl, arg+sizeof(struct kevent_user_control));
+ break;
+ case KEVENT_CTL_REMOVE:
+ err = kevent_user_ctl_remove(u, ctl, arg+sizeof(struct kevent_user_control));
+ break;
+ case KEVENT_CTL_MODIFY:
+ err = kevent_user_ctl_modify(u, ctl, arg+sizeof(struct kevent_user_control));
+ break;
+ case KEVENT_CTL_WAIT:
+ err = kevent_user_wait(file, u, ctl, arg);
+ break;
+ case KEVENT_CTL_INIT:
+ err = kevent_ctl_init();
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
+
+asmlinkage long sys_kevent_ctl(int fd, void __user *arg)
+{
+ int err, fput_needed;
+ struct kevent_user_control ctl;
+ struct file *file;
+
+ if (copy_from_user(&ctl, arg, sizeof(struct kevent_user_control)))
+ return -EINVAL;
+
+ if (ctl.cmd == KEVENT_CTL_INIT)
+ return kevent_ctl_init();
+
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return -ENODEV;
+
+ err = kevent_ctl_process(file, &ctl, arg);
+
+ fput_light(file, fput_needed);
+ return err;
+}
+
+static int kevent_user_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned
long arg)
+{
+ int err = -ENODEV;
+ struct kevent_user_control ctl;
+ struct kevent_user *u = file->private_data;
+ void __user *ptr = (void __user *)arg;
+
+ if (copy_from_user(&ctl, ptr, sizeof(struct kevent_user_control)))
+ return -EINVAL;
+
+ switch (cmd) {
+ case KEVENT_USER_CTL:
+ err = kevent_ctl_process(file, &ctl, ptr);
+ break;
+ case KEVENT_USER_WAIT:
+ err = kevent_user_wait(file, u, &ctl, ptr);
+ break;
+ default:
+ break;
+ }
+
+ return err;
+}
+
+static int __devinit kevent_user_init(void)
+{
+ struct class_device *dev;
+ int err = 0;
+
+ err = register_filesystem(&kevent_fs_type);
+ if (err)
+ panic("%s: failed to register filesystem: err=%d.\n", kevent_name, err);
+
+ kevent_mnt = kern_mount(&kevent_fs_type);
+ if (IS_ERR(kevent_mnt))
+ panic("%s: failed to mount silesystem: err=%ld.\n", kevent_name, PTR_ERR(kevent_mnt));
+
+ kevent_user_major = register_chrdev(0, kevent_name, &kevent_user_fops);
+ if (kevent_user_major < 0) {
+ printk(KERN_ERR "Failed to register \"%s\" char device: err=%d.\n", kevent_name,
kevent_user_major);
+ return -ENODEV;
+ }
+
+ kevent_user_class = class_create(THIS_MODULE, "kevent");
+ if (IS_ERR(kevent_user_class)) {
+ printk(KERN_ERR "Failed to register \"%s\" class: err=%ld.\n", kevent_name,
PTR_ERR(kevent_user_class));
+ err = PTR_ERR(kevent_user_class);
+ goto err_out_unregister;
+ }
+
+ dev = class_device_create(kevent_user_class, NULL, MKDEV(kevent_user_major, 0), NULL,
kevent_name);
+ if (IS_ERR(dev)) {
+ printk(KERN_ERR "Failed to create %d.%d class device in \"%s\" class: err=%ld.\n",
+ kevent_user_major, 0, kevent_name, PTR_ERR(dev));
+ err = PTR_ERR(dev);
+ goto err_out_class_destroy;
+ }
+
+ printk("KEVENT subsystem: chardev helper: major=%d.\n", kevent_user_major);
+
+ return 0;
+
+err_out_class_destroy:
+ class_destroy(kevent_user_class);
+err_out_unregister:
+ unregister_chrdev(kevent_user_major, kevent_name);
+
+ return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+ class_device_destroy(kevent_user_class, MKDEV(kevent_user_major, 0));
+ class_destroy(kevent_user_class);
+ unregister_chrdev(kevent_user_major, kevent_name);
+ mntput(kevent_mnt);
+ unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370..2053809 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -90,3 +90,8 @@ cond_syscall(sys_pciconfig_iobase);
cond_syscall(sys32_ipc);
cond_syscall(sys32_sysctl);
cond_syscall(ppc_rtas);
+
+cond_syscall(sys_aio_recv);
+cond_syscall(sys_aio_send);
+cond_syscall(sys_aio_sendfile);
+cond_syscall(sys_kevent_ctl);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 1bcfef5..b2f19a7 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -200,6 +200,60 @@ void skb_free_datagram(struct sock *sk,
}
/**
+ * skb_copy_datagram - Copy a datagram.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: pointer to copy to
+ * @len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram(const struct sk_buff *skb, int offset,
+ void *to, int len)
+{
+ int i, fraglen, end = 0;
+ struct sk_buff *next = skb_shinfo(skb)->frag_list;
+
+ if (!len)
+ return 0;
+
+next_skb:
+ fraglen = skb_headlen(skb);
+ i = -1;
+
+ while (1) {
+ int start = end;
+
+ if ((end += fraglen) > offset) {
+ int copy = end - offset, o = offset - start;
+
+ if (copy > len)
+ copy = len;
+ if (i == -1)
+ memcpy(to, skb->data + o, copy);
+ else {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = frag->page;
+ void *p = kmap(page) + frag->page_offset + o;
+ memcpy(to, p, copy);
+ kunmap(page);
+ }
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ }
+ if (++i >= skb_shinfo(skb)->nr_frags)
+ break;
+ fraglen = skb_shinfo(skb)->frags[i].size;
+ }
+ if (next) {
+ skb = next;
+ BUG_ON(skb_shinfo(skb)->frag_list);
+ next = skb->next;
+ goto next_skb;
+ }
+ return -EFAULT;
+}
+
+/**
* skb_copy_datagram_iovec - Copy a datagram to an iovec.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
@@ -467,6 +521,7 @@ unsigned int datagram_poll(struct file *
EXPORT_SYMBOL(datagram_poll);
EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
+EXPORT_SYMBOL(skb_copy_datagram);
EXPORT_SYMBOL(skb_copy_datagram_iovec);
EXPORT_SYMBOL(skb_free_datagram);
EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/sock.c b/net/core/sock.c
index 13cc3be..3d32c5c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -455,6 +455,16 @@ set_rcvbuf:
spin_unlock_bh(&sk->sk_lock.slock);
ret = -ENONET;
break;
+#ifdef CONFIG_KEVENT_SOCKET
+ case SO_ASYNC_SOCK:
+ spin_lock_bh(&sk->sk_lock.slock);
+ if (valbool)
+ set_bit(KEVENT_SOCKET_FLAGS_ASYNC, &sk->sk_kevent_flags);
+ else
+ clear_bit(KEVENT_SOCKET_FLAGS_ASYNC, &sk->sk_kevent_flags);
+ spin_unlock_bh(&sk->sk_lock.slock);
+ break;
+#endif
/* We implement the SO_SNDLOWAT etc to
not be settable (1003.1g 5.3) */
@@ -1201,6 +1211,7 @@ static void sock_def_wakeup(struct sock
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible_all(sk->sk_sleep);
read_unlock(&sk->sk_callback_lock);
+ kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
}
static void sock_def_error_report(struct sock *sk)
@@ -1210,6 +1221,7 @@ static void sock_def_error_report(struct
wake_up_interruptible(sk->sk_sleep);
sk_wake_async(sk,0,POLL_ERR);
read_unlock(&sk->sk_callback_lock);
+ kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
}
static void sock_def_readable(struct sock *sk, int len)
@@ -1219,6 +1231,7 @@ static void sock_def_readable(struct soc
wake_up_interruptible(sk->sk_sleep);
sk_wake_async(sk,1,POLL_IN);
read_unlock(&sk->sk_callback_lock);
+ kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
}
static void sock_def_write_space(struct sock *sk)
@@ -1235,6 +1248,8 @@ static void sock_def_write_space(struct
/* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk))
sk_wake_async(sk, 2, POLL_OUT);
+
+ kevent_socket_notify(sk, KEVENT_SOCKET_SEND);
}
read_unlock(&sk->sk_callback_lock);
diff --git a/net/core/stream.c b/net/core/stream.c
index 15bfd03..745a07f 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -36,6 +36,7 @@ void sk_stream_write_space(struct sock *
wake_up_interruptible(sk->sk_sleep);
if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
sock_wake_async(sock, 2, POLL_OUT);
+ kevent_socket_notify(sk, KEVENT_SOCKET_SEND);
}
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ef98b14..c2ed578 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -206,6 +206,7 @@
* lingertime == 0 (RFC 793 ABORT Call)
* Hirokazu Takahashi : Use copy_from_user() instead of
* csum_and_copy_from_user() if possible.
+ * Evgeniy Polyakov : Network asynchronous IO.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -1090,6 +1091,275 @@ int tcp_read_sock(struct sock *sk, read_
}
/*
+ * Must be called with locked sock.
+ */
+int tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss_now, size_goal;
+ int err = -EAGAIN;
+ ssize_t copied;
+
+ /* Wait for a connection to finish. */
+ if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ goto out_err;
+
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ mss_now = tcp_current_mss(sk, 1);
+ size_goal = tp->xmit_size_goal;
+ copied = 0;
+
+ err = -EPIPE;
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || sock_flag(sk, SOCK_DONE) ||
+ (sk->sk_state == TCP_CLOSE) || (atomic_read(&sk->sk_refcnt) == 1))
+ goto do_error;
+
+ while (len > 0) {
+ struct sk_buff *skb = sk->sk_write_queue.prev;
+ struct page *page = pages[poffset / PAGE_SIZE];
+ int copy, i, can_coalesce;
+ int offset = poffset % PAGE_SIZE;
+ int size = min_t(size_t, len, PAGE_SIZE - offset);
+
+ if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
+new_segment:
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+
+ skb = sk_stream_alloc_pskb(sk, 0, 0,
+ sk->sk_allocation);
+ if (!skb)
+ goto wait_for_memory;
+
+ skb_entail(sk, tp, skb);
+ copy = size_goal;
+ }
+
+ if (copy > size)
+ copy = size;
+
+ i = skb_shinfo(skb)->nr_frags;
+ can_coalesce = skb_can_coalesce(skb, i, page, offset);
+ if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ if (!sk_stream_wmem_schedule(sk, copy))
+ goto wait_for_memory;
+
+ if (can_coalesce) {
+ skb_shinfo(skb)->frags[i - 1].size += copy;
+ } else {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, copy);
+ }
+
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+ sk->sk_wmem_queued += copy;
+ sk->sk_forward_alloc -= copy;
+ skb->ip_summed = CHECKSUM_HW;
+ tp->write_seq += copy;
+ TCP_SKB_CB(skb)->end_seq += copy;
+ skb_shinfo(skb)->tso_segs = 0;
+
+ if (!copied)
+ TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+
+ copied += copy;
+ poffset += copy;
+ if (!(len -= copy))
+ goto out;
+
+ if (skb->len < mss_now)
+ continue;
+
+ if (forced_push(tp)) {
+ tcp_mark_push(tp, skb);
+ __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+ } else if (skb == sk->sk_send_head)
+ tcp_push_one(sk, mss_now);
+ continue;
+
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ if (copied)
+ tcp_push(sk, tp, 0, mss_now, TCP_NAGLE_PUSH);
+
+ err = -EAGAIN;
+ goto do_error;
+ }
+
+out:
+ if (copied)
+ tcp_push(sk, tp, 0, mss_now, tp->nonagle);
+ return copied;
+
+do_error:
+ if (copied)
+ goto out;
+out_err:
+ return sk_stream_error(sk, 0, err);
+}
+
+/*
+ * Must be called with locked sock.
+ */
+int tcp_async_recv(struct sock *sk, void *dst, size_t len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int copied = 0;
+ u32 *seq;
+ unsigned long used;
+ int err;
+ int target; /* Read at least this many bytes */
+
+ TCP_CHECK_TIMER(sk);
+
+ err = -ENOTCONN;
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
+ seq = &tp->copied_seq;
+
+ target = sock_rcvlowat(sk, 0, len);
+
+ do {
+ struct sk_buff *skb;
+ u32 offset;
+
+ /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+ if (tp->urg_data && tp->urg_seq == *seq) {
+ if (copied)
+ break;
+ }
+
+ /* Next get a buffer. */
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ do {
+ if (!skb)
+ break;
+
+ /* Now that we have two receive queues this
+ * shouldn't happen.
+ */
+ if (before(*seq, TCP_SKB_CB(skb)->seq)) {
+ printk(KERN_INFO "async_recv bug: copied %X "
+ "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+ break;
+ }
+ offset = *seq - TCP_SKB_CB(skb)->seq;
+ if (skb->h.th->syn)
+ offset--;
+ if (offset < skb->len)
+ goto found_ok_skb;
+ if (skb->h.th->fin)
+ goto found_fin_ok;
+ skb = skb->next;
+ } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+
+ if (copied)
+ break;
+
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ break;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ if (sk->sk_state == TCP_CLOSE) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ copied = -ENOTCONN;
+ break;
+ }
+ break;
+ }
+
+ copied = -EAGAIN;
+ break;
+
+ found_ok_skb:
+ /* Ok so how much can we use? */
+ used = skb->len - offset;
+ if (len < used)
+ used = len;
+
+ /* Do we have urgent data here? */
+ if (tp->urg_data) {
+ u32 urg_offset = tp->urg_seq - *seq;
+ if (urg_offset < used) {
+ if (!urg_offset) {
+ if (!sock_flag(sk, SOCK_URGINLINE)) {
+ ++*seq;
+ offset++;
+ used--;
+ if (!used)
+ goto skip_copy;
+ }
+ } else
+ used = urg_offset;
+ }
+ }
+
+ err = skb_copy_datagram(skb, offset, dst, used);
+ if (err) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+
+ *seq += used;
+ copied += used;
+ len -= used;
+ dst += used;
+
+ tcp_rcv_space_adjust(sk);
+
+skip_copy:
+ if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+ tp->urg_data = 0;
+ tcp_fast_path_check(sk, tp);
+ }
+ if (used + offset < skb->len)
+ continue;
+
+ if (skb->h.th->fin)
+ goto found_fin_ok;
+ sk_eat_skb(sk, skb);
+ continue;
+
+ found_fin_ok:
+ /* Process the FIN. */
+ ++*seq;
+ sk_eat_skb(sk, skb);
+ break;
+ } while (len > 0);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ cleanup_rbuf(sk, copied);
+
+ TCP_CHECK_TIMER(sk);
+ return copied;
+
+out:
+ TCP_CHECK_TIMER(sk);
+ return err;
+}
+
+/*
* This routine copies from a sock struct into the user buffer.
*
* Technical note: in 2.3 we work on _locked_ socket, so that
@@ -2136,6 +2406,8 @@ EXPORT_SYMBOL(tcp_getsockopt);
EXPORT_SYMBOL(tcp_ioctl);
EXPORT_SYMBOL(tcp_poll);
EXPORT_SYMBOL(tcp_read_sock);
+EXPORT_SYMBOL(tcp_async_recv);
+EXPORT_SYMBOL(tcp_async_send);
EXPORT_SYMBOL(tcp_recvmsg);
EXPORT_SYMBOL(tcp_sendmsg);
EXPORT_SYMBOL(tcp_sendpage);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bf2e230..6d652fa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3016,6 +3016,7 @@ static void tcp_ofo_queue(struct sock *s
__skb_unlink(skb, &tp->out_of_order_queue);
__skb_queue_tail(&sk->sk_receive_queue, skb);
+ kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if(skb->h.th->fin)
tcp_fin(skb, sk, skb->h.th);
@@ -3079,8 +3080,8 @@ queue_and_out:
!sk_stream_rmem_schedule(sk, skb))
goto drop;
}
- sk_stream_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk_stream_set_owner_r(skb, sk);
}
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if(skb->len)
@@ -3819,7 +3820,7 @@ int tcp_rcv_established(struct sock *sk,
if (tp->ucopy.task == current &&
tp->copied_seq == tp->rcv_nxt &&
len - tcp_header_len <= tp->ucopy.len &&
- sock_owned_by_user(sk)) {
+ sock_owned_by_user(sk) && !sock_async(sk)) {
__set_current_state(TASK_RUNNING);
if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4d5021e..be0d3dc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -62,6 +62,7 @@
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
+#include <linux/kevent.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
@@ -1007,6 +1008,7 @@ int tcp_v4_conn_request(struct sock *sk,
reqsk_free(req);
} else {
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ kevent_socket_notify(sk, KEVENT_SOCKET_ACCEPT);
}
return 0;
@@ -1244,14 +1246,20 @@ process:
skb->dev = NULL;
- bh_lock_sock(sk);
ret = 0;
- if (!sock_owned_by_user(sk)) {
- if (!tcp_prequeue(sk, skb))
- ret = tcp_v4_do_rcv(sk, skb);
- } else
- sk_add_backlog(sk, skb);
- bh_unlock_sock(sk);
+ if (sock_async(sk)) {
+ spin_lock_bh(&sk->sk_lock.slock);
+ ret = tcp_v4_do_rcv(sk, skb);
+ spin_unlock_bh(&sk->sk_lock.slock);
+ } else {
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ if (!tcp_prequeue(sk, skb))
+ ret = tcp_v4_do_rcv(sk, skb);
+ } else
+ sk_add_backlog(sk, skb);
+ bh_unlock_sock(sk);
+ }
sock_put(sk);
@@ -1975,6 +1983,8 @@ struct proto tcp_prot = {
.getsockopt = tcp_getsockopt,
.sendmsg = tcp_sendmsg,
.recvmsg = tcp_recvmsg,
+ .async_recv = tcp_async_recv,
+ .async_send = tcp_async_send,
.backlog_rcv = tcp_v4_do_rcv,
.hash = tcp_v4_hash,
.unhash = tcp_unhash,
--
Evgeniy Polyakov
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html