|
|
Subscribe / Log in / New account

Kevent, network AIO system calls implementation.

From:  Evgeniy Polyakov <johnpol@2ka.mipt.ru>
To:  netdev@vger.kernel.org
Subject:  [PATCH] Kevent, network AIO system calls implementation.
Date:  Sat, 18 Feb 2006 19:11:40 +0300
Cc:  David Miller <davem@davemloft.net>, Jamal Hadi Salim <hadi@cyberus.ca>

Hello developers.

I'm pleased to announce combined patch of kevent and network AIO subsytems,
which are implemented using three system calls:
 * kevent_ctl(), which fully controls kevent subsystem.
   It allows to add/modify/remove kevents and waiting for either given
   number or at least one kevent is ready.
 * aio_send().
   This system call allows to asynchronously transfer userspace buffer
   to the remote host using zero-copy mechanism.
 * aio_recv().
   Asynchronous receiving support.

Next step is to implement aio_sendfile() support.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 9b21a31..3ae20b2 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -294,3 +294,7 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init
 	.long sys_inotify_add_watch
 	.long sys_inotify_rm_watch
+	.long sys_aio_recv
+	.long sys_aio_send
+	.long sys_aio_sendfile
+	.long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index e0eb0c7..dc6924d 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -643,6 +643,10 @@ ia32_sys_call_table:
 	.quad sys_inotify_init
 	.quad sys_inotify_add_watch
 	.quad sys_inotify_rm_watch
+	.quad sys_aio_recv
+	.quad sys_aio_send
+	.quad sys_aio_sendfile
+	.quad sys_kevent_ctl
 ia32_syscall_end:		
 	.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
 		.quad ni_syscall
diff --git a/fs/file_table.c b/fs/file_table.c
index c3a5e2f..7d73a2b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -89,6 +89,9 @@ struct file *get_empty_filp(void)
 	if (security_file_alloc(f))
 		goto fail_sec;
 
+#ifdef CONFIG_KEVENT_POLL
+	kevent_storage_init(KEVENT_POLL, KEVENT_MASK_EMPTY, f, &f->st);
+#endif
 	eventpoll_init_file(f);
 	atomic_set(&f->f_count, 1);
 	f->f_uid = current->fsuid;
@@ -135,6 +138,9 @@ void fastcall __fput(struct file *file)
 	might_sleep();
 
 	fsnotify_close(file);
+#ifdef CONFIG_KEVENT_POLL
+	kevent_storage_fini(&file->st);
+#endif
 	/*
 	 * The function eventpoll_release() should be the first called
 	 * in the file cleanup chain.
diff --git a/fs/inode.c b/fs/inode.c
index d8d04bd..185bd67 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 
 /*
  * This is needed for the following functions:
@@ -165,6 +166,9 @@ static struct inode *alloc_inode(struct 
 		}
 		memset(&inode->u, 0, sizeof(inode->u));
 		inode->i_mapping = mapping;
+#if defined CONFIG_KEVENT_INODE || defined CONFIG_KEVENT_SOCKET
+		kevent_storage_init(KEVENT_INODE, KEVENT_MASK_EMPTY, inode, &inode->st);
+#endif
 	}
 	return inode;
 }
@@ -173,6 +177,9 @@ void destroy_inode(struct inode *inode) 
 {
 	if (inode_has_buffers(inode))
 		BUG();
+#if defined CONFIG_KEVENT_INODE || defined CONFIG_KEVENT_SOCKET
+	kevent_storage_fini(&inode->st);
+#endif
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
diff --git a/include/asm-i386/socket.h b/include/asm-i386/socket.h
index 802ae76..3473f5c 100644
--- a/include/asm-i386/socket.h
+++ b/include/asm-i386/socket.h
@@ -49,4 +49,6 @@
 
 #define SO_PEERSEC		31
 
+#define SO_ASYNC_SOCK		34
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 0f92e78..2075f4b 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -299,8 +299,12 @@
 #define __NR_inotify_init	291
 #define __NR_inotify_add_watch	292
 #define __NR_inotify_rm_watch	293
+#define __NR_aio_recv		294
+#define __NR_aio_send		295
+#define __NR_aio_sendfile	296
+#define __NR_kevent_ctl		297
 
-#define NR_syscalls 294
+#define NR_syscalls 298
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h
index d5166ec..9fba3a0 100644
--- a/include/asm-x86_64/ia32_unistd.h
+++ b/include/asm-x86_64/ia32_unistd.h
@@ -299,7 +299,8 @@
 #define __NR_ia32_inotify_init		291
 #define __NR_ia32_inotify_add_watch	292
 #define __NR_ia32_inotify_rm_watch	293
+#define __NR_ia32_aio_recv		294
 
-#define IA32_NR_syscalls 294	/* must be > than biggest syscall! */
+#define IA32_NR_syscalls 295	/* must be > than biggest syscall! */
 
 #endif /* _ASM_X86_64_IA32_UNISTD_H_ */
diff --git a/include/asm-x86_64/socket.h b/include/asm-x86_64/socket.h
index f2cdbea..1f31f86 100644
--- a/include/asm-x86_64/socket.h
+++ b/include/asm-x86_64/socket.h
@@ -49,4 +49,6 @@
 
 #define SO_PEERSEC             31
 
+#define SO_ASYNC_SOCK		34
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 2c42150..1f23f91 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -571,8 +571,16 @@ __SYSCALL(__NR_inotify_init, sys_inotify
 __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
 #define __NR_inotify_rm_watch	255
 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
+#define __NR_aio_recv		256
+__SYSCALL(__NR_aio_recv, sys_aio_recv)
+#define __NR_aio_send		257
+__SYSCALL(__NR_aio_send, sys_aio_send)
+#define __NR_aio_sendfile	258
+__SYSCALL(__NR_aio_sendfile, sys_aio_sendfile)
+#define __NR_kevent_ctl		259
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
-#define __NR_syscall_max __NR_inotify_rm_watch
+#define __NR_syscall_max __NR_kevent_ctl
 #ifndef __NO_STUBS
 
 /* user-visible error numbers are in the range -1 - -4095 */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc35b6a..6566600 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -224,6 +224,9 @@ extern int dir_notify_enable;
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/byteorder.h>
+#ifdef CONFIG_KEVENT
+#include <linux/kevent_storage.h>
+#endif
 
 struct iovec;
 struct nameidata;
@@ -483,6 +486,10 @@ struct inode {
 	struct semaphore	inotify_sem;	/* protects the watches list */
 #endif
 
+#ifdef CONFIG_KEVENT_INODE
+	struct kevent_storage	st;
+#endif
+
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
@@ -616,6 +623,9 @@ struct file {
 	struct list_head	f_ep_links;
 	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
+#ifdef CONFIG_KEVENT_POLL
+	struct kevent_storage	st;
+#endif
 	struct address_space	*f_mapping;
 };
 extern spinlock_t files_lock;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 03b8e79..85449ac 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -15,6 +15,7 @@
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 
 /*
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
@@ -56,6 +57,7 @@ static inline void fsnotify_nameremove(s
 		isdir = IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
 	inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+	kevent_inode_notify_parent(dentry, KEVENT_INODE_REMOVE);
 }
 
 /*
@@ -65,6 +67,7 @@ static inline void fsnotify_inoderemove(
 {
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL);
 	inotify_inode_is_dead(inode);
+	kevent_inode_remove(inode);
 }
 
 /*
@@ -74,6 +77,7 @@ static inline void fsnotify_create(struc
 {
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, name);
+	kevent_inode_notify(inode, KEVENT_INODE_CREATE);
 }
 
 /*
@@ -83,6 +87,7 @@ static inline void fsnotify_mkdir(struct
 {
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, name);
+	kevent_inode_notify(inode, KEVENT_INODE_CREATE);
 }
 
 /*
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..64926bc
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,268 @@
+/*
+ * 	kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT	0x1		/* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN	0x1		/* Kevent is broken. */
+#define KEVENT_RET_DONE		0x2		/* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+enum {
+	KEVENT_SOCKET = 0,
+	KEVENT_INODE,
+	KEVENT_TIMER,
+	KEVENT_POLL,
+	KEVENT_NAIO,
+
+	KEVENT_MAX,
+};
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+enum {
+	KEVENT_TIMER_FIRED 	= 0x1,
+};
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+enum {
+	KEVENT_SOCKET_RECV 	= 0x1,
+	KEVENT_SOCKET_ACCEPT	= 0x2,
+	KEVENT_SOCKET_SEND	= 0x4,
+};
+
+/*
+ * Inode events.
+ */
+enum {
+	KEVENT_INODE_CREATE	= 0x1,
+	KEVENT_INODE_REMOVE	= 0x2,
+};
+
+/*
+ * Poll events.
+ */
+enum {
+	KEVENT_POLL_POLLIN	= 0x0001,
+	KEVENT_POLL_POLLPRI	= 0x0002,
+	KEVENT_POLL_POLLOUT	= 0x0004,
+	KEVENT_POLL_POLLERR	= 0x0008,
+	KEVENT_POLL_POLLHUP	= 0x0010,
+	KEVENT_POLL_POLLNVAL	= 0x0020,
+
+	KEVENT_POLL_POLLRDNORM	= 0x0040,
+	KEVENT_POLL_POLLRDBAND	= 0x0080,
+	KEVENT_POLL_POLLWRNORM	= 0x0100,
+	KEVENT_POLL_POLLWRBAND	= 0x0200,
+	KEVENT_POLL_POLLMSG	= 0x0400,
+	KEVENT_POLL_POLLREMOVE	= 0x1000,
+};
+
+#define KEVENT_MASK_ALL		0xffffffff	/* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY	0x0		/* Empty mask of ready events. */
+
+struct kevent_id
+{
+	__u32		raw[2];
+};
+
+struct ukevent
+{
+	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on...
*/
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			req_flags;		/* Per-event request flags */
+	__u32			ret_flags;		/* Per-event return flags */
+	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
+	union {
+		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
+		void		*ptr;
+	};
+};
+
+enum {
+	KEVENT_CTL_ADD = 0,
+	KEVENT_CTL_REMOVE,
+	KEVENT_CTL_MODIFY,
+	KEVENT_CTL_WAIT,
+	KEVENT_CTL_INIT,
+};
+
+struct kevent_user_control
+{
+	unsigned int		cmd;			/* Control command, e.g. KEVENT_ADD, KEVENT_REMOVE... */
+	unsigned int		num;			/* Number of ukevents this strucutre controls. */
+	unsigned int		timeout;		/* Timeout in milliseconds waiting for "num" events to become ready. */
+};
+
+#define KEVENT_USER_SYMBOL	'K'
+#define KEVENT_USER_CTL		_IOWR(KEVENT_USER_SYMBOL, 0, struct kevent_user_control)
+#define KEVENT_USER_WAIT	_IOWR(KEVENT_USER_SYMBOL, 1, struct kevent_user_control)
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/kevent_storage.h>
+#include <asm/semaphore.h>
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+struct kevent
+{
+	struct ukevent		event;
+	spinlock_t		lock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+	struct list_head	kevent_entry;		/* Entry of user's queue. */
+	struct list_head	storage_entry;		/* Entry of origin's queue. */
+	struct list_head	ready_entry;		/* Entry of user's ready. */
+
+	struct kevent_user	*user;			/* User who requested this kevent. */
+	struct kevent_storage	*st;			/* Kevent container. */
+
+	kevent_callback_t	callback;		/* Is called each time new event has been caught. */
+	kevent_callback_t	enqueue;		/* Is called each time new event is queued. */
+	kevent_callback_t	dequeue;		/* Is called each time event is dequeued. */
+
+	void			*priv;			/* Private data for different storages. 
+							 * poll()/select storage has a list of wait_queue_t containers 
+							 * for each ->poll() { poll_wait()' } here.
+							 */
+};
+
+#define KEVENT_HASH_MASK	0xff
+
+struct kevent_list
+{
+	struct list_head	kevent_list;		/* List of all kevents. */
+	spinlock_t 		kevent_lock;		/* Protects all manipulations with queue of kevents. */
+};
+
+struct kevent_user
+{
+	struct kevent_list	kqueue[KEVENT_HASH_MASK+1];
+	unsigned int		kevent_num;		/* Number of queued kevents. */
+
+	struct list_head	ready_list;		/* List of ready kevents. */
+	unsigned int		ready_num;		/* Number of ready kevents. */
+	spinlock_t 		ready_lock;		/* Protects all manipulations with ready queue. */
+
+	unsigned int		max_ready_num;		/* Requested number of kevents. */
+
+	struct semaphore	ctl_mutex;		/* Protects against simultaneous kevent_user control manipulations.
*/
+	struct semaphore	wait_mutex;		/* Protects against simultaneous kevent_user waits. */
+	wait_queue_head_t	wait;			/* Wait until some events are ready. */
+
+	atomic_t		refcnt;			/* Reference counter, increased for each new kevent. */
+};
+
+#define KEVENT_MAX_REQUESTS		PAGE_SIZE/sizeof(struct kevent)
+
+struct kevent *kevent_alloc(gfp_t mask);
+void kevent_free(struct kevent *k);
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+#define list_for_each_entry_reverse_safe(pos, n, head, member)			\
+	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
+		n = list_entry(pos->member.prev, typeof(*pos), member);	\
+	     prefetch(pos->member.prev), &pos->member != (head); 	\
+	     pos = n, n = list_entry(pos->member.prev, typeof(*pos), member))
+
+int kevent_break(struct kevent *k);
+int kevent_init(struct kevent *k);
+
+int kevent_init_socket(struct kevent *k);
+int kevent_init_inode(struct kevent *k);
+int kevent_init_timer(struct kevent *k);
+int kevent_init_poll(struct kevent *k);
+int kevent_init_naio(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, kevent_callback_t ready_callback, u32
event);
+int kevent_storage_init(__u32 type, __u32 event, void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+
+enum {
+	KEVENT_SOCKET_FLAGS_ASYNC = 0,
+	KEVENT_SOCKET_FLAGS_INUSE,
+};
+
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk)	test_bit(KEVENT_SOCKET_FLAGS_ASYNC, &__sk->sk_kevent_flags)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	0
+#endif
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..7c68170
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,21 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED, 
+							 * which were NOT updated by any kevent in origin's queue,
+							 * i.e. when new event happens and there are no requests in
+							 * origin's queue for that event, it will be placed here.
+							 * New events are ORed with old one, so when new kevent is being added
+							 * into origin's queue, it just needs to check if requested event
+							 * is in this mask, and if so, return positive value from ->enqueu()
+							 */
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	unsigned int		qlen;			/* Number of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8c5d600..8dbd67d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1232,6 +1232,8 @@ extern struct sk_buff *skb_recv_datagram
 					 int noblock, int *err);
 extern unsigned int    datagram_poll(struct file *file, struct socket *sock,
 				     struct poll_table_struct *wait);
+extern int	       skb_copy_datagram(const struct sk_buff *from, 
+					 int offset, void *dst, int size);
 extern int	       skb_copy_datagram_iovec(const struct sk_buff *from,
 					       int offset, struct iovec *to,
 					       int size);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c7007b1..5e1e872 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -512,4 +512,8 @@ asmlinkage long sys_ioprio_get(int which
 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 					unsigned long maxnode);
 
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_aio_sendfile(int ctl_fd, int fd, int s, size_t size, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, void __user *buf);
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 982b4ec..d571c3f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -48,6 +48,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/security.h>
+#include <linux/kevent.h>
 
 #include <linux/filter.h>
 
@@ -212,6 +213,9 @@ struct sock {
 	int			sk_route_caps;
 	unsigned long 		sk_flags;
 	unsigned long	        sk_lingertime;
+#ifdef CONFIG_KEVENT_SOCKET
+	unsigned long		sk_kevent_flags;
+#endif
 	/*
 	 * The backlog queue is special, it is always used with
 	 * the per-socket spinlock held and requires low latency
@@ -444,6 +448,21 @@ static inline int sk_stream_memory_free(
 
 extern void sk_stream_rfree(struct sk_buff *skb);
 
+struct socket_alloc {
+	struct socket socket;
+	struct inode vfs_inode;
+};
+
+static inline struct socket *SOCKET_I(struct inode *inode)
+{
+	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+}
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
 static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
@@ -470,6 +489,7 @@ static inline void sk_add_backlog(struct
 		sk->sk_backlog.tail = skb;
 	}
 	skb->next = NULL;
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 #define sk_wait_event(__sk, __timeo, __condition)		\
@@ -532,6 +552,12 @@ struct proto {
 
 	int			(*backlog_rcv) (struct sock *sk, 
 						struct sk_buff *skb);
+	
+	int			(*async_recv) (struct sock *sk, 
+						void *dst, size_t size);
+	int			(*async_send) (struct sock *sk, 
+						struct page **pages, unsigned int poffset, 
+						size_t size);
 
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
@@ -664,21 +690,6 @@ static inline struct kiocb *siocb_to_kio
 	return si->kiocb;
 }
 
-struct socket_alloc {
-	struct socket socket;
-	struct inode vfs_inode;
-};
-
-static inline struct socket *SOCKET_I(struct inode *inode)
-{
-	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
-}
-
-static inline struct inode *SOCK_INODE(struct socket *socket)
-{
-	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
-}
-
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d78025f..8788625 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -392,6 +392,8 @@ extern int			tcp_setsockopt(struct sock 
 					       int optname, char __user *optval, 
 					       int optlen);
 extern void			tcp_set_keepalive(struct sock *sk, int val);
+extern int			tcp_async_recv(struct sock *sk, void *dst, size_t size);
+extern int			tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t
size);
 extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 					    struct msghdr *msg,
 					    size_t len, int nonblock, 
@@ -950,6 +952,7 @@ static __inline__ int tcp_prequeue(struc
 			tp->ucopy.memory = 0;
 		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 			wake_up_interruptible(sk->sk_sleep);
+			kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 			if (!inet_csk_ack_scheduled(sk))
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						          (3 * TCP_RTO_MIN) / 4,
diff --git a/init/Kconfig b/init/Kconfig
index 9fc0759..4a4f26c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -224,6 +224,8 @@ config KOBJECT_UEVENT
 	  Say Y, unless you are building a system requiring minimal memory
 	  consumption.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index 4f5a145..7c5aa88 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..a52a86f
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,39 @@
+config KEVENT
+	bool "Kernel event notification mechanism"
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback invocations,
+	  advanced timer notifications and other kernel object status changes.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, ready for accept
+  	  conditions and so on.
+	
+config KEVENT_INODE
+	bool "Kernel event notifications for inodes"
+	depends on KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for poll()/select() notifications.
+
+config KEVENT_NAIO
+	bool "Network asynchronous IO"
+	depends on KEVENT_SOCKET
+	help
+	  This option enables kevent based network asynchronous IO subsystem.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..2bc7135
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,6 @@
+obj-y := kevent.o kevent_user.o kevent_init.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..5c3a141
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,301 @@
+/*
+ * 	kevent.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+static kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+
+	if (!k->enqueue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+	
+	return k->enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+	
+	if (!k->dequeue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return k->dequeue(k);
+}
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	int err;
+
+	spin_lock_init(&k->lock);
+	k->kevent_entry.next = LIST_POISON1;
+	k->storage_entry.next = LIST_POISON1;
+	k->ready_entry.next = LIST_POISON1;
+
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+	
+	switch (k->event.type) {
+		case KEVENT_NAIO:
+			err = kevent_init_naio(k);
+			break;
+		case KEVENT_SOCKET:
+			err = kevent_init_socket(k);
+			break;
+		case KEVENT_INODE:
+			err = kevent_init_inode(k);
+			break;
+		case KEVENT_TIMER:
+			err = kevent_init_timer(k);
+			break;
+		case KEVENT_POLL:
+			err = kevent_init_poll(k);
+			break;
+		default:
+			err = -ENODEV;
+	}
+
+	return err;
+}
+
+/*
+ * Checks if "event" is requested by given kevent and if so setup kevent's ret_data.
+ * Also updates storage's mask of pending events.
+ * Returns set of events requested by user which are ready.
+ */
+static inline u32 kevent_set_event(struct kevent_storage *st, struct kevent *k, u32 event)
+{
+	u32 ev = event & k->event.event;
+
+	st->event &= ~ev;
+#if 0
+	if (ev)
+		k->event.ret_data[1] = ev;
+#endif
+	return ev;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+	u32 ev;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	
+	spin_lock(&k->lock);
+	ev = kevent_set_event(st, k, st->event);
+	spin_unlock(&k->lock);
+	
+	list_add_tail(&k->storage_entry, &st->list);
+	st->qlen++;
+	
+	spin_unlock_irqrestore(&st->lock, flags);
+#if 0
+	if (ev) {
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		list_add_tail(&k->ready_entry, &k->user->ready_list);
+		k->user->ready_num++;
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+#endif	
+	return !!ev;
+}
+
+/*
+ * Dequeue kevent from origin's queue. 
+ * It does not decrease origin's reference counter in any way and must be called before it,
+ * so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->storage_entry.next != LIST_POISON1) {
+		list_del(&k->storage_entry);
+		st->qlen--;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+	int err, rem = 0;
+		
+	wake_up(&k->user->wait);
+
+	err = k->callback(k);
+
+	spin_lock(&k->lock);
+	if (err > 0) {
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	} else if (err < 0) {
+		k->event.ret_flags |= KEVENT_RET_BROKEN;
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	}
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	spin_unlock(&k->lock);
+
+	if (err) {
+		if (rem) {
+			list_del(&k->storage_entry);
+			k->st->qlen--;
+		}
+		spin_lock(&k->user->ready_lock);
+		if (k->ready_entry.next == LIST_POISON1) {
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->user->ready_num++;
+		}
+		spin_unlock(&k->user->ready_lock);
+	}
+}
+
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, kevent_callback_t ready_callback, u32 event)
+{
+	//unsigned long flags;
+	struct kevent *k, *n;
+	unsigned int qlen;
+	u32 ev = 0;
+
+	spin_lock_bh(&st->lock);
+	st->event |= event;
+	qlen = st->qlen;
+	
+	if (qlen) {
+		list_for_each_entry_safe(k, n, &st->list, storage_entry) {
+			if (qlen-- <= 0)
+				break;
+
+			if (ready_callback)
+				ready_callback(k);
+
+			ev |= (event & k->event.event);
+
+			if (event & k->event.event)
+				__kevent_requeue(k, event);
+		}
+	}
+	
+	st->event &= ~ev;
+	spin_unlock_bh(&st->lock);
+}
+
+int kevent_storage_init(__u32 type, __u32 event, void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->type = type;
+	st->event = event;
+	st->origin = origin;
+	st->qlen = 0;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+struct kevent *kevent_alloc(gfp_t mask)
+{
+	struct kevent *k;
+	
+	if (kevent_cache)
+		k = kmem_cache_alloc(kevent_cache, mask);
+	else
+		k = kzalloc(sizeof(struct kevent), mask);
+
+	return k;
+}
+
+void kevent_free(struct kevent *k)
+{
+	if (kevent_cache)
+		kmem_cache_free(kevent_cache, k);
+	else
+		kfree(k);
+}
+
+int __init kevent_sys_init(void)
+{
+	int err = 0;
+
+	kevent_cache = kmem_cache_create("kevent_cache", sizeof(struct kevent),
+			0, 0, NULL, NULL);
+	if (!kevent_cache)
+		err = -ENOMEM;
+	
+	return err;
+}
+
+late_initcall(kevent_sys_init);
diff --git a/kernel/kevent/kevent_init.c b/kernel/kevent/kevent_init.c
new file mode 100644
index 0000000..74659df
--- /dev/null
+++ b/kernel/kevent/kevent_init.c
@@ -0,0 +1,77 @@
+/*
+ * 	kevent_init.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/kevent.h>
+
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->lock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->lock, flags);
+	return 0;
+}
+
+#ifndef CONFIG_KEVENT_SOCKET
+int kevent_init_socket(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_INODE
+int kevent_init_inode(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_TIMER
+int kevent_init_timer(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_POLL
+int kevent_init_poll(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_NAIO
+int kevent_init_naio(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
diff --git a/kernel/kevent/kevent_inode.c b/kernel/kevent/kevent_inode.c
new file mode 100644
index 0000000..b6d12f6
--- /dev/null
+++ b/kernel/kevent/kevent_inode.c
@@ -0,0 +1,110 @@
+/*
+ * 	kevent_inode.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/fs.h>
+
+static int kevent_inode_enqueue(struct kevent *k)
+{
+	struct file *file;
+	struct inode *inode;
+	int err, fput_needed;
+
+	file = fget_light(k->event.id.raw[0], &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+	
+	inode = igrab(file->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err < 0)
+		goto err_out_iput;
+
+	fput_light(file, fput_needed);
+	return 0;
+
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+static int kevent_inode_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+static int kevent_inode_callback(struct kevent *k)
+{
+	return 1;
+}
+
+int kevent_init_inode(struct kevent *k)
+{
+	k->enqueue = &kevent_inode_enqueue;
+	k->dequeue = &kevent_inode_dequeue;
+	k->callback = &kevent_inode_callback;
+	return 0;
+}
+
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+	struct dentry *parent;
+	struct inode *inode;
+	
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	inode = parent->d_inode;
+
+	dget(parent);
+	spin_unlock(&dentry->d_lock);
+	kevent_inode_notify(inode, KEVENT_INODE_REMOVE);
+	dput(parent);
+}
+	
+void kevent_inode_remove(struct inode *inode)
+{
+	kevent_storage_fini(&inode->st);
+}
+	
+void kevent_inode_notify(struct inode *inode, u32 event)
+{
+	kevent_storage_ready(&inode->st, NULL, event);
+}
diff --git a/kernel/kevent/kevent_naio.c b/kernel/kevent/kevent_naio.c
new file mode 100644
index 0000000..004b292
--- /dev/null
+++ b/kernel/kevent/kevent_naio.c
@@ -0,0 +1,238 @@
+/*
+ * 	kevent_naio.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+static int kevent_naio_enqueue(struct kevent *k);
+static int kevent_naio_dequeue(struct kevent *k);
+static int kevent_naio_callback(struct kevent *k);
+
+static int kevent_naio_setup_aio(int ctl_fd, int s, void __user *buf, size_t size, u32 event)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int err, fput_needed;
+	struct ukevent uk;
+
+	file = fget_light(ctl_fd, &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	u = file->private_data;
+	if (!u) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	memset(&uk, 0, sizeof(struct ukevent));
+	uk.type = KEVENT_NAIO;
+	uk.ptr = buf;
+	uk.req_flags = KEVENT_REQ_ONESHOT;
+	uk.event = event;
+	uk.id.raw[0] = s;
+	uk.id.raw[1] = size;
+
+	err = kevent_user_add_ukevent(&uk, u);
+
+err_out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags)
+{
+	return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_RECV);
+}
+
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags)
+{
+	return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_SEND);
+}
+
+asmlinkage long sys_aio_sendfile(int ctl_fd, int fd, int s, size_t size, unsigned flags)
+{
+	return 0;
+}
+
+static int kevent_naio_enqueue(struct kevent *k)
+{
+	int err, i;
+	struct page **page;
+	void *addr;
+	unsigned int size = k->event.id.raw[1];
+	int num = size/PAGE_SIZE + 1;
+	struct file *file;
+	struct sock *sk = NULL;
+	int fput_needed;
+
+	file = fget_light(k->event.id.raw[0], &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+
+	sk = SOCKET_I(file->f_dentry->d_inode)->sk;
+
+	err = -ESOCKTNOSUPPORT;
+	if (!sk || !sk->sk_prot->async_recv || !sk->sk_prot->async_send || 
+			!test_bit(KEVENT_SOCKET_FLAGS_ASYNC, &sk->sk_kevent_flags))
+		goto err_out_fput;
+
+	page = kmalloc(sizeof(struct page *) * num, GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	addr = k->event.ptr;
+
+	down_read(&current->mm->mmap_sem);
+	err = get_user_pages(current, current->mm, (unsigned long)addr, num, 1, 0, page, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (err <= 0)
+		goto err_out_free;
+	num = err;
+
+	k->event.ret_data[0] = num;
+	k->event.ret_data[1] = offset_in_page(k->event.ptr);
+	k->priv = page;
+
+	sk->sk_allocation = GFP_ATOMIC;
+
+	spin_lock_bh(&sk->sk_lock.slock);
+	err = kevent_socket_enqueue(k);
+	spin_unlock_bh(&sk->sk_lock.slock);
+	if (err)
+		goto err_out_put_pages;
+
+	fput_light(file, fput_needed);
+
+	return err;
+
+err_out_put_pages:
+	for (i=0; i<num; ++i)
+		page_cache_release(page[i]);
+err_out_free:
+	kfree(page);
+err_out_fput:
+	fput_light(file, fput_needed);
+
+	return err;
+}
+
+static int kevent_naio_dequeue(struct kevent *k)
+{
+	int err, i, num;
+	struct page **page = k->priv;
+
+	num = k->event.ret_data[0];
+
+	err = kevent_socket_dequeue(k);
+
+	for (i=0; i<num; ++i)
+		page_cache_release(page[i]);
+
+	kfree(k->priv);
+	k->priv = NULL;
+
+	return err;
+}
+
+static int kevent_naio_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct sock *sk = SOCKET_I(inode)->sk;
+	unsigned int size = k->event.id.raw[1];
+	unsigned int off = k->event.ret_data[1];
+	struct page **pages = k->priv, *page;
+	int ready = 0, num = off/PAGE_SIZE, err = 0, send = 0;
+	void *ptr, *optr;
+	unsigned int len;
+
+	if (!test_bit(KEVENT_SOCKET_FLAGS_ASYNC, &sk->sk_kevent_flags))
+		return -1;
+
+	if (k->event.event & KEVENT_SOCKET_SEND)
+		send = 1;
+	else if (!(k->event.event & KEVENT_SOCKET_RECV))
+		return -EINVAL;
+
+	/*
+	 * sk_prot->async_*() can return either number of bytes processed,
+	 * or negative error value, or zero if socket is closed.
+	 */
+
+	if (!send) {
+		page = pages[num];
+
+		optr = ptr = kmap_atomic(page, KM_IRQ0);
+		if (!ptr)
+			return -ENOMEM;
+
+		ptr += off % PAGE_SIZE;
+		len = min_t(unsigned int, PAGE_SIZE - (ptr - optr), size);
+
+		err = sk->sk_prot->async_recv(sk, ptr, len);
+
+		kunmap_atomic(optr, KM_IRQ0);
+	} else {
+		len = size;
+		err = sk->sk_prot->async_send(sk, pages, off, size);
+	}
+
+	if (err > 0) {
+		num++;
+		size -= err;
+		off += err;
+	}
+
+	k->event.ret_data[1] = off;
+	k->event.id.raw[1] = size;
+
+	if (err == 0 || (err < 0 && err != -EAGAIN))
+		ready = -1;
+
+	if (!size)
+		ready = 1;
+#if 0
+	printk("%s: sk=%p, k=%p, size=%4u, off=%4u, err=%3d, ready=%1d.\n",
+			__func__, sk, k, size, off, err, ready);
+#endif
+
+	return ready;
+}
+
+int kevent_init_naio(struct kevent *k)
+{
+	k->enqueue = &kevent_naio_enqueue;
+	k->dequeue = &kevent_naio_dequeue;
+	k->callback = &kevent_naio_callback;
+	return 0;
+}
diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..12b06bb
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,213 @@
+/*
+ * 	kevent_poll.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_wait_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont = container_of(wait, struct kevent_poll_wait_container,
wait);
+	struct kevent *k = cont->k;
+	struct file *file = k->st->origin;
+	unsigned long flags;
+	u32 revents, event;
+
+	revents = file->f_op->poll(file, NULL);
+	spin_lock_irqsave(&k->lock, flags);
+	event = k->event.event;
+	spin_unlock_irqrestore(&k->lock, flags);
+
+	kevent_storage_ready(k->st, NULL, revents);
+
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, struct
poll_table_struct *poll_table)
+{
+	struct kevent *k = container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+		
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err, ready = 0;
+	unsigned int revents;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (revents & k->event.event)
+		ready = 1;
+
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err < 0)
+		goto err_out_free;
+	
+	return ready;
+
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+	
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+	
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	unsigned int revents = file->f_op->poll(file, NULL);
+	return (revents & k->event.event);
+}
+
+int kevent_init_poll(struct kevent *k)
+{
+	if (!kevent_poll_container_cache || !kevent_poll_priv_cache)
+		return -ENOMEM;
+
+	k->enqueue = &kevent_poll_enqueue;
+	k->dequeue = &kevent_poll_dequeue;
+	k->callback = &kevent_poll_callback;
+	return 0;
+}
+
+
+static int __init kevent_poll_sys_init(void)
+{
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", sizeof(struct
kevent_poll_wait_container), 
+			0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+	
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", sizeof(struct
kevent_poll_private), 
+			0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);
diff --git a/kernel/kevent/kevent_socket.c b/kernel/kevent/kevent_socket.c
new file mode 100644
index 0000000..d179ca8
--- /dev/null
+++ b/kernel/kevent/kevent_socket.c
@@ -0,0 +1,124 @@
+/*
+ * 	kevent_socket.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/tcp.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/request_sock.h>
+#include <net/inet_connection_sock.h>
+
+static int kevent_socket_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct sock *sk = SOCKET_I(inode)->sk;
+	int rmem;
+	
+	if (k->event.event & KEVENT_SOCKET_RECV) {
+		int ret = 0;
+		
+		if ((rmem = atomic_read(&sk->sk_rmem_alloc)) > 0 || !skb_queue_empty(&sk->sk_receive_queue))
+			ret = 1;
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			ret = 1;
+		if (ret)
+			return ret;
+	}
+	if ((k->event.event & KEVENT_SOCKET_ACCEPT) && 
+			(!reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) || 
+			 	reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue))) {
+		k->event.ret_data[1] = reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
+		return 1;
+	}
+
+	return 0;
+}
+
+int kevent_socket_enqueue(struct kevent *k)
+{
+	struct file *file;
+	struct inode *inode;
+	int err, fput_needed;
+
+	file = fget_light(k->event.id.raw[0], &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+
+	inode = igrab(file->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err < 0)
+		goto err_out_iput;
+
+	err = k->callback(k);
+	if (err)
+		goto err_out_dequeue;
+
+	fput_light(file, fput_needed);
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+int kevent_socket_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+int kevent_init_socket(struct kevent *k)
+{
+	k->enqueue = &kevent_socket_enqueue;
+	k->dequeue = &kevent_socket_dequeue;
+	k->callback = &kevent_socket_callback;
+	return 0;
+}
+
+void kevent_socket_notify(struct sock *sk, u32 event)
+{
+	if (sk->sk_socket && !test_and_set_bit(KEVENT_SOCKET_FLAGS_INUSE, &sk->sk_kevent_flags)) {
+		kevent_storage_ready(&SOCK_INODE(sk->sk_socket)->st, NULL, event);
+		clear_bit(KEVENT_SOCKET_FLAGS_INUSE, &sk->sk_kevent_flags);
+	}
+}
diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..3ae05c2
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,111 @@
+/*
+ * 	kevent_timer.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+static void kevent_timer_func(unsigned long data)
+{
+	struct kevent *k = (struct kevent *)data;
+	struct timer_list *t = k->st->origin;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+}
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	struct timer_list *t;
+	struct kevent_storage *st;
+	int err;
+
+	t = kmalloc(sizeof(struct timer_list) + sizeof(struct kevent_storage), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	init_timer(t);
+	t->function = kevent_timer_func;
+	t->expires = jiffies + msecs_to_jiffies(k->event.id.raw[0]);
+	t->data = (unsigned long)k;
+
+	st = (struct kevent_storage *)(t+1);
+	err = kevent_storage_init(k->event.type, KEVENT_MASK_EMPTY, t, st);
+	if (err)
+		goto err_out_free;
+
+	err = kevent_storage_enqueue(st, k);
+	if (err < 0)
+		goto err_out_st_fini;
+	
+	add_timer(t);
+
+	return 0;
+
+err_out_st_fini:	
+	kevent_storage_fini(st);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+
+	del_timer_sync(t);
+	
+	kevent_storage_dequeue(st, k);
+	
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+	
+	k->event.ret_data[0] = (__u32)jiffies;
+	return 1;
+}
+
+int kevent_init_timer(struct kevent *k)
+{
+	k->enqueue = &kevent_timer_enqueue;
+	k->dequeue = &kevent_timer_dequeue;
+	k->callback = &kevent_timer_callback;
+	return 0;
+}
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..6a0eedd
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,655 @@
+/*
+ * 	kevent_user.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+static struct class *kevent_user_class;
+static char kevent_name[] = "kevent";
+static int kevent_user_major;
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static int kevent_user_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+
+static struct file_operations kevent_user_fops = {
+	.open		= kevent_user_open,
+	.release	= kevent_user_release,
+	.ioctl		= kevent_user_ioctl,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static struct super_block *kevent_get_sb(struct file_system_type *fs_type, int flags,
+	       const char *dev_name, void *data)
+{
+	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xabcdef);	/* So original magic... */
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= kevent_name,
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+	
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static struct kevent_user *kevent_user_alloc(void)
+{
+	struct kevent_user *u;
+	int i;
+
+	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	u->ready_num = 0;
+	
+	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
+		INIT_LIST_HEAD(&u->kqueue[i].kevent_list);
+		spin_lock_init(&u->kqueue[i].kevent_lock);
+	}
+	u->kevent_num = 0;
+	
+	init_MUTEX(&u->ctl_mutex);
+	init_MUTEX(&u->wait_mutex);
+	init_waitqueue_head(&u->wait);
+	u->max_ready_num = 0;
+
+	atomic_set(&u->refcnt, 1);
+
+	return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = kevent_user_alloc();
+	
+	if (!u)
+		return -ENOMEM;
+
+	file->private_data = u;
+	
+	return 0;
+}
+
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt))
+		kfree(u);
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+	
+	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+	return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	return jhash_1word(uk->id.raw[0], uk->id.raw[1]) & KEVENT_HASH_MASK;
+}
+#endif
+
+/*
+ * Remove kevent from user's list of all events, 
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int lock)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+	
+	if (lock) {
+		unsigned int hash = kevent_user_hash(&k->event);
+		struct kevent_list *l = &u->kqueue[hash];
+		
+		spin_lock_irqsave(&l->kevent_lock, flags);
+		list_del(&k->kevent_entry);
+		u->kevent_num--;
+		spin_unlock_irqrestore(&l->kevent_lock, flags);
+	} else {
+		list_del(&k->kevent_entry);
+		u->kevent_num--;
+	}
+	kevent_dequeue(k);
+	kevent_user_put(u);
+	kevent_free(k);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *__kqueue_dequeue_one_ready(struct list_head *q, unsigned int *qlen)
+{
+	struct kevent *k = NULL;
+	unsigned int len = *qlen;
+	
+	if (len) {
+		k = list_entry(q->next, struct kevent, ready_entry);
+		list_del(&k->ready_entry);
+		*qlen = len - 1;
+	}
+	
+	return k;
+}
+
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	k = __kqueue_dequeue_one_ready(&u->ready_list, &u->ready_num);
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	return k;
+}
+
+struct kevent *kevent_search(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	struct kevent_list *l = &u->kqueue[hash];
+	int found = 0;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&l->kevent_lock, flags);
+	list_for_each_entry(k, &l->kevent_list, kevent_entry) {
+		spin_lock(&k->lock);
+		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+				k->event.id.raw[0] == uk->id.raw[0] && k->event.id.raw[1] == uk->id.raw[1]) {
+			found = 1;
+			spin_unlock(&k->lock);
+			break;
+		}
+		spin_unlock(&k->lock);
+	}
+	spin_unlock_irqrestore(&l->kevent_lock, flags);
+
+	return (found)?k:NULL;
+}
+
+/*
+ * No new entry can be added or removed from any list at this point.
+ * It is not permitted to call ->ioctl() and ->release() in parallel.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k, *n;
+	int i;
+
+	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
+		struct kevent_list *l = &u->kqueue[i];
+		
+		list_for_each_entry_safe(k, n, &l->kevent_list, kevent_entry)
+			kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+static int kevent_user_ctl_modify(struct kevent_user *u, struct kevent_user_control *ctl, void
__user *arg)
+{
+	int err = 0, i;
+	struct kevent *k;
+	unsigned long flags;
+	struct ukevent uk;
+
+	if (down_interruptible(&u->ctl_mutex))
+		return -ERESTARTSYS;
+
+	for (i=0; i<ctl->num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		k = kevent_search(&uk, u);
+		if (k) {
+			spin_lock_irqsave(&k->lock, flags);
+			k->event.event = uk.event;
+			k->event.req_flags = uk.req_flags;
+			k->event.ret_flags = 0;
+			spin_unlock_irqrestore(&k->lock, flags);
+			kevent_requeue(k);
+		} else
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+
+	up(&u->ctl_mutex);
+
+	return err;
+}
+
+static int kevent_user_ctl_remove(struct kevent_user *u, struct kevent_user_control *ctl, void
__user *arg)
+{
+	int err = 0, i;
+	struct kevent *k;
+	struct ukevent uk;
+
+	if (down_interruptible(&u->ctl_mutex))
+		return -ERESTARTSYS;
+
+	for (i=0; i<ctl->num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		k = kevent_search(&uk, u);
+		if (k) {
+			kevent_finish_user(k, 1);
+		} else
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+
+	up(&u->ctl_mutex);
+
+	return err;
+}
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kevent_alloc(GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+
+	k->event.ret_flags = 0;
+	k->event.ret_data[0] = 0;
+	k->event.ret_data[1] = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kevent_free(k);
+		goto err_out_exit;
+	}
+	k->user = u;
+
+	err = kevent_enqueue(k);
+	if (err) {
+		if (err < 0)
+			uk->ret_flags |= KEVENT_RET_BROKEN;
+		uk->ret_flags |= KEVENT_RET_DONE;
+		kevent_free(k);
+	} else {
+		unsigned long flags;
+		unsigned int hash = kevent_user_hash(&k->event);
+		struct kevent_list *l = &u->kqueue[hash];
+		
+		spin_lock_irqsave(&l->kevent_lock, flags);
+		list_add_tail(&k->kevent_entry, &l->kevent_list);
+		u->kevent_num++;
+		kevent_user_get(u);
+		spin_unlock_irqrestore(&l->kevent_lock, flags);
+	}
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one and add them
+ * into appropriate kevent_storages, e.g. sockets, inodes and so on...
+ * If something goes wrong, all events will be dequeued and negative error will be returned.
+ * On success zero is returned and ctl->num will be a number of finished events, either completed
+ * or failed. Array of finished events (struct ukevent) will be placed behind kevent_user_control
structure.
+ * User must run through that array and check ret_flags field of each ukevent structure
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, struct kevent_user_control *ctl, void __user
*arg)
+{
+	int err = 0, cerr = 0, num = 0, knum = 0, i;
+	void __user *orig, *ctl_addr;
+	struct ukevent uk;
+
+	if (down_interruptible(&u->ctl_mutex))
+		return -ERESTARTSYS;
+
+	orig = arg;
+	ctl_addr = arg - sizeof(struct kevent_user_control);
+#if 1
+	err = -ENFILE;
+	if (u->kevent_num + ctl->num >= 1024)
+		goto err_out_remove;
+#endif
+	for (i=0; i<ctl->num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EINVAL;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_user_add_ukevent(&uk, u);
+		if (err) {
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent)))
+				cerr = -EINVAL;
+			orig += sizeof(struct ukevent);
+			num++;
+		} else
+			knum++;
+	}
+
+	if (cerr < 0)
+		goto err_out_remove;
+
+	ctl->num = num;
+	if (copy_to_user(ctl_addr, ctl, sizeof(struct kevent_user_control)))
+		cerr = -EINVAL;
+
+	if (cerr)
+		err = cerr;
+	if (!err)
+		err = num;
+
+err_out_remove:
+	up(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Waits until at least ctl->ready_num events are ready or timeout and returns 
+ * number of ready events (in case of timeout) or number of requested events.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, struct kevent_user_control
*ctl, void __user *arg)
+{
+	struct kevent *k;
+	int cerr = 0, num = 0;
+	void __user *ptr = arg + sizeof(struct kevent_user_control);
+
+	if (down_interruptible(&u->wait_mutex))
+		return -ERESTARTSYS;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (ctl->timeout)
+			wait_event_interruptible_timeout(u->wait, 
+					u->ready_num >= ctl->num, msecs_to_jiffies(ctl->timeout));
+		else
+			wait_event_interruptible_timeout(u->wait, 
+					u->ready_num > 0, msecs_to_jiffies(1000));
+	}
+	while (num < ctl->num && (k = kqueue_dequeue_ready(u))) {
+		if (copy_to_user(ptr + num*sizeof(struct ukevent), &k->event, sizeof(struct ukevent)))
+			cerr = -EINVAL;
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+			kevent_finish_user(k, 1);
+		++num;
+	}
+
+	ctl->num = num;
+	if (copy_to_user(arg, ctl, sizeof(struct kevent_user_control)))
+		cerr = -EINVAL;
+
+	up(&u->wait_mutex);
+
+	return (cerr)?cerr:num;
+}
+
+static int kevent_ctl_init(void)
+{
+	struct kevent_user *u;
+	struct file *file;	
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	u = kevent_user_alloc();
+	if (unlikely(!u)) {
+		ret = -ENOMEM;
+		goto out_put_file;
+	}
+
+	file->f_op = &kevent_user_fops;
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dget(kevent_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = u;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_file:
+	put_filp(file);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int kevent_ctl_process(struct file *file, struct kevent_user_control *ctl, void __user
*arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	if (!u)
+		return -EINVAL;
+
+	switch (ctl->cmd) {
+		case KEVENT_CTL_ADD:
+			err = kevent_user_ctl_add(u, ctl, arg+sizeof(struct kevent_user_control));
+			break;
+		case KEVENT_CTL_REMOVE:
+			err = kevent_user_ctl_remove(u, ctl, arg+sizeof(struct kevent_user_control));
+			break;
+		case KEVENT_CTL_MODIFY:
+			err = kevent_user_ctl_modify(u, ctl, arg+sizeof(struct kevent_user_control));
+			break;
+		case KEVENT_CTL_WAIT:
+			err = kevent_user_wait(file, u, ctl, arg);
+			break;
+		case KEVENT_CTL_INIT:
+			err = kevent_ctl_init();
+		default:
+			err = -EINVAL;
+			break;
+	}
+
+	return err;
+}
+
+asmlinkage long sys_kevent_ctl(int fd, void __user *arg)
+{
+	int err, fput_needed;
+	struct kevent_user_control ctl;
+	struct file *file;
+
+	if (copy_from_user(&ctl, arg, sizeof(struct kevent_user_control)))
+		return -EINVAL;
+
+	if (ctl.cmd == KEVENT_CTL_INIT)
+		return kevent_ctl_init();
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	err = kevent_ctl_process(file, &ctl, arg);
+
+	fput_light(file, fput_needed);
+	return err;
+}
+
+static int kevent_user_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned
long arg)
+{
+	int err = -ENODEV;
+	struct kevent_user_control ctl;
+	struct kevent_user *u = file->private_data;
+	void __user *ptr = (void __user *)arg;
+
+	if (copy_from_user(&ctl, ptr, sizeof(struct kevent_user_control)))
+		return -EINVAL;
+
+	switch (cmd) {
+		case KEVENT_USER_CTL:
+			err = kevent_ctl_process(file, &ctl, ptr);
+			break;
+		case KEVENT_USER_WAIT:
+			err = kevent_user_wait(file, u, &ctl, ptr);
+			break;
+		default:
+			break;
+	}
+
+	return err;
+}
+
+static int __devinit kevent_user_init(void)
+{
+	struct class_device *dev;
+	int err = 0;
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		panic("%s: failed to register filesystem: err=%d.\n", kevent_name, err);
+
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	if (IS_ERR(kevent_mnt))
+		panic("%s: failed to mount silesystem: err=%ld.\n", kevent_name, PTR_ERR(kevent_mnt));
+	
+	kevent_user_major = register_chrdev(0, kevent_name, &kevent_user_fops);
+	if (kevent_user_major < 0) {
+		printk(KERN_ERR "Failed to register \"%s\" char device: err=%d.\n", kevent_name,
kevent_user_major);
+		return -ENODEV;
+	}
+
+	kevent_user_class = class_create(THIS_MODULE, "kevent");
+	if (IS_ERR(kevent_user_class)) {
+		printk(KERN_ERR "Failed to register \"%s\" class: err=%ld.\n", kevent_name,
PTR_ERR(kevent_user_class));
+		err = PTR_ERR(kevent_user_class);
+		goto err_out_unregister;
+	}
+
+	dev = class_device_create(kevent_user_class, NULL, MKDEV(kevent_user_major, 0), NULL,
kevent_name);
+	if (IS_ERR(dev)) {
+		printk(KERN_ERR "Failed to create %d.%d class device in \"%s\" class: err=%ld.\n", 
+				kevent_user_major, 0, kevent_name, PTR_ERR(dev));
+		err = PTR_ERR(dev);
+		goto err_out_class_destroy;
+	}
+
+	printk("KEVENT subsystem: chardev helper: major=%d.\n", kevent_user_major);
+
+	return 0;
+
+err_out_class_destroy:
+	class_destroy(kevent_user_class);
+err_out_unregister:
+	unregister_chrdev(kevent_user_major, kevent_name);
+
+	return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+	class_device_destroy(kevent_user_class, MKDEV(kevent_user_major, 0));
+	class_destroy(kevent_user_class);
+	unregister_chrdev(kevent_user_major, kevent_name);
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370..2053809 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -90,3 +90,8 @@ cond_syscall(sys_pciconfig_iobase);
 cond_syscall(sys32_ipc);
 cond_syscall(sys32_sysctl);
 cond_syscall(ppc_rtas);
+
+cond_syscall(sys_aio_recv);
+cond_syscall(sys_aio_send);
+cond_syscall(sys_aio_sendfile);
+cond_syscall(sys_kevent_ctl);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 1bcfef5..b2f19a7 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -200,6 +200,60 @@ void skb_free_datagram(struct sock *sk, 
 }
 
 /**
+ *	skb_copy_datagram - Copy a datagram.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: pointer to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram(const struct sk_buff *skb, int offset,
+			    void *to, int len)
+{
+	int i, fraglen, end = 0;
+	struct sk_buff *next = skb_shinfo(skb)->frag_list;
+
+	if (!len)
+		return 0;
+
+next_skb:
+	fraglen = skb_headlen(skb);
+	i = -1;
+
+	while (1) {
+		int start = end;
+
+		if ((end += fraglen) > offset) {
+			int copy = end - offset, o = offset - start;
+
+			if (copy > len)
+				copy = len;
+			if (i == -1)
+				memcpy(to, skb->data + o, copy);
+			else {
+				skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+				struct page *page = frag->page;
+				void *p = kmap(page) + frag->page_offset + o;
+				memcpy(to, p, copy);
+				kunmap(page);
+			}
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		if (++i >= skb_shinfo(skb)->nr_frags)
+			break;
+		fraglen = skb_shinfo(skb)->frags[i].size;
+	}
+	if (next) {
+		skb = next;
+		BUG_ON(skb_shinfo(skb)->frag_list);
+		next = skb->next;
+		goto next_skb;
+	}
+	return -EFAULT;
+}
+
+/**
  *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying from
@@ -467,6 +521,7 @@ unsigned int datagram_poll(struct file *
 
 EXPORT_SYMBOL(datagram_poll);
 EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
+EXPORT_SYMBOL(skb_copy_datagram);
 EXPORT_SYMBOL(skb_copy_datagram_iovec);
 EXPORT_SYMBOL(skb_free_datagram);
 EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/sock.c b/net/core/sock.c
index 13cc3be..3d32c5c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -455,6 +455,16 @@ set_rcvbuf:
 			spin_unlock_bh(&sk->sk_lock.slock);
 			ret = -ENONET;
 			break;
+#ifdef CONFIG_KEVENT_SOCKET
+		case SO_ASYNC_SOCK:
+			spin_lock_bh(&sk->sk_lock.slock);
+			if (valbool)
+				set_bit(KEVENT_SOCKET_FLAGS_ASYNC, &sk->sk_kevent_flags);
+			else
+				clear_bit(KEVENT_SOCKET_FLAGS_ASYNC, &sk->sk_kevent_flags);
+			spin_unlock_bh(&sk->sk_lock.slock);
+			break;
+#endif
 
 		/* We implement the SO_SNDLOWAT etc to
 		   not be settable (1003.1g 5.3) */
@@ -1201,6 +1211,7 @@ static void sock_def_wakeup(struct sock 
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible_all(sk->sk_sleep);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_error_report(struct sock *sk)
@@ -1210,6 +1221,7 @@ static void sock_def_error_report(struct
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,0,POLL_ERR); 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_readable(struct sock *sk, int len)
@@ -1219,6 +1231,7 @@ static void sock_def_readable(struct soc
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_write_space(struct sock *sk)
@@ -1235,6 +1248,8 @@ static void sock_def_write_space(struct 
 		/* Should agree with poll, otherwise some programs break */
 		if (sock_writeable(sk))
 			sk_wake_async(sk, 2, POLL_OUT);
+	
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND);
 	}
 
 	read_unlock(&sk->sk_callback_lock);
diff --git a/net/core/stream.c b/net/core/stream.c
index 15bfd03..745a07f 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -36,6 +36,7 @@ void sk_stream_write_space(struct sock *
 			wake_up_interruptible(sk->sk_sleep);
 		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, 2, POLL_OUT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND);
 	}
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ef98b14..c2ed578 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -206,6 +206,7 @@
  *					lingertime == 0 (RFC 793 ABORT Call)
  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
  *					csum_and_copy_from_user() if possible.
+ *	Evgeniy Polyakov	:	Network asynchronous IO.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -1090,6 +1091,275 @@ int tcp_read_sock(struct sock *sk, read_
 }
 
 /*
+ * Must be called with locked sock.
+ */
+int tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now, size_goal;
+	int err = -EAGAIN;
+	ssize_t copied;
+
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		goto out_err;
+
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	mss_now = tcp_current_mss(sk, 1);
+	size_goal = tp->xmit_size_goal;
+	copied = 0;
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || sock_flag(sk, SOCK_DONE) ||
+			(sk->sk_state == TCP_CLOSE) || (atomic_read(&sk->sk_refcnt) == 1))
+		goto do_error;
+
+	while (len > 0) {
+		struct sk_buff *skb = sk->sk_write_queue.prev;
+		struct page *page = pages[poffset / PAGE_SIZE];
+		int copy, i, can_coalesce;
+		int offset = poffset % PAGE_SIZE;
+		int size = min_t(size_t, len, PAGE_SIZE - offset);
+
+		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
+new_segment:
+			if (!sk_stream_memory_free(sk))
+				goto wait_for_sndbuf;
+
+			skb = sk_stream_alloc_pskb(sk, 0, 0,
+						   sk->sk_allocation);
+			if (!skb)
+				goto wait_for_memory;
+
+			skb_entail(sk, tp, skb);
+			copy = size_goal;
+		}
+
+		if (copy > size)
+			copy = size;
+
+		i = skb_shinfo(skb)->nr_frags;
+		can_coalesce = skb_can_coalesce(skb, i, page, offset);
+		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+			tcp_mark_push(tp, skb);
+			goto new_segment;
+		}
+		if (!sk_stream_wmem_schedule(sk, copy))
+			goto wait_for_memory;
+		
+		if (can_coalesce) {
+			skb_shinfo(skb)->frags[i - 1].size += copy;
+		} else {
+			get_page(page);
+			skb_fill_page_desc(skb, i, page, offset, copy);
+		}
+
+		skb->len += copy;
+		skb->data_len += copy;
+		skb->truesize += copy;
+		sk->sk_wmem_queued += copy;
+		sk->sk_forward_alloc -= copy;
+		skb->ip_summed = CHECKSUM_HW;
+		tp->write_seq += copy;
+		TCP_SKB_CB(skb)->end_seq += copy;
+		skb_shinfo(skb)->tso_segs = 0;
+
+		if (!copied)
+			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+
+		copied += copy;
+		poffset += copy;
+		if (!(len -= copy))
+			goto out;
+
+		if (skb->len < mss_now)
+			continue;
+
+		if (forced_push(tp)) {
+			tcp_mark_push(tp, skb);
+			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+		} else if (skb == sk->sk_send_head)
+			tcp_push_one(sk, mss_now);
+		continue;
+
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		if (copied)
+			tcp_push(sk, tp, 0, mss_now, TCP_NAGLE_PUSH);
+
+		err = -EAGAIN;
+		goto do_error;
+	}
+
+out:
+	if (copied)
+		tcp_push(sk, tp, 0, mss_now, tp->nonagle);
+	return copied;
+
+do_error:
+	if (copied)
+		goto out;
+out_err:
+	return sk_stream_error(sk, 0, err);
+}
+
+/*
+ * Must be called with locked sock.
+ */
+int tcp_async_recv(struct sock *sk, void *dst, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int copied = 0;
+	u32 *seq;
+	unsigned long used;
+	int err;
+	int target;		/* Read at least this many bytes */
+
+	TCP_CHECK_TIMER(sk);
+
+	err = -ENOTCONN;
+	if (sk->sk_state == TCP_LISTEN)
+		goto out;
+
+	seq = &tp->copied_seq;
+
+	target = sock_rcvlowat(sk, 0, len);
+
+	do {
+		struct sk_buff *skb;
+		u32 offset;
+
+		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+		if (tp->urg_data && tp->urg_seq == *seq) {
+			if (copied)
+				break;
+		}
+
+		/* Next get a buffer. */
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		do {
+			if (!skb)
+				break;
+
+			/* Now that we have two receive queues this
+			 * shouldn't happen.
+			 */
+			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
+				printk(KERN_INFO "async_recv bug: copied %X "
+				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+				break;
+			}
+			offset = *seq - TCP_SKB_CB(skb)->seq;
+			if (skb->h.th->syn)
+				offset--;
+			if (offset < skb->len)
+				goto found_ok_skb;
+			if (skb->h.th->fin)
+				goto found_fin_ok;
+			skb = skb->next;
+		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+
+		if (copied)
+			break;
+
+		if (sock_flag(sk, SOCK_DONE))
+			break;
+
+		if (sk->sk_err) {
+			copied = sock_error(sk);
+			break;
+		}
+
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			break;
+
+		if (sk->sk_state == TCP_CLOSE) {
+			if (!sock_flag(sk, SOCK_DONE)) {
+				/* This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				copied = -ENOTCONN;
+				break;
+			}
+			break;
+		}
+
+		copied = -EAGAIN;
+		break;
+
+	found_ok_skb:
+		/* Ok so how much can we use? */
+		used = skb->len - offset;
+		if (len < used)
+			used = len;
+
+		/* Do we have urgent data here? */
+		if (tp->urg_data) {
+			u32 urg_offset = tp->urg_seq - *seq;
+			if (urg_offset < used) {
+				if (!urg_offset) {
+					if (!sock_flag(sk, SOCK_URGINLINE)) {
+						++*seq;
+						offset++;
+						used--;
+						if (!used)
+							goto skip_copy;
+					}
+				} else
+					used = urg_offset;
+			}
+		}
+
+		err = skb_copy_datagram(skb, offset, dst, used);
+		if (err) {
+			/* Exception. Bailout! */
+			if (!copied)
+				copied = -EFAULT;
+			break;
+		}
+
+		*seq += used;
+		copied += used;
+		len -= used;
+		dst += used;
+
+		tcp_rcv_space_adjust(sk);
+
+skip_copy:
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+			tp->urg_data = 0;
+			tcp_fast_path_check(sk, tp);
+		}
+		if (used + offset < skb->len)
+			continue;
+
+		if (skb->h.th->fin)
+			goto found_fin_ok;
+		sk_eat_skb(sk, skb);
+		continue;
+
+	found_fin_ok:
+		/* Process the FIN. */
+		++*seq;
+		sk_eat_skb(sk, skb);
+		break;
+	} while (len > 0);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	cleanup_rbuf(sk, copied);
+
+	TCP_CHECK_TIMER(sk);
+	return copied;
+
+out:
+	TCP_CHECK_TIMER(sk);
+	return err;
+}
+
+/*
  *	This routine copies from a sock struct into the user buffer.
  *
  *	Technical note: in 2.3 we work on _locked_ socket, so that
@@ -2136,6 +2406,8 @@ EXPORT_SYMBOL(tcp_getsockopt);
 EXPORT_SYMBOL(tcp_ioctl);
 EXPORT_SYMBOL(tcp_poll);
 EXPORT_SYMBOL(tcp_read_sock);
+EXPORT_SYMBOL(tcp_async_recv);
+EXPORT_SYMBOL(tcp_async_send);
 EXPORT_SYMBOL(tcp_recvmsg);
 EXPORT_SYMBOL(tcp_sendmsg);
 EXPORT_SYMBOL(tcp_sendpage);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bf2e230..6d652fa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3016,6 +3016,7 @@ static void tcp_ofo_queue(struct sock *s
 
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
@@ -3079,8 +3080,8 @@ queue_and_out:
 				    !sk_stream_rmem_schedule(sk, skb))
 					goto drop;
 			}
-			sk_stream_set_owner_r(skb, sk);
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
+			sk_stream_set_owner_r(skb, sk);
 		}
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->len)
@@ -3819,7 +3820,7 @@ int tcp_rcv_established(struct sock *sk,
 			if (tp->ucopy.task == current &&
 			    tp->copied_seq == tp->rcv_nxt &&
 			    len - tcp_header_len <= tp->ucopy.len &&
-			    sock_owned_by_user(sk)) {
+			    sock_owned_by_user(sk) && !sock_async(sk)) {
 				__set_current_state(TASK_RUNNING);
 
 				if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4d5021e..be0d3dc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -62,6 +62,7 @@
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
+#include <linux/kevent.h>
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -1007,6 +1008,7 @@ int tcp_v4_conn_request(struct sock *sk,
 	   	reqsk_free(req);
 	} else {
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_ACCEPT);
 	}
 	return 0;
 
@@ -1244,14 +1246,20 @@ process:
 
 	skb->dev = NULL;
 
-	bh_lock_sock(sk);
 	ret = 0;
-	if (!sock_owned_by_user(sk)) {
-		if (!tcp_prequeue(sk, skb))
-			ret = tcp_v4_do_rcv(sk, skb);
-	} else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
+	if (sock_async(sk)) {
+		spin_lock_bh(&sk->sk_lock.slock);
+		ret = tcp_v4_do_rcv(sk, skb);
+		spin_unlock_bh(&sk->sk_lock.slock);
+	} else {
+		bh_lock_sock(sk);
+		if (!sock_owned_by_user(sk)) {
+			if (!tcp_prequeue(sk, skb))
+				ret = tcp_v4_do_rcv(sk, skb);
+		} else
+			sk_add_backlog(sk, skb);
+		bh_unlock_sock(sk);
+	}
 
 	sock_put(sk);
 
@@ -1975,6 +1983,8 @@ struct proto tcp_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
+	.async_recv		= tcp_async_recv,
+	.async_send		= tcp_async_send,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.hash			= tcp_v4_hash,
 	.unhash			= tcp_unhash,


-- 
	Evgeniy Polyakov
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Copyright © 2007, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds