TCP Offload (TOE) - Chelsio
From: | Scott Bardone <sbardone@chelsio.com> | |
To: | netdev@vger.kernel.org | |
Subject: | [PATCH] TCP Offload (TOE) - Chelsio | |
Date: | Thu, 11 Aug 2005 23:16:14 -0700 | |
Cc: | opentoe <opentoe@chelsio.com> |
"OPEN TOE" submission from Chelsio Communications. The following items have been addressed: - cleaned up indentation. - cleaned up comments. - cleaned up c-styles. - using EXPORT_SYMBOL_GPL instead of EXPORT_SYMBOL - removed 2.4 compatibility. - created TCP_OFFLOAD config option. - moved #defines to appropriate files. - removed obfuscating macros. - included necessary definitions instead of struct. - made IS_OFFLOADED an inline function instead of macro. The following items are currently being worked on: - use sysfs instead of procfs. - addressing the use of semaphores in 'register_tom'. - use RCU, need to look at this. - use inline function instead of TOEDEV macro, requires some work. Comments: - static was removed from functions '__tcp_inherit_port' & '__tcp_v4_hash' because these are called outside of tcp_ipv4.c from the TOM driver. Signed-off-by: Scott Bardone <sbardone@chelsio.com> diff -Naur linux-2.6.13-rc6-git3/include/linux/netdevice.h linux-2.6.13-rc6-git3.patched/include/linux/netdevice.h --- linux-2.6.13-rc6-git3/include/linux/netdevice.h 2005-08-07 11:18:56.000000000 -0700 +++ linux-2.6.13-rc6-git3.patched/include/linux/netdevice.h 2005-08-11 21:28:36.000000000 -0700 @@ -408,6 +408,9 @@ #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */ #define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */ #define NETIF_F_LLTX 4096 /* LockLess TX */ +#ifdef CONFIG_TCP_OFFLOAD +#define NETIF_F_TCPIP_OFFLOAD 65536 /* Can offload TCP/IP */ +#endif /* Called after device is detached from network. */ void (*uninit)(struct net_device *dev); diff -Naur linux-2.6.13-rc6-git3/include/linux/tcp_diag.h linux-2.6.13-rc6-git3.patched/include/linux/tcp_diag.h --- linux-2.6.13-rc6-git3/include/linux/tcp_diag.h 2005-08-07 11:18:56.000000000 -0700 +++ linux-2.6.13-rc6-git3.patched/include/linux/tcp_diag.h 2005-08-11 21:28:36.000000000 -0700 @@ -4,6 +4,11 @@ /* Just some random number */ #define TCPDIAG_GETSOCK 18 +/* TOE API */ +#ifdef CONFIG_TCP_OFFLOAD +#define TCPDIAG_OFFLOAD 5 +#endif + /* Socket identity */ struct tcpdiag_sockid { diff -Naur linux-2.6.13-rc6-git3/include/linux/tcp.h linux-2.6.13-rc6-git3.patched/include/linux/tcp.h --- linux-2.6.13-rc6-git3/include/linux/tcp.h 2005-08-07 11:18:56.000000000 -0700 +++ linux-2.6.13-rc6-git3.patched/include/linux/tcp.h 2005-08-11 21:28:36.000000000 -0700 @@ -235,6 +235,10 @@ return (struct tcp_request_sock *)req; } +#ifdef CONFIG_TCP_OFFLOAD +struct toe_funcs; +#endif + struct tcp_sock { /* inet_sock has to be the first member of tcp_sock */ struct inet_sock inet; @@ -342,6 +346,10 @@ struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ +#ifdef CONFIG_TCP_OFFLOAD + struct toe_funcs *toe_specific; /* Operations overriden by TOEs */ +#endif + __u32 rcv_wnd; /* Current receiver window */ __u32 rcv_wup; /* rcv_nxt on last window update sent */ __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ diff -Naur linux-2.6.13-rc6-git3/include/linux/toedev.h linux-2.6.13-rc6-git3.patched/include/linux/toedev.h --- linux-2.6.13-rc6-git3/include/linux/toedev.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.13-rc6-git3.patched/include/linux/toedev.h 2005-08-11 22:37:03.947800000 -0700 @@ -0,0 +1,126 @@ +/***************************************************************************** + * * + * File: * + * toedev.h * + * * + * Description: * + * TOE device definitions. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License, version 2, as * + * published by the Free Software Foundation. * + * * + * You should have received a copy of the GNU General Public License along * + * with this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + * * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED * + * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF * + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * + * * + * http://www.chelsio.com * + * * + * Copyright (c) 2003 - 2005 Chelsio Communications, Inc. * + * All rights reserved. * + * * + * Maintainers: maintainers@chelsio.com * + * * + * Authors: Dimitrios Michailidis <dm@chelsio.com> * + * * + * History: * + * * + ****************************************************************************/ +/* $Date: 2005/08/09 18:58:21 $ $RCSfile: toedev.h,v $ $Revision: 1.4 $ */ + +#ifndef _TOEDEV_H_ +#define _TOEDEV_H_ + +#include <linux/list.h> +#include <asm/atomic.h> +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <net/neighbour.h> +#include <net/sock.h> + +#define TOENAMSIZ 16 + +/* Get the toedev associated with a net_device */ +#define TOEDEV(netdev) ((struct toedev *)(netdev)->ec_ptr) + +/* TOE type ids */ +enum { + TOE_ID_CHELSIO_T1 = 1, + TOE_ID_CHELSIO_T1C, + TOE_ID_CHELSIO_T3, +}; + +struct toe_id { + unsigned int id; + unsigned long data; +}; + +struct tom_info; + +struct toedev { + char name[TOENAMSIZ]; /* TOE device name */ + struct list_head toe_list; /* for list linking */ + int toe_index; /* unique TOE device index */ + unsigned int ttid; /* TOE type id */ + unsigned long flags; /* device flags */ + unsigned int mtu; /* max size of TX offloaded data */ + unsigned int nconn; /* max # of offloaded connections */ + struct net_device *lldev; /* LL device associated with TOE messages */ + const struct tom_info *offload_mod; /* attached TCP offload module */ + struct proc_dir_entry *proc_dir; /* root of proc dir for this TOE */ + int (*open)(struct toedev *dev); + int (*close)(struct toedev *dev); + int (*can_offload)(struct toedev *dev, struct sock *sk); + int (*connect)(struct toedev *dev, struct sock *sk); + int (*send)(struct toedev *dev, struct sk_buff *skb); + int (*recv)(struct toedev *dev, struct sk_buff **skb, int n); + int (*ctl)(struct toedev *dev, unsigned int req, void *data); + void (*neigh_update)(struct net_device *lldev, + struct toedev *dev, + struct neighbour *neigh, int fl); + void *priv; /* driver private data */ + void *l2opt; /* optional layer 2 data */ + void *l3opt; /* optional layer 3 data */ + void *l4opt; /* optional layer 4 data */ + void *ulp; /* ulp stuff */ + atomic_t refcnt; /* reference count */ +}; + +struct tom_info { + int (*attach)(struct toedev *dev, const struct toe_id *entry); + int (*detach)(struct toedev *dev); + const char *name; + struct toe_id *id_table; + struct list_head list_node; +}; + +/* Flags for toe_neigh_update() */ +enum { + NEIGH_ADDR_CHANGED = 1 +}; + +static inline void toedev_hold(struct toedev *dev) +{ + atomic_inc(&dev->refcnt); +} + +static inline void toedev_put(struct toedev *dev) +{ + atomic_dec(&dev->refcnt); +} + +int register_tom(struct tom_info *t); +int unregister_tom(struct tom_info *t); +int register_toedev(struct toedev *dev, const char *name); +int activate_toedev(struct toedev *dev); +struct toedev *alloc_toedev(void); +void toe_set_lldev(struct toedev *dev, struct net_device *lldev); +int toe_send(struct toedev *dev, struct sk_buff *skb); +int toe_receive_skb(struct toedev *dev, struct sk_buff **skb, int n); +void toe_neigh_update(struct neighbour *neigh, int flags); +#endif /* _TOEDEV_H_ */ diff -Naur linux-2.6.13-rc6-git3/include/net/offload.h linux-2.6.13-rc6-git3.patched/include/net/offload.h --- linux-2.6.13-rc6-git3/include/net/offload.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.13-rc6-git3.patched/include/net/offload.h 2005-08-11 22:37:03.998792000 -0700 @@ -0,0 +1,87 @@ +/***************************************************************************** + * * + * File: * + * offload.h * + * * + * Description: * + * TCP offload support. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License, version 2, as * + * published by the Free Software Foundation. * + * * + * You should have received a copy of the GNU General Public License along * + * with this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + * * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED * + * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF * + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * + * * + * http://www.chelsio.com * + * * + * Copyright (c) 2003 - 2005 Chelsio Communications, Inc. * + * All rights reserved. * + * * + * Maintainers: maintainers@chelsio.com * + * * + * Authors: Dimitrios Michailidis <dm@chelsio.com> * + * * + * History: * + * * + ****************************************************************************/ +/* $Date: 2005/08/09 18:58:21 $ $RCSfile: offload.h,v $ $Revision: 1.4 $ */ + +#ifndef _NET_OFFLOAD_H +#define _NET_OFFLOAD_H +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/toedev.h> + +enum { + OFFLOAD_LISTEN_START, + OFFLOAD_LISTEN_STOP +}; + +/* Returns true if sk is an offloaded IPv4 TCP socket. */ +static inline int is_offloaded(struct sock *sk) +{ + return ((sk)->sk_family == AF_INET && (sk)->sk_prot != &tcp_prot); +} + +/* Per-skb backlog handler. Run when a socket's backlog is processed. */ +struct blog_skb_cb { + void (*backlog_rcv)(struct sock *sk, struct sk_buff *skb); + struct toedev *dev; +}; + +#define BLOG_SKB_CB(skb) ((struct blog_skb_cb *)(skb)->cb) + +/* Offload structure */ +struct tcpdiag_offload { + unsigned int offload_dev_idx; + unsigned int offload_cookie; + unsigned int mem; +}; + +struct notifier_block; + +/* + * TCP operations that a TOE wants to override but cannot through existing + * means. + */ +struct toe_funcs { + void (*rcv_consumed)(struct sock *sk, int consumed); + void (*pmtu_changed)(struct sock *sk); + void (*set_keepalive)(struct sock *sk, int on_off); + void (*tcpdiag_offload_info)(const struct sock *sk, + struct tcpdiag_offload *oinfo); + int (*sendskb)(struct sock *sk, struct sk_buff *skb, int flags); +}; + +extern int register_listen_offload_notifier(struct notifier_block *nb); +extern int unregister_listen_offload_notifier(struct notifier_block *nb); +extern int tcp_listen_offload_stop(struct sock *sk); +extern int tcp_listen_offload(struct sock *sk); +extern int tcp_connect_offload(struct sock *sk); +#endif /* _NET_OFFLOAD_H */ diff -Naur linux-2.6.13-rc6-git3/net/core/Makefile linux-2.6.13-rc6-git3.patched/net/core/Makefile --- linux-2.6.13-rc6-git3/net/core/Makefile 2005-08-07 11:18:56.000000000 -0700 +++ linux-2.6.13-rc6-git3.patched/net/core/Makefile 2005-08-11 21:28:37.000000000 -0700 @@ -17,3 +17,4 @@ obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NET_RADIO) += wireless.o obj-$(CONFIG_NETPOLL) += netpoll.o +obj-$(CONFIG_TCP_OFFLOAD) += toedev.o diff -Naur linux-2.6.13-rc6-git3/net/core/neighbour.c linux-2.6.13-rc6-git3.patched/net/core/neighbour.c --- linux-2.6.13-rc6-git3/net/core/neighbour.c 2005-08-07 11:18:56.000000000 -0700 +++ linux-2.6.13-rc6-git3.patched/net/core/neighbour.c 2005-08-11 21:28:37.000000000 -0700 @@ -32,6 +32,9 @@ #include <net/sock.h> #include <linux/rtnetlink.h> #include <linux/random.h> +#ifdef CONFIG_TCP_OFFLOAD +#include <linux/toedev.h> +#endif #include <linux/string.h> #define NEIGH_DEBUG 1 @@ -763,6 +766,9 @@ NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); neigh->nud_state = NUD_STALE; neigh_suspect(neigh); +#ifdef CONFIG_TCP_OFFLOAD + toe_neigh_update(neigh, 0); +#endif } } else if (state & NUD_DELAY) { if (time_before_eq(now, @@ -770,6 +776,9 @@ NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh); neigh->nud_state = NUD_REACHABLE; neigh_connect(neigh); +#ifdef CONFIG_TCP_OFFLOAD + toe_neigh_update(neigh, 0); +#endif next = neigh->confirmed + neigh->parms->reachable_time; } else { NEIGH_PRINTK2("neigh %p is probed.\n", neigh); @@ -788,6 +797,9 @@ neigh->nud_state = NUD_FAILED; notify = 1; +#ifdef CONFIG_TCP_OFFLOAD + toe_neigh_update(neigh, 0); +#endif NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); NEIGH_PRINTK2("neigh %p is failed.\n", neigh); @@ -952,6 +964,9 @@ if (old & NUD_CONNECTED) neigh_suspect(neigh); neigh->nud_state = new; +#ifdef CONFIG_TCP_OFFLOAD + toe_neigh_update(neigh, 0); +#endif err = 0; #ifdef CONFIG_ARPD notify = old & NUD_VALID; @@ -1031,6 +1046,9 @@ notify = 1; #endif } +#ifdef CONFIG_TCP_OFFLOAD + toe_neigh_update(neigh, lladdr != neigh->ha ? NEIGH_ADDR_CHANGED : 0); +#endif if (new == old) goto out; if (new & NUD_CONNECTED) diff -Naur linux-2.6.13-rc6-git3/net/core/toedev.c linux-2.6.13-rc6-git3.patched/net/core/toedev.c --- linux-2.6.13-rc6-git3/net/core/toedev.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.13-rc6-git3.patched/net/core/toedev.c 2005-08-11 22:37:03.965797000 -0700 @@ -0,0 +1,485 @@ +/***************************************************************************** + * * + * File: * + * toedev.c * + * * + * Description: * + * TOE device support infrastructure. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License, version 2, as * + * published by the Free Software Foundation. * + * * + * You should have received a copy of the GNU General Public License along * + * with this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + * * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED * + * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF * + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * + * * + * http://www.chelsio.com * + * * + * Copyright (c) 2003 - 2005 Chelsio Communications, Inc. * + * All rights reserved. * + * * + * Maintainers: maintainers@chelsio.com * + * * + * Authors: Dimitrios Michailidis <dm@chelsio.com> * + * * + * History: * + * * + ****************************************************************************/ +/* $Date: 2005/08/10 01:33:18 $ $RCSfile: toedev.c,v $ $Revision: 1.5 $ */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/netdevice.h> +#include <linux/toedev.h> +#include <net/neighbour.h> +#include <asm/semaphore.h> + +#define boot_phase 0 + +#ifndef __raise_softirq_irqoff +#define __raise_softirq_irqoff(nr) __cpu_raise_softirq(smp_processor_id(), nr) +#endif + +static DECLARE_MUTEX(toedev_db_lock); +static LIST_HEAD(toedev_list); +static LIST_HEAD(tom_list); + +static int toedev_init(void); + +/* + * Returns the entry in the TOE id table 'table' that has a given id, or NULL + * if the id is not found. + */ +static const struct toe_id *id_find(unsigned int id, + const struct toe_id *table) +{ + const struct toe_id *p; + + for (p = table; p->id; ++p) + if (p->id == id) + return p; + return NULL; +} + +/* + * Returns true if a TOE device is presently attached to an offload module. + */ +static inline int is_attached(const struct toedev *dev) +{ + return dev->offload_mod != NULL; +} + +/* + * Try to attach a new TOE device to an existing TCP offload module that can + * handle the device's TOE id. Returns 0 if it succeeds. + * + * Must be called with the toedev_db_lock held. + */ +static int toedev_attach(struct toedev *dev) +{ + struct tom_info *t; + + list_for_each_entry(t, &tom_list, list_node) { + const struct toe_id *entry; + + entry = id_find(dev->ttid, t->id_table); + if (entry && t->attach(dev, entry) == 0) { + dev->offload_mod = t; + return 0; + } + } + return -ENOPROTOOPT; +} + +/* + * Register a TCP Offload Module (TOM). + */ +int register_tom(struct tom_info *t) +{ + down(&toedev_db_lock); + list_add(&t->list_node, &tom_list); + up(&toedev_db_lock); + return 0; +} + +/* + * Unregister a TCP Offload Module (TOM). Note that this does not affect any + * TOE devices to which the TOM is already attached. + */ +int unregister_tom(struct tom_info *t) +{ + down(&toedev_db_lock); + list_del(&t->list_node); + up(&toedev_db_lock); + return 0; +} + +/* + * Find a TOE device by name. Must be called with toedev_db_lock held. + */ +static struct toedev *__find_toedev_by_name(const char *name) +{ + struct toedev *dev; + + list_for_each_entry(dev, &toedev_list, toe_list) { + if (!strncmp(dev->name, name, TOENAMSIZ)) + return dev; + } + return NULL; +} + +#if 0 +/* + * Find a TOE device by name. + */ +static struct toedev *find_toedev_by_name(const char *name) +{ + struct toedev *dev; + + down(&toedev_db_lock); + dev = __find_toedev_by_name(name); + if (dev) + toedev_hold(dev); + up(&toedev_db_lock); + return dev; +} +#endif + +/* + * Find a TOE device by index. Must be called with toedev_db_lock held. + */ +static struct toedev *__find_toedev_by_index(int index) +{ + struct toedev *dev; + + list_for_each_entry(dev, &toedev_list, toe_list) { + if (dev->toe_index == index) + return dev; + } + return NULL; +} + +/* + * Return true if a TOE device is already registered. + * Must be called with the toedev_db_lock held. + */ +static int toedev_registered(const struct toedev *dev) +{ + struct toedev *d; + + list_for_each_entry(d, &toedev_list, toe_list) { + if (d == dev) + return 1; + } + return 0; +} + +/* + * Finalize the name of a TOE device by assigning values to any format strings + * in its name. + */ +static int toedev_assign_name(struct toedev *dev, const char *name, int limit) +{ + int i; + + for (i = 0; i < limit; ++i) { + char s[TOENAMSIZ]; + + snprintf(s, sizeof(s), name, i); + if (!__find_toedev_by_name(s)) { + strcpy(dev->name, s); + return 0; + } + } + return -1; +} + +/* + * Allocate a unique index for a TOE device. We keep the index within 30 bits + * to allow it to be used as a sysctl index, which uses signed IDs. + * + * We don't handle index exhaustion. Guess why. + */ +static int toedev_new_index(void) +{ + static int toe_index; + + for (;;) { + if (++toe_index & 0xc0000000) + toe_index = 1; + if (!__find_toedev_by_index(toe_index)) + return toe_index; + } +} + +#ifdef CONFIG_PROC_FS +#include <linux/proc_fs.h> + +static struct proc_dir_entry *toedev_proc_root; + +/* XXX This doesn't handle module unloading properly. Do we need to? */ + +static int devices_read_proc(char *buf, char **start, off_t offset, + int length, int *eof, void *data) +{ + int len; + struct toedev *dev; + struct net_device *ndev; + + len = sprintf(buf, + "Device Offload Module Interfaces\n"); + + down(&toedev_db_lock); + list_for_each_entry(dev, &toedev_list, toe_list) { + len += sprintf(buf + len, "%-16s %-20s", dev->name, + is_attached(dev) ? dev->offload_mod->name : "<None>"); + read_lock(&dev_base_lock); + for (ndev = dev_base; ndev; ndev = ndev->next) { + if (TOEDEV(ndev) == dev) + len += sprintf(buf + len, " %s", ndev->name); + } + read_unlock(&dev_base_lock); + len += sprintf(buf + len, "\n"); + if (len >= length) + break; + } + up(&toedev_db_lock); + + if (len > length) + len = length; + *eof = 1; + return len; +} + +static void toe_proc_cleanup(void) +{ + remove_proc_entry("devices", toedev_proc_root); + remove_proc_entry("net/toe", NULL); + toedev_proc_root = NULL; +} + +static struct proc_dir_entry *create_toe_proc_dir(const char *name) +{ + struct proc_dir_entry *d; + + if (!toedev_proc_root) + return NULL; + + d = proc_mkdir(name, toedev_proc_root); + if (d) + d->owner = THIS_MODULE; + return d; +} + +#if 0 +static void delete_toe_proc_dir(struct toedev *dev) +{ + if (dev->proc_dir) { + remove_proc_entry(dev->name, toedev_proc_root); + dev->proc_dir = NULL; + } +} +#endif + +static int __init toe_proc_init(void) +{ + struct proc_dir_entry *d; + + toedev_proc_root = proc_mkdir("net/toe", NULL); + if (!toedev_proc_root) + return -ENOMEM; + toedev_proc_root->owner = THIS_MODULE; + + d = create_proc_read_entry("devices", 0, toedev_proc_root, + devices_read_proc, NULL); + if (!d) + goto cleanup; + d->owner = THIS_MODULE; + return 0; + +cleanup: + toe_proc_cleanup(); + return -ENOMEM; +} +#else +#define toe_proc_init() 0 +#define create_toe_proc_dir(name) NULL +#define delete_toe_proc_dir(dev) +#endif /* CONFIG_PROC_FS */ + +/* + * Register a TOE device and try to attach an appropriate TCP offload module + * to it. 'name' is a template that may contain at most one %d format + * specifier. + */ +int register_toedev(struct toedev *dev, const char *name) +{ + int ret; + char *p; + + if (boot_phase) + toedev_init(); + + /* Validate the name template. Only one %d allowed. */ + if (!name || !*name) + return -EINVAL; + p = strchr(name, '%'); + if (p && (p[1] != 'd' || strchr(p + 2, '%'))) + return -EINVAL; + + down(&toedev_db_lock); + if (toedev_registered(dev)) { /* device already registered */ + ret = -EEXIST; + goto out; + } + + if ((ret = toedev_assign_name(dev, name, 32)) != 0) + goto out; + + dev->proc_dir = create_toe_proc_dir(dev->name); + dev->toe_index = toedev_new_index(); + dev->offload_mod = NULL; + list_add_tail(&dev->toe_list, &toedev_list); + toedev_hold(dev); +out: + up(&toedev_db_lock); + return ret; +} + +/* + * Allocate and initialize a toedev structure. + */ +struct toedev *alloc_toedev(void) +{ + struct toedev *dev = kmalloc(sizeof(struct toedev), GFP_KERNEL); + + if (dev) { + memset(dev, 0, sizeof(struct toedev)); + atomic_set(&dev->refcnt, 0); + INIT_LIST_HEAD(&dev->toe_list); + } + return dev; +} + +/* + * Activate a TOE device. + */ +int activate_toedev(struct toedev *dev) +{ + int ret = 0; + + down(&toedev_db_lock); + if (!toedev_registered(dev)) + ret = -ENODEV; + else if (!is_attached(dev)) + ret = toedev_attach(dev); + up(&toedev_db_lock); + return ret; +} + +/* + * Set the link-layer device associated with a TOE. For sniffing purposes any + * messages sent to/received from the TOE will be associated with this device. + */ +void toe_set_lldev(struct toedev *dev, struct net_device *lldev) +{ + struct net_device *olddev = dev->lldev; + + if (lldev) + dev_hold(lldev); + dev->lldev = lldev; + if (olddev) + dev_put(olddev); +} + +/* + * Sends an sk_buff to a TOE driver after dealing with any active network taps. + */ +int toe_send(struct toedev *dev, struct sk_buff *skb) +{ + int r; + + local_bh_disable(); + if (unlikely(netdev_nit)) { /* deal with active taps */ + skb->nh.raw = skb->data; + skb->dev = dev->lldev; + dev_queue_xmit_nit(skb, skb->dev); + } + r = dev->send(dev, skb); + local_bh_enable(); + return r; +} + +/* + * toe_receive_skb - process n received TOE packets + * @dev: the toe device + * @skb: an array of offload packets + * @n: the number of offload packets + * + * Process an array of ingress offload packets. Each packet is forwarded + * to any active network taps and then passed to the toe device's receive + * method. We optimize passing packets to the receive method by passing + * it the whole array at once except when there are active taps. + */ +int toe_receive_skb(struct toedev *dev, struct sk_buff **skb, int n) +{ + int i; + + if (likely(!netdev_nit)) + return dev->recv(dev, skb, n); + + for (i = 0; i < n; ++i) { + struct sk_buff *p = *skb++; + + p->dev = dev->lldev; + skb_get(p); + netif_receive_skb(p); + p->dev = NULL; + dev->recv(dev, &p, 1); + } + return 0; +} + +void toe_neigh_update(struct neighbour *neigh, int flags) +{ + struct net_device *dev = neigh->dev; + + if (dev && (dev->features & NETIF_F_TCPIP_OFFLOAD)) { + struct toedev *tdev = TOEDEV(dev); + + if (tdev && tdev->neigh_update) + tdev->neigh_update(neigh->dev, tdev, neigh, flags); + } +} + +static int __init toedev_init(void) +{ +#ifndef boot_phase + boot_phase = 0; +#endif + + /* We tolerate proc failures */ + if (toe_proc_init()) + printk(KERN_WARNING "Unable to create /proc/net/toe entries\n"); + + return 0; +} + +subsys_initcall(toedev_init); + +EXPORT_SYMBOL_GPL(register_tom); +EXPORT_SYMBOL_GPL(unregister_tom); +EXPORT_SYMBOL_GPL(register_toedev); +EXPORT_SYMBOL_GPL(alloc_toedev); +EXPORT_SYMBOL_GPL(activate_toedev); +EXPORT_SYMBOL_GPL(toe_set_lldev); +EXPORT_SYMBOL_GPL(toe_send); +EXPORT_SYMBOL_GPL(toe_receive_skb); diff -Naur linux-2.6.13-rc6-git3/net/ipv4/Makefile linux-2.6.13-rc6-git3.patched/net/ipv4/Makefile --- linux-2.6.13-rc6-git3/net/ipv4/Makefile 2005-08-07 11:18:56.000000000 -0700 +++ linux-2.6.13-rc6-git3.patched/net/ipv4/Makefile 2005-08-11 21:28:37.000000000 -0700 @@ -10,6 +10,7 @@ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o +obj-$(CONFIG_TCP_OFFLOAD) += offload.o obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o obj-$(CONFIG_PROC_FS) += proc.o diff -Naur linux-2.6.13-rc6-git3/net/ipv4/offload.c linux-2.6.13-rc6-git3.patched/net/ipv4/offload.c --- linux-2.6.13-rc6-git3/net/ipv4/offload.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.13-rc6-git3.patched/net/ipv4/offload.c 2005-08-11 22:37:04.015789000 -0700 @@ -0,0 +1,161 @@ +/***************************************************************************** + * * + * File: * + * offload.c * + * * + * Description: * + * TCP offload support. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License, version 2, as * + * published by the Free Software Foundation. * + * * + * You should have received a copy of the GNU General Public License along * + * with this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + * * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED * + * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF * + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * + * * + * http://www.chelsio.com * + * * + * Copyright (c) 2003 - 2005 Chelsio Communications, Inc. * + * All rights reserved. * + * * + * Maintainers: maintainers@chelsio.com * + * * + * Authors: Dimitrios Michailidis <dm@chelsio.com> * + * * + * History: * + * * + ****************************************************************************/ +/* $Date: 2005/08/09 18:58:21 $ $RCSfile: offload.c,v $ $Revision: 1.3 $ */ + +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/notifier.h> +#include <linux/toedev.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/offload.h> +#include <asm/semaphore.h> +#include <linux/netdevice.h> + +# define inet_addr_info(sk) inet_sk(sk) + +/* Replace this with a R/W semaphore someday. See kernel/profile.c */ +static DECLARE_MUTEX(notify_mutex); +static struct notifier_block *listen_offload_notify_list; + +int register_listen_offload_notifier(struct notifier_block *nb) +{ + int err; + + down(¬ify_mutex); + err = notifier_chain_register(&listen_offload_notify_list, nb); + up(¬ify_mutex); + return err; +} + +int unregister_listen_offload_notifier(struct notifier_block *nb) +{ + int err; + + down(¬ify_mutex); + err = notifier_chain_unregister(&listen_offload_notify_list, nb); + up(¬ify_mutex); + return err; +} + +/* + * Called when an active open has been requested through connect(2). Decides + * if the connection may be offloaded based on the system's offload policies + * and the capabilities of the egress interface. + * + * Returns 1 if the connection is offloaded and 0 otherwise. + */ +int tcp_connect_offload(struct sock *sk) +{ + if (sk->sk_route_caps & NETIF_F_TCPIP_OFFLOAD) { + struct toedev *dev = TOEDEV(__sk_dst_get(sk)->dev); + + if (!dev || !dev->can_offload(dev, sk)) + return 0; + /* XXX check offload policies */ + if (dev->connect(dev, sk) == 0) + return 1; + } + return 0; +} + +/* + * TOE capable backlog handler. This is used for offloaded listening sockets + * so they can deal with non-IP (TOE) packets queued in their backlogs. We + * distinguish TOE from IP packets easily as the former lack network headers. + * Such TOE packets are fed to a TOE-specific backlog handler. + */ +static int listen_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + if (likely(skb->h.raw != skb->nh.raw)) + return tcp_v4_do_rcv(sk, skb); + BLOG_SKB_CB(skb)->backlog_rcv(sk, skb); + return 0; +} + +/* + * Called when the SW stack has transitioned a socket to listen state. + * We check if the socket should be offloaded according to the current + * offloading policies, and if so, publish an OFFLOAD_LISTEN_START event. + */ +int tcp_listen_offload(struct sock *sk) +{ + /* IPv4 only for now */ + if (sk->sk_family != PF_INET) + return 0; + + /* filter out loopback listens */ + if (LOOPBACK(inet_addr_info(sk)->rcv_saddr)) + return 0; + + /* if (nf_sock_hook(PF_INET, NF_IP_OFFLOAD, sk) != NF_ACCEPT) + return 0; */ + + /* Install a TOE capable backlog handler */ + sk->sk_backlog_rcv = listen_backlog_rcv; + + down(¬ify_mutex); + notifier_call_chain(&listen_offload_notify_list, + OFFLOAD_LISTEN_START, sk); + up(¬ify_mutex); + return 1; +} + +/* + * Called through a netfilter hook when a socket starts listening. + * Publishes an OFFLOAD_LISTEN_START event. + */ +static int tcp_listen_offload_start(struct sk_buff *skb) +{ + down(¬ify_mutex); + notifier_call_chain(&listen_offload_notify_list, OFFLOAD_LISTEN_START, + skb->sk); + up(¬ify_mutex); + return 0; +} + +/* + * Called when the SW stack is preparing to close an existing listening socket. + * We publish an OFFLOAD_LISTEN_STOP event. + */ +int tcp_listen_offload_stop(struct sock *sk) +{ + down(¬ify_mutex); + notifier_call_chain(&listen_offload_notify_list, + OFFLOAD_LISTEN_STOP, sk); + up(¬ify_mutex); + return 0; +} + +EXPORT_SYMBOL_GPL(register_listen_offload_notifier); +EXPORT_SYMBOL_GPL(unregister_listen_offload_notifier); diff -Naur linux-2.6.13-rc6-git3/net/ipv4/tcp.c linux-2.6.13-rc6-git3.patched/net/ipv4/tcp.c --- linux-2.6.13-rc6-git3/net/ipv4/tcp.c 2005-08-07 11:18:56.000000000 -0700 +++ linux-2.6.13-rc6-git3.patched/net/ipv4/tcp.c 2005-08-11 21:28:37.000000000 -0700 @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $ + * Version: $Id: tcp.c,v 1.2 2005/08/12 04:28:37 sbardone Exp $ * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -262,7 +262,9 @@ #include <net/tcp.h> #include <net/xfrm.h> #include <net/ip.h> - +#ifdef CONFIG_TCP_OFFLOAD +#include <net/offload.h> +#endif #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -483,6 +485,9 @@ sk_dst_reset(sk); sk->sk_prot->hash(sk); +#ifdef CONFIG_TCP_OFFLOAD + tcp_listen_offload(sk); +#endif return 0; } @@ -735,6 +740,11 @@ ssize_t res; struct sock *sk = sock->sk; +#ifdef CONFIG_TCP_OFFLOAD + if (sk->sk_prot->sendpage) + return sk->sk_prot->sendpage(sk, page, offset, size, flags); +#endif + #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) if (!(sk->sk_route_caps & NETIF_F_SG) || @@ -994,7 +1004,10 @@ * this, no blocking and very strange errors 8) */ -static int tcp_recv_urg(struct sock *sk, long timeo, +#ifndef CONFIG_TCP_OFFLOAD +static +#endif +int tcp_recv_urg(struct sock *sk, long timeo, struct msghdr *msg, int len, int flags, int *addr_len) { @@ -1193,8 +1206,14 @@ tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ - if (copied) - cleanup_rbuf(sk, copied); + if (copied) { +#ifdef CONFIG_TCP_OFFLOAD + if (tp->toe_specific) + tp->toe_specific->rcv_consumed(sk, copied); + else +#endif + cleanup_rbuf(sk, copied); + } return copied; } @@ -1615,6 +1634,9 @@ sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == TCP_LISTEN) { +#ifdef CONFIG_TCP_OFFLOAD + tcp_listen_offload_stop(sk); +#endif tcp_set_state(sk, TCP_CLOSE); /* Special case. */ diff -Naur linux-2.6.13-rc6-git3/net/ipv4/tcp_diag.c linux-2.6.13-rc6-git3.patched/net/ipv4/tcp_diag.c --- linux-2.6.13-rc6-git3/net/ipv4/tcp_diag.c 2005-08-07 11:18:56.000000000 -0700 +++ linux-2.6.13-rc6-git3.patched/net/ipv4/tcp_diag.c 2005-08-11 21:28:37.000000000 -0700 @@ -1,7 +1,7 @@ /* * tcp_diag.c Module for monitoring TCP sockets. * - * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ + * Version: $Id: tcp_diag.c,v 1.2 2005/08/12 04:28:37 sbardone Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -24,6 +24,9 @@ #include <net/tcp.h> #include <net/ipv6.h> #include <net/inet_common.h> +#ifdef CONFIG_TCP_OFFLOAD +#include <net/offload.h> +#endif #include <linux/inet.h> #include <linux/stddef.h> @@ -54,6 +57,9 @@ struct nlmsghdr *nlh; struct tcp_info *info = NULL; struct tcpdiag_meminfo *minfo = NULL; +#ifdef CONFIG_TCP_OFFLOAD + struct tcpdiag_offload *oinfo = NULL; +#endif unsigned char *b = skb->tail; nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); @@ -70,6 +76,11 @@ strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), tp->ca_ops->name); } +#ifdef CONFIG_TCP_OFFLOAD + if ((ext & (1 << (TCPDIAG_OFFLOAD - 1))) && + tp->toe_specific && tp->toe_specific->tcpdiag_offload_info) + oinfo = TCPDIAG_PUT(skb, TCPDIAG_OFFLOAD, sizeof(*oinfo)); +#endif } r->tcpdiag_family = sk->sk_family; r->tcpdiag_state = sk->sk_state; @@ -163,6 +174,11 @@ if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) tp->ca_ops->get_info(tp, ext, skb); +#ifdef CONFIG_TCP_OFFLOAD + if (oinfo) + tp->toe_specific->tcpdiag_offload_info(sk, oinfo); +#endif + nlh->nlmsg_len = skb->tail - b; return skb->len; diff -Naur linux-2.6.13-rc6-git3/net/ipv4/tcp_ipv4.c linux-2.6.13-rc6-git3.patched/net/ipv4/tcp_ipv4.c --- linux-2.6.13-rc6-git3/net/ipv4/tcp_ipv4.c 2005-08-11 16:57:48.000000000 -0700 +++ linux-2.6.13-rc6-git3.patched/net/ipv4/tcp_ipv4.c 2005-08-11 21:28:37.000000000 -0700 @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.2 2005/08/12 04:28:37 sbardone Exp $ * * IPv4 specific functions * @@ -68,6 +68,9 @@ #include <net/ipv6.h> #include <net/inet_common.h> #include <net/xfrm.h> +#ifdef CONFIG_TCP_OFFLOAD +#include <net/offload.h> +#endif #include <linux/inet.h> #include <linux/ipv6.h> @@ -151,7 +154,10 @@ } /* Caller must disable local BH processing. */ -static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) +#ifndef CONFIG_TCP_OFFLOAD +static +#endif +__inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) { struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)]; @@ -351,7 +357,10 @@ } } -static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) +#ifndef CONFIG_TCP_OFFLOAD +static +#endif +__inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) { struct hlist_head *list; rwlock_t *lock; @@ -835,6 +844,11 @@ __sk_dst_set(sk, &rt->u.dst); tcp_v4_setup_caps(sk, &rt->u.dst); +#ifdef CONFIG_TCP_OFFLOAD + if (tcp_connect_offload(sk)) + return 0; +#endif + if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->saddr, inet->daddr, @@ -1494,11 +1508,20 @@ * to destinations, already remembered * to the moment of synflood. */ +#ifndef CONFIG_TCP_OFFLOAD LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open " "request from %u.%u." "%u.%u/%u\n", NIPQUAD(saddr), ntohs(skb->h.th->source))); +#else + NETDEBUG(if (net_ratelimit()) \ + printk(KERN_DEBUG "TCP: drop open " + "request from %u.%u." + "%u.%u/%u\n", \ + NIPQUAD(saddr), + ntohs(skb->h.th->source))); +#endif dst_release(dst); goto drop_and_free; } @@ -1626,7 +1649,12 @@ skb->nh.iph->daddr, skb->csum)) return 0; +#ifndef CONFIG_TCP_OFFLOAD LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n")); +#else + NETDEBUG(if (net_ratelimit()) + printk(KERN_DEBUG "hw tcp v4 csum failed\n")); +#endif skb->ip_summed = CHECKSUM_NONE; } if (skb->len <= 76) { diff -Naur linux-2.6.13-rc6-git3/net/ipv4/tcp_timer.c linux-2.6.13-rc6-git3.patched/net/ipv4/tcp_timer.c --- linux-2.6.13-rc6-git3/net/ipv4/tcp_timer.c 2005-08-07 11:18:56.000000000 -0700 +++ linux-2.6.13-rc6-git3.patched/net/ipv4/tcp_timer.c 2005-08-11 21:28:37.000000000 -0700 @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $ + * Version: $Id: tcp_timer.c,v 1.2 2005/08/12 04:28:37 sbardone Exp $ * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -22,6 +22,9 @@ #include <linux/module.h> #include <net/tcp.h> +#ifdef CONFIG_TCP_OFFLOAD +#include <net/offload.h> +#endif int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; @@ -559,6 +562,13 @@ if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return; +#ifdef CONFIG_TCP_OFFLOAD + if (tcp_sk(sk)->toe_specific) { + tcp_sk(sk)->toe_specific->set_keepalive(sk, val); + return; + } +#endif + if (val && !sock_flag(sk, SOCK_KEEPOPEN)) tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) diff -Naur linux-2.6.13-rc6-git3/net/Kconfig linux-2.6.13-rc6-git3.patched/net/Kconfig --- linux-2.6.13-rc6-git3/net/Kconfig 2005-08-07 11:18:56.000000000 -0700 +++ linux-2.6.13-rc6-git3.patched/net/Kconfig 2005-08-11 21:41:53.000000000 -0700 @@ -59,6 +59,22 @@ endif # if INET +config TCP_OFFLOAD + bool "TCP Offload support" + depends on NET + default y + help + TOE (TCP Offload Engine) places the TCP protocol stack in hardware + to reduce the burden on the host processor, thereby freeing up CPU + cycles to increase applications performance and high-speed data + transmission. + + You should say Y here if you have an Ethernet device with TOE + hardware, such as Chelsio's 10Gb T110/T210/T204 cards. + + If you don't have TOE hardware, saying Y here will not change the + way TCP works, all packets will go through the host TCP stack. + menuconfig NETFILTER bool "Network packet filtering (replaces ipchains)" ---help---