| From: |
| Arthur Kepner <akepner@sgi.com> |
| To: |
| netdev@oss.sgi.com |
| Subject: |
| [RFC/PATCH] "safer ipv4 reassembly" (fwd) |
| Date: |
| Thu, 23 Jun 2005 09:33:35 -0700 (PDT) |
| Cc: |
| Rick Jones <rick.jones2@hp.com>, Herbert Xu <herbert@gondor.apana.org.au> |
What with the recent migration to vger.kernel.org, I'm
forwarding this to oss.sgi.com, just in case any interested
parties missed it.
---------- Forwarded message ----------
Date: Wed, 22 Jun 2005 16:00:55 -0700 (PDT)
From: Arthur Kepner <akepner@sgi.com>
To: netdev@vger.kernel.org
Subject: [RFC/PATCH] "safer ipv4 reassembly"
A little more than a month ago I sent a RFC/PATCH for
something I called "strict ipv4 reassembly". This was
an attempt to make it much less likely that IP fragments
from different IP datagrams were reassembled together
when the IP id wraps. That patch was considered
unacceptable because it required fragments to arrive in
order or they'd be dropped.
One idea that resulted from that thread was to keep a
count of IP datagrams for a (src,dst,proto) and use that
as a kind of sequence number to check that a fragment is
valid. (I believe that Rick Jones and Herbert Xu each
independently came up with this idea, or something very
close to it.)
Following is a patch which implements that idea.
A new sysctl "sysctl_ip_reassembly_count" is used to control
how much reordering of IP fragments we'll tolerate. If it's
zero, the patch is a no-op. If sysctl_ip_reassembly_count is
non-zero, it defines a "window size" for IP fragments. When
a new fragment queue is made, the "bottom" of the window is
defined by the number if IP packets which have been received
for the associated (src,dst,proto), and each time a fragment
is added to the queue, the bottom of the window is advanced.
But before adding a fragment to the queue, a check is made
that the number of IP fragments in the queue falls within the
window. If not, the queue is dropped.
Comments?
include/linux/sysctl.h | 1
include/net/ip.h | 1
net/ipv4/ip_fragment.c | 206 +++++++++++++++++++++++++++++++++++++++++++++
net/ipv4/ip_input.c | 24 ++++-
net/ipv4/sysctl_net_ipv4.c | 11 ++
5 files changed, 240 insertions(+), 3 deletions(-)
Signed-off-by: Arthur Kepner <akepner@sgi.com>
diff -rup linux.orig/include/linux/sysctl.h linux.new/include/linux/sysctl.h
--- linux.orig/include/linux/sysctl.h 2005-06-14 11:35:18.611069887 -0700
+++ linux.new/include/linux/sysctl.h 2005-06-22 14:04:17.384853993 -0700
@@ -347,6 +347,7 @@ enum
NET_TCP_MODERATE_RCVBUF=106,
NET_TCP_TSO_WIN_DIVISOR=107,
NET_TCP_BIC_BETA=108,
+ NET_IPV4_REASM_COUNT=109,
};
enum {
diff -rup linux.orig/include/net/ip.h linux.new/include/net/ip.h
--- linux.orig/include/net/ip.h 2005-06-14 11:52:09.878700520 -0700
+++ linux.new/include/net/ip.h 2005-06-22 14:04:33.508057469 -0700
@@ -300,6 +300,7 @@ enum ip_defrag_users
};
struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user);
+void ip_count(u32 saddr, u32 daddr, u8 protocol);
extern int ip_frag_nqueues;
extern atomic_t ip_frag_mem;
diff -rup linux.orig/net/ipv4/ip_fragment.c linux.new/net/ipv4/ip_fragment.c
--- linux.orig/net/ipv4/ip_fragment.c 2005-06-13 16:49:55.290992303 -0700
+++ linux.new/net/ipv4/ip_fragment.c 2005-06-22 14:17:54.136940893 -0700
@@ -56,6 +56,8 @@
int sysctl_ipfrag_high_thresh = 256*1024;
int sysctl_ipfrag_low_thresh = 192*1024;
+extern int sysctl_ip_reassembly_count;
+
/* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
*/
@@ -69,6 +71,25 @@ struct ipfrag_skb_cb
#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb))
+/* struct ipc contains a count of the number of IP datagrams
+ * received for a (saddr, daddr, protocol) tuple - but one of
+ * these structures exists for a given (saddr, daddr, protocol)
+ * if and only if there is a queue of IP fragments associated
+ * with that 3-tuple and sysctl_ip_reassembly_count is non-zero.
+ */
+struct ipc {
+ struct hlist_node node;
+ u32 saddr;
+ u32 daddr;
+ u8 protocol;
+ atomic_t refcnt; /* how many ipqs hold refs to us */
+ atomic_t seq; /* how many ip datagrams for this
+ * (saddr,daddr,protocol) since we
+ * were created */
+ struct timer_list timer;
+ struct rcu_head rcu;
+};
+
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
struct ipq *next; /* linked list pointers */
@@ -92,6 +113,14 @@ struct ipq {
struct ipq **pprev;
int iif;
struct timeval stamp;
+ struct ipc *ipc;
+ atomic_t seq;
+ /* ipq->seq defines the "bottom" of the window of sequence numbers
+ * that are valid for this fragment - the "top" of the window is
+ * (ipq->seq + sysctl_ip_reassembly_count). ipq->seq is initialized
+ * to the value in the associated ipc when the fragment queue is
+ * created, and incremented each time a fragment is added to the
+ * queue */
};
/* Hash table. */
@@ -105,6 +134,12 @@ static u32 ipfrag_hash_rnd;
static LIST_HEAD(ipq_lru_list);
int ip_frag_nqueues = 0;
+#define IPC_HASHSZ IPQ_HASHSZ
+static struct {
+ struct hlist_head head;
+ spinlock_t lock;
+} ipc_hash[IPC_HASHSZ];
+
static __inline__ void __ipq_unlink(struct ipq *qp)
{
if(qp->next)
@@ -121,6 +156,11 @@ static __inline__ void ipq_unlink(struct
write_unlock(&ipfrag_lock);
}
+static unsigned int ipchashfn(u32 saddr, u32 daddr, u8 prot)
+{
+ return jhash_3words(prot, saddr, daddr, 0) & (IPC_HASHSZ - 1);
+}
+
static unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot)
{
return jhash_3words((u32)id << 16 | prot, saddr, daddr,
@@ -231,8 +271,16 @@ static __inline__ void ipq_put(struct ip
*/
static void ipq_kill(struct ipq *ipq)
{
+ struct ipc *cp = ipq->ipc;
+
if (del_timer(&ipq->timer))
atomic_dec(&ipq->refcnt);
+ if (cp) {
+ atomic_dec(&cp->refcnt);
+ /* no particular reason to use sysctl_ipfrag_time
+ * for this timer */
+ mod_timer(&cp->timer, jiffies + sysctl_ipfrag_time);
+ }
if (!(ipq->last_in & COMPLETE)) {
ipq_unlink(ipq);
@@ -348,10 +396,109 @@ static struct ipq *ip_frag_intern(unsign
return qp;
}
+static inline void __ipc_destroy(struct rcu_head *head)
+{
+ kfree(container_of(head, struct ipc, rcu));
+}
+
+static void ipc_destroy(unsigned long arg)
+{
+ struct ipc *cp = (struct ipc *) arg;
+ unsigned int hash = ipchashfn(cp->saddr, cp->daddr, cp->protocol);
+
+ spin_lock(&ipc_hash[hash].lock);
+ BUG_ON((atomic_read(&cp->refcnt)) < 0);
+ if (atomic_read(&cp->refcnt) == 0) {
+ hlist_del_rcu(&cp->node);
+ call_rcu(&cp->rcu, __ipc_destroy);
+ }
+ spin_unlock(&ipc_hash[hash].lock);
+}
+
+/*
+ * must hold spinlock for the appropriate hash list head when
+ * __ipc_create is called
+ */
+
+static inline struct ipc *__ipc_create(struct iphdr *iph,
+ const unsigned int hash)
+{
+ struct ipc *cp = kmalloc(sizeof(struct ipc), GFP_ATOMIC);
+ /* XXX should we account size to ip_frag_mem ??? */
+ if (cp) {
+ cp->saddr = iph->saddr;
+ cp->daddr = iph->daddr;
+ cp->protocol = iph->protocol;
+ atomic_set(&cp->seq, 0);
+ atomic_set(&cp->refcnt, 1);
+ INIT_HLIST_NODE(&cp->node);
+ hlist_add_head_rcu(&cp->node, &ipc_hash[hash].head);
+ init_timer(&cp->timer);
+ cp->timer.data = (unsigned long) cp;
+ cp->timer.function = ipc_destroy;
+ } else {
+ NETDEBUG(if (net_ratelimit())
+ printk(KERN_ERR "__ipc_create: no memory left !\n"));
+ }
+ return cp;
+}
+
+/*
+ * must be "rcu safe" when __ipc_find is called - either use
+ * rcu_read_lock (if you intend only to read the returned struct)
+ * or grab the spinlock for the appropriate hash list head (if
+ * you might modify the returned struct)
+ */
+static inline struct ipc *__ipc_find(u32 saddr, u32 daddr, u8 protocol,
+ const unsigned int hash)
+{
+ struct hlist_node *p;
+
+ hlist_for_each_rcu(p, &ipc_hash[hash].head) {
+ struct ipc * cp = (struct ipc *)p;
+ if(cp->saddr == saddr &&
+ cp->daddr == daddr &&
+ cp->protocol == protocol) {
+ return cp;
+ }
+ }
+ return NULL;
+}
+
+static struct ipc *ipc_find(struct iphdr *iph)
+{
+ struct ipc *cp;
+ unsigned int hash = ipchashfn(iph->saddr, iph->daddr, iph->protocol);
+
+ rcu_read_lock();
+ if((cp = __ipc_find(iph->saddr, iph->daddr,
+ iph->protocol, hash)) != NULL) {
+ atomic_inc(&cp->refcnt);
+ rcu_read_unlock();
+ return cp;
+ }
+ rcu_read_unlock();
+ spin_lock(&ipc_hash[hash].lock);
+ if((cp = __ipc_find(iph->saddr, iph->daddr,
+ iph->protocol, hash)) != NULL) {
+ atomic_inc(&cp->refcnt);
+ spin_unlock(&ipc_hash[hash].lock);
+ return cp;
+ }
+ cp = __ipc_create(iph, hash);
+ spin_unlock(&ipc_hash[hash].lock);
+ return cp;
+}
+
+
/* Add an entry to the 'ipq' queue for a newly received IP datagram. */
static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
{
struct ipq *qp;
+ struct ipc *cp = NULL;
+
+ if (sysctl_ip_reassembly_count && (cp = ipc_find(iph)) == NULL)
+ return NULL;
if ((qp = frag_alloc_queue()) == NULL)
goto out_nomem;
@@ -366,6 +513,10 @@ static struct ipq *ip_frag_create(unsign
qp->meat = 0;
qp->fragments = NULL;
qp->iif = 0;
+ qp->ipc = cp;
+ if (sysctl_ip_reassembly_count && cp) {
+ atomic_set(&qp->seq, atomic_read(&cp->seq));
+ }
/* Initialize a timer for this entry. */
init_timer(&qp->timer);
@@ -381,6 +532,51 @@ out_nomem:
return NULL;
}
+void ip_count(u32 saddr, u32 daddr, u8 protocol)
+{
+ struct ipc *cp = NULL;
+ unsigned int hash = ipchashfn(saddr, daddr, protocol);
+
+ rcu_read_lock();
+ if((cp = __ipc_find(saddr, daddr, protocol, hash)) != NULL) {
+ atomic_inc(&cp->seq);
+ }
+ rcu_read_unlock();
+}
+
+static inline int in_window(int bottom, int size, int seq) {
+ return (((seq - bottom) >= 0) && ((seq - (bottom + size)) < 0));
+}
+
+static int __ip_reassembly_count_check(const struct iphdr *iph, struct ipq *qp)
+{
+ struct ipc *cp = qp->ipc;
+ int cseq, qseq;
+
+ /* qp->ipc may be NULL if sysctl_ip_reassembly_count was off
+ * at the time the fragment queue was created */
+ if (cp == NULL)
+ return 0;
+
+ cseq = atomic_read(&cp->seq);
+ qseq = atomic_inc_return(&qp->seq);
+
+ if (!in_window(qseq, sysctl_ip_reassembly_count, cseq)) {
+ atomic_inc(&qp->refcnt);
+ read_unlock(&ipfrag_lock);
+ spin_lock(&qp->lock);
+ if (!(qp->last_in&COMPLETE))
+ ipq_kill(qp);
+ spin_unlock(&qp->lock);
+ ipq_put(qp, NULL);
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+ read_lock(&ipfrag_lock);
+ return 1;
+ }
+ return 0;
+}
+
+
/* Find the correct entry in the "incomplete datagrams" queue for
* this IP datagram, and create new one, if nothing is found.
*/
@@ -400,6 +596,10 @@ static inline struct ipq *ip_find(struct
qp->daddr == daddr &&
qp->protocol == protocol &&
qp->user == user) {
+ if (sysctl_ip_reassembly_count &&
+ __ip_reassembly_count_check(iph, qp)) {
+ break;
+ }
atomic_inc(&qp->refcnt);
read_unlock(&ipfrag_lock);
return qp;
@@ -679,9 +879,15 @@ struct sk_buff *ip_defrag(struct sk_buff
void ipfrag_init(void)
{
+ int i;
ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
(jiffies ^ (jiffies >> 6)));
+ for (i = 0; i < IPC_HASHSZ; i++ ) {
+ INIT_HLIST_HEAD(&ipc_hash[i].head);
+ spin_lock_init(&ipc_hash[i].lock);
+ }
+
init_timer(&ipfrag_secret_timer);
ipfrag_secret_timer.function = ipfrag_secret_rebuild;
ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval;
diff -rup linux.orig/net/ipv4/ip_input.c linux.new/net/ipv4/ip_input.c
--- linux.orig/net/ipv4/ip_input.c 2005-06-13 16:23:41.824620856 -0700
+++ linux.new/net/ipv4/ip_input.c 2005-06-22 14:02:35.705155734 -0700
@@ -146,6 +146,14 @@
#include <linux/mroute.h>
#include <linux/netlink.h>
+/*
+ * A non-zero value for sysctl_ip_reassembly_count defines the
+ * size of the window of ip fragments that are considered valid.
+ * This is useful for preventing reassembly of fragments from
+ * different IP datagrams when the 16-bit IP id wraps.
+ * A value of zero means the window is unlimited.
+ */
+int sysctl_ip_reassembly_count = 0;
/*
* SNMP management statistics
*/
@@ -286,13 +294,17 @@ static inline int ip_rcv_finish(struct s
{
struct net_device *dev = skb->dev;
struct iphdr *iph = skb->nh.iph;
+ __u32 saddr = iph->saddr;
+ __u32 daddr = iph->daddr;
+ __u8 proto = iph->protocol;
+ int ret;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb->dst == NULL) {
- if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+ if (ip_route_input(skb, daddr, saddr, iph->tos, dev))
goto drop;
}
@@ -334,7 +346,7 @@ static inline int ip_rcv_finish(struct s
if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
- NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+ NIPQUAD(saddr), NIPQUAD(iph->daddr));
in_dev_put(in_dev);
goto drop;
}
@@ -345,7 +357,13 @@ static inline int ip_rcv_finish(struct s
}
}
- return dst_input(skb);
+ ret = dst_input(skb);
+
+ if (sysctl_ip_reassembly_count) {
+ ip_count(saddr, daddr, proto);
+ }
+
+ return ret;
inhdr_error:
IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
diff -rup linux.orig/net/ipv4/sysctl_net_ipv4.c linux.new/net/ipv4/sysctl_net_ipv4.c
--- linux.orig/net/ipv4/sysctl_net_ipv4.c 2005-06-14 11:36:29.923218508 -0700
+++ linux.new/net/ipv4/sysctl_net_ipv4.c 2005-06-22 14:03:50.869948048 -0700
@@ -29,6 +29,7 @@ extern int sysctl_ipfrag_low_thresh;
extern int sysctl_ipfrag_high_thresh;
extern int sysctl_ipfrag_time;
extern int sysctl_ipfrag_secret_interval;
+extern int sysctl_ip_reassembly_count;
/* From ip_output.c */
extern int sysctl_ip_dynaddr;
@@ -49,6 +50,7 @@ extern int inet_peer_gc_mintime;
extern int inet_peer_gc_maxtime;
#ifdef CONFIG_SYSCTL
+static int zero;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -595,6 +597,15 @@ ctl_table ipv4_table[] = {
.strategy = &sysctl_jiffies
},
{
+ .ctl_name = NET_IPV4_REASM_COUNT,
+ .procname = "ip_reassembly_count",
+ .data = &sysctl_ip_reassembly_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero
+ },
+ {
.ctl_name = NET_TCP_NO_METRICS_SAVE,
.procname = "tcp_no_metrics_save",
.data = &sysctl_tcp_nometrics_save,
--
Arthur
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html