LWN.net Logo

Asynchronous IPI and e1000 Multiple Queues

From:  "cramerj" <cramerj@intel.com>
To:  <linux-net@vger.kernel.org>, <linux-netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>
Subject:  [RFC] Asynchronous IPI and e1000 Multiple Queues
Date:  Fri, 23 Sep 2005 15:25:12 -0700

With our latest submittal of e1000 patches, we introduced code to enable
multiple transmit and receive queues for the 82571 adapter.  All of the
code is wrapped with CONFIG_E1000_MQ with the intention that it not be
enabled until the patchset within this email is reviewed and (in some
form) released.  So we'd like to gather some feedback on this patchset
and get an idea if this is the correct approach.

Multiple queues serve a couple purposes (probably more): Receive-Side
Scaling - Share the interrupt processing across multiple CPUs.  We've
got hyper-threaded/multi-core processors, let's use them; Priority
Queuing (e.g., TOS) - Queue 0 transmits X more/less packets than queue 1
due to <insert arbitration scheme here>.  With the single-queue (qdisc)
implementation for transmits, it doesn't make multiple Tx queues all
that exciting, and it means the arbitration scheme resides in the
driver, but it's possible that could change over time.  So most benefits
of multiple queues are seen on receives.  NAPI helps this effort (with
per-CPU processing), but this means netif_rx_schedule is CPU-bound.  So
we needed a way to schedule receive processing per-CPU context.  The one
way we came up with was designing a new asynchronous IPI vector.  The
helper function is exported to drivers to queue up the work, then inform
the other CPUs of this pending work.

In smp_call_async.2.6.13.patch, we create an asynchronous IPI with an
associated queue.  Drivers fill out the call_async_data_struct and call
the "smp_call_function"-like routine smp_call_async_mask.  If the mask
contains the current running CPU, it simply calls the routine specified
in the data struct, otherwise add the task to the call_async_queue and
send an IPI to all CPUs in the mask.  The async interrupt simply
processes each task in the queue.

Each CPU can now take care of its own work (essentially calling
netif_rx_schedule) without messy locks around the NAPI threads.

In e1000_mq_Kconfig.patch, we simply add the option to enable multiple
queues during kernel configuration.

Is this the right approach?  Any input, fixes and testing would be
greatly appreciated.

Thanks,
-Jeb

 <<e1000_mq_Kconfig.patch>>  <<smp_call_async.2.6.13.patch>> 

diff -u
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -1841,6 +1841,23 @@
 
 	  If in doubt, say N.
 
+config E1000_MQ
+	bool "Enable Multiple Transmit/Receive Queues (EXPERIMENTAL)"
+	depends on E1000_NAPI && EXPERIMENTAL
+	help
+	  Starting with the 82571 (PCI-Express) network adapter, the option
+	  to specify multiple queues has been supported.  Multiple queues
+	  allow for Receive-Side Scaling, priority queues, etc. all within
+	  a single network controller.  The implementation is such that each
+	  receive queue is mapped to a logical processor; and each packet
+	  queued for transmission is assigned to the appropriate transmit
+	  queue based on a bit in its socket address.
+
+	  This feature, however, requires a kernel patch for asynchronous
+	  IPI calls.  Without that patch, compilation of the driver will fail.
+
+	  If in doubt, say N.
+
diff -u
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -629,3 +629,83 @@
 	}
 }
 
+LIST_HEAD(call_async_queue);
+rwlock_t call_async_queue_lock = RW_LOCK_UNLOCKED;
+
+/* user must set func, info, cpumask, and count before calling this */
+/* maybe some of that can be moved into here (count?) */
+
+int smp_call_async_mask(struct call_async_data_struct *call_data)
+{
+	unsigned long flags;
+	int this_cpu;
+
+	this_cpu = get_cpu();
+	
+	/* make sure the caller did not specify any offline CPUs */
+	if (unlikely(!cpus_subset(call_data->cpumask, cpu_online_map)))
+		return -EINVAL;
+
+	/* check to see if this CPU is in the mask */
+	if (cpu_isset(this_cpu, call_data->cpumask)) {
+		if (call_data->func)
+			call_data->func(call_data->info);
+		cpu_clear(this_cpu, call_data->cpumask);
+		atomic_dec(&call_data->count);
+	}
+
+	if (unlikely(cpus_empty(call_data->cpumask)))
+		return 0;
+
+	write_lock_irqsave(&call_async_queue_lock, flags);
+	list_add_tail(&call_data->node, &call_async_queue);
+	write_unlock_irqrestore(&call_async_queue_lock, flags);
+
+	mb();
+	send_IPI_mask(call_data->cpumask, CALL_ASYNC_VECTOR);
+	put_cpu_no_resched();
+	return 0;
+}
+EXPORT_SYMBOL(smp_call_async_mask);
+
+fastcall void smp_call_async_interrupt(void)
+{
+	struct call_async_data_struct *call_data = NULL, *last_entry = NULL;
+	int this_cpu;
+
+	ack_APIC_irq();
+	irq_enter();
+	this_cpu = smp_processor_id();
+
+	call_data = list_prepare_entry(call_data, &call_async_queue, node);
+	do {
+		/* find the next work item on the list for this CPU */
+		read_lock_irq(&call_async_queue_lock);
+		list_for_each_entry_continue(call_data, &call_async_queue, node) {
+			if (cpu_isset(this_cpu, call_data->cpumask))
+				break;
+		}
+		if (&call_data->node == &call_async_queue)
+			call_data = NULL;
+		read_unlock_irq(&call_async_queue_lock);
+
+		/* clean up from the last item, if this isn't the first pass */
+		if (last_entry) {
+			cpu_clear(this_cpu, last_entry->cpumask);
+			mb();
+			if (atomic_dec_and_test(&last_entry->count)) {
+				write_lock_irq(&call_async_queue_lock);
+				list_del(&last_entry->node);
+				write_unlock_irq(&call_async_queue_lock);
+			}
+		}
+
+		/* call the function */
+		if (call_data && call_data->func)
+			call_data->func(call_data->info);
+
+		last_entry = call_data;
+	} while (call_data);
+
+	irq_exit();
+}
diff -u
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -1399,4 +1399,6 @@
 
 	/* IPI for generic function call */
 	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+	set_intr_gate(CALL_ASYNC_VECTOR, call_async_interrupt);
 }
diff -u
--- a/include/asm-i386/hw_irq.h
+++ b/include/asm-i386/hw_irq.h
@@ -35,6 +35,7 @@
 fastcall void reschedule_interrupt(void);
 fastcall void invalidate_interrupt(void);
 fastcall void call_function_interrupt(void);
+fastcall void call_async_interrupt(void);
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
diff -u
--- a/include/asm-i386/mach-default/entry_arch.h
+++ b/include/asm-i386/mach-default/entry_arch.h
@@ -13,6 +13,7 @@
 BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
 BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
 BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
+BUILD_INTERRUPT(call_async_interrupt,CALL_ASYNC_VECTOR)
 #endif
 
 /*
diff -u
--- a/include/asm-i386/mach-default/irq_vectors.h
+++ b/include/asm-i386/mach-default/irq_vectors.h
@@ -48,6 +48,7 @@
 #define INVALIDATE_TLB_VECTOR	0xfd
 #define RESCHEDULE_VECTOR	0xfc
 #define CALL_FUNCTION_VECTOR	0xfb
+#define CALL_ASYNC_VECTOR	0xfa
 
 #define THERMAL_APIC_VECTOR	0xf0
 /*
diff -u
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -16,8 +16,11 @@
 #include <linux/kernel.h>
 #include <linux/compiler.h>
 #include <linux/thread_info.h>
+#include <linux/list.h>
+#include <linux/cpumask.h>
 #include <asm/smp.h>
 #include <asm/bug.h>
+#include <asm/atomic.h>
 
 /*
  * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
@@ -71,6 +74,16 @@
 	return ret;
 }
 
+struct call_async_data_struct {
+	void (*func) (void *info);
+	void *info;
+	cpumask_t cpumask;
+	atomic_t count;
+	struct list_head node;
+};
+
+extern int smp_call_async_mask(struct call_async_data_struct *call_data);
+
 #define MSG_ALL_BUT_SELF	0x8000	/* Assume <32768 CPU's */
 #define MSG_ALL			0x8001
 

Copyright © 2005, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds
Powered by Rackspace Managed Hosting.