User: Password:
|
|
Subscribe / Log in / New account

Intel x86-64 support merge

From:  Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
To:  bk-commits-head@vger.kernel.org
Subject:  [PATCH] Intel x86-64 support merge
Date:  Wed, 18 Feb 2004 19:44:21 +0000

ChangeSet 1.1562, 2004/02/18 11:44:21-08:00, ak@suse.de

	[PATCH] Intel x86-64 support merge
	
	This has all the x86-64 specific changes for Intel Prescott/Nocona
	support.
	
	It requires a few minor changes outside arch/x86_64, which I am sending
	separately.
	
	This patch is needed to boot an 64bit kernel on a 64-bit capable
	Prescott machine.
	
	The ugliest part is probably the swiotlb code.  In fact the code for
	that is not even included, but just reused from IA64.  swiotlb
	implements the PCI DMA API using bounce buffering.  I don't like this at
	all, but there was no other way to support non DAC capable hardware
	(like IDE or USB) on machines with >3GB.  Please redirect all flames for
	that to the Intel chipset designers.
	
	ChangeLog:
	- Add Kconfig options for PSC
	- Add support to reuse microcode driver from i386 (Suresh B Siddha)
	- Try to optimize for the selected CPU
	- Fix early CPUID check for Intel CPUs (Suresh B Siddha)
	- Fix GDT to use the configured cache line size for padding
	- Support monitor/mwait idle loop
	- Support HyperThreading
	- Support Intel CPUID flags
	- Remove all 3dnow prefetches
	- Add alternative() for the prefetchw prefetch inline.
	- Include P4 driver in oprofile
	- Support Intel NOPs in alternative


# This patch includes the following deltas:
#	           ChangeSet	1.1561  -> 1.1562 
#	include/asm-x86_64/smp.h	1.15    -> 1.16   
#	arch/x86_64/kernel/setup.c	1.26    -> 1.27   
#	arch/x86_64/kernel/x8664_ksyms.c	1.21    -> 1.22   
#	arch/x86_64/mm/init.c	1.23    -> 1.24   
#	arch/x86_64/oprofile/Makefile	1.4     -> 1.5    
#	arch/x86_64/lib/copy_page.S	1.5     -> 1.6    
#	include/asm-x86_64/pci.h	1.14    -> 1.15   
#	arch/x86_64/kernel/pci-gart.c	1.28    -> 1.29   
#	include/asm-x86_64/cpufeature.h	1.5     -> 1.6    
#	arch/x86_64/kernel/process.c	1.23    -> 1.24   
#	arch/x86_64/kernel/head.S	1.14    -> 1.15   
#	arch/x86_64/kernel/Makefile	1.28    -> 1.29   
#	arch/x86_64/kernel/aperture.c	1.6     -> 1.7    
#	 arch/x86_64/Kconfig	1.40    -> 1.41   
#	arch/x86_64/lib/csum-copy.S	1.4     -> 1.5    
#	include/asm-x86_64/system.h	1.18    -> 1.19   
#	arch/x86_64/Makefile	1.33    -> 1.34   
#	include/asm-x86_64/proto.h	1.16    -> 1.17   
#	include/asm-x86_64/msr.h	1.6     -> 1.7    
#	include/asm-x86_64/segment.h	1.8     -> 1.9    
#	arch/x86_64/boot/setup.S	1.6     -> 1.7    
#	include/asm-x86_64/processor.h	1.26    -> 1.27   
#	arch/x86_64/kernel/smpboot.c	1.21    -> 1.22   
#

 arch/x86_64/Kconfig              |   37 ++++++
 arch/x86_64/Makefile             |    4 
 arch/x86_64/boot/setup.S         |   22 ++-
 arch/x86_64/kernel/Makefile      |    5 
 arch/x86_64/kernel/aperture.c    |    4 
 arch/x86_64/kernel/head.S        |   19 +--
 arch/x86_64/kernel/pci-gart.c    |   32 +++++
 arch/x86_64/kernel/process.c     |   46 ++++++++
 arch/x86_64/kernel/setup.c       |  217 ++++++++++++++++++++++++++++++++++++++-
 arch/x86_64/kernel/smpboot.c     |   34 ++++++
 arch/x86_64/kernel/x8664_ksyms.c |    2 
 arch/x86_64/lib/copy_page.S      |    9 -
 arch/x86_64/lib/csum-copy.S      |   13 --
 arch/x86_64/mm/init.c            |    7 +
 arch/x86_64/oprofile/Makefile    |    6 -
 include/asm-x86_64/cpufeature.h  |    9 +
 include/asm-x86_64/msr.h         |  122 +++++++++++++++++++++
 include/asm-x86_64/pci.h         |   32 +++++
 include/asm-x86_64/processor.h   |  121 +++++++++++++++------
 include/asm-x86_64/proto.h       |    5 
 include/asm-x86_64/segment.h     |    4 
 include/asm-x86_64/smp.h         |    3 
 include/asm-x86_64/system.h      |   50 ++++++++
 23 files changed, 712 insertions(+), 91 deletions(-)


diff -Nru a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
--- a/arch/x86_64/Kconfig	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/Kconfig	Wed Feb 18 12:06:21 2004
@@ -89,6 +89,9 @@
 	help
 	  Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs. 
 
+config MPSC
+       bool "Prescott/Nocona" 
+       
 config GENERIC_CPU
 	bool "Generic-x86-64"
 	help
@@ -101,11 +104,13 @@
 #
 config X86_L1_CACHE_BYTES
 	int
-	default "64"
+	default "128" if GENERIC_CPU || MPSC
+	default "64" if MK8
 
 config X86_L1_CACHE_SHIFT
 	int
-	default "6"
+	default "7" if GENERIC_CPU || MPSC
+	default "6" if MK8
 
 config X86_TSC
 	bool
@@ -115,6 +120,23 @@
 	bool
 	default y
 
+config MICROCODE
+	tristate "/dev/cpu/microcode - Intel CPU microcode support"
+	---help---
+	  If you say Y here the 'File systems' section, you will be
+	  able to update the microcode on Intel processors. You will 
+	  obviously need the actual microcode binary data itself which is 
+	  not shipped with the Linux kernel.
+
+	  For latest news and information on obtaining all the required
+	  ingredients for this driver, check:
+	  <http://www.urbanmyth.org/microcode/>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called microcode.
+	  If you use modprobe or kmod you may also want to add the line
+	  'alias char-major-10-184 microcode' to your /etc/modules.conf file.
+
 config X86_MSR
 	tristate "/dev/cpu/*/msr - Model-specific register support"
 	help
@@ -132,6 +154,11 @@
 	  with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
 	  /dev/cpu/31/cpuid.
 
+config X86_HT
+	bool
+	depends on SMP
+	default y
+       
 config MATH_EMULATION
 	bool
 
@@ -256,9 +283,13 @@
 	  Normally the kernel will take the right choice by itself.
 	  If unsure say Y 
 
+config SWIOTLB
+       select GART_IOMMU
+       bool "Software IOTLB support"
+
 config DUMMY_IOMMU
 	bool
-	depends on !GART_IOMMU
+	depends on !GART_IOMMU && !SWIOTLB
 	default y
 	help
 	  Don't use IOMMU code. This will cause problems when you have more than 4GB 
diff -Nru a/arch/x86_64/Makefile b/arch/x86_64/Makefile
--- a/arch/x86_64/Makefile	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/Makefile	Wed Feb 18 12:06:21 2004
@@ -39,6 +39,10 @@
 
 check_gcc = $(shell if $(CC) $(1) -S -o /dev/null -xc /dev/null > /dev/null 2>&1 ; then echo "$(1)"; else echo "$(2)"; fi)
 
+cflags-$(CONFIG_MK8) += $(call check_gcc,-march=k8,)
+cflags-$(CONFIG_MPSC) += $(call check_gcc,-march=pentium4,)
+CFLAGS += $(cflags-y)
+
 CFLAGS += -mno-red-zone
 CFLAGS += -mcmodel=kernel
 CFLAGS += -pipe
diff -Nru a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
--- a/arch/x86_64/boot/setup.S	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/boot/setup.S	Wed Feb 18 12:06:21 2004
@@ -292,8 +292,9 @@
 	/* minimum CPUID flags for x86-64 */
 	/* see http://www.x86-64.org/lists/discuss/msg02971.html */		
 #define SSE_MASK ((1<<25)|(1<<26))
-#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|(1<<11)| \
-					   (1<<13)|(1<<15)|(1<<24)|(1<<29))
+#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\
+					   (1<<13)|(1<<15)|(1<<24))
+#define REQUIRED_MASK2 (1<<29)
 
 	pushfl				/* standard way to check for cpuid */
 	popl	%eax
@@ -305,10 +306,10 @@
 	popl	%eax
 	cmpl	%eax,%ebx
 	jz	no_longmode		/* cpu has no cpuid */
-	movl	$0x80000000,%eax
+	movl	$0x0,%eax
 	cpuid
-	cmpl	$0x80000001,%eax
-	jb	no_longmode		/* no extended cpuid */
+	cmpl	$0x1,%eax
+	jb	no_longmode		/* no cpuid 1 */
 	xor	%di,%di
 	cmpl	$0x68747541,%ebx	/* AuthenticAMD */
 	jnz	noamd
@@ -318,11 +319,20 @@
 	jnz	noamd
 	mov	$1,%di			/* cpu is from AMD */
 noamd:		
-	movl	$0x80000001,%eax		
+	movl    $0x1,%eax
 	cpuid
 	andl	$REQUIRED_MASK1,%edx
 	xorl	$REQUIRED_MASK1,%edx
 	jnz	no_longmode
+	movl    $0x80000000,%eax
+	cpuid
+	cmpl    $0x80000001,%eax
+	jb      no_longmode             /* no extended cpuid */
+	movl    $0x80000001,%eax
+	cpuid
+	andl    $REQUIRED_MASK2,%edx
+	xorl    $REQUIRED_MASK2,%edx
+	jnz     no_longmode
 sse_test:		
 	movl	$1,%eax
 	cpuid
diff -Nru a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
--- a/arch/x86_64/kernel/Makefile	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/kernel/Makefile	Wed Feb 18 12:06:21 2004
@@ -12,6 +12,7 @@
 obj-$(CONFIG_MTRR)		+= ../../i386/kernel/cpu/mtrr/
 obj-$(CONFIG_ACPI)		+= acpi/
 obj-$(CONFIG_X86_MSR)		+= msr.o
+obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
 obj-$(CONFIG_SMP)		+= smp.o smpboot.o trampoline.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o  nmi.o
@@ -22,6 +23,7 @@
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
 obj-$(CONFIG_DUMMY_IOMMU)	+= pci-nommu.o pci-dma.o
+obj-$(CONFIG_SWIOTLB)		+= swiotlb.o
 
 obj-$(CONFIG_MODULES)		+= module.o
 
@@ -30,4 +32,5 @@
 bootflag-y			+= ../../i386/kernel/bootflag.o
 cpuid-$(subst m,y,$(CONFIG_X86_CPUID))  += ../../i386/kernel/cpuid.o
 topology-y                     += ../../i386/mach-default/topology.o
-
+swiotlb-$(CONFIG_SWIOTLB)      += ../../ia64/lib/swiotlb.o
+microcode-$(CONFIG_MICROCODE)  += ../../i386/kernel/microcode.o
diff -Nru a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
--- a/arch/x86_64/kernel/aperture.c	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/kernel/aperture.c	Wed Feb 18 12:06:21 2004
@@ -24,6 +24,8 @@
 #include <asm/proto.h>
 #include <asm/pci-direct.h>
 
+int iommu_aperture;
+
 int fallback_aper_order __initdata = 1; /* 64MB */
 int fallback_aper_force __initdata = 0; 
 
@@ -205,6 +207,8 @@
 		char name[30];
 		if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) 
 			continue;	
+
+		iommu_aperture = 1;; 
 
 		aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
 		aper_size = (32 * 1024 * 1024) << aper_order; 
diff -Nru a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
--- a/arch/x86_64/kernel/head.S	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/kernel/head.S	Wed Feb 18 12:06:21 2004
@@ -16,6 +16,7 @@
 #include <asm/segment.h>
 #include <asm/page.h>
 #include <asm/msr.h>
+#include <asm/cache.h>
 	
 /* we are not able to switch in one step to the final KERNEL ADRESS SPACE
  * because we need identity-mapped pages on setup so define __START_KERNEL to
@@ -322,7 +323,6 @@
 	.endr
 #endif
 
-.align 64 /* cacheline aligned */
 ENTRY(gdt_table32)
 	.quad	0x0000000000000000	/* This one is magic */
 	.quad	0x0000000000000000	/* unused */
@@ -334,7 +334,7 @@
  * Also sysret mandates a special GDT layout 
  */
 		 		
-.align 64 /* cacheline aligned, keep this synchronized with asm/desc.h */
+.align L1_CACHE_BYTES
 
 /* The TLS descriptors are currently at a different place compared to i386.
    Hopefully nobody expects them at a fixed place (Wine?) */
@@ -354,18 +354,13 @@
 	.quad	0			/* unused now */
 	.quad	0x00009a000000ffff	/* __KERNEL16_CS - 16bit PM for S3 wakeup. */
 					/* base must be patched for real base address. */
-	/* This should be a multiple of the cache line size */
 gdt_end:	
-	.globl gdt_end
-
-	/* GDTs of other CPUs */	
-#ifdef CONFIG_SMP
-	.rept NR_CPUS-1
-	.quad 0,0,0,0,0,0,0,0,0,0,0
-	.endr	
-#endif
+	/* asm/segment.h:GDT_ENTRIES must match this */	
+	/* This should be a multiple of the cache line size */
+	/* GDTs of other CPUs: */	
+	.fill (L1_CACHE_BYTES * NR_CPUS) - (gdt_end - cpu_gdt_table) 
 
-	.align  64
+	.align  L1_CACHE_BYTES
 ENTRY(idt_table)	
 	.rept   256
 	.quad   0
diff -Nru a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
--- a/arch/x86_64/kernel/pci-gart.c	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/kernel/pci-gart.c	Wed Feb 18 12:06:21 2004
@@ -354,6 +354,11 @@
 
 	BUG_ON(dir == PCI_DMA_NONE);
 
+#ifdef CONFIG_SWIOTLB
+	if (swiotlb)
+		return swiotlb_map_single(&dev->dev,addr,size,dir);
+#endif
+
 	phys_mem = virt_to_phys(addr); 
 	if (!need_iommu(dev, phys_mem, size))
 		return phys_mem; 
@@ -460,6 +465,12 @@
 	BUG_ON(dir == PCI_DMA_NONE);
 	if (nents == 0) 
 		return 0;
+
+#ifdef CONFIG_SWIOTLB
+	if (swiotlb)
+		return swiotlb_map_sg(&dev->dev,sg,nents,dir);
+#endif
+
 	out = 0;
 	start = 0;
 	for (i = 0; i < nents; i++) {
@@ -520,6 +531,14 @@
 	unsigned long iommu_page; 
 	int npages;
 	int i;
+
+#ifdef CONFIG_SWIOTLB
+	if (swiotlb) {
+		swiotlb_unmap_single(&hwdev->dev,dma_addr,size,direction);
+		return;
+	}
+#endif
+
 	if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || 
 	    dma_addr >= iommu_bus_base + iommu_size)
 		return;
@@ -570,7 +589,7 @@
 		return 0; 
 	}
 
-	if (no_iommu && (mask < (end_pfn << PAGE_SHIFT)))
+	if (no_iommu && (mask < (end_pfn << PAGE_SHIFT)) && !swiotlb)
 		return 0;
 
 	return 1;
@@ -680,6 +699,7 @@
 	return 0;
 
  nommu:
+ 	/* Should not happen anymore */
 	printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
 	       KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); 
 	return -1; 
@@ -694,6 +714,7 @@
 	unsigned long iommu_start;
 	struct pci_dev *dev;
 		
+
 #ifndef CONFIG_AGP_AMD64
 	no_agp = 1; 
 #else
@@ -704,7 +725,14 @@
 		(agp_copy_info(&info) < 0); 
 #endif	
 
-	if (no_iommu || (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT)) { 
+	if (swiotlb) { 
+		no_iommu = 1;
+		printk(KERN_INFO "PCI-DMA: Using SWIOTLB :-(\n"); 
+		return -1; 
+	} 
+	
+	if (no_iommu || (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) || 
+	    !iommu_aperture) {
 		printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); 
 		no_iommu = 1;
 		return -1;
diff -Nru a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
--- a/arch/x86_64/kernel/process.c	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/kernel/process.c	Wed Feb 18 12:06:21 2004
@@ -140,6 +140,52 @@
 	}
 }
 
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ */
+static void mwait_idle(void)
+{
+	local_irq_enable();
+
+	if (!need_resched()) {
+		set_thread_flag(TIF_POLLING_NRFLAG);
+		do {
+			__monitor((void *)&current_thread_info()->flags, 0, 0);
+			if (need_resched())
+				break;
+			__mwait(0, 0);
+		} while (!need_resched());
+		clear_thread_flag(TIF_POLLING_NRFLAG);
+	}
+}
+
+void __init select_idle_routine(const struct cpuinfo_x86 *c)
+{
+	static int printed;
+	if (cpu_has(c, X86_FEATURE_MWAIT)) {
+		/*
+		 * Skip, if setup has overridden idle.
+		 * Also, take care of system with asymmetric CPUs.
+		 * Use, mwait_idle only if all cpus support it.
+		 * If not, we fallback to default_idle()
+		 */
+		if (!pm_idle) {
+			if (!printed) {
+				printk("using mwait in idle threads.\n");
+				printed = 1;
+			}
+			pm_idle = mwait_idle;
+		}
+		return;
+	}
+	pm_idle = default_idle;
+	return;
+}
+
 static int __init idle_setup (char *str)
 {
 	if (!strncmp(str, "poll", 4)) {
diff -Nru a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
--- a/arch/x86_64/kernel/setup.c	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/kernel/setup.c	Wed Feb 18 12:06:21 2004
@@ -76,6 +76,9 @@
 
 unsigned long saved_video_mode;
 
+int swiotlb;
+EXPORT_SYMBOL(swiotlb);
+
 /*
  * Setup options
  */
@@ -440,7 +443,6 @@
 		}
 	}
 #endif
-
 	paging_init();
 
 #ifndef CONFIG_SMP
@@ -584,6 +586,191 @@
 	return r;
 }
 
+static void __init detect_ht(void)
+{
+	extern	int phys_proc_id[NR_CPUS];
+	
+	u32 	eax, ebx, ecx, edx;
+	int 	index_lsb, index_msb, tmp;
+	int	initial_apic_id;
+	int 	cpu = smp_processor_id();
+	
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+	smp_num_siblings = (ebx & 0xff0000) >> 16;
+	
+	if (smp_num_siblings == 1) {
+		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
+	} else if (smp_num_siblings > 1) {
+		index_lsb = 0;
+		index_msb = 31;
+		/*
+		 * At this point we only support two siblings per
+		 * processor package.
+		 */
+#define NR_SIBLINGS	2
+		if (smp_num_siblings != NR_SIBLINGS) {
+			printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
+			smp_num_siblings = 1;
+			return;
+		}
+		tmp = smp_num_siblings;
+		while ((tmp & 1) == 0) {
+			tmp >>=1 ;
+			index_lsb++;
+		}
+		tmp = smp_num_siblings;
+		while ((tmp & 0x80000000 ) == 0) {
+			tmp <<=1 ;
+			index_msb--;
+		}
+		if (index_lsb != index_msb )
+			index_msb++;
+		initial_apic_id = ebx >> 24 & 0xff;
+		phys_proc_id[cpu] = initial_apic_id >> index_msb;
+		
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		       phys_proc_id[cpu]);
+	}
+}
+	
+#define LVL_1_INST	1
+#define LVL_1_DATA	2
+#define LVL_2		3
+#define LVL_3		4
+#define LVL_TRACE	5
+
+struct _cache_table
+{
+	unsigned char descriptor;
+	char cache_type;
+	short size;
+};
+
+/* all the cache descriptor types we care about (no TLB or trace cache entries) */
+static struct _cache_table cache_table[] __initdata =
+{
+	{ 0x06, LVL_1_INST, 8 },
+	{ 0x08, LVL_1_INST, 16 },
+	{ 0x0a, LVL_1_DATA, 8 },
+	{ 0x0c, LVL_1_DATA, 16 },
+	{ 0x22, LVL_3,      512 },
+	{ 0x23, LVL_3,      1024 },
+	{ 0x25, LVL_3,      2048 },
+	{ 0x29, LVL_3,      4096 },
+	{ 0x2c, LVL_1_DATA, 32 },
+	{ 0x30, LVL_1_INST, 32 },
+	{ 0x39, LVL_2,      128 },
+	{ 0x3b, LVL_2,      128 },
+	{ 0x3c, LVL_2,      256 },
+	{ 0x41, LVL_2,      128 },
+	{ 0x42, LVL_2,      256 },
+	{ 0x43, LVL_2,      512 },
+	{ 0x44, LVL_2,      1024 },
+	{ 0x45, LVL_2,      2048 },
+	{ 0x66, LVL_1_DATA, 8 },
+	{ 0x67, LVL_1_DATA, 16 },
+	{ 0x68, LVL_1_DATA, 32 },
+	{ 0x70, LVL_TRACE,  12 },
+	{ 0x71, LVL_TRACE,  16 },
+	{ 0x72, LVL_TRACE,  32 },
+	{ 0x79, LVL_2,      128 },
+	{ 0x7a, LVL_2,      256 },
+	{ 0x7b, LVL_2,      512 },
+	{ 0x7c, LVL_2,      1024 },
+	{ 0x82, LVL_2,      256 },
+	{ 0x83, LVL_2,      512 },
+	{ 0x84, LVL_2,      1024 },
+	{ 0x85, LVL_2,      2048 },
+	{ 0x86, LVL_2,      512 },
+	{ 0x87, LVL_2,      1024 },
+	{ 0x00, 0, 0}
+};
+
+static void __init init_intel(struct cpuinfo_x86 *c)
+{
+	/* Cache sizes */
+	unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; 
+	unsigned n;
+
+	select_idle_routine(c);
+	if (c->cpuid_level > 1) {
+		/* supports eax=2  call */
+		int i, j, n;
+		int regs[4];
+		unsigned char *dp = (unsigned char *)regs;
+
+		/* Number of times to iterate */
+		n = cpuid_eax(2) & 0xFF;
+
+		for ( i = 0 ; i < n ; i++ ) {
+			cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
+			
+			/* If bit 31 is set, this is an unknown format */
+			for ( j = 0 ; j < 3 ; j++ ) {
+				if ( regs[j] < 0 ) regs[j] = 0;
+			}
+
+			/* Byte 0 is level count, not a descriptor */
+			for ( j = 1 ; j < 16 ; j++ ) {
+				unsigned char des = dp[j];
+				unsigned char k = 0;
+
+				/* look up this descriptor in the table */
+				while (cache_table[k].descriptor != 0)
+				{
+					if (cache_table[k].descriptor == des) {
+						switch (cache_table[k].cache_type) {
+						case LVL_1_INST:
+							l1i += cache_table[k].size;
+							break;
+						case LVL_1_DATA:
+							l1d += cache_table[k].size;
+							break;
+						case LVL_2:
+							l2 += cache_table[k].size;
+							break;
+						case LVL_3:
+							l3 += cache_table[k].size;
+							break;
+						case LVL_TRACE:
+							trace += cache_table[k].size;
+							break;
+						}
+
+						break;
+					}
+
+					k++;
+				}
+			}
+		}
+
+		if (trace)
+			printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
+		else if (l1i)
+			printk (KERN_INFO "CPU: L1 I cache: %dK", l1i);
+		if (l1d)
+			printk(", L1 D cache: %dK\n", l1d);
+		else
+			printk("\n"); 
+		if (l2)
+			printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
+		if (l3)
+			printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
+
+		c->x86_cache_size = l2 ? l2 : (l1i+l1d);
+	}
+
+	if (cpu_has(c, X86_FEATURE_HT))
+		detect_ht(); 
+
+	n = cpuid_eax(0x80000000);
+	if (n >= 0x80000008) {
+		unsigned eax = cpuid_eax(0x80000008);
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+}
 
 void __init get_cpu_vendor(struct cpuinfo_x86 *c)
 {
@@ -591,6 +778,8 @@
 
 	if (!strcmp(v, "AuthenticAMD"))
 		c->x86_vendor = X86_VENDOR_AMD;
+	else if (!strcmp(v, "GenuineIntel"))
+		c->x86_vendor = X86_VENDOR_INTEL;
 	else
 		c->x86_vendor = X86_VENDOR_UNKNOWN;
 }
@@ -606,7 +795,7 @@
  */
 void __init identify_cpu(struct cpuinfo_x86 *c)
 {
-	int junk, i;
+	int i;
 	u32 xlvl, tfms;
 
 	c->loops_per_jiffy = loops_per_jiffy;
@@ -630,7 +819,7 @@
 	/* Intel-defined flags: level 0x00000001 */
 	if (c->cpuid_level >= 0x00000001) {
 		__u32 misc;
-		cpuid(0x00000001, &tfms, &misc, &junk,
+		cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
 		      &c->x86_capability[0]);
 		c->x86 = (tfms >> 8) & 0xf;
 		c->x86_model = (tfms >> 4) & 0xf;
@@ -679,9 +868,13 @@
 			init_amd(c);
 			break;
 
+		case X86_VENDOR_INTEL:
+			init_intel(c); 
+			break; 
+
 		case X86_VENDOR_UNKNOWN:
 		default:
-			/* Not much we can do here... */
+			display_cacheinfo(c);
 			break;
 	}
 	
@@ -732,7 +925,7 @@
 	        "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
 	        "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
 	        "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
-	        "fxsr", "sse", "sse2", "ss", NULL, "tm", "ia64", NULL,
+	        "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
 
 		/* AMD-defined */
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -751,6 +944,12 @@
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+		/* Intel-defined (#2) */
+		"pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "tm2",
+		"est", NULL, "cid", NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 	};
 	static char *x86_power_flags[] = { 
 		"ts",	/* temperature sensor */
@@ -790,6 +989,14 @@
 	if (c->x86_cache_size >= 0) 
 		seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
 	
+#ifdef CONFIG_X86_HT
+	if (cpu_has_ht) {
+		extern int phys_proc_id[NR_CPUS];
+		seq_printf(m, "physical id\t: %d\n", phys_proc_id[c - cpu_data]);
+		seq_printf(m, "siblings\t: %d\n", smp_num_siblings);
+	}
+#endif	
+
 	seq_printf(m,
 	        "fpu\t\t: yes\n"
 	        "fpu_exception\t: yes\n"
diff -Nru a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
--- a/arch/x86_64/kernel/smpboot.c	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/kernel/smpboot.c	Wed Feb 18 12:06:21 2004
@@ -53,6 +53,10 @@
 #include <asm/tlbflush.h>
 #include <asm/proto.h>
 
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
+
 /* Bitmask of currently online CPUs */
 cpumask_t cpu_online_map;
 
@@ -66,6 +70,8 @@
 /* Set when the idlers are all forked */
 int smp_threads_ready;
 
+int cpu_sibling_map[NR_CPUS] __cacheline_aligned;
+
 /*
  * Trampoline 80x86 program as an array.
  */
@@ -855,6 +861,34 @@
 			bogosum/(500000/HZ),
 			(bogosum/(5000/HZ))%100);
 		Dprintk("Before bogocount - setting activated=1.\n");
+	}
+
+	/*
+	 * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so
+	 * that we can tell the sibling CPU efficiently.
+	 */
+	if (cpu_has_ht && smp_num_siblings > 1) {
+		for (cpu = 0; cpu < NR_CPUS; cpu++)
+			cpu_sibling_map[cpu] = NO_PROC_ID;
+		
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			int 	i;
+			if (!cpu_isset(cpu, cpu_callout_map))
+				continue;
+
+			for (i = 0; i < NR_CPUS; i++) {
+				if (i == cpu || !cpu_isset(i, cpu_callout_map))
+					continue;
+				if (phys_proc_id[cpu] == phys_proc_id[i]) {
+					cpu_sibling_map[cpu] = i;
+					break;
+				}
+			}
+			if (cpu_sibling_map[cpu] == NO_PROC_ID) {
+				smp_num_siblings = 1;
+				printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu);
+			}
+		}
 	}
 
 	Dprintk("Boot done.\n");
diff -Nru a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
--- a/arch/x86_64/kernel/x8664_ksyms.c	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/kernel/x8664_ksyms.c	Wed Feb 18 12:06:21 2004
@@ -194,6 +194,8 @@
 
 EXPORT_SYMBOL(die_chain);
 
+EXPORT_SYMBOL(cpu_sibling_map);
+
 extern void do_softirq_thunk(void);
 EXPORT_SYMBOL_NOVERS(do_softirq_thunk);
 
diff -Nru a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
--- a/arch/x86_64/lib/copy_page.S	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/lib/copy_page.S	Wed Feb 18 12:06:21 2004
@@ -8,11 +8,6 @@
 	.globl copy_page
 	.p2align 4
 copy_page:
-	prefetch (%rsi) 
-	prefetch 1*64(%rsi)
-	prefetchw (%rdi) 
-	prefetchw 1*64(%rdi) 
-
 	subq	$3*8,%rsp
 	movq	%rbx,(%rsp)
 	movq	%r12,1*8(%rsp)
@@ -32,7 +27,7 @@
 	movq     48 (%rsi), %r11
 	movq     56 (%rsi), %r12
 
-	prefetch 5*64(%rsi)
+	prefetcht0 5*64(%rsi)
 
 	movq     %rax,    (%rdi)
 	movq     %rbx,  8 (%rdi)
@@ -42,8 +37,6 @@
 	movq     %r10, 40 (%rdi)
 	movq     %r11, 48 (%rdi)
 	movq     %r12, 56 (%rdi)
-
-	prefetchw 5*64(%rdi)
 
 	leaq    64 (%rsi), %rsi
 	leaq    64 (%rdi), %rdi
diff -Nru a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S
--- a/arch/x86_64/lib/csum-copy.S	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/lib/csum-copy.S	Wed Feb 18 12:06:21 2004
@@ -59,15 +59,6 @@
 	cmpl	 $3*64,%edx
 	jle	 .Lignore
 
-	ignore
-	prefetch (%rdi)
-	ignore
-	prefetch 1*64(%rdi)
-	ignore
-	prefetchw (%rsi)
-	ignore
-	prefetchw 1*64(%rsi)
-
 .Lignore:		
 	subq  $7*8,%rsp
 	movq  %rbx,2*8(%rsp)
@@ -115,7 +106,7 @@
 	movq  56(%rdi),%r13
 		
 	ignore 2f
-	prefetch 5*64(%rdi)
+	prefetcht0 5*64(%rdi)
 2:							
 	adcq  %rbx,%rax
 	adcq  %r8,%rax
@@ -146,8 +137,6 @@
 	dest
 	movq %r13,56(%rsi)
 	
-	ignore 3f
-	prefetchw 5*64(%rsi)
 3:
 	
 	leaq 64(%rdi),%rdi
diff -Nru a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
--- a/arch/x86_64/mm/init.c	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/mm/init.c	Wed Feb 18 12:06:21 2004
@@ -402,6 +402,13 @@
 	int codesize, reservedpages, datasize, initsize;
 	int tmp;
 
+#ifdef CONFIG_SWIOTLB
+       if (!iommu_aperture && end_pfn >= 0xffffffff>>PAGE_SHIFT) { 
+	       swiotlb_init();
+	       swiotlb = 1;
+       }
+#endif
+
 	/* How many end-of-memory variables you have, grandma! */
 	max_low_pfn = end_pfn;
 	max_pfn = end_pfn;
diff -Nru a/arch/x86_64/oprofile/Makefile b/arch/x86_64/oprofile/Makefile
--- a/arch/x86_64/oprofile/Makefile	Wed Feb 18 12:06:21 2004
+++ b/arch/x86_64/oprofile/Makefile	Wed Feb 18 12:06:21 2004
@@ -1,7 +1,6 @@
 #
 # oprofile for x86-64.
-# Just reuse the one from i386. The Hammer performance counters 
-# are similar to Athlon.
+# Just reuse the one from i386. 
 #
 
 obj-$(CONFIG_OPROFILE) += oprofile.o
@@ -13,7 +12,8 @@
 	timer_int.o )
 
 OPROFILE-y := init.o
-OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o
+OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \
+				     op_model_ppro.o
 OPROFILE-$(CONFIG_X86_IO_APIC)    += nmi_timer_int.o 
 
 oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
diff -Nru a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h
--- a/include/asm-x86_64/cpufeature.h	Wed Feb 18 12:06:21 2004
+++ b/include/asm-x86_64/cpufeature.h	Wed Feb 18 12:06:21 2004
@@ -7,7 +7,7 @@
 #ifndef __ASM_X8664_CPUFEATURE_H
 #define __ASM_X8664_CPUFEATURE_H
 
-#define NCAPINTS	4	/* Currently we have 4 32-bit words worth of info */
+#define NCAPINTS	5	/* Currently we have 4 32-bit words worth of info */
 
 /* Intel-defined CPU features, CPUID level 0x00000001, word 0 */
 #define X86_FEATURE_FPU		(0*32+ 0) /* Onboard FPU */
@@ -37,6 +37,7 @@
 #define X86_FEATURE_XMM		(0*32+25) /* Streaming SIMD Extensions */
 #define X86_FEATURE_XMM2	(0*32+26) /* Streaming SIMD Extensions-2 */
 #define X86_FEATURE_SELFSNOOP	(0*32+27) /* CPU self snoop */
+#define X86_FEATURE_HT		(0*32+28) /* Hyper-Threading */
 #define X86_FEATURE_ACC		(0*32+29) /* Automatic clock control */
 #define X86_FEATURE_IA64	(0*32+30) /* IA-64 processor */
 
@@ -61,6 +62,10 @@
 #define X86_FEATURE_CENTAUR_MCR	(3*32+ 3) /* Centaur MCRs (= MTRRs) */
 #define X86_FEATURE_K8_C	(3*32+ 4) /* C stepping K8 */
 
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_EST		(4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_MWAIT	(4*32+ 3) /* Monitor/Mwait support */
+
 #define cpu_has(c, bit)                test_bit(bit, (c)->x86_capability)
 #define boot_cpu_has(bit)      test_bit(bit, boot_cpu_data.x86_capability)
 
@@ -76,7 +81,7 @@
 #define cpu_has_mmx            1
 #define cpu_has_fxsr           1
 #define cpu_has_xmm            1
-#define cpu_has_ht             0 /* you need to report the support from i386. sorry */
+#define cpu_has_ht             boot_cpu_has(X86_FEATURE_HT)
 #define cpu_has_mp             1 /* XXX */
 #define cpu_has_k6_mtrr        0
 #define cpu_has_cyrix_arr      0
diff -Nru a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h
--- a/include/asm-x86_64/msr.h	Wed Feb 18 12:06:21 2004
+++ b/include/asm-x86_64/msr.h	Wed Feb 18 12:06:21 2004
@@ -121,6 +121,9 @@
 	return edx;
 }
 
+#define MSR_IA32_UCODE_WRITE		0x79
+#define MSR_IA32_UCODE_REV		0x8b
+
 
 #endif
 
@@ -242,5 +245,124 @@
 #define MSR_IA32_APICBASE_BSP           (1<<8)
 #define MSR_IA32_APICBASE_ENABLE        (1<<11)
 #define MSR_IA32_APICBASE_BASE          (0xfffff<<12)
+
+/* P4/Xeon+ specific */
+#define MSR_IA32_MCG_EAX		0x180
+#define MSR_IA32_MCG_EBX		0x181
+#define MSR_IA32_MCG_ECX		0x182
+#define MSR_IA32_MCG_EDX		0x183
+#define MSR_IA32_MCG_ESI		0x184
+#define MSR_IA32_MCG_EDI		0x185
+#define MSR_IA32_MCG_EBP		0x186
+#define MSR_IA32_MCG_ESP		0x187
+#define MSR_IA32_MCG_EFLAGS		0x188
+#define MSR_IA32_MCG_EIP		0x189
+#define MSR_IA32_MCG_RESERVED		0x18A
+
+#define MSR_P6_EVNTSEL0			0x186
+#define MSR_P6_EVNTSEL1			0x187
+
+#define MSR_IA32_PERF_STATUS		0x198
+#define MSR_IA32_PERF_CTL		0x199
+
+#define MSR_IA32_THERM_CONTROL		0x19a
+#define MSR_IA32_THERM_INTERRUPT	0x19b
+#define MSR_IA32_THERM_STATUS		0x19c
+#define MSR_IA32_MISC_ENABLE		0x1a0
+
+#define MSR_IA32_DEBUGCTLMSR		0x1d9
+#define MSR_IA32_LASTBRANCHFROMIP	0x1db
+#define MSR_IA32_LASTBRANCHTOIP		0x1dc
+#define MSR_IA32_LASTINTFROMIP		0x1dd
+#define MSR_IA32_LASTINTTOIP		0x1de
+
+#define MSR_IA32_MC0_CTL		0x400
+#define MSR_IA32_MC0_STATUS		0x401
+#define MSR_IA32_MC0_ADDR		0x402
+#define MSR_IA32_MC0_MISC		0x403
+
+/* Pentium IV performance counter MSRs */
+#define MSR_P4_BPU_PERFCTR0 		0x300
+#define MSR_P4_BPU_PERFCTR1 		0x301
+#define MSR_P4_BPU_PERFCTR2 		0x302
+#define MSR_P4_BPU_PERFCTR3 		0x303
+#define MSR_P4_MS_PERFCTR0 		0x304
+#define MSR_P4_MS_PERFCTR1 		0x305
+#define MSR_P4_MS_PERFCTR2 		0x306
+#define MSR_P4_MS_PERFCTR3 		0x307
+#define MSR_P4_FLAME_PERFCTR0 		0x308
+#define MSR_P4_FLAME_PERFCTR1 		0x309
+#define MSR_P4_FLAME_PERFCTR2 		0x30a
+#define MSR_P4_FLAME_PERFCTR3 		0x30b
+#define MSR_P4_IQ_PERFCTR0 		0x30c
+#define MSR_P4_IQ_PERFCTR1 		0x30d
+#define MSR_P4_IQ_PERFCTR2 		0x30e
+#define MSR_P4_IQ_PERFCTR3 		0x30f
+#define MSR_P4_IQ_PERFCTR4 		0x310
+#define MSR_P4_IQ_PERFCTR5 		0x311
+#define MSR_P4_BPU_CCCR0 		0x360
+#define MSR_P4_BPU_CCCR1 		0x361
+#define MSR_P4_BPU_CCCR2 		0x362
+#define MSR_P4_BPU_CCCR3 		0x363
+#define MSR_P4_MS_CCCR0 		0x364
+#define MSR_P4_MS_CCCR1 		0x365
+#define MSR_P4_MS_CCCR2 		0x366
+#define MSR_P4_MS_CCCR3 		0x367
+#define MSR_P4_FLAME_CCCR0 		0x368
+#define MSR_P4_FLAME_CCCR1 		0x369
+#define MSR_P4_FLAME_CCCR2 		0x36a
+#define MSR_P4_FLAME_CCCR3 		0x36b
+#define MSR_P4_IQ_CCCR0 		0x36c
+#define MSR_P4_IQ_CCCR1 		0x36d
+#define MSR_P4_IQ_CCCR2 		0x36e
+#define MSR_P4_IQ_CCCR3 		0x36f
+#define MSR_P4_IQ_CCCR4 		0x370
+#define MSR_P4_IQ_CCCR5 		0x371
+#define MSR_P4_ALF_ESCR0 		0x3ca
+#define MSR_P4_ALF_ESCR1 		0x3cb
+#define MSR_P4_BPU_ESCR0 		0x3b2
+#define MSR_P4_BPU_ESCR1 		0x3b3
+#define MSR_P4_BSU_ESCR0 		0x3a0
+#define MSR_P4_BSU_ESCR1 		0x3a1
+#define MSR_P4_CRU_ESCR0 		0x3b8
+#define MSR_P4_CRU_ESCR1 		0x3b9
+#define MSR_P4_CRU_ESCR2 		0x3cc
+#define MSR_P4_CRU_ESCR3 		0x3cd
+#define MSR_P4_CRU_ESCR4 		0x3e0
+#define MSR_P4_CRU_ESCR5 		0x3e1
+#define MSR_P4_DAC_ESCR0 		0x3a8
+#define MSR_P4_DAC_ESCR1 		0x3a9
+#define MSR_P4_FIRM_ESCR0 		0x3a4
+#define MSR_P4_FIRM_ESCR1 		0x3a5
+#define MSR_P4_FLAME_ESCR0 		0x3a6
+#define MSR_P4_FLAME_ESCR1 		0x3a7
+#define MSR_P4_FSB_ESCR0 		0x3a2
+#define MSR_P4_FSB_ESCR1 		0x3a3
+#define MSR_P4_IQ_ESCR0 		0x3ba
+#define MSR_P4_IQ_ESCR1 		0x3bb
+#define MSR_P4_IS_ESCR0 		0x3b4
+#define MSR_P4_IS_ESCR1 		0x3b5
+#define MSR_P4_ITLB_ESCR0 		0x3b6
+#define MSR_P4_ITLB_ESCR1 		0x3b7
+#define MSR_P4_IX_ESCR0 		0x3c8
+#define MSR_P4_IX_ESCR1 		0x3c9
+#define MSR_P4_MOB_ESCR0 		0x3aa
+#define MSR_P4_MOB_ESCR1 		0x3ab
+#define MSR_P4_MS_ESCR0 		0x3c0
+#define MSR_P4_MS_ESCR1 		0x3c1
+#define MSR_P4_PMH_ESCR0 		0x3ac
+#define MSR_P4_PMH_ESCR1 		0x3ad
+#define MSR_P4_RAT_ESCR0 		0x3bc
+#define MSR_P4_RAT_ESCR1 		0x3bd
+#define MSR_P4_SAAT_ESCR0 		0x3ae
+#define MSR_P4_SAAT_ESCR1 		0x3af
+#define MSR_P4_SSU_ESCR0 		0x3be
+#define MSR_P4_SSU_ESCR1 		0x3bf    /* guess: not defined in manual */
+#define MSR_P4_TBPU_ESCR0 		0x3c2
+#define MSR_P4_TBPU_ESCR1 		0x3c3
+#define MSR_P4_TC_ESCR0 		0x3c4
+#define MSR_P4_TC_ESCR1 		0x3c5
+#define MSR_P4_U2L_ESCR0 		0x3b0
+#define MSR_P4_U2L_ESCR1 		0x3b1
 
 #endif
diff -Nru a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
--- a/include/asm-x86_64/pci.h	Wed Feb 18 12:06:21 2004
+++ b/include/asm-x86_64/pci.h	Wed Feb 18 12:06:21 2004
@@ -72,6 +72,23 @@
 extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
 				void *vaddr, dma_addr_t dma_handle);
 
+#ifdef CONFIG_SWIOTLB
+extern int swiotlb; 
+extern dma_addr_t swiotlb_map_single (struct device *hwdev, void *ptr, size_t size, 
+				      int dir);
+extern void swiotlb_unmap_single (struct device *hwdev, dma_addr_t dev_addr,
+				  size_t size, int dir);
+extern void swiotlb_sync_single (struct device *hwdev, dma_addr_t dev_addr, 
+				 size_t size, int dir);
+extern void swiotlb_sync_sg (struct device *hwdev, struct scatterlist *sg, int nelems, 
+			     int dir);
+extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
+		      int nents, int direction);
+extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+			 int nents, int direction);
+
+#endif
+
 #ifdef CONFIG_GART_IOMMU
 
 /* Map a single buffer of the indicated size for DMA in streaming mode.
@@ -113,6 +130,13 @@
 				       size_t size, int direction)
 {
 	BUG_ON(direction == PCI_DMA_NONE); 
+
+#ifdef CONFIG_SWIOTLB
+	if (swiotlb)
+		return swiotlb_sync_single(&hwdev->dev,dma_handle,size,direction);
+#endif
+
+	flush_write_buffers();
 } 
 
 static inline void pci_dma_sync_sg(struct pci_dev *hwdev, 
@@ -120,6 +144,12 @@
 				   int nelems, int direction)
 { 
 	BUG_ON(direction == PCI_DMA_NONE); 
+
+#ifdef CONFIG_SWIOTLB
+	if (swiotlb)
+		return swiotlb_sync_sg(&hwdev->dev,sg,nelems,direction);
+#endif
+	flush_write_buffers();
 } 
 
 /* The PCI address space does equal the physical memory
@@ -271,5 +301,7 @@
 
 /* generic pci stuff */
 #include <asm-generic/pci.h>
+
+#include <linux/dma-mapping.h>
 
 #endif /* __x8664_PCI_H */
diff -Nru a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
--- a/include/asm-x86_64/processor.h	Wed Feb 18 12:06:21 2004
+++ b/include/asm-x86_64/processor.h	Wed Feb 18 12:06:21 2004
@@ -303,6 +303,67 @@
 	(((struct pt_regs *)(tsk->thread.rsp0 - sizeof(struct pt_regs)))->rip)
 #define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
 
+
+struct microcode_header {
+	unsigned int hdrver;
+	unsigned int rev;
+	unsigned int date;
+	unsigned int sig;
+	unsigned int cksum;
+	unsigned int ldrver;
+	unsigned int pf;
+	unsigned int datasize;
+	unsigned int totalsize;
+	unsigned int reserved[3];
+};
+
+struct microcode {
+	struct microcode_header hdr;
+	unsigned int bits[0];
+};
+
+typedef struct microcode microcode_t;
+typedef struct microcode_header microcode_header_t;
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+	unsigned int sig;
+	unsigned int pf;
+	unsigned int cksum;
+};
+
+struct extended_sigtable {
+	unsigned int count;
+	unsigned int cksum;
+	unsigned int reserved[3];
+	struct extended_signature sigs[0];
+};
+
+/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
+#define MICROCODE_IOCFREE	_IO('6',0)
+
+
+#define ASM_NOP1 K8_NOP1
+#define ASM_NOP2 K8_NOP2
+#define ASM_NOP3 K8_NOP3
+#define ASM_NOP4 K8_NOP4
+#define ASM_NOP5 K8_NOP5
+#define ASM_NOP6 K8_NOP6
+#define ASM_NOP7 K8_NOP7
+#define ASM_NOP8 K8_NOP8
+
+/* Opteron nops */
+#define K8_NOP1 ".byte 0x90\n"
+#define K8_NOP2	".byte 0x66,0x90\n" 
+#define K8_NOP3	".byte 0x66,0x66,0x90\n" 
+#define K8_NOP4	".byte 0x66,0x66,0x66,0x90\n" 
+#define K8_NOP5	K8_NOP3 K8_NOP2 
+#define K8_NOP6	K8_NOP3 K8_NOP3
+#define K8_NOP7	K8_NOP4 K8_NOP3
+#define K8_NOP8	K8_NOP4 K8_NOP4
+
+#define ASM_NOP_MAX 8
+
 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
 extern inline void rep_nop(void)
 {
@@ -318,31 +379,25 @@
 
 #define cpu_has_fpu 1
 
-/* Some early Opteron versions incorrectly fault on prefetch (errata #91). 
-   If this happens just jump back. */
 #define ARCH_HAS_PREFETCH
 static inline void prefetch(void *x) 
 { 
-	asm volatile("2: prefetcht0 %0\n1:\t" 
-		    ".section __ex_table,\"a\"\n\t"
-		    "  .align 8\n\t"
-		    "  .quad  2b,1b\n\t"
-		    ".previous" :: "m" (*(unsigned long *)x));
+	asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 } 
 
-#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_PREFETCHW 1
 static inline void prefetchw(void *x) 
 { 
-	asm volatile("2: prefetchw %0\n1:\t" 
-		    ".section __ex_table,\"a\"\n\t"
-		    "  .align 8\n\t"
-		    "  .quad  2b,1b\n\t"
-		    ".previous" :: "m" (*(unsigned long *)x));
+	alternative_input(ASM_NOP4,
+			  "prefetchw (%1)",
+			  X86_FEATURE_3DNOW,
+			  "r" (x));
 } 
 
-#define ARCH_HAS_SPINLOCK_PREFETCH
+#define ARCH_HAS_SPINLOCK_PREFETCH 1
 
 #define spin_lock_prefetch(x)  prefetchw(x)
+
 #define cpu_relax()   rep_nop()
 
 /*
@@ -372,32 +427,28 @@
 	outb((data), 0x23); \
 } while (0)
 
+static inline void __monitor(const void *eax, unsigned long ecx,
+		unsigned long edx)
+{
+	/* "monitor %eax,%ecx,%edx;" */
+	asm volatile(
+		".byte 0x0f,0x01,0xc8;"
+		: :"a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+	/* "mwait %eax,%ecx;" */
+	asm volatile(
+		".byte 0x0f,0x01,0xc9;"
+		: :"a" (eax), "c" (ecx));
+}
+
 #define stack_current() \
 ({								\
 	struct thread_info *ti;					\
 	asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));	\
 	ti->task;					\
 })
-
-#define ASM_NOP1 K8_NOP1
-#define ASM_NOP2 K8_NOP2
-#define ASM_NOP3 K8_NOP3
-#define ASM_NOP4 K8_NOP4
-#define ASM_NOP5 K8_NOP5
-#define ASM_NOP6 K8_NOP6
-#define ASM_NOP7 K8_NOP7
-#define ASM_NOP8 K8_NOP8
-
-/* Opteron nops */
-#define K8_NOP1 ".byte 0x90\n"
-#define K8_NOP2	".byte 0x66,0x90\n" 
-#define K8_NOP3	".byte 0x66,0x66,0x90\n" 
-#define K8_NOP4	".byte 0x66,0x66,0x66,0x90\n" 
-#define K8_NOP5	K8_NOP3 K8_NOP2 
-#define K8_NOP6	K8_NOP3 K8_NOP3
-#define K8_NOP7	K8_NOP4 K8_NOP3
-#define K8_NOP8	K8_NOP4 K8_NOP4
-
-#define ASM_NOP_MAX 8
 
 #endif /* __ASM_X86_64_PROCESSOR_H */
diff -Nru a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
--- a/include/asm-x86_64/proto.h	Wed Feb 18 12:06:21 2004
+++ b/include/asm-x86_64/proto.h	Wed Feb 18 12:06:21 2004
@@ -76,6 +76,10 @@
 
 extern int unhandled_signal(struct task_struct *tsk, int sig);
 
+extern void select_idle_routine(const struct cpuinfo_x86 *c);
+extern void swiotlb_init(void);
+extern int swiotlb;
+
 extern unsigned long max_mapnr;
 extern unsigned long end_pfn; 
 extern unsigned long table_start, table_end;
@@ -92,6 +96,7 @@
 
 extern int fallback_aper_order;
 extern int fallback_aper_force;
+extern int iommu_aperture;
 
 extern void smp_local_timer_interrupt(struct pt_regs * regs);
 
diff -Nru a/include/asm-x86_64/segment.h b/include/asm-x86_64/segment.h
--- a/include/asm-x86_64/segment.h	Wed Feb 18 12:06:21 2004
+++ b/include/asm-x86_64/segment.h	Wed Feb 18 12:06:21 2004
@@ -1,6 +1,8 @@
 #ifndef _ASM_SEGMENT_H
 #define _ASM_SEGMENT_H
 
+#include <asm/cache.h>
+
 #define __KERNEL_CS	0x10
 #define __KERNEL_DS	0x18
 
@@ -38,7 +40,7 @@
 #define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
 
 #define IDT_ENTRIES 256
-#define GDT_ENTRIES 16
+#define GDT_ENTRIES (L1_CACHE_BYTES / 8) 
 #define GDT_SIZE (GDT_ENTRIES * 8)
 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) 
 
diff -Nru a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
--- a/include/asm-x86_64/smp.h	Wed Feb 18 12:06:21 2004
+++ b/include/asm-x86_64/smp.h	Wed Feb 18 12:06:21 2004
@@ -39,6 +39,7 @@
 extern cpumask_t cpu_online_map;
 extern volatile unsigned long smp_invalidate_needed;
 extern int pic_mode;
+extern int smp_num_siblings;
 extern void smp_flush_tlb(void);
 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
 extern void smp_send_reschedule(int cpu);
@@ -46,7 +47,7 @@
 extern void (*mtrr_hook) (void);
 extern void zap_low_mappings(void);
 void smp_stop_cpu(void);
-
+extern int cpu_sibling_map[];
 
 #define SMP_TRAMPOLINE_BASE 0x6000
 
diff -Nru a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
--- a/include/asm-x86_64/system.h	Wed Feb 18 12:06:21 2004
+++ b/include/asm-x86_64/system.h	Wed Feb 18 12:06:21 2004
@@ -88,6 +88,56 @@
 #endif
 
 /*
+ * Alternative instructions for different CPU types or capabilities.
+ * 
+ * This allows to use optimized instructions even on generic binary
+ * kernels.
+ * 
+ * length of oldinstr must be longer or equal the length of newinstr
+ * It can be padded with nops as needed.
+ * 
+ * For non barrier like inlines please define new variants
+ * without volatile and memory clobber.
+ */
+#define alternative(oldinstr, newinstr, feature) 	\
+	asm volatile ("661:\n\t" oldinstr "\n662:\n" 		     \
+		      ".section .altinstructions,\"a\"\n"     	     \
+		      "  .align 8\n"				       \
+		      "  .quad 661b\n"            /* label */          \
+		      "  .quad 663f\n"		  /* new instruction */ \
+		      "  .byte %c0\n"             /* feature bit */    \
+		      "  .byte 662b-661b\n"       /* sourcelen */      \
+		      "  .byte 664f-663f\n"       /* replacementlen */ \
+		      ".previous\n"					\
+		      ".section .altinstr_replacement,\"ax\"\n"		\
+		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
+		      ".previous" :: "i" (feature) : "memory")  
+
+/*
+ * Alternative inline assembly with input.
+ * 
+ * Pecularities:
+ * No memory clobber here. 
+ * Argument numbers start with 1.
+ * Best is to use constraints that are fixed size (like (%1) ... "r")
+ * If you use variable sized constraints like "m" or "g" in the 
+ * replacement maake sure to pad to the worst case length.
+ */
+#define alternative_input(oldinstr, newinstr, feature, input)		\
+	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
+		      ".section .altinstructions,\"a\"\n"		\
+		      "  .align 8\n"					\
+		      "  .quad 661b\n"            /* label */		\
+		      "  .quad 663f\n"		  /* new instruction */	\
+		      "  .byte %c0\n"             /* feature bit */	\
+		      "  .byte 662b-661b\n"       /* sourcelen */	\
+		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      ".previous\n"					\
+		      ".section .altinstr_replacement,\"ax\"\n"		\
+		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
+		      ".previous" :: "i" (feature), input)
+
+/*
  * Clear and set 'TS' bit respectively
  */
 #define clts() __asm__ __volatile__ ("clts")
-
To unsubscribe from this list: send the line "unsubscribe bk-commits-head" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Copyright © 2004, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds