| From: |
| Naohiko Shimizu <nshimizu@keyaki.cc.u-tokai.ac.jp> |
| To: |
| linux-kernel@vger.kernel.org |
| Subject: |
| [PATCH]Super Page patch for 2.4.18 |
| Date: |
| Mon, 8 Jul 2002 19:02:20 +0900 |
Hi,
Attached file is the Super Page patch for 2.4.18.
It activates the super page feature of Alpha processor.
(It was known as the page granularity hint support patch, but
the super page is lucid for the people. I added TLB flush to the
two years old patch to avoid the Alpha architectual problem.)
When activated, the kernel will run upto 5 times faster
on a large matrix transpose benchmark.(it also attached for convinient)
I am testing with randomly malloc/free test program for a whole day,
and it seems quite stable. But I don't have any SMP machine,
then I have not tested on SMP.
I also have the IA32 port, and I will merge it when it seems to be stable enough.
I am not in the kernel list, then please cc to me for any comment to this mail.
Regards,
--
Naohiko Shimizu
Department of Communications Engineering,
School of Information Technology and Electronics, Tokai University
1117 Kitakaname Hiratsuka 259-1292 Japan
TEL.+81-463-58-1211(ext. 4084) FAX.+81-463-58-8320
http://shimizu-lab.dt.u-tokai.ac.jp/
diff -urN linux-2.4.18/Documentation/super_page.txt linux/Documentation/super_page.txt
--- linux-2.4.18/Documentation/super_page.txt Thu Jan 1 09:00:00 1970
+++ linux/Documentation/super_page.txt Fri Jun 28 14:40:25 2002
@@ -0,0 +1,24 @@
+The Alpha architecture defines Granularity Hint(GH) bits in the
+Page Table Entry(PTE). If these bits are set to non-zero value,
+it supply a hint to translation buffer implementations that
+a block of pages can be treated as a single larger page.
+It means that even if we don't have variable length page mechanism,
+we will have the opportunity to reduce translation misses.
+For the large working set HPC applications the performance
+degradation caused by the translation misses should be avoided.
+Then if we can use this feature, many HPC applications will be
+appreciated.
+
+In this release there is a configuration option to support this
+feature for Alpha architecture. You can turn on this feature by assert
+the CONFIG_SUPER_PAGE variable. With this release, you can set GH bits in
+your page table when you call brk system call or mmap system call with
+MAP_PRIVATE flag on. Then if you want to run your program with
+enough speed, you should allocate memory with these call (or more
+general you can use malloc library call).
+The GH bits only set on the heap allocation, after that if any process
+changes the table it will drop the bits for safety. You may want to
+stop swapping for continuous high performance operation.
+
+
+Naohiko Shimizu<nshimizu@keyaki.cc.u-tokai.ac.jp>
diff -urN linux-2.4.18/arch/alpha/config.in linux/arch/alpha/config.in
--- linux-2.4.18/arch/alpha/config.in Wed Nov 21 08:49:31 2001
+++ linux/arch/alpha/config.in Fri Jun 28 16:26:11 2002
@@ -231,6 +231,8 @@
# LARGE_VMALLOC is racy, if you *really* need it then fix it first
define_bool CONFIG_ALPHA_LARGE_VMALLOC n
+bool 'SuperPage Support' CONFIG_SUPER_PAGE
+
source drivers/pci/Config.in
bool 'Support for hot-pluggable devices' CONFIG_HOTPLUG
diff -urN linux-2.4.18/arch/alpha/mm/init.c linux/arch/alpha/mm/init.c
--- linux-2.4.18/arch/alpha/mm/init.c Fri Sep 21 12:02:03 2001
+++ linux/arch/alpha/mm/init.c Fri Jun 28 14:40:25 2002
@@ -5,6 +5,7 @@
*/
/* 2.3.x zone allocator, 1999 Andrea Arcangeli <andrea@suse.de> */
+/* SUPER_PAGE support, 2000 Naohiko Shimizu <nshimizu@keyaki.cc.u-tokai.ac.jp> */
#include <linux/config.h>
#include <linux/signal.h>
@@ -44,6 +45,11 @@
#ifndef CONFIG_SMP
struct pgtable_cache_struct quicklists;
+#endif
+
+#ifdef CONFIG_SUPER_PAGE
+int super_page_order[] = {0,3,6,9};
+pgprot_t super_page_prot[] = {0x0000,0x0020,0x0040,0x0060};
#endif
pgd_t *
diff -urN linux-2.4.18/include/asm-alpha/pgtable.h linux/include/asm-alpha/pgtable.h
--- linux-2.4.18/include/asm-alpha/pgtable.h Fri Oct 26 06:00:32 2001
+++ linux/include/asm-alpha/pgtable.h Sat Jul 6 16:47:16 2002
@@ -19,7 +19,10 @@
* within a page table are directly modified. Thus, the following
* hook is made available.
*/
-#define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval))
+/*
+ * SUPER_PAGE support added by Naohiko Shimizu
+ <nshimizu@keyaki.cc.u-tokai.ac.jp>
+ */
/* PMD_SHIFT determines the size of the area a second-level page table can map */
#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3))
@@ -254,9 +257,7 @@
extern inline unsigned long pgd_page(pgd_t pgd)
{ return PAGE_OFFSET + ((pgd_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT)); }
-extern inline int pte_none(pte_t pte) { return !pte_val(pte); }
extern inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_VALID; }
-extern inline void pte_clear(pte_t *ptep) { pte_val(*ptep) = 0; }
extern inline int pmd_none(pmd_t pmd) { return !pmd_val(pmd); }
extern inline int pmd_bad(pmd_t pmd) { return (pmd_val(pmd) & ~_PFN_MASK) != _PAGE_TABLE; }
@@ -269,6 +270,58 @@
extern inline void pgd_clear(pgd_t * pgdp) { pgd_val(*pgdp) = 0; }
#define page_address(page) ((page)->virtual)
+
+#ifndef CONFIG_SUPER_PAGE
+#define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval))
+extern inline int pte_none(pte_t pte) { return !(pte_val(pte)); }
+extern inline void pte_clear(pte_t *ptep) { pte_val(*ptep)=0; }
+#else
+#define SUPER_PAGE_MASK 0x0060
+#define SUPER_PAGE_MASK_SHIFT 5
+#define SUPER_PAGE_NR 4
+static inline void flush_tlb(void);
+extern int super_page_order[];
+extern pgprot_t super_page_prot[];
+extern inline int pte_none(pte_t pte) { return !(pte_val(pte) & ~SUPER_PAGE_MASK); }
+#define pte_to_sp_index(x) ((pte_val(x) & SUPER_PAGE_MASK) >> SUPER_PAGE_MASK_SHIFT)
+extern inline pte_t mk_pte_sp_clean(pte_t pte) {pte_val(pte) &= ~SUPER_PAGE_MASK; return pte;}
+extern inline void down_pte_sp(pte_t *pteptr, int index) {
+ int i,order;
+ pte_t *addr;
+ order = super_page_order[index];
+ addr = (pte_t *)((unsigned long) pteptr & ~((1UL<<(order + SIZEOF_PTR_LOG2)) - 1));
+ for ( i=0; i < 1<<order; i++) {
+ pte_val(*(addr+i)) = (pte_val(*(addr+i)) & ~SUPER_PAGE_MASK) | pgprot_val(super_page_prot[index -1]);
+ }
+ }
+extern inline void clear_pte_sp(pte_t *pteptr, int index) {
+ int i,order;
+ pte_t *addr;
+ order = super_page_order[index];
+ addr = (pte_t *)((unsigned long) pteptr & ~((1UL<<(order + SIZEOF_PTR_LOG2)) - 1));
+ for ( i=0; i < 1<<order; i++) {
+ pte_val(*(addr+i)) &= ~SUPER_PAGE_MASK;
+ }
+ }
+extern inline void set_pte_raw(pte_t *pteptr, pte_t pteval) {
+ int super_page=0;
+ retry:
+ if ( pte_present(*pteptr) && ( pte_val(*pteptr) & SUPER_PAGE_MASK )) {
+ down_pte_sp(pteptr, pte_to_sp_index(*pteptr));
+ super_page++;
+ goto retry;
+ }
+ if(super_page) flush_tlb();
+ *pteptr = pteval;
+}
+extern inline void set_pte(pte_t *pteptr, pte_t pteval) {
+ if ( pte_none(*pteptr) && ( pte_val(*pteptr) & SUPER_PAGE_MASK )) {
+ clear_pte_sp(pteptr, pte_to_sp_index(*pteptr));
+ }
+ set_pte_raw(pteptr, mk_pte_sp_clean(pteval));
+}
+extern inline void pte_clear(pte_t *ptep) { pte_t pte; pte_val(pte)=0; set_pte(ptep, pte); }
+#endif
/*
* The following only work if pte_present() is true.
diff -urN linux-2.4.18/include/linux/mm.h linux/include/linux/mm.h
--- linux-2.4.18/include/linux/mm.h Sat Dec 22 02:42:03 2001
+++ linux/include/linux/mm.h Sat Jul 6 16:47:16 2002
@@ -336,6 +336,15 @@
#define PageHighMem(page) 0 /* needed to optimize away at compile time */
#endif
+#ifndef CONFIG_SUPER_PAGE
+#define SUPER_PAGE_NR 1
+#define SUPER_PAGE_MASK (pgprot_t)0
+#define pte_to_sp_index(x) (0)
+#define down_pte_sp(pte, order)
+#define set_pte_raw(ptep, pte) set_pte(ptep, pte)
+#endif
+
+
#define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags)
#define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags)
@@ -414,6 +423,7 @@
extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
extern int make_pages_present(unsigned long addr, unsigned long end);
+extern int make_ptes_present(unsigned long addr, unsigned long end);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len);
extern int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len);
@@ -448,6 +458,12 @@
extern void show_mem(void);
extern void si_meminfo(struct sysinfo * val);
extern void swapin_readahead(swp_entry_t);
+
+extern void __break_area (struct page *page, unsigned long order);
+static inline void break_area(struct page *page, unsigned long order)
+{
+ __break_area(page, order);
+}
extern struct address_space swapper_space;
#define PageSwapCache(page) ((page)->mapping == &swapper_space)
diff -urN linux-2.4.18/mm/memory.c linux/mm/memory.c
--- linux-2.4.18/mm/memory.c Tue Feb 26 04:38:13 2002
+++ linux/mm/memory.c Sun Jul 7 12:50:38 2002
@@ -34,6 +34,8 @@
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* (Gerhard.Wichert@pdb.siemens.de)
+ * 13.06.00 - Support of SUPER_PAGE added by Naohiko Shimizu, Tokai Univ.
+ <nshimizu@keyaki.cc.u-tokai.ac.jp>
*/
#include <linux/mm.h>
@@ -54,6 +56,11 @@
void * high_memory;
struct page *highmem_start_page;
+#ifndef CONFIG_SUPER_PAGE
+int super_page_order[] = {0};
+pgprot_t super_page_prot[] = {0x00};
+#endif
+
/*
* We special-case the C-O-W ZERO_PAGE, because it's such
* a common occurrence (no need to read the page to know
@@ -310,7 +317,14 @@
for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
pte_t pte = *ptep;
if (pte_none(pte))
- continue;
+ {
+ if(pte_val(pte)&SUPER_PAGE_MASK) {
+ struct mm_struct *mm = current->mm;
+ spin_lock(&mm->page_table_lock);
+ pte_clear(ptep);
+ spin_unlock(&mm->page_table_lock);
+ }
+ continue; }
if (pte_present(pte)) {
struct page *page = pte_page(pte);
if (VALID_PAGE(page) && !PageReserved(page))
@@ -1184,34 +1198,78 @@
static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
{
pte_t entry;
-
+ int i;
+ unsigned long spaddr;
+ pte_t oldpte;
+ int order;
+ pte_t *wktable;
+retry:
/* Read-only mapping of ZERO_PAGE. */
entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
/* ..except if it's a write access */
if (write_access) {
struct page *page;
+ oldpte = *page_table;
+ order = super_page_order[pte_to_sp_index(oldpte)];
+ wktable =
+ (pte_t *)((unsigned long)page_table &
+ ~((1UL << (order + SIZEOF_PTR_LOG2)) -1));
+
+ for (i=0; i < 1 << order; i++) {
+ if(!pte_none(*(wktable+i))) {
+ down_pte_sp(page_table, pte_to_sp_index(oldpte));
+ goto retry;
+ }
+ }
/* Allocate our own private page. */
spin_unlock(&mm->page_table_lock);
- page = alloc_page(GFP_HIGHUSER);
- if (!page)
- goto no_mem;
- clear_user_highpage(page, addr);
-
+ page = alloc_pages(GFP_HIGHUSER, order);
+ if (!page) {
+ if (order) {
+ down_pte_sp(page_table, pte_to_sp_index(oldpte));
+ goto retry;
+ }
+ else goto no_mem;
+ }
+ spaddr = addr & ~((PAGE_SIZE << order) - 1);
+ for (i=0; i < 1 << order; i++) {
+ clear_user_highpage(page+i, spaddr);
+ spaddr += PAGE_SIZE;
+ }
+ if (order) {
+ break_area(page, order);
+ spin_lock(&mm->page_table_lock);
+ spaddr = addr & ~((PAGE_SIZE << order) - 1);
+ for (i=0; i < 1 << order; i++) {
+ entry = pte_mkwrite(pte_mkdirty(
+ mk_pte(page+i, __pgprot(pgprot_val(vma->vm_page_prot)|
+ pgprot_val(super_page_prot[pte_to_sp_index(oldpte)])))));
+ mm->rss++;
+ flush_page_to_ram(page+i);
+ lru_cache_add(page+i);
+ mark_page_accessed(page+i);
+ set_pte_raw(wktable+i, entry);
+ spaddr += PAGE_SIZE;
+ }
+ } else{
spin_lock(&mm->page_table_lock);
if (!pte_none(*page_table)) {
page_cache_release(page);
spin_unlock(&mm->page_table_lock);
return 1;
}
- mm->rss++;
- flush_page_to_ram(page);
- entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ mm->rss++;
+ flush_page_to_ram(page);
+ entry = pte_mkwrite(pte_mkdirty(
+ mk_pte(page, vma->vm_page_prot)));
lru_cache_add(page);
mark_page_accessed(page);
- }
+ set_pte(page_table, entry);
+ }
+ } else
set_pte(page_table, entry);
@@ -1467,3 +1525,67 @@
len, write, 0, NULL, NULL);
return ret == len ? 0 : -1;
}
+ /*
+ * Allocating PTEs for future falt handling.
+ */
+
+ int set_sp_range(unsigned long address, int order, pgprot_t prot)
+ {
+ int i;
+ pgd_t * dir;
+ pmd_t *pmd;
+ pte_t * pte;
+
+ dir = pgd_offset(current->mm, address);
+ pmd = pmd_alloc(current->mm, dir, address);
+ if (!pmd) return -ENOMEM;
+ address &= ~PGDIR_MASK;
+ pte = pte_alloc(current->mm, pmd, address);
+ if (!pte)
+ return -ENOMEM;
+ for (i = 0; i < 1<<order; i ++)
+ if(!pte_none(*(pte+i))) return 0;
+ for (i = 0; i < 1<<order; i ++) {
+ set_pte_raw(pte, pte_modify(*pte, prot));
+ pte++;
+ }
+ return 0;
+ }
+
+ /*
+ * Simplistic new page table allocation for sys_brk..
+ * Only GH bit != 0 tables will be allocated.
+ * At this time, we will not allocate real storage, it remains
+ * for the page_fault handler.
+ */
+ int make_ptes_present(unsigned long addr, unsigned long end)
+ {
+ int i;
+ unsigned long rem;
+ if (addr >= end)
+ BUG();
+ /*
+ * The first order(i=0) is the ordinary pte (1page), then we skip to
+ * allocate the pte.
+ */
+ for (i = 0; i < SUPER_PAGE_NR - 1; i++) {
+ rem = (~addr + 1) & ((PAGE_SIZE << super_page_order[i+1]) - 1);
+ while (rem &&
+ (addr & ((PAGE_SIZE << super_page_order[i]) - 1)) == 0UL &&
+ ((end - addr ) >= (PAGE_SIZE << super_page_order[i]))) {
+ if(i) set_sp_range(addr, super_page_order[i], super_page_prot[i]);
+ addr += PAGE_SIZE << super_page_order[i];
+ rem -= PAGE_SIZE << super_page_order[i];
+ };
+ }
+ for (i = SUPER_PAGE_NR - 1; i > 0; i--) {
+ while (
+ (addr & ((PAGE_SIZE << super_page_order[i]) - 1)) == 0UL &&
+ ((end - addr ) >= (PAGE_SIZE << super_page_order[i]))) {
+ set_sp_range(addr, super_page_order[i], super_page_prot[i]);
+ addr += PAGE_SIZE << super_page_order[i];
+ };
+ }
+ return 0;
+ }
+
diff -urN linux-2.4.18/mm/mmap.c linux/mm/mmap.c
--- linux-2.4.18/mm/mmap.c Tue Feb 26 04:38:14 2002
+++ linux/mm/mmap.c Sat Jun 29 11:39:19 2002
@@ -560,6 +560,17 @@
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
}
+ /*
+ * mmap may provide new pages and we have the chance to set the
+ * SUPER_PAGE bits in the TLB entries, then we call make_ptes_present.
+ * MAP_PRIVATE flag gives a chance to tell us
+ * that it is a plain allocation.
+ * <N.Shimizu>
+ */
+#ifdef CONFIG_SUPER_PAGE
+ if (vm_flags & MAP_PRIVATE)
+ make_ptes_present(addr, addr + len);
+#endif
return addr;
unmap_and_free_vma:
@@ -1080,6 +1091,14 @@
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
}
+ /*
+ * brk provide new pages and we have the chance to set the
+ * SUPER_PAGE bits for TLB, then we call make_ptes_present.
+ * <N.Shimizu>
+ */
+#ifdef CONFIG_SUPER_PAGE
+ make_ptes_present(addr, addr + len);
+#endif
return addr;
}
diff -urN linux-2.4.18/mm/page_alloc.c linux/mm/page_alloc.c
--- linux-2.4.18/mm/page_alloc.c Tue Feb 26 04:38:14 2002
+++ linux/mm/page_alloc.c Fri Jun 28 15:52:12 2002
@@ -7,6 +7,7 @@
* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ * Page Granularity Hint support, Naohiko Shimizu, Tokai Univ., Jul 2000
*/
#include <linux/config.h>
@@ -175,6 +176,17 @@
if (BAD_RANGE(zone,page))
BUG();
return page;
+}
+
+void __break_area (struct page *page, unsigned long order) {
+ int i;
+ unsigned long flags;
+ unsigned long size = 1 << order;
+
+ for ( i = 0; i < size; i++ ) {
+ set_page_count(page + i, 1);
+ }
+ return;
}
static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
/*
This program is for testing large stride data transfer
I put this program under the GPL.
Contact:Naohiko Shimizu,
School of Engineering, Tokai University.
1117 Kitakaname, Kanagawa 259-12 Japan
email:nshimizu@keyaki.cc.u-tokai.ac.jp
TEL: +81-463-58-1211(ext.4084)
FAX: +81-463-58-8320
<URL:http://shimizu-lab.et.u-tokai.ac.jp/>
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/resource.h>
#ifndef DIMENSION
#define DIMENSION 1000
#endif
#define ITR 10*4000*4000/(DIMENSION*DIMENSION)
double a[DIMENSION][DIMENSION];
double b[DIMENSION][DIMENSION];
struct rusage rusage;
double dtime()
{
double q;
getrusage(RUSAGE_SELF,&rusage);
q = (double)(rusage.ru_utime.tv_sec);
q = q + (double)(rusage.ru_utime.tv_usec) * 1.0e-06;
return q;
}
int main(int argc, char **argv)
{
int i,j,k;
int dim=DIMENSION;
double starttime,endtime,ss,ls;
for(i=0; i<dim; i++) {
for(j=0; j< dim; j++) {
a[j][i] = b[i][j] = 0.0;
}
}
starttime = dtime();
for(k=0; k<ITR; k++)
for(i=0; i<dim; i++) {
for(j=0; j< dim; j++) {
a[j][i] = b[i][j];
}
}
endtime = dtime();
ss=endtime-starttime;
// printf("store stride: %5.2fMB/S\n", ITR*dim*dim*sizeof(double)*1e-6/(endtime-starttime));
starttime = dtime();
for(k=0; k<ITR; k++)
for(j=0; j< dim; j++) {
for(i=0; i<dim; i++) {
a[j][i] = b[i][j];
}
}
endtime = dtime();
ls=endtime-starttime;
// printf("load stride: %5.2fMB/S\n", ITR*dim*dim*sizeof(double)*1e-6/(endtime-starttime));
printf("%d %5.2f %5.2f\n", dim,
ITR*dim*dim*sizeof(double)*1e-6/ss,
ITR*dim*dim*sizeof(double)*1e-6/ls
);
exit(0);
}