| From: |
| Dave Hansen <haveblue@us.ibm.com> |
| To: |
| "Martin J. Bligh" <mbligh@aracnet.com> |
| Subject: |
| [Lse-tech] [RFC] NUMA replication of user data |
| Date: |
| 13 Aug 2003 16:43:24 -0700 |
| Cc: |
| Zwane Mwaikambo <zwane@linuxpower.ca>,
LSE <lse-tech@lists.sourceforge.net> |
After my last patch to replicate kernel text on x86 NUMA
(http://lwn.net/Articles/36602/), I've turned to user text: executables
and libraries. This is significantly more invasive with significantly
more code, but the performance results are quite promising, plus it's
arch-independent.
DISCLAIMER: SPEC(tm) and the benchmark name SDET(tm) are registered
trademarks of the Standard Performance Evaluation Corporation. This
benchmarking was performed for research purposes only, and the run
results are non-compliant and not-comparable with any published results.
SDET Average Throughput (NUMA-Q):
2.6.0-test3 100.0%
2.6.0-test3+urepl 143.1%
SDET Average Throughput (16-way P4):
2.6.0-test3 100.0%
2.6.0-test3+urepl 108.8%
The patch isn't really just user text. It targets anything that's
mmap'd with the MAP_PRIVATE flag. What uses that flag? Any executable
or shared library is mapped that way. Plus, executables already have a
mechanism to deny writes to the file while it's open. I hijack that
mechanism a bit to somewhat cover everything that I try and replicate.
These mechanisms could conceivably cover just about any data which is
read-only.
This is still pretty experimental, so don't give it to your bank or
anything. I've lightly corrupted data playing with it, although not in
at least a week :)
--
Dave Hansen
haveblue@us.ibm.com
diff -urp linux-2.6.0-test3-clean/drivers/serial/8250.c linux-2.6.0-test3-textrepl/drivers/serial/8250.c
--- linux-2.6.0-test3-clean/drivers/serial/8250.c Sun Jul 27 09:56:59 2003
+++ linux-2.6.0-test3-textrepl/drivers/serial/8250.c Mon Aug 11 20:17:37 2003
@@ -1983,7 +1983,7 @@ static struct console serial8250_console
.data = &serial8250_reg,
};
-static int __init serial8250_console_init(void)
+int __init serial8250_console_init(void)
{
serial8250_isa_init_ports();
register_console(&serial8250_console);
diff -urp linux-2.6.0-test3-clean/fs/fs-writeback.c linux-2.6.0-test3-textrepl/fs/fs-writeback.c
--- linux-2.6.0-test3-clean/fs/fs-writeback.c Sun Jul 27 10:12:08 2003
+++ linux-2.6.0-test3-textrepl/fs/fs-writeback.c Wed Aug 13 02:46:45 2003
@@ -100,7 +100,11 @@ void __mark_inode_dirty(struct inode *in
*/
if (!was_dirty) {
mapping->dirtied_when = jiffies|1; /* 0 is special */
- list_move(&inode->i_list, &sb->s_dirty);
+ /* Replication does not need to be collapsed here
+ * This gets called when things like update_atime()
+ * occur, and those don't touch anything but metadata
+ * and replication doesn't interfere with that
+ */
}
}
out:
diff -urp linux-2.6.0-test3-clean/fs/inode.c linux-2.6.0-test3-textrepl/fs/inode.c
--- linux-2.6.0-test3-clean/fs/inode.c Sat Aug 9 13:09:03 2003
+++ linux-2.6.0-test3-textrepl/fs/inode.c Mon Aug 11 20:28:30 2003
@@ -182,6 +182,8 @@ void inode_init_once(struct inode *inode
INIT_LIST_HEAD(&inode->i_devices);
sema_init(&inode->i_sem, 1);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+ inode->i_data.replicate = 0;
+ spin_lock_init(&inode->i_data.replication_lock);
spin_lock_init(&inode->i_data.page_lock);
init_MUTEX(&inode->i_data.i_shared_sem);
atomic_set(&inode->i_data.truncate_count, 0);
diff -urp linux-2.6.0-test3-clean/fs/namei.c linux-2.6.0-test3-textrepl/fs/namei.c
--- linux-2.6.0-test3-clean/fs/namei.c Sat Aug 9 13:09:03 2003
+++ linux-2.6.0-test3-textrepl/fs/namei.c Wed Aug 13 03:42:26 2003
@@ -240,15 +240,40 @@ int permission(struct inode * inode,int
* who will try to move it in struct inode - just leave it here.
*/
static spinlock_t arbitration_lock = SPIN_LOCK_UNLOCKED;
+/*
+ * if the inability to get_write_access() is because
+ * of replication going on, collapse the replication
+ * and try again
+ */
+static inline int inode_try_replication_disable(struct inode *inode)
+{
+ struct address_space *mapping = inode->i_mapping;
+ if (mapping_replicate(inode->i_mapping)) {
+ spin_unlock(&arbitration_lock);
+ printk("get_write_access() trying to collapse replication\n");
+ dump_stack();
+ spin_lock(&mapping->replication_lock);
+ collapse_replication(mapping, NULL);
+ mapping->replicate = 0;
+ atomic_inc(&inode->i_writecount);
+ spin_unlock(&mapping->replication_lock);
+ return 1;
+ }
+ return 0;
+}
int get_write_access(struct inode * inode)
{
+retry:
spin_lock(&arbitration_lock);
if (atomic_read(&inode->i_writecount) < 0) {
+ if (inode_try_replication_disable(inode))
+ goto retry;
spin_unlock(&arbitration_lock);
return -ETXTBSY;
}
atomic_inc(&inode->i_writecount);
spin_unlock(&arbitration_lock);
+ inode->i_mapping->replicate--;
return 0;
}
int deny_write_access(struct file * file)
diff -urp linux-2.6.0-test3-clean/fs/proc/task_mmu.c linux-2.6.0-test3-textrepl/fs/proc/task_mmu.c
--- linux-2.6.0-test3-clean/fs/proc/task_mmu.c Sun Jul 27 09:57:47 2003
+++ linux-2.6.0-test3-textrepl/fs/proc/task_mmu.c Sat Aug 9 20:21:59 2003
@@ -83,20 +83,27 @@ static int show_map(struct seq_file *m,
unsigned long ino = 0;
dev_t dev = 0;
int len;
+ struct address_space *mapping = NULL;
if (file) {
struct inode *inode = map->vm_file->f_dentry->d_inode;
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
+ mapping = inode->i_mapping;
}
- seq_printf(m, "%0*lx-%0*lx %c%c%c%c %0*lx %02x:%02x %lu %n",
+ seq_printf(m, "%0*lx-%0*lx %c%c%c|%c%c|%c|%c%c|deny:%c %0*lx %02x:%02x %lu %n",
(int) (2*sizeof(void*)), map->vm_start,
(int) (2*sizeof(void*)), map->vm_end,
flags & VM_READ ? 'r' : '-',
flags & VM_WRITE ? 'w' : '-',
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? 's' : 'p',
+ flags & VM_SHARED ? 's' : 'p',
+ flags & VM_MAYREAD ? 'r' : '-',
+ flags & VM_MAYWRITE ? 'w' : '-',
+ flags & VM_DENYWRITE ? 'd' : '-',
+ mapping ? (mapping->replicate+'0') : '_',
(int) (2*sizeof(void*)), map->vm_pgoff << PAGE_SHIFT,
MAJOR(dev), MINOR(dev), ino, &len);
diff -urp linux-2.6.0-test3-clean/include/asm-i386/mmzone.h linux-2.6.0-test3-textrepl/include/asm-i386/mmzone.h
--- linux-2.6.0-test3-clean/include/asm-i386/mmzone.h Sun Jul 27 10:02:04 2003
+++ linux-2.6.0-test3-textrepl/include/asm-i386/mmzone.h Wed Aug 13 04:23:50 2003
@@ -83,6 +83,8 @@ extern struct pglist_data *node_data[];
(unsigned long)(__page - __zone->zone_mem_map) \
+ __zone->zone_start_pfn; \
})
+#define page_to_nid(page) pfn_to_nid(page_to_pfn(page))
+#define page_is_local(page) (page_to_nid(page) == numa_node_id())
#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
/*
* pfn_valid should be made as fast as possible, and the current definition
diff -urp linux-2.6.0-test3-clean/include/linux/fs.h linux-2.6.0-test3-textrepl/include/linux/fs.h
--- linux-2.6.0-test3-clean/include/linux/fs.h Sat Aug 9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/include/linux/fs.h Wed Aug 13 04:51:36 2003
@@ -315,6 +315,10 @@ struct backing_dev_info;
struct address_space {
struct inode *host; /* owner: inode, block_device */
struct radix_tree_root page_tree; /* radix tree of all pages */
+ int replicate;
+ spinlock_t replication_lock; /* gets rid of races while
+ turning replication on
+ and off */
spinlock_t page_lock; /* and spinlock protecting it */
struct list_head clean_pages; /* list of clean pages */
struct list_head dirty_pages; /* list of dirty pages */
@@ -334,6 +338,8 @@ struct address_space {
struct address_space *assoc_mapping; /* ditto */
};
+#define mapping_replicate(mapping) ((mapping)->replicate > 0)
+
struct block_device {
struct list_head bd_hash;
atomic_t bd_count;
@@ -1177,6 +1183,8 @@ unsigned long invalidate_mapping_pages(s
unsigned long invalidate_inode_pages(struct address_space *mapping);
extern void invalidate_inode_pages2(struct address_space *mapping);
extern void write_inode_now(struct inode *, int);
+extern int collapse_replication(struct address_space *mapping,
+ struct file *file);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
@@ -1195,6 +1203,28 @@ extern int get_write_access(struct inode
extern int deny_write_access(struct file *);
static inline void put_write_access(struct inode * inode)
{
+ /* you can reenable replication here
+ if ((atomic_read(&inode->i_writecount) == 1) && // if this is the last
+ // write access
+ (inode->i_mapping->replicate < 0)) { // and this was previously
+ // disabled
+ struct file *f;
+ file_list_lock();
+ list_for_each_entry(f, &inode->i_sb->s_files, f_list) {
+ if (f && f->f_dentry && f->f_dentry->d_inode &&
+ f->f_dentry->d_inode == inode) {
+ char buf[100];
+ file_list_unlock();
+ printk("might have had chance to redo replication on: ");
+ printk ("%s\n", d_path(f->f_dentry,
+ f->f_vfsmnt, &buf[0], 99));
+ goto out;
+ }
+ }
+ file_list_unlock();
+ }
+ inode->i_mapping->replicate++;
+out:*/
atomic_dec(&inode->i_writecount);
}
static inline void allow_write_access(struct file *file)
diff -urp linux-2.6.0-test3-clean/include/linux/gfp.h linux-2.6.0-test3-textrepl/include/linux/gfp.h
--- linux-2.6.0-test3-clean/include/linux/gfp.h Sun Jul 27 09:59:24 2003
+++ linux-2.6.0-test3-textrepl/include/linux/gfp.h Wed Aug 13 03:11:05 2003
@@ -32,6 +32,7 @@
#define __GFP_NOFAIL 0x800 /* Retry for ever. Cannot fail */
#define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */
#define __GFP_NO_GROW 0x2000 /* Slab internal usage */
+#define __GFP_NODE_STRICT 0x4000 /* Do not fall back to other nodes */
#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_NOIO (__GFP_WAIT)
diff -urp linux-2.6.0-test3-clean/include/linux/pagemap.h linux-2.6.0-test3-textrepl/include/linux/pagemap.h
--- linux-2.6.0-test3-clean/include/linux/pagemap.h Sun Jul 27 09:56:03 2003
+++ linux-2.6.0-test3-textrepl/include/linux/pagemap.h Wed Aug 13 03:05:20 2003
@@ -73,6 +73,7 @@ int add_to_page_cache_lru(struct page *p
unsigned long index, int gfp_mask);
extern void remove_from_page_cache(struct page *page);
extern void __remove_from_page_cache(struct page *page);
+extern struct page* __page_cache_lookup(struct address_space *mapping, pgoff_t offset);
extern atomic_t nr_pagecache;
diff -urp linux-2.6.0-test3-clean/include/linux/radix-tree.h linux-2.6.0-test3-textrepl/include/linux/radix-tree.h
--- linux-2.6.0-test3-clean/include/linux/radix-tree.h Sun Jul 27 09:59:36 2003
+++ linux-2.6.0-test3-textrepl/include/linux/radix-tree.h Sat Aug 9 20:21:59 2003
@@ -41,7 +41,7 @@ do { \
(root)->rnode = NULL; \
} while (0)
-extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+extern void *radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
extern void *radix_tree_delete(struct radix_tree_root *, unsigned long);
extern unsigned int
diff -urp linux-2.6.0-test3-clean/init/main.c linux-2.6.0-test3-textrepl/init/main.c
--- linux-2.6.0-test3-clean/init/main.c Sun Jul 27 09:57:49 2003
+++ linux-2.6.0-test3-textrepl/init/main.c Wed Aug 13 03:39:18 2003
@@ -81,6 +81,7 @@ extern void pidhash_init(void);
extern void pidmap_init(void);
extern void pte_chain_init(void);
extern void radix_tree_init(void);
+extern void page_cache_leaf_init(void);
extern void free_initmem(void);
extern void populate_rootfs(void);
extern void driver_init(void);
@@ -442,6 +443,7 @@ asmlinkage void __init start_kernel(void
security_scaffolding_startup();
vfs_caches_init(num_physpages);
radix_tree_init();
+ page_cache_leaf_init();
signals_init();
/* rootfs populating might need page-writeback */
page_writeback_init();
diff -urp linux-2.6.0-test3-clean/lib/radix-tree.c linux-2.6.0-test3-textrepl/lib/radix-tree.c
--- linux-2.6.0-test3-clean/lib/radix-tree.c Sun Jul 27 09:58:50 2003
+++ linux-2.6.0-test3-textrepl/lib/radix-tree.c Wed Aug 13 03:07:31 2003
@@ -18,6 +18,7 @@
*/
#include <linux/errno.h>
+#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -168,8 +169,11 @@ static int radix_tree_extend(struct radi
* @item: item to insert
*
* Insert an item into the radix tree at position @index.
+ *
+ * If the insertion fails because a duplicate element is present,
+ * return that element.
*/
-int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
+void *radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
{
struct radix_tree_node *node = NULL, *tmp, **slot;
unsigned int height, shift;
@@ -179,7 +183,7 @@ int radix_tree_insert(struct radix_tree_
if (index > radix_tree_maxindex(root->height)) {
error = radix_tree_extend(root, index);
if (error)
- return error;
+ return ERR_PTR(error);
}
slot = &root->rnode;
@@ -190,7 +194,7 @@ int radix_tree_insert(struct radix_tree_
if (*slot == NULL) {
/* Have to add a child node. */
if (!(tmp = radix_tree_node_alloc(root)))
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
*slot = tmp;
if (node)
node->count++;
@@ -205,7 +209,7 @@ int radix_tree_insert(struct radix_tree_
}
if (*slot != NULL)
- return -EEXIST;
+ return *slot; //-EEXIST;
if (node)
node->count++;
diff -urp linux-2.6.0-test3-clean/mm/filemap.c linux-2.6.0-test3-textrepl/mm/filemap.c
--- linux-2.6.0-test3-clean/mm/filemap.c Sat Aug 9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/mm/filemap.c Wed Aug 13 04:47:03 2003
@@ -10,6 +10,7 @@
* the NFS filesystem used to do this differently, for example)
*/
#include <linux/config.h>
+#include <linux/err.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/compiler.h>
@@ -72,6 +73,150 @@
* ->page_lock (try_to_unmap_one)
*/
+struct page_cache_leaf {
+ struct page* pages[MAX_NR_NODES];
+ int count;
+ /* the duplicate_lock is not here to prevent any harmful races, it
+ * keeps collision overhead to a minimum.
+ *
+ * When 2 CPUs on the same node getinto find_get_page() together, they
+ * can both try to make a copy at the same time. One is bound to get
+ * -EEXIST and back off properly, but copying that page is expensive.
+ * Better to just spin here and wait for it to happen.
+ *
+ * This lock could be per-node.
+ */
+ spinlock_t duplicate_lock;
+};
+static kmem_cache_t *page_cache_leaf_cachep;
+
+void page_cache_leaf_ctor(void *node, kmem_cache_t *cachep, unsigned long flags)
+{
+ struct page_cache_leaf *leaf = node;
+ memset(node, 0, sizeof(struct page_cache_leaf));
+ spin_lock_init(&leaf->duplicate_lock);
+}
+
+DEFINE_PER_CPU(struct page_cache_leaf *, page_cache_leaf_preloads) = { 0, };
+int page_cache_leaf_preload(int gfp_mask)
+{
+ struct page_cache_leaf **preload;
+ int error = 0;
+
+ preload = &get_cpu_var(page_cache_leaf_preloads);
+ if (!*preload)
+ *preload = kmem_cache_alloc(page_cache_leaf_cachep, gfp_mask);
+ if (!*preload)
+ error = -ENOMEM;
+ put_cpu_var(page_cache_leaf_preloads);
+
+ return error;
+}
+
+void __init page_cache_leaf_init(void)
+{
+ page_cache_leaf_cachep = kmem_cache_create("page_cache_leaf",
+ sizeof(struct page_cache_leaf), 0,
+ 0, page_cache_leaf_ctor, NULL);
+ if (!page_cache_leaf_cachep)
+ panic ("Failed to create radix_tree_node cache\n");
+}
+
+/*
+ * If replication is on, only the node-local page will be returned. If
+ * there is not a local page, it will not find anything.
+ *
+ * If find_any is set, a search for all pages will be done even if
+ * replication is on. This is useful when we're trying to make a
+ * local copy of the page and we just want any old copy of it.
+ */
+enum page_search {
+ PAGE_LOCAL,
+ PAGE_ANY
+};
+
+/*
+ * for the non-numa case, this can just cast *leaf to a page and return
+ */
+struct page *page_cache_leaf_to_page(struct page_cache_leaf *leaf,
+ struct address_space *mapping, enum page_search search_type)
+{
+ struct page *page = NULL;
+ int nid = numa_node_id();
+
+ if (mapping_replicate(mapping))
+ page = leaf->pages[nid];
+
+ if (!page && ((mapping->replicate <= 0) || (search_type == PAGE_ANY)))
+ for (nid = 0; nid < numnodes; nid++) {
+ page = leaf->pages[nid];
+ if (page)
+ break;
+ }
+ return page;
+}
+
+int __insert_into_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t offset)
+{
+ int error = 0;
+ int nid;
+ struct page_cache_leaf *leaf, **newleaf;
+
+ nid = page_to_nid(page);
+
+ newleaf = &get_cpu_var(page_cache_leaf_preloads);
+ if (newleaf)
+ leaf = radix_tree_insert(&mapping->page_tree, offset, *newleaf);
+ else {
+ leaf = radix_tree_lookup(&mapping->page_tree, offset);
+ if (!leaf)
+ leaf = ERR_PTR(-ENOMEM);
+ }
+
+ if (IS_ERR(leaf)) {
+ error = PTR_ERR(leaf);
+ goto out;
+ }
+
+ /* there's already a leaf node there */
+ if (!mapping_replicate(mapping) && leaf) {
+ error = -EEXIST;
+ goto out;
+ }
+
+ /* successful insertion */
+ if (!leaf) {
+ leaf = *newleaf;
+ *newleaf = NULL;
+ }
+
+ if (leaf->pages[nid]) {
+ error = -EEXIST;
+ } else {
+ leaf->pages[nid] = page;
+ leaf->count++;
+ }
+out:
+ put_cpu_var(page_cache_leaf_preloads);
+ return error;
+}
+
+struct page*
+__page_cache_lookup(struct address_space *mapping, pgoff_t offset)
+{
+ struct page *page = NULL;
+ struct page_cache_leaf *leaf;
+
+ leaf = radix_tree_lookup(&mapping->page_tree, offset);
+ if (!leaf)
+ goto out;
+
+ page = page_cache_leaf_to_page(leaf, mapping, 0);
+out:
+ return page;
+}
+
/*
* Remove a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
@@ -80,8 +225,25 @@
void __remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
-
- radix_tree_delete(&mapping->page_tree, page->index);
+ struct page_cache_leaf *leaf;
+
+ leaf = radix_tree_lookup(&mapping->page_tree, page->index);
+ leaf->pages[page_to_nid(page)] = NULL;
+ if (--leaf->count == 0) {
+ struct page_cache_leaf **preload;
+ radix_tree_delete(&mapping->page_tree, page->index);
+ /*
+ * if there is a free preload slot for this CPU, put the
+ * leaf back there instead of freeing it
+ */
+ preload = &get_cpu_var(page_cache_leaf_preloads);
+ if (!*preload)
+ *preload = leaf;
+ else
+ kmem_cache_free(page_cache_leaf_cachep, leaf);
+ put_cpu_var(page_cache_leaf_preloads);
+ }
+
list_del(&page->list);
page->mapping = NULL;
@@ -110,6 +272,36 @@ static inline int sync_page(struct page
return 0;
}
+static void __collapse_replication(struct address_space *mapping)
+{
+ /*
+ * later on, you can worry about collapsing everything back
+ * but, for now, just tell everything that this is not currently
+ * replicated and *don't* try to in the future
+ */
+ printk("collapsing replication\n");
+
+ mapping->replicate = -1;
+}
+
+inline int collapse_replication(struct address_space *mapping,
+ struct file *file)
+{
+ spin_lock(&mapping->page_lock);
+ if (mapping->replicate > 0) {
+ char buf[100];
+ memset(&buf[0], 0, 100);
+ printk ("collapsed: [%d] %s\n", mapping->replicate,
+ file ? d_path(file->f_dentry,
+ file->f_vfsmnt, &buf[0], 99) : "nofile");
+ dump_stack();
+ __collapse_replication(mapping);
+ }
+
+ spin_unlock(&mapping->page_lock);
+ return 0;
+}
+
/**
* filemap_fdatawrite - start writeback against all of a mapping's dirty pages
* @mapping: address space structure to write
@@ -222,10 +414,16 @@ int add_to_page_cache(struct page *page,
{
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (error != 0)
+ goto err;
+
+ /* this benefits from the radix_tree_preload()'s preempt_disable() */
+ error = page_cache_leaf_preload(gfp_mask & ~__GFP_HIGHMEM);
+
if (error == 0) {
page_cache_get(page);
spin_lock(&mapping->page_lock);
- error = radix_tree_insert(&mapping->page_tree, offset, page);
+ error = __insert_into_page_cache(page, mapping, offset);
if (!error) {
SetPageLocked(page);
___add_to_page_cache(page, mapping, offset);
@@ -235,10 +433,16 @@ int add_to_page_cache(struct page *page,
spin_unlock(&mapping->page_lock);
radix_tree_preload_end();
}
+err:
return error;
}
EXPORT_SYMBOL(add_to_page_cache);
+/*
+ * The pages will *not* be added to the LRU immediately. They're only
+ * added after the entire pagevec is filled up. Don't worry, they'll
+ * get there eventually.
+ */
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t offset, int gfp_mask)
{
@@ -350,22 +554,162 @@ void __lock_page(struct page *page)
EXPORT_SYMBOL(__lock_page);
/*
+ * This is fairly lazy with preemption to make the code simpler. It doesn't
+ * need to be perfect. Making a local replica is by no means required. If the
+ * replica page allocation failes, one of two things happens:
+ * 1. page cache returns non-local page, which gets mapped in somewhere.
+ * things are slightly slower
+ * 2. page cache returns NULL, when there was a page in the cache.
+ * I/O is resubmitted for the page, and a replica is made with
+ * the new data.
+ */
+DEFINE_PER_CPU(struct page *, replica_preloads) = { NULL, };
+void refill_replica_page_cpu(void)
+{
+ int cpu = get_cpu();
+ int nid = cpu_to_node(cpu);
+ unsigned int gfp_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_NODE_STRICT;
+ struct page **page = &__get_cpu_var(replica_preloads);
+
+ if (!*page)
+ *page = alloc_pages_node(nid, gfp_mask, 0);
+
+ put_cpu();
+}
+/* I want to see this in the profiles */
+void make_local_replica_copy(struct page *dst, struct page *src)
+{
+ copy_highpage(dst, src);
+}
+static struct page *make_local_replica(struct address_space *mapping, struct page *page) {
+ struct page *copy = page;
+ struct page **prealloc;
+ int nid = numa_node_id();
+ int err;
+
+ if (!page)
+ goto out;
+
+ /* it's already local */
+ if (page_to_nid(page) == nid)
+ goto out;
+
+ if (!mapping_replicate(mapping))
+ goto out;
+
+ /* something is probably writing into the source page */
+ if (TestSetPageLocked(page))
+ goto out;
+
+ prealloc = &get_cpu_var(replica_preloads);
+ if (*prealloc) {
+ copy = *prealloc;
+ *prealloc = NULL;
+ }
+ put_cpu_var(replica_preloads);
+
+ if (!copy)
+ goto out;
+
+ make_local_replica_copy(copy, page);
+ /*
+ * This should never actually have to allocate memory. It will
+ * be able to add the page to the already existing leaf. The
+ * leaf can't go away because we hold a ref count on the source
+ * page
+ */
+ err = add_to_page_cache_lru(copy, mapping, page->index, GFP_ATOMIC);
+ unlock_page(page);
+ switch (err) {
+ case 0:
+ SetPageUptodate(copy);
+ SetPageMappedToDisk(copy);
+ unlock_page(copy);
+ break;
+ case -EEXIST:
+ printk("%s(): -EEXIST\n", __FUNCTION__);
+ page_cache_release(copy);
+ goto out;
+ default:
+ printk("__FUNCTION__(): ?? %d\n", err);
+ page_cache_release(copy);
+ dump_stack();
+ goto out;
+ }
+ return copy;
+out:
+ return page;
+}
+
+static struct page *make_local_replica_lock(struct address_space *mapping, struct page *page) {
+ struct page *copy;
+ return page;
+
+ refill_replica_page_cpu();
+ copy = make_local_replica(mapping, page);
+
+ /*
+ * this is the cowardly way to do it. add the new copy, and pray
+ * that it shows up :) If the replication appears to have worked,
+ * drop the references to the source page. If the new page
+ * got removed in the meantime, find_lock_page() will just
+ * redo the locking anyway.
+ */
+ if (copy != page) {
+ unlock_page(page);
+ page_cache_release(page);
+ copy = find_lock_page(mapping, page->index);
+ }
+
+ return copy;
+}
+/*
* a rather lightweight function, finding and getting a reference to a
* hashed page atomically.
*/
struct page * find_get_page(struct address_space *mapping, unsigned long offset)
{
- struct page *page;
-
+ struct page_cache_leaf *leaf;
+ struct page *page, *copy;
/*
* We scan the hash list read-only. Addition to and removal from
* the hash-list needs a held write-lock.
*/
+
+repeat:
spin_lock(&mapping->page_lock);
- page = radix_tree_lookup(&mapping->page_tree, offset);
- if (page)
- page_cache_get(page);
+ leaf = radix_tree_lookup(&mapping->page_tree, offset);
+ /* nothing found */
+ if (!leaf) {
+ spin_unlock(&mapping->page_lock);
+ return NULL;
+ }
+
+ page = page_cache_leaf_to_page(leaf, mapping, PAGE_ANY);
+ page_cache_get(page);
spin_unlock(&mapping->page_lock);
+
+ refill_replica_page_cpu();
+
+ spin_lock(&leaf->duplicate_lock);
+ /*
+ * now that we have the lock, do a crude check to see if anyone
+ * else has filled in the page we were looking for
+ */
+ if (mapping_replicate(mapping) &&
+ !page_is_local(page) &&
+ leaf->pages[numa_node_id()]) {
+ spin_unlock(&leaf->duplicate_lock);
+ page_cache_release(page);
+ goto repeat;
+ }
+ copy = make_local_replica(mapping, page);
+ spin_unlock(&leaf->duplicate_lock);
+
+ if (copy != page) {
+ page_cache_release(page);
+ return copy;
+ }
return page;
}
@@ -377,7 +721,7 @@ struct page *find_trylock_page(struct ad
struct page *page;
spin_lock(&mapping->page_lock);
- page = radix_tree_lookup(&mapping->page_tree, offset);
+ page = __page_cache_lookup(mapping, offset);
if (page && TestSetPageLocked(page))
page = NULL;
spin_unlock(&mapping->page_lock);
@@ -402,7 +746,7 @@ struct page *find_lock_page(struct addre
spin_lock(&mapping->page_lock);
repeat:
- page = radix_tree_lookup(&mapping->page_tree, offset);
+ page = __page_cache_lookup(mapping, offset);
if (page) {
page_cache_get(page);
if (TestSetPageLocked(page)) {
@@ -447,6 +791,7 @@ struct page *find_or_create_page(struct
int err;
repeat:
page = find_lock_page(mapping, index);
+ page = make_local_replica_lock(mapping, page);
if (!page) {
if (!cached_page) {
cached_page = alloc_page(gfp_mask);
@@ -482,8 +827,9 @@ repeat:
*
* find_get_pages() returns the number of pages which were found.
*/
-unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
- unsigned int nr_pages, struct page **pages)
+unsigned int find_get_pages(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_pages,
+ struct page **pages)
{
unsigned int i;
unsigned int ret;
@@ -491,8 +837,15 @@ unsigned int find_get_pages(struct addre
spin_lock(&mapping->page_lock);
ret = radix_tree_gang_lookup(&mapping->page_tree,
(void **)pages, start, nr_pages);
- for (i = 0; i < ret; i++)
+
+ /*
+ * The radix tree lookups return leaves, which must be converted
+ * to pages
+ */
+ for (i = 0; i < ret; i++) {
+ pages[i] = page_cache_leaf_to_page((struct page_cache_leaf *)pages[i], mapping, 1);
page_cache_get(pages[i]);
+ }
spin_unlock(&mapping->page_lock);
return ret;
}
@@ -1728,6 +2081,7 @@ generic_file_aio_write_nolock(struct kio
*/
fault_in_pages_readable(buf, bytes);
+ collapse_replication(mapping, file);
page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
if (!page) {
status = -ENOMEM;
diff -urp linux-2.6.0-test3-clean/mm/memory.c linux-2.6.0-test3-textrepl/mm/memory.c
--- linux-2.6.0-test3-clean/mm/memory.c Sat Aug 9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/mm/memory.c Sat Aug 9 20:22:18 2003
@@ -959,6 +959,7 @@ static inline void break_cow(struct vm_a
invalidate_vcache(address, vma->vm_mm, new_page);
flush_cache_page(vma, address);
establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
+ BUG_ON(new_page->mapping && (new_page->mapping->replicate > 0));
}
/*
@@ -1452,8 +1453,13 @@ retry:
++mm->rss;
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
- if (write_access)
+ if (write_access) {
entry = pte_mkwrite(pte_mkdirty(entry));
+ if (new_page->mapping &&
+ (new_page->mapping->replicate > 0)) {
+ BUG();
+ }
+ }
set_pte(page_table, entry);
pte_chain = page_add_rmap(new_page, page_table, pte_chain);
pte_unmap(page_table);
diff -urp linux-2.6.0-test3-clean/mm/mmap.c linux-2.6.0-test3-textrepl/mm/mmap.c
--- linux-2.6.0-test3-clean/mm/mmap.c Sun Jul 27 10:04:01 2003
+++ linux-2.6.0-test3-textrepl/mm/mmap.c Wed Aug 13 03:20:55 2003
@@ -523,7 +523,6 @@ unsigned long do_mmap_pgoff(struct file
case MAP_SHARED:
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
return -EACCES;
-
/*
* Make sure we don't allow writing to an append-only
* file..
@@ -545,6 +544,32 @@ unsigned long do_mmap_pgoff(struct file
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
+ /*
+ * Most of this code is duplicated about a hundred
+ * lines down. It needs to be consolidated
+ */
+ if (inode->i_data.replicate == 0) {
+ char buf[100];
+ int error = 1;
+ spin_lock(&inode->i_data.replication_lock);
+ if (inode->i_data.replicate == 0) {
+ inode->i_data.replicate = 1;
+ error = deny_write_access(file);
+ }
+ spin_unlock(&inode->i_data.replication_lock);
+ if (error)
+ break;
+ printk("%s: doing replication i_writecount: %d ",
+ current->comm,
+ atomic_read(&inode->i_writecount));
+ printk ("%s\n", d_path(file->f_dentry,
+ file->f_vfsmnt, &buf[0], 99));
+ if (!list_empty(&inode->i_data.dirty_pages)) {
+ printk("dirty list not empty\n");
+ inode->i_data.replicate = 0;
+ allow_write_access(file);
+ }
+ }
break;
default:
@@ -634,6 +659,22 @@ munmap_back:
if (error)
goto free_vma;
correct_wcount = 1;
+
+ if (inode->i_data.replicate == 0) {
+ char buf[100];
+ printk("%s: doing replication i_writecount: %d ",
+ current->comm,
+ atomic_read(&inode->i_writecount));
+ memset(&buf[0], 0, 100);
+ printk ("%s\n", d_path(file->f_dentry,
+ file->f_vfsmnt, &buf[0], 99));
+ inode->i_data.replicate = 1;
+ deny_write_access(file);
+ if (!list_empty(&inode->i_data.dirty_pages)) {
+ printk("dirty list not empty\n");
+ inode->i_data.replicate = 0;
+ }
+ }
}
vma->vm_file = file;
get_file(file);
diff -urp linux-2.6.0-test3-clean/mm/page_alloc.c linux-2.6.0-test3-textrepl/mm/page_alloc.c
--- linux-2.6.0-test3-clean/mm/page_alloc.c Sat Aug 9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/mm/page_alloc.c Tue Aug 12 21:14:19 2003
@@ -558,7 +558,11 @@ __alloc_pages(unsigned int gfp_mask, uns
min = 1UL << order;
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];
-
+
+ if ((__GFP_NODE_STRICT & gfp_mask) &&
+ (pfn_to_nid(z->zone_start_pfn) != numa_node_id()))
+ continue;
+
min += z->pages_low;
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
diff -urp linux-2.6.0-test3-clean/mm/readahead.c linux-2.6.0-test3-textrepl/mm/readahead.c
--- linux-2.6.0-test3-clean/mm/readahead.c Sat Aug 9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/mm/readahead.c Mon Aug 11 21:46:52 2003
@@ -229,7 +229,7 @@ __do_page_cache_readahead(struct address
if (page_offset > end_index)
break;
- page = radix_tree_lookup(&mapping->page_tree, page_offset);
+ page = __page_cache_lookup(mapping, page_offset);
if (page)
continue;
diff -urp linux-2.6.0-test3-clean/mm/swap.c linux-2.6.0-test3-textrepl/mm/swap.c
--- linux-2.6.0-test3-clean/mm/swap.c Sun Jul 27 10:00:40 2003
+++ linux-2.6.0-test3-textrepl/mm/swap.c Wed Aug 13 03:22:15 2003
@@ -197,6 +197,7 @@ void release_pages(struct page **pages,
}
if (TestClearPageLRU(page))
del_page_from_lru(zone, page);
+
if (page_count(page) == 0) {
if (!pagevec_add(&pages_to_free, page)) {
spin_unlock_irq(&zone->lru_lock);
diff -urp linux-2.6.0-test3-clean/mm/swap_state.c linux-2.6.0-test3-textrepl/mm/swap_state.c
--- linux-2.6.0-test3-clean/mm/swap_state.c Sat Aug 9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/mm/swap_state.c Wed Aug 13 03:22:29 2003
@@ -25,6 +25,8 @@ extern struct address_space_operations s
struct address_space swapper_space = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC),
+ .replicate = 0,
+ .replication_lock = SPIN_LOCK_UNLOCKED,
.page_lock = SPIN_LOCK_UNLOCKED,
.clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages),
.dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages),
@@ -198,7 +200,7 @@ int move_to_swap_cache(struct page *page
spin_lock(&swapper_space.page_lock);
spin_lock(&mapping->page_lock);
- err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
+ err = __insert_into_page_cache(page, &swapper_space, entry.val);
if (!err) {
__remove_from_page_cache(page);
___add_to_page_cache(page, &swapper_space, entry.val);
@@ -234,7 +236,7 @@ int move_from_swap_cache(struct page *pa
spin_lock(&swapper_space.page_lock);
spin_lock(&mapping->page_lock);
- err = radix_tree_insert(&mapping->page_tree, index, page);
+ err = __insert_into_page_cache(page, mapping, index);
if (!err) {
__delete_from_swap_cache(page);
___add_to_page_cache(page, mapping, index);