LWN.net Logo

[RFC] NUMA replication of user data

From:  Dave Hansen <haveblue@us.ibm.com>
To:  "Martin J. Bligh" <mbligh@aracnet.com>
Subject:  [Lse-tech] [RFC] NUMA replication of user data
Date:  13 Aug 2003 16:43:24 -0700
Cc:  Zwane Mwaikambo <zwane@linuxpower.ca>, LSE <lse-tech@lists.sourceforge.net>

After my last patch to replicate kernel text on x86 NUMA
(http://lwn.net/Articles/36602/), I've turned to user text: executables
and libraries.  This is significantly more invasive with significantly
more code, but the performance results are quite promising, plus it's
arch-independent.

DISCLAIMER: SPEC(tm) and the benchmark name SDET(tm) are registered 
trademarks of the Standard Performance Evaluation Corporation. This 
benchmarking was performed for research purposes only, and the run
results are non-compliant and not-comparable with any published results.

SDET Average Throughput (NUMA-Q): 
            2.6.0-test3 100.0%
      2.6.0-test3+urepl 143.1%

SDET Average Throughput (16-way P4): 
            2.6.0-test3 100.0%
      2.6.0-test3+urepl 108.8%

The patch isn't really just user text.  It targets anything that's
mmap'd with the MAP_PRIVATE flag.  What uses that flag?  Any executable
or shared library is mapped that way.  Plus, executables already have a
mechanism to deny writes to the file while it's open.  I hijack that
mechanism a bit to somewhat cover everything that I try and replicate.  
These mechanisms could conceivably cover just about any data which is
read-only.

This is still pretty experimental, so don't give it to your bank or
anything.  I've lightly corrupted data playing with it, although not in
at least a week :)  
-- 
Dave Hansen
haveblue@us.ibm.com
diff -urp linux-2.6.0-test3-clean/drivers/serial/8250.c linux-2.6.0-test3-textrepl/drivers/serial/8250.c
--- linux-2.6.0-test3-clean/drivers/serial/8250.c	Sun Jul 27 09:56:59 2003
+++ linux-2.6.0-test3-textrepl/drivers/serial/8250.c	Mon Aug 11 20:17:37 2003
@@ -1983,7 +1983,7 @@ static struct console serial8250_console
 	.data		= &serial8250_reg,
 };
 
-static int __init serial8250_console_init(void)
+int __init serial8250_console_init(void)
 {
 	serial8250_isa_init_ports();
 	register_console(&serial8250_console);
diff -urp linux-2.6.0-test3-clean/fs/fs-writeback.c linux-2.6.0-test3-textrepl/fs/fs-writeback.c
--- linux-2.6.0-test3-clean/fs/fs-writeback.c	Sun Jul 27 10:12:08 2003
+++ linux-2.6.0-test3-textrepl/fs/fs-writeback.c	Wed Aug 13 02:46:45 2003
@@ -100,7 +100,11 @@ void __mark_inode_dirty(struct inode *in
 		 */
 		if (!was_dirty) {
 			mapping->dirtied_when = jiffies|1; /* 0 is special */
-			list_move(&inode->i_list, &sb->s_dirty);
+			/* Replication does not need to be collapsed here
+			 * This gets called when things like update_atime()
+			 * occur, and those don't touch anything but metadata
+			 * and replication doesn't interfere with that
+			 */
 		}
 	}
 out:
diff -urp linux-2.6.0-test3-clean/fs/inode.c linux-2.6.0-test3-textrepl/fs/inode.c
--- linux-2.6.0-test3-clean/fs/inode.c	Sat Aug  9 13:09:03 2003
+++ linux-2.6.0-test3-textrepl/fs/inode.c	Mon Aug 11 20:28:30 2003
@@ -182,6 +182,8 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+	inode->i_data.replicate = 0;
+	spin_lock_init(&inode->i_data.replication_lock);
 	spin_lock_init(&inode->i_data.page_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
 	atomic_set(&inode->i_data.truncate_count, 0);
diff -urp linux-2.6.0-test3-clean/fs/namei.c linux-2.6.0-test3-textrepl/fs/namei.c
--- linux-2.6.0-test3-clean/fs/namei.c	Sat Aug  9 13:09:03 2003
+++ linux-2.6.0-test3-textrepl/fs/namei.c	Wed Aug 13 03:42:26 2003
@@ -240,15 +240,40 @@ int permission(struct inode * inode,int 
  * who will try to move it in struct inode - just leave it here.
  */
 static spinlock_t arbitration_lock = SPIN_LOCK_UNLOCKED;
+/* 
+ * if the inability to get_write_access() is because
+ * of replication going on, collapse the replication
+ * and try again
+ */
+static inline int inode_try_replication_disable(struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	if (mapping_replicate(inode->i_mapping)) {
+		spin_unlock(&arbitration_lock);
+		printk("get_write_access() trying to collapse replication\n");
+		dump_stack();
+		spin_lock(&mapping->replication_lock);
+		collapse_replication(mapping, NULL);
+		mapping->replicate = 0;
+		atomic_inc(&inode->i_writecount);
+		spin_unlock(&mapping->replication_lock);
+		return 1;
+	}
+	return 0;
+}
 int get_write_access(struct inode * inode)
 {
+retry:
 	spin_lock(&arbitration_lock);
 	if (atomic_read(&inode->i_writecount) < 0) {
+		if (inode_try_replication_disable(inode))
+			goto retry;
 		spin_unlock(&arbitration_lock);
 		return -ETXTBSY;
 	}
 	atomic_inc(&inode->i_writecount);
 	spin_unlock(&arbitration_lock);
+	inode->i_mapping->replicate--;
 	return 0;
 }
 int deny_write_access(struct file * file)
diff -urp linux-2.6.0-test3-clean/fs/proc/task_mmu.c linux-2.6.0-test3-textrepl/fs/proc/task_mmu.c
--- linux-2.6.0-test3-clean/fs/proc/task_mmu.c	Sun Jul 27 09:57:47 2003
+++ linux-2.6.0-test3-textrepl/fs/proc/task_mmu.c	Sat Aug  9 20:21:59 2003
@@ -83,20 +83,27 @@ static int show_map(struct seq_file *m, 
 	unsigned long ino = 0;
 	dev_t dev = 0;
 	int len;
+	struct address_space *mapping = NULL;
 
 	if (file) {
 		struct inode *inode = map->vm_file->f_dentry->d_inode;
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
+		mapping = inode->i_mapping;
 	}
 
-	seq_printf(m, "%0*lx-%0*lx %c%c%c%c %0*lx %02x:%02x %lu %n",
+	seq_printf(m, "%0*lx-%0*lx %c%c%c|%c%c|%c|%c%c|deny:%c %0*lx %02x:%02x %lu %n",
 			(int) (2*sizeof(void*)), map->vm_start,
 			(int) (2*sizeof(void*)), map->vm_end,
 			flags & VM_READ ? 'r' : '-',
 			flags & VM_WRITE ? 'w' : '-',
 			flags & VM_EXEC ? 'x' : '-',
 			flags & VM_MAYSHARE ? 's' : 'p',
+			flags & VM_SHARED ? 's' : 'p',
+			flags & VM_MAYREAD ? 'r' : '-',
+			flags & VM_MAYWRITE ? 'w' : '-',
+			flags & VM_DENYWRITE ? 'd' : '-',
+			mapping ? (mapping->replicate+'0') : '_',
 			(int) (2*sizeof(void*)), map->vm_pgoff << PAGE_SHIFT,
 			MAJOR(dev), MINOR(dev), ino, &len);
 
diff -urp linux-2.6.0-test3-clean/include/asm-i386/mmzone.h linux-2.6.0-test3-textrepl/include/asm-i386/mmzone.h
--- linux-2.6.0-test3-clean/include/asm-i386/mmzone.h	Sun Jul 27 10:02:04 2003
+++ linux-2.6.0-test3-textrepl/include/asm-i386/mmzone.h	Wed Aug 13 04:23:50 2003
@@ -83,6 +83,8 @@ extern struct pglist_data *node_data[];
 	(unsigned long)(__page - __zone->zone_mem_map)			\
 		+ __zone->zone_start_pfn;				\
 })
+#define page_to_nid(page)	pfn_to_nid(page_to_pfn(page))
+#define page_is_local(page)	(page_to_nid(page) == numa_node_id())
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
 /*
  * pfn_valid should be made as fast as possible, and the current definition 
diff -urp linux-2.6.0-test3-clean/include/linux/fs.h linux-2.6.0-test3-textrepl/include/linux/fs.h
--- linux-2.6.0-test3-clean/include/linux/fs.h	Sat Aug  9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/include/linux/fs.h	Wed Aug 13 04:51:36 2003
@@ -315,6 +315,10 @@ struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
+	int replicate;
+	spinlock_t		replication_lock; /* gets rid of races while 
+						     turning replication on
+						     and off */
 	spinlock_t		page_lock;	/* and spinlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
 	struct list_head	dirty_pages;	/* list of dirty pages */
@@ -334,6 +338,8 @@ struct address_space {
 	struct address_space	*assoc_mapping;	/* ditto */
 };
 
+#define mapping_replicate(mapping)	((mapping)->replicate > 0)
+
 struct block_device {
 	struct list_head	bd_hash;
 	atomic_t		bd_count;
@@ -1177,6 +1183,8 @@ unsigned long invalidate_mapping_pages(s
 unsigned long invalidate_inode_pages(struct address_space *mapping);
 extern void invalidate_inode_pages2(struct address_space *mapping);
 extern void write_inode_now(struct inode *, int);
+extern int collapse_replication(struct address_space *mapping,
+		                struct file *file);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
 extern int filemap_fdatawait(struct address_space *);
@@ -1195,6 +1203,28 @@ extern int get_write_access(struct inode
 extern int deny_write_access(struct file *);
 static inline void put_write_access(struct inode * inode)
 {
+	/* you can reenable replication here 
+	if ((atomic_read(&inode->i_writecount) == 1) && // if this is the last
+							// write access
+	    (inode->i_mapping->replicate < 0)) {  // and this was previously
+						  // disabled
+		struct file *f;
+		file_list_lock();
+		list_for_each_entry(f, &inode->i_sb->s_files, f_list) {
+			if (f && f->f_dentry && f->f_dentry->d_inode && 
+				f->f_dentry->d_inode == inode) {
+				char buf[100];
+				file_list_unlock();
+				printk("might have had chance to redo replication on: ");
+				printk ("%s\n", d_path(f->f_dentry,
+					f->f_vfsmnt, &buf[0], 99));
+				goto out;
+			}
+		}
+		file_list_unlock();
+	}
+	inode->i_mapping->replicate++;
+out:*/
 	atomic_dec(&inode->i_writecount);
 }
 static inline void allow_write_access(struct file *file)
diff -urp linux-2.6.0-test3-clean/include/linux/gfp.h linux-2.6.0-test3-textrepl/include/linux/gfp.h
--- linux-2.6.0-test3-clean/include/linux/gfp.h	Sun Jul 27 09:59:24 2003
+++ linux-2.6.0-test3-textrepl/include/linux/gfp.h	Wed Aug 13 03:11:05 2003
@@ -32,6 +32,7 @@
 #define __GFP_NOFAIL	0x800	/* Retry for ever.  Cannot fail */
 #define __GFP_NORETRY	0x1000	/* Do not retry.  Might fail */
 #define __GFP_NO_GROW	0x2000	/* Slab internal usage */
+#define __GFP_NODE_STRICT 0x4000 /* Do not fall back to other nodes */
 
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)
diff -urp linux-2.6.0-test3-clean/include/linux/pagemap.h linux-2.6.0-test3-textrepl/include/linux/pagemap.h
--- linux-2.6.0-test3-clean/include/linux/pagemap.h	Sun Jul 27 09:56:03 2003
+++ linux-2.6.0-test3-textrepl/include/linux/pagemap.h	Wed Aug 13 03:05:20 2003
@@ -73,6 +73,7 @@ int add_to_page_cache_lru(struct page *p
 				unsigned long index, int gfp_mask);
 extern void remove_from_page_cache(struct page *page);
 extern void __remove_from_page_cache(struct page *page);
+extern struct page* __page_cache_lookup(struct address_space *mapping, pgoff_t offset);
 
 extern atomic_t nr_pagecache;
 
diff -urp linux-2.6.0-test3-clean/include/linux/radix-tree.h linux-2.6.0-test3-textrepl/include/linux/radix-tree.h
--- linux-2.6.0-test3-clean/include/linux/radix-tree.h	Sun Jul 27 09:59:36 2003
+++ linux-2.6.0-test3-textrepl/include/linux/radix-tree.h	Sat Aug  9 20:21:59 2003
@@ -41,7 +41,7 @@ do {					\
 	(root)->rnode = NULL;		\
 } while (0)
 
-extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+extern void *radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 extern void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 extern unsigned int
diff -urp linux-2.6.0-test3-clean/init/main.c linux-2.6.0-test3-textrepl/init/main.c
--- linux-2.6.0-test3-clean/init/main.c	Sun Jul 27 09:57:49 2003
+++ linux-2.6.0-test3-textrepl/init/main.c	Wed Aug 13 03:39:18 2003
@@ -81,6 +81,7 @@ extern void pidhash_init(void);
 extern void pidmap_init(void);
 extern void pte_chain_init(void);
 extern void radix_tree_init(void);
+extern void page_cache_leaf_init(void);
 extern void free_initmem(void);
 extern void populate_rootfs(void);
 extern void driver_init(void);
@@ -442,6 +443,7 @@ asmlinkage void __init start_kernel(void
 	security_scaffolding_startup();
 	vfs_caches_init(num_physpages);
 	radix_tree_init();
+	page_cache_leaf_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
 	page_writeback_init();
diff -urp linux-2.6.0-test3-clean/lib/radix-tree.c linux-2.6.0-test3-textrepl/lib/radix-tree.c
--- linux-2.6.0-test3-clean/lib/radix-tree.c	Sun Jul 27 09:58:50 2003
+++ linux-2.6.0-test3-textrepl/lib/radix-tree.c	Wed Aug 13 03:07:31 2003
@@ -18,6 +18,7 @@
  */
 
 #include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -168,8 +169,11 @@ static int radix_tree_extend(struct radi
  *	@item:		item to insert
  *
  *	Insert an item into the radix tree at position @index.
+ *
+ *	If the insertion fails because a duplicate element is present,
+ *	return that element.
  */
-int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
+void *radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
 {
 	struct radix_tree_node *node = NULL, *tmp, **slot;
 	unsigned int height, shift;
@@ -179,7 +183,7 @@ int radix_tree_insert(struct radix_tree_
 	if (index > radix_tree_maxindex(root->height)) {
 		error = radix_tree_extend(root, index);
 		if (error)
-			return error;
+			return ERR_PTR(error);
 	}
     
 	slot = &root->rnode;
@@ -190,7 +194,7 @@ int radix_tree_insert(struct radix_tree_
 		if (*slot == NULL) {
 			/* Have to add a child node.  */
 			if (!(tmp = radix_tree_node_alloc(root)))
-				return -ENOMEM;
+				return ERR_PTR(-ENOMEM);
 			*slot = tmp;
 			if (node)
 				node->count++;
@@ -205,7 +209,7 @@ int radix_tree_insert(struct radix_tree_
 	}
 
 	if (*slot != NULL)
-		return -EEXIST;
+		return *slot; //-EEXIST;
 	if (node)
 		node->count++;
 
diff -urp linux-2.6.0-test3-clean/mm/filemap.c linux-2.6.0-test3-textrepl/mm/filemap.c
--- linux-2.6.0-test3-clean/mm/filemap.c	Sat Aug  9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/mm/filemap.c	Wed Aug 13 04:47:03 2003
@@ -10,6 +10,7 @@
  * the NFS filesystem used to do this differently, for example)
  */
 #include <linux/config.h>
+#include <linux/err.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/compiler.h>
@@ -72,6 +73,150 @@
  *    ->page_lock		(try_to_unmap_one)
  */
 
+struct page_cache_leaf {
+	struct page* pages[MAX_NR_NODES];
+	int count;
+	/* the duplicate_lock is not here to prevent any harmful races, it
+	 * keeps collision overhead to a minimum.  
+	 *
+	 * When 2 CPUs on the same node getinto find_get_page() together, they
+	 * can both try to make a copy at the same time.  One is bound to get 
+	 * -EEXIST and back off properly, but copying that page is expensive.  
+	 * Better to just spin here and wait for it to happen.
+	 * 
+	 * This lock could be per-node.
+	 */
+	spinlock_t duplicate_lock;
+};
+static kmem_cache_t *page_cache_leaf_cachep;
+
+void page_cache_leaf_ctor(void *node, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct page_cache_leaf *leaf = node;
+	memset(node, 0, sizeof(struct page_cache_leaf));
+	spin_lock_init(&leaf->duplicate_lock);
+}
+
+DEFINE_PER_CPU(struct page_cache_leaf *, page_cache_leaf_preloads) = { 0, };
+int page_cache_leaf_preload(int gfp_mask)
+{
+	struct page_cache_leaf **preload;
+	int error = 0;
+	
+	preload = &get_cpu_var(page_cache_leaf_preloads);
+	if (!*preload)
+		*preload = kmem_cache_alloc(page_cache_leaf_cachep, gfp_mask);
+	if (!*preload)
+		error = -ENOMEM;
+	put_cpu_var(page_cache_leaf_preloads);
+
+	return error;
+}
+
+void __init page_cache_leaf_init(void)
+{
+	page_cache_leaf_cachep = kmem_cache_create("page_cache_leaf",
+			sizeof(struct page_cache_leaf), 0,
+			0, page_cache_leaf_ctor, NULL);
+	if (!page_cache_leaf_cachep)
+		panic ("Failed to create radix_tree_node cache\n");
+}
+
+/*
+ * If replication is on, only the node-local page will be returned.  If 
+ * there is not a local page, it will not find anything.
+ *
+ * If find_any is set, a search for all pages will be done even if
+ * replication is on.  This is useful when we're trying to make a 
+ * local copy of the page and we just want any old copy of it.
+ */
+enum page_search {
+	PAGE_LOCAL,
+	PAGE_ANY
+};
+
+/*
+ * for the non-numa case, this can just cast *leaf to a page and return
+ */
+struct page *page_cache_leaf_to_page(struct page_cache_leaf *leaf, 
+		struct address_space *mapping, enum page_search search_type)
+{
+	struct page *page = NULL;
+	int nid = numa_node_id();
+	
+	if (mapping_replicate(mapping))
+		page = leaf->pages[nid];
+
+	if (!page && ((mapping->replicate <= 0) || (search_type == PAGE_ANY))) 
+		for (nid = 0; nid < numnodes; nid++) {
+			page = leaf->pages[nid];
+			if (page)
+				break;
+		}
+	return page;
+}
+
+int __insert_into_page_cache(struct page *page, struct address_space *mapping,
+			pgoff_t offset)
+{
+	int error = 0;
+	int nid;
+	struct page_cache_leaf *leaf, **newleaf;
+	
+	nid = page_to_nid(page);
+	
+	newleaf = &get_cpu_var(page_cache_leaf_preloads);
+	if (newleaf)
+		leaf = radix_tree_insert(&mapping->page_tree, offset, *newleaf);
+	else {
+		leaf = radix_tree_lookup(&mapping->page_tree, offset);
+		if (!leaf)
+			leaf = ERR_PTR(-ENOMEM);
+	}
+
+	if (IS_ERR(leaf)) {
+		error = PTR_ERR(leaf);
+		goto out;
+	}
+	
+	/* there's already a leaf node there */
+	if (!mapping_replicate(mapping) && leaf) {
+		error = -EEXIST;
+		goto out;
+	}
+	
+	/* successful insertion */
+	if (!leaf) {
+		leaf = *newleaf;
+		*newleaf = NULL;
+	}
+	
+	if (leaf->pages[nid]) {
+		error = -EEXIST;
+	} else {
+		leaf->pages[nid] = page;
+		leaf->count++;
+	}
+out:
+	put_cpu_var(page_cache_leaf_preloads);
+	return error;
+}
+
+struct page* 
+__page_cache_lookup(struct address_space *mapping, pgoff_t offset)
+{
+	struct page *page = NULL;
+	struct page_cache_leaf *leaf;
+
+	leaf = radix_tree_lookup(&mapping->page_tree, offset);
+	if (!leaf)
+		goto out;
+
+	page = page_cache_leaf_to_page(leaf, mapping, 0);
+out:	
+	return page;	
+}
+
 /*
  * Remove a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
@@ -80,8 +225,25 @@
 void __remove_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
-
-	radix_tree_delete(&mapping->page_tree, page->index);
+	struct page_cache_leaf *leaf;
+	
+	leaf = radix_tree_lookup(&mapping->page_tree, page->index);
+	leaf->pages[page_to_nid(page)] = NULL;
+	if (--leaf->count == 0) {
+		struct page_cache_leaf **preload;
+		radix_tree_delete(&mapping->page_tree, page->index);
+		/* 
+		 * if there is a free preload slot for this CPU, put the
+		 * leaf back there instead of freeing it
+		 */
+		preload = &get_cpu_var(page_cache_leaf_preloads);
+		if (!*preload)
+			*preload = leaf;
+		else
+			kmem_cache_free(page_cache_leaf_cachep, leaf);
+		put_cpu_var(page_cache_leaf_preloads);
+	}
+	
 	list_del(&page->list);
 	page->mapping = NULL;
 
@@ -110,6 +272,36 @@ static inline int sync_page(struct page 
 	return 0;
 }
 
+static void __collapse_replication(struct address_space *mapping)
+{
+	/* 
+	 * later on, you can worry about collapsing everything back
+	 * but, for now, just tell everything that this is not currently
+	 * replicated and *don't* try to in the future
+	 */
+	printk("collapsing replication\n");
+	
+	mapping->replicate = -1;
+}
+
+inline int collapse_replication(struct address_space *mapping,
+		struct file *file)
+{
+	spin_lock(&mapping->page_lock);
+	if (mapping->replicate > 0) {
+		char buf[100];
+		memset(&buf[0], 0, 100);
+		printk ("collapsed: [%d] %s\n", mapping->replicate,
+				file ? d_path(file->f_dentry,
+				file->f_vfsmnt, &buf[0], 99) : "nofile");
+		dump_stack();
+		__collapse_replication(mapping);
+	} 
+	
+	spin_unlock(&mapping->page_lock);
+	return 0;
+}
+
 /**
  * filemap_fdatawrite - start writeback against all of a mapping's dirty pages
  * @mapping: address space structure to write
@@ -222,10 +414,16 @@ int add_to_page_cache(struct page *page,
 {
 	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 
+	if (error != 0)
+		goto err;
+	
+	/* this benefits from the radix_tree_preload()'s preempt_disable() */
+	error = page_cache_leaf_preload(gfp_mask & ~__GFP_HIGHMEM);
+	
 	if (error == 0) {
 		page_cache_get(page);
 		spin_lock(&mapping->page_lock);
-		error = radix_tree_insert(&mapping->page_tree, offset, page);
+		error = __insert_into_page_cache(page, mapping, offset);
 		if (!error) {
 			SetPageLocked(page);
 			___add_to_page_cache(page, mapping, offset);
@@ -235,10 +433,16 @@ int add_to_page_cache(struct page *page,
 		spin_unlock(&mapping->page_lock);
 		radix_tree_preload_end();
 	}
+err:
 	return error;
 }
 EXPORT_SYMBOL(add_to_page_cache);
 
+/*
+ * The pages will *not* be added to the LRU immediately.  They're only
+ * added after the entire pagevec is filled up.  Don't worry, they'll
+ * get there eventually.
+ */
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t offset, int gfp_mask)
 {
@@ -350,22 +554,162 @@ void __lock_page(struct page *page)
 EXPORT_SYMBOL(__lock_page);
 
 /*
+ * This is fairly lazy with preemption to make the code simpler.  It doesn't
+ * need to be perfect.  Making a local replica is by no means required.  If the
+ * replica page allocation failes, one of two things happens:
+ * 1. page cache returns non-local page, which gets mapped in somewhere.
+ *    things are slightly slower
+ * 2. page cache returns NULL, when there was a page in the cache.  
+ *    I/O is resubmitted for the page, and a replica is made with
+ *    the new data.
+ */
+DEFINE_PER_CPU(struct page *, replica_preloads) = { NULL, };
+void refill_replica_page_cpu(void)
+{
+	int cpu = get_cpu();
+	int nid = cpu_to_node(cpu);
+	unsigned int gfp_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_NODE_STRICT;
+	struct page **page = &__get_cpu_var(replica_preloads);
+	
+	if (!*page)
+		*page = alloc_pages_node(nid, gfp_mask, 0);
+
+	put_cpu();
+}
+/* I want to see this in the profiles */
+void make_local_replica_copy(struct page *dst, struct page *src)
+{
+	copy_highpage(dst, src);
+}
+static struct page *make_local_replica(struct address_space *mapping, struct page *page) {
+	struct page *copy = page;
+	struct page **prealloc;
+	int nid = numa_node_id();
+	int err;
+
+	if (!page)
+		goto out;
+
+	/* it's already local */
+	if (page_to_nid(page) == nid) 
+		goto out;
+	
+	if (!mapping_replicate(mapping))
+		goto out;
+
+	/* something is probably writing into the source page */
+	if (TestSetPageLocked(page))
+		goto out;
+	
+	prealloc = &get_cpu_var(replica_preloads);
+	if (*prealloc) {
+		copy = *prealloc;
+		*prealloc = NULL;
+	}
+	put_cpu_var(replica_preloads);
+
+	if (!copy)
+		goto out;
+	
+	make_local_replica_copy(copy, page);
+	/*
+	 * This should never actually have to allocate memory.  It will
+	 * be able to add the page to the already existing leaf.  The
+	 * leaf can't go away because we hold a ref count on the source 
+	 * page
+	 */
+	err = add_to_page_cache_lru(copy, mapping, page->index, GFP_ATOMIC);
+	unlock_page(page);
+	switch (err) {
+		case 0:
+			SetPageUptodate(copy);
+			SetPageMappedToDisk(copy);
+			unlock_page(copy);
+			break;
+		case -EEXIST:
+			printk("%s(): -EEXIST\n", __FUNCTION__);
+			page_cache_release(copy);
+			goto out;
+		default:
+			printk("__FUNCTION__(): ?? %d\n", err);
+			page_cache_release(copy);
+			dump_stack();
+			goto out;
+	}
+	return copy;
+out:
+	return page;
+}
+
+static struct page *make_local_replica_lock(struct address_space *mapping, struct page *page) {
+	struct page *copy;
+	return page;
+
+	refill_replica_page_cpu();
+	copy = make_local_replica(mapping, page);
+	
+	/* 
+	 * this is the cowardly way to do it.  add the new copy, and pray
+	 * that it shows up :)  If the replication appears to have worked,
+	 * drop the references to the source page.  If the new page
+	 * got removed in the meantime, find_lock_page() will just
+	 * redo the locking anyway.
+	 */
+	if (copy != page) {
+		unlock_page(page);
+		page_cache_release(page);
+		copy = find_lock_page(mapping, page->index);
+	}
+	
+	return copy;
+}
+/*
  * a rather lightweight function, finding and getting a reference to a
  * hashed page atomically.
  */
 struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 {
-	struct page *page;
-
+	struct page_cache_leaf *leaf;
+	struct page *page, *copy;
 	/*
 	 * We scan the hash list read-only. Addition to and removal from
 	 * the hash-list needs a held write-lock.
 	 */
+
+repeat:
 	spin_lock(&mapping->page_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page)
-		page_cache_get(page);
+	leaf = radix_tree_lookup(&mapping->page_tree, offset);
+	/* nothing found */
+	if (!leaf) {
+		spin_unlock(&mapping->page_lock);
+		return NULL;
+	}
+
+	page = page_cache_leaf_to_page(leaf, mapping, PAGE_ANY);
+	page_cache_get(page);
 	spin_unlock(&mapping->page_lock);
+	
+	refill_replica_page_cpu();
+
+	spin_lock(&leaf->duplicate_lock);
+	/* 
+	 * now that we have the lock, do a crude check to see if anyone
+	 * else has filled in the page we were looking for
+	 */
+	if (mapping_replicate(mapping) &&
+	    !page_is_local(page) &&
+	    leaf->pages[numa_node_id()]) {
+		spin_unlock(&leaf->duplicate_lock);
+		page_cache_release(page);
+		goto repeat;
+	}
+	copy = make_local_replica(mapping, page);
+	spin_unlock(&leaf->duplicate_lock);
+
+	if (copy != page) {
+		page_cache_release(page);
+		return copy;
+	}
 	return page;
 }
 
@@ -377,7 +721,7 @@ struct page *find_trylock_page(struct ad
 	struct page *page;
 
 	spin_lock(&mapping->page_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = __page_cache_lookup(mapping, offset);
 	if (page && TestSetPageLocked(page))
 		page = NULL;
 	spin_unlock(&mapping->page_lock);
@@ -402,7 +746,7 @@ struct page *find_lock_page(struct addre
 
 	spin_lock(&mapping->page_lock);
 repeat:
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = __page_cache_lookup(mapping, offset);
 	if (page) {
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
@@ -447,6 +791,7 @@ struct page *find_or_create_page(struct 
 	int err;
 repeat:
 	page = find_lock_page(mapping, index);
+	page = make_local_replica_lock(mapping, page);
 	if (!page) {
 		if (!cached_page) {
 			cached_page = alloc_page(gfp_mask);
@@ -482,8 +827,9 @@ repeat:
  *
  * find_get_pages() returns the number of pages which were found.
  */
-unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
-			    unsigned int nr_pages, struct page **pages)
+unsigned int find_get_pages(struct address_space *mapping,
+			    pgoff_t start, unsigned int nr_pages, 
+			    struct page **pages)
 {
 	unsigned int i;
 	unsigned int ret;
@@ -491,8 +837,15 @@ unsigned int find_get_pages(struct addre
 	spin_lock(&mapping->page_lock);
 	ret = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, start, nr_pages);
-	for (i = 0; i < ret; i++)
+	
+	/* 
+	 * The radix tree lookups return leaves, which must be converted
+	 * to pages 
+	 */
+	for (i = 0; i < ret; i++) {
+		pages[i] = page_cache_leaf_to_page((struct page_cache_leaf *)pages[i], mapping, 1);
 		page_cache_get(pages[i]);
+	}
 	spin_unlock(&mapping->page_lock);
 	return ret;
 }
@@ -1728,6 +2081,7 @@ generic_file_aio_write_nolock(struct kio
 		 */
 		fault_in_pages_readable(buf, bytes);
 
+		collapse_replication(mapping, file);
 		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
 		if (!page) {
 			status = -ENOMEM;
diff -urp linux-2.6.0-test3-clean/mm/memory.c linux-2.6.0-test3-textrepl/mm/memory.c
--- linux-2.6.0-test3-clean/mm/memory.c	Sat Aug  9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/mm/memory.c	Sat Aug  9 20:22:18 2003
@@ -959,6 +959,7 @@ static inline void break_cow(struct vm_a
 	invalidate_vcache(address, vma->vm_mm, new_page);
 	flush_cache_page(vma, address);
 	establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
+	BUG_ON(new_page->mapping && (new_page->mapping->replicate > 0));
 }
 
 /*
@@ -1452,8 +1453,13 @@ retry:
 		++mm->rss;
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
-		if (write_access)
+		if (write_access) {
 			entry = pte_mkwrite(pte_mkdirty(entry));
+			if (new_page->mapping && 
+				(new_page->mapping->replicate > 0)) {
+				BUG();
+			}
+		}
 		set_pte(page_table, entry);
 		pte_chain = page_add_rmap(new_page, page_table, pte_chain);
 		pte_unmap(page_table);
diff -urp linux-2.6.0-test3-clean/mm/mmap.c linux-2.6.0-test3-textrepl/mm/mmap.c
--- linux-2.6.0-test3-clean/mm/mmap.c	Sun Jul 27 10:04:01 2003
+++ linux-2.6.0-test3-textrepl/mm/mmap.c	Wed Aug 13 03:20:55 2003
@@ -523,7 +523,6 @@ unsigned long do_mmap_pgoff(struct file 
 		case MAP_SHARED:
 			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
 				return -EACCES;
-
 			/*
 			 * Make sure we don't allow writing to an append-only
 			 * file..
@@ -545,6 +544,32 @@ unsigned long do_mmap_pgoff(struct file 
 		case MAP_PRIVATE:
 			if (!(file->f_mode & FMODE_READ))
 				return -EACCES;
+			/*
+			 * Most of this code is duplicated about a hundred 
+			 * lines down.  It needs to be consolidated
+			 */
+			if (inode->i_data.replicate == 0) {
+				char buf[100];
+				int error = 1;
+				spin_lock(&inode->i_data.replication_lock);
+				if (inode->i_data.replicate == 0) {
+					inode->i_data.replicate = 1;
+					error = deny_write_access(file);
+				}
+				spin_unlock(&inode->i_data.replication_lock);
+				if (error)
+					break;
+				printk("%s: doing replication i_writecount: %d ",
+						current->comm,
+						atomic_read(&inode->i_writecount));
+				printk ("%s\n", d_path(file->f_dentry,
+					file->f_vfsmnt, &buf[0], 99));
+				if (!list_empty(&inode->i_data.dirty_pages)) {
+					printk("dirty list not empty\n");
+					inode->i_data.replicate = 0;
+					allow_write_access(file);
+				}
+			}
 			break;
 
 		default:
@@ -634,6 +659,22 @@ munmap_back:
 			if (error)
 				goto free_vma;
 			correct_wcount = 1;
+
+			if (inode->i_data.replicate == 0) {
+				char buf[100];
+				printk("%s: doing replication i_writecount: %d ",
+						current->comm,
+						atomic_read(&inode->i_writecount));
+				memset(&buf[0], 0, 100);
+				printk ("%s\n", d_path(file->f_dentry,
+					file->f_vfsmnt, &buf[0], 99));
+				inode->i_data.replicate = 1;
+				deny_write_access(file);
+				if (!list_empty(&inode->i_data.dirty_pages)) {
+					printk("dirty list not empty\n");
+					inode->i_data.replicate = 0;
+				}
+			}
 		}
 		vma->vm_file = file;
 		get_file(file);
diff -urp linux-2.6.0-test3-clean/mm/page_alloc.c linux-2.6.0-test3-textrepl/mm/page_alloc.c
--- linux-2.6.0-test3-clean/mm/page_alloc.c	Sat Aug  9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/mm/page_alloc.c	Tue Aug 12 21:14:19 2003
@@ -558,7 +558,11 @@ __alloc_pages(unsigned int gfp_mask, uns
 	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
-
+		
+		if ((__GFP_NODE_STRICT & gfp_mask) && 
+		    (pfn_to_nid(z->zone_start_pfn) != numa_node_id()))
+			continue;
+				
 		min += z->pages_low;
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
diff -urp linux-2.6.0-test3-clean/mm/readahead.c linux-2.6.0-test3-textrepl/mm/readahead.c
--- linux-2.6.0-test3-clean/mm/readahead.c	Sat Aug  9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/mm/readahead.c	Mon Aug 11 21:46:52 2003
@@ -229,7 +229,7 @@ __do_page_cache_readahead(struct address
 		if (page_offset > end_index)
 			break;
 
-		page = radix_tree_lookup(&mapping->page_tree, page_offset);
+		page = __page_cache_lookup(mapping, page_offset);
 		if (page)
 			continue;
 
diff -urp linux-2.6.0-test3-clean/mm/swap.c linux-2.6.0-test3-textrepl/mm/swap.c
--- linux-2.6.0-test3-clean/mm/swap.c	Sun Jul 27 10:00:40 2003
+++ linux-2.6.0-test3-textrepl/mm/swap.c	Wed Aug 13 03:22:15 2003
@@ -197,6 +197,7 @@ void release_pages(struct page **pages, 
 		}
 		if (TestClearPageLRU(page))
 			del_page_from_lru(zone, page);
+
 		if (page_count(page) == 0) {
 			if (!pagevec_add(&pages_to_free, page)) {
 				spin_unlock_irq(&zone->lru_lock);
diff -urp linux-2.6.0-test3-clean/mm/swap_state.c linux-2.6.0-test3-textrepl/mm/swap_state.c
--- linux-2.6.0-test3-clean/mm/swap_state.c	Sat Aug  9 13:09:04 2003
+++ linux-2.6.0-test3-textrepl/mm/swap_state.c	Wed Aug 13 03:22:29 2003
@@ -25,6 +25,8 @@ extern struct address_space_operations s
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
+	.replicate	= 0,
+	.replication_lock	= SPIN_LOCK_UNLOCKED,
 	.page_lock	= SPIN_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
 	.dirty_pages	= LIST_HEAD_INIT(swapper_space.dirty_pages),
@@ -198,7 +200,7 @@ int move_to_swap_cache(struct page *page
 	spin_lock(&swapper_space.page_lock);
 	spin_lock(&mapping->page_lock);
 
-	err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
+	err = __insert_into_page_cache(page, &swapper_space, entry.val);
 	if (!err) {
 		__remove_from_page_cache(page);
 		___add_to_page_cache(page, &swapper_space, entry.val);
@@ -234,7 +236,7 @@ int move_from_swap_cache(struct page *pa
 	spin_lock(&swapper_space.page_lock);
 	spin_lock(&mapping->page_lock);
 
-	err = radix_tree_insert(&mapping->page_tree, index, page);
+	err = __insert_into_page_cache(page, mapping, index);
 	if (!err) {
 		__delete_from_swap_cache(page);
 		___add_to_page_cache(page, mapping, index);

Copyright © 2003, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds