| From: |
| Dave Hansen <haveblue@us.ibm.com> |
| To: |
| LSE <lse-tech@lists.sourceforge.net> |
| Subject: |
| [Lse-tech] [RFC][PATCH] NUMA user page replication |
| Date: |
| 11 Dec 2003 17:18:27 -0800 |
| Cc: |
| "Martin J. Bligh" <mbligh@aracnet.com> |
This is a followup to a previous patch, covered here:
http://lwn.net/Articles/45082/ and here http://lwn.net/Articles/44523/
This version fixes quite a few locking problems with the previous
release. There were some races as replication was turned on and off, as
well as one that would cause unnecessary ETXTBSY errors.
The readahead code was also determined to be causing problems. Here's
what happened:
* read-only page is put into the page cache on node A, and brought in
from the disk
* another page attempts to do readahead for that page on node B
* readahead code is allowed to insert a !Uptodate() page
* I/O is submitted to fill in node B's copy (duplicate)
To work around this for now, I simply return -EEXIST to the readahead
code, which is exactly what happened before replication came along. An
alternate approach might be to fill in the copy that node B tried to
insert, from the already existing node A copy.
The patch is against 2.6.0-test11-mjb2:
ftp://ftp.kernel.org/pub/linux/kernel/people/mbligh/2.6.0-test11/patch-2.6.0-test11-mjb2.bz2
--
Dave Hansen
haveblue@us.ibm.com
diff -urp linux-2.6.0-test11-mjb2-clean/arch/i386/Kconfig linux-2.6.0-test11-mjb2-pagerepl/arch/i386/Kconfig
--- linux-2.6.0-test11-mjb2-clean/arch/i386/Kconfig 2003-12-11 16:04:02.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/arch/i386/Kconfig 2003-12-11 16:15:14.000000000 -0800
@@ -792,6 +792,17 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)
+config MAPPING_REPLICATE
+ bool " Numa user text replication"
+ depends on NUMA
+ default y
+ help
+ Selecting this option will allow the NUMA code to make node-local copies
+ of some kinds of read-only files, including executables and shared
+ libraries.
+
+ If unsure, say "n".
+
config DISCONTIGMEM
bool
depends on NUMA
diff -urp linux-2.6.0-test11-mjb2-clean/fs/inode.c linux-2.6.0-test11-mjb2-pagerepl/fs/inode.c
--- linux-2.6.0-test11-mjb2-clean/fs/inode.c 2003-12-11 16:04:23.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/fs/inode.c 2003-12-11 16:24:03.000000000 -0800
@@ -196,6 +196,9 @@ void inode_init_once(struct inode *inode
INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
spin_lock_init(&inode->i_lock);
i_size_ordered_init(inode);
+#ifdef CONFIG_MAPPING_REPLICATE
+ atomic_set(&inode->i_data.replicate, 0);
+#endif
}
EXPORT_SYMBOL(inode_init_once);
@@ -993,6 +996,7 @@ void generic_delete_inode(struct inode *
if (inode->i_data.nrpages)
truncate_inode_pages(&inode->i_data, 0);
+ clear_replication(inode);
security_inode_delete(inode);
@@ -1039,6 +1043,8 @@ static void generic_forget_inode(struct
spin_unlock(&inode_lock);
if (inode->i_data.nrpages)
truncate_inode_pages(&inode->i_data, 0);
+
+ clear_replication(inode);
clear_inode(inode);
destroy_inode(inode);
}
diff -urp linux-2.6.0-test11-mjb2-clean/fs/namei.c linux-2.6.0-test11-mjb2-pagerepl/fs/namei.c
--- linux-2.6.0-test11-mjb2-clean/fs/namei.c 2003-12-11 16:03:24.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/fs/namei.c 2003-12-11 16:20:44.000000000 -0800
@@ -241,29 +241,80 @@ int permission(struct inode * inode,int
* who will try to move it in struct inode - just leave it here.
*/
static spinlock_t arbitration_lock = SPIN_LOCK_UNLOCKED;
+/*
+ * if the inability to get_write_access() is because
+ * of replication going on, collapse the replication
+ * and try again
+ */
+static int inode_try_replication_disable(struct inode *inode)
+{
+ struct address_space *mapping = inode->i_mapping;
+ if (unlikely(mapping_replicate(inode->i_mapping))) {
+ spin_unlock(&arbitration_lock);
+
+ /* the collapsing is like truncating, and is protected
+ * by i_sem */
+ down(&inode->i_sem);
+ collapse_replication(mapping, NULL);
+ spin_lock(&arbitration_lock);
+ up(&inode->i_sem);
+
+ return 1;
+ }
+ return 0;
+}
int get_write_access(struct inode * inode)
{
spin_lock(&arbitration_lock);
+retry:
if (atomic_read(&inode->i_writecount) < 0) {
+ /* this can drop and reacquire the arbitration_lock */
+ if (inode_try_replication_disable(inode))
+ goto retry;
+
spin_unlock(&arbitration_lock);
return -ETXTBSY;
}
atomic_inc(&inode->i_writecount);
+ BUG_ON(mapping_replicate(inode->i_mapping));
spin_unlock(&arbitration_lock);
return 0;
}
-int deny_write_access(struct file * file)
+int __deny_write_access(struct file * file, int set_replicate)
{
+ struct inode *inode = file->f_dentry->d_inode;
+
spin_lock(&arbitration_lock);
- if (atomic_read(&file->f_dentry->d_inode->i_writecount) > 0) {
+ if (atomic_read(&inode->i_writecount) > 0) {
spin_unlock(&arbitration_lock);
+ printk("%s() failed replicate: %d\n",
+ __func__,
+ mapping_replicate(inode->i_mapping));
+ dump_stack();
return -ETXTBSY;
}
- atomic_dec(&file->f_dentry->d_inode->i_writecount);
+ atomic_dec(&inode->i_writecount);
+
+ /*
+ * this is done under the arbitration_lock to prevent any
+ * races where a potential writer might not see that
+ * writing is denied because of replication, and not just
+ * a normal write deny.
+ */
+#ifdef CONFIG_MAPPING_REPLICATE
+ if (set_replicate && !mapping_replicate(inode->i_mapping))
+ atomic_inc(&inode->i_data.replicate);
+#endif
+
spin_unlock(&arbitration_lock);
return 0;
}
+int deny_write_access(struct file * file)
+{
+ return __deny_write_access(file, 0);
+}
+
void path_release(struct nameidata *nd)
{
dput(nd->dentry);
diff -urp linux-2.6.0-test11-mjb2-clean/include/asm-i386/mmzone.h linux-2.6.0-test11-mjb2-pagerepl/include/asm-i386/mmzone.h
--- linux-2.6.0-test11-mjb2-clean/include/asm-i386/mmzone.h 2003-12-11 16:04:04.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/include/asm-i386/mmzone.h 2003-12-11 16:17:49.000000000 -0800
@@ -149,5 +149,7 @@ static inline void get_memcfg_numa(void)
get_memcfg_numa_flat();
}
+#define page_is_local(page) (page_to_nid(page) == numa_node_id())
+
#endif /* CONFIG_DISCONTIGMEM */
#endif /* _ASM_MMZONE_H_ */
diff -urp linux-2.6.0-test11-mjb2-clean/include/linux/fs.h linux-2.6.0-test11-mjb2-pagerepl/include/linux/fs.h
--- linux-2.6.0-test11-mjb2-clean/include/linux/fs.h 2003-12-11 16:04:23.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/include/linux/fs.h 2003-12-11 16:29:31.000000000 -0800
@@ -339,8 +339,22 @@ struct address_space {
#ifdef CONFIG_NUMA
struct binding *binding; /* for memory bindings */
#endif
+#ifdef CONFIG_MAPPING_REPLICATE
+ atomic_t replicate;
+#endif
};
+#ifdef CONFIG_MAPPING_REPLICATE
+ #define mapping_replicate(mapping) (atomic_read(&(mapping)->replicate) > 0)
+ #define clear_replication(inode) do { \
+ if (atomic_read(&inode->i_data.replicate)) \
+ atomic_dec(&inode->i_data.replicate); \
+ } while (0)
+#else
+ #define mapping_replicate(mapping) (0)
+ #define clear_replication(inode) do {} while(0)
+#endif
+
struct block_device {
dev_t bd_dev; /* not a kdev_t - it's a search key */
struct inode * bd_inode; /* will die */
@@ -1202,9 +1216,11 @@ static inline void invalidate_remote_ino
}
extern void invalidate_inode_pages2(struct address_space *mapping);
extern void write_inode_now(struct inode *, int);
+extern int file_try_replicate(struct file *file);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
+extern void collapse_replication(struct address_space *mapping, struct file *file);
extern void sync_supers(void);
extern void sync_filesystems(int wait);
extern void emergency_sync(void);
@@ -1218,6 +1234,7 @@ extern int permission(struct inode *, in
extern int vfs_permission(struct inode *, int);
extern int get_write_access(struct inode *);
extern int deny_write_access(struct file *);
+extern int __deny_write_access(struct file *, int);
static inline void put_write_access(struct inode * inode)
{
atomic_dec(&inode->i_writecount);
diff -urp linux-2.6.0-test11-mjb2-clean/include/linux/pagemap.h linux-2.6.0-test11-mjb2-pagerepl/include/linux/pagemap.h
--- linux-2.6.0-test11-mjb2-clean/include/linux/pagemap.h 2003-12-11 16:04:04.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/include/linux/pagemap.h 2003-12-11 16:15:14.000000000 -0800
@@ -96,6 +96,9 @@ extern struct page * find_or_create_page
extern unsigned int find_get_pages(struct address_space *mapping,
pgoff_t start, unsigned int nr_pages,
struct page **pages);
+extern int find_get_replica_pages(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_pages,
+ struct page **pages);
/*
* Returns locked page at given index in given cache, creating it if needed.
@@ -118,7 +121,10 @@ int add_to_page_cache(struct page *page,
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
unsigned long index, int gfp_mask);
extern void remove_from_page_cache(struct page *page);
+extern int __insert_into_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t offset);
extern void __remove_from_page_cache(struct page *page);
+extern struct page *__page_cache_lookup(struct address_space *mapping, pgoff_t offset);
extern atomic_t nr_pagecache;
diff -urp linux-2.6.0-test11-mjb2-clean/include/linux/pagevec.h linux-2.6.0-test11-mjb2-pagerepl/include/linux/pagevec.h
--- linux-2.6.0-test11-mjb2-clean/include/linux/pagevec.h 2003-12-11 16:03:14.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/include/linux/pagevec.h 2003-12-11 16:15:14.000000000 -0800
@@ -24,6 +24,8 @@ void __pagevec_lru_add_active(struct pag
void pagevec_strip(struct pagevec *pvec);
unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned int nr_pages);
+unsigned int pagevec_lookup_replicas(struct pagevec *pvec,
+ struct address_space *mapping, unsigned int nr_pages);
static inline void pagevec_init(struct pagevec *pvec, int cold)
{
diff -urp linux-2.6.0-test11-mjb2-clean/include/linux/radix-tree.h linux-2.6.0-test11-mjb2-pagerepl/include/linux/radix-tree.h
--- linux-2.6.0-test11-mjb2-clean/include/linux/radix-tree.h 2003-12-11 16:03:14.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/include/linux/radix-tree.h 2003-12-11 16:15:14.000000000 -0800
@@ -41,7 +41,7 @@ do { \
(root)->rnode = NULL; \
} while (0)
-extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+extern void *radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
extern void *radix_tree_delete(struct radix_tree_root *, unsigned long);
extern unsigned int
diff -urp linux-2.6.0-test11-mjb2-clean/init/main.c linux-2.6.0-test11-mjb2-pagerepl/init/main.c
--- linux-2.6.0-test11-mjb2-clean/init/main.c 2003-12-11 16:04:04.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/init/main.c 2003-12-11 16:15:14.000000000 -0800
@@ -83,6 +83,7 @@ extern void pidhash_init(void);
extern void pidmap_init(void);
extern void pte_chain_init(void);
extern void radix_tree_init(void);
+extern void page_cache_leaf_init(void);
extern void free_initmem(void);
extern void populate_rootfs(void);
extern void driver_init(void);
@@ -456,6 +457,7 @@ asmlinkage void __init start_kernel(void
security_scaffolding_startup();
vfs_caches_init(num_physpages);
radix_tree_init();
+ page_cache_leaf_init();
signals_init();
/* rootfs populating might need page-writeback */
page_writeback_init();
diff -urp linux-2.6.0-test11-mjb2-clean/kernel/sched.c linux-2.6.0-test11-mjb2-pagerepl/kernel/sched.c
--- linux-2.6.0-test11-mjb2-clean/kernel/sched.c 2003-12-11 16:04:04.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/kernel/sched.c 2003-12-11 16:15:14.000000000 -0800
@@ -2529,6 +2529,8 @@ asmlinkage long sys_sched_setaffinity(pi
int retval;
task_t *p;
+ printk("%s(%d, %d, %08lx)\n", __func__, pid, len, user_mask_ptr ? (unsigned long)-1 : *user_mask_ptr);
+
if (len < sizeof(new_mask))
return -EINVAL;
diff -urp linux-2.6.0-test11-mjb2-clean/lib/radix-tree.c linux-2.6.0-test11-mjb2-pagerepl/lib/radix-tree.c
--- linux-2.6.0-test11-mjb2-clean/lib/radix-tree.c 2003-12-11 16:03:28.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/lib/radix-tree.c 2003-12-11 16:15:14.000000000 -0800
@@ -18,6 +18,7 @@
*/
#include <linux/errno.h>
+#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -168,8 +169,11 @@ static int radix_tree_extend(struct radi
* @item: item to insert
*
* Insert an item into the radix tree at position @index.
+ *
+ * If the insertion fails because a duplicate element is present,
+ * return that element.
*/
-int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
+void *radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
{
struct radix_tree_node *node = NULL, *tmp, **slot;
unsigned int height, shift;
@@ -179,7 +183,7 @@ int radix_tree_insert(struct radix_tree_
if (index > radix_tree_maxindex(root->height)) {
error = radix_tree_extend(root, index);
if (error)
- return error;
+ return ERR_PTR(error);
}
slot = &root->rnode;
@@ -190,7 +194,7 @@ int radix_tree_insert(struct radix_tree_
if (*slot == NULL) {
/* Have to add a child node. */
if (!(tmp = radix_tree_node_alloc(root)))
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
*slot = tmp;
if (node)
node->count++;
@@ -205,7 +209,7 @@ int radix_tree_insert(struct radix_tree_
}
if (*slot != NULL)
- return -EEXIST;
+ return *slot; /* used to be -EEXIST */
if (node)
node->count++;
diff -urp linux-2.6.0-test11-mjb2-clean/mm/filemap.c linux-2.6.0-test11-mjb2-pagerepl/mm/filemap.c
--- linux-2.6.0-test11-mjb2-clean/mm/filemap.c 2003-12-11 16:04:23.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/mm/filemap.c 2003-12-11 16:27:58.000000000 -0800
@@ -10,6 +10,7 @@
* the NFS filesystem used to do this differently, for example)
*/
#include <linux/config.h>
+#include <linux/err.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/compiler.h>
@@ -91,6 +92,254 @@
*/
/*
+ * If replication is on, only the node-local page will be returned. If
+ * there is not a local page, it will not find anything.
+ *
+ * If find_any is set, a search for all pages will be done even if
+ * replication is on. This is useful when we're trying to make a
+ * local copy of the page and we just want any old copy of it.
+ */
+enum page_search {
+ PAGE_LOCAL,
+ PAGE_ANY
+};
+
+#ifndef CONFIG_MAPPING_REPLICATE
+/*
+ * This is an attempt to keep the overhead when not doing replication
+ * to a bare minimum. Instead of storing a real page_cache_leaf in
+ * the radix tree, a plain page pointer is stored.
+ *
+ * This abstraction allows more common code to be used for both the
+ * replicated, and non-replicated cases.
+ */
+struct page_cache_leaf {
+ struct page page;
+};
+
+struct page *page_cache_leaf_to_page(struct page_cache_leaf *leaf,
+ struct address_space *mapping, enum page_search search_type)
+{
+ return &leaf->page;
+}
+
+#define leaf_free(leaf) do {} while (0)
+#define leaf_preload(gfpflags) (0)
+
+static inline struct page *make_local_replica_lock(struct address_space *mapping,
+ struct page *page)
+{
+ return page;
+}
+
+static inline void drop_replica_pages(struct address_space *mapping)
+{
+}
+void collapse_replication(struct address_space *mapping,
+ struct file *file)
+{
+}
+static inline struct page *make_local_replica(struct address_space *mapping, struct page *page, struct page_cache_leaf *leaf)
+{
+ return page;
+}
+
+#else /* CONFIG_MAPPING_REPLICATE */
+
+struct page_cache_leaf {
+ struct page* pages[MAX_NUMNODES];
+ /*
+ * This doesn't need to be an atomic because it's always
+ * modified under mapping->page_lock
+ */
+ int count;
+ /*
+ * the duplicate_lock is not here to prevent any harmful races, it
+ * keeps collision overhead to a minimum.
+ *
+ * When 2 CPUs on the same node get into find_get_page() together, they
+ * can both try to make a copy at the same time. One is bound to get
+ * -EEXIST and back off properly, but copying that page is expensive.
+ * Better to just spin on this and wait for the other cpu to do the copy.
+ *
+ * This lock could be per-node.
+ */
+ spinlock_t duplicate_lock;
+};
+
+DEFINE_PER_CPU(struct page_cache_leaf *, page_cache_leaf_preloads) = { 0, };
+static kmem_cache_t *page_cache_leaf_cachep;
+
+static inline void leaf_free(struct page_cache_leaf *leaf)
+{
+ struct page_cache_leaf **preload;
+ preload = &get_cpu_var(page_cache_leaf_preloads);
+ if (!*preload)
+ *preload = leaf;
+ else
+ kmem_cache_free(page_cache_leaf_cachep, leaf);
+ put_cpu_var(page_cache_leaf_preloads);
+}
+
+void page_cache_leaf_ctor(void *node, kmem_cache_t *cachep, unsigned long flags)
+{
+ struct page_cache_leaf *leaf = node;
+ memset(node, 0, sizeof(struct page_cache_leaf));
+ spin_lock_init(&leaf->duplicate_lock);
+}
+
+int leaf_preload(int gfp_mask)
+{
+ struct page_cache_leaf **preload;
+ int error = 0;
+
+ preload = &get_cpu_var(page_cache_leaf_preloads);
+ if (!*preload)
+ *preload = kmem_cache_alloc(page_cache_leaf_cachep, gfp_mask);
+ if (!*preload)
+ error = -ENOMEM;
+
+ put_cpu_var(page_cache_leaf_preloads);
+
+ return error;
+}
+
+/*
+ * for the non-numa case, this can just cast *leaf to a page and return
+ */
+struct page *page_cache_leaf_to_page(struct page_cache_leaf *leaf,
+ struct address_space *mapping, enum page_search search_type)
+{
+ struct page *page = NULL;
+ int nid = numa_node_id();
+
+ /* Always look for a local copy first */
+ if (mapping_replicate(mapping))
+ page = leaf->pages[nid];
+
+ if (!page && (!mapping_replicate(mapping) || (search_type == PAGE_ANY)))
+ for (nid = 0; nid < numnodes; nid++) {
+ page = leaf->pages[nid];
+ if (page)
+ break;
+ }
+ return page;
+}
+#endif
+
+void __init page_cache_leaf_init(void)
+{
+#ifdef CONFIG_MAPPING_REPLICATE
+ page_cache_leaf_cachep = kmem_cache_create("page_cache_leaf",
+ sizeof(struct page_cache_leaf), 0,
+ 0, page_cache_leaf_ctor, NULL);
+ if (!page_cache_leaf_cachep)
+ panic ("Failed to create radix_tree_node cache\n");
+#endif
+}
+
+#ifndef CONFIG_MAPPING_REPLICATE
+int __insert_into_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t offset)
+{
+ struct page_cache_leaf *leaf, *errptr;
+ int error = 0;
+
+ leaf = container_of(page, struct page_cache_leaf, page);
+ errptr = radix_tree_insert(&mapping->page_tree, offset, leaf);
+
+ if (IS_ERR(errptr))
+ error = PTR_ERR(errptr);
+
+ return error;
+}
+#else
+int __insert_into_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t offset)
+{
+ int error = 0;
+ int nid;
+ struct page_cache_leaf *leaf, **newleaf;
+
+ nid = page_to_nid(page);
+
+ /*
+ * If the leaf preload allocation failed, then at least check the
+ * tree to see if a leaf is already present. If one is present,
+ * then we got lucky and didn't really need to allocate anything.
+ *
+ * If that lookup *fails*, then we were really out of memory and
+ * error out.
+ */
+ newleaf = &get_cpu_var(page_cache_leaf_preloads);
+ if (*newleaf)
+ leaf = radix_tree_insert(&mapping->page_tree, offset, *newleaf);
+ else {
+ leaf = radix_tree_lookup(&mapping->page_tree, offset);
+ if (!leaf)
+ leaf = ERR_PTR(-ENOMEM);
+ }
+
+ if (IS_ERR(leaf)) {
+ error = PTR_ERR(leaf);
+ goto out;
+ }
+
+ /* there's already a leaf node there */
+ if (!mapping_replicate(mapping) && leaf) {
+ error = -EEXIST;
+ goto out;
+ }
+
+ /* successful insertion, absorb the preloaded leaf */
+ if (!leaf) {
+ leaf = *newleaf;
+ *newleaf = NULL;
+ }
+
+ /*
+ * A !PageUptodate() will have some I/O done on it shortly.
+ * The readahead code puts pages like that in here. If
+ * there's a replica available, don't bother putting the
+ * page in, because the I/O is a duplicate.
+ */
+ if (leaf->pages[nid]) {
+ error = -EEXIST;
+ } else {
+ /*
+ * Instead of -EEXIST, we could look for an
+ * Uptodate copy, and use that to make this
+ * page Uptodate, making a local replica.
+ */
+ if (leaf->count > 1 && !PageUptodate(page)) {
+ error = -EEXIST;
+ } else {
+ leaf->pages[nid] = page;
+ leaf->count++;
+ }
+ }
+out:
+ put_cpu_var(page_cache_leaf_preloads);
+ return error;
+}
+#endif
+
+struct page*
+__page_cache_lookup(struct address_space *mapping, pgoff_t offset)
+{
+ struct page *page = NULL;
+ struct page_cache_leaf *leaf;
+
+ leaf = radix_tree_lookup(&mapping->page_tree, offset);
+ if (!leaf)
+ goto out;
+
+ page = page_cache_leaf_to_page(leaf, mapping, PAGE_ANY);
+out:
+ return page;
+}
+
+/*
* Remove a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold a write_lock on the mapping's page_lock.
@@ -98,8 +347,21 @@
void __remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
-
- radix_tree_delete(&mapping->page_tree, page->index);
+#ifdef CONFIG_MAPPING_REPLICATE
+ struct page_cache_leaf *leaf;
+ leaf = radix_tree_lookup(&mapping->page_tree, page->index);
+ leaf->pages[page_to_nid(page)] = NULL;
+ if (--leaf->count == 0) {
+#endif
+ radix_tree_delete(&mapping->page_tree, page->index);
+#ifdef CONFIG_MAPPING_REPLICATE
+ /*
+ * if there is a free preload slot for this CPU, put the
+ * leaf back there instead of freeing it
+ */
+ leaf_free(leaf);
+ }
+#endif
list_del(&page->list);
page->mapping = NULL;
@@ -128,6 +390,22 @@ static inline int sync_page(struct page
return 0;
}
+#ifdef CONFIG_MAPPING_REPLICATE
+/*
+ * synchronized by i_sem
+ */
+extern void drop_replica_pages(struct address_space *mapping);
+inline void collapse_replication(struct address_space *mapping,
+ struct file *file)
+{
+ if (mapping_replicate(mapping)) {
+ atomic_dec(&mapping->replicate);
+ drop_replica_pages(mapping);
+ atomic_inc(&mapping->host->i_writecount);
+ }
+}
+#endif
+
/**
* filemap_fdatawrite - start writeback against all of a mapping's dirty pages
* @mapping: address space structure to write
@@ -251,10 +529,16 @@ int add_to_page_cache(struct page *page,
{
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (error != 0)
+ goto err;
+
+ /* this benefits from the radix_tree_preload()'s preempt_disable() */
+ error = leaf_preload(gfp_mask & ~__GFP_HIGHMEM);
+
if (error == 0) {
page_cache_get(page);
spin_lock(&mapping->page_lock);
- error = radix_tree_insert(&mapping->page_tree, offset, page);
+ error = __insert_into_page_cache(page, mapping, offset);
if (!error) {
SetPageLocked(page);
___add_to_page_cache(page, mapping, offset);
@@ -264,11 +548,17 @@ int add_to_page_cache(struct page *page,
spin_unlock(&mapping->page_lock);
radix_tree_preload_end();
}
+err:
return error;
}
EXPORT_SYMBOL(add_to_page_cache);
+/*
+ * The pages will *not* be added to the LRU immediately. They're only
+ * added after the entire pagevec is filled up. Don't worry, they'll
+ * get there eventually.
+ */
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t offset, int gfp_mask)
{
@@ -385,24 +675,236 @@ void __lock_page(struct page *page)
EXPORT_SYMBOL(__lock_page);
+#ifdef CONFIG_MAPPING_REPLICATE
/*
- * a rather lightweight function, finding and getting a reference to a
- * hashed page atomically.
+ * This is fairly lazy with preemption to make the code simpler. It doesn't
+ * need to be perfect. Making a local replica is by no means required. If the
+ * replica page allocation fails, one of two things happens:
+ * 1. page cache returns non-local page, which gets mapped in somewhere.
+ * things are slightly slower
+ * 2. page cache returns NULL, when there was a page in the cache.
+ * I/O is resubmitted for the page, and a replica is made with
+ * the new data.
+ */
+DEFINE_PER_CPU(struct page *, replica_preloads) = { NULL, };
+void refill_replica_page_cpu(void)
+{
+ int cpu = get_cpu();
+ int nid = cpu_to_node(cpu);
+ unsigned int gfp_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_NODE_STRICT;
+ struct page **page = &__get_cpu_var(replica_preloads);
+
+ if (!*page)
+ *page = alloc_pages_node(nid, gfp_mask, 0);
+
+ put_cpu();
+}
+
+/* I want to see this in the profiles */
+void make_local_replica_copy(struct page *dst, struct page *src)
+{
+ if (!page_is_local(dst)) {
+ printk("%s(): %d dst not local: %08lx src: %08lx\n",
+ __func__, smp_processor_id(),
+ page_to_pfn(dst), page_to_pfn(src));
+ }
+ BUG_ON(!PageUptodate(src));
+ copy_highpage(dst, src);
+}
+
+static struct page *__make_local_replica(struct address_space *mapping, struct page *page) {
+ struct page *copy = page;
+ struct page **prealloc;
+ int err;
+
+ if (!page)
+ goto out;
+
+ if (!mapping_replicate(mapping))
+ goto out;
+
+ /* something is probably writing into the source page
+ * do *not* wait for this to get unlocked. We're under
+ * a lock here. Just punt on the copy. */
+ if (TestSetPageLocked(page))
+ goto out;
+
+ /* the old page got unhashed since we pulled it out */
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ goto out;
+ }
+
+ prealloc = &get_cpu_var(replica_preloads);
+ if (*prealloc) {
+ copy = *prealloc;
+ *prealloc = NULL;
+ }
+ put_cpu_var(replica_preloads);
+
+ if (!copy)
+ goto out;
+
+ make_local_replica_copy(copy, page);
+ /*
+ * Do this now so that add_to_page_cache_lru() won't confuse this
+ * with a readahead page that should get -EEXIST instead of just
+ * getting added.
+ */
+ SetPageUptodate(copy);
+
+ /*
+ * This should never actually have to allocate memory. It will
+ * be able to add the page to the already existing leaf. The
+ * leaf can't go away because we hold a ref count on the source
+ * page.
+ */
+ err = add_to_page_cache_lru(copy, mapping, page->index, GFP_ATOMIC);
+ unlock_page(page);
+ switch (err) {
+ case 0:
+ unlock_page(copy);
+ break;
+ case -EEXIST:
+ page_cache_release(copy);
+ goto out;
+ default:
+ printk("%s(): ?? %d\n", __FUNCTION__, err);
+ page_cache_release(copy);
+ dump_stack();
+ goto out;
+ }
+ return copy;
+out:
+ return page;
+}
+
+
+/*
+ * We can not be making copies of pages that aren't up to date yet,
+ * so this function makes sure of that.
+ *
+ * Instead of just just returning the information that the page is
+ * unusable, it could go looking for other sources for the page, perhaps
+ * another node.
+ *
+ * The logic for this was taken from read_cache_page()
*/
-struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+static inline int replica_source_uptodate(struct address_space *mapping, struct page *page)
{
- struct page *page;
+ int ret = 1;
+
+ if (likely(PageUptodate(page)))
+ goto out;
+
+ lock_page(page);
+ if (!PageUptodate(page) || !page->mapping)
+ ret = 0;
+ unlock_page(page);
+out:
+ return ret;
+}
+/*
+ * This needs to be called without mapping->page_lock held
+ */
+static inline struct page *make_local_replica(struct address_space *mapping, struct page *page, struct page_cache_leaf *leaf)
+{
+ struct page *copy;
+
+ if (page_is_local(page))
+ return page;
+
+ /*
+ * if there's a problem with the source page, don't make a copy
+ * of it. The caller will fix this up.
+ */
+ if (!replica_source_uptodate(mapping, page))
+ return page;
+
+ refill_replica_page_cpu();
+
+ spin_lock(&leaf->duplicate_lock);
+ /*
+ * now that we have the lock, do a crude check to see if anyone
+ * else has filled in the page we were looking for
+ */
+ if (mapping_replicate(mapping) &&
+ leaf->pages[numa_node_id()]) {
+ spin_unlock(&leaf->duplicate_lock);
+ page_cache_release(page);
+ return NULL;
+ }
+ copy = __make_local_replica(mapping, page);
+ spin_unlock(&leaf->duplicate_lock);
+
+ if (copy != page) {
+ page_cache_release(page);
+ return copy;
+ }
+
+ return page;
+}
+
+
+static struct page *make_local_replica_lock(struct address_space *mapping, struct page *page) {
+ struct page *copy;
+ return page;
+
+ refill_replica_page_cpu();
+ copy = __make_local_replica(mapping, page);
+
+ /*
+ * this is the cowardly way to do it. Add the new copy, and pray
+ * that it shows up :) If the replication appears to have worked,
+ * drop the references to the source page. If the new page
+ * got removed in the meantime, find_lock_page() will just
+ * redo the locking anyway.
+ */
+ if (copy != page) {
+ unlock_page(page);
+ page_cache_release(page);
+ copy = find_lock_page(mapping, page->index);
+ }
+
+ return copy;
+}
+#endif
+/*
+ * With no page replication, this is a rather function for finding and
+ * getting a reference to a hashed page atomically.
+ *
+ * When replicating pages, this becomes the place where the source for
+ * copies found and the new copy made.
+ */
+struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+{
+ struct page_cache_leaf *leaf;
+ struct page *page, *copy;
/*
* We scan the hash list read-only. Addition to and removal from
* the hash-list needs a held write-lock.
*/
+
+repeat:
spin_lock(&mapping->page_lock);
- page = radix_tree_lookup(&mapping->page_tree, offset);
- if (page)
- page_cache_get(page);
+ leaf = radix_tree_lookup(&mapping->page_tree, offset);
+ /* nothing found */
+ if (!leaf) {
+ spin_unlock(&mapping->page_lock);
+ return NULL;
+ }
+
+ page = page_cache_leaf_to_page(leaf, mapping, PAGE_ANY);
+ page_cache_get(page);
spin_unlock(&mapping->page_lock);
- return page;
+
+ /* A NULL in this context is like -EEXIST. Try again. */
+ copy = make_local_replica(mapping, page, leaf);
+ if (!copy)
+ goto repeat;
+
+ return copy;
}
EXPORT_SYMBOL(find_get_page);
@@ -415,10 +917,11 @@ struct page *find_trylock_page(struct ad
struct page *page;
spin_lock(&mapping->page_lock);
- page = radix_tree_lookup(&mapping->page_tree, offset);
+ page = __page_cache_lookup(mapping, offset);
if (page && TestSetPageLocked(page))
page = NULL;
spin_unlock(&mapping->page_lock);
+ page = make_local_replica_lock(mapping, page);
return page;
}
@@ -442,12 +945,13 @@ struct page *find_lock_page(struct addre
spin_lock(&mapping->page_lock);
repeat:
- page = radix_tree_lookup(&mapping->page_tree, offset);
+ page = __page_cache_lookup(mapping, offset);
if (page) {
page_cache_get(page);
if (TestSetPageLocked(page)) {
spin_unlock(&mapping->page_lock);
lock_page(page);
+ page = make_local_replica_lock(mapping, page);
spin_lock(&mapping->page_lock);
/* Has the page been truncated while we slept? */
@@ -489,6 +993,8 @@ struct page *find_or_create_page(struct
int err;
repeat:
page = find_lock_page(mapping, index);
+ /* this only locks if a replica is made */
+ page = make_local_replica_lock(mapping, page);
if (!page) {
if (!cached_page) {
cached_page = alloc_page(gfp_mask);
@@ -526,22 +1032,85 @@ EXPORT_SYMBOL(find_or_create_page);
*
* find_get_pages() returns the number of pages which were found.
*/
-unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
- unsigned int nr_pages, struct page **pages)
+unsigned int find_get_pages(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_pages,
+ struct page **pages)
{
- unsigned int i;
unsigned int ret;
+ int i;
spin_lock(&mapping->page_lock);
+
ret = radix_tree_gang_lookup(&mapping->page_tree,
(void **)pages, start, nr_pages);
- for (i = 0; i < ret; i++)
+
+ for (i = 0; i < ret; i++) {
+ /*
+ * The radix tree lookups return leaves, which
+ * must be converted to pages
+ */
+ struct page_cache_leaf * leaf = (struct page_cache_leaf *)pages[i];
+ pages[i] = page_cache_leaf_to_page(leaf, mapping, PAGE_ANY);
page_cache_get(pages[i]);
+ }
spin_unlock(&mapping->page_lock);
return ret;
}
/*
+ * This is used to find _just_ the replicated pages . It's
+ * used when we need to write to something where replication
+ * is active.
+ */
+int find_get_replica_pages(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_pages,
+ struct page **pages)
+{
+#ifdef CONFIG_MAPPING_REPLICATE
+ unsigned int nid = numa_node_id();
+ unsigned int nr_leaves;
+ struct page_cache_leaf *leaf;
+ struct page_cache_leaf **leaves = (struct page_cache_leaf **)pages;
+ int pages_seen;
+ int i, j;
+
+ /*
+ * this is the number of leaves which have been converted
+ * to leaves to be returned. Any array indexes <= this
+ * number are pages. Any > are leaves
+ */
+ int nr_ret_pages = 0;
+
+ spin_lock(&mapping->page_lock);
+
+ nr_leaves = radix_tree_gang_lookup(&mapping->page_tree,
+ (void **)leaves, start, nr_pages);
+ for (i = 0; i < nr_leaves; i++) {
+ leaf = leaves[i];
+ if (leaf->count <= 1)
+ continue;
+
+ for (j=0, pages_seen = 0;
+ j < MAX_NUMNODES && pages_seen < leaf->count;
+ j++) {
+ if (j == nid || !leaf->pages[j])
+ continue;
+ pages[nr_ret_pages] = leaf->pages[j];
+ page_cache_get(pages[nr_ret_pages]);
+ pages_seen++;
+ nr_ret_pages++;
+ }
+ if (i < nr_ret_pages)
+ i = nr_ret_pages; /* don't forget i++ */
+ }
+ spin_unlock(&mapping->page_lock);
+ return nr_ret_pages;
+#else
+ return 0;
+#endif
+}
+
+/*
* Same as grab_cache_page, but do not wait if the page is unavailable.
* This is intended for speculative data generators, where the data can
* be regenerated if the page couldn't be grabbed. This routine should
@@ -1806,6 +2375,7 @@ generic_file_aio_write_nolock(struct kio
*/
fault_in_pages_readable(buf, bytes);
+ collapse_replication(mapping, file);
page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
if (!page) {
status = -ENOMEM;
@@ -1987,3 +2557,38 @@ out:
}
EXPORT_SYMBOL_GPL(generic_file_direct_IO);
+
+/*
+ * Some of this code is a bit redundant in the case where we're replicating
+ * an executable. It does a deny_write_access() just before this is called
+ * so this deny_write_access()'s error checking is unnecessary in that case.
+ *
+ * For overall reduction of code and cleanliness, we do a little extra here
+ */
+
+int file_try_replicate(struct file *file)
+{
+#ifdef CONFIG_MAPPING_REPLICATE
+ struct inode *inode = file ? file->f_dentry->d_inode : NULL;
+ int error = 1;
+ down(&inode->i_sem);
+ if (!mapping_replicate(inode->i_mapping)) {
+ error = __deny_write_access(file, 1);
+ if (error)
+ goto out_fail;
+
+ /*
+ * there used to be a check here for dirty pages. it
+ * was incorrect. dirty pages are allowed, the only
+ * real problem is !Uptodate pages.
+ */
+ BUG_ON(atomic_read(&inode->i_writecount) >= 0);
+ up(&inode->i_sem);
+ return 1;
+ }
+
+out_fail:
+ up(&inode->i_sem);
+#endif
+ return 0;
+}
diff -urp linux-2.6.0-test11-mjb2-clean/mm/memory.c linux-2.6.0-test11-mjb2-pagerepl/mm/memory.c
--- linux-2.6.0-test11-mjb2-clean/mm/memory.c 2003-12-11 16:04:04.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/mm/memory.c 2003-12-11 16:15:15.000000000 -0800
@@ -1496,8 +1496,11 @@ retry:
inc_rss(mm, new_page);
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
- if (write_access)
+ if (write_access) {
entry = pte_mkwrite(pte_mkdirty(entry));
+ BUG_ON(new_page->mapping &&
+ mapping_replicate(new_page->mapping));
+ }
set_pte(page_table, entry);
pte_chain = page_add_rmap(new_page, page_table, pte_chain);
pte_unmap(page_table);
diff -urp linux-2.6.0-test11-mjb2-clean/mm/mmap.c linux-2.6.0-test11-mjb2-pagerepl/mm/mmap.c
--- linux-2.6.0-test11-mjb2-clean/mm/mmap.c 2003-12-11 16:04:04.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/mm/mmap.c 2003-12-11 16:15:15.000000000 -0800
@@ -543,6 +543,7 @@ unsigned long do_mmap_pgoff(struct file
inode = file ? file->f_dentry->d_inode : NULL;
if (file) {
+ int try_to_replicate = 1;
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
@@ -564,11 +565,19 @@ unsigned long do_mmap_pgoff(struct file
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
-
+ /*
+ * If this is set, there is a possibility of a conversion
+ * to a writeable area later. Do not replicate
+ */
+ if (vm_flags & VM_MAYWRITE)
+ try_to_replicate = 0;
+
/* fall through */
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
+ if (try_to_replicate)
+ file_try_replicate(file);
break;
default:
@@ -661,6 +670,7 @@ munmap_back:
if (error)
goto free_vma;
correct_wcount = 1;
+ file_try_replicate(file);
}
vma->vm_file = file;
get_file(file);
diff -urp linux-2.6.0-test11-mjb2-clean/mm/readahead.c linux-2.6.0-test11-mjb2-pagerepl/mm/readahead.c
--- linux-2.6.0-test11-mjb2-clean/mm/readahead.c 2003-12-11 16:04:23.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/mm/readahead.c 2003-12-11 16:15:15.000000000 -0800
@@ -236,7 +236,7 @@ __do_page_cache_readahead(struct address
if (page_offset > end_index)
break;
- page = radix_tree_lookup(&mapping->page_tree, page_offset);
+ page = __page_cache_lookup(mapping, page_offset);
if (page)
continue;
diff -urp linux-2.6.0-test11-mjb2-clean/mm/swap.c linux-2.6.0-test11-mjb2-pagerepl/mm/swap.c
--- linux-2.6.0-test11-mjb2-clean/mm/swap.c 2003-12-11 16:03:28.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/mm/swap.c 2003-12-11 16:15:15.000000000 -0800
@@ -357,6 +357,12 @@ unsigned int pagevec_lookup(struct pagev
return pagevec_count(pvec);
}
+unsigned int pagevec_lookup_replicas(struct pagevec *pvec, struct address_space *mapping, unsigned int nr_pages)
+{
+ pvec->nr = find_get_replica_pages(mapping, 0, nr_pages, pvec->pages);
+ return pagevec_count(pvec);
+}
+
#ifdef CONFIG_SMP
/*
diff -urp linux-2.6.0-test11-mjb2-clean/mm/swap_state.c linux-2.6.0-test11-mjb2-pagerepl/mm/swap_state.c
--- linux-2.6.0-test11-mjb2-clean/mm/swap_state.c 2003-12-11 16:04:23.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/mm/swap_state.c 2003-12-11 16:15:15.000000000 -0800
@@ -38,6 +38,9 @@ struct address_space swapper_space = {
.truncate_count = ATOMIC_INIT(0),
.private_lock = SPIN_LOCK_UNLOCKED,
.private_list = LIST_HEAD_INIT(swapper_space.private_list),
+#ifdef CONFIG_MAPPING_REPLICATE
+ .replicate = ATOMIC_INIT(0),
+#endif
};
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -198,7 +201,7 @@ int move_to_swap_cache(struct page *page
spin_lock(&swapper_space.page_lock);
spin_lock(&mapping->page_lock);
- err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
+ err = __insert_into_page_cache(page, &swapper_space, entry.val);
if (!err) {
__remove_from_page_cache(page);
___add_to_page_cache(page, &swapper_space, entry.val);
@@ -234,7 +237,7 @@ int move_from_swap_cache(struct page *pa
spin_lock(&swapper_space.page_lock);
spin_lock(&mapping->page_lock);
- err = radix_tree_insert(&mapping->page_tree, index, page);
+ err = __insert_into_page_cache(page, mapping, index);
if (!err) {
__delete_from_swap_cache(page);
___add_to_page_cache(page, mapping, index);
diff -urp linux-2.6.0-test11-mjb2-clean/mm/truncate.c linux-2.6.0-test11-mjb2-pagerepl/mm/truncate.c
--- linux-2.6.0-test11-mjb2-clean/mm/truncate.c 2003-12-11 16:04:23.000000000 -0800
+++ linux-2.6.0-test11-mjb2-pagerepl/mm/truncate.c 2003-12-11 16:15:15.000000000 -0800
@@ -178,6 +178,33 @@ void truncate_inode_pages(struct address
EXPORT_SYMBOL(truncate_inode_pages);
+
+/**
+ * drop_replica_pages - remove all replicated pages from a mapping
+ * @mapping: mapping to remove replication from
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void drop_replica_pages(struct address_space *mapping)
+{
+ struct pagevec pvec;
+ int num;
+ int i;
+
+ pagevec_init(&pvec, 0);
+ while ((num = pagevec_lookup_replicas(&pvec, mapping, PAGEVEC_SIZE))) {
+ for (i=0; i<num; i++) {
+ struct page *page = pvec.pages[i];
+
+ lock_page(page);
+ wait_on_page_writeback(page);
+ truncate_complete_page(mapping, page);
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ }
+}
+
/**
* invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
* @mapping: the address_space which holds the pages to invalidate
-------------------------------------------------------
This SF.net email is sponsored by: IBM Linux Tutorials.
Become an expert in LINUX or just sharpen your skills. Sign up for IBM's
Free Linux Tutorials. Learn everything from the bash shell to sys admin.
Click now! http://ads.osdn.com/?ad_id=1278&alloc_id=3371&op=click
_______________________________________________
Lse-tech mailing list
Lse-tech@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/lse-tech