| From: |
| Nick Piggin <nickpiggin@yahoo.com.au> |
| To: |
| Andrew Morton <akpm@osdl.org>,
Linux Memory Management <linux-mm@kvack.org> |
| Subject: |
| [RFC][PATCH 2/2] perzone slab LRUs |
| Date: |
| Wed, 28 Jul 2004 21:13:01 +1000 |
Oops, forgot to CC linux-mm.
Nick Piggin wrote:
> This patch is only intended for comments.
> This implements (crappy?) infrastructure for per-zone slab LRUs for
> reclaimable slabs, and moves dcache.c over to use that.
> The global unused list is retained to reduce intrusiveness, and another
> per-zone LRU list is added (which are still protected with the global
> dcache
> lock). This is an attempt to make slab scanning more robust on highmem and
> NUMA systems.
> One concern is that off-zone dentries might be pinning inodes in the zone
> we're trying to free memory for. I wonder if this can be solved?
>
---
linux-2.6-npiggin/fs/dcache.c | 120 ++++++++++++++++++++------
linux-2.6-npiggin/include/linux/dcache.h | 1
linux-2.6-npiggin/include/linux/mm.h | 19 ++++
linux-2.6-npiggin/include/linux/mmzone.h | 4
linux-2.6-npiggin/mm/page_alloc.c | 1
linux-2.6-npiggin/mm/vmscan.c | 139 ++++++++++++++++++++++++-------
6 files changed, 227 insertions(+), 57 deletions(-)
diff -puN fs/dcache.c~perzone-slab fs/dcache.c
--- linux-2.6/fs/dcache.c~perzone-slab 2004-07-28 20:54:53.000000000 +1000
+++ linux-2.6-npiggin/fs/dcache.c 2004-07-28 20:54:53.000000000 +1000
@@ -60,6 +60,7 @@ static unsigned int d_hash_mask;
static unsigned int d_hash_shift;
static struct hlist_head *dentry_hashtable;
static LIST_HEAD(dentry_unused);
+static int zone_shrinker;
/* Statistics gathering. */
struct dentry_stat_t dentry_stat = {
@@ -86,6 +87,22 @@ static void d_free(struct dentry *dentry
call_rcu(&dentry->d_rcu, d_callback);
}
+static void dentry_add_lru(struct dentry *dentry)
+{
+ struct zone_shrinker *zs;
+ zs = get_zone_shrinker(page_zone(virt_to_page(dentry)), zone_shrinker);
+ list_add(&dentry->d_lru, &zs->lru);
+ zs->nr++;
+}
+
+static void dentry_del_lru(struct dentry *dentry)
+{
+ struct zone_shrinker *zs;
+ zs = get_zone_shrinker(page_zone(virt_to_page(dentry)), zone_shrinker);
+ list_del(&dentry->d_lru);
+ zs->nr--;
+}
+
/*
* Release the dentry's inode, using the filesystem
* d_iput() operation if defined.
@@ -153,7 +170,7 @@ repeat:
spin_unlock(&dcache_lock);
return;
}
-
+
/*
* AV: ->d_delete() is _NOT_ allowed to block now.
*/
@@ -164,9 +181,9 @@ repeat:
/* Unreachable? Get rid of it */
if (d_unhashed(dentry))
goto kill_it;
- if (list_empty(&dentry->d_lru)) {
- dentry->d_flags |= DCACHE_REFERENCED;
- list_add(&dentry->d_lru, &dentry_unused);
+ dentry->d_flags |= DCACHE_REFERENCED;
+ if (list_empty(&dentry->d_unused)) {
+ list_add(&dentry->d_unused, &dentry_unused);
dentry_stat.nr_unused++;
}
spin_unlock(&dentry->d_lock);
@@ -179,11 +196,12 @@ unhash_it:
kill_it: {
struct dentry *parent;
- /* If dentry was on d_lru list
+ /* If dentry was on d_unused list
* delete it from there
*/
- if (!list_empty(&dentry->d_lru)) {
- list_del(&dentry->d_lru);
+ dentry_del_lru(dentry);
+ if (!list_empty(&dentry->d_unused)) {
+ list_del(&dentry->d_unused);
dentry_stat.nr_unused--;
}
list_del(&dentry->d_child);
@@ -261,9 +279,9 @@ int d_invalidate(struct dentry * dentry)
static inline struct dentry * __dget_locked(struct dentry *dentry)
{
atomic_inc(&dentry->d_count);
- if (!list_empty(&dentry->d_lru)) {
+ if (!list_empty(&dentry->d_unused)) {
dentry_stat.nr_unused--;
- list_del_init(&dentry->d_lru);
+ list_del_init(&dentry->d_unused);
}
return dentry;
}
@@ -348,6 +366,7 @@ static inline void prune_one_dentry(stru
{
struct dentry * parent;
+ dentry_del_lru(dentry);
__d_drop(dentry);
list_del(&dentry->d_child);
dentry_stat.nr_dentry--; /* For d_free, below */
@@ -385,6 +404,37 @@ static void prune_dcache(int count)
list_del_init(tmp);
prefetch(dentry_unused.prev);
dentry_stat.nr_unused--;
+ dentry = list_entry(tmp, struct dentry, d_unused);
+
+ spin_lock(&dentry->d_lock);
+ /*
+ * We found an inuse dentry which was not removed from
+ * dentry_unused because of laziness during lookup. Do not free
+ * it - just keep it off the dentry_unused list.
+ */
+ if (atomic_read(&dentry->d_count)) {
+ spin_unlock(&dentry->d_lock);
+ continue;
+ }
+ if (dentry->d_flags & DCACHE_REFERENCED)
+ dentry->d_flags &= ~DCACHE_REFERENCED;
+ prune_one_dentry(dentry);
+ }
+ spin_unlock(&dcache_lock);
+}
+
+static unsigned long prune_dcache_lru(struct list_head *list, unsigned long count)
+{
+ unsigned long pruned = 0;
+ spin_lock(&dcache_lock);
+ for (; count ; count--) {
+ struct dentry *dentry;
+ struct list_head *tmp;
+
+ tmp = list->prev;
+ if (tmp == list)
+ break;
+ prefetch(tmp->prev);
dentry = list_entry(tmp, struct dentry, d_lru);
spin_lock(&dentry->d_lock);
@@ -394,22 +444,32 @@ static void prune_dcache(int count)
* it - just keep it off the dentry_unused list.
*/
if (atomic_read(&dentry->d_count)) {
+ if (!list_empty(&dentry->d_unused)) {
+ list_del_init(&dentry->d_unused);
+ dentry_stat.nr_unused--;
+ }
spin_unlock(&dentry->d_lock);
continue;
}
/* If the dentry was recently referenced, don't free it. */
if (dentry->d_flags & DCACHE_REFERENCED) {
dentry->d_flags &= ~DCACHE_REFERENCED;
- list_add(&dentry->d_lru, &dentry_unused);
- dentry_stat.nr_unused++;
+ list_del(&dentry->d_lru);
+ list_add(&dentry->d_lru, list);
spin_unlock(&dentry->d_lock);
continue;
}
+ list_del_init(&dentry->d_unused);
+ dentry_stat.nr_unused--;
prune_one_dentry(dentry);
+ pruned++;
}
spin_unlock(&dcache_lock);
+
+ return pruned;
}
+
/*
* Shrink the dcache for the specified super block.
* This allows us to unmount a device without disturbing
@@ -446,7 +506,7 @@ void shrink_dcache_sb(struct super_block
while (next != &dentry_unused) {
tmp = next;
next = tmp->next;
- dentry = list_entry(tmp, struct dentry, d_lru);
+ dentry = list_entry(tmp, struct dentry, d_unused);
if (dentry->d_sb != sb)
continue;
list_del(tmp);
@@ -461,7 +521,7 @@ repeat:
while (next != &dentry_unused) {
tmp = next;
next = tmp->next;
- dentry = list_entry(tmp, struct dentry, d_lru);
+ dentry = list_entry(tmp, struct dentry, d_unused);
if (dentry->d_sb != sb)
continue;
dentry_stat.nr_unused--;
@@ -551,16 +611,16 @@ resume:
struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
next = tmp->next;
- if (!list_empty(&dentry->d_lru)) {
+ if (!list_empty(&dentry->d_unused)) {
dentry_stat.nr_unused--;
- list_del_init(&dentry->d_lru);
+ list_del_init(&dentry->d_unused);
}
/*
* move only zero ref count dentries to the end
* of the unused list for prune_dcache
*/
if (!atomic_read(&dentry->d_count)) {
- list_add(&dentry->d_lru, dentry_unused.prev);
+ list_add(&dentry->d_unused, dentry_unused.prev);
dentry_stat.nr_unused++;
found++;
}
@@ -626,9 +686,9 @@ void shrink_dcache_anon(struct hlist_hea
spin_lock(&dcache_lock);
hlist_for_each(lp, head) {
struct dentry *this = hlist_entry(lp, struct dentry, d_hash);
- if (!list_empty(&this->d_lru)) {
+ if (!list_empty(&this->d_unused)) {
dentry_stat.nr_unused--;
- list_del(&this->d_lru);
+ list_del(&this->d_unused);
}
/*
@@ -636,7 +696,7 @@ void shrink_dcache_anon(struct hlist_hea
* of the unused list for prune_dcache
*/
if (!atomic_read(&this->d_count)) {
- list_add_tail(&this->d_lru, &dentry_unused);
+ list_add_tail(&this->d_unused, &dentry_unused);
dentry_stat.nr_unused++;
found++;
}
@@ -658,14 +718,16 @@ void shrink_dcache_anon(struct hlist_hea
*
* In this case we return -1 to tell the caller that we baled.
*/
-static int shrink_dcache_memory(int nr, unsigned int gfp_mask)
+static unsigned long shrink_dcache_memory(struct zone_shrinker *zs,
+ unsigned long nr,
+ unsigned int gfp_mask)
{
if (nr) {
if (!(gfp_mask & __GFP_FS))
return -1;
- prune_dcache(nr);
+ zs->nr -= prune_dcache_lru(&zs->lru, nr);
}
- return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+ return (zs->nr / 100) * sysctl_vfs_cache_pressure;
}
/**
@@ -695,7 +757,7 @@ struct dentry *d_alloc(struct dentry * p
}
} else {
dname = dentry->d_iname;
- }
+ }
dentry->d_name.name = dname;
dentry->d_name.len = name->len;
@@ -716,6 +778,7 @@ struct dentry *d_alloc(struct dentry * p
dentry->d_bucket = NULL;
INIT_HLIST_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
+ INIT_LIST_HEAD(&dentry->d_unused);
INIT_LIST_HEAD(&dentry->d_subdirs);
INIT_LIST_HEAD(&dentry->d_alias);
@@ -727,6 +790,7 @@ struct dentry *d_alloc(struct dentry * p
}
spin_lock(&dcache_lock);
+ dentry_add_lru(dentry);
if (parent)
list_add(&dentry->d_child, &parent->d_subdirs);
dentry_stat.nr_dentry++;
@@ -831,7 +895,7 @@ struct dentry * d_alloc_anon(struct inod
return NULL;
tmp->d_parent = tmp; /* make sure dput doesn't croak */
-
+
spin_lock(&dcache_lock);
if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
/* A directory can only have one dentry.
@@ -969,7 +1033,7 @@ struct dentry * __d_lookup(struct dentry
struct hlist_node *node;
rcu_read_lock();
-
+
hlist_for_each (node, head) {
struct dentry *dentry;
struct qstr *qstr;
@@ -1592,8 +1656,10 @@ static void __init dcache_init(unsigned
0,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC,
NULL, NULL);
-
- set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
+
+ zone_shrinker = set_zone_shrinker(shrink_dcache_memory, DEFAULT_SEEKS);
+ if (zone_shrinker < 0)
+ BUG();
}
/* SLAB cache for __getname() consumers */
diff -puN include/linux/mmzone.h~perzone-slab include/linux/mmzone.h
--- linux-2.6/include/linux/mmzone.h~perzone-slab 2004-07-28 20:54:53.000000000 +1000
+++ linux-2.6-npiggin/include/linux/mmzone.h 2004-07-28 20:54:53.000000000 +1000
@@ -142,7 +142,7 @@ struct zone {
ZONE_PADDING(_pad1_)
- spinlock_t lru_lock;
+ spinlock_t lru_lock;
struct list_head active_list;
struct list_head inactive_list;
unsigned long nr_scan_active;
@@ -152,6 +152,8 @@ struct zone {
int all_unreclaimable; /* All pages pinned */
unsigned long pages_scanned; /* since last reclaim */
+ struct list_head zone_shrinker_list;
+
ZONE_PADDING(_pad2_)
/*
diff -puN mm/page_alloc.c~perzone-slab mm/page_alloc.c
--- linux-2.6/mm/page_alloc.c~perzone-slab 2004-07-28 20:54:53.000000000 +1000
+++ linux-2.6-npiggin/mm/page_alloc.c 2004-07-28 20:54:53.000000000 +1000
@@ -1495,6 +1495,7 @@ static void __init free_area_init_core(s
zone->nr_scan_inactive = 0;
zone->nr_active = 0;
zone->nr_inactive = 0;
+ INIT_LIST_HEAD(&zone->zone_shrinker_list);
if (!size)
continue;
diff -puN include/linux/mm.h~perzone-slab include/linux/mm.h
--- linux-2.6/include/linux/mm.h~perzone-slab 2004-07-28 20:54:53.000000000 +1000
+++ linux-2.6-npiggin/include/linux/mm.h 2004-07-28 20:54:53.000000000 +1000
@@ -575,6 +575,25 @@ struct shrinker;
extern struct shrinker *set_shrinker(int, shrinker_t);
extern void remove_shrinker(struct shrinker *shrinker);
+struct zone_shrinker;
+typedef unsigned long (*zone_shrinker_fn)(struct zone_shrinker *zs,
+ unsigned long nr_to_scan,
+ unsigned int gfp_mask);
+struct zone_shrinker {
+ struct list_head lru;
+ unsigned long nr;
+ zone_shrinker_fn shrinker;
+ unsigned long nr_scan;
+ int seeks;
+
+ int idx;
+ struct list_head list;
+};
+
+int set_zone_shrinker(zone_shrinker_fn, int);
+struct zone_shrinker *get_zone_shrinker(struct zone *, int);
+void remove_zone_shrinker(int);
+
/*
* On a two-level page table, this ends up being trivial. Thus the
* inlining and the symmetry break with pte_alloc_map() that does all
diff -puN mm/vmscan.c~perzone-slab mm/vmscan.c
--- linux-2.6/mm/vmscan.c~perzone-slab 2004-07-28 20:54:53.000000000 +1000
+++ linux-2.6-npiggin/mm/vmscan.c 2004-07-28 20:59:02.000000000 +1000
@@ -130,16 +130,16 @@ static DECLARE_RWSEM(shrinker_rwsem);
*/
struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
{
- struct shrinker *shrinker;
+ struct shrinker *shrinker;
- shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
- if (shrinker) {
- shrinker->shrinker = theshrinker;
- shrinker->seeks = seeks;
- shrinker->nr = 0;
- down_write(&shrinker_rwsem);
- list_add(&shrinker->list, &shrinker_list);
- up_write(&shrinker_rwsem);
+ shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
+ if (shrinker) {
+ shrinker->shrinker = theshrinker;
+ shrinker->seeks = seeks;
+ shrinker->nr = 0;
+ down_write(&shrinker_rwsem);
+ list_add(&shrinker->list, &shrinker_list);
+ up_write(&shrinker_rwsem);
}
return shrinker;
}
@@ -157,6 +157,81 @@ void remove_shrinker(struct shrinker *sh
}
EXPORT_SYMBOL(remove_shrinker);
+static unsigned int zone_shrinker_idx;
+
+/*
+ * Add a shrinker callback to be called from the vm
+ */
+int set_zone_shrinker(zone_shrinker_fn fn, int seeks)
+{
+ int idx;
+ struct zone_shrinker *zs;
+ struct zone *zone;
+
+ down_write(&shrinker_rwsem);
+ idx = zone_shrinker_idx++;
+
+ for_each_zone(zone) {
+ zs = kmalloc(sizeof(*zs), GFP_KERNEL);
+ if (!zs) {
+ up_write(&shrinker_rwsem);
+ remove_zone_shrinker(idx);
+ return -ENOMEM;
+ }
+ INIT_LIST_HEAD(&zs->lru);
+ zs->shrinker = fn;
+ zs->seeks = seeks;
+ zs->nr = 0;
+ zs->idx = idx;
+ spin_lock_irq(&zone->lru_lock);
+ list_add(&zs->list, &zone->zone_shrinker_list);
+ spin_unlock_irq(&zone->lru_lock);
+ }
+ up_write(&shrinker_rwsem);
+ return idx;
+}
+EXPORT_SYMBOL(set_zone_shrinker);
+
+struct zone_shrinker *get_zone_shrinker(struct zone *zone, int idx)
+{
+ struct zone_shrinker *zs;
+ struct zone_shrinker *ret = NULL;
+
+ spin_lock_irq(&zone->lru_lock);
+ list_for_each_entry(zs, &zone->zone_shrinker_list, list) {
+ if (zs->idx == idx) {
+ ret = zs;
+ break;
+ }
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ return ret;
+}
+EXPORT_SYMBOL(get_zone_shrinker);
+
+/*
+ * Remove one
+ */
+void remove_zone_shrinker(int idx)
+{
+ struct zone *zone;
+
+ down_write(&shrinker_rwsem);
+ for_each_zone(zone) {
+ struct zone_shrinker *zs;
+ list_for_each_entry(zs, &zone->zone_shrinker_list, list) {
+ if (zs->idx == idx) {
+ spin_lock_irq(&zone->lru_lock);
+ list_del(&zs->list);
+ spin_unlock_irq(&zone->lru_lock);
+ kfree(zs);
+ }
+ }
+ }
+ up_write(&shrinker_rwsem);
+}
+EXPORT_SYMBOL(remove_zone_shrinker);
+
#define SHRINK_BATCH 128
/*
* Call the shrink functions to age shrinkable caches
@@ -171,8 +246,9 @@ EXPORT_SYMBOL(remove_shrinker);
*
* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
*/
-static int shrink_slab(unsigned long scanned, unsigned int gfp_mask)
+static int shrink_slab(struct zone *zone, unsigned long scanned, unsigned int gfp_mask)
{
+ struct zone_shrinker *zs;
struct shrinker *shrinker;
long pages;
@@ -182,26 +258,25 @@ static int shrink_slab(unsigned long sca
if (!down_read_trylock(&shrinker_rwsem))
return 0;
- pages = nr_used_zone_pages();
- list_for_each_entry(shrinker, &shrinker_list, list) {
+ list_for_each_entry(zs, &zone->zone_shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
- delta = (4 * scanned) / shrinker->seeks;
- delta *= (*shrinker->shrinker)(0, gfp_mask);
- do_div(delta, pages + 1);
- shrinker->nr += delta;
- if (shrinker->nr < 0)
- shrinker->nr = LONG_MAX; /* It wrapped! */
+ delta = (4 * scanned) / zs->seeks;
+ delta *= (*zs->shrinker)(zs, 0, gfp_mask);
+ do_div(delta, zone->nr_inactive + zone->nr_active + 1);
+ zs->nr_scan += delta;
+ if (zs->nr_scan < 0)
+ zs->nr_scan = LONG_MAX; /* It wrapped! */
- total_scan = shrinker->nr;
- shrinker->nr = 0;
+ total_scan = zs->nr_scan;
+ zs->nr_scan = 0;
while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
int shrink_ret;
- shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
+ shrink_ret = (*zs->shrinker)(zs, this_scan, gfp_mask);
if (shrink_ret == -1)
break;
mod_page_state(slabs_scanned, this_scan);
@@ -210,8 +285,9 @@ static int shrink_slab(unsigned long sca
cond_resched();
}
- shrinker->nr += total_scan;
+ zs->nr_scan += total_scan;
}
+
up_read(&shrinker_rwsem);
return 0;
}
@@ -866,6 +942,8 @@ shrink_zone(struct zone *zone, struct sc
static void
shrink_caches(struct zone **zones, struct scan_control *sc)
{
+ struct reclaim_state *reclaim_state = current->reclaim_state;
+ unsigned long total_scanned = 0;
int i;
for (i = 0; zones[i] != NULL; i++) {
@@ -878,8 +956,17 @@ shrink_caches(struct zone **zones, struc
if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ sc->nr_scanned = 0;
shrink_zone(zone, sc);
+ total_scanned += sc->nr_scanned;
+ shrink_slab(zone, sc->nr_scanned, sc->gfp_mask);
+ if (reclaim_state) {
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
}
+
+ sc->nr_scanned = total_scanned;
}
/*
@@ -901,7 +988,6 @@ int try_to_free_pages(struct zone **zone
int priority;
int ret = 0;
int total_scanned = 0, total_reclaimed = 0;
- struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc;
int i;
@@ -919,11 +1005,6 @@ int try_to_free_pages(struct zone **zone
sc.nr_reclaimed = 0;
sc.priority = priority;
shrink_caches(zones, &sc);
- shrink_slab(sc.nr_scanned, gfp_mask);
- if (reclaim_state) {
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) {
ret = 1;
goto out;
@@ -1055,7 +1136,7 @@ scan:
sc.priority = priority;
shrink_zone(zone, &sc);
reclaim_state->reclaimed_slab = 0;
- shrink_slab(sc.nr_scanned, GFP_KERNEL);
+ shrink_slab(zone, sc.nr_scanned, GFP_KERNEL);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_reclaimed += sc.nr_reclaimed;
if (zone->all_unreclaimable)
diff -puN include/linux/dcache.h~perzone-slab include/linux/dcache.h
--- linux-2.6/include/linux/dcache.h~perzone-slab 2004-07-28 20:54:53.000000000 +1000
+++ linux-2.6-npiggin/include/linux/dcache.h 2004-07-28 20:54:53.000000000 +1000
@@ -95,6 +95,7 @@ struct dentry {
struct qstr d_name;
struct list_head d_lru; /* LRU list */
+ struct list_head d_unused; /* unused list */
struct list_head d_child; /* child of parent list */
struct list_head d_subdirs; /* our children */
struct list_head d_alias; /* inode alias list */
_