LWN.net Logo

O(1) vm (rmap15) for 2.4.20-rc2-ac1

From:  Paul P Komkoff Jr <i@stingr.net>
To:  Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject:  [CFT][RFC][PATCH] O(1) vm (rmap15) for 2.4.20-rc2-ac1
Date:  Wed, 20 Nov 2002 17:23:27 +0300

Here it is. Compiles, boots, couple of LTP runs with dd if=/dev/zero
of=zzz bs=256M count=1 looping in background

# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
#	           ChangeSet	v2.4.20-rc2-ac1 -> 1.793  
#	include/linux/mmzone.h	1.10    -> 1.11   
#	include/linux/list.h	1.9     -> 1.10   
#	include/linux/pagemap.h	1.19    -> 1.20   
#	      kernel/ksyms.c	1.65    -> 1.66   
#	include/linux/swap.h	1.36    -> 1.37   
#	  include/linux/mm.h	1.40    -> 1.41   
#	     mm/page_alloc.c	1.57    -> 1.58   
#	           mm/rmap.c	1.1     -> 1.2    
#	         mm/vmscan.c	1.63    -> 1.64   
#	 fs/proc/proc_misc.c	1.19    -> 1.20   
#	        mm/filemap.c	1.73    -> 1.74   
#	include/linux/brlock.h	1.4     -> 1.5    
#	include/linux/mm_inline.h	1.1     -> 1.2    
#	           mm/swap.c	1.18    -> 1.19   
#	          mm/shmem.c	1.46    -> 1.47   
#	     mm/swap_state.c	1.19    -> 1.20   
#	         fs/buffer.c	1.76    -> 1.77   
#	include/linux/module.h	1.12    -> 1.13   
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/11/20	stingray@boxster.stingr.net	1.792
# 2.4.20-rc2-ac1
# --------------------------------------------
# 02/11/20	stingray@proxy.sgu.ru	1.793
# rmap15
# --------------------------------------------
#
diff -Nru a/fs/buffer.c b/fs/buffer.c
--- a/fs/buffer.c	Wed Nov 20 17:19:24 2002
+++ b/fs/buffer.c	Wed Nov 20 17:19:24 2002
@@ -2915,6 +2915,30 @@
 	}
 }
 
+
+/*
+ * Do some IO post-processing here!!!
+ */
+void do_io_postprocessing(void)
+{
+	int i;
+	struct buffer_head *bh, *next;
+
+	spin_lock(&lru_list_lock);
+	bh = lru_list[BUF_LOCKED];
+	if (bh) {
+		for (i = nr_buffers_type[BUF_LOCKED]; i-- > 0; bh = next) {
+			next = bh->b_next_free;
+
+			if (!buffer_locked(bh)) 
+				__refile_buffer(bh);
+			else 
+				break;
+		}
+	}
+	spin_unlock(&lru_list_lock);
+}
+
 /*
  * This is the kernel update daemon. It was used to live in userspace
  * but since it's need to run safely we want it unkillable by mistake.
@@ -2966,6 +2990,7 @@
 #ifdef DEBUG
 		printk(KERN_DEBUG "kupdate() activated...\n");
 #endif
+		do_io_postprocessing();
 		sync_old_buffers();
 		run_task_queue(&tq_disk);
 	}
diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
--- a/fs/proc/proc_misc.c	Wed Nov 20 17:19:24 2002
+++ b/fs/proc/proc_misc.c	Wed Nov 20 17:19:24 2002
@@ -192,6 +192,7 @@
 		"SwapCached:   %8lu kB\n"
 		"Active:       %8u kB\n"
 		"Inact_dirty:  %8u kB\n"
+		"Inact_laundry:%8u kB\n"
 		"Inact_clean:  %8u kB\n"
 		"Inact_target: %8u kB\n"
 		"HighTotal:    %8lu kB\n"
@@ -209,6 +210,7 @@
 		K(swapper_space.nrpages),
 		K(nr_active_pages),
 		K(nr_inactive_dirty_pages),
+		K(nr_inactive_laundry_pages),
 		K(nr_inactive_clean_pages),
 		K(inactive_target()),
 		K(i.totalhigh),
diff -Nru a/include/linux/brlock.h b/include/linux/brlock.h
--- a/include/linux/brlock.h	Wed Nov 20 17:19:24 2002
+++ b/include/linux/brlock.h	Wed Nov 20 17:19:24 2002
@@ -37,6 +37,8 @@
 	BR_GLOBALIRQ_LOCK,
 	BR_NETPROTO_LOCK,
 	BR_LLC_LOCK,
+	BR_LRU_LOCK,
+
 	__BR_END
 };
 
diff -Nru a/include/linux/list.h b/include/linux/list.h
--- a/include/linux/list.h	Wed Nov 20 17:19:24 2002
+++ b/include/linux/list.h	Wed Nov 20 17:19:24 2002
@@ -137,8 +137,7 @@
 	return head->next == head;
 }
 
-static inline void __list_splice(struct list_head *list,
-				 struct list_head *head)
+static inline void __list_splice(struct list_head *list, struct list_head *head)
 {
 	struct list_head *first = list->next;
 	struct list_head *last = list->prev;
diff -Nru a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h	Wed Nov 20 17:19:24 2002
+++ b/include/linux/mm.h	Wed Nov 20 17:19:24 2002
@@ -1,5 +1,23 @@
 #ifndef _LINUX_MM_H
 #define _LINUX_MM_H
+/*
+ * Copyright (c) 2002. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors: 
+ *	Linus Torvalds
+ *	Stephen Tweedie
+ *	Andrea Arcangeli
+ *	Rik van Riel
+ *	Arjan van de Ven
+ *	and others
+ */
 
 #include <linux/sched.h>
 #include <linux/errno.h>
@@ -168,7 +186,7 @@
 	unsigned long flags;		/* atomic flags, some possibly
 					   updated asynchronously */
 	struct list_head lru;		/* Pageout list, eg. active_list;
-					   protected by pagemap_lru_lock !! */
+					   protected by the lru lock !! */
 	unsigned char age;		/* Page aging counter. */
 	struct pte_chain * pte_chain;	/* Reverse pte mapping pointer.
 					 * protected by PG_chainlock
@@ -279,7 +297,7 @@
  *
  * Note that the referenced bit, the page->lru list_head and the
  * active, inactive_dirty and inactive_clean lists are protected by
- * the pagemap_lru_lock, and *NOT* by the usual PG_locked bit!
+ * the lru lock, and *NOT* by the usual PG_locked bit!
  *
  * PG_skip is used on sparc/sparc64 architectures to "skip" certain
  * parts of the address space.
@@ -300,18 +318,19 @@
 #define PG_referenced		 2
 #define PG_uptodate		 3
 #define PG_dirty		 4
-#define PG_inactive_clean	 5
-#define PG_active		 6
+#define PG_active		 5
 #define PG_inactive_dirty	 7
-#define PG_slab			 8
-#define PG_skip			10
-#define PG_highmem		11
-#define PG_checked		12	/* kill me in 2.5.<early>. */
-#define PG_arch_1		13
-#define PG_reserved		14
-#define PG_launder		15	/* written out by VM pressure.. */
-#define PG_chainlock		16	/* lock bit for ->pte_chain */
-#define PG_lru			17
+#define PG_inactive_laundry	 8
+#define PG_inactive_clean	 9
+#define PG_slab			10
+#define PG_skip			11
+#define PG_highmem		12
+#define PG_checked		13	/* kill me in 2.5.<early>. */
+#define PG_arch_1		14
+#define PG_reserved		15
+#define PG_launder		16	/* written out by VM pressure.. */
+#define PG_chainlock		17	/* lock bit for ->pte_chain */
+#define PG_lru			18
 /* Don't you dare to use high bits, they seem to be used for something else! */
 
 
@@ -429,11 +448,16 @@
 #define PageClearSlab(page)	clear_bit(PG_slab, &(page)->flags)
 #define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)
 
-#define PageActive(page)	test_bit(PG_active, &(page)->flags)
+#define PageActive(page)		test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
 #define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
 #define TestandSetPageActive(page)	test_and_set_bit(PG_active, &(page)->flags)
 #define TestandClearPageActive(page)	test_and_clear_bit(PG_active, &(page)->flags)
+
+
+#define PageInactiveLaundry(page)	test_bit(PG_inactive_laundry, &(page)->flags)
+#define SetPageInactiveLaundry(page)	set_bit(PG_inactive_laundry, &(page)->flags)
+#define ClearPageInactiveLaundry(page)	clear_bit(PG_inactive_laundry, &(page)->flags)
 
 #define PageInactiveDirty(page)	test_bit(PG_inactive_dirty, &(page)->flags)
 #define SetPageInactiveDirty(page)	set_bit(PG_inactive_dirty, &(page)->flags)
diff -Nru a/include/linux/mm_inline.h b/include/linux/mm_inline.h
--- a/include/linux/mm_inline.h	Wed Nov 20 17:19:24 2002
+++ b/include/linux/mm_inline.h	Wed Nov 20 17:19:24 2002
@@ -2,21 +2,77 @@
 #define _LINUX_MM_INLINE_H
 
 #include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/brlock.h>
+
+
+/*
+ * Copyright (c) 2002. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors: 
+ *	Linus Torvalds
+ *	Stephen Tweedie
+ *	Andrea Arcangeli
+ *	Rik van Riel
+ *	Arjan van de Ven
+ *	and others
+ */
+
+GPL_HEADER()
+
+extern unsigned char active_age_bias;
 
 /*
  * These inline functions tend to need bits and pieces of all the
  * other VM include files, meaning they cannot be defined inside
  * one of the other VM include files.
+ * 
+ */
+ 
+/**
+ * page_dirty - do we need to write the data out to disk
+ * @page: page to test
  *
- * The include file mess really needs to be cleaned up...
+ * Returns true if the page contains data which needs to
+ * be written to disk.  Doesn't test the page tables (yet?).
  */
+static inline int page_dirty(struct page *page)
+{
+	struct buffer_head *tmp, *bh;
+
+	if (PageDirty(page))
+		return 1;
+
+	if (page->mapping && !page->buffers)
+		return 0;
+
+	tmp = bh = page->buffers;
+
+	do {
+		if (tmp->b_state & ((1<<BH_Dirty) | (1<<BH_Lock)))
+			return 1;
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
 
-static inline void add_page_to_active_list(struct page * page)
+	return 0;
+}
+
+
+
+static inline void add_page_to_active_list(struct page * page, int age)
 {
 	struct zone_struct * zone = page_zone(page);
 	DEBUG_LRU_PAGE(page);
 	SetPageActive(page);
-	list_add(&page->lru, &zone->active_list);
+	list_add(&page->lru, &zone->active_list[age]);
+	page->age = age + active_age_bias;
 	zone->active_pages++;
 	nr_active_pages++;
 }
@@ -31,6 +87,16 @@
 	nr_inactive_dirty_pages++;
 }
 
+static inline void add_page_to_inactive_laundry_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageInactiveLaundry(page);
+	list_add(&page->lru, &zone->inactive_laundry_list);
+	zone->inactive_laundry_pages++;
+	nr_inactive_laundry_pages++;
+}
+
 static inline void add_page_to_inactive_clean_list(struct page * page)
 {
 	struct zone_struct * zone = page_zone(page);
@@ -44,10 +110,14 @@
 static inline void del_page_from_active_list(struct page * page)
 {
 	struct zone_struct * zone = page_zone(page);
+	unsigned char age;
 	list_del(&page->lru);
 	ClearPageActive(page);
 	nr_active_pages--;
 	zone->active_pages--;
+	age = page->age - active_age_bias;
+	if (age<=MAX_AGE)
+		zone->active_count[age]--;
 	DEBUG_LRU_PAGE(page);
 }
 
@@ -61,6 +131,16 @@
 	DEBUG_LRU_PAGE(page);
 }
 
+static inline void del_page_from_inactive_laundry_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	list_del(&page->lru);
+	ClearPageInactiveLaundry(page);
+	nr_inactive_laundry_pages--;
+	zone->inactive_laundry_pages--;
+	DEBUG_LRU_PAGE(page);
+}
+
 static inline void del_page_from_inactive_clean_list(struct page * page)
 {
 	struct zone_struct * zone = page_zone(page);
@@ -184,7 +264,7 @@
 {
 	int inactive, target, inactive_base;
 
-	inactive_base = zone->active_pages + zone->inactive_dirty_pages;
+	inactive_base = zone->active_pages +  zone->inactive_dirty_pages;
 	inactive_base /= INACTIVE_FACTOR;
 
 	/* GCC should optimise this away completely. */
@@ -253,7 +333,12 @@
  */
 static inline int inactive_high(struct zone_struct * zone)
 {
-	return inactive_limit(zone, VM_HIGH);
+	unsigned long active, inactive;
+	active = zone->active_pages + zone->free_pages;
+	inactive = zone->inactive_dirty_pages + zone->inactive_clean_pages + zone->inactive_laundry_pages;
+	if (inactive * 5 >   (active+inactive))
+		return -1;
+	return 1;
 }
 
 /*
@@ -264,11 +349,31 @@
 	int target;
 
 	target = nr_active_pages + nr_inactive_dirty_pages
-			+ nr_inactive_clean_pages;
+			+ nr_inactive_clean_pages + nr_inactive_laundry_pages;
 
 	target /= INACTIVE_FACTOR;
 
 	return target;
+}
+
+static inline void lru_lock(struct zone_struct *zone)
+{
+	if (zone) {
+		br_read_lock(BR_LRU_LOCK);
+		spin_lock(&zone->lru_lock);
+	} else {
+		br_write_lock(BR_LRU_LOCK);
+	}
+}
+
+static inline void lru_unlock(struct zone_struct *zone)
+{
+	if (zone) {
+		spin_unlock(&zone->lru_lock);
+		br_read_unlock(BR_LRU_LOCK);
+	} else {
+		br_write_unlock(BR_LRU_LOCK);
+	}
 }
 
 #endif /* _LINUX_MM_INLINE_H */
diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h	Wed Nov 20 17:19:24 2002
+++ b/include/linux/mmzone.h	Wed Nov 20 17:19:24 2002
@@ -13,11 +13,7 @@
  * Free memory management - zoned buddy allocator.
  */
 
-#ifndef CONFIG_FORCE_MAX_ZONEORDER
 #define MAX_ORDER 10
-#else
-#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
-#endif
 
 typedef struct free_area_struct {
 	struct list_head	free_list;
@@ -29,6 +25,9 @@
 
 #define MAX_CHUNKS_PER_NODE 8
 
+#define MAX_AGE 15
+#define INITIAL_AGE 3
+
 #define MAX_PER_CPU_PAGES 512
 typedef struct per_cpu_pages_s {
 	int			nr_pages, max_nr_pages;
@@ -52,17 +51,22 @@
 	unsigned long		free_pages;
 	unsigned long		active_pages;
 	unsigned long		inactive_dirty_pages;
+	unsigned long		inactive_laundry_pages;
 	unsigned long		inactive_clean_pages;
 	unsigned long		pages_min, pages_low, pages_high, pages_plenty;
 	int			need_balance;
+	int			need_scan;
+	int			active_count[MAX_AGE+1];
 
 	/*
 	 * free areas of different sizes
 	 */
-	struct list_head	active_list;
+	struct list_head	active_list[MAX_AGE+1];
 	struct list_head	inactive_dirty_list;
+	struct list_head	inactive_laundry_list;
 	struct list_head	inactive_clean_list;
 	free_area_t		free_area[MAX_ORDER];
+	spinlock_t		lru_lock;
 
 	/*
 	 * wait_table           -- the array holding the hash table
diff -Nru a/include/linux/module.h b/include/linux/module.h
--- a/include/linux/module.h	Wed Nov 20 17:19:24 2002
+++ b/include/linux/module.h	Wed Nov 20 17:19:24 2002
@@ -287,6 +287,9 @@
 static const char __module_license[] __attribute__((section(".modinfo"))) =   \
 "license=" license
 
+#define GPL_HEADER() \
+static const char cpyright="This software may be freely redistributed under the terms of the GNU General Public License.";
+
 /* Define the module variable, and usage macros.  */
 extern struct module __this_module;
 
@@ -302,7 +305,6 @@
 static const char __module_using_checksums[] __attribute__((section(".modinfo"))) =
 "using_checksums=1";
 #endif
-
 #else /* MODULE */
 
 #define MODULE_AUTHOR(name)
@@ -311,6 +313,7 @@
 #define MODULE_SUPPORTED_DEVICE(name)
 #define MODULE_PARM(var,type)
 #define MODULE_PARM_DESC(var,desc)
+#define GPL_HEADER()
 
 /* Create a dummy reference to the table to suppress gcc unused warnings.  Put
  * the reference in the .data.exit section which is discarded when code is built
diff -Nru a/include/linux/pagemap.h b/include/linux/pagemap.h
--- a/include/linux/pagemap.h	Wed Nov 20 17:19:24 2002
+++ b/include/linux/pagemap.h	Wed Nov 20 17:19:24 2002
@@ -70,10 +70,6 @@
 
 #define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))
 
-extern struct page * __find_get_page(struct address_space *mapping,
-				unsigned long index, struct page **hash);
-#define find_get_page(mapping, index) \
-	__find_get_page(mapping, index, page_hash(mapping, index))
 extern struct page * __find_lock_page (struct address_space * mapping,
 				unsigned long index, struct page **hash);
 extern struct page * find_or_create_page(struct address_space *mapping,
@@ -90,6 +86,13 @@
 extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash);
 
 extern void ___wait_on_page(struct page *);
+extern int wait_on_page_timeout(struct page *page, int timeout);
+
+
+extern struct page * __find_pagecache_page(struct address_space *mapping,
+				unsigned long index, struct page **hash);
+#define find_pagecache_page(mapping, index) \
+	__find_pagecache_page(mapping, index, page_hash(mapping, index))
 
 static inline void wait_on_page(struct page * page)
 {
diff -Nru a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h	Wed Nov 20 17:19:24 2002
+++ b/include/linux/swap.h	Wed Nov 20 17:19:24 2002
@@ -87,6 +87,7 @@
 extern unsigned int nr_free_buffer_pages(void);
 extern int nr_active_pages;
 extern int nr_inactive_dirty_pages;
+extern int nr_inactive_laundry_pages;
 extern int nr_inactive_clean_pages;
 extern atomic_t page_cache_size;
 extern atomic_t buffermem_pages;
@@ -115,6 +116,7 @@
 
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
+extern void FASTCALL(lru_cache_add_dirty(struct page *));
 extern void FASTCALL(__lru_cache_del(struct page *));
 extern void FASTCALL(lru_cache_del(struct page *));
 
@@ -175,8 +177,6 @@
 asmlinkage long sys_swapoff(const char *);
 asmlinkage long sys_swapon(const char *, int);
 
-extern spinlock_cacheline_t pagemap_lru_lock_cacheline;
-#define pagemap_lru_lock pagemap_lru_lock_cacheline.lock
 
 extern void FASTCALL(mark_page_accessed(struct page *));
 
@@ -191,7 +191,7 @@
 
 /*
  * List add/del helper macros. These must be called
- * with the pagemap_lru_lock held!
+ * with the lru lock held!
  */
 #define DEBUG_LRU_PAGE(page)			\
 do {						\
diff -Nru a/kernel/ksyms.c b/kernel/ksyms.c
--- a/kernel/ksyms.c	Wed Nov 20 17:19:24 2002
+++ b/kernel/ksyms.c	Wed Nov 20 17:19:24 2002
@@ -262,7 +262,6 @@
 EXPORT_SYMBOL(__pollwait);
 EXPORT_SYMBOL(poll_freewait);
 EXPORT_SYMBOL(ROOT_DEV);
-EXPORT_SYMBOL(__find_get_page);
 EXPORT_SYMBOL(__find_lock_page);
 EXPORT_SYMBOL(find_or_create_page);
 EXPORT_SYMBOL(grab_cache_page_nowait);
diff -Nru a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c	Wed Nov 20 17:19:24 2002
+++ b/mm/filemap.c	Wed Nov 20 17:19:24 2002
@@ -55,15 +55,14 @@
 
 spinlock_cacheline_t pagecache_lock_cacheline  = {SPIN_LOCK_UNLOCKED};
 /*
- * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock 
+ * NOTE: to avoid deadlocking you must never acquire the lru lock 
  *	with the pagecache_lock held.
  *
  * Ordering:
  *	swap_lock ->
- *		pagemap_lru_lock ->
+ *		   lru lock ->
  *			pagecache_lock
  */
-spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
 
 #define CLUSTER_PAGES		(1 << page_cluster)
 #define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
@@ -183,7 +182,7 @@
 
 	head = &inode->i_mapping->clean_pages;
 
-	spin_lock(&pagemap_lru_lock);
+	lru_lock(ALL_ZONES);
 	spin_lock(&pagecache_lock);
 	curr = head->next;
 
@@ -216,7 +215,7 @@
 	}
 
 	spin_unlock(&pagecache_lock);
-	spin_unlock(&pagemap_lru_lock);
+	lru_unlock(ALL_ZONES);
 }
 
 static int do_flushpage(struct page *page, unsigned long offset)
@@ -880,6 +879,32 @@
 		wake_up_all(waitqueue);
 }
 
+
+/* like wait_on_page but with a timeout (in jiffies).
+ * returns 1 on timeout 
+ */
+int wait_on_page_timeout(struct page *page, int timeout)
+{
+	wait_queue_head_t *waitqueue = page_waitqueue(page);
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	
+	if (!PageLocked(page))
+		return 0;
+
+	add_wait_queue(waitqueue, &wait);
+	do {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (!PageLocked(page))
+			break;
+		sync_page(page);
+		timeout = schedule_timeout(timeout);
+	} while (PageLocked(page) && timeout);
+	__set_task_state(tsk, TASK_RUNNING);
+	remove_wait_queue(waitqueue, &wait);
+	return PageLocked(page);
+}
+
 /*
  * Get a lock on the page, assuming we need to sleep
  * to get it..
@@ -914,26 +939,6 @@
 		__lock_page(page);
 }
 
-/*
- * a rather lightweight function, finding and getting a reference to a
- * hashed page atomically.
- */
-struct page * __find_get_page(struct address_space *mapping,
-			      unsigned long offset, struct page **hash)
-{
-	struct page *page;
-
-	/*
-	 * We scan the hash list read-only. Addition to and removal from
-	 * the hash-list needs a held write-lock.
-	 */
-	spin_lock(&pagecache_lock);
-	page = __find_page_nolock(mapping, offset, *hash);
-	if (page)
-		page_cache_get(page);
-	spin_unlock(&pagecache_lock);
-	return page;
-}
 
 /*
  * Same as above, but trylock it instead of incrementing the count.
@@ -1069,7 +1074,7 @@
 	 * been increased since the last time we were called, we
 	 * stop when the page isn't there.
 	 */
-	spin_lock(&pagemap_lru_lock);
+	lru_lock(ALL_ZONES);
 	while (--index >= start) {
 		struct page **hash = page_hash(mapping, index);
 		spin_lock(&pagecache_lock);
@@ -1079,9 +1084,31 @@
 			break;
 		drop_page(page);
 	}
-	spin_unlock(&pagemap_lru_lock);
+	lru_unlock(ALL_ZONES);
+}
+
+/*
+ * Look up a page in the pagecache and return that page with
+ * a reference helt
+ */
+struct page * __find_pagecache_page(struct address_space *mapping,
+			      unsigned long offset, struct page **hash)
+{
+	struct page *page;
+
+	/*
+	 * We scan the hash list read-only. Addition to and removal from
+	 * the hash-list needs a held write-lock.
+	 */
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(mapping, offset, *hash);
+	if (page)
+		page_cache_get(page);
+	spin_unlock(&pagecache_lock);
+	return page;
 }
 
+
 /* Same as grab_cache_page, but do not wait if the page is unavailable.
  * This is intended for speculative data generators, where the data can
  * be regenerated if the page couldn't be grabbed.  This routine should
@@ -1092,7 +1119,7 @@
 	struct page *page, **hash;
 
 	hash = page_hash(mapping, index);
-	page = __find_get_page(mapping, index, hash);
+	page = __find_pagecache_page(mapping, index, hash);
 
 	if ( page ) {
 		if ( !TryLockPage(page) ) {
@@ -2022,7 +2049,7 @@
 	 */
 	hash = page_hash(mapping, pgoff);
 retry_find:
-	page = __find_get_page(mapping, pgoff, hash);
+	page = __find_pagecache_page(mapping, pgoff, hash);
 	if (!page)
 		goto no_cached_page;
 
@@ -2885,7 +2912,7 @@
 	struct page *page, *cached_page = NULL;
 	int err;
 repeat:
-	page = __find_get_page(mapping, index, hash);
+	page = __find_pagecache_page(mapping, index, hash);
 	if (!page) {
 		if (!cached_page) {
 			cached_page = page_cache_alloc(mapping);
diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c	Wed Nov 20 17:19:24 2002
+++ b/mm/page_alloc.c	Wed Nov 20 17:19:24 2002
@@ -28,6 +28,7 @@
 int nr_swap_pages;
 int nr_active_pages;
 int nr_inactive_dirty_pages;
+int nr_inactive_laundry_pages;
 int nr_inactive_clean_pages;
 pg_data_t *pgdat_list;
 
@@ -113,12 +114,13 @@
 		BUG();
 	if (PageInactiveDirty(page))
 		BUG();
+	if (PageInactiveLaundry(page))
+		BUG();
 	if (PageInactiveClean(page))
 		BUG();
 	if (page->pte_chain)
 		BUG();
 	page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
-	page->age = PAGE_AGE_START;
 	
 	zone = page_zone(page);
 
@@ -728,9 +730,10 @@
 		nr_free_pages() << (PAGE_SHIFT-10),
 		nr_free_highpages() << (PAGE_SHIFT-10));
 
-	printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d )\n",
+	printk("( Active: %d/%d, inactive_laundry: %d, inactive_clean: %d, free: %d )\n",
 		nr_active_pages,
 		nr_inactive_dirty_pages,
+		nr_inactive_laundry_pages,
 		nr_inactive_clean_pages,
 		nr_free_pages());
 
@@ -941,12 +944,20 @@
 		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
+		zone->active_pages = 0;
 		zone->inactive_clean_pages = 0;
+		zone->inactive_laundry_pages = 0;
 		zone->inactive_dirty_pages = 0;
 		zone->need_balance = 0;
-		INIT_LIST_HEAD(&zone->active_list);
+		zone->need_scan = 0;
+		for (k = 0; k <= MAX_AGE ; k++) {
+			INIT_LIST_HEAD(&zone->active_list[k]);
+			zone->active_count[k] = 0;
+		}
 		INIT_LIST_HEAD(&zone->inactive_dirty_list);
+		INIT_LIST_HEAD(&zone->inactive_laundry_list);
 		INIT_LIST_HEAD(&zone->inactive_clean_list);
+		spin_lock_init(&zone->lru_lock);
 
 		if (!size)
 			continue;
diff -Nru a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c	Wed Nov 20 17:19:24 2002
+++ b/mm/rmap.c	Wed Nov 20 17:19:24 2002
@@ -14,7 +14,7 @@
 /*
  * Locking:
  * - the page->pte_chain is protected by the PG_chainlock bit,
- *   which nests within the pagemap_lru_lock, then the
+ *   which nests within the lru lock, then the
  *   mm->page_table_lock, and then the page lock.
  * - because swapout locking is opposite to the locking order
  *   in the page fault path, the swapout path uses trylocks
@@ -195,7 +195,7 @@
  * table entry mapping a page. Because locking order here is opposite
  * to the locking order used by the page fault path, we use trylocks.
  * Locking:
- *	pagemap_lru_lock		page_launder()
+ *	   lru lock			page_launder()
  *	    page lock			page_launder(), trylock
  *		pte_chain_lock		page_launder()
  *		    mm->page_table_lock	try_to_unmap_one(), trylock
@@ -263,7 +263,7 @@
  * @page: the page to get unmapped
  *
  * Tries to remove all the page table entries which are mapping this
- * page, used in the pageout path.  Caller must hold pagemap_lru_lock
+ * page, used in the pageout path.  Caller must hold lru lock
  * and the page lock.  Return values are:
  *
  * SWAP_SUCCESS	- we succeeded in removing all mappings
diff -Nru a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c	Wed Nov 20 17:19:24 2002
+++ b/mm/shmem.c	Wed Nov 20 17:19:24 2002
@@ -581,7 +581,7 @@
 	 * cache and swap cache.  We need to recheck the page cache
 	 * under the protection of the info->lock spinlock. */
 
-	page = find_get_page(mapping, idx);
+	page = find_pagecache_page(mapping, idx);
 	if (page) {
 		if (TryLockPage(page))
 			goto wait_retry;
diff -Nru a/mm/swap.c b/mm/swap.c
--- a/mm/swap.c	Wed Nov 20 17:19:24 2002
+++ b/mm/swap.c	Wed Nov 20 17:19:24 2002
@@ -36,7 +36,6 @@
 /**
  * (de)activate_page - move pages from/to active and inactive lists
  * @page: the page we want to move
- * @nolock - are we already holding the pagemap_lru_lock?
  *
  * Deactivate_page will move an active page to the right
  * inactive list, while activate_page will move a page back
@@ -51,7 +50,6 @@
 	 * (some pages aren't on any list at all)
 	 */
 	ClearPageReferenced(page);
-	page->age = 0;
 	if (PageActive(page)) {
 		del_page_from_active_list(page);
 		add_page_to_inactive_dirty_list(page);
@@ -60,9 +58,9 @@
 
 void deactivate_page(struct page * page)
 {
-	spin_lock(&pagemap_lru_lock);
+	lru_lock(page_zone(page));
 	deactivate_page_nolock(page);
-	spin_unlock(&pagemap_lru_lock);
+	lru_unlock(page_zone(page));
 }
 
 /**
@@ -74,16 +72,16 @@
  * on the inactive_clean list it is placed on the inactive_dirty list
  * instead.
  *
- * Note: this function gets called with the pagemap_lru_lock held.
+ * Note: this function gets called with the lru lock held.
  */
-void drop_page(struct page * page)
+void drop_page_zone(struct zone_struct *zone, struct page * page)
 {
 	if (!TryLockPage(page)) {
 		if (page->mapping && page->buffers) {
 			page_cache_get(page);
-			spin_unlock(&pagemap_lru_lock);
+			lru_unlock(zone);
 			try_to_release_page(page, GFP_NOIO);
-			spin_lock(&pagemap_lru_lock);
+			lru_lock(zone);
 			page_cache_release(page);
 		}
 		UnlockPage(page);
@@ -97,12 +95,49 @@
 
 	else if (page_count(page) == 1) {
 		ClearPageReferenced(page);
-		page->age = 0;
 		if (PageActive(page)) {
 			del_page_from_active_list(page);
+			add_page_to_inactive_dirty_list(page);
+		} else if (PageInactiveDirty(page)) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_inactive_laundry_list(page);
+		} else if (PageInactiveLaundry(page)) {
+			del_page_from_inactive_laundry_list(page);
 			add_page_to_inactive_clean_list(page);
+		}
+	}
+	pte_chain_unlock(page);
+}
+
+void drop_page(struct page * page)
+{
+	if (!TryLockPage(page)) {
+		if (page->mapping && page->buffers) {
+			page_cache_get(page);
+			lru_unlock(ALL_ZONES);
+			try_to_release_page(page, GFP_NOIO);
+			lru_lock(ALL_ZONES);
+			page_cache_release(page);
+		}
+		UnlockPage(page);
+	}
+
+	/* Make sure the page really is reclaimable. */
+	pte_chain_lock(page);
+	if (!page->mapping || PageDirty(page) || page->pte_chain ||
+			page->buffers || page_count(page) > 1)
+		deactivate_page_nolock(page);
+
+	else if (page_count(page) == 1) {
+		ClearPageReferenced(page);
+		if (PageActive(page)) {
+			del_page_from_active_list(page);
+			add_page_to_inactive_dirty_list(page);
 		} else if (PageInactiveDirty(page)) {
 			del_page_from_inactive_dirty_list(page);
+			add_page_to_inactive_laundry_list(page);
+		} else if (PageInactiveLaundry(page)) {
+			del_page_from_inactive_laundry_list(page);
 			add_page_to_inactive_clean_list(page);
 		}
 	}
@@ -116,21 +151,22 @@
 {
 	if (PageInactiveDirty(page)) {
 		del_page_from_inactive_dirty_list(page);
-		add_page_to_active_list(page);
+		add_page_to_active_list(page, INITIAL_AGE);
+	} else
+	if (PageInactiveLaundry(page)) {
+		del_page_from_inactive_laundry_list(page);
+		add_page_to_active_list(page, INITIAL_AGE);
 	} else if (PageInactiveClean(page)) {
 		del_page_from_inactive_clean_list(page);
-		add_page_to_active_list(page);
+		add_page_to_active_list(page, INITIAL_AGE);
 	}
-
-	/* Make sure the page gets a fair chance at staying active. */
-	page->age = max((int)page->age, PAGE_AGE_START);
 }
 
 void activate_page(struct page * page)
 {
-	spin_lock(&pagemap_lru_lock);
+	lru_lock(page_zone(page));
 	activate_page_nolock(page);
-	spin_unlock(&pagemap_lru_lock);
+	lru_unlock(page_zone(page));
 }
 
 /**
@@ -140,10 +176,10 @@
 void lru_cache_add(struct page * page)
 {
 	if (!PageLRU(page)) {
-		spin_lock(&pagemap_lru_lock);
+		lru_lock(page_zone(page));
 		SetPageLRU(page);
-		add_page_to_active_list(page);
-		spin_unlock(&pagemap_lru_lock);
+		add_page_to_active_list(page, INITIAL_AGE);
+		lru_unlock(page_zone(page));
 	}
 }
 
@@ -152,7 +188,7 @@
  * @page: the page to add
  *
  * This function is for when the caller already holds
- * the pagemap_lru_lock.
+ * the lru lock.
  */
 void __lru_cache_del(struct page * page)
 {
@@ -160,6 +196,8 @@
 		del_page_from_active_list(page);
 	} else if (PageInactiveDirty(page)) {
 		del_page_from_inactive_dirty_list(page);
+	} else if (PageInactiveLaundry(page)) {
+		del_page_from_inactive_laundry_list(page);
 	} else if (PageInactiveClean(page)) {
 		del_page_from_inactive_clean_list(page);
 	}
@@ -172,9 +210,9 @@
  */
 void lru_cache_del(struct page * page)
 {
-	spin_lock(&pagemap_lru_lock);
+	lru_lock(page_zone(page));
 	__lru_cache_del(page);
-	spin_unlock(&pagemap_lru_lock);
+	lru_unlock(page_zone(page));
 }
 
 /*
diff -Nru a/mm/swap_state.c b/mm/swap_state.c
--- a/mm/swap_state.c	Wed Nov 20 17:19:24 2002
+++ b/mm/swap_state.c	Wed Nov 20 17:19:24 2002
@@ -196,7 +196,7 @@
 {
 	struct page *found;
 
-	found = find_get_page(&swapper_space, entry.val);
+	found = find_pagecache_page(&swapper_space, entry.val);
 	/*
 	 * Unsafe to assert PageSwapCache and mapping on page found:
 	 * if SMP nothing prevents swapoff from deleting this page from
@@ -224,10 +224,10 @@
 		/*
 		 * First check the swap cache.  Since this is normally
 		 * called after lookup_swap_cache() failed, re-calling
-		 * that would confuse statistics: use find_get_page()
+		 * that would confuse statistics: use find_pagecache_page()
 		 * directly.
 		 */
-		found_page = find_get_page(&swapper_space, entry.val);
+		found_page = find_pagecache_page(&swapper_space, entry.val);
 		if (found_page)
 			break;
 
diff -Nru a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c	Wed Nov 20 17:19:24 2002
+++ b/mm/vmscan.c	Wed Nov 20 17:19:24 2002
@@ -12,6 +12,7 @@
  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  *  Multiqueue VM started 5.8.00, Rik van Riel.
+ *  O(1) rmap vm, Arjan van de ven <arjanv@redhat.com>
  */
 
 #include <linux/slab.h>
@@ -37,16 +38,30 @@
  */
 #define DEF_PRIORITY (6)
 
-static inline void age_page_up(struct page *page)
+static inline void age_page_up_nolock(struct page *page, int old_age)
 {
-	page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX); 
-}
+	int new_age;
+	
+	new_age = old_age+4;
+	if (new_age < 0)
+		new_age = 0;
+	if (new_age > MAX_AGE)
+		new_age = MAX_AGE;	
+		
+	if (PageActive(page)) {
+		del_page_from_active_list(page);
+	} else if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+	} else if (PageInactiveLaundry(page)) {
+		del_page_from_inactive_laundry_list(page);
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
+	} else return;
 
-static inline void age_page_down(struct page *page)
-{
-	page->age -= min(PAGE_AGE_DECL, (int)page->age);
+	add_page_to_active_list(page, new_age);	
 }
 
+
 /* Must be called with page's pte_chain_lock held. */
 static inline int page_mapping_inuse(struct page * page)
 {
@@ -84,9 +99,9 @@
 
 	/*
 	 * We need to hold the pagecache_lock around all tests to make sure
-	 * reclaim_page() cannot race with find_get_page() and friends.
+	 * reclaim_page() doesn't race with other pagecache users
 	 */
-	spin_lock(&pagemap_lru_lock);
+	lru_lock(zone);
 	spin_lock(&pagecache_lock);
 	maxscan = zone->inactive_clean_pages;
 	while (maxscan-- && !list_empty(&zone->inactive_clean_list)) {
@@ -94,12 +109,7 @@
 		page = list_entry(page_lru, struct page, lru);
 
 		/* Wrong page on list?! (list corruption, should not happen) */
-		if (unlikely(!PageInactiveClean(page))) {
-			printk("VM: reclaim_page, wrong page on list.\n");
-			list_del(page_lru);
-			page_zone(page)->inactive_clean_pages--;
-			continue;
-		}
+		BUG_ON(unlikely(!PageInactiveClean(page)));
 
 		/* Page is being freed */
 		if (unlikely(page_count(page)) == 0) {
@@ -144,7 +154,7 @@
 		UnlockPage(page);
 	}
 	spin_unlock(&pagecache_lock);
-	spin_unlock(&pagemap_lru_lock);
+	lru_unlock(zone);
 	return NULL;
 
 
@@ -152,11 +162,10 @@
 	__lru_cache_del(page);
 	pte_chain_unlock(page);
 	spin_unlock(&pagecache_lock);
-	spin_unlock(&pagemap_lru_lock);
+	lru_unlock(zone);
 	if (entry.val)
 		swap_free(entry);
 	UnlockPage(page);
-	page->age = PAGE_AGE_START;
 	if (page_count(page) != 1)
 		printk("VM: reclaim_page, found page with count %d!\n",
 				page_count(page));
@@ -164,338 +173,259 @@
 }
 
 /**
- * page_dirty - do we need to write the data out to disk
- * @page: page to test
+ * need_rebalance_dirty - do we need to write inactive stuff to disk?
+ * @zone: the zone in question
  *
- * Returns true if the page contains data which needs to
- * be written to disk.  Doesn't test the page tables (yet?).
+ * Returns true if the zone in question has an inbalance between inactive
+ * dirty on one side and inactive laundry + inactive clean on the other
+ * Right now set the balance at 50%; may need tuning later on
  */
-static inline int page_dirty(struct page *page)
+static inline int need_rebalance_dirty(zone_t * zone)
 {
-	struct buffer_head *tmp, *bh;
-
-	if (PageDirty(page))
+	if (zone->inactive_dirty_pages > zone->inactive_laundry_pages + zone->inactive_clean_pages)
 		return 1;
 
-	if (page->mapping && !page->buffers)
-		return 0;
-
-	tmp = bh = page->buffers;
-
-	do {
-		if (tmp->b_state & ((1<<BH_Dirty) | (1<<BH_Lock)))
-			return 1;
-		tmp = tmp->b_this_page;
-	} while (tmp != bh);
+	return 0;
+}
 
+/**
+ * need_rebalance_laundry - does the zone have too few inactive_clean pages?
+ * @zone: the zone in question
+ *
+ * Returns true if the zone in question has too few pages in inactive clean
+ * + free
+ */
+static inline int need_rebalance_laundry(zone_t * zone)
+{
+	if (free_low(zone) >= 0)
+		return 1;
 	return 0;
 }
 
 /**
- * page_launder_zone - clean dirty inactive pages, move to inactive_clean list
+ * launder_page - clean dirty page, move to inactive_laundry list
  * @zone: zone to free pages in
  * @gfp_mask: what operations we are allowed to do
- * @full_flush: full-out page flushing, if we couldn't get enough clean pages
+ * @page: the page at hand, must be on the inactive dirty list
  *
- * This function is called when we are low on free / inactive_clean
- * pages, its purpose is to refill the free/clean list as efficiently
- * as possible.
- *
- * This means we do writes asynchronously as long as possible and will
- * only sleep on IO when we don't have another option. Since writeouts
- * cause disk seeks and make read IO slower, we skip writes alltogether
- * when the amount of dirty pages is small.
- *
- * This code is heavily inspired by the FreeBSD source code. Thanks
- * go out to Matthew Dillon.
- */
-int page_launder_zone(zone_t * zone, int gfp_mask, int full_flush)
-{
-	int maxscan, cleaned_pages, target, maxlaunder, iopages, over_rsslimit;
-	struct list_head * entry, * next;
-
-	target = max_t(int, free_plenty(zone), zone->pages_min);
-	cleaned_pages = iopages = 0;
-
-	/* If we can get away with it, only flush 2 MB worth of dirty pages */
-	if (full_flush)
-		maxlaunder = 1000000;
-	else {
-		maxlaunder = min_t(int, 512, zone->inactive_dirty_pages / 4);
-		maxlaunder = max(maxlaunder, free_plenty(zone) * 4);
-	}
-	
-	/* The main launder loop. */
-	spin_lock(&pagemap_lru_lock);
-rescan:
-	maxscan = zone->inactive_dirty_pages;
-	entry = zone->inactive_dirty_list.prev;
-	next = entry->prev;
-	while (maxscan-- && !list_empty(&zone->inactive_dirty_list) &&
-			next != &zone->inactive_dirty_list) {
-		struct page * page;
-		
-		/* Low latency reschedule point */
-		if (current->need_resched) {
-			spin_unlock(&pagemap_lru_lock);
-			schedule();
-			spin_lock(&pagemap_lru_lock);
-			continue;
-		}
-
-		entry = next;
-		next = entry->prev;
-		page = list_entry(entry, struct page, lru);
-
-		/* This page was removed while we looked the other way. */
-		if (!PageInactiveDirty(page))
-			goto rescan;
+ * per-zone lru lock is assumed to be held, but this function can drop
+ * it and sleep, so no other locks are allowed to be held.
+ *
+ * returns 0 for failure; 1 for success
+ */
+int launder_page(zone_t * zone, int gfp_mask, struct page *page)
+{
+	int over_rsslimit;
 
-		if (cleaned_pages > target)
-			break;
+	/*
+	 * Page is being freed, don't worry about it, but report progress.
+	 */
+	if (unlikely(page_count(page)) == 0)
+		return 1;
 
-		/* Stop doing IO if we've laundered too many pages already. */
-		if (maxlaunder < 0)
-			gfp_mask &= ~(__GFP_IO|__GFP_FS);
+	BUG_ON(!PageInactiveDirty(page));
+	del_page_from_inactive_dirty_list(page);
+	add_page_to_inactive_laundry_list(page);
+	/* store the time we start IO */
+	page->age = (jiffies/HZ)&255;
+	/*
+	 * The page is locked. IO in progress?
+	 * If so, move to laundry and report progress
+	 * Acquire PG_locked early in order to safely
+	 * access page->mapping.
+	 */
+	if (unlikely(TryLockPage(page))) {
+		return 1;
+	}
 
-		/*
-		 * Page is being freed, don't worry about it.
-		 */
-		if (unlikely(page_count(page)) == 0)
-			continue;
+	/*
+	 * The page is in active use or really unfreeable. Move to
+	 * the active list and adjust the page age if needed.
+	 */
+	pte_chain_lock(page);
+	if (page_referenced(page, &over_rsslimit) && !over_rsslimit &&
+			page_mapping_inuse(page)) {
+		del_page_from_inactive_laundry_list(page);
+		add_page_to_active_list(page, INITIAL_AGE);
+		pte_chain_unlock(page);
+		UnlockPage(page);
+		return 1;
+	}
 
+	/*
+	 * Anonymous process memory without backing store. Try to
+	 * allocate it some swap space here.
+	 *
+	 * XXX: implement swap clustering ?
+	 */
+	if (page->pte_chain && !page->mapping && !page->buffers) {
 		/*
-		 * The page is locked. IO in progress?
-		 * Acquire PG_locked early in order to safely
-		 * access page->mapping.
+		 * exception to strict LRU ordering:
+		 * If we now have plenty of ram free,
+		 * don't allocate swap but move the page
+		 * to the active list instead. This can
+		 * happen with bursty workloads
 		 */
-		if (unlikely(TryLockPage(page))) {
-			iopages++;
-			continue;
-		}
 
-		/*
-		 * The page is in active use or really unfreeable. Move to
-		 * the active list and adjust the page age if needed.
-		 */
-		pte_chain_lock(page);
-		if (page_referenced(page, &over_rsslimit) && !over_rsslimit &&
-				page_mapping_inuse(page)) {
-			del_page_from_inactive_dirty_list(page);
-			add_page_to_active_list(page);
-			page->age = max((int)page->age, PAGE_AGE_START);
+		if (free_high(zone) <= 0) {
+			del_page_from_inactive_laundry_list(page);
+			add_page_to_active_list(page, 0);
 			pte_chain_unlock(page);
 			UnlockPage(page);
-			continue;
+			return 0;
 		}
-
-		/*
-		 * Anonymous process memory without backing store. Try to
-		 * allocate it some swap space here.
-		 *
-		 * XXX: implement swap clustering ?
-		 */
-		if (page->pte_chain && !page->mapping && !page->buffers) {
-			/* Don't bother if we can't swap it out now. */
-			if (maxlaunder < 0) {
-				pte_chain_unlock(page);
-				UnlockPage(page);
-				list_del(entry);
-				list_add(entry, &zone->inactive_dirty_list);
-				continue;
-			}
-			page_cache_get(page);
-			pte_chain_unlock(page);
-			spin_unlock(&pagemap_lru_lock);
-			if (!add_to_swap(page)) {
-				activate_page(page);
-				UnlockPage(page);
-				page_cache_release(page);
-				spin_lock(&pagemap_lru_lock);
-				continue;
-			}
+		page_cache_get(page);
+		pte_chain_unlock(page);
+		lru_unlock(zone);
+		if (!add_to_swap(page)) {
+			activate_page(page);
+			lru_lock(zone);
+			UnlockPage(page);
 			page_cache_release(page);
-			spin_lock(&pagemap_lru_lock);
-			pte_chain_lock(page);
+			return 0;
 		}
-
-		/*
-		 * The page is mapped into the page tables of one or more
-		 * processes. Try to unmap it here.
-		 */
-		if (page->pte_chain && page->mapping) {
-			switch (try_to_unmap(page)) {
-				case SWAP_ERROR:
-				case SWAP_FAIL:
-					goto page_active;
-				case SWAP_AGAIN:
-					pte_chain_unlock(page);
-					UnlockPage(page);
-					continue;
-				case SWAP_SUCCESS:
-					; /* try to free the page below */
-			}
+		lru_lock(zone);
+		page_cache_release(page);
+		/* Note: may be on another list ! */
+		if (!PageInactiveLaundry(page)) {
+			UnlockPage(page);
+			return 1;
 		}
-		pte_chain_unlock(page);
-
-		if (PageDirty(page) && page->mapping) {
-			/*
-			 * It is not critical here to write it only if
-			 * the page is unmapped beause any direct writer
-			 * like O_DIRECT would set the PG_dirty bitflag
-			 * on the physical page after having successfully
-			 * pinned it and after the I/O to the page is finished,
-			 * so the direct writes to the page cannot get lost.
-			 */
-			int (*writepage)(struct page *);
-
-			writepage = page->mapping->a_ops->writepage;
-			if ((gfp_mask & __GFP_FS) && writepage) {
-				ClearPageDirty(page);
-				SetPageLaunder(page);
-				page_cache_get(page);
-				spin_unlock(&pagemap_lru_lock);
-
-				writepage(page);
-				maxlaunder--;
-				iopages++;
-				page_cache_release(page);
+		if (unlikely(page_count(page)) == 0) {
+			UnlockPage(page);
+			return 1;
+		}
+		pte_chain_lock(page);
+	}
 
-				spin_lock(&pagemap_lru_lock);
-				continue;
-			} else {
+	/*
+	 * The page is mapped into the page tables of one or more
+	 * processes. Try to unmap it here.
+	 */
+	if (page->pte_chain && page->mapping) {
+		switch (try_to_unmap(page)) {
+			case SWAP_ERROR:
+			case SWAP_FAIL:
+				goto page_active;
+			case SWAP_AGAIN:
+				pte_chain_unlock(page);
 				UnlockPage(page);
-				list_del(entry);
-				list_add(entry, &zone->inactive_dirty_list);
-				continue;
-			}
+				return 0;
+			case SWAP_SUCCESS:
+				; /* fall through, try freeing the page below */
+			/* fixme: add a SWAP_MLOCK case */
 		}
+	}
+	pte_chain_unlock(page);
 
+	if (PageDirty(page) && page->mapping) {
 		/*
-		 * If the page has buffers, try to free the buffer mappings
-		 * associated with this page. If we succeed we try to free
-		 * the page as well.
+		 * The page can be dirtied after we start writing, but
+		 * in that case the dirty bit will simply be set again
+		 * and we'll need to write it again.
 		 */
-		if (page->buffers) {
-			/* To avoid freeing our page before we're done. */
+		int (*writepage)(struct page *);
+
+		writepage = page->mapping->a_ops->writepage;
+		if ((gfp_mask & __GFP_FS) && writepage) {
+			ClearPageDirty(page);
+			SetPageLaunder(page);
 			page_cache_get(page);
+			lru_unlock(zone);
 
-			spin_unlock(&pagemap_lru_lock);
+			writepage(page);
 
-			if (try_to_release_page(page, gfp_mask)) {
-				if (!page->mapping) {
-					/*
-					 * We must not allow an anon page
-					 * with no buffers to be visible on
-					 * the LRU, so we unlock the page after
-					 * taking the lru lock
-					 */
-					spin_lock(&pagemap_lru_lock);
-					UnlockPage(page);
-					__lru_cache_del(page);
+			page_cache_release(page);
+			lru_lock(zone);
+			return 1;
+		} else {
+			del_page_from_inactive_laundry_list(page);
+			add_page_to_inactive_dirty_list(page);
+			/* FIXME: this is wrong for !__GFP_FS !!! */
+			UnlockPage(page);
+			return 0;
+		}
+	}
 
-					/* effectively free the page here */
-					page_cache_release(page);
+	/*
+	 * If the page has buffers, try to free the buffer mappings
+	 * associated with this page. If we succeed we try to free
+	 * the page as well.
+	 */
+	if (page->buffers) {
+		/* To avoid freeing our page before we're done. */
+		page_cache_get(page);
+		lru_unlock(zone);
 
-					cleaned_pages++;
-					continue;
-				} else {
-					/*
-					 * We freed the buffers but may have
-					 * slept; undo the stuff we did before
-					 * try_to_release_page and fall through
-					 * to the next step.
-					 * But only if the page is still on the inact. dirty 
-					 * list.
-					 */
-
-					spin_lock(&pagemap_lru_lock);
-					/* Check if the page was removed from the list
-					 * while we looked the other way. 
-					 */
-					if (!PageInactiveDirty(page)) {
-						page_cache_release(page);
-						continue;
-					}
-					page_cache_release(page);
-				}
-			} else {
-				/* failed to drop the buffers so stop here */
-				UnlockPage(page);
-				page_cache_release(page);
-				maxlaunder--;
-				iopages++;
+		try_to_release_page(page, gfp_mask);
+		UnlockPage(page);
 
-				spin_lock(&pagemap_lru_lock);
-				continue;
-			}
-		}
+		/* 
+		 * If the buffers were the last user of the page we free
+		 * the page here. Because of that we shouldn't hold the
+		 * lru lock yet.
+		 */
+		page_cache_release(page);
 
+		lru_lock(zone);
+		return 1;
+	}
 
+	/*
+	 * If the page is really freeable now, move it to the
+	 * inactive_laundry list to keep LRU order.
+	 *
+	 * We re-test everything since the page could have been
+	 * used by somebody else while we waited on IO above.
+	 * This test is not safe from races; only the one in
+	 * reclaim_page() needs to be.
+	 */
+	pte_chain_lock(page);
+	if (page->mapping && !PageDirty(page) && !page->pte_chain &&
+			page_count(page) == 1) {
+		pte_chain_unlock(page);
+		UnlockPage(page);
+		return 1;
+	} else {
 		/*
-		 * If the page is really freeable now, move it to the
-		 * inactive_clean list.
-		 *
-		 * We re-test everything since the page could have been
-		 * used by somebody else while we waited on IO above.
-		 * This test is not safe from races, but only the one
-		 * in reclaim_page() needs to be.
+		 * OK, we don't know what to do with the page.
+		 * It's no use keeping it here, so we move it
+		 * back to the active list.
 		 */
-		pte_chain_lock(page);
-		if (page->mapping && !PageDirty(page) && !page->pte_chain &&
-				page_count(page) == 1) {
-			del_page_from_inactive_dirty_list(page);
-			add_page_to_inactive_clean_list(page);
-			pte_chain_unlock(page);
-			UnlockPage(page);
-			cleaned_pages++;
-		} else {
-			/*
-			 * OK, we don't know what to do with the page.
-			 * It's no use keeping it here, so we move it to
-			 * the active list.
-			 */
-page_active:
-			del_page_from_inactive_dirty_list(page);
-			add_page_to_active_list(page);
-			pte_chain_unlock(page);
-			UnlockPage(page);
-		}
+ page_active:
+		activate_page_nolock(page);
+		pte_chain_unlock(page);
+		UnlockPage(page);
 	}
-	spin_unlock(&pagemap_lru_lock);
-
-	/* Return the number of pages moved to the inactive_clean list. */
-	return cleaned_pages + iopages;
+	return 0;
 }
 
-/**
- * page_launder - clean dirty inactive pages, move to inactive_clean list
- * @gfp_mask: what operations we are allowed to do
- *
- * This function iterates over all zones and calls page_launder_zone(),
- * balancing still needs to be added...
- */
-int page_launder(int gfp_mask)
+
+unsigned char active_age_bias = 0;
+
+/* Ages down all pages on the active list */
+/* assumes the lru lock held */
+static inline void kachunk(struct zone_struct * zone)
 {
-	struct zone_struct * zone;
-	int freed = 0;
+	int k;
+	if (!list_empty(&zone->active_list[0]))
+		return;
+	if (!zone->active_pages)
+		return;
 
-	/* Global balancing while we have a global shortage. */
-	if (free_high(ALL_ZONES) >= 0)
-		for_each_zone(zone)
-			if (free_plenty(zone) >= 0)
-				freed += page_launder_zone(zone, gfp_mask, 0);
-	
-	/* Clean up the remaining zones with a serious shortage, if any. */
-	for_each_zone(zone)
-		if (free_low(zone) >= 0) {
-			int fullflush = free_min(zone) > 0;
-			freed += page_launder_zone(zone, gfp_mask, fullflush);
-		}
+	for (k = 0; k < MAX_AGE; k++)  {
+		list_splice_init(&zone->active_list[k+1], &zone->active_list[k]);
+		zone->active_count[k] = zone->active_count[k+1];
+		zone->active_count[k+1] = 0;
+	}
 
-	return freed;
+	active_age_bias++;
+	/* flag this zone as having had activity -> rescan to age up is desired */
+	zone->need_scan = 1;
 }
 
+#define BATCH_WORK_AMOUNT	64
+
 /**
  * refill_inactive_zone - scan the active list and find pages to deactivate
  * @priority: how much are we allowed to scan
@@ -503,119 +433,265 @@
  * This function will scan a portion of the active list of a zone to find
  * unused pages, those pages will then be moved to the inactive list.
  */
-int refill_inactive_zone(struct zone_struct * zone, int priority)
+int refill_inactive_zone(struct zone_struct * zone, int priority, int target)
 {
 	int maxscan = zone->active_pages >> priority;
-	int nr_deactivated = 0, over_rsslimit;
-	int target = inactive_high(zone);
 	struct list_head * page_lru;
 	struct page * page;
+	int over_rsslimit;
+	int progress = 0;
 
 	/* Take the lock while messing with the list... */
-	spin_lock(&pagemap_lru_lock);
-	while (maxscan-- && !list_empty(&zone->active_list)) {
-		page_lru = zone->active_list.prev;
+	lru_lock(zone);
+	if (target < BATCH_WORK_AMOUNT)
+		target = BATCH_WORK_AMOUNT;
+
+	while (maxscan-- && zone->active_pages > 0 && target > 0) {
+		if (list_empty(&zone->active_list[0])) {
+			kachunk(zone);
+			continue;
+		}
+		page_lru = zone->active_list[0].prev;
 		page = list_entry(page_lru, struct page, lru);
 
 		/* Wrong page on list?! (list corruption, should not happen) */
-		if (unlikely(!PageActive(page))) {
-			printk("VM: refill_inactive, wrong page on list.\n");
-			list_del(page_lru);
-			nr_active_pages--;
-			continue;
-		}
+		BUG_ON(unlikely(!PageActive(page)));
 		
 		/* Needed to follow page->mapping */
 		if (TryLockPage(page)) {
-			list_del(page_lru);
-			list_add(page_lru, &zone->active_list);
+			/* The page is already locked. This for sure means
+			 * someone is doing stuff with it which makes it
+			 * active by definition ;)
+			 */
+			del_page_from_active_list(page);
+			add_page_to_active_list(page, INITIAL_AGE);
 			continue;
 		}
 
 		/*
-		 * If the object the page is in is not in use we don't
-		 * bother with page aging.  If the page is touched again
-		 * while on the inactive_clean list it'll be reactivated.
-		 * From here until the end of the current iteration
-		 * both PG_locked and the pte_chain_lock are held.
+		 * Do aging on the pages.
 		 */
-		pte_chain_lock(page);
-		if (!page_mapping_inuse(page)) {
-			pte_chain_unlock(page);
+		if (page_referenced(page, &over_rsslimit) && !over_rsslimit) {
+			age_page_up_nolock(page, 0);
 			UnlockPage(page);
-			drop_page(page);
 			continue;
 		}
 
+		deactivate_page_nolock(page);
+		target--;
+		progress++;
+		UnlockPage(page);
+	}
+	lru_unlock(zone);
+
+	return progress;
+}
+
+static int need_active_scan(struct zone_struct * zone)
+{
+	int low = 0, high = 0;
+	int k;
+	for (k=0; k < MAX_AGE/2; k++)
+		low += zone->active_count[k];
+
+	for (k=MAX_AGE/2; k <= MAX_AGE; k++)
+		high += zone->active_count[k];
+
+	if (high<low)
+		return 1;
+	return 0;
+}
+
+static int scan_active_list(struct zone_struct * zone, int age)
+{
+	struct list_head * list, *page_lru , *next;
+	struct page * page;
+	int over_rsslimit;
+
+	list = &zone->active_list[age];
+
+	/* Take the lock while messing with the list... */
+	lru_lock(zone);
+	list_for_each_safe(page_lru, next, list) {
+		page = list_entry(page_lru, struct page, lru);
+		pte_chain_lock(page);
+		if (page_referenced(page, &over_rsslimit) && !over_rsslimit)
+			age_page_up_nolock(page, age);
+		pte_chain_unlock(page);
+	}
+	lru_unlock(zone);
+	return 0;
+}
+
+/*
+ * Move max_work pages to the inactive clean list as long as there is a need
+ * for this. If gfp_mask allows it, sleep for IO to finish.
+ */
+int rebalance_laundry_zone(struct zone_struct * zone, int max_work, unsigned int gfp_mask)
+{
+	struct list_head * page_lru;
+	int max_loop;
+	int work_done = 0;
+	struct page * page;
+
+	max_loop = max_work;
+	if (max_loop < BATCH_WORK_AMOUNT)
+		max_loop = BATCH_WORK_AMOUNT;
+	/* Take the lock while messing with the list... */
+	lru_lock(zone);
+	while (max_loop-- && !list_empty(&zone->inactive_laundry_list)) {
+		page_lru = zone->inactive_laundry_list.prev;
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		BUG_ON(unlikely(!PageInactiveLaundry(page)));
+
+		/* TryLock to see if the page IO is done */
+		if (TryLockPage(page)) {
+			/*
+			 * Page is locked (IO in progress?). If we can sleep,
+			 * wait for it to finish, except when we've already
+			 * done enough work.
+			 */
+			if ((gfp_mask & __GFP_WAIT) && (work_done < max_work)) {
+				int timed_out;
+				
+				page_cache_get(page);
+				lru_unlock(zone);
+				run_task_queue(&tq_disk);
+				timed_out = wait_on_page_timeout(page, 5 * HZ);
+				lru_lock(zone);
+				page_cache_release(page);
+				/*
+				 * If we timed out and the page has been in
+				 * flight for over 30 seconds, this might not
+				 * be the best page to wait on; move it to
+				 * the head of the dirty list.
+				 */
+				if (timed_out & PageInactiveLaundry(page)) {
+					unsigned char now;
+					now = (jiffies/HZ)&255;
+					if (now - page->age > 30) {
+						del_page_from_inactive_laundry_list(page);
+						add_page_to_inactive_dirty_list(page);
+					}
+					continue;
+				}
+				/* We didn't make any progress for our caller,
+				 * but we are actively avoiding a livelock
+				 * so undo the decrement and wait on this page
+				 * some more, until IO finishes or we timeout.
+				 */
+				max_loop++;
+				continue;
+			} else
+				/* No dice, we can't wait for IO */
+				break;
+		}
+		UnlockPage(page);
+
 		/*
-		 * Do aging on the pages.
+		 * If we get here either the IO on the page is done or
+		 * IO never happened because it was clean. Either way
+		 * move it to the inactive clean list.
 		 */
-		if (page_referenced(page, &over_rsslimit)) {
-			age_page_up(page);
-		} else {
-			age_page_down(page);
-		}
 
-		/* 
-		 * If the page age is 'hot' and the process using the
-		 * page doesn't exceed its RSS limit we keep the page.
-		 * Otherwise we move it to the inactive_dirty list.
+		/* FIXME: check if the page is still clean or is accessed ? */
+
+		del_page_from_inactive_laundry_list(page);
+		add_page_to_inactive_clean_list(page);
+		work_done++;
+
+		/*
+		 * If we've done the minimal batch of work and there's
+		 * no longer a need to rebalance, abort now.
 		 */
-		if (page->age && !over_rsslimit) {
-			list_del(page_lru);
-			list_add(page_lru, &zone->active_list);
-		} else {
-			deactivate_page_nolock(page);
-			if (++nr_deactivated > target) {
-				pte_chain_unlock(page);
-				UnlockPage(page);
-				goto done;
-			}
-		}
-		pte_chain_unlock(page);
-		UnlockPage(page);
+		if ((work_done > BATCH_WORK_AMOUNT) && (!need_rebalance_laundry(zone)))
+			break;
+	}
 
-		/* Low latency reschedule point */
-		if (current->need_resched) {
-			spin_unlock(&pagemap_lru_lock);
-			schedule();
-			spin_lock(&pagemap_lru_lock);
-		}
+	lru_unlock(zone);
+	return work_done;
+}
+
+/*
+ * Move max_work pages from the dirty list as long as there is a need.
+ * Start IO if the gfp_mask allows it.
+ */
+int rebalance_dirty_zone(struct zone_struct * zone, int max_work, unsigned int gfp_mask)
+{
+	struct list_head * page_lru;
+	int max_loop;
+	int work_done = 0;
+	struct page * page;
+
+	max_loop = max_work;
+	if (max_loop < BATCH_WORK_AMOUNT)
+		max_loop = BATCH_WORK_AMOUNT;
+	/* Take the lock while messing with the list... */
+	lru_lock(zone);
+	while (max_loop-- && !list_empty(&zone->inactive_dirty_list)) {
+		page_lru = zone->inactive_dirty_list.prev;
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		BUG_ON(unlikely(!PageInactiveDirty(page)));
+
+		/*
+		 * Note: launder_page() sleeps so we can't safely look at
+		 * the page after this point!
+		 *
+		 * If we fail (only happens if we can't do IO) we just try
+		 * again on another page; launder_page makes sure we won't
+		 * see the same page over and over again.
+		 */
+		if (!launder_page(zone, gfp_mask, page))
+			continue;
+
+		work_done++;
+
+		/*
+		 * If we've done the minimal batch of work and there's
+		 * no longer any need to rebalance, abort now.
+		 */
+		if ((work_done > BATCH_WORK_AMOUNT) && (!need_rebalance_dirty(zone)))
+			break;
 	}
+	lru_unlock(zone);
 
-done:
-	spin_unlock(&pagemap_lru_lock);
+	return work_done;
+}
 
-	return nr_deactivated;
+/* goal percentage sets the goal of the laundry+clean+free of the total zone size */
+int rebalance_inactive_zone(struct zone_struct * zone, int max_work, unsigned int gfp_mask, int goal_percentage)
+{
+	int ret = 0;
+	/* first deactivate memory */
+	if (((zone->inactive_laundry_pages + zone->inactive_clean_pages + zone->free_pages)*100 < zone->size * goal_percentage) &&
+			(inactive_high(zone) > 0))
+		ret += refill_inactive_zone(zone, 0, max_work + BATCH_WORK_AMOUNT);
+
+	if (need_rebalance_dirty(zone))
+		ret += rebalance_dirty_zone(zone, max_work, gfp_mask);
+	if (need_rebalance_laundry(zone))
+		ret += rebalance_laundry_zone(zone, max_work, gfp_mask);
+	return ret;
 }
 
-/**
- * refill_inactive - checks all zones and refills the inactive list as needed
- *
- * This function tries to balance page eviction from all zones by aging
- * the pages from each zone in the same ratio until the global inactive
- * shortage is resolved. After that it does one last "clean-up" scan to
- * fix up local inactive shortages.
- */
-int refill_inactive(void)
+int rebalance_inactive(unsigned int gfp_mask, int percentage)
 {
-	int maxtry = 1 << DEF_PRIORITY;
-	zone_t * zone;
+	struct zone_struct * zone;
+	int max_work;
 	int ret = 0;
 
-	/* Global balancing while we have a global shortage. */
-	while (maxtry-- && inactive_low(ALL_ZONES) >= 0) {
-		for_each_zone(zone) {
-			if (inactive_high(zone) >= 0)
-				ret += refill_inactive_zone(zone, DEF_PRIORITY);
-		}
-	}
+	max_work = 4 * BATCH_WORK_AMOUNT;
+	/* If we're in deeper trouble, do more work */
+	if (percentage >= 50)
+		max_work = 8 * BATCH_WORK_AMOUNT;
 
-	/* Local balancing for zones which really need it. */
-	for_each_zone(zone) {
-		if (inactive_min(zone) >= 0)
-			ret += refill_inactive_zone(zone, 0);
-	}
+	for_each_zone(zone)
+		ret += rebalance_inactive_zone(zone, max_work, gfp_mask, percentage);
+		/* 4 * BATCH_WORK_AMOUNT needs tuning */
 
 	return ret;
 }
@@ -636,7 +712,9 @@
 
 	for_each_zone(zone)
 		if (inactive_high(zone) > 0)
-			refill_inactive_zone(zone, priority);
+			refill_inactive_zone(zone, priority, BATCH_WORK_AMOUNT);
+	for_each_zone(zone)
+			rebalance_dirty_zone(zone, BATCH_WORK_AMOUNT, GFP_KSWAPD);
 }
 
 /*
@@ -655,18 +733,13 @@
 	 * Eat memory from filesystem page cache, buffer cache,
 	 * dentry, inode and filesystem quota caches.
 	 */
-	ret += page_launder(gfp_mask);
+	ret += rebalance_inactive(gfp_mask, 100);
 	ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
 	ret += shrink_icache_memory(1, gfp_mask);
 #ifdef CONFIG_QUOTA
 	ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 #endif
 
-	/*
-	 * Move pages from the active list to the inactive list.
-	 */
-	refill_inactive();
-
 	/* 	
 	 * Reclaim unused slab cache memory.
 	 */
@@ -682,12 +755,54 @@
 	 * Hmm.. Cache shrink failed - time to kill something?
 	 * Mhwahahhaha! This is the part I really like. Giggle.
 	 */
-	if (ret < free_low(ANY_ZONE))
+	if (ret < free_low(ANY_ZONE) && (gfp_mask&__GFP_WAIT))
 		out_of_memory();
 
 	return ret;
 }
 
+/*
+ * Worker function for kswapd and try_to_free_pages, we get
+ * called whenever there is a shortage of free/inactive_clean
+ * pages.
+ *
+ * This function will also move pages to the inactive list,
+ * if needed.
+ */
+static int do_try_to_free_pages_kswapd(unsigned int gfp_mask)
+{
+	int ret = 0;
+	struct zone_struct * zone;
+
+	ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+	ret += shrink_icache_memory(DEF_PRIORITY, gfp_mask);
+#ifdef CONFIG_QUOTA
+	ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+#endif
+
+	/*
+	 * Eat memory from filesystem page cache, buffer cache,
+	 * dentry, inode and filesystem quota caches.
+	 */
+	rebalance_inactive(gfp_mask, 5);
+
+	for_each_zone(zone)
+		while (need_rebalance_dirty(zone))
+			rebalance_dirty_zone(zone,  16 * BATCH_WORK_AMOUNT,  gfp_mask);
+
+	for_each_zone(zone)
+		if (free_high(zone)>0)
+			rebalance_laundry_zone(zone, BATCH_WORK_AMOUNT, 0);
+
+	refill_freelist();
+
+	/* Start IO when needed. */
+	if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0)
+		run_task_queue(&tq_disk);
+
+	return ret;
+}
+
 /**
  * refill_freelist - move inactive_clean pages to free list if needed
  *
@@ -764,7 +879,7 @@
 		 * zone is very short on free pages.
 		 */
 		if (free_high(ALL_ZONES) >= 0 || free_low(ANY_ZONE) > 0)
-			do_try_to_free_pages(GFP_KSWAPD);
+			do_try_to_free_pages_kswapd(GFP_KSWAPD);
 
 		refill_freelist();
 
@@ -846,7 +961,7 @@
 	/* OK, the VM is very loaded. Sleep instead of using all CPU. */
 	kswapd_overloaded = 1;
 	set_current_state(TASK_UNINTERRUPTIBLE);
-	schedule_timeout(HZ / 4);
+	schedule_timeout(HZ / 40);
 	kswapd_overloaded = 0;
 	return;
 }
@@ -888,6 +1003,7 @@
 void rss_free_pages(unsigned int gfp_mask)
 {
 	long pause = 0;
+	struct zone_struct * zone;
 
 	if (current->flags & PF_MEMALLOC)
 		return;
@@ -895,7 +1011,10 @@
 	current->flags |= PF_MEMALLOC;
 
 	do {
-		page_launder(gfp_mask);
+		rebalance_inactive(gfp_mask, 100);
+		for_each_zone(zone)
+			if (free_plenty(zone) >= 0)
+				rebalance_laundry_zone(zone, BATCH_WORK_AMOUNT, 0);
 
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(pause);
@@ -907,11 +1026,52 @@
 	return;
 }
 
+/*
+ * The background page scanning daemon, started as a kernel thread
+ * from the init process. 
+ *
+ * This is the part that background scans the active list to find
+ * pages that are referenced and increases their age score.
+ * It is important that this scan rate is not proportional to vm pressure
+ * per se otherwise cpu usage becomes unbounded. On the other hand, if there's
+ * no VM pressure at all it shouldn't age stuff either otherwise everything
+ * ends up at the maximum age. 
+ */
+int kscand(void *unused)
+{
+	struct task_struct *tsk = current;
+	struct zone_struct * zone;
+	int age;
+
+	daemonize();
+	strcpy(tsk->comm, "kscand");
+	sigfillset(&tsk->blocked);
+	
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(10*HZ);	
+		for_each_zone(zone) {
+
+			
+			if (!need_active_scan(zone))
+				continue;
+			for (age = 0; age < MAX_AGE; age++)  {
+				scan_active_list(zone, age);
+				if (current->need_resched)
+					schedule();
+			}
+				
+		}
+	}
+}
+
+
 static int __init kswapd_init(void)
 {
 	printk("Starting kswapd\n");
 	swap_setup();
 	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	kernel_thread(kscand, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 	return 0;
 }
 



-- 
Paul P 'Stingray' Komkoff 'Greatest' Jr /// (icq)23200764 /// (http)stingr.net
  When you're invisible, the only one really watching you is you (my keychain)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Copyright © 2002, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds