|
|
Subscribe / Log in / New account

second try for swap prefetch (does Oops!)

From:  Thomas Schlichter <schlicht@uni-mannheim.de>
To:  Andrew Morton <akpm@digeo.com>
Subject:  [RFC] second try for swap prefetch (does Oops!)
Date:  Thu, 17 Apr 2003 18:02:13 +0200
Cc:  linux-kernel@vger.kernel.org

diff -urP linux-2.5.67/arch/i386/Kconfig linux-2.5.67_patched/arch/i386/Kconfig
--- linux-2.5.67/arch/i386/Kconfig	Thu Apr 10 19:25:40 2003
+++ linux-2.5.67_patched/arch/i386/Kconfig	Thu Apr 10 17:47:36 2003
@@ -373,6 +373,13 @@
 	depends on MK8 || MPENTIUM4
 	default y
 
+config SWAP_PREFETCH
+	tristate "Prefetch swapped memory"
+	depends on SWAP
+	help
+	  This option enables the kernel to prefetch swapped memory pages
+	  when idle.
+
 config HUGETLB_PAGE
 	bool "Huge TLB Page Support"
 	help
diff -urP linux-2.5.67/fs/inode.c linux-2.5.67_patched/fs/inode.c
--- linux-2.5.67/fs/inode.c	Mon Apr  7 19:32:58 2003
+++ linux-2.5.67_patched/fs/inode.c	Sat Apr 12 03:30:39 2003
@@ -180,6 +180,7 @@
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
+	INIT_RADIX_TREE(&inode->i_data.swap_tree, GFP_ATOMIC);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	rwlock_init(&inode->i_data.page_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
diff -urP linux-2.5.67/include/linux/fs.h linux-2.5.67_patched/include/linux/fs.h
--- linux-2.5.67/include/linux/fs.h	Mon Apr  7 19:30:58 2003
+++ linux-2.5.67_patched/include/linux/fs.h	Sat Apr 12 03:31:24 2003
@@ -312,6 +312,7 @@
 struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
+	struct radix_tree_root	swap_tree;	/* radix tree of swapped pages */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	rwlock_t		page_lock;	/* and rwlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
diff -urP linux-2.5.67/include/linux/swap.h linux-2.5.67_patched/include/linux/swap.h
--- linux-2.5.67/include/linux/swap.h	Thu Apr 10 19:25:40 2003
+++ linux-2.5.67_patched/include/linux/swap.h	Thu Apr 10 18:36:33 2003
@@ -155,6 +155,8 @@
 extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat);
 extern unsigned int nr_free_buffer_pages(void);
 extern unsigned int nr_free_pagecache_pages(void);
+extern unsigned int nr_avail_buffer_pages(void);
+extern unsigned int nr_avail_pagecache_pages(void);
 
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
diff -urP linux-2.5.67/include/linux/swap_prefetch.h linux-2.5.67_patched/include/linux/swap_prefetch.h
--- linux-2.5.67/include/linux/swap_prefetch.h	Thu Jan  1 01:00:00 1970
+++ linux-2.5.67_patched/include/linux/swap_prefetch.h	Wed Apr 16 16:00:09 2003
@@ -0,0 +1,57 @@
+#ifndef _LINUX_SWAP_PREFETCH_H
+#define _LINUX_SWAP_PREFETCH_H
+
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/radix-tree.h>
+
+struct swapped_entry_t {
+	struct list_head	head;
+	swp_entry_t		swp_entry;
+	struct address_space	*mapping;
+};
+
+struct swapped_list_t {
+	spinlock_t		lock;
+	struct list_head	head;
+	kmem_cache_t		*cache;
+};
+
+extern struct swapped_list_t	swapped_list;
+
+static inline void add_to_swapped_list(struct address_space *mapping,
+							unsigned long index)
+{
+	struct swapped_entry_t *entry;
+	int error;
+ 
+	entry = kmem_cache_alloc(swapped_list.cache, GFP_ATOMIC);
+	if(entry) {
+		entry->swp_entry.val = index;
+		entry->mapping       = mapping;
+
+		spin_lock(&swapped_list.lock);
+		error = radix_tree_insert(&mapping->swap_tree, index, entry);
+		if(!error)
+			list_add(&entry->head, &swapped_list.head);
+		else
+			kmem_cache_free(swapped_list.cache, entry);
+		spin_unlock(&swapped_list.lock);
+	}
+}
+
+static inline void remove_from_swapped_list(struct address_space *mapping,
+							unsigned long index)
+{
+	struct swapped_entry_t *entry;
+
+	spin_lock(&swapped_list.lock);
+	entry = radix_tree_delete(&mapping->swap_tree, index);
+	if(entry) {
+		list_del(&entry->head);
+		kmem_cache_free(swapped_list.cache, entry);
+	}
+	spin_unlock(&swapped_list.lock);
+}
+
+#endif /* _LINUX_SWAP_PREFETCH_H */
diff -urP linux-2.5.67/kernel/ksyms.c linux-2.5.67_patched/kernel/ksyms.c
--- linux-2.5.67/kernel/ksyms.c	Thu Apr 10 19:25:40 2003
+++ linux-2.5.67_patched/kernel/ksyms.c	Mon Apr 14 01:51:51 2003
@@ -58,6 +58,7 @@
 #include <linux/ptrace.h>
 #include <linux/time.h>
 #include <linux/backing-dev.h>
+#include <linux/swap_prefetch.h>
 #include <asm/checksum.h>
 
 #if defined(CONFIG_PROC_FS)
@@ -70,6 +71,13 @@
 extern struct timezone sys_tz;
 
 extern int panic_timeout;
+
+/* needed for swap prefetch support */
+EXPORT_SYMBOL(swapped_list);
+EXPORT_SYMBOL(swapper_space);
+EXPORT_SYMBOL(swapin_readahead);
+EXPORT_SYMBOL(do_page_cache_readahead);
+EXPORT_SYMBOL(nr_avail_pagecache_pages);
 
 /* process memory management */
 EXPORT_SYMBOL(do_mmap_pgoff);
diff -urP linux-2.5.67/mm/Makefile linux-2.5.67_patched/mm/Makefile
--- linux-2.5.67/mm/Makefile	Thu Apr 10 19:25:40 2003
+++ linux-2.5.67_patched/mm/Makefile	Thu Apr 10 17:47:36 2003
@@ -12,3 +12,5 @@
 			   slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
+
+obj-$(CONFIG_SWAP_PREFETCH)	+= swap_prefetch.o
diff -urP linux-2.5.67/mm/filemap.c linux-2.5.67_patched/mm/filemap.c
--- linux-2.5.67/mm/filemap.c	Mon Apr  7 19:31:02 2003
+++ linux-2.5.67_patched/mm/filemap.c	Wed Apr 16 16:04:40 2003
@@ -16,8 +16,7 @@
 #include <linux/fs.h>
 #include <linux/aio.h>
 #include <linux/kernel_stat.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
+#include <linux/swap_prefetch.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
@@ -84,6 +83,7 @@
 
 	BUG_ON(PageDirty(page) && !PageSwapCache(page));
 
+	remove_from_swapped_list(mapping, page->index);
 	radix_tree_delete(&mapping->page_tree, page->index);
 	list_del(&page->list);
 	page->mapping = NULL;
@@ -223,8 +223,11 @@
 int add_to_page_cache(struct page *page, struct address_space *mapping,
 		pgoff_t offset, int gfp_mask)
 {
-	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+	int error;
 
+	remove_from_swapped_list(mapping, offset);
+
+	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 	if (error == 0) {
 		page_cache_get(page);
 		write_lock(&mapping->page_lock);
diff -urP linux-2.5.67/mm/page_alloc.c linux-2.5.67_patched/mm/page_alloc.c
--- linux-2.5.67/mm/page_alloc.c	Thu Apr 10 19:25:40 2003
+++ linux-2.5.67_patched/mm/page_alloc.c	Thu Apr 10 17:47:36 2003
@@ -787,6 +787,48 @@
 }
 #endif
 
+static unsigned int nr_avail_zone_pages(int offset)
+{
+	pg_data_t *pgdat;
+	unsigned long avail = 0;
+
+	for_each_pgdat(pgdat) {
+		struct zonelist *zonelist = pgdat->node_zonelists + offset;
+		struct zone **zonep = zonelist->zones;
+		struct zone *zone;
+		unsigned long low = 0;
+
+		for (zone = *zonep++; zone; zone = *zonep++) {
+			unsigned long local_free = zone->free_pages;
+			unsigned long local_low  = zone->pages_low;
+			
+			low += local_low;
+			if (local_free > low) {
+				avail = max(avail, local_free - low);
+			}
+			low += local_low * sysctl_lower_zone_protection;
+		}
+	}
+
+	return avail;
+}
+
+/*
+ * Amount of available RAM allocatable within ZONE_DMA and ZONE_NORMAL
+ */
+unsigned int nr_avail_buffer_pages(void)
+{
+	return nr_avail_zone_pages(GFP_USER & GFP_ZONEMASK);
+}
+
+/*
+ * Amount of available RAM allocatable within all zones
+ */
+unsigned int nr_avail_pagecache_pages(void)
+{
+	return nr_avail_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
+}
+
 #ifdef CONFIG_NUMA
 static void show_node(struct zone *zone)
 {
diff -urP linux-2.5.67/mm/swap.c linux-2.5.67_patched/mm/swap.c
--- linux-2.5.67/mm/swap.c	Mon Apr  7 19:31:05 2003
+++ linux-2.5.67_patched/mm/swap.c	Sat Apr 12 03:19:53 2003
@@ -13,9 +13,8 @@
  * Buffermem limits added 12.3.98, Rik van Riel.
  */
 
-#include <linux/mm.h>
 #include <linux/kernel_stat.h>
-#include <linux/swap.h>
+#include <linux/swap_prefetch.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
@@ -24,6 +23,11 @@
 #include <linux/buffer_head.h>
 #include <linux/percpu.h>
 
+struct swapped_list_t swapped_list = {
+	.lock = SPIN_LOCK_UNLOCKED,
+	.head = LIST_HEAD_INIT(swapped_list.head),
+};
+
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
@@ -390,4 +394,12 @@
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+
+	/*
+	 * Create kmem cache for swapped entries
+	 */
+ 	swapped_list.cache = kmem_cache_create("swapped_entry",
+		sizeof(struct swapped_entry_t), 0, 0, NULL, NULL);
+	if(!swapped_list.cache)
+		panic("swap_setup(): cannot create swapped_entry SLAB cache");
 }
diff -urP linux-2.5.67/mm/swap_prefetch.c linux-2.5.67_patched/mm/swap_prefetch.c
--- linux-2.5.67/mm/swap_prefetch.c	Thu Jan  1 01:00:00 1970
+++ linux-2.5.67_patched/mm/swap_prefetch.c	Thu Apr 17 00:29:40 2003
@@ -0,0 +1,88 @@
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/swap_prefetch.h>
+
+#define RESERVED_PAGES	50		/* let 200 kByte of pagecache free */
+#define INTERVAL	60		/* (secs) Default is 1 minute */
+
+static int reserved_pages = RESERVED_PAGES;
+static int interval       = INTERVAL;
+
+MODULE_PARM(reserved_pages,"i");
+MODULE_PARM_DESC(reserved_pages,
+	"count of pagechache pages to let free (default 50)");
+
+MODULE_PARM(interval,"i");
+MODULE_PARM_DESC(interval,
+	"delay in seconds to wait between memory checks (default 60)");
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Thomas Schlichter <thomas.schlichter@web.de>");
+MODULE_DESCRIPTION("prefetches swap pages when there is free memory");
+
+/*
+ *	Our timer
+ */
+static void prefetch_timer_handler(unsigned long data);
+static struct timer_list prefetch_timer =
+		TIMER_INITIALIZER(prefetch_timer_handler, 0, 0);
+
+/*
+ *	Our work
+ */
+static void prefetch_work_handler(void *data);
+static DECLARE_WORK(prefetch_work, prefetch_work_handler, 0);
+
+/*
+ *	If the timer expires..
+ */
+static void prefetch_timer_handler(unsigned long data)
+{
+	schedule_work(&prefetch_work);
+	prefetch_timer.expires = jiffies + interval * HZ;
+	add_timer(&prefetch_timer);
+}
+
+/*
+ *	..do the work
+ */
+static void prefetch_work_handler(void *data)
+{
+	printk(KERN_INFO "Available pages before: %d\n", nr_avail_pagecache_pages());
+
+	while(nr_avail_pagecache_pages() > reserved_pages) {
+		struct swapped_entry_t *entry;
+
+		spin_lock(&swapped_list.lock);
+		if(list_empty(&swapped_list.head)) {
+			spin_unlock(&swapped_list.lock);
+			break;
+		}
+		entry = list_entry(swapped_list.head.next, struct swapped_entry_t, head);
+		radix_tree_delete(&entry->mapping->swap_tree, entry->swp_entry.val);
+		list_del(&entry->head);
+		spin_unlock(&swapped_list.lock);
+
+		if(entry->mapping == &swapper_space)
+			swapin_readahead(entry->swp_entry);
+		else
+			do_page_cache_readahead(entry->mapping, NULL, entry->swp_entry.val, 1);
+		kmem_cache_free(swapped_list.cache, entry);
+	}
+
+	printk(KERN_INFO "Available pages after: %d\n", nr_avail_pagecache_pages());
+}
+
+static int __init prefetch_init(void)
+{
+	prefetch_timer_handler(0);
+	return 0;
+}
+
+static void __exit prefetch_exit(void)
+{
+	del_timer(&prefetch_timer);
+}
+
+module_init(prefetch_init);
+module_exit(prefetch_exit);
diff -urP linux-2.5.67/mm/swap_state.c linux-2.5.67_patched/mm/swap_state.c
--- linux-2.5.67/mm/swap_state.c	Thu Apr 10 19:25:40 2003
+++ linux-2.5.67_patched/mm/swap_state.c	Sat Apr 12 03:29:59 2003
@@ -33,6 +33,7 @@
 extern struct address_space_operations swap_aops;
 
 struct address_space swapper_space = {
+	.swap_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.page_lock	= RW_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
diff -urP linux-2.5.67/mm/vmscan.c linux-2.5.67_patched/mm/vmscan.c
--- linux-2.5.67/mm/vmscan.c	Thu Apr 10 19:25:40 2003
+++ linux-2.5.67_patched/mm/vmscan.c	Thu Apr 17 14:41:17 2003
@@ -11,10 +11,9 @@
  *  Multiqueue VM started 5.8.00, Rik van Riel.
  */
 
-#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
-#include <linux/swap.h>
+#include <linux/swap_prefetch.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
@@ -417,6 +416,9 @@
 		ret++;
 		if (!pagevec_add(&freed_pvec, page))
 			__pagevec_release_nonlru(&freed_pvec);
+		if (mapping)
+//		if (mapping == &swapper_space)
+			add_to_swapped_list(mapping, page->index);
 		continue;
 
 activate_locked:
ksymoops 2.4.2 on i586 2.5.67.  Options used
     -V (default)
     -k /proc/ksyms (default)
     -l /proc/modules (default)
     -o /lib/modules/2.5.67/ (default)
     -m /boot/System.map-2.5.67 (default)

Warning: You did not tell me where to find symbol information.  I will
assume that the log matches the kernel and modules that are running
right now and I'll use the default options above for symbol resolution.
If the current kernel and/or modules do not match the log, you can get
more accurate output by telling me the kernel version and where to find
map, modules, ksyms etc.  ksymoops -h explains the options.

Error (regular_file): read_ksyms stat /proc/ksyms failed
No modules in ksyms, skipping objects
No ksyms, skipping lsmod
Unable to handle kernel paging request at virtual address 64815c28
c02018e8
*pde = 00000000
Oops: 0000 [#1]
CPU:    0
EIP:    0060:[<c02018e8>]    Tainted: P  
Using defaults from ksymoops -t elf32-i386 -a i386
EFLAGS: 00010283
eax: 64815c28   ebx: c12b9f3c   ecx: 8b21ca84   edx: 64815c24
esi: 8b21ca7e   edi: 00000001   ebp: c12b9f6c   esp: c12b9f20
ds: 007b   es: 007b   ss: 0068
Stack: c2e34c6c c03840e0 c12b8000 c1daf716 00000000 00000000 ceeb5570 64815c24 
       64815c28 00000282 c2e34c50 c2e34c50 c03840e0 c12b8000 00000000 00000000 
       000003d4 00002a76 c12b9f74 c12b9f88 d4b0415f ceeb5568 00000001 c12b8000 
 [<d4b0415f>] prefetch_work_handler+0x12f/0x210 [swap_prefetch]
 [<d4b04d80>] prefetch_work+0x0/0x60 [swap_prefetch]
 [<c0139dcf>] worker_thread+0x28f/0x438
 [<c0139b40>] worker_thread+0x0/0x438
 [<d4b04030>] prefetch_work_handler+0x0/0x210 [swap_prefetch]
 [<c0122d38>] default_wake_function+0x0/0x18
 [<c0122d38>] default_wake_function+0x0/0x18
 [<c01081e5>] kernel_thread_helper+0x5/0xc
Code: 8b 10 85 d2 74 6a 89 f8 89 f1 d3 e8 83 e0 3f 8d 44 82 04 89 

>>EIP; c02018e8 <radix_tree_delete+4c/c8>   <=====
Code;  c02018e8 <radix_tree_delete+4c/c8>
00000000 <_EIP>:
Code;  c02018e8 <radix_tree_delete+4c/c8>   <=====
   0:   8b 10                     mov    (%eax),%edx   <=====
Code;  c02018ea <radix_tree_delete+4e/c8>
   2:   85 d2                     test   %edx,%edx
Code;  c02018ec <radix_tree_delete+50/c8>
   4:   74 6a                     je     70 <_EIP+0x70> c0201958 <radix_tree_delete+bc/c8>
Code;  c02018ee <radix_tree_delete+52/c8>
   6:   89 f8                     mov    %edi,%eax
Code;  c02018f0 <radix_tree_delete+54/c8>
   8:   89 f1                     mov    %esi,%ecx
Code;  c02018f2 <radix_tree_delete+56/c8>
   a:   d3 e8                     shr    %cl,%eax
Code;  c02018f4 <radix_tree_delete+58/c8>
   c:   83 e0 3f                  and    $0x3f,%eax
Code;  c02018f6 <radix_tree_delete+5a/c8>
   f:   8d 44 82 04               lea    0x4(%edx,%eax,4),%eax
Code;  c02018fa <radix_tree_delete+5e/c8>
  13:   89 00                     mov    %eax,(%eax)


1 warning and 1 error issued.  Results may not be reliable.


Copyright © 2003, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds