|
|
Subscribe / Log in / New account

2.5.46 AIO support for raw/O_DIRECT

From:  Badari Pulavarty <pbadari@us.ibm.com>
To:  linux-aio@kvack.org, linux-kernel@vger.kernel.org (lkml)
Subject:  [PATCH 2/2] 2.5.46 AIO support for raw/O_DIRECT
Date:  Tue, 5 Nov 2002 17:03:36 -0800 (PST)
Cc:  akpm@digeo.com, bcrl@redhat.com, pbadari@us.ibm.com (Badari Pulavarty)

Hi,

This is (part 2/2) 2.5.46 patch to support AIO for raw/O_DIRECT.

This patch adds AIO support for DIO code path. This patch also
has a work around for calling set_page_dirty() from interrupt 
context problem.

Andrew, could you please check to see if I did "set_page_dirty()"
hack (you suggested) correctly (in the right place) ?

Ben, could you pick these patches and push to Linus ?

NOTE: You need part 1/2 to use this patch.

Thanks,
Badari

diff -Naur -X dontdiff linux-2.5.46/fs/direct-io.c linux-2.5.46.aio/fs/direct-io.c
--- linux-2.5.46/fs/direct-io.c	Tue Nov  5 16:01:18 2002
+++ linux-2.5.46.aio/fs/direct-io.c	Tue Nov  5 14:57:36 2002
@@ -13,6 +13,7 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
@@ -100,6 +101,11 @@
 	spinlock_t bio_list_lock;	/* protects bio_list */
 	struct bio *bio_list;		/* singly linked via bi_private */
 	struct task_struct *waiter;	/* waiting task (NULL if none) */
+
+	/* AIO related stuff */
+	struct kiocb *iocb;		/* kiocb */
+	int is_async;			/* is IO async ? */
+	int result;			/* IO result */
 };
 
 /*
@@ -176,6 +182,43 @@
 	return dio->pages[dio->head++];
 }
 
+static void dio_bio_count(struct dio *dio)
+{
+	if (atomic_dec_and_test(&dio->bio_count)) {
+		if(dio->is_async) {
+			aio_complete(dio->iocb, dio->result, 0);
+			kfree(dio);
+		}
+	}
+}
+
+static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
+{
+	struct dio *dio = bio->bi_private;
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int page_no;
+
+	if (bio->bi_size)
+		return 1;
+
+	for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+		struct page *page = bvec[page_no].bv_page;
+
+		if (dio->rw == READ) {
+			SetPageDirty(page);
+			SetPageWrongList(page);
+		}
+		page_cache_release(page);
+	}
+	if (!uptodate) 
+		dio->result = -EIO;
+
+	dio_bio_count(dio);
+	bio_put(bio);
+	return 0;
+}
+
 /*
  * The BIO completion handler simply queues the BIO up for the process-context
  * handler.
@@ -212,7 +255,10 @@
 
 	bio->bi_bdev = bdev;
 	bio->bi_sector = first_sector;
-	bio->bi_end_io = dio_bio_end_io;
+	if (dio->is_async)
+		bio->bi_end_io = dio_bio_end_aio;
+	else
+		bio->bi_end_io = dio_bio_end_io;
 
 	dio->bio = bio;
 	return 0;
@@ -745,73 +791,84 @@
 }
 
 static int
-direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, 
-	loff_t offset, unsigned long nr_segs, unsigned blkbits,
-	get_blocks_t get_blocks)
+direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
+	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
+	unsigned blkbits, get_blocks_t get_blocks)
 {
 	unsigned long user_addr; 
 	int seg, ret2, ret = 0;
-	struct dio dio;
-	size_t bytes, tot_bytes = 0;
+	struct dio local_dio, *dio;
+	size_t bytes;
 
-	dio.bio = NULL;
-	dio.inode = inode;
-	dio.rw = rw;
-	dio.blkbits = blkbits;
-	dio.blkfactor = inode->i_blkbits - blkbits;
-	dio.start_zero_done = 0;
-	dio.block_in_file = offset >> blkbits;
-	dio.blocks_available = 0;
-
-	dio.cur_page = NULL;
-
-	dio.boundary = 0;
-	dio.reap_counter = 0;
-	dio.get_blocks = get_blocks;
-	dio.final_block_in_bio = -1;
-	dio.next_block_for_io = -1;
+	if (is_sync_kiocb(iocb)) {
+		dio = &local_dio;
+		dio->is_async = 0;
+	} else {
+		dio = (struct dio *)kmalloc(sizeof(struct dio), GFP_KERNEL);
+		if (!dio)
+			return -ENOMEM;
+		dio->is_async = 1;
+	}
+	dio->bio = NULL;
+	dio->inode = inode;
+	dio->rw = rw;
+	dio->blkbits = blkbits;
+	dio->blkfactor = inode->i_blkbits - blkbits;
+	dio->start_zero_done = 0;
+	dio->block_in_file = offset >> blkbits;
+	dio->blocks_available = 0;
 
-	dio.page_errors = 0;
+	dio->cur_page = NULL;
+
+	dio->boundary = 0;
+	dio->reap_counter = 0;
+	dio->get_blocks = get_blocks;
+	dio->final_block_in_bio = -1;
+	dio->next_block_for_io = -1;
+
+	dio->page_errors = 0;
+	dio->result = 0;
+	dio->iocb = iocb;
 
 	/* BIO completion state */
-	atomic_set(&dio.bio_count, 0);
-	spin_lock_init(&dio.bio_list_lock);
-	dio.bio_list = NULL;
-	dio.waiter = NULL;
-	dio.pages_in_io = 0;
+	atomic_set(&dio->bio_count, 1);
+	spin_lock_init(&dio->bio_list_lock);
+	dio->bio_list = NULL;
+	dio->waiter = NULL;
+	dio->pages_in_io = 0;
 
 	for (seg = 0; seg < nr_segs; seg++) 
-		dio.pages_in_io += (iov[seg].iov_len >> blkbits) + 2; 
+		dio->pages_in_io += (iov[seg].iov_len >> blkbits) + 2; 
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
 		bytes = iov[seg].iov_len;
 
 		/* Index into the first page of the first block */
-		dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
-		dio.final_block_in_request = dio.block_in_file + (bytes >> blkbits);
+		dio->first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
+		dio->final_block_in_request = dio->block_in_file + (bytes >> blkbits);
 		/* Page fetching state */
-		dio.head = 0;
-		dio.tail = 0;
-		dio.curr_page = 0;
+		dio->head = 0;
+		dio->tail = 0;
+		dio->curr_page = 0;
 
-		dio.total_pages = 0;
+		dio->total_pages = 0;
 		if (user_addr & (PAGE_SIZE-1)) {
-			dio.total_pages++;
+			dio->total_pages++;
 			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
 		}
-		dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		dio.curr_user_address = user_addr;
+		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		dio->curr_user_address = user_addr;
 	
-		ret = do_direct_IO(&dio);
+		ret = do_direct_IO(dio);
 
 		if (ret) {
-			dio_cleanup(&dio);
+			dio_cleanup(dio);
 			break;
 		}
 
-		tot_bytes += iov[seg].iov_len - ((dio.final_block_in_request -
-					dio.block_in_file) << blkbits);
+		dio->result += iov[seg].iov_len - ((dio->final_block_in_request -
+					dio->block_in_file) << blkbits);
 
 	} /* end iovec loop */
 
@@ -819,22 +876,32 @@
 	 * There may be some unwritten disk at the end of a part-written
 	 * fs-block-sized block.  Go zero that now.
 	 */
-	dio_zero_block(&dio, 1);
+	dio_zero_block(dio, 1);
 
-	if (dio.cur_page) {
-		ret2 = dio_send_cur_page(&dio);
-		page_cache_release(dio.cur_page);
+	if (dio->cur_page) {
+		ret2 = dio_send_cur_page(dio);
+		page_cache_release(dio->cur_page);
 		if (ret == 0)
 			ret = ret2;
 	}
-	ret2 = dio_await_completion(&dio);
+
+	if (dio->is_async) {
+		dio_bio_count(dio);
+		if (ret == 0)
+			ret = -EIOCBQUEUED;
+		goto out;
+	}
+
+	dio_bio_count(dio);
+	ret2 = dio_await_completion(dio);
 	if (ret == 0)
 		ret = ret2;
 	if (ret == 0)
-		ret = dio.page_errors;
+		ret = dio->page_errors;
 	if (ret == 0)
-		ret = tot_bytes; 
+		ret = dio->result; 
 
+out:
 	return ret;
 }
 
@@ -878,7 +945,7 @@
 		}
 	}
 
-	retval = direct_io_worker(rw, inode, iov, offset, 
+	retval = direct_io_worker(rw, iocb, inode, iov, offset, 
 				nr_segs, blkbits, get_blocks);
 out:
 	return retval;
diff -Naur -X dontdiff linux-2.5.46/include/linux/page-flags.h linux-2.5.46.aio/include/linux/page-flags.h
--- linux-2.5.46/include/linux/page-flags.h	Mon Nov  4 14:30:37 2002
+++ linux-2.5.46.aio/include/linux/page-flags.h	Tue Nov  5 14:56:44 2002
@@ -70,6 +70,7 @@
 #define PG_chainlock		15	/* lock bit for ->pte_chain */
 
 #define PG_direct		16	/* ->pte_chain points directly at pte */
+#define PG_wronglist		17	/* page is on wrong list */
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -233,6 +234,10 @@
 #define ClearPageDirect(page)		clear_bit(PG_direct, &(page)->flags)
 #define TestClearPageDirect(page)	test_and_clear_bit(PG_direct, &(page)->flags)
 
+#define SetPageWrongList(page)	set_bit(PG_wronglist, &(page)->flags)
+#define PageWrongList(page)	test_bit(PG_wronglist, &(page)->flags)
+#define ClearPageWrongList(page)	clear_bit(PG_wronglist, &(page)->flags)
+
 /*
  * The PageSwapCache predicate doesn't use a PG_flag at this time,
  * but it may again do so one day.
diff -Naur -X dontdiff linux-2.5.46/mm/vmscan.c linux-2.5.46.aio/mm/vmscan.c
--- linux-2.5.46/mm/vmscan.c	Mon Nov  4 14:30:07 2002
+++ linux-2.5.46.aio/mm/vmscan.c	Tue Nov  5 14:58:56 2002
@@ -378,6 +378,12 @@
 			goto keep_locked;
 		}
 
+		if (PageWrongList(page)) {
+			if (TestClearPageDirty(page))
+				set_page_dirty(page);
+			ClearPageWrongList(page);
+		}
+
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
 			swp_entry_t swap = { .val = page->index };

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Copyright © 2002, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds