| From: |
| Jan Kara <jack@suse.cz> |
| To: |
| linux-fsdevel@vger.kernel.org |
| Subject: |
| [RFC] Ext3 online defrag |
| Date: |
| Mon, 23 Oct 2006 14:27:10 +0200 |
| Archive-link: |
| Article,
Thread
|
Hello,
I've written a simple patch implementing ext3 ioctl for file
relocation. Basically you call ioctl on a file, give it list of blocks
and it relocates the file into given blocks (provided they are still
free). The idea is to use it as a kernel part of ext3 online
defragmenter (or generally disk access optimizer). Now I don't have the
user space part that finds larger runs of free blocks and so on so that
it can really be used as a defragmenter. I just send this as a kind of
proof-of-concept to hear some comments. Attached is also a simple
program that demonstrates the use of the ioctl.
Thanks for suggestions/comments in advance.
Honza
--
Jan Kara <jack@suse.cz>
SuSE CR Labs
Implement ext3 ioctl for relocation of file into a given set of blocks. The
function also allocates those blocks (provided they are still free). We simply
build new indirect-tree in given blocks, copy data to it and in the end we
swap pointers to blocks from the inode.
Signed-off-by: Jan Kara <jack@suse.cz>
diff -rupX /home/jack/.kerndiffexclude linux-2.6.18/fs/ext3/inode.c
linux-2.6.18-1-defragment-ext3/fs/ext3/inode.c
--- linux-2.6.18/fs/ext3/inode.c 2006-09-27 13:08:35.000000000 +0200
+++ linux-2.6.18-1-defragment-ext3/fs/ext3/inode.c 2006-10-20 17:54:57.000000000 +0200
@@ -3219,3 +3219,360 @@ int ext3_change_inode_journal_flag(struc
return err;
}
+
+static inline int get_next_reloc_extent(struct ext3_reloc_extent *ext,
+ struct ext3_reloc_extent __user *ext_user, int *act_ext, int extents)
+{
+ /* Still some blocks in the current extent? */
+ if (ext->len)
+ return 0;
+ /* Not enough extents? */
+ if (++(*act_ext) >= extents)
+ return -ENOSPC;
+ if (copy_from_user(ext, ext_user+*act_ext, sizeof(*ext)))
+ return -EFAULT;
+ /* Invalid extent? */
+ if (!ext->len)
+ return -EINVAL;
+ return 0;
+}
+
+static ext3_fsblk_t alloc_reloc_extent(handle_t *handle, struct inode *inode,
+ struct ext3_reloc_extent *ext, unsigned long *blocks, int *err)
+{
+ ext3_fsblk_t ret;
+
+ if (*blocks > ext->len)
+ *blocks = ext->len;
+ ret = ext3_new_blocks(handle, inode, ext->start, blocks, err);
+ if (!ret)
+ return 0;
+
+ /* Required block not free? */
+ if (ret != ext->start) {
+ ext3_free_blocks(handle, inode, ret, *blocks);
+ *err = -ENOSPC;
+ return 0;
+ }
+ ext->start += *blocks;
+ ext->len -= *blocks;
+ return ret;
+}
+
+static int reloc_tree(struct inode *inode, int depth, loff_t pos,
+ struct ext3_reloc_extent *ext, struct ext3_reloc_extent __user *ext_user,
+ int *act_ext, int extents, __le32 oblk, __le32 *nblk,
+ struct buffer_head *nblk_bh)
+{
+ struct buffer_head *obh, *nbh = NULL;
+ handle_t *handle;
+ ext3_fsblk_t newblock;
+ unsigned long count = 1;
+ int ret, i, j;
+ loff_t blocks = (i_size_read(inode) + inode->i_sb->s_blocksize-1) >>
+ inode->i_sb->s_blocksize_bits;
+
+ if (!oblk) {
+ *nblk = 0;
+ return 0;
+ }
+ obh = sb_bread(inode->i_sb, le32_to_cpu(oblk));
+ if (!obh)
+ return -EIO;
+ ret = get_next_reloc_extent(ext, ext_user, act_ext, extents);
+ if (ret < 0)
+ goto out_bh;
+ /* We modify nblk, bitmap, sb, descriptor, buffer */
+ handle = ext3_journal_start(inode, 4);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_bh;
+ }
+ /* First we allocate indirect block */
+ newblock = alloc_reloc_extent(handle, inode, ext, &count, &ret);
+ if (!newblock)
+ goto out_trans;
+ nbh = sb_getblk(inode->i_sb, newblock);
+ if (!nbh) {
+ ret = -EIO;
+ goto out_alloc;
+ }
+ lock_buffer(nbh);
+ ret = ext3_journal_get_create_access(handle, nbh);
+ if (ret) {
+ unlock_buffer(nbh);
+ goto out_alloc;
+ }
+ memset(nbh->b_data, 0, nbh->b_size);
+ set_buffer_uptodate(nbh);
+ unlock_buffer(nbh);
+ *nblk = cpu_to_le32(newblock);
+ if (nblk_bh)
+ ret = ext3_journal_dirty_metadata(handle, nblk_bh);
+ else
+ ret = ext3_mark_inode_dirty(handle, inode);
+ if (ret) {
+ *nblk = 0;
+ goto out_alloc;
+ }
+ ext3_journal_stop(handle);
+
+ printk("Indirect block allocated.\n");
+ /* Now it's time to allocate further data/indirect blocks */
+ if (depth) {
+ for (i = 0; i < EXT3_ADDR_PER_BLOCK(inode->i_sb) &&
+ pos < blocks; i++, pos +=
+ 1 << depth*EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb)) {
+ if (!((__le32 *)obh->b_data)[i])
+ continue;
+ ret = reloc_tree(inode, depth-1, pos, ext, ext_user,
+ act_ext, extents, ((__le32 *)obh->b_data)[i],
+ ((__le32 *)nbh->b_data)+i, nbh);
+ if (ret)
+ goto out_bh;
+ }
+ } else {
+ for (i = 0; i < EXT3_ADDR_PER_BLOCK(inode->i_sb) &&
+ pos < blocks; i++, pos++) {
+ if (!((__le32 *)obh->b_data)[i])
+ continue;
+ ret = get_next_reloc_extent(ext, ext_user, act_ext,
+ extents);
+ if (ret < 0)
+ goto out_bh;
+ /* We modify sb+descriptor+bitmap+nbh */
+ handle = ext3_journal_start(inode, 4);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_bh;
+ }
+ /* Compute size of continuous extent we may have */
+ for (count = 0;
+ i+count < EXT3_ADDR_PER_BLOCK(inode->i_sb) &&
+ pos+count < blocks &&
+ ((__le32 *)obh->b_data)[i+count];
+ count++);
+ printk("Going to allocated extent of lenght %lu\n", count);
+ /* Allocate extent and store block pointers */
+ newblock = alloc_reloc_extent(handle, inode, ext,
+ &count, &ret);
+ printk("Got extent from block %lu, lenght %lu\n", newblock, count);
+ if (!newblock)
+ goto out_trans;
+ for (j = 0; j < count; j++)
+ ((__le32 *)nbh->b_data)[i+j] =
+ cpu_to_le32(newblock+j);
+ printk("Pointers stored, going to dirty metadata.\n");
+ ret = ext3_journal_dirty_metadata(handle, nbh);
+ if (ret) {
+ memset(((__le32 *)nbh->b_data)+i, 0,
+ sizeof(__le32)*count);
+ goto out_alloc;
+ }
+ ext3_journal_stop(handle);
+ i += count-1;
+ pos += count-1;
+ }
+ }
+ ret = 0;
+ goto out_bh;
+out_alloc:
+ ext3_free_blocks(handle, inode, newblock, count);
+out_trans:
+ ext3_journal_stop(handle);
+out_bh:
+ brelse(obh);
+ if (nbh)
+ brelse(nbh);
+ return ret;
+}
+
+/*
+ * Move file into new blocks
+ */
+int ext3_move_file_blocks(struct inode *inode, int extents,
+ struct ext3_reloc_extent __user *ext_user)
+{
+ int ret, act_ext = 0, j;
+ unsigned long count = 1;
+ struct ext3_reloc_extent ext;
+ struct inode *tmp_inode = NULL;
+ struct ext3_inode_info *ei = EXT3_I(inode);
+ struct ext3_inode_info *tmp_ei;
+ struct super_block *sb = inode->i_sb;
+ loff_t blocks = (i_size_read(inode) + sb->s_blocksize-1) >>
+ sb->s_blocksize_bits;
+ loff_t i;
+ handle_t *handle;
+ __le32 tmp_i_data[EXT3_N_BLOCKS];
+
+ /* FIXME: Maybe rewrite like in the style of direct IO? */
+ if (inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ struct writeback_control wbc = {
+ /* We don't need to wait for data, but we need to wait for I_LOCK */
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 0,
+ };
+ ret = sync_inode(inode, &wbc);
+ if (ret)
+ return ret;
+ }
+ if (copy_from_user(&ext, ext_user, sizeof(ext)))
+ return -EFAULT;
+ if (!ext.len)
+ return -EINVAL;
+
+ /* We modify sb+inode+bitmap+descriptor */
+ handle = ext3_journal_start(inode, 4);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ tmp_inode = ext3_new_inode(handle, sb->s_root->d_inode, S_IFREG);
+ i_size_write(tmp_inode, i_size_read(inode));
+ tmp_inode->i_nlink = 0;
+ /* Add inode to the orphan list in case we crash so that replay
+ * takes care after it */
+ ret = ext3_orphan_add(handle, tmp_inode);
+ ext3_journal_stop(handle);
+
+ tmp_ei = EXT3_I(tmp_inode);
+ mutex_lock(&inode->i_mutex);
+ for (i = 0; i < EXT3_NDIR_BLOCKS && i < blocks; i++) {
+ if (ei->i_data[i] == 0) {
+ tmp_ei->i_data[i] = 0;
+ continue;
+ }
+ ret = get_next_reloc_extent(&ext, ext_user, &act_ext, extents);
+ if (ret < 0)
+ goto out;
+ /* We modify inode, bitmap, sb, descriptor */
+ handle = ext3_journal_start(inode, 4);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ tmp_ei->i_data[i] = cpu_to_le32(
+ alloc_reloc_extent(handle, tmp_inode, &ext, &count, &ret));
+ ext3_mark_inode_dirty(handle, tmp_inode);
+ ext3_journal_stop(handle);
+ if (!tmp_ei->i_data[i])
+ goto out;
+ }
+ if (i >= blocks)
+ goto copy_data;
+
+ ret = reloc_tree(tmp_inode, 0, i, &ext, ext_user, &act_ext, extents,
+ ei->i_data[EXT3_IND_BLOCK], tmp_ei->i_data+EXT3_IND_BLOCK,
+ NULL);
+ if (ret < 0)
+ goto out;
+ i += EXT3_ADDR_PER_BLOCK(sb);
+ if (blocks <= i)
+ goto copy_data;
+ ret = reloc_tree(tmp_inode, 1, i, &ext, ext_user, &act_ext, extents,
+ ei->i_data[EXT3_DIND_BLOCK], tmp_ei->i_data+EXT3_DIND_BLOCK,
+ NULL);
+ if (ret < 0)
+ goto out;
+ i += 1 << 2*EXT3_ADDR_PER_BLOCK_BITS(sb);
+ if (blocks <= i)
+ goto copy_data;
+ ret = reloc_tree(tmp_inode, 2, i, &ext, ext_user, &act_ext, extents,
+ ei->i_data[EXT3_TIND_BLOCK], tmp_ei->i_data+EXT3_TIND_BLOCK,
+ NULL);
+ if (ret < 0)
+ goto out;
+copy_data:
+ /* Currently simple, later we may do something more clever */
+ for (i = 0; i < blocks; i += EXT3_MAX_TRANS_DATA) {
+ struct buffer_head *in_bh[EXT3_MAX_TRANS_DATA];
+ struct buffer_head *out_bh[EXT3_MAX_TRANS_DATA];
+
+ /* Prepare all buffers for copying */
+ count = min(EXT3_MAX_TRANS_DATA, (unsigned)(blocks-i));
+ for (j = 0; j < count; j++) {
+ in_bh[j] = ext3_bread(NULL, inode, i+j, 0, &ret);
+ if (!in_bh[j]) {
+ journal_brelse_array(in_bh, j);
+ goto out;
+ }
+ }
+ for (j = 0; j < count; j++) {
+ out_bh[j] = ext3_getblk(NULL, tmp_inode, i+j, 0, &ret);
+ if (!out_bh[j]) {
+ journal_brelse_array(out_bh, j);
+ journal_brelse_array(in_bh, count);
+ goto out;
+ }
+ }
+ /* Copy data */
+ if (ext3_should_journal_data(inode))
+ handle = ext3_journal_start(inode, count);
+ else
+ /* No metadata, just data => no credits needed */
+ handle = ext3_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ for (j = 0; j < count; j++) {
+ if (ext3_should_journal_data(inode)) {
+ ret = ext3_journal_get_write_access(handle,
+ out_bh[j]);
+ if (ret)
+ goto release_buffers;
+ }
+ lock_buffer(out_bh[j]);
+ memcpy(out_bh[j]->b_data, in_bh[j]->b_data,
+ out_bh[j]->b_size);
+ set_buffer_uptodate(out_bh[j]);
+ unlock_buffer(out_bh[j]);
+ ret = 0;
+ if (ext3_should_journal_data(inode)) {
+ ret = ext3_journal_dirty_metadata(handle,
+ out_bh[j]);
+ }
+ else {
+ if (ext3_should_order_data(inode))
+ ret = ext3_journal_dirty_data(handle,
+ out_bh[j]);
+ mark_buffer_dirty(out_bh[j]);
+ }
+ if (ret)
+ goto release_buffers;
+
+ }
+release_buffers:
+ ext3_journal_stop(handle);
+ journal_brelse_array(in_bh, count);
+ journal_brelse_array(out_bh, count);
+ if (ret)
+ goto out;
+ }
+
+ /* Need to modify 2 inodes + superblock */
+ handle = ext3_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ /* Finally file is duplicated. Just swap blocks... */
+ memcpy(tmp_i_data, ei->i_data, sizeof(__le32)*EXT3_N_BLOCKS);
+ memcpy(ei->i_data, tmp_ei->i_data, sizeof(__le32)*EXT3_N_BLOCKS);
+ memcpy(tmp_ei->i_data, tmp_i_data, sizeof(__le32)*EXT3_N_BLOCKS);
+ ext3_mark_inode_dirty(handle, inode);
+ ext3_mark_inode_dirty(handle, tmp_inode);
+ ext3_orphan_del(handle, tmp_inode);
+ ext3_journal_stop(handle);
+ /* We know that there are no writers and all data has been written */
+ ret = invalidate_inode_pages2(inode->i_mapping);
+ if (ret) {
+ printk(KERN_WARNING "Cannot invalidate inode pages in ext3_move_file_blocks!\n");
+ goto out;
+ }
+
+out:
+ mutex_unlock(&inode->i_mutex);
+ if (tmp_inode)
+ iput(tmp_inode);
+ return ret;
+}
diff -rupX /home/jack/.kerndiffexclude linux-2.6.18/fs/ext3/ioctl.c
linux-2.6.18-1-defragment-ext3/fs/ext3/ioctl.c
--- linux-2.6.18/fs/ext3/ioctl.c 2006-09-27 13:08:35.000000000 +0200
+++ linux-2.6.18-1-defragment-ext3/fs/ext3/ioctl.c 2006-10-20 01:51:59.000000000 +0200
@@ -246,7 +246,26 @@ flags_err:
return err;
}
+ case EXT3_IOC_FILE_RELOC: {
+ struct ext3_file_move_data input;
+ int err;
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+ if (IS_RDONLY(inode))
+ return -EROFS;
+ if (copy_from_user(&input, (struct ext3_file_move_data __user *)arg, sizeof(input)))
+ return -EFAULT;
+ if (!access_ok(VERIFY_READ, input.ext_array, input.extents * sizeof(struct ext3_reloc_extent)))
+ return -EFAULT;
+
+ err = deny_write_access(filp);
+ if (err)
+ return err;
+ err = ext3_move_file_blocks(inode, input.extents, input.ext_array);
+ allow_write_access(filp);
+ return err;
+ }
default:
return -ENOTTY;
diff -rupX /home/jack/.kerndiffexclude linux-2.6.18/include/linux/ext3_fs.h
linux-2.6.18-1-defragment-ext3/include/linux/ext3_fs.h
--- linux-2.6.18/include/linux/ext3_fs.h 2006-09-27 13:09:04.000000000 +0200
+++ linux-2.6.18-1-defragment-ext3/include/linux/ext3_fs.h 2006-10-20 01:51:47.000000000 +0200
@@ -233,6 +233,7 @@ struct ext3_new_group_data {
#endif
#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
+#define EXT3_IOC_FILE_RELOC _IOR('f', 9, struct ext3_file_move_data)
/*
* Mount options
@@ -598,6 +599,22 @@ static inline int ext3_valid_inum(struct
#define EXT3_DEFM_JMODE_WBACK 0x0060
/*
+ * File relocation structures
+ */
+
+struct ext3_reloc_extent {
+ ext3_fsblk_t start;
+ loff_t len;
+};
+
+struct ext3_file_move_data {
+ int extents;
+ struct ext3_reloc_extent __user *ext_array;
+};
+
+int ext3_move_file_blocks(struct inode *inode, int extents, struct ext3_reloc_extent __user
*ext_user);
+
+/*
* Structure of a directory entry
*/
#define EXT3_NAME_LEN 255
#include <stdio.h>
#include <sys/ioctl.h>
#include <asm/ioctl.h>
#include <fcntl.h>
#include <string.h>
#include <sys/stat.h>
#include <linux/fs.h>
#include <unistd.h>
#define BLOCKSIZE 1024
#define BLOCKS 1024
#define FILENAME "reloc_test_file"
#define BUFSIZE 64
struct ext3_reloc_extent {
unsigned int start;
loff_t len;
};
struct ext3_file_move_data {
int extents;
struct ext3_reloc_extent *ext_array;
};
int main(int argc, char **argv)
{
char buf[BUFSIZE*BLOCKSIZE];
int fd = open(FILENAME, O_CREAT | O_TRUNC | O_WRONLY, S_IRWXU);
int i, extents;
unsigned int blockmap[BLOCKS];
struct ext3_reloc_extent ext[BLOCKS];
struct ext3_file_move_data data;
if (fd < 0) {
perror("Cannot open file for block allocation");
return 1;
}
unlink(FILENAME);
if (argc != 2) {
puts("Usage: ext3_reloc_test filename");
return 1;
}
memset(buf, 1, sizeof(buf));
for (i = 0; i < BLOCKS; i += BUFSIZE)
if (write(fd, buf, sizeof(buf)) != sizeof(buf)) {
perror("Cannot write data");
return 1;
}
for (i = 0, extents = 0; i < BLOCKS; i++) {
blockmap[i] = i;
if (ioctl(fd, FIBMAP, &(blockmap[i])) < 0) {
perror("ioctl");
return 1;
}
if (!i)
ext[0].start = blockmap[0];
else if (i && blockmap[i-1]+1 != blockmap[i]) {
ext[extents].len = blockmap[i-1]-ext[extents].start+1;
extents++;
ext[extents].start = blockmap[i];
}
}
ext[extents].len = blockmap[BLOCKS-1]-ext[extents].start+1;
extents++;
close(fd);
sync();
fd = open(argv[1], O_RDONLY);
if (fd < 0) {
perror("Cannot open file to relocate");
return 1;
}
data.extents = extents;
data.ext_array = ext;
if (ioctl(fd, _IOR('f', 9, struct ext3_file_move_data), &data) < 0) {
perror("Move ioctl");
return 1;
}
close(fd);
return 0;
}