diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ed11b70 --- /dev/null +++ b/.gitignore @@ -0,0 +1,97 @@ +# specific files +mkfs.chunkfs +write_pattern + +# +# NOTE! Don't add files that are generated in specific +# subdirectories here. Add them in the ".gitignore" file +# in that subdirectory instead. +# +# NOTE! Please use 'git ls-files -i --exclude-standard' +# command after changing this file, to see if there are +# any tracked files which get ignored after the change. +# +# Normal rules +# +.* +*.o +*.o.* +*.a +*.s +*.ko +*.so +*.so.dbg +*.mod.c +*.i +*.lst +*.symtypes +*.order +modules.builtin +*.elf +*.bin +*.gz +*.bz2 +*.lzma +*.xz +*.lzo +*.patch +*.gcno + +# +# Top-level generic files +# +/tags +/TAGS +/linux +/vmlinux +/vmlinuz +/System.map +/Module.markers +/Module.symvers + +# +# Debian directory (make deb-pkg) +# +/debian/ + +# +# git files that we don't want to ignore even it they are dot-files +# +!.gitignore +!.mailmap + +# +# Generated include files +# +include/config +include/generated +arch/*/include/generated + +# stgit generated dirs +patches-* + +# quilt's files +patches +series + +# cscope files +cscope.* +ncscope.* + +# gnu global files +GPATH +GRTAGS +GSYMS +GTAGS + +*.orig +*~ +\#*# + +# +# Leavings from module signing +# +extra_certificates +signing_key.priv +signing_key.x509 +x509.genkey diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f0fa677 --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +# +# Makefile for chunkfs. +# + +obj-$(CONFIG_CHUNK_FS) += chunkfs.o + +chunkfs-y := super.o inode.o dir.o file.o namei.o symlink.o cont.o + +# +# Temporarily keep utilities in this dir too. +# +HOST_EXTRACFLAGS += -I$(src)/../../include -static + +hostprogs-$(CONFIG_CHUNK_FS) := mkfs.chunkfs mount_chunkfs write_pattern + +always := $(hostprogs-y) $(hostprogs-m) diff --git a/README b/README new file mode 100644 index 0000000..e2bc180 --- /dev/null +++ b/README @@ -0,0 +1,70 @@ +Chunkfs README + +Val Henson + +Summary +------- + +Chunkfs is an experimental local file system designed to quickly +recover from file system corruption. Each file system is divided up +into many small chunks, each of which can be checked and repaired with +very few references to other chunks. In most cases, only a small part +of the file system must be checked and repaired before it can be +brought back online, requiring minutes instead of hours of downtime to +recover from a file system error. + +Status +------ + +Chunkfs development began in February 2007. Growing a file into two +chunks is supported but not much else. + +See the project web site for the current status: + +http://chunkfs.org + +License +------- + +Chunkfs is licensed under the GNU General Public License version 2. +Chunks is released as a patch against the Linux kernel, which contains +a copy of the GPLv2. + +Funding +------- + +Development of chunkfs was funded by: + +Intel +EMC Centera +VAH Consulting + +How it works +------------ + +Each file system is divided up into many small chunks. Each chunk is +marked as dirty or clean. Things that make a chunk dirty are +in-progress writes to metadata (creat(), chmod(), extending a file, +etc.), I/O errors reported by the disk, and any data integrity errors +observed by the file system code (bad checksums, wrong magic number, +etc.). At mount time, each dirty chunk is checked with fsck and +repaired if necessary, with limited references to other chunks. Clean +chunks are not checked unless specifically requested. + +The metadata inside a chunk is structured so that nearly all +references are within the chunk only. No block pointers or hard links +may cross chunk boundaries. Only one kind of reference crosses the +chunk boundary: inode continuations. When an inode needs to grow out +of a chunk (either its data has outgrown the free space or we want to +link to an inode outside its chunk), we allocate a new inode in the +appropriate chunk and link the two inodes together with forward and +back pointers, creating a continuation. Logically, the two inodes are +parts of a single file or directory. When checking a chunk containing +a continuation inode, the forward and back pointers allow us to +quickly find the relevant information in any other chunks, without +reading all the metadata in the entire chunk. Each chunk also keeps a +bitmap of all inodes with continuations, as in some circumstances all +continuations must be checked. + +For more information, see the documentation section on the project web +site. diff --git a/chunkfs.h b/chunkfs.h new file mode 100644 index 0000000..2fb5806 --- /dev/null +++ b/chunkfs.h @@ -0,0 +1,204 @@ +/* + * Chunkfs header file + * + * (C) 2007-2008 Valerie Henson + * + */ + +#ifndef _LINUX_CHUNKFS_FS_H +#define _LINUX_CHUNKFS_FS_H + +/* XXX Do above _H stuff for other header files */ + +#include +#include +#include + +/* + * NOTE: Most on disk structures need: + * + * Magic number (32 bit is really big but easy) + * Checksum (32 bit for some kind of sanity) + * + * These go first (magic, then checksum) in all on-disk structures so + * that even if have the type of the structure wrong, we're checking + * the correct spot. + * + * XXX File system generation number should be included - perhaps high + * 16 of magic? + */ + +/* XXX Should have magic, checksum, version, and generation in one struct */ + +/* + * Some useful typedefs to possibly prevent endian mixups. Use c_* + * for on disk, ci_* for in memory. + */ + +typedef __le64 c_blk_t; +typedef __u64 ci_blk_t; +typedef __le64 c_byte_t; +typedef __u64 ci_byte_t; +typedef __le64 c_inode_num_t; +typedef __u64 ci_inode_num_t; + +/* + * XXX Block size shouldn't have much meaning, and it will probably + * vary by chunk. Figure out what Linux VFS thinks this means. + * + * I think this means that when you use sb_read(), this is the block + * size used. + */ + +#define CHUNKFS_BLK_SIZE 4096 +#define CHUNKFS_BLK_BITS 12 + +/* + * Rev me! Lots! Whenever on-disk structures change! Mainly for + * development. + * + * Note that 0 is never acceptable. + */ + +#define CHUNKFS_VERSION 1 + +/* + * XXX On-disk structures probably aren't correctly padded at any + * given moment in time. + */ + +/* + * Locating a device has two parts. First, we try a cached path name + * which is a hint only, since paths may change. Then we check for + * the correct UUID; if it is wrong, we go search each device. + */ + +#define CHUNKFS_DEV_PATH_LEN 1024 + +struct chunkfs_dev_desc { + /* + * The path of the device when last opened. It may have + * changed, therefore it is only a hint. + */ + char d_hint[CHUNKFS_DEV_PATH_LEN]; + /* Is this the device we're looking for? */ + __le64 d_uuid; +}; + +/* + * Dummy struct to force us to check the "official" position of the + * checksum and magic number (at the beginning of the struct). + * + * XXX Actually use this struct in other structs. Never access + * directly. + */ + +struct chunkfs_chkmagic { + __le32 x_magic; + __le32 x_chksum; +}; + +/* XXX use e2fsprogs/dev uuid and crc32 lib functions */ +/* XXX using __cpu_to_* so userland can share */ +/* #ifdef KERNEL? How does Jeff do it? */ + +static inline void write_chksum(void *buf, unsigned int size) +{ + struct chunkfs_chkmagic *x = (struct chunkfs_chkmagic *) buf; +/* x->x_chksum = __cpu_to_le32(crc32(buf, size)); */ + x->x_chksum = __cpu_to_le32(0x32323232); +} + +static inline int check_chksum(void *buf, unsigned int size) +{ + struct chunkfs_chkmagic *x = (struct chunkfs_chkmagic *) buf; +/* return !(x->x_chksum == __cpu_to_le32(crc32(buf, size))); */ + return (__le32_to_cpu(x->x_chksum) != 0x32323232); +} + +static inline int check_magic(void *buf, __u32 expected_magic) { + struct chunkfs_chkmagic *x = (struct chunkfs_chkmagic *) buf; + return (__le32_to_cpu(x->x_magic) != expected_magic); +} +/* + * Generic function to check a piece of metadata just read off disk. + * Checksum and magic number are -always- in the same location in all + * metadata. + */ + +static inline int check_metadata(void *buf, unsigned int size, __u32 expected_magic) +{ + if (check_magic(buf, expected_magic)) + return 1; + if (check_chksum(buf, size)) + return 2; + return 0; +} + +#ifdef __KERNEL__ + +/* dir.c */ +extern struct file_operations chunkfs_dir_fops; + +/* inode.c */ +extern struct file_operations chunkfs_file_fops; +extern struct inode_operations chunkfs_file_iops; +int chunkfs_new_inode(struct super_block *, struct inode **); +void chunkfs_start_inode(struct inode *inode, struct inode *client_inode, + u64 chunk_id); +void chunkfs_read_inode(struct inode *); +int chunkfs_write_inode(struct inode *, int); +void chunkfs_copy_up_inode(struct inode *, struct inode *); + +/* symlink.c */ + +extern struct inode_operations chunkfs_symlink_iops; + +/* namei.c */ + +extern struct inode_operations chunkfs_dir_iops; +extern struct inode_operations chunkfs_special_iops; + +struct chunkfs_dlist_node *chunkfs_alloc_dlist_node(struct dentry *); +void chunkfs_add_dentry(struct dentry *, struct dentry *, struct vfsmount *); +int chunkfs_init_dentry(struct dentry *); +void chunkfs_free_dentry(struct dentry *); +void chunkfs_init_nd(struct inode *dir, struct dentry *dentry, + struct dentry *client_dentry, u64 chunk_id); +void chunkfs_copy_up_nd(struct nameidata *nd, struct nameidata *client_nd); +void chunkfs_copy_down_nd(struct nameidata *nd, struct nameidata *client_nd); + +/* file.c */ + +int chunkfs_setattr(struct dentry *dentry, struct iattr *attr); +int chunkfs_permission(struct inode *, int, struct nameidata *); +int chunkfs_open(struct inode *, struct file *); + +struct chunkfs_continuation; + +int chunkfs_open_cont_file(struct file *file, loff_t *ppos, + struct file **client_file, + struct chunkfs_continuation **ret_cont); +void chunkfs_close_cont_file(struct file *file, struct file *client_file, + struct chunkfs_continuation *cont); +void chunkfs_copy_down_file(struct file *file, loff_t *ppos, + struct file *client_file, u64 client_start); + +/* cont.c */ + +int chunkfs_get_next_inode(struct inode *head_inode, + struct inode *prev_inode, struct inode **ret_inode); +int chunkfs_get_cont_at_offset(struct dentry *dentry, loff_t offset, + struct chunkfs_continuation **ret_cont); +int chunkfs_get_next_cont(struct dentry *head_dentry, + struct chunkfs_continuation *prev_cont, + struct chunkfs_continuation **next_cont); +int chunkfs_create_continuation(struct file *file, loff_t *ppos, + struct file **client_file, + struct chunkfs_continuation **ret_cont); +void chunkfs_put_continuation(struct chunkfs_continuation *cont); +int chunkfs_init_cont_data(struct dentry *client_dentry); + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_CHUNKFS_FS_H */ diff --git a/chunkfs_chunk.h b/chunkfs_chunk.h new file mode 100644 index 0000000..8b2b2d0 --- /dev/null +++ b/chunkfs_chunk.h @@ -0,0 +1,85 @@ +/* + * Chunkfs chunk definitions + * + * (C) 2007-2008 Valerie Henson + */ + +/* + * Chunk summary. This is just a wrapper defining the chunk size and + * giving us enough information to identify the client file system + * living inside this chunk and use its routines to mount it. + * + * XXX Question: what about df? Will it require us to df each + * individual chunk? Maybe a higher level summary is a good idea. + * + */ + +#define CHUNKFS_CHUNK_MAGIC 0xf00df00d + +/* XXX super_block s_id len is 32, should #define it */ + +#define CHUNKFS_CLIENT_NAME_LEN 32 + +struct chunkfs_chunk { + __le32 c_magic; + __le32 c_chksum; + __le64 c_flags; + __le64 c_chunk_id; + c_byte_t c_begin; + c_byte_t c_end; + c_byte_t c_innards_begin; + c_byte_t c_innards_end; + c_byte_t c_next_chunk; + char c_client_fs[CHUNKFS_CLIENT_NAME_LEN]; +}; + +#define CHUNKFS_CHUNK_BLK (CHUNKFS_DEV_BLK + 1) +#define CHUNKFS_CHUNK_OFFSET (CHUNKFS_CHUNK_BLK * CHUNKFS_BLK_SIZE) +#define CHUNKFS_CHUNK_SIZE (10 * 1024 * 1024) /* XXX should be dynamic */ + +static inline int check_chunk(struct chunkfs_chunk *chunk) +{ + return check_metadata(chunk, sizeof(*chunk), CHUNKFS_CHUNK_MAGIC); +} + +/* + * Chunk flags + */ + +#define CHUNKFS_ROOT 0x00000001ULL + +#ifdef __KERNEL__ + +/* + * XXX Audit client file systems for start-from-zero block address bugs + * + * XXX Root inode location? Copy to all chunks? O(n chunk) space usage... + */ + +struct chunkfs_chunk_info { + struct chunkfs_dev_info *ci_dev; /* Parent device */ + struct list_head ci_clist; /* Member of list of chunks */ + struct buffer_head *ci_bh; + struct super_block *ci_sb; /* Superblock of client fs in memory */ + struct vfsmount *ci_mnt; + __u64 ci_flags; + __u64 ci_chunk_id; + char ci_client_fs[CHUNKFS_CLIENT_NAME_LEN]; + /* The rest of the on-disk data is not normally used. */ +}; + +#define CHUNKFS_IS_ROOT(ci) (ci->ci_flags & CHUNKFS_ROOT) + +static inline struct chunkfs_chunk * CHUNKFS_CHUNK(struct chunkfs_chunk_info *ci) +{ + return (struct chunkfs_chunk *) ci->ci_bh->b_data; +} + +static inline struct super_block * CHUNKFS_ROOT_SB(struct chunkfs_pool_info *pi) +{ + return pi->pi_root_dev->di_root_chunk->ci_sb; +} + +struct chunkfs_chunk_info * chunkfs_find_chunk(struct chunkfs_pool_info *, u64); + +#endif /* __KERNEL__ */ diff --git a/chunkfs_dev.h b/chunkfs_dev.h new file mode 100644 index 0000000..406ccdd --- /dev/null +++ b/chunkfs_dev.h @@ -0,0 +1,66 @@ +/* + * Chunkfs device definitions + * + * (C) 2007-2008 Valerie Henson + */ + +/* + * Device summary. This contains: + * + * - Information about which part of the device we manage + * - Pointer to the first chunk header (root chunk is flagged) + * + * Again, free/used information is known only by chunks, so we do not + * keep summary info in the dev summary unless we find some + * performance reason to keep it on disk. + */ + +#define CHUNKFS_DEV_MAGIC 0xdeeddeed + +struct chunkfs_dev { + __le32 d_magic; + __le32 d_chksum; + __le64 d_flags; /* Clean unmounted, etc. */ + __le64 d_uuid; + c_byte_t d_begin; /* Total space we manage */ + c_byte_t d_end; + c_byte_t d_innards_begin; /* Space for chunks */ + c_byte_t d_innards_end; + c_byte_t d_root_chunk; /* Offset of chunk containing root, if here */ + struct chunkfs_dev_desc d_next_dev; /* Next device in pool */ +}; + +/* + * Dev flags + */ + +#define CHUNKFS_ROOT_DEV 0x00000001ULL + +#define CHUNKFS_IS_ROOT_DEV(ci) (ci->ci_flags & CHUNKFS_ROOT_DEV) + +#define CHUNKFS_DEV_BLK (CHUNKFS_POOL_BLK + 1) +#define CHUNKFS_DEV_OFFSET (CHUNKFS_DEV_BLK * CHUNKFS_BLK_SIZE) + +static inline int check_dev(struct chunkfs_dev *dev) +{ + return check_metadata(dev, sizeof(*dev), CHUNKFS_DEV_MAGIC); +} + +#ifdef __KERNEL__ + +struct chunkfs_dev_info { + struct chunkfs_pool_info *di_pool; + struct list_head di_dlist; /* Member of list of devs */ + struct list_head di_clist_head; /* Pointer to list of chunks */ + struct chunkfs_chunk_info *di_root_chunk; + struct buffer_head *di_bh; + __u64 di_flags; + /* The rest of the on-disk data is not normally used. */ +}; + +static inline struct chunkfs_dev * CHUNKFS_DEV(struct chunkfs_dev_info *di) +{ + return (struct chunkfs_dev *) di->di_bh->b_data; +} + +#endif /* __KERNEL__ */ diff --git a/chunkfs_i.h b/chunkfs_i.h new file mode 100644 index 0000000..830c916 --- /dev/null +++ b/chunkfs_i.h @@ -0,0 +1,113 @@ +/* + * Chunkfs inode definitions + * + * (C) 2007-2008 Valerie Henson + */ + +#define CHUNKFS_INODE_MAGIC 0x10de10de + +/* + * The on-disk version of the chunkfs continuation data is stored as + * strings inname/value pairs. They are: + * + * "next" "" - next inode in the file + * "prev" - ditto + * "start" "" - byte offset of file data in this inode + * "len" "" - length of file data stored in this inode + */ + +/* + * Inode/chunk number and back again + */ + +#define UINO_TO_CHUNK_ID(ino) ((ino & 0xF0000000ULL) >> 28) +#define UINO_TO_INO(ino) (ino & 0x0FFFFFFFULL) +#define MAKE_UINO(chunk_id, ino) ((chunk_id << 28) | ino) + +#ifdef __KERNEL__ + +struct chunkfs_cont_data { + ci_inode_num_t cd_next; + ci_inode_num_t cd_prev; + ci_byte_t cd_start; + ci_byte_t cd_len; +}; + +/* + * This is the information that must be maintained in memory in + * addition to the client fs's in-memory inode and the VFS's inode. + */ + +struct chunkfs_inode_info { + /* VFS inode */ + struct inode ii_vnode; + /* Head client inode - keeps our inode state */ + struct inode *ii_client_inode; + /* Protects on-disk continuation list */ + spinlock_t ii_continuations_lock; +}; + +/* + * Info for each continuation in the file. Loaded as needed and not + * cached because that's hard, mkay. + */ + +struct chunkfs_continuation { + struct inode *co_inode; + struct dentry *co_dentry; + struct vfsmount *co_mnt; + struct chunkfs_cont_data co_cd; + u64 co_chunk_id; + /* Can be reconstructed */ + u64 co_uino; +}; + +/* + * We need a single client dentry hanging off the parent dentry, as + * well as a client version of the nameidata. + */ + +struct chunkfs_dentry_priv { + struct dentry *dp_client_dentry; + struct nameidata *dp_client_nd; +}; + +static inline struct chunkfs_inode_info *CHUNKFS_I(struct inode * inode) +{ + return container_of(inode, struct chunkfs_inode_info, ii_vnode); +} + +static inline struct inode *get_client_inode(struct inode *inode) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(inode); + return ii->ii_client_inode; +} + +static inline struct chunkfs_dentry_priv *CHUNKFS_D(struct dentry *dentry) { + return (struct chunkfs_dentry_priv *) dentry->d_fsdata; +} + +static inline struct dentry *get_client_dentry(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + return dp->dp_client_dentry; +} + +static inline struct nameidata *get_client_nd(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + /* + * XXX locking. Can we have more than one operation going + * forward using a nameidata at the same time? My first guess + * is no. + */ + return dp->dp_client_nd; +} + +static inline struct vfsmount *get_client_mnt(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + return dp->dp_client_nd->mnt; +} + +#endif /* __KERNEL__ */ diff --git a/chunkfs_pool.h b/chunkfs_pool.h new file mode 100644 index 0000000..cd5ea41 --- /dev/null +++ b/chunkfs_pool.h @@ -0,0 +1,76 @@ +/* + * Chunkfs pool summary definitions. + * + * (C) 2007-2008 Valerie Henson + */ + +/* + * A wee little summary of the whole pool. There should be one copy + * of this summary every device. + * + * There is no size or block summary in this structure. Information + * about the usable size of the file system is only known by the + * client file systems inside each chunk. We have to query them + * individually to find out this information. Currently, I see no + * compelling reason to store a summary on disk - it only allows for + * it to be out of sync with the real accounting information. + */ + +/* Pool (superblock) magic number goes in linux/magic.h */ +#include + +struct chunkfs_pool { + __le32 p_magic; + __le32 p_chksum; + __le64 p_flags; + struct chunkfs_dev_desc p_root_desc; /* Device containing root */ +}; + +/* + * Offset from beginning of partition of the pool summary/superblock. + * A large initial offset avoids MBR, boot blocks, etc. + * + * XXX Any problems from picking this location? + */ + +#define CHUNKFS_POOL_BLK 8 +#define CHUNKFS_POOL_OFFSET (CHUNKFS_POOL_BLK * CHUNKFS_BLK_SIZE) + +static inline int check_pool(struct chunkfs_pool *pool) +{ + return check_metadata(pool, sizeof(*pool), CHUNKFS_SUPER_MAGIC); +} + +#ifdef __KERNEL__ + +#include + +struct chunkfs_pool_info { + struct list_head pi_dlist_head; /* List of devices in this pool */ + struct chunkfs_dev_info *pi_root_dev; + struct buffer_head *pi_bh; + /* Use bytes instead of blocks - block size may vary */ + /* + * Note that with shared storage or dynamically allocated + * inodes, you don't want to assume that total = used + free + */ + __u64 pi_bytes_total; + __u64 pi_bytes_free; + __u64 pi_bytes_used; + __u64 pi_inodes_total; + __u64 pi_inodes_free; + __u64 pi_inodes_used; + __u64 pi_flags; +}; + +static inline struct chunkfs_pool_info * CHUNKFS_PI(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct chunkfs_pool * CHUNKFS_POOL(struct chunkfs_pool_info *pi) +{ + return (struct chunkfs_pool *) pi->pi_bh->b_data; +} + +#endif /* __KERNEL__ */ diff --git a/cont.c b/cont.c new file mode 100644 index 0000000..7ac0c42 --- /dev/null +++ b/cont.c @@ -0,0 +1,442 @@ +/* + * Chunkfs continuation routines + * + * (C) 2007-2008 Valerie Henson + * + */ + +#include +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +/* + * Wow... all hack, all the time. Don't try this at home, kids. + */ + +static int +get_set_cont_data(struct dentry *dentry, char *name, u64 value, + u64 *ret_value, int type) +{ + /* Yaaaaaaay potential buffer overflow */ + char value_str[50]; /* XXX */ + char full_name[50]; /* XXX */ + /* Our continuation data is stored in the "user" xattr namespace */ + char prefix[] = "user."; + ssize_t size; + int err = 0; + /* printk(KERN_ERR "%s(%s): inode %lu ", __FUNCTION__, + type ? "set" : "get", client_inode->i_ino, name); */ + /* Make our "user.name" xattr name */ + sprintf(full_name, "%s%s", prefix, name); + if (type == 0) { + size = generic_getxattr(dentry, full_name, value_str, + sizeof(value_str)); + if (size >= 0) { + /* No automatic null termination... */ + value_str[size] = '\0'; + *ret_value = simple_strtoull(value_str, NULL, 10); + } else { + err = size; + } + /* printk("%s=%llu ", name, *ret_value); */ + } else { + /* XXX Pad out to max number of characters to avoid ENOSPC */ + sprintf(value_str, "%llu", value); + err = generic_setxattr(dentry, full_name, value_str, + strlen(value_str) + 1, 0); + /* printk("%s=%s ", name, value_str); */ + } + /* XXX ENOSPC handling */ + /* printk("err %d\n", err); */ + return err; +} + +static int +set_cont_data(struct dentry *dentry, struct chunkfs_cont_data *cd) +{ + int err; + + err = get_set_cont_data(dentry, "next", cd->cd_next, NULL, 1); + if (err) + goto out; + err = get_set_cont_data(dentry, "prev", cd->cd_prev, NULL, 1); + if (err) + goto out; + err = get_set_cont_data(dentry, "start", cd->cd_start, NULL, 1); + if (err) + goto out; + err = get_set_cont_data(dentry, "len", cd->cd_len, NULL, 1); + if (err) + goto out; + + mark_inode_dirty(dentry->d_inode); + out: + printk(KERN_ERR "%s: inode %lu err %d next %llu prev %llu " + "start %llu len %llu\n", + __FUNCTION__, dentry->d_inode->i_ino, err, + cd->cd_next, cd->cd_prev, + cd->cd_start, cd->cd_len); + return err; +} + +/* + * Get the continuation info out of the underlying client inode and + * stick it into the continuation info for an element of the inode + * list for a chunkfs inode. Currently stored in an xattr, so can use + * nice pretty fs-independent xattr routines. + */ + +static int +get_cont_data(struct dentry *dentry, struct chunkfs_cont_data *cd) +{ + int err; + + err = get_set_cont_data(dentry, "next", 0, &cd->cd_next, 0); + if (err) + return err; + err = get_set_cont_data(dentry, "prev", 0, &cd->cd_prev, 0); + if (err) + return err; + err = get_set_cont_data(dentry, "start", 0, &cd->cd_start, 0); + if (err) + return err; + err = get_set_cont_data(dentry, "len", 0, &cd->cd_len, 0); + if (err) + return err; + + printk(KERN_ERR "%s: inode %lu err %d next %llu prev %llu " + "start %llu len %llu\n", + __FUNCTION__, dentry->d_inode->i_ino, err, + cd->cd_next, cd->cd_prev, + cd->cd_start, cd->cd_len); + + return 0; +} + +/* + * inode based interface to get cont data + */ + +static int +get_cont_data_inode(struct inode *inode, struct chunkfs_cont_data *cd) +{ + struct dentry fake_dentry; + int err; + + fake_dentry.d_inode = inode; + err = get_cont_data(&fake_dentry, cd); + return err; +} + +/* + * Read an existing continuation into memory. + * + * XXX - dget/iget on client? + */ + +static int +load_continuation(struct inode *head_inode, struct dentry *client_dentry, + u64 chunk_id, struct chunkfs_continuation **ret_cont) +{ + struct chunkfs_pool_info *pi = CHUNKFS_PI(head_inode->i_sb); + struct chunkfs_continuation *cont; + struct chunkfs_chunk_info *ci; + int err; + + printk(KERN_ERR "%s() chunk_id %llu\n", __FUNCTION__, chunk_id); + + cont = kzalloc(sizeof(*cont), GFP_KERNEL); + if (cont == NULL) + return -ENOMEM; + + cont->co_inode = client_dentry->d_inode; + cont->co_dentry = client_dentry; + cont->co_chunk_id = chunk_id; + /* Hm. Think I could pass in the mnt, too... */ + ci = chunkfs_find_chunk(pi, chunk_id); + BUG_ON(ci == NULL); /* XXX */ + cont->co_mnt = ci->ci_mnt; + cont->co_uino = MAKE_UINO(chunk_id, cont->co_inode->i_ino); + + err = get_cont_data(cont->co_dentry, &cont->co_cd); + if (err) + goto out; + + *ret_cont = cont; + return 0; + out: + kfree(cont); + return err; +} + +void +chunkfs_put_continuation(struct chunkfs_continuation *cont) +{ + dput(cont->co_dentry); + /* Should be doing mntput but don't do mntget either */ + kfree(cont); +} + +/* + * Inode list lock must be held. + * + * Huuuuuge simplification - only load a continuation into memory + * while it's being used. No in-memory linked list. + * + */ + +int +chunkfs_get_next_cont(struct dentry *head_dentry, + struct chunkfs_continuation *prev_cont, + struct chunkfs_continuation **next_cont) +{ + struct inode *head_inode = head_dentry->d_inode; + struct chunkfs_cont_data *cd; + struct dentry *client_dentry; + struct nameidata nd; + char path[PATH_MAX]; + u64 from_chunk_id; + u64 chunk_id; + u64 from_ino; + u64 next_uino; + int err; + + printk(KERN_ERR "%s() prev_cont %p\n", __FUNCTION__, prev_cont); + + /* + * Get the dentry for the continuation we want. + */ + + if (prev_cont == NULL) { + client_dentry = dget(get_client_dentry(head_dentry)); + chunk_id = UINO_TO_CHUNK_ID(head_inode->i_ino); + } else { + cd = &prev_cont->co_cd; + /* If it's the head inode again, return */ + if (cd->cd_next == head_inode->i_ino) { + *next_cont = NULL; + return 0; + } + /* If there is no next continuation, return */ + if (cd->cd_next == 0) { + *next_cont = NULL; + return 0; + } + /* Laboriously construct the path and look it up */ + next_uino = cd->cd_next; + chunk_id = UINO_TO_CHUNK_ID(next_uino); + from_chunk_id = prev_cont->co_chunk_id; + from_ino = UINO_TO_INO(prev_cont->co_uino); + sprintf(path, "/chunk%llu/%llu/%llu", + chunk_id, from_chunk_id, from_ino); + err = path_lookup(path, 0, &nd); + if (err) + return -ENOENT; + client_dentry = dget(nd.dentry); + path_release(&nd); + } + + /* Now we know the dentry of the continuation we want. */ + + err = load_continuation(head_inode, client_dentry, chunk_id, + next_cont); + + printk(KERN_ERR "%s() returning err %d\n", __FUNCTION__, err); + + return err; +} + +int +chunkfs_get_cont_at_offset(struct dentry *dentry, loff_t offset, + struct chunkfs_continuation **ret_cont) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(dentry->d_inode); + struct chunkfs_continuation *prev_cont = NULL; + struct chunkfs_continuation *next_cont; + struct chunkfs_cont_data *cd; + int err; + + printk(KERN_ERR "%s() reading ino %0lx offset %llu\n", + __FUNCTION__, dentry->d_inode->i_ino, offset); + + spin_lock(&ii->ii_continuations_lock); + while (1) { + err = chunkfs_get_next_cont(dentry, prev_cont, &next_cont); + if (err || (next_cont == NULL)) + break; + cd = &next_cont->co_cd; + printk(KERN_ERR "offset %llu start %llu len %llu err %d\n", + offset, cd->cd_start, cd->cd_len, err); + if ((offset >= cd->cd_start) && + (offset < (cd->cd_start + cd->cd_len))) { + printk(KERN_ERR "found it!\n"); + *ret_cont = next_cont; + break; + } + printk(KERN_ERR "not this one\n"); + prev_cont = next_cont; + } + spin_unlock(&ii->ii_continuations_lock); + /* If we didn't find a cont at all, return -ENOENT */ + if (next_cont == NULL) + err = -ENOENT; + *ret_cont = next_cont; + return err; +} + +/* + * Traverse the list of continuations using iget() only. + */ + +int +chunkfs_get_next_inode(struct inode *head_inode, + struct inode *prev_inode, struct inode **ret_inode) +{ + struct chunkfs_pool_info *pi = CHUNKFS_PI(head_inode->i_sb); + struct chunkfs_chunk_info *ci; + struct chunkfs_cont_data cd; + struct inode *next_inode; + u64 next_uino; + ino_t next_ino; + u64 chunk_id; + int err; + + /* Starting the list... */ + if (prev_inode == NULL) { + prev_inode = get_client_inode(head_inode); + next_inode = iget(prev_inode->i_sb, prev_inode->i_ino); + goto found_inode; + } else + iput(prev_inode); + /* Find the superblock and inode for the next one */ + err = get_cont_data_inode(prev_inode, &cd); + if (err) + return err; + next_uino = cd.cd_next; + if (next_uino == 0) { + *ret_inode = NULL; + return 0; + } + next_ino = UINO_TO_INO(next_uino); + chunk_id = UINO_TO_CHUNK_ID(next_uino); + printk(KERN_ERR "next_uino %llu next_ino %lu, next chunk_id %llu\n", + next_uino, next_ino, chunk_id); + ci = chunkfs_find_chunk(pi, chunk_id); + BUG_ON(ci == NULL); /* XXX */ + next_inode = iget(ci->ci_sb, next_ino); + found_inode: + if (is_bad_inode(next_inode)) + return -EIO; + *ret_inode = next_inode; + return 0; +} + +/* + * Create a new continuation in this chunk. Never called on the head. + * Length is set arbitrarily so be sure to write continuously. + * + * We have to bootstrap ourselves up, starting with a dentry. We are, + * in fact, creating a file from the kernel. Bleah. + */ + +int +chunkfs_create_continuation(struct file *file, loff_t *ppos, + struct file **client_file, + struct chunkfs_continuation **ret_cont) +{ + struct chunkfs_continuation *prev_cont = NULL; + struct chunkfs_continuation *next_cont; + struct chunkfs_continuation *new_cont; + struct file *new_file; + u64 from_chunk_id; + u64 to_chunk_id; + u64 from_ino; + struct nameidata file_nd; + char path[PATH_MAX]; + struct dentry *dentry; + struct chunkfs_cont_data cd; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + /* Get the last continuation */ + while (1) { + err = chunkfs_get_next_cont(file->f_dentry, prev_cont, + &next_cont); + if (err) + return err; + if (next_cont == NULL) + break; + prev_cont = next_cont; + } + + /* Figure out what chunk and inode we are continuing from. */ + from_chunk_id = prev_cont->co_chunk_id; + from_ino = UINO_TO_INO(prev_cont->co_uino); + /* Temporary hack, do the next chunk for creation. */ + to_chunk_id = from_chunk_id + 1; + printk(KERN_ERR "%s() to chunk %llu\n", __FUNCTION__, to_chunk_id); + + /* Now we need the filename for the continuation inode. */ + sprintf(path, "/chunk%llu/%llu/%llu", to_chunk_id, from_chunk_id, + from_ino); + + /* Create the file */ + err = open_namei(0 /* XXX */, path, O_CREAT | O_RDWR, + (MAY_WRITE | MAY_READ | MAY_APPEND), &file_nd); + printk(KERN_ERR "open_namei for %s: err %d\n", path, err); + if (err) + goto out; + + dentry = dget(file_nd.dentry); + + /* Fill in next/prev/etc. data */ + cd.cd_next = 0; + cd.cd_prev = prev_cont->co_uino; + cd.cd_start = prev_cont->co_cd.cd_start + prev_cont->co_cd.cd_len; + cd.cd_len = 10 * 4096; + set_cont_data(dentry, &cd); + /* Now update prev */ + prev_cont->co_cd.cd_next = MAKE_UINO(to_chunk_id, + dentry->d_inode->i_ino); + set_cont_data(prev_cont->co_dentry, &prev_cont->co_cd); + /* Now! It's all in the inode and we can load it like normal. */ + err = load_continuation(file->f_dentry->d_inode, dentry, + to_chunk_id, &new_cont); + new_file = dentry_open(dentry, file_nd.mnt, file->f_flags); + if (IS_ERR(new_file)) { + err = PTR_ERR(new_file); + printk(KERN_ERR "dentry_open: err %d\n", err); + goto out; + } + chunkfs_copy_down_file(file, ppos, new_file, new_cont->co_cd.cd_start); + + *client_file = new_file; + *ret_cont = new_cont; + + printk(KERN_ERR "%s(): start %llu returning %d\n", + __FUNCTION__, cd.cd_start, err); + return 0; + out: + chunkfs_put_continuation(prev_cont); + printk(KERN_ERR "%s(): start %llu returning %d\n", + __FUNCTION__, cd.cd_start, err); + return err; +} + +int +chunkfs_init_cont_data(struct dentry *client_dentry) +{ + struct chunkfs_cont_data cd; + int err; + + cd.cd_prev = 0; + cd.cd_next = 0; + cd.cd_start = 0; + cd.cd_len = 10 * 4096; + err = set_cont_data(client_dentry, &cd); + return err; +} diff --git a/cross.sh b/cross.sh new file mode 100644 index 0000000..1224a79 --- /dev/null +++ b/cross.sh @@ -0,0 +1,33 @@ +#!/bin/bash -x +# +# Run simple cross-chunk checks. +# + +mount /dev/loop1 /chunk1 +mount /dev/loop2 /chunk2 +mount /dev/loop3 /chunk3 + +# For each file in the continued-from-directory, check to see if it +# exists in the original chunk. + +# For every possible continuation file... + +for from_chunk in 1 2 3; do + for to_chunk in 1 2 3; do + for i in `ls /chunk${to_chunk}/${from_chunk}/*` ; do + # Check to see if the previous inode exists + echo testi \<${i}\> | debugfs /dev/loop${from_chunk} | grep marked + # Returns 1 if not allocated (and it should be) + if [ "$?" == "1" ]; then + echo "Orphan continuation ${i}, removing" + rm ${i} + fi + done + done +done + +umount /dev/loop1 +umount /dev/loop2 +umount /dev/loop3 + +exit 0 diff --git a/demo.sh b/demo.sh new file mode 100644 index 0000000..cfd4eda --- /dev/null +++ b/demo.sh @@ -0,0 +1,55 @@ +#!/bin/bash -x +# +# Demo script for chunkfs to be run from inside UML. +# + +# This is where the chunkfs user binaries are located. +BINPATH=/chunkfs_bin +MNT=/mnt + +# Name of the file backing the loop device +ORIG=/loop/saved_disk +FILE=/loop/test_disk +cp ${ORIG} ${FILE} + +# Unmount chunkfs and chunk file systems before stomping +umount ${MNT} +umount /dev/loop1 +umount /dev/loop2 +umount /dev/loop3 + +losetup -d /dev/loop0 +losetup -d /dev/loop1 +losetup -d /dev/loop2 +losetup -d /dev/loop3 + +losetup /dev/loop0 ${FILE} +losetup -o 45056 /dev/loop1 ${FILE} +losetup -o 10530816 /dev/loop2 ${FILE} +losetup -o 21016576 /dev/loop3 ${FILE} + +# Stomp head inode + +OFFSET=$(((4096 * 4) + 0x0e00)) +dd if=/dev/zero of=/dev/loop1 seek=${OFFSET} bs=1 count=128 + +# Repair individual chunks + +fsck -f /dev/loop1 +fsck -f /dev/loop2 +fsck -f /dev/loop3 + +# Cross-chunk repair + +${BINPATH}/cross.sh + +for i in 1 2 3; do + mount -t ext2 -o user_xattr /dev/loop${i} /chunk${i} +done + +${BINPATH}/mount_chunkfs /dev/loop0 ${MNT} + +ls /mnt/big +ls /chunk2/1/29 + +exit 0 diff --git a/dir.c b/dir.c new file mode 100644 index 0000000..71ce865 --- /dev/null +++ b/dir.c @@ -0,0 +1,68 @@ +/* + * Chunkfs directory routines + * + * (C) 2007-2008 Valerie Henson + */ + +#include + +#include "chunkfs.h" +#include "chunkfs_i.h" + +/* + * Currently we're reusing the client directory ops. We'll probably + * have to implement our own directories on top. + */ + +static loff_t +chunkfs_dir_llseek(struct file *file, loff_t offset, int origin) +{ + struct file *client_file; + struct chunkfs_continuation *cont; + int err; + + /* XXX... should only do top-level file struct? */ + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = chunkfs_open_cont_file(file, &offset, &client_file, &cont); + if (err) + return err; + + if (client_file->f_op->llseek) + err = client_file->f_op->llseek(client_file, offset, origin); + else + err = default_llseek(client_file, offset, origin); + + chunkfs_close_cont_file(file, client_file, cont); + return err; +} + +static int +chunkfs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + struct file *client_file; + struct chunkfs_continuation *cont; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = chunkfs_open_cont_file(file, &file->f_pos, &client_file, &cont); + if (err) + return err; + + err = client_file->f_op->readdir(client_file, dirent, filldir); + /* If we read off the end, no problemo */ + if (err == -ENODATA) + err = 0; + + chunkfs_close_cont_file(file, client_file, cont); + + return err; +} + +struct file_operations chunkfs_dir_fops = { + .llseek = chunkfs_dir_llseek, + .read = generic_read_dir, + .open = chunkfs_open, + .readdir = chunkfs_readdir, +}; diff --git a/file.c b/file.c new file mode 100644 index 0000000..e601544 --- /dev/null +++ b/file.c @@ -0,0 +1,338 @@ +/* + * Chunkfs file routines + * + * (C) 2007-2008 Valerie Henson + */ + +#include +#include +#include +#include + +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +/* + * The point of all these wrapper functions is the following: + * + * We need to get set the right file ops in the file struct for the + * area of the file being altered. + * + * For ops which affect the entire file (like fsync), we need to fan + * out to all the parts of the file. + * + * It would be cool if we could set the file operations depending on + * where in the file the I/O is happening. But I don't think we have + * that ability in the VFS right now. + */ + +/* + * Set the client file position to be relative to the start of the + * client file and copy down the main file struct's data in to the + * client file struct. + */ + +void +chunkfs_copy_down_file(struct file *file, loff_t *ppos, + struct file *client_file, u64 client_start) +{ + client_file->f_pos = *ppos - client_start; + *ppos = client_file->f_pos; + + printk(KERN_ERR "%s(): client f_pos set to %llu " + "(parent %llu, client_start %llu)\n", + __FUNCTION__, client_file->f_pos, file->f_pos, + client_start); +} + +/* + * Aaaand reverse the pos conversion. + */ + +static void +copy_up_file(struct file *file, struct file *client_file, u64 client_start) +{ + file->f_pos = client_file->f_pos + client_start; + + printk(KERN_ERR "%s(): file f_pos set to %llu (client f_pos %llu " + "client_start %llu)\n", __FUNCTION__, file->f_pos, + client_file->f_pos, client_start); +} + +/* + * Open the client inode at offset and return the file struct. + */ + +int +chunkfs_open_cont_file(struct file *file, loff_t *ppos, + struct file **client_file, + struct chunkfs_continuation **ret_cont) +{ + struct chunkfs_continuation *cont; + struct chunkfs_cont_data *cd; + struct file *new_file; + int err; + + printk(KERN_ERR "%s() pos %llu\n", __FUNCTION__, *ppos); + + err = chunkfs_get_cont_at_offset(file->f_dentry, *ppos, &cont); + if (err) + return err; + + new_file = dentry_open(cont->co_dentry, cont->co_mnt, + file->f_flags); + if (IS_ERR(new_file)) { + err = PTR_ERR(new_file); + printk(KERN_ERR "dentry_open: err %d\n", err); + goto out; + } + cd = &cont->co_cd; + chunkfs_copy_down_file(file, ppos, new_file, cd->cd_start); + + *ret_cont = cont; + *client_file = new_file; + out: + printk(KERN_ERR "%s(): returning %d\n", __FUNCTION__, err); + return err; +} + +void +chunkfs_close_cont_file(struct file *file, struct file *client_file, + struct chunkfs_continuation *cont) +{ + struct chunkfs_cont_data *cd = &cont->co_cd; + /* XXX... sys_close does a lot more than this. */ + printk(KERN_ERR "%s()\n", __FUNCTION__); + copy_up_file(file, client_file, cd->cd_start); + chunkfs_copy_up_inode(file->f_dentry->d_inode, + client_file->f_dentry->d_inode); + chunkfs_put_continuation(cont); +} + +/* + * lseek only affects the top-level file struct's fpos. + */ + +static loff_t +chunkfs_llseek_file(struct file *file, loff_t offset, int origin) +{ + printk(KERN_ERR "%s()\n", __FUNCTION__); + + /* XXX right generic llseek? */ + return default_llseek(file, offset, origin); +} + +/* + * Find the right inode for the offset and read from it. Opens and + * closes the client file struct every time because I'm lazy. + */ + +static ssize_t +chunkfs_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) +{ + struct file *client_file; + struct chunkfs_continuation *cont; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = chunkfs_open_cont_file(file, ppos, &client_file, &cont); + /* Read off the end of the file */ + /* XXX distinguish between this and EIO */ + if (err == -ENOENT) + return 0; + if (err) + return err; + + /* XXX assume not longer than len */ + if (client_file->f_op->read) + err = client_file->f_op->read(client_file, buf, len, ppos); + else + err = do_sync_read(client_file, buf, len, ppos); + + /* If we read off the end, no problemo */ + if (err == -ENODATA) + err = 0; + + chunkfs_close_cont_file(file, client_file, cont); + return err; +} + +static ssize_t +chunkfs_write(struct file *file, const char __user *buf, size_t len, + loff_t *ppos) +{ + struct chunkfs_continuation *cont; + struct file *client_file; + ssize_t size; + int err; + + printk(KERN_ERR "%s() pos %llu len %u\n", + __FUNCTION__, *ppos, len); + + err = chunkfs_open_cont_file(file, ppos, &client_file, &cont); + if (err == -ENOENT) { + err = chunkfs_create_continuation(file, ppos, &client_file, + &cont); + } + if (err) + return err; + + /* XXX assume not longer than len */ + if (client_file->f_op->write) + size = client_file->f_op->write(client_file, buf, len, ppos); + else + size = do_sync_write(client_file, buf, len, ppos); + + chunkfs_close_cont_file(file, client_file, cont); + + printk(KERN_ERR "%s() pos %llu len %u, returning size %u\n", + __FUNCTION__, *ppos, len, size); + + return size; +} + +/* + * Open only affects the top-level chunkfs file struct. Do an open of + * the underlying head client inode just to see that we can, then + * close it again. + */ + +int +chunkfs_open(struct inode * inode, struct file * file) +{ + struct file *client_file; + struct chunkfs_continuation *cont; + loff_t dummy_pos = 0; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = chunkfs_open_cont_file(file, &dummy_pos, &client_file, &cont); + if (err) + goto out; + chunkfs_close_cont_file(file, client_file, cont); + return 0; + out: + printk(KERN_ERR "%s() returning %d\n", __FUNCTION__, err); + return err; +} + +/* + * Apparently, file may be null at this point. Uh. Whatever. + */ + +static int +chunkfs_fsync_file(struct file *file, struct dentry *dentry, int datasync) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(file->f_dentry->d_inode); + struct chunkfs_continuation *prev_cont = NULL; + struct chunkfs_continuation *next_cont; + struct dentry *client_dentry; + struct inode *client_inode; + int err = -EIO; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + /* XXX syncs all inodes instead of just ones in mem */ + spin_lock(&ii->ii_continuations_lock); + while (1) { + err = chunkfs_get_next_cont(dentry, prev_cont, &next_cont); + if (err || (next_cont == NULL)) + break; + client_dentry = next_cont->co_dentry; + client_inode = client_dentry->d_inode; + /* XXX error propagation */ + err = client_inode->i_fop->fsync(NULL, client_dentry, + datasync); + prev_cont = next_cont; + } + spin_unlock(&ii->ii_continuations_lock); + printk(KERN_ERR "%s() err %d\n", __FUNCTION__, err); + return err; +} + +static void chunkfs_truncate(struct inode *inode) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(inode); + struct inode *prev_inode = NULL; + struct inode *next_inode; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + spin_lock(&ii->ii_continuations_lock); + /* XXX completely delete continuations */ + while (1) { + err = chunkfs_get_next_inode(inode, prev_inode, &next_inode); + if (err || (next_inode == NULL)) + break; + next_inode->i_op->truncate(next_inode); + prev_inode = next_inode; + } + spin_unlock(&ii->ii_continuations_lock); +} + +int chunkfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *client_inode = get_client_inode(dentry->d_inode); + struct dentry *client_dentry = get_client_dentry(dentry); + unsigned int ia_valid = attr->ia_valid; + int error; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + if (client_inode->i_op->setattr) { + error = client_inode->i_op->setattr(client_dentry, attr); + } else { + /* Arrrrrgh gross argh */ + error = inode_change_ok(client_inode, attr); + if (!error) + error = security_inode_setattr(client_dentry, attr); + if (!error) { + if ((ia_valid & ATTR_UID && attr->ia_uid != client_inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != client_inode->i_gid)) + error = DQUOT_TRANSFER(client_inode, attr) ? -EDQUOT : 0; + if (!error) + error = inode_setattr(client_inode, attr); + } + } + if (!error) + chunkfs_copy_up_inode(dentry->d_inode, client_inode); + return error; +} + +/* + * XXX probably need to change the nd. + */ + +int chunkfs_permission(struct inode *inode, int submask, + struct nameidata *nd) +{ + struct inode *client_inode = get_client_inode(inode); + int err; + + if (client_inode->i_op->permission) + err = client_inode->i_op->permission(client_inode, + submask, nd); + else + err = generic_permission(client_inode, submask, NULL); + return err; +} + +struct file_operations chunkfs_file_fops = { + .llseek = chunkfs_llseek_file, + .read = chunkfs_read, + .write = chunkfs_write, + .open = chunkfs_open, + .fsync = chunkfs_fsync_file, +}; + +struct inode_operations chunkfs_file_iops = { + .truncate = chunkfs_truncate, + .setattr = chunkfs_setattr, + .permission = chunkfs_permission, +}; diff --git a/fsck.chunkfs.c b/fsck.chunkfs.c new file mode 100644 index 0000000..f340b96 --- /dev/null +++ b/fsck.chunkfs.c @@ -0,0 +1,212 @@ +/* + * Check and repair a chunkfs file system. + * + * (C) 2007-2008 Val Henson + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "chunkfs.h" + +static char * cmd; + +static void usage (void) +{ + fprintf(stderr, "Usage: %s \n", cmd); + exit(1); +} + +static void read_data(void *buf, int size, int fd, size_t offset) +{ + bzero(buf, size); + + if (lseek(fd, block, SEEK_SET) < 0) + error(1, errno, "Cannot seek"); + + if (read(fd, buf, size) < size) + error(1, errno, "Cannot read +} + +/* + * Construct a "superblock." In chunkfs, this means a pool summary. + */ +static void create_pool_summary(char *dev_name, struct chunkfs_pool *pool) +{ + struct chunkfs_dev_desc *root_dev = &pool->p_root; + int hint_len = sizeof (root_dev->d_hint); + + bzero(pool, sizeof(*pool)); + pool->p_magic = __cpu_to_le32(CHUNKFS_SUPER_MAGIC); + /* Fill in root device description. */ + strncpy(root_dev->d_hint, dev_name, hint_len); + root_dev->d_hint[hint_len - 1] = '\0'; + pool->p_flags = 0; + /* XXX need userland generated uuid... ask kernel to do it on mount? */ + root_dev->d_uuid = __cpu_to_le64(0x001d001d); + + write_chksum(pool, sizeof(*pool), &pool->p_chksum); +} + +static void create_dev_summary(char *dev_name, int devfd, + struct chunkfs_pool *pool, + struct chunkfs_dev *dev) +{ + struct stat stat_buf; + struct chunkfs_dev_desc *root_dev = &pool->p_root; + + if (fstat(devfd, &stat_buf) != 0) + error(1, errno, "Cannot stat device %s", dev_name); + + bzero(dev, sizeof(*dev)); + dev->d_uuid = root_dev->d_uuid; /* Already endian swapped */ + dev->d_bytes_total = __cpu_to_le64(stat_buf.st_size); + dev->d_magic = __cpu_to_le32(CHUNKFS_DEV_MAGIC); + write_chksum(dev, sizeof (dev), &dev->d_chksum); +} + +static void create_chunk_summary(struct chunkfs_pool *pool, + struct chunkfs_dev *dev, + struct chunkfs_chunk *chunk, + __u64 start, __u64 size, __u64 next_chunk) +{ + __u64 end = start + size - 1; + /* XXX use chunk->ci_blk_bits; */ + __u64 start_blk = (start >> CHUNKFS_BLK_BITS) + 2; + __u64 end_blk = end >> CHUNKFS_BLK_BITS; + __u64 start_inode; + __u64 end_inode; + + bzero(chunk, sizeof(*chunk)); + chunk->c_next_chunk = __cpu_to_le64(next_chunk); + chunk->c_blk_size = __cpu_to_le64(CHUNKFS_BLK_SIZE); + chunk->c_blk_bits = __cpu_to_le64(CHUNKFS_BLK_BITS); + chunk->c_blk_start = __cpu_to_le64(start_blk); + chunk->c_blk_end = __cpu_to_le64(end_blk); + /* XXX subtract space used for bitmaps and chunk summary + * For now assume only takes two blocks + * XXX put data at one end of the chunk and metadata at other */ + chunk->c_blks_free = __cpu_to_le64((end_blk - start_blk) - 2); + start_inode = blk_to_inode(start_blk); + chunk->c_inode_begin = __cpu_to_le64(start_inode); + /* Get the last inode in the last blk, not the first */ + end_inode = blk_to_inode(end_blk + 1) - 1; + chunk->c_inode_end = __cpu_to_le64(end_inode); + chunk->c_magic = __cpu_to_le32(CHUNKFS_CHUNK_MAGIC); + write_chksum(chunk, sizeof (chunk), &chunk->c_chksum); +} + +static void write_chunk_summaries(int fd, struct chunkfs_pool *pool, + struct chunkfs_dev *dev, + struct chunkfs_chunk *root_chunk) +{ + struct chunkfs_chunk chunk; + __u64 dev_size = __le64_to_cpu(dev->d_bytes_total); + __u64 chunk_size = CHUNKFS_CHUNK_SIZE; + __u64 chunk_start = CHUNKFS_CHUNK_OFFSET;; + __u64 next_chunk_offset; + unsigned int chunk_id = 0; + int root = 1; + + while (chunk_start < dev_size) { + /* XXX What is chunk min size? */ + if ((chunk_start + chunk_size) > dev_size) { + chunk_size = dev_size - chunk_start; + next_chunk_offset = 0; + } else { + next_chunk_offset = chunk_start + chunk_size; + } + create_chunk_summary(pool, dev, &chunk, chunk_start, + chunk_size, next_chunk_offset); + if (root) { + /* Make root the first inode */ + chunk.c_root_inode = chunk.c_inode_begin; + *root_chunk = chunk; + root = 0; + } + + printf("Writing chunk %d (bytes %llu-%llu, blocks %llu-%llu, " + "inodes %llu-%llu)\n", chunk.c_id, chunk.c_start, + chunk_end, start_blk, end_blk,start_inode, end_inode); + + write_block(chunk, sizeof(*chunk), fd, chunk_start); + chunk_start += chunk_size; + *chunk_id++; + } +} + +static void write_root(int fd, struct chunkfs_pool *pool, + struct chunkfs_dev *dev, + struct chunkfs_chunk *root_chunk, + struct chunkfs_inode *root_inode) +{ + /* XXX This is zero presently. */ + c_inode_num_t inode_num = __le64_to_cpu(root_chunk->c_root_inode); + c_byte_t root_offset = inode_num; + + printf("root inode number %llu, offset %llu\n", inode_num, root_offset); + root_inode->i_self = __cpu_to_le64(inode_num); + /* uid, gid, size, etc. all 0 */ + root_inode->i_nlink = __cpu_to_le32(2); + root_inode->i_atime.t_sec = + root_inode->i_mtime.t_sec = + root_inode->i_ctime.t_sec = __cpu_to_le32(time(NULL)); + root_inode->i_mode = __cpu_to_le16(S_IFDIR | 0755); /* XXX */ + root_inode->i_type = __cpu_to_le16(CHUNKFS_PUBLIC_INODE); + root_inode->i_magic = __cpu_to_le32(CHUNKFS_INODE_MAGIC); + write_chksum(root_inode, sizeof(*root_inode), &root_inode->i_chksum); + + write_block(root_inode, sizeof(*root_inode), fd, root_offset); +} + +int main (int argc, char * argv[]) +{ + int fd; + char * dev_name; + struct chunkfs_pool pool = { 0 }; + struct chunkfs_dev root_dev = { 0 }; + struct chunkfs_chunk root_chunk = { 0 }; + struct chunkfs_inode root_inode = { 0 }; + + cmd = argv[0]; + + if (argc != 2) + usage(); + + dev_name = argv[1]; + + if ((fd = open(dev_name, O_RDWR)) < 0) + error(1, errno, "Cannot open device %s", dev_name); + /* XXX combine create and write_block */ + /* The chunkfs equivalent of a superblock is the pool summary. */ + + create_pool_summary(dev_name, &pool); + write_block(&pool, sizeof(pool), fd, CHUNKFS_POOL_OFFSET); + + /* We need one device summary per device. */ + + create_dev_summary(dev_name, fd, &pool, &root_dev); + write_block(&root_dev, sizeof(root_dev), fd, CHUNKFS_DEV_OFFSET); + + /* Now we get to the meaty bit: chunk summaries. */ + + write_chunk_summaries(fd, &pool, &root_dev, &root_chunk); + + /* Write root inode */ + + write_root(fd, &pool, &root_dev, &root_chunk, &root_inode); + + close(fd); + + return 0; +} diff --git a/gaze.sh b/gaze.sh new file mode 100644 index 0000000..44f0ada --- /dev/null +++ b/gaze.sh @@ -0,0 +1,27 @@ +#!/bin/bash -x +# +# Give me a naked eyeball look at the on-disk format as created by mkfs. +# + +# Name of the loopback device we're creating +DEV=/tmp/disk0 +# This is where the chunkfs user binaries are located. +BINPATH=. + +${BINPATH}/write_pattern ${DEV} +if [ "$?" != "0" ]; then + echo "write_pattern failed" + exit 1 +fi + +${BINPATH}/mkfs.chunkfs ${DEV} +if [ "$?" != "0" ]; then + echo "mkfs.chunkfs failed" + exit 1 +fi + +# Now dump it for us. + +od -A d -t x4 ${DEV} + +exit 0 diff --git a/inode.c b/inode.c new file mode 100644 index 0000000..97550bb --- /dev/null +++ b/inode.c @@ -0,0 +1,173 @@ +/* + * Chunkfs inode routines + * + * (C) 2007-2008 Valerie Henson + */ + +#include +#include + +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +int +chunkfs_get_nlinks(struct inode *inode) +{ + /* XXX go through all client inodes */ + return inode->i_nlink; +} + +static void +__copy_inode(struct inode *dst, struct inode *src) +{ + /* Copy data from one inode to another */ + fsstack_copy_attr_all(dst, src, chunkfs_get_nlinks); +} + +void +chunkfs_copy_up_inode(struct inode *inode, struct inode *client_inode) +{ + struct inode *prev_inode = NULL; + struct inode *next_inode; + loff_t total_size = 0; + + __copy_inode(inode, client_inode); + + while (1) { + chunkfs_get_next_inode(inode, prev_inode, &next_inode); + if (next_inode == NULL) + break; + /* XXX doesn't do holey files right */ + printk(KERN_ERR "adding %llu\n", next_inode->i_size); + total_size += next_inode->i_size; + prev_inode = next_inode; + } + inode->i_size = total_size; + printk(KERN_ERR "%s() ino %lu size %llu\n", __FUNCTION__, + inode->i_ino, inode->i_size); + + mark_inode_dirty(inode); +} + +static void +copy_down_inode(struct inode *inode, struct inode *client_inode) +{ + __copy_inode(client_inode, inode); +} + +static void +set_inode_ops(struct inode *inode, struct inode *client_inode) +{ + /* Pick inode ops */ + if (S_ISLNK(client_inode->i_mode)) + inode->i_op = &chunkfs_symlink_iops; + else if (S_ISDIR(client_inode->i_mode)) + inode->i_op = &chunkfs_dir_iops; + else if (S_ISREG(client_inode->i_mode)) + inode->i_op = &chunkfs_file_iops; + else + inode->i_op = &chunkfs_special_iops; + + /* Use different set of file ops for directories */ + if (S_ISDIR(client_inode->i_mode)) + inode->i_fop = &chunkfs_dir_fops; + else if (S_ISREG(client_inode->i_mode)) + inode->i_fop = &chunkfs_file_fops; + + /* properly initialize special inodes */ + if (S_ISBLK(client_inode->i_mode) || S_ISCHR(client_inode->i_mode) || + S_ISFIFO(client_inode->i_mode) || S_ISSOCK(client_inode->i_mode)) + init_special_inode(inode, client_inode->i_mode, + client_inode->i_rdev); +} + +/* + * Allocate a new inode and do any extra bits to it that aren't + * covered by the alloc_inode() op (currently none). + */ + +int +chunkfs_new_inode(struct super_block *sb, struct inode **inodep) +{ + *inodep = new_inode(sb); + + if (is_bad_inode(*inodep)) + /* XXX hate the inode error return conventions */ + return -EIO; + return 0; +} + +/* + * We've just read in a client inode. Fill in the chunkfs inode. + * Wait to fill in the continuation until the file is opened. + */ + +void +chunkfs_start_inode(struct inode *inode, struct inode *client_inode, + u64 chunk_id) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(inode); + + BUG_ON(!client_inode); + + ii->ii_client_inode = client_inode; + inode->i_ino = MAKE_UINO(chunk_id, client_inode->i_ino); + /* XXX i_mapping? */ + /* XXX check inode checksum, etc. */ + set_inode_ops(inode, client_inode); + chunkfs_copy_up_inode(inode, client_inode); + + printk(KERN_ERR "%s(): inode %p ino %0lx mode %0x client %p\n", + __FUNCTION__, inode, inode->i_ino, inode->i_mode, + ii->ii_client_inode); +} + +/* + * Come in with the chunkfs inode. Fill it in and get the client + * inode too. + */ + +void +chunkfs_read_inode(struct inode *inode) +{ + struct chunkfs_pool_info *pi = CHUNKFS_PI(inode->i_sb); + struct chunkfs_chunk_info *ci; + struct inode *client_inode; + struct super_block *client_sb; + u64 chunk_id = UINO_TO_CHUNK_ID(inode->i_ino); + unsigned long client_ino = UINO_TO_INO(inode->i_ino); + + printk (KERN_ERR "%s() reading ino %0lx client ino %0lx chunk_id " + "%0llx count %d\n", + __FUNCTION__, inode->i_ino, client_ino, chunk_id, + atomic_read(&inode->i_count)); + + /* XXX should be chunkfs_get_sb */ + ci = chunkfs_find_chunk(pi, chunk_id); + BUG_ON(ci == NULL); /* XXX */ + + client_sb = ci->ci_sb; + client_inode = iget(client_sb, client_ino); + if (is_bad_inode(client_inode)) { + /* XXX should do something here */ + return; + } + chunkfs_start_inode(inode, client_inode, chunk_id); + return; +} + +int chunkfs_write_inode(struct inode *inode, int wait) +{ + struct inode *client_inode = get_client_inode(inode); + int err; + + copy_down_inode(inode, client_inode); + + /* XXX will client inodes be written when evicted? think so */ + err = client_inode->i_sb->s_op->write_inode(client_inode, wait); + + return err; +} diff --git a/mkfs.chunkfs.c b/mkfs.chunkfs.c new file mode 100644 index 0000000..e773bec --- /dev/null +++ b/mkfs.chunkfs.c @@ -0,0 +1,192 @@ +/* + * Create a chunkfs file system. + * + * (C) 2007-2008 Val Henson + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +/* Compile time test that structures have not outgrown blocks. */ + +static char canary_buf1[CHUNKFS_BLK_SIZE - + sizeof(struct chunkfs_pool)] __attribute__((unused)); +static char canary_buf2[CHUNKFS_BLK_SIZE - + sizeof(struct chunkfs_dev)] __attribute__((unused)); + +static char * cmd; + +static void usage (void) +{ + fprintf(stderr, "Usage: %s \n", cmd); + exit(1); +} + +static void write_block(void *metadata, int size, int fd, __u64 offset) +{ + char buf[CHUNKFS_BLK_SIZE]; + int buf_size = sizeof (buf); + struct chunkfs_chkmagic *x = (struct chunkfs_chkmagic *) buf; + + bzero(buf, buf_size); + memcpy(buf, metadata, size); + write_chksum(buf, size); + + printf("Writing magic %0x chksum %0x to offset %llu\n", + __le32_to_cpu(x->x_magic), __le32_to_cpu(x->x_chksum), + offset); + + if (lseek(fd, offset, SEEK_SET) < 0) + error(1, errno, "Cannot seek"); + + if (write(fd, buf, buf_size) < buf_size) + error(1, errno, "Cannot write metadata at offset %llu", + (unsigned long long) offset); +} + +/* + * Create and write a pool summary (superblock) + */ +static void create_pool_summary(char *dev_name, struct chunkfs_pool *pool) +{ + struct chunkfs_dev_desc *dev_desc = &pool->p_root_desc; + + /* Fill in device description. */ + strcpy(dev_desc->d_hint, dev_name); + /* XXX need userland generated uuid */ + dev_desc->d_uuid = __cpu_to_le64(0x001d001d); + + bzero(pool, sizeof(*pool)); + pool->p_magic = __cpu_to_le32(CHUNKFS_SUPER_MAGIC); +} + +static void create_dev_summary(struct chunkfs_pool *pool, + struct chunkfs_dev *dev, + __u64 dev_begin, + __u64 dev_size) +{ + struct chunkfs_dev_desc *dev_desc = &pool->p_root_desc; + + bzero(dev, sizeof(*dev)); + dev->d_uuid = dev_desc->d_uuid; /* Already swapped */ + dev->d_begin = __cpu_to_le64(dev_begin); + dev->d_end = __cpu_to_le64(dev_begin + dev_size - 1); /* Starting counting from zero */ + dev->d_innards_begin = __cpu_to_le64(dev_begin + CHUNKFS_BLK_SIZE); + dev->d_innards_end = dev->d_end; /* Already swapped */ + dev->d_root_chunk = dev->d_innards_begin; /* Already swapped */ + dev->d_magic = __cpu_to_le32(CHUNKFS_DEV_MAGIC); +} + +static void create_chunk_summary(struct chunkfs_chunk *chunk, + __u64 chunk_start, __u64 chunk_size, + __u64 chunk_id) +{ + bzero(chunk, sizeof(*chunk)); + chunk->c_begin = __cpu_to_le64(chunk_start); + chunk->c_end = __cpu_to_le64(chunk_start + chunk_size - 1); + chunk->c_innards_begin = __cpu_to_le64(chunk_start + CHUNKFS_BLK_SIZE); + chunk->c_innards_end = chunk->c_end; /* Already swapped */ + chunk->c_chunk_id = __cpu_to_le64(chunk_id); + chunk->c_magic = __cpu_to_le32(CHUNKFS_CHUNK_MAGIC); +} + +static void write_chunk_summaries(struct chunkfs_dev *dev, + struct chunkfs_chunk *chunk, + int fd) +{ + __u64 chunk_id = 1; /* 0 is not a valid chunk id */ + __u64 chunk_start = __le64_to_cpu(dev->d_root_chunk); + __u64 chunk_size = CHUNKFS_CHUNK_SIZE; + __u64 dev_end = __le64_to_cpu(dev->d_end); + + while ((chunk_start + chunk_size - 1) < dev_end) { + /* XXX Throwing away disk if not multiple of chunk size */ + create_chunk_summary(chunk, chunk_start, chunk_size, + chunk_id); + if (chunk_id == 1) + chunk->c_flags |= __cpu_to_le64(CHUNKFS_ROOT); + /* Can we get another chunk in? Then point to it */ + if ((__le64_to_cpu(chunk->c_end) + chunk_size - 1) < dev_end) + chunk->c_next_chunk = __cpu_to_le64(chunk->c_end + 1); + + printf("Writing chunk %llu: start %llu end %llu)\n", + __le64_to_cpu(chunk->c_chunk_id), + __le64_to_cpu(chunk->c_begin), + __le64_to_cpu(chunk->c_end)); + + printf("clientfs: start %llu\n", __le64_to_cpu(chunk->c_innards_begin)); + + write_block(chunk, sizeof(*chunk), fd, chunk_start); + chunk_start += chunk_size; + chunk_id++; + } +} + +int main (int argc, char * argv[]) +{ + int fd; + char * dev_name; + struct stat stat_buf; + off_t raw_dev_size; + struct chunkfs_pool pool = { 0 }; + struct chunkfs_dev root_dev = { 0 }; + struct chunkfs_chunk root_chunk = { 0 }; + + cmd = argv[0]; + + if (argc != 2) + usage(); + + dev_name = argv[1]; + + /* + * Get some info about the device. + */ + + if ((fd = open(dev_name, O_RDWR)) < 0) + error(1, errno, "Cannot open device %s", dev_name); + + if (fstat(fd, &stat_buf) != 0) + error(1, errno, "Cannot stat device %s", dev_name); + raw_dev_size = stat_buf.st_size; + /* + * XXX Sanity check size - big enough? + */ + + /* + * Create structures and write them out + */ + + create_pool_summary(dev_name, &pool); + write_block(&pool, sizeof(pool), fd, CHUNKFS_POOL_OFFSET); + + /* XXX handle multiple devs */ + + create_dev_summary(&pool, &root_dev, CHUNKFS_DEV_OFFSET, + raw_dev_size - CHUNKFS_DEV_OFFSET - 1); + write_block(&root_dev, sizeof(root_dev), fd, CHUNKFS_DEV_OFFSET); + + /* Now we get to the meaty bit: chunk summaries. */ + + write_chunk_summaries(&root_dev, &root_chunk, fd); + + close(fd); + + return 0; +} diff --git a/namei.c b/namei.c new file mode 100644 index 0000000..7ccc193 --- /dev/null +++ b/namei.c @@ -0,0 +1,538 @@ +/* + * Chunkfs + * + * Chunks is a file system designed to be checked and repaired in + * small, mostly independent chunks. This allows quick recovery from + * file system corruption. + * + * (C) 2007-2008 Valerie Henson + * + */ + +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +void +chunkfs_release_nd(struct dentry *dentry) +{ + struct nameidata *nd = get_client_nd(dentry); + dput(nd->dentry); + mntput(nd->mnt); +} + +/* + * Call this to initialize our client nameidata. + */ + +void +chunkfs_init_nd(struct inode *dir, struct dentry *dentry, + struct dentry *client_dentry, u64 chunk_id) +{ + struct nameidata *nd = get_client_nd(dentry); + struct chunkfs_chunk_info *chunk; + + chunk = chunkfs_find_chunk(CHUNKFS_PI(dir->i_sb), chunk_id); + BUG_ON(!chunk); /* XXX */ + /* Probably don't need dget/mntget */ + nd->dentry = dget(client_dentry); + nd->mnt = mntget(chunk->ci_mnt); + printk(KERN_ERR "%s(): dentry %p name %s client_dentry %p mnt %s\n", + __FUNCTION__, dentry, dentry->d_iname, client_dentry, + nd->mnt->mnt_sb->s_type->name); +} + +/* + * The client file system may read the following parts of the nameidata: + * + * In open, it read the intent's mode or flags. + * + * The client file system may alter the nameidata in the following cases: + * + * When following symbolic links (up to N levels of links saved in + * nd), it may set the saved_names (using the depth) with nd_set_link. + */ + +static void +__chunkfs_copy_nd(struct nameidata *dst, struct nameidata *src) +{ + dst->intent.open.flags = src->intent.open.flags; + dst->intent.open.create_mode = src->intent.open.create_mode; + dst->depth = src->depth; + dst->saved_names[dst->depth] = src->saved_names[dst->depth]; +} + +void +chunkfs_copy_up_nd(struct nameidata *nd, struct nameidata *client_nd) +{ + __chunkfs_copy_nd(nd, client_nd); +} + +void +chunkfs_copy_down_nd(struct nameidata *nd, struct nameidata *client_nd) +{ + __chunkfs_copy_nd(client_nd, nd); +} + +static void +chunkfs_remove_dentry(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + dput(dp->dp_client_dentry); +} + +void +chunkfs_free_dentry(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + kfree(dp->dp_client_nd); + kfree(dp); + dentry->d_fsdata = NULL; +} + +/* + * Called when a dentry is evicted from cache. + */ + +void +chunkfs_release_dentry(struct dentry *dentry) +{ + printk(KERN_ERR "%s(): name %s\n", __FUNCTION__, dentry->d_name.name); + /* + * Root dentry can be legitimately released on umount, but is + * also a common manifestation of refcounting problems. Catch + * for debugging. + */ + WARN_ON(strcmp(dentry->d_name.name, "/") == 0); + chunkfs_release_nd(dentry); + /* + * Negative dentries need client dentries too, so they can be + * easily converted into responsible positive dentries. We + * should never have a dentry without a client dentry. + */ + chunkfs_remove_dentry(dentry); + chunkfs_free_dentry(dentry); +} + +struct dentry_operations chunkfs_dops = { + .d_release = chunkfs_release_dentry, +}; + +/* + * Initialize a new chunkfs dentry. + */ + +int +chunkfs_init_dentry(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp; + struct nameidata *nd; + + BUG_ON(dentry->d_fsdata); + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + if (!dp) + return -ENOMEM; + nd = kzalloc(sizeof(*nd), GFP_KERNEL); + if (!nd) + goto out; + dp->dp_client_nd = nd; + dentry->d_fsdata = dp; + dentry->d_op = &chunkfs_dops; + return 0; + out: + kfree(dp); + return -ENOMEM; +} + +/* + * This function takes a chunkfs dentry and constructs a new dentry + * for the client fs. + */ + +static struct dentry * +chunkfs_clone_dentry(struct dentry *dentry) +{ + struct dentry *client_parent = get_client_dentry(dentry->d_parent); + struct dentry *client_dentry; + + client_dentry = d_alloc_name(client_parent, dentry->d_name.name); + + if (!client_dentry) + return ERR_PTR(-ENOMEM); + return client_dentry; +} + +void +chunkfs_add_dentry(struct dentry *dentry, struct dentry *client_dentry, + struct vfsmount *mnt) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + dp->dp_client_dentry = client_dentry; +} + +static int +chunkfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *client_dir = get_client_inode(dir); + struct dentry *client_dentry = get_client_dentry(dentry); + struct nameidata *client_nd = get_client_nd(dentry); + u64 chunk_id = UINO_TO_CHUNK_ID(dir->i_ino); + struct inode *inode; + int err; + + printk(KERN_ERR "%s(): dir ino %0lx i_count %d\n", + __FUNCTION__, dir->i_ino, atomic_read(&dir->i_count)); + + err = chunkfs_new_inode(dir->i_sb, &inode); + if (err) + goto out; + + chunkfs_copy_down_nd(nd, client_nd); + + err = client_dir->i_op->create(client_dir, client_dentry, mode, + client_nd); + if (err) + goto out_inode; + + err = chunkfs_init_cont_data(client_dentry); + if (err) + goto out_inode; + chunkfs_start_inode(inode, client_dentry->d_inode, chunk_id); + chunkfs_copy_up_inode(dir, client_dir); + chunkfs_copy_up_nd(nd, client_nd); + + /* Now put our new inode into the dentry */ + d_instantiate(dentry, inode); + + printk(KERN_ERR "dentry %p name %s inode %p ino %0lx\n", + dentry, dentry->d_iname, dentry->d_inode, + dentry->d_inode->i_ino); + + printk(KERN_ERR "client dentry %p name %s inode %p ino %0lx\n", + client_dentry, client_dentry->d_iname, client_dentry->d_inode, + client_dentry->d_inode->i_ino); + return 0; + out_inode: + iput(inode); + out: + return err; +} + +static struct dentry * +chunkfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode *client_dir = get_client_inode(dir); + u64 chunk_id = UINO_TO_CHUNK_ID(dir->i_ino); + struct dentry *client_dentry; + struct dentry *new_dentry; + struct nameidata *client_nd; + struct inode *inode; + int err; + + printk(KERN_ERR "%s(): name %s dir ino %0lx i_count %d\n", + __FUNCTION__, dentry->d_iname, dir->i_ino, + atomic_read(&dir->i_count)); + + err = chunkfs_init_dentry(dentry); + if (err) + goto out; + + client_dentry = chunkfs_clone_dentry(dentry); + if (IS_ERR(client_dentry)) + goto out_dentry; + + chunkfs_init_nd(dir, dentry, client_dentry, chunk_id); + client_nd = get_client_nd(dentry); + /* + * Fill out the client dentry. + */ + new_dentry = client_dir->i_op->lookup(client_dir, client_dentry, + client_nd); + /* + * Possible return values: + * + * NULL: Nothing went wrong with lookup, you may or may not + * have found a matching inode and attached it. If the inode + * is NULL, we still have to create a negative dentry. + * + * Address of a dentry: The dentry already existed (and was + * root and disconnected - something about knfsd), so the + * dentry we passed in needs to be thrown away and we should + * use the one returned. + * + * IS_ERR(): Something went wrong, return the error. + */ + if (IS_ERR(new_dentry)) { + err = PTR_ERR(new_dentry); + goto out_dput; + } else if (new_dentry) { + dput(client_dentry); + client_dentry = new_dentry; + } + + /* + * If the client found an inode, fill in the chunkfs inode. + */ + if (client_dentry->d_inode) { + err = chunkfs_new_inode(dir->i_sb, &inode); + if (err) + goto out_dput; + err = chunkfs_init_cont_data(client_dentry); + if (err) + goto out_dput; + chunkfs_start_inode(inode, client_dentry->d_inode, + chunk_id); + } else { + inode = NULL; + } + /* Hook up the client and parent dentries. */ + chunkfs_add_dentry(dentry, client_dentry, client_nd->mnt); + + printk(KERN_ERR "dentry %p name %s inode %p\n", + dentry, dentry->d_iname, dentry->d_inode); + printk(KERN_ERR "client dentry %p name %s inode %p\n", client_dentry, + client_dentry->d_iname, client_dentry->d_inode); + + return d_splice_alias(inode, dentry); + out_dput: + dput(client_dentry); + chunkfs_release_nd(dentry); + out_dentry: + chunkfs_remove_dentry(dentry); + out: + chunkfs_free_dentry(dentry); + + printk(KERN_ERR "%s(): name %s returning %d\n", + __FUNCTION__, dentry->d_iname, err); + + return ERR_PTR(err); +} + +static int +chunkfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + struct inode *client_dir = get_client_inode(dir); + struct inode *old_inode = old_dentry->d_inode; + struct inode *client_old_inode = get_client_inode(old_inode); + struct dentry *client_old_dentry = get_client_dentry(old_dentry); + struct dentry *client_new_dentry = get_client_dentry(new_dentry); + int err = 0; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = client_dir->i_op->link(client_old_dentry, client_dir, + client_new_dentry); + if (err) + goto out; + /* Copy up inode takes care of link counts */ + chunkfs_copy_up_inode(old_inode, client_old_inode); + /* + * For some reason, this is the one place where the VFS + * doesn't increment the inode ref count for us. + */ + atomic_inc(&dir->i_count); + d_instantiate(new_dentry, old_inode); + out: + return err; +} + +static int +chunkfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *client_dir = get_client_inode(dir); + struct dentry *client_dentry = get_client_dentry(dentry); + struct inode *inode = dentry->d_inode; + struct inode *client_inode = get_client_inode(inode); + int err = 0; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = client_dir->i_op->unlink(client_dir, client_dentry); + if (err) + goto out; + chunkfs_copy_up_inode(dir, client_dir); + chunkfs_copy_up_inode(inode, client_inode); + out: + return err; +} + +static int +chunkfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) +{ + struct inode *client_dir = get_client_inode(dir); + struct dentry *client_dentry = get_client_dentry(dentry); + u64 chunk_id = UINO_TO_CHUNK_ID(dir->i_ino); + struct inode *inode; + int err; + + printk(KERN_ERR "%s(): dir ino %0lx i_count %d\n", + __FUNCTION__, dir->i_ino, atomic_read(&dir->i_count)); + + err = chunkfs_new_inode(dir->i_sb, &inode); + if (err) + goto out; + + err = client_dir->i_op->symlink(client_dir, client_dentry, oldname); + if (err) + goto out_inode; + + err = chunkfs_init_cont_data(client_dentry); + if (err) + goto out_inode; + chunkfs_start_inode(inode, client_dentry->d_inode, chunk_id); + chunkfs_copy_up_inode(dir, client_dir); + + /* Now put our new inode into the dentry */ + d_instantiate(dentry, inode); + + printk(KERN_ERR "dentry %p name %s inode %p ino %0lx\n", + dentry, dentry->d_iname, dentry->d_inode, + dentry->d_inode->i_ino); + printk(KERN_ERR "client dentry %p name %s inode %p ino %0lx\n", + client_dentry, client_dentry->d_iname, client_dentry->d_inode, + client_dentry->d_inode->i_ino); + return 0; + out_inode: + iput(inode); + out: + return err; +} + +static int +chunkfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *client_dir = get_client_inode(dir); + struct inode *client_inode; + struct dentry *client_dentry = get_client_dentry(dentry); + u64 chunk_id = UINO_TO_CHUNK_ID(dir->i_ino); + struct inode *inode; + int err; + + printk(KERN_ERR "%s(): name %s dir ino %0lx i_count %d\n", + __FUNCTION__, dentry->d_iname, dir->i_ino, + atomic_read(&dir->i_count)); + + err = chunkfs_new_inode(dir->i_sb, &inode); + if (err) + goto out; + + err = client_dir->i_op->mkdir(client_dir, client_dentry, mode); + if (err) + goto out_inode; + client_inode = client_dentry->d_inode; + + err = chunkfs_init_cont_data(client_dentry); + if (err) + goto out_inode; + chunkfs_start_inode(inode, client_inode, chunk_id); + chunkfs_copy_up_inode(dir, client_dir); + d_instantiate(dentry, inode); + return 0; + out_inode: + iput(inode); + out: + printk(KERN_ERR "%s(): name %s returning %d\n", + __FUNCTION__, dentry->d_iname, err); + return err; +} + +static int +chunkfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *client_dir = get_client_inode(dir); + struct dentry *client_dentry = get_client_dentry(dentry); + struct inode *inode = dentry->d_inode; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + err = client_dir->i_op->rmdir(client_dir, client_dentry); + if (err) + return err; + chunkfs_copy_up_inode(dir, client_dir); + chunkfs_copy_up_inode(inode, client_dentry->d_inode); + return 0; +} + +static int +chunkfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + struct inode *client_dir = get_client_inode(dir); + struct dentry *client_dentry = get_client_dentry(dentry); + u64 chunk_id = UINO_TO_CHUNK_ID(dir->i_ino); + struct inode *inode; + int err; + + printk(KERN_ERR "%s(): name %s dir ino %0lx i_count %d\n", + __FUNCTION__, dentry->d_iname, dir->i_ino, + atomic_read(&dir->i_count)); + + err = chunkfs_new_inode(dir->i_sb, &inode); + if (err) + goto out; + + err = client_dir->i_op->mknod(client_dir, client_dentry, mode, dev); + if (err) + goto out_inode; + + err = chunkfs_init_cont_data(client_dentry); + if (err) + goto out_inode; + chunkfs_start_inode(inode, client_dentry->d_inode, chunk_id); + chunkfs_copy_up_inode(dir, client_dir); + d_instantiate(dentry, inode); + + return 0; + out_inode: + iput(inode); + out: + printk(KERN_ERR "%s(): name %s returning %d\n", + __FUNCTION__, dentry->d_iname, err); + return err; +} + +static int +chunkfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *client_old_dir = get_client_inode(old_dir); + struct inode *client_new_dir = get_client_inode(new_dir); + struct dentry *client_old_dentry = get_client_dentry(old_dentry); + struct dentry *client_new_dentry = get_client_dentry(new_dentry); + int err = 0; + + return -ENOSYS; + /* Not reached */ + err = client_old_dir->i_op->rename(client_old_dir, + client_old_dentry, + client_new_dir, + client_new_dentry); + if (err) + goto out; + chunkfs_copy_up_inode(old_dir, client_old_dir); + chunkfs_copy_up_inode(new_dir, client_new_dir); + out: + return err; +} + +struct inode_operations chunkfs_dir_iops = { + .create = chunkfs_create, + .lookup = chunkfs_lookup, + .link = chunkfs_link, + .unlink = chunkfs_unlink, + .symlink = chunkfs_symlink, + .mkdir = chunkfs_mkdir, + .rmdir = chunkfs_rmdir, + .mknod = chunkfs_mknod, + .rename = chunkfs_rename, + .setattr = chunkfs_setattr, + .permission = chunkfs_permission, +}; + +struct inode_operations chunkfs_special_iops = { + .setattr = chunkfs_setattr, + .permission = chunkfs_permission, +}; diff --git a/start_uml.sh b/start_uml.sh new file mode 100644 index 0000000..c06bd21 --- /dev/null +++ b/start_uml.sh @@ -0,0 +1,21 @@ +#!/bin/bash -x +# +# Start UML. The real test script is run from inside the UML +# partition; I suggest using hostfs to mount it. +# +ROOT=/home/val/root_fs_philips + +# Set up some gdb commands + +cat > /tmp/gdb_commands < + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +static struct inode *chunkfs_alloc_inode(struct super_block *sb) +{ + /* XXX Make a kmem_cache */ + struct chunkfs_inode_info *ii; + struct inode *inode; + + ii = kzalloc(sizeof (*ii), GFP_KERNEL); + if (!ii) + return NULL; + /* XXX should be done in cache constructor */ + spin_lock_init(&ii->ii_continuations_lock); + /* Don't load head continuation until file open */ + inode = &ii->ii_vnode; + inode_init_once(inode); + inode->i_version = 1; + + return inode; +} + +static void chunkfs_destroy_inode(struct inode *inode) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(inode); + + printk(KERN_ERR "%s(): ino %0lx i_count %d\n", __FUNCTION__, + inode->i_ino, atomic_read(&inode->i_count)); + + kfree(ii); +} + +static void chunkfs_clear_inode(struct inode *inode) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(inode); + + printk(KERN_ERR "%s(): ino %0lx i_count %d\n", + __FUNCTION__, inode->i_ino, atomic_read(&inode->i_count)); + iput(ii->ii_client_inode); +} + +static int +chunkfs_read_client_sb(struct chunkfs_chunk_info *ci) +{ + /* XXX XXX XXX There aren't enough XXX's in the world XXX XXX */ + char *path_prefix = "/chunk"; + char mount_path[strlen(path_prefix + 10)]; + struct nameidata nd; + int retval; + + /* + * Userland has kindly mounted our client fs's in particular + * locations. Look up the path and grab the superblock for + * each chunk. + * + * XXX Yuckity yuckity yuck yuck + */ + sprintf(mount_path, "%s%llu", path_prefix, ci->ci_chunk_id); + retval = path_lookup(mount_path, LOOKUP_FOLLOW, &nd); + if (retval) { + printk(KERN_ERR "path_lookup for %s failed: %d\n", + mount_path, retval); + return retval; + } + /* XXX locking XXX prevent unmount XXX ref count XXX XXX */ + ci->ci_mnt = mntget(nd.mnt); + ci->ci_sb = nd.mnt->mnt_sb; + path_release(&nd); + + return 0; +} + +struct chunkfs_chunk_info * +chunkfs_find_chunk(struct chunkfs_pool_info *pi, u64 chunk_id) +{ + struct chunkfs_dev_info *di; + struct chunkfs_chunk_info *ci; + + list_for_each_entry(di, &pi->pi_dlist_head, di_dlist) { + list_for_each_entry(ci, &di->di_clist_head, ci_clist) { + if(ci->ci_chunk_id == chunk_id) + return ci; + } + } + return NULL; +} + +static void chunkfs_free_chunk(struct chunkfs_chunk_info *ci) +{ + brelse(ci->ci_bh); + mntput(ci->ci_mnt); + kfree(ci); +} + +static void chunkfs_free_dev(struct chunkfs_dev_info *di) +{ + struct chunkfs_chunk_info *ci, *ci_next; + + list_for_each_entry_safe(ci, ci_next, &di->di_clist_head, ci_clist) { + list_del(&ci->ci_clist); + chunkfs_free_chunk(ci); + } + brelse(di->di_bh); + kfree(di); +} + +static void chunkfs_free_pool(struct chunkfs_pool_info *pi) +{ + struct chunkfs_dev_info *di, *di_next; + + list_for_each_entry_safe(di, di_next, &pi->pi_dlist_head, di_dlist) { + list_del(&di->di_dlist); + chunkfs_free_dev(di); + } + brelse(pi->pi_bh); + kfree(pi); +} + +static int chunkfs_read_chunk(struct super_block *sb, + struct chunkfs_dev_info *dev, + struct chunkfs_chunk_info **chunk_info, + ci_byte_t chunk_offset, + ci_byte_t *next_chunk_offset) +{ + struct chunkfs_chunk_info *ci; + struct chunkfs_chunk *chunk; + struct buffer_head *bh; + int retval = -EIO; + int err; + + ci = kzalloc(sizeof(*ci), GFP_KERNEL); + if (!ci) + return -ENOMEM; + + /* XXX assumes offset is multiple of underlying block size */ + + if (!(bh = sb_bread(sb, chunk_offset/CHUNKFS_BLK_SIZE))) { + printk (KERN_ERR "chunkfs: unable to read chunk summary at %llu", + chunk_offset); + goto out_nobh; + } + + ci->ci_bh = bh; + chunk = CHUNKFS_CHUNK(ci); + + if ((err = check_chunk(chunk)) != 0) { + printk (KERN_ERR "chunkfs: Invalid chunk summary, err %d, chksum %0x\n", + err, le32_to_cpu(chunk->c_chksum)); + goto out; + } + + /* Fill in on-disk info */ + ci->ci_flags = cpu_to_le64(chunk->c_flags); + *next_chunk_offset = cpu_to_le64(chunk->c_next_chunk); + ci->ci_chunk_id = cpu_to_le64(chunk->c_chunk_id); + memcpy(ci->ci_client_fs, chunk->c_client_fs, CHUNKFS_CLIENT_NAME_LEN); + + /* Init non-disk stuff */ + ci->ci_dev = dev; + + /* Mount the client file system */ + retval = chunkfs_read_client_sb(ci); + if (retval) + goto out; + + *chunk_info = ci; + return 0; + out: + brelse(bh); + ci->ci_bh = NULL; + out_nobh: + kfree(ci); + BUG_ON(retval == 0); + return retval; +} + +static int chunkfs_read_dev(struct super_block *sb, + struct chunkfs_pool_info *pool_info, + struct chunkfs_dev_info **dev_info) +{ + struct chunkfs_dev_info *di; + struct chunkfs_dev *dev; + struct buffer_head * bh; + struct chunkfs_chunk_info *ci, *ci_next; + ci_byte_t chunk_offset, next_chunk_offset; + int retval = -EIO; + int err; + + di = kzalloc(sizeof(*di), GFP_KERNEL); + if (!di) + return -ENOMEM; + + /* XXX assumes sb offset is multiple of underlying block size */ + + if (!(bh = sb_bread(sb, CHUNKFS_DEV_BLK))) { + printk (KERN_ERR "chunkfs: unable to read dev summary\n"); + goto out_nobh; + } + + di->di_bh = bh; + dev = CHUNKFS_DEV(di); + + if ((err = check_dev(dev)) != 0) { + printk (KERN_ERR "chunkfs: Invalid dev summary err %d chksum %0x\n", + err, le32_to_cpu(dev->d_chksum)); + goto out_bh; + } + /* Fill in on-disk info */ + di->di_flags = cpu_to_le64(dev->d_flags); + chunk_offset = cpu_to_le64(dev->d_innards_begin); + + /* Init non-disk stuff */ + INIT_LIST_HEAD(&di->di_clist_head); + di->di_pool = pool_info; + + /* XXX would like to sanity check dev size here */ + + while (chunk_offset != 0) { + retval = chunkfs_read_chunk(sb, di, &ci, chunk_offset, + &next_chunk_offset); + if (retval) + goto out_free_chunks; + list_add_tail(&ci->ci_clist, &di->di_clist_head); + if (CHUNKFS_IS_ROOT(ci)) { + BUG_ON(di->di_pool->pi_root_dev); + di->di_pool->pi_root_dev = di; + BUG_ON(di->di_root_chunk); + di->di_root_chunk = ci; + } + chunk_offset = next_chunk_offset; + } + + /* Did we find root? */ + if (!di->di_root_chunk) { + printk(KERN_ERR "chunkfs: did not find root\n"); + goto out_free_chunks; + } + *dev_info = di; + return 0; + out_free_chunks: + list_for_each_entry_safe(ci, ci_next, &di->di_clist_head, ci_clist) { + list_del(&ci->ci_clist); + chunkfs_free_chunk(ci); + } + out_bh: + brelse(bh); + di->di_bh = NULL; + out_nobh: + kfree(di); + return retval; +} + +static int chunkfs_read_pool(struct super_block *sb, + struct chunkfs_pool_info **pool_info) +{ + struct chunkfs_pool_info *pi; + struct chunkfs_pool *pool; + struct buffer_head * bh; + struct chunkfs_dev_info *di; + int retval = -EIO; + int err; + + pi = kzalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) + return -ENOMEM; + + /* XXX assumes sb offset is multiple of underlying block size */ + + if (!(bh = sb_bread(sb, CHUNKFS_POOL_BLK))) { + printk (KERN_ERR "chunkfs: unable to read pool summary\n"); + goto out_nobh; + } + + pi->pi_bh = bh; + pool = CHUNKFS_POOL(pi); + + if ((err = check_pool(pool)) != 0) { + printk (KERN_ERR "chunkfs: Invalid pool summary, err %d chksum %0x magic %0x\n", + err, le32_to_cpu(pool->p_chksum), le32_to_cpu(pool->p_magic)); + goto out; + } + /* Fill in on-disk info */ + pi->pi_flags = cpu_to_le64(pool->p_flags); + + /* Init non-disk stuff */ + INIT_LIST_HEAD(&pi->pi_dlist_head); + + /* XXX read multiple devs */ + /* For now, we just read at a particular offset on this dev */ + retval = chunkfs_read_dev(sb, pi, &di); + if (retval) + goto out; + list_add_tail(&di->di_dlist, &pi->pi_dlist_head); + + *pool_info = pi; + return 0; + out: + brelse(bh); + pi->pi_bh = NULL; + out_nobh: + kfree(pi); + return retval; +} + +static void chunkfs_commit_super (struct super_block *sb, int sync) +{ + struct buffer_head *sbh = CHUNKFS_PI(sb)->pi_bh; + + if (!sbh) + return; + mark_buffer_dirty(sbh); + if (sync) + sync_dirty_buffer(sbh); +} + +static void chunkfs_put_super (struct super_block *sb) +{ + struct chunkfs_pool_info *pi = CHUNKFS_PI(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + /* XXX should mark super block as clean unmounted */ + chunkfs_commit_super(sb, 1); + } + chunkfs_free_pool(pi); + sb->s_fs_info = NULL; + + return; +} + +static void +chunkfs_write_super (struct super_block * sb) +{ + if (mutex_trylock(&sb->s_lock) != 0) + BUG(); + chunkfs_commit_super(sb, 1); + sb->s_dirt = 0; +} + + +static struct super_operations chunkfs_sops = { + .alloc_inode = chunkfs_alloc_inode, + .destroy_inode = chunkfs_destroy_inode, + .read_inode = chunkfs_read_inode, + .write_inode = chunkfs_write_inode, +#if 0 /* XXX Totally unimplemented at present */ + .dirty_inode = chunkfs_dirty_inode, + .delete_inode = chunkfs_delete_inode, +#endif + .put_super = chunkfs_put_super, + .write_super = chunkfs_write_super, +#if 0 + .sync_fs = chunkfs_sync_fs, + .write_super_lockfs = chunkfs_write_super_lockfs, + .unlockfs = chunkfs_unlockfs, + .statfs = chunkfs_statfs, + .remount_fs = chunkfs_remount, +#endif + .clear_inode = chunkfs_clear_inode, +#if 0 + .show_options = chunkfs_show_options, +#endif +}; + +/* + * The file system in the root chunk has already been mounted, so the + * chunk root inode is already loaded and stored in the superblock. + * However, we really want to have the root directory in terms of the + * chunkfs namespace, which is presently named "/root" and inode 12. + */ + +static int chunkfs_read_root(struct super_block *sb) +{ + struct chunkfs_chunk_info *ci = CHUNKFS_PI(sb)->pi_root_dev->di_root_chunk; + ino_t ino = MAKE_UINO(ci->ci_chunk_id, 12); /* XXX */ + struct inode *inode; + struct nameidata nd; + struct dentry *dentry; + int retval; + + inode = iget(sb, ino); + sb->s_root = d_alloc_root(inode); + if (!sb->s_root) { + retval = -ENOMEM; + goto out_iput; + } + retval = chunkfs_init_dentry(sb->s_root); + if (retval) + goto out_dput; + retval = path_lookup("/chunk1/root/", LOOKUP_FOLLOW, &nd); + if (retval) + goto out_dentry; + dentry = dget(nd.dentry); + chunkfs_init_nd(inode, sb->s_root, dentry, ci->ci_chunk_id); + chunkfs_add_dentry(sb->s_root, dentry, nd.mnt); + path_release(&nd); + return 0; + out_dentry: + chunkfs_free_dentry(sb->s_root); + out_dput: + dput(sb->s_root); + out_iput: + iput(inode); + printk(KERN_ERR "%s() path lookup failed\n", __FUNCTION__); + return retval; +} + +/* + * chunkfs_setup_super does all things that are shared between mount + * and remount. At moment, I'm not sure what they are. + */ + +static int chunkfs_setup_super(struct super_block *sb, + struct chunkfs_pool_info *pi, + int read_only) +{ + return 0; +} + +/* + * Get the superblock off the disk and check to see if it is sane. + * + * Note that VFS code has a generic routine to find alternate superblocks. + * + * XXX todo, put dev summary copies in chunk summaries. + */ + +static int chunkfs_fill_super (struct super_block *sb, void *data, int silent) +{ + struct chunkfs_pool_info *pi; + int retval = -EINVAL; + + unlock_kernel(); + + printk(KERN_ERR "%s\n", __FUNCTION__); + + /* We must set blocksize before we can read blocks. */ + + if (sb_set_blocksize(sb, CHUNKFS_BLK_SIZE) == 0) + goto out; + + retval = chunkfs_read_pool(sb, &pi); + if (retval) + goto out; + sb->s_fs_info = pi; + + sb->s_maxbytes = ~0ULL; + sb->s_op = &chunkfs_sops; + + retval = chunkfs_read_root(sb); + if (retval) + goto out; + /* If fail after this, dput sb->s_root */ + + chunkfs_setup_super (sb, pi, sb->s_flags & MS_RDONLY); + + printk(KERN_ERR "chunkfs: mounted file system\n"); + lock_kernel(); + return 0; + out: + lock_kernel(); + BUG_ON(retval == 0); + printk(KERN_ERR "%s() failed! err %d\n", __FUNCTION__, retval); + return retval; +} + +static int chunkfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, chunkfs_fill_super, mnt); +} + +static struct file_system_type chunkfs_fs_type = { + .owner = THIS_MODULE, + .name = "chunkfs", + .get_sb = chunkfs_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_chunkfs_fs(void) +{ + int err = register_filesystem(&chunkfs_fs_type); + if (!err) + printk(KERN_INFO "chunkfs (C) 2007 Valerie Henson " + "\n"); + return err; +} + +static void __exit exit_chunkfs_fs(void) +{ + unregister_filesystem(&chunkfs_fs_type); +} + +MODULE_AUTHOR("Val Henson"); +MODULE_DESCRIPTION("Chunkfs"); +MODULE_LICENSE("GPL"); +module_init(init_chunkfs_fs) +module_exit(exit_chunkfs_fs) diff --git a/symlink.c b/symlink.c new file mode 100644 index 0000000..a650e53 --- /dev/null +++ b/symlink.c @@ -0,0 +1,62 @@ +/* + * Chunkfs symlinks + * + * (C) 2007-2008 Valerie Henson + */ + +#include "chunkfs.h" +#include "chunkfs_i.h" + +static int +chunkfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct inode *client_inode = get_client_inode(dentry->d_inode); + struct dentry *client_dentry = get_client_dentry(dentry); + int err; + + printk (KERN_ERR "%s()\n", __FUNCTION__); + + err = client_inode->i_op->readlink(client_dentry, buffer, buflen); + + return err; +} + +static void * +chunkfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *client_inode = get_client_inode(dentry->d_inode); + struct dentry *client_dentry = get_client_dentry(dentry); + struct nameidata *client_nd = get_client_nd(dentry); + void *cookie; + + printk (KERN_ERR "%s()\n", __FUNCTION__); + + chunkfs_copy_down_nd(nd, client_nd); + + cookie = client_inode->i_op->follow_link(client_dentry, client_nd); + + chunkfs_copy_up_nd(nd, client_nd); + + return cookie; +} + +static void +chunkfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + struct inode *client_inode = get_client_inode(dentry->d_inode); + struct dentry *client_dentry = get_client_dentry(dentry); + struct nameidata *client_nd = get_client_nd(dentry); + + printk (KERN_ERR "%s()\n", __FUNCTION__); + if (client_inode->i_op->put_link) { + chunkfs_copy_down_nd(nd, client_nd); + client_inode->i_op->put_link(client_dentry, client_nd, cookie); + chunkfs_copy_up_nd(nd, client_nd); + } +} + +struct inode_operations chunkfs_symlink_iops = { + .readlink = chunkfs_readlink, + .follow_link = chunkfs_follow_link, + .put_link = chunkfs_put_link, +}; diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..1364729 --- /dev/null +++ b/test.sh @@ -0,0 +1,114 @@ +#!/bin/bash -x +# +# Test script for chunkfs, to be run from inside UML. +# + +# Name of the file backing the loop device +FILE=/loop/disk0 +# This is where the chunkfs user binaries are located. +BINPATH=/chunkfs_bin +MNT=/mnt + +# Clean up from last iteration if necessary. + +MOUNTED=`cat /proc/mounts | grep "${DEV} ${MNT} chunkfs"` +if [ -n "$MOUNTED" ]; then + umount ${MNT} + if [ "$?" != "0" ]; then + echo "umount failed" + exit 1 + fi +fi + +# Tear down old mounts and loop devices +loop_num=$((0)) +while (losetup /dev/loop$loop_num); do + umount /chunk${loop_num} + losetup -d /dev/loop$loop_num + loop_num=$((loop_num + 1)) +done + +${BINPATH}/write_pattern ${FILE} +if [ "$?" != "0" ]; then + echo "write_pattern failed" + exit 1 +fi + +${BINPATH}/mkfs.chunkfs ${FILE} > /tmp/offsetlist +if [ "$?" != "0" ]; then + echo "mkfs.chunkfs failed" + exit 1 +fi + +# Create primary loop device +losetup /dev/loop0 ${FILE} +if [ "$?" != "0" ]; then + echo "Create loop device failed" + exit 1 +fi + +# XXX Wow, like, such a hack. Set up a bunch of block devices +# starting at different offsets in order to create ext2 file systems +# inside chunks. losetup doesn't include an end argument, so there's +# no protection against one file system having a bug that scribbles +# over the following file systems. Also, mkfs should do this +# directly. +# +# XXX More hackery. Mount all our client fs's so that chunkfs kernel +# side can lookup the path and grab the superblocks. + +OFFSETS="`awk '/clientfs: start/ {print $3}' /tmp/offsetlist`" +loop_num=$((1)) +for offset in ${OFFSETS}; do + losetup -o $offset /dev/loop$loop_num ${FILE} + mke2fs -b 4096 /dev/loop$loop_num 2559 > /dev/null + mkdir -p /chunk${loop_num} + mount -t ext2 -o user_xattr /dev/loop${loop_num} /chunk${loop_num} + if [ "$?" != "0" ]; then + echo "mount client fs failed" + exit 1 + fi + # Only the root chunk has the root directory + if [ "$loop_num" == "1" ]; then + mkdir -p /chunk${loop_num}/root + # Create continuation data + # Hackity hack. Just create it on /root if it doesn't already + # exist. + /usr/local/usr/bin/setfattr -n user.next -v 0 /chunk${loop_num}/root + /usr/local/usr/bin/setfattr -n user.prev -v 0 /chunk${loop_num}/root + /usr/local/usr/bin/setfattr -n user.start -v 0 /chunk${loop_num}/root + /usr/local/usr/bin/setfattr -n user.len -v 40960 /chunk${loop_num}/root + fi + for i in 0 1 2 3 4 5 6 7 8 9 private; do + mkdir -p /chunk${loop_num}/$i + done + loop_num=$((loop_num + 1)) +done + +${BINPATH}/mount_chunkfs /dev/loop0 ${MNT} +if [ "$?" != "0" ]; then + echo "mount_chunkfs failed" + exit 1 +fi + +# Now test a few more things + +ls /mnt +touch /mnt/a_file +touch /mnt/another_file +echo "file data" > /mnt/a_file_with_data +cat /mnt/a_file_with_data +rm /mnt/another_file +mkdir /mnt/a_dir +mkdir /mnt/a_dir/a_dir +mkdir /mnt/a_dir/another_dir +rmdir /mnt/a_dir/another_dir +ln -s /mnt/a_file /mnt/a_symlink +ls -l /mnt/a_symlink +cat /mnt/a_symlink +dd if=/dev/zero of=/mnt/big bs=4096 count=11 +ls -l /mnt/big +ls -l /chunk1/root/big +ls -l /chunk2/1/29 + +exit 0 diff --git a/write_pattern.c b/write_pattern.c new file mode 100644 index 0000000..f0c66b2 --- /dev/null +++ b/write_pattern.c @@ -0,0 +1,61 @@ +/* + * Write a pattern to a file. + * + * (C) 2007 Valerie Henson + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FILE_SIZE (32 * 1024 * 1024) + +static char * cmd; + +static void usage (void) +{ + fprintf(stderr, "Usage: %s \n", cmd); + exit(1); +} + +int main (int argc, char * argv[]) +{ + int fd; + char * file; + char buf[4096]; + int n = 0; + int written = 0; + + cmd = argv[0]; + + if (argc != 2) + usage(); + + file = argv[1]; + + if ((fd = open(file, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR)) < 0) { + error(0, errno, "Cannot open file %s", file); + usage(); + } + + memset(buf, '5', sizeof(buf)); + + while (written < FILE_SIZE) { + n = write(fd, buf, sizeof(buf)); + if (n == -1) { + error(0, errno, "Cannot write file %s", file); + usage(); + } + written += n; + } + + close(fd); + + return 0; +}