From 35c59d50f04c89b030908f88bf6d8a4456b3aa61 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 21 Jan 2014 01:18:47 +0400 Subject: [PATCH] chunkfs: implementation of chunkfs by Valerie Henson From: http://valerieaurora.org/chunkfs/lca_release.diff $ patch -p1 < patches/chunkfs_lca_release.diff patching file fs/Kconfig Hunk #1 FAILED at 272. 1 out of 1 hunk FAILED -- saving rejects to file fs/Kconfig.rej patching file fs/Makefile Hunk #1 FAILED at 73. 1 out of 1 hunk FAILED -- saving rejects to file fs/Makefile.rej patching file fs/chunkfs/Makefile patching file fs/chunkfs/chunkfs.h patching file fs/chunkfs/mkfs.chunkfs.c patching file fs/chunkfs/super.c can't find file to patch at input line 981 Perhaps you used the wrong -p or --strip option? The text leading up to this was: -------------------------- |--- chunkfs-2.6-mm-linux.orig/include/linux/magic.h |+++ chunkfs-2.6-mm-linux/include/linux/magic.h -------------------------- File to patch: include/uapi/linux/magic.h patching file include/uapi/linux/magic.h Hunk #1 succeeded at 5 with fuzz 2. patching file fs/chunkfs/README patching file fs/chunkfs/chunkfs_chunk.h patching file fs/chunkfs/chunkfs_dev.h patching file fs/chunkfs/chunkfs_i.h patching file fs/chunkfs/chunkfs_pool.h patching file fs/chunkfs/dir.c patching file fs/chunkfs/file.c patching file fs/chunkfs/fsck.chunkfs.c patching file fs/chunkfs/gaze.sh patching file fs/chunkfs/inode.c patching file fs/chunkfs/namei.c patching file fs/chunkfs/start_uml.sh patching file fs/chunkfs/symlink.c patching file fs/chunkfs/test.sh patching file fs/chunkfs/write_pattern.c patching file fs/namespace.c Hunk #1 FAILED at 340. Hunk #2 FAILED at 351. 2 out of 2 hunks FAILED -- saving rejects to file fs/namespace.c.rej patching file include/linux/mount.h Hunk #1 FAILED at 67. 1 out of 1 hunk FAILED -- saving rejects to file include/linux/mount.h.rej patching file fs/chunkfs/cont.c patching file fs/chunkfs/cross.sh patching file fs/chunkfs/demo.sh --- .gitignore | 97 +++++++++ Makefile | 16 ++ README | 70 +++++++ chunkfs.h | 204 ++++++++++++++++++ chunkfs_chunk.h | 85 ++++++++ chunkfs_dev.h | 66 ++++++ chunkfs_i.h | 113 ++++++++++ chunkfs_pool.h | 76 +++++++ cont.c | 442 +++++++++++++++++++++++++++++++++++++++ cross.sh | 33 +++ demo.sh | 55 +++++ dir.c | 68 ++++++ file.c | 338 ++++++++++++++++++++++++++++++ fsck.chunkfs.c | 212 +++++++++++++++++++ gaze.sh | 27 +++ inode.c | 173 ++++++++++++++++ mkfs.chunkfs.c | 192 +++++++++++++++++ namei.c | 538 ++++++++++++++++++++++++++++++++++++++++++++++++ start_uml.sh | 21 ++ super.c | 526 ++++++++++++++++++++++++++++++++++++++++++++++ symlink.c | 62 ++++++ test.sh | 114 ++++++++++ write_pattern.c | 61 ++++++ 23 files changed, 3589 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README create mode 100644 chunkfs.h create mode 100644 chunkfs_chunk.h create mode 100644 chunkfs_dev.h create mode 100644 chunkfs_i.h create mode 100644 chunkfs_pool.h create mode 100644 cont.c create mode 100644 cross.sh create mode 100644 demo.sh create mode 100644 dir.c create mode 100644 file.c create mode 100644 fsck.chunkfs.c create mode 100644 gaze.sh create mode 100644 inode.c create mode 100644 mkfs.chunkfs.c create mode 100644 namei.c create mode 100644 start_uml.sh create mode 100644 super.c create mode 100644 symlink.c create mode 100644 test.sh create mode 100644 write_pattern.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ed11b70 --- /dev/null +++ b/.gitignore @@ -0,0 +1,97 @@ +# specific files +mkfs.chunkfs +write_pattern + +# +# NOTE! Don't add files that are generated in specific +# subdirectories here. Add them in the ".gitignore" file +# in that subdirectory instead. +# +# NOTE! Please use 'git ls-files -i --exclude-standard' +# command after changing this file, to see if there are +# any tracked files which get ignored after the change. +# +# Normal rules +# +.* +*.o +*.o.* +*.a +*.s +*.ko +*.so +*.so.dbg +*.mod.c +*.i +*.lst +*.symtypes +*.order +modules.builtin +*.elf +*.bin +*.gz +*.bz2 +*.lzma +*.xz +*.lzo +*.patch +*.gcno + +# +# Top-level generic files +# +/tags +/TAGS +/linux +/vmlinux +/vmlinuz +/System.map +/Module.markers +/Module.symvers + +# +# Debian directory (make deb-pkg) +# +/debian/ + +# +# git files that we don't want to ignore even it they are dot-files +# +!.gitignore +!.mailmap + +# +# Generated include files +# +include/config +include/generated +arch/*/include/generated + +# stgit generated dirs +patches-* + +# quilt's files +patches +series + +# cscope files +cscope.* +ncscope.* + +# gnu global files +GPATH +GRTAGS +GSYMS +GTAGS + +*.orig +*~ +\#*# + +# +# Leavings from module signing +# +extra_certificates +signing_key.priv +signing_key.x509 +x509.genkey diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f0fa677 --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +# +# Makefile for chunkfs. +# + +obj-$(CONFIG_CHUNK_FS) += chunkfs.o + +chunkfs-y := super.o inode.o dir.o file.o namei.o symlink.o cont.o + +# +# Temporarily keep utilities in this dir too. +# +HOST_EXTRACFLAGS += -I$(src)/../../include -static + +hostprogs-$(CONFIG_CHUNK_FS) := mkfs.chunkfs mount_chunkfs write_pattern + +always := $(hostprogs-y) $(hostprogs-m) diff --git a/README b/README new file mode 100644 index 0000000..e2bc180 --- /dev/null +++ b/README @@ -0,0 +1,70 @@ +Chunkfs README + +Val Henson + +Summary +------- + +Chunkfs is an experimental local file system designed to quickly +recover from file system corruption. Each file system is divided up +into many small chunks, each of which can be checked and repaired with +very few references to other chunks. In most cases, only a small part +of the file system must be checked and repaired before it can be +brought back online, requiring minutes instead of hours of downtime to +recover from a file system error. + +Status +------ + +Chunkfs development began in February 2007. Growing a file into two +chunks is supported but not much else. + +See the project web site for the current status: + +http://chunkfs.org + +License +------- + +Chunkfs is licensed under the GNU General Public License version 2. +Chunks is released as a patch against the Linux kernel, which contains +a copy of the GPLv2. + +Funding +------- + +Development of chunkfs was funded by: + +Intel +EMC Centera +VAH Consulting + +How it works +------------ + +Each file system is divided up into many small chunks. Each chunk is +marked as dirty or clean. Things that make a chunk dirty are +in-progress writes to metadata (creat(), chmod(), extending a file, +etc.), I/O errors reported by the disk, and any data integrity errors +observed by the file system code (bad checksums, wrong magic number, +etc.). At mount time, each dirty chunk is checked with fsck and +repaired if necessary, with limited references to other chunks. Clean +chunks are not checked unless specifically requested. + +The metadata inside a chunk is structured so that nearly all +references are within the chunk only. No block pointers or hard links +may cross chunk boundaries. Only one kind of reference crosses the +chunk boundary: inode continuations. When an inode needs to grow out +of a chunk (either its data has outgrown the free space or we want to +link to an inode outside its chunk), we allocate a new inode in the +appropriate chunk and link the two inodes together with forward and +back pointers, creating a continuation. Logically, the two inodes are +parts of a single file or directory. When checking a chunk containing +a continuation inode, the forward and back pointers allow us to +quickly find the relevant information in any other chunks, without +reading all the metadata in the entire chunk. Each chunk also keeps a +bitmap of all inodes with continuations, as in some circumstances all +continuations must be checked. + +For more information, see the documentation section on the project web +site. diff --git a/chunkfs.h b/chunkfs.h new file mode 100644 index 0000000..2fb5806 --- /dev/null +++ b/chunkfs.h @@ -0,0 +1,204 @@ +/* + * Chunkfs header file + * + * (C) 2007-2008 Valerie Henson + * + */ + +#ifndef _LINUX_CHUNKFS_FS_H +#define _LINUX_CHUNKFS_FS_H + +/* XXX Do above _H stuff for other header files */ + +#include +#include +#include + +/* + * NOTE: Most on disk structures need: + * + * Magic number (32 bit is really big but easy) + * Checksum (32 bit for some kind of sanity) + * + * These go first (magic, then checksum) in all on-disk structures so + * that even if have the type of the structure wrong, we're checking + * the correct spot. + * + * XXX File system generation number should be included - perhaps high + * 16 of magic? + */ + +/* XXX Should have magic, checksum, version, and generation in one struct */ + +/* + * Some useful typedefs to possibly prevent endian mixups. Use c_* + * for on disk, ci_* for in memory. + */ + +typedef __le64 c_blk_t; +typedef __u64 ci_blk_t; +typedef __le64 c_byte_t; +typedef __u64 ci_byte_t; +typedef __le64 c_inode_num_t; +typedef __u64 ci_inode_num_t; + +/* + * XXX Block size shouldn't have much meaning, and it will probably + * vary by chunk. Figure out what Linux VFS thinks this means. + * + * I think this means that when you use sb_read(), this is the block + * size used. + */ + +#define CHUNKFS_BLK_SIZE 4096 +#define CHUNKFS_BLK_BITS 12 + +/* + * Rev me! Lots! Whenever on-disk structures change! Mainly for + * development. + * + * Note that 0 is never acceptable. + */ + +#define CHUNKFS_VERSION 1 + +/* + * XXX On-disk structures probably aren't correctly padded at any + * given moment in time. + */ + +/* + * Locating a device has two parts. First, we try a cached path name + * which is a hint only, since paths may change. Then we check for + * the correct UUID; if it is wrong, we go search each device. + */ + +#define CHUNKFS_DEV_PATH_LEN 1024 + +struct chunkfs_dev_desc { + /* + * The path of the device when last opened. It may have + * changed, therefore it is only a hint. + */ + char d_hint[CHUNKFS_DEV_PATH_LEN]; + /* Is this the device we're looking for? */ + __le64 d_uuid; +}; + +/* + * Dummy struct to force us to check the "official" position of the + * checksum and magic number (at the beginning of the struct). + * + * XXX Actually use this struct in other structs. Never access + * directly. + */ + +struct chunkfs_chkmagic { + __le32 x_magic; + __le32 x_chksum; +}; + +/* XXX use e2fsprogs/dev uuid and crc32 lib functions */ +/* XXX using __cpu_to_* so userland can share */ +/* #ifdef KERNEL? How does Jeff do it? */ + +static inline void write_chksum(void *buf, unsigned int size) +{ + struct chunkfs_chkmagic *x = (struct chunkfs_chkmagic *) buf; +/* x->x_chksum = __cpu_to_le32(crc32(buf, size)); */ + x->x_chksum = __cpu_to_le32(0x32323232); +} + +static inline int check_chksum(void *buf, unsigned int size) +{ + struct chunkfs_chkmagic *x = (struct chunkfs_chkmagic *) buf; +/* return !(x->x_chksum == __cpu_to_le32(crc32(buf, size))); */ + return (__le32_to_cpu(x->x_chksum) != 0x32323232); +} + +static inline int check_magic(void *buf, __u32 expected_magic) { + struct chunkfs_chkmagic *x = (struct chunkfs_chkmagic *) buf; + return (__le32_to_cpu(x->x_magic) != expected_magic); +} +/* + * Generic function to check a piece of metadata just read off disk. + * Checksum and magic number are -always- in the same location in all + * metadata. + */ + +static inline int check_metadata(void *buf, unsigned int size, __u32 expected_magic) +{ + if (check_magic(buf, expected_magic)) + return 1; + if (check_chksum(buf, size)) + return 2; + return 0; +} + +#ifdef __KERNEL__ + +/* dir.c */ +extern struct file_operations chunkfs_dir_fops; + +/* inode.c */ +extern struct file_operations chunkfs_file_fops; +extern struct inode_operations chunkfs_file_iops; +int chunkfs_new_inode(struct super_block *, struct inode **); +void chunkfs_start_inode(struct inode *inode, struct inode *client_inode, + u64 chunk_id); +void chunkfs_read_inode(struct inode *); +int chunkfs_write_inode(struct inode *, int); +void chunkfs_copy_up_inode(struct inode *, struct inode *); + +/* symlink.c */ + +extern struct inode_operations chunkfs_symlink_iops; + +/* namei.c */ + +extern struct inode_operations chunkfs_dir_iops; +extern struct inode_operations chunkfs_special_iops; + +struct chunkfs_dlist_node *chunkfs_alloc_dlist_node(struct dentry *); +void chunkfs_add_dentry(struct dentry *, struct dentry *, struct vfsmount *); +int chunkfs_init_dentry(struct dentry *); +void chunkfs_free_dentry(struct dentry *); +void chunkfs_init_nd(struct inode *dir, struct dentry *dentry, + struct dentry *client_dentry, u64 chunk_id); +void chunkfs_copy_up_nd(struct nameidata *nd, struct nameidata *client_nd); +void chunkfs_copy_down_nd(struct nameidata *nd, struct nameidata *client_nd); + +/* file.c */ + +int chunkfs_setattr(struct dentry *dentry, struct iattr *attr); +int chunkfs_permission(struct inode *, int, struct nameidata *); +int chunkfs_open(struct inode *, struct file *); + +struct chunkfs_continuation; + +int chunkfs_open_cont_file(struct file *file, loff_t *ppos, + struct file **client_file, + struct chunkfs_continuation **ret_cont); +void chunkfs_close_cont_file(struct file *file, struct file *client_file, + struct chunkfs_continuation *cont); +void chunkfs_copy_down_file(struct file *file, loff_t *ppos, + struct file *client_file, u64 client_start); + +/* cont.c */ + +int chunkfs_get_next_inode(struct inode *head_inode, + struct inode *prev_inode, struct inode **ret_inode); +int chunkfs_get_cont_at_offset(struct dentry *dentry, loff_t offset, + struct chunkfs_continuation **ret_cont); +int chunkfs_get_next_cont(struct dentry *head_dentry, + struct chunkfs_continuation *prev_cont, + struct chunkfs_continuation **next_cont); +int chunkfs_create_continuation(struct file *file, loff_t *ppos, + struct file **client_file, + struct chunkfs_continuation **ret_cont); +void chunkfs_put_continuation(struct chunkfs_continuation *cont); +int chunkfs_init_cont_data(struct dentry *client_dentry); + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_CHUNKFS_FS_H */ diff --git a/chunkfs_chunk.h b/chunkfs_chunk.h new file mode 100644 index 0000000..8b2b2d0 --- /dev/null +++ b/chunkfs_chunk.h @@ -0,0 +1,85 @@ +/* + * Chunkfs chunk definitions + * + * (C) 2007-2008 Valerie Henson + */ + +/* + * Chunk summary. This is just a wrapper defining the chunk size and + * giving us enough information to identify the client file system + * living inside this chunk and use its routines to mount it. + * + * XXX Question: what about df? Will it require us to df each + * individual chunk? Maybe a higher level summary is a good idea. + * + */ + +#define CHUNKFS_CHUNK_MAGIC 0xf00df00d + +/* XXX super_block s_id len is 32, should #define it */ + +#define CHUNKFS_CLIENT_NAME_LEN 32 + +struct chunkfs_chunk { + __le32 c_magic; + __le32 c_chksum; + __le64 c_flags; + __le64 c_chunk_id; + c_byte_t c_begin; + c_byte_t c_end; + c_byte_t c_innards_begin; + c_byte_t c_innards_end; + c_byte_t c_next_chunk; + char c_client_fs[CHUNKFS_CLIENT_NAME_LEN]; +}; + +#define CHUNKFS_CHUNK_BLK (CHUNKFS_DEV_BLK + 1) +#define CHUNKFS_CHUNK_OFFSET (CHUNKFS_CHUNK_BLK * CHUNKFS_BLK_SIZE) +#define CHUNKFS_CHUNK_SIZE (10 * 1024 * 1024) /* XXX should be dynamic */ + +static inline int check_chunk(struct chunkfs_chunk *chunk) +{ + return check_metadata(chunk, sizeof(*chunk), CHUNKFS_CHUNK_MAGIC); +} + +/* + * Chunk flags + */ + +#define CHUNKFS_ROOT 0x00000001ULL + +#ifdef __KERNEL__ + +/* + * XXX Audit client file systems for start-from-zero block address bugs + * + * XXX Root inode location? Copy to all chunks? O(n chunk) space usage... + */ + +struct chunkfs_chunk_info { + struct chunkfs_dev_info *ci_dev; /* Parent device */ + struct list_head ci_clist; /* Member of list of chunks */ + struct buffer_head *ci_bh; + struct super_block *ci_sb; /* Superblock of client fs in memory */ + struct vfsmount *ci_mnt; + __u64 ci_flags; + __u64 ci_chunk_id; + char ci_client_fs[CHUNKFS_CLIENT_NAME_LEN]; + /* The rest of the on-disk data is not normally used. */ +}; + +#define CHUNKFS_IS_ROOT(ci) (ci->ci_flags & CHUNKFS_ROOT) + +static inline struct chunkfs_chunk * CHUNKFS_CHUNK(struct chunkfs_chunk_info *ci) +{ + return (struct chunkfs_chunk *) ci->ci_bh->b_data; +} + +static inline struct super_block * CHUNKFS_ROOT_SB(struct chunkfs_pool_info *pi) +{ + return pi->pi_root_dev->di_root_chunk->ci_sb; +} + +struct chunkfs_chunk_info * chunkfs_find_chunk(struct chunkfs_pool_info *, u64); + +#endif /* __KERNEL__ */ diff --git a/chunkfs_dev.h b/chunkfs_dev.h new file mode 100644 index 0000000..406ccdd --- /dev/null +++ b/chunkfs_dev.h @@ -0,0 +1,66 @@ +/* + * Chunkfs device definitions + * + * (C) 2007-2008 Valerie Henson + */ + +/* + * Device summary. This contains: + * + * - Information about which part of the device we manage + * - Pointer to the first chunk header (root chunk is flagged) + * + * Again, free/used information is known only by chunks, so we do not + * keep summary info in the dev summary unless we find some + * performance reason to keep it on disk. + */ + +#define CHUNKFS_DEV_MAGIC 0xdeeddeed + +struct chunkfs_dev { + __le32 d_magic; + __le32 d_chksum; + __le64 d_flags; /* Clean unmounted, etc. */ + __le64 d_uuid; + c_byte_t d_begin; /* Total space we manage */ + c_byte_t d_end; + c_byte_t d_innards_begin; /* Space for chunks */ + c_byte_t d_innards_end; + c_byte_t d_root_chunk; /* Offset of chunk containing root, if here */ + struct chunkfs_dev_desc d_next_dev; /* Next device in pool */ +}; + +/* + * Dev flags + */ + +#define CHUNKFS_ROOT_DEV 0x00000001ULL + +#define CHUNKFS_IS_ROOT_DEV(ci) (ci->ci_flags & CHUNKFS_ROOT_DEV) + +#define CHUNKFS_DEV_BLK (CHUNKFS_POOL_BLK + 1) +#define CHUNKFS_DEV_OFFSET (CHUNKFS_DEV_BLK * CHUNKFS_BLK_SIZE) + +static inline int check_dev(struct chunkfs_dev *dev) +{ + return check_metadata(dev, sizeof(*dev), CHUNKFS_DEV_MAGIC); +} + +#ifdef __KERNEL__ + +struct chunkfs_dev_info { + struct chunkfs_pool_info *di_pool; + struct list_head di_dlist; /* Member of list of devs */ + struct list_head di_clist_head; /* Pointer to list of chunks */ + struct chunkfs_chunk_info *di_root_chunk; + struct buffer_head *di_bh; + __u64 di_flags; + /* The rest of the on-disk data is not normally used. */ +}; + +static inline struct chunkfs_dev * CHUNKFS_DEV(struct chunkfs_dev_info *di) +{ + return (struct chunkfs_dev *) di->di_bh->b_data; +} + +#endif /* __KERNEL__ */ diff --git a/chunkfs_i.h b/chunkfs_i.h new file mode 100644 index 0000000..830c916 --- /dev/null +++ b/chunkfs_i.h @@ -0,0 +1,113 @@ +/* + * Chunkfs inode definitions + * + * (C) 2007-2008 Valerie Henson + */ + +#define CHUNKFS_INODE_MAGIC 0x10de10de + +/* + * The on-disk version of the chunkfs continuation data is stored as + * strings inname/value pairs. They are: + * + * "next" "" - next inode in the file + * "prev" - ditto + * "start" "" - byte offset of file data in this inode + * "len" "" - length of file data stored in this inode + */ + +/* + * Inode/chunk number and back again + */ + +#define UINO_TO_CHUNK_ID(ino) ((ino & 0xF0000000ULL) >> 28) +#define UINO_TO_INO(ino) (ino & 0x0FFFFFFFULL) +#define MAKE_UINO(chunk_id, ino) ((chunk_id << 28) | ino) + +#ifdef __KERNEL__ + +struct chunkfs_cont_data { + ci_inode_num_t cd_next; + ci_inode_num_t cd_prev; + ci_byte_t cd_start; + ci_byte_t cd_len; +}; + +/* + * This is the information that must be maintained in memory in + * addition to the client fs's in-memory inode and the VFS's inode. + */ + +struct chunkfs_inode_info { + /* VFS inode */ + struct inode ii_vnode; + /* Head client inode - keeps our inode state */ + struct inode *ii_client_inode; + /* Protects on-disk continuation list */ + spinlock_t ii_continuations_lock; +}; + +/* + * Info for each continuation in the file. Loaded as needed and not + * cached because that's hard, mkay. + */ + +struct chunkfs_continuation { + struct inode *co_inode; + struct dentry *co_dentry; + struct vfsmount *co_mnt; + struct chunkfs_cont_data co_cd; + u64 co_chunk_id; + /* Can be reconstructed */ + u64 co_uino; +}; + +/* + * We need a single client dentry hanging off the parent dentry, as + * well as a client version of the nameidata. + */ + +struct chunkfs_dentry_priv { + struct dentry *dp_client_dentry; + struct nameidata *dp_client_nd; +}; + +static inline struct chunkfs_inode_info *CHUNKFS_I(struct inode * inode) +{ + return container_of(inode, struct chunkfs_inode_info, ii_vnode); +} + +static inline struct inode *get_client_inode(struct inode *inode) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(inode); + return ii->ii_client_inode; +} + +static inline struct chunkfs_dentry_priv *CHUNKFS_D(struct dentry *dentry) { + return (struct chunkfs_dentry_priv *) dentry->d_fsdata; +} + +static inline struct dentry *get_client_dentry(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + return dp->dp_client_dentry; +} + +static inline struct nameidata *get_client_nd(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + /* + * XXX locking. Can we have more than one operation going + * forward using a nameidata at the same time? My first guess + * is no. + */ + return dp->dp_client_nd; +} + +static inline struct vfsmount *get_client_mnt(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + return dp->dp_client_nd->mnt; +} + +#endif /* __KERNEL__ */ diff --git a/chunkfs_pool.h b/chunkfs_pool.h new file mode 100644 index 0000000..cd5ea41 --- /dev/null +++ b/chunkfs_pool.h @@ -0,0 +1,76 @@ +/* + * Chunkfs pool summary definitions. + * + * (C) 2007-2008 Valerie Henson + */ + +/* + * A wee little summary of the whole pool. There should be one copy + * of this summary every device. + * + * There is no size or block summary in this structure. Information + * about the usable size of the file system is only known by the + * client file systems inside each chunk. We have to query them + * individually to find out this information. Currently, I see no + * compelling reason to store a summary on disk - it only allows for + * it to be out of sync with the real accounting information. + */ + +/* Pool (superblock) magic number goes in linux/magic.h */ +#include + +struct chunkfs_pool { + __le32 p_magic; + __le32 p_chksum; + __le64 p_flags; + struct chunkfs_dev_desc p_root_desc; /* Device containing root */ +}; + +/* + * Offset from beginning of partition of the pool summary/superblock. + * A large initial offset avoids MBR, boot blocks, etc. + * + * XXX Any problems from picking this location? + */ + +#define CHUNKFS_POOL_BLK 8 +#define CHUNKFS_POOL_OFFSET (CHUNKFS_POOL_BLK * CHUNKFS_BLK_SIZE) + +static inline int check_pool(struct chunkfs_pool *pool) +{ + return check_metadata(pool, sizeof(*pool), CHUNKFS_SUPER_MAGIC); +} + +#ifdef __KERNEL__ + +#include + +struct chunkfs_pool_info { + struct list_head pi_dlist_head; /* List of devices in this pool */ + struct chunkfs_dev_info *pi_root_dev; + struct buffer_head *pi_bh; + /* Use bytes instead of blocks - block size may vary */ + /* + * Note that with shared storage or dynamically allocated + * inodes, you don't want to assume that total = used + free + */ + __u64 pi_bytes_total; + __u64 pi_bytes_free; + __u64 pi_bytes_used; + __u64 pi_inodes_total; + __u64 pi_inodes_free; + __u64 pi_inodes_used; + __u64 pi_flags; +}; + +static inline struct chunkfs_pool_info * CHUNKFS_PI(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct chunkfs_pool * CHUNKFS_POOL(struct chunkfs_pool_info *pi) +{ + return (struct chunkfs_pool *) pi->pi_bh->b_data; +} + +#endif /* __KERNEL__ */ diff --git a/cont.c b/cont.c new file mode 100644 index 0000000..7ac0c42 --- /dev/null +++ b/cont.c @@ -0,0 +1,442 @@ +/* + * Chunkfs continuation routines + * + * (C) 2007-2008 Valerie Henson + * + */ + +#include +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +/* + * Wow... all hack, all the time. Don't try this at home, kids. + */ + +static int +get_set_cont_data(struct dentry *dentry, char *name, u64 value, + u64 *ret_value, int type) +{ + /* Yaaaaaaay potential buffer overflow */ + char value_str[50]; /* XXX */ + char full_name[50]; /* XXX */ + /* Our continuation data is stored in the "user" xattr namespace */ + char prefix[] = "user."; + ssize_t size; + int err = 0; + /* printk(KERN_ERR "%s(%s): inode %lu ", __FUNCTION__, + type ? "set" : "get", client_inode->i_ino, name); */ + /* Make our "user.name" xattr name */ + sprintf(full_name, "%s%s", prefix, name); + if (type == 0) { + size = generic_getxattr(dentry, full_name, value_str, + sizeof(value_str)); + if (size >= 0) { + /* No automatic null termination... */ + value_str[size] = '\0'; + *ret_value = simple_strtoull(value_str, NULL, 10); + } else { + err = size; + } + /* printk("%s=%llu ", name, *ret_value); */ + } else { + /* XXX Pad out to max number of characters to avoid ENOSPC */ + sprintf(value_str, "%llu", value); + err = generic_setxattr(dentry, full_name, value_str, + strlen(value_str) + 1, 0); + /* printk("%s=%s ", name, value_str); */ + } + /* XXX ENOSPC handling */ + /* printk("err %d\n", err); */ + return err; +} + +static int +set_cont_data(struct dentry *dentry, struct chunkfs_cont_data *cd) +{ + int err; + + err = get_set_cont_data(dentry, "next", cd->cd_next, NULL, 1); + if (err) + goto out; + err = get_set_cont_data(dentry, "prev", cd->cd_prev, NULL, 1); + if (err) + goto out; + err = get_set_cont_data(dentry, "start", cd->cd_start, NULL, 1); + if (err) + goto out; + err = get_set_cont_data(dentry, "len", cd->cd_len, NULL, 1); + if (err) + goto out; + + mark_inode_dirty(dentry->d_inode); + out: + printk(KERN_ERR "%s: inode %lu err %d next %llu prev %llu " + "start %llu len %llu\n", + __FUNCTION__, dentry->d_inode->i_ino, err, + cd->cd_next, cd->cd_prev, + cd->cd_start, cd->cd_len); + return err; +} + +/* + * Get the continuation info out of the underlying client inode and + * stick it into the continuation info for an element of the inode + * list for a chunkfs inode. Currently stored in an xattr, so can use + * nice pretty fs-independent xattr routines. + */ + +static int +get_cont_data(struct dentry *dentry, struct chunkfs_cont_data *cd) +{ + int err; + + err = get_set_cont_data(dentry, "next", 0, &cd->cd_next, 0); + if (err) + return err; + err = get_set_cont_data(dentry, "prev", 0, &cd->cd_prev, 0); + if (err) + return err; + err = get_set_cont_data(dentry, "start", 0, &cd->cd_start, 0); + if (err) + return err; + err = get_set_cont_data(dentry, "len", 0, &cd->cd_len, 0); + if (err) + return err; + + printk(KERN_ERR "%s: inode %lu err %d next %llu prev %llu " + "start %llu len %llu\n", + __FUNCTION__, dentry->d_inode->i_ino, err, + cd->cd_next, cd->cd_prev, + cd->cd_start, cd->cd_len); + + return 0; +} + +/* + * inode based interface to get cont data + */ + +static int +get_cont_data_inode(struct inode *inode, struct chunkfs_cont_data *cd) +{ + struct dentry fake_dentry; + int err; + + fake_dentry.d_inode = inode; + err = get_cont_data(&fake_dentry, cd); + return err; +} + +/* + * Read an existing continuation into memory. + * + * XXX - dget/iget on client? + */ + +static int +load_continuation(struct inode *head_inode, struct dentry *client_dentry, + u64 chunk_id, struct chunkfs_continuation **ret_cont) +{ + struct chunkfs_pool_info *pi = CHUNKFS_PI(head_inode->i_sb); + struct chunkfs_continuation *cont; + struct chunkfs_chunk_info *ci; + int err; + + printk(KERN_ERR "%s() chunk_id %llu\n", __FUNCTION__, chunk_id); + + cont = kzalloc(sizeof(*cont), GFP_KERNEL); + if (cont == NULL) + return -ENOMEM; + + cont->co_inode = client_dentry->d_inode; + cont->co_dentry = client_dentry; + cont->co_chunk_id = chunk_id; + /* Hm. Think I could pass in the mnt, too... */ + ci = chunkfs_find_chunk(pi, chunk_id); + BUG_ON(ci == NULL); /* XXX */ + cont->co_mnt = ci->ci_mnt; + cont->co_uino = MAKE_UINO(chunk_id, cont->co_inode->i_ino); + + err = get_cont_data(cont->co_dentry, &cont->co_cd); + if (err) + goto out; + + *ret_cont = cont; + return 0; + out: + kfree(cont); + return err; +} + +void +chunkfs_put_continuation(struct chunkfs_continuation *cont) +{ + dput(cont->co_dentry); + /* Should be doing mntput but don't do mntget either */ + kfree(cont); +} + +/* + * Inode list lock must be held. + * + * Huuuuuge simplification - only load a continuation into memory + * while it's being used. No in-memory linked list. + * + */ + +int +chunkfs_get_next_cont(struct dentry *head_dentry, + struct chunkfs_continuation *prev_cont, + struct chunkfs_continuation **next_cont) +{ + struct inode *head_inode = head_dentry->d_inode; + struct chunkfs_cont_data *cd; + struct dentry *client_dentry; + struct nameidata nd; + char path[PATH_MAX]; + u64 from_chunk_id; + u64 chunk_id; + u64 from_ino; + u64 next_uino; + int err; + + printk(KERN_ERR "%s() prev_cont %p\n", __FUNCTION__, prev_cont); + + /* + * Get the dentry for the continuation we want. + */ + + if (prev_cont == NULL) { + client_dentry = dget(get_client_dentry(head_dentry)); + chunk_id = UINO_TO_CHUNK_ID(head_inode->i_ino); + } else { + cd = &prev_cont->co_cd; + /* If it's the head inode again, return */ + if (cd->cd_next == head_inode->i_ino) { + *next_cont = NULL; + return 0; + } + /* If there is no next continuation, return */ + if (cd->cd_next == 0) { + *next_cont = NULL; + return 0; + } + /* Laboriously construct the path and look it up */ + next_uino = cd->cd_next; + chunk_id = UINO_TO_CHUNK_ID(next_uino); + from_chunk_id = prev_cont->co_chunk_id; + from_ino = UINO_TO_INO(prev_cont->co_uino); + sprintf(path, "/chunk%llu/%llu/%llu", + chunk_id, from_chunk_id, from_ino); + err = path_lookup(path, 0, &nd); + if (err) + return -ENOENT; + client_dentry = dget(nd.dentry); + path_release(&nd); + } + + /* Now we know the dentry of the continuation we want. */ + + err = load_continuation(head_inode, client_dentry, chunk_id, + next_cont); + + printk(KERN_ERR "%s() returning err %d\n", __FUNCTION__, err); + + return err; +} + +int +chunkfs_get_cont_at_offset(struct dentry *dentry, loff_t offset, + struct chunkfs_continuation **ret_cont) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(dentry->d_inode); + struct chunkfs_continuation *prev_cont = NULL; + struct chunkfs_continuation *next_cont; + struct chunkfs_cont_data *cd; + int err; + + printk(KERN_ERR "%s() reading ino %0lx offset %llu\n", + __FUNCTION__, dentry->d_inode->i_ino, offset); + + spin_lock(&ii->ii_continuations_lock); + while (1) { + err = chunkfs_get_next_cont(dentry, prev_cont, &next_cont); + if (err || (next_cont == NULL)) + break; + cd = &next_cont->co_cd; + printk(KERN_ERR "offset %llu start %llu len %llu err %d\n", + offset, cd->cd_start, cd->cd_len, err); + if ((offset >= cd->cd_start) && + (offset < (cd->cd_start + cd->cd_len))) { + printk(KERN_ERR "found it!\n"); + *ret_cont = next_cont; + break; + } + printk(KERN_ERR "not this one\n"); + prev_cont = next_cont; + } + spin_unlock(&ii->ii_continuations_lock); + /* If we didn't find a cont at all, return -ENOENT */ + if (next_cont == NULL) + err = -ENOENT; + *ret_cont = next_cont; + return err; +} + +/* + * Traverse the list of continuations using iget() only. + */ + +int +chunkfs_get_next_inode(struct inode *head_inode, + struct inode *prev_inode, struct inode **ret_inode) +{ + struct chunkfs_pool_info *pi = CHUNKFS_PI(head_inode->i_sb); + struct chunkfs_chunk_info *ci; + struct chunkfs_cont_data cd; + struct inode *next_inode; + u64 next_uino; + ino_t next_ino; + u64 chunk_id; + int err; + + /* Starting the list... */ + if (prev_inode == NULL) { + prev_inode = get_client_inode(head_inode); + next_inode = iget(prev_inode->i_sb, prev_inode->i_ino); + goto found_inode; + } else + iput(prev_inode); + /* Find the superblock and inode for the next one */ + err = get_cont_data_inode(prev_inode, &cd); + if (err) + return err; + next_uino = cd.cd_next; + if (next_uino == 0) { + *ret_inode = NULL; + return 0; + } + next_ino = UINO_TO_INO(next_uino); + chunk_id = UINO_TO_CHUNK_ID(next_uino); + printk(KERN_ERR "next_uino %llu next_ino %lu, next chunk_id %llu\n", + next_uino, next_ino, chunk_id); + ci = chunkfs_find_chunk(pi, chunk_id); + BUG_ON(ci == NULL); /* XXX */ + next_inode = iget(ci->ci_sb, next_ino); + found_inode: + if (is_bad_inode(next_inode)) + return -EIO; + *ret_inode = next_inode; + return 0; +} + +/* + * Create a new continuation in this chunk. Never called on the head. + * Length is set arbitrarily so be sure to write continuously. + * + * We have to bootstrap ourselves up, starting with a dentry. We are, + * in fact, creating a file from the kernel. Bleah. + */ + +int +chunkfs_create_continuation(struct file *file, loff_t *ppos, + struct file **client_file, + struct chunkfs_continuation **ret_cont) +{ + struct chunkfs_continuation *prev_cont = NULL; + struct chunkfs_continuation *next_cont; + struct chunkfs_continuation *new_cont; + struct file *new_file; + u64 from_chunk_id; + u64 to_chunk_id; + u64 from_ino; + struct nameidata file_nd; + char path[PATH_MAX]; + struct dentry *dentry; + struct chunkfs_cont_data cd; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + /* Get the last continuation */ + while (1) { + err = chunkfs_get_next_cont(file->f_dentry, prev_cont, + &next_cont); + if (err) + return err; + if (next_cont == NULL) + break; + prev_cont = next_cont; + } + + /* Figure out what chunk and inode we are continuing from. */ + from_chunk_id = prev_cont->co_chunk_id; + from_ino = UINO_TO_INO(prev_cont->co_uino); + /* Temporary hack, do the next chunk for creation. */ + to_chunk_id = from_chunk_id + 1; + printk(KERN_ERR "%s() to chunk %llu\n", __FUNCTION__, to_chunk_id); + + /* Now we need the filename for the continuation inode. */ + sprintf(path, "/chunk%llu/%llu/%llu", to_chunk_id, from_chunk_id, + from_ino); + + /* Create the file */ + err = open_namei(0 /* XXX */, path, O_CREAT | O_RDWR, + (MAY_WRITE | MAY_READ | MAY_APPEND), &file_nd); + printk(KERN_ERR "open_namei for %s: err %d\n", path, err); + if (err) + goto out; + + dentry = dget(file_nd.dentry); + + /* Fill in next/prev/etc. data */ + cd.cd_next = 0; + cd.cd_prev = prev_cont->co_uino; + cd.cd_start = prev_cont->co_cd.cd_start + prev_cont->co_cd.cd_len; + cd.cd_len = 10 * 4096; + set_cont_data(dentry, &cd); + /* Now update prev */ + prev_cont->co_cd.cd_next = MAKE_UINO(to_chunk_id, + dentry->d_inode->i_ino); + set_cont_data(prev_cont->co_dentry, &prev_cont->co_cd); + /* Now! It's all in the inode and we can load it like normal. */ + err = load_continuation(file->f_dentry->d_inode, dentry, + to_chunk_id, &new_cont); + new_file = dentry_open(dentry, file_nd.mnt, file->f_flags); + if (IS_ERR(new_file)) { + err = PTR_ERR(new_file); + printk(KERN_ERR "dentry_open: err %d\n", err); + goto out; + } + chunkfs_copy_down_file(file, ppos, new_file, new_cont->co_cd.cd_start); + + *client_file = new_file; + *ret_cont = new_cont; + + printk(KERN_ERR "%s(): start %llu returning %d\n", + __FUNCTION__, cd.cd_start, err); + return 0; + out: + chunkfs_put_continuation(prev_cont); + printk(KERN_ERR "%s(): start %llu returning %d\n", + __FUNCTION__, cd.cd_start, err); + return err; +} + +int +chunkfs_init_cont_data(struct dentry *client_dentry) +{ + struct chunkfs_cont_data cd; + int err; + + cd.cd_prev = 0; + cd.cd_next = 0; + cd.cd_start = 0; + cd.cd_len = 10 * 4096; + err = set_cont_data(client_dentry, &cd); + return err; +} diff --git a/cross.sh b/cross.sh new file mode 100644 index 0000000..1224a79 --- /dev/null +++ b/cross.sh @@ -0,0 +1,33 @@ +#!/bin/bash -x +# +# Run simple cross-chunk checks. +# + +mount /dev/loop1 /chunk1 +mount /dev/loop2 /chunk2 +mount /dev/loop3 /chunk3 + +# For each file in the continued-from-directory, check to see if it +# exists in the original chunk. + +# For every possible continuation file... + +for from_chunk in 1 2 3; do + for to_chunk in 1 2 3; do + for i in `ls /chunk${to_chunk}/${from_chunk}/*` ; do + # Check to see if the previous inode exists + echo testi \<${i}\> | debugfs /dev/loop${from_chunk} | grep marked + # Returns 1 if not allocated (and it should be) + if [ "$?" == "1" ]; then + echo "Orphan continuation ${i}, removing" + rm ${i} + fi + done + done +done + +umount /dev/loop1 +umount /dev/loop2 +umount /dev/loop3 + +exit 0 diff --git a/demo.sh b/demo.sh new file mode 100644 index 0000000..cfd4eda --- /dev/null +++ b/demo.sh @@ -0,0 +1,55 @@ +#!/bin/bash -x +# +# Demo script for chunkfs to be run from inside UML. +# + +# This is where the chunkfs user binaries are located. +BINPATH=/chunkfs_bin +MNT=/mnt + +# Name of the file backing the loop device +ORIG=/loop/saved_disk +FILE=/loop/test_disk +cp ${ORIG} ${FILE} + +# Unmount chunkfs and chunk file systems before stomping +umount ${MNT} +umount /dev/loop1 +umount /dev/loop2 +umount /dev/loop3 + +losetup -d /dev/loop0 +losetup -d /dev/loop1 +losetup -d /dev/loop2 +losetup -d /dev/loop3 + +losetup /dev/loop0 ${FILE} +losetup -o 45056 /dev/loop1 ${FILE} +losetup -o 10530816 /dev/loop2 ${FILE} +losetup -o 21016576 /dev/loop3 ${FILE} + +# Stomp head inode + +OFFSET=$(((4096 * 4) + 0x0e00)) +dd if=/dev/zero of=/dev/loop1 seek=${OFFSET} bs=1 count=128 + +# Repair individual chunks + +fsck -f /dev/loop1 +fsck -f /dev/loop2 +fsck -f /dev/loop3 + +# Cross-chunk repair + +${BINPATH}/cross.sh + +for i in 1 2 3; do + mount -t ext2 -o user_xattr /dev/loop${i} /chunk${i} +done + +${BINPATH}/mount_chunkfs /dev/loop0 ${MNT} + +ls /mnt/big +ls /chunk2/1/29 + +exit 0 diff --git a/dir.c b/dir.c new file mode 100644 index 0000000..71ce865 --- /dev/null +++ b/dir.c @@ -0,0 +1,68 @@ +/* + * Chunkfs directory routines + * + * (C) 2007-2008 Valerie Henson + */ + +#include + +#include "chunkfs.h" +#include "chunkfs_i.h" + +/* + * Currently we're reusing the client directory ops. We'll probably + * have to implement our own directories on top. + */ + +static loff_t +chunkfs_dir_llseek(struct file *file, loff_t offset, int origin) +{ + struct file *client_file; + struct chunkfs_continuation *cont; + int err; + + /* XXX... should only do top-level file struct? */ + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = chunkfs_open_cont_file(file, &offset, &client_file, &cont); + if (err) + return err; + + if (client_file->f_op->llseek) + err = client_file->f_op->llseek(client_file, offset, origin); + else + err = default_llseek(client_file, offset, origin); + + chunkfs_close_cont_file(file, client_file, cont); + return err; +} + +static int +chunkfs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + struct file *client_file; + struct chunkfs_continuation *cont; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = chunkfs_open_cont_file(file, &file->f_pos, &client_file, &cont); + if (err) + return err; + + err = client_file->f_op->readdir(client_file, dirent, filldir); + /* If we read off the end, no problemo */ + if (err == -ENODATA) + err = 0; + + chunkfs_close_cont_file(file, client_file, cont); + + return err; +} + +struct file_operations chunkfs_dir_fops = { + .llseek = chunkfs_dir_llseek, + .read = generic_read_dir, + .open = chunkfs_open, + .readdir = chunkfs_readdir, +}; diff --git a/file.c b/file.c new file mode 100644 index 0000000..e601544 --- /dev/null +++ b/file.c @@ -0,0 +1,338 @@ +/* + * Chunkfs file routines + * + * (C) 2007-2008 Valerie Henson + */ + +#include +#include +#include +#include + +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +/* + * The point of all these wrapper functions is the following: + * + * We need to get set the right file ops in the file struct for the + * area of the file being altered. + * + * For ops which affect the entire file (like fsync), we need to fan + * out to all the parts of the file. + * + * It would be cool if we could set the file operations depending on + * where in the file the I/O is happening. But I don't think we have + * that ability in the VFS right now. + */ + +/* + * Set the client file position to be relative to the start of the + * client file and copy down the main file struct's data in to the + * client file struct. + */ + +void +chunkfs_copy_down_file(struct file *file, loff_t *ppos, + struct file *client_file, u64 client_start) +{ + client_file->f_pos = *ppos - client_start; + *ppos = client_file->f_pos; + + printk(KERN_ERR "%s(): client f_pos set to %llu " + "(parent %llu, client_start %llu)\n", + __FUNCTION__, client_file->f_pos, file->f_pos, + client_start); +} + +/* + * Aaaand reverse the pos conversion. + */ + +static void +copy_up_file(struct file *file, struct file *client_file, u64 client_start) +{ + file->f_pos = client_file->f_pos + client_start; + + printk(KERN_ERR "%s(): file f_pos set to %llu (client f_pos %llu " + "client_start %llu)\n", __FUNCTION__, file->f_pos, + client_file->f_pos, client_start); +} + +/* + * Open the client inode at offset and return the file struct. + */ + +int +chunkfs_open_cont_file(struct file *file, loff_t *ppos, + struct file **client_file, + struct chunkfs_continuation **ret_cont) +{ + struct chunkfs_continuation *cont; + struct chunkfs_cont_data *cd; + struct file *new_file; + int err; + + printk(KERN_ERR "%s() pos %llu\n", __FUNCTION__, *ppos); + + err = chunkfs_get_cont_at_offset(file->f_dentry, *ppos, &cont); + if (err) + return err; + + new_file = dentry_open(cont->co_dentry, cont->co_mnt, + file->f_flags); + if (IS_ERR(new_file)) { + err = PTR_ERR(new_file); + printk(KERN_ERR "dentry_open: err %d\n", err); + goto out; + } + cd = &cont->co_cd; + chunkfs_copy_down_file(file, ppos, new_file, cd->cd_start); + + *ret_cont = cont; + *client_file = new_file; + out: + printk(KERN_ERR "%s(): returning %d\n", __FUNCTION__, err); + return err; +} + +void +chunkfs_close_cont_file(struct file *file, struct file *client_file, + struct chunkfs_continuation *cont) +{ + struct chunkfs_cont_data *cd = &cont->co_cd; + /* XXX... sys_close does a lot more than this. */ + printk(KERN_ERR "%s()\n", __FUNCTION__); + copy_up_file(file, client_file, cd->cd_start); + chunkfs_copy_up_inode(file->f_dentry->d_inode, + client_file->f_dentry->d_inode); + chunkfs_put_continuation(cont); +} + +/* + * lseek only affects the top-level file struct's fpos. + */ + +static loff_t +chunkfs_llseek_file(struct file *file, loff_t offset, int origin) +{ + printk(KERN_ERR "%s()\n", __FUNCTION__); + + /* XXX right generic llseek? */ + return default_llseek(file, offset, origin); +} + +/* + * Find the right inode for the offset and read from it. Opens and + * closes the client file struct every time because I'm lazy. + */ + +static ssize_t +chunkfs_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) +{ + struct file *client_file; + struct chunkfs_continuation *cont; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = chunkfs_open_cont_file(file, ppos, &client_file, &cont); + /* Read off the end of the file */ + /* XXX distinguish between this and EIO */ + if (err == -ENOENT) + return 0; + if (err) + return err; + + /* XXX assume not longer than len */ + if (client_file->f_op->read) + err = client_file->f_op->read(client_file, buf, len, ppos); + else + err = do_sync_read(client_file, buf, len, ppos); + + /* If we read off the end, no problemo */ + if (err == -ENODATA) + err = 0; + + chunkfs_close_cont_file(file, client_file, cont); + return err; +} + +static ssize_t +chunkfs_write(struct file *file, const char __user *buf, size_t len, + loff_t *ppos) +{ + struct chunkfs_continuation *cont; + struct file *client_file; + ssize_t size; + int err; + + printk(KERN_ERR "%s() pos %llu len %u\n", + __FUNCTION__, *ppos, len); + + err = chunkfs_open_cont_file(file, ppos, &client_file, &cont); + if (err == -ENOENT) { + err = chunkfs_create_continuation(file, ppos, &client_file, + &cont); + } + if (err) + return err; + + /* XXX assume not longer than len */ + if (client_file->f_op->write) + size = client_file->f_op->write(client_file, buf, len, ppos); + else + size = do_sync_write(client_file, buf, len, ppos); + + chunkfs_close_cont_file(file, client_file, cont); + + printk(KERN_ERR "%s() pos %llu len %u, returning size %u\n", + __FUNCTION__, *ppos, len, size); + + return size; +} + +/* + * Open only affects the top-level chunkfs file struct. Do an open of + * the underlying head client inode just to see that we can, then + * close it again. + */ + +int +chunkfs_open(struct inode * inode, struct file * file) +{ + struct file *client_file; + struct chunkfs_continuation *cont; + loff_t dummy_pos = 0; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = chunkfs_open_cont_file(file, &dummy_pos, &client_file, &cont); + if (err) + goto out; + chunkfs_close_cont_file(file, client_file, cont); + return 0; + out: + printk(KERN_ERR "%s() returning %d\n", __FUNCTION__, err); + return err; +} + +/* + * Apparently, file may be null at this point. Uh. Whatever. + */ + +static int +chunkfs_fsync_file(struct file *file, struct dentry *dentry, int datasync) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(file->f_dentry->d_inode); + struct chunkfs_continuation *prev_cont = NULL; + struct chunkfs_continuation *next_cont; + struct dentry *client_dentry; + struct inode *client_inode; + int err = -EIO; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + /* XXX syncs all inodes instead of just ones in mem */ + spin_lock(&ii->ii_continuations_lock); + while (1) { + err = chunkfs_get_next_cont(dentry, prev_cont, &next_cont); + if (err || (next_cont == NULL)) + break; + client_dentry = next_cont->co_dentry; + client_inode = client_dentry->d_inode; + /* XXX error propagation */ + err = client_inode->i_fop->fsync(NULL, client_dentry, + datasync); + prev_cont = next_cont; + } + spin_unlock(&ii->ii_continuations_lock); + printk(KERN_ERR "%s() err %d\n", __FUNCTION__, err); + return err; +} + +static void chunkfs_truncate(struct inode *inode) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(inode); + struct inode *prev_inode = NULL; + struct inode *next_inode; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + spin_lock(&ii->ii_continuations_lock); + /* XXX completely delete continuations */ + while (1) { + err = chunkfs_get_next_inode(inode, prev_inode, &next_inode); + if (err || (next_inode == NULL)) + break; + next_inode->i_op->truncate(next_inode); + prev_inode = next_inode; + } + spin_unlock(&ii->ii_continuations_lock); +} + +int chunkfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *client_inode = get_client_inode(dentry->d_inode); + struct dentry *client_dentry = get_client_dentry(dentry); + unsigned int ia_valid = attr->ia_valid; + int error; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + if (client_inode->i_op->setattr) { + error = client_inode->i_op->setattr(client_dentry, attr); + } else { + /* Arrrrrgh gross argh */ + error = inode_change_ok(client_inode, attr); + if (!error) + error = security_inode_setattr(client_dentry, attr); + if (!error) { + if ((ia_valid & ATTR_UID && attr->ia_uid != client_inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != client_inode->i_gid)) + error = DQUOT_TRANSFER(client_inode, attr) ? -EDQUOT : 0; + if (!error) + error = inode_setattr(client_inode, attr); + } + } + if (!error) + chunkfs_copy_up_inode(dentry->d_inode, client_inode); + return error; +} + +/* + * XXX probably need to change the nd. + */ + +int chunkfs_permission(struct inode *inode, int submask, + struct nameidata *nd) +{ + struct inode *client_inode = get_client_inode(inode); + int err; + + if (client_inode->i_op->permission) + err = client_inode->i_op->permission(client_inode, + submask, nd); + else + err = generic_permission(client_inode, submask, NULL); + return err; +} + +struct file_operations chunkfs_file_fops = { + .llseek = chunkfs_llseek_file, + .read = chunkfs_read, + .write = chunkfs_write, + .open = chunkfs_open, + .fsync = chunkfs_fsync_file, +}; + +struct inode_operations chunkfs_file_iops = { + .truncate = chunkfs_truncate, + .setattr = chunkfs_setattr, + .permission = chunkfs_permission, +}; diff --git a/fsck.chunkfs.c b/fsck.chunkfs.c new file mode 100644 index 0000000..f340b96 --- /dev/null +++ b/fsck.chunkfs.c @@ -0,0 +1,212 @@ +/* + * Check and repair a chunkfs file system. + * + * (C) 2007-2008 Val Henson + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "chunkfs.h" + +static char * cmd; + +static void usage (void) +{ + fprintf(stderr, "Usage: %s \n", cmd); + exit(1); +} + +static void read_data(void *buf, int size, int fd, size_t offset) +{ + bzero(buf, size); + + if (lseek(fd, block, SEEK_SET) < 0) + error(1, errno, "Cannot seek"); + + if (read(fd, buf, size) < size) + error(1, errno, "Cannot read +} + +/* + * Construct a "superblock." In chunkfs, this means a pool summary. + */ +static void create_pool_summary(char *dev_name, struct chunkfs_pool *pool) +{ + struct chunkfs_dev_desc *root_dev = &pool->p_root; + int hint_len = sizeof (root_dev->d_hint); + + bzero(pool, sizeof(*pool)); + pool->p_magic = __cpu_to_le32(CHUNKFS_SUPER_MAGIC); + /* Fill in root device description. */ + strncpy(root_dev->d_hint, dev_name, hint_len); + root_dev->d_hint[hint_len - 1] = '\0'; + pool->p_flags = 0; + /* XXX need userland generated uuid... ask kernel to do it on mount? */ + root_dev->d_uuid = __cpu_to_le64(0x001d001d); + + write_chksum(pool, sizeof(*pool), &pool->p_chksum); +} + +static void create_dev_summary(char *dev_name, int devfd, + struct chunkfs_pool *pool, + struct chunkfs_dev *dev) +{ + struct stat stat_buf; + struct chunkfs_dev_desc *root_dev = &pool->p_root; + + if (fstat(devfd, &stat_buf) != 0) + error(1, errno, "Cannot stat device %s", dev_name); + + bzero(dev, sizeof(*dev)); + dev->d_uuid = root_dev->d_uuid; /* Already endian swapped */ + dev->d_bytes_total = __cpu_to_le64(stat_buf.st_size); + dev->d_magic = __cpu_to_le32(CHUNKFS_DEV_MAGIC); + write_chksum(dev, sizeof (dev), &dev->d_chksum); +} + +static void create_chunk_summary(struct chunkfs_pool *pool, + struct chunkfs_dev *dev, + struct chunkfs_chunk *chunk, + __u64 start, __u64 size, __u64 next_chunk) +{ + __u64 end = start + size - 1; + /* XXX use chunk->ci_blk_bits; */ + __u64 start_blk = (start >> CHUNKFS_BLK_BITS) + 2; + __u64 end_blk = end >> CHUNKFS_BLK_BITS; + __u64 start_inode; + __u64 end_inode; + + bzero(chunk, sizeof(*chunk)); + chunk->c_next_chunk = __cpu_to_le64(next_chunk); + chunk->c_blk_size = __cpu_to_le64(CHUNKFS_BLK_SIZE); + chunk->c_blk_bits = __cpu_to_le64(CHUNKFS_BLK_BITS); + chunk->c_blk_start = __cpu_to_le64(start_blk); + chunk->c_blk_end = __cpu_to_le64(end_blk); + /* XXX subtract space used for bitmaps and chunk summary + * For now assume only takes two blocks + * XXX put data at one end of the chunk and metadata at other */ + chunk->c_blks_free = __cpu_to_le64((end_blk - start_blk) - 2); + start_inode = blk_to_inode(start_blk); + chunk->c_inode_begin = __cpu_to_le64(start_inode); + /* Get the last inode in the last blk, not the first */ + end_inode = blk_to_inode(end_blk + 1) - 1; + chunk->c_inode_end = __cpu_to_le64(end_inode); + chunk->c_magic = __cpu_to_le32(CHUNKFS_CHUNK_MAGIC); + write_chksum(chunk, sizeof (chunk), &chunk->c_chksum); +} + +static void write_chunk_summaries(int fd, struct chunkfs_pool *pool, + struct chunkfs_dev *dev, + struct chunkfs_chunk *root_chunk) +{ + struct chunkfs_chunk chunk; + __u64 dev_size = __le64_to_cpu(dev->d_bytes_total); + __u64 chunk_size = CHUNKFS_CHUNK_SIZE; + __u64 chunk_start = CHUNKFS_CHUNK_OFFSET;; + __u64 next_chunk_offset; + unsigned int chunk_id = 0; + int root = 1; + + while (chunk_start < dev_size) { + /* XXX What is chunk min size? */ + if ((chunk_start + chunk_size) > dev_size) { + chunk_size = dev_size - chunk_start; + next_chunk_offset = 0; + } else { + next_chunk_offset = chunk_start + chunk_size; + } + create_chunk_summary(pool, dev, &chunk, chunk_start, + chunk_size, next_chunk_offset); + if (root) { + /* Make root the first inode */ + chunk.c_root_inode = chunk.c_inode_begin; + *root_chunk = chunk; + root = 0; + } + + printf("Writing chunk %d (bytes %llu-%llu, blocks %llu-%llu, " + "inodes %llu-%llu)\n", chunk.c_id, chunk.c_start, + chunk_end, start_blk, end_blk,start_inode, end_inode); + + write_block(chunk, sizeof(*chunk), fd, chunk_start); + chunk_start += chunk_size; + *chunk_id++; + } +} + +static void write_root(int fd, struct chunkfs_pool *pool, + struct chunkfs_dev *dev, + struct chunkfs_chunk *root_chunk, + struct chunkfs_inode *root_inode) +{ + /* XXX This is zero presently. */ + c_inode_num_t inode_num = __le64_to_cpu(root_chunk->c_root_inode); + c_byte_t root_offset = inode_num; + + printf("root inode number %llu, offset %llu\n", inode_num, root_offset); + root_inode->i_self = __cpu_to_le64(inode_num); + /* uid, gid, size, etc. all 0 */ + root_inode->i_nlink = __cpu_to_le32(2); + root_inode->i_atime.t_sec = + root_inode->i_mtime.t_sec = + root_inode->i_ctime.t_sec = __cpu_to_le32(time(NULL)); + root_inode->i_mode = __cpu_to_le16(S_IFDIR | 0755); /* XXX */ + root_inode->i_type = __cpu_to_le16(CHUNKFS_PUBLIC_INODE); + root_inode->i_magic = __cpu_to_le32(CHUNKFS_INODE_MAGIC); + write_chksum(root_inode, sizeof(*root_inode), &root_inode->i_chksum); + + write_block(root_inode, sizeof(*root_inode), fd, root_offset); +} + +int main (int argc, char * argv[]) +{ + int fd; + char * dev_name; + struct chunkfs_pool pool = { 0 }; + struct chunkfs_dev root_dev = { 0 }; + struct chunkfs_chunk root_chunk = { 0 }; + struct chunkfs_inode root_inode = { 0 }; + + cmd = argv[0]; + + if (argc != 2) + usage(); + + dev_name = argv[1]; + + if ((fd = open(dev_name, O_RDWR)) < 0) + error(1, errno, "Cannot open device %s", dev_name); + /* XXX combine create and write_block */ + /* The chunkfs equivalent of a superblock is the pool summary. */ + + create_pool_summary(dev_name, &pool); + write_block(&pool, sizeof(pool), fd, CHUNKFS_POOL_OFFSET); + + /* We need one device summary per device. */ + + create_dev_summary(dev_name, fd, &pool, &root_dev); + write_block(&root_dev, sizeof(root_dev), fd, CHUNKFS_DEV_OFFSET); + + /* Now we get to the meaty bit: chunk summaries. */ + + write_chunk_summaries(fd, &pool, &root_dev, &root_chunk); + + /* Write root inode */ + + write_root(fd, &pool, &root_dev, &root_chunk, &root_inode); + + close(fd); + + return 0; +} diff --git a/gaze.sh b/gaze.sh new file mode 100644 index 0000000..44f0ada --- /dev/null +++ b/gaze.sh @@ -0,0 +1,27 @@ +#!/bin/bash -x +# +# Give me a naked eyeball look at the on-disk format as created by mkfs. +# + +# Name of the loopback device we're creating +DEV=/tmp/disk0 +# This is where the chunkfs user binaries are located. +BINPATH=. + +${BINPATH}/write_pattern ${DEV} +if [ "$?" != "0" ]; then + echo "write_pattern failed" + exit 1 +fi + +${BINPATH}/mkfs.chunkfs ${DEV} +if [ "$?" != "0" ]; then + echo "mkfs.chunkfs failed" + exit 1 +fi + +# Now dump it for us. + +od -A d -t x4 ${DEV} + +exit 0 diff --git a/inode.c b/inode.c new file mode 100644 index 0000000..97550bb --- /dev/null +++ b/inode.c @@ -0,0 +1,173 @@ +/* + * Chunkfs inode routines + * + * (C) 2007-2008 Valerie Henson + */ + +#include +#include + +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +int +chunkfs_get_nlinks(struct inode *inode) +{ + /* XXX go through all client inodes */ + return inode->i_nlink; +} + +static void +__copy_inode(struct inode *dst, struct inode *src) +{ + /* Copy data from one inode to another */ + fsstack_copy_attr_all(dst, src, chunkfs_get_nlinks); +} + +void +chunkfs_copy_up_inode(struct inode *inode, struct inode *client_inode) +{ + struct inode *prev_inode = NULL; + struct inode *next_inode; + loff_t total_size = 0; + + __copy_inode(inode, client_inode); + + while (1) { + chunkfs_get_next_inode(inode, prev_inode, &next_inode); + if (next_inode == NULL) + break; + /* XXX doesn't do holey files right */ + printk(KERN_ERR "adding %llu\n", next_inode->i_size); + total_size += next_inode->i_size; + prev_inode = next_inode; + } + inode->i_size = total_size; + printk(KERN_ERR "%s() ino %lu size %llu\n", __FUNCTION__, + inode->i_ino, inode->i_size); + + mark_inode_dirty(inode); +} + +static void +copy_down_inode(struct inode *inode, struct inode *client_inode) +{ + __copy_inode(client_inode, inode); +} + +static void +set_inode_ops(struct inode *inode, struct inode *client_inode) +{ + /* Pick inode ops */ + if (S_ISLNK(client_inode->i_mode)) + inode->i_op = &chunkfs_symlink_iops; + else if (S_ISDIR(client_inode->i_mode)) + inode->i_op = &chunkfs_dir_iops; + else if (S_ISREG(client_inode->i_mode)) + inode->i_op = &chunkfs_file_iops; + else + inode->i_op = &chunkfs_special_iops; + + /* Use different set of file ops for directories */ + if (S_ISDIR(client_inode->i_mode)) + inode->i_fop = &chunkfs_dir_fops; + else if (S_ISREG(client_inode->i_mode)) + inode->i_fop = &chunkfs_file_fops; + + /* properly initialize special inodes */ + if (S_ISBLK(client_inode->i_mode) || S_ISCHR(client_inode->i_mode) || + S_ISFIFO(client_inode->i_mode) || S_ISSOCK(client_inode->i_mode)) + init_special_inode(inode, client_inode->i_mode, + client_inode->i_rdev); +} + +/* + * Allocate a new inode and do any extra bits to it that aren't + * covered by the alloc_inode() op (currently none). + */ + +int +chunkfs_new_inode(struct super_block *sb, struct inode **inodep) +{ + *inodep = new_inode(sb); + + if (is_bad_inode(*inodep)) + /* XXX hate the inode error return conventions */ + return -EIO; + return 0; +} + +/* + * We've just read in a client inode. Fill in the chunkfs inode. + * Wait to fill in the continuation until the file is opened. + */ + +void +chunkfs_start_inode(struct inode *inode, struct inode *client_inode, + u64 chunk_id) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(inode); + + BUG_ON(!client_inode); + + ii->ii_client_inode = client_inode; + inode->i_ino = MAKE_UINO(chunk_id, client_inode->i_ino); + /* XXX i_mapping? */ + /* XXX check inode checksum, etc. */ + set_inode_ops(inode, client_inode); + chunkfs_copy_up_inode(inode, client_inode); + + printk(KERN_ERR "%s(): inode %p ino %0lx mode %0x client %p\n", + __FUNCTION__, inode, inode->i_ino, inode->i_mode, + ii->ii_client_inode); +} + +/* + * Come in with the chunkfs inode. Fill it in and get the client + * inode too. + */ + +void +chunkfs_read_inode(struct inode *inode) +{ + struct chunkfs_pool_info *pi = CHUNKFS_PI(inode->i_sb); + struct chunkfs_chunk_info *ci; + struct inode *client_inode; + struct super_block *client_sb; + u64 chunk_id = UINO_TO_CHUNK_ID(inode->i_ino); + unsigned long client_ino = UINO_TO_INO(inode->i_ino); + + printk (KERN_ERR "%s() reading ino %0lx client ino %0lx chunk_id " + "%0llx count %d\n", + __FUNCTION__, inode->i_ino, client_ino, chunk_id, + atomic_read(&inode->i_count)); + + /* XXX should be chunkfs_get_sb */ + ci = chunkfs_find_chunk(pi, chunk_id); + BUG_ON(ci == NULL); /* XXX */ + + client_sb = ci->ci_sb; + client_inode = iget(client_sb, client_ino); + if (is_bad_inode(client_inode)) { + /* XXX should do something here */ + return; + } + chunkfs_start_inode(inode, client_inode, chunk_id); + return; +} + +int chunkfs_write_inode(struct inode *inode, int wait) +{ + struct inode *client_inode = get_client_inode(inode); + int err; + + copy_down_inode(inode, client_inode); + + /* XXX will client inodes be written when evicted? think so */ + err = client_inode->i_sb->s_op->write_inode(client_inode, wait); + + return err; +} diff --git a/mkfs.chunkfs.c b/mkfs.chunkfs.c new file mode 100644 index 0000000..e773bec --- /dev/null +++ b/mkfs.chunkfs.c @@ -0,0 +1,192 @@ +/* + * Create a chunkfs file system. + * + * (C) 2007-2008 Val Henson + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +/* Compile time test that structures have not outgrown blocks. */ + +static char canary_buf1[CHUNKFS_BLK_SIZE - + sizeof(struct chunkfs_pool)] __attribute__((unused)); +static char canary_buf2[CHUNKFS_BLK_SIZE - + sizeof(struct chunkfs_dev)] __attribute__((unused)); + +static char * cmd; + +static void usage (void) +{ + fprintf(stderr, "Usage: %s \n", cmd); + exit(1); +} + +static void write_block(void *metadata, int size, int fd, __u64 offset) +{ + char buf[CHUNKFS_BLK_SIZE]; + int buf_size = sizeof (buf); + struct chunkfs_chkmagic *x = (struct chunkfs_chkmagic *) buf; + + bzero(buf, buf_size); + memcpy(buf, metadata, size); + write_chksum(buf, size); + + printf("Writing magic %0x chksum %0x to offset %llu\n", + __le32_to_cpu(x->x_magic), __le32_to_cpu(x->x_chksum), + offset); + + if (lseek(fd, offset, SEEK_SET) < 0) + error(1, errno, "Cannot seek"); + + if (write(fd, buf, buf_size) < buf_size) + error(1, errno, "Cannot write metadata at offset %llu", + (unsigned long long) offset); +} + +/* + * Create and write a pool summary (superblock) + */ +static void create_pool_summary(char *dev_name, struct chunkfs_pool *pool) +{ + struct chunkfs_dev_desc *dev_desc = &pool->p_root_desc; + + /* Fill in device description. */ + strcpy(dev_desc->d_hint, dev_name); + /* XXX need userland generated uuid */ + dev_desc->d_uuid = __cpu_to_le64(0x001d001d); + + bzero(pool, sizeof(*pool)); + pool->p_magic = __cpu_to_le32(CHUNKFS_SUPER_MAGIC); +} + +static void create_dev_summary(struct chunkfs_pool *pool, + struct chunkfs_dev *dev, + __u64 dev_begin, + __u64 dev_size) +{ + struct chunkfs_dev_desc *dev_desc = &pool->p_root_desc; + + bzero(dev, sizeof(*dev)); + dev->d_uuid = dev_desc->d_uuid; /* Already swapped */ + dev->d_begin = __cpu_to_le64(dev_begin); + dev->d_end = __cpu_to_le64(dev_begin + dev_size - 1); /* Starting counting from zero */ + dev->d_innards_begin = __cpu_to_le64(dev_begin + CHUNKFS_BLK_SIZE); + dev->d_innards_end = dev->d_end; /* Already swapped */ + dev->d_root_chunk = dev->d_innards_begin; /* Already swapped */ + dev->d_magic = __cpu_to_le32(CHUNKFS_DEV_MAGIC); +} + +static void create_chunk_summary(struct chunkfs_chunk *chunk, + __u64 chunk_start, __u64 chunk_size, + __u64 chunk_id) +{ + bzero(chunk, sizeof(*chunk)); + chunk->c_begin = __cpu_to_le64(chunk_start); + chunk->c_end = __cpu_to_le64(chunk_start + chunk_size - 1); + chunk->c_innards_begin = __cpu_to_le64(chunk_start + CHUNKFS_BLK_SIZE); + chunk->c_innards_end = chunk->c_end; /* Already swapped */ + chunk->c_chunk_id = __cpu_to_le64(chunk_id); + chunk->c_magic = __cpu_to_le32(CHUNKFS_CHUNK_MAGIC); +} + +static void write_chunk_summaries(struct chunkfs_dev *dev, + struct chunkfs_chunk *chunk, + int fd) +{ + __u64 chunk_id = 1; /* 0 is not a valid chunk id */ + __u64 chunk_start = __le64_to_cpu(dev->d_root_chunk); + __u64 chunk_size = CHUNKFS_CHUNK_SIZE; + __u64 dev_end = __le64_to_cpu(dev->d_end); + + while ((chunk_start + chunk_size - 1) < dev_end) { + /* XXX Throwing away disk if not multiple of chunk size */ + create_chunk_summary(chunk, chunk_start, chunk_size, + chunk_id); + if (chunk_id == 1) + chunk->c_flags |= __cpu_to_le64(CHUNKFS_ROOT); + /* Can we get another chunk in? Then point to it */ + if ((__le64_to_cpu(chunk->c_end) + chunk_size - 1) < dev_end) + chunk->c_next_chunk = __cpu_to_le64(chunk->c_end + 1); + + printf("Writing chunk %llu: start %llu end %llu)\n", + __le64_to_cpu(chunk->c_chunk_id), + __le64_to_cpu(chunk->c_begin), + __le64_to_cpu(chunk->c_end)); + + printf("clientfs: start %llu\n", __le64_to_cpu(chunk->c_innards_begin)); + + write_block(chunk, sizeof(*chunk), fd, chunk_start); + chunk_start += chunk_size; + chunk_id++; + } +} + +int main (int argc, char * argv[]) +{ + int fd; + char * dev_name; + struct stat stat_buf; + off_t raw_dev_size; + struct chunkfs_pool pool = { 0 }; + struct chunkfs_dev root_dev = { 0 }; + struct chunkfs_chunk root_chunk = { 0 }; + + cmd = argv[0]; + + if (argc != 2) + usage(); + + dev_name = argv[1]; + + /* + * Get some info about the device. + */ + + if ((fd = open(dev_name, O_RDWR)) < 0) + error(1, errno, "Cannot open device %s", dev_name); + + if (fstat(fd, &stat_buf) != 0) + error(1, errno, "Cannot stat device %s", dev_name); + raw_dev_size = stat_buf.st_size; + /* + * XXX Sanity check size - big enough? + */ + + /* + * Create structures and write them out + */ + + create_pool_summary(dev_name, &pool); + write_block(&pool, sizeof(pool), fd, CHUNKFS_POOL_OFFSET); + + /* XXX handle multiple devs */ + + create_dev_summary(&pool, &root_dev, CHUNKFS_DEV_OFFSET, + raw_dev_size - CHUNKFS_DEV_OFFSET - 1); + write_block(&root_dev, sizeof(root_dev), fd, CHUNKFS_DEV_OFFSET); + + /* Now we get to the meaty bit: chunk summaries. */ + + write_chunk_summaries(&root_dev, &root_chunk, fd); + + close(fd); + + return 0; +} diff --git a/namei.c b/namei.c new file mode 100644 index 0000000..7ccc193 --- /dev/null +++ b/namei.c @@ -0,0 +1,538 @@ +/* + * Chunkfs + * + * Chunks is a file system designed to be checked and repaired in + * small, mostly independent chunks. This allows quick recovery from + * file system corruption. + * + * (C) 2007-2008 Valerie Henson + * + */ + +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +void +chunkfs_release_nd(struct dentry *dentry) +{ + struct nameidata *nd = get_client_nd(dentry); + dput(nd->dentry); + mntput(nd->mnt); +} + +/* + * Call this to initialize our client nameidata. + */ + +void +chunkfs_init_nd(struct inode *dir, struct dentry *dentry, + struct dentry *client_dentry, u64 chunk_id) +{ + struct nameidata *nd = get_client_nd(dentry); + struct chunkfs_chunk_info *chunk; + + chunk = chunkfs_find_chunk(CHUNKFS_PI(dir->i_sb), chunk_id); + BUG_ON(!chunk); /* XXX */ + /* Probably don't need dget/mntget */ + nd->dentry = dget(client_dentry); + nd->mnt = mntget(chunk->ci_mnt); + printk(KERN_ERR "%s(): dentry %p name %s client_dentry %p mnt %s\n", + __FUNCTION__, dentry, dentry->d_iname, client_dentry, + nd->mnt->mnt_sb->s_type->name); +} + +/* + * The client file system may read the following parts of the nameidata: + * + * In open, it read the intent's mode or flags. + * + * The client file system may alter the nameidata in the following cases: + * + * When following symbolic links (up to N levels of links saved in + * nd), it may set the saved_names (using the depth) with nd_set_link. + */ + +static void +__chunkfs_copy_nd(struct nameidata *dst, struct nameidata *src) +{ + dst->intent.open.flags = src->intent.open.flags; + dst->intent.open.create_mode = src->intent.open.create_mode; + dst->depth = src->depth; + dst->saved_names[dst->depth] = src->saved_names[dst->depth]; +} + +void +chunkfs_copy_up_nd(struct nameidata *nd, struct nameidata *client_nd) +{ + __chunkfs_copy_nd(nd, client_nd); +} + +void +chunkfs_copy_down_nd(struct nameidata *nd, struct nameidata *client_nd) +{ + __chunkfs_copy_nd(client_nd, nd); +} + +static void +chunkfs_remove_dentry(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + dput(dp->dp_client_dentry); +} + +void +chunkfs_free_dentry(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + kfree(dp->dp_client_nd); + kfree(dp); + dentry->d_fsdata = NULL; +} + +/* + * Called when a dentry is evicted from cache. + */ + +void +chunkfs_release_dentry(struct dentry *dentry) +{ + printk(KERN_ERR "%s(): name %s\n", __FUNCTION__, dentry->d_name.name); + /* + * Root dentry can be legitimately released on umount, but is + * also a common manifestation of refcounting problems. Catch + * for debugging. + */ + WARN_ON(strcmp(dentry->d_name.name, "/") == 0); + chunkfs_release_nd(dentry); + /* + * Negative dentries need client dentries too, so they can be + * easily converted into responsible positive dentries. We + * should never have a dentry without a client dentry. + */ + chunkfs_remove_dentry(dentry); + chunkfs_free_dentry(dentry); +} + +struct dentry_operations chunkfs_dops = { + .d_release = chunkfs_release_dentry, +}; + +/* + * Initialize a new chunkfs dentry. + */ + +int +chunkfs_init_dentry(struct dentry *dentry) +{ + struct chunkfs_dentry_priv *dp; + struct nameidata *nd; + + BUG_ON(dentry->d_fsdata); + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + if (!dp) + return -ENOMEM; + nd = kzalloc(sizeof(*nd), GFP_KERNEL); + if (!nd) + goto out; + dp->dp_client_nd = nd; + dentry->d_fsdata = dp; + dentry->d_op = &chunkfs_dops; + return 0; + out: + kfree(dp); + return -ENOMEM; +} + +/* + * This function takes a chunkfs dentry and constructs a new dentry + * for the client fs. + */ + +static struct dentry * +chunkfs_clone_dentry(struct dentry *dentry) +{ + struct dentry *client_parent = get_client_dentry(dentry->d_parent); + struct dentry *client_dentry; + + client_dentry = d_alloc_name(client_parent, dentry->d_name.name); + + if (!client_dentry) + return ERR_PTR(-ENOMEM); + return client_dentry; +} + +void +chunkfs_add_dentry(struct dentry *dentry, struct dentry *client_dentry, + struct vfsmount *mnt) +{ + struct chunkfs_dentry_priv *dp = CHUNKFS_D(dentry); + dp->dp_client_dentry = client_dentry; +} + +static int +chunkfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *client_dir = get_client_inode(dir); + struct dentry *client_dentry = get_client_dentry(dentry); + struct nameidata *client_nd = get_client_nd(dentry); + u64 chunk_id = UINO_TO_CHUNK_ID(dir->i_ino); + struct inode *inode; + int err; + + printk(KERN_ERR "%s(): dir ino %0lx i_count %d\n", + __FUNCTION__, dir->i_ino, atomic_read(&dir->i_count)); + + err = chunkfs_new_inode(dir->i_sb, &inode); + if (err) + goto out; + + chunkfs_copy_down_nd(nd, client_nd); + + err = client_dir->i_op->create(client_dir, client_dentry, mode, + client_nd); + if (err) + goto out_inode; + + err = chunkfs_init_cont_data(client_dentry); + if (err) + goto out_inode; + chunkfs_start_inode(inode, client_dentry->d_inode, chunk_id); + chunkfs_copy_up_inode(dir, client_dir); + chunkfs_copy_up_nd(nd, client_nd); + + /* Now put our new inode into the dentry */ + d_instantiate(dentry, inode); + + printk(KERN_ERR "dentry %p name %s inode %p ino %0lx\n", + dentry, dentry->d_iname, dentry->d_inode, + dentry->d_inode->i_ino); + + printk(KERN_ERR "client dentry %p name %s inode %p ino %0lx\n", + client_dentry, client_dentry->d_iname, client_dentry->d_inode, + client_dentry->d_inode->i_ino); + return 0; + out_inode: + iput(inode); + out: + return err; +} + +static struct dentry * +chunkfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode *client_dir = get_client_inode(dir); + u64 chunk_id = UINO_TO_CHUNK_ID(dir->i_ino); + struct dentry *client_dentry; + struct dentry *new_dentry; + struct nameidata *client_nd; + struct inode *inode; + int err; + + printk(KERN_ERR "%s(): name %s dir ino %0lx i_count %d\n", + __FUNCTION__, dentry->d_iname, dir->i_ino, + atomic_read(&dir->i_count)); + + err = chunkfs_init_dentry(dentry); + if (err) + goto out; + + client_dentry = chunkfs_clone_dentry(dentry); + if (IS_ERR(client_dentry)) + goto out_dentry; + + chunkfs_init_nd(dir, dentry, client_dentry, chunk_id); + client_nd = get_client_nd(dentry); + /* + * Fill out the client dentry. + */ + new_dentry = client_dir->i_op->lookup(client_dir, client_dentry, + client_nd); + /* + * Possible return values: + * + * NULL: Nothing went wrong with lookup, you may or may not + * have found a matching inode and attached it. If the inode + * is NULL, we still have to create a negative dentry. + * + * Address of a dentry: The dentry already existed (and was + * root and disconnected - something about knfsd), so the + * dentry we passed in needs to be thrown away and we should + * use the one returned. + * + * IS_ERR(): Something went wrong, return the error. + */ + if (IS_ERR(new_dentry)) { + err = PTR_ERR(new_dentry); + goto out_dput; + } else if (new_dentry) { + dput(client_dentry); + client_dentry = new_dentry; + } + + /* + * If the client found an inode, fill in the chunkfs inode. + */ + if (client_dentry->d_inode) { + err = chunkfs_new_inode(dir->i_sb, &inode); + if (err) + goto out_dput; + err = chunkfs_init_cont_data(client_dentry); + if (err) + goto out_dput; + chunkfs_start_inode(inode, client_dentry->d_inode, + chunk_id); + } else { + inode = NULL; + } + /* Hook up the client and parent dentries. */ + chunkfs_add_dentry(dentry, client_dentry, client_nd->mnt); + + printk(KERN_ERR "dentry %p name %s inode %p\n", + dentry, dentry->d_iname, dentry->d_inode); + printk(KERN_ERR "client dentry %p name %s inode %p\n", client_dentry, + client_dentry->d_iname, client_dentry->d_inode); + + return d_splice_alias(inode, dentry); + out_dput: + dput(client_dentry); + chunkfs_release_nd(dentry); + out_dentry: + chunkfs_remove_dentry(dentry); + out: + chunkfs_free_dentry(dentry); + + printk(KERN_ERR "%s(): name %s returning %d\n", + __FUNCTION__, dentry->d_iname, err); + + return ERR_PTR(err); +} + +static int +chunkfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + struct inode *client_dir = get_client_inode(dir); + struct inode *old_inode = old_dentry->d_inode; + struct inode *client_old_inode = get_client_inode(old_inode); + struct dentry *client_old_dentry = get_client_dentry(old_dentry); + struct dentry *client_new_dentry = get_client_dentry(new_dentry); + int err = 0; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = client_dir->i_op->link(client_old_dentry, client_dir, + client_new_dentry); + if (err) + goto out; + /* Copy up inode takes care of link counts */ + chunkfs_copy_up_inode(old_inode, client_old_inode); + /* + * For some reason, this is the one place where the VFS + * doesn't increment the inode ref count for us. + */ + atomic_inc(&dir->i_count); + d_instantiate(new_dentry, old_inode); + out: + return err; +} + +static int +chunkfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *client_dir = get_client_inode(dir); + struct dentry *client_dentry = get_client_dentry(dentry); + struct inode *inode = dentry->d_inode; + struct inode *client_inode = get_client_inode(inode); + int err = 0; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + + err = client_dir->i_op->unlink(client_dir, client_dentry); + if (err) + goto out; + chunkfs_copy_up_inode(dir, client_dir); + chunkfs_copy_up_inode(inode, client_inode); + out: + return err; +} + +static int +chunkfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) +{ + struct inode *client_dir = get_client_inode(dir); + struct dentry *client_dentry = get_client_dentry(dentry); + u64 chunk_id = UINO_TO_CHUNK_ID(dir->i_ino); + struct inode *inode; + int err; + + printk(KERN_ERR "%s(): dir ino %0lx i_count %d\n", + __FUNCTION__, dir->i_ino, atomic_read(&dir->i_count)); + + err = chunkfs_new_inode(dir->i_sb, &inode); + if (err) + goto out; + + err = client_dir->i_op->symlink(client_dir, client_dentry, oldname); + if (err) + goto out_inode; + + err = chunkfs_init_cont_data(client_dentry); + if (err) + goto out_inode; + chunkfs_start_inode(inode, client_dentry->d_inode, chunk_id); + chunkfs_copy_up_inode(dir, client_dir); + + /* Now put our new inode into the dentry */ + d_instantiate(dentry, inode); + + printk(KERN_ERR "dentry %p name %s inode %p ino %0lx\n", + dentry, dentry->d_iname, dentry->d_inode, + dentry->d_inode->i_ino); + printk(KERN_ERR "client dentry %p name %s inode %p ino %0lx\n", + client_dentry, client_dentry->d_iname, client_dentry->d_inode, + client_dentry->d_inode->i_ino); + return 0; + out_inode: + iput(inode); + out: + return err; +} + +static int +chunkfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *client_dir = get_client_inode(dir); + struct inode *client_inode; + struct dentry *client_dentry = get_client_dentry(dentry); + u64 chunk_id = UINO_TO_CHUNK_ID(dir->i_ino); + struct inode *inode; + int err; + + printk(KERN_ERR "%s(): name %s dir ino %0lx i_count %d\n", + __FUNCTION__, dentry->d_iname, dir->i_ino, + atomic_read(&dir->i_count)); + + err = chunkfs_new_inode(dir->i_sb, &inode); + if (err) + goto out; + + err = client_dir->i_op->mkdir(client_dir, client_dentry, mode); + if (err) + goto out_inode; + client_inode = client_dentry->d_inode; + + err = chunkfs_init_cont_data(client_dentry); + if (err) + goto out_inode; + chunkfs_start_inode(inode, client_inode, chunk_id); + chunkfs_copy_up_inode(dir, client_dir); + d_instantiate(dentry, inode); + return 0; + out_inode: + iput(inode); + out: + printk(KERN_ERR "%s(): name %s returning %d\n", + __FUNCTION__, dentry->d_iname, err); + return err; +} + +static int +chunkfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *client_dir = get_client_inode(dir); + struct dentry *client_dentry = get_client_dentry(dentry); + struct inode *inode = dentry->d_inode; + int err; + + printk(KERN_ERR "%s()\n", __FUNCTION__); + err = client_dir->i_op->rmdir(client_dir, client_dentry); + if (err) + return err; + chunkfs_copy_up_inode(dir, client_dir); + chunkfs_copy_up_inode(inode, client_dentry->d_inode); + return 0; +} + +static int +chunkfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + struct inode *client_dir = get_client_inode(dir); + struct dentry *client_dentry = get_client_dentry(dentry); + u64 chunk_id = UINO_TO_CHUNK_ID(dir->i_ino); + struct inode *inode; + int err; + + printk(KERN_ERR "%s(): name %s dir ino %0lx i_count %d\n", + __FUNCTION__, dentry->d_iname, dir->i_ino, + atomic_read(&dir->i_count)); + + err = chunkfs_new_inode(dir->i_sb, &inode); + if (err) + goto out; + + err = client_dir->i_op->mknod(client_dir, client_dentry, mode, dev); + if (err) + goto out_inode; + + err = chunkfs_init_cont_data(client_dentry); + if (err) + goto out_inode; + chunkfs_start_inode(inode, client_dentry->d_inode, chunk_id); + chunkfs_copy_up_inode(dir, client_dir); + d_instantiate(dentry, inode); + + return 0; + out_inode: + iput(inode); + out: + printk(KERN_ERR "%s(): name %s returning %d\n", + __FUNCTION__, dentry->d_iname, err); + return err; +} + +static int +chunkfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *client_old_dir = get_client_inode(old_dir); + struct inode *client_new_dir = get_client_inode(new_dir); + struct dentry *client_old_dentry = get_client_dentry(old_dentry); + struct dentry *client_new_dentry = get_client_dentry(new_dentry); + int err = 0; + + return -ENOSYS; + /* Not reached */ + err = client_old_dir->i_op->rename(client_old_dir, + client_old_dentry, + client_new_dir, + client_new_dentry); + if (err) + goto out; + chunkfs_copy_up_inode(old_dir, client_old_dir); + chunkfs_copy_up_inode(new_dir, client_new_dir); + out: + return err; +} + +struct inode_operations chunkfs_dir_iops = { + .create = chunkfs_create, + .lookup = chunkfs_lookup, + .link = chunkfs_link, + .unlink = chunkfs_unlink, + .symlink = chunkfs_symlink, + .mkdir = chunkfs_mkdir, + .rmdir = chunkfs_rmdir, + .mknod = chunkfs_mknod, + .rename = chunkfs_rename, + .setattr = chunkfs_setattr, + .permission = chunkfs_permission, +}; + +struct inode_operations chunkfs_special_iops = { + .setattr = chunkfs_setattr, + .permission = chunkfs_permission, +}; diff --git a/start_uml.sh b/start_uml.sh new file mode 100644 index 0000000..c06bd21 --- /dev/null +++ b/start_uml.sh @@ -0,0 +1,21 @@ +#!/bin/bash -x +# +# Start UML. The real test script is run from inside the UML +# partition; I suggest using hostfs to mount it. +# +ROOT=/home/val/root_fs_philips + +# Set up some gdb commands + +cat > /tmp/gdb_commands < + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "chunkfs.h" +#include "chunkfs_pool.h" +#include "chunkfs_dev.h" +#include "chunkfs_chunk.h" +#include "chunkfs_i.h" + +static struct inode *chunkfs_alloc_inode(struct super_block *sb) +{ + /* XXX Make a kmem_cache */ + struct chunkfs_inode_info *ii; + struct inode *inode; + + ii = kzalloc(sizeof (*ii), GFP_KERNEL); + if (!ii) + return NULL; + /* XXX should be done in cache constructor */ + spin_lock_init(&ii->ii_continuations_lock); + /* Don't load head continuation until file open */ + inode = &ii->ii_vnode; + inode_init_once(inode); + inode->i_version = 1; + + return inode; +} + +static void chunkfs_destroy_inode(struct inode *inode) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(inode); + + printk(KERN_ERR "%s(): ino %0lx i_count %d\n", __FUNCTION__, + inode->i_ino, atomic_read(&inode->i_count)); + + kfree(ii); +} + +static void chunkfs_clear_inode(struct inode *inode) +{ + struct chunkfs_inode_info *ii = CHUNKFS_I(inode); + + printk(KERN_ERR "%s(): ino %0lx i_count %d\n", + __FUNCTION__, inode->i_ino, atomic_read(&inode->i_count)); + iput(ii->ii_client_inode); +} + +static int +chunkfs_read_client_sb(struct chunkfs_chunk_info *ci) +{ + /* XXX XXX XXX There aren't enough XXX's in the world XXX XXX */ + char *path_prefix = "/chunk"; + char mount_path[strlen(path_prefix + 10)]; + struct nameidata nd; + int retval; + + /* + * Userland has kindly mounted our client fs's in particular + * locations. Look up the path and grab the superblock for + * each chunk. + * + * XXX Yuckity yuckity yuck yuck + */ + sprintf(mount_path, "%s%llu", path_prefix, ci->ci_chunk_id); + retval = path_lookup(mount_path, LOOKUP_FOLLOW, &nd); + if (retval) { + printk(KERN_ERR "path_lookup for %s failed: %d\n", + mount_path, retval); + return retval; + } + /* XXX locking XXX prevent unmount XXX ref count XXX XXX */ + ci->ci_mnt = mntget(nd.mnt); + ci->ci_sb = nd.mnt->mnt_sb; + path_release(&nd); + + return 0; +} + +struct chunkfs_chunk_info * +chunkfs_find_chunk(struct chunkfs_pool_info *pi, u64 chunk_id) +{ + struct chunkfs_dev_info *di; + struct chunkfs_chunk_info *ci; + + list_for_each_entry(di, &pi->pi_dlist_head, di_dlist) { + list_for_each_entry(ci, &di->di_clist_head, ci_clist) { + if(ci->ci_chunk_id == chunk_id) + return ci; + } + } + return NULL; +} + +static void chunkfs_free_chunk(struct chunkfs_chunk_info *ci) +{ + brelse(ci->ci_bh); + mntput(ci->ci_mnt); + kfree(ci); +} + +static void chunkfs_free_dev(struct chunkfs_dev_info *di) +{ + struct chunkfs_chunk_info *ci, *ci_next; + + list_for_each_entry_safe(ci, ci_next, &di->di_clist_head, ci_clist) { + list_del(&ci->ci_clist); + chunkfs_free_chunk(ci); + } + brelse(di->di_bh); + kfree(di); +} + +static void chunkfs_free_pool(struct chunkfs_pool_info *pi) +{ + struct chunkfs_dev_info *di, *di_next; + + list_for_each_entry_safe(di, di_next, &pi->pi_dlist_head, di_dlist) { + list_del(&di->di_dlist); + chunkfs_free_dev(di); + } + brelse(pi->pi_bh); + kfree(pi); +} + +static int chunkfs_read_chunk(struct super_block *sb, + struct chunkfs_dev_info *dev, + struct chunkfs_chunk_info **chunk_info, + ci_byte_t chunk_offset, + ci_byte_t *next_chunk_offset) +{ + struct chunkfs_chunk_info *ci; + struct chunkfs_chunk *chunk; + struct buffer_head *bh; + int retval = -EIO; + int err; + + ci = kzalloc(sizeof(*ci), GFP_KERNEL); + if (!ci) + return -ENOMEM; + + /* XXX assumes offset is multiple of underlying block size */ + + if (!(bh = sb_bread(sb, chunk_offset/CHUNKFS_BLK_SIZE))) { + printk (KERN_ERR "chunkfs: unable to read chunk summary at %llu", + chunk_offset); + goto out_nobh; + } + + ci->ci_bh = bh; + chunk = CHUNKFS_CHUNK(ci); + + if ((err = check_chunk(chunk)) != 0) { + printk (KERN_ERR "chunkfs: Invalid chunk summary, err %d, chksum %0x\n", + err, le32_to_cpu(chunk->c_chksum)); + goto out; + } + + /* Fill in on-disk info */ + ci->ci_flags = cpu_to_le64(chunk->c_flags); + *next_chunk_offset = cpu_to_le64(chunk->c_next_chunk); + ci->ci_chunk_id = cpu_to_le64(chunk->c_chunk_id); + memcpy(ci->ci_client_fs, chunk->c_client_fs, CHUNKFS_CLIENT_NAME_LEN); + + /* Init non-disk stuff */ + ci->ci_dev = dev; + + /* Mount the client file system */ + retval = chunkfs_read_client_sb(ci); + if (retval) + goto out; + + *chunk_info = ci; + return 0; + out: + brelse(bh); + ci->ci_bh = NULL; + out_nobh: + kfree(ci); + BUG_ON(retval == 0); + return retval; +} + +static int chunkfs_read_dev(struct super_block *sb, + struct chunkfs_pool_info *pool_info, + struct chunkfs_dev_info **dev_info) +{ + struct chunkfs_dev_info *di; + struct chunkfs_dev *dev; + struct buffer_head * bh; + struct chunkfs_chunk_info *ci, *ci_next; + ci_byte_t chunk_offset, next_chunk_offset; + int retval = -EIO; + int err; + + di = kzalloc(sizeof(*di), GFP_KERNEL); + if (!di) + return -ENOMEM; + + /* XXX assumes sb offset is multiple of underlying block size */ + + if (!(bh = sb_bread(sb, CHUNKFS_DEV_BLK))) { + printk (KERN_ERR "chunkfs: unable to read dev summary\n"); + goto out_nobh; + } + + di->di_bh = bh; + dev = CHUNKFS_DEV(di); + + if ((err = check_dev(dev)) != 0) { + printk (KERN_ERR "chunkfs: Invalid dev summary err %d chksum %0x\n", + err, le32_to_cpu(dev->d_chksum)); + goto out_bh; + } + /* Fill in on-disk info */ + di->di_flags = cpu_to_le64(dev->d_flags); + chunk_offset = cpu_to_le64(dev->d_innards_begin); + + /* Init non-disk stuff */ + INIT_LIST_HEAD(&di->di_clist_head); + di->di_pool = pool_info; + + /* XXX would like to sanity check dev size here */ + + while (chunk_offset != 0) { + retval = chunkfs_read_chunk(sb, di, &ci, chunk_offset, + &next_chunk_offset); + if (retval) + goto out_free_chunks; + list_add_tail(&ci->ci_clist, &di->di_clist_head); + if (CHUNKFS_IS_ROOT(ci)) { + BUG_ON(di->di_pool->pi_root_dev); + di->di_pool->pi_root_dev = di; + BUG_ON(di->di_root_chunk); + di->di_root_chunk = ci; + } + chunk_offset = next_chunk_offset; + } + + /* Did we find root? */ + if (!di->di_root_chunk) { + printk(KERN_ERR "chunkfs: did not find root\n"); + goto out_free_chunks; + } + *dev_info = di; + return 0; + out_free_chunks: + list_for_each_entry_safe(ci, ci_next, &di->di_clist_head, ci_clist) { + list_del(&ci->ci_clist); + chunkfs_free_chunk(ci); + } + out_bh: + brelse(bh); + di->di_bh = NULL; + out_nobh: + kfree(di); + return retval; +} + +static int chunkfs_read_pool(struct super_block *sb, + struct chunkfs_pool_info **pool_info) +{ + struct chunkfs_pool_info *pi; + struct chunkfs_pool *pool; + struct buffer_head * bh; + struct chunkfs_dev_info *di; + int retval = -EIO; + int err; + + pi = kzalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) + return -ENOMEM; + + /* XXX assumes sb offset is multiple of underlying block size */ + + if (!(bh = sb_bread(sb, CHUNKFS_POOL_BLK))) { + printk (KERN_ERR "chunkfs: unable to read pool summary\n"); + goto out_nobh; + } + + pi->pi_bh = bh; + pool = CHUNKFS_POOL(pi); + + if ((err = check_pool(pool)) != 0) { + printk (KERN_ERR "chunkfs: Invalid pool summary, err %d chksum %0x magic %0x\n", + err, le32_to_cpu(pool->p_chksum), le32_to_cpu(pool->p_magic)); + goto out; + } + /* Fill in on-disk info */ + pi->pi_flags = cpu_to_le64(pool->p_flags); + + /* Init non-disk stuff */ + INIT_LIST_HEAD(&pi->pi_dlist_head); + + /* XXX read multiple devs */ + /* For now, we just read at a particular offset on this dev */ + retval = chunkfs_read_dev(sb, pi, &di); + if (retval) + goto out; + list_add_tail(&di->di_dlist, &pi->pi_dlist_head); + + *pool_info = pi; + return 0; + out: + brelse(bh); + pi->pi_bh = NULL; + out_nobh: + kfree(pi); + return retval; +} + +static void chunkfs_commit_super (struct super_block *sb, int sync) +{ + struct buffer_head *sbh = CHUNKFS_PI(sb)->pi_bh; + + if (!sbh) + return; + mark_buffer_dirty(sbh); + if (sync) + sync_dirty_buffer(sbh); +} + +static void chunkfs_put_super (struct super_block *sb) +{ + struct chunkfs_pool_info *pi = CHUNKFS_PI(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + /* XXX should mark super block as clean unmounted */ + chunkfs_commit_super(sb, 1); + } + chunkfs_free_pool(pi); + sb->s_fs_info = NULL; + + return; +} + +static void +chunkfs_write_super (struct super_block * sb) +{ + if (mutex_trylock(&sb->s_lock) != 0) + BUG(); + chunkfs_commit_super(sb, 1); + sb->s_dirt = 0; +} + + +static struct super_operations chunkfs_sops = { + .alloc_inode = chunkfs_alloc_inode, + .destroy_inode = chunkfs_destroy_inode, + .read_inode = chunkfs_read_inode, + .write_inode = chunkfs_write_inode, +#if 0 /* XXX Totally unimplemented at present */ + .dirty_inode = chunkfs_dirty_inode, + .delete_inode = chunkfs_delete_inode, +#endif + .put_super = chunkfs_put_super, + .write_super = chunkfs_write_super, +#if 0 + .sync_fs = chunkfs_sync_fs, + .write_super_lockfs = chunkfs_write_super_lockfs, + .unlockfs = chunkfs_unlockfs, + .statfs = chunkfs_statfs, + .remount_fs = chunkfs_remount, +#endif + .clear_inode = chunkfs_clear_inode, +#if 0 + .show_options = chunkfs_show_options, +#endif +}; + +/* + * The file system in the root chunk has already been mounted, so the + * chunk root inode is already loaded and stored in the superblock. + * However, we really want to have the root directory in terms of the + * chunkfs namespace, which is presently named "/root" and inode 12. + */ + +static int chunkfs_read_root(struct super_block *sb) +{ + struct chunkfs_chunk_info *ci = CHUNKFS_PI(sb)->pi_root_dev->di_root_chunk; + ino_t ino = MAKE_UINO(ci->ci_chunk_id, 12); /* XXX */ + struct inode *inode; + struct nameidata nd; + struct dentry *dentry; + int retval; + + inode = iget(sb, ino); + sb->s_root = d_alloc_root(inode); + if (!sb->s_root) { + retval = -ENOMEM; + goto out_iput; + } + retval = chunkfs_init_dentry(sb->s_root); + if (retval) + goto out_dput; + retval = path_lookup("/chunk1/root/", LOOKUP_FOLLOW, &nd); + if (retval) + goto out_dentry; + dentry = dget(nd.dentry); + chunkfs_init_nd(inode, sb->s_root, dentry, ci->ci_chunk_id); + chunkfs_add_dentry(sb->s_root, dentry, nd.mnt); + path_release(&nd); + return 0; + out_dentry: + chunkfs_free_dentry(sb->s_root); + out_dput: + dput(sb->s_root); + out_iput: + iput(inode); + printk(KERN_ERR "%s() path lookup failed\n", __FUNCTION__); + return retval; +} + +/* + * chunkfs_setup_super does all things that are shared between mount + * and remount. At moment, I'm not sure what they are. + */ + +static int chunkfs_setup_super(struct super_block *sb, + struct chunkfs_pool_info *pi, + int read_only) +{ + return 0; +} + +/* + * Get the superblock off the disk and check to see if it is sane. + * + * Note that VFS code has a generic routine to find alternate superblocks. + * + * XXX todo, put dev summary copies in chunk summaries. + */ + +static int chunkfs_fill_super (struct super_block *sb, void *data, int silent) +{ + struct chunkfs_pool_info *pi; + int retval = -EINVAL; + + unlock_kernel(); + + printk(KERN_ERR "%s\n", __FUNCTION__); + + /* We must set blocksize before we can read blocks. */ + + if (sb_set_blocksize(sb, CHUNKFS_BLK_SIZE) == 0) + goto out; + + retval = chunkfs_read_pool(sb, &pi); + if (retval) + goto out; + sb->s_fs_info = pi; + + sb->s_maxbytes = ~0ULL; + sb->s_op = &chunkfs_sops; + + retval = chunkfs_read_root(sb); + if (retval) + goto out; + /* If fail after this, dput sb->s_root */ + + chunkfs_setup_super (sb, pi, sb->s_flags & MS_RDONLY); + + printk(KERN_ERR "chunkfs: mounted file system\n"); + lock_kernel(); + return 0; + out: + lock_kernel(); + BUG_ON(retval == 0); + printk(KERN_ERR "%s() failed! err %d\n", __FUNCTION__, retval); + return retval; +} + +static int chunkfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, chunkfs_fill_super, mnt); +} + +static struct file_system_type chunkfs_fs_type = { + .owner = THIS_MODULE, + .name = "chunkfs", + .get_sb = chunkfs_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_chunkfs_fs(void) +{ + int err = register_filesystem(&chunkfs_fs_type); + if (!err) + printk(KERN_INFO "chunkfs (C) 2007 Valerie Henson " + "\n"); + return err; +} + +static void __exit exit_chunkfs_fs(void) +{ + unregister_filesystem(&chunkfs_fs_type); +} + +MODULE_AUTHOR("Val Henson"); +MODULE_DESCRIPTION("Chunkfs"); +MODULE_LICENSE("GPL"); +module_init(init_chunkfs_fs) +module_exit(exit_chunkfs_fs) diff --git a/symlink.c b/symlink.c new file mode 100644 index 0000000..a650e53 --- /dev/null +++ b/symlink.c @@ -0,0 +1,62 @@ +/* + * Chunkfs symlinks + * + * (C) 2007-2008 Valerie Henson + */ + +#include "chunkfs.h" +#include "chunkfs_i.h" + +static int +chunkfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct inode *client_inode = get_client_inode(dentry->d_inode); + struct dentry *client_dentry = get_client_dentry(dentry); + int err; + + printk (KERN_ERR "%s()\n", __FUNCTION__); + + err = client_inode->i_op->readlink(client_dentry, buffer, buflen); + + return err; +} + +static void * +chunkfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *client_inode = get_client_inode(dentry->d_inode); + struct dentry *client_dentry = get_client_dentry(dentry); + struct nameidata *client_nd = get_client_nd(dentry); + void *cookie; + + printk (KERN_ERR "%s()\n", __FUNCTION__); + + chunkfs_copy_down_nd(nd, client_nd); + + cookie = client_inode->i_op->follow_link(client_dentry, client_nd); + + chunkfs_copy_up_nd(nd, client_nd); + + return cookie; +} + +static void +chunkfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + struct inode *client_inode = get_client_inode(dentry->d_inode); + struct dentry *client_dentry = get_client_dentry(dentry); + struct nameidata *client_nd = get_client_nd(dentry); + + printk (KERN_ERR "%s()\n", __FUNCTION__); + if (client_inode->i_op->put_link) { + chunkfs_copy_down_nd(nd, client_nd); + client_inode->i_op->put_link(client_dentry, client_nd, cookie); + chunkfs_copy_up_nd(nd, client_nd); + } +} + +struct inode_operations chunkfs_symlink_iops = { + .readlink = chunkfs_readlink, + .follow_link = chunkfs_follow_link, + .put_link = chunkfs_put_link, +}; diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..1364729 --- /dev/null +++ b/test.sh @@ -0,0 +1,114 @@ +#!/bin/bash -x +# +# Test script for chunkfs, to be run from inside UML. +# + +# Name of the file backing the loop device +FILE=/loop/disk0 +# This is where the chunkfs user binaries are located. +BINPATH=/chunkfs_bin +MNT=/mnt + +# Clean up from last iteration if necessary. + +MOUNTED=`cat /proc/mounts | grep "${DEV} ${MNT} chunkfs"` +if [ -n "$MOUNTED" ]; then + umount ${MNT} + if [ "$?" != "0" ]; then + echo "umount failed" + exit 1 + fi +fi + +# Tear down old mounts and loop devices +loop_num=$((0)) +while (losetup /dev/loop$loop_num); do + umount /chunk${loop_num} + losetup -d /dev/loop$loop_num + loop_num=$((loop_num + 1)) +done + +${BINPATH}/write_pattern ${FILE} +if [ "$?" != "0" ]; then + echo "write_pattern failed" + exit 1 +fi + +${BINPATH}/mkfs.chunkfs ${FILE} > /tmp/offsetlist +if [ "$?" != "0" ]; then + echo "mkfs.chunkfs failed" + exit 1 +fi + +# Create primary loop device +losetup /dev/loop0 ${FILE} +if [ "$?" != "0" ]; then + echo "Create loop device failed" + exit 1 +fi + +# XXX Wow, like, such a hack. Set up a bunch of block devices +# starting at different offsets in order to create ext2 file systems +# inside chunks. losetup doesn't include an end argument, so there's +# no protection against one file system having a bug that scribbles +# over the following file systems. Also, mkfs should do this +# directly. +# +# XXX More hackery. Mount all our client fs's so that chunkfs kernel +# side can lookup the path and grab the superblocks. + +OFFSETS="`awk '/clientfs: start/ {print $3}' /tmp/offsetlist`" +loop_num=$((1)) +for offset in ${OFFSETS}; do + losetup -o $offset /dev/loop$loop_num ${FILE} + mke2fs -b 4096 /dev/loop$loop_num 2559 > /dev/null + mkdir -p /chunk${loop_num} + mount -t ext2 -o user_xattr /dev/loop${loop_num} /chunk${loop_num} + if [ "$?" != "0" ]; then + echo "mount client fs failed" + exit 1 + fi + # Only the root chunk has the root directory + if [ "$loop_num" == "1" ]; then + mkdir -p /chunk${loop_num}/root + # Create continuation data + # Hackity hack. Just create it on /root if it doesn't already + # exist. + /usr/local/usr/bin/setfattr -n user.next -v 0 /chunk${loop_num}/root + /usr/local/usr/bin/setfattr -n user.prev -v 0 /chunk${loop_num}/root + /usr/local/usr/bin/setfattr -n user.start -v 0 /chunk${loop_num}/root + /usr/local/usr/bin/setfattr -n user.len -v 40960 /chunk${loop_num}/root + fi + for i in 0 1 2 3 4 5 6 7 8 9 private; do + mkdir -p /chunk${loop_num}/$i + done + loop_num=$((loop_num + 1)) +done + +${BINPATH}/mount_chunkfs /dev/loop0 ${MNT} +if [ "$?" != "0" ]; then + echo "mount_chunkfs failed" + exit 1 +fi + +# Now test a few more things + +ls /mnt +touch /mnt/a_file +touch /mnt/another_file +echo "file data" > /mnt/a_file_with_data +cat /mnt/a_file_with_data +rm /mnt/another_file +mkdir /mnt/a_dir +mkdir /mnt/a_dir/a_dir +mkdir /mnt/a_dir/another_dir +rmdir /mnt/a_dir/another_dir +ln -s /mnt/a_file /mnt/a_symlink +ls -l /mnt/a_symlink +cat /mnt/a_symlink +dd if=/dev/zero of=/mnt/big bs=4096 count=11 +ls -l /mnt/big +ls -l /chunk1/root/big +ls -l /chunk2/1/29 + +exit 0 diff --git a/write_pattern.c b/write_pattern.c new file mode 100644 index 0000000..f0c66b2 --- /dev/null +++ b/write_pattern.c @@ -0,0 +1,61 @@ +/* + * Write a pattern to a file. + * + * (C) 2007 Valerie Henson + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FILE_SIZE (32 * 1024 * 1024) + +static char * cmd; + +static void usage (void) +{ + fprintf(stderr, "Usage: %s \n", cmd); + exit(1); +} + +int main (int argc, char * argv[]) +{ + int fd; + char * file; + char buf[4096]; + int n = 0; + int written = 0; + + cmd = argv[0]; + + if (argc != 2) + usage(); + + file = argv[1]; + + if ((fd = open(file, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR)) < 0) { + error(0, errno, "Cannot open file %s", file); + usage(); + } + + memset(buf, '5', sizeof(buf)); + + while (written < FILE_SIZE) { + n = write(fd, buf, sizeof(buf)); + if (n == -1) { + error(0, errno, "Cannot write file %s", file); + usage(); + } + written += n; + } + + close(fd); + + return 0; +}