diff --git a/CHANGELOG.md b/CHANGELOG.md index a0443f3..4b51808 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,16 +1,27 @@ # v0.3.0 (2024-05-03) + - MSRV is now 1.76 stable - added support for more Tar archives - - 256 character long filename support (prefix + name) - - add support for space terminated numbers - - non-null terminated names - - iterate over directories: read regular files from directories - - more info: + - 256 character long filename support (prefix + name) + - add support for space terminated numbers + - non-null terminated names + - iterate over directories: read regular files from directories + - more info: - `TarArchive[Ref]::new` now returns a result - added `unstable` feature with enhanced functionality for `nightly` compilers - - error types implement `core::error::Error` + - error types implement `core::error::Error` +- various bug fixes and code improvements +- better error reporting / less panics + +Special thanks to the following external contributors or helpers: + +- https://github.com/thenhnn: provide me with a bunch of Tar archives coming + from a fuzzer +- https://github.com/schnoberts1 implemented 256 character long filenames (ustar + Tar format) # v0.2.0 (2023-04-11) + - MSRV is 1.60.0 - bitflags bump: 1.x -> 2.x - few internal code improvements (less possible panics) diff --git a/README.md b/README.md index eb75e5f..fd6af30 100644 --- a/README.md +++ b/README.md @@ -1,59 +1,65 @@ # `tar-no-std` - Parse Tar Archives (Tarballs) -_Due to historical reasons, there are several formats of tar archives. All of them are based on the same principles, -but have some subtle differences that often make them incompatible with each other._ [0] +_Due to historical reasons, there are several formats of Tar archives. All of +them are based on the same principles, but have some subtle differences that +often make them incompatible with each other._ [(reference)](https://www.gnu.org/software/tar/manual/html_section/Formats.html) -Library to read Tar archives (by GNU Tar) in `no_std` contexts with zero allocations. If you have a standard -environment and need full feature support, I recommend the use of instead. +Library to read Tar archives in `no_std` environments with zero allocations. If +you have a standard environment and need full feature support, I recommend the +use of instead. ## Limitations -The crate is simple and only supports reading of "basic" archives, therefore no extensions, such -as GNU Longname. The maximum supported file name length is 256 characters excluding the NULL-byte (using the tar name/prefix longname implementation). The maximum supported file size is 8GiB. Directories are supported, but only regular fields are yielded in iteration. +This crate is simple and focuses on reading files and their content from a Tar +archive. Historic basic Tar and ustar [formats](https://www.gnu.org/software/tar/manual/html_section/Formats.html) +are supported. Other formats may work, but likely without all supported +features. GNU Extensions such as sparse files, incremental archives, and long +filename extension are not supported. + +The maximum supported file name length is 256 characters excluding the +NULL-byte (using the Tar name/prefix longname implementation of ustar). The +maximum supported file size is 8GiB. Directories are supported, but only regular +fields are yielded in iteration. The path is reflected in their file name. ## Use Case -This library is useful, if you write a kernel or a similar low-level application, which needs -"a bunch of files" from an archive ("init ramdisk"). The Tar file could for example come -as a Multiboot2 boot module provided by the bootloader. +This library is useful, if you write a kernel or a similar low-level +application, which needs "a bunch of files" from an archive (like an +"init ramdisk"). The Tar file could for example come as a Multiboot2 boot module +provided by the bootloader. -This crate focuses on extracting files from uncompressed Tar archives created with default options by **GNU Tar**. -GNU Extensions such as sparse files, incremental archives, and long filename extension are not supported yet. -[This link](https://www.gnu.org/software/tar/manual/html_section/Formats.html) gives a good overview over possible -archive formats and their limitations. +## Example -## Example (without `alloc`-feature) ```rust use tar_no_std::TarArchiveRef; fn main() { - // log: not mandatory + // init a logger (optional) std::env::set_var("RUST_LOG", "trace"); env_logger::init(); // also works in no_std environment (except the println!, of course) let archive = include_bytes!("../tests/gnu_tar_default.tar"); - let archive = TarArchiveRef::new(archive); + let archive = TarArchiveRef::new(archive).unwrap(); // Vec needs an allocator of course, but the library itself doesn't need one let entries = archive.entries().collect::>(); println!("{:#?}", entries); - println!("content of last file:"); - println!("{:#?}", entries[2].data_as_str().expect("Should be valid UTF-8")); } ``` -## Alloc Feature -This crate allows the usage of the additional Cargo build time feature `alloc`. When this is used, -the crate also provides the type `TarArchive`, which owns the data on the heap. +## Cargo Feature + +This crate allows the usage of the additional Cargo build time feature `alloc`. +When this is active, the crate also provides the type `TarArchive`, which owns +the data on the heap. The `unstable` feature provides additional convenience +only available on the nightly channel. ## Compression (`tar.gz`) -If your tar file is compressed, e.g. by `.tar.gz`/`gzip`, you need to uncompress the bytes first -(e.g. by a *gzip* library). Afterwards, this crate can read the Tar archive format from the uncompressed -bytes. -## MSRV -The MSRV is 1.76.0 stable. +If your Tar file is compressed, e.g. by `.tar.gz`/`gzip`, you need to uncompress +the bytes first (e.g. by a *gzip* library). Afterwards, this crate can read the +Tar archive format from the uncompressed bytes. +## MSRV -## References -[0]\: https://www.gnu.org/software/tar/manual/html_section/Formats.html +The MSRV is 1.76.0 stable. diff --git a/examples/alloc_feature.rs b/examples/alloc_feature.rs index a631c3e..599e649 100644 --- a/examples/alloc_feature.rs +++ b/examples/alloc_feature.rs @@ -27,6 +27,7 @@ use tar_no_std::TarArchive; fn main() { // log: not mandatory std::env::set_var("RUST_LOG", "trace"); + std::env::set_var("RUST_LOG_STYLE", "always"); env_logger::init(); // also works in no_std environment (except the println!, of course) diff --git a/examples/minimal.rs b/examples/minimal.rs index e0140d5..a8ced6b 100644 --- a/examples/minimal.rs +++ b/examples/minimal.rs @@ -26,6 +26,7 @@ use tar_no_std::TarArchiveRef; fn main() { // log: not mandatory std::env::set_var("RUST_LOG", "trace"); + std::env::set_var("RUST_LOG_STYLE", "always"); env_logger::init(); // also works in no_std environment (except the println!, of course) diff --git a/src/archive.rs b/src/archive.rs index f57f771..1a19d21 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -101,8 +101,7 @@ impl Display for CorruptDataError { impl core::error::Error for CorruptDataError {} /// Type that owns bytes on the heap, that represents a Tar archive. -/// Unlike [`TarArchiveRef`], this type is useful, if you need to own the -/// data as long as you need the archive, but no longer. +/// Unlike [`TarArchiveRef`], this type takes ownership of the data. /// /// This is only available with the `alloc` feature of this crate. #[cfg(feature = "alloc")] @@ -144,8 +143,8 @@ impl From for Box<[u8]> { } } -/// Wrapper type around bytes, which represents a Tar archive. -/// Unlike [`TarArchive`], this uses only a reference to the data. +/// Wrapper type around bytes, which represents a Tar archive. To iterate the +/// entries, use [`TarArchiveRef::entries`]. #[derive(Clone, Debug, PartialEq, Eq)] pub struct TarArchiveRef<'a> { data: &'a [u8], @@ -162,9 +161,7 @@ impl<'a> TarArchiveRef<'a> { .ok_or(CorruptDataError) } - /// Iterates over all entries of the Tar archive. - /// Returns items of type [`ArchiveEntry`]. - /// See also [`ArchiveEntryIterator`]. + /// Creates an [`ArchiveEntryIterator`]. pub fn entries(&self) -> ArchiveEntryIterator { ArchiveEntryIterator::new(self.data) } @@ -174,7 +171,7 @@ impl<'a> TarArchiveRef<'a> { #[derive(Debug)] pub struct ArchiveHeaderIterator<'a> { archive_data: &'a [u8], - block_index: usize, + next_hdr_block_index: usize, } impl<'a> ArchiveHeaderIterator<'a> { @@ -183,7 +180,7 @@ impl<'a> ArchiveHeaderIterator<'a> { assert_eq!(archive.len() % BLOCKSIZE, 0); Self { archive_data: archive, - block_index: 0, + next_hdr_block_index: 0, } } @@ -211,25 +208,31 @@ impl<'a> Iterator for ArchiveHeaderIterator<'a> { /// This returns `None` if either no further headers are found or if a /// header can't be parsed. fn next(&mut self) -> Option { - // TODO better check for two end zero blocks here? - assert!(self.block_index < self.archive_data.len() / BLOCKSIZE); + let total_block_count = self.archive_data.len() / BLOCKSIZE; + if self.next_hdr_block_index >= total_block_count { + warn!("Invalid block index. Probably the Tar is corrupt: an header had an invalid payload size"); + return None; + } - let hdr = self.block_as_header(self.block_index); - let block_index = self.block_index; + let hdr = self.block_as_header(self.next_hdr_block_index); + let block_index = self.next_hdr_block_index; // Start at next block on next iteration. - self.block_index += 1; - log::info!("{:#?}, {:#?}", hdr.name, hdr.typeflag); - - let block_count = hdr - .payload_block_count() - .inspect_err(|e| { - log::error!("Unparsable size ({e:?}) in header {hdr:#?}"); - }) - .ok()?; - - if !hdr.is_zero_block() { - self.block_index += block_count; + self.next_hdr_block_index += 1; + + // We only update the block index for types that have a payload. + // In directory entries, for example, the size field has other + // semantics. See spec. + if let Ok(typeflag) = hdr.typeflag.try_to_type_flag() { + if typeflag.is_regular_file() { + let payload_block_count = hdr + .payload_block_count() + .inspect_err(|e| { + log::error!("Unparsable size ({e:?}) in header {hdr:#?}"); + }) + .ok()?; + self.next_hdr_block_index += payload_block_count; + } } Some((block_index, hdr)) @@ -239,11 +242,15 @@ impl<'a> Iterator for ArchiveHeaderIterator<'a> { impl<'a> ExactSizeIterator for ArchiveEntryIterator<'a> {} /// Iterator over the files of the archive. +/// +/// Only regular files are supported, but not directories, links, or other +/// special types ([`crate::TypeFlag`]). The full path to files is reflected +/// in their file name. #[derive(Debug)] pub struct ArchiveEntryIterator<'a>(ArchiveHeaderIterator<'a>); impl<'a> ArchiveEntryIterator<'a> { - pub fn new(archive: &'a [u8]) -> Self { + fn new(archive: &'a [u8]) -> Self { Self(ArchiveHeaderIterator::new(archive)) } @@ -260,7 +267,13 @@ impl<'a> Iterator for ArchiveEntryIterator<'a> { // Ignore directory entries, i.e. yield only regular files. Works as // filenames in tarballs are fully specified, e.g. dirA/dirB/file1 - while !hdr.typeflag.is_regular_file() { + while !hdr + .typeflag + .try_to_type_flag() + .inspect_err(|e| error!("Invalid TypeFlag: {e:?}")) + .ok()? + .is_regular_file() + { warn!( "Skipping entry of type {:?} (not supported yet)", hdr.typeflag @@ -289,13 +302,25 @@ impl<'a> Iterator for ArchiveEntryIterator<'a> { let idx_first_data_block = block_index + 1; let idx_begin = idx_first_data_block * BLOCKSIZE; let idx_end_exclusive = idx_begin + payload_size; + + let max_data_end_index_exclusive = self.0.archive_data.len() - 2 * BLOCKSIZE; + if idx_end_exclusive > max_data_end_index_exclusive { + warn!("Invalid Tar. The size of the payload ({payload_size}) is larger than what is valid"); + return None; + } + let file_bytes = &self.0.archive_data[idx_begin..idx_end_exclusive]; let mut filename: TarFormatString<256> = TarFormatString::::new([0; POSIX_1003_MAX_FILENAME_LEN]); - if hdr.magic.as_str().unwrap() == "ustar" - && hdr.version.as_str().unwrap() == "00" - && !hdr.prefix.is_empty() + + // POXIS_1003 long filename check + // https://docs.scinet.utoronto.ca/index.php/(POSIX_1003.1_USTAR) + if ( + hdr.magic.as_str(), + hdr.version.as_str(), + hdr.prefix.is_empty(), + ) == (Ok("ustar"), Ok("00"), false) { filename.append(&hdr.prefix); filename.append(&TarFormatString::<1>::new([b'/'])); @@ -332,6 +357,7 @@ mod tests { let names = iter .map(|(_i, hdr)| hdr.name.as_str().unwrap()) .collect::>(); + assert_eq!( names.as_slice(), &[ @@ -340,21 +366,56 @@ mod tests { "hello_world.txt", ] ) + } - /*for hdr in iter { - dbg!(hdr); - }*/ + /// The test here is that no panics occur. + #[test] + fn test_print_archive_headers() { + let data = include_bytes!("../tests/gnu_tar_default.tar"); - // TODO make PartialEq - //assert_eq!(ArchiveHeaderIterator::new(archive).collect::>().as_slice(), &[]); + let iter = ArchiveHeaderIterator::new(data); + let entries = iter.map(|(_, hdr)| hdr).collect::>(); + println!("{:#?}", entries); } + /// The test here is that no panics occur. #[test] - fn test_archive_list() { + fn test_print_archive_list() { let archive = TarArchiveRef::new(include_bytes!("../tests/gnu_tar_default.tar")).unwrap(); let entries = archive.entries().collect::>(); println!("{:#?}", entries); } + + /// Tests various weird (= invalid, corrupt) tarballs that are bundled + /// within this file. The tarball(s) originate from a fuzzing process from a + /// GitHub contributor [0]. + /// + /// The test succeeds if no panics occur. + /// + /// [0] https://github.com/phip1611/tar-no-std/issues/12#issuecomment-2092632090 + #[test] + fn test_weird_fuzzing_tarballs() { + /*std::env::set_var("RUST_LOG", "trace"); + std::env::set_var("RUST_LOG_STYLE", "always"); + env_logger::init();*/ + + let main_tarball = + TarArchiveRef::new(include_bytes!("../tests/weird_fuzzing_tarballs.tar")).unwrap(); + + let mut all_entries = vec![]; + for tarball in main_tarball.entries() { + let tarball = TarArchiveRef::new(tarball.data()).unwrap(); + for entry in tarball.entries() { + all_entries.push(entry.filename()); + } + } + + // Test succeeds if this works without a panic. + for entry in all_entries { + eprintln!("\"{entry:?}\","); + } + } + /// Tests to read the entries from existing archives in various Tar flavors. #[test] fn test_archive_entries() { @@ -418,27 +479,25 @@ mod tests { } #[test] - fn test_archive_with_dir_entries() { + fn test_default_archive_with_dir_entries() { // tarball created with: // $ gtar -cf tests/gnu_tar_default_with_dir.tar --exclude '*.tar' --exclude '012345678*' tests - { - let archive = - TarArchiveRef::new(include_bytes!("../tests/gnu_tar_default_with_dir.tar")) - .unwrap(); - let entries = archive.entries().collect::>(); + let archive = + TarArchiveRef::new(include_bytes!("../tests/gnu_tar_default_with_dir.tar")).unwrap(); + let entries = archive.entries().collect::>(); - assert_archive_with_dir_content(&entries); - } + assert_archive_with_dir_content(&entries); + } + #[test] + fn test_ustar_archive_with_dir_entries() { // tarball created with: // $(osx) tar -cf tests/mac_tar_ustar_with_dir.tar --format=ustar --exclude '*.tar' --exclude '012345678*' tests - { - let archive = - TarArchiveRef::new(include_bytes!("../tests/mac_tar_ustar_with_dir.tar")).unwrap(); - let entries = archive.entries().collect::>(); + let archive = + TarArchiveRef::new(include_bytes!("../tests/mac_tar_ustar_with_dir.tar")).unwrap(); + let entries = archive.entries().collect::>(); - assert_archive_with_dir_content(&entries); - } + assert_archive_with_dir_content(&entries); } /// Like [`test_archive_entries`] but with additional `alloc` functionality. diff --git a/src/header.rs b/src/header.rs index 696fd42..9d2d5de 100644 --- a/src/header.rs +++ b/src/header.rs @@ -31,9 +31,10 @@ SOFTWARE. #![allow(non_upper_case_globals)] use crate::{TarFormatDecimal, TarFormatOctal, TarFormatString, BLOCKSIZE, NAME_LEN, PREFIX_LEN}; -use core::fmt::{Debug, Formatter}; +use core::fmt::{Debug, Display, Formatter}; use core::num::ParseIntError; +/// Errors that may happen when parsing the [`ModeFlags`]. #[derive(Debug)] pub enum ModeError { ParseInt(ParseIntError), @@ -55,68 +56,36 @@ impl Mode { impl Debug for Mode { fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result { - let mut debug = f.debug_tuple("Mode"); - debug.field(&self.to_flags()); - debug.finish() + Debug::fmt(&self.to_flags(), f) } } -/// Header of the TAR format as specified by POSIX (POSIX 1003.1-1990. -/// "New" (version?) GNU Tar versions use this archive format by default. -/// (). -/// -/// Each file is started by such a header, that describes the size and -/// the file name. After that, the file content stands in chunks of 512 bytes. -/// The number of bytes can be derived from the file size. -/// -/// This is also mostly compatible with the "Ustar"-header and the "GNU format". -/// Because this library only needs to fetch data and filename, we don't need -/// further checks. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -#[repr(C, packed)] -pub struct PosixHeader { - pub name: TarFormatString, - pub mode: Mode, - pub uid: TarFormatOctal<8>, - pub gid: TarFormatOctal<8>, - // confusing; size is stored as ASCII string - pub size: TarFormatOctal<12>, - pub mtime: TarFormatDecimal<12>, - pub cksum: TarFormatOctal<8>, - pub typeflag: TypeFlag, - /// Name. There is always a null byte, therefore - /// the max len is 99. - pub linkname: TarFormatString, - pub magic: TarFormatString<6>, - pub version: TarFormatString<2>, - /// Username. There is always a null byte, therefore - /// the max len is N-1. - pub uname: TarFormatString<32>, - /// Groupname. There is always a null byte, therefore - /// the max len is N-1. - pub gname: TarFormatString<32>, - pub dev_major: TarFormatOctal<8>, - pub dev_minor: TarFormatOctal<8>, - pub prefix: TarFormatString, - // padding => to BLOCKSIZE bytes - pub _pad: [u8; 12], +#[derive(Copy, Clone, Debug, PartialOrd, PartialEq, Eq)] +pub struct InvalidTypeFlagError(u8); + +impl Display for InvalidTypeFlagError { + fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result { + f.write_fmt(format_args!("{:x} is not a valid TypeFlag", self.0)) + } } -impl PosixHeader { - /// Returns the number of blocks that are required to read the whole file - /// content. Returns an error, if the file size can't be parsed from the - /// header. - pub fn payload_block_count(&self) -> Result { - let parsed_size = self.size.as_number::()?; - Ok(parsed_size.div_ceil(BLOCKSIZE)) +#[cfg(feature = "unstable")] +impl core::error::Error for InvalidTypeFlagError {} + +#[derive(Copy, Clone, PartialOrd, PartialEq, Eq)] +pub struct TypeFlagRaw(u8); + +impl TypeFlagRaw { + /// Tries to parse the underlying value as [`TypeFlag`]. This fails if the + /// Tar file is corrupt and the type is invalid. + pub fn try_to_type_flag(self) -> Result { + TypeFlag::try_from(self) } +} - /// A Tar archive is terminated, if an end-of-archive entry, which consists - /// of two 512 blocks of zero bytes, is found. - pub fn is_zero_block(&self) -> bool { - let ptr = self as *const Self as *const u8; - let self_bytes = unsafe { core::slice::from_raw_parts(ptr, BLOCKSIZE) }; - self_bytes.iter().filter(|x| **x == 0).count() == BLOCKSIZE +impl Debug for TypeFlagRaw { + fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result { + Debug::fmt(&self.try_to_type_flag(), f) } } @@ -183,6 +152,27 @@ impl TypeFlag { } } +impl TryFrom for TypeFlag { + type Error = InvalidTypeFlagError; + + fn try_from(value: TypeFlagRaw) -> Result { + match value.0 { + b'0' => Ok(Self::REGTYPE), + b'\0' => Ok(Self::AREGTYPE), + b'1' => Ok(Self::LINK), + b'2' => Ok(Self::SYMTYPE), + b'3' => Ok(Self::CHRTYPE), + b'4' => Ok(Self::BLKTYPE), + b'5' => Ok(Self::DIRTYPE), + b'6' => Ok(Self::FIFOTYPE), + b'7' => Ok(Self::CONTTYPE), + b'x' => Ok(Self::XHDTYPE), + b'g' => Ok(Self::XGLTYPE), + e => Err(InvalidTypeFlagError(e)), + } + } +} + bitflags::bitflags! { /// UNIX file permissions in octal format. #[repr(transparent)] @@ -215,6 +205,65 @@ bitflags::bitflags! { } } +/// Header of the TAR format as specified by POSIX (POSIX 1003.1-1990. +/// "New" (version?) GNU Tar versions use this archive format by default. +/// (). +/// +/// Each file is started by such a header, that describes the size and +/// the file name. After that, the file content stands in chunks of 512 bytes. +/// The number of bytes can be derived from the file size. +/// +/// This is also mostly compatible with the "Ustar"-header and the "GNU format". +/// Because this library only needs to fetch data and filename, we don't need +/// further checks. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[repr(C, packed)] +pub struct PosixHeader { + pub name: TarFormatString, + pub mode: Mode, + pub uid: TarFormatOctal<8>, + pub gid: TarFormatOctal<8>, + // confusing; size is stored as ASCII string + pub size: TarFormatOctal<12>, + pub mtime: TarFormatDecimal<12>, + pub cksum: TarFormatOctal<8>, + pub typeflag: TypeFlagRaw, + /// Name. There is always a null byte, therefore + /// the max len is 99. + pub linkname: TarFormatString, + pub magic: TarFormatString<6>, + pub version: TarFormatString<2>, + /// Username. There is always a null byte, therefore + /// the max len is N-1. + pub uname: TarFormatString<32>, + /// Groupname. There is always a null byte, therefore + /// the max len is N-1. + pub gname: TarFormatString<32>, + pub dev_major: TarFormatOctal<8>, + pub dev_minor: TarFormatOctal<8>, + pub prefix: TarFormatString, + // padding => to BLOCKSIZE bytes + pub _pad: [u8; 12], +} + +impl PosixHeader { + /// Returns the number of blocks that are required to read the whole file + /// content. Returns an error, if the file size can't be parsed from the + /// header. + pub fn payload_block_count(&self) -> Result { + let parsed_size = self.size.as_number::()?; + Ok(parsed_size.div_ceil(BLOCKSIZE)) + } + + /// A Tar archive is terminated, if an end-of-archive entry, which consists + /// of two 512 blocks of zero bytes, is found. + pub fn is_zero_block(&self) -> bool { + let ptr = self as *const Self as *const u8; + let self_bytes = unsafe { core::slice::from_raw_parts(ptr, BLOCKSIZE) }; + self_bytes.iter().filter(|x| **x == 0).count() == BLOCKSIZE + } +} + #[cfg(test)] mod tests { use crate::header::{PosixHeader, TypeFlag}; @@ -283,24 +332,24 @@ mod tests { fn test_parse_tar_header_filename() { let archive = bytes_to_archive(include_bytes!("../tests/gnu_tar_default.tar")); assert_eq!( - archive.typeflag, - TypeFlag::REGTYPE, + archive.typeflag.try_to_type_flag(), + Ok(TypeFlag::REGTYPE), "the first entry is a regular file!" ); assert_eq!(archive.name.as_str(), Ok("bye_world_513b.txt")); let archive = bytes_to_archive(include_bytes!("../tests/gnu_tar_gnu.tar")); assert_eq!( - archive.typeflag, - TypeFlag::REGTYPE, + archive.typeflag.try_to_type_flag(), + Ok(TypeFlag::REGTYPE), "the first entry is a regular file!" ); assert_eq!(archive.name.as_str(), Ok("bye_world_513b.txt")); let archive = bytes_to_archive(include_bytes!("../tests/gnu_tar_oldgnu.tar")); assert_eq!( - archive.typeflag, - TypeFlag::REGTYPE, + archive.typeflag.try_to_type_flag(), + Ok(TypeFlag::REGTYPE), "the first entry is a regular file!" ); assert_eq!(archive.name.as_str(), Ok("bye_world_513b.txt")); @@ -317,8 +366,8 @@ mod tests { let archive = bytes_to_archive(include_bytes!("../tests/gnu_tar_ustar.tar")); assert_eq!( - archive.typeflag, - TypeFlag::REGTYPE, + archive.typeflag.try_to_type_flag(), + Ok(TypeFlag::REGTYPE), "the first entry is a regular file!" ); assert_eq!(archive.name.as_str(), Ok("bye_world_513b.txt")); @@ -326,8 +375,8 @@ mod tests { let archive = bytes_to_archive(include_bytes!("../tests/gnu_tar_v7.tar")); // ARegType: legacy assert_eq!( - archive.typeflag, - TypeFlag::AREGTYPE, + archive.typeflag.try_to_type_flag(), + Ok(TypeFlag::AREGTYPE), "the first entry is a regular file!" ); assert_eq!(archive.name.as_str(), Ok("bye_world_513b.txt")); diff --git a/src/lib.rs b/src/lib.rs index f25fef6..54f6612 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,42 +21,73 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -//! Library to read Tar archives (by GNU Tar) in `no_std` contexts with zero -//! allocations. If you have a standard environment and need full feature -//! support, I recommend the use of instead. +//! # `tar-no-std` - Parse Tar Archives (Tarballs) //! -//! The crate is simple and only supports reading of "basic" archives, therefore -//! no extensions, such as GNU Longname. The maximum supported file name length -//! is 100 characters including the NULL-byte. The maximum supported file size -//! is 8 GiB. Also, directories are not supported yet but only flat collections -//! of files. +//! _Due to historical reasons, there are several formats of Tar archives. All of +//! them are based on the same principles, but have some subtle differences that +//! often make them incompatible with each other._ [(reference)](https://www.gnu.org/software/tar/manual/html_section/Formats.html) +//! +//! Library to read Tar archives in `no_std` environments with zero allocations. If +//! you have a standard environment and need full feature support, I recommend the +//! use of instead. +//! +//! ## TL;DR +//! +//! Look at the [`TarArchiveRef`] type. +//! +//! ## Limitations +//! +//! This crate is simple and focuses on reading files and their content from a Tar +//! archive. Historic basic Tar and ustar [formats](https://www.gnu.org/software/tar/manual/html_section/Formats.html) +//! are supported. Other formats may work, but likely without all supported +//! features. GNU Extensions such as sparse files, incremental archives, and +//! long filename extension are not supported. +//! +//! The maximum supported file name length is 256 characters excluding the +//! NULL-byte (using the Tar name/prefix longname implementation of ustar). The +//! maximum supported file size is 8GiB. Directories are supported, but only regular +//! fields are yielded in iteration. The path is reflected in their file name. +//! +//! ## Use Case //! //! This library is useful, if you write a kernel or a similar low-level -//! application, which needs "a bunch of files" from an archive ("init ram -//! disk"). The Tar file could for example come as a Multiboot2 boot module +//! application, which needs "a bunch of files" from an archive (like an +//! "init ramdisk"). The Tar file could for example come as a Multiboot2 boot module //! provided by the bootloader. //! -//! This crate focuses on extracting files from uncompressed Tar archives -//! created with default options by **GNU Tar**. GNU Extensions such as sparse -//! files, incremental archives, and long filename extension are not supported -//! yet. [gnu.org](https://www.gnu.org/software/tar/manual/html_section/Formats.html) -//! provides a good overview over possible archive formats and their -//! limitations. +//! ## Example //! -//! # Example //! ```rust //! use tar_no_std::TarArchiveRef; //! +//! // init a logger (optional) +//! std::env::set_var("RUST_LOG", "trace"); +//! env_logger::init(); +//! //! // also works in no_std environment (except the println!, of course) //! let archive = include_bytes!("../tests/gnu_tar_default.tar"); //! let archive = TarArchiveRef::new(archive).unwrap(); //! // Vec needs an allocator of course, but the library itself doesn't need one //! let entries = archive.entries().collect::>(); //! println!("{:#?}", entries); -//! println!("content of last file:"); -//! let last_file_content = unsafe { core::str::from_utf8_unchecked(entries[2].data()) }; -//! println!("{:#?}", last_file_content); //! ``` +//! +//! ## Cargo Feature +//! +//! This crate allows the usage of the additional Cargo build time feature `alloc`. +//! When this is active, the crate also provides the type `TarArchive`, which owns +//! the data on the heap. The `unstable` feature provides additional convenience +//! only available on the nightly channel. +//! +//! ## Compression (`tar.gz`) +//! +//! If your Tar file is compressed, e.g. by `.tar.gz`/`gzip`, you need to uncompress +//! the bytes first (e.g. by a *gzip* library). Afterwards, this crate can read the +//! Tar archive format from the uncompressed bytes. +//! +//! ## MSRV +//! +//! The MSRV is 1.76.0 stable. #![cfg_attr(feature = "unstable", feature(error_in_core))] #![cfg_attr(not(test), no_std)] diff --git a/src/tar_format_types.rs b/src/tar_format_types.rs index 53d51c2..7b0ccbf 100644 --- a/src/tar_format_types.rs +++ b/src/tar_format_types.rs @@ -7,14 +7,12 @@ use core::str::{from_utf8, Utf8Error}; use num_traits::Num; /// Base type for strings embedded in a Tar header. The length depends on the -/// context. The returned string +/// context. The returned string is likely to be UTF-8/ASCII, which is verified +/// by getters, such as [`TarFormatString::as_str`]. /// /// An optionally null terminated string. The contents are either: /// 1. A fully populated string with no null termination or /// 2. A partially populated string where the unused bytes are zero. -/// -/// The content is likely to be UTF-8/ASCII, but that is not verified by this -/// type. The #[derive(Copy, Clone, PartialEq, Eq)] #[repr(C)] pub struct TarFormatString { @@ -37,8 +35,8 @@ impl TarFormatString { self.bytes[0] == 0 } - /// Returns the length of the bytes. This is either the full capacity `N` - /// or the data until the first NULL byte. + /// Returns the length of the payload in bytes. This is either the full + /// capacity `N` or the data until the first NULL byte. pub fn size(&self) -> usize { memchr::memchr(0, &self.bytes).unwrap_or(N) } @@ -50,6 +48,17 @@ impl TarFormatString { from_utf8(&self.bytes[0..self.size()]) } + /// Wrapper around [`Self::as_str`] that stops as soon as the first space + /// is found. This is necessary to properly parse certain Tar-style encoded + /// numbers. Some ustar implementations pad spaces which prevents the proper + /// parsing as number. + pub fn as_str_until_first_space(&self) -> Result<&str, Utf8Error> { + from_utf8(&self.bytes[0..self.size()]).map(|str| { + let end_index_exclusive = str.find(' ').unwrap_or(str.len()); + &str[0..end_index_exclusive] + }) + } + /// Append to end of string. Panics if there is not enough capacity. pub fn append(&mut self, other: &TarFormatString) { let resulting_length = self.size() + other.size(); @@ -74,8 +83,8 @@ impl Debug for TarFormatString { let sub_array = &self.bytes[0..self.size()]; write!( f, - "str='{}',byte_usage={}/{}", - from_utf8(sub_array).unwrap(), + "str='{:?}',byte_usage={}/{}", + from_utf8(sub_array), self.size(), N ) @@ -98,24 +107,22 @@ pub struct TarFormatOctal(TarFormatNumber); pub struct TarFormatDecimal(TarFormatNumber); impl TarFormatNumber { + #[cfg(test)] + const fn new(bytes: [u8; N]) -> Self { + Self(TarFormatString:: { bytes }) + } + pub fn as_number(&self) -> core::result::Result where T: num_traits::Num, { - memchr::memchr2(32, 0, &self.0.bytes).map_or_else( - || T::from_str_radix(self.0.as_str().expect("Should be valid Tar archive"), R), - |idx| { - T::from_str_radix( - from_utf8(&self.0.bytes[..idx]).expect("byte array is not UTF-8"), - 8, - ) - }, - ) + let str = self.0.as_str_until_first_space().unwrap_or("0"); + T::from_str_radix(str, R) } - /// Returns the raw string describing this type. - pub fn as_raw_str(&self) -> core::result::Result<&str, Utf8Error> { - self.0.as_str() + /// Returns the underlying [`TarFormatString`]. + pub const fn as_inner(&self) -> &TarFormatString { + &self.0 } } @@ -149,8 +156,9 @@ impl TarFormatDecimal { self.0.as_number::() } - pub fn as_raw_str(&self) -> core::result::Result<&str, Utf8Error> { - self.0.as_raw_str() + /// Returns the underlying [`TarFormatString`]. + pub const fn as_inner(&self) -> &TarFormatString { + self.0.as_inner() } } @@ -162,12 +170,14 @@ impl TarFormatOctal { self.0.as_number::() } - pub fn as_raw_str(&self) -> core::result::Result<&str, Utf8Error> { - self.0.as_raw_str() + /// Returns the underlying [`TarFormatString`]. + pub const fn as_inner(&self) -> &TarFormatString { + self.0.as_inner() } } -mod tests { +#[cfg(test)] +mod tar_format_string_tests { use super::TarFormatString; use core::mem::size_of_val; @@ -183,7 +193,7 @@ mod tests { #[test] fn test_one_byte_string() { - let s = TarFormatString::new([65]); + let s = TarFormatString::new([b'A']); assert_eq!(size_of_val(&s), 1); assert!(!s.is_empty()); assert_eq!(s.size(), 1); @@ -192,13 +202,23 @@ mod tests { #[test] fn test_two_byte_string_nul_terminated() { - let s = TarFormatString::new([65, 0]); - assert_eq!(size_of_val(&s), 2); + let s = TarFormatString::new([b'A', 0, b'B']); + assert_eq!(size_of_val(&s), 3); assert!(!s.is_empty()); assert_eq!(s.size(), 1); assert_eq!(s.as_str(), Ok("A")); } + #[test] + fn test_str_until_first_space() { + let s = TarFormatString::new([b'A', b'B', b' ', b'X', 0]); + assert_eq!(size_of_val(&s), 5); + assert!(!s.is_empty()); + assert_eq!(s.size(), 4); + assert_eq!(s.as_str(), Ok("AB X")); + assert_eq!(s.as_str_until_first_space(), Ok("AB")); + } + #[test] #[allow(clippy::cognitive_complexity)] fn test_append() { @@ -213,14 +233,14 @@ mod tests { assert_eq!(s.as_str(), Ok("")); // When adding ABC - s.append(&TarFormatString::new([65, 66, 67])); + s.append(&TarFormatString::new([b'A', b'B', b'C'])); // Then the string contains the additional 3 chars assert_eq!(size_of_val(&s), 20); assert!(!s.is_empty()); assert_eq!(s.size(), 3); assert_eq!(s.as_str(), Ok("ABC")); - s.append(&TarFormatString::new([68, 69, 70])); + s.append(&TarFormatString::new([b'D', b'E', b'F'])); // Then the string contains the additional 3 chars assert_eq!(size_of_val(&s), 20); assert!(!s.is_empty()); @@ -249,3 +269,15 @@ mod tests { assert_eq!(s.as_str(), Ok("ABCDEFAAAAAAAAAAAAAZ")); } } + +#[cfg(test)] +mod tar_format_number_tests { + use crate::{TarFormatDecimal, TarFormatNumber, TarFormatString}; + + #[test] + fn test_as_number_with_space_in_string() { + let str = [b'0', b'1', b'0', b' ', 0]; + let str = TarFormatNumber::<5, 10>::new(str); + assert_eq!(str.as_number::(), Ok(10)); + } +} diff --git a/tests/weird_fuzzing_tarballs.tar b/tests/weird_fuzzing_tarballs.tar new file mode 100644 index 0000000..1d5b9aa Binary files /dev/null and b/tests/weird_fuzzing_tarballs.tar differ