Skip to content

Commit

Permalink
ArchiveIterator: split into ArchiveHeaderIterator and ArchiveEntryIte…
Browse files Browse the repository at this point in the history
…rator
  • Loading branch information
phip1611 committed May 3, 2024
1 parent 9c1cd95 commit b6d8dee
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 91 deletions.
201 changes: 121 additions & 80 deletions src/archive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ SOFTWARE.
use crate::header::PosixHeader;
use crate::tar_format_types::TarFormatString;
use crate::{TypeFlag, BLOCKSIZE, POSIX_1003_MAX_FILENAME_LEN};
use crate::{BLOCKSIZE, POSIX_1003_MAX_FILENAME_LEN};
#[cfg(feature = "alloc")]
use alloc::boxed::Box;
use core::fmt::{Debug, Display, Formatter};
use core::str::Utf8Error;
use log::warn;
use log::{error, warn};

/// Describes an entry in an archive.
/// Currently only supports files but no directories.
Expand Down Expand Up @@ -124,9 +124,9 @@ impl TarArchive {

/// Iterates over all entries of the Tar archive.
/// Returns items of type [`ArchiveEntry`].
/// See also [`ArchiveIterator`].
pub fn entries(&self) -> ArchiveIterator {
ArchiveIterator::new(self.data.as_ref())
/// See also [`ArchiveEntryIterator`].
pub fn entries(&self) -> ArchiveEntryIterator {
ArchiveEntryIterator::new(self.data.as_ref())
}
}

Expand Down Expand Up @@ -164,116 +164,133 @@ impl<'a> TarArchiveRef<'a> {

/// Iterates over all entries of the Tar archive.
/// Returns items of type [`ArchiveEntry`].
/// See also [`ArchiveIterator`].
pub const fn entries(&self) -> ArchiveIterator {
ArchiveIterator::new(self.data)
/// See also [`ArchiveEntryIterator`].
pub fn entries(&self) -> ArchiveEntryIterator {
ArchiveEntryIterator::new(self.data)
}
}

/// Iterator over the files of the archive. Each iteration starts
/// at the next Tar header entry.
/// Iterates over the headers of the Tar archive.
#[derive(Debug)]
pub struct ArchiveIterator<'a> {
pub struct ArchiveHeaderIterator<'a> {
archive_data: &'a [u8],
block_index: usize,
}

impl<'a> ArchiveIterator<'a> {
pub const fn new(archive: &'a [u8]) -> Self {
impl<'a> ArchiveHeaderIterator<'a> {
pub fn new(archive: &'a [u8]) -> Self {
assert!(!archive.is_empty());
assert_eq!(archive.len() % BLOCKSIZE, 0);
Self {
archive_data: archive,
block_index: 0,
}
}

/// Returns a reference to the next Header.
fn next_hdr(&self, block_index: usize) -> &'a PosixHeader {
/// Parse the memory at the given block as [`PosixHeader`].
fn block_as_header(&self, block_index: usize) -> &'a PosixHeader {
let hdr_ptr = &self.archive_data[block_index * BLOCKSIZE];
unsafe { (hdr_ptr as *const u8).cast::<PosixHeader>().as_ref() }.unwrap()
}
}

impl<'a> Iterator for ArchiveIterator<'a> {
type Item = ArchiveEntry<'a>;
type BlockIndex = usize;

impl<'a> Iterator for ArchiveHeaderIterator<'a> {
type Item = (BlockIndex, &'a PosixHeader);

/// Returns the next header. Internally, it updates the necessary data
/// structures to not read the same header multiple times.
///
/// This returns `None` if either no further headers are found or if a
/// header can't be parsed.
fn next(&mut self) -> Option<Self::Item> {
if self.block_index * BLOCKSIZE >= self.archive_data.len() {
warn!("Reached end of Tar archive data without finding zero/end blocks!");
return None;
}
// TODO better check for two end zero blocks here?
assert!(self.block_index < self.archive_data.len() / BLOCKSIZE);

let mut hdr = self.next_hdr(self.block_index);

loop {
// check if we found end of archive
if hdr.is_zero_block() {
let next_hdr = self.next_hdr(self.block_index + 1);
if next_hdr.is_zero_block() {
// gracefully terminated Archive
log::debug!("End of Tar archive with two zero blocks!");
} else {
log::warn!(
"Zero block found at end of Tar archive, but only one instead of two!"
);
}
// end of archive
return None;
}
let hdr = self.block_as_header(self.block_index);
let block_index = self.block_index;

// Ignore directory entries, i.e. yield only regular files. Works as
// filenames in tarballs are fully specified, e.g. dirA/dirB/file1
if hdr.typeflag != TypeFlag::DIRTYPE {
break;
}
// Start at next block on next iteration.
self.block_index += 1;
log::info!("{:#?}, {:#?}", hdr.name, hdr.typeflag);

// in next iteration: start at next Archive entry header
// +1 for current hdr block itself + all data blocks
let data_block_count: usize = hdr.payload_block_count().unwrap();
self.block_index += data_block_count + 1;
hdr = self.next_hdr(self.block_index);
let block_count = hdr
.payload_block_count()
.inspect_err(|e| {
log::error!("Unparseable size ({e:?}) in header {hdr:#?}");
})
.ok()?;

if !hdr.is_zero_block() {
self.block_index += block_count;
}

if hdr.typeflag != TypeFlag::AREGTYPE && hdr.typeflag != TypeFlag::REGTYPE {
log::warn!(
"Found entry of type={:?}, but only files are supported",
Some((block_index, hdr))
}
}

impl<'a> ExactSizeIterator for ArchiveEntryIterator<'a> {}

/// Iterator over the files of the archive.
#[derive(Debug)]
pub struct ArchiveEntryIterator<'a>(ArchiveHeaderIterator<'a>);

impl<'a> ArchiveEntryIterator<'a> {
pub fn new(archive: &'a [u8]) -> Self {
Self(ArchiveHeaderIterator::new(archive))
}

fn next_hdr(&mut self) -> Option<(BlockIndex, &'a PosixHeader)> {
self.0.next()
}
}

impl<'a> Iterator for ArchiveEntryIterator<'a> {
type Item = ArchiveEntry<'a>;

fn next(&mut self) -> Option<Self::Item> {
let (mut block_index, mut hdr) = self.next_hdr()?;

// Ignore directory entries, i.e. yield only regular files. Works as
// filenames in tarballs are fully specified, e.g. dirA/dirB/file1
while !hdr.typeflag.is_regular_file() {
warn!(
"Skipping entry of type {:?} (not supported yet)",
hdr.typeflag
);
return None;
}

if hdr.name.is_empty() {
warn!("Found empty file name",);
// Update properties.
(block_index, hdr) = self.next_hdr()?;
}

let hdr_size = hdr.size.as_number::<usize>();
if let Err(e) = hdr_size {
warn!("Can't parse the file size from the header block. Stop iterating Tar archive. {e:#?}");
return None;
// check if we found end of archive (two zero blocks)
if hdr.is_zero_block() {
if self.next_hdr()?.1.is_zero_block() {
// found end
return None;
} else {
panic!("Never expected to have a situation where self.next_hdr() returns a zero block and the next one is not a zero block, as we should never point to an 'end zero block of a regular file'");
}
}
let hdr_size = hdr_size.unwrap();

// Fetch data of file from next block(s).
// .unwrap() is fine as we checked that hdr.size().val() is valid
// above
let data_block_count = hdr.payload_block_count().unwrap();

// +1: skip hdr block itself and start at data!
// i_begin is the byte begin index of this file in the array of the whole archive
let i_begin = (self.block_index + 1) * BLOCKSIZE;
// i_end is the exclusive byte end index of the data of the current file
let i_end = i_begin + data_block_count * BLOCKSIZE;
let file_block_bytes = &self.archive_data[i_begin..i_end];
// Each block is 512 bytes long, but the file size is not necessarily a
// multiple of 512.
let file_bytes = &file_block_bytes[0..hdr_size];

// in next iteration: start at next Archive entry header
// +1 for current hdr block itself + all data blocks
self.block_index += data_block_count + 1;

let payload_size: usize = hdr
.size
.as_number()
.inspect_err(|e| error!("Can't parse the file size from the header. {e:#?}"))
.ok()?;

let idx_first_data_block = block_index + 1;
let idx_begin = idx_first_data_block * BLOCKSIZE;
let idx_end_exclusive = idx_begin + payload_size;
let file_bytes = &self.0.archive_data[idx_begin..idx_end_exclusive];

let mut filename: TarFormatString<256> =
TarFormatString::<POSIX_1003_MAX_FILENAME_LEN>::new([0; POSIX_1003_MAX_FILENAME_LEN]);
if hdr.magic.as_str().unwrap() == "ustar" && hdr.version.as_str().unwrap() == "00" && !hdr.prefix.is_empty() {
if hdr.magic.as_str().unwrap() == "ustar"
&& hdr.version.as_str().unwrap() == "00"
&& !hdr.prefix.is_empty()
{
filename.append(&hdr.prefix);
filename.append(&TarFormatString::<1>::new([b'/']));
}
Expand Down Expand Up @@ -302,6 +319,30 @@ mod tests {
};
}

#[test]
fn test_header_iterator() {
let archive = include_bytes!("../tests/gnu_tar_default.tar");
let iter = ArchiveHeaderIterator::new(archive);
let names = iter
.map(|(_i, hdr)| hdr.name.as_str().unwrap())
.collect::<Vec<_>>();
assert_eq!(
names.as_slice(),
&[
"bye_world_513b.txt",
"hello_world_513b.txt",
"hello_world.txt",
]
)

/*for hdr in iter {
dbg!(hdr);
}*/

// TODO make PartialEq
//assert_eq!(ArchiveHeaderIterator::new(archive).collect::<Vec<_>>().as_slice(), &[]);
}

#[test]
fn test_archive_list() {
let archive = TarArchiveRef::new(include_bytes!("../tests/gnu_tar_default.tar")).unwrap();
Expand Down
25 changes: 16 additions & 9 deletions src/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ impl Debug for Mode {
/// This is also mostly compatible with the "Ustar"-header and the "GNU format".
/// Because this library only needs to fetch data and filename, we don't need
/// further checks.
// TODO make PartialEq?
#[derive(Debug, Copy, Clone)]
#[repr(C, packed)]
pub struct PosixHeader {
Expand Down Expand Up @@ -107,14 +108,12 @@ impl PosixHeader {
/// content. Returns an error, if the file size can't be parsed from the
/// header.
pub fn payload_block_count(&self) -> Result<usize, ParseIntError> {
let div = self.size.as_number::<usize>()? / BLOCKSIZE;
let modulo = self.size.as_number::<usize>()? % BLOCKSIZE;
let block_count = if modulo > 0 { div + 1 } else { div };
Ok(block_count)
let parsed_size = self.size.as_number::<usize>()?;
Ok(parsed_size.div_ceil(BLOCKSIZE))
}

/// A Tar archive is terminated, if a end-of-archive entry, which consists of two 512 blocks
/// of zero bytes, is found.
/// A Tar archive is terminated, if an end-of-archive entry, which consists
/// of two 512 blocks of zero bytes, is found.
pub fn is_zero_block(&self) -> bool {
let ptr = self as *const Self as *const u8;
let self_bytes = unsafe { core::slice::from_raw_parts(ptr, BLOCKSIZE) };
Expand Down Expand Up @@ -223,17 +222,25 @@ mod tests {
use crate::BLOCKSIZE;
use std::mem::size_of;

/// Casts the bytes to a reference to a PosixhHeader.
fn bytes_to_archive(bytes: &[u8]) -> &PosixHeader {
unsafe { (bytes.as_ptr() as *const PosixHeader).as_ref() }.unwrap()
/// Returns the PosixHeader at the beginning of the Tar archive.
fn bytes_to_archive(tar_archive_data: &[u8]) -> &PosixHeader {
unsafe { (tar_archive_data.as_ptr() as *const PosixHeader).as_ref() }.unwrap()
}

#[test]
fn test_display_header() {
let archive = bytes_to_archive(include_bytes!("../tests/gnu_tar_default.tar"));
assert_eq!(archive.name.as_str(), Ok("bye_world_513b.txt"));
println!("{:#?}'", archive);
}

#[test]
fn test_payload_block_count() {
// first file is "bye_world_513b.txt" => we expect two data blocks
let archive = bytes_to_archive(include_bytes!("../tests/gnu_tar_default.tar"));
assert_eq!(archive.payload_block_count(), Ok(2));
}

#[test]
fn test_show_tar_header_magics() {
let archive = bytes_to_archive(include_bytes!("../tests/gnu_tar_default.tar"));
Expand Down
17 changes: 15 additions & 2 deletions src/tar_format_types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use num_traits::Num;
/// 2. A partially populated string where the unused bytes are zero.
///
/// The content is likely to be UTF-8/ASCII, but that is not verified by this
/// type.
/// type. The
#[derive(Copy, Clone)]
#[repr(C)]
pub struct TarFormatString<const N: usize> {
Expand Down Expand Up @@ -82,7 +82,7 @@ impl<const N: usize> Debug for TarFormatString<N> {
}
}

/// A number. Trailing spaces in the string are ignored.
/// A number with a specified base. Trailing spaces in the string are ignored.
#[derive(Copy, Clone)]
#[repr(C)]
pub struct TarFormatNumber<const N: usize, const R: u32>(TarFormatString<N>);
Expand Down Expand Up @@ -112,6 +112,11 @@ impl<const N: usize, const R: u32> TarFormatNumber<N, R> {
},
)
}

/// Returns the raw string describing this type.
pub fn as_raw_str(&self) -> core::result::Result<&str, Utf8Error> {
self.0.as_str()
}
}

impl<const N: usize, const R: u32> Debug for TarFormatNumber<N, R> {
Expand Down Expand Up @@ -143,6 +148,10 @@ impl<const N: usize> TarFormatDecimal<N> {
{
self.0.as_number::<T>()
}

pub fn as_raw_str(&self) -> core::result::Result<&str, Utf8Error> {
self.0.as_raw_str()
}
}

impl<const N: usize> TarFormatOctal<N> {
Expand All @@ -152,6 +161,10 @@ impl<const N: usize> TarFormatOctal<N> {
{
self.0.as_number::<T>()
}

pub fn as_raw_str(&self) -> core::result::Result<&str, Utf8Error> {
self.0.as_raw_str()
}
}

mod tests {
Expand Down

0 comments on commit b6d8dee

Please sign in to comment.