bitbottle 0.10.0

a modern archive file format
Documentation
use std::cell::RefCell;
use std::collections::HashMap;
use std::fmt::Debug;
use std::path::{Component, PathBuf};
use std::rc::Rc;
use crate::bottle_cap::BottleCap;
use crate::file_atlas::FileAtlasRef;
use crate::hashing::HashingOutput;


/// Hash of some data, stored as a `GenericArray`, which is a kind of
/// flat array like `[u8; N]` with the exact dimension of the output of this
/// hash operation.
pub type BlockHash = HashingOutput;


/// Info about a data block delineated by `Scanner`: size and hash.
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq)]
pub struct Block {
    /// Size of the block, in bytes.
    pub size: usize,
    /// The hash of this block (as 32 bytes).
    pub hash: BlockHash,
}

impl Debug for Block {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "Block {{ size: {}, hash: {} }}", self.size, hex::encode(self.hash))
    }
}


/// A block's size & hash, and the actual data.
#[derive(Debug, Eq, PartialEq)]
pub struct BlockData {
    pub block: Block,
    pub data: Vec<u8>,
    pub bottle_cap: BottleCap,
}


/// Info about an entire file: its list of `Block`s, and its total hash.
#[derive(Clone, Default, Eq, PartialEq)]
pub struct FileBlocks {
    pub blocks: Vec<Block>,
    pub hash: BlockHash,
}

impl FileBlocks {
    /// If this block is in this file, what offset is it at?
    pub fn offsets_of(&self, hash: &BlockHash) -> Vec<(u64, Block)> {
        let mut rv = Vec::new();
        let mut offset = 0u64;
        for b in &self.blocks {
            if b.hash == *hash {
                rv.push((offset, *b));
            }
            offset += b.size as u64;
        }
        rv
    }
}

impl Debug for FileBlocks {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "FileBlocks {{ blocks: {:?}, hash: {} }}", self.blocks, hex::encode(self.hash))
    }
}


/// The source and target of a symlink, for use when dropping invalid symlinks.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Symlink {
    pub source: PathBuf,
    pub target: PathBuf,
}

/// List of files and their metadata (including block hashes). It also
/// includes a table of the unique blocks, to easily determine the overall
/// archive size.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct FileList {
    /// The list of files actually stored in this archive
    pub files: Vec<FileAtlasRef>,

    /// A map of the actual unique blocks. Some blocks may appear in multiple
    /// files, so this is really treated as a set.
    pub blocks: HashMap<BlockHash, Block>,

    /// A map of which files contain each actual unique block. Some blocks
    /// may appear in multiple files.
    pub file_map: HashMap<BlockHash, Vec<FileAtlasRef>>,
}

/// Shared reference to a FileList.
pub type FileListRef = Rc<RefCell<FileList>>;

impl FileList {
    pub fn new() -> FileList {
        FileList { files: Vec::new(), blocks: HashMap::new(), file_map: HashMap::new() }
    }

    pub fn clear(&mut self) {
        self.files.clear();
        self.blocks.clear();
        self.file_map.clear();
    }

    /// Turn me into a bobbled self (non-thread-safe, runtime reference counted)
    pub fn to_shared(self) -> FileListRef {
        Rc::new(RefCell::new(self))
    }

    pub fn plain_files(&self) -> impl Iterator<Item = FileAtlasRef> + '_ {
        self.files.iter().filter(|atlas| !atlas.borrow().is_folder).cloned()
    }

    pub fn total_file_count(&self) -> usize {
        self.plain_files().count()
    }

    /// Sum of all the file sizes, which may be larger than the sum of all
    /// the block sizes, because some blocks may appear in multiple files.
    pub fn total_size(&self) -> u64 {
        self.plain_files().fold(0, |sum, atlas| sum + atlas.borrow().size)
    }

    /// Sum of the size of each block in this archive.
    pub fn total_block_size(&self) -> u64 {
        self.blocks.values().fold(0, |sum, block| sum + block.size as u64)
    }

    // when reading the file list from an archive, the file_map must be generated from files & blocks.
    pub fn build_file_map(&mut self) {
        self.blocks.clear();
        for atlas in &self.files {
            for block in &atlas.borrow().contents.blocks {
                // must duplicate add_block because rust gets grumpy.
                if self.blocks.insert(block.hash, *block).iter().any(|ob| ob.size != block.size) {
                    panic!("Two blocks with the same SHA-256 but different sizes!");
                }
                self.file_map.entry(block.hash).or_default().push(atlas.clone());
            }
        }
    }

    pub fn add_block(&mut self, atlas: &FileAtlasRef, block: &Block) {
        if self.blocks.insert(block.hash, *block).iter().any(|ob| ob.size != block.size) {
            panic!("Two blocks with the same SHA-256 but different sizes!");
        }
        self.file_map.entry(block.hash).or_default().push(atlas.clone());
    }

    /// Drop any symlinks that smell fishy, and return them.
    pub fn drop_stray_symlinks(&mut self) -> Vec<Symlink> {
        let (normal, symlinks): (Vec<_>, Vec<_>) = self.files.iter().partition(|atlas| {
            atlas.borrow().symlink_target.is_none()
        });
        let path_names = normal.iter().map(|atlas| atlas.borrow().normalized_path.clone()).collect::<Vec<_>>();
        let mut symlinks = symlinks.iter().map(|atlas| {
            let atlas = atlas.borrow();
            Symlink { source: atlas.path.clone(), target: atlas.symlink_target.as_ref().unwrap().clone() }
        }).collect::<Vec<_>>();

        let reject = FileList::drop_stray_symlinks_from(&mut symlinks, &path_names);
        let reject_paths: Vec<PathBuf> = reject.iter().map(|symlink| symlink.source.clone()).collect();
        self.files.retain(|atlas| reject_paths.iter().all(|path| &atlas.borrow().path != path));
        reject
    }

    /// Drop any symlinks that are:
    /// - absolute
    /// - refer to a target that isn't in the path list
    /// - attempt to pass through any intermediate outside the path list
    pub fn drop_stray_symlinks_from(symlinks: &mut Vec<Symlink>, path_names: &[PathBuf]) -> Vec<Symlink> {
        let mut keep: Vec<Symlink> = vec![];
        let mut reject: Vec<Symlink> = vec![];
        for symlink in symlinks.drain(..) {
            let mut anchor = symlink.source.parent().unwrap_or(&PathBuf::from("")).to_path_buf();

            if !symlink.target.components().all(|c| {
                // symlink can't be absolute or have any shenanigans
                matches!(c, Component::Normal(_) | Component::ParentDir | Component::CurDir)
            }) || !symlink.target.components().all(|component| {
                // every component of a symlink must be present in the archive & not escape
                let mut escaped = false;
                match component {
                    Component::Normal(segment) => { anchor.push(segment); },
                    Component::ParentDir => { escaped = !anchor.pop(); },
                    _ => (),
                }

                (path_names.contains(&anchor) || anchor.as_os_str().is_empty()) && !escaped
            }) || anchor.as_os_str().is_empty() {
                // it can't just point into liminal space (tho it can go thru it to a real place)
                reject.push(symlink);
            } else {
                keep.push(symlink);
            }
        }

        symlinks.extend(keep);
        reject
    }
}

impl Default for FileList {
    fn default() -> Self {
        FileList::new()
    }
}


#[cfg(test)]
mod test {
    use std::path::PathBuf;
    use super::{FileList, Symlink};


    #[test]
    fn relative_symlinks_ok() {
        let test_file1: PathBuf = PathBuf::from("test/file1");
        let data_file1: PathBuf = PathBuf::from("data/file1");
        let back_src: PathBuf = PathBuf::from("../src");
        let path_names = [ "src", "test", "test/data", "test/data/file1" ].map(PathBuf::from).to_vec();

        let mut symlinks = vec![ Symlink { source: test_file1.clone(), target: data_file1.clone() } ];
        let rejected = FileList::drop_stray_symlinks_from(&mut symlinks, &path_names);
        assert_eq!(rejected.len(), 0);
        assert_eq!(symlinks.iter().map(|s| &s.target).collect::<Vec<_>>(), vec![ &data_file1 ]);

        let mut symlinks = vec![ Symlink { source: test_file1.clone(), target: back_src.clone() } ];
        let rejected = FileList::drop_stray_symlinks_from(&mut symlinks, &path_names);
        assert_eq!(rejected.len(), 0);
        assert_eq!(symlinks.iter().map(|s| &s.target).collect::<Vec<_>>(), vec![ &back_src ]);
    }

    #[test]
    fn no_symlink_to_missing_path() {
        let test_file1: PathBuf = PathBuf::from("test/file1");
        let missing_path: PathBuf = PathBuf::from("./docs");
        let path_names = [ "src", "test", "test/data", "test/data/file1" ].map(PathBuf::from).to_vec();

        let mut symlinks = vec![ Symlink { source: test_file1.clone(), target: missing_path.clone() } ];
        let rejected = FileList::drop_stray_symlinks_from(&mut symlinks, &path_names);
        assert_eq!(symlinks.len(), 0);
        assert_eq!(rejected.iter().map(|s| &s.target).collect::<Vec<_>>(), vec![ &missing_path ]);
    }

    #[test]
    fn no_symlink_to_space() {
        let test_file1: PathBuf = PathBuf::from("test/file1");
        let back_space: PathBuf = PathBuf::from("../");
        let path_names = [ "src", "test", "test/data", "test/data/file1" ].map(PathBuf::from).to_vec();

        let mut symlinks = vec![ Symlink { source: test_file1.clone(), target: back_space.clone() } ];
        let rejected = FileList::drop_stray_symlinks_from(&mut symlinks, &path_names);
        assert_eq!(symlinks.len(), 0);
        assert_eq!(rejected.iter().map(|s| &s.target).collect::<Vec<_>>(), vec![ &back_space ]);
    }

    #[test]
    fn no_escaping_symlinks() {
        let test_file1: PathBuf = PathBuf::from("test/file1");
        let back_space: PathBuf = PathBuf::from("../../test");
        let path_names = [ "src", "test", "test/data", "test/data/file1" ].map(PathBuf::from).to_vec();

        let mut symlinks = vec![ Symlink { source: test_file1.clone(), target: back_space.clone() } ];
        let rejected = FileList::drop_stray_symlinks_from(&mut symlinks, &path_names);
        assert_eq!(symlinks.len(), 0);
        assert_eq!(rejected.iter().map(|s| &s.target).collect::<Vec<_>>(), vec![ &back_space ]);
    }

    #[test]
    fn no_absolute_symlinks() {
        let test_file1: PathBuf = PathBuf::from("test/file1");
        let abs_path: PathBuf = PathBuf::from("/etc/passwd");
        let path_names = [ "src", "test", "test/data", "test/data/file1" ].map(PathBuf::from).to_vec();

        let mut symlinks = vec![ Symlink { source: test_file1.clone(), target: abs_path.clone() } ];
        let rejected = FileList::drop_stray_symlinks_from(&mut symlinks, &path_names);
        assert_eq!(symlinks.len(), 0);
        assert_eq!(rejected.iter().map(|s| &s.target).collect::<Vec<_>>(), vec![ &abs_path ]);
    }
}