bitbottle 0.10.0

a modern archive file format
Documentation
use std::fs;
use std::path::{Component, Path, PathBuf};
use crate::FileAtlasRef;
use crate::bottle_error::{BottleError, BottleResult};
use crate::file_atlas::FileAtlas;
use crate::file_list::{Block, FileBlocks, FileList};
use crate::hashing::{Hashing, HashType};
use crate::scanner::Scanner;


/// `FileScanner` progress: scanning paths or blocks.
#[derive(Clone, PartialEq, Eq)]
pub enum ScanState {
    FileList { atlas: FileAtlasRef, file_count: usize, bytes: u64 },
    Blocks { blocks: usize, file_count: usize, bytes: u64, total_bytes: u64, unique_bytes: u64 },
}


/// Return the common prefix for a list of paths.
fn find_common_prefix(paths: &[PathBuf]) -> PathBuf {
    if let Some((first_path, paths)) = paths.split_first() {
        // fold across the paths, combining each pair of paths into their common prefix:
        paths.iter().fold(first_path.to_path_buf(), |path1, path2| {
            path1.components().zip(path2.components()).take_while(|(c1, c2)| c1 == c2).map(|(c1, _)| c1).collect()
        })
    } else {
        paths[0].to_path_buf()
    }
}

#[derive(Clone, Debug, Eq, PartialEq)]
struct NormalizedScanPath {
    /// the actual path to scan (absolute or relative)
    pub path: PathBuf,
    /// for absolute paths, the prefix to omit in the normalized form
    pub hidden_prefix: Option<PathBuf>,
}

/// Fix up incoming paths so the FileScanner knows how to create a normalized
/// path for each discovered folder/file.
fn normalize_paths(paths: &[PathBuf]) -> BottleResult<Vec<NormalizedScanPath>> {
    let paths: Vec<PathBuf> = paths.iter().map(|path| {
        let mut components: Vec<Component> = path.components().collect();

        // trim initial "./"
        while let Some(Component::CurDir) = components.first() {
            components.remove(0);
        };

        // ensure paths don't contain sneaky crap in the middle (. or ..)
        if components.iter().any(|c| {
            c == &Component::CurDir || c == &Component::ParentDir
        }) {
            return Err(BottleError::InvalidAddPath(path.to_path_buf()));
        }

        Ok(components.iter().collect())
    }).collect::<BottleResult<_>>()?;

    // track the index so we can put them back in order later.
    let (absolute_paths, relative_paths) = paths.iter().enumerate().partition::<Vec<_>, _>(|(_, p)| p.has_root());
    let relative_paths: Vec<_> = relative_paths.iter().map(|(i, path)| {
        (i, NormalizedScanPath { path: path.to_path_buf(), hidden_prefix: None })
    }).collect();

    // find the common prefix of all absolute paths, and attach it to each one as the "hidden prefix".
    let absolute_paths: Vec<_> = if !absolute_paths.is_empty() {
        let hidden_prefix = if absolute_paths.len() == 1 {
            absolute_paths[0].1.parent().map(|p| p.to_path_buf())
        } else {
            Some(find_common_prefix(
                &absolute_paths.iter().map(|(_, p)| p.to_path_buf()).collect::<Vec<_>>()
            ))
        };

        absolute_paths.iter().map(|(i, path)| {
            (i, NormalizedScanPath { path: path.to_path_buf(), hidden_prefix: hidden_prefix.clone() })
        }).collect()
    } else {
        vec![]
    };

    let mut rv = [ relative_paths, absolute_paths ].concat();
    rv.sort_by(|(i, _), (j, _)| i.cmp(j));
    let rv = rv.iter().map(|(_, scan_path)| scan_path.clone()).collect::<Vec<_>>();

    // complain if the shortened paths would be duplicate
    let normalized_paths = rv.iter().map(|normalized_path| {
        match &normalized_path.hidden_prefix {
            None => &normalized_path.path,
            Some(prefix) => normalized_path.path.strip_prefix(prefix).unwrap(),
        }
    }).collect::<Vec<_>>();

    // we could be fancy here with sets, but we want to return the duplicate paths, to be helpful.
    for i in 0..normalized_paths.len() - 1 {
        for j in i + 1 .. normalized_paths.len() {
            if normalized_paths[i] == normalized_paths[j] {
                return Err(
                    BottleError::DuplicatePaths(
                        rv[i].path.to_path_buf(),
                        rv[j].path.to_path_buf(),
                    )
                );
            }
        }
    }

    Ok(rv)
}


/// Wrap `Scanner` to traverse recursively through a list of paths, scan each
/// file to figure out block boundaries, and generate a `FileList`.
pub struct FileScanner<F: FnMut (ScanState)> {
    scanner: Scanner,
    updater: F,
    buffer: Vec<u8>,
    file_list: FileList,
}

impl<F: FnMut (ScanState)> FileScanner<F> {
    pub fn new(
        hash_type: HashType,
        min_bits: u8,
        pref_bits: u8,
        max_bits: u8,
        window_bits: u8,
        buffer: Vec<u8>,
        updater: F,
    ) -> FileScanner<F> {
        FileScanner {
            scanner: Scanner::new(hash_type, min_bits, pref_bits, max_bits, window_bits),
            updater,
            buffer,
            file_list: FileList::new(),
        }
    }

    fn build_file_list(&mut self, path: &Path, hidden_prefix: &Option<PathBuf>) -> BottleResult<()> {
        for entry in fs::read_dir(path)? {
            let entry = entry?;
            self.scan_path(&entry.path(), hidden_prefix)?;
        }
        Ok(())
    }

    pub fn scan_paths(&mut self, paths: &[PathBuf]) -> BottleResult<()> {
        let scan_paths = normalize_paths(paths)?;
        for scan_path in scan_paths {
            self.scan_path(&scan_path.path, &scan_path.hidden_prefix)?;
        }
        Ok(())
    }

    pub fn scan_path(&mut self, path: &Path, hidden_prefix: &Option<PathBuf>) -> BottleResult<()> {
        let metadata = fs::symlink_metadata(path)?;
        let path = path.to_path_buf();
        let normalized_path = match hidden_prefix {
            None => path.clone(),
            // shouldn't actually be possible for this to error:
            Some(prefix) => path.strip_prefix(prefix).map_err(|_| BottleError::BadPath)?.to_path_buf(),
        };
        let mut atlas: Option<FileAtlas> = None;

        if metadata.file_type().is_symlink() {
            let target = fs::read_link(&path)?;
            let mut raw_atlas: FileAtlas = (&metadata).try_into()?;
            raw_atlas.size = 0;
            raw_atlas.symlink_target = Some(target);
            atlas = Some(raw_atlas);
        } else if metadata.file_type().is_dir() || metadata.file_type().is_file() {
            atlas = Some((&metadata).try_into()?);
        }

        if let Some(mut atlas) = atlas {
            atlas.path = path.clone();
            atlas.normalized_path = normalized_path;
            let atlas = atlas.bobble();

            self.file_list.files.push(atlas.clone());
            let file_count = self.file_list.total_file_count();
            let bytes = self.file_list.total_size();
            (self.updater)(ScanState::FileList { atlas: atlas.clone(), file_count, bytes });

            if atlas.borrow().is_folder {
                self.build_file_list(&path, hidden_prefix)?;
            }
        }
        Ok(())
    }

    pub fn build_block_list(&mut self, hash_type: HashType) -> std::io::Result<&FileList> {
        let mut scanned_size = 0u64;
        // this kinda sucks? but maybe not? we have to dup the vec to get a
        // file list that can be iterated so we can fill in the block data.
        // however, it's a vec of Rc so maybe it's cheap?
        for atlas in self.file_list.files.clone().iter() {
            let is_folder = atlas.borrow().is_folder;
            let is_symlink = atlas.borrow().symlink_target.is_some();

            if !is_folder && !is_symlink {
                let mut f = fs::File::open(&atlas.borrow().path)?;
                let mut blocks: Vec<Block> = Vec::new();
                let mut digest = Hashing::new(hash_type);
                for block in self.scanner.reader_iter(&mut f, &mut self.buffer, &mut digest) {
                    let block = block?;
                    let size = block.size;
                    scanned_size += size as u64;
                    self.file_list.add_block(atlas, &block);
                    blocks.push(block);

                    let blocks = self.file_list.blocks.len();
                    let file_count = self.file_list.total_file_count();
                    let bytes = scanned_size;
                    let total_bytes = self.file_list.total_size();
                    let unique_bytes = self.file_list.total_block_size();
                    (self.updater)(ScanState::Blocks { blocks, file_count, bytes, total_bytes, unique_bytes });
                }
                atlas.borrow_mut().contents = FileBlocks { hash: digest.finalize_reset(), blocks };
            }
        }

        Ok(&self.file_list)
    }
}


impl<F: FnMut (ScanState)> From<FileScanner<F>> for FileList {
    fn from(file_scanner: FileScanner<F>) -> Self {
        file_scanner.file_list
    }
}


#[cfg(test)]
mod test {
    use std::path::{Path, PathBuf};
    use super::{find_common_prefix, NormalizedScanPath, normalize_paths};

    fn normalize(paths: &[&str]) -> Vec<NormalizedScanPath> {
        let paths: Vec<PathBuf> = paths.iter().map(PathBuf::from).collect();
        normalize_paths(&paths).unwrap()
    }

    fn scan_paths(paths: &[(&str, Option<&str>)]) -> Vec<NormalizedScanPath> {
        paths.iter().map(|(path, hidden_prefix)| {
            NormalizedScanPath {
                path: Path::new(path).to_path_buf(),
                hidden_prefix: hidden_prefix.map(|p| Path::new(p).to_path_buf()),
            }
        }).collect()
    }

    fn without_prefix(paths: &[NormalizedScanPath]) -> Vec<PathBuf> {
        paths.iter().map(|p| {
            match &p.hidden_prefix {
                None => p.path.to_path_buf(),
                Some(prefix) => p.path.strip_prefix(prefix).unwrap().to_path_buf(),
            }
        }).collect()
    }


    #[test]
    fn common_prefix() {
        assert_eq!(
            find_common_prefix(&[
                "/home/robey/projects/rust/bitbottle/tests",
                "/home/robey/projects/rust/bitbottle/src",
                "/home/robey/projects/rust/mwgc",
            ].map(PathBuf::from)),
            PathBuf::from("/home/robey/projects/rust")
        );

        assert_eq!(
            find_common_prefix(&[
                "/home/robey/projects/rust/bitbottle/tests",
                "/etc/init.d/rc5.d",
            ].map(PathBuf::from)),
            PathBuf::from("/")
        );
    }

    #[test]
    fn normalize_relative() {
        assert_eq!(
            normalize(&[ "src/", "docs/" ]),
            scan_paths(&[ ("src", None), ("docs", None) ]),
        );

        assert_eq!(
            normalize(&[ "./src/", "docs/" ]),
            scan_paths(&[ ("src", None), ("docs", None) ]),
        );
    }

    #[test]
    fn normalize_disallow_dots() {
        let rv = normalize_paths(&[ PathBuf::from("./src/../src") ]);
        assert!(rv.is_err());
        assert_eq!(format!("{:?}", rv.unwrap_err()), "InvalidAddPath(\"./src/../src\")");
    }

    #[test]
    fn normalize_absolute() {
        let paths = normalize(&[ "/home/robey/src/", "/home/robey/docs" ]);
        assert_eq!(
            paths,
            scan_paths(&[ ("/home/robey/src", Some("/home/robey")), ("/home/robey/docs", Some("/home/robey")) ]),
        );

        assert_eq!(without_prefix(&paths), [ "src", "docs" ].map(PathBuf::from));
    }

    #[test]
    fn normalize_one_prefix() {
        let paths = normalize(&[ "/home/robey/src/" ]);
        assert_eq!(
            paths,
            scan_paths(&[ ("/home/robey/src", Some("/home/robey")) ]),
        );

        assert_eq!(without_prefix(&paths), [ "src" ].map(PathBuf::from));
    }

    #[test]
    fn normalize_keep_ordering() {
        let paths = normalize(&[ "target", "/home/robey/src/", "./tests", "/home/robey/docs" ]);
        assert_eq!(
            paths,
            scan_paths(&[
                ("target", None),
                ("/home/robey/src", Some("/home/robey")),
                ("tests", None),
                ("/home/robey/docs", Some("/home/robey"))
            ]),
        );

        assert_eq!(without_prefix(&paths), [ "target", "src", "tests", "docs" ].map(PathBuf::from));
    }

    #[test]
    fn duplicate_paths() {
        let rv = normalize_paths(&[ "target", "/home/robey/src", "/home/robey/target/" ].map(PathBuf::from));
        assert!(rv.is_err());
        assert_eq!(format!("{:?}", rv.unwrap_err()), "DuplicatePaths(\"target\", \"/home/robey/target\")");
    }
}