use std::cell::OnceCell;
use std::collections::hash_map::IntoIter;
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::SystemTime;
use file_declutter::FileDeclutter;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use thiserror::Error;
use walkdir::WalkDir;
mod cache;
#[derive(Debug, Error)]
pub enum Error {
#[error(transparent)]
Io(#[from] std::io::Error),
}
type Result<R> = std::result::Result<R, Error>;
#[cfg(unix)]
fn read_at_chunk(file: &File, offset: u64, len: usize) -> std::io::Result<Vec<u8>> {
use std::os::unix::fs::FileExt;
let mut buf = vec![0u8; len];
let mut pos = 0usize;
while pos < len {
let read = file.read_at(&mut buf[pos..], offset + pos as u64)?;
if read == 0 {
break;
}
pos += read;
}
buf.truncate(pos);
Ok(buf)
}
#[cfg(windows)]
fn read_at_chunk(file: &File, offset: u64, len: usize) -> std::io::Result<Vec<u8>> {
use std::os::windows::fs::FileExt;
let handle = file.try_clone()?;
let mut buf = vec![0u8; len];
let mut pos = 0usize;
while pos < len {
let read = handle.seek_read(&mut buf[pos..], offset + pos as u64)?;
if read == 0 {
break;
}
pos += read;
}
buf.truncate(pos);
Ok(buf)
}
#[derive(Clone, Copy, Debug, Default, Deserialize, Serialize)]
pub enum HashingAlgorithm {
MD5,
#[default]
SHA1,
SHA256,
SHA512,
}
impl HashingAlgorithm {
fn select_hasher(&self) -> Box<dyn sha2::digest::DynDigest> {
match self {
Self::MD5 => Box::new(md5::Md5::default()),
Self::SHA1 => Box::new(sha1::Sha1::default()),
Self::SHA256 => Box::new(sha2::Sha256::default()),
Self::SHA512 => Box::new(sha2::Sha512::default()),
}
}
}
#[derive(Clone, Debug)]
pub struct FileWithChunks {
base: PathBuf,
pub path: String,
pub size: u64,
pub mtime: SystemTime,
chunks: OnceCell<Vec<FileChunk>>,
hashing_algorithm: HashingAlgorithm,
}
impl PartialEq for FileWithChunks {
fn eq(&self, other: &Self) -> bool {
self.path == other.path && self.size == other.size && self.mtime == other.mtime
}
}
impl Eq for FileWithChunks {}
impl FileWithChunks {
pub fn try_new(
source_path: impl Into<PathBuf>,
path: impl Into<PathBuf>,
hashing_algorithm: HashingAlgorithm,
) -> Result<Self> {
let base = source_path.into();
let path = path.into();
let metadata = path.metadata()?;
let path = path
.strip_prefix(&base)
.unwrap()
.to_string_lossy()
.to_string();
let size = metadata.len();
let mtime = metadata.modified()?;
Ok(Self {
base,
path,
size,
mtime,
chunks: Default::default(),
hashing_algorithm,
})
}
pub fn get_chunks(&self) -> Option<&Vec<FileChunk>> {
self.chunks.get()
}
pub fn take_chunks(&mut self) -> Option<Vec<FileChunk>> {
self.chunks.take()
}
pub fn get_or_calculate_chunks(&self) -> Result<&Vec<FileChunk>> {
if self.chunks.get().is_none() {
let chunks = self.calculate_chunks()?;
self.chunks.set(chunks).unwrap();
}
Ok(self.chunks.get().unwrap())
}
fn calculate_chunks(&self) -> Result<Vec<FileChunk>> {
let path = self.base.join(&self.path);
let size = path.metadata()?.len();
let hashing_algorithm = self.hashing_algorithm;
let chunk_size = 1024 * 1024;
if size == 0 {
let hasher = hashing_algorithm.select_hasher();
let hash = hasher.finalize();
let hash = base16ct::lower::encode_string(&hash);
std::iter::once(Ok::<FileChunk, Error>(FileChunk::new(0, 0, hash))).collect()
} else {
let file = Arc::new(File::open(&path)?);
let total_chunks = (size + chunk_size - 1) / chunk_size;
(0..total_chunks)
.into_par_iter()
.map(|chunk_idx| {
let offset = chunk_idx * chunk_size;
let len = chunk_size.min(size.saturating_sub(offset)) as usize;
let data = read_at_chunk(&file, offset, len)?;
let mut hasher = hashing_algorithm.select_hasher();
hasher.update(&data);
let hash = hasher.finalize();
let hash = base16ct::lower::encode_string(&hash);
Ok::<FileChunk, Error>(FileChunk::new(offset, data.len() as u64, hash))
})
.collect()
}
}
}
#[derive(Clone, Debug)]
pub struct FileChunk {
pub start: u64,
pub size: u64,
pub hash: String,
pub path: Option<String>,
}
impl FileChunk {
pub fn new(start: u64, size: u64, hash: String) -> Self {
Self {
start,
size,
hash,
path: None,
}
}
}
pub struct DedupCache(HashMap<String, FileWithChunks>);
impl DedupCache {
fn new() -> Self {
Self(HashMap::new())
}
fn from_hashmap(hash_map: HashMap<String, FileWithChunks>) -> Self {
Self(hash_map)
}
fn read_from_file(&mut self, path: impl AsRef<Path>) {
let cache_from_file = cache::read_from_file(path);
for x in cache_from_file {
self.insert(x.path.clone(), x);
}
}
fn write_to_file(&self, path: impl AsRef<Path>) {
cache::write_to_file(path, self);
}
pub fn get_chunks(&self) -> Result<impl Iterator<Item = (String, FileChunk, bool)> + '_> {
Ok(self.values().flat_map(|fwc| {
let mut dirty = fwc.get_chunks().is_none();
fwc.get_or_calculate_chunks()
.unwrap()
.iter()
.map(move |chunk| {
let result = (
chunk.hash.clone(),
FileChunk {
path: Some(fwc.path.clone()),
..chunk.clone()
},
dirty,
);
dirty = false;
result
})
}))
}
pub fn get(&self, path: &str) -> Option<&FileWithChunks> {
self.0.get(path)
}
pub fn get_mut(&mut self, path: &str) -> Option<&mut FileWithChunks> {
self.0.get_mut(path)
}
fn insert(&mut self, path: String, fwc: FileWithChunks) {
self.0.insert(path, fwc);
}
pub fn contains_key(&self, path: &str) -> bool {
self.0.contains_key(path)
}
pub fn into_iter(self) -> IntoIter<String, FileWithChunks> {
self.0.into_iter()
}
pub fn values(&self) -> impl Iterator<Item = &FileWithChunks> {
self.0.values()
}
pub fn len(&self) -> usize {
self.0.len()
}
}
pub struct Deduper {
source_path: PathBuf,
cache_path: PathBuf,
pub cache: DedupCache,
}
impl Deduper {
pub fn new(
source_path: impl Into<PathBuf>,
cache_paths: Vec<impl Into<PathBuf>>,
hashing_algorithm: HashingAlgorithm,
same_file_system: bool,
) -> Self {
let source_path = source_path.into();
let mut cache = DedupCache::new();
let cache_path = {
let mut cache_path = Default::default();
for cache_path_from_iter in cache_paths.into_iter().rev() {
cache_path = cache_path_from_iter.into();
cache.read_from_file(&cache_path);
}
cache_path
};
let valid_entry = |path: &PathBuf| path.is_file() && !path.is_symlink();
cache = DedupCache::from_hashmap(
cache
.into_iter()
.filter(|(path, _)| valid_entry(&source_path.join(path)))
.collect(),
);
let dir_walker = WalkDir::new(&source_path)
.min_depth(1)
.same_file_system(same_file_system);
for entry in dir_walker {
let entry = entry.unwrap().into_path();
if !valid_entry(&entry) {
continue;
}
let fwc = FileWithChunks::try_new(&source_path, &entry, hashing_algorithm).unwrap();
if let Some(fwc_cache) = cache.get_mut(&fwc.path) {
if fwc == *fwc_cache {
fwc_cache.base = source_path.clone();
continue;
}
}
cache.insert(fwc.path.clone(), fwc);
}
Self {
source_path,
cache_path,
cache,
}
}
pub fn write_cache(&self) {
if self.cache_path.file_name().is_none() {
return;
}
let temp_path = self.cache_path.clone().with_extension(format!(
"tmp.{}.{}",
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_millis(),
self.cache_path
.extension()
.unwrap_or("ext".as_ref())
.to_str()
.unwrap()
));
self.cache.write_to_file(&temp_path);
std::fs::rename(temp_path, &self.cache_path).unwrap();
}
pub fn write_chunks(
&mut self,
target_path: impl Into<PathBuf>,
declutter_levels: usize,
) -> Result<()> {
let target_path = target_path.into();
let data_dir = target_path.join("data");
std::fs::create_dir_all(&data_dir)?;
for (_, chunk, _) in self.cache.get_chunks()? {
let mut chunk_file = PathBuf::from(&chunk.hash);
if declutter_levels > 0 {
chunk_file = FileDeclutter::oneshot(chunk_file, declutter_levels);
}
chunk_file = data_dir.join(chunk_file);
if !chunk_file.exists() {
std::fs::create_dir_all(&chunk_file.parent().unwrap())?;
let mut out = File::create(chunk_file)?;
let mut src = BufReader::new(File::open(
self.source_path.join(chunk.path.as_ref().unwrap()),
)?);
src.seek(SeekFrom::Start(chunk.start))?;
let mut limited = src.take(chunk.size);
std::io::copy(&mut limited, &mut out)?;
}
}
Ok(())
}
}
pub struct Hydrator {
source_path: PathBuf,
pub cache: DedupCache,
}
impl Hydrator {
pub fn new(source_path: impl Into<PathBuf>, cache_paths: Vec<impl Into<PathBuf>>) -> Self {
let source_path = source_path.into();
let mut cache = DedupCache::new();
for cache_path in cache_paths.into_iter().rev() {
let cache_path = cache_path.into();
cache.read_from_file(&cache_path);
}
Self { source_path, cache }
}
pub fn restore_files(&self, target_path: impl Into<PathBuf>, declutter_levels: usize) {
let data_dir = self.source_path.join("data");
let target_path = target_path.into();
std::fs::create_dir_all(&target_path).unwrap();
for fwc in self.cache.values() {
let target = target_path.join(&fwc.path);
std::fs::create_dir_all(&target.parent().unwrap()).unwrap();
let target_file = File::create(&target).unwrap();
let mut target = BufWriter::new(&target_file);
for chunk in fwc.get_chunks().unwrap() {
let mut chunk_file = PathBuf::from(&chunk.hash);
if declutter_levels > 0 {
chunk_file = FileDeclutter::oneshot(chunk_file, declutter_levels);
}
chunk_file = data_dir.join(chunk_file);
let mut source = File::open(chunk_file).unwrap();
std::io::copy(&mut source, &mut target).unwrap();
}
target.flush().unwrap();
target_file.set_modified(fwc.mtime).unwrap()
}
}
pub fn list_missing_chunks(
&self,
declutter_levels: usize,
) -> impl Iterator<Item = (PathBuf, String)> {
let mut hashes_and_chunks = self
.cache
.get_chunks()
.unwrap()
.map(|(hash, chunk, ..)| (PathBuf::from(hash), chunk))
.collect::<Vec<_>>();
hashes_and_chunks.sort_by(|a, b| a.0.cmp(&b.0));
hashes_and_chunks.dedup_by(|a, b| a.0 == b.0);
let (hashes, chunks): (Vec<_>, Vec<_>) = hashes_and_chunks.into_iter().unzip();
let files_in_cache = FileDeclutter::new_from_iter(hashes.into_iter())
.base(&self.source_path.join("data"))
.levels(declutter_levels)
.map(|(_, path)| path);
files_in_cache
.zip(chunks)
.into_iter()
.filter_map(|(path, chunk)| {
if !path.exists() {
Some((path, "Does not exist".to_string()))
} else if path.metadata().unwrap().len() != chunk.size {
Some((
path,
format!("Does not have expected size of {}", chunk.size),
))
} else {
None
}
})
}
pub fn check_cache(&self, declutter_levels: usize) -> bool {
self.list_missing_chunks(declutter_levels).next().is_none()
}
pub fn list_extra_files(&self, declutter_levels: usize) -> impl Iterator<Item = PathBuf> {
let files_in_cache = FileDeclutter::new_from_iter(
self.cache
.get_chunks()
.unwrap()
.map(|(hash, ..)| PathBuf::from(hash)),
)
.base(&self.source_path.join("data"))
.levels(declutter_levels)
.map(|(_, path)| path)
.collect::<HashSet<_>>();
WalkDir::new(&self.source_path.join("data"))
.min_depth(1)
.same_file_system(false)
.into_iter()
.filter(move |entry| {
entry
.as_ref()
.map(|entry| {
entry.file_type().is_file() && !files_in_cache.contains(entry.path())
})
.unwrap_or_default()
})
.flatten()
.map(|entry| entry.into_path())
}
pub fn delete_extra_files(&self, declutter_levels: usize) -> anyhow::Result<()> {
for path in self.list_extra_files(declutter_levels) {
std::fs::remove_file(&path)?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::fs::OpenOptions;
use assert_fs::fixture::ChildPath;
use assert_fs::prelude::*;
use assert_fs::{NamedTempFile, TempDir};
use super::*;
fn setup() -> anyhow::Result<(TempDir, ChildPath, ChildPath, ChildPath)> {
let temp = TempDir::new()?;
let origin = temp.child("origin");
origin.create_dir_all()?;
origin.child("README.md").write_str("Hello, world!")?;
let deduped = temp.child("deduped");
deduped.create_dir_all()?;
let cache = temp.child("cache.json");
{
let mut deduper = Deduper::new(
origin.to_path_buf(),
vec![cache.to_path_buf()],
HashingAlgorithm::MD5,
true,
);
deduper.write_chunks(deduped.to_path_buf(), 3)?;
deduper.write_cache();
}
Ok((temp, origin, deduped, cache))
}
#[test]
fn compare_filechunk_objects() -> anyhow::Result<()> {
let temp = TempDir::new()?;
let file_1 = temp.child("file_1");
std::fs::write(&file_1, "content_1")?;
let file_2 = temp.child("file_2");
std::fs::write(&file_2, "content_2")?;
let fwc_1 = FileWithChunks::try_new(&temp.path(), &file_1.path(), HashingAlgorithm::MD5)?;
let fwc_1_same =
FileWithChunks::try_new(&temp.path(), &file_1.path(), HashingAlgorithm::MD5)?;
let fwc_2 = FileWithChunks::try_new(&temp.path(), &file_2.path(), HashingAlgorithm::MD5)?;
assert_eq!(fwc_1, fwc_1);
assert_eq!(fwc_1, fwc_1_same);
assert_ne!(fwc_1, fwc_2);
OpenOptions::new()
.write(true)
.open(&file_1)?
.set_modified(SystemTime::now())?;
let fwc_1_new =
FileWithChunks::try_new(&temp.path(), &file_1.path(), HashingAlgorithm::MD5)?;
assert_ne!(fwc_1, fwc_1_new);
Ok(())
}
#[test]
fn check_all_hashing_algorithms() -> anyhow::Result<()> {
let algorithms = &[
(HashingAlgorithm::MD5, "0fb073cd346f46f60c15e719f3820482"),
(
HashingAlgorithm::SHA1,
"5503f5edc1bba66a7733c5ec38f4e9d449021be9",
),
(
HashingAlgorithm::SHA256,
"e8c73ac958a87f17906b092bd99f37038788ee23b271574aad6d5bf1c76cc61c",
),
(
HashingAlgorithm::SHA512,
"e6eda213df25f96ca380dd07640df530574e380c1b93d5d863fec05d5908a4880a3075fef4a438cfb1023cc51affb4624002f54b4790fe8362c7de032eb39aaa",
),
];
let temp = TempDir::new()?;
let file = temp.child("file");
std::fs::write(&file, "hello rust")?;
for (algorithm, expected_hash) in algorithms.iter().copied() {
let cache_file = NamedTempFile::new("cache.json")?;
let chunks = Deduper::new(temp.path(), vec![cache_file.path()], algorithm, true)
.cache
.get_chunks()?
.collect::<Vec<_>>();
assert_eq!(chunks.len(), 1, "Too many chunks");
let (hash, _, _) = &chunks[0];
assert_eq!(
hash, &expected_hash,
"Algorithm {:?} does not produce expected hash",
algorithm
);
}
Ok(())
}
#[test]
fn check_cache() -> anyhow::Result<()> {
let (_temp, _origin, deduped, cache) = setup()?;
assert!(
Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()]).check_cache(3),
"Cache checking failed when it shouldn't"
);
std::fs::remove_dir_all(deduped.child("data").read_dir()?.next().unwrap()?.path())?;
assert!(
!Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()]).check_cache(3),
"Cache checking didn't fail when it should"
);
Ok(())
}
#[test]
fn check_list_extra() -> anyhow::Result<()> {
let (_temp, _origin, deduped, cache) = setup()?;
assert_eq!(
Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()])
.list_extra_files(3)
.count(),
0,
"Extra files present when there shouldn't be"
);
deduped
.child("data")
.child("extra_file")
.write_str("Hello, world!")?;
assert_eq!(
Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()])
.list_extra_files(3)
.count(),
1,
"Number of extra files present is not 1"
);
deduped
.child("data")
.child("e")
.child("x")
.child("t")
.child("extra_file")
.write_str("Hello, world!")?;
assert_eq!(
Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()])
.list_extra_files(3)
.count(),
2,
"Number of extra files present is not 2"
);
Ok(())
}
#[cfg(not(windows))]
#[test]
fn check_files_with_exotic_characters() -> anyhow::Result<()> {
let temp = TempDir::new()?;
let origin = temp.child("origin");
origin.create_dir_all()?;
let deduped = temp.child("deduped");
deduped.create_dir_all()?;
let cache = temp.child("cache.json");
let filename_with_newline = "new\nline.txt";
let file_with_newline = origin.child(filename_with_newline);
file_with_newline.write_str("content")?;
let filename_with_japanese = "日本語ファイル名";
let file_with_japanese = origin.child(filename_with_japanese);
file_with_japanese.write_str("content")?;
let try_dedup = || -> anyhow::Result<()> {
let mut deduper = Deduper::new(
origin.to_path_buf(),
vec![cache.to_path_buf()],
HashingAlgorithm::MD5,
true,
);
assert!(
deduper.cache.get(&filename_with_newline).is_some(),
"File with newline is missing from cache"
);
assert!(
deduper.cache.get(&filename_with_japanese).is_some(),
"File with Japanese is missing from cache"
);
deduper.write_chunks(deduped.to_path_buf(), 3)?;
deduper.write_cache();
Ok(())
};
try_dedup()?;
try_dedup()?;
Ok(())
}
}